From 47934dc4c277725c3615794a8409158cdae117e7 Mon Sep 17 00:00:00 2001 From: James Greenhalgh Date: Mon, 29 Apr 2013 11:08:30 +0000 Subject: [AArch64] fcvt instructions - arm_neon.h changes. gcc/ * config/aarch64/arm_neon.h (vcvt_f<32,64>_s<32,64>): Rewrite in C. (vcvt_f<32,64>_s<32,64>): Rewrite using builtins. (vcvt__f<32,64>_f<32,64>): Likewise. (vcvt_<32,64>_f<32,64>): Likewise. (vcvta_<32,64>_f<32,64>): Likewise. (vcvtm_<32,64>_f<32,64>): Likewise. (vcvtn_<32,64>_f<32,64>): Likewise. (vcvtp_<32,64>_f<32,64>): Likewise. gcc/testsuite/ * gcc.target/aarch64/vect-vcvt.c: New. From-SVN: r198404 --- gcc/ChangeLog | 12 + gcc/config/aarch64/arm_neon.h | 4613 ++++++++++++-------------- gcc/testsuite/ChangeLog | 4 + gcc/testsuite/gcc.target/aarch64/vect-vcvt.c | 132 + 4 files changed, 2307 insertions(+), 2454 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/vect-vcvt.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 3c42d60..dc763cc 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,17 @@ 2013-04-29 James Greenhalgh + * config/aarch64/arm_neon.h + (vcvt_f<32,64>_s<32,64>): Rewrite in C. + (vcvt_f<32,64>_s<32,64>): Rewrite using builtins. + (vcvt__f<32,64>_f<32,64>): Likewise. + (vcvt_<32,64>_f<32,64>): Likewise. + (vcvta_<32,64>_f<32,64>): Likewise. + (vcvtm_<32,64>_f<32,64>): Likewise. + (vcvtn_<32,64>_f<32,64>): Likewise. + (vcvtp_<32,64>_f<32,64>): Likewise. + +2013-04-29 James Greenhalgh + * config/aarch64/aarch64-simd.md (2): New, maps to fix, fixuns. (2): New, maps to diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index c868a46..7d37744 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -5882,100 +5882,12 @@ vcntq_u8 (uint8x16_t a) /* vcvt_f32_f16 not supported */ -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vcvt_f32_f64 (float64x2_t a) -{ - float32x2_t result; - __asm__ ("fcvtn %0.2s,%1.2d" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vcvt_f32_s32 (int32x2_t a) -{ - float32x2_t result; - __asm__ ("scvtf %0.2s, %1.2s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vcvt_f32_u32 (uint32x2_t a) -{ - float32x2_t result; - __asm__ ("ucvtf %0.2s, %1.2s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vcvt_f64_f32 (float32x2_t a) -{ - float64x2_t result; - __asm__ ("fcvtl %0.2d,%1.2s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) -vcvt_f64_s64 (uint64x1_t a) -{ - float64x1_t result; - __asm__ ("scvtf %d0, %d1" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) -vcvt_f64_u64 (uint64x1_t a) -{ - float64x1_t result; - __asm__ ("ucvtf %d0, %d1" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - /* vcvt_high_f16_f32 not supported */ /* vcvt_high_f32_f16 not supported */ static float32x2_t vdup_n_f32 (float32_t); -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vcvt_high_f32_f64 (float32x2_t a, float64x2_t b) -{ - float32x4_t result = vcombine_f32 (a, vdup_n_f32 (0.0f)); - __asm__ ("fcvtn2 %0.4s,%2.2d" - : "+w"(result) - : "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vcvt_high_f64_f32 (float32x4_t a) -{ - float64x2_t result; - __asm__ ("fcvtl2 %0.2d,%1.4s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - #define vcvt_n_f32_s32(a, b) \ __extension__ \ ({ \ @@ -6024,160 +5936,6 @@ vcvt_high_f64_f32 (float32x4_t a) result; \ }) -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vcvt_s32_f32 (float32x2_t a) -{ - int32x2_t result; - __asm__ ("fcvtzs %0.2s, %1.2s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcvt_u32_f32 (float32x2_t a) -{ - uint32x2_t result; - __asm__ ("fcvtzu %0.2s, %1.2s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vcvta_s32_f32 (float32x2_t a) -{ - int32x2_t result; - __asm__ ("fcvtas %0.2s, %1.2s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcvta_u32_f32 (float32x2_t a) -{ - uint32x2_t result; - __asm__ ("fcvtau %0.2s, %1.2s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) -vcvtad_s64_f64 (float64_t a) -{ - float64_t result; - __asm__ ("fcvtas %d0,%d1" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) -vcvtad_u64_f64 (float64_t a) -{ - float64_t result; - __asm__ ("fcvtau %d0,%d1" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vcvtaq_s32_f32 (float32x4_t a) -{ - int32x4_t result; - __asm__ ("fcvtas %0.4s, %1.4s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vcvtaq_s64_f64 (float64x2_t a) -{ - int64x2_t result; - __asm__ ("fcvtas %0.2d, %1.2d" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcvtaq_u32_f32 (float32x4_t a) -{ - uint32x4_t result; - __asm__ ("fcvtau %0.4s, %1.4s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcvtaq_u64_f64 (float64x2_t a) -{ - uint64x2_t result; - __asm__ ("fcvtau %0.2d, %1.2d" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vcvtas_s64_f64 (float32_t a) -{ - float32_t result; - __asm__ ("fcvtas %s0,%s1" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vcvtas_u64_f64 (float32_t a) -{ - float32_t result; - __asm__ ("fcvtau %s0,%s1" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int64_t __attribute__ ((__always_inline__)) -vcvtd_f64_s64 (int64_t a) -{ - int64_t result; - __asm__ ("scvtf %d0,%d1" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) -vcvtd_f64_u64 (uint64_t a) -{ - uint64_t result; - __asm__ ("ucvtf %d0,%d1" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - #define vcvtd_n_f64_s64(a, b) \ __extension__ \ ({ \ @@ -6226,220 +5984,166 @@ vcvtd_f64_u64 (uint64_t a) result; \ }) -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) -vcvtd_s64_f64 (float64_t a) -{ - float64_t result; - __asm__ ("fcvtzs %d0,%d1" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} +#define vcvtq_n_f32_s32(a, b) \ + __extension__ \ + ({ \ + int32x4_t a_ = (a); \ + float32x4_t result; \ + __asm__ ("scvtf %0.4s, %1.4s, #%2" \ + : "=w"(result) \ + : "w"(a_), "i"(b) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) -vcvtd_u64_f64 (float64_t a) -{ - float64_t result; - __asm__ ("fcvtzu %d0,%d1" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} +#define vcvtq_n_f32_u32(a, b) \ + __extension__ \ + ({ \ + uint32x4_t a_ = (a); \ + float32x4_t result; \ + __asm__ ("ucvtf %0.4s, %1.4s, #%2" \ + : "=w"(result) \ + : "w"(a_), "i"(b) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vcvtm_s32_f32 (float32x2_t a) -{ - int32x2_t result; - __asm__ ("fcvtms %0.2s, %1.2s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} +#define vcvtq_n_f64_s64(a, b) \ + __extension__ \ + ({ \ + int64x2_t a_ = (a); \ + float64x2_t result; \ + __asm__ ("scvtf %0.2d, %1.2d, #%2" \ + : "=w"(result) \ + : "w"(a_), "i"(b) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcvtm_u32_f32 (float32x2_t a) -{ - uint32x2_t result; - __asm__ ("fcvtmu %0.2s, %1.2s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) -vcvtmd_s64_f64 (float64_t a) -{ - float64_t result; - __asm__ ("fcvtms %d0,%d1" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) -vcvtmd_u64_f64 (float64_t a) -{ - float64_t result; - __asm__ ("fcvtmu %d0,%d1" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vcvtmq_s32_f32 (float32x4_t a) -{ - int32x4_t result; - __asm__ ("fcvtms %0.4s, %1.4s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vcvtmq_s64_f64 (float64x2_t a) -{ - int64x2_t result; - __asm__ ("fcvtms %0.2d, %1.2d" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcvtmq_u32_f32 (float32x4_t a) -{ - uint32x4_t result; - __asm__ ("fcvtmu %0.4s, %1.4s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcvtmq_u64_f64 (float64x2_t a) -{ - uint64x2_t result; - __asm__ ("fcvtmu %0.2d, %1.2d" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} +#define vcvtq_n_f64_u64(a, b) \ + __extension__ \ + ({ \ + uint64x2_t a_ = (a); \ + float64x2_t result; \ + __asm__ ("ucvtf %0.2d, %1.2d, #%2" \ + : "=w"(result) \ + : "w"(a_), "i"(b) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vcvtms_s64_f64 (float32_t a) -{ - float32_t result; - __asm__ ("fcvtms %s0,%s1" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} +#define vcvtq_n_s32_f32(a, b) \ + __extension__ \ + ({ \ + float32x4_t a_ = (a); \ + int32x4_t result; \ + __asm__ ("fcvtzs %0.4s, %1.4s, #%2" \ + : "=w"(result) \ + : "w"(a_), "i"(b) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vcvtms_u64_f64 (float32_t a) -{ - float32_t result; - __asm__ ("fcvtmu %s0,%s1" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} +#define vcvtq_n_s64_f64(a, b) \ + __extension__ \ + ({ \ + float64x2_t a_ = (a); \ + int64x2_t result; \ + __asm__ ("fcvtzs %0.2d, %1.2d, #%2" \ + : "=w"(result) \ + : "w"(a_), "i"(b) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vcvtn_s32_f32 (float32x2_t a) -{ - int32x2_t result; - __asm__ ("fcvtns %0.2s, %1.2s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} +#define vcvtq_n_u32_f32(a, b) \ + __extension__ \ + ({ \ + float32x4_t a_ = (a); \ + uint32x4_t result; \ + __asm__ ("fcvtzu %0.4s, %1.4s, #%2" \ + : "=w"(result) \ + : "w"(a_), "i"(b) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcvtn_u32_f32 (float32x2_t a) -{ - uint32x2_t result; - __asm__ ("fcvtnu %0.2s, %1.2s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} +#define vcvtq_n_u64_f64(a, b) \ + __extension__ \ + ({ \ + float64x2_t a_ = (a); \ + uint64x2_t result; \ + __asm__ ("fcvtzu %0.2d, %1.2d, #%2" \ + : "=w"(result) \ + : "w"(a_), "i"(b) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) -vcvtnd_s64_f64 (float64_t a) -{ - float64_t result; - __asm__ ("fcvtns %d0,%d1" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} +#define vcvts_n_f32_s32(a, b) \ + __extension__ \ + ({ \ + int32_t a_ = (a); \ + int32_t result; \ + __asm__ ("scvtf %s0,%s1,%2" \ + : "=w"(result) \ + : "w"(a_), "i"(b) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) -vcvtnd_u64_f64 (float64_t a) -{ - float64_t result; - __asm__ ("fcvtnu %d0,%d1" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} +#define vcvts_n_f32_u32(a, b) \ + __extension__ \ + ({ \ + uint32_t a_ = (a); \ + uint32_t result; \ + __asm__ ("ucvtf %s0,%s1,%2" \ + : "=w"(result) \ + : "w"(a_), "i"(b) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vcvtnq_s32_f32 (float32x4_t a) -{ - int32x4_t result; - __asm__ ("fcvtns %0.4s, %1.4s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} +#define vcvts_n_s32_f32(a, b) \ + __extension__ \ + ({ \ + float32_t a_ = (a); \ + float32_t result; \ + __asm__ ("fcvtzs %s0,%s1,%2" \ + : "=w"(result) \ + : "w"(a_), "i"(b) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vcvtnq_s64_f64 (float64x2_t a) -{ - int64x2_t result; - __asm__ ("fcvtns %0.2d, %1.2d" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} +#define vcvts_n_u32_f32(a, b) \ + __extension__ \ + ({ \ + float32_t a_ = (a); \ + float32_t result; \ + __asm__ ("fcvtzu %s0,%s1,%2" \ + : "=w"(result) \ + : "w"(a_), "i"(b) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcvtnq_u32_f32 (float32x4_t a) +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vcvtx_f32_f64 (float64x2_t a) { - uint32x4_t result; - __asm__ ("fcvtnu %0.4s, %1.4s" + float32x2_t result; + __asm__ ("fcvtxn %0.2s,%1.2d" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcvtnq_u64_f64 (float64x2_t a) +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vcvtx_high_f32_f64 (float64x2_t a) { - uint64x2_t result; - __asm__ ("fcvtnu %0.2d, %1.2d" + float32x4_t result; + __asm__ ("fcvtxn2 %0.4s,%1.2d" : "=w"(result) : "w"(a) : /* No clobbers */); @@ -6447,1406 +6151,976 @@ vcvtnq_u64_f64 (float64x2_t a) } __extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vcvtns_s64_f64 (float32_t a) +vcvtxd_f32_f64 (float64_t a) { float32_t result; - __asm__ ("fcvtns %s0,%s1" + __asm__ ("fcvtxn %s0,%d1" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vcvtns_u64_f64 (float32_t a) -{ - float32_t result; - __asm__ ("fcvtnu %s0,%s1" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} +#define vdup_lane_f32(a, b) \ + __extension__ \ + ({ \ + float32x2_t a_ = (a); \ + float32x2_t result; \ + __asm__ ("dup %0.2s,%1.s[%2]" \ + : "=w"(result) \ + : "w"(a_), "i"(b) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vcvtp_s32_f32 (float32x2_t a) -{ - int32x2_t result; - __asm__ ("fcvtps %0.2s, %1.2s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcvtp_u32_f32 (float32x2_t a) -{ - uint32x2_t result; - __asm__ ("fcvtpu %0.2s, %1.2s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) -vcvtpd_s64_f64 (float64_t a) -{ - float64_t result; - __asm__ ("fcvtps %d0,%d1" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) -vcvtpd_u64_f64 (float64_t a) -{ - float64_t result; - __asm__ ("fcvtpu %d0,%d1" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vcvtpq_s32_f32 (float32x4_t a) -{ - int32x4_t result; - __asm__ ("fcvtps %0.4s, %1.4s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vcvtpq_s64_f64 (float64x2_t a) -{ - int64x2_t result; - __asm__ ("fcvtps %0.2d, %1.2d" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcvtpq_u32_f32 (float32x4_t a) -{ - uint32x4_t result; - __asm__ ("fcvtpu %0.4s, %1.4s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcvtpq_u64_f64 (float64x2_t a) -{ - uint64x2_t result; - __asm__ ("fcvtpu %0.2d, %1.2d" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vcvtps_s64_f64 (float32_t a) -{ - float32_t result; - __asm__ ("fcvtps %s0,%s1" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vcvtps_u64_f64 (float32_t a) -{ - float32_t result; - __asm__ ("fcvtpu %s0,%s1" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vcvtq_f32_s32 (int32x4_t a) -{ - float32x4_t result; - __asm__ ("scvtf %0.4s, %1.4s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vcvtq_f32_u32 (uint32x4_t a) -{ - float32x4_t result; - __asm__ ("ucvtf %0.4s, %1.4s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vcvtq_f64_s64 (int64x2_t a) -{ - float64x2_t result; - __asm__ ("scvtf %0.2d, %1.2d" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} +#define vdup_lane_p8(a, b) \ + __extension__ \ + ({ \ + poly8x8_t a_ = (a); \ + poly8x8_t result; \ + __asm__ ("dup %0.8b,%1.b[%2]" \ + : "=w"(result) \ + : "w"(a_), "i"(b) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vcvtq_f64_u64 (uint64x2_t a) -{ - float64x2_t result; - __asm__ ("ucvtf %0.2d, %1.2d" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} +#define vdup_lane_p16(a, b) \ + __extension__ \ + ({ \ + poly16x4_t a_ = (a); \ + poly16x4_t result; \ + __asm__ ("dup %0.4h,%1.h[%2]" \ + : "=w"(result) \ + : "w"(a_), "i"(b) \ + : /* No clobbers */); \ + result; \ + }) -#define vcvtq_n_f32_s32(a, b) \ +#define vdup_lane_s8(a, b) \ __extension__ \ ({ \ - int32x4_t a_ = (a); \ - float32x4_t result; \ - __asm__ ("scvtf %0.4s, %1.4s, #%2" \ + int8x8_t a_ = (a); \ + int8x8_t result; \ + __asm__ ("dup %0.8b,%1.b[%2]" \ : "=w"(result) \ : "w"(a_), "i"(b) \ : /* No clobbers */); \ result; \ }) -#define vcvtq_n_f32_u32(a, b) \ +#define vdup_lane_s16(a, b) \ __extension__ \ ({ \ - uint32x4_t a_ = (a); \ - float32x4_t result; \ - __asm__ ("ucvtf %0.4s, %1.4s, #%2" \ + int16x4_t a_ = (a); \ + int16x4_t result; \ + __asm__ ("dup %0.4h,%1.h[%2]" \ : "=w"(result) \ : "w"(a_), "i"(b) \ : /* No clobbers */); \ result; \ }) -#define vcvtq_n_f64_s64(a, b) \ +#define vdup_lane_s32(a, b) \ __extension__ \ ({ \ - int64x2_t a_ = (a); \ - float64x2_t result; \ - __asm__ ("scvtf %0.2d, %1.2d, #%2" \ + int32x2_t a_ = (a); \ + int32x2_t result; \ + __asm__ ("dup %0.2s,%1.s[%2]" \ : "=w"(result) \ : "w"(a_), "i"(b) \ : /* No clobbers */); \ result; \ }) -#define vcvtq_n_f64_u64(a, b) \ +#define vdup_lane_s64(a, b) \ __extension__ \ ({ \ - uint64x2_t a_ = (a); \ - float64x2_t result; \ - __asm__ ("ucvtf %0.2d, %1.2d, #%2" \ + int64x1_t a_ = (a); \ + int64x1_t result; \ + __asm__ ("ins %0.d[0],%1.d[%2]" \ : "=w"(result) \ : "w"(a_), "i"(b) \ : /* No clobbers */); \ result; \ }) -#define vcvtq_n_s32_f32(a, b) \ +#define vdup_lane_u8(a, b) \ __extension__ \ ({ \ - float32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("fcvtzs %0.4s, %1.4s, #%2" \ + uint8x8_t a_ = (a); \ + uint8x8_t result; \ + __asm__ ("dup %0.8b,%1.b[%2]" \ : "=w"(result) \ : "w"(a_), "i"(b) \ : /* No clobbers */); \ result; \ }) -#define vcvtq_n_s64_f64(a, b) \ +#define vdup_lane_u16(a, b) \ __extension__ \ ({ \ - float64x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("fcvtzs %0.2d, %1.2d, #%2" \ + uint16x4_t a_ = (a); \ + uint16x4_t result; \ + __asm__ ("dup %0.4h,%1.h[%2]" \ : "=w"(result) \ : "w"(a_), "i"(b) \ : /* No clobbers */); \ result; \ }) -#define vcvtq_n_u32_f32(a, b) \ +#define vdup_lane_u32(a, b) \ __extension__ \ ({ \ - float32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("fcvtzu %0.4s, %1.4s, #%2" \ + uint32x2_t a_ = (a); \ + uint32x2_t result; \ + __asm__ ("dup %0.2s,%1.s[%2]" \ : "=w"(result) \ : "w"(a_), "i"(b) \ : /* No clobbers */); \ result; \ }) -#define vcvtq_n_u64_f64(a, b) \ +#define vdup_lane_u64(a, b) \ __extension__ \ ({ \ - float64x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("fcvtzu %0.2d, %1.2d, #%2" \ + uint64x1_t a_ = (a); \ + uint64x1_t result; \ + __asm__ ("ins %0.d[0],%1.d[%2]" \ : "=w"(result) \ : "w"(a_), "i"(b) \ : /* No clobbers */); \ result; \ }) -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vcvtq_s32_f32 (float32x4_t a) +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vdup_n_f32 (float32_t a) { - int32x4_t result; - __asm__ ("fcvtzs %0.4s, %1.4s" + float32x2_t result; + __asm__ ("dup %0.2s, %w1" : "=w"(result) - : "w"(a) + : "r"(a) : /* No clobbers */); return result; } -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vcvtq_s64_f64 (float64x2_t a) +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vdup_n_p8 (uint32_t a) { - int64x2_t result; - __asm__ ("fcvtzs %0.2d, %1.2d" + poly8x8_t result; + __asm__ ("dup %0.8b,%w1" : "=w"(result) - : "w"(a) + : "r"(a) : /* No clobbers */); return result; } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcvtq_u32_f32 (float32x4_t a) +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vdup_n_p16 (uint32_t a) { - uint32x4_t result; - __asm__ ("fcvtzu %0.4s, %1.4s" + poly16x4_t result; + __asm__ ("dup %0.4h,%w1" : "=w"(result) - : "w"(a) + : "r"(a) : /* No clobbers */); return result; } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcvtq_u64_f64 (float64x2_t a) +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vdup_n_s8 (int32_t a) { - uint64x2_t result; - __asm__ ("fcvtzu %0.2d, %1.2d" + int8x8_t result; + __asm__ ("dup %0.8b,%w1" : "=w"(result) - : "w"(a) + : "r"(a) : /* No clobbers */); return result; } -__extension__ static __inline int32_t __attribute__ ((__always_inline__)) -vcvts_f64_s32 (int32_t a) +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vdup_n_s16 (int32_t a) { - int32_t result; - __asm__ ("scvtf %s0,%s1" + int16x4_t result; + __asm__ ("dup %0.4h,%w1" : "=w"(result) - : "w"(a) + : "r"(a) : /* No clobbers */); return result; } -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vcvts_f64_u32 (uint32_t a) +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vdup_n_s32 (int32_t a) { - uint32_t result; - __asm__ ("ucvtf %s0,%s1" + int32x2_t result; + __asm__ ("dup %0.2s,%w1" : "=w"(result) - : "w"(a) + : "r"(a) : /* No clobbers */); return result; } -#define vcvts_n_f32_s32(a, b) \ - __extension__ \ - ({ \ - int32_t a_ = (a); \ - int32_t result; \ - __asm__ ("scvtf %s0,%s1,%2" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vcvts_n_f32_u32(a, b) \ - __extension__ \ - ({ \ - uint32_t a_ = (a); \ - uint32_t result; \ - __asm__ ("ucvtf %s0,%s1,%2" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vcvts_n_s32_f32(a, b) \ - __extension__ \ - ({ \ - float32_t a_ = (a); \ - float32_t result; \ - __asm__ ("fcvtzs %s0,%s1,%2" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vcvts_n_u32_f32(a, b) \ - __extension__ \ - ({ \ - float32_t a_ = (a); \ - float32_t result; \ - __asm__ ("fcvtzu %s0,%s1,%2" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vcvts_s64_f64 (float32_t a) +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vdup_n_s64 (int64_t a) { - float32_t result; - __asm__ ("fcvtzs %s0,%s1" + int64x1_t result; + __asm__ ("ins %0.d[0],%x1" : "=w"(result) - : "w"(a) + : "r"(a) : /* No clobbers */); return result; } -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vcvts_u64_f64 (float32_t a) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vdup_n_u8 (uint32_t a) { - float32_t result; - __asm__ ("fcvtzu %s0,%s1" + uint8x8_t result; + __asm__ ("dup %0.8b,%w1" : "=w"(result) - : "w"(a) + : "r"(a) : /* No clobbers */); return result; } -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vcvtx_f32_f64 (float64x2_t a) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vdup_n_u16 (uint32_t a) { - float32x2_t result; - __asm__ ("fcvtxn %0.2s,%1.2d" + uint16x4_t result; + __asm__ ("dup %0.4h,%w1" : "=w"(result) - : "w"(a) + : "r"(a) : /* No clobbers */); return result; } -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vcvtx_high_f32_f64 (float64x2_t a) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vdup_n_u32 (uint32_t a) { - float32x4_t result; - __asm__ ("fcvtxn2 %0.4s,%1.2d" + uint32x2_t result; + __asm__ ("dup %0.2s,%w1" : "=w"(result) - : "w"(a) + : "r"(a) : /* No clobbers */); return result; } -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vcvtxd_f32_f64 (float64_t a) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vdup_n_u64 (uint64_t a) { - float32_t result; - __asm__ ("fcvtxn %s0,%d1" + uint64x1_t result; + __asm__ ("ins %0.d[0],%x1" : "=w"(result) - : "w"(a) + : "r"(a) : /* No clobbers */); return result; } -#define vdup_lane_f32(a, b) \ +#define vdupd_lane_f64(a, b) \ + __extension__ \ + ({ \ + float64x2_t a_ = (a); \ + float64_t result; \ + __asm__ ("dup %d0, %1.d[%2]" \ + : "=w"(result) \ + : "w"(a_), "i"(b) \ + : /* No clobbers */); \ + result; \ + }) + +#define vdupq_lane_f32(a, b) \ __extension__ \ ({ \ float32x2_t a_ = (a); \ - float32x2_t result; \ - __asm__ ("dup %0.2s,%1.s[%2]" \ + float32x4_t result; \ + __asm__ ("dup %0.4s,%1.s[%2]" \ : "=w"(result) \ : "w"(a_), "i"(b) \ : /* No clobbers */); \ result; \ }) -#define vdup_lane_p8(a, b) \ +#define vdupq_lane_f64(a, b) \ + __extension__ \ + ({ \ + float64x1_t a_ = (a); \ + float64x2_t result; \ + __asm__ ("dup %0.2d,%1.d[%2]" \ + : "=w"(result) \ + : "w"(a_), "i"(b) \ + : /* No clobbers */); \ + result; \ + }) + +#define vdupq_lane_p8(a, b) \ __extension__ \ ({ \ poly8x8_t a_ = (a); \ - poly8x8_t result; \ - __asm__ ("dup %0.8b,%1.b[%2]" \ + poly8x16_t result; \ + __asm__ ("dup %0.16b,%1.b[%2]" \ : "=w"(result) \ : "w"(a_), "i"(b) \ : /* No clobbers */); \ result; \ }) -#define vdup_lane_p16(a, b) \ +#define vdupq_lane_p16(a, b) \ __extension__ \ ({ \ poly16x4_t a_ = (a); \ - poly16x4_t result; \ - __asm__ ("dup %0.4h,%1.h[%2]" \ + poly16x8_t result; \ + __asm__ ("dup %0.8h,%1.h[%2]" \ : "=w"(result) \ : "w"(a_), "i"(b) \ : /* No clobbers */); \ result; \ }) -#define vdup_lane_s8(a, b) \ +#define vdupq_lane_s8(a, b) \ __extension__ \ ({ \ int8x8_t a_ = (a); \ - int8x8_t result; \ - __asm__ ("dup %0.8b,%1.b[%2]" \ + int8x16_t result; \ + __asm__ ("dup %0.16b,%1.b[%2]" \ : "=w"(result) \ : "w"(a_), "i"(b) \ : /* No clobbers */); \ result; \ }) -#define vdup_lane_s16(a, b) \ +#define vdupq_lane_s16(a, b) \ __extension__ \ ({ \ int16x4_t a_ = (a); \ - int16x4_t result; \ - __asm__ ("dup %0.4h,%1.h[%2]" \ + int16x8_t result; \ + __asm__ ("dup %0.8h,%1.h[%2]" \ : "=w"(result) \ : "w"(a_), "i"(b) \ : /* No clobbers */); \ result; \ }) -#define vdup_lane_s32(a, b) \ +#define vdupq_lane_s32(a, b) \ __extension__ \ ({ \ int32x2_t a_ = (a); \ - int32x2_t result; \ - __asm__ ("dup %0.2s,%1.s[%2]" \ + int32x4_t result; \ + __asm__ ("dup %0.4s,%1.s[%2]" \ : "=w"(result) \ : "w"(a_), "i"(b) \ : /* No clobbers */); \ result; \ }) -#define vdup_lane_s64(a, b) \ +#define vdupq_lane_s64(a, b) \ __extension__ \ ({ \ int64x1_t a_ = (a); \ - int64x1_t result; \ - __asm__ ("ins %0.d[0],%1.d[%2]" \ + int64x2_t result; \ + __asm__ ("dup %0.2d,%1.d[%2]" \ : "=w"(result) \ : "w"(a_), "i"(b) \ : /* No clobbers */); \ result; \ }) -#define vdup_lane_u8(a, b) \ +#define vdupq_lane_u8(a, b) \ __extension__ \ ({ \ uint8x8_t a_ = (a); \ - uint8x8_t result; \ - __asm__ ("dup %0.8b,%1.b[%2]" \ + uint8x16_t result; \ + __asm__ ("dup %0.16b,%1.b[%2]" \ : "=w"(result) \ : "w"(a_), "i"(b) \ : /* No clobbers */); \ result; \ }) -#define vdup_lane_u16(a, b) \ +#define vdupq_lane_u16(a, b) \ __extension__ \ ({ \ uint16x4_t a_ = (a); \ - uint16x4_t result; \ - __asm__ ("dup %0.4h,%1.h[%2]" \ + uint16x8_t result; \ + __asm__ ("dup %0.8h,%1.h[%2]" \ : "=w"(result) \ : "w"(a_), "i"(b) \ : /* No clobbers */); \ result; \ }) -#define vdup_lane_u32(a, b) \ +#define vdupq_lane_u32(a, b) \ __extension__ \ ({ \ uint32x2_t a_ = (a); \ - uint32x2_t result; \ - __asm__ ("dup %0.2s,%1.s[%2]" \ + uint32x4_t result; \ + __asm__ ("dup %0.4s,%1.s[%2]" \ : "=w"(result) \ : "w"(a_), "i"(b) \ : /* No clobbers */); \ result; \ }) -#define vdup_lane_u64(a, b) \ +#define vdupq_lane_u64(a, b) \ __extension__ \ ({ \ uint64x1_t a_ = (a); \ - uint64x1_t result; \ - __asm__ ("ins %0.d[0],%1.d[%2]" \ + uint64x2_t result; \ + __asm__ ("dup %0.2d,%1.d[%2]" \ : "=w"(result) \ : "w"(a_), "i"(b) \ : /* No clobbers */); \ result; \ }) -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vdup_n_f32 (float32_t a) +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vdupq_n_f32 (float32_t a) { - float32x2_t result; - __asm__ ("dup %0.2s, %w1" + float32x4_t result; + __asm__ ("dup %0.4s, %w1" : "=w"(result) : "r"(a) : /* No clobbers */); return result; } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vdup_n_p8 (uint32_t a) +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vdupq_n_f64 (float64_t a) { - poly8x8_t result; - __asm__ ("dup %0.8b,%w1" + float64x2_t result; + __asm__ ("dup %0.2d, %x1" : "=w"(result) : "r"(a) : /* No clobbers */); return result; } -__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) -vdup_n_p16 (uint32_t a) +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vdupq_n_p8 (uint32_t a) { - poly16x4_t result; - __asm__ ("dup %0.4h,%w1" + poly8x16_t result; + __asm__ ("dup %0.16b,%w1" : "=w"(result) : "r"(a) : /* No clobbers */); return result; } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vdup_n_s8 (int32_t a) +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vdupq_n_p16 (uint32_t a) { - int8x8_t result; - __asm__ ("dup %0.8b,%w1" + poly16x8_t result; + __asm__ ("dup %0.8h,%w1" : "=w"(result) : "r"(a) : /* No clobbers */); return result; } -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vdup_n_s16 (int32_t a) +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vdupq_n_s8 (int32_t a) { - int16x4_t result; - __asm__ ("dup %0.4h,%w1" + int8x16_t result; + __asm__ ("dup %0.16b,%w1" : "=w"(result) : "r"(a) : /* No clobbers */); return result; } -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vdup_n_s32 (int32_t a) +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vdupq_n_s16 (int32_t a) { - int32x2_t result; - __asm__ ("dup %0.2s,%w1" + int16x8_t result; + __asm__ ("dup %0.8h,%w1" : "=w"(result) : "r"(a) : /* No clobbers */); return result; } -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) -vdup_n_s64 (int64_t a) +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vdupq_n_s32 (int32_t a) { - int64x1_t result; - __asm__ ("ins %0.d[0],%x1" + int32x4_t result; + __asm__ ("dup %0.4s,%w1" : "=w"(result) : "r"(a) : /* No clobbers */); return result; } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vdup_n_u8 (uint32_t a) +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vdupq_n_s64 (int64_t a) { - uint8x8_t result; - __asm__ ("dup %0.8b,%w1" + int64x2_t result; + __asm__ ("dup %0.2d,%x1" : "=w"(result) : "r"(a) : /* No clobbers */); return result; } -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vdup_n_u16 (uint32_t a) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vdupq_n_u8 (uint32_t a) { - uint16x4_t result; - __asm__ ("dup %0.4h,%w1" + uint8x16_t result; + __asm__ ("dup %0.16b,%w1" : "=w"(result) : "r"(a) : /* No clobbers */); return result; } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vdup_n_u32 (uint32_t a) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vdupq_n_u16 (uint32_t a) { - uint32x2_t result; - __asm__ ("dup %0.2s,%w1" + uint16x8_t result; + __asm__ ("dup %0.8h,%w1" : "=w"(result) : "r"(a) : /* No clobbers */); return result; } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vdup_n_u64 (uint64_t a) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vdupq_n_u32 (uint32_t a) { - uint64x1_t result; - __asm__ ("ins %0.d[0],%x1" + uint32x4_t result; + __asm__ ("dup %0.4s,%w1" : "=w"(result) : "r"(a) : /* No clobbers */); return result; } -#define vdupd_lane_f64(a, b) \ +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vdupq_n_u64 (uint64_t a) +{ + uint64x2_t result; + __asm__ ("dup %0.2d,%x1" + : "=w"(result) + : "r"(a) + : /* No clobbers */); + return result; +} + +#define vdups_lane_f32(a, b) \ __extension__ \ ({ \ - float64x2_t a_ = (a); \ - float64_t result; \ - __asm__ ("dup %d0, %1.d[%2]" \ + float32x4_t a_ = (a); \ + float32_t result; \ + __asm__ ("dup %s0, %1.s[%2]" \ : "=w"(result) \ : "w"(a_), "i"(b) \ : /* No clobbers */); \ result; \ }) -#define vdupq_lane_f32(a, b) \ +#define vext_f32(a, b, c) \ __extension__ \ ({ \ + float32x2_t b_ = (b); \ float32x2_t a_ = (a); \ - float32x4_t result; \ - __asm__ ("dup %0.4s,%1.s[%2]" \ + float32x2_t result; \ + __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*4" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdupq_lane_f64(a, b) \ +#define vext_f64(a, b, c) \ __extension__ \ ({ \ + float64x1_t b_ = (b); \ float64x1_t a_ = (a); \ - float64x2_t result; \ - __asm__ ("dup %0.2d,%1.d[%2]" \ + float64x1_t result; \ + __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*8" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdupq_lane_p8(a, b) \ +#define vext_p8(a, b, c) \ __extension__ \ ({ \ + poly8x8_t b_ = (b); \ poly8x8_t a_ = (a); \ - poly8x16_t result; \ - __asm__ ("dup %0.16b,%1.b[%2]" \ + poly8x8_t result; \ + __asm__ ("ext %0.8b,%1.8b,%2.8b,%3" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdupq_lane_p16(a, b) \ +#define vext_p16(a, b, c) \ __extension__ \ ({ \ + poly16x4_t b_ = (b); \ poly16x4_t a_ = (a); \ - poly16x8_t result; \ - __asm__ ("dup %0.8h,%1.h[%2]" \ + poly16x4_t result; \ + __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*2" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdupq_lane_s8(a, b) \ +#define vext_s8(a, b, c) \ __extension__ \ ({ \ + int8x8_t b_ = (b); \ int8x8_t a_ = (a); \ - int8x16_t result; \ - __asm__ ("dup %0.16b,%1.b[%2]" \ + int8x8_t result; \ + __asm__ ("ext %0.8b,%1.8b,%2.8b,%3" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdupq_lane_s16(a, b) \ +#define vext_s16(a, b, c) \ __extension__ \ ({ \ + int16x4_t b_ = (b); \ int16x4_t a_ = (a); \ - int16x8_t result; \ - __asm__ ("dup %0.8h,%1.h[%2]" \ + int16x4_t result; \ + __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*2" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdupq_lane_s32(a, b) \ +#define vext_s32(a, b, c) \ __extension__ \ ({ \ + int32x2_t b_ = (b); \ int32x2_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("dup %0.4s,%1.s[%2]" \ + int32x2_t result; \ + __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*4" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdupq_lane_s64(a, b) \ +#define vext_s64(a, b, c) \ __extension__ \ ({ \ + int64x1_t b_ = (b); \ int64x1_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("dup %0.2d,%1.d[%2]" \ + int64x1_t result; \ + __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*8" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdupq_lane_u8(a, b) \ +#define vext_u8(a, b, c) \ __extension__ \ ({ \ + uint8x8_t b_ = (b); \ uint8x8_t a_ = (a); \ - uint8x16_t result; \ - __asm__ ("dup %0.16b,%1.b[%2]" \ + uint8x8_t result; \ + __asm__ ("ext %0.8b,%1.8b,%2.8b,%3" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdupq_lane_u16(a, b) \ +#define vext_u16(a, b, c) \ __extension__ \ ({ \ + uint16x4_t b_ = (b); \ uint16x4_t a_ = (a); \ - uint16x8_t result; \ - __asm__ ("dup %0.8h,%1.h[%2]" \ + uint16x4_t result; \ + __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*2" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdupq_lane_u32(a, b) \ +#define vext_u32(a, b, c) \ __extension__ \ ({ \ + uint32x2_t b_ = (b); \ uint32x2_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("dup %0.4s,%1.s[%2]" \ + uint32x2_t result; \ + __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*4" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdupq_lane_u64(a, b) \ +#define vext_u64(a, b, c) \ __extension__ \ ({ \ + uint64x1_t b_ = (b); \ uint64x1_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("dup %0.2d,%1.d[%2]" \ + uint64x1_t result; \ + __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*8" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vdupq_n_f32 (float32_t a) -{ - float32x4_t result; - __asm__ ("dup %0.4s, %w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} +#define vextq_f32(a, b, c) \ + __extension__ \ + ({ \ + float32x4_t b_ = (b); \ + float32x4_t a_ = (a); \ + float32x4_t result; \ + __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*4" \ + : "=w"(result) \ + : "w"(a_), "w"(b_), "i"(c) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vdupq_n_f64 (float64_t a) -{ - float64x2_t result; - __asm__ ("dup %0.2d, %x1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} +#define vextq_f64(a, b, c) \ + __extension__ \ + ({ \ + float64x2_t b_ = (b); \ + float64x2_t a_ = (a); \ + float64x2_t result; \ + __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*8" \ + : "=w"(result) \ + : "w"(a_), "w"(b_), "i"(c) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vdupq_n_p8 (uint32_t a) -{ - poly8x16_t result; - __asm__ ("dup %0.16b,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} +#define vextq_p8(a, b, c) \ + __extension__ \ + ({ \ + poly8x16_t b_ = (b); \ + poly8x16_t a_ = (a); \ + poly8x16_t result; \ + __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3" \ + : "=w"(result) \ + : "w"(a_), "w"(b_), "i"(c) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) -vdupq_n_p16 (uint32_t a) -{ - poly16x8_t result; - __asm__ ("dup %0.8h,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} +#define vextq_p16(a, b, c) \ + __extension__ \ + ({ \ + poly16x8_t b_ = (b); \ + poly16x8_t a_ = (a); \ + poly16x8_t result; \ + __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*2" \ + : "=w"(result) \ + : "w"(a_), "w"(b_), "i"(c) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vdupq_n_s8 (int32_t a) -{ - int8x16_t result; - __asm__ ("dup %0.16b,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} +#define vextq_s8(a, b, c) \ + __extension__ \ + ({ \ + int8x16_t b_ = (b); \ + int8x16_t a_ = (a); \ + int8x16_t result; \ + __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3" \ + : "=w"(result) \ + : "w"(a_), "w"(b_), "i"(c) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vdupq_n_s16 (int32_t a) -{ - int16x8_t result; - __asm__ ("dup %0.8h,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vdupq_n_s32 (int32_t a) -{ - int32x4_t result; - __asm__ ("dup %0.4s,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vdupq_n_s64 (int64_t a) -{ - int64x2_t result; - __asm__ ("dup %0.2d,%x1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vdupq_n_u8 (uint32_t a) -{ - uint8x16_t result; - __asm__ ("dup %0.16b,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vdupq_n_u16 (uint32_t a) -{ - uint16x8_t result; - __asm__ ("dup %0.8h,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vdupq_n_u32 (uint32_t a) -{ - uint32x4_t result; - __asm__ ("dup %0.4s,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vdupq_n_u64 (uint64_t a) -{ - uint64x2_t result; - __asm__ ("dup %0.2d,%x1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -#define vdups_lane_f32(a, b) \ - __extension__ \ - ({ \ - float32x4_t a_ = (a); \ - float32_t result; \ - __asm__ ("dup %s0, %1.s[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vext_f32(a, b, c) \ +#define vextq_s16(a, b, c) \ __extension__ \ ({ \ - float32x2_t b_ = (b); \ - float32x2_t a_ = (a); \ - float32x2_t result; \ - __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*4" \ + int16x8_t b_ = (b); \ + int16x8_t a_ = (a); \ + int16x8_t result; \ + __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*2" \ : "=w"(result) \ : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vext_f64(a, b, c) \ +#define vextq_s32(a, b, c) \ __extension__ \ ({ \ - float64x1_t b_ = (b); \ - float64x1_t a_ = (a); \ - float64x1_t result; \ - __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*8" \ + int32x4_t b_ = (b); \ + int32x4_t a_ = (a); \ + int32x4_t result; \ + __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*4" \ : "=w"(result) \ : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vext_p8(a, b, c) \ +#define vextq_s64(a, b, c) \ __extension__ \ ({ \ - poly8x8_t b_ = (b); \ - poly8x8_t a_ = (a); \ - poly8x8_t result; \ - __asm__ ("ext %0.8b,%1.8b,%2.8b,%3" \ + int64x2_t b_ = (b); \ + int64x2_t a_ = (a); \ + int64x2_t result; \ + __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*8" \ : "=w"(result) \ : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vext_p16(a, b, c) \ +#define vextq_u8(a, b, c) \ __extension__ \ ({ \ - poly16x4_t b_ = (b); \ - poly16x4_t a_ = (a); \ - poly16x4_t result; \ - __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*2" \ + uint8x16_t b_ = (b); \ + uint8x16_t a_ = (a); \ + uint8x16_t result; \ + __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3" \ : "=w"(result) \ : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vext_s8(a, b, c) \ +#define vextq_u16(a, b, c) \ __extension__ \ ({ \ - int8x8_t b_ = (b); \ - int8x8_t a_ = (a); \ - int8x8_t result; \ - __asm__ ("ext %0.8b,%1.8b,%2.8b,%3" \ + uint16x8_t b_ = (b); \ + uint16x8_t a_ = (a); \ + uint16x8_t result; \ + __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*2" \ : "=w"(result) \ : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vext_s16(a, b, c) \ +#define vextq_u32(a, b, c) \ __extension__ \ ({ \ - int16x4_t b_ = (b); \ - int16x4_t a_ = (a); \ - int16x4_t result; \ - __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*2" \ + uint32x4_t b_ = (b); \ + uint32x4_t a_ = (a); \ + uint32x4_t result; \ + __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*4" \ : "=w"(result) \ : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vext_s32(a, b, c) \ +#define vextq_u64(a, b, c) \ __extension__ \ ({ \ - int32x2_t b_ = (b); \ - int32x2_t a_ = (a); \ - int32x2_t result; \ - __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*4" \ + uint64x2_t b_ = (b); \ + uint64x2_t a_ = (a); \ + uint64x2_t result; \ + __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*8" \ : "=w"(result) \ : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vext_s64(a, b, c) \ +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vfma_f32 (float32x2_t a, float32x2_t b, float32x2_t c) +{ + float32x2_t result; + __asm__ ("fmla %0.2s,%2.2s,%3.2s" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} + +#define vfma_lane_f32(a, b, c, d) \ __extension__ \ ({ \ - int64x1_t b_ = (b); \ - int64x1_t a_ = (a); \ - int64x1_t result; \ - __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*8" \ + float32x2_t c_ = (c); \ + float32x2_t b_ = (b); \ + float32x2_t a_ = (a); \ + float32x2_t result; \ + __asm__ ("fmla %0.2s,%2.2s,%3.s[%4]" \ : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vext_u8(a, b, c) \ +#define vfmad_lane_f64(a, b, c) \ __extension__ \ ({ \ - uint8x8_t b_ = (b); \ - uint8x8_t a_ = (a); \ - uint8x8_t result; \ - __asm__ ("ext %0.8b,%1.8b,%2.8b,%3" \ + float64x2_t b_ = (b); \ + float64_t a_ = (a); \ + float64_t result; \ + __asm__ ("fmla %d0,%d1,%2.d[%3]" \ : "=w"(result) \ : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vext_u16(a, b, c) \ +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vfmaq_f32 (float32x4_t a, float32x4_t b, float32x4_t c) +{ + float32x4_t result; + __asm__ ("fmla %0.4s,%2.4s,%3.4s" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vfmaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c) +{ + float64x2_t result; + __asm__ ("fmla %0.2d,%2.2d,%3.2d" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} + +#define vfmaq_lane_f32(a, b, c, d) \ __extension__ \ ({ \ - uint16x4_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint16x4_t result; \ - __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*2" \ + float32x4_t c_ = (c); \ + float32x4_t b_ = (b); \ + float32x4_t a_ = (a); \ + float32x4_t result; \ + __asm__ ("fmla %0.4s,%2.4s,%3.s[%4]" \ : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vext_u32(a, b, c) \ +#define vfmaq_lane_f64(a, b, c, d) \ __extension__ \ ({ \ - uint32x2_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint32x2_t result; \ - __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*4" \ + float64x2_t c_ = (c); \ + float64x2_t b_ = (b); \ + float64x2_t a_ = (a); \ + float64x2_t result; \ + __asm__ ("fmla %0.2d,%2.2d,%3.d[%4]" \ : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vext_u64(a, b, c) \ +#define vfmas_lane_f32(a, b, c) \ __extension__ \ ({ \ - uint64x1_t b_ = (b); \ - uint64x1_t a_ = (a); \ - uint64x1_t result; \ - __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*8" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_f32(a, b, c) \ - __extension__ \ - ({ \ - float32x4_t b_ = (b); \ - float32x4_t a_ = (a); \ - float32x4_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*4" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_f64(a, b, c) \ - __extension__ \ - ({ \ - float64x2_t b_ = (b); \ - float64x2_t a_ = (a); \ - float64x2_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*8" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_p8(a, b, c) \ - __extension__ \ - ({ \ - poly8x16_t b_ = (b); \ - poly8x16_t a_ = (a); \ - poly8x16_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_p16(a, b, c) \ - __extension__ \ - ({ \ - poly16x8_t b_ = (b); \ - poly16x8_t a_ = (a); \ - poly16x8_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*2" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_s8(a, b, c) \ - __extension__ \ - ({ \ - int8x16_t b_ = (b); \ - int8x16_t a_ = (a); \ - int8x16_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x8_t b_ = (b); \ - int16x8_t a_ = (a); \ - int16x8_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*2" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x4_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*4" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_s64(a, b, c) \ - __extension__ \ - ({ \ - int64x2_t b_ = (b); \ - int64x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*8" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_u8(a, b, c) \ - __extension__ \ - ({ \ - uint8x16_t b_ = (b); \ - uint8x16_t a_ = (a); \ - uint8x16_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x8_t b_ = (b); \ - uint16x8_t a_ = (a); \ - uint16x8_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*2" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x4_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*4" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_u64(a, b, c) \ - __extension__ \ - ({ \ - uint64x2_t b_ = (b); \ - uint64x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*8" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vfma_f32 (float32x2_t a, float32x2_t b, float32x2_t c) -{ - float32x2_t result; - __asm__ ("fmla %0.2s,%2.2s,%3.2s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -#define vfma_lane_f32(a, b, c, d) \ - __extension__ \ - ({ \ - float32x2_t c_ = (c); \ - float32x2_t b_ = (b); \ - float32x2_t a_ = (a); \ - float32x2_t result; \ - __asm__ ("fmla %0.2s,%2.2s,%3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vfmad_lane_f64(a, b, c) \ - __extension__ \ - ({ \ - float64x2_t b_ = (b); \ - float64_t a_ = (a); \ - float64_t result; \ - __asm__ ("fmla %d0,%d1,%2.d[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vfmaq_f32 (float32x4_t a, float32x4_t b, float32x4_t c) -{ - float32x4_t result; - __asm__ ("fmla %0.4s,%2.4s,%3.4s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vfmaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c) -{ - float64x2_t result; - __asm__ ("fmla %0.2d,%2.2d,%3.2d" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -#define vfmaq_lane_f32(a, b, c, d) \ - __extension__ \ - ({ \ - float32x4_t c_ = (c); \ - float32x4_t b_ = (b); \ - float32x4_t a_ = (a); \ - float32x4_t result; \ - __asm__ ("fmla %0.4s,%2.4s,%3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vfmaq_lane_f64(a, b, c, d) \ - __extension__ \ - ({ \ - float64x2_t c_ = (c); \ - float64x2_t b_ = (b); \ - float64x2_t a_ = (a); \ - float64x2_t result; \ - __asm__ ("fmla %0.2d,%2.2d,%3.d[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vfmas_lane_f32(a, b, c) \ - __extension__ \ - ({ \ - float32x4_t b_ = (b); \ - float32_t a_ = (a); \ - float32_t result; \ - __asm__ ("fmla %s0,%s1,%2.s[%3]" \ + float32x4_t b_ = (b); \ + float32_t a_ = (a); \ + float32_t result; \ + __asm__ ("fmla %s0,%s1,%2.s[%3]" \ : "=w"(result) \ : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ @@ -18364,1838 +17638,2269 @@ __ST2_LANE_FUNC (uint16x8x2_t, uint16_t, 8h, h, u16, q) __ST2_LANE_FUNC (uint32x4x2_t, uint32_t, 4s, s, u32, q) __ST2_LANE_FUNC (uint64x2x2_t, uint64_t, 2d, d, u64, q) -#define __ST3_LANE_FUNC(intype, ptrtype, regsuffix, \ - lnsuffix, funcsuffix, Q) \ - __extension__ static __inline void \ - __attribute__ ((__always_inline__)) \ - vst3 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \ - intype b, const int c) \ - { \ - __asm__ ("ld1 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t" \ - "st3 {v16." #lnsuffix " - v18." #lnsuffix "}[%2], %0\n\t" \ - : "=Q"(*(intype *) ptr) \ - : "Q"(b), "i"(c) \ - : "memory", "v16", "v17", "v18"); \ - } +#define __ST3_LANE_FUNC(intype, ptrtype, regsuffix, \ + lnsuffix, funcsuffix, Q) \ + __extension__ static __inline void \ + __attribute__ ((__always_inline__)) \ + vst3 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \ + intype b, const int c) \ + { \ + __asm__ ("ld1 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t" \ + "st3 {v16." #lnsuffix " - v18." #lnsuffix "}[%2], %0\n\t" \ + : "=Q"(*(intype *) ptr) \ + : "Q"(b), "i"(c) \ + : "memory", "v16", "v17", "v18"); \ + } + +__ST3_LANE_FUNC (int8x8x3_t, int8_t, 8b, b, s8,) +__ST3_LANE_FUNC (float32x2x3_t, float32_t, 2s, s, f32,) +__ST3_LANE_FUNC (float64x1x3_t, float64_t, 1d, d, f64,) +__ST3_LANE_FUNC (poly8x8x3_t, poly8_t, 8b, b, p8,) +__ST3_LANE_FUNC (poly16x4x3_t, poly16_t, 4h, h, p16,) +__ST3_LANE_FUNC (int16x4x3_t, int16_t, 4h, h, s16,) +__ST3_LANE_FUNC (int32x2x3_t, int32_t, 2s, s, s32,) +__ST3_LANE_FUNC (int64x1x3_t, int64_t, 1d, d, s64,) +__ST3_LANE_FUNC (uint8x8x3_t, uint8_t, 8b, b, u8,) +__ST3_LANE_FUNC (uint16x4x3_t, uint16_t, 4h, h, u16,) +__ST3_LANE_FUNC (uint32x2x3_t, uint32_t, 2s, s, u32,) +__ST3_LANE_FUNC (uint64x1x3_t, uint64_t, 1d, d, u64,) +__ST3_LANE_FUNC (float32x4x3_t, float32_t, 4s, s, f32, q) +__ST3_LANE_FUNC (float64x2x3_t, float64_t, 2d, d, f64, q) +__ST3_LANE_FUNC (poly8x16x3_t, poly8_t, 16b, b, p8, q) +__ST3_LANE_FUNC (poly16x8x3_t, poly16_t, 8h, h, p16, q) +__ST3_LANE_FUNC (int8x16x3_t, int8_t, 16b, b, s8, q) +__ST3_LANE_FUNC (int16x8x3_t, int16_t, 8h, h, s16, q) +__ST3_LANE_FUNC (int32x4x3_t, int32_t, 4s, s, s32, q) +__ST3_LANE_FUNC (int64x2x3_t, int64_t, 2d, d, s64, q) +__ST3_LANE_FUNC (uint8x16x3_t, uint8_t, 16b, b, u8, q) +__ST3_LANE_FUNC (uint16x8x3_t, uint16_t, 8h, h, u16, q) +__ST3_LANE_FUNC (uint32x4x3_t, uint32_t, 4s, s, u32, q) +__ST3_LANE_FUNC (uint64x2x3_t, uint64_t, 2d, d, u64, q) + +#define __ST4_LANE_FUNC(intype, ptrtype, regsuffix, \ + lnsuffix, funcsuffix, Q) \ + __extension__ static __inline void \ + __attribute__ ((__always_inline__)) \ + vst4 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \ + intype b, const int c) \ + { \ + __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t" \ + "st4 {v16." #lnsuffix " - v19." #lnsuffix "}[%2], %0\n\t" \ + : "=Q"(*(intype *) ptr) \ + : "Q"(b), "i"(c) \ + : "memory", "v16", "v17", "v18", "v19"); \ + } + +__ST4_LANE_FUNC (int8x8x4_t, int8_t, 8b, b, s8,) +__ST4_LANE_FUNC (float32x2x4_t, float32_t, 2s, s, f32,) +__ST4_LANE_FUNC (float64x1x4_t, float64_t, 1d, d, f64,) +__ST4_LANE_FUNC (poly8x8x4_t, poly8_t, 8b, b, p8,) +__ST4_LANE_FUNC (poly16x4x4_t, poly16_t, 4h, h, p16,) +__ST4_LANE_FUNC (int16x4x4_t, int16_t, 4h, h, s16,) +__ST4_LANE_FUNC (int32x2x4_t, int32_t, 2s, s, s32,) +__ST4_LANE_FUNC (int64x1x4_t, int64_t, 1d, d, s64,) +__ST4_LANE_FUNC (uint8x8x4_t, uint8_t, 8b, b, u8,) +__ST4_LANE_FUNC (uint16x4x4_t, uint16_t, 4h, h, u16,) +__ST4_LANE_FUNC (uint32x2x4_t, uint32_t, 2s, s, u32,) +__ST4_LANE_FUNC (uint64x1x4_t, uint64_t, 1d, d, u64,) +__ST4_LANE_FUNC (float32x4x4_t, float32_t, 4s, s, f32, q) +__ST4_LANE_FUNC (float64x2x4_t, float64_t, 2d, d, f64, q) +__ST4_LANE_FUNC (poly8x16x4_t, poly8_t, 16b, b, p8, q) +__ST4_LANE_FUNC (poly16x8x4_t, poly16_t, 8h, h, p16, q) +__ST4_LANE_FUNC (int8x16x4_t, int8_t, 16b, b, s8, q) +__ST4_LANE_FUNC (int16x8x4_t, int16_t, 8h, h, s16, q) +__ST4_LANE_FUNC (int32x4x4_t, int32_t, 4s, s, s32, q) +__ST4_LANE_FUNC (int64x2x4_t, int64_t, 2d, d, s64, q) +__ST4_LANE_FUNC (uint8x16x4_t, uint8_t, 16b, b, u8, q) +__ST4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q) +__ST4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q) +__ST4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q) + +__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +vaddlv_s32 (int32x2_t a) +{ + int64_t result; + __asm__ ("saddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : ); + return result; +} + +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vaddlv_u32 (uint32x2_t a) +{ + uint64_t result; + __asm__ ("uaddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : ); + return result; +} + +__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +vaddv_s32 (int32x2_t a) +{ + int32_t result; + __asm__ ("addp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : ); + return result; +} + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vaddv_u32 (uint32x2_t a) +{ + uint32_t result; + __asm__ ("addp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : ); + return result; +} + +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vmaxnmv_f32 (float32x2_t a) +{ + float32_t result; + __asm__ ("fmaxnmp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : ); + return result; +} + +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vminnmv_f32 (float32x2_t a) +{ + float32_t result; + __asm__ ("fminnmp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : ); + return result; +} + +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vmaxnmvq_f64 (float64x2_t a) +{ + float64_t result; + __asm__ ("fmaxnmp %0.2d, %1.2d, %1.2d" : "=w"(result) : "w"(a) : ); + return result; +} + +__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +vmaxv_s32 (int32x2_t a) +{ + int32_t result; + __asm__ ("smaxp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : ); + return result; +} + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vmaxv_u32 (uint32x2_t a) +{ + uint32_t result; + __asm__ ("umaxp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : ); + return result; +} + +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vminnmvq_f64 (float64x2_t a) +{ + float64_t result; + __asm__ ("fminnmp %0.2d, %1.2d, %1.2d" : "=w"(result) : "w"(a) : ); + return result; +} + +__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +vminv_s32 (int32x2_t a) +{ + int32_t result; + __asm__ ("sminp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : ); + return result; +} + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vminv_u32 (uint32x2_t a) +{ + uint32_t result; + __asm__ ("uminp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : ); + return result; +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vpaddd_s64 (int64x2_t __a) +{ + return __builtin_aarch64_addpdi (__a); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vqdmulh_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c) +{ + return __builtin_aarch64_sqdmulh_laneqv4hi (__a, __b, __c); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vqdmulh_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c) +{ + return __builtin_aarch64_sqdmulh_laneqv2si (__a, __b, __c); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vqdmulhq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c) +{ + return __builtin_aarch64_sqdmulh_laneqv8hi (__a, __b, __c); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqdmulhq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c) +{ + return __builtin_aarch64_sqdmulh_laneqv4si (__a, __b, __c); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vqrdmulh_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c) +{ + return __builtin_aarch64_sqrdmulh_laneqv4hi (__a, __b, __c); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vqrdmulh_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c) +{ + return __builtin_aarch64_sqrdmulh_laneqv2si (__a, __b, __c); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vqrdmulhq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c) +{ + return __builtin_aarch64_sqrdmulh_laneqv8hi (__a, __b, __c); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqrdmulhq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c) +{ + return __builtin_aarch64_sqrdmulh_laneqv4si (__a, __b, __c); +} + +/* Table intrinsics. */ + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vqtbl1_p8 (poly8x16_t a, uint8x8_t b) +{ + poly8x8_t result; + __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" + : "=w"(result) + : "w"(a), "w"(b) + : /* No clobbers */); + return result; +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vqtbl1_s8 (int8x16_t a, int8x8_t b) +{ + int8x8_t result; + __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" + : "=w"(result) + : "w"(a), "w"(b) + : /* No clobbers */); + return result; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqtbl1_u8 (uint8x16_t a, uint8x8_t b) +{ + uint8x8_t result; + __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" + : "=w"(result) + : "w"(a), "w"(b) + : /* No clobbers */); + return result; +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vqtbl1q_p8 (poly8x16_t a, uint8x16_t b) +{ + poly8x16_t result; + __asm__ ("tbl %0.16b, {%1.16b}, %2.16b" + : "=w"(result) + : "w"(a), "w"(b) + : /* No clobbers */); + return result; +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vqtbl1q_s8 (int8x16_t a, int8x16_t b) +{ + int8x16_t result; + __asm__ ("tbl %0.16b, {%1.16b}, %2.16b" + : "=w"(result) + : "w"(a), "w"(b) + : /* No clobbers */); + return result; +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vqtbl1q_u8 (uint8x16_t a, uint8x16_t b) +{ + uint8x16_t result; + __asm__ ("tbl %0.16b, {%1.16b}, %2.16b" + : "=w"(result) + : "w"(a), "w"(b) + : /* No clobbers */); + return result; +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vqtbl2_s8 (int8x16x2_t tab, int8x8_t idx) +{ + int8x8_t result; + __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" + "tbl %0.8b, {v16.16b, v17.16b}, %2.8b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17"); + return result; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqtbl2_u8 (uint8x16x2_t tab, uint8x8_t idx) +{ + uint8x8_t result; + __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" + "tbl %0.8b, {v16.16b, v17.16b}, %2.8b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17"); + return result; +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vqtbl2_p8 (poly8x16x2_t tab, uint8x8_t idx) +{ + poly8x8_t result; + __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" + "tbl %0.8b, {v16.16b, v17.16b}, %2.8b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17"); + return result; +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vqtbl2q_s8 (int8x16x2_t tab, int8x16_t idx) +{ + int8x16_t result; + __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" + "tbl %0.16b, {v16.16b, v17.16b}, %2.16b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17"); + return result; +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vqtbl2q_u8 (uint8x16x2_t tab, uint8x16_t idx) +{ + uint8x16_t result; + __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" + "tbl %0.16b, {v16.16b, v17.16b}, %2.16b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17"); + return result; +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vqtbl2q_p8 (poly8x16x2_t tab, uint8x16_t idx) +{ + poly8x16_t result; + __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" + "tbl %0.16b, {v16.16b, v17.16b}, %2.16b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17"); + return result; +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vqtbl3_s8 (int8x16x3_t tab, int8x8_t idx) +{ + int8x8_t result; + __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" + "tbl %0.8b, {v16.16b - v18.16b}, %2.8b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18"); + return result; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqtbl3_u8 (uint8x16x3_t tab, uint8x8_t idx) +{ + uint8x8_t result; + __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" + "tbl %0.8b, {v16.16b - v18.16b}, %2.8b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18"); + return result; +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vqtbl3_p8 (poly8x16x3_t tab, uint8x8_t idx) +{ + poly8x8_t result; + __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" + "tbl %0.8b, {v16.16b - v18.16b}, %2.8b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18"); + return result; +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vqtbl3q_s8 (int8x16x3_t tab, int8x16_t idx) +{ + int8x16_t result; + __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" + "tbl %0.16b, {v16.16b - v18.16b}, %2.16b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18"); + return result; +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vqtbl3q_u8 (uint8x16x3_t tab, uint8x16_t idx) +{ + uint8x16_t result; + __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" + "tbl %0.16b, {v16.16b - v18.16b}, %2.16b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18"); + return result; +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vqtbl3q_p8 (poly8x16x3_t tab, uint8x16_t idx) +{ + poly8x16_t result; + __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" + "tbl %0.16b, {v16.16b - v18.16b}, %2.16b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18"); + return result; +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vqtbl4_s8 (int8x16x4_t tab, int8x8_t idx) +{ + int8x8_t result; + __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" + "tbl %0.8b, {v16.16b - v19.16b}, %2.8b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18", "v19"); + return result; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqtbl4_u8 (uint8x16x4_t tab, uint8x8_t idx) +{ + uint8x8_t result; + __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" + "tbl %0.8b, {v16.16b - v19.16b}, %2.8b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18", "v19"); + return result; +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vqtbl4_p8 (poly8x16x4_t tab, uint8x8_t idx) +{ + poly8x8_t result; + __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" + "tbl %0.8b, {v16.16b - v19.16b}, %2.8b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18", "v19"); + return result; +} + + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vqtbl4q_s8 (int8x16x4_t tab, int8x16_t idx) +{ + int8x16_t result; + __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" + "tbl %0.16b, {v16.16b - v19.16b}, %2.16b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18", "v19"); + return result; +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vqtbl4q_u8 (uint8x16x4_t tab, uint8x16_t idx) +{ + uint8x16_t result; + __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" + "tbl %0.16b, {v16.16b - v19.16b}, %2.16b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18", "v19"); + return result; +} -__ST3_LANE_FUNC (int8x8x3_t, int8_t, 8b, b, s8,) -__ST3_LANE_FUNC (float32x2x3_t, float32_t, 2s, s, f32,) -__ST3_LANE_FUNC (float64x1x3_t, float64_t, 1d, d, f64,) -__ST3_LANE_FUNC (poly8x8x3_t, poly8_t, 8b, b, p8,) -__ST3_LANE_FUNC (poly16x4x3_t, poly16_t, 4h, h, p16,) -__ST3_LANE_FUNC (int16x4x3_t, int16_t, 4h, h, s16,) -__ST3_LANE_FUNC (int32x2x3_t, int32_t, 2s, s, s32,) -__ST3_LANE_FUNC (int64x1x3_t, int64_t, 1d, d, s64,) -__ST3_LANE_FUNC (uint8x8x3_t, uint8_t, 8b, b, u8,) -__ST3_LANE_FUNC (uint16x4x3_t, uint16_t, 4h, h, u16,) -__ST3_LANE_FUNC (uint32x2x3_t, uint32_t, 2s, s, u32,) -__ST3_LANE_FUNC (uint64x1x3_t, uint64_t, 1d, d, u64,) -__ST3_LANE_FUNC (float32x4x3_t, float32_t, 4s, s, f32, q) -__ST3_LANE_FUNC (float64x2x3_t, float64_t, 2d, d, f64, q) -__ST3_LANE_FUNC (poly8x16x3_t, poly8_t, 16b, b, p8, q) -__ST3_LANE_FUNC (poly16x8x3_t, poly16_t, 8h, h, p16, q) -__ST3_LANE_FUNC (int8x16x3_t, int8_t, 16b, b, s8, q) -__ST3_LANE_FUNC (int16x8x3_t, int16_t, 8h, h, s16, q) -__ST3_LANE_FUNC (int32x4x3_t, int32_t, 4s, s, s32, q) -__ST3_LANE_FUNC (int64x2x3_t, int64_t, 2d, d, s64, q) -__ST3_LANE_FUNC (uint8x16x3_t, uint8_t, 16b, b, u8, q) -__ST3_LANE_FUNC (uint16x8x3_t, uint16_t, 8h, h, u16, q) -__ST3_LANE_FUNC (uint32x4x3_t, uint32_t, 4s, s, u32, q) -__ST3_LANE_FUNC (uint64x2x3_t, uint64_t, 2d, d, u64, q) +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vqtbl4q_p8 (poly8x16x4_t tab, uint8x16_t idx) +{ + poly8x16_t result; + __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" + "tbl %0.16b, {v16.16b - v19.16b}, %2.16b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18", "v19"); + return result; +} -#define __ST4_LANE_FUNC(intype, ptrtype, regsuffix, \ - lnsuffix, funcsuffix, Q) \ - __extension__ static __inline void \ - __attribute__ ((__always_inline__)) \ - vst4 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \ - intype b, const int c) \ - { \ - __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t" \ - "st4 {v16." #lnsuffix " - v19." #lnsuffix "}[%2], %0\n\t" \ - : "=Q"(*(intype *) ptr) \ - : "Q"(b), "i"(c) \ - : "memory", "v16", "v17", "v18", "v19"); \ - } -__ST4_LANE_FUNC (int8x8x4_t, int8_t, 8b, b, s8,) -__ST4_LANE_FUNC (float32x2x4_t, float32_t, 2s, s, f32,) -__ST4_LANE_FUNC (float64x1x4_t, float64_t, 1d, d, f64,) -__ST4_LANE_FUNC (poly8x8x4_t, poly8_t, 8b, b, p8,) -__ST4_LANE_FUNC (poly16x4x4_t, poly16_t, 4h, h, p16,) -__ST4_LANE_FUNC (int16x4x4_t, int16_t, 4h, h, s16,) -__ST4_LANE_FUNC (int32x2x4_t, int32_t, 2s, s, s32,) -__ST4_LANE_FUNC (int64x1x4_t, int64_t, 1d, d, s64,) -__ST4_LANE_FUNC (uint8x8x4_t, uint8_t, 8b, b, u8,) -__ST4_LANE_FUNC (uint16x4x4_t, uint16_t, 4h, h, u16,) -__ST4_LANE_FUNC (uint32x2x4_t, uint32_t, 2s, s, u32,) -__ST4_LANE_FUNC (uint64x1x4_t, uint64_t, 1d, d, u64,) -__ST4_LANE_FUNC (float32x4x4_t, float32_t, 4s, s, f32, q) -__ST4_LANE_FUNC (float64x2x4_t, float64_t, 2d, d, f64, q) -__ST4_LANE_FUNC (poly8x16x4_t, poly8_t, 16b, b, p8, q) -__ST4_LANE_FUNC (poly16x8x4_t, poly16_t, 8h, h, p16, q) -__ST4_LANE_FUNC (int8x16x4_t, int8_t, 16b, b, s8, q) -__ST4_LANE_FUNC (int16x8x4_t, int16_t, 8h, h, s16, q) -__ST4_LANE_FUNC (int32x4x4_t, int32_t, 4s, s, s32, q) -__ST4_LANE_FUNC (int64x2x4_t, int64_t, 2d, d, s64, q) -__ST4_LANE_FUNC (uint8x16x4_t, uint8_t, 16b, b, u8, q) -__ST4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q) -__ST4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q) -__ST4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q) +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vqtbx1_s8 (int8x8_t r, int8x16_t tab, int8x8_t idx) +{ + int8x8_t result = r; + __asm__ ("tbx %0.8b,{%1.16b},%2.8b" + : "+w"(result) + : "w"(tab), "w"(idx) + : /* No clobbers */); + return result; +} -__extension__ static __inline int64_t __attribute__ ((__always_inline__)) -vaddlv_s32 (int32x2_t a) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqtbx1_u8 (uint8x8_t r, uint8x16_t tab, uint8x8_t idx) { - int64_t result; - __asm__ ("saddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : ); + uint8x8_t result = r; + __asm__ ("tbx %0.8b,{%1.16b},%2.8b" + : "+w"(result) + : "w"(tab), "w"(idx) + : /* No clobbers */); return result; } -__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) -vaddlv_u32 (uint32x2_t a) +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vqtbx1_p8 (poly8x8_t r, poly8x16_t tab, uint8x8_t idx) { - uint64_t result; - __asm__ ("uaddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : ); + poly8x8_t result = r; + __asm__ ("tbx %0.8b,{%1.16b},%2.8b" + : "+w"(result) + : "w"(tab), "w"(idx) + : /* No clobbers */); return result; } -__extension__ static __inline int32_t __attribute__ ((__always_inline__)) -vaddv_s32 (int32x2_t a) +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vqtbx1q_s8 (int8x16_t r, int8x16_t tab, int8x16_t idx) { - int32_t result; - __asm__ ("addp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : ); + int8x16_t result = r; + __asm__ ("tbx %0.16b,{%1.16b},%2.16b" + : "+w"(result) + : "w"(tab), "w"(idx) + : /* No clobbers */); return result; } -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vaddv_u32 (uint32x2_t a) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vqtbx1q_u8 (uint8x16_t r, uint8x16_t tab, uint8x16_t idx) { - uint32_t result; - __asm__ ("addp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : ); + uint8x16_t result = r; + __asm__ ("tbx %0.16b,{%1.16b},%2.16b" + : "+w"(result) + : "w"(tab), "w"(idx) + : /* No clobbers */); return result; } -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vmaxnmv_f32 (float32x2_t a) +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vqtbx1q_p8 (poly8x16_t r, poly8x16_t tab, uint8x16_t idx) { - float32_t result; - __asm__ ("fmaxnmp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : ); + poly8x16_t result = r; + __asm__ ("tbx %0.16b,{%1.16b},%2.16b" + : "+w"(result) + : "w"(tab), "w"(idx) + : /* No clobbers */); return result; } -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vminnmv_f32 (float32x2_t a) +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vqtbx2_s8 (int8x8_t r, int8x16x2_t tab, int8x8_t idx) { - float32_t result; - __asm__ ("fminnmp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : ); + int8x8_t result = r; + __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" + "tbx %0.8b, {v16.16b, v17.16b}, %2.8b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17"); return result; } -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) -vmaxnmvq_f64 (float64x2_t a) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqtbx2_u8 (uint8x8_t r, uint8x16x2_t tab, uint8x8_t idx) { - float64_t result; - __asm__ ("fmaxnmp %0.2d, %1.2d, %1.2d" : "=w"(result) : "w"(a) : ); + uint8x8_t result = r; + __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" + "tbx %0.8b, {v16.16b, v17.16b}, %2.8b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17"); return result; } -__extension__ static __inline int32_t __attribute__ ((__always_inline__)) -vmaxv_s32 (int32x2_t a) +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vqtbx2_p8 (poly8x8_t r, poly8x16x2_t tab, uint8x8_t idx) { - int32_t result; - __asm__ ("smaxp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : ); + poly8x8_t result = r; + __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" + "tbx %0.8b, {v16.16b, v17.16b}, %2.8b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17"); return result; } -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vmaxv_u32 (uint32x2_t a) + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vqtbx2q_s8 (int8x16_t r, int8x16x2_t tab, int8x16_t idx) { - uint32_t result; - __asm__ ("umaxp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : ); + int8x16_t result = r; + __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" + "tbx %0.16b, {v16.16b, v17.16b}, %2.16b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17"); return result; } -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) -vminnmvq_f64 (float64x2_t a) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vqtbx2q_u8 (uint8x16_t r, uint8x16x2_t tab, uint8x16_t idx) { - float64_t result; - __asm__ ("fminnmp %0.2d, %1.2d, %1.2d" : "=w"(result) : "w"(a) : ); + uint8x16_t result = r; + __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" + "tbx %0.16b, {v16.16b, v17.16b}, %2.16b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17"); + return result; +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vqtbx2q_p8 (poly8x16_t r, poly8x16x2_t tab, uint8x16_t idx) +{ + poly8x16_t result = r; + __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" + "tbx %0.16b, {v16.16b, v17.16b}, %2.16b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17"); + return result; +} + + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vqtbx3_s8 (int8x8_t r, int8x16x3_t tab, int8x8_t idx) +{ + int8x8_t result = r; + __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" + "tbx %0.8b, {v16.16b - v18.16b}, %2.8b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18"); return result; } -__extension__ static __inline int32_t __attribute__ ((__always_inline__)) -vminv_s32 (int32x2_t a) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqtbx3_u8 (uint8x8_t r, uint8x16x3_t tab, uint8x8_t idx) { - int32_t result; - __asm__ ("sminp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : ); + uint8x8_t result = r; + __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" + "tbx %0.8b, {v16.16b - v18.16b}, %2.8b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18"); return result; } -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vminv_u32 (uint32x2_t a) +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vqtbx3_p8 (poly8x8_t r, poly8x16x3_t tab, uint8x8_t idx) { - uint32_t result; - __asm__ ("uminp %0.2s, %1.2s, %1.2s" : "=w"(result) : "w"(a) : ); + poly8x8_t result = r; + __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" + "tbx %0.8b, {v16.16b - v18.16b}, %2.8b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18"); return result; } -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) -vpaddd_s64 (int64x2_t __a) -{ - return __builtin_aarch64_addpdi (__a); -} -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vqdmulh_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c) +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vqtbx3q_s8 (int8x16_t r, int8x16x3_t tab, int8x16_t idx) { - return __builtin_aarch64_sqdmulh_laneqv4hi (__a, __b, __c); + int8x16_t result = r; + __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" + "tbx %0.16b, {v16.16b - v18.16b}, %2.16b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18"); + return result; } -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vqdmulh_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vqtbx3q_u8 (uint8x16_t r, uint8x16x3_t tab, uint8x16_t idx) { - return __builtin_aarch64_sqdmulh_laneqv2si (__a, __b, __c); + uint8x16_t result = r; + __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" + "tbx %0.16b, {v16.16b - v18.16b}, %2.16b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18"); + return result; } -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vqdmulhq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c) +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vqtbx3q_p8 (poly8x16_t r, poly8x16x3_t tab, uint8x16_t idx) { - return __builtin_aarch64_sqdmulh_laneqv8hi (__a, __b, __c); + poly8x16_t result = r; + __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" + "tbx %0.16b, {v16.16b - v18.16b}, %2.16b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18"); + return result; } -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vqdmulhq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c) + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vqtbx4_s8 (int8x8_t r, int8x16x4_t tab, int8x8_t idx) { - return __builtin_aarch64_sqdmulh_laneqv4si (__a, __b, __c); + int8x8_t result = r; + __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" + "tbx %0.8b, {v16.16b - v19.16b}, %2.8b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18", "v19"); + return result; } -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vqrdmulh_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqtbx4_u8 (uint8x8_t r, uint8x16x4_t tab, uint8x8_t idx) { - return __builtin_aarch64_sqrdmulh_laneqv4hi (__a, __b, __c); + uint8x8_t result = r; + __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" + "tbx %0.8b, {v16.16b - v19.16b}, %2.8b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18", "v19"); + return result; } -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vqrdmulh_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c) +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vqtbx4_p8 (poly8x8_t r, poly8x16x4_t tab, uint8x8_t idx) { - return __builtin_aarch64_sqrdmulh_laneqv2si (__a, __b, __c); + poly8x8_t result = r; + __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" + "tbx %0.8b, {v16.16b - v19.16b}, %2.8b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18", "v19"); + return result; } -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vqrdmulhq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c) + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vqtbx4q_s8 (int8x16_t r, int8x16x4_t tab, int8x16_t idx) { - return __builtin_aarch64_sqrdmulh_laneqv8hi (__a, __b, __c); + int8x16_t result = r; + __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" + "tbx %0.16b, {v16.16b - v19.16b}, %2.16b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18", "v19"); + return result; } -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vqrdmulhq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vqtbx4q_u8 (uint8x16_t r, uint8x16x4_t tab, uint8x16_t idx) { - return __builtin_aarch64_sqrdmulh_laneqv4si (__a, __b, __c); + uint8x16_t result = r; + __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" + "tbx %0.16b, {v16.16b - v19.16b}, %2.16b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18", "v19"); + return result; } -/* Table intrinsics. */ - -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vqtbl1_p8 (poly8x16_t a, uint8x8_t b) +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vqtbx4q_p8 (poly8x16_t r, poly8x16x4_t tab, uint8x16_t idx) { - poly8x8_t result; - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); + poly8x16_t result = r; + __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" + "tbx %0.16b, {v16.16b - v19.16b}, %2.16b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18", "v19"); return result; } +/* V7 legacy table intrinsics. */ + __extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vqtbl1_s8 (int8x16_t a, int8x8_t b) +vtbl1_s8 (int8x8_t tab, int8x8_t idx) { int8x8_t result; + int8x16_t temp = vcombine_s8 (tab, vcreate_s8 (UINT64_C (0x0))); __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" : "=w"(result) - : "w"(a), "w"(b) + : "w"(temp), "w"(idx) : /* No clobbers */); return result; } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vqtbl1_u8 (uint8x16_t a, uint8x8_t b) +vtbl1_u8 (uint8x8_t tab, uint8x8_t idx) { uint8x8_t result; + uint8x16_t temp = vcombine_u8 (tab, vcreate_u8 (UINT64_C (0x0))); __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vqtbl1q_p8 (poly8x16_t a, uint8x16_t b) -{ - poly8x16_t result; - __asm__ ("tbl %0.16b, {%1.16b}, %2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vqtbl1q_s8 (int8x16_t a, int8x16_t b) -{ - int8x16_t result; - __asm__ ("tbl %0.16b, {%1.16b}, %2.16b" - : "=w"(result) - : "w"(a), "w"(b) + : "w"(temp), "w"(idx) : /* No clobbers */); return result; } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vqtbl1q_u8 (uint8x16_t a, uint8x16_t b) +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtbl1_p8 (poly8x8_t tab, uint8x8_t idx) { - uint8x16_t result; - __asm__ ("tbl %0.16b, {%1.16b}, %2.16b" + poly8x8_t result; + poly8x16_t temp = vcombine_p8 (tab, vcreate_p8 (UINT64_C (0x0))); + __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" : "=w"(result) - : "w"(a), "w"(b) + : "w"(temp), "w"(idx) : /* No clobbers */); return result; } __extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vqtbl2_s8 (int8x16x2_t tab, int8x8_t idx) +vtbl2_s8 (int8x8x2_t tab, int8x8_t idx) { int8x8_t result; - __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" - "tbl %0.8b, {v16.16b, v17.16b}, %2.8b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17"); + int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]); + __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" + : "=w"(result) + : "w"(temp), "w"(idx) + : /* No clobbers */); return result; } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vqtbl2_u8 (uint8x16x2_t tab, uint8x8_t idx) +vtbl2_u8 (uint8x8x2_t tab, uint8x8_t idx) { uint8x8_t result; - __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" - "tbl %0.8b, {v16.16b, v17.16b}, %2.8b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17"); + uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]); + __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" + : "=w"(result) + : "w"(temp), "w"(idx) + : /* No clobbers */); return result; } __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vqtbl2_p8 (poly8x16x2_t tab, uint8x8_t idx) +vtbl2_p8 (poly8x8x2_t tab, uint8x8_t idx) { poly8x8_t result; - __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" - "tbl %0.8b, {v16.16b, v17.16b}, %2.8b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17"); + poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]); + __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" + : "=w"(result) + : "w"(temp), "w"(idx) + : /* No clobbers */); return result; } -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vqtbl2q_s8 (int8x16x2_t tab, int8x16_t idx) +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vtbl3_s8 (int8x8x3_t tab, int8x8_t idx) { - int8x16_t result; - __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" - "tbl %0.16b, {v16.16b, v17.16b}, %2.16b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17"); + int8x8_t result; + int8x16x2_t temp; + temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]); + temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (UINT64_C (0x0))); + __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" + "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" + : "=w"(result) + : "Q"(temp), "w"(idx) + : "v16", "v17", "memory"); return result; -} - -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vqtbl2q_u8 (uint8x16x2_t tab, uint8x16_t idx) -{ - uint8x16_t result; - __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" - "tbl %0.16b, {v16.16b, v17.16b}, %2.16b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17"); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtbl3_u8 (uint8x8x3_t tab, uint8x8_t idx) +{ + uint8x8_t result; + uint8x16x2_t temp; + temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]); + temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (UINT64_C (0x0))); + __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" + "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" + : "=w"(result) + : "Q"(temp), "w"(idx) + : "v16", "v17", "memory"); return result; } -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vqtbl2q_p8 (poly8x16x2_t tab, uint8x16_t idx) +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtbl3_p8 (poly8x8x3_t tab, uint8x8_t idx) { - poly8x16_t result; - __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" - "tbl %0.16b, {v16.16b, v17.16b}, %2.16b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17"); + poly8x8_t result; + poly8x16x2_t temp; + temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]); + temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (UINT64_C (0x0))); + __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" + "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" + : "=w"(result) + : "Q"(temp), "w"(idx) + : "v16", "v17", "memory"); return result; } __extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vqtbl3_s8 (int8x16x3_t tab, int8x8_t idx) +vtbl4_s8 (int8x8x4_t tab, int8x8_t idx) { int8x8_t result; - __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" - "tbl %0.8b, {v16.16b - v18.16b}, %2.8b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18"); + int8x16x2_t temp; + temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]); + temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]); + __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" + "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" + : "=w"(result) + : "Q"(temp), "w"(idx) + : "v16", "v17", "memory"); return result; } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vqtbl3_u8 (uint8x16x3_t tab, uint8x8_t idx) +vtbl4_u8 (uint8x8x4_t tab, uint8x8_t idx) { uint8x8_t result; - __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" - "tbl %0.8b, {v16.16b - v18.16b}, %2.8b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18"); + uint8x16x2_t temp; + temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]); + temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]); + __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" + "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" + : "=w"(result) + : "Q"(temp), "w"(idx) + : "v16", "v17", "memory"); return result; } __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vqtbl3_p8 (poly8x16x3_t tab, uint8x8_t idx) +vtbl4_p8 (poly8x8x4_t tab, uint8x8_t idx) { poly8x8_t result; - __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" - "tbl %0.8b, {v16.16b - v18.16b}, %2.8b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18"); + poly8x16x2_t temp; + temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]); + temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]); + __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" + "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" + : "=w"(result) + : "Q"(temp), "w"(idx) + : "v16", "v17", "memory"); return result; } -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vqtbl3q_s8 (int8x16x3_t tab, int8x16_t idx) +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vtbx1_s8 (int8x8_t r, int8x8_t tab, int8x8_t idx) { - int8x16_t result; - __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" - "tbl %0.16b, {v16.16b - v18.16b}, %2.16b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18"); + int8x8_t result; + int8x8_t tmp1; + int8x16_t temp = vcombine_s8 (tab, vcreate_s8 (UINT64_C (0x0))); + __asm__ ("movi %0.8b, 8\n\t" + "cmhs %0.8b, %3.8b, %0.8b\n\t" + "tbl %1.8b, {%2.16b}, %3.8b\n\t" + "bsl %0.8b, %4.8b, %1.8b\n\t" + : "+w"(result), "=w"(tmp1) + : "w"(temp), "w"(idx), "w"(r) + : /* No clobbers */); return result; } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vqtbl3q_u8 (uint8x16x3_t tab, uint8x16_t idx) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtbx1_u8 (uint8x8_t r, uint8x8_t tab, uint8x8_t idx) { - uint8x16_t result; - __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" - "tbl %0.16b, {v16.16b - v18.16b}, %2.16b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18"); + uint8x8_t result; + uint8x8_t tmp1; + uint8x16_t temp = vcombine_u8 (tab, vcreate_u8 (UINT64_C (0x0))); + __asm__ ("movi %0.8b, 8\n\t" + "cmhs %0.8b, %3.8b, %0.8b\n\t" + "tbl %1.8b, {%2.16b}, %3.8b\n\t" + "bsl %0.8b, %4.8b, %1.8b\n\t" + : "+w"(result), "=w"(tmp1) + : "w"(temp), "w"(idx), "w"(r) + : /* No clobbers */); return result; } -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vqtbl3q_p8 (poly8x16x3_t tab, uint8x16_t idx) +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtbx1_p8 (poly8x8_t r, poly8x8_t tab, uint8x8_t idx) { - poly8x16_t result; - __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" - "tbl %0.16b, {v16.16b - v18.16b}, %2.16b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18"); + poly8x8_t result; + poly8x8_t tmp1; + poly8x16_t temp = vcombine_p8 (tab, vcreate_p8 (UINT64_C (0x0))); + __asm__ ("movi %0.8b, 8\n\t" + "cmhs %0.8b, %3.8b, %0.8b\n\t" + "tbl %1.8b, {%2.16b}, %3.8b\n\t" + "bsl %0.8b, %4.8b, %1.8b\n\t" + : "+w"(result), "=w"(tmp1) + : "w"(temp), "w"(idx), "w"(r) + : /* No clobbers */); return result; } __extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vqtbl4_s8 (int8x16x4_t tab, int8x8_t idx) +vtbx2_s8 (int8x8_t r, int8x8x2_t tab, int8x8_t idx) { - int8x8_t result; - __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" - "tbl %0.8b, {v16.16b - v19.16b}, %2.8b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18", "v19"); + int8x8_t result = r; + int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]); + __asm__ ("tbx %0.8b, {%1.16b}, %2.8b" + : "+w"(result) + : "w"(temp), "w"(idx) + : /* No clobbers */); return result; } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vqtbl4_u8 (uint8x16x4_t tab, uint8x8_t idx) +vtbx2_u8 (uint8x8_t r, uint8x8x2_t tab, uint8x8_t idx) { - uint8x8_t result; - __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" - "tbl %0.8b, {v16.16b - v19.16b}, %2.8b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18", "v19"); + uint8x8_t result = r; + uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]); + __asm__ ("tbx %0.8b, {%1.16b}, %2.8b" + : "+w"(result) + : "w"(temp), "w"(idx) + : /* No clobbers */); return result; } __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vqtbl4_p8 (poly8x16x4_t tab, uint8x8_t idx) +vtbx2_p8 (poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx) { - poly8x8_t result; - __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" - "tbl %0.8b, {v16.16b - v19.16b}, %2.8b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18", "v19"); + poly8x8_t result = r; + poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]); + __asm__ ("tbx %0.8b, {%1.16b}, %2.8b" + : "+w"(result) + : "w"(temp), "w"(idx) + : /* No clobbers */); return result; } - -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vqtbl4q_s8 (int8x16x4_t tab, int8x16_t idx) +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vtbx3_s8 (int8x8_t r, int8x8x3_t tab, int8x8_t idx) { - int8x16_t result; - __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" - "tbl %0.16b, {v16.16b - v19.16b}, %2.16b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18", "v19"); + int8x8_t result; + int8x8_t tmp1; + int8x16x2_t temp; + temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]); + temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (UINT64_C (0x0))); + __asm__ ("ld1 {v16.16b - v17.16b}, %2\n\t" + "movi %0.8b, 24\n\t" + "cmhs %0.8b, %3.8b, %0.8b\n\t" + "tbl %1.8b, {v16.16b - v17.16b}, %3.8b\n\t" + "bsl %0.8b, %4.8b, %1.8b\n\t" + : "+w"(result), "=w"(tmp1) + : "Q"(temp), "w"(idx), "w"(r) + : "v16", "v17", "memory"); return result; } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vqtbl4q_u8 (uint8x16x4_t tab, uint8x16_t idx) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtbx3_u8 (uint8x8_t r, uint8x8x3_t tab, uint8x8_t idx) { - uint8x16_t result; - __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" - "tbl %0.16b, {v16.16b - v19.16b}, %2.16b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18", "v19"); + uint8x8_t result; + uint8x8_t tmp1; + uint8x16x2_t temp; + temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]); + temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (UINT64_C (0x0))); + __asm__ ("ld1 {v16.16b - v17.16b}, %2\n\t" + "movi %0.8b, 24\n\t" + "cmhs %0.8b, %3.8b, %0.8b\n\t" + "tbl %1.8b, {v16.16b - v17.16b}, %3.8b\n\t" + "bsl %0.8b, %4.8b, %1.8b\n\t" + : "+w"(result), "=w"(tmp1) + : "Q"(temp), "w"(idx), "w"(r) + : "v16", "v17", "memory"); return result; } -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vqtbl4q_p8 (poly8x16x4_t tab, uint8x16_t idx) +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtbx3_p8 (poly8x8_t r, poly8x8x3_t tab, uint8x8_t idx) { - poly8x16_t result; - __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" - "tbl %0.16b, {v16.16b - v19.16b}, %2.16b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18", "v19"); + poly8x8_t result; + poly8x8_t tmp1; + poly8x16x2_t temp; + temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]); + temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (UINT64_C (0x0))); + __asm__ ("ld1 {v16.16b - v17.16b}, %2\n\t" + "movi %0.8b, 24\n\t" + "cmhs %0.8b, %3.8b, %0.8b\n\t" + "tbl %1.8b, {v16.16b - v17.16b}, %3.8b\n\t" + "bsl %0.8b, %4.8b, %1.8b\n\t" + : "+w"(result), "=w"(tmp1) + : "Q"(temp), "w"(idx), "w"(r) + : "v16", "v17", "memory"); return result; } - __extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vqtbx1_s8 (int8x8_t r, int8x16_t tab, int8x8_t idx) +vtbx4_s8 (int8x8_t r, int8x8x4_t tab, int8x8_t idx) { int8x8_t result = r; - __asm__ ("tbx %0.8b,{%1.16b},%2.8b" + int8x16x2_t temp; + temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]); + temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]); + __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" + "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" : "+w"(result) - : "w"(tab), "w"(idx) - : /* No clobbers */); + : "Q"(temp), "w"(idx) + : "v16", "v17", "memory"); return result; } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vqtbx1_u8 (uint8x8_t r, uint8x16_t tab, uint8x8_t idx) +vtbx4_u8 (uint8x8_t r, uint8x8x4_t tab, uint8x8_t idx) { uint8x8_t result = r; - __asm__ ("tbx %0.8b,{%1.16b},%2.8b" + uint8x16x2_t temp; + temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]); + temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]); + __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" + "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" : "+w"(result) - : "w"(tab), "w"(idx) - : /* No clobbers */); + : "Q"(temp), "w"(idx) + : "v16", "v17", "memory"); return result; } __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vqtbx1_p8 (poly8x8_t r, poly8x16_t tab, uint8x8_t idx) +vtbx4_p8 (poly8x8_t r, poly8x8x4_t tab, uint8x8_t idx) { poly8x8_t result = r; - __asm__ ("tbx %0.8b,{%1.16b},%2.8b" + poly8x16x2_t temp; + temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]); + temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]); + __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" + "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" : "+w"(result) - : "w"(tab), "w"(idx) - : /* No clobbers */); + : "Q"(temp), "w"(idx) + : "v16", "v17", "memory"); return result; } -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vqtbx1q_s8 (int8x16_t r, int8x16_t tab, int8x16_t idx) +/* End of temporary inline asm. */ + +/* Start of optimal implementations in approved order. */ + +/* vabs */ + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vabs_f32 (float32x2_t __a) { - int8x16_t result = r; - __asm__ ("tbx %0.16b,{%1.16b},%2.16b" - : "+w"(result) - : "w"(tab), "w"(idx) - : /* No clobbers */); - return result; + return __builtin_aarch64_absv2sf (__a); } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vqtbx1q_u8 (uint8x16_t r, uint8x16_t tab, uint8x16_t idx) +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vabsq_f32 (float32x4_t __a) { - uint8x16_t result = r; - __asm__ ("tbx %0.16b,{%1.16b},%2.16b" - : "+w"(result) - : "w"(tab), "w"(idx) - : /* No clobbers */); - return result; + return __builtin_aarch64_absv4sf (__a); } -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vqtbx1q_p8 (poly8x16_t r, poly8x16_t tab, uint8x16_t idx) +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vabsq_f64 (float64x2_t __a) { - poly8x16_t result = r; - __asm__ ("tbx %0.16b,{%1.16b},%2.16b" - : "+w"(result) - : "w"(tab), "w"(idx) - : /* No clobbers */); - return result; + return __builtin_aarch64_absv2df (__a); } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vqtbx2_s8 (int8x8_t r, int8x16x2_t tab, int8x8_t idx) +/* vadd */ + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vaddd_s64 (int64x1_t __a, int64x1_t __b) { - int8x8_t result = r; - __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" - "tbx %0.8b, {v16.16b, v17.16b}, %2.8b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17"); - return result; + return __a + __b; } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vqtbx2_u8 (uint8x8_t r, uint8x16x2_t tab, uint8x8_t idx) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vaddd_u64 (uint64x1_t __a, uint64x1_t __b) { - uint8x8_t result = r; - __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" - "tbx %0.8b, {v16.16b, v17.16b}, %2.8b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17"); - return result; + return __a + __b; } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vqtbx2_p8 (poly8x8_t r, poly8x16x2_t tab, uint8x8_t idx) +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vaddv_f32 (float32x2_t __a) { - poly8x8_t result = r; - __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" - "tbx %0.8b, {v16.16b, v17.16b}, %2.8b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17"); - return result; + float32x2_t t = __builtin_aarch64_addvv2sf (__a); + return vget_lane_f32 (t, 0); } +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vaddvq_f32 (float32x4_t __a) +{ + float32x4_t t = __builtin_aarch64_addvv4sf (__a); + return vgetq_lane_f32 (t, 0); +} -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vqtbx2q_s8 (int8x16_t r, int8x16x2_t tab, int8x16_t idx) +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vaddvq_f64 (float64x2_t __a) { - int8x16_t result = r; - __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" - "tbx %0.16b, {v16.16b, v17.16b}, %2.16b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17"); - return result; + float64x2_t t = __builtin_aarch64_addvv2df (__a); + return vgetq_lane_f64 (t, 0); } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vqtbx2q_u8 (uint8x16_t r, uint8x16x2_t tab, uint8x16_t idx) +/* vceq */ + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vceq_p8 (poly8x8_t __a, poly8x8_t __b) { - uint8x16_t result = r; - __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" - "tbx %0.16b, {v16.16b, v17.16b}, %2.16b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17"); - return result; + return (uint8x8_t) __builtin_aarch64_cmeqv8qi ((int8x8_t) __a, + (int8x8_t) __b); } -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vqtbx2q_p8 (poly8x16_t r, poly8x16x2_t tab, uint8x16_t idx) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vceq_s8 (int8x8_t __a, int8x8_t __b) { - poly8x16_t result = r; - __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" - "tbx %0.16b, {v16.16b, v17.16b}, %2.16b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17"); - return result; + return (uint8x8_t) __builtin_aarch64_cmeqv8qi (__a, __b); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vceq_s16 (int16x4_t __a, int16x4_t __b) +{ + return (uint16x4_t) __builtin_aarch64_cmeqv4hi (__a, __b); } +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vceq_s32 (int32x2_t __a, int32x2_t __b) +{ + return (uint32x2_t) __builtin_aarch64_cmeqv2si (__a, __b); +} -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vqtbx3_s8 (int8x8_t r, int8x16x3_t tab, int8x8_t idx) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vceq_s64 (int64x1_t __a, int64x1_t __b) { - int8x8_t result = r; - __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" - "tbx %0.8b, {v16.16b - v18.16b}, %2.8b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18"); - return result; + return (uint64x1_t) __builtin_aarch64_cmeqdi (__a, __b); } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vqtbx3_u8 (uint8x8_t r, uint8x16x3_t tab, uint8x8_t idx) +vceq_u8 (uint8x8_t __a, uint8x8_t __b) { - uint8x8_t result = r; - __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" - "tbx %0.8b, {v16.16b - v18.16b}, %2.8b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18"); - return result; + return (uint8x8_t) __builtin_aarch64_cmeqv8qi ((int8x8_t) __a, + (int8x8_t) __b); } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vqtbx3_p8 (poly8x8_t r, poly8x16x3_t tab, uint8x8_t idx) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vceq_u16 (uint16x4_t __a, uint16x4_t __b) { - poly8x8_t result = r; - __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" - "tbx %0.8b, {v16.16b - v18.16b}, %2.8b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18"); - return result; + return (uint16x4_t) __builtin_aarch64_cmeqv4hi ((int16x4_t) __a, + (int16x4_t) __b); } +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vceq_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t) __builtin_aarch64_cmeqv2si ((int32x2_t) __a, + (int32x2_t) __b); +} -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vqtbx3q_s8 (int8x16_t r, int8x16x3_t tab, int8x16_t idx) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vceq_u64 (uint64x1_t __a, uint64x1_t __b) { - int8x16_t result = r; - __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" - "tbx %0.16b, {v16.16b - v18.16b}, %2.16b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18"); - return result; + return (uint64x1_t) __builtin_aarch64_cmeqdi ((int64x1_t) __a, + (int64x1_t) __b); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vqtbx3q_u8 (uint8x16_t r, uint8x16x3_t tab, uint8x16_t idx) +vceqq_p8 (poly8x16_t __a, poly8x16_t __b) { - uint8x16_t result = r; - __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" - "tbx %0.16b, {v16.16b - v18.16b}, %2.16b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18"); - return result; + return (uint8x16_t) __builtin_aarch64_cmeqv16qi ((int8x16_t) __a, + (int8x16_t) __b); } -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vqtbx3q_p8 (poly8x16_t r, poly8x16x3_t tab, uint8x16_t idx) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vceqq_s8 (int8x16_t __a, int8x16_t __b) { - poly8x16_t result = r; - __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" - "tbx %0.16b, {v16.16b - v18.16b}, %2.16b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18"); - return result; + return (uint8x16_t) __builtin_aarch64_cmeqv16qi (__a, __b); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vceqq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (uint16x8_t) __builtin_aarch64_cmeqv8hi (__a, __b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vceqq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (uint32x4_t) __builtin_aarch64_cmeqv4si (__a, __b); } +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vceqq_s64 (int64x2_t __a, int64x2_t __b) +{ + return (uint64x2_t) __builtin_aarch64_cmeqv2di (__a, __b); +} -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vqtbx4_s8 (int8x8_t r, int8x16x4_t tab, int8x8_t idx) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vceqq_u8 (uint8x16_t __a, uint8x16_t __b) { - int8x8_t result = r; - __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" - "tbx %0.8b, {v16.16b - v19.16b}, %2.8b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18", "v19"); - return result; + return (uint8x16_t) __builtin_aarch64_cmeqv16qi ((int8x16_t) __a, + (int8x16_t) __b); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vqtbx4_u8 (uint8x8_t r, uint8x16x4_t tab, uint8x8_t idx) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vceqq_u16 (uint16x8_t __a, uint16x8_t __b) { - uint8x8_t result = r; - __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" - "tbx %0.8b, {v16.16b - v19.16b}, %2.8b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18", "v19"); - return result; + return (uint16x8_t) __builtin_aarch64_cmeqv8hi ((int16x8_t) __a, + (int16x8_t) __b); } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vqtbx4_p8 (poly8x8_t r, poly8x16x4_t tab, uint8x8_t idx) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vceqq_u32 (uint32x4_t __a, uint32x4_t __b) { - poly8x8_t result = r; - __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" - "tbx %0.8b, {v16.16b - v19.16b}, %2.8b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18", "v19"); - return result; + return (uint32x4_t) __builtin_aarch64_cmeqv4si ((int32x4_t) __a, + (int32x4_t) __b); } +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vceqq_u64 (uint64x2_t __a, uint64x2_t __b) +{ + return (uint64x2_t) __builtin_aarch64_cmeqv2di ((int64x2_t) __a, + (int64x2_t) __b); +} -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vqtbx4q_s8 (int8x16_t r, int8x16x4_t tab, int8x16_t idx) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vceqd_s64 (int64x1_t __a, int64x1_t __b) { - int8x16_t result = r; - __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" - "tbx %0.16b, {v16.16b - v19.16b}, %2.16b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18", "v19"); - return result; + return (uint64x1_t) __builtin_aarch64_cmeqdi (__a, __b); } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vqtbx4q_u8 (uint8x16_t r, uint8x16x4_t tab, uint8x16_t idx) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vceqd_u64 (uint64x1_t __a, uint64x1_t __b) { - uint8x16_t result = r; - __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" - "tbx %0.16b, {v16.16b - v19.16b}, %2.16b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18", "v19"); - return result; + return (uint64x1_t) __builtin_aarch64_cmeqdi (__a, __b); } -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vqtbx4q_p8 (poly8x16_t r, poly8x16x4_t tab, uint8x16_t idx) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vceqzd_s64 (int64x1_t __a) { - poly8x16_t result = r; - __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" - "tbx %0.16b, {v16.16b - v19.16b}, %2.16b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18", "v19"); - return result; + return (uint64x1_t) __builtin_aarch64_cmeqdi (__a, 0); } -/* V7 legacy table intrinsics. */ +/* vcge */ -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vtbl1_s8 (int8x8_t tab, int8x8_t idx) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcge_s8 (int8x8_t __a, int8x8_t __b) { - int8x8_t result; - int8x16_t temp = vcombine_s8 (tab, vcreate_s8 (UINT64_C (0x0))); - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(result) - : "w"(temp), "w"(idx) - : /* No clobbers */); - return result; + return (uint8x8_t) __builtin_aarch64_cmgev8qi (__a, __b); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vtbl1_u8 (uint8x8_t tab, uint8x8_t idx) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcge_s16 (int16x4_t __a, int16x4_t __b) { - uint8x8_t result; - uint8x16_t temp = vcombine_u8 (tab, vcreate_u8 (UINT64_C (0x0))); - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(result) - : "w"(temp), "w"(idx) - : /* No clobbers */); - return result; + return (uint16x4_t) __builtin_aarch64_cmgev4hi (__a, __b); } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vtbl1_p8 (poly8x8_t tab, uint8x8_t idx) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcge_s32 (int32x2_t __a, int32x2_t __b) { - poly8x8_t result; - poly8x16_t temp = vcombine_p8 (tab, vcreate_p8 (UINT64_C (0x0))); - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(result) - : "w"(temp), "w"(idx) - : /* No clobbers */); - return result; + return (uint32x2_t) __builtin_aarch64_cmgev2si (__a, __b); } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vtbl2_s8 (int8x8x2_t tab, int8x8_t idx) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcge_s64 (int64x1_t __a, int64x1_t __b) { - int8x8_t result; - int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]); - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(result) - : "w"(temp), "w"(idx) - : /* No clobbers */); - return result; + return (uint64x1_t) __builtin_aarch64_cmgedi (__a, __b); } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vtbl2_u8 (uint8x8x2_t tab, uint8x8_t idx) +vcge_u8 (uint8x8_t __a, uint8x8_t __b) { - uint8x8_t result; - uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]); - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(result) - : "w"(temp), "w"(idx) - : /* No clobbers */); - return result; + return (uint8x8_t) __builtin_aarch64_cmhsv8qi ((int8x8_t) __a, + (int8x8_t) __b); } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vtbl2_p8 (poly8x8x2_t tab, uint8x8_t idx) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcge_u16 (uint16x4_t __a, uint16x4_t __b) { - poly8x8_t result; - poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]); - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(result) - : "w"(temp), "w"(idx) - : /* No clobbers */); - return result; + return (uint16x4_t) __builtin_aarch64_cmhsv4hi ((int16x4_t) __a, + (int16x4_t) __b); } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vtbl3_s8 (int8x8x3_t tab, int8x8_t idx) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcge_u32 (uint32x2_t __a, uint32x2_t __b) { - int8x8_t result; - int8x16x2_t temp; - temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (UINT64_C (0x0))); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "=w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); - return result; + return (uint32x2_t) __builtin_aarch64_cmhsv2si ((int32x2_t) __a, + (int32x2_t) __b); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vtbl3_u8 (uint8x8x3_t tab, uint8x8_t idx) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcge_u64 (uint64x1_t __a, uint64x1_t __b) { - uint8x8_t result; - uint8x16x2_t temp; - temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (UINT64_C (0x0))); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "=w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); - return result; + return (uint64x1_t) __builtin_aarch64_cmhsdi ((int64x1_t) __a, + (int64x1_t) __b); } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vtbl3_p8 (poly8x8x3_t tab, uint8x8_t idx) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcgeq_s8 (int8x16_t __a, int8x16_t __b) { - poly8x8_t result; - poly8x16x2_t temp; - temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (UINT64_C (0x0))); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "=w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); - return result; + return (uint8x16_t) __builtin_aarch64_cmgev16qi (__a, __b); } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vtbl4_s8 (int8x8x4_t tab, int8x8_t idx) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcgeq_s16 (int16x8_t __a, int16x8_t __b) { - int8x8_t result; - int8x16x2_t temp; - temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "=w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); - return result; + return (uint16x8_t) __builtin_aarch64_cmgev8hi (__a, __b); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vtbl4_u8 (uint8x8x4_t tab, uint8x8_t idx) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcgeq_s32 (int32x4_t __a, int32x4_t __b) { - uint8x8_t result; - uint8x16x2_t temp; - temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "=w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); - return result; + return (uint32x4_t) __builtin_aarch64_cmgev4si (__a, __b); } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vtbl4_p8 (poly8x8x4_t tab, uint8x8_t idx) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcgeq_s64 (int64x2_t __a, int64x2_t __b) { - poly8x8_t result; - poly8x16x2_t temp; - temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "=w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); - return result; + return (uint64x2_t) __builtin_aarch64_cmgev2di (__a, __b); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcgeq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t) __builtin_aarch64_cmhsv16qi ((int8x16_t) __a, + (int8x16_t) __b); } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vtbx1_s8 (int8x8_t r, int8x8_t tab, int8x8_t idx) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcgeq_u16 (uint16x8_t __a, uint16x8_t __b) { - int8x8_t result; - int8x8_t tmp1; - int8x16_t temp = vcombine_s8 (tab, vcreate_s8 (UINT64_C (0x0))); - __asm__ ("movi %0.8b, 8\n\t" - "cmhs %0.8b, %3.8b, %0.8b\n\t" - "tbl %1.8b, {%2.16b}, %3.8b\n\t" - "bsl %0.8b, %4.8b, %1.8b\n\t" - : "+w"(result), "=w"(tmp1) - : "w"(temp), "w"(idx), "w"(r) - : /* No clobbers */); - return result; + return (uint16x8_t) __builtin_aarch64_cmhsv8hi ((int16x8_t) __a, + (int16x8_t) __b); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vtbx1_u8 (uint8x8_t r, uint8x8_t tab, uint8x8_t idx) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcgeq_u32 (uint32x4_t __a, uint32x4_t __b) { - uint8x8_t result; - uint8x8_t tmp1; - uint8x16_t temp = vcombine_u8 (tab, vcreate_u8 (UINT64_C (0x0))); - __asm__ ("movi %0.8b, 8\n\t" - "cmhs %0.8b, %3.8b, %0.8b\n\t" - "tbl %1.8b, {%2.16b}, %3.8b\n\t" - "bsl %0.8b, %4.8b, %1.8b\n\t" - : "+w"(result), "=w"(tmp1) - : "w"(temp), "w"(idx), "w"(r) - : /* No clobbers */); - return result; + return (uint32x4_t) __builtin_aarch64_cmhsv4si ((int32x4_t) __a, + (int32x4_t) __b); } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vtbx1_p8 (poly8x8_t r, poly8x8_t tab, uint8x8_t idx) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcgeq_u64 (uint64x2_t __a, uint64x2_t __b) { - poly8x8_t result; - poly8x8_t tmp1; - poly8x16_t temp = vcombine_p8 (tab, vcreate_p8 (UINT64_C (0x0))); - __asm__ ("movi %0.8b, 8\n\t" - "cmhs %0.8b, %3.8b, %0.8b\n\t" - "tbl %1.8b, {%2.16b}, %3.8b\n\t" - "bsl %0.8b, %4.8b, %1.8b\n\t" - : "+w"(result), "=w"(tmp1) - : "w"(temp), "w"(idx), "w"(r) - : /* No clobbers */); - return result; + return (uint64x2_t) __builtin_aarch64_cmhsv2di ((int64x2_t) __a, + (int64x2_t) __b); } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vtbx2_s8 (int8x8_t r, int8x8x2_t tab, int8x8_t idx) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcged_s64 (int64x1_t __a, int64x1_t __b) { - int8x8_t result = r; - int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]); - __asm__ ("tbx %0.8b, {%1.16b}, %2.8b" - : "+w"(result) - : "w"(temp), "w"(idx) - : /* No clobbers */); - return result; + return (uint64x1_t) __builtin_aarch64_cmgedi (__a, __b); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vtbx2_u8 (uint8x8_t r, uint8x8x2_t tab, uint8x8_t idx) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcged_u64 (uint64x1_t __a, uint64x1_t __b) { - uint8x8_t result = r; - uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]); - __asm__ ("tbx %0.8b, {%1.16b}, %2.8b" - : "+w"(result) - : "w"(temp), "w"(idx) - : /* No clobbers */); - return result; + return (uint64x1_t) __builtin_aarch64_cmhsdi ((int64x1_t) __a, + (int64x1_t) __b); } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vtbx2_p8 (poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcgezd_s64 (int64x1_t __a) { - poly8x8_t result = r; - poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]); - __asm__ ("tbx %0.8b, {%1.16b}, %2.8b" - : "+w"(result) - : "w"(temp), "w"(idx) - : /* No clobbers */); - return result; + return (uint64x1_t) __builtin_aarch64_cmgedi (__a, 0); } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vtbx3_s8 (int8x8_t r, int8x8x3_t tab, int8x8_t idx) +/* vcgt */ + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcgt_s8 (int8x8_t __a, int8x8_t __b) { - int8x8_t result; - int8x8_t tmp1; - int8x16x2_t temp; - temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (UINT64_C (0x0))); - __asm__ ("ld1 {v16.16b - v17.16b}, %2\n\t" - "movi %0.8b, 24\n\t" - "cmhs %0.8b, %3.8b, %0.8b\n\t" - "tbl %1.8b, {v16.16b - v17.16b}, %3.8b\n\t" - "bsl %0.8b, %4.8b, %1.8b\n\t" - : "+w"(result), "=w"(tmp1) - : "Q"(temp), "w"(idx), "w"(r) - : "v16", "v17", "memory"); - return result; + return (uint8x8_t) __builtin_aarch64_cmgtv8qi (__a, __b); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vtbx3_u8 (uint8x8_t r, uint8x8x3_t tab, uint8x8_t idx) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcgt_s16 (int16x4_t __a, int16x4_t __b) { - uint8x8_t result; - uint8x8_t tmp1; - uint8x16x2_t temp; - temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (UINT64_C (0x0))); - __asm__ ("ld1 {v16.16b - v17.16b}, %2\n\t" - "movi %0.8b, 24\n\t" - "cmhs %0.8b, %3.8b, %0.8b\n\t" - "tbl %1.8b, {v16.16b - v17.16b}, %3.8b\n\t" - "bsl %0.8b, %4.8b, %1.8b\n\t" - : "+w"(result), "=w"(tmp1) - : "Q"(temp), "w"(idx), "w"(r) - : "v16", "v17", "memory"); - return result; + return (uint16x4_t) __builtin_aarch64_cmgtv4hi (__a, __b); } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vtbx3_p8 (poly8x8_t r, poly8x8x3_t tab, uint8x8_t idx) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcgt_s32 (int32x2_t __a, int32x2_t __b) { - poly8x8_t result; - poly8x8_t tmp1; - poly8x16x2_t temp; - temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (UINT64_C (0x0))); - __asm__ ("ld1 {v16.16b - v17.16b}, %2\n\t" - "movi %0.8b, 24\n\t" - "cmhs %0.8b, %3.8b, %0.8b\n\t" - "tbl %1.8b, {v16.16b - v17.16b}, %3.8b\n\t" - "bsl %0.8b, %4.8b, %1.8b\n\t" - : "+w"(result), "=w"(tmp1) - : "Q"(temp), "w"(idx), "w"(r) - : "v16", "v17", "memory"); - return result; + return (uint32x2_t) __builtin_aarch64_cmgtv2si (__a, __b); } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vtbx4_s8 (int8x8_t r, int8x8x4_t tab, int8x8_t idx) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcgt_s64 (int64x1_t __a, int64x1_t __b) { - int8x8_t result = r; - int8x16x2_t temp; - temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "+w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); - return result; + return (uint64x1_t) __builtin_aarch64_cmgtdi (__a, __b); } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vtbx4_u8 (uint8x8_t r, uint8x8x4_t tab, uint8x8_t idx) +vcgt_u8 (uint8x8_t __a, uint8x8_t __b) { - uint8x8_t result = r; - uint8x16x2_t temp; - temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "+w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); - return result; + return (uint8x8_t) __builtin_aarch64_cmhiv8qi ((int8x8_t) __a, + (int8x8_t) __b); } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vtbx4_p8 (poly8x8_t r, poly8x8x4_t tab, uint8x8_t idx) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcgt_u16 (uint16x4_t __a, uint16x4_t __b) { - poly8x8_t result = r; - poly8x16x2_t temp; - temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "+w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); - return result; + return (uint16x4_t) __builtin_aarch64_cmhiv4hi ((int16x4_t) __a, + (int16x4_t) __b); } -/* End of temporary inline asm. */ - -/* Start of optimal implementations in approved order. */ +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcgt_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t) __builtin_aarch64_cmhiv2si ((int32x2_t) __a, + (int32x2_t) __b); +} -/* vabs */ +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcgt_u64 (uint64x1_t __a, uint64x1_t __b) +{ + return (uint64x1_t) __builtin_aarch64_cmhidi ((int64x1_t) __a, + (int64x1_t) __b); +} -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vabs_f32 (float32x2_t __a) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcgtq_s8 (int8x16_t __a, int8x16_t __b) { - return __builtin_aarch64_absv2sf (__a); + return (uint8x16_t) __builtin_aarch64_cmgtv16qi (__a, __b); } -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vabsq_f32 (float32x4_t __a) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcgtq_s16 (int16x8_t __a, int16x8_t __b) { - return __builtin_aarch64_absv4sf (__a); + return (uint16x8_t) __builtin_aarch64_cmgtv8hi (__a, __b); } -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vabsq_f64 (float64x2_t __a) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcgtq_s32 (int32x4_t __a, int32x4_t __b) { - return __builtin_aarch64_absv2df (__a); + return (uint32x4_t) __builtin_aarch64_cmgtv4si (__a, __b); } -/* vadd */ +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcgtq_s64 (int64x2_t __a, int64x2_t __b) +{ + return (uint64x2_t) __builtin_aarch64_cmgtv2di (__a, __b); +} -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) -vaddd_s64 (int64x1_t __a, int64x1_t __b) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcgtq_u8 (uint8x16_t __a, uint8x16_t __b) { - return __a + __b; + return (uint8x16_t) __builtin_aarch64_cmhiv16qi ((int8x16_t) __a, + (int8x16_t) __b); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vaddd_u64 (uint64x1_t __a, uint64x1_t __b) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcgtq_u16 (uint16x8_t __a, uint16x8_t __b) { - return __a + __b; + return (uint16x8_t) __builtin_aarch64_cmhiv8hi ((int16x8_t) __a, + (int16x8_t) __b); } -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vaddv_f32 (float32x2_t __a) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcgtq_u32 (uint32x4_t __a, uint32x4_t __b) { - float32x2_t t = __builtin_aarch64_addvv2sf (__a); - return vget_lane_f32 (t, 0); + return (uint32x4_t) __builtin_aarch64_cmhiv4si ((int32x4_t) __a, + (int32x4_t) __b); } -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vaddvq_f32 (float32x4_t __a) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcgtq_u64 (uint64x2_t __a, uint64x2_t __b) { - float32x4_t t = __builtin_aarch64_addvv4sf (__a); - return vgetq_lane_f32 (t, 0); + return (uint64x2_t) __builtin_aarch64_cmhiv2di ((int64x2_t) __a, + (int64x2_t) __b); } -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) -vaddvq_f64 (float64x2_t __a) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcgtd_s64 (int64x1_t __a, int64x1_t __b) { - float64x2_t t = __builtin_aarch64_addvv2df (__a); - return vgetq_lane_f64 (t, 0); + return (uint64x1_t) __builtin_aarch64_cmgtdi (__a, __b); } -/* vceq */ +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcgtd_u64 (uint64x1_t __a, uint64x1_t __b) +{ + return (uint64x1_t) __builtin_aarch64_cmhidi ((int64x1_t) __a, + (int64x1_t) __b); +} -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vceq_p8 (poly8x8_t __a, poly8x8_t __b) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcgtzd_s64 (int64x1_t __a) { - return (uint8x8_t) __builtin_aarch64_cmeqv8qi ((int8x8_t) __a, - (int8x8_t) __b); + return (uint64x1_t) __builtin_aarch64_cmgtdi (__a, 0); } +/* vcle */ + __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vceq_s8 (int8x8_t __a, int8x8_t __b) +vcle_s8 (int8x8_t __a, int8x8_t __b) { - return (uint8x8_t) __builtin_aarch64_cmeqv8qi (__a, __b); + return (uint8x8_t) __builtin_aarch64_cmgev8qi (__b, __a); } __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vceq_s16 (int16x4_t __a, int16x4_t __b) +vcle_s16 (int16x4_t __a, int16x4_t __b) { - return (uint16x4_t) __builtin_aarch64_cmeqv4hi (__a, __b); + return (uint16x4_t) __builtin_aarch64_cmgev4hi (__b, __a); } __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vceq_s32 (int32x2_t __a, int32x2_t __b) +vcle_s32 (int32x2_t __a, int32x2_t __b) { - return (uint32x2_t) __builtin_aarch64_cmeqv2si (__a, __b); + return (uint32x2_t) __builtin_aarch64_cmgev2si (__b, __a); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vceq_s64 (int64x1_t __a, int64x1_t __b) +vcle_s64 (int64x1_t __a, int64x1_t __b) { - return (uint64x1_t) __builtin_aarch64_cmeqdi (__a, __b); + return (uint64x1_t) __builtin_aarch64_cmgedi (__b, __a); } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vceq_u8 (uint8x8_t __a, uint8x8_t __b) +vcle_u8 (uint8x8_t __a, uint8x8_t __b) { - return (uint8x8_t) __builtin_aarch64_cmeqv8qi ((int8x8_t) __a, - (int8x8_t) __b); + return (uint8x8_t) __builtin_aarch64_cmhsv8qi ((int8x8_t) __b, + (int8x8_t) __a); } __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vceq_u16 (uint16x4_t __a, uint16x4_t __b) +vcle_u16 (uint16x4_t __a, uint16x4_t __b) { - return (uint16x4_t) __builtin_aarch64_cmeqv4hi ((int16x4_t) __a, - (int16x4_t) __b); + return (uint16x4_t) __builtin_aarch64_cmhsv4hi ((int16x4_t) __b, + (int16x4_t) __a); } __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vceq_u32 (uint32x2_t __a, uint32x2_t __b) +vcle_u32 (uint32x2_t __a, uint32x2_t __b) { - return (uint32x2_t) __builtin_aarch64_cmeqv2si ((int32x2_t) __a, - (int32x2_t) __b); + return (uint32x2_t) __builtin_aarch64_cmhsv2si ((int32x2_t) __b, + (int32x2_t) __a); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vceq_u64 (uint64x1_t __a, uint64x1_t __b) -{ - return (uint64x1_t) __builtin_aarch64_cmeqdi ((int64x1_t) __a, - (int64x1_t) __b); -} - -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vceqq_p8 (poly8x16_t __a, poly8x16_t __b) +vcle_u64 (uint64x1_t __a, uint64x1_t __b) { - return (uint8x16_t) __builtin_aarch64_cmeqv16qi ((int8x16_t) __a, - (int8x16_t) __b); + return (uint64x1_t) __builtin_aarch64_cmhsdi ((int64x1_t) __b, + (int64x1_t) __a); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vceqq_s8 (int8x16_t __a, int8x16_t __b) +vcleq_s8 (int8x16_t __a, int8x16_t __b) { - return (uint8x16_t) __builtin_aarch64_cmeqv16qi (__a, __b); + return (uint8x16_t) __builtin_aarch64_cmgev16qi (__b, __a); } __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vceqq_s16 (int16x8_t __a, int16x8_t __b) +vcleq_s16 (int16x8_t __a, int16x8_t __b) { - return (uint16x8_t) __builtin_aarch64_cmeqv8hi (__a, __b); + return (uint16x8_t) __builtin_aarch64_cmgev8hi (__b, __a); } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vceqq_s32 (int32x4_t __a, int32x4_t __b) +vcleq_s32 (int32x4_t __a, int32x4_t __b) { - return (uint32x4_t) __builtin_aarch64_cmeqv4si (__a, __b); + return (uint32x4_t) __builtin_aarch64_cmgev4si (__b, __a); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vceqq_s64 (int64x2_t __a, int64x2_t __b) +vcleq_s64 (int64x2_t __a, int64x2_t __b) { - return (uint64x2_t) __builtin_aarch64_cmeqv2di (__a, __b); + return (uint64x2_t) __builtin_aarch64_cmgev2di (__b, __a); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vceqq_u8 (uint8x16_t __a, uint8x16_t __b) +vcleq_u8 (uint8x16_t __a, uint8x16_t __b) { - return (uint8x16_t) __builtin_aarch64_cmeqv16qi ((int8x16_t) __a, - (int8x16_t) __b); + return (uint8x16_t) __builtin_aarch64_cmhsv16qi ((int8x16_t) __b, + (int8x16_t) __a); } __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vceqq_u16 (uint16x8_t __a, uint16x8_t __b) +vcleq_u16 (uint16x8_t __a, uint16x8_t __b) { - return (uint16x8_t) __builtin_aarch64_cmeqv8hi ((int16x8_t) __a, - (int16x8_t) __b); + return (uint16x8_t) __builtin_aarch64_cmhsv8hi ((int16x8_t) __b, + (int16x8_t) __a); } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vceqq_u32 (uint32x4_t __a, uint32x4_t __b) +vcleq_u32 (uint32x4_t __a, uint32x4_t __b) { - return (uint32x4_t) __builtin_aarch64_cmeqv4si ((int32x4_t) __a, - (int32x4_t) __b); + return (uint32x4_t) __builtin_aarch64_cmhsv4si ((int32x4_t) __b, + (int32x4_t) __a); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vceqq_u64 (uint64x2_t __a, uint64x2_t __b) -{ - return (uint64x2_t) __builtin_aarch64_cmeqv2di ((int64x2_t) __a, - (int64x2_t) __b); -} - -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vceqd_s64 (int64x1_t __a, int64x1_t __b) +vcleq_u64 (uint64x2_t __a, uint64x2_t __b) { - return (uint64x1_t) __builtin_aarch64_cmeqdi (__a, __b); + return (uint64x2_t) __builtin_aarch64_cmhsv2di ((int64x2_t) __b, + (int64x2_t) __a); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vceqd_u64 (uint64x1_t __a, uint64x1_t __b) +vcled_s64 (int64x1_t __a, int64x1_t __b) { - return (uint64x1_t) __builtin_aarch64_cmeqdi (__a, __b); + return (uint64x1_t) __builtin_aarch64_cmgedi (__b, __a); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vceqzd_s64 (int64x1_t __a) +vclezd_s64 (int64x1_t __a) { - return (uint64x1_t) __builtin_aarch64_cmeqdi (__a, 0); + return (uint64x1_t) __builtin_aarch64_cmledi (__a, 0); } -/* vcge */ +/* vclt */ __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vcge_s8 (int8x8_t __a, int8x8_t __b) +vclt_s8 (int8x8_t __a, int8x8_t __b) { - return (uint8x8_t) __builtin_aarch64_cmgev8qi (__a, __b); + return (uint8x8_t) __builtin_aarch64_cmgtv8qi (__b, __a); } __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vcge_s16 (int16x4_t __a, int16x4_t __b) +vclt_s16 (int16x4_t __a, int16x4_t __b) { - return (uint16x4_t) __builtin_aarch64_cmgev4hi (__a, __b); + return (uint16x4_t) __builtin_aarch64_cmgtv4hi (__b, __a); } __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcge_s32 (int32x2_t __a, int32x2_t __b) +vclt_s32 (int32x2_t __a, int32x2_t __b) { - return (uint32x2_t) __builtin_aarch64_cmgev2si (__a, __b); + return (uint32x2_t) __builtin_aarch64_cmgtv2si (__b, __a); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcge_s64 (int64x1_t __a, int64x1_t __b) +vclt_s64 (int64x1_t __a, int64x1_t __b) { - return (uint64x1_t) __builtin_aarch64_cmgedi (__a, __b); + return (uint64x1_t) __builtin_aarch64_cmgtdi (__b, __a); } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vcge_u8 (uint8x8_t __a, uint8x8_t __b) +vclt_u8 (uint8x8_t __a, uint8x8_t __b) { - return (uint8x8_t) __builtin_aarch64_cmhsv8qi ((int8x8_t) __a, - (int8x8_t) __b); + return (uint8x8_t) __builtin_aarch64_cmhiv8qi ((int8x8_t) __b, + (int8x8_t) __a); } __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vcge_u16 (uint16x4_t __a, uint16x4_t __b) +vclt_u16 (uint16x4_t __a, uint16x4_t __b) { - return (uint16x4_t) __builtin_aarch64_cmhsv4hi ((int16x4_t) __a, - (int16x4_t) __b); + return (uint16x4_t) __builtin_aarch64_cmhiv4hi ((int16x4_t) __b, + (int16x4_t) __a); } __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcge_u32 (uint32x2_t __a, uint32x2_t __b) +vclt_u32 (uint32x2_t __a, uint32x2_t __b) { - return (uint32x2_t) __builtin_aarch64_cmhsv2si ((int32x2_t) __a, - (int32x2_t) __b); + return (uint32x2_t) __builtin_aarch64_cmhiv2si ((int32x2_t) __b, + (int32x2_t) __a); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcge_u64 (uint64x1_t __a, uint64x1_t __b) +vclt_u64 (uint64x1_t __a, uint64x1_t __b) { - return (uint64x1_t) __builtin_aarch64_cmhsdi ((int64x1_t) __a, - (int64x1_t) __b); + return (uint64x1_t) __builtin_aarch64_cmhidi ((int64x1_t) __b, + (int64x1_t) __a); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcgeq_s8 (int8x16_t __a, int8x16_t __b) +vcltq_s8 (int8x16_t __a, int8x16_t __b) { - return (uint8x16_t) __builtin_aarch64_cmgev16qi (__a, __b); + return (uint8x16_t) __builtin_aarch64_cmgtv16qi (__b, __a); } __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vcgeq_s16 (int16x8_t __a, int16x8_t __b) +vcltq_s16 (int16x8_t __a, int16x8_t __b) { - return (uint16x8_t) __builtin_aarch64_cmgev8hi (__a, __b); + return (uint16x8_t) __builtin_aarch64_cmgtv8hi (__b, __a); } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcgeq_s32 (int32x4_t __a, int32x4_t __b) +vcltq_s32 (int32x4_t __a, int32x4_t __b) { - return (uint32x4_t) __builtin_aarch64_cmgev4si (__a, __b); + return (uint32x4_t) __builtin_aarch64_cmgtv4si (__b, __a); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcgeq_s64 (int64x2_t __a, int64x2_t __b) +vcltq_s64 (int64x2_t __a, int64x2_t __b) { - return (uint64x2_t) __builtin_aarch64_cmgev2di (__a, __b); + return (uint64x2_t) __builtin_aarch64_cmgtv2di (__b, __a); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcgeq_u8 (uint8x16_t __a, uint8x16_t __b) +vcltq_u8 (uint8x16_t __a, uint8x16_t __b) { - return (uint8x16_t) __builtin_aarch64_cmhsv16qi ((int8x16_t) __a, - (int8x16_t) __b); + return (uint8x16_t) __builtin_aarch64_cmhiv16qi ((int8x16_t) __b, + (int8x16_t) __a); } __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vcgeq_u16 (uint16x8_t __a, uint16x8_t __b) +vcltq_u16 (uint16x8_t __a, uint16x8_t __b) { - return (uint16x8_t) __builtin_aarch64_cmhsv8hi ((int16x8_t) __a, - (int16x8_t) __b); + return (uint16x8_t) __builtin_aarch64_cmhiv8hi ((int16x8_t) __b, + (int16x8_t) __a); } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcgeq_u32 (uint32x4_t __a, uint32x4_t __b) +vcltq_u32 (uint32x4_t __a, uint32x4_t __b) { - return (uint32x4_t) __builtin_aarch64_cmhsv4si ((int32x4_t) __a, - (int32x4_t) __b); + return (uint32x4_t) __builtin_aarch64_cmhiv4si ((int32x4_t) __b, + (int32x4_t) __a); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcgeq_u64 (uint64x2_t __a, uint64x2_t __b) +vcltq_u64 (uint64x2_t __a, uint64x2_t __b) { - return (uint64x2_t) __builtin_aarch64_cmhsv2di ((int64x2_t) __a, - (int64x2_t) __b); + return (uint64x2_t) __builtin_aarch64_cmhiv2di ((int64x2_t) __b, + (int64x2_t) __a); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcged_s64 (int64x1_t __a, int64x1_t __b) +vcltd_s64 (int64x1_t __a, int64x1_t __b) { - return (uint64x1_t) __builtin_aarch64_cmgedi (__a, __b); + return (uint64x1_t) __builtin_aarch64_cmgtdi (__b, __a); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcged_u64 (uint64x1_t __a, uint64x1_t __b) +vcltzd_s64 (int64x1_t __a) { - return (uint64x1_t) __builtin_aarch64_cmhsdi ((int64x1_t) __a, - (int64x1_t) __b); + return (uint64x1_t) __builtin_aarch64_cmltdi (__a, 0); +} + +/* vcvt (double -> float). */ + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vcvt_f32_f64 (float64x2_t __a) +{ + return __builtin_aarch64_float_truncate_lo_v2sf (__a); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vcvt_high_f32_f64 (float32x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_float_truncate_hi_v4sf (__a, __b); +} + +/* vcvt (float -> double). */ + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vcvt_f64_f32 (float32x2_t __a) +{ + + return __builtin_aarch64_float_extend_lo_v2df (__a); +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vcvt_high_f64_f32 (float32x4_t __a) +{ + return __builtin_aarch64_vec_unpacks_hi_v4sf (__a); +} + +/* vcvt (int -> float) */ + +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vcvtd_f64_s64 (int64_t __a) +{ + return (float64_t) __a; +} + +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vcvtd_f64_u64 (uint64_t __a) +{ + return (float64_t) __a; +} + +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vcvts_f32_s32 (int32_t __a) +{ + return (float32_t) __a; } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcgezd_s64 (int64x1_t __a) +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vcvts_f32_u32 (uint32_t __a) { - return (uint64x1_t) __builtin_aarch64_cmgedi (__a, 0); + return (float32_t) __a; } -/* vcgt */ - -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vcgt_s8 (int8x8_t __a, int8x8_t __b) +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vcvt_f32_s32 (int32x2_t __a) { - return (uint8x8_t) __builtin_aarch64_cmgtv8qi (__a, __b); + return __builtin_aarch64_floatv2siv2sf (__a); } -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vcgt_s16 (int16x4_t __a, int16x4_t __b) +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vcvt_f32_u32 (uint32x2_t __a) { - return (uint16x4_t) __builtin_aarch64_cmgtv4hi (__a, __b); + return __builtin_aarch64_floatunsv2siv2sf ((int32x2_t) __a); } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcgt_s32 (int32x2_t __a, int32x2_t __b) +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vcvtq_f32_s32 (int32x4_t __a) { - return (uint32x2_t) __builtin_aarch64_cmgtv2si (__a, __b); + return __builtin_aarch64_floatv4siv4sf (__a); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcgt_s64 (int64x1_t __a, int64x1_t __b) +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vcvtq_f32_u32 (uint32x4_t __a) { - return (uint64x1_t) __builtin_aarch64_cmgtdi (__a, __b); + return __builtin_aarch64_floatunsv4siv4sf ((int32x4_t) __a); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vcgt_u8 (uint8x8_t __a, uint8x8_t __b) +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vcvtq_f64_s64 (int64x2_t __a) { - return (uint8x8_t) __builtin_aarch64_cmhiv8qi ((int8x8_t) __a, - (int8x8_t) __b); + return __builtin_aarch64_floatv2div2df (__a); } -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vcgt_u16 (uint16x4_t __a, uint16x4_t __b) +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vcvtq_f64_u64 (uint64x2_t __a) { - return (uint16x4_t) __builtin_aarch64_cmhiv4hi ((int16x4_t) __a, - (int16x4_t) __b); + return __builtin_aarch64_floatunsv2div2df ((int64x2_t) __a); } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcgt_u32 (uint32x2_t __a, uint32x2_t __b) +/* vcvt (float -> int) */ + +__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +vcvtd_s64_f64 (float64_t __a) { - return (uint32x2_t) __builtin_aarch64_cmhiv2si ((int32x2_t) __a, - (int32x2_t) __b); + return (int64_t) __a; } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcgt_u64 (uint64x1_t __a, uint64x1_t __b) +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vcvtd_u64_f64 (float64_t __a) { - return (uint64x1_t) __builtin_aarch64_cmhidi ((int64x1_t) __a, - (int64x1_t) __b); + return (uint64_t) __a; } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcgtq_s8 (int8x16_t __a, int8x16_t __b) +__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +vcvts_s32_f32 (float32_t __a) { - return (uint8x16_t) __builtin_aarch64_cmgtv16qi (__a, __b); + return (int32_t) __a; } -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vcgtq_s16 (int16x8_t __a, int16x8_t __b) +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vcvts_u32_f32 (float32_t __a) { - return (uint16x8_t) __builtin_aarch64_cmgtv8hi (__a, __b); + return (uint32_t) __a; } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcgtq_s32 (int32x4_t __a, int32x4_t __b) +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vcvt_s32_f32 (float32x2_t __a) { - return (uint32x4_t) __builtin_aarch64_cmgtv4si (__a, __b); + return __builtin_aarch64_lbtruncv2sfv2si (__a); } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcgtq_s64 (int64x2_t __a, int64x2_t __b) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcvt_u32_f32 (float32x2_t __a) { - return (uint64x2_t) __builtin_aarch64_cmgtv2di (__a, __b); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint32x2_t) __builtin_aarch64_lbtruncuv2sfv2si (__a); } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcgtq_u8 (uint8x16_t __a, uint8x16_t __b) +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vcvtq_s32_f32 (float32x4_t __a) { - return (uint8x16_t) __builtin_aarch64_cmhiv16qi ((int8x16_t) __a, - (int8x16_t) __b); + return __builtin_aarch64_lbtruncv4sfv4si (__a); } -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vcgtq_u16 (uint16x8_t __a, uint16x8_t __b) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcvtq_u32_f32 (float32x4_t __a) { - return (uint16x8_t) __builtin_aarch64_cmhiv8hi ((int16x8_t) __a, - (int16x8_t) __b); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint32x4_t) __builtin_aarch64_lbtruncuv4sfv4si (__a); } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcgtq_u32 (uint32x4_t __a, uint32x4_t __b) +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vcvtq_s64_f64 (float64x2_t __a) { - return (uint32x4_t) __builtin_aarch64_cmhiv4si ((int32x4_t) __a, - (int32x4_t) __b); + return __builtin_aarch64_lbtruncv2dfv2di (__a); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcgtq_u64 (uint64x2_t __a, uint64x2_t __b) +vcvtq_u64_f64 (float64x2_t __a) { - return (uint64x2_t) __builtin_aarch64_cmhiv2di ((int64x2_t) __a, - (int64x2_t) __b); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint64x2_t) __builtin_aarch64_lbtruncuv2dfv2di (__a); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcgtd_s64 (int64x1_t __a, int64x1_t __b) +/* vcvta */ + +__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +vcvtad_s64_f64 (float64_t __a) { - return (uint64x1_t) __builtin_aarch64_cmgtdi (__a, __b); + return __builtin_aarch64_lrounddfdi (__a); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcgtd_u64 (uint64x1_t __a, uint64x1_t __b) +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vcvtad_u64_f64 (float64_t __a) { - return (uint64x1_t) __builtin_aarch64_cmhidi ((int64x1_t) __a, - (int64x1_t) __b); + return __builtin_aarch64_lroundudfdi (__a); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcgtzd_s64 (int64x1_t __a) +__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +vcvtas_s32_f32 (float32_t __a) { - return (uint64x1_t) __builtin_aarch64_cmgtdi (__a, 0); + return __builtin_aarch64_lroundsfsi (__a); } -/* vcle */ - -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vcle_s8 (int8x8_t __a, int8x8_t __b) +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vcvtas_u32_f32 (float32_t __a) { - return (uint8x8_t) __builtin_aarch64_cmgev8qi (__b, __a); + return __builtin_aarch64_lroundusfsi (__a); } -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vcle_s16 (int16x4_t __a, int16x4_t __b) +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vcvta_s32_f32 (float32x2_t __a) { - return (uint16x4_t) __builtin_aarch64_cmgev4hi (__b, __a); + return __builtin_aarch64_lroundv2sfv2si (__a); } __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcle_s32 (int32x2_t __a, int32x2_t __b) +vcvta_u32_f32 (float32x2_t __a) { - return (uint32x2_t) __builtin_aarch64_cmgev2si (__b, __a); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint32x2_t) __builtin_aarch64_lrounduv2sfv2si (__a); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcle_s64 (int64x1_t __a, int64x1_t __b) +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vcvtaq_s32_f32 (float32x4_t __a) { - return (uint64x1_t) __builtin_aarch64_cmgedi (__b, __a); + return __builtin_aarch64_lroundv4sfv4si (__a); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vcle_u8 (uint8x8_t __a, uint8x8_t __b) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcvtaq_u32_f32 (float32x4_t __a) { - return (uint8x8_t) __builtin_aarch64_cmhsv8qi ((int8x8_t) __b, - (int8x8_t) __a); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint32x4_t) __builtin_aarch64_lrounduv4sfv4si (__a); } -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vcle_u16 (uint16x4_t __a, uint16x4_t __b) +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vcvtaq_s64_f64 (float64x2_t __a) { - return (uint16x4_t) __builtin_aarch64_cmhsv4hi ((int16x4_t) __b, - (int16x4_t) __a); + return __builtin_aarch64_lroundv2dfv2di (__a); } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcle_u32 (uint32x2_t __a, uint32x2_t __b) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcvtaq_u64_f64 (float64x2_t __a) { - return (uint32x2_t) __builtin_aarch64_cmhsv2si ((int32x2_t) __b, - (int32x2_t) __a); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint64x2_t) __builtin_aarch64_lrounduv2dfv2di (__a); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcle_u64 (uint64x1_t __a, uint64x1_t __b) +/* vcvtm */ + +__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +vcvtmd_s64_f64 (float64_t __a) { - return (uint64x1_t) __builtin_aarch64_cmhsdi ((int64x1_t) __b, - (int64x1_t) __a); + return __builtin_lfloor (__a); } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcleq_s8 (int8x16_t __a, int8x16_t __b) +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vcvtmd_u64_f64 (float64_t __a) { - return (uint8x16_t) __builtin_aarch64_cmgev16qi (__b, __a); + return __builtin_aarch64_lfloorudfdi (__a); } -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vcleq_s16 (int16x8_t __a, int16x8_t __b) +__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +vcvtms_s32_f32 (float32_t __a) { - return (uint16x8_t) __builtin_aarch64_cmgev8hi (__b, __a); + return __builtin_ifloorf (__a); } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcleq_s32 (int32x4_t __a, int32x4_t __b) +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vcvtms_u32_f32 (float32_t __a) { - return (uint32x4_t) __builtin_aarch64_cmgev4si (__b, __a); + return __builtin_aarch64_lfloorusfsi (__a); } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcleq_s64 (int64x2_t __a, int64x2_t __b) +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vcvtm_s32_f32 (float32x2_t __a) { - return (uint64x2_t) __builtin_aarch64_cmgev2di (__b, __a); + return __builtin_aarch64_lfloorv2sfv2si (__a); } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcleq_u8 (uint8x16_t __a, uint8x16_t __b) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcvtm_u32_f32 (float32x2_t __a) { - return (uint8x16_t) __builtin_aarch64_cmhsv16qi ((int8x16_t) __b, - (int8x16_t) __a); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint32x2_t) __builtin_aarch64_lflooruv2sfv2si (__a); } -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vcleq_u16 (uint16x8_t __a, uint16x8_t __b) +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vcvtmq_s32_f32 (float32x4_t __a) { - return (uint16x8_t) __builtin_aarch64_cmhsv8hi ((int16x8_t) __b, - (int16x8_t) __a); + return __builtin_aarch64_lfloorv4sfv4si (__a); } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcleq_u32 (uint32x4_t __a, uint32x4_t __b) +vcvtmq_u32_f32 (float32x4_t __a) { - return (uint32x4_t) __builtin_aarch64_cmhsv4si ((int32x4_t) __b, - (int32x4_t) __a); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint32x4_t) __builtin_aarch64_lflooruv4sfv4si (__a); } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcleq_u64 (uint64x2_t __a, uint64x2_t __b) +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vcvtmq_s64_f64 (float64x2_t __a) { - return (uint64x2_t) __builtin_aarch64_cmhsv2di ((int64x2_t) __b, - (int64x2_t) __a); + return __builtin_aarch64_lfloorv2dfv2di (__a); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcled_s64 (int64x1_t __a, int64x1_t __b) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcvtmq_u64_f64 (float64x2_t __a) { - return (uint64x1_t) __builtin_aarch64_cmgedi (__b, __a); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint64x2_t) __builtin_aarch64_lflooruv2dfv2di (__a); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vclezd_s64 (int64x1_t __a) +/* vcvtn */ + +__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +vcvtnd_s64_f64 (float64_t __a) { - return (uint64x1_t) __builtin_aarch64_cmledi (__a, 0); + return __builtin_aarch64_lfrintndfdi (__a); } -/* vclt */ +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vcvtnd_u64_f64 (float64_t __a) +{ + return __builtin_aarch64_lfrintnudfdi (__a); +} -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vclt_s8 (int8x8_t __a, int8x8_t __b) +__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +vcvtns_s32_f32 (float32_t __a) { - return (uint8x8_t) __builtin_aarch64_cmgtv8qi (__b, __a); + return __builtin_aarch64_lfrintnsfsi (__a); } -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vclt_s16 (int16x4_t __a, int16x4_t __b) +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vcvtns_u32_f32 (float32_t __a) { - return (uint16x4_t) __builtin_aarch64_cmgtv4hi (__b, __a); + return __builtin_aarch64_lfrintnusfsi (__a); } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vclt_s32 (int32x2_t __a, int32x2_t __b) +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vcvtn_s32_f32 (float32x2_t __a) { - return (uint32x2_t) __builtin_aarch64_cmgtv2si (__b, __a); + return __builtin_aarch64_lfrintnv2sfv2si (__a); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vclt_s64 (int64x1_t __a, int64x1_t __b) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcvtn_u32_f32 (float32x2_t __a) { - return (uint64x1_t) __builtin_aarch64_cmgtdi (__b, __a); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint32x2_t) __builtin_aarch64_lfrintnuv2sfv2si (__a); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vclt_u8 (uint8x8_t __a, uint8x8_t __b) +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vcvtnq_s32_f32 (float32x4_t __a) { - return (uint8x8_t) __builtin_aarch64_cmhiv8qi ((int8x8_t) __b, - (int8x8_t) __a); + return __builtin_aarch64_lfrintnv4sfv4si (__a); } -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vclt_u16 (uint16x4_t __a, uint16x4_t __b) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcvtnq_u32_f32 (float32x4_t __a) { - return (uint16x4_t) __builtin_aarch64_cmhiv4hi ((int16x4_t) __b, - (int16x4_t) __a); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint32x4_t) __builtin_aarch64_lfrintnuv4sfv4si (__a); } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vclt_u32 (uint32x2_t __a, uint32x2_t __b) +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vcvtnq_s64_f64 (float64x2_t __a) { - return (uint32x2_t) __builtin_aarch64_cmhiv2si ((int32x2_t) __b, - (int32x2_t) __a); + return __builtin_aarch64_lfrintnv2dfv2di (__a); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vclt_u64 (uint64x1_t __a, uint64x1_t __b) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcvtnq_u64_f64 (float64x2_t __a) { - return (uint64x1_t) __builtin_aarch64_cmhidi ((int64x1_t) __b, - (int64x1_t) __a); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint64x2_t) __builtin_aarch64_lfrintnuv2dfv2di (__a); } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcltq_s8 (int8x16_t __a, int8x16_t __b) +/* vcvtp */ + +__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +vcvtpd_s64_f64 (float64_t __a) { - return (uint8x16_t) __builtin_aarch64_cmgtv16qi (__b, __a); + return __builtin_lceil (__a); } -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vcltq_s16 (int16x8_t __a, int16x8_t __b) +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vcvtpd_u64_f64 (float64_t __a) { - return (uint16x8_t) __builtin_aarch64_cmgtv8hi (__b, __a); + return __builtin_aarch64_lceiludfdi (__a); } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcltq_s32 (int32x4_t __a, int32x4_t __b) +__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +vcvtps_s32_f32 (float32_t __a) { - return (uint32x4_t) __builtin_aarch64_cmgtv4si (__b, __a); + return __builtin_iceilf (__a); } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcltq_s64 (int64x2_t __a, int64x2_t __b) +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vcvtps_u32_f32 (float32_t __a) { - return (uint64x2_t) __builtin_aarch64_cmgtv2di (__b, __a); + return __builtin_aarch64_lceilusfsi (__a); } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcltq_u8 (uint8x16_t __a, uint8x16_t __b) +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vcvtp_s32_f32 (float32x2_t __a) { - return (uint8x16_t) __builtin_aarch64_cmhiv16qi ((int8x16_t) __b, - (int8x16_t) __a); + return __builtin_aarch64_lceilv2sfv2si (__a); } -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vcltq_u16 (uint16x8_t __a, uint16x8_t __b) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcvtp_u32_f32 (float32x2_t __a) { - return (uint16x8_t) __builtin_aarch64_cmhiv8hi ((int16x8_t) __b, - (int16x8_t) __a); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint32x2_t) __builtin_aarch64_lceiluv2sfv2si (__a); } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcltq_u32 (uint32x4_t __a, uint32x4_t __b) +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vcvtpq_s32_f32 (float32x4_t __a) { - return (uint32x4_t) __builtin_aarch64_cmhiv4si ((int32x4_t) __b, - (int32x4_t) __a); + return __builtin_aarch64_lceilv4sfv4si (__a); } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcltq_u64 (uint64x2_t __a, uint64x2_t __b) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcvtpq_u32_f32 (float32x4_t __a) { - return (uint64x2_t) __builtin_aarch64_cmhiv2di ((int64x2_t) __b, - (int64x2_t) __a); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint32x4_t) __builtin_aarch64_lceiluv4sfv4si (__a); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcltd_s64 (int64x1_t __a, int64x1_t __b) +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vcvtpq_s64_f64 (float64x2_t __a) { - return (uint64x1_t) __builtin_aarch64_cmgtdi (__b, __a); + return __builtin_aarch64_lceilv2dfv2di (__a); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcltzd_s64 (int64x1_t __a) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcvtpq_u64_f64 (float64x2_t __a) { - return (uint64x1_t) __builtin_aarch64_cmltdi (__a, 0); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint64x2_t) __builtin_aarch64_lceiluv2dfv2di (__a); } /* vdup */ diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 898bfdf..d035aa5 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,5 +1,9 @@ 2013-04-29 James Greenhalgh + * gcc.target/aarch64/vect-vcvt.c: New. + +2013-04-29 James Greenhalgh + * gcc.target/aarch64/vect-vrnd.c: New. 2013-04-29 Richard Biener diff --git a/gcc/testsuite/gcc.target/aarch64/vect-vcvt.c b/gcc/testsuite/gcc.target/aarch64/vect-vcvt.c new file mode 100644 index 0000000..6066d7d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-vcvt.c @@ -0,0 +1,132 @@ +/* { dg-do run } */ +/* { dg-options "-O3 --save-temps -ffast-math" } */ + +#include + +extern void abort (void); +extern double fabs (double); + +#define NUM_TESTS 8 +#define DELTA 0.000001 + +float input_f32[] = {0.1f, -0.1f, 0.4f, 10.3f, + 200.0f, -800.0f, -13.0f, -0.5f}; +double input_f64[] = {0.1, -0.1, 0.4, 10.3, + 200.0, -800.0, -13.0, -0.5}; + +#define TEST(SUFFIX, Q, WIDTH, LANES, S, U, D) \ +int \ +test_vcvt##SUFFIX##_##S##WIDTH##_f##WIDTH##x##LANES##_t (void) \ +{ \ + int ret = 1; \ + int i = 0; \ + int nlanes = LANES; \ + U##int##WIDTH##_t expected_out[NUM_TESTS]; \ + U##int##WIDTH##_t actual_out[NUM_TESTS]; \ + \ + for (i = 0; i < NUM_TESTS; i++) \ + { \ + expected_out[i] \ + = vcvt##SUFFIX##D##_##S##WIDTH##_f##WIDTH (input_f##WIDTH[i]); \ + /* Don't vectorize this. */ \ + asm volatile ("" : : : "memory"); \ + } \ + \ + for (i = 0; i < NUM_TESTS; i+=nlanes) \ + { \ + U##int##WIDTH##x##LANES##_t out = \ + vcvt##SUFFIX##Q##_##S##WIDTH##_f##WIDTH \ + (vld1##Q##_f##WIDTH (input_f##WIDTH + i)); \ + vst1##Q##_##S##WIDTH (actual_out + i, out); \ + } \ + \ + for (i = 0; i < NUM_TESTS; i++) \ + ret &= fabs (expected_out[i] - actual_out[i]) < DELTA; \ + \ + return ret; \ +} \ + + +#define BUILD_VARIANTS(SUFFIX) \ +TEST (SUFFIX, , 32, 2, s, ,s) \ +TEST (SUFFIX, q, 32, 4, s, ,s) \ +TEST (SUFFIX, q, 64, 2, s, ,d) \ +TEST (SUFFIX, , 32, 2, u,u,s) \ +TEST (SUFFIX, q, 32, 4, u,u,s) \ +TEST (SUFFIX, q, 64, 2, u,u,d) \ + +BUILD_VARIANTS ( ) +/* { dg-final { scan-assembler "fcvtzs\\tw\[0-9\]+, s\[0-9\]+" } } */ +/* { dg-final { scan-assembler "fcvtzs\\tx\[0-9\]+, d\[0-9\]+" } } */ +/* { dg-final { scan-assembler "fcvtzs\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s" } } */ +/* { dg-final { scan-assembler "fcvtzs\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s" } } */ +/* { dg-final { scan-assembler "fcvtzs\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d" } } */ +/* { dg-final { scan-assembler "fcvtzu\\tw\[0-9\]+, s\[0-9\]+" } } */ +/* { dg-final { scan-assembler "fcvtzu\\tx\[0-9\]+, d\[0-9\]+" } } */ +/* { dg-final { scan-assembler "fcvtzu\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s" } } */ +/* { dg-final { scan-assembler "fcvtzu\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s" } } */ +/* { dg-final { scan-assembler "fcvtzu\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d" } } */ +BUILD_VARIANTS (a) +/* { dg-final { scan-assembler "fcvtas\\tw\[0-9\]+, s\[0-9\]+" } } */ +/* { dg-final { scan-assembler "fcvtas\\tx\[0-9\]+, d\[0-9\]+" } } */ +/* { dg-final { scan-assembler "fcvtas\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s" } } */ +/* { dg-final { scan-assembler "fcvtas\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s" } } */ +/* { dg-final { scan-assembler "fcvtas\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d" } } */ +/* { dg-final { scan-assembler "fcvtau\\tw\[0-9\]+, s\[0-9\]+" } } */ +/* { dg-final { scan-assembler "fcvtau\\tx\[0-9\]+, d\[0-9\]+" } } */ +/* { dg-final { scan-assembler "fcvtau\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s" } } */ +/* { dg-final { scan-assembler "fcvtau\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s" } } */ +/* { dg-final { scan-assembler "fcvtau\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d" } } */ +BUILD_VARIANTS (m) +/* { dg-final { scan-assembler "fcvtms\\tw\[0-9\]+, s\[0-9\]+" } } */ +/* { dg-final { scan-assembler "fcvtms\\tx\[0-9\]+, d\[0-9\]+" } } */ +/* { dg-final { scan-assembler "fcvtms\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s" } } */ +/* { dg-final { scan-assembler "fcvtms\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s" } } */ +/* { dg-final { scan-assembler "fcvtms\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d" } } */ +/* { dg-final { scan-assembler "fcvtmu\\tw\[0-9\]+, s\[0-9\]+" } } */ +/* { dg-final { scan-assembler "fcvtmu\\tx\[0-9\]+, d\[0-9\]+" } } */ +/* { dg-final { scan-assembler "fcvtmu\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s" } } */ +/* { dg-final { scan-assembler "fcvtmu\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s" } } */ +/* { dg-final { scan-assembler "fcvtmu\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d" } } */ +BUILD_VARIANTS (n) +/* { dg-final { scan-assembler "fcvtns\\tw\[0-9\]+, s\[0-9\]+" } } */ +/* { dg-final { scan-assembler "fcvtns\\tx\[0-9\]+, d\[0-9\]+" } } */ +/* { dg-final { scan-assembler "fcvtns\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s" } } */ +/* { dg-final { scan-assembler "fcvtns\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s" } } */ +/* { dg-final { scan-assembler "fcvtns\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d" } } */ +/* { dg-final { scan-assembler "fcvtnu\\tw\[0-9\]+, s\[0-9\]+" } } */ +/* { dg-final { scan-assembler "fcvtnu\\tx\[0-9\]+, d\[0-9\]+" } } */ +/* { dg-final { scan-assembler "fcvtnu\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s" } } */ +/* { dg-final { scan-assembler "fcvtnu\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s" } } */ +/* { dg-final { scan-assembler "fcvtnu\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d" } } */ +BUILD_VARIANTS (p) +/* { dg-final { scan-assembler "fcvtps\\tw\[0-9\]+, s\[0-9\]+" } } */ +/* { dg-final { scan-assembler "fcvtps\\tx\[0-9\]+, d\[0-9\]+" } } */ +/* { dg-final { scan-assembler "fcvtps\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s" } } */ +/* { dg-final { scan-assembler "fcvtps\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s" } } */ +/* { dg-final { scan-assembler "fcvtps\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d" } } */ +/* { dg-final { scan-assembler "fcvtpu\\tw\[0-9\]+, s\[0-9\]+" } } */ +/* { dg-final { scan-assembler "fcvtpu\\tx\[0-9\]+, d\[0-9\]+" } } */ +/* { dg-final { scan-assembler "fcvtpu\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s" } } */ +/* { dg-final { scan-assembler "fcvtpu\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s" } } */ +/* { dg-final { scan-assembler "fcvtpu\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d" } } */ + +#undef TEST +#define TEST(SUFFIX, Q, WIDTH, LANES, S, U, D) \ +{ \ + if (!test_vcvt##SUFFIX##_##S##WIDTH##_f##WIDTH##x##LANES##_t ()) \ + abort (); \ +} + +int +main (int argc, char **argv) +{ + BUILD_VARIANTS ( ) + BUILD_VARIANTS (a) + BUILD_VARIANTS (m) + BUILD_VARIANTS (n) + BUILD_VARIANTS (p) + return 0; +} + +/* { dg-final { cleanup-saved-temps } } */ -- cgit v1.1