diff options
Diffstat (limited to 'target/arm/tcg/sve_helper.c')
-rw-r--r-- | target/arm/tcg/sve_helper.c | 1197 |
1 files changed, 1008 insertions, 189 deletions
diff --git a/target/arm/tcg/sve_helper.c b/target/arm/tcg/sve_helper.c index d786b4b..43b872c 100644 --- a/target/arm/tcg/sve_helper.c +++ b/target/arm/tcg/sve_helper.c @@ -20,15 +20,19 @@ #include "qemu/osdep.h" #include "cpu.h" #include "internals.h" -#include "exec/exec-all.h" #include "exec/page-protection.h" #include "exec/helper-proto.h" +#include "exec/target_page.h" +#include "exec/tlb-flags.h" #include "tcg/tcg-gvec-desc.h" #include "fpu/softfloat.h" #include "tcg/tcg.h" #include "vec_internal.h" #include "sve_ldst_internal.h" +#include "accel/tcg/cpu-ldst.h" +#include "accel/tcg/helper-retaddr.h" #include "accel/tcg/cpu-ops.h" +#include "accel/tcg/probe.h" #ifdef CONFIG_USER_ONLY #include "user/page-protection.h" #endif @@ -119,6 +123,11 @@ static inline uint64_t expand_pred_s(uint8_t byte) return word[byte & 0x11]; } +static inline uint64_t expand_pred_d(uint8_t byte) +{ + return -(uint64_t)(byte & 1); +} + #define LOGICAL_PPPP(NAME, FUNC) \ void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ { \ @@ -202,6 +211,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ #define DO_EOR(N, M) (N ^ M) #define DO_ORR(N, M) (N | M) #define DO_BIC(N, M) (N & ~M) +#define DO_ORC(N, M) (N | ~M) #define DO_ADD(N, M) (N + M) #define DO_SUB(N, M) (N - M) #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M)) @@ -523,14 +533,9 @@ DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS) DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS) DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D) -static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max) -{ - return val >= max ? max : val <= min ? min : val; -} - -#define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX) -#define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX) -#define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX) +#define DO_SQADD_B(n, m) do_ssat_b((int64_t)n + m) +#define DO_SQADD_H(n, m) do_ssat_h((int64_t)n + m) +#define DO_SQADD_S(n, m) do_ssat_s((int64_t)n + m) static inline int64_t do_sqadd_d(int64_t n, int64_t m) { @@ -547,9 +552,9 @@ DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H) DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S) DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d) -#define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX) -#define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX) -#define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX) +#define DO_UQADD_B(n, m) do_usat_b((int64_t)n + m) +#define DO_UQADD_H(n, m) do_usat_h((int64_t)n + m) +#define DO_UQADD_S(n, m) do_usat_s((int64_t)n + m) static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m) { @@ -562,9 +567,9 @@ DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H) DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S) DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d) -#define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX) -#define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX) -#define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX) +#define DO_SQSUB_B(n, m) do_ssat_b((int64_t)n - m) +#define DO_SQSUB_H(n, m) do_ssat_h((int64_t)n - m) +#define DO_SQSUB_S(n, m) do_ssat_s((int64_t)n - m) static inline int64_t do_sqsub_d(int64_t n, int64_t m) { @@ -581,9 +586,9 @@ DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H) DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S) DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d) -#define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX) -#define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX) -#define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX) +#define DO_UQSUB_B(n, m) do_usat_b((int64_t)n - m) +#define DO_UQSUB_H(n, m) do_usat_h((int64_t)n - m) +#define DO_UQSUB_S(n, m) do_usat_s((int64_t)n - m) static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m) { @@ -595,12 +600,9 @@ DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H) DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S) DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d) -#define DO_SUQADD_B(n, m) \ - do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX) -#define DO_SUQADD_H(n, m) \ - do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX) -#define DO_SUQADD_S(n, m) \ - do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX) +#define DO_SUQADD_B(n, m) do_ssat_b((int64_t)(int8_t)n + m) +#define DO_SUQADD_H(n, m) do_ssat_h((int64_t)(int16_t)n + m) +#define DO_SUQADD_S(n, m) do_ssat_s((int64_t)(int32_t)n + m) static inline int64_t do_suqadd_d(int64_t n, uint64_t m) { @@ -630,12 +632,9 @@ DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H) DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S) DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d) -#define DO_USQADD_B(n, m) \ - do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX) -#define DO_USQADD_H(n, m) \ - do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX) -#define DO_USQADD_S(n, m) \ - do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX) +#define DO_USQADD_B(n, m) do_usat_b((int64_t)n + (int8_t)m) +#define DO_USQADD_H(n, m) do_usat_h((int64_t)n + (int16_t)m) +#define DO_USQADD_S(n, m) do_usat_s((int64_t)n + (int32_t)m) static inline uint64_t do_usqadd_d(uint64_t n, int64_t m) { @@ -1222,37 +1221,29 @@ void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ } \ } -#define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX) -#define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX) -#define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX) - -DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H) -DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S) -DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D) +DO_XTNB(sve2_sqxtnb_h, int16_t, do_ssat_b) +DO_XTNB(sve2_sqxtnb_s, int32_t, do_ssat_h) +DO_XTNB(sve2_sqxtnb_d, int64_t, do_ssat_s) -DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H) -DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S) -DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D) +DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, do_ssat_b) +DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, do_ssat_h) +DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, do_ssat_s) -#define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX) -#define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX) -#define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX) +DO_XTNB(sve2_uqxtnb_h, uint16_t, do_usat_b) +DO_XTNB(sve2_uqxtnb_s, uint32_t, do_usat_h) +DO_XTNB(sve2_uqxtnb_d, uint64_t, do_usat_s) -DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H) -DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S) -DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D) +DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, do_usat_b) +DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, do_usat_h) +DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, do_usat_s) -DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H) -DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S) -DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D) +DO_XTNB(sve2_sqxtunb_h, int16_t, do_usat_b) +DO_XTNB(sve2_sqxtunb_s, int32_t, do_usat_h) +DO_XTNB(sve2_sqxtunb_d, int64_t, do_usat_s) -DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H) -DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S) -DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D) - -DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H) -DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S) -DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D) +DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, do_usat_b) +DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, do_usat_h) +DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, do_usat_s) #undef DO_XTNB #undef DO_XTNT @@ -1829,6 +1820,52 @@ DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN) #undef DO_VPZ #undef DO_VPZ_D +#define DO_VPQ(NAME, TYPE, H, INIT, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ +{ \ + TYPE tmp[16 / sizeof(TYPE)] = { [0 ... 16 / sizeof(TYPE) - 1] = INIT }; \ + TYPE *n = vn; uint16_t *g = vg; \ + uintptr_t oprsz = simd_oprsz(desc); \ + uintptr_t nseg = oprsz / 16, nsegelt = 16 / sizeof(TYPE); \ + for (uintptr_t s = 0; s < nseg; s++) { \ + uint16_t pg = g[H2(s)]; \ + for (uintptr_t e = 0; e < nsegelt; e++, pg >>= sizeof(TYPE)) { \ + if (pg & 1) { \ + tmp[e] = OP(tmp[H(e)], n[s * nsegelt + H(e)]); \ + } \ + } \ + } \ + memcpy(vd, tmp, 16); \ + clear_tail(vd, 16, simd_maxsz(desc)); \ +} + +DO_VPQ(sve2p1_addqv_b, uint8_t, H1, 0, DO_ADD) +DO_VPQ(sve2p1_addqv_h, uint16_t, H2, 0, DO_ADD) +DO_VPQ(sve2p1_addqv_s, uint32_t, H4, 0, DO_ADD) +DO_VPQ(sve2p1_addqv_d, uint64_t, H8, 0, DO_ADD) + +DO_VPQ(sve2p1_smaxqv_b, int8_t, H1, INT8_MIN, DO_MAX) +DO_VPQ(sve2p1_smaxqv_h, int16_t, H2, INT16_MIN, DO_MAX) +DO_VPQ(sve2p1_smaxqv_s, int32_t, H4, INT32_MIN, DO_MAX) +DO_VPQ(sve2p1_smaxqv_d, int64_t, H8, INT64_MIN, DO_MAX) + +DO_VPQ(sve2p1_sminqv_b, int8_t, H1, INT8_MAX, DO_MIN) +DO_VPQ(sve2p1_sminqv_h, int16_t, H2, INT16_MAX, DO_MIN) +DO_VPQ(sve2p1_sminqv_s, int32_t, H4, INT32_MAX, DO_MIN) +DO_VPQ(sve2p1_sminqv_d, int64_t, H8, INT64_MAX, DO_MIN) + +DO_VPQ(sve2p1_umaxqv_b, uint8_t, H1, 0, DO_MAX) +DO_VPQ(sve2p1_umaxqv_h, uint16_t, H2, 0, DO_MAX) +DO_VPQ(sve2p1_umaxqv_s, uint32_t, H4, 0, DO_MAX) +DO_VPQ(sve2p1_umaxqv_d, uint64_t, H8, 0, DO_MAX) + +DO_VPQ(sve2p1_uminqv_b, uint8_t, H1, -1, DO_MIN) +DO_VPQ(sve2p1_uminqv_h, uint16_t, H2, -1, DO_MIN) +DO_VPQ(sve2p1_uminqv_s, uint32_t, H4, -1, DO_MIN) +DO_VPQ(sve2p1_uminqv_d, uint64_t, H8, -1, DO_MIN) + +#undef DO_VPQ + /* Two vector operand, one scalar operand, unpredicated. */ #define DO_ZZI(NAME, TYPE, OP) \ void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \ @@ -1869,10 +1906,46 @@ DO_ZZI(sve_umini_d, uint64_t, DO_MIN) #undef DO_ZZI +#define DO_LOGIC_QV(NAME, SUFF, INIT, VOP, POP) \ +void HELPER(NAME ## _ ## SUFF)(void *vd, void *vn, void *vg, uint32_t desc) \ +{ \ + unsigned seg = simd_oprsz(desc) / 16; \ + uint64_t r0 = INIT, r1 = INIT; \ + for (unsigned s = 0; s < seg; s++) { \ + uint64_t p0 = expand_pred_##SUFF(*(uint8_t *)(vg + H1(s * 2))); \ + uint64_t p1 = expand_pred_##SUFF(*(uint8_t *)(vg + H1(s * 2 + 1))); \ + uint64_t v0 = *(uint64_t *)(vn + s * 16); \ + uint64_t v1 = *(uint64_t *)(vn + s * 16 + 8); \ + v0 = POP(v0, p0), v1 = POP(v1, p1); \ + r0 = VOP(r0, v0), r1 = VOP(r1, v1); \ + } \ + *(uint64_t *)(vd + 0) = r0; \ + *(uint64_t *)(vd + 8) = r1; \ + clear_tail(vd, 16, simd_maxsz(desc)); \ +} + +DO_LOGIC_QV(sve2p1_orqv, b, 0, DO_ORR, DO_AND) +DO_LOGIC_QV(sve2p1_orqv, h, 0, DO_ORR, DO_AND) +DO_LOGIC_QV(sve2p1_orqv, s, 0, DO_ORR, DO_AND) +DO_LOGIC_QV(sve2p1_orqv, d, 0, DO_ORR, DO_AND) + +DO_LOGIC_QV(sve2p1_eorqv, b, 0, DO_EOR, DO_AND) +DO_LOGIC_QV(sve2p1_eorqv, h, 0, DO_EOR, DO_AND) +DO_LOGIC_QV(sve2p1_eorqv, s, 0, DO_EOR, DO_AND) +DO_LOGIC_QV(sve2p1_eorqv, d, 0, DO_EOR, DO_AND) + +DO_LOGIC_QV(sve2p1_andqv, b, -1, DO_AND, DO_ORC) +DO_LOGIC_QV(sve2p1_andqv, h, -1, DO_AND, DO_ORC) +DO_LOGIC_QV(sve2p1_andqv, s, -1, DO_AND, DO_ORC) +DO_LOGIC_QV(sve2p1_andqv, d, -1, DO_AND, DO_ORC) + +#undef DO_LOGIC_QV + #undef DO_AND #undef DO_ORR #undef DO_EOR #undef DO_BIC +#undef DO_ORC #undef DO_ADD #undef DO_SUB #undef DO_MAX @@ -2065,27 +2138,6 @@ void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ when N is negative, add 2**M-1. */ #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M) -static inline uint64_t do_urshr(uint64_t x, unsigned sh) -{ - if (likely(sh < 64)) { - return (x >> sh) + ((x >> (sh - 1)) & 1); - } else if (sh == 64) { - return x >> 63; - } else { - return 0; - } -} - -static inline int64_t do_srshr(int64_t x, unsigned sh) -{ - if (likely(sh < 64)) { - return (x >> sh) + ((x >> (sh - 1)) & 1); - } else { - /* Rounding the sign bit always produces 0. */ - return 0; - } -} - DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR) DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR) DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR) @@ -2183,10 +2235,9 @@ DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr) DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr) DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr) -#define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX) -#define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX) -#define DO_SQSHRUN_D(x, sh) \ - do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX) +#define DO_SQSHRUN_H(x, sh) do_usat_b((int64_t)(x) >> sh) +#define DO_SQSHRUN_S(x, sh) do_usat_h((int64_t)(x) >> sh) +#define DO_SQSHRUN_D(x, sh) do_usat_s((int64_t)(x) >> (sh < 64 ? sh : 63)) DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H) DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S) @@ -2196,9 +2247,9 @@ DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H) DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S) DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D) -#define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX) -#define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX) -#define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX) +#define DO_SQRSHRUN_H(x, sh) do_usat_b(do_srshr(x, sh)) +#define DO_SQRSHRUN_S(x, sh) do_usat_h(do_srshr(x, sh)) +#define DO_SQRSHRUN_D(x, sh) do_usat_s(do_srshr(x, sh)) DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H) DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S) @@ -2208,9 +2259,9 @@ DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H) DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S) DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D) -#define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX) -#define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX) -#define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX) +#define DO_SQSHRN_H(x, sh) do_ssat_b(x >> sh) +#define DO_SQSHRN_S(x, sh) do_ssat_h(x >> sh) +#define DO_SQSHRN_D(x, sh) do_ssat_s(x >> sh) DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H) DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S) @@ -2220,9 +2271,9 @@ DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H) DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S) DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D) -#define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX) -#define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX) -#define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX) +#define DO_SQRSHRN_H(x, sh) do_ssat_b(do_srshr(x, sh)) +#define DO_SQRSHRN_S(x, sh) do_ssat_h(do_srshr(x, sh)) +#define DO_SQRSHRN_D(x, sh) do_ssat_s(do_srshr(x, sh)) DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H) DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S) @@ -2984,6 +3035,56 @@ void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc) } } +/* + * TODO: This could use half_shuffle64 and similar bit tricks to + * expand blocks of bits at once. + */ +#define DO_PMOV_PV(NAME, ESIZE) \ +void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \ +{ \ + unsigned vl = simd_oprsz(desc); \ + unsigned idx = simd_data(desc); \ + unsigned elements = vl / ESIZE; \ + ARMPredicateReg *d = vd; \ + ARMVectorReg *s = vs; \ + memset(d, 0, sizeof(*d)); \ + for (unsigned e = 0; e < elements; ++e) { \ + depositn(d->p, e * ESIZE, 1, extractn(s->d, elements * idx + e, 1)); \ + } \ +} + +DO_PMOV_PV(pmov_pv_h, 2) +DO_PMOV_PV(pmov_pv_s, 4) +DO_PMOV_PV(pmov_pv_d, 8) + +#undef DO_PMOV_PV + +/* + * TODO: This could use half_unshuffle64 and similar bit tricks to + * compress blocks of bits at once. + */ +#define DO_PMOV_VP(NAME, ESIZE) \ +void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \ +{ \ + unsigned vl = simd_oprsz(desc); \ + unsigned idx = simd_data(desc); \ + unsigned elements = vl / ESIZE; \ + ARMVectorReg *d = vd; \ + ARMPredicateReg *s = vs; \ + if (idx == 0) { \ + memset(d, 0, vl); \ + } \ + for (unsigned e = 0; e < elements; ++e) { \ + depositn(d->d, elements * idx + e, 1, extractn(s->p, e * ESIZE, 1)); \ + } \ +} + +DO_PMOV_VP(pmov_vp_h, 2) +DO_PMOV_VP(pmov_vp_s, 4) +DO_PMOV_VP(pmov_vp_d, 8) + +#undef DO_PMOV_VP + typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool); static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc, @@ -3449,6 +3550,45 @@ DO_UZP(sve_uzp_s, uint32_t, H1_4) DO_UZP(sve_uzp_d, uint64_t, H1_8) DO_UZP(sve2_uzp_q, Int128, ) +typedef void perseg_zzz_fn(void *vd, void *vn, void *vm, uint32_t desc); + +static void do_perseg_zzz(void *vd, void *vn, void *vm, + uint32_t desc, perseg_zzz_fn *fn) +{ + intptr_t oprsz = simd_oprsz(desc); + + desc = simd_desc(16, 16, simd_data(desc)); + for (intptr_t i = 0; i < oprsz; i += 16) { + fn(vd + i, vn + i, vm + i, desc); + } +} + +#define DO_PERSEG_ZZZ(NAME, FUNC) \ + void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ + { do_perseg_zzz(vd, vn, vm, desc, FUNC); } + +DO_PERSEG_ZZZ(sve2p1_uzpq_b, helper_sve_uzp_b) +DO_PERSEG_ZZZ(sve2p1_uzpq_h, helper_sve_uzp_h) +DO_PERSEG_ZZZ(sve2p1_uzpq_s, helper_sve_uzp_s) +DO_PERSEG_ZZZ(sve2p1_uzpq_d, helper_sve_uzp_d) + +DO_PERSEG_ZZZ(sve2p1_zipq_b, helper_sve_zip_b) +DO_PERSEG_ZZZ(sve2p1_zipq_h, helper_sve_zip_h) +DO_PERSEG_ZZZ(sve2p1_zipq_s, helper_sve_zip_s) +DO_PERSEG_ZZZ(sve2p1_zipq_d, helper_sve_zip_d) + +DO_PERSEG_ZZZ(sve2p1_tblq_b, helper_sve_tbl_b) +DO_PERSEG_ZZZ(sve2p1_tblq_h, helper_sve_tbl_h) +DO_PERSEG_ZZZ(sve2p1_tblq_s, helper_sve_tbl_s) +DO_PERSEG_ZZZ(sve2p1_tblq_d, helper_sve_tbl_d) + +DO_PERSEG_ZZZ(sve2p1_tbxq_b, helper_sve2_tbx_b) +DO_PERSEG_ZZZ(sve2p1_tbxq_h, helper_sve2_tbx_h) +DO_PERSEG_ZZZ(sve2p1_tbxq_s, helper_sve2_tbx_s) +DO_PERSEG_ZZZ(sve2p1_tbxq_d, helper_sve2_tbx_d) + +#undef DO_PERSEG_ZZZ + #define DO_TRN(NAME, TYPE, H) \ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ { \ @@ -3989,15 +4129,6 @@ static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g, return flags; } -static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz) -{ - /* It is quicker to zero the whole predicate than loop on OPRSZ. - * The compiler should turn this into 4 64-bit integer stores. - */ - memset(d, 0, sizeof(ARMPredicateReg)); - return PREDTEST_INIT; -} - void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg, uint32_t pred_desc) { @@ -4005,7 +4136,7 @@ void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg, if (last_active_pred(vn, vg, oprsz)) { compute_brk_z(vd, vm, vg, oprsz, true); } else { - do_zero(vd, oprsz); + memset(vd, 0, sizeof(ARMPredicateReg)); } } @@ -4016,7 +4147,8 @@ uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg, if (last_active_pred(vn, vg, oprsz)) { return compute_brks_z(vd, vm, vg, oprsz, true); } else { - return do_zero(vd, oprsz); + memset(vd, 0, sizeof(ARMPredicateReg)); + return PREDTEST_INIT; } } @@ -4027,7 +4159,7 @@ void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg, if (last_active_pred(vn, vg, oprsz)) { compute_brk_z(vd, vm, vg, oprsz, false); } else { - do_zero(vd, oprsz); + memset(vd, 0, sizeof(ARMPredicateReg)); } } @@ -4038,7 +4170,8 @@ uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg, if (last_active_pred(vn, vg, oprsz)) { return compute_brks_z(vd, vm, vg, oprsz, false); } else { - return do_zero(vd, oprsz); + memset(vd, 0, sizeof(ARMPredicateReg)); + return PREDTEST_INIT; } } @@ -4094,35 +4227,30 @@ void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc) { intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); if (!last_active_pred(vn, vg, oprsz)) { - do_zero(vd, oprsz); - } -} - -/* As if PredTest(Ones(PL), D, esz). */ -static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz, - uint64_t esz_mask) -{ - uint32_t flags = PREDTEST_INIT; - intptr_t i; - - for (i = 0; i < oprsz / 8; i++) { - flags = iter_predtest_fwd(d->p[i], esz_mask, flags); - } - if (oprsz & 7) { - uint64_t mask = ~(-1ULL << (8 * (oprsz & 7))); - flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags); + memset(vd, 0, sizeof(ARMPredicateReg)); } - return flags; } uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc) { intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); if (last_active_pred(vn, vg, oprsz)) { - return predtest_ones(vd, oprsz, -1); - } else { - return do_zero(vd, oprsz); + ARMPredicateReg *d = vd; + uint32_t flags = PREDTEST_INIT; + intptr_t i; + + /* As if PredTest(Ones(PL), D, MO_8). */ + for (i = 0; i < oprsz / 8; i++) { + flags = iter_predtest_fwd(d->p[i], -1, flags); + } + if (oprsz & 7) { + uint64_t mask = ~(-1ULL << (8 * (oprsz & 7))); + flags = iter_predtest_fwd(d->p[i], mask, flags); + } + return flags; } + memset(vd, 0, sizeof(ARMPredicateReg)); + return PREDTEST_INIT; } uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc) @@ -4139,66 +4267,200 @@ uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc) return sum; } -uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc) +uint64_t HELPER(sve2p1_cntp_c)(uint32_t png, uint32_t desc) { - intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); - intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); - uint64_t esz_mask = pred_esz_masks[esz]; - ARMPredicateReg *d = vd; - uint32_t flags; - intptr_t i; + int pl = FIELD_EX32(desc, PREDDESC, OPRSZ); + int vl = pl * 8; + unsigned v_esz = FIELD_EX32(desc, PREDDESC, ESZ); + int lg2_width = FIELD_EX32(desc, PREDDESC, DATA) + 1; + DecodeCounter p = decode_counter(png, vl, v_esz); + unsigned maxelem = (vl << lg2_width) >> v_esz; + unsigned count = p.count; + + if (p.invert) { + if (count >= maxelem) { + return 0; + } + count = maxelem - count; + } else { + count = MIN(count, maxelem); + } + return count >> p.lg2_stride; +} + +/* C.f. Arm pseudocode EncodePredCount */ +static uint64_t encode_pred_count(uint32_t elements, uint32_t count, + uint32_t esz, bool invert) +{ + uint32_t pred; - /* Begin with a zero predicate register. */ - flags = do_zero(d, oprsz); if (count == 0) { - return flags; + return 0; + } + if (invert) { + count = elements - count; + } else if (count == elements) { + count = 0; + invert = true; } - /* Set all of the requested bits. */ - for (i = 0; i < count / 64; ++i) { - d->p[i] = esz_mask; + pred = (count << 1) | 1; + pred <<= esz; + pred |= invert << 15; + + return pred; +} + +/* C.f. Arm pseudocode PredCountTest */ +static uint32_t pred_count_test(uint32_t elements, uint32_t count, bool invert) +{ + uint32_t flags; + + if (count == 0) { + flags = 1; /* !N, Z, C */ + } else if (!invert) { + flags = (1u << 31) | 2; /* N, !Z */ + flags |= count != elements; /* C */ + } else { + flags = 2; /* !Z, !C */ + flags |= (count == elements) << 31; /* N */ } - if (count & 63) { - d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask; + return flags; +} + +/* D must be cleared on entry. */ +static void do_whilel(ARMPredicateReg *d, uint64_t esz_mask, + uint32_t count, uint32_t oprbits) +{ + tcg_debug_assert(count <= oprbits); + if (count) { + uint32_t i; + + /* Set all of the requested bits. */ + for (i = 0; i < count / 64; ++i) { + d->p[i] = esz_mask; + } + if (count & 63) { + d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask; + } } +} - return predtest_ones(d, oprsz, esz_mask); +uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc) +{ + uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); + uint32_t oprbits = oprsz * 8; + uint64_t esz_mask = pred_esz_masks[esz]; + ARMPredicateReg *d = vd; + + count <<= esz; + memset(d, 0, sizeof(*d)); + do_whilel(d, esz_mask, count, oprbits); + return pred_count_test(oprbits, count, false); } -uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc) +uint32_t HELPER(sve_while2l)(void *vd, uint32_t count, uint32_t pred_desc) { - intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); - intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); + uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); + uint32_t oprbits = oprsz * 8; uint64_t esz_mask = pred_esz_masks[esz]; ARMPredicateReg *d = vd; - intptr_t i, invcount, oprbits; - uint64_t bits; - if (count == 0) { - return do_zero(d, oprsz); + count <<= esz; + memset(d, 0, 2 * sizeof(*d)); + if (count <= oprbits) { + do_whilel(&d[0], esz_mask, count, oprbits); + } else { + do_whilel(&d[0], esz_mask, oprbits, oprbits); + do_whilel(&d[1], esz_mask, count - oprbits, oprbits); } - oprbits = oprsz * 8; + return pred_count_test(2 * oprbits, count, false); +} + +uint32_t HELPER(sve_whilecl)(void *vd, uint32_t count, uint32_t pred_desc) +{ + uint32_t pl = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); + uint32_t scale = FIELD_EX32(pred_desc, PREDDESC, DATA); + uint32_t vl = pl * 8; + uint32_t elements = (vl >> esz) << scale; + ARMPredicateReg *d = vd; + + *d = (ARMPredicateReg) { + .p[0] = encode_pred_count(elements, count, esz, false) + }; + return pred_count_test(elements, count, false); +} + +/* D must be cleared on entry. */ +static void do_whileg(ARMPredicateReg *d, uint64_t esz_mask, + uint32_t count, uint32_t oprbits) +{ tcg_debug_assert(count <= oprbits); + if (count) { + uint32_t i, invcount = oprbits - count; + uint64_t bits = esz_mask & MAKE_64BIT_MASK(invcount & 63, 64); - bits = esz_mask; - if (oprbits & 63) { - bits &= MAKE_64BIT_MASK(0, oprbits & 63); + for (i = invcount / 64; i < oprbits / 64; ++i) { + d->p[i] = bits; + bits = esz_mask; + } + if (oprbits & 63) { + d->p[i] = bits & MAKE_64BIT_MASK(0, oprbits & 63); + } } +} - invcount = oprbits - count; - for (i = (oprsz - 1) / 8; i > invcount / 64; --i) { - d->p[i] = bits; - bits = esz_mask; - } +uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc) +{ + uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); + uint32_t oprbits = oprsz * 8; + uint64_t esz_mask = pred_esz_masks[esz]; + ARMPredicateReg *d = vd; - d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64); + count <<= esz; + memset(d, 0, sizeof(*d)); + do_whileg(d, esz_mask, count, oprbits); + return pred_count_test(oprbits, count, true); +} + +uint32_t HELPER(sve_while2g)(void *vd, uint32_t count, uint32_t pred_desc) +{ + uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); + uint32_t oprbits = oprsz * 8; + uint64_t esz_mask = pred_esz_masks[esz]; + ARMPredicateReg *d = vd; - while (--i >= 0) { - d->p[i] = 0; + count <<= esz; + memset(d, 0, 2 * sizeof(*d)); + if (count <= oprbits) { + do_whileg(&d[1], esz_mask, count, oprbits); + } else { + do_whilel(&d[1], esz_mask, oprbits, oprbits); + do_whileg(&d[0], esz_mask, count - oprbits, oprbits); } - return predtest_ones(d, oprsz, esz_mask); + return pred_count_test(2 * oprbits, count, true); +} + +uint32_t HELPER(sve_whilecg)(void *vd, uint32_t count, uint32_t pred_desc) +{ + uint32_t pl = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); + uint32_t scale = FIELD_EX32(pred_desc, PREDDESC, DATA); + uint32_t vl = pl * 8; + uint32_t elements = (vl >> esz) << scale; + ARMPredicateReg *d = vd; + + *d = (ARMPredicateReg) { + .p[0] = encode_pred_count(elements, count, esz, true) + }; + return pred_count_test(elements, count, true); } /* Recursive reduction on a function; @@ -4209,19 +4471,20 @@ uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc) * The recursion is bounded to depth 7 (128 fp16 elements), so there's * little to gain with a more complex non-recursive form. */ -#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \ -static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \ +#define DO_REDUCE(NAME, SUF, TYPE, H, FUNC, IDENT) \ +static TYPE FUNC##_reduce(TYPE *data, float_status *status, uintptr_t n) \ { \ if (n == 1) { \ return *data; \ } else { \ uintptr_t half = n / 2; \ - TYPE lo = NAME##_reduce(data, status, half); \ - TYPE hi = NAME##_reduce(data + half, status, half); \ + TYPE lo = FUNC##_reduce(data, status, half); \ + TYPE hi = FUNC##_reduce(data + half, status, half); \ return FUNC(lo, hi, status); \ } \ } \ -uint64_t HELPER(NAME)(void *vn, void *vg, float_status *s, uint32_t desc) \ +uint64_t helper_sve_##NAME##v_##SUF(void *vn, void *vg, \ + float_status *s, uint32_t desc) \ { \ uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \ TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \ @@ -4236,39 +4499,54 @@ uint64_t HELPER(NAME)(void *vn, void *vg, float_status *s, uint32_t desc) \ for (; i < maxsz; i += sizeof(TYPE)) { \ *(TYPE *)((void *)data + i) = IDENT; \ } \ - return NAME##_reduce(data, s, maxsz / sizeof(TYPE)); \ + return FUNC##_reduce(data, s, maxsz / sizeof(TYPE)); \ +} \ +void helper_sve2p1_##NAME##qv_##SUF(void *vd, void *vn, void *vg, \ + float_status *status, uint32_t desc) \ +{ \ + unsigned oprsz = simd_oprsz(desc), segments = oprsz / 16; \ + for (unsigned e = 0; e < 16; e += sizeof(TYPE)) { \ + TYPE data[ARM_MAX_VQ]; \ + for (unsigned s = 0; s < segments; s++) { \ + uint16_t pg = *(uint16_t *)(vg + H1_2(s * 2)); \ + TYPE nn = *(TYPE *)(vn + H(s * 16 + H(e))); \ + data[s] = (pg >> e) & 1 ? nn : IDENT; \ + } \ + *(TYPE *)(vd + H(e)) = FUNC##_reduce(data, status, segments); \ + } \ + clear_tail(vd, 16, simd_maxsz(desc)); \ } -DO_REDUCE(sve_faddv_h, float16, H1_2, float16_add, float16_zero) -DO_REDUCE(sve_faddv_s, float32, H1_4, float32_add, float32_zero) -DO_REDUCE(sve_faddv_d, float64, H1_8, float64_add, float64_zero) +DO_REDUCE(fadd,h, float16, H1_2, float16_add, float16_zero) +DO_REDUCE(fadd,s, float32, H1_4, float32_add, float32_zero) +DO_REDUCE(fadd,d, float64, H1_8, float64_add, float64_zero) /* Identity is floatN_default_nan, without the function call. */ -DO_REDUCE(sve_fminnmv_h, float16, H1_2, float16_minnum, 0x7E00) -DO_REDUCE(sve_fminnmv_s, float32, H1_4, float32_minnum, 0x7FC00000) -DO_REDUCE(sve_fminnmv_d, float64, H1_8, float64_minnum, 0x7FF8000000000000ULL) +DO_REDUCE(fminnm,h, float16, H1_2, float16_minnum, 0x7E00) +DO_REDUCE(fminnm,s, float32, H1_4, float32_minnum, 0x7FC00000) +DO_REDUCE(fminnm,d, float64, H1_8, float64_minnum, 0x7FF8000000000000ULL) -DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, float16_maxnum, 0x7E00) -DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, float32_maxnum, 0x7FC00000) -DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, float64_maxnum, 0x7FF8000000000000ULL) +DO_REDUCE(fmaxnm,h, float16, H1_2, float16_maxnum, 0x7E00) +DO_REDUCE(fmaxnm,s, float32, H1_4, float32_maxnum, 0x7FC00000) +DO_REDUCE(fmaxnm,d, float64, H1_8, float64_maxnum, 0x7FF8000000000000ULL) -DO_REDUCE(sve_fminv_h, float16, H1_2, float16_min, float16_infinity) -DO_REDUCE(sve_fminv_s, float32, H1_4, float32_min, float32_infinity) -DO_REDUCE(sve_fminv_d, float64, H1_8, float64_min, float64_infinity) +DO_REDUCE(fmin,h, float16, H1_2, float16_min, float16_infinity) +DO_REDUCE(fmin,s, float32, H1_4, float32_min, float32_infinity) +DO_REDUCE(fmin,d, float64, H1_8, float64_min, float64_infinity) -DO_REDUCE(sve_fmaxv_h, float16, H1_2, float16_max, float16_chs(float16_infinity)) -DO_REDUCE(sve_fmaxv_s, float32, H1_4, float32_max, float32_chs(float32_infinity)) -DO_REDUCE(sve_fmaxv_d, float64, H1_8, float64_max, float64_chs(float64_infinity)) +DO_REDUCE(fmax,h, float16, H1_2, float16_max, float16_chs(float16_infinity)) +DO_REDUCE(fmax,s, float32, H1_4, float32_max, float32_chs(float32_infinity)) +DO_REDUCE(fmax,d, float64, H1_8, float64_max, float64_chs(float64_infinity)) -DO_REDUCE(sve_ah_fminv_h, float16, H1_2, helper_vfp_ah_minh, float16_infinity) -DO_REDUCE(sve_ah_fminv_s, float32, H1_4, helper_vfp_ah_mins, float32_infinity) -DO_REDUCE(sve_ah_fminv_d, float64, H1_8, helper_vfp_ah_mind, float64_infinity) +DO_REDUCE(ah_fmin,h, float16, H1_2, helper_vfp_ah_minh, float16_infinity) +DO_REDUCE(ah_fmin,s, float32, H1_4, helper_vfp_ah_mins, float32_infinity) +DO_REDUCE(ah_fmin,d, float64, H1_8, helper_vfp_ah_mind, float64_infinity) -DO_REDUCE(sve_ah_fmaxv_h, float16, H1_2, helper_vfp_ah_maxh, +DO_REDUCE(ah_fmax,h, float16, H1_2, helper_vfp_ah_maxh, float16_chs(float16_infinity)) -DO_REDUCE(sve_ah_fmaxv_s, float32, H1_4, helper_vfp_ah_maxs, +DO_REDUCE(ah_fmax,s, float32, H1_4, helper_vfp_ah_maxs, float32_chs(float32_infinity)) -DO_REDUCE(sve_ah_fmaxv_d, float64, H1_8, helper_vfp_ah_maxd, +DO_REDUCE(ah_fmax,d, float64, H1_8, helper_vfp_ah_maxd, float64_chs(float64_infinity)) #undef DO_REDUCE @@ -4550,7 +4828,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vg, \ * FZ16. When converting from fp16, this affects flushing input denormals; * when converting to fp16, this affects flushing output denormals. */ -static inline float32 sve_f16_to_f32(float16 f, float_status *fpst) +float32 sve_f16_to_f32(float16 f, float_status *fpst) { bool save = get_flush_inputs_to_zero(fpst); float32 ret; @@ -4572,7 +4850,7 @@ static inline float64 sve_f16_to_f64(float16 f, float_status *fpst) return ret; } -static inline float16 sve_f32_to_f16(float32 f, float_status *fpst) +float16 sve_f32_to_f16(float32 f, float_status *fpst) { bool save = get_flush_to_zero(fpst); float16 ret; @@ -6081,6 +6359,9 @@ DO_LD1_2(ld1sds, MO_64, MO_32) DO_LD1_2(ld1dd, MO_64, MO_64) +DO_LD1_2(ld1squ, MO_32, MO_128) +DO_LD1_2(ld1dqu, MO_64, MO_128) + #undef DO_LD1_1 #undef DO_LD1_2 @@ -6140,6 +6421,10 @@ DO_LDN_2(2, dd, MO_64) DO_LDN_2(3, dd, MO_64) DO_LDN_2(4, dd, MO_64) +DO_LDN_2(2, qq, MO_128) +DO_LDN_2(3, qq, MO_128) +DO_LDN_2(4, qq, MO_128) + #undef DO_LDN_1 #undef DO_LDN_2 @@ -6703,6 +6988,13 @@ DO_STN_2(2, dd, MO_64, MO_64) DO_STN_2(3, dd, MO_64, MO_64) DO_STN_2(4, dd, MO_64, MO_64) +DO_STN_2(1, sq, MO_128, MO_32) +DO_STN_2(1, dq, MO_128, MO_64) + +DO_STN_2(2, qq, MO_128, MO_128) +DO_STN_2(3, qq, MO_128, MO_128) +DO_STN_2(4, qq, MO_128, MO_128) + #undef DO_STN_1 #undef DO_STN_2 @@ -6919,6 +7211,9 @@ DO_LD1_ZPZ_D(dd_be, zsu, MO_64) DO_LD1_ZPZ_D(dd_be, zss, MO_64) DO_LD1_ZPZ_D(dd_be, zd, MO_64) +DO_LD1_ZPZ_D(qq_le, zd, MO_128) +DO_LD1_ZPZ_D(qq_be, zd, MO_128) + #undef DO_LD1_ZPZ_S #undef DO_LD1_ZPZ_D @@ -7305,9 +7600,505 @@ DO_ST1_ZPZ_D(sd_be, zd, MO_32) DO_ST1_ZPZ_D(dd_le, zd, MO_64) DO_ST1_ZPZ_D(dd_be, zd, MO_64) +DO_ST1_ZPZ_D(qq_le, zd, MO_128) +DO_ST1_ZPZ_D(qq_be, zd, MO_128) + #undef DO_ST1_ZPZ_S #undef DO_ST1_ZPZ_D +/* + * SVE2.1 consecutive register load/store + */ + +static unsigned sve2p1_cont_ldst_elements(SVEContLdSt *info, vaddr addr, + uint32_t png, intptr_t reg_max, + int N, int v_esz) +{ + const int esize = 1 << v_esz; + intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split; + DecodeCounter p = decode_counter(png, reg_max, v_esz); + unsigned b_count = p.count << v_esz; + unsigned b_stride = 1 << (v_esz + p.lg2_stride); + intptr_t page_split; + + /* Set all of the element indices to -1, and the TLB data to 0. */ + memset(info, -1, offsetof(SVEContLdSt, page)); + memset(info->page, 0, sizeof(info->page)); + + if (p.invert) { + if (b_count >= reg_max * N) { + return 0; + } + reg_off_first = b_count; + reg_off_last = reg_max * N - b_stride; + } else { + if (b_count == 0) { + return 0; + } + reg_off_first = 0; + reg_off_last = MIN(b_count - esize, reg_max * N - b_stride); + } + + info->reg_off_first[0] = reg_off_first; + info->mem_off_first[0] = reg_off_first; + + page_split = -(addr | TARGET_PAGE_MASK); + if (reg_off_last + esize <= page_split || reg_off_first >= page_split) { + /* The entire operation fits within a single page. */ + info->reg_off_last[0] = reg_off_last; + return b_stride; + } + + info->page_split = page_split; + reg_off_split = ROUND_DOWN(page_split, esize); + + /* + * This is the last full element on the first page, but it is not + * necessarily active. If there is no full element, i.e. the first + * active element is the one that's split, this value remains -1. + * It is useful as iteration bounds. + */ + if (reg_off_split != 0) { + info->reg_off_last[0] = ROUND_DOWN(reg_off_split - esize, b_stride); + } + + /* Determine if an unaligned element spans the pages. */ + if (page_split & (esize - 1)) { + /* It is helpful to know if the split element is active. */ + if ((reg_off_split & (b_stride - 1)) == 0) { + info->reg_off_split = reg_off_split; + info->mem_off_split = reg_off_split; + } + reg_off_split += esize; + } + + /* + * We do want the first active element on the second page, because + * this may affect the address reported in an exception. + */ + reg_off_split = ROUND_UP(reg_off_split, b_stride); + if (reg_off_split <= reg_off_last) { + info->reg_off_first[1] = reg_off_split; + info->mem_off_first[1] = reg_off_split; + info->reg_off_last[1] = reg_off_last; + } + return b_stride; +} + +static void sve2p1_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env, + target_ulong addr, unsigned estride, + int esize, int wp_access, uintptr_t ra) +{ +#ifndef CONFIG_USER_ONLY + intptr_t count_off, count_last; + int flags0 = info->page[0].flags; + int flags1 = info->page[1].flags; + + if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) { + return; + } + + /* Indicate that watchpoints are handled. */ + info->page[0].flags = flags0 & ~TLB_WATCHPOINT; + info->page[1].flags = flags1 & ~TLB_WATCHPOINT; + + if (flags0 & TLB_WATCHPOINT) { + count_off = info->reg_off_first[0]; + count_last = info->reg_off_split; + if (count_last < 0) { + count_last = info->reg_off_last[0]; + } + do { + cpu_check_watchpoint(env_cpu(env), addr + count_off, + esize, info->page[0].attrs, wp_access, ra); + count_off += estride; + } while (count_off <= count_last); + } + + count_off = info->reg_off_first[1]; + if ((flags1 & TLB_WATCHPOINT) && count_off >= 0) { + count_last = info->reg_off_last[1]; + do { + cpu_check_watchpoint(env_cpu(env), addr + count_off, + esize, info->page[1].attrs, + wp_access, ra); + count_off += estride; + } while (count_off <= count_last); + } +#endif +} + +static void sve2p1_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env, + target_ulong addr, unsigned estride, + int esize, uint32_t mtedesc, + uintptr_t ra) +{ + intptr_t count_off, count_last; + + /* + * TODO: estride is always a small power of two, <= 8. + * Manipulate the stride within the loops such that + * - first iteration hits addr + off, as required, + * - second iteration hits ALIGN_UP(addr, 16), + * - other iterations advance addr by 16. + * This will minimize the probing to once per MTE granule. + */ + + /* Process the page only if MemAttr == Tagged. */ + if (info->page[0].tagged) { + count_off = info->reg_off_first[0]; + count_last = info->reg_off_split; + if (count_last < 0) { + count_last = info->reg_off_last[0]; + } + + do { + mte_check(env, mtedesc, addr + count_off, ra); + count_off += estride; + } while (count_off <= count_last); + } + + count_off = info->reg_off_first[1]; + if (count_off >= 0 && info->page[1].tagged) { + count_last = info->reg_off_last[1]; + do { + mte_check(env, mtedesc, addr + count_off, ra); + count_off += estride; + } while (count_off <= count_last); + } +} + +static inline QEMU_ALWAYS_INLINE +void sve2p1_ld1_c(CPUARMState *env, ARMVectorReg *zd, const vaddr addr, + uint32_t png, uint32_t desc, + const uintptr_t ra, const MemOp esz, + sve_ldst1_host_fn *host_fn, + sve_ldst1_tlb_fn *tlb_fn) +{ + const unsigned N = (desc >> SIMD_DATA_SHIFT) & 1 ? 4 : 2; + const unsigned rstride = 1 << ((desc >> (SIMD_DATA_SHIFT + 1)) % 4); + uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + const intptr_t reg_max = simd_oprsz(desc); + const unsigned esize = 1 << esz; + intptr_t count_off, count_last; + intptr_t reg_off, reg_last, reg_n; + SVEContLdSt info; + unsigned estride, flags; + void *host; + + estride = sve2p1_cont_ldst_elements(&info, addr, png, reg_max, N, esz); + if (estride == 0) { + /* The entire predicate was false; no load occurs. */ + for (unsigned n = 0; n < N; n++) { + memset(zd + n * rstride, 0, reg_max); + } + return; + } + + /* Probe the page(s). Exit with exception for any invalid page. */ + sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra); + + /* Handle watchpoints for all active elements. */ + sve2p1_cont_ldst_watchpoints(&info, env, addr, estride, + esize, BP_MEM_READ, ra); + + /* + * Handle mte checks for all active elements. + * Since TBI must be set for MTE, !mtedesc => !mte_active. + */ + if (mtedesc) { + sve2p1_cont_ldst_mte_check(&info, env, estride, addr, + esize, mtedesc, ra); + } + + flags = info.page[0].flags | info.page[1].flags; + if (unlikely(flags != 0)) { + /* + * At least one page includes MMIO. + * Any bus operation can fail with cpu_transaction_failed, + * which for ARM will raise SyncExternal. Perform the load + * into scratch memory to preserve register state until the end. + */ + ARMVectorReg scratch[4] = { }; + + count_off = info.reg_off_first[0]; + count_last = info.reg_off_last[1]; + if (count_last < 0) { + count_last = info.reg_off_split; + if (count_last < 0) { + count_last = info.reg_off_last[0]; + } + } + reg_off = count_off % reg_max; + reg_n = count_off / reg_max; + + do { + reg_last = MIN(count_last - count_off, reg_max - esize); + do { + tlb_fn(env, &scratch[reg_n], reg_off, addr + count_off, ra); + reg_off += estride; + count_off += estride; + } while (reg_off <= reg_last); + reg_off = 0; + reg_n++; + } while (count_off <= count_last); + + for (unsigned n = 0; n < N; ++n) { + memcpy(&zd[n * rstride], &scratch[n], reg_max); + } + return; + } + + /* The entire operation is in RAM, on valid pages. */ + + for (unsigned n = 0; n < N; ++n) { + memset(&zd[n * rstride], 0, reg_max); + } + + count_off = info.reg_off_first[0]; + count_last = info.reg_off_last[0]; + reg_off = count_off % reg_max; + reg_n = count_off / reg_max; + host = info.page[0].host; + + set_helper_retaddr(ra); + + do { + reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize); + do { + host_fn(&zd[reg_n * rstride], reg_off, host + count_off); + reg_off += estride; + count_off += estride; + } while (reg_off <= reg_last); + reg_off = 0; + reg_n++; + } while (count_off <= count_last); + + clear_helper_retaddr(); + + /* + * Use the slow path to manage the cross-page misalignment. + * But we know this is RAM and cannot trap. + */ + count_off = info.reg_off_split; + if (unlikely(count_off >= 0)) { + reg_off = count_off % reg_max; + reg_n = count_off / reg_max; + tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra); + } + + count_off = info.reg_off_first[1]; + if (unlikely(count_off >= 0)) { + count_last = info.reg_off_last[1]; + reg_off = count_off % reg_max; + reg_n = count_off / reg_max; + host = info.page[1].host; + + set_helper_retaddr(ra); + + do { + reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize); + do { + host_fn(&zd[reg_n * rstride], reg_off, host + count_off); + reg_off += estride; + count_off += estride; + } while (reg_off <= reg_last); + reg_off = 0; + reg_n++; + } while (count_off <= count_last); + + clear_helper_retaddr(); + } +} + +void HELPER(sve2p1_ld1bb_c)(CPUARMState *env, void *vd, target_ulong addr, + uint32_t png, uint32_t desc) +{ + sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), MO_8, + sve_ld1bb_host, sve_ld1bb_tlb); +} + +#define DO_LD1_2(NAME, ESZ) \ +void HELPER(sve2p1_##NAME##_le_c)(CPUARMState *env, void *vd, \ + target_ulong addr, uint32_t png, \ + uint32_t desc) \ +{ \ + sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), ESZ, \ + sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ +} \ +void HELPER(sve2p1_##NAME##_be_c)(CPUARMState *env, void *vd, \ + target_ulong addr, uint32_t png, \ + uint32_t desc) \ +{ \ + sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), ESZ, \ + sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ +} + +DO_LD1_2(ld1hh, MO_16) +DO_LD1_2(ld1ss, MO_32) +DO_LD1_2(ld1dd, MO_64) + +#undef DO_LD1_2 + +static inline QEMU_ALWAYS_INLINE +void sve2p1_st1_c(CPUARMState *env, ARMVectorReg *zd, const vaddr addr, + uint32_t png, uint32_t desc, + const uintptr_t ra, const int esz, + sve_ldst1_host_fn *host_fn, + sve_ldst1_tlb_fn *tlb_fn) +{ + const unsigned N = (desc >> SIMD_DATA_SHIFT) & 1 ? 4 : 2; + const unsigned rstride = 1 << ((desc >> (SIMD_DATA_SHIFT + 1)) % 4); + uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + const intptr_t reg_max = simd_oprsz(desc); + const unsigned esize = 1 << esz; + intptr_t count_off, count_last; + intptr_t reg_off, reg_last, reg_n; + SVEContLdSt info; + unsigned estride, flags; + void *host; + + estride = sve2p1_cont_ldst_elements(&info, addr, png, reg_max, N, esz); + if (estride == 0) { + /* The entire predicate was false; no store occurs. */ + return; + } + + /* Probe the page(s). Exit with exception for any invalid page. */ + sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra); + + /* Handle watchpoints for all active elements. */ + sve2p1_cont_ldst_watchpoints(&info, env, addr, estride, + esize, BP_MEM_WRITE, ra); + + /* + * Handle mte checks for all active elements. + * Since TBI must be set for MTE, !mtedesc => !mte_active. + */ + if (mtedesc) { + sve2p1_cont_ldst_mte_check(&info, env, estride, addr, + esize, mtedesc, ra); + } + + flags = info.page[0].flags | info.page[1].flags; + if (unlikely(flags != 0)) { + /* + * At least one page includes MMIO. + * Any bus operation can fail with cpu_transaction_failed, + * which for ARM will raise SyncExternal. Perform the load + * into scratch memory to preserve register state until the end. + */ + count_off = info.reg_off_first[0]; + count_last = info.reg_off_last[1]; + if (count_last < 0) { + count_last = info.reg_off_split; + if (count_last < 0) { + count_last = info.reg_off_last[0]; + } + } + reg_off = count_off % reg_max; + reg_n = count_off / reg_max; + + do { + reg_last = MIN(count_last - count_off, reg_max - esize); + do { + tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra); + reg_off += estride; + count_off += estride; + } while (reg_off <= reg_last); + reg_off = 0; + reg_n++; + } while (count_off <= count_last); + return; + } + + /* The entire operation is in RAM, on valid pages. */ + + count_off = info.reg_off_first[0]; + count_last = info.reg_off_last[0]; + reg_off = count_off % reg_max; + reg_n = count_off / reg_max; + host = info.page[0].host; + + set_helper_retaddr(ra); + + do { + reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize); + do { + host_fn(&zd[reg_n * rstride], reg_off, host + count_off); + reg_off += estride; + count_off += estride; + } while (reg_off <= reg_last); + reg_off = 0; + reg_n++; + } while (count_off <= count_last); + + clear_helper_retaddr(); + + /* + * Use the slow path to manage the cross-page misalignment. + * But we know this is RAM and cannot trap. + */ + count_off = info.reg_off_split; + if (unlikely(count_off >= 0)) { + reg_off = count_off % reg_max; + reg_n = count_off / reg_max; + tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra); + } + + count_off = info.reg_off_first[1]; + if (unlikely(count_off >= 0)) { + count_last = info.reg_off_last[1]; + reg_off = count_off % reg_max; + reg_n = count_off / reg_max; + host = info.page[1].host; + + set_helper_retaddr(ra); + + do { + reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize); + do { + host_fn(&zd[reg_n * rstride], reg_off, host + count_off); + reg_off += estride; + count_off += estride; + } while (reg_off <= reg_last); + reg_off = 0; + reg_n++; + } while (count_off <= count_last); + + clear_helper_retaddr(); + } +} + +void HELPER(sve2p1_st1bb_c)(CPUARMState *env, void *vd, target_ulong addr, + uint32_t png, uint32_t desc) +{ + sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), MO_8, + sve_st1bb_host, sve_st1bb_tlb); +} + +#define DO_ST1_2(NAME, ESZ) \ +void HELPER(sve2p1_##NAME##_le_c)(CPUARMState *env, void *vd, \ + target_ulong addr, uint32_t png, \ + uint32_t desc) \ +{ \ + sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), ESZ, \ + sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ +} \ +void HELPER(sve2p1_##NAME##_be_c)(CPUARMState *env, void *vd, \ + target_ulong addr, uint32_t png, \ + uint32_t desc) \ +{ \ + sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), ESZ, \ + sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ +} + +DO_ST1_2(st1hh, MO_16) +DO_ST1_2(st1ss, MO_32) +DO_ST1_2(st1dd, MO_64) + +#undef DO_ST1_2 + void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) { intptr_t i, opr_sz = simd_oprsz(desc) / 8; @@ -7711,3 +8502,31 @@ DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64) #undef DO_FCVTLT #undef DO_FCVTNT + +void HELPER(pext)(void *vd, uint32_t png, uint32_t desc) +{ + int pl = FIELD_EX32(desc, PREDDESC, OPRSZ); + int vl = pl * 8; + unsigned v_esz = FIELD_EX32(desc, PREDDESC, ESZ); + int part = FIELD_EX32(desc, PREDDESC, DATA); + DecodeCounter p = decode_counter(png, vl, v_esz); + uint64_t mask = pred_esz_masks[v_esz + p.lg2_stride]; + ARMPredicateReg *d = vd; + + /* + * Convert from element count to byte count and adjust + * for the portion of the 4*VL counter to be extracted. + */ + int b_count = (p.count << v_esz) - vl * part; + + memset(d, 0, sizeof(*d)); + if (p.invert) { + if (b_count <= 0) { + do_whilel(vd, mask, vl, vl); + } else if (b_count < vl) { + do_whileg(vd, mask, vl - b_count, vl); + } + } else if (b_count > 0) { + do_whilel(vd, mask, MIN(b_count, vl), vl); + } +} |