aboutsummaryrefslogtreecommitdiff
path: root/target/arm/tcg/sve_helper.c
diff options
context:
space:
mode:
Diffstat (limited to 'target/arm/tcg/sve_helper.c')
-rw-r--r--target/arm/tcg/sve_helper.c1197
1 files changed, 1008 insertions, 189 deletions
diff --git a/target/arm/tcg/sve_helper.c b/target/arm/tcg/sve_helper.c
index d786b4b..43b872c 100644
--- a/target/arm/tcg/sve_helper.c
+++ b/target/arm/tcg/sve_helper.c
@@ -20,15 +20,19 @@
#include "qemu/osdep.h"
#include "cpu.h"
#include "internals.h"
-#include "exec/exec-all.h"
#include "exec/page-protection.h"
#include "exec/helper-proto.h"
+#include "exec/target_page.h"
+#include "exec/tlb-flags.h"
#include "tcg/tcg-gvec-desc.h"
#include "fpu/softfloat.h"
#include "tcg/tcg.h"
#include "vec_internal.h"
#include "sve_ldst_internal.h"
+#include "accel/tcg/cpu-ldst.h"
+#include "accel/tcg/helper-retaddr.h"
#include "accel/tcg/cpu-ops.h"
+#include "accel/tcg/probe.h"
#ifdef CONFIG_USER_ONLY
#include "user/page-protection.h"
#endif
@@ -119,6 +123,11 @@ static inline uint64_t expand_pred_s(uint8_t byte)
return word[byte & 0x11];
}
+static inline uint64_t expand_pred_d(uint8_t byte)
+{
+ return -(uint64_t)(byte & 1);
+}
+
#define LOGICAL_PPPP(NAME, FUNC) \
void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
{ \
@@ -202,6 +211,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
#define DO_EOR(N, M) (N ^ M)
#define DO_ORR(N, M) (N | M)
#define DO_BIC(N, M) (N & ~M)
+#define DO_ORC(N, M) (N | ~M)
#define DO_ADD(N, M) (N + M)
#define DO_SUB(N, M) (N - M)
#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
@@ -523,14 +533,9 @@ DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
-static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
-{
- return val >= max ? max : val <= min ? min : val;
-}
-
-#define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
-#define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
-#define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
+#define DO_SQADD_B(n, m) do_ssat_b((int64_t)n + m)
+#define DO_SQADD_H(n, m) do_ssat_h((int64_t)n + m)
+#define DO_SQADD_S(n, m) do_ssat_s((int64_t)n + m)
static inline int64_t do_sqadd_d(int64_t n, int64_t m)
{
@@ -547,9 +552,9 @@ DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
-#define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
-#define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
-#define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
+#define DO_UQADD_B(n, m) do_usat_b((int64_t)n + m)
+#define DO_UQADD_H(n, m) do_usat_h((int64_t)n + m)
+#define DO_UQADD_S(n, m) do_usat_s((int64_t)n + m)
static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
{
@@ -562,9 +567,9 @@ DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
-#define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
-#define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
-#define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
+#define DO_SQSUB_B(n, m) do_ssat_b((int64_t)n - m)
+#define DO_SQSUB_H(n, m) do_ssat_h((int64_t)n - m)
+#define DO_SQSUB_S(n, m) do_ssat_s((int64_t)n - m)
static inline int64_t do_sqsub_d(int64_t n, int64_t m)
{
@@ -581,9 +586,9 @@ DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
-#define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
-#define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
-#define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
+#define DO_UQSUB_B(n, m) do_usat_b((int64_t)n - m)
+#define DO_UQSUB_H(n, m) do_usat_h((int64_t)n - m)
+#define DO_UQSUB_S(n, m) do_usat_s((int64_t)n - m)
static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
{
@@ -595,12 +600,9 @@ DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
-#define DO_SUQADD_B(n, m) \
- do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
-#define DO_SUQADD_H(n, m) \
- do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
-#define DO_SUQADD_S(n, m) \
- do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
+#define DO_SUQADD_B(n, m) do_ssat_b((int64_t)(int8_t)n + m)
+#define DO_SUQADD_H(n, m) do_ssat_h((int64_t)(int16_t)n + m)
+#define DO_SUQADD_S(n, m) do_ssat_s((int64_t)(int32_t)n + m)
static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
{
@@ -630,12 +632,9 @@ DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
-#define DO_USQADD_B(n, m) \
- do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
-#define DO_USQADD_H(n, m) \
- do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
-#define DO_USQADD_S(n, m) \
- do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
+#define DO_USQADD_B(n, m) do_usat_b((int64_t)n + (int8_t)m)
+#define DO_USQADD_H(n, m) do_usat_h((int64_t)n + (int16_t)m)
+#define DO_USQADD_S(n, m) do_usat_s((int64_t)n + (int32_t)m)
static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
{
@@ -1222,37 +1221,29 @@ void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
} \
}
-#define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
-#define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
-#define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
-
-DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
-DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
-DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
+DO_XTNB(sve2_sqxtnb_h, int16_t, do_ssat_b)
+DO_XTNB(sve2_sqxtnb_s, int32_t, do_ssat_h)
+DO_XTNB(sve2_sqxtnb_d, int64_t, do_ssat_s)
-DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
-DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
-DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
+DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, do_ssat_b)
+DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, do_ssat_h)
+DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, do_ssat_s)
-#define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
-#define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
-#define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
+DO_XTNB(sve2_uqxtnb_h, uint16_t, do_usat_b)
+DO_XTNB(sve2_uqxtnb_s, uint32_t, do_usat_h)
+DO_XTNB(sve2_uqxtnb_d, uint64_t, do_usat_s)
-DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
-DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
-DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
+DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, do_usat_b)
+DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, do_usat_h)
+DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, do_usat_s)
-DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
-DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
-DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
+DO_XTNB(sve2_sqxtunb_h, int16_t, do_usat_b)
+DO_XTNB(sve2_sqxtunb_s, int32_t, do_usat_h)
+DO_XTNB(sve2_sqxtunb_d, int64_t, do_usat_s)
-DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
-DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
-DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
-
-DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
-DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
-DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
+DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, do_usat_b)
+DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, do_usat_h)
+DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, do_usat_s)
#undef DO_XTNB
#undef DO_XTNT
@@ -1829,6 +1820,52 @@ DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
#undef DO_VPZ
#undef DO_VPZ_D
+#define DO_VPQ(NAME, TYPE, H, INIT, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
+{ \
+ TYPE tmp[16 / sizeof(TYPE)] = { [0 ... 16 / sizeof(TYPE) - 1] = INIT }; \
+ TYPE *n = vn; uint16_t *g = vg; \
+ uintptr_t oprsz = simd_oprsz(desc); \
+ uintptr_t nseg = oprsz / 16, nsegelt = 16 / sizeof(TYPE); \
+ for (uintptr_t s = 0; s < nseg; s++) { \
+ uint16_t pg = g[H2(s)]; \
+ for (uintptr_t e = 0; e < nsegelt; e++, pg >>= sizeof(TYPE)) { \
+ if (pg & 1) { \
+ tmp[e] = OP(tmp[H(e)], n[s * nsegelt + H(e)]); \
+ } \
+ } \
+ } \
+ memcpy(vd, tmp, 16); \
+ clear_tail(vd, 16, simd_maxsz(desc)); \
+}
+
+DO_VPQ(sve2p1_addqv_b, uint8_t, H1, 0, DO_ADD)
+DO_VPQ(sve2p1_addqv_h, uint16_t, H2, 0, DO_ADD)
+DO_VPQ(sve2p1_addqv_s, uint32_t, H4, 0, DO_ADD)
+DO_VPQ(sve2p1_addqv_d, uint64_t, H8, 0, DO_ADD)
+
+DO_VPQ(sve2p1_smaxqv_b, int8_t, H1, INT8_MIN, DO_MAX)
+DO_VPQ(sve2p1_smaxqv_h, int16_t, H2, INT16_MIN, DO_MAX)
+DO_VPQ(sve2p1_smaxqv_s, int32_t, H4, INT32_MIN, DO_MAX)
+DO_VPQ(sve2p1_smaxqv_d, int64_t, H8, INT64_MIN, DO_MAX)
+
+DO_VPQ(sve2p1_sminqv_b, int8_t, H1, INT8_MAX, DO_MIN)
+DO_VPQ(sve2p1_sminqv_h, int16_t, H2, INT16_MAX, DO_MIN)
+DO_VPQ(sve2p1_sminqv_s, int32_t, H4, INT32_MAX, DO_MIN)
+DO_VPQ(sve2p1_sminqv_d, int64_t, H8, INT64_MAX, DO_MIN)
+
+DO_VPQ(sve2p1_umaxqv_b, uint8_t, H1, 0, DO_MAX)
+DO_VPQ(sve2p1_umaxqv_h, uint16_t, H2, 0, DO_MAX)
+DO_VPQ(sve2p1_umaxqv_s, uint32_t, H4, 0, DO_MAX)
+DO_VPQ(sve2p1_umaxqv_d, uint64_t, H8, 0, DO_MAX)
+
+DO_VPQ(sve2p1_uminqv_b, uint8_t, H1, -1, DO_MIN)
+DO_VPQ(sve2p1_uminqv_h, uint16_t, H2, -1, DO_MIN)
+DO_VPQ(sve2p1_uminqv_s, uint32_t, H4, -1, DO_MIN)
+DO_VPQ(sve2p1_uminqv_d, uint64_t, H8, -1, DO_MIN)
+
+#undef DO_VPQ
+
/* Two vector operand, one scalar operand, unpredicated. */
#define DO_ZZI(NAME, TYPE, OP) \
void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
@@ -1869,10 +1906,46 @@ DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
#undef DO_ZZI
+#define DO_LOGIC_QV(NAME, SUFF, INIT, VOP, POP) \
+void HELPER(NAME ## _ ## SUFF)(void *vd, void *vn, void *vg, uint32_t desc) \
+{ \
+ unsigned seg = simd_oprsz(desc) / 16; \
+ uint64_t r0 = INIT, r1 = INIT; \
+ for (unsigned s = 0; s < seg; s++) { \
+ uint64_t p0 = expand_pred_##SUFF(*(uint8_t *)(vg + H1(s * 2))); \
+ uint64_t p1 = expand_pred_##SUFF(*(uint8_t *)(vg + H1(s * 2 + 1))); \
+ uint64_t v0 = *(uint64_t *)(vn + s * 16); \
+ uint64_t v1 = *(uint64_t *)(vn + s * 16 + 8); \
+ v0 = POP(v0, p0), v1 = POP(v1, p1); \
+ r0 = VOP(r0, v0), r1 = VOP(r1, v1); \
+ } \
+ *(uint64_t *)(vd + 0) = r0; \
+ *(uint64_t *)(vd + 8) = r1; \
+ clear_tail(vd, 16, simd_maxsz(desc)); \
+}
+
+DO_LOGIC_QV(sve2p1_orqv, b, 0, DO_ORR, DO_AND)
+DO_LOGIC_QV(sve2p1_orqv, h, 0, DO_ORR, DO_AND)
+DO_LOGIC_QV(sve2p1_orqv, s, 0, DO_ORR, DO_AND)
+DO_LOGIC_QV(sve2p1_orqv, d, 0, DO_ORR, DO_AND)
+
+DO_LOGIC_QV(sve2p1_eorqv, b, 0, DO_EOR, DO_AND)
+DO_LOGIC_QV(sve2p1_eorqv, h, 0, DO_EOR, DO_AND)
+DO_LOGIC_QV(sve2p1_eorqv, s, 0, DO_EOR, DO_AND)
+DO_LOGIC_QV(sve2p1_eorqv, d, 0, DO_EOR, DO_AND)
+
+DO_LOGIC_QV(sve2p1_andqv, b, -1, DO_AND, DO_ORC)
+DO_LOGIC_QV(sve2p1_andqv, h, -1, DO_AND, DO_ORC)
+DO_LOGIC_QV(sve2p1_andqv, s, -1, DO_AND, DO_ORC)
+DO_LOGIC_QV(sve2p1_andqv, d, -1, DO_AND, DO_ORC)
+
+#undef DO_LOGIC_QV
+
#undef DO_AND
#undef DO_ORR
#undef DO_EOR
#undef DO_BIC
+#undef DO_ORC
#undef DO_ADD
#undef DO_SUB
#undef DO_MAX
@@ -2065,27 +2138,6 @@ void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
when N is negative, add 2**M-1. */
#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
-static inline uint64_t do_urshr(uint64_t x, unsigned sh)
-{
- if (likely(sh < 64)) {
- return (x >> sh) + ((x >> (sh - 1)) & 1);
- } else if (sh == 64) {
- return x >> 63;
- } else {
- return 0;
- }
-}
-
-static inline int64_t do_srshr(int64_t x, unsigned sh)
-{
- if (likely(sh < 64)) {
- return (x >> sh) + ((x >> (sh - 1)) & 1);
- } else {
- /* Rounding the sign bit always produces 0. */
- return 0;
- }
-}
-
DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
@@ -2183,10 +2235,9 @@ DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
-#define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
-#define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
-#define DO_SQSHRUN_D(x, sh) \
- do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
+#define DO_SQSHRUN_H(x, sh) do_usat_b((int64_t)(x) >> sh)
+#define DO_SQSHRUN_S(x, sh) do_usat_h((int64_t)(x) >> sh)
+#define DO_SQSHRUN_D(x, sh) do_usat_s((int64_t)(x) >> (sh < 64 ? sh : 63))
DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
@@ -2196,9 +2247,9 @@ DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
-#define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
-#define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
-#define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
+#define DO_SQRSHRUN_H(x, sh) do_usat_b(do_srshr(x, sh))
+#define DO_SQRSHRUN_S(x, sh) do_usat_h(do_srshr(x, sh))
+#define DO_SQRSHRUN_D(x, sh) do_usat_s(do_srshr(x, sh))
DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
@@ -2208,9 +2259,9 @@ DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
-#define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
-#define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
-#define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
+#define DO_SQSHRN_H(x, sh) do_ssat_b(x >> sh)
+#define DO_SQSHRN_S(x, sh) do_ssat_h(x >> sh)
+#define DO_SQSHRN_D(x, sh) do_ssat_s(x >> sh)
DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
@@ -2220,9 +2271,9 @@ DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
-#define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
-#define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
-#define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
+#define DO_SQRSHRN_H(x, sh) do_ssat_b(do_srshr(x, sh))
+#define DO_SQRSHRN_S(x, sh) do_ssat_h(do_srshr(x, sh))
+#define DO_SQRSHRN_D(x, sh) do_ssat_s(do_srshr(x, sh))
DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
@@ -2984,6 +3035,56 @@ void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
}
}
+/*
+ * TODO: This could use half_shuffle64 and similar bit tricks to
+ * expand blocks of bits at once.
+ */
+#define DO_PMOV_PV(NAME, ESIZE) \
+void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \
+{ \
+ unsigned vl = simd_oprsz(desc); \
+ unsigned idx = simd_data(desc); \
+ unsigned elements = vl / ESIZE; \
+ ARMPredicateReg *d = vd; \
+ ARMVectorReg *s = vs; \
+ memset(d, 0, sizeof(*d)); \
+ for (unsigned e = 0; e < elements; ++e) { \
+ depositn(d->p, e * ESIZE, 1, extractn(s->d, elements * idx + e, 1)); \
+ } \
+}
+
+DO_PMOV_PV(pmov_pv_h, 2)
+DO_PMOV_PV(pmov_pv_s, 4)
+DO_PMOV_PV(pmov_pv_d, 8)
+
+#undef DO_PMOV_PV
+
+/*
+ * TODO: This could use half_unshuffle64 and similar bit tricks to
+ * compress blocks of bits at once.
+ */
+#define DO_PMOV_VP(NAME, ESIZE) \
+void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \
+{ \
+ unsigned vl = simd_oprsz(desc); \
+ unsigned idx = simd_data(desc); \
+ unsigned elements = vl / ESIZE; \
+ ARMVectorReg *d = vd; \
+ ARMPredicateReg *s = vs; \
+ if (idx == 0) { \
+ memset(d, 0, vl); \
+ } \
+ for (unsigned e = 0; e < elements; ++e) { \
+ depositn(d->d, elements * idx + e, 1, extractn(s->p, e * ESIZE, 1)); \
+ } \
+}
+
+DO_PMOV_VP(pmov_vp_h, 2)
+DO_PMOV_VP(pmov_vp_s, 4)
+DO_PMOV_VP(pmov_vp_d, 8)
+
+#undef DO_PMOV_VP
+
typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
@@ -3449,6 +3550,45 @@ DO_UZP(sve_uzp_s, uint32_t, H1_4)
DO_UZP(sve_uzp_d, uint64_t, H1_8)
DO_UZP(sve2_uzp_q, Int128, )
+typedef void perseg_zzz_fn(void *vd, void *vn, void *vm, uint32_t desc);
+
+static void do_perseg_zzz(void *vd, void *vn, void *vm,
+ uint32_t desc, perseg_zzz_fn *fn)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+
+ desc = simd_desc(16, 16, simd_data(desc));
+ for (intptr_t i = 0; i < oprsz; i += 16) {
+ fn(vd + i, vn + i, vm + i, desc);
+ }
+}
+
+#define DO_PERSEG_ZZZ(NAME, FUNC) \
+ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+ { do_perseg_zzz(vd, vn, vm, desc, FUNC); }
+
+DO_PERSEG_ZZZ(sve2p1_uzpq_b, helper_sve_uzp_b)
+DO_PERSEG_ZZZ(sve2p1_uzpq_h, helper_sve_uzp_h)
+DO_PERSEG_ZZZ(sve2p1_uzpq_s, helper_sve_uzp_s)
+DO_PERSEG_ZZZ(sve2p1_uzpq_d, helper_sve_uzp_d)
+
+DO_PERSEG_ZZZ(sve2p1_zipq_b, helper_sve_zip_b)
+DO_PERSEG_ZZZ(sve2p1_zipq_h, helper_sve_zip_h)
+DO_PERSEG_ZZZ(sve2p1_zipq_s, helper_sve_zip_s)
+DO_PERSEG_ZZZ(sve2p1_zipq_d, helper_sve_zip_d)
+
+DO_PERSEG_ZZZ(sve2p1_tblq_b, helper_sve_tbl_b)
+DO_PERSEG_ZZZ(sve2p1_tblq_h, helper_sve_tbl_h)
+DO_PERSEG_ZZZ(sve2p1_tblq_s, helper_sve_tbl_s)
+DO_PERSEG_ZZZ(sve2p1_tblq_d, helper_sve_tbl_d)
+
+DO_PERSEG_ZZZ(sve2p1_tbxq_b, helper_sve2_tbx_b)
+DO_PERSEG_ZZZ(sve2p1_tbxq_h, helper_sve2_tbx_h)
+DO_PERSEG_ZZZ(sve2p1_tbxq_s, helper_sve2_tbx_s)
+DO_PERSEG_ZZZ(sve2p1_tbxq_d, helper_sve2_tbx_d)
+
+#undef DO_PERSEG_ZZZ
+
#define DO_TRN(NAME, TYPE, H) \
void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
{ \
@@ -3989,15 +4129,6 @@ static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
return flags;
}
-static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
-{
- /* It is quicker to zero the whole predicate than loop on OPRSZ.
- * The compiler should turn this into 4 64-bit integer stores.
- */
- memset(d, 0, sizeof(ARMPredicateReg));
- return PREDTEST_INIT;
-}
-
void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
uint32_t pred_desc)
{
@@ -4005,7 +4136,7 @@ void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
if (last_active_pred(vn, vg, oprsz)) {
compute_brk_z(vd, vm, vg, oprsz, true);
} else {
- do_zero(vd, oprsz);
+ memset(vd, 0, sizeof(ARMPredicateReg));
}
}
@@ -4016,7 +4147,8 @@ uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
if (last_active_pred(vn, vg, oprsz)) {
return compute_brks_z(vd, vm, vg, oprsz, true);
} else {
- return do_zero(vd, oprsz);
+ memset(vd, 0, sizeof(ARMPredicateReg));
+ return PREDTEST_INIT;
}
}
@@ -4027,7 +4159,7 @@ void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
if (last_active_pred(vn, vg, oprsz)) {
compute_brk_z(vd, vm, vg, oprsz, false);
} else {
- do_zero(vd, oprsz);
+ memset(vd, 0, sizeof(ARMPredicateReg));
}
}
@@ -4038,7 +4170,8 @@ uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
if (last_active_pred(vn, vg, oprsz)) {
return compute_brks_z(vd, vm, vg, oprsz, false);
} else {
- return do_zero(vd, oprsz);
+ memset(vd, 0, sizeof(ARMPredicateReg));
+ return PREDTEST_INIT;
}
}
@@ -4094,35 +4227,30 @@ void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
{
intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
if (!last_active_pred(vn, vg, oprsz)) {
- do_zero(vd, oprsz);
- }
-}
-
-/* As if PredTest(Ones(PL), D, esz). */
-static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
- uint64_t esz_mask)
-{
- uint32_t flags = PREDTEST_INIT;
- intptr_t i;
-
- for (i = 0; i < oprsz / 8; i++) {
- flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
- }
- if (oprsz & 7) {
- uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
- flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
+ memset(vd, 0, sizeof(ARMPredicateReg));
}
- return flags;
}
uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
{
intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
if (last_active_pred(vn, vg, oprsz)) {
- return predtest_ones(vd, oprsz, -1);
- } else {
- return do_zero(vd, oprsz);
+ ARMPredicateReg *d = vd;
+ uint32_t flags = PREDTEST_INIT;
+ intptr_t i;
+
+ /* As if PredTest(Ones(PL), D, MO_8). */
+ for (i = 0; i < oprsz / 8; i++) {
+ flags = iter_predtest_fwd(d->p[i], -1, flags);
+ }
+ if (oprsz & 7) {
+ uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
+ flags = iter_predtest_fwd(d->p[i], mask, flags);
+ }
+ return flags;
}
+ memset(vd, 0, sizeof(ARMPredicateReg));
+ return PREDTEST_INIT;
}
uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
@@ -4139,66 +4267,200 @@ uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
return sum;
}
-uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
+uint64_t HELPER(sve2p1_cntp_c)(uint32_t png, uint32_t desc)
{
- intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
- intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
- uint64_t esz_mask = pred_esz_masks[esz];
- ARMPredicateReg *d = vd;
- uint32_t flags;
- intptr_t i;
+ int pl = FIELD_EX32(desc, PREDDESC, OPRSZ);
+ int vl = pl * 8;
+ unsigned v_esz = FIELD_EX32(desc, PREDDESC, ESZ);
+ int lg2_width = FIELD_EX32(desc, PREDDESC, DATA) + 1;
+ DecodeCounter p = decode_counter(png, vl, v_esz);
+ unsigned maxelem = (vl << lg2_width) >> v_esz;
+ unsigned count = p.count;
+
+ if (p.invert) {
+ if (count >= maxelem) {
+ return 0;
+ }
+ count = maxelem - count;
+ } else {
+ count = MIN(count, maxelem);
+ }
+ return count >> p.lg2_stride;
+}
+
+/* C.f. Arm pseudocode EncodePredCount */
+static uint64_t encode_pred_count(uint32_t elements, uint32_t count,
+ uint32_t esz, bool invert)
+{
+ uint32_t pred;
- /* Begin with a zero predicate register. */
- flags = do_zero(d, oprsz);
if (count == 0) {
- return flags;
+ return 0;
+ }
+ if (invert) {
+ count = elements - count;
+ } else if (count == elements) {
+ count = 0;
+ invert = true;
}
- /* Set all of the requested bits. */
- for (i = 0; i < count / 64; ++i) {
- d->p[i] = esz_mask;
+ pred = (count << 1) | 1;
+ pred <<= esz;
+ pred |= invert << 15;
+
+ return pred;
+}
+
+/* C.f. Arm pseudocode PredCountTest */
+static uint32_t pred_count_test(uint32_t elements, uint32_t count, bool invert)
+{
+ uint32_t flags;
+
+ if (count == 0) {
+ flags = 1; /* !N, Z, C */
+ } else if (!invert) {
+ flags = (1u << 31) | 2; /* N, !Z */
+ flags |= count != elements; /* C */
+ } else {
+ flags = 2; /* !Z, !C */
+ flags |= (count == elements) << 31; /* N */
}
- if (count & 63) {
- d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
+ return flags;
+}
+
+/* D must be cleared on entry. */
+static void do_whilel(ARMPredicateReg *d, uint64_t esz_mask,
+ uint32_t count, uint32_t oprbits)
+{
+ tcg_debug_assert(count <= oprbits);
+ if (count) {
+ uint32_t i;
+
+ /* Set all of the requested bits. */
+ for (i = 0; i < count / 64; ++i) {
+ d->p[i] = esz_mask;
+ }
+ if (count & 63) {
+ d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
+ }
}
+}
- return predtest_ones(d, oprsz, esz_mask);
+uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
+{
+ uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ uint32_t oprbits = oprsz * 8;
+ uint64_t esz_mask = pred_esz_masks[esz];
+ ARMPredicateReg *d = vd;
+
+ count <<= esz;
+ memset(d, 0, sizeof(*d));
+ do_whilel(d, esz_mask, count, oprbits);
+ return pred_count_test(oprbits, count, false);
}
-uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
+uint32_t HELPER(sve_while2l)(void *vd, uint32_t count, uint32_t pred_desc)
{
- intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
- intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ uint32_t oprbits = oprsz * 8;
uint64_t esz_mask = pred_esz_masks[esz];
ARMPredicateReg *d = vd;
- intptr_t i, invcount, oprbits;
- uint64_t bits;
- if (count == 0) {
- return do_zero(d, oprsz);
+ count <<= esz;
+ memset(d, 0, 2 * sizeof(*d));
+ if (count <= oprbits) {
+ do_whilel(&d[0], esz_mask, count, oprbits);
+ } else {
+ do_whilel(&d[0], esz_mask, oprbits, oprbits);
+ do_whilel(&d[1], esz_mask, count - oprbits, oprbits);
}
- oprbits = oprsz * 8;
+ return pred_count_test(2 * oprbits, count, false);
+}
+
+uint32_t HELPER(sve_whilecl)(void *vd, uint32_t count, uint32_t pred_desc)
+{
+ uint32_t pl = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ uint32_t scale = FIELD_EX32(pred_desc, PREDDESC, DATA);
+ uint32_t vl = pl * 8;
+ uint32_t elements = (vl >> esz) << scale;
+ ARMPredicateReg *d = vd;
+
+ *d = (ARMPredicateReg) {
+ .p[0] = encode_pred_count(elements, count, esz, false)
+ };
+ return pred_count_test(elements, count, false);
+}
+
+/* D must be cleared on entry. */
+static void do_whileg(ARMPredicateReg *d, uint64_t esz_mask,
+ uint32_t count, uint32_t oprbits)
+{
tcg_debug_assert(count <= oprbits);
+ if (count) {
+ uint32_t i, invcount = oprbits - count;
+ uint64_t bits = esz_mask & MAKE_64BIT_MASK(invcount & 63, 64);
- bits = esz_mask;
- if (oprbits & 63) {
- bits &= MAKE_64BIT_MASK(0, oprbits & 63);
+ for (i = invcount / 64; i < oprbits / 64; ++i) {
+ d->p[i] = bits;
+ bits = esz_mask;
+ }
+ if (oprbits & 63) {
+ d->p[i] = bits & MAKE_64BIT_MASK(0, oprbits & 63);
+ }
}
+}
- invcount = oprbits - count;
- for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
- d->p[i] = bits;
- bits = esz_mask;
- }
+uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
+{
+ uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ uint32_t oprbits = oprsz * 8;
+ uint64_t esz_mask = pred_esz_masks[esz];
+ ARMPredicateReg *d = vd;
- d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
+ count <<= esz;
+ memset(d, 0, sizeof(*d));
+ do_whileg(d, esz_mask, count, oprbits);
+ return pred_count_test(oprbits, count, true);
+}
+
+uint32_t HELPER(sve_while2g)(void *vd, uint32_t count, uint32_t pred_desc)
+{
+ uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ uint32_t oprbits = oprsz * 8;
+ uint64_t esz_mask = pred_esz_masks[esz];
+ ARMPredicateReg *d = vd;
- while (--i >= 0) {
- d->p[i] = 0;
+ count <<= esz;
+ memset(d, 0, 2 * sizeof(*d));
+ if (count <= oprbits) {
+ do_whileg(&d[1], esz_mask, count, oprbits);
+ } else {
+ do_whilel(&d[1], esz_mask, oprbits, oprbits);
+ do_whileg(&d[0], esz_mask, count - oprbits, oprbits);
}
- return predtest_ones(d, oprsz, esz_mask);
+ return pred_count_test(2 * oprbits, count, true);
+}
+
+uint32_t HELPER(sve_whilecg)(void *vd, uint32_t count, uint32_t pred_desc)
+{
+ uint32_t pl = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ uint32_t scale = FIELD_EX32(pred_desc, PREDDESC, DATA);
+ uint32_t vl = pl * 8;
+ uint32_t elements = (vl >> esz) << scale;
+ ARMPredicateReg *d = vd;
+
+ *d = (ARMPredicateReg) {
+ .p[0] = encode_pred_count(elements, count, esz, true)
+ };
+ return pred_count_test(elements, count, true);
}
/* Recursive reduction on a function;
@@ -4209,19 +4471,20 @@ uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
* The recursion is bounded to depth 7 (128 fp16 elements), so there's
* little to gain with a more complex non-recursive form.
*/
-#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
-static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
+#define DO_REDUCE(NAME, SUF, TYPE, H, FUNC, IDENT) \
+static TYPE FUNC##_reduce(TYPE *data, float_status *status, uintptr_t n) \
{ \
if (n == 1) { \
return *data; \
} else { \
uintptr_t half = n / 2; \
- TYPE lo = NAME##_reduce(data, status, half); \
- TYPE hi = NAME##_reduce(data + half, status, half); \
+ TYPE lo = FUNC##_reduce(data, status, half); \
+ TYPE hi = FUNC##_reduce(data + half, status, half); \
return FUNC(lo, hi, status); \
} \
} \
-uint64_t HELPER(NAME)(void *vn, void *vg, float_status *s, uint32_t desc) \
+uint64_t helper_sve_##NAME##v_##SUF(void *vn, void *vg, \
+ float_status *s, uint32_t desc) \
{ \
uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
@@ -4236,39 +4499,54 @@ uint64_t HELPER(NAME)(void *vn, void *vg, float_status *s, uint32_t desc) \
for (; i < maxsz; i += sizeof(TYPE)) { \
*(TYPE *)((void *)data + i) = IDENT; \
} \
- return NAME##_reduce(data, s, maxsz / sizeof(TYPE)); \
+ return FUNC##_reduce(data, s, maxsz / sizeof(TYPE)); \
+} \
+void helper_sve2p1_##NAME##qv_##SUF(void *vd, void *vn, void *vg, \
+ float_status *status, uint32_t desc) \
+{ \
+ unsigned oprsz = simd_oprsz(desc), segments = oprsz / 16; \
+ for (unsigned e = 0; e < 16; e += sizeof(TYPE)) { \
+ TYPE data[ARM_MAX_VQ]; \
+ for (unsigned s = 0; s < segments; s++) { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(s * 2)); \
+ TYPE nn = *(TYPE *)(vn + H(s * 16 + H(e))); \
+ data[s] = (pg >> e) & 1 ? nn : IDENT; \
+ } \
+ *(TYPE *)(vd + H(e)) = FUNC##_reduce(data, status, segments); \
+ } \
+ clear_tail(vd, 16, simd_maxsz(desc)); \
}
-DO_REDUCE(sve_faddv_h, float16, H1_2, float16_add, float16_zero)
-DO_REDUCE(sve_faddv_s, float32, H1_4, float32_add, float32_zero)
-DO_REDUCE(sve_faddv_d, float64, H1_8, float64_add, float64_zero)
+DO_REDUCE(fadd,h, float16, H1_2, float16_add, float16_zero)
+DO_REDUCE(fadd,s, float32, H1_4, float32_add, float32_zero)
+DO_REDUCE(fadd,d, float64, H1_8, float64_add, float64_zero)
/* Identity is floatN_default_nan, without the function call. */
-DO_REDUCE(sve_fminnmv_h, float16, H1_2, float16_minnum, 0x7E00)
-DO_REDUCE(sve_fminnmv_s, float32, H1_4, float32_minnum, 0x7FC00000)
-DO_REDUCE(sve_fminnmv_d, float64, H1_8, float64_minnum, 0x7FF8000000000000ULL)
+DO_REDUCE(fminnm,h, float16, H1_2, float16_minnum, 0x7E00)
+DO_REDUCE(fminnm,s, float32, H1_4, float32_minnum, 0x7FC00000)
+DO_REDUCE(fminnm,d, float64, H1_8, float64_minnum, 0x7FF8000000000000ULL)
-DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, float16_maxnum, 0x7E00)
-DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, float32_maxnum, 0x7FC00000)
-DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, float64_maxnum, 0x7FF8000000000000ULL)
+DO_REDUCE(fmaxnm,h, float16, H1_2, float16_maxnum, 0x7E00)
+DO_REDUCE(fmaxnm,s, float32, H1_4, float32_maxnum, 0x7FC00000)
+DO_REDUCE(fmaxnm,d, float64, H1_8, float64_maxnum, 0x7FF8000000000000ULL)
-DO_REDUCE(sve_fminv_h, float16, H1_2, float16_min, float16_infinity)
-DO_REDUCE(sve_fminv_s, float32, H1_4, float32_min, float32_infinity)
-DO_REDUCE(sve_fminv_d, float64, H1_8, float64_min, float64_infinity)
+DO_REDUCE(fmin,h, float16, H1_2, float16_min, float16_infinity)
+DO_REDUCE(fmin,s, float32, H1_4, float32_min, float32_infinity)
+DO_REDUCE(fmin,d, float64, H1_8, float64_min, float64_infinity)
-DO_REDUCE(sve_fmaxv_h, float16, H1_2, float16_max, float16_chs(float16_infinity))
-DO_REDUCE(sve_fmaxv_s, float32, H1_4, float32_max, float32_chs(float32_infinity))
-DO_REDUCE(sve_fmaxv_d, float64, H1_8, float64_max, float64_chs(float64_infinity))
+DO_REDUCE(fmax,h, float16, H1_2, float16_max, float16_chs(float16_infinity))
+DO_REDUCE(fmax,s, float32, H1_4, float32_max, float32_chs(float32_infinity))
+DO_REDUCE(fmax,d, float64, H1_8, float64_max, float64_chs(float64_infinity))
-DO_REDUCE(sve_ah_fminv_h, float16, H1_2, helper_vfp_ah_minh, float16_infinity)
-DO_REDUCE(sve_ah_fminv_s, float32, H1_4, helper_vfp_ah_mins, float32_infinity)
-DO_REDUCE(sve_ah_fminv_d, float64, H1_8, helper_vfp_ah_mind, float64_infinity)
+DO_REDUCE(ah_fmin,h, float16, H1_2, helper_vfp_ah_minh, float16_infinity)
+DO_REDUCE(ah_fmin,s, float32, H1_4, helper_vfp_ah_mins, float32_infinity)
+DO_REDUCE(ah_fmin,d, float64, H1_8, helper_vfp_ah_mind, float64_infinity)
-DO_REDUCE(sve_ah_fmaxv_h, float16, H1_2, helper_vfp_ah_maxh,
+DO_REDUCE(ah_fmax,h, float16, H1_2, helper_vfp_ah_maxh,
float16_chs(float16_infinity))
-DO_REDUCE(sve_ah_fmaxv_s, float32, H1_4, helper_vfp_ah_maxs,
+DO_REDUCE(ah_fmax,s, float32, H1_4, helper_vfp_ah_maxs,
float32_chs(float32_infinity))
-DO_REDUCE(sve_ah_fmaxv_d, float64, H1_8, helper_vfp_ah_maxd,
+DO_REDUCE(ah_fmax,d, float64, H1_8, helper_vfp_ah_maxd,
float64_chs(float64_infinity))
#undef DO_REDUCE
@@ -4550,7 +4828,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vg, \
* FZ16. When converting from fp16, this affects flushing input denormals;
* when converting to fp16, this affects flushing output denormals.
*/
-static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
+float32 sve_f16_to_f32(float16 f, float_status *fpst)
{
bool save = get_flush_inputs_to_zero(fpst);
float32 ret;
@@ -4572,7 +4850,7 @@ static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
return ret;
}
-static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
+float16 sve_f32_to_f16(float32 f, float_status *fpst)
{
bool save = get_flush_to_zero(fpst);
float16 ret;
@@ -6081,6 +6359,9 @@ DO_LD1_2(ld1sds, MO_64, MO_32)
DO_LD1_2(ld1dd, MO_64, MO_64)
+DO_LD1_2(ld1squ, MO_32, MO_128)
+DO_LD1_2(ld1dqu, MO_64, MO_128)
+
#undef DO_LD1_1
#undef DO_LD1_2
@@ -6140,6 +6421,10 @@ DO_LDN_2(2, dd, MO_64)
DO_LDN_2(3, dd, MO_64)
DO_LDN_2(4, dd, MO_64)
+DO_LDN_2(2, qq, MO_128)
+DO_LDN_2(3, qq, MO_128)
+DO_LDN_2(4, qq, MO_128)
+
#undef DO_LDN_1
#undef DO_LDN_2
@@ -6703,6 +6988,13 @@ DO_STN_2(2, dd, MO_64, MO_64)
DO_STN_2(3, dd, MO_64, MO_64)
DO_STN_2(4, dd, MO_64, MO_64)
+DO_STN_2(1, sq, MO_128, MO_32)
+DO_STN_2(1, dq, MO_128, MO_64)
+
+DO_STN_2(2, qq, MO_128, MO_128)
+DO_STN_2(3, qq, MO_128, MO_128)
+DO_STN_2(4, qq, MO_128, MO_128)
+
#undef DO_STN_1
#undef DO_STN_2
@@ -6919,6 +7211,9 @@ DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
DO_LD1_ZPZ_D(dd_be, zss, MO_64)
DO_LD1_ZPZ_D(dd_be, zd, MO_64)
+DO_LD1_ZPZ_D(qq_le, zd, MO_128)
+DO_LD1_ZPZ_D(qq_be, zd, MO_128)
+
#undef DO_LD1_ZPZ_S
#undef DO_LD1_ZPZ_D
@@ -7305,9 +7600,505 @@ DO_ST1_ZPZ_D(sd_be, zd, MO_32)
DO_ST1_ZPZ_D(dd_le, zd, MO_64)
DO_ST1_ZPZ_D(dd_be, zd, MO_64)
+DO_ST1_ZPZ_D(qq_le, zd, MO_128)
+DO_ST1_ZPZ_D(qq_be, zd, MO_128)
+
#undef DO_ST1_ZPZ_S
#undef DO_ST1_ZPZ_D
+/*
+ * SVE2.1 consecutive register load/store
+ */
+
+static unsigned sve2p1_cont_ldst_elements(SVEContLdSt *info, vaddr addr,
+ uint32_t png, intptr_t reg_max,
+ int N, int v_esz)
+{
+ const int esize = 1 << v_esz;
+ intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
+ DecodeCounter p = decode_counter(png, reg_max, v_esz);
+ unsigned b_count = p.count << v_esz;
+ unsigned b_stride = 1 << (v_esz + p.lg2_stride);
+ intptr_t page_split;
+
+ /* Set all of the element indices to -1, and the TLB data to 0. */
+ memset(info, -1, offsetof(SVEContLdSt, page));
+ memset(info->page, 0, sizeof(info->page));
+
+ if (p.invert) {
+ if (b_count >= reg_max * N) {
+ return 0;
+ }
+ reg_off_first = b_count;
+ reg_off_last = reg_max * N - b_stride;
+ } else {
+ if (b_count == 0) {
+ return 0;
+ }
+ reg_off_first = 0;
+ reg_off_last = MIN(b_count - esize, reg_max * N - b_stride);
+ }
+
+ info->reg_off_first[0] = reg_off_first;
+ info->mem_off_first[0] = reg_off_first;
+
+ page_split = -(addr | TARGET_PAGE_MASK);
+ if (reg_off_last + esize <= page_split || reg_off_first >= page_split) {
+ /* The entire operation fits within a single page. */
+ info->reg_off_last[0] = reg_off_last;
+ return b_stride;
+ }
+
+ info->page_split = page_split;
+ reg_off_split = ROUND_DOWN(page_split, esize);
+
+ /*
+ * This is the last full element on the first page, but it is not
+ * necessarily active. If there is no full element, i.e. the first
+ * active element is the one that's split, this value remains -1.
+ * It is useful as iteration bounds.
+ */
+ if (reg_off_split != 0) {
+ info->reg_off_last[0] = ROUND_DOWN(reg_off_split - esize, b_stride);
+ }
+
+ /* Determine if an unaligned element spans the pages. */
+ if (page_split & (esize - 1)) {
+ /* It is helpful to know if the split element is active. */
+ if ((reg_off_split & (b_stride - 1)) == 0) {
+ info->reg_off_split = reg_off_split;
+ info->mem_off_split = reg_off_split;
+ }
+ reg_off_split += esize;
+ }
+
+ /*
+ * We do want the first active element on the second page, because
+ * this may affect the address reported in an exception.
+ */
+ reg_off_split = ROUND_UP(reg_off_split, b_stride);
+ if (reg_off_split <= reg_off_last) {
+ info->reg_off_first[1] = reg_off_split;
+ info->mem_off_first[1] = reg_off_split;
+ info->reg_off_last[1] = reg_off_last;
+ }
+ return b_stride;
+}
+
+static void sve2p1_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
+ target_ulong addr, unsigned estride,
+ int esize, int wp_access, uintptr_t ra)
+{
+#ifndef CONFIG_USER_ONLY
+ intptr_t count_off, count_last;
+ int flags0 = info->page[0].flags;
+ int flags1 = info->page[1].flags;
+
+ if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
+ return;
+ }
+
+ /* Indicate that watchpoints are handled. */
+ info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
+ info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
+
+ if (flags0 & TLB_WATCHPOINT) {
+ count_off = info->reg_off_first[0];
+ count_last = info->reg_off_split;
+ if (count_last < 0) {
+ count_last = info->reg_off_last[0];
+ }
+ do {
+ cpu_check_watchpoint(env_cpu(env), addr + count_off,
+ esize, info->page[0].attrs, wp_access, ra);
+ count_off += estride;
+ } while (count_off <= count_last);
+ }
+
+ count_off = info->reg_off_first[1];
+ if ((flags1 & TLB_WATCHPOINT) && count_off >= 0) {
+ count_last = info->reg_off_last[1];
+ do {
+ cpu_check_watchpoint(env_cpu(env), addr + count_off,
+ esize, info->page[1].attrs,
+ wp_access, ra);
+ count_off += estride;
+ } while (count_off <= count_last);
+ }
+#endif
+}
+
+static void sve2p1_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
+ target_ulong addr, unsigned estride,
+ int esize, uint32_t mtedesc,
+ uintptr_t ra)
+{
+ intptr_t count_off, count_last;
+
+ /*
+ * TODO: estride is always a small power of two, <= 8.
+ * Manipulate the stride within the loops such that
+ * - first iteration hits addr + off, as required,
+ * - second iteration hits ALIGN_UP(addr, 16),
+ * - other iterations advance addr by 16.
+ * This will minimize the probing to once per MTE granule.
+ */
+
+ /* Process the page only if MemAttr == Tagged. */
+ if (info->page[0].tagged) {
+ count_off = info->reg_off_first[0];
+ count_last = info->reg_off_split;
+ if (count_last < 0) {
+ count_last = info->reg_off_last[0];
+ }
+
+ do {
+ mte_check(env, mtedesc, addr + count_off, ra);
+ count_off += estride;
+ } while (count_off <= count_last);
+ }
+
+ count_off = info->reg_off_first[1];
+ if (count_off >= 0 && info->page[1].tagged) {
+ count_last = info->reg_off_last[1];
+ do {
+ mte_check(env, mtedesc, addr + count_off, ra);
+ count_off += estride;
+ } while (count_off <= count_last);
+ }
+}
+
+static inline QEMU_ALWAYS_INLINE
+void sve2p1_ld1_c(CPUARMState *env, ARMVectorReg *zd, const vaddr addr,
+ uint32_t png, uint32_t desc,
+ const uintptr_t ra, const MemOp esz,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ const unsigned N = (desc >> SIMD_DATA_SHIFT) & 1 ? 4 : 2;
+ const unsigned rstride = 1 << ((desc >> (SIMD_DATA_SHIFT + 1)) % 4);
+ uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+ const intptr_t reg_max = simd_oprsz(desc);
+ const unsigned esize = 1 << esz;
+ intptr_t count_off, count_last;
+ intptr_t reg_off, reg_last, reg_n;
+ SVEContLdSt info;
+ unsigned estride, flags;
+ void *host;
+
+ estride = sve2p1_cont_ldst_elements(&info, addr, png, reg_max, N, esz);
+ if (estride == 0) {
+ /* The entire predicate was false; no load occurs. */
+ for (unsigned n = 0; n < N; n++) {
+ memset(zd + n * rstride, 0, reg_max);
+ }
+ return;
+ }
+
+ /* Probe the page(s). Exit with exception for any invalid page. */
+ sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra);
+
+ /* Handle watchpoints for all active elements. */
+ sve2p1_cont_ldst_watchpoints(&info, env, addr, estride,
+ esize, BP_MEM_READ, ra);
+
+ /*
+ * Handle mte checks for all active elements.
+ * Since TBI must be set for MTE, !mtedesc => !mte_active.
+ */
+ if (mtedesc) {
+ sve2p1_cont_ldst_mte_check(&info, env, estride, addr,
+ esize, mtedesc, ra);
+ }
+
+ flags = info.page[0].flags | info.page[1].flags;
+ if (unlikely(flags != 0)) {
+ /*
+ * At least one page includes MMIO.
+ * Any bus operation can fail with cpu_transaction_failed,
+ * which for ARM will raise SyncExternal. Perform the load
+ * into scratch memory to preserve register state until the end.
+ */
+ ARMVectorReg scratch[4] = { };
+
+ count_off = info.reg_off_first[0];
+ count_last = info.reg_off_last[1];
+ if (count_last < 0) {
+ count_last = info.reg_off_split;
+ if (count_last < 0) {
+ count_last = info.reg_off_last[0];
+ }
+ }
+ reg_off = count_off % reg_max;
+ reg_n = count_off / reg_max;
+
+ do {
+ reg_last = MIN(count_last - count_off, reg_max - esize);
+ do {
+ tlb_fn(env, &scratch[reg_n], reg_off, addr + count_off, ra);
+ reg_off += estride;
+ count_off += estride;
+ } while (reg_off <= reg_last);
+ reg_off = 0;
+ reg_n++;
+ } while (count_off <= count_last);
+
+ for (unsigned n = 0; n < N; ++n) {
+ memcpy(&zd[n * rstride], &scratch[n], reg_max);
+ }
+ return;
+ }
+
+ /* The entire operation is in RAM, on valid pages. */
+
+ for (unsigned n = 0; n < N; ++n) {
+ memset(&zd[n * rstride], 0, reg_max);
+ }
+
+ count_off = info.reg_off_first[0];
+ count_last = info.reg_off_last[0];
+ reg_off = count_off % reg_max;
+ reg_n = count_off / reg_max;
+ host = info.page[0].host;
+
+ set_helper_retaddr(ra);
+
+ do {
+ reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize);
+ do {
+ host_fn(&zd[reg_n * rstride], reg_off, host + count_off);
+ reg_off += estride;
+ count_off += estride;
+ } while (reg_off <= reg_last);
+ reg_off = 0;
+ reg_n++;
+ } while (count_off <= count_last);
+
+ clear_helper_retaddr();
+
+ /*
+ * Use the slow path to manage the cross-page misalignment.
+ * But we know this is RAM and cannot trap.
+ */
+ count_off = info.reg_off_split;
+ if (unlikely(count_off >= 0)) {
+ reg_off = count_off % reg_max;
+ reg_n = count_off / reg_max;
+ tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra);
+ }
+
+ count_off = info.reg_off_first[1];
+ if (unlikely(count_off >= 0)) {
+ count_last = info.reg_off_last[1];
+ reg_off = count_off % reg_max;
+ reg_n = count_off / reg_max;
+ host = info.page[1].host;
+
+ set_helper_retaddr(ra);
+
+ do {
+ reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize);
+ do {
+ host_fn(&zd[reg_n * rstride], reg_off, host + count_off);
+ reg_off += estride;
+ count_off += estride;
+ } while (reg_off <= reg_last);
+ reg_off = 0;
+ reg_n++;
+ } while (count_off <= count_last);
+
+ clear_helper_retaddr();
+ }
+}
+
+void HELPER(sve2p1_ld1bb_c)(CPUARMState *env, void *vd, target_ulong addr,
+ uint32_t png, uint32_t desc)
+{
+ sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), MO_8,
+ sve_ld1bb_host, sve_ld1bb_tlb);
+}
+
+#define DO_LD1_2(NAME, ESZ) \
+void HELPER(sve2p1_##NAME##_le_c)(CPUARMState *env, void *vd, \
+ target_ulong addr, uint32_t png, \
+ uint32_t desc) \
+{ \
+ sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), ESZ, \
+ sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
+} \
+void HELPER(sve2p1_##NAME##_be_c)(CPUARMState *env, void *vd, \
+ target_ulong addr, uint32_t png, \
+ uint32_t desc) \
+{ \
+ sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), ESZ, \
+ sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
+}
+
+DO_LD1_2(ld1hh, MO_16)
+DO_LD1_2(ld1ss, MO_32)
+DO_LD1_2(ld1dd, MO_64)
+
+#undef DO_LD1_2
+
+static inline QEMU_ALWAYS_INLINE
+void sve2p1_st1_c(CPUARMState *env, ARMVectorReg *zd, const vaddr addr,
+ uint32_t png, uint32_t desc,
+ const uintptr_t ra, const int esz,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ const unsigned N = (desc >> SIMD_DATA_SHIFT) & 1 ? 4 : 2;
+ const unsigned rstride = 1 << ((desc >> (SIMD_DATA_SHIFT + 1)) % 4);
+ uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+ const intptr_t reg_max = simd_oprsz(desc);
+ const unsigned esize = 1 << esz;
+ intptr_t count_off, count_last;
+ intptr_t reg_off, reg_last, reg_n;
+ SVEContLdSt info;
+ unsigned estride, flags;
+ void *host;
+
+ estride = sve2p1_cont_ldst_elements(&info, addr, png, reg_max, N, esz);
+ if (estride == 0) {
+ /* The entire predicate was false; no store occurs. */
+ return;
+ }
+
+ /* Probe the page(s). Exit with exception for any invalid page. */
+ sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra);
+
+ /* Handle watchpoints for all active elements. */
+ sve2p1_cont_ldst_watchpoints(&info, env, addr, estride,
+ esize, BP_MEM_WRITE, ra);
+
+ /*
+ * Handle mte checks for all active elements.
+ * Since TBI must be set for MTE, !mtedesc => !mte_active.
+ */
+ if (mtedesc) {
+ sve2p1_cont_ldst_mte_check(&info, env, estride, addr,
+ esize, mtedesc, ra);
+ }
+
+ flags = info.page[0].flags | info.page[1].flags;
+ if (unlikely(flags != 0)) {
+ /*
+ * At least one page includes MMIO.
+ * Any bus operation can fail with cpu_transaction_failed,
+ * which for ARM will raise SyncExternal. Perform the load
+ * into scratch memory to preserve register state until the end.
+ */
+ count_off = info.reg_off_first[0];
+ count_last = info.reg_off_last[1];
+ if (count_last < 0) {
+ count_last = info.reg_off_split;
+ if (count_last < 0) {
+ count_last = info.reg_off_last[0];
+ }
+ }
+ reg_off = count_off % reg_max;
+ reg_n = count_off / reg_max;
+
+ do {
+ reg_last = MIN(count_last - count_off, reg_max - esize);
+ do {
+ tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra);
+ reg_off += estride;
+ count_off += estride;
+ } while (reg_off <= reg_last);
+ reg_off = 0;
+ reg_n++;
+ } while (count_off <= count_last);
+ return;
+ }
+
+ /* The entire operation is in RAM, on valid pages. */
+
+ count_off = info.reg_off_first[0];
+ count_last = info.reg_off_last[0];
+ reg_off = count_off % reg_max;
+ reg_n = count_off / reg_max;
+ host = info.page[0].host;
+
+ set_helper_retaddr(ra);
+
+ do {
+ reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize);
+ do {
+ host_fn(&zd[reg_n * rstride], reg_off, host + count_off);
+ reg_off += estride;
+ count_off += estride;
+ } while (reg_off <= reg_last);
+ reg_off = 0;
+ reg_n++;
+ } while (count_off <= count_last);
+
+ clear_helper_retaddr();
+
+ /*
+ * Use the slow path to manage the cross-page misalignment.
+ * But we know this is RAM and cannot trap.
+ */
+ count_off = info.reg_off_split;
+ if (unlikely(count_off >= 0)) {
+ reg_off = count_off % reg_max;
+ reg_n = count_off / reg_max;
+ tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra);
+ }
+
+ count_off = info.reg_off_first[1];
+ if (unlikely(count_off >= 0)) {
+ count_last = info.reg_off_last[1];
+ reg_off = count_off % reg_max;
+ reg_n = count_off / reg_max;
+ host = info.page[1].host;
+
+ set_helper_retaddr(ra);
+
+ do {
+ reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize);
+ do {
+ host_fn(&zd[reg_n * rstride], reg_off, host + count_off);
+ reg_off += estride;
+ count_off += estride;
+ } while (reg_off <= reg_last);
+ reg_off = 0;
+ reg_n++;
+ } while (count_off <= count_last);
+
+ clear_helper_retaddr();
+ }
+}
+
+void HELPER(sve2p1_st1bb_c)(CPUARMState *env, void *vd, target_ulong addr,
+ uint32_t png, uint32_t desc)
+{
+ sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), MO_8,
+ sve_st1bb_host, sve_st1bb_tlb);
+}
+
+#define DO_ST1_2(NAME, ESZ) \
+void HELPER(sve2p1_##NAME##_le_c)(CPUARMState *env, void *vd, \
+ target_ulong addr, uint32_t png, \
+ uint32_t desc) \
+{ \
+ sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), ESZ, \
+ sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
+} \
+void HELPER(sve2p1_##NAME##_be_c)(CPUARMState *env, void *vd, \
+ target_ulong addr, uint32_t png, \
+ uint32_t desc) \
+{ \
+ sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), ESZ, \
+ sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
+}
+
+DO_ST1_2(st1hh, MO_16)
+DO_ST1_2(st1ss, MO_32)
+DO_ST1_2(st1dd, MO_64)
+
+#undef DO_ST1_2
+
void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
{
intptr_t i, opr_sz = simd_oprsz(desc) / 8;
@@ -7711,3 +8502,31 @@ DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
#undef DO_FCVTLT
#undef DO_FCVTNT
+
+void HELPER(pext)(void *vd, uint32_t png, uint32_t desc)
+{
+ int pl = FIELD_EX32(desc, PREDDESC, OPRSZ);
+ int vl = pl * 8;
+ unsigned v_esz = FIELD_EX32(desc, PREDDESC, ESZ);
+ int part = FIELD_EX32(desc, PREDDESC, DATA);
+ DecodeCounter p = decode_counter(png, vl, v_esz);
+ uint64_t mask = pred_esz_masks[v_esz + p.lg2_stride];
+ ARMPredicateReg *d = vd;
+
+ /*
+ * Convert from element count to byte count and adjust
+ * for the portion of the 4*VL counter to be extracted.
+ */
+ int b_count = (p.count << v_esz) - vl * part;
+
+ memset(d, 0, sizeof(*d));
+ if (p.invert) {
+ if (b_count <= 0) {
+ do_whilel(vd, mask, vl, vl);
+ } else if (b_count < vl) {
+ do_whileg(vd, mask, vl - b_count, vl);
+ }
+ } else if (b_count > 0) {
+ do_whilel(vd, mask, MIN(b_count, vl), vl);
+ }
+}