aboutsummaryrefslogtreecommitdiff
path: root/target/arm/tcg/vec_helper.c
diff options
context:
space:
mode:
Diffstat (limited to 'target/arm/tcg/vec_helper.c')
-rw-r--r--target/arm/tcg/vec_helper.c384
1 files changed, 331 insertions, 53 deletions
diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c
index 986eaf8..0603db0 100644
--- a/target/arm/tcg/vec_helper.c
+++ b/target/arm/tcg/vec_helper.c
@@ -825,11 +825,11 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
clear_tail(d, opr_sz, simd_maxsz(desc)); \
}
-DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
-DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
-DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
-DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
-DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
+DO_DOT(gvec_sdot_4b, int32_t, int8_t, int8_t)
+DO_DOT(gvec_udot_4b, uint32_t, uint8_t, uint8_t)
+DO_DOT(gvec_usdot_4b, uint32_t, uint8_t, int8_t)
+DO_DOT(gvec_sdot_4h, int64_t, int16_t, int16_t)
+DO_DOT(gvec_udot_4h, uint64_t, uint16_t, uint16_t)
#define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
@@ -865,12 +865,63 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
clear_tail(d, opr_sz, simd_maxsz(desc)); \
}
-DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
-DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
-DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
-DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
-DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
-DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
+DO_DOT_IDX(gvec_sdot_idx_4b, int32_t, int8_t, int8_t, H4)
+DO_DOT_IDX(gvec_udot_idx_4b, uint32_t, uint8_t, uint8_t, H4)
+DO_DOT_IDX(gvec_sudot_idx_4b, int32_t, int8_t, uint8_t, H4)
+DO_DOT_IDX(gvec_usdot_idx_4b, int32_t, uint8_t, int8_t, H4)
+DO_DOT_IDX(gvec_sdot_idx_4h, int64_t, int16_t, int16_t, H8)
+DO_DOT_IDX(gvec_udot_idx_4h, uint64_t, uint16_t, uint16_t, H8)
+
+#undef DO_DOT
+#undef DO_DOT_IDX
+
+/* Similar for 2-way dot product */
+#define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ TYPED *d = vd, *a = va; \
+ TYPEN *n = vn; \
+ TYPEM *m = vm; \
+ for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \
+ d[i] = (a[i] + \
+ (TYPED)n[i * 2 + 0] * m[i * 2 + 0] + \
+ (TYPED)n[i * 2 + 1] * m[i * 2 + 1]); \
+ } \
+ clear_tail(d, opr_sz, simd_maxsz(desc)); \
+}
+
+#define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
+{ \
+ intptr_t i = 0, opr_sz = simd_oprsz(desc); \
+ intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \
+ intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \
+ intptr_t index = simd_data(desc); \
+ TYPED *d = vd, *a = va; \
+ TYPEN *n = vn; \
+ TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 2; \
+ do { \
+ TYPED m0 = m_indexed[i * 2 + 0]; \
+ TYPED m1 = m_indexed[i * 2 + 1]; \
+ do { \
+ d[i] = (a[i] + \
+ n[i * 2 + 0] * m0 + \
+ n[i * 2 + 1] * m1); \
+ } while (++i < segend); \
+ segend = i + (16 / sizeof(TYPED)); \
+ } while (i < opr_sz_n); \
+ clear_tail(d, opr_sz, simd_maxsz(desc)); \
+}
+
+DO_DOT(gvec_sdot_2h, int32_t, int16_t, int16_t)
+DO_DOT(gvec_udot_2h, uint32_t, uint16_t, uint16_t)
+
+DO_DOT_IDX(gvec_sdot_idx_2h, int32_t, int16_t, int16_t, H4)
+DO_DOT_IDX(gvec_udot_idx_2h, uint32_t, uint16_t, uint16_t, H4)
+
+#undef DO_DOT
+#undef DO_DOT_IDX
void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
float_status *fpst, uint32_t desc)
@@ -1419,10 +1470,12 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, \
DO_3OP(gvec_fadd_h, float16_add, float16)
DO_3OP(gvec_fadd_s, float32_add, float32)
DO_3OP(gvec_fadd_d, float64_add, float64)
+DO_3OP(gvec_bfadd, bfloat16_add, bfloat16)
DO_3OP(gvec_fsub_h, float16_sub, float16)
DO_3OP(gvec_fsub_s, float32_sub, float32)
DO_3OP(gvec_fsub_d, float64_sub, float64)
+DO_3OP(gvec_bfsub, bfloat16_sub, bfloat16)
DO_3OP(gvec_fmul_h, float16_mul, float16)
DO_3OP(gvec_fmul_s, float32_mul, float32)
@@ -1515,6 +1568,13 @@ DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16)
DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32)
DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64)
+DO_3OP(gvec_fmax_b16, bfloat16_max, bfloat16)
+DO_3OP(gvec_fmin_b16, bfloat16_min, bfloat16)
+DO_3OP(gvec_fmaxnum_b16, bfloat16_maxnum, bfloat16)
+DO_3OP(gvec_fminnum_b16, bfloat16_minnum, bfloat16)
+DO_3OP(gvec_ah_fmax_b16, helper_sme2_ah_fmax_b16, bfloat16)
+DO_3OP(gvec_ah_fmin_b16, helper_sme2_ah_fmin_b16, bfloat16)
+
#endif
#undef DO_3OP
@@ -1550,6 +1610,12 @@ static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
return float16_muladd(op1, op2, dest, 0, stat);
}
+static bfloat16 bfloat16_muladd_f(bfloat16 dest, bfloat16 op1, bfloat16 op2,
+ float_status *stat)
+{
+ return bfloat16_muladd(op1, op2, dest, 0, stat);
+}
+
static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
float_status *stat)
{
@@ -1568,6 +1634,12 @@ static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
}
+static bfloat16 bfloat16_mulsub_f(bfloat16 dest, bfloat16 op1, bfloat16 op2,
+ float_status *stat)
+{
+ return bfloat16_muladd(bfloat16_chs(op1), op2, dest, 0, stat);
+}
+
static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
float_status *stat)
{
@@ -1586,6 +1658,12 @@ static float16 float16_ah_mulsub_f(float16 dest, float16 op1, float16 op2,
return float16_muladd(op1, op2, dest, float_muladd_negate_product, stat);
}
+static bfloat16 bfloat16_ah_mulsub_f(bfloat16 dest, bfloat16 op1, bfloat16 op2,
+ float_status *stat)
+{
+ return bfloat16_muladd(op1, op2, dest, float_muladd_negate_product, stat);
+}
+
static float32 float32_ah_mulsub_f(float32 dest, float32 op1, float32 op2,
float_status *stat)
{
@@ -1610,23 +1688,28 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, \
clear_tail(d, oprsz, simd_maxsz(desc)); \
}
-DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
-DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
+DO_MULADD(gvec_fmla_nf_h, float16_muladd_nf, float16)
+DO_MULADD(gvec_fmla_nf_s, float32_muladd_nf, float32)
-DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
-DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
+DO_MULADD(gvec_fmls_nf_h, float16_mulsub_nf, float16)
+DO_MULADD(gvec_fmls_nf_s, float32_mulsub_nf, float32)
DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
+DO_MULADD(gvec_bfmla, bfloat16_muladd_f, bfloat16)
DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
+DO_MULADD(gvec_bfmls, bfloat16_mulsub_f, bfloat16)
DO_MULADD(gvec_ah_vfms_h, float16_ah_mulsub_f, float16)
DO_MULADD(gvec_ah_vfms_s, float32_ah_mulsub_f, float32)
DO_MULADD(gvec_ah_vfms_d, float64_ah_mulsub_f, float64)
+DO_MULADD(gvec_ah_bfmls, bfloat16_ah_mulsub_f, bfloat16)
+
+#undef DO_MULADD
/* For the indexed ops, SVE applies the index per 128-bit vector segment.
* For AdvSIMD, there is of course only one such vector segment.
@@ -1745,14 +1828,17 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \
DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2, 0, 0)
DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4, 0, 0)
DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8, 0, 0)
+DO_FMLA_IDX(gvec_bfmla_idx, bfloat16, H2, 0, 0)
DO_FMLA_IDX(gvec_fmls_idx_h, float16, H2, INT16_MIN, 0)
DO_FMLA_IDX(gvec_fmls_idx_s, float32, H4, INT32_MIN, 0)
DO_FMLA_IDX(gvec_fmls_idx_d, float64, H8, INT64_MIN, 0)
+DO_FMLA_IDX(gvec_bfmls_idx, bfloat16, H2, INT16_MIN, 0)
DO_FMLA_IDX(gvec_ah_fmls_idx_h, float16, H2, 0, float_muladd_negate_product)
DO_FMLA_IDX(gvec_ah_fmls_idx_s, float32, H4, 0, float_muladd_negate_product)
DO_FMLA_IDX(gvec_ah_fmls_idx_d, float64, H8, 0, float_muladd_negate_product)
+DO_FMLA_IDX(gvec_ah_bfmls_idx, bfloat16, H2, 0, float_muladd_negate_product)
#undef DO_FMLA_IDX
@@ -2184,7 +2270,8 @@ void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
intptr_t i, oprsz = simd_oprsz(desc);
bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
- float_status *status = &env->vfp.fp_status[FPST_A64];
+ bool za = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
+ float_status *status = &env->vfp.fp_status[za ? FPST_ZA : FPST_A64];
bool fz16 = env->vfp.fpcr & FPCR_FZ16;
int negx = 0, negf = 0;
@@ -2267,8 +2354,9 @@ void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
intptr_t i, j, oprsz = simd_oprsz(desc);
bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
- intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
- float_status *status = &env->vfp.fp_status[FPST_A64];
+ bool za = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
+ intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 3, 3) * sizeof(float16);
+ float_status *status = &env->vfp.fp_status[za ? FPST_ZA : FPST_A64];
bool fz16 = env->vfp.fpcr & FPCR_FZ16;
int negx = 0, negf = 0;
@@ -2989,31 +3077,62 @@ float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst)
float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
float_status *fpst, float_status *fpst_odd)
{
- /*
- * Compare f16_dotadd() in sme_helper.c, but here we have
- * bfloat16 inputs. In particular that means that we do not
- * want the FPCR.FZ16 flush semantics, so we use the normal
- * float_status for the input handling here.
- */
- float64 e1r = float32_to_float64(e1 << 16, fpst);
- float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst);
- float64 e2r = float32_to_float64(e2 << 16, fpst);
- float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst);
- float64 t64;
+ float32 s1r = e1 << 16;
+ float32 s1c = e1 & 0xffff0000u;
+ float32 s2r = e2 << 16;
+ float32 s2c = e2 & 0xffff0000u;
float32 t32;
- /*
- * The ARM pseudocode function FPDot performs both multiplies
- * and the add with a single rounding operation. Emulate this
- * by performing the first multiply in round-to-odd, then doing
- * the second multiply as fused multiply-add, and rounding to
- * float32 all in one step.
- */
- t64 = float64_mul(e1r, e2r, fpst_odd);
- t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
+ /* C.f. FPProcessNaNs4 */
+ if (float32_is_any_nan(s1r) || float32_is_any_nan(s1c) ||
+ float32_is_any_nan(s2r) || float32_is_any_nan(s2c)) {
+ if (float32_is_signaling_nan(s1r, fpst)) {
+ t32 = s1r;
+ } else if (float32_is_signaling_nan(s1c, fpst)) {
+ t32 = s1c;
+ } else if (float32_is_signaling_nan(s2r, fpst)) {
+ t32 = s2r;
+ } else if (float32_is_signaling_nan(s2c, fpst)) {
+ t32 = s2c;
+ } else if (float32_is_any_nan(s1r)) {
+ t32 = s1r;
+ } else if (float32_is_any_nan(s1c)) {
+ t32 = s1c;
+ } else if (float32_is_any_nan(s2r)) {
+ t32 = s2r;
+ } else {
+ t32 = s2c;
+ }
+ /*
+ * FPConvertNaN(FPProcessNaN(t32)) will be done as part
+ * of the final addition below.
+ */
+ } else {
+ /*
+ * Compare f16_dotadd() in sme_helper.c, but here we have
+ * bfloat16 inputs. In particular that means that we do not
+ * want the FPCR.FZ16 flush semantics, so we use the normal
+ * float_status for the input handling here.
+ */
+ float64 e1r = float32_to_float64(s1r, fpst);
+ float64 e1c = float32_to_float64(s1c, fpst);
+ float64 e2r = float32_to_float64(s2r, fpst);
+ float64 e2c = float32_to_float64(s2c, fpst);
+ float64 t64;
+
+ /*
+ * The ARM pseudocode function FPDot performs both multiplies
+ * and the add with a single rounding operation. Emulate this
+ * by performing the first multiply in round-to-odd, then doing
+ * the second multiply as fused multiply-add, and rounding to
+ * float32 all in one step.
+ */
+ t64 = float64_mul(e1r, e2r, fpst_odd);
+ t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
- /* This conversion is exact, because we've already rounded. */
- t32 = float64_to_float32(t64, fpst);
+ /* This conversion is exact, because we've already rounded. */
+ t32 = float64_to_float32(t64, fpst);
+ }
/* The final accumulation step is not fused. */
return float32_add(sum, t32, fpst);
@@ -3070,6 +3189,45 @@ void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
clear_tail(d, opr_sz, simd_maxsz(desc));
}
+void HELPER(sme2_bfvdot_idx)(void *vd, void *vn, void *vm,
+ void *va, CPUARMState *env, uint32_t desc)
+{
+ intptr_t i, j, opr_sz = simd_oprsz(desc);
+ intptr_t idx = extract32(desc, SIMD_DATA_SHIFT, 2);
+ intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
+ intptr_t elements = opr_sz / 4;
+ intptr_t eltspersegment = MIN(16 / 4, elements);
+ float32 *d = vd, *a = va;
+ uint16_t *n0 = vn;
+ uint16_t *n1 = vn + sizeof(ARMVectorReg);
+ uint32_t *m = vm;
+ float_status fpst, fpst_odd;
+
+ if (is_ebf(env, &fpst, &fpst_odd)) {
+ for (i = 0; i < elements; i += eltspersegment) {
+ uint32_t m_idx = m[i + H4(idx)];
+
+ for (j = 0; j < eltspersegment; j++) {
+ uint32_t nn = (n0[H2(2 * (i + j) + sel)])
+ | (n1[H2(2 * (i + j) + sel)] << 16);
+ d[i + H4(j)] = bfdotadd_ebf(a[i + H4(j)], nn, m_idx,
+ &fpst, &fpst_odd);
+ }
+ }
+ } else {
+ for (i = 0; i < elements; i += eltspersegment) {
+ uint32_t m_idx = m[i + H4(idx)];
+
+ for (j = 0; j < eltspersegment; j++) {
+ uint32_t nn = (n0[H2(2 * (i + j) + sel)])
+ | (n1[H2(2 * (i + j) + sel)] << 16);
+ d[i + H4(j)] = bfdotadd(a[i + H4(j)], nn, m_idx, &fpst);
+ }
+ }
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
CPUARMState *env, uint32_t desc)
{
@@ -3146,44 +3304,76 @@ void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
clear_tail(d, opr_sz, simd_maxsz(desc));
}
-void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
- float_status *stat, uint32_t desc)
+static void do_bfmlal(float32 *d, bfloat16 *n, bfloat16 *m, float32 *a,
+ float_status *stat, uint32_t desc, int negx, int negf)
{
intptr_t i, opr_sz = simd_oprsz(desc);
- intptr_t sel = simd_data(desc);
- float32 *d = vd, *a = va;
- bfloat16 *n = vn, *m = vm;
+ intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
for (i = 0; i < opr_sz / 4; ++i) {
- float32 nn = n[H2(i * 2 + sel)] << 16;
+ float32 nn = (negx ^ n[H2(i * 2 + sel)]) << 16;
float32 mm = m[H2(i * 2 + sel)] << 16;
- d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
+ d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], negf, stat);
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
-void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
- void *va, float_status *stat, uint32_t desc)
+void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
+ float_status *stat, uint32_t desc)
+{
+ do_bfmlal(vd, vn, vm, va, stat, desc, 0, 0);
+}
+
+void HELPER(gvec_bfmlsl)(void *vd, void *vn, void *vm, void *va,
+ float_status *stat, uint32_t desc)
+{
+ do_bfmlal(vd, vn, vm, va, stat, desc, 0x8000, 0);
+}
+
+void HELPER(gvec_ah_bfmlsl)(void *vd, void *vn, void *vm, void *va,
+ float_status *stat, uint32_t desc)
+{
+ do_bfmlal(vd, vn, vm, va, stat, desc, 0, float_muladd_negate_product);
+}
+
+static void do_bfmlal_idx(float32 *d, bfloat16 *n, bfloat16 *m, float32 *a,
+ float_status *stat, uint32_t desc, int negx, int negf)
{
intptr_t i, j, opr_sz = simd_oprsz(desc);
intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
intptr_t elements = opr_sz / 4;
intptr_t eltspersegment = MIN(16 / 4, elements);
- float32 *d = vd, *a = va;
- bfloat16 *n = vn, *m = vm;
for (i = 0; i < elements; i += eltspersegment) {
float32 m_idx = m[H2(2 * i + index)] << 16;
for (j = i; j < i + eltspersegment; j++) {
- float32 n_j = n[H2(2 * j + sel)] << 16;
- d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
+ float32 n_j = (negx ^ n[H2(2 * j + sel)]) << 16;
+ d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], negf, stat);
}
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
+void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, void *va,
+ float_status *stat, uint32_t desc)
+{
+ do_bfmlal_idx(vd, vn, vm, va, stat, desc, 0, 0);
+}
+
+void HELPER(gvec_bfmlsl_idx)(void *vd, void *vn, void *vm, void *va,
+ float_status *stat, uint32_t desc)
+{
+ do_bfmlal_idx(vd, vn, vm, va, stat, desc, 0x8000, 0);
+}
+
+void HELPER(gvec_ah_bfmlsl_idx)(void *vd, void *vn, void *vm, void *va,
+ float_status *stat, uint32_t desc)
+{
+ do_bfmlal_idx(vd, vn, vm, va, stat, desc, 0, float_muladd_negate_product);
+}
+
#define DO_CLAMP(NAME, TYPE) \
void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \
{ \
@@ -3253,3 +3443,91 @@ void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc)
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
+
+static inline void do_lut_b(void *zd, uint64_t *indexes, uint64_t *table,
+ unsigned elements, unsigned segbase,
+ unsigned dstride, unsigned isize,
+ unsigned tsize, unsigned nreg)
+{
+ for (unsigned r = 0; r < nreg; ++r) {
+ uint8_t *dst = zd + dstride * r;
+ unsigned base = segbase + r * elements;
+
+ for (unsigned e = 0; e < elements; ++e) {
+ unsigned index = extractn(indexes, (base + e) * isize, isize);
+ dst[H1(e)] = extractn(table, index * tsize, 8);
+ }
+ }
+}
+
+static inline void do_lut_h(void *zd, uint64_t *indexes, uint64_t *table,
+ unsigned elements, unsigned segbase,
+ unsigned dstride, unsigned isize,
+ unsigned tsize, unsigned nreg)
+{
+ for (unsigned r = 0; r < nreg; ++r) {
+ uint16_t *dst = zd + dstride * r;
+ unsigned base = segbase + r * elements;
+
+ for (unsigned e = 0; e < elements; ++e) {
+ unsigned index = extractn(indexes, (base + e) * isize, isize);
+ dst[H2(e)] = extractn(table, index * tsize, 16);
+ }
+ }
+}
+
+static inline void do_lut_s(void *zd, uint64_t *indexes, uint32_t *table,
+ unsigned elements, unsigned segbase,
+ unsigned dstride, unsigned isize,
+ unsigned tsize, unsigned nreg)
+{
+ for (unsigned r = 0; r < nreg; ++r) {
+ uint32_t *dst = zd + dstride * r;
+ unsigned base = segbase + r * elements;
+
+ for (unsigned e = 0; e < elements; ++e) {
+ unsigned index = extractn(indexes, (base + e) * isize, isize);
+ dst[H4(e)] = table[H4(index)];
+ }
+ }
+}
+
+#define DO_SME2_LUT(ISIZE, NREG, SUFF, ESIZE) \
+void helper_sme2_luti##ISIZE##_##NREG##SUFF \
+ (void *zd, void *zn, CPUARMState *env, uint32_t desc) \
+{ \
+ unsigned vl = simd_oprsz(desc); \
+ unsigned strided = extract32(desc, SIMD_DATA_SHIFT, 1); \
+ unsigned idx = extract32(desc, SIMD_DATA_SHIFT + 1, 4); \
+ unsigned elements = vl / ESIZE; \
+ unsigned dstride = (!strided ? 1 : NREG == 4 ? 4 : 8); \
+ unsigned segments = (ESIZE * 8) / (ISIZE * NREG); \
+ unsigned segment = idx & (segments - 1); \
+ ARMVectorReg indexes; \
+ memcpy(&indexes, zn, vl); \
+ do_lut_##SUFF(zd, indexes.d, (void *)env->za_state.zt0, elements, \
+ segment * NREG * elements, \
+ dstride * sizeof(ARMVectorReg), ISIZE, 32, NREG); \
+}
+
+DO_SME2_LUT(2,1,b, 1)
+DO_SME2_LUT(2,1,h, 2)
+DO_SME2_LUT(2,1,s, 4)
+DO_SME2_LUT(2,2,b, 1)
+DO_SME2_LUT(2,2,h, 2)
+DO_SME2_LUT(2,2,s, 4)
+DO_SME2_LUT(2,4,b, 1)
+DO_SME2_LUT(2,4,h, 2)
+DO_SME2_LUT(2,4,s, 4)
+
+DO_SME2_LUT(4,1,b, 1)
+DO_SME2_LUT(4,1,h, 2)
+DO_SME2_LUT(4,1,s, 4)
+DO_SME2_LUT(4,2,b, 1)
+DO_SME2_LUT(4,2,h, 2)
+DO_SME2_LUT(4,2,s, 4)
+DO_SME2_LUT(4,4,b, 1)
+DO_SME2_LUT(4,4,h, 2)
+DO_SME2_LUT(4,4,s, 4)
+
+#undef DO_SME2_LUT