aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Henderson <richard.henderson@linaro.org>2025-07-04 08:21:03 -0600
committerPeter Maydell <peter.maydell@linaro.org>2025-07-04 15:53:23 +0100
commita19b104f849a0ab6f781755f76e567960629a69c (patch)
treeb54f4b85a1084ad79a19350956a27fecb929954d
parentec2d9709653f5c0c5503f177e5cdf6ba3e8f2627 (diff)
downloadqemu-a19b104f849a0ab6f781755f76e567960629a69c.zip
qemu-a19b104f849a0ab6f781755f76e567960629a69c.tar.gz
qemu-a19b104f849a0ab6f781755f76e567960629a69c.tar.bz2
target/arm: Implement LUTI2, LUTI4 for SME2/SME2p1
Reviewed-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> Message-id: 20250704142112.1018902-101-richard.henderson@linaro.org Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r--target/arm/tcg/helper.h24
-rw-r--r--target/arm/tcg/sme.decode42
-rw-r--r--target/arm/tcg/translate-sme.c56
-rw-r--r--target/arm/tcg/vec_helper.c88
4 files changed, 210 insertions, 0 deletions
diff --git a/target/arm/tcg/helper.h b/target/arm/tcg/helper.h
index 392bf7b..d9565c8 100644
--- a/target/arm/tcg/helper.h
+++ b/target/arm/tcg/helper.h
@@ -1188,3 +1188,27 @@ DEF_HELPER_FLAGS_4(gvec_uminp_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(gvec_urecpe_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(gvec_ursqrte_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sme2_luti2_1b, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_4(sme2_luti2_1h, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_4(sme2_luti2_1s, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+
+DEF_HELPER_FLAGS_4(sme2_luti2_2b, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_4(sme2_luti2_2h, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_4(sme2_luti2_2s, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+
+DEF_HELPER_FLAGS_4(sme2_luti2_4b, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_4(sme2_luti2_4h, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_4(sme2_luti2_4s, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+
+DEF_HELPER_FLAGS_4(sme2_luti4_1b, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_4(sme2_luti4_1h, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_4(sme2_luti4_1s, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+
+DEF_HELPER_FLAGS_4(sme2_luti4_2b, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_4(sme2_luti4_2h, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_4(sme2_luti4_2s, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+
+DEF_HELPER_FLAGS_4(sme2_luti4_4b, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_4(sme2_luti4_4h, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_4(sme2_luti4_4s, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
diff --git a/target/arm/tcg/sme.decode b/target/arm/tcg/sme.decode
index f7e4143..5a5b8ff 100644
--- a/target/arm/tcg/sme.decode
+++ b/target/arm/tcg/sme.decode
@@ -945,3 +945,45 @@ ZERO_za 11000000 000011 110 .. 0000000000 00. \
&zero_za ngrp=2 nvec=4 rv=%mova_rv off=%off1_x4
ZERO_za 11000000 000011 111 .. 0000000000 00. \
&zero_za ngrp=4 nvec=4 rv=%mova_rv off=%off1_x4
+
+### SME Lookup Table Read
+
+&lut zd zn idx
+
+# LUTI2, consecutive
+LUTI2_c_1b 1100 0000 1100 11 idx:4 00 00 zn:5 zd:5 &lut
+LUTI2_c_1h 1100 0000 1100 11 idx:4 01 00 zn:5 zd:5 &lut
+LUTI2_c_1s 1100 0000 1100 11 idx:4 10 00 zn:5 zd:5 &lut
+
+LUTI2_c_2b 1100 0000 1000 11 idx:3 1 00 00 zn:5 .... 0 &lut zd=%zd_ax2
+LUTI2_c_2h 1100 0000 1000 11 idx:3 1 01 00 zn:5 .... 0 &lut zd=%zd_ax2
+LUTI2_c_2s 1100 0000 1000 11 idx:3 1 10 00 zn:5 .... 0 &lut zd=%zd_ax2
+
+LUTI2_c_4b 1100 0000 1000 11 idx:2 10 00 00 zn:5 ... 00 &lut zd=%zd_ax4
+LUTI2_c_4h 1100 0000 1000 11 idx:2 10 01 00 zn:5 ... 00 &lut zd=%zd_ax4
+LUTI2_c_4s 1100 0000 1000 11 idx:2 10 10 00 zn:5 ... 00 &lut zd=%zd_ax4
+
+# LUTI2, strided (must check zd alignment)
+LUTI2_s_2b 1100 0000 1001 11 idx:3 1 00 00 zn:5 zd:5 &lut
+LUTI2_s_2h 1100 0000 1001 11 idx:3 1 01 00 zn:5 zd:5 &lut
+
+LUTI2_s_4b 1100 0000 1001 11 idx:2 10 00 00 zn:5 zd:5 &lut
+LUTI2_s_4h 1100 0000 1001 11 idx:2 10 01 00 zn:5 zd:5 &lut
+
+# LUTI4, consecutive
+LUTI4_c_1b 1100 0000 1100 101 idx:3 00 00 zn:5 zd:5 &lut
+LUTI4_c_1h 1100 0000 1100 101 idx:3 01 00 zn:5 zd:5 &lut
+LUTI4_c_1s 1100 0000 1100 101 idx:3 10 00 zn:5 zd:5 &lut
+
+LUTI4_c_2b 1100 0000 1000 101 idx:2 1 00 00 zn:5 .... 0 &lut zd=%zd_ax2
+LUTI4_c_2h 1100 0000 1000 101 idx:2 1 01 00 zn:5 .... 0 &lut zd=%zd_ax2
+LUTI4_c_2s 1100 0000 1000 101 idx:2 1 10 00 zn:5 .... 0 &lut zd=%zd_ax2
+
+LUTI4_c_4h 1100 0000 1000 101 idx:1 10 01 00 zn:5 ... 00 &lut zd=%zd_ax4
+LUTI4_c_4s 1100 0000 1000 101 idx:1 10 10 00 zn:5 ... 00 &lut zd=%zd_ax4
+
+# LUTI4, strided (must check zd alignment)
+LUTI4_s_2b 1100 0000 1001 101 idx:2 1 00 00 zn:5 zd:5 &lut
+LUTI4_s_2h 1100 0000 1001 101 idx:2 1 01 00 zn:5 zd:5 &lut
+
+LUTI4_s_4h 1100 0000 1001 101 idx:1 10 01 00 zn:5 zd:5 &lut
diff --git a/target/arm/tcg/translate-sme.c b/target/arm/tcg/translate-sme.c
index 9592f6a..d38b8a5 100644
--- a/target/arm/tcg/translate-sme.c
+++ b/target/arm/tcg/translate-sme.c
@@ -1697,3 +1697,59 @@ static bool trans_SEL(DisasContext *s, arg_SEL *a)
}
return true;
}
+
+static bool do_lut(DisasContext *s, arg_lut *a,
+ gen_helper_gvec_2_ptr *fn, bool strided)
+{
+ if (sme_sm_enabled_check(s) && sme2_zt0_enabled_check(s)) {
+ int svl = streaming_vec_reg_size(s);
+ tcg_gen_gvec_2_ptr(vec_full_reg_offset(s, a->zd),
+ vec_full_reg_offset(s, a->zn),
+ tcg_env, svl, svl, strided | (a->idx << 1), fn);
+ }
+ return true;
+}
+
+TRANS_FEAT(LUTI2_c_1b, aa64_sme2, do_lut, a, gen_helper_sme2_luti2_1b, false)
+TRANS_FEAT(LUTI2_c_1h, aa64_sme2, do_lut, a, gen_helper_sme2_luti2_1h, false)
+TRANS_FEAT(LUTI2_c_1s, aa64_sme2, do_lut, a, gen_helper_sme2_luti2_1s, false)
+
+TRANS_FEAT(LUTI2_c_2b, aa64_sme2, do_lut, a, gen_helper_sme2_luti2_2b, false)
+TRANS_FEAT(LUTI2_c_2h, aa64_sme2, do_lut, a, gen_helper_sme2_luti2_2h, false)
+TRANS_FEAT(LUTI2_c_2s, aa64_sme2, do_lut, a, gen_helper_sme2_luti2_2s, false)
+
+TRANS_FEAT(LUTI2_c_4b, aa64_sme2, do_lut, a, gen_helper_sme2_luti2_4b, false)
+TRANS_FEAT(LUTI2_c_4h, aa64_sme2, do_lut, a, gen_helper_sme2_luti2_4h, false)
+TRANS_FEAT(LUTI2_c_4s, aa64_sme2, do_lut, a, gen_helper_sme2_luti2_4s, false)
+
+TRANS_FEAT(LUTI4_c_1b, aa64_sme2, do_lut, a, gen_helper_sme2_luti4_1b, false)
+TRANS_FEAT(LUTI4_c_1h, aa64_sme2, do_lut, a, gen_helper_sme2_luti4_1h, false)
+TRANS_FEAT(LUTI4_c_1s, aa64_sme2, do_lut, a, gen_helper_sme2_luti4_1s, false)
+
+TRANS_FEAT(LUTI4_c_2b, aa64_sme2, do_lut, a, gen_helper_sme2_luti4_2b, false)
+TRANS_FEAT(LUTI4_c_2h, aa64_sme2, do_lut, a, gen_helper_sme2_luti4_2h, false)
+TRANS_FEAT(LUTI4_c_2s, aa64_sme2, do_lut, a, gen_helper_sme2_luti4_2s, false)
+
+TRANS_FEAT(LUTI4_c_4h, aa64_sme2, do_lut, a, gen_helper_sme2_luti4_4h, false)
+TRANS_FEAT(LUTI4_c_4s, aa64_sme2, do_lut, a, gen_helper_sme2_luti4_4s, false)
+
+static bool do_lut_s4(DisasContext *s, arg_lut *a, gen_helper_gvec_2_ptr *fn)
+{
+ return !(a->zd & 0b01100) && do_lut(s, a, fn, true);
+}
+
+static bool do_lut_s8(DisasContext *s, arg_lut *a, gen_helper_gvec_2_ptr *fn)
+{
+ return !(a->zd & 0b01000) && do_lut(s, a, fn, true);
+}
+
+TRANS_FEAT(LUTI2_s_2b, aa64_sme2p1, do_lut_s8, a, gen_helper_sme2_luti2_2b)
+TRANS_FEAT(LUTI2_s_2h, aa64_sme2p1, do_lut_s8, a, gen_helper_sme2_luti2_2h)
+
+TRANS_FEAT(LUTI2_s_4b, aa64_sme2p1, do_lut_s4, a, gen_helper_sme2_luti2_4b)
+TRANS_FEAT(LUTI2_s_4h, aa64_sme2p1, do_lut_s4, a, gen_helper_sme2_luti2_4h)
+
+TRANS_FEAT(LUTI4_s_2b, aa64_sme2p1, do_lut_s8, a, gen_helper_sme2_luti4_2b)
+TRANS_FEAT(LUTI4_s_2h, aa64_sme2p1, do_lut_s8, a, gen_helper_sme2_luti4_2h)
+
+TRANS_FEAT(LUTI4_s_4h, aa64_sme2p1, do_lut_s4, a, gen_helper_sme2_luti4_4h)
diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c
index d4ee6f4..0603db0 100644
--- a/target/arm/tcg/vec_helper.c
+++ b/target/arm/tcg/vec_helper.c
@@ -3443,3 +3443,91 @@ void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc)
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
+
+static inline void do_lut_b(void *zd, uint64_t *indexes, uint64_t *table,
+ unsigned elements, unsigned segbase,
+ unsigned dstride, unsigned isize,
+ unsigned tsize, unsigned nreg)
+{
+ for (unsigned r = 0; r < nreg; ++r) {
+ uint8_t *dst = zd + dstride * r;
+ unsigned base = segbase + r * elements;
+
+ for (unsigned e = 0; e < elements; ++e) {
+ unsigned index = extractn(indexes, (base + e) * isize, isize);
+ dst[H1(e)] = extractn(table, index * tsize, 8);
+ }
+ }
+}
+
+static inline void do_lut_h(void *zd, uint64_t *indexes, uint64_t *table,
+ unsigned elements, unsigned segbase,
+ unsigned dstride, unsigned isize,
+ unsigned tsize, unsigned nreg)
+{
+ for (unsigned r = 0; r < nreg; ++r) {
+ uint16_t *dst = zd + dstride * r;
+ unsigned base = segbase + r * elements;
+
+ for (unsigned e = 0; e < elements; ++e) {
+ unsigned index = extractn(indexes, (base + e) * isize, isize);
+ dst[H2(e)] = extractn(table, index * tsize, 16);
+ }
+ }
+}
+
+static inline void do_lut_s(void *zd, uint64_t *indexes, uint32_t *table,
+ unsigned elements, unsigned segbase,
+ unsigned dstride, unsigned isize,
+ unsigned tsize, unsigned nreg)
+{
+ for (unsigned r = 0; r < nreg; ++r) {
+ uint32_t *dst = zd + dstride * r;
+ unsigned base = segbase + r * elements;
+
+ for (unsigned e = 0; e < elements; ++e) {
+ unsigned index = extractn(indexes, (base + e) * isize, isize);
+ dst[H4(e)] = table[H4(index)];
+ }
+ }
+}
+
+#define DO_SME2_LUT(ISIZE, NREG, SUFF, ESIZE) \
+void helper_sme2_luti##ISIZE##_##NREG##SUFF \
+ (void *zd, void *zn, CPUARMState *env, uint32_t desc) \
+{ \
+ unsigned vl = simd_oprsz(desc); \
+ unsigned strided = extract32(desc, SIMD_DATA_SHIFT, 1); \
+ unsigned idx = extract32(desc, SIMD_DATA_SHIFT + 1, 4); \
+ unsigned elements = vl / ESIZE; \
+ unsigned dstride = (!strided ? 1 : NREG == 4 ? 4 : 8); \
+ unsigned segments = (ESIZE * 8) / (ISIZE * NREG); \
+ unsigned segment = idx & (segments - 1); \
+ ARMVectorReg indexes; \
+ memcpy(&indexes, zn, vl); \
+ do_lut_##SUFF(zd, indexes.d, (void *)env->za_state.zt0, elements, \
+ segment * NREG * elements, \
+ dstride * sizeof(ARMVectorReg), ISIZE, 32, NREG); \
+}
+
+DO_SME2_LUT(2,1,b, 1)
+DO_SME2_LUT(2,1,h, 2)
+DO_SME2_LUT(2,1,s, 4)
+DO_SME2_LUT(2,2,b, 1)
+DO_SME2_LUT(2,2,h, 2)
+DO_SME2_LUT(2,2,s, 4)
+DO_SME2_LUT(2,4,b, 1)
+DO_SME2_LUT(2,4,h, 2)
+DO_SME2_LUT(2,4,s, 4)
+
+DO_SME2_LUT(4,1,b, 1)
+DO_SME2_LUT(4,1,h, 2)
+DO_SME2_LUT(4,1,s, 4)
+DO_SME2_LUT(4,2,b, 1)
+DO_SME2_LUT(4,2,h, 2)
+DO_SME2_LUT(4,2,s, 4)
+DO_SME2_LUT(4,4,b, 1)
+DO_SME2_LUT(4,4,h, 2)
+DO_SME2_LUT(4,4,s, 4)
+
+#undef DO_SME2_LUT