aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--docs/about/deprecated.rst7
-rw-r--r--docs/system/arm/emulation.rst6
-rw-r--r--hw/arm/highbank.c2
-rw-r--r--linux-user/aarch64/signal.c4
-rw-r--r--linux-user/elfload.c8
-rw-r--r--target/arm/cpu-features.h63
-rw-r--r--target/arm/cpu.c11
-rw-r--r--target/arm/cpu.h70
-rw-r--r--target/arm/cpu64.c8
-rw-r--r--target/arm/helper.c8
-rw-r--r--target/arm/machine.c22
-rw-r--r--target/arm/syndrome.h1
-rw-r--r--target/arm/tcg/cpu64.c10
-rw-r--r--target/arm/tcg/gengvec64.c11
-rw-r--r--target/arm/tcg/helper-a64.c2
-rw-r--r--target/arm/tcg/helper-sme.h213
-rw-r--r--target/arm/tcg/helper-sve.h212
-rw-r--r--target/arm/tcg/helper.h91
-rw-r--r--target/arm/tcg/hflags.c34
-rw-r--r--target/arm/tcg/m_helper.c33
-rw-r--r--target/arm/tcg/mve_helper.c183
-rw-r--r--target/arm/tcg/neon_helper.c30
-rw-r--r--target/arm/tcg/sme.decode937
-rw-r--r--target/arm/tcg/sme_helper.c1672
-rw-r--r--target/arm/tcg/sve.decode327
-rw-r--r--target/arm/tcg/sve_helper.c1191
-rw-r--r--target/arm/tcg/sve_ldst_internal.h89
-rw-r--r--target/arm/tcg/translate-a64.c45
-rw-r--r--target/arm/tcg/translate-a64.h10
-rw-r--r--target/arm/tcg/translate-neon.c18
-rw-r--r--target/arm/tcg/translate-sme.c1476
-rw-r--r--target/arm/tcg/translate-sve.c1021
-rw-r--r--target/arm/tcg/translate.h9
-rw-r--r--target/arm/tcg/vec_helper.c384
-rw-r--r--target/arm/tcg/vec_internal.h148
-rw-r--r--target/arm/tcg/vfp_helper.c12
36 files changed, 7597 insertions, 771 deletions
diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst
index 5a3ed71..d50645a 100644
--- a/docs/about/deprecated.rst
+++ b/docs/about/deprecated.rst
@@ -352,6 +352,13 @@ they want to use and avoids confusion. Existing users of the ``spike``
machine must ensure that they're setting the ``spike`` machine in the
command line (``-M spike``).
+Arm ``highbank`` and ``midway`` machines (since 10.1)
+'''''''''''''''''''''''''''''''''''''''''''''''''''''
+
+There are no known users left for these machines (if you still use it,
+please write a mail to the qemu-devel mailing list). If you just want to
+boot a Cortex-A15 or Cortex-A9 Linux, use the ``virt`` machine instead.
+
System emulator binaries
------------------------
diff --git a/docs/system/arm/emulation.rst b/docs/system/arm/emulation.rst
index 78c2fd2..890dc6f 100644
--- a/docs/system/arm/emulation.rst
+++ b/docs/system/arm/emulation.rst
@@ -129,16 +129,22 @@ the following architecture extensions:
- FEAT_SM3 (Advanced SIMD SM3 instructions)
- FEAT_SM4 (Advanced SIMD SM4 instructions)
- FEAT_SME (Scalable Matrix Extension)
+- FEAT_SME2 (Scalable Matrix Extension version 2)
+- FEAT_SME2p1 (Scalable Matrix Extension version 2.1)
+- FEAT_SME_B16B16 (Non-widening BFloat16 arithmetic for SME2)
- FEAT_SME_FA64 (Full A64 instruction set in Streaming SVE mode)
+- FEAT_SME_F16F16 (Non-widening half-precision FP16 arithmetic for SME2)
- FEAT_SME_F64F64 (Double-precision floating-point outer product instructions)
- FEAT_SME_I16I64 (16-bit to 64-bit integer widening outer product instructions)
- FEAT_SVE (Scalable Vector Extension)
- FEAT_SVE_AES (Scalable Vector AES instructions)
+- FEAT_SVE_B16B16 (Non-widening BFloat16 arithmetic for SVE2)
- FEAT_SVE_BitPerm (Scalable Vector Bit Permutes instructions)
- FEAT_SVE_PMULL128 (Scalable Vector PMULL instructions)
- FEAT_SVE_SHA3 (Scalable Vector SHA3 instructions)
- FEAT_SVE_SM4 (Scalable Vector SM4 instructions)
- FEAT_SVE2 (Scalable Vector Extension version 2)
+- FEAT_SVE2p1 (Scalable Vector Extension version 2.1)
- FEAT_SPECRES (Speculation restriction instructions)
- FEAT_SSBS (Speculative Store Bypass Safe)
- FEAT_SSBS2 (MRS and MSR instructions for SSBS version 2)
diff --git a/hw/arm/highbank.c b/hw/arm/highbank.c
index 3ae26eb..165c0b7 100644
--- a/hw/arm/highbank.c
+++ b/hw/arm/highbank.c
@@ -357,6 +357,7 @@ static void highbank_class_init(ObjectClass *oc, const void *data)
mc->max_cpus = 4;
mc->ignore_memory_transaction_failures = true;
mc->default_ram_id = "highbank.dram";
+ mc->deprecation_reason = "no known users left for this machine";
}
static const TypeInfo highbank_type = {
@@ -381,6 +382,7 @@ static void midway_class_init(ObjectClass *oc, const void *data)
mc->max_cpus = 4;
mc->ignore_memory_transaction_failures = true;
mc->default_ram_id = "highbank.dram";
+ mc->deprecation_reason = "no known users left for this machine";
}
static const TypeInfo midway_type = {
diff --git a/linux-user/aarch64/signal.c b/linux-user/aarch64/signal.c
index bc7a138..d50cab7 100644
--- a/linux-user/aarch64/signal.c
+++ b/linux-user/aarch64/signal.c
@@ -248,7 +248,7 @@ static void target_setup_za_record(struct target_za_context *za,
for (i = 0; i < vl; ++i) {
uint64_t *z = (void *)za + TARGET_ZA_SIG_ZAV_OFFSET(vq, i);
for (j = 0; j < vq * 2; ++j) {
- __put_user_e(env->zarray[i].d[j], z + j, le);
+ __put_user_e(env->za_state.za[i].d[j], z + j, le);
}
}
}
@@ -397,7 +397,7 @@ static bool target_restore_za_record(CPUARMState *env,
for (i = 0; i < vl; ++i) {
uint64_t *z = (void *)za + TARGET_ZA_SIG_ZAV_OFFSET(vq, i);
for (j = 0; j < vq * 2; ++j) {
- __get_user_e(env->zarray[i].d[j], z + j, le);
+ __get_user_e(env->za_state.za[i].d[j], z + j, le);
}
}
return true;
diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index 2add166..ea21410 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -915,6 +915,14 @@ uint64_t get_elf_hwcap2(void)
GET_FEATURE_ID(aa64_sme_fa64, ARM_HWCAP2_A64_SME_FA64);
GET_FEATURE_ID(aa64_hbc, ARM_HWCAP2_A64_HBC);
GET_FEATURE_ID(aa64_mops, ARM_HWCAP2_A64_MOPS);
+ GET_FEATURE_ID(aa64_sve2p1, ARM_HWCAP2_A64_SVE2P1);
+ GET_FEATURE_ID(aa64_sme2, (ARM_HWCAP2_A64_SME2 |
+ ARM_HWCAP2_A64_SME_I16I32 |
+ ARM_HWCAP2_A64_SME_BI32I32));
+ GET_FEATURE_ID(aa64_sme2p1, ARM_HWCAP2_A64_SME2P1);
+ GET_FEATURE_ID(aa64_sme_b16b16, ARM_HWCAP2_A64_SME_B16B16);
+ GET_FEATURE_ID(aa64_sme_f16f16, ARM_HWCAP2_A64_SME_F16F16);
+ GET_FEATURE_ID(aa64_sve_b16b16, ARM_HWCAP2_A64_SVE_B16B16);
return hwcaps;
}
diff --git a/target/arm/cpu-features.h b/target/arm/cpu-features.h
index 5d8adfb..5876162 100644
--- a/target/arm/cpu-features.h
+++ b/target/arm/cpu-features.h
@@ -604,6 +604,11 @@ static inline bool isar_feature_aa64_rpres(const ARMISARegisters *id)
return FIELD_EX64_IDREG(id, ID_AA64ISAR2, RPRES);
}
+static inline bool isar_feature_aa64_lut(const ARMISARegisters *id)
+{
+ return FIELD_EX64_IDREG(id, ID_AA64ISAR2, LUT);
+}
+
static inline bool isar_feature_aa64_fp_simd(const ARMISARegisters *id)
{
/* We always set the AdvSIMD and FP fields identically. */
@@ -932,6 +937,11 @@ static inline bool isar_feature_aa64_sve2(const ARMISARegisters *id)
return FIELD_EX64_IDREG(id, ID_AA64ZFR0, SVEVER) != 0;
}
+static inline bool isar_feature_aa64_sve2p1(const ARMISARegisters *id)
+{
+ return FIELD_EX64_IDREG(id, ID_AA64ZFR0, SVEVER) >=2;
+}
+
static inline bool isar_feature_aa64_sve2_aes(const ARMISARegisters *id)
{
return FIELD_EX64_IDREG(id, ID_AA64ZFR0, AES) != 0;
@@ -977,6 +987,21 @@ static inline bool isar_feature_aa64_sve_f64mm(const ARMISARegisters *id)
return FIELD_EX64_IDREG(id, ID_AA64ZFR0, F64MM) != 0;
}
+static inline bool isar_feature_aa64_sve_b16b16(const ARMISARegisters *id)
+{
+ return FIELD_EX64_IDREG(id, ID_AA64ZFR0, B16B16);
+}
+
+static inline bool isar_feature_aa64_sme_b16b16(const ARMISARegisters *id)
+{
+ return FIELD_EX64_IDREG(id, ID_AA64SMFR0, B16B16);
+}
+
+static inline bool isar_feature_aa64_sme_f16f16(const ARMISARegisters *id)
+{
+ return FIELD_EX64_IDREG(id, ID_AA64SMFR0, F16F16);
+}
+
static inline bool isar_feature_aa64_sme_f64f64(const ARMISARegisters *id)
{
return FIELD_EX64_IDREG(id, ID_AA64SMFR0, F64F64);
@@ -992,6 +1017,44 @@ static inline bool isar_feature_aa64_sme_fa64(const ARMISARegisters *id)
return FIELD_EX64_IDREG(id, ID_AA64SMFR0, FA64);
}
+static inline bool isar_feature_aa64_sme2(const ARMISARegisters *id)
+{
+ return FIELD_EX64_IDREG(id, ID_AA64SMFR0, SMEVER) != 0;
+}
+
+static inline bool isar_feature_aa64_sme2p1(const ARMISARegisters *id)
+{
+ return FIELD_EX64_IDREG(id, ID_AA64SMFR0, SMEVER) >= 2;
+}
+
+/*
+ * Combinations of feature tests, for ease of use with TRANS_FEAT.
+ */
+static inline bool isar_feature_aa64_sme_or_sve2p1(const ARMISARegisters *id)
+{
+ return isar_feature_aa64_sme(id) || isar_feature_aa64_sve2p1(id);
+}
+
+static inline bool isar_feature_aa64_sme2_or_sve2p1(const ARMISARegisters *id)
+{
+ return isar_feature_aa64_sme2(id) || isar_feature_aa64_sve2p1(id);
+}
+
+static inline bool isar_feature_aa64_sme2p1_or_sve2p1(const ARMISARegisters *id)
+{
+ return isar_feature_aa64_sme2p1(id) || isar_feature_aa64_sve2p1(id);
+}
+
+static inline bool isar_feature_aa64_sme2_i16i64(const ARMISARegisters *id)
+{
+ return isar_feature_aa64_sme2(id) && isar_feature_aa64_sme_i16i64(id);
+}
+
+static inline bool isar_feature_aa64_sme2_f64f64(const ARMISARegisters *id)
+{
+ return isar_feature_aa64_sme2(id) && isar_feature_aa64_sme_f64f64(id);
+}
+
/*
* Feature tests for "does this exist in either 32-bit or 64-bit?"
*/
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index ebac86f..08c43f6 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -554,11 +554,15 @@ static void arm_cpu_reset_hold(Object *obj, ResetType type)
set_flush_inputs_to_zero(1, &env->vfp.fp_status[FPST_STD]);
set_default_nan_mode(1, &env->vfp.fp_status[FPST_STD]);
set_default_nan_mode(1, &env->vfp.fp_status[FPST_STD_F16]);
+ set_default_nan_mode(1, &env->vfp.fp_status[FPST_ZA]);
+ set_default_nan_mode(1, &env->vfp.fp_status[FPST_ZA_F16]);
arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_A32]);
arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_A64]);
+ arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_ZA]);
arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_STD]);
arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_A32_F16]);
arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_A64_F16]);
+ arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_ZA_F16]);
arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_STD_F16]);
arm_set_ah_fp_behaviours(&env->vfp.fp_status[FPST_AH]);
set_flush_to_zero(1, &env->vfp.fp_status[FPST_AH]);
@@ -631,6 +635,9 @@ void arm_emulate_firmware_reset(CPUState *cpustate, int target_el)
env->cp15.cptr_el[3] |= R_CPTR_EL3_ESM_MASK;
env->cp15.scr_el3 |= SCR_ENTP2;
env->vfp.smcr_el[3] = 0xf;
+ if (cpu_isar_feature(aa64_sme2, cpu)) {
+ env->vfp.smcr_el[3] |= R_SMCR_EZT0_MASK;
+ }
}
if (cpu_isar_feature(aa64_hcx, cpu)) {
env->cp15.scr_el3 |= SCR_HXEN;
@@ -1331,8 +1338,8 @@ static void aarch64_cpu_dump_state(CPUState *cs, FILE *f, int flags)
qemu_fprintf(f, "ZA[%0*d]=", svl_lg10, i);
for (j = zcr_len; j >= 0; --j) {
qemu_fprintf(f, "%016" PRIx64 ":%016" PRIx64 "%c",
- env->zarray[i].d[2 * j + 1],
- env->zarray[i].d[2 * j],
+ env->za_state.za[i].d[2 * j + 1],
+ env->za_state.za[i].d[2 * j],
j ? ':' : '\n');
}
}
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index 0338153..c8cf0ab 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -207,6 +207,8 @@ typedef struct NVICState NVICState;
* when FPCR.AH == 1 (bfloat16 conversions and multiplies,
* and the reciprocal and square root estimate/step insns);
* for half-precision
+ * ZA: the "streaming sve" fp status.
+ * ZA_F16: likewise for half-precision.
*
* Half-precision operations are governed by a separate
* flush-to-zero control bit in FPSCR:FZ16. We pass a separate
@@ -227,6 +229,12 @@ typedef struct NVICState NVICState;
* they ignore FPCR.RMode. But they don't ignore FPCR.FZ16,
* which means we need an FPST_AH_F16 as well.
*
+ * The "ZA" float_status are for Streaming SVE operations which use
+ * default-NaN and do not generate fp exceptions, which means that they
+ * do not accumulate exception bits back into FPCR.
+ * See e.g. FPAdd vs FPAdd_ZA pseudocode functions, and the setting
+ * of fpcr.DN and fpexec parameters.
+ *
* To avoid having to transfer exception bits around, we simply
* say that the FPSCR cumulative exception flags are the logical
* OR of the flags in the four fp statuses. This relies on the
@@ -240,10 +248,12 @@ typedef enum ARMFPStatusFlavour {
FPST_A64_F16,
FPST_AH,
FPST_AH_F16,
+ FPST_ZA,
+ FPST_ZA_F16,
FPST_STD,
FPST_STD_F16,
} ARMFPStatusFlavour;
-#define FPST_COUNT 8
+#define FPST_COUNT 10
typedef struct CPUArchState {
/* Regs for current mode. */
@@ -669,9 +679,6 @@ typedef struct CPUArchState {
uint32_t xregs[16];
- /* Scratch space for aa32 neon expansion. */
- uint32_t scratch[8];
-
/* There are a number of distinct float control structures. */
float_status fp_status[FPST_COUNT];
@@ -708,27 +715,36 @@ typedef struct CPUArchState {
uint64_t scxtnum_el[4];
- /*
- * SME ZA storage -- 256 x 256 byte array, with bytes in host word order,
- * as we do with vfp.zregs[]. This corresponds to the architectural ZA
- * array, where ZA[N] is in the least-significant bytes of env->zarray[N].
- * When SVL is less than the architectural maximum, the accessible
- * storage is restricted, such that if the SVL is X bytes the guest can
- * see only the bottom X elements of zarray[], and only the least
- * significant X bytes of each element of the array. (In other words,
- * the observable part is always square.)
- *
- * The ZA storage can also be considered as a set of square tiles of
- * elements of different sizes. The mapping from tiles to the ZA array
- * is architecturally defined, such that for tiles of elements of esz
- * bytes, the Nth row (or "horizontal slice") of tile T is in
- * ZA[T + N * esz]. Note that this means that each tile is not contiguous
- * in the ZA storage, because its rows are striped through the ZA array.
- *
- * Because this is so large, keep this toward the end of the reset area,
- * to keep the offsets into the rest of the structure smaller.
- */
- ARMVectorReg zarray[ARM_MAX_VQ * 16];
+ struct {
+ /* SME2 ZT0 -- 512 bit array, with data ordered like ARMVectorReg. */
+ uint64_t zt0[512 / 64] QEMU_ALIGNED(16);
+
+ /*
+ * SME ZA storage -- 256 x 256 byte array, with bytes in host
+ * word order, as we do with vfp.zregs[]. This corresponds to
+ * the architectural ZA array, where ZA[N] is in the least
+ * significant bytes of env->za_state.za[N].
+ *
+ * When SVL is less than the architectural maximum, the accessible
+ * storage is restricted, such that if the SVL is X bytes the guest
+ * can see only the bottom X elements of zarray[], and only the least
+ * significant X bytes of each element of the array. (In other words,
+ * the observable part is always square.)
+ *
+ * The ZA storage can also be considered as a set of square tiles of
+ * elements of different sizes. The mapping from tiles to the ZA array
+ * is architecturally defined, such that for tiles of elements of esz
+ * bytes, the Nth row (or "horizontal slice") of tile T is in
+ * ZA[T + N * esz]. Note that this means that each tile is not
+ * contiguous in the ZA storage, because its rows are striped through
+ * the ZA array.
+ *
+ * Because this is so large, keep this toward the end of the
+ * reset area, to keep the offsets into the rest of the structure
+ * smaller.
+ */
+ ARMVectorReg za[ARM_MAX_VQ * 16];
+ } za_state;
struct CPUBreakpoint *cpu_breakpoint[16];
struct CPUWatchpoint *cpu_watchpoint[16];
@@ -1120,6 +1136,7 @@ struct ArchCPU {
/* Used to set the maximum vector length the cpu will support. */
uint32_t sve_max_vq;
+ uint32_t sme_max_vq;
#ifdef CONFIG_USER_ONLY
/* Used to set the default vector length at process start. */
@@ -1497,6 +1514,7 @@ FIELD(SVCR, ZA, 1, 1)
/* Fields for SMCR_ELx. */
FIELD(SMCR, LEN, 0, 4)
+FIELD(SMCR, EZT0, 30, 1)
FIELD(SMCR, FA64, 31, 1)
/* Write a new value to v7m.exception, thus transitioning into or out
@@ -2198,6 +2216,7 @@ FIELD(ID_AA64ISAR2, SYSINSTR_128, 36, 4)
FIELD(ID_AA64ISAR2, PRFMSLC, 40, 4)
FIELD(ID_AA64ISAR2, RPRFM, 48, 4)
FIELD(ID_AA64ISAR2, CSSC, 52, 4)
+FIELD(ID_AA64ISAR2, LUT, 56, 4)
FIELD(ID_AA64ISAR2, ATS1A, 60, 4)
FIELD(ID_AA64PFR0, EL0, 0, 4)
@@ -3067,6 +3086,7 @@ FIELD(TBFLAG_A64, NV2_MEM_E20, 35, 1)
FIELD(TBFLAG_A64, NV2_MEM_BE, 36, 1)
FIELD(TBFLAG_A64, AH, 37, 1) /* FPCR.AH */
FIELD(TBFLAG_A64, NEP, 38, 1) /* FPCR.NEP */
+FIELD(TBFLAG_A64, ZT0EXC_EL, 39, 2)
/*
* Helpers for using the above. Note that only the A64 accessors use
diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c
index 1f34067..bd33d6c 100644
--- a/target/arm/cpu64.c
+++ b/target/arm/cpu64.c
@@ -259,6 +259,13 @@ void arm_cpu_sve_finalize(ARMCPU *cpu, Error **errp)
/* From now on sve_max_vq is the actual maximum supported length. */
cpu->sve_max_vq = max_vq;
cpu->sve_vq.map = vq_map;
+
+ /* FEAT_F64MM requires the existence of a 256-bit vector size. */
+ if (max_vq < 2) {
+ uint64_t t = GET_IDREG(&cpu->isar, ID_AA64ZFR0);
+ t = FIELD_DP64(t, ID_AA64ZFR0, F64MM, 0);
+ SET_IDREG(&cpu->isar, ID_AA64ZFR0, t);
+ }
}
/*
@@ -356,6 +363,7 @@ void arm_cpu_sme_finalize(ARMCPU *cpu, Error **errp)
}
cpu->sme_vq.map = vq_map;
+ cpu->sme_max_vq = 32 - clz32(vq_map);
}
static bool cpu_arm_get_sme(Object *obj, Error **errp)
diff --git a/target/arm/helper.c b/target/arm/helper.c
index c311d2d..b3f0d6f 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -6663,7 +6663,7 @@ void aarch64_set_svcr(CPUARMState *env, uint64_t new, uint64_t mask)
* when disabled either.
*/
if (change & new & R_SVCR_ZA_MASK) {
- memset(env->zarray, 0, sizeof(env->zarray));
+ memset(&env->za_state, 0, sizeof(env->za_state));
}
if (tcg_enabled()) {
@@ -6682,10 +6682,14 @@ static void smcr_write(CPUARMState *env, const ARMCPRegInfo *ri,
{
int cur_el = arm_current_el(env);
int old_len = sve_vqm1_for_el(env, cur_el);
+ uint64_t valid_mask = R_SMCR_LEN_MASK | R_SMCR_FA64_MASK;
int new_len;
QEMU_BUILD_BUG_ON(ARM_MAX_VQ > R_SMCR_LEN_MASK + 1);
- value &= R_SMCR_LEN_MASK | R_SMCR_FA64_MASK;
+ if (cpu_isar_feature(aa64_sme2, env_archcpu(env))) {
+ valid_mask |= R_SMCR_EZT0_MASK;
+ }
+ value &= valid_mask;
raw_write(env, ri, value);
/*
diff --git a/target/arm/machine.c b/target/arm/machine.c
index e442d48..6986915 100644
--- a/target/arm/machine.c
+++ b/target/arm/machine.c
@@ -315,12 +315,31 @@ static const VMStateDescription vmstate_za = {
.minimum_version_id = 1,
.needed = za_needed,
.fields = (const VMStateField[]) {
- VMSTATE_STRUCT_ARRAY(env.zarray, ARMCPU, ARM_MAX_VQ * 16, 0,
+ VMSTATE_STRUCT_ARRAY(env.za_state.za, ARMCPU, ARM_MAX_VQ * 16, 0,
vmstate_vreg, ARMVectorReg),
VMSTATE_END_OF_LIST()
}
};
+static bool zt0_needed(void *opaque)
+{
+ ARMCPU *cpu = opaque;
+
+ return za_needed(cpu) && cpu_isar_feature(aa64_sme2, cpu);
+}
+
+static const VMStateDescription vmstate_zt0 = {
+ .name = "cpu/zt0",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = zt0_needed,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT64_ARRAY(env.za_state.zt0, ARMCPU,
+ ARRAY_SIZE(((CPUARMState *)0)->za_state.zt0)),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
static bool serror_needed(void *opaque)
{
ARMCPU *cpu = opaque;
@@ -1096,6 +1115,7 @@ const VMStateDescription vmstate_arm_cpu = {
&vmstate_m_security,
&vmstate_sve,
&vmstate_za,
+ &vmstate_zt0,
&vmstate_serror,
&vmstate_irq_line_state,
&vmstate_wfxt_timer,
diff --git a/target/arm/syndrome.h b/target/arm/syndrome.h
index 3244e07..c48d3b8 100644
--- a/target/arm/syndrome.h
+++ b/target/arm/syndrome.h
@@ -80,6 +80,7 @@ typedef enum {
SME_ET_Streaming,
SME_ET_NotStreaming,
SME_ET_InactiveZA,
+ SME_ET_InaccessibleZT0,
} SMEExceptionType;
#define ARM_EL_EC_LENGTH 6
diff --git a/target/arm/tcg/cpu64.c b/target/arm/tcg/cpu64.c
index 937f29e2..d0df50a 100644
--- a/target/arm/tcg/cpu64.c
+++ b/target/arm/tcg/cpu64.c
@@ -1201,7 +1201,7 @@ void aarch64_max_tcg_initfn(Object *obj)
*/
t = FIELD_DP64(t, ID_AA64PFR1, MTE, 3); /* FEAT_MTE3 */
t = FIELD_DP64(t, ID_AA64PFR1, RAS_FRAC, 0); /* FEAT_RASv1p1 + FEAT_DoubleFault */
- t = FIELD_DP64(t, ID_AA64PFR1, SME, 1); /* FEAT_SME */
+ t = FIELD_DP64(t, ID_AA64PFR1, SME, 2); /* FEAT_SME2 */
t = FIELD_DP64(t, ID_AA64PFR1, CSV2_FRAC, 0); /* FEAT_CSV2_3 */
t = FIELD_DP64(t, ID_AA64PFR1, NMI, 1); /* FEAT_NMI */
SET_IDREG(isar, ID_AA64PFR1, t);
@@ -1250,10 +1250,11 @@ void aarch64_max_tcg_initfn(Object *obj)
FIELD_DP64_IDREG(isar, ID_AA64MMFR3, SPEC_FPACC, 1); /* FEAT_FPACC_SPEC */
t = GET_IDREG(isar, ID_AA64ZFR0);
- t = FIELD_DP64(t, ID_AA64ZFR0, SVEVER, 1);
+ t = FIELD_DP64(t, ID_AA64ZFR0, SVEVER, 2); /* FEAT_SVE2p1 */
t = FIELD_DP64(t, ID_AA64ZFR0, AES, 2); /* FEAT_SVE_PMULL128 */
t = FIELD_DP64(t, ID_AA64ZFR0, BITPERM, 1); /* FEAT_SVE_BitPerm */
t = FIELD_DP64(t, ID_AA64ZFR0, BFLOAT16, 2); /* FEAT_BF16, FEAT_EBF16 */
+ t = FIELD_DP64(t, ID_AA64ZFR0, B16B16, 1); /* FEAT_SVE_B16B16 */
t = FIELD_DP64(t, ID_AA64ZFR0, SHA3, 1); /* FEAT_SVE_SHA3 */
t = FIELD_DP64(t, ID_AA64ZFR0, SM4, 1); /* FEAT_SVE_SM4 */
t = FIELD_DP64(t, ID_AA64ZFR0, I8MM, 1); /* FEAT_I8MM */
@@ -1269,11 +1270,16 @@ void aarch64_max_tcg_initfn(Object *obj)
t = GET_IDREG(isar, ID_AA64SMFR0);
t = FIELD_DP64(t, ID_AA64SMFR0, F32F32, 1); /* FEAT_SME */
+ t = FIELD_DP64(t, ID_AA64SMFR0, BI32I32, 1); /* FEAT_SME2 */
t = FIELD_DP64(t, ID_AA64SMFR0, B16F32, 1); /* FEAT_SME */
t = FIELD_DP64(t, ID_AA64SMFR0, F16F32, 1); /* FEAT_SME */
t = FIELD_DP64(t, ID_AA64SMFR0, I8I32, 0xf); /* FEAT_SME */
+ t = FIELD_DP64(t, ID_AA64SMFR0, F16F16, 1); /* FEAT_SME_F16F16 */
+ t = FIELD_DP64(t, ID_AA64SMFR0, B16B16, 1); /* FEAT_SME_B16B16 */
+ t = FIELD_DP64(t, ID_AA64SMFR0, I16I32, 5); /* FEAT_SME2 */
t = FIELD_DP64(t, ID_AA64SMFR0, F64F64, 1); /* FEAT_SME_F64F64 */
t = FIELD_DP64(t, ID_AA64SMFR0, I16I64, 0xf); /* FEAT_SME_I16I64 */
+ t = FIELD_DP64(t, ID_AA64SMFR0, SMEVER, 2); /* FEAT_SME2p1 */
t = FIELD_DP64(t, ID_AA64SMFR0, FA64, 1); /* FEAT_SME_FA64 */
SET_IDREG(isar, ID_AA64SMFR0, t);
diff --git a/target/arm/tcg/gengvec64.c b/target/arm/tcg/gengvec64.c
index 2617cde..2429cab 100644
--- a/target/arm/tcg/gengvec64.c
+++ b/target/arm/tcg/gengvec64.c
@@ -369,3 +369,14 @@ void gen_gvec_usqadd_qc(unsigned vece, uint32_t rd_ofs,
tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
}
+
+void gen_gvec_sve2_sqdmulh(unsigned vece, uint32_t rd_ofs,
+ uint32_t rn_ofs, uint32_t rm_ofs,
+ uint32_t opr_sz, uint32_t max_sz)
+{
+ static gen_helper_gvec_3 * const fns[4] = {
+ gen_helper_sve2_sqdmulh_b, gen_helper_sve2_sqdmulh_h,
+ gen_helper_sve2_sqdmulh_s, gen_helper_sve2_sqdmulh_d,
+ };
+ tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
+}
diff --git a/target/arm/tcg/helper-a64.c b/target/arm/tcg/helper-a64.c
index 4f618ae..c66d521 100644
--- a/target/arm/tcg/helper-a64.c
+++ b/target/arm/tcg/helper-a64.c
@@ -402,6 +402,8 @@ AH_MINMAX_HELPER(vfp_ah_mind, float64, float64, min)
AH_MINMAX_HELPER(vfp_ah_maxh, dh_ctype_f16, float16, max)
AH_MINMAX_HELPER(vfp_ah_maxs, float32, float32, max)
AH_MINMAX_HELPER(vfp_ah_maxd, float64, float64, max)
+AH_MINMAX_HELPER(sme2_ah_fmax_b16, bfloat16, bfloat16, max)
+AH_MINMAX_HELPER(sme2_ah_fmin_b16, bfloat16, bfloat16, min)
/* 64-bit versions of the CRC helpers. Note that although the operation
* (and the prototypes of crc32c() and crc32() mean that only the bottom
diff --git a/target/arm/tcg/helper-sme.h b/target/arm/tcg/helper-sme.h
index 858d691..1fc756b 100644
--- a/target/arm/tcg/helper-sme.h
+++ b/target/arm/tcg/helper-sme.h
@@ -33,6 +33,21 @@ DEF_HELPER_FLAGS_4(sme_mova_zc_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(sme_mova_cz_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(sme_mova_zc_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_mova_cz_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_mova_zc_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_mova_cz_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_mova_zc_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_mova_cz_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_mova_zc_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_mova_cz_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_mova_zc_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sme2p1_movaz_zc_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2p1_movaz_zc_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2p1_movaz_zc_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2p1_movaz_zc_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2p1_movaz_zc_q, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
DEF_HELPER_FLAGS_5(sme_ld1b_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32)
DEF_HELPER_FLAGS_5(sme_ld1b_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32)
DEF_HELPER_FLAGS_5(sme_ld1b_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32)
@@ -120,14 +135,45 @@ DEF_HELPER_FLAGS_5(sme_addva_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(sme_addha_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(sme_addva_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
-DEF_HELPER_FLAGS_7(sme_fmopa_h, TCG_CALL_NO_RWG,
+DEF_HELPER_FLAGS_7(sme_fmopa_w_h, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_7(sme_fmopa_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_7(sme_fmopa_s, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_7(sme_fmopa_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_7(sme_bfmopa_w, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, env, i32)
DEF_HELPER_FLAGS_7(sme_bfmopa, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, fpst, i32)
+
+DEF_HELPER_FLAGS_7(sme_fmops_w_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_7(sme_fmops_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_7(sme_fmops_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_7(sme_fmops_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_7(sme_bfmops_w, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_7(sme_bfmops, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, fpst, i32)
+
+DEF_HELPER_FLAGS_7(sme_ah_fmops_w_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_7(sme_ah_fmops_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_7(sme_ah_fmops_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_7(sme_ah_fmops_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_7(sme_ah_bfmops_w, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_7(sme_ah_bfmops, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, fpst, i32)
+
DEF_HELPER_FLAGS_6(sme_smopa_s, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_6(sme_umopa_s, TCG_CALL_NO_RWG,
@@ -144,3 +190,168 @@ DEF_HELPER_FLAGS_6(sme_sumopa_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_6(sme_usmopa_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sme2_bmopa_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sme2_smopa2_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sme2_umopa2_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(gvec_fmax_b16, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(gvec_fmin_b16, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(gvec_ah_fmax_b16, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(gvec_ah_fmin_b16, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(gvec_fmaxnum_b16, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(gvec_fminnum_b16, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, fpst, i32)
+
+DEF_HELPER_FLAGS_6(sme2_fdot_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_6(sme2_fdot_idx_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_6(sme2_fvdot_idx_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, env, i32)
+
+DEF_HELPER_FLAGS_4(sme2_svdot_idx_4b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_uvdot_idx_4b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_suvdot_idx_4b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_usvdot_idx_4b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sme2_svdot_idx_4h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_uvdot_idx_4h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sme2_svdot_idx_2h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_uvdot_idx_2h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sme2_smlall_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sme2_smlall_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sme2_smlsll_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sme2_smlsll_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sme2_umlall_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sme2_umlall_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sme2_umlsll_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sme2_umlsll_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sme2_usmlall_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sme2_smlall_idx_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sme2_smlall_idx_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sme2_smlsll_idx_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sme2_smlsll_idx_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sme2_umlall_idx_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sme2_umlall_idx_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sme2_umlsll_idx_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sme2_umlsll_idx_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sme2_usmlall_idx_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sme2_sumlall_idx_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sme2_bfcvt, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_4(sme2_bfcvtn, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_4(sme2_fcvt_n, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_4(sme2_fcvtn, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_4(sme2_fcvt_w, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_4(sme2_fcvtl, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_4(sme2_scvtf, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_4(sme2_ucvtf, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32)
+
+DEF_HELPER_FLAGS_3(sme2_sqcvt_sb, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uqcvt_sb, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sqcvtu_sb, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sqcvt_sh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uqcvt_sh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sqcvtu_sh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sqcvt_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uqcvt_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sqcvtu_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sme2_sqcvtn_sb, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uqcvtn_sb, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sqcvtun_sb, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sqcvtn_sh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uqcvtn_sh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sqcvtun_sh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sqcvtn_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uqcvtn_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sqcvtun_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sme2_sunpk2_bh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sunpk2_hs, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sunpk2_sd, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sunpk4_bh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sunpk4_hs, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sunpk4_sd, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uunpk2_bh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uunpk2_hs, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uunpk2_sd, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uunpk4_bh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uunpk4_hs, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uunpk4_sd, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sme2_zip2_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_zip2_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_zip2_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_zip2_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_zip2_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sme2_uzp2_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_uzp2_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_uzp2_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_uzp2_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_uzp2_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sme2_zip4_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_zip4_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_zip4_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_zip4_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_zip4_q, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sme2_uzp4_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uzp4_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uzp4_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uzp4_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uzp4_q, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sme2_sqrshr_sh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uqrshr_sh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sqrshru_sh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sqrshr_sb, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uqrshr_sb, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sqrshru_sb, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sqrshr_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uqrshr_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sqrshru_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sme2_sqrshrn_sh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uqrshrn_sh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sqrshrun_sh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sqrshrn_sb, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uqrshrn_sb, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sqrshrun_sb, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sqrshrn_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_uqrshrn_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(sme2_sqrshrun_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sme2_sclamp_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_sclamp_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_sclamp_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_sclamp_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sme2_uclamp_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_uclamp_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_uclamp_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_uclamp_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sme2_fclamp_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(sme2_fclamp_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(sme2_fclamp_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(sme2_bfclamp, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
+
+DEF_HELPER_FLAGS_5(sme2_sel_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32, i32)
+DEF_HELPER_FLAGS_5(sme2_sel_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32, i32)
+DEF_HELPER_FLAGS_5(sme2_sel_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32, i32)
+DEF_HELPER_FLAGS_5(sme2_sel_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32, i32)
diff --git a/target/arm/tcg/helper-sve.h b/target/arm/tcg/helper-sve.h
index 0b1b588..c36090d 100644
--- a/target/arm/tcg/helper-sve.h
+++ b/target/arm/tcg/helper-sve.h
@@ -676,11 +676,21 @@ DEF_HELPER_FLAGS_5(sve2_tbl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(sve2_tbl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(sve2_tbl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_tblq_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_tblq_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_tblq_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_tblq_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
DEF_HELPER_FLAGS_4(sve2_tbx_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(sve2_tbx_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(sve2_tbx_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(sve2_tbx_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_tbxq_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_tbxq_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_tbxq_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_tbxq_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
DEF_HELPER_FLAGS_3(sve_sunpk_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(sve_sunpk_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(sve_sunpk_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
@@ -701,12 +711,22 @@ DEF_HELPER_FLAGS_4(sve_zip_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(sve_zip_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(sve2_zip_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_zipq_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_zipq_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_zipq_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_zipq_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
DEF_HELPER_FLAGS_4(sve_uzp_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(sve_uzp_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(sve_uzp_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(sve_uzp_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(sve2_uzp_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_uzpq_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_uzpq_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_uzpq_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_uzpq_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
DEF_HELPER_FLAGS_4(sve_trn_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(sve_trn_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(sve_trn_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
@@ -937,10 +957,17 @@ DEF_HELPER_FLAGS_4(sve_brkn, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(sve_brkns, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(sve_cntp, TCG_CALL_NO_RWG, i64, ptr, ptr, i32)
+DEF_HELPER_FLAGS_2(sve2p1_cntp_c, TCG_CALL_NO_RWG_SE, i64, i32, i32)
DEF_HELPER_FLAGS_3(sve_whilel, TCG_CALL_NO_RWG, i32, ptr, i32, i32)
DEF_HELPER_FLAGS_3(sve_whileg, TCG_CALL_NO_RWG, i32, ptr, i32, i32)
+DEF_HELPER_FLAGS_3(sve_while2l, TCG_CALL_NO_RWG, i32, ptr, i32, i32)
+DEF_HELPER_FLAGS_3(sve_while2g, TCG_CALL_NO_RWG, i32, ptr, i32, i32)
+
+DEF_HELPER_FLAGS_3(sve_whilecl, TCG_CALL_NO_RWG, i32, ptr, i32, i32)
+DEF_HELPER_FLAGS_3(sve_whilecg, TCG_CALL_NO_RWG, i32, ptr, i32, i32)
+
DEF_HELPER_FLAGS_4(sve_subri_b, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
DEF_HELPER_FLAGS_4(sve_subri_h, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
DEF_HELPER_FLAGS_4(sve_subri_s, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
@@ -1071,6 +1098,55 @@ DEF_HELPER_FLAGS_4(sve_ah_fminv_s, TCG_CALL_NO_RWG,
DEF_HELPER_FLAGS_4(sve_ah_fminv_d, TCG_CALL_NO_RWG,
i64, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(sve2p1_faddqv_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(sve2p1_faddqv_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(sve2p1_faddqv_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, fpst, i32)
+
+DEF_HELPER_FLAGS_5(sve2p1_fmaxnmqv_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(sve2p1_fmaxnmqv_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(sve2p1_fmaxnmqv_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, fpst, i32)
+
+DEF_HELPER_FLAGS_5(sve2p1_fminnmqv_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(sve2p1_fminnmqv_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(sve2p1_fminnmqv_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, fpst, i32)
+
+DEF_HELPER_FLAGS_5(sve2p1_fmaxqv_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(sve2p1_fmaxqv_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(sve2p1_fmaxqv_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, fpst, i32)
+
+DEF_HELPER_FLAGS_5(sve2p1_fminqv_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(sve2p1_fminqv_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(sve2p1_fminqv_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, fpst, i32)
+
+DEF_HELPER_FLAGS_5(sve2p1_ah_fmaxqv_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(sve2p1_ah_fmaxqv_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(sve2p1_ah_fmaxqv_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, fpst, i32)
+
+DEF_HELPER_FLAGS_5(sve2p1_ah_fminqv_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(sve2p1_ah_fminqv_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(sve2p1_ah_fminqv_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, fpst, i32)
+
DEF_HELPER_FLAGS_5(sve_fadda_h, TCG_CALL_NO_RWG,
i64, i64, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(sve_fadda_s, TCG_CALL_NO_RWG,
@@ -1582,6 +1658,14 @@ DEF_HELPER_FLAGS_4(sve_ld2dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_ld3dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_ld4dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld2qq_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld3qq_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld4qq_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ld2qq_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld3qq_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld4qq_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
DEF_HELPER_FLAGS_4(sve_ld1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_ld1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_ld1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
@@ -1602,9 +1686,15 @@ DEF_HELPER_FLAGS_4(sve_ld1hds_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_ld1sdu_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_ld1sds_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld1squ_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld1dqu_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
DEF_HELPER_FLAGS_4(sve_ld1sdu_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_ld1sds_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld1squ_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld1dqu_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
DEF_HELPER_FLAGS_4(sve_ld1bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_ld2bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_ld3bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
@@ -1640,6 +1730,14 @@ DEF_HELPER_FLAGS_4(sve_ld2dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_ld3dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_ld4dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld2qq_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld3qq_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld4qq_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ld2qq_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld3qq_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld4qq_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
DEF_HELPER_FLAGS_4(sve_ld1bhu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_ld1bsu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_ld1bdu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
@@ -1660,9 +1758,15 @@ DEF_HELPER_FLAGS_4(sve_ld1hds_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_ld1sdu_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_ld1sds_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld1squ_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld1dqu_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
DEF_HELPER_FLAGS_4(sve_ld1sdu_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_ld1sds_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld1squ_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld1dqu_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
DEF_HELPER_FLAGS_4(sve_ldff1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_ldff1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_ldff1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
@@ -1858,6 +1962,14 @@ DEF_HELPER_FLAGS_4(sve_st2dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_st3dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_st4dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st2qq_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st3qq_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st4qq_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_st2qq_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st3qq_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st4qq_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
DEF_HELPER_FLAGS_4(sve_st1bh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_st1bs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_st1bd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
@@ -1870,6 +1982,11 @@ DEF_HELPER_FLAGS_4(sve_st1hd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_st1sd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_st1sd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st1sq_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st1sq_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st1dq_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st1dq_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
DEF_HELPER_FLAGS_4(sve_st1bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_st2bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_st3bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
@@ -1905,6 +2022,14 @@ DEF_HELPER_FLAGS_4(sve_st2dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_st3dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_st4dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st2qq_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st3qq_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st4qq_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_st2qq_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st3qq_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st4qq_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
DEF_HELPER_FLAGS_4(sve_st1bh_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_st1bs_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_st1bd_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
@@ -1917,6 +2042,11 @@ DEF_HELPER_FLAGS_4(sve_st1hd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_st1sd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_st1sd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st1sq_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st1sq_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st1dq_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st1dq_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
DEF_HELPER_FLAGS_6(sve_ldbsu_zsu, TCG_CALL_NO_WG,
void, env, ptr, ptr, ptr, tl, i32)
DEF_HELPER_FLAGS_6(sve_ldhsu_le_zsu, TCG_CALL_NO_WG,
@@ -2025,6 +2155,10 @@ DEF_HELPER_FLAGS_6(sve_ldsds_le_zd, TCG_CALL_NO_WG,
void, env, ptr, ptr, ptr, tl, i32)
DEF_HELPER_FLAGS_6(sve_ldsds_be_zd, TCG_CALL_NO_WG,
void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldqq_le_zd, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldqq_be_zd, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
DEF_HELPER_FLAGS_6(sve_ldbsu_zsu_mte, TCG_CALL_NO_WG,
void, env, ptr, ptr, ptr, tl, i32)
@@ -2134,6 +2268,10 @@ DEF_HELPER_FLAGS_6(sve_ldsds_le_zd_mte, TCG_CALL_NO_WG,
void, env, ptr, ptr, ptr, tl, i32)
DEF_HELPER_FLAGS_6(sve_ldsds_be_zd_mte, TCG_CALL_NO_WG,
void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldqq_le_zd_mte, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldqq_be_zd_mte, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
DEF_HELPER_FLAGS_6(sve_ldffbsu_zsu, TCG_CALL_NO_WG,
void, env, ptr, ptr, ptr, tl, i32)
@@ -2419,6 +2557,10 @@ DEF_HELPER_FLAGS_6(sve_stdd_le_zd, TCG_CALL_NO_WG,
void, env, ptr, ptr, ptr, tl, i32)
DEF_HELPER_FLAGS_6(sve_stdd_be_zd, TCG_CALL_NO_WG,
void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_stqq_le_zd, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_stqq_be_zd, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
DEF_HELPER_FLAGS_6(sve_stbs_zsu_mte, TCG_CALL_NO_WG,
void, env, ptr, ptr, ptr, tl, i32)
@@ -2486,6 +2628,10 @@ DEF_HELPER_FLAGS_6(sve_stdd_le_zd_mte, TCG_CALL_NO_WG,
void, env, ptr, ptr, ptr, tl, i32)
DEF_HELPER_FLAGS_6(sve_stdd_be_zd_mte, TCG_CALL_NO_WG,
void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_stqq_le_zd_mte, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_stqq_be_zd_mte, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve2_sqdmull_zzz_h, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, i32)
@@ -2922,3 +3068,69 @@ DEF_HELPER_FLAGS_4(sve2_sqshlu_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(sve2_sqshlu_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(sve2_sqshlu_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(sve2_sqshlu_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve2p1_addqv_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_addqv_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_addqv_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_addqv_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve2p1_smaxqv_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_smaxqv_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_smaxqv_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_smaxqv_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve2p1_sminqv_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_sminqv_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_sminqv_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_sminqv_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve2p1_umaxqv_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_umaxqv_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_umaxqv_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_umaxqv_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve2p1_uminqv_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_uminqv_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_uminqv_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_uminqv_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(pext, TCG_CALL_NO_RWG, void, ptr, i32, i32)
+
+DEF_HELPER_FLAGS_4(sve2p1_orqv_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_orqv_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_orqv_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_orqv_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve2p1_eorqv_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_eorqv_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_eorqv_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_eorqv_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve2p1_andqv_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_andqv_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_andqv_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve2p1_andqv_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(pmov_pv_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(pmov_pv_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(pmov_pv_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(pmov_vp_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(pmov_vp_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(pmov_vp_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve2p1_ld1bb_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i32)
+DEF_HELPER_FLAGS_5(sve2p1_ld1hh_le_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i32)
+DEF_HELPER_FLAGS_5(sve2p1_ld1hh_be_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i32)
+DEF_HELPER_FLAGS_5(sve2p1_ld1ss_le_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i32)
+DEF_HELPER_FLAGS_5(sve2p1_ld1ss_be_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i32)
+DEF_HELPER_FLAGS_5(sve2p1_ld1dd_le_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i32)
+DEF_HELPER_FLAGS_5(sve2p1_ld1dd_be_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i32)
+
+DEF_HELPER_FLAGS_5(sve2p1_st1bb_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i32)
+DEF_HELPER_FLAGS_5(sve2p1_st1hh_le_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i32)
+DEF_HELPER_FLAGS_5(sve2p1_st1hh_be_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i32)
+DEF_HELPER_FLAGS_5(sve2p1_st1ss_le_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i32)
+DEF_HELPER_FLAGS_5(sve2p1_st1ss_be_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i32)
+DEF_HELPER_FLAGS_5(sve2p1_st1dd_le_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i32)
+DEF_HELPER_FLAGS_5(sve2p1_st1dd_be_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i32)
diff --git a/target/arm/tcg/helper.h b/target/arm/tcg/helper.h
index 80db7c2..d9565c8 100644
--- a/target/arm/tcg/helper.h
+++ b/target/arm/tcg/helper.h
@@ -353,6 +353,14 @@ DEF_HELPER_FLAGS_4(gvec_urshl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_urshl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_urshl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_srshl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_srshl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_srshl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sme2_urshl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_urshl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_urshl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
DEF_HELPER_2(neon_add_u8, i32, i32, i32)
DEF_HELPER_2(neon_add_u16, i32, i32, i32)
DEF_HELPER_2(neon_sub_u8, i32, i32, i32)
@@ -608,23 +616,31 @@ DEF_HELPER_FLAGS_5(sve2_sqrdmlah_d, TCG_CALL_NO_RWG,
DEF_HELPER_FLAGS_5(sve2_sqrdmlsh_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
-DEF_HELPER_FLAGS_5(gvec_sdot_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
-DEF_HELPER_FLAGS_5(gvec_udot_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
-DEF_HELPER_FLAGS_5(gvec_sdot_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
-DEF_HELPER_FLAGS_5(gvec_udot_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
-DEF_HELPER_FLAGS_5(gvec_usdot_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_sdot_4b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_udot_4b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_sdot_4h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_udot_4h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_usdot_4b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(gvec_sdot_2h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_udot_2h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
-DEF_HELPER_FLAGS_5(gvec_sdot_idx_b, TCG_CALL_NO_RWG,
+DEF_HELPER_FLAGS_5(gvec_sdot_idx_4b, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_udot_idx_4b, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_sdot_idx_4h, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
-DEF_HELPER_FLAGS_5(gvec_udot_idx_b, TCG_CALL_NO_RWG,
+DEF_HELPER_FLAGS_5(gvec_udot_idx_4h, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
-DEF_HELPER_FLAGS_5(gvec_sdot_idx_h, TCG_CALL_NO_RWG,
+DEF_HELPER_FLAGS_5(gvec_sudot_idx_4b, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
-DEF_HELPER_FLAGS_5(gvec_udot_idx_h, TCG_CALL_NO_RWG,
+DEF_HELPER_FLAGS_5(gvec_usdot_idx_4b, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
-DEF_HELPER_FLAGS_5(gvec_sudot_idx_b, TCG_CALL_NO_RWG,
+
+DEF_HELPER_FLAGS_5(gvec_sdot_idx_2h, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
-DEF_HELPER_FLAGS_5(gvec_usdot_idx_b, TCG_CALL_NO_RWG,
+DEF_HELPER_FLAGS_5(gvec_udot_idx_2h, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(gvec_fcaddh, TCG_CALL_NO_RWG,
@@ -715,10 +731,12 @@ DEF_HELPER_FLAGS_4(gvec_fclt0_d, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(gvec_fadd_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(gvec_fadd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(gvec_fadd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(gvec_bfadd, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(gvec_fsub_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(gvec_fsub_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(gvec_fsub_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(gvec_bfsub, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(gvec_fmul_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(gvec_fmul_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
@@ -774,23 +792,26 @@ DEF_HELPER_FLAGS_5(gvec_recps_nf_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst,
DEF_HELPER_FLAGS_5(gvec_rsqrts_nf_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(gvec_rsqrts_nf_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
-DEF_HELPER_FLAGS_5(gvec_fmla_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
-DEF_HELPER_FLAGS_5(gvec_fmla_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(gvec_fmla_nf_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(gvec_fmla_nf_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
-DEF_HELPER_FLAGS_5(gvec_fmls_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
-DEF_HELPER_FLAGS_5(gvec_fmls_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(gvec_fmls_nf_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(gvec_fmls_nf_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(gvec_vfma_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(gvec_vfma_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(gvec_vfma_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(gvec_bfmla, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(gvec_vfms_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(gvec_vfms_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(gvec_vfms_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(gvec_bfmls, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(gvec_ah_vfms_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(gvec_ah_vfms_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(gvec_ah_vfms_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_5(gvec_ah_bfmls, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(gvec_ftsmul_h, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, fpst, i32)
@@ -822,6 +843,8 @@ DEF_HELPER_FLAGS_6(gvec_fmla_idx_s, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_6(gvec_fmla_idx_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_6(gvec_bfmla_idx, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_6(gvec_fmls_idx_h, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, fpst, i32)
@@ -829,6 +852,8 @@ DEF_HELPER_FLAGS_6(gvec_fmls_idx_s, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_6(gvec_fmls_idx_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_6(gvec_bfmls_idx, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_6(gvec_ah_fmls_idx_h, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, fpst, i32)
@@ -836,6 +861,8 @@ DEF_HELPER_FLAGS_6(gvec_ah_fmls_idx_s, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_6(gvec_ah_fmls_idx_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_6(gvec_ah_bfmls_idx, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(gvec_uqadd_b, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
@@ -1081,14 +1108,24 @@ DEF_HELPER_FLAGS_6(gvec_bfdot, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, env, i32)
DEF_HELPER_FLAGS_6(gvec_bfdot_idx, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_6(sme2_bfvdot_idx, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, env, i32)
DEF_HELPER_FLAGS_6(gvec_bfmmla, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, env, i32)
DEF_HELPER_FLAGS_6(gvec_bfmlal, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_6(gvec_bfmlsl, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_6(gvec_ah_bfmlsl, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_6(gvec_bfmlal_idx, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_6(gvec_bfmlsl_idx, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_6(gvec_ah_bfmlsl_idx, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(gvec_sclamp_b, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
@@ -1151,3 +1188,27 @@ DEF_HELPER_FLAGS_4(gvec_uminp_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(gvec_urecpe_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(gvec_ursqrte_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sme2_luti2_1b, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_4(sme2_luti2_1h, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_4(sme2_luti2_1s, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+
+DEF_HELPER_FLAGS_4(sme2_luti2_2b, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_4(sme2_luti2_2h, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_4(sme2_luti2_2s, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+
+DEF_HELPER_FLAGS_4(sme2_luti2_4b, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_4(sme2_luti2_4h, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_4(sme2_luti2_4s, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+
+DEF_HELPER_FLAGS_4(sme2_luti4_1b, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_4(sme2_luti4_1h, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_4(sme2_luti4_1s, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+
+DEF_HELPER_FLAGS_4(sme2_luti4_2b, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_4(sme2_luti4_2h, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_4(sme2_luti4_2s, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+
+DEF_HELPER_FLAGS_4(sme2_luti4_4b, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_4(sme2_luti4_4h, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_4(sme2_luti4_4s, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
diff --git a/target/arm/tcg/hflags.c b/target/arm/tcg/hflags.c
index 1ccec63..59ab526 100644
--- a/target/arm/tcg/hflags.c
+++ b/target/arm/tcg/hflags.c
@@ -214,6 +214,31 @@ static CPUARMTBFlags rebuild_hflags_a32(CPUARMState *env, int fp_el,
return rebuild_hflags_common_32(env, fp_el, mmu_idx, flags);
}
+/*
+ * Return the exception level to which exceptions should be taken for ZT0.
+ * C.f. the ARM pseudocode function CheckSMEZT0Enabled, after the ZA check.
+ */
+static int zt0_exception_el(CPUARMState *env, int el)
+{
+#ifndef CONFIG_USER_ONLY
+ if (el <= 1
+ && !el_is_in_host(env, el)
+ && !FIELD_EX64(env->vfp.smcr_el[1], SMCR, EZT0)) {
+ return 1;
+ }
+ if (el <= 2
+ && arm_is_el2_enabled(env)
+ && !FIELD_EX64(env->vfp.smcr_el[2], SMCR, EZT0)) {
+ return 2;
+ }
+ if (arm_feature(env, ARM_FEATURE_EL3)
+ && !FIELD_EX64(env->vfp.smcr_el[3], SMCR, EZT0)) {
+ return 3;
+ }
+#endif
+ return 0;
+}
+
static CPUARMTBFlags rebuild_hflags_a64(CPUARMState *env, int el, int fp_el,
ARMMMUIdx mmu_idx)
{
@@ -269,7 +294,14 @@ static CPUARMTBFlags rebuild_hflags_a64(CPUARMState *env, int el, int fp_el,
DP_TBFLAG_A64(flags, PSTATE_SM, 1);
DP_TBFLAG_A64(flags, SME_TRAP_NONSTREAMING, !sme_fa64(env, el));
}
- DP_TBFLAG_A64(flags, PSTATE_ZA, FIELD_EX64(env->svcr, SVCR, ZA));
+
+ if (FIELD_EX64(env->svcr, SVCR, ZA)) {
+ DP_TBFLAG_A64(flags, PSTATE_ZA, 1);
+ if (cpu_isar_feature(aa64_sme2, env_archcpu(env))) {
+ int zt0_el = zt0_exception_el(env, el);
+ DP_TBFLAG_A64(flags, ZT0EXC_EL, zt0_el);
+ }
+ }
}
sctlr = regime_sctlr(env, stage1);
diff --git a/target/arm/tcg/m_helper.c b/target/arm/tcg/m_helper.c
index 6614719..28307b5 100644
--- a/target/arm/tcg/m_helper.c
+++ b/target/arm/tcg/m_helper.c
@@ -632,8 +632,11 @@ void HELPER(v7m_blxns)(CPUARMState *env, uint32_t dest)
}
/* Note that these stores can throw exceptions on MPU faults */
- cpu_stl_data_ra(env, sp, nextinst, GETPC());
- cpu_stl_data_ra(env, sp + 4, saved_psr, GETPC());
+ ARMMMUIdx mmu_idx = arm_mmu_idx(env);
+ MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN,
+ arm_to_core_mmu_idx(mmu_idx));
+ cpu_stl_mmu(env, sp, nextinst, oi, GETPC());
+ cpu_stl_mmu(env, sp + 4, saved_psr, oi, GETPC());
env->regs[13] = sp;
env->regs[14] = 0xfeffffff;
@@ -1048,6 +1051,9 @@ void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr)
bool s = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_S_MASK;
bool lspact = env->v7m.fpccr[s] & R_V7M_FPCCR_LSPACT_MASK;
uintptr_t ra = GETPC();
+ ARMMMUIdx mmu_idx = arm_mmu_idx(env);
+ MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN,
+ arm_to_core_mmu_idx(mmu_idx));
assert(env->v7m.secure);
@@ -1073,7 +1079,7 @@ void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr)
* Note that we do not use v7m_stack_write() here, because the
* accesses should not set the FSR bits for stacking errors if they
* fail. (In pseudocode terms, they are AccType_NORMAL, not AccType_STACK
- * or AccType_LAZYFP). Faults in cpu_stl_data_ra() will throw exceptions
+ * or AccType_LAZYFP). Faults in cpu_stl_mmu() will throw exceptions
* and longjmp out.
*/
if (!(env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPEN_MASK)) {
@@ -1089,12 +1095,12 @@ void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr)
if (i >= 16) {
faddr += 8; /* skip the slot for the FPSCR */
}
- cpu_stl_data_ra(env, faddr, slo, ra);
- cpu_stl_data_ra(env, faddr + 4, shi, ra);
+ cpu_stl_mmu(env, faddr, slo, oi, ra);
+ cpu_stl_mmu(env, faddr + 4, shi, oi, ra);
}
- cpu_stl_data_ra(env, fptr + 0x40, vfp_get_fpscr(env), ra);
+ cpu_stl_mmu(env, fptr + 0x40, vfp_get_fpscr(env), oi, ra);
if (cpu_isar_feature(aa32_mve, cpu)) {
- cpu_stl_data_ra(env, fptr + 0x44, env->v7m.vpr, ra);
+ cpu_stl_mmu(env, fptr + 0x44, env->v7m.vpr, oi, ra);
}
/*
@@ -1121,6 +1127,9 @@ void HELPER(v7m_vlldm)(CPUARMState *env, uint32_t fptr)
{
ARMCPU *cpu = env_archcpu(env);
uintptr_t ra = GETPC();
+ ARMMMUIdx mmu_idx = arm_mmu_idx(env);
+ MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN,
+ arm_to_core_mmu_idx(mmu_idx));
/* fptr is the value of Rn, the frame pointer we load the FP regs from */
assert(env->v7m.secure);
@@ -1155,16 +1164,16 @@ void HELPER(v7m_vlldm)(CPUARMState *env, uint32_t fptr)
faddr += 8; /* skip the slot for the FPSCR and VPR */
}
- slo = cpu_ldl_data_ra(env, faddr, ra);
- shi = cpu_ldl_data_ra(env, faddr + 4, ra);
+ slo = cpu_ldl_mmu(env, faddr, oi, ra);
+ shi = cpu_ldl_mmu(env, faddr + 4, oi, ra);
dn = (uint64_t) shi << 32 | slo;
*aa32_vfp_dreg(env, i / 2) = dn;
}
- fpscr = cpu_ldl_data_ra(env, fptr + 0x40, ra);
+ fpscr = cpu_ldl_mmu(env, fptr + 0x40, oi, ra);
vfp_set_fpscr(env, fpscr);
if (cpu_isar_feature(aa32_mve, cpu)) {
- env->v7m.vpr = cpu_ldl_data_ra(env, fptr + 0x44, ra);
+ env->v7m.vpr = cpu_ldl_mmu(env, fptr + 0x44, oi, ra);
}
}
@@ -1937,7 +1946,7 @@ static bool do_v7m_function_return(ARMCPU *cpu)
* do them as secure, so work out what MMU index that is.
*/
mmu_idx = arm_v7m_mmu_idx_for_secstate(env, true);
- oi = make_memop_idx(MO_LEUL, arm_to_core_mmu_idx(mmu_idx));
+ oi = make_memop_idx(MO_LEUL | MO_ALIGN, arm_to_core_mmu_idx(mmu_idx));
newpc = cpu_ldl_mmu(env, frameptr, oi, 0);
newpsr = cpu_ldl_mmu(env, frameptr + 4, oi, 0);
diff --git a/target/arm/tcg/mve_helper.c b/target/arm/tcg/mve_helper.c
index 506d1c3..63ddcf3 100644
--- a/target/arm/tcg/mve_helper.c
+++ b/target/arm/tcg/mve_helper.c
@@ -148,13 +148,15 @@ static void mve_advance_vpt(CPUARMState *env)
}
/* For loads, predicated lanes are zeroed instead of keeping their old values */
-#define DO_VLDR(OP, MSIZE, LDTYPE, ESIZE, TYPE) \
+#define DO_VLDR(OP, MFLAG, MSIZE, MTYPE, LDTYPE, ESIZE, TYPE) \
void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr) \
{ \
TYPE *d = vd; \
uint16_t mask = mve_element_mask(env); \
uint16_t eci_mask = mve_eci_mask(env); \
unsigned b, e; \
+ int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \
+ MemOpIdx oi = make_memop_idx(MFLAG | MO_ALIGN, mmu_idx); \
/* \
* R_SXTM allows the dest reg to become UNKNOWN for abandoned \
* beats so we don't care if we update part of the dest and \
@@ -163,46 +165,48 @@ static void mve_advance_vpt(CPUARMState *env)
for (b = 0, e = 0; b < 16; b += ESIZE, e++) { \
if (eci_mask & (1 << b)) { \
d[H##ESIZE(e)] = (mask & (1 << b)) ? \
- cpu_##LDTYPE##_data_ra(env, addr, GETPC()) : 0; \
+ (MTYPE)cpu_##LDTYPE##_mmu(env, addr, oi, GETPC()) : 0;\
} \
addr += MSIZE; \
} \
mve_advance_vpt(env); \
}
-#define DO_VSTR(OP, MSIZE, STTYPE, ESIZE, TYPE) \
+#define DO_VSTR(OP, MFLAG, MSIZE, STTYPE, ESIZE, TYPE) \
void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr) \
{ \
TYPE *d = vd; \
uint16_t mask = mve_element_mask(env); \
unsigned b, e; \
+ int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \
+ MemOpIdx oi = make_memop_idx(MFLAG | MO_ALIGN, mmu_idx); \
for (b = 0, e = 0; b < 16; b += ESIZE, e++) { \
if (mask & (1 << b)) { \
- cpu_##STTYPE##_data_ra(env, addr, d[H##ESIZE(e)], GETPC()); \
+ cpu_##STTYPE##_mmu(env, addr, d[H##ESIZE(e)], oi, GETPC()); \
} \
addr += MSIZE; \
} \
mve_advance_vpt(env); \
}
-DO_VLDR(vldrb, 1, ldub, 1, uint8_t)
-DO_VLDR(vldrh, 2, lduw, 2, uint16_t)
-DO_VLDR(vldrw, 4, ldl, 4, uint32_t)
+DO_VLDR(vldrb, MO_UB, 1, uint8_t, ldb, 1, uint8_t)
+DO_VLDR(vldrh, MO_TEUW, 2, uint16_t, ldw, 2, uint16_t)
+DO_VLDR(vldrw, MO_TEUL, 4, uint32_t, ldl, 4, uint32_t)
-DO_VSTR(vstrb, 1, stb, 1, uint8_t)
-DO_VSTR(vstrh, 2, stw, 2, uint16_t)
-DO_VSTR(vstrw, 4, stl, 4, uint32_t)
+DO_VSTR(vstrb, MO_UB, 1, stb, 1, uint8_t)
+DO_VSTR(vstrh, MO_TEUW, 2, stw, 2, uint16_t)
+DO_VSTR(vstrw, MO_TEUL, 4, stl, 4, uint32_t)
-DO_VLDR(vldrb_sh, 1, ldsb, 2, int16_t)
-DO_VLDR(vldrb_sw, 1, ldsb, 4, int32_t)
-DO_VLDR(vldrb_uh, 1, ldub, 2, uint16_t)
-DO_VLDR(vldrb_uw, 1, ldub, 4, uint32_t)
-DO_VLDR(vldrh_sw, 2, ldsw, 4, int32_t)
-DO_VLDR(vldrh_uw, 2, lduw, 4, uint32_t)
+DO_VLDR(vldrb_sh, MO_SB, 1, int8_t, ldb, 2, int16_t)
+DO_VLDR(vldrb_sw, MO_SB, 1, int8_t, ldb, 4, int32_t)
+DO_VLDR(vldrb_uh, MO_UB, 1, uint8_t, ldb, 2, uint16_t)
+DO_VLDR(vldrb_uw, MO_UB, 1, uint8_t, ldb, 4, uint32_t)
+DO_VLDR(vldrh_sw, MO_TESW, 2, int16_t, ldw, 4, int32_t)
+DO_VLDR(vldrh_uw, MO_TEUW, 2, uint16_t, ldw, 4, uint32_t)
-DO_VSTR(vstrb_h, 1, stb, 2, int16_t)
-DO_VSTR(vstrb_w, 1, stb, 4, int32_t)
-DO_VSTR(vstrh_w, 2, stw, 4, int32_t)
+DO_VSTR(vstrb_h, MO_UB, 1, stb, 2, int16_t)
+DO_VSTR(vstrb_w, MO_UB, 1, stb, 4, int32_t)
+DO_VSTR(vstrh_w, MO_TEUW, 2, stw, 4, int32_t)
#undef DO_VLDR
#undef DO_VSTR
@@ -214,7 +218,7 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t)
* For loads, predicated lanes are zeroed instead of retaining
* their previous values.
*/
-#define DO_VLDR_SG(OP, LDTYPE, ESIZE, TYPE, OFFTYPE, ADDRFN, WB) \
+#define DO_VLDR_SG(OP, MFLAG, MTYPE, LDTYPE, ESIZE, TYPE, OFFTYPE, ADDRFN, WB)\
void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \
uint32_t base) \
{ \
@@ -224,13 +228,15 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t)
uint16_t eci_mask = mve_eci_mask(env); \
unsigned e; \
uint32_t addr; \
+ int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \
+ MemOpIdx oi = make_memop_idx(MFLAG | MO_ALIGN, mmu_idx); \
for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE, eci_mask >>= ESIZE) { \
if (!(eci_mask & 1)) { \
continue; \
} \
addr = ADDRFN(base, m[H##ESIZE(e)]); \
d[H##ESIZE(e)] = (mask & 1) ? \
- cpu_##LDTYPE##_data_ra(env, addr, GETPC()) : 0; \
+ (MTYPE)cpu_##LDTYPE##_mmu(env, addr, oi, GETPC()) : 0; \
if (WB) { \
m[H##ESIZE(e)] = addr; \
} \
@@ -239,7 +245,7 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t)
}
/* We know here TYPE is unsigned so always the same as the offset type */
-#define DO_VSTR_SG(OP, STTYPE, ESIZE, TYPE, ADDRFN, WB) \
+#define DO_VSTR_SG(OP, MFLAG, STTYPE, ESIZE, TYPE, ADDRFN, WB) \
void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \
uint32_t base) \
{ \
@@ -249,13 +255,15 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t)
uint16_t eci_mask = mve_eci_mask(env); \
unsigned e; \
uint32_t addr; \
+ int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \
+ MemOpIdx oi = make_memop_idx(MFLAG | MO_ALIGN, mmu_idx); \
for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE, eci_mask >>= ESIZE) { \
if (!(eci_mask & 1)) { \
continue; \
} \
addr = ADDRFN(base, m[H##ESIZE(e)]); \
if (mask & 1) { \
- cpu_##STTYPE##_data_ra(env, addr, d[H##ESIZE(e)], GETPC()); \
+ cpu_##STTYPE##_mmu(env, addr, d[H##ESIZE(e)], oi, GETPC()); \
} \
if (WB) { \
m[H##ESIZE(e)] = addr; \
@@ -282,13 +290,15 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t)
uint16_t eci_mask = mve_eci_mask(env); \
unsigned e; \
uint32_t addr; \
+ int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \
+ MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \
for (e = 0; e < 16 / 4; e++, mask >>= 4, eci_mask >>= 4) { \
if (!(eci_mask & 1)) { \
continue; \
} \
addr = ADDRFN(base, m[H4(e & ~1)]); \
addr += 4 * (e & 1); \
- d[H4(e)] = (mask & 1) ? cpu_ldl_data_ra(env, addr, GETPC()) : 0; \
+ d[H4(e)] = (mask & 1) ? cpu_ldl_mmu(env, addr, oi, GETPC()) : 0; \
if (WB && (e & 1)) { \
m[H4(e & ~1)] = addr - 4; \
} \
@@ -306,6 +316,8 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t)
uint16_t eci_mask = mve_eci_mask(env); \
unsigned e; \
uint32_t addr; \
+ int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \
+ MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \
for (e = 0; e < 16 / 4; e++, mask >>= 4, eci_mask >>= 4) { \
if (!(eci_mask & 1)) { \
continue; \
@@ -313,7 +325,7 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t)
addr = ADDRFN(base, m[H4(e & ~1)]); \
addr += 4 * (e & 1); \
if (mask & 1) { \
- cpu_stl_data_ra(env, addr, d[H4(e)], GETPC()); \
+ cpu_stl_mmu(env, addr, d[H4(e)], oi, GETPC()); \
} \
if (WB && (e & 1)) { \
m[H4(e & ~1)] = addr - 4; \
@@ -327,40 +339,44 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t)
#define ADDR_ADD_OSW(BASE, OFFSET) ((BASE) + ((OFFSET) << 2))
#define ADDR_ADD_OSD(BASE, OFFSET) ((BASE) + ((OFFSET) << 3))
-DO_VLDR_SG(vldrb_sg_sh, ldsb, 2, int16_t, uint16_t, ADDR_ADD, false)
-DO_VLDR_SG(vldrb_sg_sw, ldsb, 4, int32_t, uint32_t, ADDR_ADD, false)
-DO_VLDR_SG(vldrh_sg_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD, false)
+DO_VLDR_SG(vldrb_sg_sh, MO_SB, int8_t, ldb, 2, int16_t, uint16_t, ADDR_ADD, false)
+DO_VLDR_SG(vldrb_sg_sw, MO_SB, int8_t, ldb, 4, int32_t, uint32_t, ADDR_ADD, false)
+DO_VLDR_SG(vldrh_sg_sw, MO_TESW, int16_t, ldw, 4, int32_t, uint32_t, ADDR_ADD, false)
-DO_VLDR_SG(vldrb_sg_ub, ldub, 1, uint8_t, uint8_t, ADDR_ADD, false)
-DO_VLDR_SG(vldrb_sg_uh, ldub, 2, uint16_t, uint16_t, ADDR_ADD, false)
-DO_VLDR_SG(vldrb_sg_uw, ldub, 4, uint32_t, uint32_t, ADDR_ADD, false)
-DO_VLDR_SG(vldrh_sg_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD, false)
-DO_VLDR_SG(vldrh_sg_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD, false)
-DO_VLDR_SG(vldrw_sg_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD, false)
+DO_VLDR_SG(vldrb_sg_ub, MO_UB, uint8_t, ldb, 1, uint8_t, uint8_t, ADDR_ADD, false)
+DO_VLDR_SG(vldrb_sg_uh, MO_UB, uint8_t, ldb, 2, uint16_t, uint16_t, ADDR_ADD, false)
+DO_VLDR_SG(vldrb_sg_uw, MO_UB, uint8_t, ldb, 4, uint32_t, uint32_t, ADDR_ADD, false)
+DO_VLDR_SG(vldrh_sg_uh, MO_TEUW, uint16_t, ldw, 2, uint16_t, uint16_t, ADDR_ADD, false)
+DO_VLDR_SG(vldrh_sg_uw, MO_TEUW, uint16_t, ldw, 4, uint32_t, uint32_t, ADDR_ADD, false)
+DO_VLDR_SG(vldrw_sg_uw, MO_TEUL, uint32_t, ldl, 4, uint32_t, uint32_t, ADDR_ADD, false)
DO_VLDR64_SG(vldrd_sg_ud, ADDR_ADD, false)
-DO_VLDR_SG(vldrh_sg_os_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD_OSH, false)
-DO_VLDR_SG(vldrh_sg_os_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD_OSH, false)
-DO_VLDR_SG(vldrh_sg_os_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD_OSH, false)
-DO_VLDR_SG(vldrw_sg_os_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD_OSW, false)
+DO_VLDR_SG(vldrh_sg_os_sw, MO_TESW, int16_t, ldw, 4,
+ int32_t, uint32_t, ADDR_ADD_OSH, false)
+DO_VLDR_SG(vldrh_sg_os_uh, MO_TEUW, uint16_t, ldw, 2,
+ uint16_t, uint16_t, ADDR_ADD_OSH, false)
+DO_VLDR_SG(vldrh_sg_os_uw, MO_TEUW, uint16_t, ldw, 4,
+ uint32_t, uint32_t, ADDR_ADD_OSH, false)
+DO_VLDR_SG(vldrw_sg_os_uw, MO_TEUL, uint32_t, ldl, 4,
+ uint32_t, uint32_t, ADDR_ADD_OSW, false)
DO_VLDR64_SG(vldrd_sg_os_ud, ADDR_ADD_OSD, false)
-DO_VSTR_SG(vstrb_sg_ub, stb, 1, uint8_t, ADDR_ADD, false)
-DO_VSTR_SG(vstrb_sg_uh, stb, 2, uint16_t, ADDR_ADD, false)
-DO_VSTR_SG(vstrb_sg_uw, stb, 4, uint32_t, ADDR_ADD, false)
-DO_VSTR_SG(vstrh_sg_uh, stw, 2, uint16_t, ADDR_ADD, false)
-DO_VSTR_SG(vstrh_sg_uw, stw, 4, uint32_t, ADDR_ADD, false)
-DO_VSTR_SG(vstrw_sg_uw, stl, 4, uint32_t, ADDR_ADD, false)
+DO_VSTR_SG(vstrb_sg_ub, MO_UB, stb, 1, uint8_t, ADDR_ADD, false)
+DO_VSTR_SG(vstrb_sg_uh, MO_UB, stb, 2, uint16_t, ADDR_ADD, false)
+DO_VSTR_SG(vstrb_sg_uw, MO_UB, stb, 4, uint32_t, ADDR_ADD, false)
+DO_VSTR_SG(vstrh_sg_uh, MO_TEUW, stw, 2, uint16_t, ADDR_ADD, false)
+DO_VSTR_SG(vstrh_sg_uw, MO_TEUW, stw, 4, uint32_t, ADDR_ADD, false)
+DO_VSTR_SG(vstrw_sg_uw, MO_TEUL, stl, 4, uint32_t, ADDR_ADD, false)
DO_VSTR64_SG(vstrd_sg_ud, ADDR_ADD, false)
-DO_VSTR_SG(vstrh_sg_os_uh, stw, 2, uint16_t, ADDR_ADD_OSH, false)
-DO_VSTR_SG(vstrh_sg_os_uw, stw, 4, uint32_t, ADDR_ADD_OSH, false)
-DO_VSTR_SG(vstrw_sg_os_uw, stl, 4, uint32_t, ADDR_ADD_OSW, false)
+DO_VSTR_SG(vstrh_sg_os_uh, MO_TEUW, stw, 2, uint16_t, ADDR_ADD_OSH, false)
+DO_VSTR_SG(vstrh_sg_os_uw, MO_TEUW, stw, 4, uint32_t, ADDR_ADD_OSH, false)
+DO_VSTR_SG(vstrw_sg_os_uw, MO_TEUL, stl, 4, uint32_t, ADDR_ADD_OSW, false)
DO_VSTR64_SG(vstrd_sg_os_ud, ADDR_ADD_OSD, false)
-DO_VLDR_SG(vldrw_sg_wb_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD, true)
+DO_VLDR_SG(vldrw_sg_wb_uw, MO_TEUL, uint32_t, ldl, 4, uint32_t, uint32_t, ADDR_ADD, true)
DO_VLDR64_SG(vldrd_sg_wb_ud, ADDR_ADD, true)
-DO_VSTR_SG(vstrw_sg_wb_uw, stl, 4, uint32_t, ADDR_ADD, true)
+DO_VSTR_SG(vstrw_sg_wb_uw, MO_TEUL, stl, 4, uint32_t, ADDR_ADD, true)
DO_VSTR64_SG(vstrd_sg_wb_ud, ADDR_ADD, true)
/*
@@ -387,13 +403,15 @@ DO_VSTR64_SG(vstrd_sg_wb_ud, ADDR_ADD, true)
uint16_t mask = mve_eci_mask(env); \
static const uint8_t off[4] = { O1, O2, O3, O4 }; \
uint32_t addr, data; \
+ int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \
+ MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \
for (beat = 0; beat < 4; beat++, mask >>= 4) { \
if ((mask & 1) == 0) { \
/* ECI says skip this beat */ \
continue; \
} \
addr = base + off[beat] * 4; \
- data = cpu_ldl_le_data_ra(env, addr, GETPC()); \
+ data = cpu_ldl_mmu(env, addr, oi, GETPC()); \
for (e = 0; e < 4; e++, data >>= 8) { \
uint8_t *qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + e); \
qd[H1(off[beat])] = data; \
@@ -411,13 +429,15 @@ DO_VSTR64_SG(vstrd_sg_wb_ud, ADDR_ADD, true)
uint32_t addr, data; \
int y; /* y counts 0 2 0 2 */ \
uint16_t *qd; \
+ int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \
+ MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \
for (beat = 0, y = 0; beat < 4; beat++, mask >>= 4, y ^= 2) { \
if ((mask & 1) == 0) { \
/* ECI says skip this beat */ \
continue; \
} \
addr = base + off[beat] * 8 + (beat & 1) * 4; \
- data = cpu_ldl_le_data_ra(env, addr, GETPC()); \
+ data = cpu_ldl_mmu(env, addr, oi, GETPC()); \
qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y); \
qd[H2(off[beat])] = data; \
data >>= 16; \
@@ -436,13 +456,15 @@ DO_VSTR64_SG(vstrd_sg_wb_ud, ADDR_ADD, true)
uint32_t addr, data; \
uint32_t *qd; \
int y; \
+ int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \
+ MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \
for (beat = 0; beat < 4; beat++, mask >>= 4) { \
if ((mask & 1) == 0) { \
/* ECI says skip this beat */ \
continue; \
} \
addr = base + off[beat] * 4; \
- data = cpu_ldl_le_data_ra(env, addr, GETPC()); \
+ data = cpu_ldl_mmu(env, addr, oi, GETPC()); \
y = (beat + (O1 & 2)) & 3; \
qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + y); \
qd[H4(off[beat] >> 2)] = data; \
@@ -473,13 +495,15 @@ DO_VLD4W(vld43w, 6, 7, 8, 9)
static const uint8_t off[4] = { O1, O2, O3, O4 }; \
uint32_t addr, data; \
uint8_t *qd; \
+ int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \
+ MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \
for (beat = 0; beat < 4; beat++, mask >>= 4) { \
if ((mask & 1) == 0) { \
/* ECI says skip this beat */ \
continue; \
} \
addr = base + off[beat] * 2; \
- data = cpu_ldl_le_data_ra(env, addr, GETPC()); \
+ data = cpu_ldl_mmu(env, addr, oi, GETPC()); \
for (e = 0; e < 4; e++, data >>= 8) { \
qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + (e & 1)); \
qd[H1(off[beat] + (e >> 1))] = data; \
@@ -497,13 +521,15 @@ DO_VLD4W(vld43w, 6, 7, 8, 9)
uint32_t addr, data; \
int e; \
uint16_t *qd; \
+ int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \
+ MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \
for (beat = 0; beat < 4; beat++, mask >>= 4) { \
if ((mask & 1) == 0) { \
/* ECI says skip this beat */ \
continue; \
} \
addr = base + off[beat] * 4; \
- data = cpu_ldl_le_data_ra(env, addr, GETPC()); \
+ data = cpu_ldl_mmu(env, addr, oi, GETPC()); \
for (e = 0; e < 2; e++, data >>= 16) { \
qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + e); \
qd[H2(off[beat])] = data; \
@@ -520,13 +546,15 @@ DO_VLD4W(vld43w, 6, 7, 8, 9)
static const uint8_t off[4] = { O1, O2, O3, O4 }; \
uint32_t addr, data; \
uint32_t *qd; \
+ int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \
+ MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \
for (beat = 0; beat < 4; beat++, mask >>= 4) { \
if ((mask & 1) == 0) { \
/* ECI says skip this beat */ \
continue; \
} \
addr = base + off[beat]; \
- data = cpu_ldl_le_data_ra(env, addr, GETPC()); \
+ data = cpu_ldl_mmu(env, addr, oi, GETPC()); \
qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + (beat & 1)); \
qd[H4(off[beat] >> 3)] = data; \
} \
@@ -549,6 +577,8 @@ DO_VLD2W(vld21w, 8, 12, 16, 20)
uint16_t mask = mve_eci_mask(env); \
static const uint8_t off[4] = { O1, O2, O3, O4 }; \
uint32_t addr, data; \
+ int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \
+ MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \
for (beat = 0; beat < 4; beat++, mask >>= 4) { \
if ((mask & 1) == 0) { \
/* ECI says skip this beat */ \
@@ -560,7 +590,7 @@ DO_VLD2W(vld21w, 8, 12, 16, 20)
uint8_t *qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + e); \
data = (data << 8) | qd[H1(off[beat])]; \
} \
- cpu_stl_le_data_ra(env, addr, data, GETPC()); \
+ cpu_stl_mmu(env, addr, data, oi, GETPC()); \
} \
}
@@ -574,6 +604,8 @@ DO_VLD2W(vld21w, 8, 12, 16, 20)
uint32_t addr, data; \
int y; /* y counts 0 2 0 2 */ \
uint16_t *qd; \
+ int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \
+ MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \
for (beat = 0, y = 0; beat < 4; beat++, mask >>= 4, y ^= 2) { \
if ((mask & 1) == 0) { \
/* ECI says skip this beat */ \
@@ -584,7 +616,7 @@ DO_VLD2W(vld21w, 8, 12, 16, 20)
data = qd[H2(off[beat])]; \
qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y + 1); \
data |= qd[H2(off[beat])] << 16; \
- cpu_stl_le_data_ra(env, addr, data, GETPC()); \
+ cpu_stl_mmu(env, addr, data, oi, GETPC()); \
} \
}
@@ -598,6 +630,8 @@ DO_VLD2W(vld21w, 8, 12, 16, 20)
uint32_t addr, data; \
uint32_t *qd; \
int y; \
+ int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \
+ MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \
for (beat = 0; beat < 4; beat++, mask >>= 4) { \
if ((mask & 1) == 0) { \
/* ECI says skip this beat */ \
@@ -607,7 +641,7 @@ DO_VLD2W(vld21w, 8, 12, 16, 20)
y = (beat + (O1 & 2)) & 3; \
qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + y); \
data = qd[H4(off[beat] >> 2)]; \
- cpu_stl_le_data_ra(env, addr, data, GETPC()); \
+ cpu_stl_mmu(env, addr, data, oi, GETPC()); \
} \
}
@@ -635,6 +669,8 @@ DO_VST4W(vst43w, 6, 7, 8, 9)
static const uint8_t off[4] = { O1, O2, O3, O4 }; \
uint32_t addr, data; \
uint8_t *qd; \
+ int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \
+ MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \
for (beat = 0; beat < 4; beat++, mask >>= 4) { \
if ((mask & 1) == 0) { \
/* ECI says skip this beat */ \
@@ -646,7 +682,7 @@ DO_VST4W(vst43w, 6, 7, 8, 9)
qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + (e & 1)); \
data = (data << 8) | qd[H1(off[beat] + (e >> 1))]; \
} \
- cpu_stl_le_data_ra(env, addr, data, GETPC()); \
+ cpu_stl_mmu(env, addr, data, oi, GETPC()); \
} \
}
@@ -660,6 +696,8 @@ DO_VST4W(vst43w, 6, 7, 8, 9)
uint32_t addr, data; \
int e; \
uint16_t *qd; \
+ int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \
+ MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \
for (beat = 0; beat < 4; beat++, mask >>= 4) { \
if ((mask & 1) == 0) { \
/* ECI says skip this beat */ \
@@ -671,7 +709,7 @@ DO_VST4W(vst43w, 6, 7, 8, 9)
qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + e); \
data = (data << 16) | qd[H2(off[beat])]; \
} \
- cpu_stl_le_data_ra(env, addr, data, GETPC()); \
+ cpu_stl_mmu(env, addr, data, oi, GETPC()); \
} \
}
@@ -684,6 +722,8 @@ DO_VST4W(vst43w, 6, 7, 8, 9)
static const uint8_t off[4] = { O1, O2, O3, O4 }; \
uint32_t addr, data; \
uint32_t *qd; \
+ int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \
+ MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \
for (beat = 0; beat < 4; beat++, mask >>= 4) { \
if ((mask & 1) == 0) { \
/* ECI says skip this beat */ \
@@ -692,7 +732,7 @@ DO_VST4W(vst43w, 6, 7, 8, 9)
addr = base + off[beat]; \
qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + (beat & 1)); \
data = qd[H4(off[beat] >> 3)]; \
- cpu_stl_le_data_ra(env, addr, data, GETPC()); \
+ cpu_stl_mmu(env, addr, data, oi, GETPC()); \
} \
}
@@ -2164,27 +2204,6 @@ DO_VSHLL_ALL(vshllt, true)
DO_VSHRN(OP##tb, true, 1, uint8_t, 2, uint16_t, FN) \
DO_VSHRN(OP##th, true, 2, uint16_t, 4, uint32_t, FN)
-static inline uint64_t do_urshr(uint64_t x, unsigned sh)
-{
- if (likely(sh < 64)) {
- return (x >> sh) + ((x >> (sh - 1)) & 1);
- } else if (sh == 64) {
- return x >> 63;
- } else {
- return 0;
- }
-}
-
-static inline int64_t do_srshr(int64_t x, unsigned sh)
-{
- if (likely(sh < 64)) {
- return (x >> sh) + ((x >> (sh - 1)) & 1);
- } else {
- /* Rounding the sign bit always produces 0. */
- return 0;
- }
-}
-
DO_VSHRN_ALL(vshrn, DO_SHR)
DO_VSHRN_ALL(vrshrn, do_urshr)
diff --git a/target/arm/tcg/neon_helper.c b/target/arm/tcg/neon_helper.c
index 2cc8241..8d288f3 100644
--- a/target/arm/tcg/neon_helper.c
+++ b/target/arm/tcg/neon_helper.c
@@ -229,15 +229,30 @@ NEON_GVEC_VOP2(gvec_srshl_h, int16_t)
#undef NEON_FN
#define NEON_FN(dest, src1, src2) \
+ (dest = do_sqrshl_bhs(src1, src2, 16, true, NULL))
+NEON_GVEC_VOP2(sme2_srshl_h, int16_t)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
(dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
NEON_GVEC_VOP2(gvec_srshl_s, int32_t)
#undef NEON_FN
#define NEON_FN(dest, src1, src2) \
+ (dest = do_sqrshl_bhs(src1, src2, 32, true, NULL))
+NEON_GVEC_VOP2(sme2_srshl_s, int32_t)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
(dest = do_sqrshl_d(src1, (int8_t)src2, true, NULL))
NEON_GVEC_VOP2(gvec_srshl_d, int64_t)
#undef NEON_FN
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_sqrshl_d(src1, src2, true, NULL))
+NEON_GVEC_VOP2(sme2_srshl_d, int64_t)
+#undef NEON_FN
+
uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
{
return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
@@ -261,15 +276,30 @@ NEON_GVEC_VOP2(gvec_urshl_h, uint16_t)
#undef NEON_FN
#define NEON_FN(dest, src1, src2) \
+ (dest = do_uqrshl_bhs(src1, (int16_t)src2, 16, true, NULL))
+NEON_GVEC_VOP2(sme2_urshl_h, uint16_t)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
(dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
NEON_GVEC_VOP2(gvec_urshl_s, int32_t)
#undef NEON_FN
#define NEON_FN(dest, src1, src2) \
+ (dest = do_uqrshl_bhs(src1, src2, 32, true, NULL))
+NEON_GVEC_VOP2(sme2_urshl_s, int32_t)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
(dest = do_uqrshl_d(src1, (int8_t)src2, true, NULL))
NEON_GVEC_VOP2(gvec_urshl_d, int64_t)
#undef NEON_FN
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_uqrshl_d(src1, src2, true, NULL))
+NEON_GVEC_VOP2(sme2_urshl_d, int64_t)
+#undef NEON_FN
+
uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
{
return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
diff --git a/target/arm/tcg/sme.decode b/target/arm/tcg/sme.decode
index 628804e..6bb9aa2 100644
--- a/target/arm/tcg/sme.decode
+++ b/target/arm/tcg/sme.decode
@@ -22,30 +22,139 @@
### SME Misc
ZERO 11000000 00 001 00000000000 imm:8
+ZERO_zt0 11000000 01 001 00000000000 00000001
### SME Move into/from Array
%mova_rs 13:2 !function=plus_12
-&mova esz rs pg zr za_imm v:bool to_vec:bool
+%mova_rv 13:2 !function=plus_8
+&mova_a rv zr off
+&mova_p esz rs pg zr za off v:bool
+&mova_t esz rs zr za off v:bool
-MOVA 11000000 esz:2 00000 0 v:1 .. pg:3 zr:5 0 za_imm:4 \
- &mova to_vec=0 rs=%mova_rs
-MOVA 11000000 11 00000 1 v:1 .. pg:3 zr:5 0 za_imm:4 \
- &mova to_vec=0 rs=%mova_rs esz=4
+MOVA_tz 11000000 00 00000 0 v:1 .. pg:3 zr:5 0 off:4 \
+ &mova_p rs=%mova_rs esz=0 za=0
+MOVA_tz 11000000 01 00000 0 v:1 .. pg:3 zr:5 0 za:1 off:3 \
+ &mova_p rs=%mova_rs esz=1
+MOVA_tz 11000000 10 00000 0 v:1 .. pg:3 zr:5 0 za:2 off:2 \
+ &mova_p rs=%mova_rs esz=2
+MOVA_tz 11000000 11 00000 0 v:1 .. pg:3 zr:5 0 za:3 off:1 \
+ &mova_p rs=%mova_rs esz=3
+MOVA_tz 11000000 11 00000 1 v:1 .. pg:3 zr:5 0 za:4 \
+ &mova_p rs=%mova_rs esz=4 off=0
-MOVA 11000000 esz:2 00001 0 v:1 .. pg:3 0 za_imm:4 zr:5 \
- &mova to_vec=1 rs=%mova_rs
-MOVA 11000000 11 00001 1 v:1 .. pg:3 0 za_imm:4 zr:5 \
- &mova to_vec=1 rs=%mova_rs esz=4
+MOVA_zt 11000000 00 00001 0 v:1 .. pg:3 0 off:4 zr:5 \
+ &mova_p rs=%mova_rs esz=0 za=0
+MOVA_zt 11000000 01 00001 0 v:1 .. pg:3 0 za:1 off:3 zr:5 \
+ &mova_p rs=%mova_rs esz=1
+MOVA_zt 11000000 10 00001 0 v:1 .. pg:3 0 za:2 off:2 zr:5 \
+ &mova_p rs=%mova_rs esz=2
+MOVA_zt 11000000 11 00001 0 v:1 .. pg:3 0 za:3 off:1 zr:5 \
+ &mova_p rs=%mova_rs esz=3
+MOVA_zt 11000000 11 00001 1 v:1 .. pg:3 0 za:4 zr:5 \
+ &mova_p rs=%mova_rs esz=4 off=0
+
+MOVA_tz2 11000000 00 00010 0 v:1 .. 000 zr:4 0 00 off:3 \
+ &mova_t rs=%mova_rs esz=0 za=0
+MOVA_tz2 11000000 01 00010 0 v:1 .. 000 zr:4 0 00 za:1 off:2 \
+ &mova_t rs=%mova_rs esz=1
+MOVA_tz2 11000000 10 00010 0 v:1 .. 000 zr:4 0 00 za:2 off:1 \
+ &mova_t rs=%mova_rs esz=2
+MOVA_tz2 11000000 11 00010 0 v:1 .. 000 zr:4 0 00 za:3 \
+ &mova_t rs=%mova_rs esz=3 off=0
+
+MOVA_zt2 11000000 00 00011 0 v:1 .. 000 00 off:3 zr:4 0 \
+ &mova_t rs=%mova_rs esz=0 za=0
+MOVA_zt2 11000000 01 00011 0 v:1 .. 000 00 za:1 off:2 zr:4 0 \
+ &mova_t rs=%mova_rs esz=1
+MOVA_zt2 11000000 10 00011 0 v:1 .. 000 00 za:2 off:1 zr:4 0 \
+ &mova_t rs=%mova_rs esz=2
+MOVA_zt2 11000000 11 00011 0 v:1 .. 000 00 za:3 zr:4 0 \
+ &mova_t rs=%mova_rs esz=3 off=0
+
+MOVA_tz4 11000000 00 00010 0 v:1 .. 001 zr:3 00 000 off:2 \
+ &mova_t rs=%mova_rs esz=0 za=0
+MOVA_tz4 11000000 01 00010 0 v:1 .. 001 zr:3 00 000 za:1 off:1 \
+ &mova_t rs=%mova_rs esz=1
+MOVA_tz4 11000000 10 00010 0 v:1 .. 001 zr:3 00 000 za:2 \
+ &mova_t rs=%mova_rs esz=2 off=0
+MOVA_tz4 11000000 11 00010 0 v:1 .. 001 zr:3 00 00 za:3 \
+ &mova_t rs=%mova_rs esz=3 off=0
+
+MOVA_zt4 11000000 00 00011 0 v:1 .. 001 000 off:2 zr:3 00 \
+ &mova_t rs=%mova_rs esz=0 za=0
+MOVA_zt4 11000000 01 00011 0 v:1 .. 001 000 za:1 off:1 zr:3 00 \
+ &mova_t rs=%mova_rs esz=1
+MOVA_zt4 11000000 10 00011 0 v:1 .. 001 000 za:2 zr:3 00 \
+ &mova_t rs=%mova_rs esz=2 off=0
+MOVA_zt4 11000000 11 00011 0 v:1 .. 001 00 za:3 zr:3 00 \
+ &mova_t rs=%mova_rs esz=3 off=0
+
+MOVA_az2 11000000 00 00010 00 .. 010 zr:4 000 off:3 \
+ &mova_a rv=%mova_rv
+MOVA_az4 11000000 00 00010 00 .. 011 zr:3 0000 off:3 \
+ &mova_a rv=%mova_rv
+
+MOVA_za2 11000000 00 00011 00 .. 010 00 off:3 zr:4 0 \
+ &mova_a rv=%mova_rv
+MOVA_za4 11000000 00 00011 00 .. 011 00 off:3 zr:3 00 \
+ &mova_a rv=%mova_rv
+
+### SME Move and Zero
+
+MOVAZ_za2 11000000 00000110 0 .. 01010 off:3 zr:4 0 \
+ &mova_a rv=%mova_rv
+MOVAZ_za4 11000000 00000110 0 .. 01110 off:3 zr:3 00 \
+ &mova_a rv=%mova_rv
+
+MOVAZ_zt 11000000 00 00001 0 v:1 .. 0001 off:4 zr:5 \
+ &mova_t rs=%mova_rs esz=0 za=0
+MOVAZ_zt 11000000 01 00001 0 v:1 .. 0001 za:1 off:3 zr:5 \
+ &mova_t rs=%mova_rs esz=1
+MOVAZ_zt 11000000 10 00001 0 v:1 .. 0001 za:2 off:2 zr:5 \
+ &mova_t rs=%mova_rs esz=2
+MOVAZ_zt 11000000 11 00001 0 v:1 .. 0001 za:3 off:1 zr:5 \
+ &mova_t rs=%mova_rs esz=3
+MOVAZ_zt 11000000 11 00001 1 v:1 .. 0001 za:4 zr:5 \
+ &mova_t rs=%mova_rs esz=4 off=0
+
+MOVAZ_zt2 11000000 00 00011 0 v:1 .. 00010 off:3 zr:4 0 \
+ &mova_t rs=%mova_rs esz=0 za=0
+MOVAZ_zt2 11000000 01 00011 0 v:1 .. 00010 za:1 off:2 zr:4 0 \
+ &mova_t rs=%mova_rs esz=1
+MOVAZ_zt2 11000000 10 00011 0 v:1 .. 00010 za:2 off:1 zr:4 0 \
+ &mova_t rs=%mova_rs esz=2
+MOVAZ_zt2 11000000 11 00011 0 v:1 .. 00010 za:3 zr:4 0 \
+ &mova_t rs=%mova_rs esz=3 off=0
+
+MOVAZ_zt4 11000000 00 00011 0 v:1 .. 001100 off:2 zr:3 00 \
+ &mova_t rs=%mova_rs esz=0 za=0
+MOVAZ_zt4 11000000 01 00011 0 v:1 .. 001100 za:1 off:1 zr:3 00 \
+ &mova_t rs=%mova_rs esz=1
+MOVAZ_zt4 11000000 10 00011 0 v:1 .. 001100 za:2 zr:3 00 \
+ &mova_t rs=%mova_rs esz=2 off=0
+MOVAZ_zt4 11000000 11 00011 0 v:1 .. 00110 za:3 zr:3 00 \
+ &mova_t rs=%mova_rs esz=3 off=0
+
+### SME Move into/from ZT0
+
+MOVT_rzt 1100 0000 0100 1100 0 off:3 00 11111 rt:5
+MOVT_ztr 1100 0000 0100 1110 0 off:3 00 11111 rt:5
### SME Memory
-&ldst esz rs pg rn rm za_imm v:bool st:bool
+&ldst esz rs pg rn rm za off v:bool st:bool
-LDST1 1110000 0 esz:2 st:1 rm:5 v:1 .. pg:3 rn:5 0 za_imm:4 \
- &ldst rs=%mova_rs
-LDST1 1110000 111 st:1 rm:5 v:1 .. pg:3 rn:5 0 za_imm:4 \
- &ldst esz=4 rs=%mova_rs
+LDST1 1110000 0 00 st:1 rm:5 v:1 .. pg:3 rn:5 0 off:4 \
+ &ldst rs=%mova_rs esz=0 za=0
+LDST1 1110000 0 01 st:1 rm:5 v:1 .. pg:3 rn:5 0 za:1 off:3 \
+ &ldst rs=%mova_rs esz=1
+LDST1 1110000 0 10 st:1 rm:5 v:1 .. pg:3 rn:5 0 za:2 off:2 \
+ &ldst rs=%mova_rs esz=2
+LDST1 1110000 0 11 st:1 rm:5 v:1 .. pg:3 rn:5 0 za:3 off:1 \
+ &ldst rs=%mova_rs esz=3
+LDST1 1110000 1 11 st:1 rm:5 v:1 .. pg:3 rn:5 0 za:4 \
+ &ldst rs=%mova_rs esz=4 off=0
&ldstr rv rn imm
@ldstr ....... ... . ...... .. ... rn:5 . imm:4 \
@@ -54,6 +163,12 @@ LDST1 1110000 111 st:1 rm:5 v:1 .. pg:3 rn:5 0 za_imm:4 \
LDR 1110000 100 0 000000 .. 000 ..... 0 .... @ldstr
STR 1110000 100 1 000000 .. 000 ..... 0 .... @ldstr
+&ldstzt0 rn
+@ldstzt0 ....... ... . ...... .. ... rn:5 ..... &ldstzt0
+
+LDR_zt0 1110000 100 0 111111 00 000 ..... 00000 @ldstzt0
+STR_zt0 1110000 100 1 111111 00 000 ..... 00000 @ldstzt0
+
### SME Add Vector to Array
&adda zad zn pm pn
@@ -68,14 +183,18 @@ ADDVA_d 11000000 11 01000 1 ... ... ..... 00 ... @adda_64
### SME Outer Product
&op zad zn zm pm pn sub:bool
+@op_16 ........ ... zm:5 pm:3 pn:3 zn:5 sub:1 ... zad:1 &op
@op_32 ........ ... zm:5 pm:3 pn:3 zn:5 sub:1 .. zad:2 &op
@op_64 ........ ... zm:5 pm:3 pn:3 zn:5 sub:1 . zad:3 &op
+FMOPA_h 10000001 100 ..... ... ... ..... . 100 . @op_16
FMOPA_s 10000000 100 ..... ... ... ..... . 00 .. @op_32
FMOPA_d 10000000 110 ..... ... ... ..... . 0 ... @op_64
-BFMOPA 10000001 100 ..... ... ... ..... . 00 .. @op_32
-FMOPA_h 10000001 101 ..... ... ... ..... . 00 .. @op_32
+BFMOPA 10000001 101 ..... ... ... ..... . 100 . @op_16
+
+BFMOPA_w 10000001 100 ..... ... ... ..... . 00 .. @op_32
+FMOPA_w_h 10000001 101 ..... ... ... ..... . 00 .. @op_32
SMOPA_s 1010000 0 10 0 ..... ... ... ..... . 00 .. @op_32
SUMOPA_s 1010000 0 10 1 ..... ... ... ..... . 00 .. @op_32
@@ -86,3 +205,789 @@ SMOPA_d 1010000 0 11 0 ..... ... ... ..... . 0 ... @op_64
SUMOPA_d 1010000 0 11 1 ..... ... ... ..... . 0 ... @op_64
USMOPA_d 1010000 1 11 0 ..... ... ... ..... . 0 ... @op_64
UMOPA_d 1010000 1 11 1 ..... ... ... ..... . 0 ... @op_64
+
+BMOPA 1000000 0 10 0 ..... ... ... ..... . 10 .. @op_32
+SMOPA2_s 1010000 0 10 0 ..... ... ... ..... . 10 .. @op_32
+UMOPA2_s 1010000 1 10 0 ..... ... ... ..... . 10 .. @op_32
+
+### SME2 Multi-vector Multiple and Single SVE Destructive
+
+%zd_ax2 1:4 !function=times_2
+%zd_ax4 2:3 !function=times_4
+
+&z2z_en zdn zm esz n
+@z2z_2x1 ....... . esz:2 .. zm:4 ....0. ..... .... . \
+ &z2z_en n=2 zdn=%zd_ax2
+@z2z_4x1 ....... . esz:2 .. zm:4 ....1. ..... ...0 . \
+ &z2z_en n=4 zdn=%zd_ax4
+
+SMAX_n1 1100000 1 .. 10 .... 1010.0 00000 .... 0 @z2z_2x1
+SMAX_n1 1100000 1 .. 10 .... 1010.0 00000 .... 0 @z2z_4x1
+UMAX_n1 1100000 1 .. 10 .... 1010.0 00000 .... 1 @z2z_2x1
+UMAX_n1 1100000 1 .. 10 .... 1010.0 00000 .... 1 @z2z_4x1
+SMIN_n1 1100000 1 .. 10 .... 1010.0 00001 .... 0 @z2z_2x1
+SMIN_n1 1100000 1 .. 10 .... 1010.0 00001 .... 0 @z2z_4x1
+UMIN_n1 1100000 1 .. 10 .... 1010.0 00001 .... 1 @z2z_2x1
+UMIN_n1 1100000 1 .. 10 .... 1010.0 00001 .... 1 @z2z_4x1
+
+FMAX_n1 1100000 1 .. 10 .... 1010.0 01000 .... 0 @z2z_2x1
+FMAX_n1 1100000 1 .. 10 .... 1010.0 01000 .... 0 @z2z_4x1
+FMIN_n1 1100000 1 .. 10 .... 1010.0 01000 .... 1 @z2z_2x1
+FMIN_n1 1100000 1 .. 10 .... 1010.0 01000 .... 1 @z2z_4x1
+FMAXNM_n1 1100000 1 .. 10 .... 1010.0 01001 .... 0 @z2z_2x1
+FMAXNM_n1 1100000 1 .. 10 .... 1010.0 01001 .... 0 @z2z_4x1
+FMINNM_n1 1100000 1 .. 10 .... 1010.0 01001 .... 1 @z2z_2x1
+FMINNM_n1 1100000 1 .. 10 .... 1010.0 01001 .... 1 @z2z_4x1
+
+SRSHL_n1 1100000 1 .. 10 .... 1010.0 10001 .... 0 @z2z_2x1
+SRSHL_n1 1100000 1 .. 10 .... 1010.0 10001 .... 0 @z2z_4x1
+URSHL_n1 1100000 1 .. 10 .... 1010.0 10001 .... 1 @z2z_2x1
+URSHL_n1 1100000 1 .. 10 .... 1010.0 10001 .... 1 @z2z_4x1
+
+ADD_n1 1100000 1 .. 10 .... 1010.0 11000 .... 0 @z2z_2x1
+ADD_n1 1100000 1 .. 10 .... 1010.0 11000 .... 0 @z2z_4x1
+
+SQDMULH_n1 1100000 1 .. 10 .... 1010.1 00000 .... 0 @z2z_2x1
+SQDMULH_n1 1100000 1 .. 10 .... 1010.1 00000 .... 0 @z2z_4x1
+
+### SME2 Multi-vector Multiple Vectors SVE Destructive
+
+%zm_ax2 17:4 !function=times_2
+%zm_ax4 18:3 !function=times_4
+
+@z2z_2x2 ....... . esz:2 . ....0 ....0. ..... .... . \
+ &z2z_en n=2 zdn=%zd_ax2 zm=%zm_ax2
+@z2z_4x4 ....... . esz:2 . ...00 ....1. ..... ...0 . \
+ &z2z_en n=4 zdn=%zd_ax4 zm=%zm_ax4
+
+SMAX_nn 1100000 1 .. 1 ..... 1011.0 00000 .... 0 @z2z_2x2
+SMAX_nn 1100000 1 .. 1 ..... 1011.0 00000 .... 0 @z2z_4x4
+UMAX_nn 1100000 1 .. 1 ..... 1011.0 00000 .... 1 @z2z_2x2
+UMAX_nn 1100000 1 .. 1 ..... 1011.0 00000 .... 1 @z2z_4x4
+SMIN_nn 1100000 1 .. 1 ..... 1011.0 00001 .... 0 @z2z_2x2
+SMIN_nn 1100000 1 .. 1 ..... 1011.0 00001 .... 0 @z2z_4x4
+UMIN_nn 1100000 1 .. 1 ..... 1011.0 00001 .... 1 @z2z_2x2
+UMIN_nn 1100000 1 .. 1 ..... 1011.0 00001 .... 1 @z2z_4x4
+
+FMAX_nn 1100000 1 .. 1 ..... 1011.0 01000 .... 0 @z2z_2x2
+FMAX_nn 1100000 1 .. 1 ..... 1011.0 01000 .... 0 @z2z_4x4
+FMIN_nn 1100000 1 .. 1 ..... 1011.0 01000 .... 1 @z2z_2x2
+FMIN_nn 1100000 1 .. 1 ..... 1011.0 01000 .... 1 @z2z_4x4
+FMAXNM_nn 1100000 1 .. 1 ..... 1011.0 01001 .... 0 @z2z_2x2
+FMAXNM_nn 1100000 1 .. 1 ..... 1011.0 01001 .... 0 @z2z_4x4
+FMINNM_nn 1100000 1 .. 1 ..... 1011.0 01001 .... 1 @z2z_2x2
+FMINNM_nn 1100000 1 .. 1 ..... 1011.0 01001 .... 1 @z2z_4x4
+
+SRSHL_nn 1100000 1 .. 1 ..... 1011.0 10001 .... 0 @z2z_2x2
+SRSHL_nn 1100000 1 .. 1 ..... 1011.0 10001 .... 0 @z2z_4x4
+URSHL_nn 1100000 1 .. 1 ..... 1011.0 10001 .... 1 @z2z_2x2
+URSHL_nn 1100000 1 .. 1 ..... 1011.0 10001 .... 1 @z2z_4x4
+
+SQDMULH_nn 1100000 1 .. 1 ..... 1011.1 00000 .... 0 @z2z_2x2
+SQDMULH_nn 1100000 1 .. 1 ..... 1011.1 00000 .... 0 @z2z_4x4
+
+### SME2 Multi-vector Multiple and Single Array Vectors
+
+&azz_n n off rv zn zm
+@azz_nx1_o3 ........ .... zm:4 ...... zn:5 .. off:3 &azz_n rv=%mova_rv
+
+ADD_azz_n1_s 11000001 0010 .... 0 .. 110 ..... 10 ... @azz_nx1_o3 n=2
+ADD_azz_n1_s 11000001 0011 .... 0 .. 110 ..... 10 ... @azz_nx1_o3 n=4
+ADD_azz_n1_d 11000001 0110 .... 0 .. 110 ..... 10 ... @azz_nx1_o3 n=2
+ADD_azz_n1_d 11000001 0111 .... 0 .. 110 ..... 10 ... @azz_nx1_o3 n=4
+
+SUB_azz_n1_s 11000001 0010 .... 0 .. 110 ..... 11 ... @azz_nx1_o3 n=2
+SUB_azz_n1_s 11000001 0011 .... 0 .. 110 ..... 11 ... @azz_nx1_o3 n=4
+SUB_azz_n1_d 11000001 0110 .... 0 .. 110 ..... 11 ... @azz_nx1_o3 n=2
+SUB_azz_n1_d 11000001 0111 .... 0 .. 110 ..... 11 ... @azz_nx1_o3 n=4
+
+%off3_x2 0:3 !function=times_2
+%off2_x2 0:2 !function=times_2
+
+@azz_nx1_o3x2 ........ ... . zm:4 . .. ... zn:5 .. ... \
+ &azz_n off=%off3_x2 rv=%mova_rv
+@azz_nx1_o2x2 ........ ... . zm:4 . .. ... zn:5 ... .. \
+ &azz_n off=%off2_x2 rv=%mova_rv
+
+FMLAL_n1 11000001 001 0 .... 0 .. 011 ..... 00 ... @azz_nx1_o3x2 n=1
+FMLAL_n1 11000001 001 0 .... 0 .. 010 ..... 000 .. @azz_nx1_o2x2 n=2
+FMLAL_n1 11000001 001 1 .... 0 .. 010 ..... 000 .. @azz_nx1_o2x2 n=4
+
+FMLSL_n1 11000001 001 0 .... 0 .. 011 ..... 01 ... @azz_nx1_o3x2 n=1
+FMLSL_n1 11000001 001 0 .... 0 .. 010 ..... 010 .. @azz_nx1_o2x2 n=2
+FMLSL_n1 11000001 001 1 .... 0 .. 010 ..... 010 .. @azz_nx1_o2x2 n=4
+
+BFMLAL_n1 11000001 001 0 .... 0 .. 011 ..... 10 ... @azz_nx1_o3x2 n=1
+BFMLAL_n1 11000001 001 0 .... 0 .. 010 ..... 100 .. @azz_nx1_o2x2 n=2
+BFMLAL_n1 11000001 001 1 .... 0 .. 010 ..... 100 .. @azz_nx1_o2x2 n=4
+
+BFMLSL_n1 11000001 001 0 .... 0 .. 011 ..... 11 ... @azz_nx1_o3x2 n=1
+BFMLSL_n1 11000001 001 0 .... 0 .. 010 ..... 110 .. @azz_nx1_o2x2 n=2
+BFMLSL_n1 11000001 001 1 .... 0 .. 010 ..... 110 .. @azz_nx1_o2x2 n=4
+
+FDOT_n1 11000001 001 0 .... 0 .. 100 ..... 00 ... @azz_nx1_o3 n=2
+FDOT_n1 11000001 001 1 .... 0 .. 100 ..... 00 ... @azz_nx1_o3 n=4
+
+BFDOT_n1 11000001 001 0 .... 0 .. 100 ..... 10 ... @azz_nx1_o3 n=2
+BFDOT_n1 11000001 001 1 .... 0 .. 100 ..... 10 ... @azz_nx1_o3 n=4
+
+USDOT_n1 11000001 001 0 .... 0 .. 101 ..... 01 ... @azz_nx1_o3 n=2
+USDOT_n1 11000001 001 1 .... 0 .. 101 ..... 01 ... @azz_nx1_o3 n=4
+
+SUDOT_n1 11000001 001 0 .... 0 .. 101 ..... 11 ... @azz_nx1_o3 n=2
+SUDOT_n1 11000001 001 1 .... 0 .. 101 ..... 11 ... @azz_nx1_o3 n=4
+
+SDOT_n1_4b 11000001 001 0 .... 0 .. 101 ..... 00 ... @azz_nx1_o3 n=2
+SDOT_n1_4b 11000001 001 1 .... 0 .. 101 ..... 00 ... @azz_nx1_o3 n=4
+SDOT_n1_4h 11000001 011 0 .... 0 .. 101 ..... 00 ... @azz_nx1_o3 n=2
+SDOT_n1_4h 11000001 011 1 .... 0 .. 101 ..... 00 ... @azz_nx1_o3 n=4
+SDOT_n1_2h 11000001 011 0 .... 0 .. 101 ..... 01 ... @azz_nx1_o3 n=2
+SDOT_n1_2h 11000001 011 1 .... 0 .. 101 ..... 01 ... @azz_nx1_o3 n=4
+
+UDOT_n1_4b 11000001 001 0 .... 0 .. 101 ..... 10 ... @azz_nx1_o3 n=2
+UDOT_n1_4b 11000001 001 1 .... 0 .. 101 ..... 10 ... @azz_nx1_o3 n=4
+UDOT_n1_4h 11000001 011 0 .... 0 .. 101 ..... 10 ... @azz_nx1_o3 n=2
+UDOT_n1_4h 11000001 011 1 .... 0 .. 101 ..... 10 ... @azz_nx1_o3 n=4
+UDOT_n1_2h 11000001 011 0 .... 0 .. 101 ..... 11 ... @azz_nx1_o3 n=2
+UDOT_n1_2h 11000001 011 1 .... 0 .. 101 ..... 11 ... @azz_nx1_o3 n=4
+
+SMLAL_n1 11000001 011 0 .... 0 .. 011 ..... 00 ... @azz_nx1_o3x2 n=1
+SMLAL_n1 11000001 011 0 .... 0 .. 010 ..... 000 .. @azz_nx1_o2x2 n=2
+SMLAL_n1 11000001 011 1 .... 0 .. 010 ..... 000 .. @azz_nx1_o2x2 n=4
+
+SMLSL_n1 11000001 011 0 .... 0 .. 011 ..... 01 ... @azz_nx1_o3x2 n=1
+SMLSL_n1 11000001 011 0 .... 0 .. 010 ..... 010 .. @azz_nx1_o2x2 n=2
+SMLSL_n1 11000001 011 1 .... 0 .. 010 ..... 010 .. @azz_nx1_o2x2 n=4
+
+UMLAL_n1 11000001 011 0 .... 0 .. 011 ..... 10 ... @azz_nx1_o3x2 n=1
+UMLAL_n1 11000001 011 0 .... 0 .. 010 ..... 100 .. @azz_nx1_o2x2 n=2
+UMLAL_n1 11000001 011 1 .... 0 .. 010 ..... 100 .. @azz_nx1_o2x2 n=4
+
+UMLSL_n1 11000001 011 0 .... 0 .. 011 ..... 11 ... @azz_nx1_o3x2 n=1
+UMLSL_n1 11000001 011 0 .... 0 .. 010 ..... 110 .. @azz_nx1_o2x2 n=2
+UMLSL_n1 11000001 011 1 .... 0 .. 010 ..... 110 .. @azz_nx1_o2x2 n=4
+
+%off2_x4 0:2 !function=times_4
+%off1_x4 0:1 !function=times_4
+
+@azz_nx1_o2x4 ........ ... . zm:4 . .. ... zn:5 ... .. \
+ &azz_n off=%off2_x4 rv=%mova_rv
+@azz_nx1_o1x4 ........ ... . zm:4 . .. ... zn:5 .... . \
+ &azz_n off=%off1_x4 rv=%mova_rv
+
+SMLALL_n1_s 11000001 001 0 .... 0 .. 001 ..... 000 .. @azz_nx1_o2x4 n=1
+SMLALL_n1_d 11000001 011 0 .... 0 .. 001 ..... 000 .. @azz_nx1_o2x4 n=1
+SMLALL_n1_s 11000001 001 0 .... 0 .. 000 ..... 0000 . @azz_nx1_o1x4 n=2
+SMLALL_n1_d 11000001 011 0 .... 0 .. 000 ..... 0000 . @azz_nx1_o1x4 n=2
+SMLALL_n1_s 11000001 001 1 .... 0 .. 000 ..... 0000 . @azz_nx1_o1x4 n=4
+SMLALL_n1_d 11000001 011 1 .... 0 .. 000 ..... 0000 . @azz_nx1_o1x4 n=4
+
+SMLSLL_n1_s 11000001 001 0 .... 0 .. 001 ..... 010 .. @azz_nx1_o2x4 n=1
+SMLSLL_n1_d 11000001 011 0 .... 0 .. 001 ..... 010 .. @azz_nx1_o2x4 n=1
+SMLSLL_n1_s 11000001 001 0 .... 0 .. 000 ..... 0100 . @azz_nx1_o1x4 n=2
+SMLSLL_n1_d 11000001 011 0 .... 0 .. 000 ..... 0100 . @azz_nx1_o1x4 n=2
+SMLSLL_n1_s 11000001 001 1 .... 0 .. 000 ..... 0100 . @azz_nx1_o1x4 n=4
+SMLSLL_n1_d 11000001 011 1 .... 0 .. 000 ..... 0100 . @azz_nx1_o1x4 n=4
+
+UMLALL_n1_s 11000001 001 0 .... 0 .. 001 ..... 100 .. @azz_nx1_o2x4 n=1
+UMLALL_n1_d 11000001 011 0 .... 0 .. 001 ..... 100 .. @azz_nx1_o2x4 n=1
+UMLALL_n1_s 11000001 001 0 .... 0 .. 000 ..... 1000 . @azz_nx1_o1x4 n=2
+UMLALL_n1_d 11000001 011 0 .... 0 .. 000 ..... 1000 . @azz_nx1_o1x4 n=2
+UMLALL_n1_s 11000001 001 1 .... 0 .. 000 ..... 1000 . @azz_nx1_o1x4 n=4
+UMLALL_n1_d 11000001 011 1 .... 0 .. 000 ..... 1000 . @azz_nx1_o1x4 n=4
+
+UMLSLL_n1_s 11000001 001 0 .... 0 .. 001 ..... 110 .. @azz_nx1_o2x4 n=1
+UMLSLL_n1_d 11000001 011 0 .... 0 .. 001 ..... 110 .. @azz_nx1_o2x4 n=1
+UMLSLL_n1_s 11000001 001 0 .... 0 .. 000 ..... 1100 . @azz_nx1_o1x4 n=2
+UMLSLL_n1_d 11000001 011 0 .... 0 .. 000 ..... 1100 . @azz_nx1_o1x4 n=2
+UMLSLL_n1_s 11000001 001 1 .... 0 .. 000 ..... 1100 . @azz_nx1_o1x4 n=4
+UMLSLL_n1_d 11000001 011 1 .... 0 .. 000 ..... 1100 . @azz_nx1_o1x4 n=4
+
+USMLALL_n1_s 11000001 001 0 .... 0 .. 001 ..... 001 .. @azz_nx1_o2x4 n=1
+USMLALL_n1_s 11000001 001 0 .... 0 .. 000 ..... 0010 . @azz_nx1_o1x4 n=2
+USMLALL_n1_s 11000001 001 1 .... 0 .. 000 ..... 0010 . @azz_nx1_o1x4 n=4
+
+SUMLALL_n1_s 11000001 001 0 .... 0 .. 000 ..... 1010 . @azz_nx1_o1x4 n=2
+SUMLALL_n1_s 11000001 001 1 .... 0 .. 000 ..... 1010 . @azz_nx1_o1x4 n=4
+
+BFMLA_n1 11000001 011 0 .... 0 .. 111 ..... 00 ... @azz_nx1_o3 n=2
+FMLA_n1_h 11000001 001 0 .... 0 .. 111 ..... 00 ... @azz_nx1_o3 n=2
+FMLA_n1_s 11000001 001 0 .... 0 .. 110 ..... 00 ... @azz_nx1_o3 n=2
+FMLA_n1_d 11000001 011 0 .... 0 .. 110 ..... 00 ... @azz_nx1_o3 n=2
+
+BFMLA_n1 11000001 011 1 .... 0 .. 111 ..... 00 ... @azz_nx1_o3 n=4
+FMLA_n1_h 11000001 001 1 .... 0 .. 111 ..... 00 ... @azz_nx1_o3 n=4
+FMLA_n1_s 11000001 001 1 .... 0 .. 110 ..... 00 ... @azz_nx1_o3 n=4
+FMLA_n1_d 11000001 011 1 .... 0 .. 110 ..... 00 ... @azz_nx1_o3 n=4
+
+BFMLS_n1 11000001 011 0 .... 0 .. 111 ..... 01 ... @azz_nx1_o3 n=2
+FMLS_n1_h 11000001 001 0 .... 0 .. 111 ..... 01 ... @azz_nx1_o3 n=2
+FMLS_n1_s 11000001 001 0 .... 0 .. 110 ..... 01 ... @azz_nx1_o3 n=2
+FMLS_n1_d 11000001 011 0 .... 0 .. 110 ..... 01 ... @azz_nx1_o3 n=2
+
+BFMLS_n1 11000001 011 1 .... 0 .. 111 ..... 01 ... @azz_nx1_o3 n=4
+FMLS_n1_h 11000001 001 1 .... 0 .. 111 ..... 01 ... @azz_nx1_o3 n=4
+FMLS_n1_s 11000001 001 1 .... 0 .. 110 ..... 01 ... @azz_nx1_o3 n=4
+FMLS_n1_d 11000001 011 1 .... 0 .. 110 ..... 01 ... @azz_nx1_o3 n=4
+
+### SME2 Multi-vector Multiple Array Vectors
+
+%zn_ax2 6:4 !function=times_2
+%zn_ax4 7:3 !function=times_4
+
+@azz_2x2_o3 ........ ... ..... . .. ... ..... .. off:3 \
+ &azz_n n=2 rv=%mova_rv zn=%zn_ax2 zm=%zm_ax2
+@azz_4x4_o3 ........ ... ..... . .. ... ..... .. off:3 \
+ &azz_n n=4 rv=%mova_rv zn=%zn_ax4 zm=%zm_ax4
+
+ADD_azz_nn_s 11000001 101 ....0 0 .. 110 ....0 10 ... @azz_2x2_o3
+ADD_azz_nn_s 11000001 101 ...01 0 .. 110 ...00 10 ... @azz_4x4_o3
+ADD_azz_nn_d 11000001 111 ....0 0 .. 110 ....0 10 ... @azz_2x2_o3
+ADD_azz_nn_d 11000001 111 ...01 0 .. 110 ...00 10 ... @azz_4x4_o3
+
+SUB_azz_nn_s 11000001 101 ....0 0 .. 110 ....0 11 ... @azz_2x2_o3
+SUB_azz_nn_s 11000001 101 ...01 0 .. 110 ...00 11 ... @azz_4x4_o3
+SUB_azz_nn_d 11000001 111 ....0 0 .. 110 ....0 11 ... @azz_2x2_o3
+SUB_azz_nn_d 11000001 111 ...01 0 .. 110 ...00 11 ... @azz_4x4_o3
+
+@azz_2x2_o2x2 ........ ... ..... . .. ... ..... ... .. \
+ &azz_n n=2 rv=%mova_rv zn=%zn_ax2 zm=%zm_ax2 off=%off2_x2
+@azz_4x4_o2x2 ........ ... ..... . .. ... ..... ... .. \
+ &azz_n n=4 rv=%mova_rv zn=%zn_ax4 zm=%zm_ax4 off=%off2_x2
+
+FMLAL_nn 11000001 101 ....0 0 .. 010 ....0 000 .. @azz_2x2_o2x2
+FMLAL_nn 11000001 101 ...01 0 .. 010 ...00 000 .. @azz_4x4_o2x2
+
+FMLSL_nn 11000001 101 ....0 0 .. 010 ....0 010 .. @azz_2x2_o2x2
+FMLSL_nn 11000001 101 ...01 0 .. 010 ...00 010 .. @azz_4x4_o2x2
+
+BFMLAL_nn 11000001 101 ....0 0 .. 010 ....0 100 .. @azz_2x2_o2x2
+BFMLAL_nn 11000001 101 ...01 0 .. 010 ...00 100 .. @azz_4x4_o2x2
+
+BFMLSL_nn 11000001 101 ....0 0 .. 010 ....0 110 .. @azz_2x2_o2x2
+BFMLSL_nn 11000001 101 ...01 0 .. 010 ...00 110 .. @azz_4x4_o2x2
+
+FDOT_nn 11000001 101 ....0 0 .. 100 ....0 00 ... @azz_2x2_o3
+FDOT_nn 11000001 101 ...01 0 .. 100 ...00 00 ... @azz_4x4_o3
+
+BFDOT_nn 11000001 101 ....0 0 .. 100 ....0 10 ... @azz_2x2_o3
+BFDOT_nn 11000001 101 ...01 0 .. 100 ...00 10 ... @azz_4x4_o3
+
+USDOT_nn 11000001 101 ....0 0 .. 101 ....0 01 ... @azz_2x2_o3
+USDOT_nn 11000001 101 ...01 0 .. 101 ...00 01 ... @azz_4x4_o3
+
+SDOT_nn_4b 11000001 101 ....0 0 .. 101 ....0 00 ... @azz_2x2_o3
+SDOT_nn_4b 11000001 101 ...01 0 .. 101 ...00 00 ... @azz_4x4_o3
+SDOT_nn_4h 11000001 111 ....0 0 .. 101 ....0 00 ... @azz_2x2_o3
+SDOT_nn_4h 11000001 111 ...01 0 .. 101 ...00 00 ... @azz_4x4_o3
+SDOT_nn_2h 11000001 111 ....0 0 .. 101 ....0 01 ... @azz_2x2_o3
+SDOT_nn_2h 11000001 111 ...01 0 .. 101 ...00 01 ... @azz_4x4_o3
+
+UDOT_nn_4b 11000001 101 ....0 0 .. 101 ....0 10 ... @azz_2x2_o3
+UDOT_nn_4b 11000001 101 ...01 0 .. 101 ...00 10 ... @azz_4x4_o3
+UDOT_nn_4h 11000001 111 ....0 0 .. 101 ....0 10 ... @azz_2x2_o3
+UDOT_nn_4h 11000001 111 ...01 0 .. 101 ...00 10 ... @azz_4x4_o3
+UDOT_nn_2h 11000001 111 ....0 0 .. 101 ....0 11 ... @azz_2x2_o3
+UDOT_nn_2h 11000001 111 ...01 0 .. 101 ...00 11 ... @azz_4x4_o3
+
+SMLAL_nn 11000001 111 ....0 0 .. 010 ....0 000 .. @azz_2x2_o2x2
+SMLAL_nn 11000001 111 ...01 0 .. 010 ...00 000 .. @azz_4x4_o2x2
+
+SMLSL_nn 11000001 111 ....0 0 .. 010 ....0 010 .. @azz_2x2_o2x2
+SMLSL_nn 11000001 111 ...01 0 .. 010 ...00 010 .. @azz_4x4_o2x2
+
+UMLAL_nn 11000001 111 ....0 0 .. 010 ....0 100 .. @azz_2x2_o2x2
+UMLAL_nn 11000001 111 ...01 0 .. 010 ...00 100 .. @azz_4x4_o2x2
+
+UMLSL_nn 11000001 111 ....0 0 .. 010 ....0 110 .. @azz_2x2_o2x2
+UMLSL_nn 11000001 111 ...01 0 .. 010 ...00 110 .. @azz_4x4_o2x2
+
+@azz_2x2_o1x4 ........ ... ..... . .. ... ..... ... .. \
+ &azz_n n=2 rv=%mova_rv zn=%zn_ax2 zm=%zm_ax2 off=%off1_x4
+@azz_4x4_o1x4 ........ ... ..... . .. ... ..... ... .. \
+ &azz_n n=4 rv=%mova_rv zn=%zn_ax4 zm=%zm_ax4 off=%off1_x4
+
+SMLALL_nn_s 11000001 101 ....0 0 .. 000 ....0 0000 . @azz_2x2_o1x4
+SMLALL_nn_d 11000001 111 ....0 0 .. 000 ....0 0000 . @azz_2x2_o1x4
+SMLALL_nn_s 11000001 101 ...01 0 .. 000 ...00 0000 . @azz_4x4_o1x4
+SMLALL_nn_d 11000001 111 ...01 0 .. 000 ...00 0000 . @azz_4x4_o1x4
+
+SMLSLL_nn_s 11000001 101 ....0 0 .. 000 ....0 0100 . @azz_2x2_o1x4
+SMLSLL_nn_d 11000001 111 ....0 0 .. 000 ....0 0100 . @azz_2x2_o1x4
+SMLSLL_nn_s 11000001 101 ...01 0 .. 000 ...00 0100 . @azz_4x4_o1x4
+SMLSLL_nn_d 11000001 111 ...01 0 .. 000 ...00 0100 . @azz_4x4_o1x4
+
+UMLALL_nn_s 11000001 101 ....0 0 .. 000 ....0 1000 . @azz_2x2_o1x4
+UMLALL_nn_d 11000001 111 ....0 0 .. 000 ....0 1000 . @azz_2x2_o1x4
+UMLALL_nn_s 11000001 101 ...01 0 .. 000 ...00 1000 . @azz_4x4_o1x4
+UMLALL_nn_d 11000001 111 ...01 0 .. 000 ...00 1000 . @azz_4x4_o1x4
+
+UMLSLL_nn_s 11000001 101 ....0 0 .. 000 ....0 1100 . @azz_2x2_o1x4
+UMLSLL_nn_d 11000001 111 ....0 0 .. 000 ....0 1100 . @azz_2x2_o1x4
+UMLSLL_nn_s 11000001 101 ...01 0 .. 000 ...00 1100 . @azz_4x4_o1x4
+UMLSLL_nn_d 11000001 111 ...01 0 .. 000 ...00 1100 . @azz_4x4_o1x4
+
+USMLALL_nn_s 11000001 101 ....0 0 .. 000 ....0 0010 . @azz_2x2_o1x4
+USMLALL_nn_s 11000001 101 ...01 0 .. 000 ...00 0010 . @azz_4x4_o1x4
+
+BFMLA_nn 11000001 111 ....0 0 .. 100 ....0 01 ... @azz_2x2_o3
+FMLA_nn_h 11000001 101 ....0 0 .. 100 ....0 01 ... @azz_2x2_o3
+FMLA_nn_s 11000001 101 ....0 0 .. 110 ....0 00 ... @azz_2x2_o3
+FMLA_nn_d 11000001 111 ....0 0 .. 110 ....0 00 ... @azz_2x2_o3
+
+BFMLA_nn 11000001 111 ...01 0 .. 100 ...00 01 ... @azz_4x4_o3
+FMLA_nn_h 11000001 101 ...01 0 .. 100 ...00 01 ... @azz_4x4_o3
+FMLA_nn_s 11000001 101 ...01 0 .. 110 ...00 00 ... @azz_4x4_o3
+FMLA_nn_d 11000001 111 ...01 0 .. 110 ...00 00 ... @azz_4x4_o3
+
+BFMLS_nn 11000001 111 ....0 0 .. 100 ....0 11 ... @azz_2x2_o3
+FMLS_nn_h 11000001 101 ....0 0 .. 100 ....0 11 ... @azz_2x2_o3
+FMLS_nn_s 11000001 101 ....0 0 .. 110 ....0 01 ... @azz_2x2_o3
+FMLS_nn_d 11000001 111 ....0 0 .. 110 ....0 01 ... @azz_2x2_o3
+
+BFMLS_nn 11000001 111 ...01 0 .. 100 ...00 11 ... @azz_4x4_o3
+FMLS_nn_h 11000001 101 ...01 0 .. 100 ...00 11 ... @azz_4x4_o3
+FMLS_nn_s 11000001 101 ...01 0 .. 110 ...00 01 ... @azz_4x4_o3
+FMLS_nn_d 11000001 111 ...01 0 .. 110 ...00 01 ... @azz_4x4_o3
+
+&az_n n off rv zm
+@az_2x2_o3 ........ ... ..... . .. ... ..... .. off:3 \
+ &az_n n=2 rv=%mova_rv zm=%zn_ax2
+@az_4x4_o3 ........ ... ..... . .. ... ..... .. off:3 \
+ &az_n n=4 rv=%mova_rv zm=%zn_ax4
+
+FADD_nn_h 11000001 101 00100 0 .. 111 ....0 00 ... @az_2x2_o3
+FADD_nn_s 11000001 101 00000 0 .. 111 ....0 00 ... @az_2x2_o3
+FADD_nn_d 11000001 111 00000 0 .. 111 ....0 00 ... @az_2x2_o3
+FADD_nn_h 11000001 101 00101 0 .. 111 ...00 00 ... @az_4x4_o3
+FADD_nn_s 11000001 101 00001 0 .. 111 ...00 00 ... @az_4x4_o3
+FADD_nn_d 11000001 111 00001 0 .. 111 ...00 00 ... @az_4x4_o3
+
+FSUB_nn_h 11000001 101 00100 0 .. 111 ....0 01 ... @az_2x2_o3
+FSUB_nn_s 11000001 101 00000 0 .. 111 ....0 01 ... @az_2x2_o3
+FSUB_nn_d 11000001 111 00000 0 .. 111 ....0 01 ... @az_2x2_o3
+FSUB_nn_h 11000001 101 00101 0 .. 111 ...00 01 ... @az_4x4_o3
+FSUB_nn_s 11000001 101 00001 0 .. 111 ...00 01 ... @az_4x4_o3
+FSUB_nn_d 11000001 111 00001 0 .. 111 ...00 01 ... @az_4x4_o3
+
+BFADD_nn 11000001 111 00100 0 .. 111 ....0 00 ... @az_2x2_o3
+BFADD_nn 11000001 111 00101 0 .. 111 ...00 00 ... @az_4x4_o3
+BFSUB_nn 11000001 111 00100 0 .. 111 ....0 01 ... @az_2x2_o3
+BFSUB_nn 11000001 111 00101 0 .. 111 ...00 01 ... @az_4x4_o3
+
+### SME2 Multi-vector Indexed
+
+&azx_n n off rv zn zm idx
+
+%idx3_15_10 15:1 10:2
+%idx2_10_2 10:2 2:1
+
+@azx_1x1_o3x2 ........ .... zm:4 . .. . .. zn:5 .. ... \
+ &azx_n n=1 rv=%mova_rv off=%off3_x2 idx=%idx3_15_10
+@azx_2x1_o2x2 ........ .... zm:4 . .. . .. ..... .. ... \
+ &azx_n n=2 rv=%mova_rv off=%off2_x2 zn=%zn_ax2 idx=%idx2_10_2
+@azx_4x1_o2x2 ........ .... zm:4 . .. . .. ..... .. ... \
+ &azx_n n=4 rv=%mova_rv off=%off2_x2 zn=%zn_ax4 idx=%idx2_10_2
+
+FMLAL_nx 11000001 1000 .... . .. 1 .. ..... 00 ... @azx_1x1_o3x2
+FMLAL_nx 11000001 1001 .... 0 .. 1 .. ....0 00 ... @azx_2x1_o2x2
+FMLAL_nx 11000001 1001 .... 1 .. 1 .. ...00 00 ... @azx_4x1_o2x2
+
+FMLSL_nx 11000001 1000 .... . .. 1 .. ..... 01 ... @azx_1x1_o3x2
+FMLSL_nx 11000001 1001 .... 0 .. 1 .. ....0 01 ... @azx_2x1_o2x2
+FMLSL_nx 11000001 1001 .... 1 .. 1 .. ...00 01 ... @azx_4x1_o2x2
+
+BFMLAL_nx 11000001 1000 .... . .. 1 .. ..... 10 ... @azx_1x1_o3x2
+BFMLAL_nx 11000001 1001 .... 0 .. 1 .. ....0 10 ... @azx_2x1_o2x2
+BFMLAL_nx 11000001 1001 .... 1 .. 1 .. ...00 10 ... @azx_4x1_o2x2
+
+BFMLSL_nx 11000001 1000 .... . .. 1 .. ..... 11 ... @azx_1x1_o3x2
+BFMLSL_nx 11000001 1001 .... 0 .. 1 .. ....0 11 ... @azx_2x1_o2x2
+BFMLSL_nx 11000001 1001 .... 1 .. 1 .. ...00 11 ... @azx_4x1_o2x2
+
+@azx_2x1_i2_o3 ........ .... zm:4 . .. . idx:2 .... ... off:3 \
+ &azx_n n=2 rv=%mova_rv zn=%zn_ax2
+@azx_4x1_i2_o3 ........ .... zm:4 . .. . idx:2 .... ... off:3 \
+ &azx_n n=4 rv=%mova_rv zn=%zn_ax4
+@azx_2x1_i1_o3 ........ .... zm:4 . .. .. idx:1 .... ... off:3 \
+ &azx_n n=2 rv=%mova_rv zn=%zn_ax2
+@azx_4x1_i1_o3 ........ .... zm:4 . .. .. idx:1 .... ... off:3 \
+ &azx_n n=4 rv=%mova_rv zn=%zn_ax4
+
+FDOT_nx 11000001 0101 .... 0 .. 1 .. ....0 01 ... @azx_2x1_i2_o3
+FDOT_nx 11000001 0101 .... 1 .. 1 .. ...00 01 ... @azx_4x1_i2_o3
+
+BFDOT_nx 11000001 0101 .... 0 .. 1 .. ....0 11 ... @azx_2x1_i2_o3
+BFDOT_nx 11000001 0101 .... 1 .. 1 .. ...00 11 ... @azx_4x1_i2_o3
+
+FVDOT 11000001 0101 .... 0 .. 0 .. ....0 01 ... @azx_2x1_i2_o3
+BFVDOT 11000001 0101 .... 0 .. 0 .. ....0 11 ... @azx_2x1_i2_o3
+
+SDOT_nx_2h 11000001 0101 .... 0 .. 1 .. ....0 00 ... @azx_2x1_i2_o3
+SDOT_nx_2h 11000001 0101 .... 1 .. 1 .. ...00 00 ... @azx_4x1_i2_o3
+SDOT_nx_4b 11000001 0101 .... 0 .. 1 .. ....1 00 ... @azx_2x1_i2_o3
+SDOT_nx_4b 11000001 0101 .... 1 .. 1 .. ...01 00 ... @azx_4x1_i2_o3
+SDOT_nx_4h 11000001 1101 .... 0 .. 00 . ....0 01 ... @azx_2x1_i1_o3
+SDOT_nx_4h 11000001 1101 .... 1 .. 00 . ...00 01 ... @azx_4x1_i1_o3
+
+UDOT_nx_2h 11000001 0101 .... 0 .. 1 .. ....0 10 ... @azx_2x1_i2_o3
+UDOT_nx_2h 11000001 0101 .... 1 .. 1 .. ...00 10 ... @azx_4x1_i2_o3
+UDOT_nx_4b 11000001 0101 .... 0 .. 1 .. ....1 10 ... @azx_2x1_i2_o3
+UDOT_nx_4b 11000001 0101 .... 1 .. 1 .. ...01 10 ... @azx_4x1_i2_o3
+UDOT_nx_4h 11000001 1101 .... 0 .. 00 . ....0 11 ... @azx_2x1_i1_o3
+UDOT_nx_4h 11000001 1101 .... 1 .. 00 . ...00 11 ... @azx_4x1_i1_o3
+
+USDOT_nx 11000001 0101 .... 0 .. 1 .. ....1 01 ... @azx_2x1_i2_o3
+USDOT_nx 11000001 0101 .... 1 .. 1 .. ...01 01 ... @azx_4x1_i2_o3
+
+SUDOT_nx 11000001 0101 .... 0 .. 1 .. ....1 11 ... @azx_2x1_i2_o3
+SUDOT_nx 11000001 0101 .... 1 .. 1 .. ...01 11 ... @azx_4x1_i2_o3
+
+SVDOT_nx_2h 11000001 0101 .... 0 .. 0 .. ....1 00 ... @azx_2x1_i2_o3
+SVDOT_nx_4b 11000001 0101 .... 1 .. 0 .. ...01 00 ... @azx_4x1_i2_o3
+SVDOT_nx_4h 11000001 1101 .... 1 .. 01 . ...00 01 ... @azx_4x1_i1_o3
+
+UVDOT_nx_2h 11000001 0101 .... 0 .. 0 .. ....1 10 ... @azx_2x1_i2_o3
+UVDOT_nx_4b 11000001 0101 .... 1 .. 0 .. ...01 10 ... @azx_4x1_i2_o3
+UVDOT_nx_4h 11000001 1101 .... 1 .. 01 . ...00 11 ... @azx_4x1_i1_o3
+
+SUVDOT_nx_4b 11000001 0101 .... 1 .. 0 .. ...01 11 ... @azx_4x1_i2_o3
+USVDOT_nx_4b 11000001 0101 .... 1 .. 0 .. ...01 01 ... @azx_4x1_i2_o3
+
+SMLAL_nx 11000001 1100 .... . .. 1 .. ..... 00 ... @azx_1x1_o3x2
+SMLAL_nx 11000001 1101 .... 0 .. 1 .. ....0 00 ... @azx_2x1_o2x2
+SMLAL_nx 11000001 1101 .... 1 .. 1 .. ...00 00 ... @azx_4x1_o2x2
+
+SMLSL_nx 11000001 1100 .... . .. 1 .. ..... 01 ... @azx_1x1_o3x2
+SMLSL_nx 11000001 1101 .... 0 .. 1 .. ....0 01 ... @azx_2x1_o2x2
+SMLSL_nx 11000001 1101 .... 1 .. 1 .. ...00 01 ... @azx_4x1_o2x2
+
+UMLAL_nx 11000001 1100 .... . .. 1 .. ..... 10 ... @azx_1x1_o3x2
+UMLAL_nx 11000001 1101 .... 0 .. 1 .. ....0 10 ... @azx_2x1_o2x2
+UMLAL_nx 11000001 1101 .... 1 .. 1 .. ...00 10 ... @azx_4x1_o2x2
+
+UMLSL_nx 11000001 1100 .... . .. 1 .. ..... 11 ... @azx_1x1_o3x2
+UMLSL_nx 11000001 1101 .... 0 .. 1 .. ....0 11 ... @azx_2x1_o2x2
+UMLSL_nx 11000001 1101 .... 1 .. 1 .. ...00 11 ... @azx_4x1_o2x2
+
+%idx4_15_10 15:1 10:3
+%idx4_10_1 10:2 1:2
+%idx3_10_1 10:1 1:2
+
+@azx_1x1_i4_o2 ........ .... zm:4 . .. ... zn:5 ... .. \
+ &azx_n n=1 rv=%mova_rv off=%off2_x4 idx=%idx4_15_10
+@azx_1x1_i3_o2 ........ .... zm:4 . .. ... zn:5 ... .. \
+ &azx_n n=1 rv=%mova_rv off=%off2_x4 idx=%idx3_15_10
+@azx_2x1_i4_o1 ........ .... zm:4 . .. ... ..... ... .. \
+ &azx_n n=2 rv=%mova_rv off=%off1_x4 zn=%zn_ax2 idx=%idx4_10_1
+@azx_2x1_i3_o1 ........ .... zm:4 . .. ... ..... ... .. \
+ &azx_n n=2 rv=%mova_rv off=%off1_x4 zn=%zn_ax2 idx=%idx3_10_1
+@azx_4x1_i4_o1 ........ .... zm:4 . .. ... ..... ... .. \
+ &azx_n n=4 rv=%mova_rv off=%off1_x4 zn=%zn_ax4 idx=%idx4_10_1
+@azx_4x1_i3_o1 ........ .... zm:4 . .. ... ..... ... .. \
+ &azx_n n=4 rv=%mova_rv off=%off1_x4 zn=%zn_ax4 idx=%idx3_10_1
+
+SMLALL_nx_s 11000001 0000 .... . .. ... ..... 000 .. @azx_1x1_i4_o2
+SMLALL_nx_d 11000001 1000 .... . .. 0.. ..... 000 .. @azx_1x1_i3_o2
+SMLALL_nx_s 11000001 0001 .... 0 .. 0.. ....0 00 ... @azx_2x1_i4_o1
+SMLALL_nx_d 11000001 1001 .... 0 .. 00. ....0 00 ... @azx_2x1_i3_o1
+SMLALL_nx_s 11000001 0001 .... 1 .. 0.. ...00 00 ... @azx_4x1_i4_o1
+SMLALL_nx_d 11000001 1001 .... 1 .. 00. ...00 00 ... @azx_4x1_i3_o1
+
+SMLSLL_nx_s 11000001 0000 .... . .. ... ..... 010 .. @azx_1x1_i4_o2
+SMLSLL_nx_d 11000001 1000 .... . .. 0.. ..... 010 .. @azx_1x1_i3_o2
+SMLSLL_nx_s 11000001 0001 .... 0 .. 0.. ....0 01 ... @azx_2x1_i4_o1
+SMLSLL_nx_d 11000001 1001 .... 0 .. 00. ....0 01 ... @azx_2x1_i3_o1
+SMLSLL_nx_s 11000001 0001 .... 1 .. 0.. ...00 01 ... @azx_4x1_i4_o1
+SMLSLL_nx_d 11000001 1001 .... 1 .. 00. ...00 01 ... @azx_4x1_i3_o1
+
+UMLALL_nx_s 11000001 0000 .... . .. ... ..... 100 .. @azx_1x1_i4_o2
+UMLALL_nx_d 11000001 1000 .... . .. 0.. ..... 100 .. @azx_1x1_i3_o2
+UMLALL_nx_s 11000001 0001 .... 0 .. 0.. ....0 10 ... @azx_2x1_i4_o1
+UMLALL_nx_d 11000001 1001 .... 0 .. 00. ....0 10 ... @azx_2x1_i3_o1
+UMLALL_nx_s 11000001 0001 .... 1 .. 0.. ...00 10 ... @azx_4x1_i4_o1
+UMLALL_nx_d 11000001 1001 .... 1 .. 00. ...00 10 ... @azx_4x1_i3_o1
+
+UMLSLL_nx_s 11000001 0000 .... . .. ... ..... 110 .. @azx_1x1_i4_o2
+UMLSLL_nx_d 11000001 1000 .... . .. 0.. ..... 110 .. @azx_1x1_i3_o2
+UMLSLL_nx_s 11000001 0001 .... 0 .. 0.. ....0 11 ... @azx_2x1_i4_o1
+UMLSLL_nx_d 11000001 1001 .... 0 .. 00. ....0 11 ... @azx_2x1_i3_o1
+UMLSLL_nx_s 11000001 0001 .... 1 .. 0.. ...00 11 ... @azx_4x1_i4_o1
+UMLSLL_nx_d 11000001 1001 .... 1 .. 00. ...00 11 ... @azx_4x1_i3_o1
+
+USMLALL_nx_s 11000001 0000 .... . .. ... ..... 001 .. @azx_1x1_i4_o2
+USMLALL_nx_s 11000001 0001 .... 0 .. 0.. ....1 00 ... @azx_2x1_i4_o1
+USMLALL_nx_s 11000001 0001 .... 1 .. 0.. ...01 00 ... @azx_4x1_i4_o1
+
+SUMLALL_nx_s 11000001 0000 .... . .. ... ..... 101 .. @azx_1x1_i4_o2
+SUMLALL_nx_s 11000001 0001 .... 0 .. 0.. ....1 10 ... @azx_2x1_i4_o1
+SUMLALL_nx_s 11000001 0001 .... 1 .. 0.. ...01 10 ... @azx_4x1_i4_o1
+
+%idx3_10_3 10:2 3:1
+@azx_2x1_i3_o3 ........ .... zm:4 . .. ... ..... .. off:3 \
+ &azx_n n=2 rv=%mova_rv zn=%zn_ax2 idx=%idx3_10_3
+@azx_4x1_i3_o3 ........ .... zm:4 . .. ... ..... .. off:3 \
+ &azx_n n=4 rv=%mova_rv zn=%zn_ax4 idx=%idx3_10_3
+
+BFMLA_nx 11000001 0001 .... 0 .. 1.. ....1 0 .... @azx_2x1_i3_o3
+FMLA_nx_h 11000001 0001 .... 0 .. 1.. ....0 0 .... @azx_2x1_i3_o3
+FMLA_nx_s 11000001 0101 .... 0 .. 0.. ....0 00 ... @azx_2x1_i2_o3
+FMLA_nx_d 11000001 1101 .... 0 .. 00. ....0 00 ... @azx_2x1_i1_o3
+
+BFMLA_nx 11000001 0001 .... 1 .. 1.. ...01 0 .... @azx_4x1_i3_o3
+FMLA_nx_h 11000001 0001 .... 1 .. 1.. ...00 0 .... @azx_4x1_i3_o3
+FMLA_nx_s 11000001 0101 .... 1 .. 0.. ...00 00 ... @azx_4x1_i2_o3
+FMLA_nx_d 11000001 1101 .... 1 .. 00. ...00 00 ... @azx_4x1_i1_o3
+
+BFMLS_nx 11000001 0001 .... 0 .. 1.. ....1 1 .... @azx_2x1_i3_o3
+FMLS_nx_h 11000001 0001 .... 0 .. 1.. ....0 1 .... @azx_2x1_i3_o3
+FMLS_nx_s 11000001 0101 .... 0 .. 0.. ....0 10 ... @azx_2x1_i2_o3
+FMLS_nx_d 11000001 1101 .... 0 .. 00. ....0 10 ... @azx_2x1_i1_o3
+
+BFMLS_nx 11000001 0001 .... 1 .. 1.. ...01 1 .... @azx_4x1_i3_o3
+FMLS_nx_h 11000001 0001 .... 1 .. 1.. ...00 1 .... @azx_4x1_i3_o3
+FMLS_nx_s 11000001 0101 .... 1 .. 0.. ...00 10 ... @azx_4x1_i2_o3
+FMLS_nx_d 11000001 1101 .... 1 .. 00. ...00 10 ... @azx_4x1_i1_o3
+
+### SME2 Add / Sub array accumulators
+
+ADD_aaz_s 11000001 101 000000 .. 111 ....0 10 ... @az_2x2_o3
+ADD_aaz_s 11000001 101 000010 .. 111 ...00 10 ... @az_4x4_o3
+ADD_aaz_d 11000001 111 000000 .. 111 ....0 10 ... @az_2x2_o3
+ADD_aaz_d 11000001 111 000010 .. 111 ...00 10 ... @az_4x4_o3
+
+SUB_aaz_s 11000001 101 000000 .. 111 ....0 11 ... @az_2x2_o3
+SUB_aaz_s 11000001 101 000010 .. 111 ...00 11 ... @az_4x4_o3
+SUB_aaz_d 11000001 111 000000 .. 111 ....0 11 ... @az_2x2_o3
+SUB_aaz_d 11000001 111 000010 .. 111 ...00 11 ... @az_4x4_o3
+
+### SME2 Multi-vector SVE Constructive Unary
+
+&zz_e zd zn esz
+&zz_n zd zn n
+@zz_1x2 ........ ... ..... ...... ..... zd:5 \
+ &zz_n n=1 zn=%zn_ax2
+@zz_1x4 ........ ... ..... ...... ..... zd:5 \
+ &zz_n n=1 zn=%zn_ax4
+@zz_2x1 ........ ... ..... ...... zn:5 ..... \
+ &zz_n n=1 zd=%zd_ax2
+@zz_2x2 ........ ... ..... ...... .... . ..... \
+ &zz_n n=2 zd=%zd_ax2 zn=%zn_ax2
+@zz_4x4 ........ ... ..... ...... .... . ..... \
+ &zz_n n=4 zd=%zd_ax4 zn=%zn_ax4
+@zz_4x2_n1 ........ ... ..... ...... .... . ..... \
+ &zz_n n=1 zd=%zd_ax4 zn=%zn_ax2
+
+BFCVT 11000001 011 00000 111000 ....0 ..... @zz_1x2
+BFCVTN 11000001 011 00000 111000 ....1 ..... @zz_1x2
+
+FCVT_n 11000001 001 00000 111000 ....0 ..... @zz_1x2
+FCVTN 11000001 001 00000 111000 ....1 ..... @zz_1x2
+
+FCVT_w 11000001 101 00000 111000 ..... ....0 @zz_2x1
+FCVTL 11000001 101 00000 111000 ..... ....1 @zz_2x1
+
+FCVTZS 11000001 001 00001 111000 ....0 ....0 @zz_2x2
+FCVTZS 11000001 001 10001 111000 ...00 ...00 @zz_4x4
+FCVTZU 11000001 001 00001 111000 ....1 ....0 @zz_2x2
+FCVTZU 11000001 001 10001 111000 ...01 ...00 @zz_4x4
+
+SCVTF 11000001 001 00010 111000 ....0 ....0 @zz_2x2
+SCVTF 11000001 001 10010 111000 ...00 ...00 @zz_4x4
+UCVTF 11000001 001 00010 111000 ....1 ....0 @zz_2x2
+UCVTF 11000001 001 10010 111000 ...01 ...00 @zz_4x4
+
+FRINTN 11000001 101 01000 111000 ....0 ....0 @zz_2x2
+FRINTN 11000001 101 11000 111000 ...00 ...00 @zz_4x4
+FRINTP 11000001 101 01001 111000 ....0 ....0 @zz_2x2
+FRINTP 11000001 101 11001 111000 ...00 ...00 @zz_4x4
+FRINTM 11000001 101 01010 111000 ....0 ....0 @zz_2x2
+FRINTM 11000001 101 11010 111000 ...00 ...00 @zz_4x4
+FRINTA 11000001 101 01100 111000 ....0 ....0 @zz_2x2
+FRINTA 11000001 101 11100 111000 ...00 ...00 @zz_4x4
+
+SQCVT_sh 11000001 001 00011 111000 ....0 ..... @zz_1x2
+UQCVT_sh 11000001 001 00011 111000 ....1 ..... @zz_1x2
+SQCVTU_sh 11000001 011 00011 111000 ....0 ..... @zz_1x2
+
+SQCVT_sb 11000001 001 10011 111000 ...00 ..... @zz_1x4
+UQCVT_sb 11000001 001 10011 111000 ...01 ..... @zz_1x4
+SQCVTU_sb 11000001 011 10011 111000 ...00 ..... @zz_1x4
+
+SQCVT_dh 11000001 101 10011 111000 ...00 ..... @zz_1x4
+UQCVT_dh 11000001 101 10011 111000 ...01 ..... @zz_1x4
+SQCVTU_dh 11000001 111 10011 111000 ...00 ..... @zz_1x4
+
+SQCVTN_sb 11000001 001 10011 111000 ...10 ..... @zz_1x4
+UQCVTN_sb 11000001 001 10011 111000 ...11 ..... @zz_1x4
+SQCVTUN_sb 11000001 011 10011 111000 ...10 ..... @zz_1x4
+
+SQCVTN_dh 11000001 101 10011 111000 ...10 ..... @zz_1x4
+UQCVTN_dh 11000001 101 10011 111000 ...11 ..... @zz_1x4
+SQCVTUN_dh 11000001 111 10011 111000 ...10 ..... @zz_1x4
+
+SUNPK_2bh 11000001 011 00101 111000 ..... ....0 @zz_2x1
+SUNPK_2hs 11000001 101 00101 111000 ..... ....0 @zz_2x1
+SUNPK_2sd 11000001 111 00101 111000 ..... ....0 @zz_2x1
+
+UUNPK_2bh 11000001 011 00101 111000 ..... ....1 @zz_2x1
+UUNPK_2hs 11000001 101 00101 111000 ..... ....1 @zz_2x1
+UUNPK_2sd 11000001 111 00101 111000 ..... ....1 @zz_2x1
+
+SUNPK_4bh 11000001 011 10101 111000 ....0 ...00 @zz_4x2_n1
+SUNPK_4hs 11000001 101 10101 111000 ....0 ...00 @zz_4x2_n1
+SUNPK_4sd 11000001 111 10101 111000 ....0 ...00 @zz_4x2_n1
+
+UUNPK_4bh 11000001 011 10101 111000 ....0 ...01 @zz_4x2_n1
+UUNPK_4hs 11000001 101 10101 111000 ....0 ...01 @zz_4x2_n1
+UUNPK_4sd 11000001 111 10101 111000 ....0 ...01 @zz_4x2_n1
+
+ZIP_4 11000001 esz:2 1 10110 111000 ...00 ... 00 \
+ &zz_e zd=%zd_ax4 zn=%zn_ax4
+ZIP_4 11000001 001 10111 111000 ...00 ... 00 \
+ &zz_e esz=4 zd=%zd_ax4 zn=%zn_ax4
+
+UZP_4 11000001 esz:2 1 10110 111000 ...00 ... 10 \
+ &zz_e zd=%zd_ax4 zn=%zn_ax4
+UZP_4 11000001 001 10111 111000 ...00 ... 10 \
+ &zz_e esz=4 zd=%zd_ax4 zn=%zn_ax4
+
+### SME2 Multi-vector SVE Constructive Binary
+
+&rshr zd zn shift
+
+%rshr_sh_shift 16:4 !function=rsub_16
+%rshr_sb_shift 16:5 !function=rsub_32
+%rshr_dh_shift 22:1 16:5 !function=rsub_64
+
+@rshr_sh ........ .... .... ...... ..... zd:5 \
+ &rshr zn=%zn_ax2 shift=%rshr_sh_shift
+@rshr_sb ........ ... ..... ...... ..... zd:5 \
+ &rshr zn=%zn_ax4 shift=%rshr_sb_shift
+@rshr_dh ........ ... ..... ...... ..... zd:5 \
+ &rshr zn=%zn_ax4 shift=%rshr_dh_shift
+
+SQRSHR_sh 11000001 1110 .... 110101 ....0 ..... @rshr_sh
+UQRSHR_sh 11000001 1110 .... 110101 ....1 ..... @rshr_sh
+SQRSHRU_sh 11000001 1111 .... 110101 ....0 ..... @rshr_sh
+
+SQRSHR_sb 11000001 011 ..... 110110 ...00 ..... @rshr_sb
+SQRSHR_dh 11000001 1.1 ..... 110110 ...00 ..... @rshr_dh
+UQRSHR_sb 11000001 011 ..... 110110 ...01 ..... @rshr_sb
+UQRSHR_dh 11000001 1.1 ..... 110110 ...01 ..... @rshr_dh
+SQRSHRU_sb 11000001 011 ..... 110110 ...10 ..... @rshr_sb
+SQRSHRU_dh 11000001 1.1 ..... 110110 ...10 ..... @rshr_dh
+
+SQRSHRN_sh 01000101 1011 .... 001010 ....0 ..... @rshr_sh
+UQRSHRN_sh 01000101 1011 .... 001110 ....0 ..... @rshr_sh
+SQRSHRUN_sh 01000101 1011 .... 000010 ....0 ..... @rshr_sh
+
+SQRSHRN_sb 11000001 011 ..... 110111 ...00 ..... @rshr_sb
+SQRSHRN_dh 11000001 1.1 ..... 110111 ...00 ..... @rshr_dh
+UQRSHRN_sb 11000001 011 ..... 110111 ...01 ..... @rshr_sb
+UQRSHRN_dh 11000001 1.1 ..... 110111 ...01 ..... @rshr_dh
+SQRSHRUN_sb 11000001 011 ..... 110111 ...10 ..... @rshr_sb
+SQRSHRUN_dh 11000001 1.1 ..... 110111 ...10 ..... @rshr_dh
+
+&zzz_e zd zn zm esz
+
+ZIP_2 11000001 esz:2 1 zm:5 110100 zn:5 .... 0 \
+ &zzz_e zd=%zd_ax2
+ZIP_2 11000001 00 1 zm:5 110101 zn:5 .... 0 \
+ &zzz_e zd=%zd_ax2 esz=4
+
+UZP_2 11000001 esz:2 1 zm:5 110100 zn:5 .... 1 \
+ &zzz_e zd=%zd_ax2
+UZP_2 11000001 00 1 zm:5 110101 zn:5 .... 1 \
+ &zzz_e zd=%zd_ax2 esz=4
+
+&zzz_en zd zn zm esz n
+
+FCLAMP 11000001 esz:2 1 zm:5 110000 zn:5 .... 0 \
+ &zzz_en zd=%zd_ax2 n=2
+FCLAMP 11000001 esz:2 1 zm:5 110010 zn:5 ...0 0 \
+ &zzz_en zd=%zd_ax4 n=4
+
+SCLAMP 11000001 esz:2 1 zm:5 110001 zn:5 .... 0 \
+ &zzz_en zd=%zd_ax2 n=2
+SCLAMP 11000001 esz:2 1 zm:5 110011 zn:5 ...0 0 \
+ &zzz_en zd=%zd_ax4 n=4
+
+UCLAMP 11000001 esz:2 1 zm:5 110001 zn:5 .... 1 \
+ &zzz_en zd=%zd_ax2 n=2
+UCLAMP 11000001 esz:2 1 zm:5 110011 zn:5 ...0 1 \
+ &zzz_en zd=%zd_ax4 n=4
+
+### SME2 Multi-vector SVE Select
+
+%sel_pg 10:3 !function=plus_8
+
+SEL 11000001 esz:2 1 ....0 100 ... ....0 ....0 \
+ n=2 zd=%zd_ax2 zn=%zn_ax2 zm=%zm_ax2 pg=%sel_pg
+SEL 11000001 esz:2 1 ...01 100 ... ...00 ...00 \
+ n=4 zd=%zd_ax4 zn=%zn_ax4 zm=%zm_ax4 pg=%sel_pg
+
+### SME Multiple Zero
+
+&zero_za rv off ngrp nvec
+
+ZERO_za 11000000 000011 000 .. 0000000000 off:3 \
+ &zero_za ngrp=2 nvec=1 rv=%mova_rv
+ZERO_za 11000000 000011 100 .. 0000000000 off:3 \
+ &zero_za ngrp=4 nvec=1 rv=%mova_rv
+
+ZERO_za 11000000 000011 001 .. 0000000000 ... \
+ &zero_za ngrp=1 nvec=2 rv=%mova_rv off=%off3_x2
+ZERO_za 11000000 000011 010 .. 0000000000 0.. \
+ &zero_za ngrp=2 nvec=2 rv=%mova_rv off=%off2_x2
+ZERO_za 11000000 000011 011 .. 0000000000 0.. \
+ &zero_za ngrp=4 nvec=2 rv=%mova_rv off=%off2_x2
+
+ZERO_za 11000000 000011 101 .. 0000000000 0.. \
+ &zero_za ngrp=1 nvec=4 rv=%mova_rv off=%off2_x4
+ZERO_za 11000000 000011 110 .. 0000000000 00. \
+ &zero_za ngrp=2 nvec=4 rv=%mova_rv off=%off1_x4
+ZERO_za 11000000 000011 111 .. 0000000000 00. \
+ &zero_za ngrp=4 nvec=4 rv=%mova_rv off=%off1_x4
+
+### SME Lookup Table Read
+
+&lut zd zn idx
+
+# LUTI2, consecutive
+LUTI2_c_1b 1100 0000 1100 11 idx:4 00 00 zn:5 zd:5 &lut
+LUTI2_c_1h 1100 0000 1100 11 idx:4 01 00 zn:5 zd:5 &lut
+LUTI2_c_1s 1100 0000 1100 11 idx:4 10 00 zn:5 zd:5 &lut
+
+LUTI2_c_2b 1100 0000 1000 11 idx:3 1 00 00 zn:5 .... 0 &lut zd=%zd_ax2
+LUTI2_c_2h 1100 0000 1000 11 idx:3 1 01 00 zn:5 .... 0 &lut zd=%zd_ax2
+LUTI2_c_2s 1100 0000 1000 11 idx:3 1 10 00 zn:5 .... 0 &lut zd=%zd_ax2
+
+LUTI2_c_4b 1100 0000 1000 11 idx:2 10 00 00 zn:5 ... 00 &lut zd=%zd_ax4
+LUTI2_c_4h 1100 0000 1000 11 idx:2 10 01 00 zn:5 ... 00 &lut zd=%zd_ax4
+LUTI2_c_4s 1100 0000 1000 11 idx:2 10 10 00 zn:5 ... 00 &lut zd=%zd_ax4
+
+# LUTI2, strided (must check zd alignment)
+LUTI2_s_2b 1100 0000 1001 11 idx:3 1 00 00 zn:5 zd:5 &lut
+LUTI2_s_2h 1100 0000 1001 11 idx:3 1 01 00 zn:5 zd:5 &lut
+
+LUTI2_s_4b 1100 0000 1001 11 idx:2 10 00 00 zn:5 zd:5 &lut
+LUTI2_s_4h 1100 0000 1001 11 idx:2 10 01 00 zn:5 zd:5 &lut
+
+# LUTI4, consecutive
+LUTI4_c_1b 1100 0000 1100 101 idx:3 00 00 zn:5 zd:5 &lut
+LUTI4_c_1h 1100 0000 1100 101 idx:3 01 00 zn:5 zd:5 &lut
+LUTI4_c_1s 1100 0000 1100 101 idx:3 10 00 zn:5 zd:5 &lut
+
+LUTI4_c_2b 1100 0000 1000 101 idx:2 1 00 00 zn:5 .... 0 &lut zd=%zd_ax2
+LUTI4_c_2h 1100 0000 1000 101 idx:2 1 01 00 zn:5 .... 0 &lut zd=%zd_ax2
+LUTI4_c_2s 1100 0000 1000 101 idx:2 1 10 00 zn:5 .... 0 &lut zd=%zd_ax2
+
+LUTI4_c_4h 1100 0000 1000 101 idx:1 10 01 00 zn:5 ... 00 &lut zd=%zd_ax4
+LUTI4_c_4s 1100 0000 1000 101 idx:1 10 10 00 zn:5 ... 00 &lut zd=%zd_ax4
+
+# LUTI4, strided (must check zd alignment)
+LUTI4_s_2b 1100 0000 1001 101 idx:2 1 00 00 zn:5 zd:5 &lut
+LUTI4_s_2h 1100 0000 1001 101 idx:2 1 01 00 zn:5 zd:5 &lut
+
+LUTI4_s_4h 1100 0000 1001 101 idx:1 10 01 00 zn:5 zd:5 &lut
diff --git a/target/arm/tcg/sme_helper.c b/target/arm/tcg/sme_helper.c
index de0c6e5..bb8ed1e 100644
--- a/target/arm/tcg/sme_helper.c
+++ b/target/arm/tcg/sme_helper.c
@@ -29,6 +29,13 @@
#include "vec_internal.h"
#include "sve_ldst_internal.h"
+
+static bool vectors_overlap(ARMVectorReg *x, unsigned nx,
+ ARMVectorReg *y, unsigned ny)
+{
+ return !(x + nx <= y || y + ny <= x);
+}
+
void helper_set_svcr(CPUARMState *env, uint32_t val, uint32_t mask)
{
aarch64_set_svcr(env, val, mask);
@@ -39,12 +46,12 @@ void helper_sme_zero(CPUARMState *env, uint32_t imm, uint32_t svl)
uint32_t i;
/*
- * Special case clearing the entire ZA space.
+ * Special case clearing the entire ZArray.
* This falls into the CONSTRAINED UNPREDICTABLE zeroing of any
* parts of the ZA storage outside of SVL.
*/
if (imm == 0xff) {
- memset(env->zarray, 0, sizeof(env->zarray));
+ memset(env->za_state.za, 0, sizeof(env->za_state.za));
return;
}
@@ -54,7 +61,7 @@ void helper_sme_zero(CPUARMState *env, uint32_t imm, uint32_t svl)
*/
for (i = 0; i < svl; i++) {
if (imm & (1 << (i % 8))) {
- memset(&env->zarray[i], 0, svl);
+ memset(&env->za_state.za[i], 0, svl);
}
}
}
@@ -206,6 +213,110 @@ void HELPER(sme_mova_zc_q)(void *vd, void *za, void *vg, uint32_t desc)
#undef DO_MOVA_Z
+void HELPER(sme2_mova_zc_b)(void *vdst, void *vsrc, uint32_t desc)
+{
+ const uint8_t *src = vsrc;
+ uint8_t *dst = vdst;
+ size_t i, n = simd_oprsz(desc);
+
+ for (i = 0; i < n; ++i) {
+ dst[i] = src[tile_vslice_index(i)];
+ }
+}
+
+void HELPER(sme2_mova_zc_h)(void *vdst, void *vsrc, uint32_t desc)
+{
+ const uint16_t *src = vsrc;
+ uint16_t *dst = vdst;
+ size_t i, n = simd_oprsz(desc) / 2;
+
+ for (i = 0; i < n; ++i) {
+ dst[i] = src[tile_vslice_index(i)];
+ }
+}
+
+void HELPER(sme2_mova_zc_s)(void *vdst, void *vsrc, uint32_t desc)
+{
+ const uint32_t *src = vsrc;
+ uint32_t *dst = vdst;
+ size_t i, n = simd_oprsz(desc) / 4;
+
+ for (i = 0; i < n; ++i) {
+ dst[i] = src[tile_vslice_index(i)];
+ }
+}
+
+void HELPER(sme2_mova_zc_d)(void *vdst, void *vsrc, uint32_t desc)
+{
+ const uint64_t *src = vsrc;
+ uint64_t *dst = vdst;
+ size_t i, n = simd_oprsz(desc) / 8;
+
+ for (i = 0; i < n; ++i) {
+ dst[i] = src[tile_vslice_index(i)];
+ }
+}
+
+void HELPER(sme2p1_movaz_zc_b)(void *vdst, void *vsrc, uint32_t desc)
+{
+ uint8_t *src = vsrc;
+ uint8_t *dst = vdst;
+ size_t i, n = simd_oprsz(desc);
+
+ for (i = 0; i < n; ++i) {
+ dst[i] = src[tile_vslice_index(i)];
+ src[tile_vslice_index(i)] = 0;
+ }
+}
+
+void HELPER(sme2p1_movaz_zc_h)(void *vdst, void *vsrc, uint32_t desc)
+{
+ uint16_t *src = vsrc;
+ uint16_t *dst = vdst;
+ size_t i, n = simd_oprsz(desc) / 2;
+
+ for (i = 0; i < n; ++i) {
+ dst[i] = src[tile_vslice_index(i)];
+ src[tile_vslice_index(i)] = 0;
+ }
+}
+
+void HELPER(sme2p1_movaz_zc_s)(void *vdst, void *vsrc, uint32_t desc)
+{
+ uint32_t *src = vsrc;
+ uint32_t *dst = vdst;
+ size_t i, n = simd_oprsz(desc) / 4;
+
+ for (i = 0; i < n; ++i) {
+ dst[i] = src[tile_vslice_index(i)];
+ src[tile_vslice_index(i)] = 0;
+ }
+}
+
+void HELPER(sme2p1_movaz_zc_d)(void *vdst, void *vsrc, uint32_t desc)
+{
+ uint64_t *src = vsrc;
+ uint64_t *dst = vdst;
+ size_t i, n = simd_oprsz(desc) / 8;
+
+ for (i = 0; i < n; ++i) {
+ dst[i] = src[tile_vslice_index(i)];
+ src[tile_vslice_index(i)] = 0;
+ }
+}
+
+void HELPER(sme2p1_movaz_zc_q)(void *vdst, void *vsrc, uint32_t desc)
+{
+ Int128 *src = vsrc;
+ Int128 *dst = vdst;
+ size_t i, n = simd_oprsz(desc) / 16;
+
+ for (i = 0; i < n; ++i) {
+ dst[i] = src[tile_vslice_index(i)];
+ memset(&src[tile_vslice_index(i)], 0, 16);
+ }
+}
+
/*
* Clear elements in a tile slice comprising len bytes.
*/
@@ -314,6 +425,26 @@ static void copy_vertical_q(void *vdst, const void *vsrc, size_t len)
}
}
+void HELPER(sme2_mova_cz_b)(void *vdst, void *vsrc, uint32_t desc)
+{
+ copy_vertical_b(vdst, vsrc, simd_oprsz(desc));
+}
+
+void HELPER(sme2_mova_cz_h)(void *vdst, void *vsrc, uint32_t desc)
+{
+ copy_vertical_h(vdst, vsrc, simd_oprsz(desc));
+}
+
+void HELPER(sme2_mova_cz_s)(void *vdst, void *vsrc, uint32_t desc)
+{
+ copy_vertical_s(vdst, vsrc, simd_oprsz(desc));
+}
+
+void HELPER(sme2_mova_cz_d)(void *vdst, void *vsrc, uint32_t desc)
+{
+ copy_vertical_d(vdst, vsrc, simd_oprsz(desc));
+}
+
/*
* Host and TLB primitives for vertical tile slice addressing.
*/
@@ -344,54 +475,22 @@ static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za, \
TLB(env, useronly_clean_ptr(addr), val, ra); \
}
-/*
- * The ARMVectorReg elements are stored in host-endian 64-bit units.
- * For 128-bit quantities, the sequence defined by the Elem[] pseudocode
- * corresponds to storing the two 64-bit pieces in little-endian order.
- */
-#define DO_LDQ(HNAME, VNAME, BE, HOST, TLB) \
-static inline void HNAME##_host(void *za, intptr_t off, void *host) \
-{ \
- uint64_t val0 = HOST(host), val1 = HOST(host + 8); \
- uint64_t *ptr = za + off; \
- ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \
-} \
+#define DO_LDQ(HNAME, VNAME) \
static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \
{ \
HNAME##_host(za, tile_vslice_offset(off), host); \
} \
-static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \
- target_ulong addr, uintptr_t ra) \
-{ \
- uint64_t val0 = TLB(env, useronly_clean_ptr(addr), ra); \
- uint64_t val1 = TLB(env, useronly_clean_ptr(addr + 8), ra); \
- uint64_t *ptr = za + off; \
- ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \
-} \
static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \
target_ulong addr, uintptr_t ra) \
{ \
HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra); \
}
-#define DO_STQ(HNAME, VNAME, BE, HOST, TLB) \
-static inline void HNAME##_host(void *za, intptr_t off, void *host) \
-{ \
- uint64_t *ptr = za + off; \
- HOST(host, ptr[BE]); \
- HOST(host + 8, ptr[!BE]); \
-} \
+#define DO_STQ(HNAME, VNAME) \
static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \
{ \
HNAME##_host(za, tile_vslice_offset(off), host); \
} \
-static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \
- target_ulong addr, uintptr_t ra) \
-{ \
- uint64_t *ptr = za + off; \
- TLB(env, useronly_clean_ptr(addr), ptr[BE], ra); \
- TLB(env, useronly_clean_ptr(addr + 8), ptr[!BE], ra); \
-} \
static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \
target_ulong addr, uintptr_t ra) \
{ \
@@ -406,8 +505,8 @@ DO_LD(ld1s_le, uint32_t, ldl_le_p, cpu_ldl_le_data_ra)
DO_LD(ld1d_be, uint64_t, ldq_be_p, cpu_ldq_be_data_ra)
DO_LD(ld1d_le, uint64_t, ldq_le_p, cpu_ldq_le_data_ra)
-DO_LDQ(sve_ld1qq_be, sme_ld1q_be, 1, ldq_be_p, cpu_ldq_be_data_ra)
-DO_LDQ(sve_ld1qq_le, sme_ld1q_le, 0, ldq_le_p, cpu_ldq_le_data_ra)
+DO_LDQ(sve_ld1qq_be, sme_ld1q_be)
+DO_LDQ(sve_ld1qq_le, sme_ld1q_le)
DO_ST(st1b, uint8_t, stb_p, cpu_stb_data_ra)
DO_ST(st1h_be, uint16_t, stw_be_p, cpu_stw_be_data_ra)
@@ -417,8 +516,8 @@ DO_ST(st1s_le, uint32_t, stl_le_p, cpu_stl_le_data_ra)
DO_ST(st1d_be, uint64_t, stq_be_p, cpu_stq_be_data_ra)
DO_ST(st1d_le, uint64_t, stq_le_p, cpu_stq_le_data_ra)
-DO_STQ(sve_st1qq_be, sme_st1q_be, 1, stq_be_p, cpu_stq_be_data_ra)
-DO_STQ(sve_st1qq_le, sme_st1q_le, 0, stq_le_p, cpu_stq_le_data_ra)
+DO_STQ(sve_st1qq_be, sme_st1q_be)
+DO_STQ(sve_st1qq_le, sme_st1q_le)
#undef DO_LD
#undef DO_ST
@@ -903,28 +1002,69 @@ void HELPER(sme_addva_d)(void *vzda, void *vzn, void *vpn,
}
}
-void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn,
- void *vpm, float_status *fpst_in, uint32_t desc)
+static void do_fmopa_h(void *vza, void *vzn, void *vzm, uint16_t *pn,
+ uint16_t *pm, float_status *fpst, uint32_t desc,
+ uint16_t negx, int negf)
{
intptr_t row, col, oprsz = simd_maxsz(desc);
- uint32_t neg = simd_data(desc) << 31;
- uint16_t *pn = vpn, *pm = vpm;
- float_status fpst;
- /*
- * Make a copy of float_status because this operation does not
- * update the cumulative fp exception status. It also produces
- * default nans.
- */
- fpst = *fpst_in;
- set_default_nan_mode(true, &fpst);
+ for (row = 0; row < oprsz; ) {
+ uint16_t pa = pn[H2(row >> 4)];
+ do {
+ if (pa & 1) {
+ void *vza_row = vza + tile_vslice_offset(row);
+ uint16_t n = *(uint32_t *)(vzn + H1_2(row)) ^ negx;
+
+ for (col = 0; col < oprsz; ) {
+ uint16_t pb = pm[H2(col >> 4)];
+ do {
+ if (pb & 1) {
+ uint16_t *a = vza_row + H1_2(col);
+ uint16_t *m = vzm + H1_2(col);
+ *a = float16_muladd(n, *m, *a, negf, fpst);
+ }
+ col += 2;
+ pb >>= 2;
+ } while (col & 15);
+ }
+ }
+ row += 2;
+ pa >>= 2;
+ } while (row & 15);
+ }
+}
+
+void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn,
+ void *vpm, float_status *fpst, uint32_t desc)
+{
+ do_fmopa_h(vza, vzn, vzm, vpn, vpm, fpst, desc, 0, 0);
+}
+
+void HELPER(sme_fmops_h)(void *vza, void *vzn, void *vzm, void *vpn,
+ void *vpm, float_status *fpst, uint32_t desc)
+{
+ do_fmopa_h(vza, vzn, vzm, vpn, vpm, fpst, desc, 1u << 15, 0);
+}
+
+void HELPER(sme_ah_fmops_h)(void *vza, void *vzn, void *vzm, void *vpn,
+ void *vpm, float_status *fpst, uint32_t desc)
+{
+ do_fmopa_h(vza, vzn, vzm, vpn, vpm, fpst, desc, 0,
+ float_muladd_negate_product);
+}
+
+static void do_fmopa_s(void *vza, void *vzn, void *vzm, uint16_t *pn,
+ uint16_t *pm, float_status *fpst, uint32_t desc,
+ uint32_t negx, int negf)
+{
+ intptr_t row, col, oprsz = simd_maxsz(desc);
for (row = 0; row < oprsz; ) {
uint16_t pa = pn[H2(row >> 4)];
do {
if (pa & 1) {
void *vza_row = vza + tile_vslice_offset(row);
- uint32_t n = *(uint32_t *)(vzn + H1_4(row)) ^ neg;
+ uint32_t n = *(uint32_t *)(vzn + H1_4(row)) ^ negx;
for (col = 0; col < oprsz; ) {
uint16_t pb = pm[H2(col >> 4)];
@@ -932,7 +1072,7 @@ void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn,
if (pb & 1) {
uint32_t *a = vza_row + H1_4(col);
uint32_t *m = vzm + H1_4(col);
- *a = float32_muladd(n, *m, *a, 0, &fpst);
+ *a = float32_muladd(n, *m, *a, negf, fpst);
}
col += 4;
pb >>= 4;
@@ -945,32 +1085,116 @@ void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn,
}
}
-void HELPER(sme_fmopa_d)(void *vza, void *vzn, void *vzm, void *vpn,
- void *vpm, float_status *fpst_in, uint32_t desc)
+void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn,
+ void *vpm, float_status *fpst, uint32_t desc)
{
- intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
- uint64_t neg = (uint64_t)simd_data(desc) << 63;
- uint64_t *za = vza, *zn = vzn, *zm = vzm;
- uint8_t *pn = vpn, *pm = vpm;
- float_status fpst = *fpst_in;
+ do_fmopa_s(vza, vzn, vzm, vpn, vpm, fpst, desc, 0, 0);
+}
- set_default_nan_mode(true, &fpst);
+void HELPER(sme_fmops_s)(void *vza, void *vzn, void *vzm, void *vpn,
+ void *vpm, float_status *fpst, uint32_t desc)
+{
+ do_fmopa_s(vza, vzn, vzm, vpn, vpm, fpst, desc, 1u << 31, 0);
+}
+
+void HELPER(sme_ah_fmops_s)(void *vza, void *vzn, void *vzm, void *vpn,
+ void *vpm, float_status *fpst, uint32_t desc)
+{
+ do_fmopa_s(vza, vzn, vzm, vpn, vpm, fpst, desc, 0,
+ float_muladd_negate_product);
+}
+
+static void do_fmopa_d(uint64_t *za, uint64_t *zn, uint64_t *zm, uint8_t *pn,
+ uint8_t *pm, float_status *fpst, uint32_t desc,
+ uint64_t negx, int negf)
+{
+ intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
for (row = 0; row < oprsz; ++row) {
if (pn[H1(row)] & 1) {
uint64_t *za_row = &za[tile_vslice_index(row)];
- uint64_t n = zn[row] ^ neg;
+ uint64_t n = zn[row] ^ negx;
for (col = 0; col < oprsz; ++col) {
if (pm[H1(col)] & 1) {
uint64_t *a = &za_row[col];
- *a = float64_muladd(n, zm[col], *a, 0, &fpst);
+ *a = float64_muladd(n, zm[col], *a, negf, fpst);
}
}
}
}
}
+void HELPER(sme_fmopa_d)(void *vza, void *vzn, void *vzm, void *vpn,
+ void *vpm, float_status *fpst, uint32_t desc)
+{
+ do_fmopa_d(vza, vzn, vzm, vpn, vpm, fpst, desc, 0, 0);
+}
+
+void HELPER(sme_fmops_d)(void *vza, void *vzn, void *vzm, void *vpn,
+ void *vpm, float_status *fpst, uint32_t desc)
+{
+ do_fmopa_d(vza, vzn, vzm, vpn, vpm, fpst, desc, 1ull << 63, 0);
+}
+
+void HELPER(sme_ah_fmops_d)(void *vza, void *vzn, void *vzm, void *vpn,
+ void *vpm, float_status *fpst, uint32_t desc)
+{
+ do_fmopa_d(vza, vzn, vzm, vpn, vpm, fpst, desc, 0,
+ float_muladd_negate_product);
+}
+
+static void do_bfmopa(void *vza, void *vzn, void *vzm, uint16_t *pn,
+ uint16_t *pm, float_status *fpst, uint32_t desc,
+ uint16_t negx, int negf)
+{
+ intptr_t row, col, oprsz = simd_maxsz(desc);
+
+ for (row = 0; row < oprsz; ) {
+ uint16_t pa = pn[H2(row >> 4)];
+ do {
+ if (pa & 1) {
+ void *vza_row = vza + tile_vslice_offset(row);
+ uint16_t n = *(uint32_t *)(vzn + H1_2(row)) ^ negx;
+
+ for (col = 0; col < oprsz; ) {
+ uint16_t pb = pm[H2(col >> 4)];
+ do {
+ if (pb & 1) {
+ uint16_t *a = vza_row + H1_2(col);
+ uint16_t *m = vzm + H1_2(col);
+ *a = bfloat16_muladd(n, *m, *a, negf, fpst);
+ }
+ col += 2;
+ pb >>= 2;
+ } while (col & 15);
+ }
+ }
+ row += 2;
+ pa >>= 2;
+ } while (row & 15);
+ }
+}
+
+void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, void *vpn,
+ void *vpm, float_status *fpst, uint32_t desc)
+{
+ do_bfmopa(vza, vzn, vzm, vpn, vpm, fpst, desc, 0, 0);
+}
+
+void HELPER(sme_bfmops)(void *vza, void *vzn, void *vzm, void *vpn,
+ void *vpm, float_status *fpst, uint32_t desc)
+{
+ do_bfmopa(vza, vzn, vzm, vpn, vpm, fpst, desc, 1u << 15, 0);
+}
+
+void HELPER(sme_ah_bfmops)(void *vza, void *vzn, void *vzm, void *vpn,
+ void *vpm, float_status *fpst, uint32_t desc)
+{
+ do_bfmopa(vza, vzn, vzm, vpn, vpm, fpst, desc, 0,
+ float_muladd_negate_product);
+}
+
/*
* Alter PAIR as needed for controlling predicates being false,
* and for NEG on an enabled row element.
@@ -991,6 +1215,20 @@ static inline uint32_t f16mop_adj_pair(uint32_t pair, uint32_t pg, uint32_t neg)
return pair;
}
+static inline uint32_t f16mop_ah_neg_adj_pair(uint32_t pair, uint32_t pg)
+{
+ uint32_t l = pg & 1 ? float16_ah_chs(pair) : 0;
+ uint32_t h = pg & 4 ? float16_ah_chs(pair >> 16) : 0;
+ return l | (h << 16);
+}
+
+static inline uint32_t bf16mop_ah_neg_adj_pair(uint32_t pair, uint32_t pg)
+{
+ uint32_t l = pg & 1 ? bfloat16_ah_chs(pair) : 0;
+ uint32_t h = pg & 4 ? bfloat16_ah_chs(pair >> 16) : 0;
+ return l | (h << 16);
+}
+
static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2,
float_status *s_f16, float_status *s_std,
float_status *s_odd)
@@ -1005,49 +1243,67 @@ static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2,
* - we have pre-set-up copy of s_std which is set to round-to-odd,
* for the multiply (see below)
*/
- float64 e1r = float16_to_float64(e1 & 0xffff, true, s_f16);
- float64 e1c = float16_to_float64(e1 >> 16, true, s_f16);
- float64 e2r = float16_to_float64(e2 & 0xffff, true, s_f16);
- float64 e2c = float16_to_float64(e2 >> 16, true, s_f16);
- float64 t64;
+ float16 h1r = e1 & 0xffff;
+ float16 h1c = e1 >> 16;
+ float16 h2r = e2 & 0xffff;
+ float16 h2c = e2 >> 16;
float32 t32;
- /*
- * The ARM pseudocode function FPDot performs both multiplies
- * and the add with a single rounding operation. Emulate this
- * by performing the first multiply in round-to-odd, then doing
- * the second multiply as fused multiply-add, and rounding to
- * float32 all in one step.
- */
- t64 = float64_mul(e1r, e2r, s_odd);
- t64 = float64r32_muladd(e1c, e2c, t64, 0, s_std);
+ /* C.f. FPProcessNaNs4 */
+ if (float16_is_any_nan(h1r) || float16_is_any_nan(h1c) ||
+ float16_is_any_nan(h2r) || float16_is_any_nan(h2c)) {
+ float16 t16;
+
+ if (float16_is_signaling_nan(h1r, s_f16)) {
+ t16 = h1r;
+ } else if (float16_is_signaling_nan(h1c, s_f16)) {
+ t16 = h1c;
+ } else if (float16_is_signaling_nan(h2r, s_f16)) {
+ t16 = h2r;
+ } else if (float16_is_signaling_nan(h2c, s_f16)) {
+ t16 = h2c;
+ } else if (float16_is_any_nan(h1r)) {
+ t16 = h1r;
+ } else if (float16_is_any_nan(h1c)) {
+ t16 = h1c;
+ } else if (float16_is_any_nan(h2r)) {
+ t16 = h2r;
+ } else {
+ t16 = h2c;
+ }
+ t32 = float16_to_float32(t16, true, s_f16);
+ } else {
+ float64 e1r = float16_to_float64(h1r, true, s_f16);
+ float64 e1c = float16_to_float64(h1c, true, s_f16);
+ float64 e2r = float16_to_float64(h2r, true, s_f16);
+ float64 e2c = float16_to_float64(h2c, true, s_f16);
+ float64 t64;
+
+ /*
+ * The ARM pseudocode function FPDot performs both multiplies
+ * and the add with a single rounding operation. Emulate this
+ * by performing the first multiply in round-to-odd, then doing
+ * the second multiply as fused multiply-add, and rounding to
+ * float32 all in one step.
+ */
+ t64 = float64_mul(e1r, e2r, s_odd);
+ t64 = float64r32_muladd(e1c, e2c, t64, 0, s_std);
- /* This conversion is exact, because we've already rounded. */
- t32 = float64_to_float32(t64, s_std);
+ /* This conversion is exact, because we've already rounded. */
+ t32 = float64_to_float32(t64, s_std);
+ }
/* The final accumulation step is not fused. */
return float32_add(sum, t32, s_std);
}
-void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn,
- void *vpm, CPUARMState *env, uint32_t desc)
+static void do_fmopa_w_h(void *vza, void *vzn, void *vzm, uint16_t *pn,
+ uint16_t *pm, CPUARMState *env, uint32_t desc,
+ uint32_t negx, bool ah_neg)
{
intptr_t row, col, oprsz = simd_maxsz(desc);
- uint32_t neg = simd_data(desc) * 0x80008000u;
- uint16_t *pn = vpn, *pm = vpm;
- float_status fpst_odd, fpst_std, fpst_f16;
+ float_status fpst_odd = env->vfp.fp_status[FPST_ZA];
- /*
- * Make copies of the fp status fields we use, because this operation
- * does not update the cumulative fp exception status. It also
- * produces default NaNs. We also need a second copy of fp_status with
- * round-to-odd -- see above.
- */
- fpst_f16 = env->vfp.fp_status[FPST_A64_F16];
- fpst_std = env->vfp.fp_status[FPST_A64];
- set_default_nan_mode(true, &fpst_std);
- set_default_nan_mode(true, &fpst_f16);
- fpst_odd = fpst_std;
set_float_rounding_mode(float_round_to_odd, &fpst_odd);
for (row = 0; row < oprsz; ) {
@@ -1056,7 +1312,11 @@ void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn,
void *vza_row = vza + tile_vslice_offset(row);
uint32_t n = *(uint32_t *)(vzn + H1_4(row));
- n = f16mop_adj_pair(n, prow, neg);
+ if (ah_neg) {
+ n = f16mop_ah_neg_adj_pair(n, prow);
+ } else {
+ n = f16mop_adj_pair(n, prow, negx);
+ }
for (col = 0; col < oprsz; ) {
uint16_t pcol = pm[H2(col >> 4)];
@@ -1067,7 +1327,9 @@ void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn,
m = f16mop_adj_pair(m, pcol, 0);
*a = f16_dotadd(*a, n, m,
- &fpst_f16, &fpst_std, &fpst_odd);
+ &env->vfp.fp_status[FPST_ZA_F16],
+ &env->vfp.fp_status[FPST_ZA],
+ &fpst_odd);
}
col += 4;
pcol >>= 4;
@@ -1079,12 +1341,103 @@ void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn,
}
}
-void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm,
- void *vpn, void *vpm, CPUARMState *env, uint32_t desc)
+void HELPER(sme_fmopa_w_h)(void *vza, void *vzn, void *vzm, void *vpn,
+ void *vpm, CPUARMState *env, uint32_t desc)
+{
+ do_fmopa_w_h(vza, vzn, vzm, vpn, vpm, env, desc, 0, false);
+}
+
+void HELPER(sme_fmops_w_h)(void *vza, void *vzn, void *vzm, void *vpn,
+ void *vpm, CPUARMState *env, uint32_t desc)
+{
+ do_fmopa_w_h(vza, vzn, vzm, vpn, vpm, env, desc, 0x80008000u, false);
+}
+
+void HELPER(sme_ah_fmops_w_h)(void *vza, void *vzn, void *vzm, void *vpn,
+ void *vpm, CPUARMState *env, uint32_t desc)
+{
+ do_fmopa_w_h(vza, vzn, vzm, vpn, vpm, env, desc, 0, true);
+}
+
+void HELPER(sme2_fdot_h)(void *vd, void *vn, void *vm, void *va,
+ CPUARMState *env, uint32_t desc)
+{
+ intptr_t i, oprsz = simd_maxsz(desc);
+ bool za = extract32(desc, SIMD_DATA_SHIFT, 1);
+ float_status *fpst_std = &env->vfp.fp_status[za ? FPST_ZA : FPST_A64];
+ float_status *fpst_f16 = &env->vfp.fp_status[za ? FPST_ZA_F16 : FPST_A64_F16];
+ float_status fpst_odd = *fpst_std;
+ float32 *d = vd, *a = va;
+ uint32_t *n = vn, *m = vm;
+
+ set_float_rounding_mode(float_round_to_odd, &fpst_odd);
+
+ for (i = 0; i < oprsz / sizeof(float32); ++i) {
+ d[H4(i)] = f16_dotadd(a[H4(i)], n[H4(i)], m[H4(i)],
+ fpst_f16, fpst_std, &fpst_odd);
+ }
+}
+
+void HELPER(sme2_fdot_idx_h)(void *vd, void *vn, void *vm, void *va,
+ CPUARMState *env, uint32_t desc)
+{
+ intptr_t i, j, oprsz = simd_maxsz(desc);
+ intptr_t elements = oprsz / sizeof(float32);
+ intptr_t eltspersegment = MIN(4, elements);
+ int idx = extract32(desc, SIMD_DATA_SHIFT, 2);
+ bool za = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
+ float_status *fpst_std = &env->vfp.fp_status[za ? FPST_ZA : FPST_A64];
+ float_status *fpst_f16 = &env->vfp.fp_status[za ? FPST_ZA_F16 : FPST_A64_F16];
+ float_status fpst_odd = *fpst_std;
+ float32 *d = vd, *a = va;
+ uint32_t *n = vn, *m = (uint32_t *)vm + H4(idx);
+
+ set_float_rounding_mode(float_round_to_odd, &fpst_odd);
+
+ for (i = 0; i < elements; i += eltspersegment) {
+ uint32_t mm = m[i];
+ for (j = 0; j < eltspersegment; ++j) {
+ d[H4(i + j)] = f16_dotadd(a[H4(i + j)], n[H4(i + j)], mm,
+ fpst_f16, fpst_std, &fpst_odd);
+ }
+ }
+}
+
+void HELPER(sme2_fvdot_idx_h)(void *vd, void *vn, void *vm, void *va,
+ CPUARMState *env, uint32_t desc)
+{
+ intptr_t i, j, oprsz = simd_maxsz(desc);
+ intptr_t elements = oprsz / sizeof(float32);
+ intptr_t eltspersegment = MIN(4, elements);
+ int idx = extract32(desc, SIMD_DATA_SHIFT, 2);
+ int sel = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
+ float_status fpst_odd, *fpst_std, *fpst_f16;
+ float32 *d = vd, *a = va;
+ uint16_t *n0 = vn;
+ uint16_t *n1 = vn + sizeof(ARMVectorReg);
+ uint32_t *m = (uint32_t *)vm + H4(idx);
+
+ fpst_std = &env->vfp.fp_status[FPST_ZA];
+ fpst_f16 = &env->vfp.fp_status[FPST_ZA_F16];
+ fpst_odd = *fpst_std;
+ set_float_rounding_mode(float_round_to_odd, &fpst_odd);
+
+ for (i = 0; i < elements; i += eltspersegment) {
+ uint32_t mm = m[i];
+ for (j = 0; j < eltspersegment; ++j) {
+ uint32_t nn = (n0[H2(2 * (i + j) + sel)])
+ | (n1[H2(2 * (i + j) + sel)] << 16);
+ d[i + H4(j)] = f16_dotadd(a[i + H4(j)], nn, mm,
+ fpst_f16, fpst_std, &fpst_odd);
+ }
+ }
+}
+
+static void do_bfmopa_w(void *vza, void *vzn, void *vzm,
+ uint16_t *pn, uint16_t *pm, CPUARMState *env,
+ uint32_t desc, uint32_t negx, bool ah_neg)
{
intptr_t row, col, oprsz = simd_maxsz(desc);
- uint32_t neg = simd_data(desc) * 0x80008000u;
- uint16_t *pn = vpn, *pm = vpm;
float_status fpst, fpst_odd;
if (is_ebf(env, &fpst, &fpst_odd)) {
@@ -1094,7 +1447,11 @@ void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm,
void *vza_row = vza + tile_vslice_offset(row);
uint32_t n = *(uint32_t *)(vzn + H1_4(row));
- n = f16mop_adj_pair(n, prow, neg);
+ if (ah_neg) {
+ n = bf16mop_ah_neg_adj_pair(n, prow);
+ } else {
+ n = f16mop_adj_pair(n, prow, negx);
+ }
for (col = 0; col < oprsz; ) {
uint16_t pcol = pm[H2(col >> 4)];
@@ -1121,7 +1478,11 @@ void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm,
void *vza_row = vza + tile_vslice_offset(row);
uint32_t n = *(uint32_t *)(vzn + H1_4(row));
- n = f16mop_adj_pair(n, prow, neg);
+ if (ah_neg) {
+ n = bf16mop_ah_neg_adj_pair(n, prow);
+ } else {
+ n = f16mop_adj_pair(n, prow, negx);
+ }
for (col = 0; col < oprsz; ) {
uint16_t pcol = pm[H2(col >> 4)];
@@ -1144,6 +1505,24 @@ void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm,
}
}
+void HELPER(sme_bfmopa_w)(void *vza, void *vzn, void *vzm, void *vpn,
+ void *vpm, CPUARMState *env, uint32_t desc)
+{
+ do_bfmopa_w(vza, vzn, vzm, vpn, vpm, env, desc, 0, false);
+}
+
+void HELPER(sme_bfmops_w)(void *vza, void *vzn, void *vzm, void *vpn,
+ void *vpm, CPUARMState *env, uint32_t desc)
+{
+ do_bfmopa_w(vza, vzn, vzm, vpn, vpm, env, desc, 0x80008000u, false);
+}
+
+void HELPER(sme_ah_bfmops_w)(void *vza, void *vzn, void *vzm, void *vpn,
+ void *vpm, CPUARMState *env, uint32_t desc)
+{
+ do_bfmopa_w(vza, vzn, vzm, vpn, vpm, env, desc, 0, true);
+}
+
typedef uint32_t IMOPFn32(uint32_t, uint32_t, uint32_t, uint8_t, bool);
static inline void do_imopa_s(uint32_t *za, uint32_t *zn, uint32_t *zm,
uint8_t *pn, uint8_t *pm,
@@ -1188,7 +1567,7 @@ static inline void do_imopa_d(uint64_t *za, uint64_t *zn, uint64_t *zm,
}
}
-#define DEF_IMOP_32(NAME, NTYPE, MTYPE) \
+#define DEF_IMOP_8x4_32(NAME, NTYPE, MTYPE) \
static uint32_t NAME(uint32_t n, uint32_t m, uint32_t a, uint8_t p, bool neg) \
{ \
uint32_t sum = 0; \
@@ -1201,7 +1580,7 @@ static uint32_t NAME(uint32_t n, uint32_t m, uint32_t a, uint8_t p, bool neg) \
return neg ? a - sum : a + sum; \
}
-#define DEF_IMOP_64(NAME, NTYPE, MTYPE) \
+#define DEF_IMOP_16x4_64(NAME, NTYPE, MTYPE) \
static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
{ \
uint64_t sum = 0; \
@@ -1214,27 +1593,1070 @@ static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
return neg ? a - sum : a + sum; \
}
-DEF_IMOP_32(smopa_s, int8_t, int8_t)
-DEF_IMOP_32(umopa_s, uint8_t, uint8_t)
-DEF_IMOP_32(sumopa_s, int8_t, uint8_t)
-DEF_IMOP_32(usmopa_s, uint8_t, int8_t)
+DEF_IMOP_8x4_32(smopa_s, int8_t, int8_t)
+DEF_IMOP_8x4_32(umopa_s, uint8_t, uint8_t)
+DEF_IMOP_8x4_32(sumopa_s, int8_t, uint8_t)
+DEF_IMOP_8x4_32(usmopa_s, uint8_t, int8_t)
-DEF_IMOP_64(smopa_d, int16_t, int16_t)
-DEF_IMOP_64(umopa_d, uint16_t, uint16_t)
-DEF_IMOP_64(sumopa_d, int16_t, uint16_t)
-DEF_IMOP_64(usmopa_d, uint16_t, int16_t)
+DEF_IMOP_16x4_64(smopa_d, int16_t, int16_t)
+DEF_IMOP_16x4_64(umopa_d, uint16_t, uint16_t)
+DEF_IMOP_16x4_64(sumopa_d, int16_t, uint16_t)
+DEF_IMOP_16x4_64(usmopa_d, uint16_t, int16_t)
-#define DEF_IMOPH(NAME, S) \
- void HELPER(sme_##NAME##_##S)(void *vza, void *vzn, void *vzm, \
+#define DEF_IMOPH(P, NAME, S) \
+ void HELPER(P##_##NAME##_##S)(void *vza, void *vzn, void *vzm, \
void *vpn, void *vpm, uint32_t desc) \
{ do_imopa_##S(vza, vzn, vzm, vpn, vpm, desc, NAME##_##S); }
-DEF_IMOPH(smopa, s)
-DEF_IMOPH(umopa, s)
-DEF_IMOPH(sumopa, s)
-DEF_IMOPH(usmopa, s)
+DEF_IMOPH(sme, smopa, s)
+DEF_IMOPH(sme, umopa, s)
+DEF_IMOPH(sme, sumopa, s)
+DEF_IMOPH(sme, usmopa, s)
+
+DEF_IMOPH(sme, smopa, d)
+DEF_IMOPH(sme, umopa, d)
+DEF_IMOPH(sme, sumopa, d)
+DEF_IMOPH(sme, usmopa, d)
+
+static uint32_t bmopa_s(uint32_t n, uint32_t m, uint32_t a, uint8_t p, bool neg)
+{
+ uint32_t sum = ctpop32(~(n ^ m));
+ if (neg) {
+ sum = -sum;
+ }
+ if (!(p & 1)) {
+ sum = 0;
+ }
+ return a + sum;
+}
+
+DEF_IMOPH(sme2, bmopa, s)
+
+#define DEF_IMOP_16x2_32(NAME, NTYPE, MTYPE) \
+static uint32_t NAME(uint32_t n, uint32_t m, uint32_t a, uint8_t p, bool neg) \
+{ \
+ uint32_t sum = 0; \
+ /* Apply P to N as a mask, making the inactive elements 0. */ \
+ n &= expand_pred_h(p); \
+ sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); \
+ sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16); \
+ return neg ? a - sum : a + sum; \
+}
+
+DEF_IMOP_16x2_32(smopa2_s, int16_t, int16_t)
+DEF_IMOP_16x2_32(umopa2_s, uint16_t, uint16_t)
+
+DEF_IMOPH(sme2, smopa2, s)
+DEF_IMOPH(sme2, umopa2, s)
+
+#define DO_VDOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD, HN) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ intptr_t svl = simd_oprsz(desc); \
+ intptr_t elements = svl / sizeof(TYPED); \
+ intptr_t eltperseg = 16 / sizeof(TYPED); \
+ intptr_t nreg = sizeof(TYPED) / sizeof(TYPEN); \
+ intptr_t vstride = (svl / nreg) * sizeof(ARMVectorReg); \
+ intptr_t zstride = sizeof(ARMVectorReg) / sizeof(TYPEN); \
+ intptr_t idx = extract32(desc, SIMD_DATA_SHIFT, 2); \
+ TYPEN *n = vn; \
+ TYPEM *m = vm; \
+ for (intptr_t r = 0; r < nreg; r++) { \
+ TYPED *d = vd + r * vstride; \
+ for (intptr_t seg = 0; seg < elements; seg += eltperseg) { \
+ intptr_t s = seg + idx; \
+ for (intptr_t e = seg; e < seg + eltperseg; e++) { \
+ TYPED sum = d[HD(e)]; \
+ for (intptr_t i = 0; i < nreg; i++) { \
+ TYPED nn = n[i * zstride + HN(nreg * e + r)]; \
+ TYPED mm = m[HN(nreg * s + i)]; \
+ sum += nn * mm; \
+ } \
+ d[HD(e)] = sum; \
+ } \
+ } \
+ } \
+}
+
+DO_VDOT_IDX(sme2_svdot_idx_4b, int32_t, int8_t, int8_t, H4, H1)
+DO_VDOT_IDX(sme2_uvdot_idx_4b, uint32_t, uint8_t, uint8_t, H4, H1)
+DO_VDOT_IDX(sme2_suvdot_idx_4b, int32_t, int8_t, uint8_t, H4, H1)
+DO_VDOT_IDX(sme2_usvdot_idx_4b, int32_t, uint8_t, int8_t, H4, H1)
+
+DO_VDOT_IDX(sme2_svdot_idx_4h, int64_t, int16_t, int16_t, H8, H2)
+DO_VDOT_IDX(sme2_uvdot_idx_4h, uint64_t, uint16_t, uint16_t, H8, H2)
+
+DO_VDOT_IDX(sme2_svdot_idx_2h, int32_t, int16_t, int16_t, H4, H2)
+DO_VDOT_IDX(sme2_uvdot_idx_2h, uint32_t, uint16_t, uint16_t, H4, H2)
+
+#undef DO_VDOT_IDX
+
+#define DO_MLALL(NAME, TYPEW, TYPEN, TYPEM, HW, HN, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
+{ \
+ intptr_t elements = simd_oprsz(desc) / sizeof(TYPEW); \
+ intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 2); \
+ TYPEW *d = vd, *a = va; TYPEN *n = vn; TYPEM *m = vm; \
+ for (intptr_t i = 0; i < elements; ++i) { \
+ TYPEW nn = n[HN(i * 4 + sel)]; \
+ TYPEM mm = m[HN(i * 4 + sel)]; \
+ d[HW(i)] = a[HW(i)] OP (nn * mm); \
+ } \
+}
+
+DO_MLALL(sme2_smlall_s, int32_t, int8_t, int8_t, H4, H1, +)
+DO_MLALL(sme2_smlall_d, int64_t, int16_t, int16_t, H8, H2, +)
+DO_MLALL(sme2_smlsll_s, int32_t, int8_t, int8_t, H4, H1, -)
+DO_MLALL(sme2_smlsll_d, int64_t, int16_t, int16_t, H8, H2, -)
+
+DO_MLALL(sme2_umlall_s, uint32_t, uint8_t, uint8_t, H4, H1, +)
+DO_MLALL(sme2_umlall_d, uint64_t, uint16_t, uint16_t, H8, H2, +)
+DO_MLALL(sme2_umlsll_s, uint32_t, uint8_t, uint8_t, H4, H1, -)
+DO_MLALL(sme2_umlsll_d, uint64_t, uint16_t, uint16_t, H8, H2, -)
+
+DO_MLALL(sme2_usmlall_s, uint32_t, uint8_t, int8_t, H4, H1, +)
+
+#undef DO_MLALL
+
+#define DO_MLALL_IDX(NAME, TYPEW, TYPEN, TYPEM, HW, HN, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
+{ \
+ intptr_t elements = simd_oprsz(desc) / sizeof(TYPEW); \
+ intptr_t eltspersegment = 16 / sizeof(TYPEW); \
+ intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 2); \
+ intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 4); \
+ TYPEW *d = vd, *a = va; TYPEN *n = vn; TYPEM *m = vm; \
+ for (intptr_t i = 0; i < elements; i += eltspersegment) { \
+ TYPEW mm = m[HN(i * 4 + idx)]; \
+ for (intptr_t j = 0; j < eltspersegment; ++j) { \
+ TYPEN nn = n[HN((i + j) * 4 + sel)]; \
+ d[HW(i + j)] = a[HW(i + j)] OP (nn * mm); \
+ } \
+ } \
+}
+
+DO_MLALL_IDX(sme2_smlall_idx_s, int32_t, int8_t, int8_t, H4, H1, +)
+DO_MLALL_IDX(sme2_smlall_idx_d, int64_t, int16_t, int16_t, H8, H2, +)
+DO_MLALL_IDX(sme2_smlsll_idx_s, int32_t, int8_t, int8_t, H4, H1, -)
+DO_MLALL_IDX(sme2_smlsll_idx_d, int64_t, int16_t, int16_t, H8, H2, -)
+
+DO_MLALL_IDX(sme2_umlall_idx_s, uint32_t, uint8_t, uint8_t, H4, H1, +)
+DO_MLALL_IDX(sme2_umlall_idx_d, uint64_t, uint16_t, uint16_t, H8, H2, +)
+DO_MLALL_IDX(sme2_umlsll_idx_s, uint32_t, uint8_t, uint8_t, H4, H1, -)
+DO_MLALL_IDX(sme2_umlsll_idx_d, uint64_t, uint16_t, uint16_t, H8, H2, -)
+
+DO_MLALL_IDX(sme2_usmlall_idx_s, uint32_t, uint8_t, int8_t, H4, H1, +)
+DO_MLALL_IDX(sme2_sumlall_idx_s, uint32_t, int8_t, uint8_t, H4, H1, +)
+
+#undef DO_MLALL_IDX
+
+/* Convert and compress */
+void HELPER(sme2_bfcvt)(void *vd, void *vs, float_status *fpst, uint32_t desc)
+{
+ ARMVectorReg scratch;
+ size_t oprsz = simd_oprsz(desc);
+ size_t i, n = oprsz / 4;
+ float32 *s0 = vs;
+ float32 *s1 = vs + sizeof(ARMVectorReg);
+ bfloat16 *d = vd;
+
+ if (vd == s1) {
+ s1 = memcpy(&scratch, s1, oprsz);
+ }
+
+ for (i = 0; i < n; ++i) {
+ d[H2(i)] = float32_to_bfloat16(s0[H4(i)], fpst);
+ }
+ for (i = 0; i < n; ++i) {
+ d[H2(i) + n] = float32_to_bfloat16(s1[H4(i)], fpst);
+ }
+}
+
+void HELPER(sme2_fcvt_n)(void *vd, void *vs, float_status *fpst, uint32_t desc)
+{
+ ARMVectorReg scratch;
+ size_t oprsz = simd_oprsz(desc);
+ size_t i, n = oprsz / 4;
+ float32 *s0 = vs;
+ float32 *s1 = vs + sizeof(ARMVectorReg);
+ float16 *d = vd;
+
+ if (vd == s1) {
+ s1 = memcpy(&scratch, s1, oprsz);
+ }
+
+ for (i = 0; i < n; ++i) {
+ d[H2(i)] = sve_f32_to_f16(s0[H4(i)], fpst);
+ }
+ for (i = 0; i < n; ++i) {
+ d[H2(i) + n] = sve_f32_to_f16(s1[H4(i)], fpst);
+ }
+}
+
+#define SQCVT2(NAME, TW, TN, HW, HN, SAT) \
+void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \
+{ \
+ ARMVectorReg scratch; \
+ size_t oprsz = simd_oprsz(desc), n = oprsz / sizeof(TW); \
+ TW *s0 = vs, *s1 = vs + sizeof(ARMVectorReg); \
+ TN *d = vd; \
+ if (vectors_overlap(vd, 1, vs, 2)) { \
+ d = (TN *)&scratch; \
+ } \
+ for (size_t i = 0; i < n; ++i) { \
+ d[HN(i)] = SAT(s0[HW(i)]); \
+ d[HN(i + n)] = SAT(s1[HW(i)]); \
+ } \
+ if (d != vd) { \
+ memcpy(vd, d, oprsz); \
+ } \
+}
+
+SQCVT2(sme2_sqcvt_sh, int32_t, int16_t, H4, H2, do_ssat_h)
+SQCVT2(sme2_uqcvt_sh, uint32_t, uint16_t, H4, H2, do_usat_h)
+SQCVT2(sme2_sqcvtu_sh, int32_t, uint16_t, H4, H2, do_usat_h)
+
+#undef SQCVT2
+
+#define SQCVT4(NAME, TW, TN, HW, HN, SAT) \
+void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \
+{ \
+ ARMVectorReg scratch; \
+ size_t oprsz = simd_oprsz(desc), n = oprsz / sizeof(TW); \
+ TW *s0 = vs, *s1 = vs + sizeof(ARMVectorReg); \
+ TW *s2 = vs + 2 * sizeof(ARMVectorReg); \
+ TW *s3 = vs + 3 * sizeof(ARMVectorReg); \
+ TN *d = vd; \
+ if (vectors_overlap(vd, 1, vs, 4)) { \
+ d = (TN *)&scratch; \
+ } \
+ for (size_t i = 0; i < n; ++i) { \
+ d[HN(i)] = SAT(s0[HW(i)]); \
+ d[HN(i + n)] = SAT(s1[HW(i)]); \
+ d[HN(i + 2 * n)] = SAT(s2[HW(i)]); \
+ d[HN(i + 3 * n)] = SAT(s3[HW(i)]); \
+ } \
+ if (d != vd) { \
+ memcpy(vd, d, oprsz); \
+ } \
+}
+
+SQCVT4(sme2_sqcvt_sb, int32_t, int8_t, H4, H2, do_ssat_b)
+SQCVT4(sme2_uqcvt_sb, uint32_t, uint8_t, H4, H2, do_usat_b)
+SQCVT4(sme2_sqcvtu_sb, int32_t, uint8_t, H4, H2, do_usat_b)
+
+SQCVT4(sme2_sqcvt_dh, int64_t, int16_t, H8, H2, do_ssat_h)
+SQCVT4(sme2_uqcvt_dh, uint64_t, uint16_t, H8, H2, do_usat_h)
+SQCVT4(sme2_sqcvtu_dh, int64_t, uint16_t, H8, H2, do_usat_h)
+
+#undef SQCVT4
+
+#define SQRSHR2(NAME, TW, TN, HW, HN, RSHR, SAT) \
+void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \
+{ \
+ ARMVectorReg scratch; \
+ size_t oprsz = simd_oprsz(desc), n = oprsz / sizeof(TW); \
+ int shift = simd_data(desc); \
+ TW *s0 = vs, *s1 = vs + sizeof(ARMVectorReg); \
+ TN *d = vd; \
+ if (vectors_overlap(vd, 1, vs, 2)) { \
+ d = (TN *)&scratch; \
+ } \
+ for (size_t i = 0; i < n; ++i) { \
+ d[HN(i)] = SAT(RSHR(s0[HW(i)], shift)); \
+ d[HN(i + n)] = SAT(RSHR(s1[HW(i)], shift)); \
+ } \
+ if (d != vd) { \
+ memcpy(vd, d, oprsz); \
+ } \
+}
+
+SQRSHR2(sme2_sqrshr_sh, int32_t, int16_t, H4, H2, do_srshr, do_ssat_h)
+SQRSHR2(sme2_uqrshr_sh, uint32_t, uint16_t, H4, H2, do_urshr, do_usat_h)
+SQRSHR2(sme2_sqrshru_sh, int32_t, uint16_t, H4, H2, do_srshr, do_usat_h)
+
+#undef SQRSHR2
+
+#define SQRSHR4(NAME, TW, TN, HW, HN, RSHR, SAT) \
+void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \
+{ \
+ ARMVectorReg scratch; \
+ size_t oprsz = simd_oprsz(desc), n = oprsz / sizeof(TW); \
+ int shift = simd_data(desc); \
+ TW *s0 = vs, *s1 = vs + sizeof(ARMVectorReg); \
+ TW *s2 = vs + 2 * sizeof(ARMVectorReg); \
+ TW *s3 = vs + 3 * sizeof(ARMVectorReg); \
+ TN *d = vd; \
+ if (vectors_overlap(vd, 1, vs, 4)) { \
+ d = (TN *)&scratch; \
+ } \
+ for (size_t i = 0; i < n; ++i) { \
+ d[HN(i)] = SAT(RSHR(s0[HW(i)], shift)); \
+ d[HN(i + n)] = SAT(RSHR(s1[HW(i)], shift)); \
+ d[HN(i + 2 * n)] = SAT(RSHR(s2[HW(i)], shift)); \
+ d[HN(i + 3 * n)] = SAT(RSHR(s3[HW(i)], shift)); \
+ } \
+ if (d != vd) { \
+ memcpy(vd, d, oprsz); \
+ } \
+}
+
+SQRSHR4(sme2_sqrshr_sb, int32_t, int8_t, H4, H2, do_srshr, do_ssat_b)
+SQRSHR4(sme2_uqrshr_sb, uint32_t, uint8_t, H4, H2, do_urshr, do_usat_b)
+SQRSHR4(sme2_sqrshru_sb, int32_t, uint8_t, H4, H2, do_srshr, do_usat_b)
+
+SQRSHR4(sme2_sqrshr_dh, int64_t, int16_t, H8, H2, do_srshr, do_ssat_h)
+SQRSHR4(sme2_uqrshr_dh, uint64_t, uint16_t, H8, H2, do_urshr, do_usat_h)
+SQRSHR4(sme2_sqrshru_dh, int64_t, uint16_t, H8, H2, do_srshr, do_usat_h)
+
+#undef SQRSHR4
+
+/* Convert and interleave */
+void HELPER(sme2_bfcvtn)(void *vd, void *vs, float_status *fpst, uint32_t desc)
+{
+ size_t i, n = simd_oprsz(desc) / 4;
+ float32 *s0 = vs;
+ float32 *s1 = vs + sizeof(ARMVectorReg);
+ bfloat16 *d = vd;
+
+ for (i = 0; i < n; ++i) {
+ bfloat16 d0 = float32_to_bfloat16(s0[H4(i)], fpst);
+ bfloat16 d1 = float32_to_bfloat16(s1[H4(i)], fpst);
+ d[H2(i * 2 + 0)] = d0;
+ d[H2(i * 2 + 1)] = d1;
+ }
+}
+
+void HELPER(sme2_fcvtn)(void *vd, void *vs, float_status *fpst, uint32_t desc)
+{
+ size_t i, n = simd_oprsz(desc) / 4;
+ float32 *s0 = vs;
+ float32 *s1 = vs + sizeof(ARMVectorReg);
+ bfloat16 *d = vd;
+
+ for (i = 0; i < n; ++i) {
+ bfloat16 d0 = sve_f32_to_f16(s0[H4(i)], fpst);
+ bfloat16 d1 = sve_f32_to_f16(s1[H4(i)], fpst);
+ d[H2(i * 2 + 0)] = d0;
+ d[H2(i * 2 + 1)] = d1;
+ }
+}
+
+#define SQCVTN2(NAME, TW, TN, HW, HN, SAT) \
+void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \
+{ \
+ ARMVectorReg scratch; \
+ size_t oprsz = simd_oprsz(desc), n = oprsz / sizeof(TW); \
+ TW *s0 = vs, *s1 = vs + sizeof(ARMVectorReg); \
+ TN *d = vd; \
+ if (vectors_overlap(vd, 1, vs, 2)) { \
+ d = (TN *)&scratch; \
+ } \
+ for (size_t i = 0; i < n; ++i) { \
+ d[HN(2 * i + 0)] = SAT(s0[HW(i)]); \
+ d[HN(2 * i + 1)] = SAT(s1[HW(i)]); \
+ } \
+ if (d != vd) { \
+ memcpy(vd, d, oprsz); \
+ } \
+}
+
+SQCVTN2(sme2_sqcvtn_sh, int32_t, int16_t, H4, H2, do_ssat_h)
+SQCVTN2(sme2_uqcvtn_sh, uint32_t, uint16_t, H4, H2, do_usat_h)
+SQCVTN2(sme2_sqcvtun_sh, int32_t, uint16_t, H4, H2, do_usat_h)
+
+#undef SQCVTN2
+
+#define SQCVTN4(NAME, TW, TN, HW, HN, SAT) \
+void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \
+{ \
+ ARMVectorReg scratch; \
+ size_t oprsz = simd_oprsz(desc), n = oprsz / sizeof(TW); \
+ TW *s0 = vs, *s1 = vs + sizeof(ARMVectorReg); \
+ TW *s2 = vs + 2 * sizeof(ARMVectorReg); \
+ TW *s3 = vs + 3 * sizeof(ARMVectorReg); \
+ TN *d = vd; \
+ if (vectors_overlap(vd, 1, vs, 4)) { \
+ d = (TN *)&scratch; \
+ } \
+ for (size_t i = 0; i < n; ++i) { \
+ d[HN(4 * i + 0)] = SAT(s0[HW(i)]); \
+ d[HN(4 * i + 1)] = SAT(s1[HW(i)]); \
+ d[HN(4 * i + 2)] = SAT(s2[HW(i)]); \
+ d[HN(4 * i + 3)] = SAT(s3[HW(i)]); \
+ } \
+ if (d != vd) { \
+ memcpy(vd, d, oprsz); \
+ } \
+}
+
+SQCVTN4(sme2_sqcvtn_sb, int32_t, int8_t, H4, H1, do_ssat_b)
+SQCVTN4(sme2_uqcvtn_sb, uint32_t, uint8_t, H4, H1, do_usat_b)
+SQCVTN4(sme2_sqcvtun_sb, int32_t, uint8_t, H4, H1, do_usat_b)
+
+SQCVTN4(sme2_sqcvtn_dh, int64_t, int16_t, H8, H2, do_ssat_h)
+SQCVTN4(sme2_uqcvtn_dh, uint64_t, uint16_t, H8, H2, do_usat_h)
+SQCVTN4(sme2_sqcvtun_dh, int64_t, uint16_t, H8, H2, do_usat_h)
+
+#undef SQCVTN4
+
+#define SQRSHRN2(NAME, TW, TN, HW, HN, RSHR, SAT) \
+void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \
+{ \
+ ARMVectorReg scratch; \
+ size_t oprsz = simd_oprsz(desc), n = oprsz / sizeof(TW); \
+ int shift = simd_data(desc); \
+ TW *s0 = vs, *s1 = vs + sizeof(ARMVectorReg); \
+ TN *d = vd; \
+ if (vectors_overlap(vd, 1, vs, 2)) { \
+ d = (TN *)&scratch; \
+ } \
+ for (size_t i = 0; i < n; ++i) { \
+ d[HN(2 * i + 0)] = SAT(RSHR(s0[HW(i)], shift)); \
+ d[HN(2 * i + 1)] = SAT(RSHR(s1[HW(i)], shift)); \
+ } \
+ if (d != vd) { \
+ memcpy(vd, d, oprsz); \
+ } \
+}
+
+SQRSHRN2(sme2_sqrshrn_sh, int32_t, int16_t, H4, H2, do_srshr, do_ssat_h)
+SQRSHRN2(sme2_uqrshrn_sh, uint32_t, uint16_t, H4, H2, do_urshr, do_usat_h)
+SQRSHRN2(sme2_sqrshrun_sh, int32_t, uint16_t, H4, H2, do_srshr, do_usat_h)
+
+#undef SQRSHRN2
+
+#define SQRSHRN4(NAME, TW, TN, HW, HN, RSHR, SAT) \
+void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \
+{ \
+ ARMVectorReg scratch; \
+ size_t oprsz = simd_oprsz(desc), n = oprsz / sizeof(TW); \
+ int shift = simd_data(desc); \
+ TW *s0 = vs, *s1 = vs + sizeof(ARMVectorReg); \
+ TW *s2 = vs + 2 * sizeof(ARMVectorReg); \
+ TW *s3 = vs + 3 * sizeof(ARMVectorReg); \
+ TN *d = vd; \
+ if (vectors_overlap(vd, 1, vs, 4)) { \
+ d = (TN *)&scratch; \
+ } \
+ for (size_t i = 0; i < n; ++i) { \
+ d[HN(4 * i + 0)] = SAT(RSHR(s0[HW(i)], shift)); \
+ d[HN(4 * i + 1)] = SAT(RSHR(s1[HW(i)], shift)); \
+ d[HN(4 * i + 2)] = SAT(RSHR(s2[HW(i)], shift)); \
+ d[HN(4 * i + 3)] = SAT(RSHR(s3[HW(i)], shift)); \
+ } \
+ if (d != vd) { \
+ memcpy(vd, d, oprsz); \
+ } \
+}
+
+SQRSHRN4(sme2_sqrshrn_sb, int32_t, int8_t, H4, H1, do_srshr, do_ssat_b)
+SQRSHRN4(sme2_uqrshrn_sb, uint32_t, uint8_t, H4, H1, do_urshr, do_usat_b)
+SQRSHRN4(sme2_sqrshrun_sb, int32_t, uint8_t, H4, H1, do_srshr, do_usat_b)
+
+SQRSHRN4(sme2_sqrshrn_dh, int64_t, int16_t, H8, H2, do_srshr, do_ssat_h)
+SQRSHRN4(sme2_uqrshrn_dh, uint64_t, uint16_t, H8, H2, do_urshr, do_usat_h)
+SQRSHRN4(sme2_sqrshrun_dh, int64_t, uint16_t, H8, H2, do_srshr, do_usat_h)
+
+#undef SQRSHRN4
+
+/* Expand and convert */
+void HELPER(sme2_fcvt_w)(void *vd, void *vs, float_status *fpst, uint32_t desc)
+{
+ ARMVectorReg scratch;
+ size_t oprsz = simd_oprsz(desc);
+ size_t i, n = oprsz / 4;
+ float16 *s = vs;
+ float32 *d0 = vd;
+ float32 *d1 = vd + sizeof(ARMVectorReg);
+
+ if (vectors_overlap(vd, 1, vs, 2)) {
+ s = memcpy(&scratch, s, oprsz);
+ }
+
+ for (i = 0; i < n; ++i) {
+ d0[H4(i)] = sve_f16_to_f32(s[H2(i)], fpst);
+ }
+ for (i = 0; i < n; ++i) {
+ d1[H4(i)] = sve_f16_to_f32(s[H2(n + i)], fpst);
+ }
+}
+
+#define UNPK(NAME, SREG, TW, TN, HW, HN) \
+void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \
+{ \
+ ARMVectorReg scratch[SREG]; \
+ size_t oprsz = simd_oprsz(desc); \
+ size_t n = oprsz / sizeof(TW); \
+ if (vectors_overlap(vd, 2 * SREG, vs, SREG)) { \
+ vs = memcpy(scratch, vs, sizeof(scratch)); \
+ } \
+ for (size_t r = 0; r < SREG; ++r) { \
+ TN *s = vs + r * sizeof(ARMVectorReg); \
+ for (size_t i = 0; i < 2; ++i) { \
+ TW *d = vd + (2 * r + i) * sizeof(ARMVectorReg); \
+ for (size_t e = 0; e < n; ++e) { \
+ d[HW(e)] = s[HN(i * n + e)]; \
+ } \
+ } \
+ } \
+}
+
+UNPK(sme2_sunpk2_bh, 1, int16_t, int8_t, H2, H1)
+UNPK(sme2_sunpk2_hs, 1, int32_t, int16_t, H4, H2)
+UNPK(sme2_sunpk2_sd, 1, int64_t, int32_t, H8, H4)
+
+UNPK(sme2_sunpk4_bh, 2, int16_t, int8_t, H2, H1)
+UNPK(sme2_sunpk4_hs, 2, int32_t, int16_t, H4, H2)
+UNPK(sme2_sunpk4_sd, 2, int64_t, int32_t, H8, H4)
+
+UNPK(sme2_uunpk2_bh, 1, uint16_t, uint8_t, H2, H1)
+UNPK(sme2_uunpk2_hs, 1, uint32_t, uint16_t, H4, H2)
+UNPK(sme2_uunpk2_sd, 1, uint64_t, uint32_t, H8, H4)
+
+UNPK(sme2_uunpk4_bh, 2, uint16_t, uint8_t, H2, H1)
+UNPK(sme2_uunpk4_hs, 2, uint32_t, uint16_t, H4, H2)
+UNPK(sme2_uunpk4_sd, 2, uint64_t, uint32_t, H8, H4)
+
+#undef UNPK
+
+/* Deinterleave and convert. */
+void HELPER(sme2_fcvtl)(void *vd, void *vs, float_status *fpst, uint32_t desc)
+{
+ size_t i, n = simd_oprsz(desc) / 4;
+ float16 *s = vs;
+ float32 *d0 = vd;
+ float32 *d1 = vd + sizeof(ARMVectorReg);
+
+ for (i = 0; i < n; ++i) {
+ float32 v0 = sve_f16_to_f32(s[H2(i * 2 + 0)], fpst);
+ float32 v1 = sve_f16_to_f32(s[H2(i * 2 + 1)], fpst);
+ d0[H4(i)] = v0;
+ d1[H4(i)] = v1;
+ }
+}
+
+void HELPER(sme2_scvtf)(void *vd, void *vs, float_status *fpst, uint32_t desc)
+{
+ size_t i, n = simd_oprsz(desc) / 4;
+ int32_t *d = vd;
+ float32 *s = vs;
+
+ for (i = 0; i < n; ++i) {
+ d[i] = int32_to_float32(s[i], fpst);
+ }
+}
+
+void HELPER(sme2_ucvtf)(void *vd, void *vs, float_status *fpst, uint32_t desc)
+{
+ size_t i, n = simd_oprsz(desc) / 4;
+ uint32_t *d = vd;
+ float32 *s = vs;
+
+ for (i = 0; i < n; ++i) {
+ d[i] = uint32_to_float32(s[i], fpst);
+ }
+}
+
+#define ZIP2(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ ARMVectorReg scratch[2]; \
+ size_t oprsz = simd_oprsz(desc); \
+ size_t pairs = oprsz / (sizeof(TYPE) * 2); \
+ TYPE *n = vn, *m = vm; \
+ if (vectors_overlap(vd, 2, vn, 1)) { \
+ n = memcpy(&scratch[0], vn, oprsz); \
+ } \
+ if (vectors_overlap(vd, 2, vm, 1)) { \
+ m = memcpy(&scratch[1], vm, oprsz); \
+ } \
+ for (size_t r = 0; r < 2; ++r) { \
+ TYPE *d = vd + r * sizeof(ARMVectorReg); \
+ size_t base = r * pairs; \
+ for (size_t p = 0; p < pairs; ++p) { \
+ d[H(2 * p + 0)] = n[base + H(p)]; \
+ d[H(2 * p + 1)] = m[base + H(p)]; \
+ } \
+ } \
+}
+
+ZIP2(sme2_zip2_b, uint8_t, H1)
+ZIP2(sme2_zip2_h, uint16_t, H2)
+ZIP2(sme2_zip2_s, uint32_t, H4)
+ZIP2(sme2_zip2_d, uint64_t, )
+ZIP2(sme2_zip2_q, Int128, )
+
+#undef ZIP2
+
+#define ZIP4(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \
+{ \
+ ARMVectorReg scratch[4]; \
+ size_t oprsz = simd_oprsz(desc); \
+ size_t quads = oprsz / (sizeof(TYPE) * 4); \
+ TYPE *s0, *s1, *s2, *s3; \
+ if (vs == vd) { \
+ vs = memcpy(scratch, vs, sizeof(scratch)); \
+ } \
+ s0 = vs; \
+ s1 = vs + sizeof(ARMVectorReg); \
+ s2 = vs + 2 * sizeof(ARMVectorReg); \
+ s3 = vs + 3 * sizeof(ARMVectorReg); \
+ for (size_t r = 0; r < 4; ++r) { \
+ TYPE *d = vd + r * sizeof(ARMVectorReg); \
+ size_t base = r * quads; \
+ for (size_t q = 0; q < quads; ++q) { \
+ d[H(4 * q + 0)] = s0[base + H(q)]; \
+ d[H(4 * q + 1)] = s1[base + H(q)]; \
+ d[H(4 * q + 2)] = s2[base + H(q)]; \
+ d[H(4 * q + 3)] = s3[base + H(q)]; \
+ } \
+ } \
+}
+
+ZIP4(sme2_zip4_b, uint8_t, H1)
+ZIP4(sme2_zip4_h, uint16_t, H2)
+ZIP4(sme2_zip4_s, uint32_t, H4)
+ZIP4(sme2_zip4_d, uint64_t, )
+ZIP4(sme2_zip4_q, Int128, )
+
+#undef ZIP4
+
+#define UZP2(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ ARMVectorReg scratch[2]; \
+ size_t oprsz = simd_oprsz(desc); \
+ size_t pairs = oprsz / (sizeof(TYPE) * 2); \
+ TYPE *d0 = vd, *d1 = vd + sizeof(ARMVectorReg); \
+ if (vectors_overlap(vd, 2, vn, 1)) { \
+ vn = memcpy(&scratch[0], vn, oprsz); \
+ } \
+ if (vectors_overlap(vd, 2, vm, 1)) { \
+ vm = memcpy(&scratch[1], vm, oprsz); \
+ } \
+ for (size_t r = 0; r < 2; ++r) { \
+ TYPE *s = r ? vm : vn; \
+ size_t base = r * pairs; \
+ for (size_t p = 0; p < pairs; ++p) { \
+ d0[base + H(p)] = s[H(2 * p + 0)]; \
+ d1[base + H(p)] = s[H(2 * p + 1)]; \
+ } \
+ } \
+}
+
+UZP2(sme2_uzp2_b, uint8_t, H1)
+UZP2(sme2_uzp2_h, uint16_t, H2)
+UZP2(sme2_uzp2_s, uint32_t, H4)
+UZP2(sme2_uzp2_d, uint64_t, )
+UZP2(sme2_uzp2_q, Int128, )
+
+#undef UZP2
+
+#define UZP4(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \
+{ \
+ ARMVectorReg scratch[4]; \
+ size_t oprsz = simd_oprsz(desc); \
+ size_t quads = oprsz / (sizeof(TYPE) * 4); \
+ TYPE *d0, *d1, *d2, *d3; \
+ if (vs == vd) { \
+ vs = memcpy(scratch, vs, sizeof(scratch)); \
+ } \
+ d0 = vd; \
+ d1 = vd + sizeof(ARMVectorReg); \
+ d2 = vd + 2 * sizeof(ARMVectorReg); \
+ d3 = vd + 3 * sizeof(ARMVectorReg); \
+ for (size_t r = 0; r < 4; ++r) { \
+ TYPE *s = vs + r * sizeof(ARMVectorReg); \
+ size_t base = r * quads; \
+ for (size_t q = 0; q < quads; ++q) { \
+ d0[base + H(q)] = s[H(4 * q + 0)]; \
+ d1[base + H(q)] = s[H(4 * q + 1)]; \
+ d2[base + H(q)] = s[H(4 * q + 2)]; \
+ d3[base + H(q)] = s[H(4 * q + 3)]; \
+ } \
+ } \
+}
+
+UZP4(sme2_uzp4_b, uint8_t, H1)
+UZP4(sme2_uzp4_h, uint16_t, H2)
+UZP4(sme2_uzp4_s, uint32_t, H4)
+UZP4(sme2_uzp4_d, uint64_t, )
+UZP4(sme2_uzp4_q, Int128, )
+
+#undef UZP4
+
+#define ICLAMP(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ size_t stride = sizeof(ARMVectorReg) / sizeof(TYPE); \
+ size_t elements = simd_oprsz(desc) / sizeof(TYPE); \
+ size_t nreg = simd_data(desc); \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ for (size_t e = 0; e < elements; e++) { \
+ TYPE nn = n[H(e)], mm = m[H(e)]; \
+ for (size_t r = 0; r < nreg; r++) { \
+ TYPE *dd = &d[r * stride + H(e)]; \
+ *dd = MIN(MAX(*dd, nn), mm); \
+ } \
+ } \
+}
+
+ICLAMP(sme2_sclamp_b, int8_t, H1)
+ICLAMP(sme2_sclamp_h, int16_t, H2)
+ICLAMP(sme2_sclamp_s, int32_t, H4)
+ICLAMP(sme2_sclamp_d, int64_t, H8)
+
+ICLAMP(sme2_uclamp_b, uint8_t, H1)
+ICLAMP(sme2_uclamp_h, uint16_t, H2)
+ICLAMP(sme2_uclamp_s, uint32_t, H4)
+ICLAMP(sme2_uclamp_d, uint64_t, H8)
+
+#undef ICLAMP
+
+/*
+ * Note the argument ordering to minnum and maxnum must match
+ * the ARM pseudocode so that NaNs are propagated properly.
+ */
+#define FCLAMP(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, \
+ float_status *fpst, uint32_t desc) \
+{ \
+ size_t stride = sizeof(ARMVectorReg) / sizeof(TYPE); \
+ size_t elements = simd_oprsz(desc) / sizeof(TYPE); \
+ size_t nreg = simd_data(desc); \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ for (size_t e = 0; e < elements; e++) { \
+ TYPE nn = n[H(e)], mm = m[H(e)]; \
+ for (size_t r = 0; r < nreg; r++) { \
+ TYPE *dd = &d[r * stride + H(e)]; \
+ *dd = TYPE##_minnum(TYPE##_maxnum(nn, *dd, fpst), mm, fpst); \
+ } \
+ } \
+}
+
+FCLAMP(sme2_fclamp_h, float16, H2)
+FCLAMP(sme2_fclamp_s, float32, H4)
+FCLAMP(sme2_fclamp_d, float64, H8)
+FCLAMP(sme2_bfclamp, bfloat16, H2)
+
+#undef FCLAMP
+
+void HELPER(sme2_sel_b)(void *vd, void *vn, void *vm,
+ uint32_t png, uint32_t desc)
+{
+ int vl = simd_oprsz(desc);
+ int nreg = simd_data(desc);
+ int elements = vl / sizeof(uint8_t);
+ DecodeCounter p = decode_counter(png, vl, MO_8);
+
+ if (p.lg2_stride == 0) {
+ if (p.invert) {
+ for (int r = 0; r < nreg; r++) {
+ uint8_t *d = vd + r * sizeof(ARMVectorReg);
+ uint8_t *n = vn + r * sizeof(ARMVectorReg);
+ uint8_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+
+ if (split <= 0) {
+ memcpy(d, n, vl); /* all true */
+ } else if (elements <= split) {
+ memcpy(d, m, vl); /* all false */
+ } else {
+ for (int e = 0; e < split; e++) {
+ d[H1(e)] = m[H1(e)];
+ }
+ for (int e = split; e < elements; e++) {
+ d[H1(e)] = n[H1(e)];
+ }
+ }
+ }
+ } else {
+ for (int r = 0; r < nreg; r++) {
+ uint8_t *d = vd + r * sizeof(ARMVectorReg);
+ uint8_t *n = vn + r * sizeof(ARMVectorReg);
+ uint8_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+
+ if (split <= 0) {
+ memcpy(d, m, vl); /* all false */
+ } else if (elements <= split) {
+ memcpy(d, n, vl); /* all true */
+ } else {
+ for (int e = 0; e < split; e++) {
+ d[H1(e)] = n[H1(e)];
+ }
+ for (int e = split; e < elements; e++) {
+ d[H1(e)] = m[H1(e)];
+ }
+ }
+ }
+ }
+ } else {
+ int estride = 1 << p.lg2_stride;
+ if (p.invert) {
+ for (int r = 0; r < nreg; r++) {
+ uint8_t *d = vd + r * sizeof(ARMVectorReg);
+ uint8_t *n = vn + r * sizeof(ARMVectorReg);
+ uint8_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+ int e = 0;
+
+ for (; e < MIN(split, elements); e++) {
+ d[H1(e)] = m[H1(e)];
+ }
+ for (; e < elements; e += estride) {
+ d[H1(e)] = n[H1(e)];
+ for (int i = 1; i < estride; i++) {
+ d[H1(e + i)] = m[H1(e + i)];
+ }
+ }
+ }
+ } else {
+ for (int r = 0; r < nreg; r++) {
+ uint8_t *d = vd + r * sizeof(ARMVectorReg);
+ uint8_t *n = vn + r * sizeof(ARMVectorReg);
+ uint8_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+ int e = 0;
+
+ for (; e < MIN(split, elements); e += estride) {
+ d[H1(e)] = n[H1(e)];
+ for (int i = 1; i < estride; i++) {
+ d[H1(e + i)] = m[H1(e + i)];
+ }
+ }
+ for (; e < elements; e++) {
+ d[H1(e)] = m[H1(e)];
+ }
+ }
+ }
+ }
+}
+
+void HELPER(sme2_sel_h)(void *vd, void *vn, void *vm,
+ uint32_t png, uint32_t desc)
+{
+ int vl = simd_oprsz(desc);
+ int nreg = simd_data(desc);
+ int elements = vl / sizeof(uint16_t);
+ DecodeCounter p = decode_counter(png, vl, MO_16);
+
+ if (p.lg2_stride == 0) {
+ if (p.invert) {
+ for (int r = 0; r < nreg; r++) {
+ uint16_t *d = vd + r * sizeof(ARMVectorReg);
+ uint16_t *n = vn + r * sizeof(ARMVectorReg);
+ uint16_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+
+ if (split <= 0) {
+ memcpy(d, n, vl); /* all true */
+ } else if (elements <= split) {
+ memcpy(d, m, vl); /* all false */
+ } else {
+ for (int e = 0; e < split; e++) {
+ d[H2(e)] = m[H2(e)];
+ }
+ for (int e = split; e < elements; e++) {
+ d[H2(e)] = n[H2(e)];
+ }
+ }
+ }
+ } else {
+ for (int r = 0; r < nreg; r++) {
+ uint16_t *d = vd + r * sizeof(ARMVectorReg);
+ uint16_t *n = vn + r * sizeof(ARMVectorReg);
+ uint16_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+
+ if (split <= 0) {
+ memcpy(d, m, vl); /* all false */
+ } else if (elements <= split) {
+ memcpy(d, n, vl); /* all true */
+ } else {
+ for (int e = 0; e < split; e++) {
+ d[H2(e)] = n[H2(e)];
+ }
+ for (int e = split; e < elements; e++) {
+ d[H2(e)] = m[H2(e)];
+ }
+ }
+ }
+ }
+ } else {
+ int estride = 1 << p.lg2_stride;
+ if (p.invert) {
+ for (int r = 0; r < nreg; r++) {
+ uint16_t *d = vd + r * sizeof(ARMVectorReg);
+ uint16_t *n = vn + r * sizeof(ARMVectorReg);
+ uint16_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+ int e = 0;
+
+ for (; e < MIN(split, elements); e++) {
+ d[H2(e)] = m[H2(e)];
+ }
+ for (; e < elements; e += estride) {
+ d[H2(e)] = n[H2(e)];
+ for (int i = 1; i < estride; i++) {
+ d[H2(e + i)] = m[H2(e + i)];
+ }
+ }
+ }
+ } else {
+ for (int r = 0; r < nreg; r++) {
+ uint16_t *d = vd + r * sizeof(ARMVectorReg);
+ uint16_t *n = vn + r * sizeof(ARMVectorReg);
+ uint16_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+ int e = 0;
+
+ for (; e < MIN(split, elements); e += estride) {
+ d[H2(e)] = n[H2(e)];
+ for (int i = 1; i < estride; i++) {
+ d[H2(e + i)] = m[H2(e + i)];
+ }
+ }
+ for (; e < elements; e++) {
+ d[H2(e)] = m[H2(e)];
+ }
+ }
+ }
+ }
+}
-DEF_IMOPH(smopa, d)
-DEF_IMOPH(umopa, d)
-DEF_IMOPH(sumopa, d)
-DEF_IMOPH(usmopa, d)
+void HELPER(sme2_sel_s)(void *vd, void *vn, void *vm,
+ uint32_t png, uint32_t desc)
+{
+ int vl = simd_oprsz(desc);
+ int nreg = simd_data(desc);
+ int elements = vl / sizeof(uint32_t);
+ DecodeCounter p = decode_counter(png, vl, MO_32);
+
+ if (p.lg2_stride == 0) {
+ if (p.invert) {
+ for (int r = 0; r < nreg; r++) {
+ uint32_t *d = vd + r * sizeof(ARMVectorReg);
+ uint32_t *n = vn + r * sizeof(ARMVectorReg);
+ uint32_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+
+ if (split <= 0) {
+ memcpy(d, n, vl); /* all true */
+ } else if (elements <= split) {
+ memcpy(d, m, vl); /* all false */
+ } else {
+ for (int e = 0; e < split; e++) {
+ d[H4(e)] = m[H4(e)];
+ }
+ for (int e = split; e < elements; e++) {
+ d[H4(e)] = n[H4(e)];
+ }
+ }
+ }
+ } else {
+ for (int r = 0; r < nreg; r++) {
+ uint32_t *d = vd + r * sizeof(ARMVectorReg);
+ uint32_t *n = vn + r * sizeof(ARMVectorReg);
+ uint32_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+
+ if (split <= 0) {
+ memcpy(d, m, vl); /* all false */
+ } else if (elements <= split) {
+ memcpy(d, n, vl); /* all true */
+ } else {
+ for (int e = 0; e < split; e++) {
+ d[H4(e)] = n[H4(e)];
+ }
+ for (int e = split; e < elements; e++) {
+ d[H4(e)] = m[H4(e)];
+ }
+ }
+ }
+ }
+ } else {
+ /* p.esz must be MO_64, so stride must be 2. */
+ if (p.invert) {
+ for (int r = 0; r < nreg; r++) {
+ uint32_t *d = vd + r * sizeof(ARMVectorReg);
+ uint32_t *n = vn + r * sizeof(ARMVectorReg);
+ uint32_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+ int e = 0;
+
+ for (; e < MIN(split, elements); e++) {
+ d[H4(e)] = m[H4(e)];
+ }
+ for (; e < elements; e += 2) {
+ d[H4(e)] = n[H4(e)];
+ d[H4(e + 1)] = m[H4(e + 1)];
+ }
+ }
+ } else {
+ for (int r = 0; r < nreg; r++) {
+ uint32_t *d = vd + r * sizeof(ARMVectorReg);
+ uint32_t *n = vn + r * sizeof(ARMVectorReg);
+ uint32_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+ int e = 0;
+
+ for (; e < MIN(split, elements); e += 2) {
+ d[H4(e)] = n[H4(e)];
+ d[H4(e + 1)] = m[H4(e + 1)];
+ }
+ for (; e < elements; e++) {
+ d[H4(e)] = m[H4(e)];
+ }
+ }
+ }
+ }
+}
+
+void HELPER(sme2_sel_d)(void *vd, void *vn, void *vm,
+ uint32_t png, uint32_t desc)
+{
+ int vl = simd_oprsz(desc);
+ int nreg = simd_data(desc);
+ int elements = vl / sizeof(uint64_t);
+ DecodeCounter p = decode_counter(png, vl, MO_64);
+
+ if (p.invert) {
+ for (int r = 0; r < nreg; r++) {
+ uint64_t *d = vd + r * sizeof(ARMVectorReg);
+ uint64_t *n = vn + r * sizeof(ARMVectorReg);
+ uint64_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+
+ if (split <= 0) {
+ memcpy(d, n, vl); /* all true */
+ } else if (elements <= split) {
+ memcpy(d, m, vl); /* all false */
+ } else {
+ memcpy(d, m, split * sizeof(uint64_t));
+ memcpy(d + split, n + split,
+ (elements - split) * sizeof(uint64_t));
+ }
+ }
+ } else {
+ for (int r = 0; r < nreg; r++) {
+ uint64_t *d = vd + r * sizeof(ARMVectorReg);
+ uint64_t *n = vn + r * sizeof(ARMVectorReg);
+ uint64_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+
+ if (split <= 0) {
+ memcpy(d, m, vl); /* all false */
+ } else if (elements <= split) {
+ memcpy(d, n, vl); /* all true */
+ } else {
+ memcpy(d, n, split * sizeof(uint64_t));
+ memcpy(d + split, m + split,
+ (elements - split) * sizeof(uint64_t));
+ }
+ }
+ }
+}
diff --git a/target/arm/tcg/sve.decode b/target/arm/tcg/sve.decode
index 04b6fcc..2efd5f5 100644
--- a/target/arm/tcg/sve.decode
+++ b/target/arm/tcg/sve.decode
@@ -30,6 +30,7 @@
%size_23 23:2
%dtype_23_13 23:2 13:2
%index3_22_19 22:1 19:2
+%index3_22_17 22:1 17:2
%index3_19_11 19:2 11:1
%index2_20_11 20:1 11:1
@@ -57,6 +58,11 @@
# as propagated via the MOVPRFX instruction.
%reg_movprfx 0:5
+%rn_ax2 6:4 !function=times_2
+
+%pnd 0:3 !function=plus_8
+%pnn 5:3 !function=plus_8
+
###########################################################################
# Named attribute sets. These are used to make nice(er) names
# when creating helpers common to those for the individual
@@ -102,6 +108,7 @@
# Two operand
@pd_pn ........ esz:2 .. .... ....... rn:4 . rd:4 &rr_esz
@rd_rn ........ esz:2 ...... ...... rn:5 rd:5 &rr_esz
+@rd_rnx2 ........ ... ..... ...... ..... rd:5 &rr_esz rn=%rn_ax2
# Two operand with governing predicate, flags setting
@pd_pg_pn_s ........ . s:1 ...... .. pg:4 . rn:4 . rd:4 &rpr_s
@@ -131,11 +138,11 @@
@rda_rn_rm ........ esz:2 . rm:5 ... ... rn:5 rd:5 \
&rrrr_esz ra=%reg_movprfx
-# Four operand with unused vector element size
-@rda_rn_rm_e0 ........ ... rm:5 ... ... rn:5 rd:5 \
- &rrrr_esz esz=0 ra=%reg_movprfx
-@rdn_ra_rm_e0 ........ ... rm:5 ... ... ra:5 rd:5 \
- &rrrr_esz esz=0 rn=%reg_movprfx
+# Four operand with explicit vector element size
+@rda_rn_rm_ex ........ ... rm:5 ... ... rn:5 rd:5 \
+ &rrrr_esz ra=%reg_movprfx
+@rdn_ra_rm_ex ........ ... rm:5 ... ... ra:5 rd:5 \
+ &rrrr_esz rn=%reg_movprfx
# Three operand with "memory" size, aka immediate left shift
@rd_rn_msz_rm ........ ... rm:5 .... imm:2 rn:5 rd:5 &rrri
@@ -222,6 +229,9 @@
@rprr_load_dt ....... dtype:4 rm:5 ... pg:3 rn:5 rd:5 &rprr_load
@rpri_load_dt ....... dtype:4 . imm:s4 ... pg:3 rn:5 rd:5 &rpri_load
+@rprr_load ....... .... rm:5 ... pg:3 rn:5 rd:5 &rprr_load
+@rpri_load ....... .... . imm:s4 ... pg:3 rn:5 rd:5 &rpri_load
+
@rprr_load_msz ....... .... rm:5 ... pg:3 rn:5 rd:5 \
&rprr_load dtype=%msz_dtype
@rpri_load_msz ....... .... . imm:s4 ... pg:3 rn:5 rd:5 \
@@ -245,7 +255,7 @@
# Stores; user must fill in ESZ, MSZ, NREG as needed.
@rprr_store ....... .. .. rm:5 ... pg:3 rn:5 rd:5 &rprr_store
-@rpri_store_msz ....... msz:2 .. . imm:s4 ... pg:3 rn:5 rd:5 &rpri_store
+@rpri_store ....... .. .. . imm:s4 ... pg:3 rn:5 rd:5 &rpri_store
@rprr_store_esz_n0 ....... .. esz:2 rm:5 ... pg:3 rn:5 rd:5 \
&rprr_store nreg=0
@rprr_scatter_store ....... msz:2 .. rm:5 ... pg:3 rn:5 rd:5 \
@@ -320,6 +330,11 @@ ORV 00000100 .. 011 000 001 ... ..... ..... @rd_pg_rn
EORV 00000100 .. 011 001 001 ... ..... ..... @rd_pg_rn
ANDV 00000100 .. 011 010 001 ... ..... ..... @rd_pg_rn
+# SVE2.1 bitwise logical reduction (quadwords)
+ORQV 00000100 .. 011 100 001 ... ..... ..... @rd_pg_rn
+EORQV 00000100 .. 011 101 001 ... ..... ..... @rd_pg_rn
+ANDQV 00000100 .. 011 110 001 ... ..... ..... @rd_pg_rn
+
# SVE constructive prefix (predicated)
MOVPRFX_z 00000100 .. 010 000 001 ... ..... ..... @rd_pg_rn
MOVPRFX_m 00000100 .. 010 001 001 ... ..... ..... @rd_pg_rn
@@ -335,6 +350,13 @@ UMAXV 00000100 .. 001 001 001 ... ..... ..... @rd_pg_rn
SMINV 00000100 .. 001 010 001 ... ..... ..... @rd_pg_rn
UMINV 00000100 .. 001 011 001 ... ..... ..... @rd_pg_rn
+# SVE2.1 segment reduction
+ADDQV 00000100 .. 000 101 001 ... ..... ..... @rd_pg_rn
+SMAXQV 00000100 .. 001 100 001 ... ..... ..... @rd_pg_rn
+SMINQV 00000100 .. 001 110 001 ... ..... ..... @rd_pg_rn
+UMAXQV 00000100 .. 001 101 001 ... ..... ..... @rd_pg_rn
+UMINQV 00000100 .. 001 111 001 ... ..... ..... @rd_pg_rn
+
### SVE Shift by Immediate - Predicated Group
# SVE bitwise shift by immediate (predicated)
@@ -428,12 +450,12 @@ XAR 00000100 .. 1 ..... 001 101 rm:5 rd:5 &rrri_esz \
rn=%reg_movprfx esz=%tszimm16_esz imm=%tszimm16_shr
# SVE2 bitwise ternary operations
-EOR3 00000100 00 1 ..... 001 110 ..... ..... @rdn_ra_rm_e0
-BSL 00000100 00 1 ..... 001 111 ..... ..... @rdn_ra_rm_e0
-BCAX 00000100 01 1 ..... 001 110 ..... ..... @rdn_ra_rm_e0
-BSL1N 00000100 01 1 ..... 001 111 ..... ..... @rdn_ra_rm_e0
-BSL2N 00000100 10 1 ..... 001 111 ..... ..... @rdn_ra_rm_e0
-NBSL 00000100 11 1 ..... 001 111 ..... ..... @rdn_ra_rm_e0
+EOR3 00000100 00 1 ..... 001 110 ..... ..... @rdn_ra_rm_ex esz=0
+BSL 00000100 00 1 ..... 001 111 ..... ..... @rdn_ra_rm_ex esz=0
+BCAX 00000100 01 1 ..... 001 110 ..... ..... @rdn_ra_rm_ex esz=0
+BSL1N 00000100 01 1 ..... 001 111 ..... ..... @rdn_ra_rm_ex esz=0
+BSL2N 00000100 10 1 ..... 001 111 ..... ..... @rdn_ra_rm_ex esz=0
+NBSL 00000100 11 1 ..... 001 111 ..... ..... @rdn_ra_rm_ex esz=0
### SVE Index Generation Group
@@ -559,6 +581,14 @@ DUP_s 00000101 .. 1 00000 001110 ..... ..... @rd_rn
DUP_x 00000101 .. 1 ..... 001000 rn:5 rd:5 \
&rri imm=%imm7_22_16
+# SVE Permute Vector - one source quadwords
+DUPQ 00000101 001 imm:4 1 001001 rn:5 rd:5 &rri_esz esz=0
+DUPQ 00000101 001 imm:3 10 001001 rn:5 rd:5 &rri_esz esz=1
+DUPQ 00000101 001 imm:2 100 001001 rn:5 rd:5 &rri_esz esz=2
+DUPQ 00000101 001 imm:1 1000 001001 rn:5 rd:5 &rri_esz esz=3
+
+EXTQ 00000101 0110 imm:4 001001 rn:5 rd:5 &rri
+
# SVE insert SIMD&FP scalar register
INSR_f 00000101 .. 1 10100 001110 ..... ..... @rdn_rm
@@ -568,6 +598,22 @@ INSR_r 00000101 .. 1 00100 001110 ..... ..... @rdn_rm
# SVE reverse vector elements
REV_v 00000101 .. 1 11000 001110 ..... ..... @rd_rn
+# SVE move predicate to/from vector
+
+PMOV_pv 00000101 00 101 01 0001110 rn:5 0 rd:4 \
+ &rri_esz esz=0 imm=0
+PMOV_pv 00000101 00 101 1 imm:1 0001110 rn:5 0 rd:4 &rri_esz esz=1
+PMOV_pv 00000101 01 101 imm:2 0001110 rn:5 0 rd:4 &rri_esz esz=2
+PMOV_pv 00000101 1. 101 .. 0001110 rn:5 0 rd:4 \
+ &rri_esz esz=3 imm=%index3_22_17
+
+PMOV_vp 00000101 00 101 01 1001110 0 rn:4 rd:5 \
+ &rri_esz esz=0 imm=0
+PMOV_vp 00000101 00 101 1 imm:1 1001110 0 rn:4 rd:5 &rri_esz esz=1
+PMOV_vp 00000101 01 101 imm:2 1001110 0 rn:4 rd:5 &rri_esz esz=2
+PMOV_vp 00000101 1. 101 .. 1001110 0 rn:4 rd:5 \
+ &rri_esz esz=3 imm=%index3_22_17
+
# SVE vector table lookup
TBL 00000101 .. 1 ..... 001100 ..... ..... @rd_rn_rm
@@ -614,6 +660,15 @@ UZP2_q 00000101 10 1 ..... 000 011 ..... ..... @rd_rn_rm_e0
TRN1_q 00000101 10 1 ..... 000 110 ..... ..... @rd_rn_rm_e0
TRN2_q 00000101 10 1 ..... 000 111 ..... ..... @rd_rn_rm_e0
+# SVE2.1 permute vector elements (quadwords)
+ZIPQ1 01000100 .. 0 ..... 111 000 ..... ..... @rd_rn_rm
+ZIPQ2 01000100 .. 0 ..... 111 001 ..... ..... @rd_rn_rm
+UZPQ1 01000100 .. 0 ..... 111 010 ..... ..... @rd_rn_rm
+UZPQ2 01000100 .. 0 ..... 111 011 ..... ..... @rd_rn_rm
+
+TBLQ 01000100 .. 0 ..... 111 110 ..... ..... @rd_rn_rm
+TBXQ 00000101 .. 1 ..... 001 101 ..... ..... @rd_rn_rm
+
### SVE Permute - Predicated Group
# SVE compress active elements
@@ -725,6 +780,7 @@ PTEST 00100101 01 010000 11 pg:4 0 rn:4 0 0000
# SVE predicate initialize
PTRUE 00100101 esz:2 01100 s:1 111000 pat:5 0 rd:4
+PTRUE_cnt 00100101 esz:2 1000000111100000010 ... rd=%pnd
# SVE initialize FFR
SETFFR 00100101 0010 1100 1001 0000 0000 0000
@@ -765,7 +821,8 @@ BRKN 00100101 0. 01100001 .... 0 .... 0 .... @pd_pg_pn_s
### SVE Predicate Count Group
# SVE predicate count
-CNTP 00100101 .. 100 000 10 .... 0 .... ..... @rd_pg4_pn
+CNTP 00100101 .. 100 000 10 .... 0 .... ..... @rd_pg4_pn
+CNTP_c 00100101 esz:2 100 000 10 000 vl:1 1 rn:4 rd:5
# SVE inc/dec register by predicate count
INCDECP_r 00100101 .. 10110 d:1 10001 00 .... ..... @incdec_pred u=1
@@ -786,11 +843,35 @@ SINCDECP_z 00100101 .. 1010 d:1 u:1 10000 00 .... ..... @incdec2_pred
CTERM 00100101 1 sf:1 1 rm:5 001000 rn:5 ne:1 0000
# SVE integer compare scalar count and limit
-WHILE 00100101 esz:2 1 rm:5 000 sf:1 u:1 lt:1 rn:5 eq:1 rd:4
+&while esz rd rn rm sf u eq
+WHILE_lt 00100101 esz:2 1 rm:5 000 sf:1 u:1 1 rn:5 eq:1 rd:4 &while
+WHILE_gt 00100101 esz:2 1 rm:5 000 sf:1 u:1 0 rn:5 eq:1 rd:4 &while
# SVE2 pointer conflict compare
WHILE_ptr 00100101 esz:2 1 rm:5 001 100 rn:5 rw:1 rd:4
+# SVE2.1 predicate pair
+%pd_pair 1:3 !function=times_2
+@while_pair ........ esz:2 . rm:5 .... u:1 . rn:5 . ... eq:1 \
+ &while rd=%pd_pair sf=1
+
+WHILE_lt_pair 00100101 .. 1 ..... 0101 . 1 ..... 1 ... . @while_pair
+WHILE_gt_pair 00100101 .. 1 ..... 0101 . 0 ..... 1 ... . @while_pair
+
+# SVE2.1 predicate as count
+@while_cnt ........ esz:2 . rm:5 .... u:1 . rn:5 . eq:1 ... \
+ &while rd=%pnd sf=1
+
+WHILE_lt_cnt2 00100101 .. 1 ..... 0100 . 1 ..... 1 . ... @while_cnt
+WHILE_lt_cnt4 00100101 .. 1 ..... 0110 . 1 ..... 1 . ... @while_cnt
+WHILE_gt_cnt2 00100101 .. 1 ..... 0100 . 0 ..... 1 . ... @while_cnt
+WHILE_gt_cnt4 00100101 .. 1 ..... 0110 . 0 ..... 1 . ... @while_cnt
+
+# SVE2.1 extract mask predicate from predicate-as-counter
+&pext rd rn esz imm
+PEXT_1 00100101 esz:2 1 00000 0111 00 imm:2 ... 1 rd:4 &pext rn=%pnn
+PEXT_2 00100101 esz:2 1 00000 0111 010 imm:1 ... 1 rd:4 &pext rn=%pnn
+
### SVE Integer Wide Immediate - Unpredicated Group
# SVE broadcast floating-point immediate (unpredicated)
@@ -851,10 +932,13 @@ CDOT_zzzz 01000100 esz:2 0 rm:5 0001 rot:2 rn:5 rd:5 ra=%reg_movprfx
#### SVE Multiply - Indexed
# SVE integer dot product (indexed)
-SDOT_zzxw_s 01000100 10 1 ..... 000000 ..... ..... @rrxr_2 esz=2
-SDOT_zzxw_d 01000100 11 1 ..... 000000 ..... ..... @rrxr_1 esz=3
-UDOT_zzxw_s 01000100 10 1 ..... 000001 ..... ..... @rrxr_2 esz=2
-UDOT_zzxw_d 01000100 11 1 ..... 000001 ..... ..... @rrxr_1 esz=3
+SDOT_zzxw_4s 01000100 10 1 ..... 000000 ..... ..... @rrxr_2 esz=2
+SDOT_zzxw_4d 01000100 11 1 ..... 000000 ..... ..... @rrxr_1 esz=3
+UDOT_zzxw_4s 01000100 10 1 ..... 000001 ..... ..... @rrxr_2 esz=2
+UDOT_zzxw_4d 01000100 11 1 ..... 000001 ..... ..... @rrxr_1 esz=3
+
+SDOT_zzxw_2s 01000100 10 0 ..... 110010 ..... ..... @rrxr_2 esz=2
+UDOT_zzxw_2s 01000100 10 0 ..... 110011 ..... ..... @rrxr_2 esz=2
# SVE2 integer multiply-add (indexed)
MLA_zzxz_h 01000100 0. 1 ..... 000010 ..... ..... @rrxr_3 esz=1
@@ -873,8 +957,8 @@ SQRDMLSH_zzxz_s 01000100 10 1 ..... 000101 ..... ..... @rrxr_2 esz=2
SQRDMLSH_zzxz_d 01000100 11 1 ..... 000101 ..... ..... @rrxr_1 esz=3
# SVE mixed sign dot product (indexed)
-USDOT_zzxw_s 01000100 10 1 ..... 000110 ..... ..... @rrxr_2 esz=2
-SUDOT_zzxw_s 01000100 10 1 ..... 000111 ..... ..... @rrxr_2 esz=2
+USDOT_zzxw_4s 01000100 10 1 ..... 000110 ..... ..... @rrxr_2 esz=2
+SUDOT_zzxw_4s 01000100 10 1 ..... 000111 ..... ..... @rrxr_2 esz=2
# SVE2 saturating multiply-add (indexed)
SQDMLALB_zzxw_s 01000100 10 1 ..... 0010.0 ..... ..... @rrxr_3a esz=2
@@ -990,6 +1074,14 @@ FMINNMV 01100101 .. 000 101 001 ... ..... ..... @rd_pg_rn
FMAXV 01100101 .. 000 110 001 ... ..... ..... @rd_pg_rn
FMINV 01100101 .. 000 111 001 ... ..... ..... @rd_pg_rn
+### SVE FP recursive reduction (quadwords)
+
+FADDQV 01100100 .. 010 000 101 ... ..... ..... @rd_pg_rn
+FMAXNMQV 01100100 .. 010 100 101 ... ..... ..... @rd_pg_rn
+FMINNMQV 01100100 .. 010 101 101 ... ..... ..... @rd_pg_rn
+FMAXQV 01100100 .. 010 110 101 ... ..... ..... @rd_pg_rn
+FMINQV 01100100 .. 010 111 101 ... ..... ..... @rd_pg_rn
+
## SVE Floating Point Unary Operations - Unpredicated Group
FRECPE 01100101 .. 001 110 001100 ..... ..... @rd_rn
@@ -1151,12 +1243,24 @@ LD1_zpiz 1000010 .. 01 ..... 1.. ... ..... ..... \
# SVE contiguous load (scalar plus scalar)
LD_zprr 1010010 .... ..... 010 ... ..... ..... @rprr_load_dt nreg=0
+# LD1W (128-bit element)
+LD_zprr 1010010 1000 rm:5 100 pg:3 rn:5 rd:5 \
+ &rprr_load dtype=16 nreg=0
+# LD1D (128-bit element)
+LD_zprr 1010010 1100 rm:5 100 pg:3 rn:5 rd:5 \
+ &rprr_load dtype=17 nreg=0
# SVE contiguous first-fault load (scalar plus scalar)
LDFF1_zprr 1010010 .... ..... 011 ... ..... ..... @rprr_load_dt nreg=0
# SVE contiguous load (scalar plus immediate)
LD_zpri 1010010 .... 0.... 101 ... ..... ..... @rpri_load_dt nreg=0
+# LD1W (128-bit element)
+LD_zpri 1010010 1000 1 imm:s4 001 pg:3 rn:5 rd:5 \
+ &rpri_load dtype=16 nreg=0
+# LD1D (128-bit element)
+LD_zpri 1010010 1100 1 imm:s4 001 pg:3 rn:5 rd:5 \
+ &rpri_load dtype=17 nreg=0
# SVE contiguous non-fault load (scalar plus immediate)
LDNF1_zpri 1010010 .... 1.... 101 ... ..... ..... @rpri_load_dt nreg=0
@@ -1166,12 +1270,26 @@ LDNF1_zpri 1010010 .... 1.... 101 ... ..... ..... @rpri_load_dt nreg=0
# SVE load multiple structures (scalar plus scalar)
# LD2B, LD2H, LD2W, LD2D; etc.
LD_zprr 1010010 .. nreg:2 ..... 110 ... ..... ..... @rprr_load_msz
+# LD[234]Q
+LD_zprr 1010010 01 01 ..... 100 ... ..... ..... \
+ @rprr_load dtype=18 nreg=1
+LD_zprr 1010010 10 01 ..... 100 ... ..... ..... \
+ @rprr_load dtype=18 nreg=2
+LD_zprr 1010010 11 01 ..... 100 ... ..... ..... \
+ @rprr_load dtype=18 nreg=3
# SVE contiguous non-temporal load (scalar plus immediate)
# LDNT1B, LDNT1H, LDNT1W, LDNT1D
# SVE load multiple structures (scalar plus immediate)
# LD2B, LD2H, LD2W, LD2D; etc.
LD_zpri 1010010 .. nreg:2 0.... 111 ... ..... ..... @rpri_load_msz
+# LD[234]Q
+LD_zpri 1010010 01 001 .... 111 ... ..... ..... \
+ @rpri_load dtype=18 nreg=1
+LD_zpri 1010010 10 001 .... 111 ... ..... ..... \
+ @rpri_load dtype=18 nreg=2
+LD_zpri 1010010 11 001 .... 111 ... ..... ..... \
+ @rpri_load dtype=18 nreg=3
# SVE load and broadcast quadword (scalar plus scalar)
LD1RQ_zprr 1010010 .. 00 ..... 000 ... ..... ..... \
@@ -1222,6 +1340,10 @@ LD1_zprz 1100010 10 1. ..... 1.. ... ..... ..... \
LD1_zprz 1100010 11 1. ..... 11. ... ..... ..... \
@rprr_g_load_sc esz=3 msz=3 u=1
+# LD1Q
+LD1_zprz 1100 0100 000 rm:5 101 pg:3 rn:5 rd:5 \
+ &rprr_gather_load u=0 ff=0 xs=2 esz=4 msz=4 scale=0
+
# SVE 64-bit gather load (vector plus immediate)
LD1_zpiz 1100010 .. 01 ..... 1.. ... ..... ..... \
@rpri_g_load esz=3
@@ -1245,8 +1367,20 @@ STR_zri 1110010 11 0. ..... 010 ... ..... ..... @rd_rn_i9
# SVE contiguous store (scalar plus immediate)
# ST1B, ST1H, ST1W, ST1D; require msz <= esz
-ST_zpri 1110010 .. esz:2 0.... 111 ... ..... ..... \
- @rpri_store_msz nreg=0
+ST_zpri 1110010 00 esz:2 0.... 111 ... ..... ..... \
+ @rpri_store msz=0 nreg=0
+ST_zpri 1110010 01 esz:2 0.... 111 ... ..... ..... \
+ @rpri_store msz=1 nreg=0
+ST_zpri 1110010 10 10 0.... 111 ... ..... ..... \
+ @rpri_store msz=2 esz=2 nreg=0
+ST_zpri 1110010 10 11 0.... 111 ... ..... ..... \
+ @rpri_store msz=2 esz=3 nreg=0
+ST_zpri 1110010 11 11 0.... 111 ... ..... ..... \
+ @rpri_store msz=3 esz=3 nreg=0
+ST_zpri 1110010 10 00 0.... 111 ... ..... ..... \
+ @rpri_store msz=2 esz=4 nreg=0
+ST_zpri 1110010 11 10 0.... 111 ... ..... ..... \
+ @rpri_store msz=3 esz=4 nreg=0
# SVE contiguous store (scalar plus scalar)
# ST1B, ST1H, ST1W, ST1D; require msz <= esz
@@ -1255,20 +1389,40 @@ ST_zprr 1110010 00 .. ..... 010 ... ..... ..... \
@rprr_store_esz_n0 msz=0
ST_zprr 1110010 01 .. ..... 010 ... ..... ..... \
@rprr_store_esz_n0 msz=1
-ST_zprr 1110010 10 .. ..... 010 ... ..... ..... \
- @rprr_store_esz_n0 msz=2
+ST_zprr 1110010 10 10 ..... 010 ... ..... ..... \
+ @rprr_store msz=2 esz=2 nreg=0
+ST_zprr 1110010 10 11 ..... 010 ... ..... ..... \
+ @rprr_store msz=2 esz=3 nreg=0
ST_zprr 1110010 11 11 ..... 010 ... ..... ..... \
@rprr_store msz=3 esz=3 nreg=0
+ST_zprr 1110010 10 00 ..... 010 ... ..... ..... \
+ @rprr_store msz=2 esz=4 nreg=0
+ST_zprr 1110010 11 10 ..... 010 ... ..... ..... \
+ @rprr_store msz=3 esz=4 nreg=0
# SVE contiguous non-temporal store (scalar plus immediate) (nreg == 0)
# SVE store multiple structures (scalar plus immediate) (nreg != 0)
ST_zpri 1110010 .. nreg:2 1.... 111 ... ..... ..... \
- @rpri_store_msz esz=%size_23
+ @rpri_store msz=%size_23 esz=%size_23
+# ST[234]Q
+ST_zpri 11100100 01 00 .... 000 ... ..... ..... \
+ @rpri_store msz=4 esz=4 nreg=1
+ST_zpri 11100100 10 00 .... 000 ... ..... ..... \
+ @rpri_store msz=4 esz=4 nreg=2
+ST_zpri 11100100 11 00 .... 000 ... ..... ..... \
+ @rpri_store msz=4 esz=4 nreg=3
# SVE contiguous non-temporal store (scalar plus scalar) (nreg == 0)
# SVE store multiple structures (scalar plus scalar) (nreg != 0)
-ST_zprr 1110010 msz:2 nreg:2 ..... 011 ... ..... ..... \
- @rprr_store esz=%size_23
+ST_zprr 1110010 .. nreg:2 ..... 011 ... ..... ..... \
+ @rprr_store msz=%size_23 esz=%size_23
+# ST[234]Q
+ST_zprr 11100100 01 1 ..... 000 ... ..... ..... \
+ @rprr_store msz=4 esz=4 nreg=1
+ST_zprr 11100100 10 1 ..... 000 ... ..... ..... \
+ @rprr_store msz=4 esz=4 nreg=2
+ST_zprr 11100100 11 1 ..... 000 ... ..... ..... \
+ @rprr_store msz=4 esz=4 nreg=3
# SVE 32-bit scatter store (scalar plus 32-bit scaled offsets)
# Require msz > 0 && msz <= esz.
@@ -1293,6 +1447,10 @@ ST1_zprz 1110010 .. 01 ..... 101 ... ..... ..... \
ST1_zprz 1110010 .. 00 ..... 101 ... ..... ..... \
@rprr_scatter_store xs=2 esz=3 scale=0
+# ST1Q
+ST1_zprz 1110 0100 001 rm:5 001 pg:3 rn:5 rd:5 \
+ &rprr_scatter_store xs=2 msz=4 esz=4 scale=0
+
# SVE 64-bit scatter store (vector plus immediate)
ST1_zpiz 1110010 .. 10 ..... 101 ... ..... ..... \
@rpri_scatter_store esz=3
@@ -1450,9 +1608,9 @@ EORTB 01000101 .. 0 ..... 10010 1 ..... ..... @rd_rn_rm
## SVE integer matrix multiply accumulate
-SMMLA 01000101 00 0 ..... 10011 0 ..... ..... @rda_rn_rm_e0
-USMMLA 01000101 10 0 ..... 10011 0 ..... ..... @rda_rn_rm_e0
-UMMLA 01000101 11 0 ..... 10011 0 ..... ..... @rda_rn_rm_e0
+SMMLA 01000101 00 0 ..... 10011 0 ..... ..... @rda_rn_rm_ex esz=2
+USMMLA 01000101 10 0 ..... 10011 0 ..... ..... @rda_rn_rm_ex esz=2
+UMMLA 01000101 11 0 ..... 10011 0 ..... ..... @rda_rn_rm_ex esz=2
## SVE2 bitwise permute
@@ -1504,13 +1662,22 @@ UABA 01000101 .. 0 ..... 11111 1 ..... ..... @rd_rn_rm
#### SVE2 Narrowing
## SVE2 saturating extract narrow
-
# Bits 23, 18-16 are zero, limited in the translator via esz < 3 & imm == 0.
-SQXTNB 01000101 .. 1 ..... 010 000 ..... ..... @rd_rn_tszimm_shl
+
+{
+ SQCVTN_sh 01000101 00 1 10001 010 000 ....0 ..... @rd_rnx2 esz=1
+ SQXTNB 01000101 .. 1 ..... 010 000 ..... ..... @rd_rn_tszimm_shl
+}
SQXTNT 01000101 .. 1 ..... 010 001 ..... ..... @rd_rn_tszimm_shl
-UQXTNB 01000101 .. 1 ..... 010 010 ..... ..... @rd_rn_tszimm_shl
+{
+ UQCVTN_sh 01000101 00 1 10001 010 010 ....0 ..... @rd_rnx2 esz=1
+ UQXTNB 01000101 .. 1 ..... 010 010 ..... ..... @rd_rn_tszimm_shl
+}
UQXTNT 01000101 .. 1 ..... 010 011 ..... ..... @rd_rn_tszimm_shl
-SQXTUNB 01000101 .. 1 ..... 010 100 ..... ..... @rd_rn_tszimm_shl
+{
+ SQCVTUN_sh 01000101 00 1 10001 010 100 ....0 ..... @rd_rnx2 esz=1
+ SQXTUNB 01000101 .. 1 ..... 010 100 ..... ..... @rd_rn_tszimm_shl
+}
SQXTUNT 01000101 .. 1 ..... 010 101 ..... ..... @rd_rn_tszimm_shl
## SVE2 bitwise shift right narrow
@@ -1597,14 +1764,17 @@ UMLSLT_zzzw 01000100 .. 0 ..... 010 111 ..... ..... @rda_rn_rm
CMLA_zzzz 01000100 esz:2 0 rm:5 0010 rot:2 rn:5 rd:5 ra=%reg_movprfx
SQRDCMLAH_zzzz 01000100 esz:2 0 rm:5 0011 rot:2 rn:5 rd:5 ra=%reg_movprfx
-## SVE mixed sign dot product
+## SVE dot product
+
+SDOT_zzzz_2s 01000100 00 0 ..... 110 010 ..... ..... @rda_rn_rm_ex esz=2
+UDOT_zzzz_2s 01000100 00 0 ..... 110 011 ..... ..... @rda_rn_rm_ex esz=2
-USDOT_zzzz 01000100 .. 0 ..... 011 110 ..... ..... @rda_rn_rm
+USDOT_zzzz_4s 01000100 10 0 ..... 011 110 ..... ..... @rda_rn_rm_ex esz=2
### SVE2 floating point matrix multiply accumulate
-BFMMLA 01100100 01 1 ..... 111 001 ..... ..... @rda_rn_rm_e0
-FMMLA_s 01100100 10 1 ..... 111 001 ..... ..... @rda_rn_rm_e0
-FMMLA_d 01100100 11 1 ..... 111 001 ..... ..... @rda_rn_rm_e0
+BFMMLA 01100100 01 1 ..... 111 001 ..... ..... @rda_rn_rm_ex esz=1
+FMMLA_s 01100100 10 1 ..... 111 001 ..... ..... @rda_rn_rm_ex esz=2
+FMMLA_d 01100100 11 1 ..... 111 001 ..... ..... @rda_rn_rm_ex esz=3
### SVE2 Memory Gather Load Group
@@ -1654,26 +1824,35 @@ FCVTLT_sd 01100100 11 0010 11 101 ... ..... ..... @rd_pg_rn_e0
FLOGB 01100101 00 011 esz:2 0101 pg:3 rn:5 rd:5 &rpr_esz
### SVE2 floating-point multiply-add long (vectors)
-FMLALB_zzzw 01100100 10 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_e0
-FMLALT_zzzw 01100100 10 1 ..... 10 0 00 1 ..... ..... @rda_rn_rm_e0
-FMLSLB_zzzw 01100100 10 1 ..... 10 1 00 0 ..... ..... @rda_rn_rm_e0
-FMLSLT_zzzw 01100100 10 1 ..... 10 1 00 1 ..... ..... @rda_rn_rm_e0
+FMLALB_zzzw 01100100 10 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_ex esz=2
+FMLALT_zzzw 01100100 10 1 ..... 10 0 00 1 ..... ..... @rda_rn_rm_ex esz=2
+FMLSLB_zzzw 01100100 10 1 ..... 10 1 00 0 ..... ..... @rda_rn_rm_ex esz=2
+FMLSLT_zzzw 01100100 10 1 ..... 10 1 00 1 ..... ..... @rda_rn_rm_ex esz=2
-BFMLALB_zzzw 01100100 11 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_e0
-BFMLALT_zzzw 01100100 11 1 ..... 10 0 00 1 ..... ..... @rda_rn_rm_e0
+BFMLALB_zzzw 01100100 11 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_ex esz=2
+BFMLALT_zzzw 01100100 11 1 ..... 10 0 00 1 ..... ..... @rda_rn_rm_ex esz=2
+BFMLSLB_zzzw 01100100 11 1 ..... 10 1 00 0 ..... ..... @rda_rn_rm_ex esz=2
+BFMLSLT_zzzw 01100100 11 1 ..... 10 1 00 1 ..... ..... @rda_rn_rm_ex esz=2
-### SVE2 floating-point bfloat16 dot-product
-BFDOT_zzzz 01100100 01 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_e0
+### SVE2 floating-point dot-product
+FDOT_zzzz 01100100 00 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_ex esz=2
+BFDOT_zzzz 01100100 01 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_ex esz=2
### SVE2 floating-point multiply-add long (indexed)
+
FMLALB_zzxw 01100100 10 1 ..... 0100.0 ..... ..... @rrxr_3a esz=2
FMLALT_zzxw 01100100 10 1 ..... 0100.1 ..... ..... @rrxr_3a esz=2
FMLSLB_zzxw 01100100 10 1 ..... 0110.0 ..... ..... @rrxr_3a esz=2
FMLSLT_zzxw 01100100 10 1 ..... 0110.1 ..... ..... @rrxr_3a esz=2
+
BFMLALB_zzxw 01100100 11 1 ..... 0100.0 ..... ..... @rrxr_3a esz=2
BFMLALT_zzxw 01100100 11 1 ..... 0100.1 ..... ..... @rrxr_3a esz=2
+BFMLSLB_zzxw 01100100 11 1 ..... 0110.0 ..... ..... @rrxr_3a esz=2
+BFMLSLT_zzxw 01100100 11 1 ..... 0110.1 ..... ..... @rrxr_3a esz=2
-### SVE2 floating-point bfloat16 dot-product (indexed)
+### SVE2 floating-point dot-product (indexed)
+
+FDOT_zzxz 01100100 00 1 ..... 010000 ..... ..... @rrxr_2 esz=2
BFDOT_zzxz 01100100 01 1 ..... 010000 ..... ..... @rrxr_2 esz=2
### SVE broadcast predicate element
@@ -1700,3 +1879,55 @@ PSEL 00100101 .1 1 000 .. 01 .... 0 .... 0 .... \
SCLAMP 01000100 .. 0 ..... 110000 ..... ..... @rda_rn_rm
UCLAMP 01000100 .. 0 ..... 110001 ..... ..... @rda_rn_rm
+
+FCLAMP 01100100 .. 1 ..... 001001 ..... ..... @rda_rn_rm
+
+### SVE2p1 multi-vec contiguous load
+
+&zcrr_ldst rd png rn rm esz nreg
+&zcri_ldst rd png rn imm esz nreg
+%png 10:3 !function=plus_8
+%zd_ax2 1:4 !function=times_2
+%zd_ax4 2:3 !function=times_4
+
+LD1_zcrr 10100000000 rm:5 0 esz:2 ... rn:5 .... - \
+ &zcrr_ldst %png rd=%zd_ax2 nreg=2
+LD1_zcrr 10100000000 rm:5 1 esz:2 ... rn:5 ... 0- \
+ &zcrr_ldst %png rd=%zd_ax4 nreg=4
+
+ST1_zcrr 10100000001 rm:5 0 esz:2 ... rn:5 .... - \
+ &zcrr_ldst %png rd=%zd_ax2 nreg=2
+ST1_zcrr 10100000001 rm:5 1 esz:2 ... rn:5 ... 0- \
+ &zcrr_ldst %png rd=%zd_ax4 nreg=4
+
+LD1_zcri 101000000100 imm:s4 0 esz:2 ... rn:5 .... - \
+ &zcri_ldst %png rd=%zd_ax2 nreg=2
+LD1_zcri 101000000100 imm:s4 1 esz:2 ... rn:5 ... 0- \
+ &zcri_ldst %png rd=%zd_ax4 nreg=4
+
+ST1_zcri 101000000110 imm:s4 0 esz:2 ... rn:5 .... - \
+ &zcri_ldst %png rd=%zd_ax2 nreg=2
+ST1_zcri 101000000110 imm:s4 1 esz:2 ... rn:5 ... 0- \
+ &zcri_ldst %png rd=%zd_ax4 nreg=4
+
+# Note: N bit and 0 bit (for nreg4) still mashed in rd.
+# This is handled within gen_ldst_c().
+LD1_zcrr_stride 10100001000 rm:5 0 esz:2 ... rn:5 rd:5 \
+ &zcrr_ldst %png nreg=2
+LD1_zcrr_stride 10100001000 rm:5 1 esz:2 ... rn:5 rd:5 \
+ &zcrr_ldst %png nreg=4
+
+ST1_zcrr_stride 10100001001 rm:5 0 esz:2 ... rn:5 rd:5 \
+ &zcrr_ldst %png nreg=2
+ST1_zcrr_stride 10100001001 rm:5 1 esz:2 ... rn:5 rd:5 \
+ &zcrr_ldst %png nreg=4
+
+LD1_zcri_stride 101000010100 imm:s4 0 esz:2 ... rn:5 rd:5 \
+ &zcri_ldst %png nreg=2
+LD1_zcri_stride 101000010100 imm:s4 1 esz:2 ... rn:5 rd:5 \
+ &zcri_ldst %png nreg=4
+
+ST1_zcri_stride 101000010110 imm:s4 0 esz:2 ... rn:5 rd:5 \
+ &zcri_ldst %png nreg=2
+ST1_zcri_stride 101000010110 imm:s4 1 esz:2 ... rn:5 rd:5 \
+ &zcri_ldst %png nreg=4
diff --git a/target/arm/tcg/sve_helper.c b/target/arm/tcg/sve_helper.c
index a2c363a..43b872c 100644
--- a/target/arm/tcg/sve_helper.c
+++ b/target/arm/tcg/sve_helper.c
@@ -123,6 +123,11 @@ static inline uint64_t expand_pred_s(uint8_t byte)
return word[byte & 0x11];
}
+static inline uint64_t expand_pred_d(uint8_t byte)
+{
+ return -(uint64_t)(byte & 1);
+}
+
#define LOGICAL_PPPP(NAME, FUNC) \
void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
{ \
@@ -206,6 +211,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
#define DO_EOR(N, M) (N ^ M)
#define DO_ORR(N, M) (N | M)
#define DO_BIC(N, M) (N & ~M)
+#define DO_ORC(N, M) (N | ~M)
#define DO_ADD(N, M) (N + M)
#define DO_SUB(N, M) (N - M)
#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
@@ -527,14 +533,9 @@ DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
-static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
-{
- return val >= max ? max : val <= min ? min : val;
-}
-
-#define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
-#define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
-#define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
+#define DO_SQADD_B(n, m) do_ssat_b((int64_t)n + m)
+#define DO_SQADD_H(n, m) do_ssat_h((int64_t)n + m)
+#define DO_SQADD_S(n, m) do_ssat_s((int64_t)n + m)
static inline int64_t do_sqadd_d(int64_t n, int64_t m)
{
@@ -551,9 +552,9 @@ DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
-#define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
-#define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
-#define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
+#define DO_UQADD_B(n, m) do_usat_b((int64_t)n + m)
+#define DO_UQADD_H(n, m) do_usat_h((int64_t)n + m)
+#define DO_UQADD_S(n, m) do_usat_s((int64_t)n + m)
static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
{
@@ -566,9 +567,9 @@ DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
-#define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
-#define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
-#define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
+#define DO_SQSUB_B(n, m) do_ssat_b((int64_t)n - m)
+#define DO_SQSUB_H(n, m) do_ssat_h((int64_t)n - m)
+#define DO_SQSUB_S(n, m) do_ssat_s((int64_t)n - m)
static inline int64_t do_sqsub_d(int64_t n, int64_t m)
{
@@ -585,9 +586,9 @@ DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
-#define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
-#define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
-#define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
+#define DO_UQSUB_B(n, m) do_usat_b((int64_t)n - m)
+#define DO_UQSUB_H(n, m) do_usat_h((int64_t)n - m)
+#define DO_UQSUB_S(n, m) do_usat_s((int64_t)n - m)
static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
{
@@ -599,12 +600,9 @@ DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
-#define DO_SUQADD_B(n, m) \
- do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
-#define DO_SUQADD_H(n, m) \
- do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
-#define DO_SUQADD_S(n, m) \
- do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
+#define DO_SUQADD_B(n, m) do_ssat_b((int64_t)(int8_t)n + m)
+#define DO_SUQADD_H(n, m) do_ssat_h((int64_t)(int16_t)n + m)
+#define DO_SUQADD_S(n, m) do_ssat_s((int64_t)(int32_t)n + m)
static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
{
@@ -634,12 +632,9 @@ DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
-#define DO_USQADD_B(n, m) \
- do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
-#define DO_USQADD_H(n, m) \
- do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
-#define DO_USQADD_S(n, m) \
- do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
+#define DO_USQADD_B(n, m) do_usat_b((int64_t)n + (int8_t)m)
+#define DO_USQADD_H(n, m) do_usat_h((int64_t)n + (int16_t)m)
+#define DO_USQADD_S(n, m) do_usat_s((int64_t)n + (int32_t)m)
static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
{
@@ -1226,37 +1221,29 @@ void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
} \
}
-#define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
-#define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
-#define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
-
-DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
-DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
-DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
+DO_XTNB(sve2_sqxtnb_h, int16_t, do_ssat_b)
+DO_XTNB(sve2_sqxtnb_s, int32_t, do_ssat_h)
+DO_XTNB(sve2_sqxtnb_d, int64_t, do_ssat_s)
-DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
-DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
-DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
+DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, do_ssat_b)
+DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, do_ssat_h)
+DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, do_ssat_s)
-#define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
-#define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
-#define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
+DO_XTNB(sve2_uqxtnb_h, uint16_t, do_usat_b)
+DO_XTNB(sve2_uqxtnb_s, uint32_t, do_usat_h)
+DO_XTNB(sve2_uqxtnb_d, uint64_t, do_usat_s)
-DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
-DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
-DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
+DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, do_usat_b)
+DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, do_usat_h)
+DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, do_usat_s)
-DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
-DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
-DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
+DO_XTNB(sve2_sqxtunb_h, int16_t, do_usat_b)
+DO_XTNB(sve2_sqxtunb_s, int32_t, do_usat_h)
+DO_XTNB(sve2_sqxtunb_d, int64_t, do_usat_s)
-DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
-DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
-DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
-
-DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
-DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
-DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
+DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, do_usat_b)
+DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, do_usat_h)
+DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, do_usat_s)
#undef DO_XTNB
#undef DO_XTNT
@@ -1833,6 +1820,52 @@ DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
#undef DO_VPZ
#undef DO_VPZ_D
+#define DO_VPQ(NAME, TYPE, H, INIT, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
+{ \
+ TYPE tmp[16 / sizeof(TYPE)] = { [0 ... 16 / sizeof(TYPE) - 1] = INIT }; \
+ TYPE *n = vn; uint16_t *g = vg; \
+ uintptr_t oprsz = simd_oprsz(desc); \
+ uintptr_t nseg = oprsz / 16, nsegelt = 16 / sizeof(TYPE); \
+ for (uintptr_t s = 0; s < nseg; s++) { \
+ uint16_t pg = g[H2(s)]; \
+ for (uintptr_t e = 0; e < nsegelt; e++, pg >>= sizeof(TYPE)) { \
+ if (pg & 1) { \
+ tmp[e] = OP(tmp[H(e)], n[s * nsegelt + H(e)]); \
+ } \
+ } \
+ } \
+ memcpy(vd, tmp, 16); \
+ clear_tail(vd, 16, simd_maxsz(desc)); \
+}
+
+DO_VPQ(sve2p1_addqv_b, uint8_t, H1, 0, DO_ADD)
+DO_VPQ(sve2p1_addqv_h, uint16_t, H2, 0, DO_ADD)
+DO_VPQ(sve2p1_addqv_s, uint32_t, H4, 0, DO_ADD)
+DO_VPQ(sve2p1_addqv_d, uint64_t, H8, 0, DO_ADD)
+
+DO_VPQ(sve2p1_smaxqv_b, int8_t, H1, INT8_MIN, DO_MAX)
+DO_VPQ(sve2p1_smaxqv_h, int16_t, H2, INT16_MIN, DO_MAX)
+DO_VPQ(sve2p1_smaxqv_s, int32_t, H4, INT32_MIN, DO_MAX)
+DO_VPQ(sve2p1_smaxqv_d, int64_t, H8, INT64_MIN, DO_MAX)
+
+DO_VPQ(sve2p1_sminqv_b, int8_t, H1, INT8_MAX, DO_MIN)
+DO_VPQ(sve2p1_sminqv_h, int16_t, H2, INT16_MAX, DO_MIN)
+DO_VPQ(sve2p1_sminqv_s, int32_t, H4, INT32_MAX, DO_MIN)
+DO_VPQ(sve2p1_sminqv_d, int64_t, H8, INT64_MAX, DO_MIN)
+
+DO_VPQ(sve2p1_umaxqv_b, uint8_t, H1, 0, DO_MAX)
+DO_VPQ(sve2p1_umaxqv_h, uint16_t, H2, 0, DO_MAX)
+DO_VPQ(sve2p1_umaxqv_s, uint32_t, H4, 0, DO_MAX)
+DO_VPQ(sve2p1_umaxqv_d, uint64_t, H8, 0, DO_MAX)
+
+DO_VPQ(sve2p1_uminqv_b, uint8_t, H1, -1, DO_MIN)
+DO_VPQ(sve2p1_uminqv_h, uint16_t, H2, -1, DO_MIN)
+DO_VPQ(sve2p1_uminqv_s, uint32_t, H4, -1, DO_MIN)
+DO_VPQ(sve2p1_uminqv_d, uint64_t, H8, -1, DO_MIN)
+
+#undef DO_VPQ
+
/* Two vector operand, one scalar operand, unpredicated. */
#define DO_ZZI(NAME, TYPE, OP) \
void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
@@ -1873,10 +1906,46 @@ DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
#undef DO_ZZI
+#define DO_LOGIC_QV(NAME, SUFF, INIT, VOP, POP) \
+void HELPER(NAME ## _ ## SUFF)(void *vd, void *vn, void *vg, uint32_t desc) \
+{ \
+ unsigned seg = simd_oprsz(desc) / 16; \
+ uint64_t r0 = INIT, r1 = INIT; \
+ for (unsigned s = 0; s < seg; s++) { \
+ uint64_t p0 = expand_pred_##SUFF(*(uint8_t *)(vg + H1(s * 2))); \
+ uint64_t p1 = expand_pred_##SUFF(*(uint8_t *)(vg + H1(s * 2 + 1))); \
+ uint64_t v0 = *(uint64_t *)(vn + s * 16); \
+ uint64_t v1 = *(uint64_t *)(vn + s * 16 + 8); \
+ v0 = POP(v0, p0), v1 = POP(v1, p1); \
+ r0 = VOP(r0, v0), r1 = VOP(r1, v1); \
+ } \
+ *(uint64_t *)(vd + 0) = r0; \
+ *(uint64_t *)(vd + 8) = r1; \
+ clear_tail(vd, 16, simd_maxsz(desc)); \
+}
+
+DO_LOGIC_QV(sve2p1_orqv, b, 0, DO_ORR, DO_AND)
+DO_LOGIC_QV(sve2p1_orqv, h, 0, DO_ORR, DO_AND)
+DO_LOGIC_QV(sve2p1_orqv, s, 0, DO_ORR, DO_AND)
+DO_LOGIC_QV(sve2p1_orqv, d, 0, DO_ORR, DO_AND)
+
+DO_LOGIC_QV(sve2p1_eorqv, b, 0, DO_EOR, DO_AND)
+DO_LOGIC_QV(sve2p1_eorqv, h, 0, DO_EOR, DO_AND)
+DO_LOGIC_QV(sve2p1_eorqv, s, 0, DO_EOR, DO_AND)
+DO_LOGIC_QV(sve2p1_eorqv, d, 0, DO_EOR, DO_AND)
+
+DO_LOGIC_QV(sve2p1_andqv, b, -1, DO_AND, DO_ORC)
+DO_LOGIC_QV(sve2p1_andqv, h, -1, DO_AND, DO_ORC)
+DO_LOGIC_QV(sve2p1_andqv, s, -1, DO_AND, DO_ORC)
+DO_LOGIC_QV(sve2p1_andqv, d, -1, DO_AND, DO_ORC)
+
+#undef DO_LOGIC_QV
+
#undef DO_AND
#undef DO_ORR
#undef DO_EOR
#undef DO_BIC
+#undef DO_ORC
#undef DO_ADD
#undef DO_SUB
#undef DO_MAX
@@ -2069,27 +2138,6 @@ void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
when N is negative, add 2**M-1. */
#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
-static inline uint64_t do_urshr(uint64_t x, unsigned sh)
-{
- if (likely(sh < 64)) {
- return (x >> sh) + ((x >> (sh - 1)) & 1);
- } else if (sh == 64) {
- return x >> 63;
- } else {
- return 0;
- }
-}
-
-static inline int64_t do_srshr(int64_t x, unsigned sh)
-{
- if (likely(sh < 64)) {
- return (x >> sh) + ((x >> (sh - 1)) & 1);
- } else {
- /* Rounding the sign bit always produces 0. */
- return 0;
- }
-}
-
DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
@@ -2187,10 +2235,9 @@ DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
-#define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
-#define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
-#define DO_SQSHRUN_D(x, sh) \
- do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
+#define DO_SQSHRUN_H(x, sh) do_usat_b((int64_t)(x) >> sh)
+#define DO_SQSHRUN_S(x, sh) do_usat_h((int64_t)(x) >> sh)
+#define DO_SQSHRUN_D(x, sh) do_usat_s((int64_t)(x) >> (sh < 64 ? sh : 63))
DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
@@ -2200,9 +2247,9 @@ DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
-#define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
-#define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
-#define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
+#define DO_SQRSHRUN_H(x, sh) do_usat_b(do_srshr(x, sh))
+#define DO_SQRSHRUN_S(x, sh) do_usat_h(do_srshr(x, sh))
+#define DO_SQRSHRUN_D(x, sh) do_usat_s(do_srshr(x, sh))
DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
@@ -2212,9 +2259,9 @@ DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
-#define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
-#define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
-#define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
+#define DO_SQSHRN_H(x, sh) do_ssat_b(x >> sh)
+#define DO_SQSHRN_S(x, sh) do_ssat_h(x >> sh)
+#define DO_SQSHRN_D(x, sh) do_ssat_s(x >> sh)
DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
@@ -2224,9 +2271,9 @@ DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
-#define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
-#define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
-#define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
+#define DO_SQRSHRN_H(x, sh) do_ssat_b(do_srshr(x, sh))
+#define DO_SQRSHRN_S(x, sh) do_ssat_h(do_srshr(x, sh))
+#define DO_SQRSHRN_D(x, sh) do_ssat_s(do_srshr(x, sh))
DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
@@ -2988,6 +3035,56 @@ void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
}
}
+/*
+ * TODO: This could use half_shuffle64 and similar bit tricks to
+ * expand blocks of bits at once.
+ */
+#define DO_PMOV_PV(NAME, ESIZE) \
+void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \
+{ \
+ unsigned vl = simd_oprsz(desc); \
+ unsigned idx = simd_data(desc); \
+ unsigned elements = vl / ESIZE; \
+ ARMPredicateReg *d = vd; \
+ ARMVectorReg *s = vs; \
+ memset(d, 0, sizeof(*d)); \
+ for (unsigned e = 0; e < elements; ++e) { \
+ depositn(d->p, e * ESIZE, 1, extractn(s->d, elements * idx + e, 1)); \
+ } \
+}
+
+DO_PMOV_PV(pmov_pv_h, 2)
+DO_PMOV_PV(pmov_pv_s, 4)
+DO_PMOV_PV(pmov_pv_d, 8)
+
+#undef DO_PMOV_PV
+
+/*
+ * TODO: This could use half_unshuffle64 and similar bit tricks to
+ * compress blocks of bits at once.
+ */
+#define DO_PMOV_VP(NAME, ESIZE) \
+void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \
+{ \
+ unsigned vl = simd_oprsz(desc); \
+ unsigned idx = simd_data(desc); \
+ unsigned elements = vl / ESIZE; \
+ ARMVectorReg *d = vd; \
+ ARMPredicateReg *s = vs; \
+ if (idx == 0) { \
+ memset(d, 0, vl); \
+ } \
+ for (unsigned e = 0; e < elements; ++e) { \
+ depositn(d->d, elements * idx + e, 1, extractn(s->p, e * ESIZE, 1)); \
+ } \
+}
+
+DO_PMOV_VP(pmov_vp_h, 2)
+DO_PMOV_VP(pmov_vp_s, 4)
+DO_PMOV_VP(pmov_vp_d, 8)
+
+#undef DO_PMOV_VP
+
typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
@@ -3453,6 +3550,45 @@ DO_UZP(sve_uzp_s, uint32_t, H1_4)
DO_UZP(sve_uzp_d, uint64_t, H1_8)
DO_UZP(sve2_uzp_q, Int128, )
+typedef void perseg_zzz_fn(void *vd, void *vn, void *vm, uint32_t desc);
+
+static void do_perseg_zzz(void *vd, void *vn, void *vm,
+ uint32_t desc, perseg_zzz_fn *fn)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+
+ desc = simd_desc(16, 16, simd_data(desc));
+ for (intptr_t i = 0; i < oprsz; i += 16) {
+ fn(vd + i, vn + i, vm + i, desc);
+ }
+}
+
+#define DO_PERSEG_ZZZ(NAME, FUNC) \
+ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+ { do_perseg_zzz(vd, vn, vm, desc, FUNC); }
+
+DO_PERSEG_ZZZ(sve2p1_uzpq_b, helper_sve_uzp_b)
+DO_PERSEG_ZZZ(sve2p1_uzpq_h, helper_sve_uzp_h)
+DO_PERSEG_ZZZ(sve2p1_uzpq_s, helper_sve_uzp_s)
+DO_PERSEG_ZZZ(sve2p1_uzpq_d, helper_sve_uzp_d)
+
+DO_PERSEG_ZZZ(sve2p1_zipq_b, helper_sve_zip_b)
+DO_PERSEG_ZZZ(sve2p1_zipq_h, helper_sve_zip_h)
+DO_PERSEG_ZZZ(sve2p1_zipq_s, helper_sve_zip_s)
+DO_PERSEG_ZZZ(sve2p1_zipq_d, helper_sve_zip_d)
+
+DO_PERSEG_ZZZ(sve2p1_tblq_b, helper_sve_tbl_b)
+DO_PERSEG_ZZZ(sve2p1_tblq_h, helper_sve_tbl_h)
+DO_PERSEG_ZZZ(sve2p1_tblq_s, helper_sve_tbl_s)
+DO_PERSEG_ZZZ(sve2p1_tblq_d, helper_sve_tbl_d)
+
+DO_PERSEG_ZZZ(sve2p1_tbxq_b, helper_sve2_tbx_b)
+DO_PERSEG_ZZZ(sve2p1_tbxq_h, helper_sve2_tbx_h)
+DO_PERSEG_ZZZ(sve2p1_tbxq_s, helper_sve2_tbx_s)
+DO_PERSEG_ZZZ(sve2p1_tbxq_d, helper_sve2_tbx_d)
+
+#undef DO_PERSEG_ZZZ
+
#define DO_TRN(NAME, TYPE, H) \
void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
{ \
@@ -3993,15 +4129,6 @@ static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
return flags;
}
-static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
-{
- /* It is quicker to zero the whole predicate than loop on OPRSZ.
- * The compiler should turn this into 4 64-bit integer stores.
- */
- memset(d, 0, sizeof(ARMPredicateReg));
- return PREDTEST_INIT;
-}
-
void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
uint32_t pred_desc)
{
@@ -4009,7 +4136,7 @@ void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
if (last_active_pred(vn, vg, oprsz)) {
compute_brk_z(vd, vm, vg, oprsz, true);
} else {
- do_zero(vd, oprsz);
+ memset(vd, 0, sizeof(ARMPredicateReg));
}
}
@@ -4020,7 +4147,8 @@ uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
if (last_active_pred(vn, vg, oprsz)) {
return compute_brks_z(vd, vm, vg, oprsz, true);
} else {
- return do_zero(vd, oprsz);
+ memset(vd, 0, sizeof(ARMPredicateReg));
+ return PREDTEST_INIT;
}
}
@@ -4031,7 +4159,7 @@ void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
if (last_active_pred(vn, vg, oprsz)) {
compute_brk_z(vd, vm, vg, oprsz, false);
} else {
- do_zero(vd, oprsz);
+ memset(vd, 0, sizeof(ARMPredicateReg));
}
}
@@ -4042,7 +4170,8 @@ uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
if (last_active_pred(vn, vg, oprsz)) {
return compute_brks_z(vd, vm, vg, oprsz, false);
} else {
- return do_zero(vd, oprsz);
+ memset(vd, 0, sizeof(ARMPredicateReg));
+ return PREDTEST_INIT;
}
}
@@ -4098,35 +4227,30 @@ void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
{
intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
if (!last_active_pred(vn, vg, oprsz)) {
- do_zero(vd, oprsz);
- }
-}
-
-/* As if PredTest(Ones(PL), D, esz). */
-static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
- uint64_t esz_mask)
-{
- uint32_t flags = PREDTEST_INIT;
- intptr_t i;
-
- for (i = 0; i < oprsz / 8; i++) {
- flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
- }
- if (oprsz & 7) {
- uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
- flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
+ memset(vd, 0, sizeof(ARMPredicateReg));
}
- return flags;
}
uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
{
intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
if (last_active_pred(vn, vg, oprsz)) {
- return predtest_ones(vd, oprsz, -1);
- } else {
- return do_zero(vd, oprsz);
+ ARMPredicateReg *d = vd;
+ uint32_t flags = PREDTEST_INIT;
+ intptr_t i;
+
+ /* As if PredTest(Ones(PL), D, MO_8). */
+ for (i = 0; i < oprsz / 8; i++) {
+ flags = iter_predtest_fwd(d->p[i], -1, flags);
+ }
+ if (oprsz & 7) {
+ uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
+ flags = iter_predtest_fwd(d->p[i], mask, flags);
+ }
+ return flags;
}
+ memset(vd, 0, sizeof(ARMPredicateReg));
+ return PREDTEST_INIT;
}
uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
@@ -4143,66 +4267,200 @@ uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
return sum;
}
-uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
+uint64_t HELPER(sve2p1_cntp_c)(uint32_t png, uint32_t desc)
{
- intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
- intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
- uint64_t esz_mask = pred_esz_masks[esz];
- ARMPredicateReg *d = vd;
- uint32_t flags;
- intptr_t i;
+ int pl = FIELD_EX32(desc, PREDDESC, OPRSZ);
+ int vl = pl * 8;
+ unsigned v_esz = FIELD_EX32(desc, PREDDESC, ESZ);
+ int lg2_width = FIELD_EX32(desc, PREDDESC, DATA) + 1;
+ DecodeCounter p = decode_counter(png, vl, v_esz);
+ unsigned maxelem = (vl << lg2_width) >> v_esz;
+ unsigned count = p.count;
+
+ if (p.invert) {
+ if (count >= maxelem) {
+ return 0;
+ }
+ count = maxelem - count;
+ } else {
+ count = MIN(count, maxelem);
+ }
+ return count >> p.lg2_stride;
+}
+
+/* C.f. Arm pseudocode EncodePredCount */
+static uint64_t encode_pred_count(uint32_t elements, uint32_t count,
+ uint32_t esz, bool invert)
+{
+ uint32_t pred;
- /* Begin with a zero predicate register. */
- flags = do_zero(d, oprsz);
if (count == 0) {
- return flags;
+ return 0;
+ }
+ if (invert) {
+ count = elements - count;
+ } else if (count == elements) {
+ count = 0;
+ invert = true;
}
- /* Set all of the requested bits. */
- for (i = 0; i < count / 64; ++i) {
- d->p[i] = esz_mask;
+ pred = (count << 1) | 1;
+ pred <<= esz;
+ pred |= invert << 15;
+
+ return pred;
+}
+
+/* C.f. Arm pseudocode PredCountTest */
+static uint32_t pred_count_test(uint32_t elements, uint32_t count, bool invert)
+{
+ uint32_t flags;
+
+ if (count == 0) {
+ flags = 1; /* !N, Z, C */
+ } else if (!invert) {
+ flags = (1u << 31) | 2; /* N, !Z */
+ flags |= count != elements; /* C */
+ } else {
+ flags = 2; /* !Z, !C */
+ flags |= (count == elements) << 31; /* N */
}
- if (count & 63) {
- d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
+ return flags;
+}
+
+/* D must be cleared on entry. */
+static void do_whilel(ARMPredicateReg *d, uint64_t esz_mask,
+ uint32_t count, uint32_t oprbits)
+{
+ tcg_debug_assert(count <= oprbits);
+ if (count) {
+ uint32_t i;
+
+ /* Set all of the requested bits. */
+ for (i = 0; i < count / 64; ++i) {
+ d->p[i] = esz_mask;
+ }
+ if (count & 63) {
+ d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
+ }
}
+}
- return predtest_ones(d, oprsz, esz_mask);
+uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
+{
+ uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ uint32_t oprbits = oprsz * 8;
+ uint64_t esz_mask = pred_esz_masks[esz];
+ ARMPredicateReg *d = vd;
+
+ count <<= esz;
+ memset(d, 0, sizeof(*d));
+ do_whilel(d, esz_mask, count, oprbits);
+ return pred_count_test(oprbits, count, false);
}
-uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
+uint32_t HELPER(sve_while2l)(void *vd, uint32_t count, uint32_t pred_desc)
{
- intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
- intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ uint32_t oprbits = oprsz * 8;
uint64_t esz_mask = pred_esz_masks[esz];
ARMPredicateReg *d = vd;
- intptr_t i, invcount, oprbits;
- uint64_t bits;
- if (count == 0) {
- return do_zero(d, oprsz);
+ count <<= esz;
+ memset(d, 0, 2 * sizeof(*d));
+ if (count <= oprbits) {
+ do_whilel(&d[0], esz_mask, count, oprbits);
+ } else {
+ do_whilel(&d[0], esz_mask, oprbits, oprbits);
+ do_whilel(&d[1], esz_mask, count - oprbits, oprbits);
}
- oprbits = oprsz * 8;
+ return pred_count_test(2 * oprbits, count, false);
+}
+
+uint32_t HELPER(sve_whilecl)(void *vd, uint32_t count, uint32_t pred_desc)
+{
+ uint32_t pl = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ uint32_t scale = FIELD_EX32(pred_desc, PREDDESC, DATA);
+ uint32_t vl = pl * 8;
+ uint32_t elements = (vl >> esz) << scale;
+ ARMPredicateReg *d = vd;
+
+ *d = (ARMPredicateReg) {
+ .p[0] = encode_pred_count(elements, count, esz, false)
+ };
+ return pred_count_test(elements, count, false);
+}
+
+/* D must be cleared on entry. */
+static void do_whileg(ARMPredicateReg *d, uint64_t esz_mask,
+ uint32_t count, uint32_t oprbits)
+{
tcg_debug_assert(count <= oprbits);
+ if (count) {
+ uint32_t i, invcount = oprbits - count;
+ uint64_t bits = esz_mask & MAKE_64BIT_MASK(invcount & 63, 64);
- bits = esz_mask;
- if (oprbits & 63) {
- bits &= MAKE_64BIT_MASK(0, oprbits & 63);
+ for (i = invcount / 64; i < oprbits / 64; ++i) {
+ d->p[i] = bits;
+ bits = esz_mask;
+ }
+ if (oprbits & 63) {
+ d->p[i] = bits & MAKE_64BIT_MASK(0, oprbits & 63);
+ }
}
+}
- invcount = oprbits - count;
- for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
- d->p[i] = bits;
- bits = esz_mask;
- }
+uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
+{
+ uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ uint32_t oprbits = oprsz * 8;
+ uint64_t esz_mask = pred_esz_masks[esz];
+ ARMPredicateReg *d = vd;
- d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
+ count <<= esz;
+ memset(d, 0, sizeof(*d));
+ do_whileg(d, esz_mask, count, oprbits);
+ return pred_count_test(oprbits, count, true);
+}
+
+uint32_t HELPER(sve_while2g)(void *vd, uint32_t count, uint32_t pred_desc)
+{
+ uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ uint32_t oprbits = oprsz * 8;
+ uint64_t esz_mask = pred_esz_masks[esz];
+ ARMPredicateReg *d = vd;
- while (--i >= 0) {
- d->p[i] = 0;
+ count <<= esz;
+ memset(d, 0, 2 * sizeof(*d));
+ if (count <= oprbits) {
+ do_whileg(&d[1], esz_mask, count, oprbits);
+ } else {
+ do_whilel(&d[1], esz_mask, oprbits, oprbits);
+ do_whileg(&d[0], esz_mask, count - oprbits, oprbits);
}
- return predtest_ones(d, oprsz, esz_mask);
+ return pred_count_test(2 * oprbits, count, true);
+}
+
+uint32_t HELPER(sve_whilecg)(void *vd, uint32_t count, uint32_t pred_desc)
+{
+ uint32_t pl = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ uint32_t scale = FIELD_EX32(pred_desc, PREDDESC, DATA);
+ uint32_t vl = pl * 8;
+ uint32_t elements = (vl >> esz) << scale;
+ ARMPredicateReg *d = vd;
+
+ *d = (ARMPredicateReg) {
+ .p[0] = encode_pred_count(elements, count, esz, true)
+ };
+ return pred_count_test(elements, count, true);
}
/* Recursive reduction on a function;
@@ -4213,19 +4471,20 @@ uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
* The recursion is bounded to depth 7 (128 fp16 elements), so there's
* little to gain with a more complex non-recursive form.
*/
-#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
-static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
+#define DO_REDUCE(NAME, SUF, TYPE, H, FUNC, IDENT) \
+static TYPE FUNC##_reduce(TYPE *data, float_status *status, uintptr_t n) \
{ \
if (n == 1) { \
return *data; \
} else { \
uintptr_t half = n / 2; \
- TYPE lo = NAME##_reduce(data, status, half); \
- TYPE hi = NAME##_reduce(data + half, status, half); \
+ TYPE lo = FUNC##_reduce(data, status, half); \
+ TYPE hi = FUNC##_reduce(data + half, status, half); \
return FUNC(lo, hi, status); \
} \
} \
-uint64_t HELPER(NAME)(void *vn, void *vg, float_status *s, uint32_t desc) \
+uint64_t helper_sve_##NAME##v_##SUF(void *vn, void *vg, \
+ float_status *s, uint32_t desc) \
{ \
uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
@@ -4240,39 +4499,54 @@ uint64_t HELPER(NAME)(void *vn, void *vg, float_status *s, uint32_t desc) \
for (; i < maxsz; i += sizeof(TYPE)) { \
*(TYPE *)((void *)data + i) = IDENT; \
} \
- return NAME##_reduce(data, s, maxsz / sizeof(TYPE)); \
+ return FUNC##_reduce(data, s, maxsz / sizeof(TYPE)); \
+} \
+void helper_sve2p1_##NAME##qv_##SUF(void *vd, void *vn, void *vg, \
+ float_status *status, uint32_t desc) \
+{ \
+ unsigned oprsz = simd_oprsz(desc), segments = oprsz / 16; \
+ for (unsigned e = 0; e < 16; e += sizeof(TYPE)) { \
+ TYPE data[ARM_MAX_VQ]; \
+ for (unsigned s = 0; s < segments; s++) { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(s * 2)); \
+ TYPE nn = *(TYPE *)(vn + H(s * 16 + H(e))); \
+ data[s] = (pg >> e) & 1 ? nn : IDENT; \
+ } \
+ *(TYPE *)(vd + H(e)) = FUNC##_reduce(data, status, segments); \
+ } \
+ clear_tail(vd, 16, simd_maxsz(desc)); \
}
-DO_REDUCE(sve_faddv_h, float16, H1_2, float16_add, float16_zero)
-DO_REDUCE(sve_faddv_s, float32, H1_4, float32_add, float32_zero)
-DO_REDUCE(sve_faddv_d, float64, H1_8, float64_add, float64_zero)
+DO_REDUCE(fadd,h, float16, H1_2, float16_add, float16_zero)
+DO_REDUCE(fadd,s, float32, H1_4, float32_add, float32_zero)
+DO_REDUCE(fadd,d, float64, H1_8, float64_add, float64_zero)
/* Identity is floatN_default_nan, without the function call. */
-DO_REDUCE(sve_fminnmv_h, float16, H1_2, float16_minnum, 0x7E00)
-DO_REDUCE(sve_fminnmv_s, float32, H1_4, float32_minnum, 0x7FC00000)
-DO_REDUCE(sve_fminnmv_d, float64, H1_8, float64_minnum, 0x7FF8000000000000ULL)
+DO_REDUCE(fminnm,h, float16, H1_2, float16_minnum, 0x7E00)
+DO_REDUCE(fminnm,s, float32, H1_4, float32_minnum, 0x7FC00000)
+DO_REDUCE(fminnm,d, float64, H1_8, float64_minnum, 0x7FF8000000000000ULL)
-DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, float16_maxnum, 0x7E00)
-DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, float32_maxnum, 0x7FC00000)
-DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, float64_maxnum, 0x7FF8000000000000ULL)
+DO_REDUCE(fmaxnm,h, float16, H1_2, float16_maxnum, 0x7E00)
+DO_REDUCE(fmaxnm,s, float32, H1_4, float32_maxnum, 0x7FC00000)
+DO_REDUCE(fmaxnm,d, float64, H1_8, float64_maxnum, 0x7FF8000000000000ULL)
-DO_REDUCE(sve_fminv_h, float16, H1_2, float16_min, float16_infinity)
-DO_REDUCE(sve_fminv_s, float32, H1_4, float32_min, float32_infinity)
-DO_REDUCE(sve_fminv_d, float64, H1_8, float64_min, float64_infinity)
+DO_REDUCE(fmin,h, float16, H1_2, float16_min, float16_infinity)
+DO_REDUCE(fmin,s, float32, H1_4, float32_min, float32_infinity)
+DO_REDUCE(fmin,d, float64, H1_8, float64_min, float64_infinity)
-DO_REDUCE(sve_fmaxv_h, float16, H1_2, float16_max, float16_chs(float16_infinity))
-DO_REDUCE(sve_fmaxv_s, float32, H1_4, float32_max, float32_chs(float32_infinity))
-DO_REDUCE(sve_fmaxv_d, float64, H1_8, float64_max, float64_chs(float64_infinity))
+DO_REDUCE(fmax,h, float16, H1_2, float16_max, float16_chs(float16_infinity))
+DO_REDUCE(fmax,s, float32, H1_4, float32_max, float32_chs(float32_infinity))
+DO_REDUCE(fmax,d, float64, H1_8, float64_max, float64_chs(float64_infinity))
-DO_REDUCE(sve_ah_fminv_h, float16, H1_2, helper_vfp_ah_minh, float16_infinity)
-DO_REDUCE(sve_ah_fminv_s, float32, H1_4, helper_vfp_ah_mins, float32_infinity)
-DO_REDUCE(sve_ah_fminv_d, float64, H1_8, helper_vfp_ah_mind, float64_infinity)
+DO_REDUCE(ah_fmin,h, float16, H1_2, helper_vfp_ah_minh, float16_infinity)
+DO_REDUCE(ah_fmin,s, float32, H1_4, helper_vfp_ah_mins, float32_infinity)
+DO_REDUCE(ah_fmin,d, float64, H1_8, helper_vfp_ah_mind, float64_infinity)
-DO_REDUCE(sve_ah_fmaxv_h, float16, H1_2, helper_vfp_ah_maxh,
+DO_REDUCE(ah_fmax,h, float16, H1_2, helper_vfp_ah_maxh,
float16_chs(float16_infinity))
-DO_REDUCE(sve_ah_fmaxv_s, float32, H1_4, helper_vfp_ah_maxs,
+DO_REDUCE(ah_fmax,s, float32, H1_4, helper_vfp_ah_maxs,
float32_chs(float32_infinity))
-DO_REDUCE(sve_ah_fmaxv_d, float64, H1_8, helper_vfp_ah_maxd,
+DO_REDUCE(ah_fmax,d, float64, H1_8, helper_vfp_ah_maxd,
float64_chs(float64_infinity))
#undef DO_REDUCE
@@ -4554,7 +4828,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vg, \
* FZ16. When converting from fp16, this affects flushing input denormals;
* when converting to fp16, this affects flushing output denormals.
*/
-static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
+float32 sve_f16_to_f32(float16 f, float_status *fpst)
{
bool save = get_flush_inputs_to_zero(fpst);
float32 ret;
@@ -4576,7 +4850,7 @@ static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
return ret;
}
-static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
+float16 sve_f32_to_f16(float32 f, float_status *fpst)
{
bool save = get_flush_to_zero(fpst);
float16 ret;
@@ -6085,6 +6359,9 @@ DO_LD1_2(ld1sds, MO_64, MO_32)
DO_LD1_2(ld1dd, MO_64, MO_64)
+DO_LD1_2(ld1squ, MO_32, MO_128)
+DO_LD1_2(ld1dqu, MO_64, MO_128)
+
#undef DO_LD1_1
#undef DO_LD1_2
@@ -6144,6 +6421,10 @@ DO_LDN_2(2, dd, MO_64)
DO_LDN_2(3, dd, MO_64)
DO_LDN_2(4, dd, MO_64)
+DO_LDN_2(2, qq, MO_128)
+DO_LDN_2(3, qq, MO_128)
+DO_LDN_2(4, qq, MO_128)
+
#undef DO_LDN_1
#undef DO_LDN_2
@@ -6707,6 +6988,13 @@ DO_STN_2(2, dd, MO_64, MO_64)
DO_STN_2(3, dd, MO_64, MO_64)
DO_STN_2(4, dd, MO_64, MO_64)
+DO_STN_2(1, sq, MO_128, MO_32)
+DO_STN_2(1, dq, MO_128, MO_64)
+
+DO_STN_2(2, qq, MO_128, MO_128)
+DO_STN_2(3, qq, MO_128, MO_128)
+DO_STN_2(4, qq, MO_128, MO_128)
+
#undef DO_STN_1
#undef DO_STN_2
@@ -6923,6 +7211,9 @@ DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
DO_LD1_ZPZ_D(dd_be, zss, MO_64)
DO_LD1_ZPZ_D(dd_be, zd, MO_64)
+DO_LD1_ZPZ_D(qq_le, zd, MO_128)
+DO_LD1_ZPZ_D(qq_be, zd, MO_128)
+
#undef DO_LD1_ZPZ_S
#undef DO_LD1_ZPZ_D
@@ -7309,9 +7600,505 @@ DO_ST1_ZPZ_D(sd_be, zd, MO_32)
DO_ST1_ZPZ_D(dd_le, zd, MO_64)
DO_ST1_ZPZ_D(dd_be, zd, MO_64)
+DO_ST1_ZPZ_D(qq_le, zd, MO_128)
+DO_ST1_ZPZ_D(qq_be, zd, MO_128)
+
#undef DO_ST1_ZPZ_S
#undef DO_ST1_ZPZ_D
+/*
+ * SVE2.1 consecutive register load/store
+ */
+
+static unsigned sve2p1_cont_ldst_elements(SVEContLdSt *info, vaddr addr,
+ uint32_t png, intptr_t reg_max,
+ int N, int v_esz)
+{
+ const int esize = 1 << v_esz;
+ intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
+ DecodeCounter p = decode_counter(png, reg_max, v_esz);
+ unsigned b_count = p.count << v_esz;
+ unsigned b_stride = 1 << (v_esz + p.lg2_stride);
+ intptr_t page_split;
+
+ /* Set all of the element indices to -1, and the TLB data to 0. */
+ memset(info, -1, offsetof(SVEContLdSt, page));
+ memset(info->page, 0, sizeof(info->page));
+
+ if (p.invert) {
+ if (b_count >= reg_max * N) {
+ return 0;
+ }
+ reg_off_first = b_count;
+ reg_off_last = reg_max * N - b_stride;
+ } else {
+ if (b_count == 0) {
+ return 0;
+ }
+ reg_off_first = 0;
+ reg_off_last = MIN(b_count - esize, reg_max * N - b_stride);
+ }
+
+ info->reg_off_first[0] = reg_off_first;
+ info->mem_off_first[0] = reg_off_first;
+
+ page_split = -(addr | TARGET_PAGE_MASK);
+ if (reg_off_last + esize <= page_split || reg_off_first >= page_split) {
+ /* The entire operation fits within a single page. */
+ info->reg_off_last[0] = reg_off_last;
+ return b_stride;
+ }
+
+ info->page_split = page_split;
+ reg_off_split = ROUND_DOWN(page_split, esize);
+
+ /*
+ * This is the last full element on the first page, but it is not
+ * necessarily active. If there is no full element, i.e. the first
+ * active element is the one that's split, this value remains -1.
+ * It is useful as iteration bounds.
+ */
+ if (reg_off_split != 0) {
+ info->reg_off_last[0] = ROUND_DOWN(reg_off_split - esize, b_stride);
+ }
+
+ /* Determine if an unaligned element spans the pages. */
+ if (page_split & (esize - 1)) {
+ /* It is helpful to know if the split element is active. */
+ if ((reg_off_split & (b_stride - 1)) == 0) {
+ info->reg_off_split = reg_off_split;
+ info->mem_off_split = reg_off_split;
+ }
+ reg_off_split += esize;
+ }
+
+ /*
+ * We do want the first active element on the second page, because
+ * this may affect the address reported in an exception.
+ */
+ reg_off_split = ROUND_UP(reg_off_split, b_stride);
+ if (reg_off_split <= reg_off_last) {
+ info->reg_off_first[1] = reg_off_split;
+ info->mem_off_first[1] = reg_off_split;
+ info->reg_off_last[1] = reg_off_last;
+ }
+ return b_stride;
+}
+
+static void sve2p1_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
+ target_ulong addr, unsigned estride,
+ int esize, int wp_access, uintptr_t ra)
+{
+#ifndef CONFIG_USER_ONLY
+ intptr_t count_off, count_last;
+ int flags0 = info->page[0].flags;
+ int flags1 = info->page[1].flags;
+
+ if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
+ return;
+ }
+
+ /* Indicate that watchpoints are handled. */
+ info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
+ info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
+
+ if (flags0 & TLB_WATCHPOINT) {
+ count_off = info->reg_off_first[0];
+ count_last = info->reg_off_split;
+ if (count_last < 0) {
+ count_last = info->reg_off_last[0];
+ }
+ do {
+ cpu_check_watchpoint(env_cpu(env), addr + count_off,
+ esize, info->page[0].attrs, wp_access, ra);
+ count_off += estride;
+ } while (count_off <= count_last);
+ }
+
+ count_off = info->reg_off_first[1];
+ if ((flags1 & TLB_WATCHPOINT) && count_off >= 0) {
+ count_last = info->reg_off_last[1];
+ do {
+ cpu_check_watchpoint(env_cpu(env), addr + count_off,
+ esize, info->page[1].attrs,
+ wp_access, ra);
+ count_off += estride;
+ } while (count_off <= count_last);
+ }
+#endif
+}
+
+static void sve2p1_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
+ target_ulong addr, unsigned estride,
+ int esize, uint32_t mtedesc,
+ uintptr_t ra)
+{
+ intptr_t count_off, count_last;
+
+ /*
+ * TODO: estride is always a small power of two, <= 8.
+ * Manipulate the stride within the loops such that
+ * - first iteration hits addr + off, as required,
+ * - second iteration hits ALIGN_UP(addr, 16),
+ * - other iterations advance addr by 16.
+ * This will minimize the probing to once per MTE granule.
+ */
+
+ /* Process the page only if MemAttr == Tagged. */
+ if (info->page[0].tagged) {
+ count_off = info->reg_off_first[0];
+ count_last = info->reg_off_split;
+ if (count_last < 0) {
+ count_last = info->reg_off_last[0];
+ }
+
+ do {
+ mte_check(env, mtedesc, addr + count_off, ra);
+ count_off += estride;
+ } while (count_off <= count_last);
+ }
+
+ count_off = info->reg_off_first[1];
+ if (count_off >= 0 && info->page[1].tagged) {
+ count_last = info->reg_off_last[1];
+ do {
+ mte_check(env, mtedesc, addr + count_off, ra);
+ count_off += estride;
+ } while (count_off <= count_last);
+ }
+}
+
+static inline QEMU_ALWAYS_INLINE
+void sve2p1_ld1_c(CPUARMState *env, ARMVectorReg *zd, const vaddr addr,
+ uint32_t png, uint32_t desc,
+ const uintptr_t ra, const MemOp esz,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ const unsigned N = (desc >> SIMD_DATA_SHIFT) & 1 ? 4 : 2;
+ const unsigned rstride = 1 << ((desc >> (SIMD_DATA_SHIFT + 1)) % 4);
+ uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+ const intptr_t reg_max = simd_oprsz(desc);
+ const unsigned esize = 1 << esz;
+ intptr_t count_off, count_last;
+ intptr_t reg_off, reg_last, reg_n;
+ SVEContLdSt info;
+ unsigned estride, flags;
+ void *host;
+
+ estride = sve2p1_cont_ldst_elements(&info, addr, png, reg_max, N, esz);
+ if (estride == 0) {
+ /* The entire predicate was false; no load occurs. */
+ for (unsigned n = 0; n < N; n++) {
+ memset(zd + n * rstride, 0, reg_max);
+ }
+ return;
+ }
+
+ /* Probe the page(s). Exit with exception for any invalid page. */
+ sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra);
+
+ /* Handle watchpoints for all active elements. */
+ sve2p1_cont_ldst_watchpoints(&info, env, addr, estride,
+ esize, BP_MEM_READ, ra);
+
+ /*
+ * Handle mte checks for all active elements.
+ * Since TBI must be set for MTE, !mtedesc => !mte_active.
+ */
+ if (mtedesc) {
+ sve2p1_cont_ldst_mte_check(&info, env, estride, addr,
+ esize, mtedesc, ra);
+ }
+
+ flags = info.page[0].flags | info.page[1].flags;
+ if (unlikely(flags != 0)) {
+ /*
+ * At least one page includes MMIO.
+ * Any bus operation can fail with cpu_transaction_failed,
+ * which for ARM will raise SyncExternal. Perform the load
+ * into scratch memory to preserve register state until the end.
+ */
+ ARMVectorReg scratch[4] = { };
+
+ count_off = info.reg_off_first[0];
+ count_last = info.reg_off_last[1];
+ if (count_last < 0) {
+ count_last = info.reg_off_split;
+ if (count_last < 0) {
+ count_last = info.reg_off_last[0];
+ }
+ }
+ reg_off = count_off % reg_max;
+ reg_n = count_off / reg_max;
+
+ do {
+ reg_last = MIN(count_last - count_off, reg_max - esize);
+ do {
+ tlb_fn(env, &scratch[reg_n], reg_off, addr + count_off, ra);
+ reg_off += estride;
+ count_off += estride;
+ } while (reg_off <= reg_last);
+ reg_off = 0;
+ reg_n++;
+ } while (count_off <= count_last);
+
+ for (unsigned n = 0; n < N; ++n) {
+ memcpy(&zd[n * rstride], &scratch[n], reg_max);
+ }
+ return;
+ }
+
+ /* The entire operation is in RAM, on valid pages. */
+
+ for (unsigned n = 0; n < N; ++n) {
+ memset(&zd[n * rstride], 0, reg_max);
+ }
+
+ count_off = info.reg_off_first[0];
+ count_last = info.reg_off_last[0];
+ reg_off = count_off % reg_max;
+ reg_n = count_off / reg_max;
+ host = info.page[0].host;
+
+ set_helper_retaddr(ra);
+
+ do {
+ reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize);
+ do {
+ host_fn(&zd[reg_n * rstride], reg_off, host + count_off);
+ reg_off += estride;
+ count_off += estride;
+ } while (reg_off <= reg_last);
+ reg_off = 0;
+ reg_n++;
+ } while (count_off <= count_last);
+
+ clear_helper_retaddr();
+
+ /*
+ * Use the slow path to manage the cross-page misalignment.
+ * But we know this is RAM and cannot trap.
+ */
+ count_off = info.reg_off_split;
+ if (unlikely(count_off >= 0)) {
+ reg_off = count_off % reg_max;
+ reg_n = count_off / reg_max;
+ tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra);
+ }
+
+ count_off = info.reg_off_first[1];
+ if (unlikely(count_off >= 0)) {
+ count_last = info.reg_off_last[1];
+ reg_off = count_off % reg_max;
+ reg_n = count_off / reg_max;
+ host = info.page[1].host;
+
+ set_helper_retaddr(ra);
+
+ do {
+ reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize);
+ do {
+ host_fn(&zd[reg_n * rstride], reg_off, host + count_off);
+ reg_off += estride;
+ count_off += estride;
+ } while (reg_off <= reg_last);
+ reg_off = 0;
+ reg_n++;
+ } while (count_off <= count_last);
+
+ clear_helper_retaddr();
+ }
+}
+
+void HELPER(sve2p1_ld1bb_c)(CPUARMState *env, void *vd, target_ulong addr,
+ uint32_t png, uint32_t desc)
+{
+ sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), MO_8,
+ sve_ld1bb_host, sve_ld1bb_tlb);
+}
+
+#define DO_LD1_2(NAME, ESZ) \
+void HELPER(sve2p1_##NAME##_le_c)(CPUARMState *env, void *vd, \
+ target_ulong addr, uint32_t png, \
+ uint32_t desc) \
+{ \
+ sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), ESZ, \
+ sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
+} \
+void HELPER(sve2p1_##NAME##_be_c)(CPUARMState *env, void *vd, \
+ target_ulong addr, uint32_t png, \
+ uint32_t desc) \
+{ \
+ sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), ESZ, \
+ sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
+}
+
+DO_LD1_2(ld1hh, MO_16)
+DO_LD1_2(ld1ss, MO_32)
+DO_LD1_2(ld1dd, MO_64)
+
+#undef DO_LD1_2
+
+static inline QEMU_ALWAYS_INLINE
+void sve2p1_st1_c(CPUARMState *env, ARMVectorReg *zd, const vaddr addr,
+ uint32_t png, uint32_t desc,
+ const uintptr_t ra, const int esz,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ const unsigned N = (desc >> SIMD_DATA_SHIFT) & 1 ? 4 : 2;
+ const unsigned rstride = 1 << ((desc >> (SIMD_DATA_SHIFT + 1)) % 4);
+ uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+ const intptr_t reg_max = simd_oprsz(desc);
+ const unsigned esize = 1 << esz;
+ intptr_t count_off, count_last;
+ intptr_t reg_off, reg_last, reg_n;
+ SVEContLdSt info;
+ unsigned estride, flags;
+ void *host;
+
+ estride = sve2p1_cont_ldst_elements(&info, addr, png, reg_max, N, esz);
+ if (estride == 0) {
+ /* The entire predicate was false; no store occurs. */
+ return;
+ }
+
+ /* Probe the page(s). Exit with exception for any invalid page. */
+ sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra);
+
+ /* Handle watchpoints for all active elements. */
+ sve2p1_cont_ldst_watchpoints(&info, env, addr, estride,
+ esize, BP_MEM_WRITE, ra);
+
+ /*
+ * Handle mte checks for all active elements.
+ * Since TBI must be set for MTE, !mtedesc => !mte_active.
+ */
+ if (mtedesc) {
+ sve2p1_cont_ldst_mte_check(&info, env, estride, addr,
+ esize, mtedesc, ra);
+ }
+
+ flags = info.page[0].flags | info.page[1].flags;
+ if (unlikely(flags != 0)) {
+ /*
+ * At least one page includes MMIO.
+ * Any bus operation can fail with cpu_transaction_failed,
+ * which for ARM will raise SyncExternal. Perform the load
+ * into scratch memory to preserve register state until the end.
+ */
+ count_off = info.reg_off_first[0];
+ count_last = info.reg_off_last[1];
+ if (count_last < 0) {
+ count_last = info.reg_off_split;
+ if (count_last < 0) {
+ count_last = info.reg_off_last[0];
+ }
+ }
+ reg_off = count_off % reg_max;
+ reg_n = count_off / reg_max;
+
+ do {
+ reg_last = MIN(count_last - count_off, reg_max - esize);
+ do {
+ tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra);
+ reg_off += estride;
+ count_off += estride;
+ } while (reg_off <= reg_last);
+ reg_off = 0;
+ reg_n++;
+ } while (count_off <= count_last);
+ return;
+ }
+
+ /* The entire operation is in RAM, on valid pages. */
+
+ count_off = info.reg_off_first[0];
+ count_last = info.reg_off_last[0];
+ reg_off = count_off % reg_max;
+ reg_n = count_off / reg_max;
+ host = info.page[0].host;
+
+ set_helper_retaddr(ra);
+
+ do {
+ reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize);
+ do {
+ host_fn(&zd[reg_n * rstride], reg_off, host + count_off);
+ reg_off += estride;
+ count_off += estride;
+ } while (reg_off <= reg_last);
+ reg_off = 0;
+ reg_n++;
+ } while (count_off <= count_last);
+
+ clear_helper_retaddr();
+
+ /*
+ * Use the slow path to manage the cross-page misalignment.
+ * But we know this is RAM and cannot trap.
+ */
+ count_off = info.reg_off_split;
+ if (unlikely(count_off >= 0)) {
+ reg_off = count_off % reg_max;
+ reg_n = count_off / reg_max;
+ tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra);
+ }
+
+ count_off = info.reg_off_first[1];
+ if (unlikely(count_off >= 0)) {
+ count_last = info.reg_off_last[1];
+ reg_off = count_off % reg_max;
+ reg_n = count_off / reg_max;
+ host = info.page[1].host;
+
+ set_helper_retaddr(ra);
+
+ do {
+ reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize);
+ do {
+ host_fn(&zd[reg_n * rstride], reg_off, host + count_off);
+ reg_off += estride;
+ count_off += estride;
+ } while (reg_off <= reg_last);
+ reg_off = 0;
+ reg_n++;
+ } while (count_off <= count_last);
+
+ clear_helper_retaddr();
+ }
+}
+
+void HELPER(sve2p1_st1bb_c)(CPUARMState *env, void *vd, target_ulong addr,
+ uint32_t png, uint32_t desc)
+{
+ sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), MO_8,
+ sve_st1bb_host, sve_st1bb_tlb);
+}
+
+#define DO_ST1_2(NAME, ESZ) \
+void HELPER(sve2p1_##NAME##_le_c)(CPUARMState *env, void *vd, \
+ target_ulong addr, uint32_t png, \
+ uint32_t desc) \
+{ \
+ sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), ESZ, \
+ sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
+} \
+void HELPER(sve2p1_##NAME##_be_c)(CPUARMState *env, void *vd, \
+ target_ulong addr, uint32_t png, \
+ uint32_t desc) \
+{ \
+ sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), ESZ, \
+ sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
+}
+
+DO_ST1_2(st1hh, MO_16)
+DO_ST1_2(st1ss, MO_32)
+DO_ST1_2(st1dd, MO_64)
+
+#undef DO_ST1_2
+
void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
{
intptr_t i, opr_sz = simd_oprsz(desc) / 8;
@@ -7715,3 +8502,31 @@ DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
#undef DO_FCVTLT
#undef DO_FCVTNT
+
+void HELPER(pext)(void *vd, uint32_t png, uint32_t desc)
+{
+ int pl = FIELD_EX32(desc, PREDDESC, OPRSZ);
+ int vl = pl * 8;
+ unsigned v_esz = FIELD_EX32(desc, PREDDESC, ESZ);
+ int part = FIELD_EX32(desc, PREDDESC, DATA);
+ DecodeCounter p = decode_counter(png, vl, v_esz);
+ uint64_t mask = pred_esz_masks[v_esz + p.lg2_stride];
+ ARMPredicateReg *d = vd;
+
+ /*
+ * Convert from element count to byte count and adjust
+ * for the portion of the 4*VL counter to be extracted.
+ */
+ int b_count = (p.count << v_esz) - vl * part;
+
+ memset(d, 0, sizeof(*d));
+ if (p.invert) {
+ if (b_count <= 0) {
+ do_whilel(vd, mask, vl, vl);
+ } else if (b_count < vl) {
+ do_whileg(vd, mask, vl - b_count, vl);
+ }
+ } else if (b_count > 0) {
+ do_whilel(vd, mask, MIN(b_count, vl), vl);
+ }
+}
diff --git a/target/arm/tcg/sve_ldst_internal.h b/target/arm/tcg/sve_ldst_internal.h
index f2243da..c67cda9 100644
--- a/target/arm/tcg/sve_ldst_internal.h
+++ b/target/arm/tcg/sve_ldst_internal.h
@@ -116,6 +116,94 @@ DO_ST_PRIM_2(sd, H1_8, uint64_t, uint32_t, stl)
DO_LD_PRIM_2(dd, H1_8, uint64_t, uint64_t, ldq)
DO_ST_PRIM_2(dd, H1_8, uint64_t, uint64_t, stq)
+#define DO_LD_PRIM_3(NAME, FUNC) \
+ static inline void sve_##NAME##_host(void *vd, \
+ intptr_t reg_off, void *host) \
+ { sve_##FUNC##_host(vd, reg_off, host); \
+ *(uint64_t *)(vd + reg_off + 8) = 0; } \
+ static inline void sve_##NAME##_tlb(CPUARMState *env, void *vd, \
+ intptr_t reg_off, target_ulong addr, uintptr_t ra) \
+ { sve_##FUNC##_tlb(env, vd, reg_off, addr, ra); \
+ *(uint64_t *)(vd + reg_off + 8) = 0; }
+
+DO_LD_PRIM_3(ld1squ_be, ld1sdu_be)
+DO_LD_PRIM_3(ld1squ_le, ld1sdu_le)
+DO_LD_PRIM_3(ld1dqu_be, ld1dd_be)
+DO_LD_PRIM_3(ld1dqu_le, ld1dd_le)
+
+#define sve_st1sq_be_host sve_st1sd_be_host
+#define sve_st1sq_le_host sve_st1sd_le_host
+#define sve_st1sq_be_tlb sve_st1sd_be_tlb
+#define sve_st1sq_le_tlb sve_st1sd_le_tlb
+
+#define sve_st1dq_be_host sve_st1dd_be_host
+#define sve_st1dq_le_host sve_st1dd_le_host
+#define sve_st1dq_be_tlb sve_st1dd_be_tlb
+#define sve_st1dq_le_tlb sve_st1dd_le_tlb
+
+/*
+ * The ARMVectorReg elements are stored in host-endian 64-bit units.
+ * For 128-bit quantities, the sequence defined by the Elem[] pseudocode
+ * corresponds to storing the two 64-bit pieces in little-endian order.
+ */
+/* FIXME: Nothing in this file makes any effort at atomicity. */
+
+static inline void sve_ld1qq_be_host(void *vd, intptr_t reg_off, void *host)
+{
+ sve_ld1dd_be_host(vd, reg_off + 8, host);
+ sve_ld1dd_be_host(vd, reg_off, host + 8);
+}
+
+static inline void sve_ld1qq_le_host(void *vd, intptr_t reg_off, void *host)
+{
+ sve_ld1dd_le_host(vd, reg_off, host);
+ sve_ld1dd_le_host(vd, reg_off + 8, host + 8);
+}
+
+static inline void
+sve_ld1qq_be_tlb(CPUARMState *env, void *vd, intptr_t reg_off,
+ target_ulong addr, uintptr_t ra)
+{
+ sve_ld1dd_be_tlb(env, vd, reg_off + 8, addr, ra);
+ sve_ld1dd_be_tlb(env, vd, reg_off, addr + 8, ra);
+}
+
+static inline void
+sve_ld1qq_le_tlb(CPUARMState *env, void *vd, intptr_t reg_off,
+ target_ulong addr, uintptr_t ra)
+{
+ sve_ld1dd_le_tlb(env, vd, reg_off, addr, ra);
+ sve_ld1dd_le_tlb(env, vd, reg_off + 8, addr + 8, ra);
+}
+
+static inline void sve_st1qq_be_host(void *vd, intptr_t reg_off, void *host)
+{
+ sve_st1dd_be_host(vd, reg_off + 8, host);
+ sve_st1dd_be_host(vd, reg_off, host + 8);
+}
+
+static inline void sve_st1qq_le_host(void *vd, intptr_t reg_off, void *host)
+{
+ sve_st1dd_le_host(vd, reg_off, host);
+ sve_st1dd_le_host(vd, reg_off + 8, host + 8);
+}
+
+static inline void
+sve_st1qq_be_tlb(CPUARMState *env, void *vd, intptr_t reg_off,
+ target_ulong addr, uintptr_t ra)
+{
+ sve_st1dd_be_tlb(env, vd, reg_off + 8, addr, ra);
+ sve_st1dd_be_tlb(env, vd, reg_off, addr + 8, ra);
+}
+
+static inline void
+sve_st1qq_le_tlb(CPUARMState *env, void *vd, intptr_t reg_off,
+ target_ulong addr, uintptr_t ra)
+{
+ sve_st1dd_le_tlb(env, vd, reg_off, addr, ra);
+ sve_st1dd_le_tlb(env, vd, reg_off + 8, addr + 8, ra);
+}
+
#undef DO_LD_TLB
#undef DO_ST_TLB
#undef DO_LD_HOST
@@ -123,6 +211,7 @@ DO_ST_PRIM_2(dd, H1_8, uint64_t, uint64_t, stq)
#undef DO_ST_PRIM_1
#undef DO_LD_PRIM_2
#undef DO_ST_PRIM_2
+#undef DO_LD_PRIM_3
/*
* Resolve the guest virtual address to info->host and info->flags.
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index 815225b..dbf4759 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -1381,11 +1381,8 @@ static bool fp_access_check_only(DisasContext *s)
return true;
}
-static bool fp_access_check(DisasContext *s)
+static bool nonstreaming_check(DisasContext *s)
{
- if (!fp_access_check_only(s)) {
- return false;
- }
if (s->sme_trap_nonstreaming && s->is_nonstreaming) {
gen_exception_insn(s, 0, EXCP_UDEF,
syn_smetrap(SME_ET_Streaming, false));
@@ -1394,6 +1391,11 @@ static bool fp_access_check(DisasContext *s)
return true;
}
+static bool fp_access_check(DisasContext *s)
+{
+ return fp_access_check_only(s) && nonstreaming_check(s);
+}
+
/*
* Return <0 for non-supported element sizes, with MO_16 controlled by
* FEAT_FP16; return 0 for fp disabled; otherwise return >0 for success.
@@ -1444,14 +1446,24 @@ static int fp_access_check_vector_hsd(DisasContext *s, bool is_q, MemOp esz)
*/
bool sve_access_check(DisasContext *s)
{
- if (s->pstate_sm || !dc_isar_feature(aa64_sve, s)) {
+ if (dc_isar_feature(aa64_sme, s)) {
bool ret;
- assert(dc_isar_feature(aa64_sme, s));
- ret = sme_sm_enabled_check(s);
+ if (s->pstate_sm) {
+ ret = sme_enabled_check(s);
+ } else if (dc_isar_feature(aa64_sve, s)) {
+ goto continue_sve;
+ } else {
+ ret = sme_sm_enabled_check(s);
+ }
+ if (ret) {
+ ret = nonstreaming_check(s);
+ }
s->sve_access_checked = (ret ? 1 : -1);
return ret;
}
+
+ continue_sve:
if (s->sve_excp_el) {
/* Assert that we only raise one exception per instruction. */
assert(!s->sve_access_checked);
@@ -1488,7 +1500,8 @@ bool sme_enabled_check(DisasContext *s)
* to be zero when fp_excp_el has priority. This is because we need
* sme_excp_el by itself for cpregs access checks.
*/
- if (!s->fp_excp_el || s->sme_excp_el < s->fp_excp_el) {
+ if (s->sme_excp_el
+ && (!s->fp_excp_el || s->sme_excp_el <= s->fp_excp_el)) {
bool ret = sme_access_check(s);
s->fp_access_checked = (ret ? 1 : -1);
return ret;
@@ -6101,9 +6114,9 @@ static bool do_dot_vector_env(DisasContext *s, arg_qrrr_e *a,
return true;
}
-TRANS_FEAT(SDOT_v, aa64_dp, do_dot_vector, a, gen_helper_gvec_sdot_b)
-TRANS_FEAT(UDOT_v, aa64_dp, do_dot_vector, a, gen_helper_gvec_udot_b)
-TRANS_FEAT(USDOT_v, aa64_i8mm, do_dot_vector, a, gen_helper_gvec_usdot_b)
+TRANS_FEAT(SDOT_v, aa64_dp, do_dot_vector, a, gen_helper_gvec_sdot_4b)
+TRANS_FEAT(UDOT_v, aa64_dp, do_dot_vector, a, gen_helper_gvec_udot_4b)
+TRANS_FEAT(USDOT_v, aa64_i8mm, do_dot_vector, a, gen_helper_gvec_usdot_4b)
TRANS_FEAT(BFDOT_v, aa64_bf16, do_dot_vector_env, a, gen_helper_gvec_bfdot)
TRANS_FEAT(BFMMLA, aa64_bf16, do_dot_vector_env, a, gen_helper_gvec_bfmmla)
TRANS_FEAT(SMMLA, aa64_i8mm, do_dot_vector, a, gen_helper_gvec_smmla_b)
@@ -6863,12 +6876,12 @@ static bool do_dot_vector_idx_env(DisasContext *s, arg_qrrx_e *a,
return true;
}
-TRANS_FEAT(SDOT_vi, aa64_dp, do_dot_vector_idx, a, gen_helper_gvec_sdot_idx_b)
-TRANS_FEAT(UDOT_vi, aa64_dp, do_dot_vector_idx, a, gen_helper_gvec_udot_idx_b)
+TRANS_FEAT(SDOT_vi, aa64_dp, do_dot_vector_idx, a, gen_helper_gvec_sdot_idx_4b)
+TRANS_FEAT(UDOT_vi, aa64_dp, do_dot_vector_idx, a, gen_helper_gvec_udot_idx_4b)
TRANS_FEAT(SUDOT_vi, aa64_i8mm, do_dot_vector_idx, a,
- gen_helper_gvec_sudot_idx_b)
+ gen_helper_gvec_sudot_idx_4b)
TRANS_FEAT(USDOT_vi, aa64_i8mm, do_dot_vector_idx, a,
- gen_helper_gvec_usdot_idx_b)
+ gen_helper_gvec_usdot_idx_4b)
TRANS_FEAT(BFDOT_vi, aa64_bf16, do_dot_vector_idx_env, a,
gen_helper_gvec_bfdot_idx)
@@ -10126,8 +10139,10 @@ static void aarch64_tr_init_disas_context(DisasContextBase *dcbase,
dc->trap_eret = EX_TBFLAG_A64(tb_flags, TRAP_ERET);
dc->sve_excp_el = EX_TBFLAG_A64(tb_flags, SVEEXC_EL);
dc->sme_excp_el = EX_TBFLAG_A64(tb_flags, SMEEXC_EL);
+ dc->zt0_excp_el = EX_TBFLAG_A64(tb_flags, ZT0EXC_EL);
dc->vl = (EX_TBFLAG_A64(tb_flags, VL) + 1) * 16;
dc->svl = (EX_TBFLAG_A64(tb_flags, SVL) + 1) * 16;
+ dc->max_svl = arm_cpu->sme_max_vq * 16;
dc->pauth_active = EX_TBFLAG_A64(tb_flags, PAUTH_ACTIVE);
dc->bt = EX_TBFLAG_A64(tb_flags, BT);
dc->btype = EX_TBFLAG_A64(tb_flags, BTYPE);
diff --git a/target/arm/tcg/translate-a64.h b/target/arm/tcg/translate-a64.h
index b2420f5..993dde6 100644
--- a/target/arm/tcg/translate-a64.h
+++ b/target/arm/tcg/translate-a64.h
@@ -225,7 +225,13 @@ void gen_gvec_usqadd_qc(unsigned vece, uint32_t rd_ofs,
uint32_t rn_ofs, uint32_t rm_ofs,
uint32_t opr_sz, uint32_t max_sz);
-void gen_sve_ldr(DisasContext *s, TCGv_ptr, int vofs, int len, int rn, int imm);
-void gen_sve_str(DisasContext *s, TCGv_ptr, int vofs, int len, int rn, int imm);
+void gen_gvec_sve2_sqdmulh(unsigned vece, uint32_t rd_ofs,
+ uint32_t rn_ofs, uint32_t rm_ofs,
+ uint32_t opr_sz, uint32_t max_sz);
+
+void gen_sve_ldr(DisasContext *s, TCGv_ptr, int vofs,
+ int len, int rn, int imm, MemOp align);
+void gen_sve_str(DisasContext *s, TCGv_ptr, int vofs,
+ int len, int rn, int imm, MemOp align);
#endif /* TARGET_ARM_TRANSLATE_A64_H */
diff --git a/target/arm/tcg/translate-neon.c b/target/arm/tcg/translate-neon.c
index c4fecb8..844d2e2 100644
--- a/target/arm/tcg/translate-neon.c
+++ b/target/arm/tcg/translate-neon.c
@@ -271,7 +271,7 @@ static bool trans_VSDOT(DisasContext *s, arg_VSDOT *a)
return false;
}
return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
- gen_helper_gvec_sdot_b);
+ gen_helper_gvec_sdot_4b);
}
static bool trans_VUDOT(DisasContext *s, arg_VUDOT *a)
@@ -280,7 +280,7 @@ static bool trans_VUDOT(DisasContext *s, arg_VUDOT *a)
return false;
}
return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
- gen_helper_gvec_udot_b);
+ gen_helper_gvec_udot_4b);
}
static bool trans_VUSDOT(DisasContext *s, arg_VUSDOT *a)
@@ -289,7 +289,7 @@ static bool trans_VUSDOT(DisasContext *s, arg_VUSDOT *a)
return false;
}
return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
- gen_helper_gvec_usdot_b);
+ gen_helper_gvec_usdot_4b);
}
static bool trans_VDOT_b16(DisasContext *s, arg_VDOT_b16 *a)
@@ -356,7 +356,7 @@ static bool trans_VSDOT_scalar(DisasContext *s, arg_VSDOT_scalar *a)
return false;
}
return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
- gen_helper_gvec_sdot_idx_b);
+ gen_helper_gvec_sdot_idx_4b);
}
static bool trans_VUDOT_scalar(DisasContext *s, arg_VUDOT_scalar *a)
@@ -365,7 +365,7 @@ static bool trans_VUDOT_scalar(DisasContext *s, arg_VUDOT_scalar *a)
return false;
}
return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
- gen_helper_gvec_udot_idx_b);
+ gen_helper_gvec_udot_idx_4b);
}
static bool trans_VUSDOT_scalar(DisasContext *s, arg_VUSDOT_scalar *a)
@@ -374,7 +374,7 @@ static bool trans_VUSDOT_scalar(DisasContext *s, arg_VUSDOT_scalar *a)
return false;
}
return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
- gen_helper_gvec_usdot_idx_b);
+ gen_helper_gvec_usdot_idx_4b);
}
static bool trans_VSUDOT_scalar(DisasContext *s, arg_VSUDOT_scalar *a)
@@ -383,7 +383,7 @@ static bool trans_VSUDOT_scalar(DisasContext *s, arg_VSUDOT_scalar *a)
return false;
}
return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
- gen_helper_gvec_sudot_idx_b);
+ gen_helper_gvec_sudot_idx_4b);
}
static bool trans_VDOT_b16_scal(DisasContext *s, arg_VDOT_b16_scal *a)
@@ -1010,8 +1010,8 @@ DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
-DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
-DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
+DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_nf_s, gen_helper_gvec_fmla_nf_h)
+DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_nf_s, gen_helper_gvec_fmls_nf_h)
DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
diff --git a/target/arm/tcg/translate-sme.c b/target/arm/tcg/translate-sme.c
index fcbb350..65fc8bc 100644
--- a/target/arm/tcg/translate-sme.c
+++ b/target/arm/tcg/translate-sme.c
@@ -27,16 +27,25 @@
#include "decode-sme.c.inc"
+static bool sme2_zt0_enabled_check(DisasContext *s)
+{
+ if (!sme_za_enabled_check(s)) {
+ return false;
+ }
+ if (s->zt0_excp_el) {
+ gen_exception_insn_el(s, 0, EXCP_UDEF,
+ syn_smetrap(SME_ET_InaccessibleZT0, false),
+ s->zt0_excp_el);
+ return false;
+ }
+ return true;
+}
-/*
- * Resolve tile.size[index] to a host pointer, where tile and index
- * are always decoded together, dependent on the element size.
- */
+/* Resolve tile.size[rs+imm] to a host pointer. */
static TCGv_ptr get_tile_rowcol(DisasContext *s, int esz, int rs,
- int tile_index, bool vertical)
+ int tile, int imm, int div_len,
+ int vec_mod, bool vertical)
{
- int tile = tile_index >> (4 - esz);
- int index = esz == MO_128 ? 0 : extract32(tile_index, 0, 4 - esz);
int pos, len, offset;
TCGv_i32 tmp;
TCGv_ptr addr;
@@ -44,10 +53,23 @@ static TCGv_ptr get_tile_rowcol(DisasContext *s, int esz, int rs,
/* Compute the final index, which is Rs+imm. */
tmp = tcg_temp_new_i32();
tcg_gen_trunc_tl_i32(tmp, cpu_reg(s, rs));
- tcg_gen_addi_i32(tmp, tmp, index);
+ /*
+ * Round the vector index down to a multiple of vec_mod if necessary.
+ * We do this before adding the offset, to handle cases like
+ * MOVA (tile to vector, 2 registers) where we want to call this
+ * several times in a loop with an increasing offset. We rely on
+ * the instruction encodings always forcing the initial offset in
+ * [rs + offset] to be a multiple of vec_mod. The pseudocode usually
+ * does the round-down after adding the offset rather than before,
+ * but MOVA is an exception.
+ */
+ if (vec_mod > 1) {
+ tcg_gen_andc_i32(tmp, tmp, tcg_constant_i32(vec_mod - 1));
+ }
+ tcg_gen_addi_i32(tmp, tmp, imm);
/* Prepare a power-of-two modulo via extraction of @len bits. */
- len = ctz32(streaming_vec_reg_size(s)) - esz;
+ len = ctz32(streaming_vec_reg_size(s) / div_len) - esz;
if (!len) {
/*
@@ -92,7 +114,7 @@ static TCGv_ptr get_tile_rowcol(DisasContext *s, int esz, int rs,
offset = tile * sizeof(ARMVectorReg);
/* Include the byte offset of zarray to make this relative to env. */
- offset += offsetof(CPUARMState, zarray);
+ offset += offsetof(CPUARMState, za_state.za);
tcg_gen_addi_i32(tmp, tmp, offset);
/* Add the byte offset to env to produce the final pointer. */
@@ -103,6 +125,14 @@ static TCGv_ptr get_tile_rowcol(DisasContext *s, int esz, int rs,
return addr;
}
+/* Resolve ZArray[rs+imm] to a host pointer. */
+static TCGv_ptr get_zarray(DisasContext *s, int rs, int imm,
+ int div_len, int vec_mod)
+{
+ /* ZA[n] equates to ZA0H.B[n]. */
+ return get_tile_rowcol(s, MO_8, rs, 0, imm, div_len, vec_mod, false);
+}
+
/*
* Resolve tile.size[0] to a host pointer.
* Used by e.g. outer product insns where we require the entire tile.
@@ -112,7 +142,7 @@ static TCGv_ptr get_tile(DisasContext *s, int esz, int tile)
TCGv_ptr addr = tcg_temp_new_ptr();
int offset;
- offset = tile * sizeof(ARMVectorReg) + offsetof(CPUARMState, zarray);
+ offset = tile * sizeof(ARMVectorReg) + offsetof(CPUARMState, za_state.za);
tcg_gen_addi_ptr(addr, tcg_env, offset);
return addr;
@@ -130,7 +160,40 @@ static bool trans_ZERO(DisasContext *s, arg_ZERO *a)
return true;
}
-static bool trans_MOVA(DisasContext *s, arg_MOVA *a)
+static bool trans_ZERO_zt0(DisasContext *s, arg_ZERO_zt0 *a)
+{
+ if (!dc_isar_feature(aa64_sme2, s)) {
+ return false;
+ }
+ if (sme_enabled_check(s) && sme2_zt0_enabled_check(s)) {
+ tcg_gen_gvec_dup_imm(MO_64, offsetof(CPUARMState, za_state.zt0),
+ sizeof_field(CPUARMState, za_state.zt0),
+ sizeof_field(CPUARMState, za_state.zt0), 0);
+ }
+ return true;
+}
+
+static bool trans_ZERO_za(DisasContext *s, arg_ZERO_za *a)
+{
+ if (!dc_isar_feature(aa64_sme2p1, s)) {
+ return false;
+ }
+ if (sme_smza_enabled_check(s)) {
+ int svl = streaming_vec_reg_size(s);
+ int vstride = svl / a->ngrp;
+ TCGv_ptr t_za = get_zarray(s, a->rv, a->off, a->ngrp, a->nvec);
+
+ for (int r = 0; r < a->ngrp; ++r) {
+ for (int i = 0; i < a->nvec; ++i) {
+ int o_za = (r * vstride + i) * sizeof(ARMVectorReg);
+ tcg_gen_gvec_dup_imm_var(MO_64, t_za, o_za, svl, svl, 0);
+ }
+ }
+ }
+ return true;
+}
+
+static bool do_mova_tile(DisasContext *s, arg_mova_p *a, bool to_vec)
{
static gen_helper_gvec_4 * const h_fns[5] = {
gen_helper_sve_sel_zpzz_b, gen_helper_sve_sel_zpzz_h,
@@ -152,14 +215,11 @@ static bool trans_MOVA(DisasContext *s, arg_MOVA *a)
TCGv_i32 t_desc;
int svl;
- if (!dc_isar_feature(aa64_sme, s)) {
- return false;
- }
if (!sme_smza_enabled_check(s)) {
return true;
}
- t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v);
+ t_za = get_tile_rowcol(s, a->esz, a->rs, a->za, a->off, 1, 0, a->v);
t_zr = vec_full_reg_ptr(s, a->zr);
t_pg = pred_full_reg_ptr(s, a->pg);
@@ -168,14 +228,14 @@ static bool trans_MOVA(DisasContext *s, arg_MOVA *a)
if (a->v) {
/* Vertical slice -- use sme mova helpers. */
- if (a->to_vec) {
+ if (to_vec) {
zc_fns[a->esz](t_zr, t_za, t_pg, t_desc);
} else {
cz_fns[a->esz](t_za, t_zr, t_pg, t_desc);
}
} else {
/* Horizontal slice -- reuse sve sel helpers. */
- if (a->to_vec) {
+ if (to_vec) {
h_fns[a->esz](t_zr, t_za, t_zr, t_pg, t_desc);
} else {
h_fns[a->esz](t_za, t_zr, t_za, t_pg, t_desc);
@@ -184,6 +244,147 @@ static bool trans_MOVA(DisasContext *s, arg_MOVA *a)
return true;
}
+TRANS_FEAT(MOVA_tz, aa64_sme, do_mova_tile, a, false)
+TRANS_FEAT(MOVA_zt, aa64_sme, do_mova_tile, a, true)
+
+static bool do_mova_tile_n(DisasContext *s, arg_mova_t *a, int n,
+ bool to_vec, bool zero)
+{
+ static gen_helper_gvec_2 * const cz_fns[] = {
+ gen_helper_sme2_mova_cz_b, gen_helper_sme2_mova_cz_h,
+ gen_helper_sme2_mova_cz_s, gen_helper_sme2_mova_cz_d,
+ };
+ static gen_helper_gvec_2 * const zc_fns[] = {
+ gen_helper_sme2_mova_zc_b, gen_helper_sme2_mova_zc_h,
+ gen_helper_sme2_mova_zc_s, gen_helper_sme2_mova_zc_d,
+ };
+ static gen_helper_gvec_2 * const zc_z_fns[] = {
+ gen_helper_sme2p1_movaz_zc_b, gen_helper_sme2p1_movaz_zc_h,
+ gen_helper_sme2p1_movaz_zc_s, gen_helper_sme2p1_movaz_zc_d,
+ gen_helper_sme2p1_movaz_zc_q,
+ };
+ TCGv_ptr t_za;
+ int svl, bytes_per_op = n << a->esz;
+
+ /*
+ * The MaxImplementedSVL check happens in the decode pseudocode,
+ * before the SM+ZA enabled check in the operation pseudocode.
+ * This will (currently) only fail for NREG=4, ESZ=MO_64.
+ */
+ if (s->max_svl < bytes_per_op) {
+ unallocated_encoding(s);
+ return true;
+ }
+
+ assert(a->esz <= MO_64 + zero);
+
+ if (!sme_smza_enabled_check(s)) {
+ return true;
+ }
+
+ svl = streaming_vec_reg_size(s);
+
+ /*
+ * The CurrentVL check happens in the operation pseudocode,
+ * after the SM+ZA enabled check.
+ */
+ if (svl < bytes_per_op) {
+ unallocated_encoding(s);
+ return true;
+ }
+
+ if (a->v) {
+ TCGv_i32 t_desc = tcg_constant_i32(simd_desc(svl, svl, 0));
+
+ for (int i = 0; i < n; ++i) {
+ TCGv_ptr t_zr = vec_full_reg_ptr(s, a->zr * n + i);
+ t_za = get_tile_rowcol(s, a->esz, a->rs, a->za,
+ a->off * n + i, 1, n, a->v);
+ if (zero) {
+ zc_z_fns[a->esz](t_zr, t_za, t_desc);
+ } else if (to_vec) {
+ zc_fns[a->esz](t_zr, t_za, t_desc);
+ } else {
+ cz_fns[a->esz](t_za, t_zr, t_desc);
+ }
+ }
+ } else {
+ for (int i = 0; i < n; ++i) {
+ int o_zr = vec_full_reg_offset(s, a->zr * n + i);
+ t_za = get_tile_rowcol(s, a->esz, a->rs, a->za,
+ a->off * n + i, 1, n, a->v);
+ if (to_vec) {
+ tcg_gen_gvec_mov_var(MO_8, tcg_env, o_zr, t_za, 0, svl, svl);
+ if (zero) {
+ tcg_gen_gvec_dup_imm_var(MO_8, t_za, 0, svl, svl, 0);
+ }
+ } else {
+ tcg_gen_gvec_mov_var(MO_8, t_za, 0, tcg_env, o_zr, svl, svl);
+ }
+ }
+ }
+ return true;
+}
+
+TRANS_FEAT(MOVA_tz2, aa64_sme2, do_mova_tile_n, a, 2, false, false)
+TRANS_FEAT(MOVA_tz4, aa64_sme2, do_mova_tile_n, a, 4, false, false)
+TRANS_FEAT(MOVA_zt2, aa64_sme2, do_mova_tile_n, a, 2, true, false)
+TRANS_FEAT(MOVA_zt4, aa64_sme2, do_mova_tile_n, a, 4, true, false)
+
+TRANS_FEAT(MOVAZ_zt, aa64_sme2p1, do_mova_tile_n, a, 1, true, true)
+TRANS_FEAT(MOVAZ_zt2, aa64_sme2p1, do_mova_tile_n, a, 2, true, true)
+TRANS_FEAT(MOVAZ_zt4, aa64_sme2p1, do_mova_tile_n, a, 4, true, true)
+
+static bool do_mova_array_n(DisasContext *s, arg_mova_a *a, int n,
+ bool to_vec, bool zero)
+{
+ TCGv_ptr t_za;
+ int svl;
+
+ if (!sme_smza_enabled_check(s)) {
+ return true;
+ }
+
+ svl = streaming_vec_reg_size(s);
+ t_za = get_zarray(s, a->rv, a->off, n, 0);
+
+ for (int i = 0; i < n; ++i) {
+ int o_za = (svl / n * sizeof(ARMVectorReg)) * i;
+ int o_zr = vec_full_reg_offset(s, a->zr * n + i);
+
+ if (to_vec) {
+ tcg_gen_gvec_mov_var(MO_8, tcg_env, o_zr, t_za, o_za, svl, svl);
+ if (zero) {
+ tcg_gen_gvec_dup_imm_var(MO_8, t_za, o_za, svl, svl, 0);
+ }
+ } else {
+ tcg_gen_gvec_mov_var(MO_8, t_za, o_za, tcg_env, o_zr, svl, svl);
+ }
+ }
+ return true;
+}
+
+TRANS_FEAT(MOVA_az2, aa64_sme2, do_mova_array_n, a, 2, false, false)
+TRANS_FEAT(MOVA_az4, aa64_sme2, do_mova_array_n, a, 4, false, false)
+TRANS_FEAT(MOVA_za2, aa64_sme2, do_mova_array_n, a, 2, true, false)
+TRANS_FEAT(MOVA_za4, aa64_sme2, do_mova_array_n, a, 4, true, false)
+
+TRANS_FEAT(MOVAZ_za2, aa64_sme2p1, do_mova_array_n, a, 2, true, true)
+TRANS_FEAT(MOVAZ_za4, aa64_sme2p1, do_mova_array_n, a, 4, true, true)
+
+static bool do_movt(DisasContext *s, arg_MOVT_rzt *a,
+ void (*func)(TCGv_i64, TCGv_ptr, tcg_target_long))
+{
+ if (sme2_zt0_enabled_check(s)) {
+ func(cpu_reg(s, a->rt), tcg_env,
+ offsetof(CPUARMState, za_state.zt0) + a->off * 8);
+ }
+ return true;
+}
+
+TRANS_FEAT(MOVT_rzt, aa64_sme2, do_movt, a, tcg_gen_ld_i64)
+TRANS_FEAT(MOVT_ztr, aa64_sme2, do_movt, a, tcg_gen_st_i64)
+
static bool trans_LDST1(DisasContext *s, arg_LDST1 *a)
{
typedef void GenLdSt1(TCGv_env, TCGv_ptr, TCGv_ptr, TCGv, TCGv_i32);
@@ -225,7 +426,7 @@ static bool trans_LDST1(DisasContext *s, arg_LDST1 *a)
return true;
}
- t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v);
+ t_za = get_tile_rowcol(s, a->esz, a->rs, a->za, a->off, 1, 0, a->v);
t_pg = pred_full_reg_ptr(s, a->pg);
addr = tcg_temp_new_i64();
@@ -243,28 +444,37 @@ static bool trans_LDST1(DisasContext *s, arg_LDST1 *a)
return true;
}
-typedef void GenLdStR(DisasContext *, TCGv_ptr, int, int, int, int);
+typedef void GenLdStR(DisasContext *, TCGv_ptr, int, int, int, int, MemOp);
static bool do_ldst_r(DisasContext *s, arg_ldstr *a, GenLdStR *fn)
{
- int svl = streaming_vec_reg_size(s);
- int imm = a->imm;
- TCGv_ptr base;
+ if (sme_za_enabled_check(s)) {
+ int svl = streaming_vec_reg_size(s);
+ int imm = a->imm;
+ TCGv_ptr base = get_zarray(s, a->rv, imm, 1, 0);
- if (!sme_za_enabled_check(s)) {
- return true;
+ fn(s, base, 0, svl, a->rn, imm * svl,
+ s->align_mem ? MO_ALIGN_16 : MO_UNALN);
}
-
- /* ZA[n] equates to ZA0H.B[n]. */
- base = get_tile_rowcol(s, MO_8, a->rv, imm, false);
-
- fn(s, base, 0, svl, a->rn, imm * svl);
return true;
}
TRANS_FEAT(LDR, aa64_sme, do_ldst_r, a, gen_sve_ldr)
TRANS_FEAT(STR, aa64_sme, do_ldst_r, a, gen_sve_str)
+static bool do_ldst_zt0(DisasContext *s, arg_ldstzt0 *a, GenLdStR *fn)
+{
+ if (sme2_zt0_enabled_check(s)) {
+ fn(s, tcg_env, offsetof(CPUARMState, za_state.zt0),
+ sizeof_field(CPUARMState, za_state.zt0), a->rn, 0,
+ s->align_mem ? MO_ALIGN_16 : MO_UNALN);
+ }
+ return true;
+}
+
+TRANS_FEAT(LDR_zt0, aa64_sme2, do_ldst_zt0, a, gen_sve_ldr)
+TRANS_FEAT(STR_zt0, aa64_sme2, do_ldst_zt0, a, gen_sve_str)
+
static bool do_adda(DisasContext *s, arg_adda *a, MemOp esz,
gen_helper_gvec_4 *fn)
{
@@ -316,7 +526,7 @@ static bool do_outprod_fpst(DisasContext *s, arg_op *a, MemOp esz,
gen_helper_gvec_5_ptr *fn)
{
int svl = streaming_vec_reg_size(s);
- uint32_t desc = simd_desc(svl, svl, a->sub);
+ uint32_t desc = simd_desc(svl, svl, 0);
TCGv_ptr za, zn, zm, pn, pm, fpst;
if (!sme_smza_enabled_check(s)) {
@@ -338,7 +548,7 @@ static bool do_outprod_env(DisasContext *s, arg_op *a, MemOp esz,
gen_helper_gvec_5_ptr *fn)
{
int svl = streaming_vec_reg_size(s);
- uint32_t desc = simd_desc(svl, svl, a->sub);
+ uint32_t desc = simd_desc(svl, svl, 0);
TCGv_ptr za, zn, zm, pn, pm;
if (!sme_smza_enabled_check(s)) {
@@ -355,14 +565,32 @@ static bool do_outprod_env(DisasContext *s, arg_op *a, MemOp esz,
return true;
}
-TRANS_FEAT(FMOPA_h, aa64_sme, do_outprod_env, a,
- MO_32, gen_helper_sme_fmopa_h)
-TRANS_FEAT(FMOPA_s, aa64_sme, do_outprod_fpst, a,
- MO_32, FPST_A64, gen_helper_sme_fmopa_s)
-TRANS_FEAT(FMOPA_d, aa64_sme_f64f64, do_outprod_fpst, a,
- MO_64, FPST_A64, gen_helper_sme_fmopa_d)
+TRANS_FEAT(FMOPA_w_h, aa64_sme, do_outprod_env, a, MO_32,
+ !a->sub ? gen_helper_sme_fmopa_w_h
+ : !s->fpcr_ah ? gen_helper_sme_fmops_w_h
+ : gen_helper_sme_ah_fmops_w_h)
+TRANS_FEAT(FMOPA_h, aa64_sme_f16f16, do_outprod_fpst, a, MO_16, FPST_ZA_F16,
+ !a->sub ? gen_helper_sme_fmopa_h
+ : !s->fpcr_ah ? gen_helper_sme_fmops_h
+ : gen_helper_sme_ah_fmops_h)
+TRANS_FEAT(FMOPA_s, aa64_sme, do_outprod_fpst, a, MO_32, FPST_ZA,
+ !a->sub ? gen_helper_sme_fmopa_s
+ : !s->fpcr_ah ? gen_helper_sme_fmops_s
+ : gen_helper_sme_ah_fmops_s)
+TRANS_FEAT(FMOPA_d, aa64_sme_f64f64, do_outprod_fpst, a, MO_64, FPST_ZA,
+ !a->sub ? gen_helper_sme_fmopa_d
+ : !s->fpcr_ah ? gen_helper_sme_fmops_d
+ : gen_helper_sme_ah_fmops_d)
+
+TRANS_FEAT(BFMOPA, aa64_sme_b16b16, do_outprod_fpst, a, MO_16, FPST_ZA,
+ !a->sub ? gen_helper_sme_bfmopa
+ : !s->fpcr_ah ? gen_helper_sme_bfmops
+ : gen_helper_sme_ah_bfmops)
-TRANS_FEAT(BFMOPA, aa64_sme, do_outprod_env, a, MO_32, gen_helper_sme_bfmopa)
+TRANS_FEAT(BFMOPA_w, aa64_sme, do_outprod_env, a, MO_32,
+ !a->sub ? gen_helper_sme_bfmopa_w
+ : !s->fpcr_ah ? gen_helper_sme_bfmops_w
+ : gen_helper_sme_ah_bfmops_w)
TRANS_FEAT(SMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_smopa_s)
TRANS_FEAT(UMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_umopa_s)
@@ -373,3 +601,1173 @@ TRANS_FEAT(SMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_smopa_
TRANS_FEAT(UMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_umopa_d)
TRANS_FEAT(SUMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_sumopa_d)
TRANS_FEAT(USMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_usmopa_d)
+
+TRANS_FEAT(BMOPA, aa64_sme2, do_outprod, a, MO_32, gen_helper_sme2_bmopa_s)
+TRANS_FEAT(SMOPA2_s, aa64_sme2, do_outprod, a, MO_32, gen_helper_sme2_smopa2_s)
+TRANS_FEAT(UMOPA2_s, aa64_sme2, do_outprod, a, MO_32, gen_helper_sme2_umopa2_s)
+
+static bool do_z2z_n1(DisasContext *s, arg_z2z_en *a, GVecGen3Fn *fn)
+{
+ int esz, dn, vsz, mofs, n;
+ bool overlap = false;
+
+ if (!sme_sm_enabled_check(s)) {
+ return true;
+ }
+
+ esz = a->esz;
+ n = a->n;
+ dn = a->zdn;
+ mofs = vec_full_reg_offset(s, a->zm);
+ vsz = streaming_vec_reg_size(s);
+
+ for (int i = 0; i < n; i++) {
+ int dofs = vec_full_reg_offset(s, dn + i);
+ if (dofs == mofs) {
+ overlap = true;
+ } else {
+ fn(esz, dofs, dofs, mofs, vsz, vsz);
+ }
+ }
+ if (overlap) {
+ fn(esz, mofs, mofs, mofs, vsz, vsz);
+ }
+ return true;
+}
+
+static void gen_sme2_srshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+ static gen_helper_gvec_3 * const fns[] = {
+ gen_helper_gvec_srshl_b, gen_helper_sme2_srshl_h,
+ gen_helper_sme2_srshl_s, gen_helper_sme2_srshl_d,
+ };
+ tcg_debug_assert(vece <= MO_64);
+ tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
+}
+
+static void gen_sme2_urshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+ static gen_helper_gvec_3 * const fns[] = {
+ gen_helper_gvec_urshl_b, gen_helper_sme2_urshl_h,
+ gen_helper_sme2_urshl_s, gen_helper_sme2_urshl_d,
+ };
+ tcg_debug_assert(vece <= MO_64);
+ tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]);
+}
+
+TRANS_FEAT(ADD_n1, aa64_sme2, do_z2z_n1, a, tcg_gen_gvec_add)
+TRANS_FEAT(SMAX_n1, aa64_sme2, do_z2z_n1, a, tcg_gen_gvec_smax)
+TRANS_FEAT(SMIN_n1, aa64_sme2, do_z2z_n1, a, tcg_gen_gvec_smin)
+TRANS_FEAT(UMAX_n1, aa64_sme2, do_z2z_n1, a, tcg_gen_gvec_umax)
+TRANS_FEAT(UMIN_n1, aa64_sme2, do_z2z_n1, a, tcg_gen_gvec_umin)
+TRANS_FEAT(SRSHL_n1, aa64_sme2, do_z2z_n1, a, gen_sme2_srshl)
+TRANS_FEAT(URSHL_n1, aa64_sme2, do_z2z_n1, a, gen_sme2_urshl)
+TRANS_FEAT(SQDMULH_n1, aa64_sme2, do_z2z_n1, a, gen_gvec_sve2_sqdmulh)
+
+static bool do_z2z_nn(DisasContext *s, arg_z2z_en *a, GVecGen3Fn *fn)
+{
+ int esz, dn, dm, vsz, n;
+
+ if (!sme_sm_enabled_check(s)) {
+ return true;
+ }
+
+ esz = a->esz;
+ n = a->n;
+ dn = a->zdn;
+ dm = a->zm;
+ vsz = streaming_vec_reg_size(s);
+
+ for (int i = 0; i < n; i++) {
+ int dofs = vec_full_reg_offset(s, dn + i);
+ int mofs = vec_full_reg_offset(s, dm + i);
+
+ fn(esz, dofs, dofs, mofs, vsz, vsz);
+ }
+ return true;
+}
+
+TRANS_FEAT(SMAX_nn, aa64_sme2, do_z2z_nn, a, tcg_gen_gvec_smax)
+TRANS_FEAT(SMIN_nn, aa64_sme2, do_z2z_nn, a, tcg_gen_gvec_smin)
+TRANS_FEAT(UMAX_nn, aa64_sme2, do_z2z_nn, a, tcg_gen_gvec_umax)
+TRANS_FEAT(UMIN_nn, aa64_sme2, do_z2z_nn, a, tcg_gen_gvec_umin)
+TRANS_FEAT(SRSHL_nn, aa64_sme2, do_z2z_nn, a, gen_sme2_srshl)
+TRANS_FEAT(URSHL_nn, aa64_sme2, do_z2z_nn, a, gen_sme2_urshl)
+TRANS_FEAT(SQDMULH_nn, aa64_sme2, do_z2z_nn, a, gen_gvec_sve2_sqdmulh)
+
+static bool do_z2z_n1_fpst(DisasContext *s, arg_z2z_en *a,
+ gen_helper_gvec_3_ptr * const fns[4])
+{
+ int esz = a->esz, n, dn, vsz, mofs;
+ bool overlap = false;
+ gen_helper_gvec_3_ptr *fn;
+ TCGv_ptr fpst;
+
+ /* These insns use MO_8 to encode BFloat16. */
+ if (esz == MO_8 && !dc_isar_feature(aa64_sme_b16b16, s)) {
+ return false;
+ }
+ if (!sme_sm_enabled_check(s)) {
+ return true;
+ }
+
+ fpst = fpstatus_ptr(esz == MO_16 ? FPST_A64_F16 : FPST_A64);
+ fn = fns[esz];
+ n = a->n;
+ dn = a->zdn;
+ mofs = vec_full_reg_offset(s, a->zm);
+ vsz = streaming_vec_reg_size(s);
+
+ for (int i = 0; i < n; i++) {
+ int dofs = vec_full_reg_offset(s, dn + i);
+ if (dofs == mofs) {
+ overlap = true;
+ } else {
+ tcg_gen_gvec_3_ptr(dofs, dofs, mofs, fpst, vsz, vsz, 0, fn);
+ }
+ }
+ if (overlap) {
+ tcg_gen_gvec_3_ptr(mofs, mofs, mofs, fpst, vsz, vsz, 0, fn);
+ }
+ return true;
+}
+
+static bool do_z2z_nn_fpst(DisasContext *s, arg_z2z_en *a,
+ gen_helper_gvec_3_ptr * const fns[4])
+{
+ int esz = a->esz, n, dn, dm, vsz;
+ gen_helper_gvec_3_ptr *fn;
+ TCGv_ptr fpst;
+
+ if (esz == MO_8 && !dc_isar_feature(aa64_sme_b16b16, s)) {
+ return false;
+ }
+ if (!sme_sm_enabled_check(s)) {
+ return true;
+ }
+
+ fpst = fpstatus_ptr(esz == MO_16 ? FPST_A64_F16 : FPST_A64);
+ fn = fns[esz];
+ n = a->n;
+ dn = a->zdn;
+ dm = a->zm;
+ vsz = streaming_vec_reg_size(s);
+
+ for (int i = 0; i < n; i++) {
+ int dofs = vec_full_reg_offset(s, dn + i);
+ int mofs = vec_full_reg_offset(s, dm + i);
+
+ tcg_gen_gvec_3_ptr(dofs, dofs, mofs, fpst, vsz, vsz, 0, fn);
+ }
+ return true;
+}
+
+static gen_helper_gvec_3_ptr * const f_vector_fmax[2][4] = {
+ { gen_helper_gvec_fmax_b16,
+ gen_helper_gvec_fmax_h,
+ gen_helper_gvec_fmax_s,
+ gen_helper_gvec_fmax_d },
+ { gen_helper_gvec_ah_fmax_b16,
+ gen_helper_gvec_ah_fmax_h,
+ gen_helper_gvec_ah_fmax_s,
+ gen_helper_gvec_ah_fmax_d },
+};
+TRANS_FEAT(FMAX_n1, aa64_sme2, do_z2z_n1_fpst, a, f_vector_fmax[s->fpcr_ah])
+TRANS_FEAT(FMAX_nn, aa64_sme2, do_z2z_nn_fpst, a, f_vector_fmax[s->fpcr_ah])
+
+static gen_helper_gvec_3_ptr * const f_vector_fmin[2][4] = {
+ { gen_helper_gvec_fmin_b16,
+ gen_helper_gvec_fmin_h,
+ gen_helper_gvec_fmin_s,
+ gen_helper_gvec_fmin_d },
+ { gen_helper_gvec_ah_fmin_b16,
+ gen_helper_gvec_ah_fmin_h,
+ gen_helper_gvec_ah_fmin_s,
+ gen_helper_gvec_ah_fmin_d },
+};
+TRANS_FEAT(FMIN_n1, aa64_sme2, do_z2z_n1_fpst, a, f_vector_fmin[s->fpcr_ah])
+TRANS_FEAT(FMIN_nn, aa64_sme2, do_z2z_nn_fpst, a, f_vector_fmin[s->fpcr_ah])
+
+static gen_helper_gvec_3_ptr * const f_vector_fmaxnm[4] = {
+ gen_helper_gvec_fmaxnum_b16,
+ gen_helper_gvec_fmaxnum_h,
+ gen_helper_gvec_fmaxnum_s,
+ gen_helper_gvec_fmaxnum_d,
+};
+TRANS_FEAT(FMAXNM_n1, aa64_sme2, do_z2z_n1_fpst, a, f_vector_fmaxnm)
+TRANS_FEAT(FMAXNM_nn, aa64_sme2, do_z2z_nn_fpst, a, f_vector_fmaxnm)
+
+static gen_helper_gvec_3_ptr * const f_vector_fminnm[4] = {
+ gen_helper_gvec_fminnum_b16,
+ gen_helper_gvec_fminnum_h,
+ gen_helper_gvec_fminnum_s,
+ gen_helper_gvec_fminnum_d,
+};
+TRANS_FEAT(FMINNM_n1, aa64_sme2, do_z2z_n1_fpst, a, f_vector_fminnm)
+TRANS_FEAT(FMINNM_nn, aa64_sme2, do_z2z_nn_fpst, a, f_vector_fminnm)
+
+/* Add/Sub vector Z[m] to each Z[n*N] with result in ZA[d*N]. */
+static bool do_azz_n1(DisasContext *s, arg_azz_n *a, int esz,
+ GVecGen3FnVar *fn)
+{
+ TCGv_ptr t_za;
+ int svl, n, o_zm;
+
+ if (!sme_smza_enabled_check(s)) {
+ return true;
+ }
+
+ n = a->n;
+ t_za = get_zarray(s, a->rv, a->off, n, 0);
+ o_zm = vec_full_reg_offset(s, a->zm);
+ svl = streaming_vec_reg_size(s);
+
+ for (int i = 0; i < n; ++i) {
+ int o_za = (svl / n * sizeof(ARMVectorReg)) * i;
+ int o_zn = vec_full_reg_offset(s, (a->zn + i) % 32);
+
+ fn(esz, t_za, o_za, tcg_env, o_zn, tcg_env, o_zm, svl, svl);
+ }
+ return true;
+}
+
+TRANS_FEAT(ADD_azz_n1_s, aa64_sme2, do_azz_n1, a, MO_32, tcg_gen_gvec_add_var)
+TRANS_FEAT(SUB_azz_n1_s, aa64_sme2, do_azz_n1, a, MO_32, tcg_gen_gvec_sub_var)
+TRANS_FEAT(ADD_azz_n1_d, aa64_sme2_i16i64, do_azz_n1, a, MO_64, tcg_gen_gvec_add_var)
+TRANS_FEAT(SUB_azz_n1_d, aa64_sme2_i16i64, do_azz_n1, a, MO_64, tcg_gen_gvec_sub_var)
+
+/* Add/Sub each vector Z[m*N] to each Z[n*N] with result in ZA[d*N]. */
+static bool do_azz_nn(DisasContext *s, arg_azz_n *a, int esz,
+ GVecGen3FnVar *fn)
+{
+ TCGv_ptr t_za;
+ int svl, n;
+
+ if (!sme_smza_enabled_check(s)) {
+ return true;
+ }
+
+ n = a->n;
+ t_za = get_zarray(s, a->rv, a->off, n, 1);
+ svl = streaming_vec_reg_size(s);
+
+ for (int i = 0; i < n; ++i) {
+ int o_za = (svl / n * sizeof(ARMVectorReg)) * i;
+ int o_zn = vec_full_reg_offset(s, a->zn + i);
+ int o_zm = vec_full_reg_offset(s, a->zm + i);
+
+ fn(esz, t_za, o_za, tcg_env, o_zn, tcg_env, o_zm, svl, svl);
+ }
+ return true;
+}
+
+TRANS_FEAT(ADD_azz_nn_s, aa64_sme2, do_azz_nn, a, MO_32, tcg_gen_gvec_add_var)
+TRANS_FEAT(SUB_azz_nn_s, aa64_sme2, do_azz_nn, a, MO_32, tcg_gen_gvec_sub_var)
+TRANS_FEAT(ADD_azz_nn_d, aa64_sme2_i16i64, do_azz_nn, a, MO_64, tcg_gen_gvec_add_var)
+TRANS_FEAT(SUB_azz_nn_d, aa64_sme2_i16i64, do_azz_nn, a, MO_64, tcg_gen_gvec_sub_var)
+
+/* Add/Sub each ZA[d*N] += Z[m*N] */
+static bool do_aaz(DisasContext *s, arg_az_n *a, int esz, GVecGen3FnVar *fn)
+{
+ TCGv_ptr t_za;
+ int svl, n;
+
+ if (!sme_smza_enabled_check(s)) {
+ return true;
+ }
+
+ n = a->n;
+ t_za = get_zarray(s, a->rv, a->off, n, 0);
+ svl = streaming_vec_reg_size(s);
+
+ for (int i = 0; i < n; ++i) {
+ int o_za = (svl / n * sizeof(ARMVectorReg)) * i;
+ int o_zm = vec_full_reg_offset(s, a->zm + i);
+
+ fn(esz, t_za, o_za, t_za, o_za, tcg_env, o_zm, svl, svl);
+ }
+ return true;
+}
+
+TRANS_FEAT(ADD_aaz_s, aa64_sme2, do_aaz, a, MO_32, tcg_gen_gvec_add_var)
+TRANS_FEAT(SUB_aaz_s, aa64_sme2, do_aaz, a, MO_32, tcg_gen_gvec_sub_var)
+TRANS_FEAT(ADD_aaz_d, aa64_sme2_i16i64, do_aaz, a, MO_64, tcg_gen_gvec_add_var)
+TRANS_FEAT(SUB_aaz_d, aa64_sme2_i16i64, do_aaz, a, MO_64, tcg_gen_gvec_sub_var)
+
+/*
+ * Expand array multi-vector single (n1), array multi-vector (nn),
+ * and array multi-vector indexed (nx), for floating-point accumulate.
+ * multi: true for nn, false for n1.
+ * fpst: >= 0 to set ptr argument for FPST_*, < 0 for ENV.
+ * data: stuff for simd_data, including any index.
+ */
+#define FPST_ENV -1
+
+static bool do_azz_fp(DisasContext *s, int nreg, int nsel,
+ int rv, int off, int zn, int zm,
+ int data, int shsel, bool multi, int fpst,
+ gen_helper_gvec_3_ptr *fn)
+{
+ if (sme_smza_enabled_check(s)) {
+ int svl = streaming_vec_reg_size(s);
+ int vstride = svl / nreg;
+ TCGv_ptr t_za = get_zarray(s, rv, off, nreg, nsel);
+ TCGv_ptr t, ptr;
+
+ if (fpst >= 0) {
+ ptr = fpstatus_ptr(fpst);
+ } else {
+ ptr = tcg_env;
+ }
+ t = tcg_temp_new_ptr();
+
+ for (int r = 0; r < nreg; ++r) {
+ TCGv_ptr t_zn = vec_full_reg_ptr(s, zn);
+ TCGv_ptr t_zm = vec_full_reg_ptr(s, zm);
+
+ for (int i = 0; i < nsel; ++i) {
+ int o_za = (r * vstride + i) * sizeof(ARMVectorReg);
+ int desc = simd_desc(svl, svl, data | (i << shsel));
+
+ tcg_gen_addi_ptr(t, t_za, o_za);
+ fn(t, t_zn, t_zm, ptr, tcg_constant_i32(desc));
+ }
+
+ /*
+ * For multiple-and-single vectors, Zn may wrap.
+ * For multiple vectors, both Zn and Zm are aligned.
+ */
+ zn = (zn + 1) % 32;
+ zm += multi;
+ }
+ }
+ return true;
+}
+
+static bool do_azz_acc_fp(DisasContext *s, int nreg, int nsel,
+ int rv, int off, int zn, int zm,
+ int data, int shsel, bool multi, int fpst,
+ gen_helper_gvec_4_ptr *fn)
+{
+ if (sme_smza_enabled_check(s)) {
+ int svl = streaming_vec_reg_size(s);
+ int vstride = svl / nreg;
+ TCGv_ptr t_za = get_zarray(s, rv, off, nreg, nsel);
+ TCGv_ptr t, ptr;
+
+ if (fpst >= 0) {
+ ptr = fpstatus_ptr(fpst);
+ } else {
+ ptr = tcg_env;
+ }
+ t = tcg_temp_new_ptr();
+
+ for (int r = 0; r < nreg; ++r) {
+ TCGv_ptr t_zn = vec_full_reg_ptr(s, zn);
+ TCGv_ptr t_zm = vec_full_reg_ptr(s, zm);
+
+ for (int i = 0; i < nsel; ++i) {
+ int o_za = (r * vstride + i) * sizeof(ARMVectorReg);
+ int desc = simd_desc(svl, svl, data | (i << shsel));
+
+ tcg_gen_addi_ptr(t, t_za, o_za);
+ fn(t, t_zn, t_zm, t, ptr, tcg_constant_i32(desc));
+ }
+
+ /*
+ * For multiple-and-single vectors, Zn may wrap.
+ * For multiple vectors, both Zn and Zm are aligned.
+ */
+ zn = (zn + 1) % 32;
+ zm += multi;
+ }
+ }
+ return true;
+}
+
+static bool do_fmlal(DisasContext *s, arg_azz_n *a, bool sub, bool multi)
+{
+ return do_azz_acc_fp(s, a->n, 2, a->rv, a->off, a->zn, a->zm,
+ (1 << 2) | sub, 1,
+ multi, FPST_ENV, gen_helper_sve2_fmlal_zzzw_s);
+}
+
+TRANS_FEAT(FMLAL_n1, aa64_sme2, do_fmlal, a, false, false)
+TRANS_FEAT(FMLSL_n1, aa64_sme2, do_fmlal, a, true, false)
+TRANS_FEAT(FMLAL_nn, aa64_sme2, do_fmlal, a, false, true)
+TRANS_FEAT(FMLSL_nn, aa64_sme2, do_fmlal, a, true, true)
+
+static bool do_fmlal_nx(DisasContext *s, arg_azx_n *a, bool sub)
+{
+ return do_azz_acc_fp(s, a->n, 2, a->rv, a->off, a->zn, a->zm,
+ (a->idx << 3) | (1 << 2) | sub, 1,
+ false, FPST_ENV, gen_helper_sve2_fmlal_zzxw_s);
+}
+
+TRANS_FEAT(FMLAL_nx, aa64_sme2, do_fmlal_nx, a, false)
+TRANS_FEAT(FMLSL_nx, aa64_sme2, do_fmlal_nx, a, true)
+
+static bool do_bfmlal(DisasContext *s, arg_azz_n *a, bool sub, bool multi)
+{
+ return do_azz_acc_fp(s, a->n, 2, a->rv, a->off, a->zn, a->zm,
+ 0, 0, multi, FPST_ZA,
+ (!sub ? gen_helper_gvec_bfmlal
+ : s->fpcr_ah ? gen_helper_gvec_ah_bfmlsl
+ : gen_helper_gvec_bfmlsl));
+}
+
+TRANS_FEAT(BFMLAL_n1, aa64_sme2, do_bfmlal, a, false, false)
+TRANS_FEAT(BFMLSL_n1, aa64_sme2, do_bfmlal, a, true, false)
+TRANS_FEAT(BFMLAL_nn, aa64_sme2, do_bfmlal, a, false, true)
+TRANS_FEAT(BFMLSL_nn, aa64_sme2, do_bfmlal, a, true, true)
+
+static bool do_bfmlal_nx(DisasContext *s, arg_azx_n *a, bool sub)
+{
+ return do_azz_acc_fp(s, a->n, 2, a->rv, a->off, a->zn, a->zm,
+ a->idx << 1, 0, false, FPST_ZA,
+ !sub ? gen_helper_gvec_bfmlal_idx
+ : s->fpcr_ah ? gen_helper_gvec_ah_bfmlsl_idx
+ : gen_helper_gvec_bfmlsl_idx);
+}
+
+TRANS_FEAT(BFMLAL_nx, aa64_sme2, do_bfmlal_nx, a, false)
+TRANS_FEAT(BFMLSL_nx, aa64_sme2, do_bfmlal_nx, a, true)
+
+static bool do_fdot(DisasContext *s, arg_azz_n *a, bool multi)
+{
+ return do_azz_acc_fp(s, a->n, 1, a->rv, a->off, a->zn, a->zm, 1, 0,
+ multi, FPST_ENV, gen_helper_sme2_fdot_h);
+}
+
+TRANS_FEAT(FDOT_n1, aa64_sme2, do_fdot, a, false)
+TRANS_FEAT(FDOT_nn, aa64_sme2, do_fdot, a, true)
+
+static bool do_fdot_nx(DisasContext *s, arg_azx_n *a)
+{
+ return do_azz_acc_fp(s, a->n, 1, a->rv, a->off, a->zn, a->zm,
+ a->idx | (1 << 2), 0, false, FPST_ENV,
+ gen_helper_sme2_fdot_idx_h);
+}
+
+TRANS_FEAT(FDOT_nx, aa64_sme2, do_fdot_nx, a)
+
+static bool do_bfdot(DisasContext *s, arg_azz_n *a, bool multi)
+{
+ return do_azz_acc_fp(s, a->n, 1, a->rv, a->off, a->zn, a->zm, 0, 0,
+ multi, FPST_ENV, gen_helper_gvec_bfdot);
+}
+
+TRANS_FEAT(BFDOT_n1, aa64_sme2, do_bfdot, a, false)
+TRANS_FEAT(BFDOT_nn, aa64_sme2, do_bfdot, a, true)
+
+static bool do_bfdot_nx(DisasContext *s, arg_azx_n *a)
+{
+ return do_azz_acc_fp(s, a->n, 1, a->rv, a->off, a->zn, a->zm, a->idx, 0,
+ false, FPST_ENV, gen_helper_gvec_bfdot_idx);
+}
+
+TRANS_FEAT(BFDOT_nx, aa64_sme2, do_bfdot_nx, a)
+
+static bool do_vdot(DisasContext *s, arg_azx_n *a, gen_helper_gvec_4_ptr *fn)
+{
+ if (sme_smza_enabled_check(s)) {
+ int svl = streaming_vec_reg_size(s);
+ int vstride = svl / 2;
+ TCGv_ptr t_za = get_zarray(s, a->rv, a->off, 2, 1);
+ TCGv_ptr t_zn = vec_full_reg_ptr(s, a->zn);
+ TCGv_ptr t_zm = vec_full_reg_ptr(s, a->zm);
+ TCGv_ptr t = tcg_temp_new_ptr();
+
+ for (int i = 0; i < 2; ++i) {
+ int o_za = i * vstride * sizeof(ARMVectorReg);
+ int desc = simd_desc(svl, svl, a->idx | (i << 2));
+
+ tcg_gen_addi_ptr(t, t_za, o_za);
+ fn(t, t_zn, t_zm, t, tcg_env, tcg_constant_i32(desc));
+ }
+ }
+ return true;
+}
+
+TRANS_FEAT(FVDOT, aa64_sme, do_vdot, a, gen_helper_sme2_fvdot_idx_h)
+TRANS_FEAT(BFVDOT, aa64_sme, do_vdot, a, gen_helper_sme2_bfvdot_idx)
+
+static bool do_fmla(DisasContext *s, arg_azz_n *a, bool multi,
+ ARMFPStatusFlavour fpst, gen_helper_gvec_3_ptr *fn)
+{
+ return do_azz_fp(s, a->n, 1, a->rv, a->off, a->zn, a->zm,
+ 0, 0, multi, fpst, fn);
+}
+
+TRANS_FEAT(FMLA_n1_h, aa64_sme_f16f16, do_fmla, a, false, FPST_ZA_F16,
+ gen_helper_gvec_vfma_h)
+TRANS_FEAT(FMLS_n1_h, aa64_sme_f16f16, do_fmla, a, false, FPST_ZA_F16,
+ s->fpcr_ah ? gen_helper_gvec_ah_vfms_h : gen_helper_gvec_vfms_h)
+TRANS_FEAT(FMLA_nn_h, aa64_sme_f16f16, do_fmla, a, true, FPST_ZA_F16,
+ gen_helper_gvec_vfma_h)
+TRANS_FEAT(FMLS_nn_h, aa64_sme_f16f16, do_fmla, a, true, FPST_ZA_F16,
+ s->fpcr_ah ? gen_helper_gvec_ah_vfms_h : gen_helper_gvec_vfms_h)
+
+TRANS_FEAT(FMLA_n1_s, aa64_sme2, do_fmla, a, false, FPST_ZA,
+ gen_helper_gvec_vfma_s)
+TRANS_FEAT(FMLS_n1_s, aa64_sme2, do_fmla, a, false, FPST_ZA,
+ s->fpcr_ah ? gen_helper_gvec_ah_vfms_s : gen_helper_gvec_vfms_s)
+TRANS_FEAT(FMLA_nn_s, aa64_sme2, do_fmla, a, true, FPST_ZA,
+ gen_helper_gvec_vfma_s)
+TRANS_FEAT(FMLS_nn_s, aa64_sme2, do_fmla, a, true, FPST_ZA,
+ s->fpcr_ah ? gen_helper_gvec_ah_vfms_s : gen_helper_gvec_vfms_s)
+
+TRANS_FEAT(FMLA_n1_d, aa64_sme2_f64f64, do_fmla, a, false, FPST_ZA,
+ gen_helper_gvec_vfma_d)
+TRANS_FEAT(FMLS_n1_d, aa64_sme2_f64f64, do_fmla, a, false, FPST_ZA,
+ s->fpcr_ah ? gen_helper_gvec_ah_vfms_d : gen_helper_gvec_vfms_d)
+TRANS_FEAT(FMLA_nn_d, aa64_sme2_f64f64, do_fmla, a, true, FPST_ZA,
+ gen_helper_gvec_vfma_d)
+TRANS_FEAT(FMLS_nn_d, aa64_sme2_f64f64, do_fmla, a, true, FPST_ZA,
+ s->fpcr_ah ? gen_helper_gvec_ah_vfms_d : gen_helper_gvec_vfms_d)
+
+TRANS_FEAT(BFMLA_n1, aa64_sme_b16b16, do_fmla, a, false, FPST_ZA,
+ gen_helper_gvec_bfmla)
+TRANS_FEAT(BFMLS_n1, aa64_sme_b16b16, do_fmla, a, false, FPST_ZA,
+ s->fpcr_ah ? gen_helper_gvec_ah_bfmls : gen_helper_gvec_bfmls)
+TRANS_FEAT(BFMLA_nn, aa64_sme_b16b16, do_fmla, a, true, FPST_ZA,
+ gen_helper_gvec_bfmla)
+TRANS_FEAT(BFMLS_nn, aa64_sme_b16b16, do_fmla, a, true, FPST_ZA,
+ s->fpcr_ah ? gen_helper_gvec_ah_bfmls : gen_helper_gvec_bfmls)
+
+static bool do_fmla_nx(DisasContext *s, arg_azx_n *a,
+ ARMFPStatusFlavour fpst, gen_helper_gvec_4_ptr *fn)
+{
+ return do_azz_acc_fp(s, a->n, 1, a->rv, a->off, a->zn, a->zm,
+ a->idx, 0, false, fpst, fn);
+}
+
+TRANS_FEAT(FMLA_nx_h, aa64_sme_f16f16, do_fmla_nx, a, FPST_ZA_F16,
+ gen_helper_gvec_fmla_idx_h)
+TRANS_FEAT(FMLS_nx_h, aa64_sme_f16f16, do_fmla_nx, a, FPST_ZA_F16,
+ s->fpcr_ah ? gen_helper_gvec_ah_fmls_idx_h : gen_helper_gvec_fmls_idx_h)
+TRANS_FEAT(FMLA_nx_s, aa64_sme2, do_fmla_nx, a, FPST_ZA,
+ gen_helper_gvec_fmla_idx_s)
+TRANS_FEAT(FMLS_nx_s, aa64_sme2, do_fmla_nx, a, FPST_ZA,
+ s->fpcr_ah ? gen_helper_gvec_ah_fmls_idx_s : gen_helper_gvec_fmls_idx_s)
+TRANS_FEAT(FMLA_nx_d, aa64_sme2_f64f64, do_fmla_nx, a, FPST_ZA,
+ gen_helper_gvec_fmla_idx_d)
+TRANS_FEAT(FMLS_nx_d, aa64_sme2_f64f64, do_fmla_nx, a, FPST_ZA,
+ s->fpcr_ah ? gen_helper_gvec_ah_fmls_idx_d : gen_helper_gvec_fmls_idx_d)
+
+TRANS_FEAT(BFMLA_nx, aa64_sme_b16b16, do_fmla_nx, a, FPST_ZA,
+ gen_helper_gvec_bfmla_idx)
+TRANS_FEAT(BFMLS_nx, aa64_sme_b16b16, do_fmla_nx, a, FPST_ZA,
+ s->fpcr_ah ? gen_helper_gvec_ah_bfmls_idx : gen_helper_gvec_bfmls_idx)
+
+static bool do_faddsub(DisasContext *s, arg_az_n *a, ARMFPStatusFlavour fpst,
+ gen_helper_gvec_3_ptr *fn)
+{
+ if (sme_smza_enabled_check(s)) {
+ int svl = streaming_vec_reg_size(s);
+ int n = a->n;
+ int zm = a->zm;
+ int vstride = svl / n;
+ TCGv_ptr t_za = get_zarray(s, a->rv, a->off, n, 0);
+ TCGv_ptr ptr = fpstatus_ptr(fpst);
+ TCGv_ptr t = tcg_temp_new_ptr();
+
+ for (int r = 0; r < n; ++r) {
+ TCGv_ptr t_zm = vec_full_reg_ptr(s, zm + r);
+ int o_za = r * vstride * sizeof(ARMVectorReg);
+ int desc = simd_desc(svl, svl, 0);
+
+ tcg_gen_addi_ptr(t, t_za, o_za);
+ fn(t, t, t_zm, ptr, tcg_constant_i32(desc));
+ }
+ }
+ return true;
+}
+
+TRANS_FEAT(FADD_nn_h, aa64_sme_f16f16, do_faddsub, a,
+ FPST_ZA_F16, gen_helper_gvec_fadd_h)
+TRANS_FEAT(FSUB_nn_h, aa64_sme_f16f16, do_faddsub, a,
+ FPST_ZA_F16, gen_helper_gvec_fsub_h)
+
+TRANS_FEAT(FADD_nn_s, aa64_sme2, do_faddsub, a,
+ FPST_ZA, gen_helper_gvec_fadd_s)
+TRANS_FEAT(FSUB_nn_s, aa64_sme2, do_faddsub, a,
+ FPST_ZA, gen_helper_gvec_fsub_s)
+
+TRANS_FEAT(FADD_nn_d, aa64_sme2_f64f64, do_faddsub, a,
+ FPST_ZA, gen_helper_gvec_fadd_d)
+TRANS_FEAT(FSUB_nn_d, aa64_sme2_f64f64, do_faddsub, a,
+ FPST_ZA, gen_helper_gvec_fsub_d)
+
+TRANS_FEAT(BFADD_nn, aa64_sme_b16b16, do_faddsub, a,
+ FPST_ZA, gen_helper_gvec_bfadd)
+TRANS_FEAT(BFSUB_nn, aa64_sme_b16b16, do_faddsub, a,
+ FPST_ZA, gen_helper_gvec_bfsub)
+
+/*
+ * Expand array multi-vector single (n1), array multi-vector (nn),
+ * and array multi-vector indexed (nx), for integer accumulate.
+ * multi: true for nn, false for n1.
+ * data: stuff for simd_data, including any index.
+ */
+static bool do_azz_acc(DisasContext *s, int nreg, int nsel,
+ int rv, int off, int zn, int zm,
+ int data, int shsel, bool multi,
+ gen_helper_gvec_4 *fn)
+{
+ if (sme_smza_enabled_check(s)) {
+ int svl = streaming_vec_reg_size(s);
+ int vstride = svl / nreg;
+ TCGv_ptr t_za = get_zarray(s, rv, off, nreg, nsel);
+ TCGv_ptr t = tcg_temp_new_ptr();
+
+ for (int r = 0; r < nreg; ++r) {
+ TCGv_ptr t_zn = vec_full_reg_ptr(s, zn);
+ TCGv_ptr t_zm = vec_full_reg_ptr(s, zm);
+
+ for (int i = 0; i < nsel; ++i) {
+ int o_za = (r * vstride + i) * sizeof(ARMVectorReg);
+ int desc = simd_desc(svl, svl, data | (i << shsel));
+
+ tcg_gen_addi_ptr(t, t_za, o_za);
+ fn(t, t_zn, t_zm, t, tcg_constant_i32(desc));
+ }
+
+ /*
+ * For multiple-and-single vectors, Zn may wrap.
+ * For multiple vectors, both Zn and Zm are aligned.
+ */
+ zn = (zn + 1) % 32;
+ zm += multi;
+ }
+ }
+ return true;
+}
+
+static bool do_dot(DisasContext *s, arg_azz_n *a, bool multi,
+ gen_helper_gvec_4 *fn)
+{
+ return do_azz_acc(s, a->n, 1, a->rv, a->off, a->zn, a->zm,
+ 0, 0, multi, fn);
+}
+
+static void gen_helper_gvec_sudot_4b(TCGv_ptr d, TCGv_ptr n, TCGv_ptr m,
+ TCGv_ptr a, TCGv_i32 desc)
+{
+ gen_helper_gvec_usdot_4b(d, m, n, a, desc);
+}
+
+TRANS_FEAT(USDOT_n1, aa64_sme2, do_dot, a, false, gen_helper_gvec_usdot_4b)
+TRANS_FEAT(SUDOT_n1, aa64_sme2, do_dot, a, false, gen_helper_gvec_sudot_4b)
+TRANS_FEAT(SDOT_n1_2h, aa64_sme2, do_dot, a, false, gen_helper_gvec_sdot_2h)
+TRANS_FEAT(UDOT_n1_2h, aa64_sme2, do_dot, a, false, gen_helper_gvec_udot_2h)
+TRANS_FEAT(SDOT_n1_4b, aa64_sme2, do_dot, a, false, gen_helper_gvec_sdot_4b)
+TRANS_FEAT(UDOT_n1_4b, aa64_sme2, do_dot, a, false, gen_helper_gvec_udot_4b)
+TRANS_FEAT(SDOT_n1_4h, aa64_sme2_i16i64, do_dot, a, false, gen_helper_gvec_sdot_4h)
+TRANS_FEAT(UDOT_n1_4h, aa64_sme2_i16i64, do_dot, a, false, gen_helper_gvec_udot_4h)
+
+TRANS_FEAT(USDOT_nn, aa64_sme2, do_dot, a, true, gen_helper_gvec_usdot_4b)
+TRANS_FEAT(SDOT_nn_2h, aa64_sme2, do_dot, a, true, gen_helper_gvec_sdot_2h)
+TRANS_FEAT(UDOT_nn_2h, aa64_sme2, do_dot, a, true, gen_helper_gvec_udot_2h)
+TRANS_FEAT(SDOT_nn_4b, aa64_sme2, do_dot, a, true, gen_helper_gvec_sdot_4b)
+TRANS_FEAT(UDOT_nn_4b, aa64_sme2, do_dot, a, true, gen_helper_gvec_udot_4b)
+TRANS_FEAT(SDOT_nn_4h, aa64_sme2_i16i64, do_dot, a, true, gen_helper_gvec_sdot_4h)
+TRANS_FEAT(UDOT_nn_4h, aa64_sme2_i16i64, do_dot, a, true, gen_helper_gvec_udot_4h)
+
+static bool do_dot_nx(DisasContext *s, arg_azx_n *a, gen_helper_gvec_4 *fn)
+{
+ return do_azz_acc(s, a->n, 1, a->rv, a->off, a->zn, a->zm,
+ a->idx, 0, false, fn);
+}
+
+TRANS_FEAT(USDOT_nx, aa64_sme2, do_dot_nx, a, gen_helper_gvec_usdot_idx_4b)
+TRANS_FEAT(SUDOT_nx, aa64_sme2, do_dot_nx, a, gen_helper_gvec_sudot_idx_4b)
+TRANS_FEAT(SDOT_nx_2h, aa64_sme2, do_dot_nx, a, gen_helper_gvec_sdot_idx_2h)
+TRANS_FEAT(UDOT_nx_2h, aa64_sme2, do_dot_nx, a, gen_helper_gvec_udot_idx_2h)
+TRANS_FEAT(SDOT_nx_4b, aa64_sme2, do_dot_nx, a, gen_helper_gvec_sdot_idx_4b)
+TRANS_FEAT(UDOT_nx_4b, aa64_sme2, do_dot_nx, a, gen_helper_gvec_udot_idx_4b)
+TRANS_FEAT(SDOT_nx_4h, aa64_sme2_i16i64, do_dot_nx, a, gen_helper_gvec_sdot_idx_4h)
+TRANS_FEAT(UDOT_nx_4h, aa64_sme2_i16i64, do_dot_nx, a, gen_helper_gvec_udot_idx_4h)
+
+static bool do_vdot_nx(DisasContext *s, arg_azx_n *a, gen_helper_gvec_3 *fn)
+{
+ if (sme_smza_enabled_check(s)) {
+ int svl = streaming_vec_reg_size(s);
+ fn(get_zarray(s, a->rv, a->off, a->n, 0),
+ vec_full_reg_ptr(s, a->zn),
+ vec_full_reg_ptr(s, a->zm),
+ tcg_constant_i32(simd_desc(svl, svl, a->idx)));
+ }
+ return true;
+}
+
+TRANS_FEAT(SVDOT_nx_2h, aa64_sme2, do_vdot_nx, a, gen_helper_sme2_svdot_idx_2h)
+TRANS_FEAT(SVDOT_nx_4b, aa64_sme2, do_vdot_nx, a, gen_helper_sme2_svdot_idx_4b)
+TRANS_FEAT(SVDOT_nx_4h, aa64_sme2, do_vdot_nx, a, gen_helper_sme2_svdot_idx_4h)
+
+TRANS_FEAT(UVDOT_nx_2h, aa64_sme2, do_vdot_nx, a, gen_helper_sme2_uvdot_idx_2h)
+TRANS_FEAT(UVDOT_nx_4b, aa64_sme2, do_vdot_nx, a, gen_helper_sme2_uvdot_idx_4b)
+TRANS_FEAT(UVDOT_nx_4h, aa64_sme2, do_vdot_nx, a, gen_helper_sme2_uvdot_idx_4h)
+
+TRANS_FEAT(SUVDOT_nx_4b, aa64_sme2, do_vdot_nx, a, gen_helper_sme2_suvdot_idx_4b)
+TRANS_FEAT(USVDOT_nx_4b, aa64_sme2, do_vdot_nx, a, gen_helper_sme2_usvdot_idx_4b)
+
+static bool do_smlal(DisasContext *s, arg_azz_n *a, bool multi,
+ gen_helper_gvec_4 *fn)
+{
+ return do_azz_acc(s, a->n, 2, a->rv, a->off, a->zn, a->zm,
+ 0, 0, multi, fn);
+}
+
+TRANS_FEAT(SMLAL_n1, aa64_sme2, do_smlal, a, false, gen_helper_sve2_smlal_zzzw_s)
+TRANS_FEAT(SMLSL_n1, aa64_sme2, do_smlal, a, false, gen_helper_sve2_smlsl_zzzw_s)
+TRANS_FEAT(UMLAL_n1, aa64_sme2, do_smlal, a, false, gen_helper_sve2_umlal_zzzw_s)
+TRANS_FEAT(UMLSL_n1, aa64_sme2, do_smlal, a, false, gen_helper_sve2_umlsl_zzzw_s)
+
+TRANS_FEAT(SMLAL_nn, aa64_sme2, do_smlal, a, true, gen_helper_sve2_smlal_zzzw_s)
+TRANS_FEAT(SMLSL_nn, aa64_sme2, do_smlal, a, true, gen_helper_sve2_smlsl_zzzw_s)
+TRANS_FEAT(UMLAL_nn, aa64_sme2, do_smlal, a, true, gen_helper_sve2_umlal_zzzw_s)
+TRANS_FEAT(UMLSL_nn, aa64_sme2, do_smlal, a, true, gen_helper_sve2_umlsl_zzzw_s)
+
+static bool do_smlal_nx(DisasContext *s, arg_azx_n *a,
+ gen_helper_gvec_4 *fn)
+{
+ return do_azz_acc(s, a->n, 2, a->rv, a->off, a->zn, a->zm,
+ a->idx << 1, 0, false, fn);
+}
+
+TRANS_FEAT(SMLAL_nx, aa64_sme2, do_smlal_nx, a, gen_helper_sve2_smlal_idx_s)
+TRANS_FEAT(SMLSL_nx, aa64_sme2, do_smlal_nx, a, gen_helper_sve2_smlsl_idx_s)
+TRANS_FEAT(UMLAL_nx, aa64_sme2, do_smlal_nx, a, gen_helper_sve2_umlal_idx_s)
+TRANS_FEAT(UMLSL_nx, aa64_sme2, do_smlal_nx, a, gen_helper_sve2_umlsl_idx_s)
+
+static bool do_smlall(DisasContext *s, arg_azz_n *a, bool multi,
+ gen_helper_gvec_4 *fn)
+{
+ return do_azz_acc(s, a->n, 4, a->rv, a->off, a->zn, a->zm,
+ 0, 0, multi, fn);
+}
+
+static void gen_helper_sme2_sumlall_s(TCGv_ptr d, TCGv_ptr n, TCGv_ptr m,
+ TCGv_ptr a, TCGv_i32 desc)
+{
+ gen_helper_sme2_usmlall_s(d, m, n, a, desc);
+}
+
+TRANS_FEAT(SMLALL_n1_s, aa64_sme2, do_smlall, a, false, gen_helper_sme2_smlall_s)
+TRANS_FEAT(SMLSLL_n1_s, aa64_sme2, do_smlall, a, false, gen_helper_sme2_smlsll_s)
+TRANS_FEAT(UMLALL_n1_s, aa64_sme2, do_smlall, a, false, gen_helper_sme2_umlall_s)
+TRANS_FEAT(UMLSLL_n1_s, aa64_sme2, do_smlall, a, false, gen_helper_sme2_umlsll_s)
+TRANS_FEAT(USMLALL_n1_s, aa64_sme2, do_smlall, a, false, gen_helper_sme2_usmlall_s)
+TRANS_FEAT(SUMLALL_n1_s, aa64_sme2, do_smlall, a, false, gen_helper_sme2_sumlall_s)
+
+TRANS_FEAT(SMLALL_n1_d, aa64_sme2_i16i64, do_smlall, a, false, gen_helper_sme2_smlall_d)
+TRANS_FEAT(SMLSLL_n1_d, aa64_sme2_i16i64, do_smlall, a, false, gen_helper_sme2_smlsll_d)
+TRANS_FEAT(UMLALL_n1_d, aa64_sme2_i16i64, do_smlall, a, false, gen_helper_sme2_umlall_d)
+TRANS_FEAT(UMLSLL_n1_d, aa64_sme2_i16i64, do_smlall, a, false, gen_helper_sme2_umlsll_d)
+
+TRANS_FEAT(SMLALL_nn_s, aa64_sme2, do_smlall, a, true, gen_helper_sme2_smlall_s)
+TRANS_FEAT(SMLSLL_nn_s, aa64_sme2, do_smlall, a, true, gen_helper_sme2_smlsll_s)
+TRANS_FEAT(UMLALL_nn_s, aa64_sme2, do_smlall, a, true, gen_helper_sme2_umlall_s)
+TRANS_FEAT(UMLSLL_nn_s, aa64_sme2, do_smlall, a, true, gen_helper_sme2_umlsll_s)
+TRANS_FEAT(USMLALL_nn_s, aa64_sme2, do_smlall, a, true, gen_helper_sme2_usmlall_s)
+
+TRANS_FEAT(SMLALL_nn_d, aa64_sme2_i16i64, do_smlall, a, true, gen_helper_sme2_smlall_d)
+TRANS_FEAT(SMLSLL_nn_d, aa64_sme2_i16i64, do_smlall, a, true, gen_helper_sme2_smlsll_d)
+TRANS_FEAT(UMLALL_nn_d, aa64_sme2_i16i64, do_smlall, a, true, gen_helper_sme2_umlall_d)
+TRANS_FEAT(UMLSLL_nn_d, aa64_sme2_i16i64, do_smlall, a, true, gen_helper_sme2_umlsll_d)
+
+static bool do_smlall_nx(DisasContext *s, arg_azx_n *a,
+ gen_helper_gvec_4 *fn)
+{
+ return do_azz_acc(s, a->n, 4, a->rv, a->off, a->zn, a->zm,
+ a->idx << 2, 0, false, fn);
+}
+
+TRANS_FEAT(SMLALL_nx_s, aa64_sme2, do_smlall_nx, a, gen_helper_sme2_smlall_idx_s)
+TRANS_FEAT(SMLSLL_nx_s, aa64_sme2, do_smlall_nx, a, gen_helper_sme2_smlsll_idx_s)
+TRANS_FEAT(UMLALL_nx_s, aa64_sme2, do_smlall_nx, a, gen_helper_sme2_umlall_idx_s)
+TRANS_FEAT(UMLSLL_nx_s, aa64_sme2, do_smlall_nx, a, gen_helper_sme2_umlsll_idx_s)
+TRANS_FEAT(USMLALL_nx_s, aa64_sme2, do_smlall_nx, a, gen_helper_sme2_usmlall_idx_s)
+TRANS_FEAT(SUMLALL_nx_s, aa64_sme2, do_smlall_nx, a, gen_helper_sme2_sumlall_idx_s)
+
+TRANS_FEAT(SMLALL_nx_d, aa64_sme2_i16i64, do_smlall_nx, a, gen_helper_sme2_smlall_idx_d)
+TRANS_FEAT(SMLSLL_nx_d, aa64_sme2_i16i64, do_smlall_nx, a, gen_helper_sme2_smlsll_idx_d)
+TRANS_FEAT(UMLALL_nx_d, aa64_sme2_i16i64, do_smlall_nx, a, gen_helper_sme2_umlall_idx_d)
+TRANS_FEAT(UMLSLL_nx_d, aa64_sme2_i16i64, do_smlall_nx, a, gen_helper_sme2_umlsll_idx_d)
+
+static bool do_zz_fpst(DisasContext *s, arg_zz_n *a, int data,
+ ARMFPStatusFlavour type, gen_helper_gvec_2_ptr *fn)
+{
+ if (sme_sm_enabled_check(s)) {
+ int svl = streaming_vec_reg_size(s);
+ TCGv_ptr fpst = fpstatus_ptr(type);
+
+ for (int i = 0, n = a->n; i < n; ++i) {
+ tcg_gen_gvec_2_ptr(vec_full_reg_offset(s, a->zd + i),
+ vec_full_reg_offset(s, a->zn + i),
+ fpst, svl, svl, data, fn);
+ }
+ }
+ return true;
+}
+
+TRANS_FEAT(BFCVT, aa64_sme2, do_zz_fpst, a, 0,
+ FPST_A64, gen_helper_sme2_bfcvt)
+TRANS_FEAT(BFCVTN, aa64_sme2, do_zz_fpst, a, 0,
+ FPST_A64, gen_helper_sme2_bfcvtn)
+TRANS_FEAT(FCVT_n, aa64_sme2, do_zz_fpst, a, 0,
+ FPST_A64, gen_helper_sme2_fcvt_n)
+TRANS_FEAT(FCVTN, aa64_sme2, do_zz_fpst, a, 0,
+ FPST_A64, gen_helper_sme2_fcvtn)
+
+TRANS_FEAT(FCVT_w, aa64_sme_f16f16, do_zz_fpst, a, 0,
+ FPST_A64_F16, gen_helper_sme2_fcvt_w)
+TRANS_FEAT(FCVTL, aa64_sme_f16f16, do_zz_fpst, a, 0,
+ FPST_A64_F16, gen_helper_sme2_fcvtl)
+
+TRANS_FEAT(FCVTZS, aa64_sme2, do_zz_fpst, a, 0,
+ FPST_A64, gen_helper_gvec_vcvt_rz_fs)
+TRANS_FEAT(FCVTZU, aa64_sme2, do_zz_fpst, a, 0,
+ FPST_A64, gen_helper_gvec_vcvt_rz_fu)
+
+TRANS_FEAT(SCVTF, aa64_sme2, do_zz_fpst, a, 0,
+ FPST_A64, gen_helper_sme2_scvtf)
+TRANS_FEAT(UCVTF, aa64_sme2, do_zz_fpst, a, 0,
+ FPST_A64, gen_helper_sme2_ucvtf)
+
+TRANS_FEAT(FRINTN, aa64_sme2, do_zz_fpst, a, float_round_nearest_even,
+ FPST_A64, gen_helper_gvec_vrint_rm_s)
+TRANS_FEAT(FRINTP, aa64_sme2, do_zz_fpst, a, float_round_up,
+ FPST_A64, gen_helper_gvec_vrint_rm_s)
+TRANS_FEAT(FRINTM, aa64_sme2, do_zz_fpst, a, float_round_down,
+ FPST_A64, gen_helper_gvec_vrint_rm_s)
+TRANS_FEAT(FRINTA, aa64_sme2, do_zz_fpst, a, float_round_ties_away,
+ FPST_A64, gen_helper_gvec_vrint_rm_s)
+
+static bool do_zz(DisasContext *s, arg_zz_n *a, int data,
+ gen_helper_gvec_2 *fn)
+{
+ if (sme_sm_enabled_check(s)) {
+ int svl = streaming_vec_reg_size(s);
+
+ for (int i = 0, n = a->n; i < n; ++i) {
+ tcg_gen_gvec_2_ool(vec_full_reg_offset(s, a->zd + i),
+ vec_full_reg_offset(s, a->zn + i),
+ svl, svl, data, fn);
+ }
+ }
+ return true;
+}
+
+TRANS_FEAT(SQCVT_sh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sqcvt_sh)
+TRANS_FEAT(UQCVT_sh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uqcvt_sh)
+TRANS_FEAT(SQCVTU_sh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sqcvtu_sh)
+
+TRANS_FEAT(SQCVT_sb, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sqcvt_sb)
+TRANS_FEAT(UQCVT_sb, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uqcvt_sb)
+TRANS_FEAT(SQCVTU_sb, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sqcvtu_sb)
+
+TRANS_FEAT(SQCVT_dh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sqcvt_dh)
+TRANS_FEAT(UQCVT_dh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uqcvt_dh)
+TRANS_FEAT(SQCVTU_dh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sqcvtu_dh)
+
+TRANS_FEAT(SQCVTN_sb, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sqcvtn_sb)
+TRANS_FEAT(UQCVTN_sb, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uqcvtn_sb)
+TRANS_FEAT(SQCVTUN_sb, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sqcvtun_sb)
+
+TRANS_FEAT(SQCVTN_dh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sqcvtn_dh)
+TRANS_FEAT(UQCVTN_dh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uqcvtn_dh)
+TRANS_FEAT(SQCVTUN_dh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sqcvtun_dh)
+
+TRANS_FEAT(SUNPK_2bh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sunpk2_bh)
+TRANS_FEAT(SUNPK_2hs, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sunpk2_hs)
+TRANS_FEAT(SUNPK_2sd, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sunpk2_sd)
+
+TRANS_FEAT(SUNPK_4bh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sunpk4_bh)
+TRANS_FEAT(SUNPK_4hs, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sunpk4_hs)
+TRANS_FEAT(SUNPK_4sd, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sunpk4_sd)
+
+TRANS_FEAT(UUNPK_2bh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uunpk2_bh)
+TRANS_FEAT(UUNPK_2hs, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uunpk2_hs)
+TRANS_FEAT(UUNPK_2sd, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uunpk2_sd)
+
+TRANS_FEAT(UUNPK_4bh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uunpk4_bh)
+TRANS_FEAT(UUNPK_4hs, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uunpk4_hs)
+TRANS_FEAT(UUNPK_4sd, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uunpk4_sd)
+
+static bool do_zipuzp_4(DisasContext *s, arg_zz_e *a,
+ gen_helper_gvec_2 * const fn[5])
+{
+ int bytes_per_op = 4 << a->esz;
+
+ /* Both MO_64 and MO_128 can fail the size test. */
+ if (s->max_svl < bytes_per_op) {
+ unallocated_encoding(s);
+ } else if (sme_sm_enabled_check(s)) {
+ int svl = streaming_vec_reg_size(s);
+ if (svl < bytes_per_op) {
+ unallocated_encoding(s);
+ } else {
+ tcg_gen_gvec_2_ool(vec_full_reg_offset(s, a->zd),
+ vec_full_reg_offset(s, a->zn),
+ svl, svl, 0, fn[a->esz]);
+ }
+ }
+ return true;
+}
+
+static gen_helper_gvec_2 * const zip4_fns[] = {
+ gen_helper_sme2_zip4_b,
+ gen_helper_sme2_zip4_h,
+ gen_helper_sme2_zip4_s,
+ gen_helper_sme2_zip4_d,
+ gen_helper_sme2_zip4_q,
+};
+TRANS_FEAT(ZIP_4, aa64_sme2, do_zipuzp_4, a, zip4_fns)
+
+static gen_helper_gvec_2 * const uzp4_fns[] = {
+ gen_helper_sme2_uzp4_b,
+ gen_helper_sme2_uzp4_h,
+ gen_helper_sme2_uzp4_s,
+ gen_helper_sme2_uzp4_d,
+ gen_helper_sme2_uzp4_q,
+};
+TRANS_FEAT(UZP_4, aa64_sme2, do_zipuzp_4, a, uzp4_fns)
+
+static bool do_zz_rshr(DisasContext *s, arg_rshr *a, gen_helper_gvec_2 *fn)
+{
+ if (sve_access_check(s)) {
+ int vl = vec_full_reg_size(s);
+ tcg_gen_gvec_2_ool(vec_full_reg_offset(s, a->zd),
+ vec_full_reg_offset(s, a->zn),
+ vl, vl, a->shift, fn);
+ }
+ return true;
+}
+
+TRANS_FEAT(SQRSHR_sh, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_sqrshr_sh)
+TRANS_FEAT(UQRSHR_sh, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_uqrshr_sh)
+TRANS_FEAT(SQRSHRU_sh, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_sqrshru_sh)
+
+TRANS_FEAT(SQRSHR_sb, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_sqrshr_sb)
+TRANS_FEAT(SQRSHR_dh, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_sqrshr_dh)
+TRANS_FEAT(UQRSHR_sb, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_uqrshr_sb)
+TRANS_FEAT(UQRSHR_dh, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_uqrshr_dh)
+TRANS_FEAT(SQRSHRU_sb, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_sqrshru_sb)
+TRANS_FEAT(SQRSHRU_dh, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_sqrshru_dh)
+
+TRANS_FEAT(SQRSHRN_sh, aa64_sme2_or_sve2p1, do_zz_rshr, a, gen_helper_sme2_sqrshrn_sh)
+TRANS_FEAT(UQRSHRN_sh, aa64_sme2_or_sve2p1, do_zz_rshr, a, gen_helper_sme2_uqrshrn_sh)
+TRANS_FEAT(SQRSHRUN_sh, aa64_sme2_or_sve2p1, do_zz_rshr, a, gen_helper_sme2_sqrshrun_sh)
+
+TRANS_FEAT(SQRSHRN_sb, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_sqrshrn_sb)
+TRANS_FEAT(SQRSHRN_dh, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_sqrshrn_dh)
+TRANS_FEAT(UQRSHRN_sb, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_uqrshrn_sb)
+TRANS_FEAT(UQRSHRN_dh, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_uqrshrn_dh)
+TRANS_FEAT(SQRSHRUN_sb, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_sqrshrun_sb)
+TRANS_FEAT(SQRSHRUN_dh, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_sqrshrun_dh)
+
+static bool do_zipuzp_2(DisasContext *s, arg_zzz_e *a,
+ gen_helper_gvec_3 * const fn[5])
+{
+ int bytes_per_op = 2 << a->esz;
+
+ /* MO_128 can fail the size test. */
+ if (s->max_svl < bytes_per_op) {
+ unallocated_encoding(s);
+ } else if (sme_sm_enabled_check(s)) {
+ int svl = streaming_vec_reg_size(s);
+ if (svl < bytes_per_op) {
+ unallocated_encoding(s);
+ } else {
+ tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->zd),
+ vec_full_reg_offset(s, a->zn),
+ vec_full_reg_offset(s, a->zm),
+ svl, svl, 0, fn[a->esz]);
+ }
+ }
+ return true;
+}
+
+static gen_helper_gvec_3 * const zip2_fns[] = {
+ gen_helper_sme2_zip2_b,
+ gen_helper_sme2_zip2_h,
+ gen_helper_sme2_zip2_s,
+ gen_helper_sme2_zip2_d,
+ gen_helper_sme2_zip2_q,
+};
+TRANS_FEAT(ZIP_2, aa64_sme2, do_zipuzp_2, a, zip2_fns)
+
+static gen_helper_gvec_3 * const uzp2_fns[] = {
+ gen_helper_sme2_uzp2_b,
+ gen_helper_sme2_uzp2_h,
+ gen_helper_sme2_uzp2_s,
+ gen_helper_sme2_uzp2_d,
+ gen_helper_sme2_uzp2_q,
+};
+TRANS_FEAT(UZP_2, aa64_sme2, do_zipuzp_2, a, uzp2_fns)
+
+static bool trans_FCLAMP(DisasContext *s, arg_zzz_en *a)
+{
+ static gen_helper_gvec_3_ptr * const fn[] = {
+ gen_helper_sme2_bfclamp,
+ gen_helper_sme2_fclamp_h,
+ gen_helper_sme2_fclamp_s,
+ gen_helper_sme2_fclamp_d,
+ };
+ TCGv_ptr fpst;
+ int vl;
+
+ if (!dc_isar_feature(aa64_sme2, s)) {
+ return false;
+ }
+ /* This insn uses MO_8 to encode BFloat16. */
+ if (a->esz == MO_8 && !dc_isar_feature(aa64_sme_b16b16, s)) {
+ return false;
+ }
+ if (!sme_sm_enabled_check(s)) {
+ return true;
+ }
+
+ fpst = fpstatus_ptr(a->esz == MO_16 ? FPST_A64_F16 : FPST_A64);
+ vl = vec_full_reg_size(s);
+
+ tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, a->zd),
+ vec_full_reg_offset(s, a->zn),
+ vec_full_reg_offset(s, a->zm),
+ fpst, vl, vl, a->n, fn[a->esz]);
+ return true;
+}
+
+static bool do_clamp(DisasContext *s, arg_zzz_en *a,
+ gen_helper_gvec_3 * const fn[4])
+{
+ int vl;
+
+ if (!dc_isar_feature(aa64_sme2, s)) {
+ return false;
+ }
+ if (!sme_sm_enabled_check(s)) {
+ return true;
+ }
+
+ /*
+ * Clamp is just a min+max, easily supported by most host
+ * vector operations -- we already have such an expansion in
+ * translate-sve.c for a single output.
+ * TODO: Add support in gvec for multiple simultaneous output,
+ * and/or copy to temporary upon overlap.
+ */
+ vl = vec_full_reg_size(s);
+ tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->zd),
+ vec_full_reg_offset(s, a->zn),
+ vec_full_reg_offset(s, a->zm),
+ vl, vl, a->n, fn[a->esz]);
+ return true;
+}
+
+static gen_helper_gvec_3 * const sclamp_fns[] = {
+ gen_helper_sme2_sclamp_b,
+ gen_helper_sme2_sclamp_h,
+ gen_helper_sme2_sclamp_s,
+ gen_helper_sme2_sclamp_d,
+};
+TRANS(SCLAMP, do_clamp, a, sclamp_fns)
+
+static gen_helper_gvec_3 * const uclamp_fns[] = {
+ gen_helper_sme2_uclamp_b,
+ gen_helper_sme2_uclamp_h,
+ gen_helper_sme2_uclamp_s,
+ gen_helper_sme2_uclamp_d,
+};
+TRANS(UCLAMP, do_clamp, a, uclamp_fns)
+
+static bool trans_SEL(DisasContext *s, arg_SEL *a)
+{
+ typedef void sme_sel_fn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32, TCGv_i32);
+ static sme_sel_fn * const fns[4] = {
+ gen_helper_sme2_sel_b, gen_helper_sme2_sel_h,
+ gen_helper_sme2_sel_s, gen_helper_sme2_sel_d
+ };
+
+ if (!dc_isar_feature(aa64_sme2, s)) {
+ return false;
+ }
+ if (sme_sm_enabled_check(s)) {
+ int svl = streaming_vec_reg_size(s);
+ uint32_t desc = simd_desc(svl, svl, a->n);
+ TCGv_ptr t_d = tcg_temp_new_ptr();
+ TCGv_ptr t_n = tcg_temp_new_ptr();
+ TCGv_ptr t_m = tcg_temp_new_ptr();
+ TCGv_i32 png = tcg_temp_new_i32();
+
+ tcg_gen_addi_ptr(t_d, tcg_env, vec_full_reg_offset(s, a->zd));
+ tcg_gen_addi_ptr(t_n, tcg_env, vec_full_reg_offset(s, a->zn));
+ tcg_gen_addi_ptr(t_m, tcg_env, vec_full_reg_offset(s, a->zm));
+
+ tcg_gen_ld16u_i32(png, tcg_env, pred_full_reg_offset(s, a->pg)
+ ^ (HOST_BIG_ENDIAN ? 6 : 0));
+
+ fns[a->esz](t_d, t_n, t_m, png, tcg_constant_i32(desc));
+ }
+ return true;
+}
+
+static bool do_lut(DisasContext *s, arg_lut *a,
+ gen_helper_gvec_2_ptr *fn, bool strided)
+{
+ if (sme_sm_enabled_check(s) && sme2_zt0_enabled_check(s)) {
+ int svl = streaming_vec_reg_size(s);
+ tcg_gen_gvec_2_ptr(vec_full_reg_offset(s, a->zd),
+ vec_full_reg_offset(s, a->zn),
+ tcg_env, svl, svl, strided | (a->idx << 1), fn);
+ }
+ return true;
+}
+
+TRANS_FEAT(LUTI2_c_1b, aa64_sme2, do_lut, a, gen_helper_sme2_luti2_1b, false)
+TRANS_FEAT(LUTI2_c_1h, aa64_sme2, do_lut, a, gen_helper_sme2_luti2_1h, false)
+TRANS_FEAT(LUTI2_c_1s, aa64_sme2, do_lut, a, gen_helper_sme2_luti2_1s, false)
+
+TRANS_FEAT(LUTI2_c_2b, aa64_sme2, do_lut, a, gen_helper_sme2_luti2_2b, false)
+TRANS_FEAT(LUTI2_c_2h, aa64_sme2, do_lut, a, gen_helper_sme2_luti2_2h, false)
+TRANS_FEAT(LUTI2_c_2s, aa64_sme2, do_lut, a, gen_helper_sme2_luti2_2s, false)
+
+TRANS_FEAT(LUTI2_c_4b, aa64_sme2, do_lut, a, gen_helper_sme2_luti2_4b, false)
+TRANS_FEAT(LUTI2_c_4h, aa64_sme2, do_lut, a, gen_helper_sme2_luti2_4h, false)
+TRANS_FEAT(LUTI2_c_4s, aa64_sme2, do_lut, a, gen_helper_sme2_luti2_4s, false)
+
+TRANS_FEAT(LUTI4_c_1b, aa64_sme2, do_lut, a, gen_helper_sme2_luti4_1b, false)
+TRANS_FEAT(LUTI4_c_1h, aa64_sme2, do_lut, a, gen_helper_sme2_luti4_1h, false)
+TRANS_FEAT(LUTI4_c_1s, aa64_sme2, do_lut, a, gen_helper_sme2_luti4_1s, false)
+
+TRANS_FEAT(LUTI4_c_2b, aa64_sme2, do_lut, a, gen_helper_sme2_luti4_2b, false)
+TRANS_FEAT(LUTI4_c_2h, aa64_sme2, do_lut, a, gen_helper_sme2_luti4_2h, false)
+TRANS_FEAT(LUTI4_c_2s, aa64_sme2, do_lut, a, gen_helper_sme2_luti4_2s, false)
+
+TRANS_FEAT(LUTI4_c_4h, aa64_sme2, do_lut, a, gen_helper_sme2_luti4_4h, false)
+TRANS_FEAT(LUTI4_c_4s, aa64_sme2, do_lut, a, gen_helper_sme2_luti4_4s, false)
+
+static bool do_lut_s4(DisasContext *s, arg_lut *a, gen_helper_gvec_2_ptr *fn)
+{
+ return !(a->zd & 0b01100) && do_lut(s, a, fn, true);
+}
+
+static bool do_lut_s8(DisasContext *s, arg_lut *a, gen_helper_gvec_2_ptr *fn)
+{
+ return !(a->zd & 0b01000) && do_lut(s, a, fn, true);
+}
+
+TRANS_FEAT(LUTI2_s_2b, aa64_sme2p1, do_lut_s8, a, gen_helper_sme2_luti2_2b)
+TRANS_FEAT(LUTI2_s_2h, aa64_sme2p1, do_lut_s8, a, gen_helper_sme2_luti2_2h)
+
+TRANS_FEAT(LUTI2_s_4b, aa64_sme2p1, do_lut_s4, a, gen_helper_sme2_luti2_4b)
+TRANS_FEAT(LUTI2_s_4h, aa64_sme2p1, do_lut_s4, a, gen_helper_sme2_luti2_4h)
+
+TRANS_FEAT(LUTI4_s_2b, aa64_sme2p1, do_lut_s8, a, gen_helper_sme2_luti4_2b)
+TRANS_FEAT(LUTI4_s_2h, aa64_sme2p1, do_lut_s8, a, gen_helper_sme2_luti4_2h)
+
+TRANS_FEAT(LUTI4_s_4h, aa64_sme2p1, do_lut_s4, a, gen_helper_sme2_luti4_4h)
diff --git a/target/arm/tcg/translate-sve.c b/target/arm/tcg/translate-sve.c
index f3cf028..7b57573 100644
--- a/target/arm/tcg/translate-sve.c
+++ b/target/arm/tcg/translate-sve.c
@@ -89,7 +89,7 @@ static inline int expand_imm_sh8u(DisasContext *s, int x)
*/
static inline int msz_dtype(DisasContext *s, int msz)
{
- static const uint8_t dtype[4] = { 0, 5, 10, 15 };
+ static const uint8_t dtype[5] = { 0, 5, 10, 15, 18 };
return dtype[msz];
}
@@ -778,6 +778,9 @@ DO_ZPZ(NOT_zpz, aa64_sve, sve_not_zpz)
DO_ZPZ(ABS, aa64_sve, sve_abs)
DO_ZPZ(NEG, aa64_sve, sve_neg)
DO_ZPZ(RBIT, aa64_sve, sve_rbit)
+DO_ZPZ(ORQV, aa64_sme2p1_or_sve2p1, sve2p1_orqv)
+DO_ZPZ(EORQV, aa64_sme2p1_or_sve2p1, sve2p1_eorqv)
+DO_ZPZ(ANDQV, aa64_sme2p1_or_sve2p1, sve2p1_andqv)
static gen_helper_gvec_3 * const fabs_fns[4] = {
NULL, gen_helper_sve_fabs_h,
@@ -828,6 +831,41 @@ TRANS_FEAT(SXTW, aa64_sve, gen_gvec_ool_arg_zpz,
TRANS_FEAT(UXTW, aa64_sve, gen_gvec_ool_arg_zpz,
a->esz == 3 ? gen_helper_sve_uxtw_d : NULL, a, 0)
+static gen_helper_gvec_3 * const addqv_fns[4] = {
+ gen_helper_sve2p1_addqv_b, gen_helper_sve2p1_addqv_h,
+ gen_helper_sve2p1_addqv_s, gen_helper_sve2p1_addqv_d,
+};
+TRANS_FEAT(ADDQV, aa64_sme2p1_or_sve2p1,
+ gen_gvec_ool_arg_zpz, addqv_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const smaxqv_fns[4] = {
+ gen_helper_sve2p1_smaxqv_b, gen_helper_sve2p1_smaxqv_h,
+ gen_helper_sve2p1_smaxqv_s, gen_helper_sve2p1_smaxqv_d,
+};
+TRANS_FEAT(SMAXQV, aa64_sme2p1_or_sve2p1,
+ gen_gvec_ool_arg_zpz, smaxqv_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const sminqv_fns[4] = {
+ gen_helper_sve2p1_sminqv_b, gen_helper_sve2p1_sminqv_h,
+ gen_helper_sve2p1_sminqv_s, gen_helper_sve2p1_sminqv_d,
+};
+TRANS_FEAT(SMINQV, aa64_sme2p1_or_sve2p1,
+ gen_gvec_ool_arg_zpz, sminqv_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const umaxqv_fns[4] = {
+ gen_helper_sve2p1_umaxqv_b, gen_helper_sve2p1_umaxqv_h,
+ gen_helper_sve2p1_umaxqv_s, gen_helper_sve2p1_umaxqv_d,
+};
+TRANS_FEAT(UMAXQV, aa64_sme2p1_or_sve2p1,
+ gen_gvec_ool_arg_zpz, umaxqv_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const uminqv_fns[4] = {
+ gen_helper_sve2p1_uminqv_b, gen_helper_sve2p1_uminqv_h,
+ gen_helper_sve2p1_uminqv_s, gen_helper_sve2p1_uminqv_d,
+};
+TRANS_FEAT(UMINQV, aa64_sme2p1_or_sve2p1,
+ gen_gvec_ool_arg_zpz, uminqv_fns[a->esz], a, 0)
+
/*
*** SVE Integer Reduction Group
*/
@@ -1679,6 +1717,22 @@ static bool do_predset(DisasContext *s, int esz, int rd, int pat, bool setflag)
TRANS_FEAT(PTRUE, aa64_sve, do_predset, a->esz, a->rd, a->pat, a->s)
+static bool trans_PTRUE_cnt(DisasContext *s, arg_PTRUE_cnt *a)
+{
+ if (!dc_isar_feature(aa64_sme2_or_sve2p1, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ /* Canonical TRUE is 0 count, invert bit, plus element size. */
+ int val = (1 << 15) | (1 << a->esz);
+
+ /* Write val to the first uint64_t; clear all of the rest. */
+ tcg_gen_gvec_dup_imm(MO_64, pred_full_reg_offset(s, a->rd),
+ 8, size_for_gvec(pred_full_reg_size(s)), val);
+ }
+ return true;
+}
+
/* Note pat == 31 is #all, to set all elements. */
TRANS_FEAT_NONSTREAMING(SETFFR, aa64_sve,
do_predset, 0, FFR_PRED_NUM, 31, false)
@@ -2148,6 +2202,55 @@ static bool do_EXT(DisasContext *s, int rd, int rn, int rm, int imm)
TRANS_FEAT(EXT, aa64_sve, do_EXT, a->rd, a->rn, a->rm, a->imm)
TRANS_FEAT(EXT_sve2, aa64_sve2, do_EXT, a->rd, a->rn, (a->rn + 1) % 32, a->imm)
+static bool trans_EXTQ(DisasContext *s, arg_EXTQ *a)
+{
+ unsigned vl, dofs, sofs0, sofs1, sofs2, imm;
+
+ if (!dc_isar_feature(aa64_sme2p1_or_sve2p1, s)) {
+ return false;
+ }
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ imm = a->imm;
+ if (imm == 0) {
+ /* So far we never optimize Zdn with MOVPRFX, so zd = zn is a nop. */
+ return true;
+ }
+
+ vl = vec_full_reg_size(s);
+ dofs = vec_full_reg_offset(s, a->rd);
+ sofs2 = vec_full_reg_offset(s, a->rn);
+
+ if (imm & 8) {
+ sofs0 = dofs + 8;
+ sofs1 = sofs2;
+ sofs2 += 8;
+ } else {
+ sofs0 = dofs;
+ sofs1 = dofs + 8;
+ }
+ imm = (imm & 7) << 3;
+
+ for (unsigned i = 0; i < vl; i += 16) {
+ TCGv_i64 s0 = tcg_temp_new_i64();
+ TCGv_i64 s1 = tcg_temp_new_i64();
+ TCGv_i64 s2 = tcg_temp_new_i64();
+
+ tcg_gen_ld_i64(s0, tcg_env, sofs0 + i);
+ tcg_gen_ld_i64(s1, tcg_env, sofs1 + i);
+ tcg_gen_ld_i64(s2, tcg_env, sofs2 + i);
+
+ tcg_gen_extract2_i64(s0, s0, s1, imm);
+ tcg_gen_extract2_i64(s1, s1, s2, imm);
+
+ tcg_gen_st_i64(s0, tcg_env, dofs + i);
+ tcg_gen_st_i64(s1, tcg_env, dofs + i + 8);
+ }
+ return true;
+}
+
/*
*** SVE Permute - Unpredicated Group
*/
@@ -2195,6 +2298,27 @@ static bool trans_DUP_x(DisasContext *s, arg_DUP_x *a)
return true;
}
+static bool trans_DUPQ(DisasContext *s, arg_DUPQ *a)
+{
+ unsigned vl, dofs, nofs;
+
+ if (!dc_isar_feature(aa64_sme2p1_or_sve2p1, s)) {
+ return false;
+ }
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ vl = vec_full_reg_size(s);
+ dofs = vec_full_reg_offset(s, a->rd);
+ nofs = vec_reg_offset(s, a->rn, a->imm, a->esz);
+
+ for (unsigned i = 0; i < vl; i += 16) {
+ tcg_gen_gvec_dup_mem(a->esz, dofs + i, nofs + i, 16, 16);
+ }
+ return true;
+}
+
static void do_insr_i64(DisasContext *s, arg_rrr_esz *a, TCGv_i64 val)
{
typedef void gen_insr(TCGv_ptr, TCGv_ptr, TCGv_i64, TCGv_i32);
@@ -2256,12 +2380,124 @@ static gen_helper_gvec_4 * const sve2_tbl_fns[4] = {
TRANS_FEAT(TBL_sve2, aa64_sve2, gen_gvec_ool_zzzz, sve2_tbl_fns[a->esz],
a->rd, a->rn, (a->rn + 1) % 32, a->rm, 0)
+static gen_helper_gvec_3 * const tblq_fns[4] = {
+ gen_helper_sve2p1_tblq_b, gen_helper_sve2p1_tblq_h,
+ gen_helper_sve2p1_tblq_s, gen_helper_sve2p1_tblq_d
+};
+TRANS_FEAT(TBLQ, aa64_sme2p1_or_sve2p1, gen_gvec_ool_arg_zzz,
+ tblq_fns[a->esz], a, 0)
+
static gen_helper_gvec_3 * const tbx_fns[4] = {
gen_helper_sve2_tbx_b, gen_helper_sve2_tbx_h,
gen_helper_sve2_tbx_s, gen_helper_sve2_tbx_d
};
TRANS_FEAT(TBX, aa64_sve2, gen_gvec_ool_arg_zzz, tbx_fns[a->esz], a, 0)
+static gen_helper_gvec_3 * const tbxq_fns[4] = {
+ gen_helper_sve2p1_tbxq_b, gen_helper_sve2p1_tbxq_h,
+ gen_helper_sve2p1_tbxq_s, gen_helper_sve2p1_tbxq_d
+};
+TRANS_FEAT(TBXQ, aa64_sme2p1_or_sve2p1, gen_gvec_ool_arg_zzz,
+ tbxq_fns[a->esz], a, 0)
+
+static bool trans_PMOV_pv(DisasContext *s, arg_PMOV_pv *a)
+{
+ static gen_helper_gvec_2 * const fns[4] = {
+ NULL, gen_helper_pmov_pv_h,
+ gen_helper_pmov_pv_s, gen_helper_pmov_pv_d
+ };
+ unsigned vl, pl, vofs, pofs;
+ TCGv_i64 tmp;
+
+ if (!dc_isar_feature(aa64_sme2p1_or_sve2p1, s)) {
+ return false;
+ }
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ vl = vec_full_reg_size(s);
+ if (a->esz != MO_8) {
+ tcg_gen_gvec_2_ool(pred_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ vl, vl, a->imm, fns[a->esz]);
+ return true;
+ }
+
+ /*
+ * Copy the low PL bytes from vector Zn, zero-extending to a
+ * multiple of 8 bytes, so that Pd is properly cleared.
+ */
+
+ pl = vl / 8;
+ pofs = pred_full_reg_offset(s, a->rd);
+ vofs = vec_full_reg_offset(s, a->rn);
+
+ QEMU_BUILD_BUG_ON(sizeof(ARMPredicateReg) != 32);
+ for (unsigned i = 32; i >= 8; i >>= 1) {
+ if (pl & i) {
+ tcg_gen_gvec_mov(MO_64, pofs, vofs, i, i);
+ pofs += i;
+ vofs += i;
+ }
+ }
+ switch (pl & 7) {
+ case 0:
+ return true;
+ case 2:
+ tmp = tcg_temp_new_i64();
+ tcg_gen_ld16u_i64(tmp, tcg_env, vofs + (HOST_BIG_ENDIAN ? 6 : 0));
+ break;
+ case 4:
+ tmp = tcg_temp_new_i64();
+ tcg_gen_ld32u_i64(tmp, tcg_env, vofs + (HOST_BIG_ENDIAN ? 4 : 0));
+ break;
+ case 6:
+ tmp = tcg_temp_new_i64();
+ tcg_gen_ld_i64(tmp, tcg_env, vofs);
+ tcg_gen_extract_i64(tmp, tmp, 0, 48);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ tcg_gen_st_i64(tmp, tcg_env, pofs);
+ return true;
+}
+
+static bool trans_PMOV_vp(DisasContext *s, arg_PMOV_pv *a)
+{
+ static gen_helper_gvec_2 * const fns[4] = {
+ NULL, gen_helper_pmov_vp_h,
+ gen_helper_pmov_vp_s, gen_helper_pmov_vp_d
+ };
+ unsigned vl;
+
+ if (!dc_isar_feature(aa64_sme2p1_or_sve2p1, s)) {
+ return false;
+ }
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ vl = vec_full_reg_size(s);
+
+ if (a->esz == MO_8) {
+ /*
+ * The low PL bytes are copied from Pn to Zd unchanged.
+ * We know that the unused portion of Pn is zero, and
+ * that imm == 0, so the balance of Zd must be zeroed.
+ */
+ tcg_gen_gvec_mov(MO_64, vec_full_reg_offset(s, a->rd),
+ pred_full_reg_offset(s, a->rn),
+ size_for_gvec(vl / 8), vl);
+ } else {
+ tcg_gen_gvec_2_ool(vec_full_reg_offset(s, a->rd),
+ pred_full_reg_offset(s, a->rn),
+ vl, vl, a->imm, fns[a->esz]);
+ }
+ return true;
+}
+
static bool trans_UNPK(DisasContext *s, arg_UNPK *a)
{
static gen_helper_gvec_2 * const fns[4][2] = {
@@ -2352,6 +2588,23 @@ TRANS_FEAT(PUNPKHI, aa64_sve, do_perm_pred2, a, 1, gen_helper_sve_punpk_p)
*** SVE Permute - Interleaving Group
*/
+static bool do_interleave_q(DisasContext *s, gen_helper_gvec_3 *fn,
+ arg_rrr_esz *a, int data)
+{
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ if (vsz < 32) {
+ unallocated_encoding(s);
+ } else {
+ tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ vec_full_reg_offset(s, a->rm),
+ vsz, vsz, data, fn);
+ }
+ }
+ return true;
+}
+
static gen_helper_gvec_3 * const zip_fns[4] = {
gen_helper_sve_zip_b, gen_helper_sve_zip_h,
gen_helper_sve_zip_s, gen_helper_sve_zip_d,
@@ -2361,26 +2614,43 @@ TRANS_FEAT(ZIP1_z, aa64_sve, gen_gvec_ool_arg_zzz,
TRANS_FEAT(ZIP2_z, aa64_sve, gen_gvec_ool_arg_zzz,
zip_fns[a->esz], a, vec_full_reg_size(s) / 2)
-TRANS_FEAT(ZIP1_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz,
- gen_helper_sve2_zip_q, a, 0)
-TRANS_FEAT(ZIP2_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz,
- gen_helper_sve2_zip_q, a,
- QEMU_ALIGN_DOWN(vec_full_reg_size(s), 32) / 2)
+TRANS_FEAT_NONSTREAMING(ZIP1_q, aa64_sve_f64mm, do_interleave_q,
+ gen_helper_sve2_zip_q, a, 0)
+TRANS_FEAT_NONSTREAMING(ZIP2_q, aa64_sve_f64mm, do_interleave_q,
+ gen_helper_sve2_zip_q, a,
+ QEMU_ALIGN_DOWN(vec_full_reg_size(s), 32) / 2)
+
+static gen_helper_gvec_3 * const zipq_fns[4] = {
+ gen_helper_sve2p1_zipq_b, gen_helper_sve2p1_zipq_h,
+ gen_helper_sve2p1_zipq_s, gen_helper_sve2p1_zipq_d,
+};
+TRANS_FEAT(ZIPQ1, aa64_sme2p1_or_sve2p1, gen_gvec_ool_arg_zzz,
+ zipq_fns[a->esz], a, 0)
+TRANS_FEAT(ZIPQ2, aa64_sme2p1_or_sve2p1, gen_gvec_ool_arg_zzz,
+ zipq_fns[a->esz], a, 16 / 2)
static gen_helper_gvec_3 * const uzp_fns[4] = {
gen_helper_sve_uzp_b, gen_helper_sve_uzp_h,
gen_helper_sve_uzp_s, gen_helper_sve_uzp_d,
};
-
TRANS_FEAT(UZP1_z, aa64_sve, gen_gvec_ool_arg_zzz,
uzp_fns[a->esz], a, 0)
TRANS_FEAT(UZP2_z, aa64_sve, gen_gvec_ool_arg_zzz,
uzp_fns[a->esz], a, 1 << a->esz)
-TRANS_FEAT(UZP1_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz,
- gen_helper_sve2_uzp_q, a, 0)
-TRANS_FEAT(UZP2_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz,
- gen_helper_sve2_uzp_q, a, 16)
+TRANS_FEAT_NONSTREAMING(UZP1_q, aa64_sve_f64mm, do_interleave_q,
+ gen_helper_sve2_uzp_q, a, 0)
+TRANS_FEAT_NONSTREAMING(UZP2_q, aa64_sve_f64mm, do_interleave_q,
+ gen_helper_sve2_uzp_q, a, 16)
+
+static gen_helper_gvec_3 * const uzpq_fns[4] = {
+ gen_helper_sve2p1_uzpq_b, gen_helper_sve2p1_uzpq_h,
+ gen_helper_sve2p1_uzpq_s, gen_helper_sve2p1_uzpq_d,
+};
+TRANS_FEAT(UZPQ1, aa64_sme2p1_or_sve2p1, gen_gvec_ool_arg_zzz,
+ uzpq_fns[a->esz], a, 0)
+TRANS_FEAT(UZPQ2, aa64_sme2p1_or_sve2p1, gen_gvec_ool_arg_zzz,
+ uzpq_fns[a->esz], a, 1 << a->esz)
static gen_helper_gvec_3 * const trn_fns[4] = {
gen_helper_sve_trn_b, gen_helper_sve_trn_h,
@@ -2392,10 +2662,10 @@ TRANS_FEAT(TRN1_z, aa64_sve, gen_gvec_ool_arg_zzz,
TRANS_FEAT(TRN2_z, aa64_sve, gen_gvec_ool_arg_zzz,
trn_fns[a->esz], a, 1 << a->esz)
-TRANS_FEAT(TRN1_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz,
- gen_helper_sve2_trn_q, a, 0)
-TRANS_FEAT(TRN2_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz,
- gen_helper_sve2_trn_q, a, 16)
+TRANS_FEAT_NONSTREAMING(TRN1_q, aa64_sve_f64mm, do_interleave_q,
+ gen_helper_sve2_trn_q, a, 0)
+TRANS_FEAT_NONSTREAMING(TRN2_q, aa64_sve_f64mm, do_interleave_q,
+ gen_helper_sve2_trn_q, a, 16)
/*
*** SVE Permute Vector - Predicated Group
@@ -2981,6 +3251,36 @@ static bool trans_CNTP(DisasContext *s, arg_CNTP *a)
return true;
}
+static bool trans_CNTP_c(DisasContext *s, arg_CNTP_c *a)
+{
+ TCGv_i32 t_png;
+ uint32_t desc = 0;
+
+ if (dc_isar_feature(aa64_sve2p1, s)) {
+ if (!sve_access_check(s)) {
+ return true;
+ }
+ } else if (dc_isar_feature(aa64_sme2, s)) {
+ if (!sme_sm_enabled_check(s)) {
+ return true;
+ }
+ } else {
+ return false;
+ }
+
+ t_png = tcg_temp_new_i32();
+ tcg_gen_ld16u_i32(t_png, tcg_env,
+ pred_full_reg_offset(s, a->rn) ^
+ (HOST_BIG_ENDIAN ? 6 : 0));
+
+ desc = FIELD_DP32(desc, PREDDESC, OPRSZ, pred_full_reg_size(s));
+ desc = FIELD_DP32(desc, PREDDESC, ESZ, a->esz);
+ desc = FIELD_DP32(desc, PREDDESC, DATA, a->vl);
+
+ gen_helper_sve2p1_cntp_c(cpu_reg(s, a->rd), t_png, tcg_constant_i32(desc));
+ return true;
+}
+
static bool trans_INCDECP_r(DisasContext *s, arg_incdec_pred *a)
{
if (!dc_isar_feature(aa64_sve, s)) {
@@ -3091,7 +3391,9 @@ static bool trans_CTERM(DisasContext *s, arg_CTERM *a)
return true;
}
-static bool trans_WHILE(DisasContext *s, arg_WHILE *a)
+typedef void gen_while_fn(TCGv_i32, TCGv_ptr, TCGv_i32, TCGv_i32);
+static bool do_WHILE(DisasContext *s, arg_while *a,
+ bool lt, int scale, int data, gen_while_fn *fn)
{
TCGv_i64 op0, op1, t0, t1, tmax;
TCGv_i32 t2;
@@ -3101,14 +3403,8 @@ static bool trans_WHILE(DisasContext *s, arg_WHILE *a)
TCGCond cond;
uint64_t maxval;
/* Note that GE/HS has a->eq == 0 and GT/HI has a->eq == 1. */
- bool eq = a->eq == a->lt;
+ bool eq = a->eq == lt;
- /* The greater-than conditions are all SVE2. */
- if (a->lt
- ? !dc_isar_feature(aa64_sve, s)
- : !dc_isar_feature(aa64_sve2, s)) {
- return false;
- }
if (!sve_access_check(s)) {
return true;
}
@@ -3132,7 +3428,7 @@ static bool trans_WHILE(DisasContext *s, arg_WHILE *a)
t0 = tcg_temp_new_i64();
t1 = tcg_temp_new_i64();
- if (a->lt) {
+ if (lt) {
tcg_gen_sub_i64(t0, op1, op0);
if (a->u) {
maxval = a->sf ? UINT64_MAX : UINT32_MAX;
@@ -3152,7 +3448,7 @@ static bool trans_WHILE(DisasContext *s, arg_WHILE *a)
}
}
- tmax = tcg_constant_i64(vsz >> a->esz);
+ tmax = tcg_constant_i64((vsz << scale) >> a->esz);
if (eq) {
/* Equality means one more iteration. */
tcg_gen_addi_i64(t0, t0, 1);
@@ -3181,24 +3477,38 @@ static bool trans_WHILE(DisasContext *s, arg_WHILE *a)
t2 = tcg_temp_new_i32();
tcg_gen_extrl_i64_i32(t2, t0);
- /* Scale elements to bits. */
- tcg_gen_shli_i32(t2, t2, a->esz);
-
desc = FIELD_DP32(desc, PREDDESC, OPRSZ, vsz / 8);
desc = FIELD_DP32(desc, PREDDESC, ESZ, a->esz);
+ desc = FIELD_DP32(desc, PREDDESC, DATA, data);
ptr = tcg_temp_new_ptr();
tcg_gen_addi_ptr(ptr, tcg_env, pred_full_reg_offset(s, a->rd));
- if (a->lt) {
- gen_helper_sve_whilel(t2, ptr, t2, tcg_constant_i32(desc));
- } else {
- gen_helper_sve_whileg(t2, ptr, t2, tcg_constant_i32(desc));
- }
+ fn(t2, ptr, t2, tcg_constant_i32(desc));
+
do_pred_flags(t2);
return true;
}
+TRANS_FEAT(WHILE_lt, aa64_sve, do_WHILE,
+ a, true, 0, 0, gen_helper_sve_whilel)
+TRANS_FEAT(WHILE_gt, aa64_sve2, do_WHILE,
+ a, false, 0, 0, gen_helper_sve_whileg)
+
+TRANS_FEAT(WHILE_lt_pair, aa64_sme2_or_sve2p1, do_WHILE,
+ a, true, 1, 0, gen_helper_sve_while2l)
+TRANS_FEAT(WHILE_gt_pair, aa64_sme2_or_sve2p1, do_WHILE,
+ a, false, 1, 0, gen_helper_sve_while2g)
+
+TRANS_FEAT(WHILE_lt_cnt2, aa64_sme2_or_sve2p1, do_WHILE,
+ a, true, 1, 1, gen_helper_sve_whilecl)
+TRANS_FEAT(WHILE_lt_cnt4, aa64_sme2_or_sve2p1, do_WHILE,
+ a, true, 2, 2, gen_helper_sve_whilecl)
+TRANS_FEAT(WHILE_gt_cnt2, aa64_sme2_or_sve2p1, do_WHILE,
+ a, false, 1, 1, gen_helper_sve_whilecg)
+TRANS_FEAT(WHILE_gt_cnt4, aa64_sme2_or_sve2p1, do_WHILE,
+ a, false, 2, 2, gen_helper_sve_whilecg)
+
static bool trans_WHILE_ptr(DisasContext *s, arg_WHILE_ptr *a)
{
TCGv_i64 op0, op1, diff, t1, tmax;
@@ -3217,7 +3527,7 @@ static bool trans_WHILE_ptr(DisasContext *s, arg_WHILE_ptr *a)
op0 = read_cpu_reg(s, a->rn, 1);
op1 = read_cpu_reg(s, a->rm, 1);
- tmax = tcg_constant_i64(vsz);
+ tmax = tcg_constant_i64(vsz >> a->esz);
diff = tcg_temp_new_i64();
if (a->rw) {
@@ -3227,15 +3537,15 @@ static bool trans_WHILE_ptr(DisasContext *s, arg_WHILE_ptr *a)
tcg_gen_sub_i64(diff, op0, op1);
tcg_gen_sub_i64(t1, op1, op0);
tcg_gen_movcond_i64(TCG_COND_GEU, diff, op0, op1, diff, t1);
- /* Round down to a multiple of ESIZE. */
- tcg_gen_andi_i64(diff, diff, -1 << a->esz);
+ /* Divide, rounding down, by ESIZE. */
+ tcg_gen_shri_i64(diff, diff, a->esz);
/* If op1 == op0, diff == 0, and the condition is always true. */
tcg_gen_movcond_i64(TCG_COND_EQ, diff, op0, op1, tmax, diff);
} else {
/* WHILEWR */
tcg_gen_sub_i64(diff, op1, op0);
- /* Round down to a multiple of ESIZE. */
- tcg_gen_andi_i64(diff, diff, -1 << a->esz);
+ /* Divide, rounding down, by ESIZE. */
+ tcg_gen_shri_i64(diff, diff, a->esz);
/* If op0 >= op1, diff <= 0, the condition is always true. */
tcg_gen_movcond_i64(TCG_COND_GEU, diff, op0, op1, tmax, diff);
}
@@ -3258,6 +3568,42 @@ static bool trans_WHILE_ptr(DisasContext *s, arg_WHILE_ptr *a)
return true;
}
+static bool do_pext(DisasContext *s, arg_pext *a, int n)
+{
+ TCGv_i32 t_png;
+ TCGv_ptr t_pd;
+ int pl;
+
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ t_png = tcg_temp_new_i32();
+ tcg_gen_ld16u_i32(t_png, tcg_env,
+ pred_full_reg_offset(s, a->rn) ^
+ (HOST_BIG_ENDIAN ? 6 : 0));
+
+ t_pd = tcg_temp_new_ptr();
+ pl = pred_full_reg_size(s);
+
+ for (int i = 0; i < n; ++i) {
+ int rd = (a->rd + i) % 16;
+ int part = a->imm * n + i;
+ unsigned desc = 0;
+
+ desc = FIELD_DP32(desc, PREDDESC, OPRSZ, pl);
+ desc = FIELD_DP32(desc, PREDDESC, ESZ, a->esz);
+ desc = FIELD_DP32(desc, PREDDESC, DATA, part);
+
+ tcg_gen_addi_ptr(t_pd, tcg_env, pred_full_reg_offset(s, rd));
+ gen_helper_pext(t_pd, t_png, tcg_constant_i32(desc));
+ }
+ return true;
+}
+
+TRANS_FEAT(PEXT_1, aa64_sme2_or_sve2p1, do_pext, a, 1)
+TRANS_FEAT(PEXT_2, aa64_sme2_or_sve2p1, do_pext, a, 2)
+
/*
*** SVE Integer Wide Immediate - Unpredicated Group
*/
@@ -3385,8 +3731,8 @@ DO_ZZI(UMIN, umin)
#undef DO_ZZI
static gen_helper_gvec_4 * const dot_fns[2][2] = {
- { gen_helper_gvec_sdot_b, gen_helper_gvec_sdot_h },
- { gen_helper_gvec_udot_b, gen_helper_gvec_udot_h }
+ { gen_helper_gvec_sdot_4b, gen_helper_gvec_sdot_4h },
+ { gen_helper_gvec_udot_4b, gen_helper_gvec_udot_4h }
};
TRANS_FEAT(DOT_zzzz, aa64_sve, gen_gvec_ool_zzzz,
dot_fns[a->u][a->sz], a->rd, a->rn, a->rm, a->ra, 0)
@@ -3395,19 +3741,24 @@ TRANS_FEAT(DOT_zzzz, aa64_sve, gen_gvec_ool_zzzz,
* SVE Multiply - Indexed
*/
-TRANS_FEAT(SDOT_zzxw_s, aa64_sve, gen_gvec_ool_arg_zzxz,
- gen_helper_gvec_sdot_idx_b, a)
-TRANS_FEAT(SDOT_zzxw_d, aa64_sve, gen_gvec_ool_arg_zzxz,
- gen_helper_gvec_sdot_idx_h, a)
-TRANS_FEAT(UDOT_zzxw_s, aa64_sve, gen_gvec_ool_arg_zzxz,
- gen_helper_gvec_udot_idx_b, a)
-TRANS_FEAT(UDOT_zzxw_d, aa64_sve, gen_gvec_ool_arg_zzxz,
- gen_helper_gvec_udot_idx_h, a)
-
-TRANS_FEAT(SUDOT_zzxw_s, aa64_sve_i8mm, gen_gvec_ool_arg_zzxz,
- gen_helper_gvec_sudot_idx_b, a)
-TRANS_FEAT(USDOT_zzxw_s, aa64_sve_i8mm, gen_gvec_ool_arg_zzxz,
- gen_helper_gvec_usdot_idx_b, a)
+TRANS_FEAT(SDOT_zzxw_4s, aa64_sve, gen_gvec_ool_arg_zzxz,
+ gen_helper_gvec_sdot_idx_4b, a)
+TRANS_FEAT(SDOT_zzxw_4d, aa64_sve, gen_gvec_ool_arg_zzxz,
+ gen_helper_gvec_sdot_idx_4h, a)
+TRANS_FEAT(UDOT_zzxw_4s, aa64_sve, gen_gvec_ool_arg_zzxz,
+ gen_helper_gvec_udot_idx_4b, a)
+TRANS_FEAT(UDOT_zzxw_4d, aa64_sve, gen_gvec_ool_arg_zzxz,
+ gen_helper_gvec_udot_idx_4h, a)
+
+TRANS_FEAT(SUDOT_zzxw_4s, aa64_sve_i8mm, gen_gvec_ool_arg_zzxz,
+ gen_helper_gvec_sudot_idx_4b, a)
+TRANS_FEAT(USDOT_zzxw_4s, aa64_sve_i8mm, gen_gvec_ool_arg_zzxz,
+ gen_helper_gvec_usdot_idx_4b, a)
+
+TRANS_FEAT(SDOT_zzxw_2s, aa64_sme2_or_sve2p1, gen_gvec_ool_arg_zzxz,
+ gen_helper_gvec_sdot_idx_2h, a)
+TRANS_FEAT(UDOT_zzxw_2s, aa64_sme2_or_sve2p1, gen_gvec_ool_arg_zzxz,
+ gen_helper_gvec_udot_idx_2h, a)
#define DO_SVE2_RRX(NAME, FUNC) \
TRANS_FEAT(NAME, aa64_sve, gen_gvec_ool_zzz, FUNC, \
@@ -3621,6 +3972,54 @@ DO_VPZ_AH(FMAXV, fmaxv)
#undef DO_VPZ
+static gen_helper_gvec_3_ptr * const faddqv_fns[4] = {
+ NULL, gen_helper_sve2p1_faddqv_h,
+ gen_helper_sve2p1_faddqv_s, gen_helper_sve2p1_faddqv_d,
+};
+TRANS_FEAT(FADDQV, aa64_sme2p1_or_sve2p1, gen_gvec_fpst_arg_zpz,
+ faddqv_fns[a->esz], a, 0,
+ a->esz == MO_16 ? FPST_A64_F16 : FPST_A64)
+
+static gen_helper_gvec_3_ptr * const fmaxnmqv_fns[4] = {
+ NULL, gen_helper_sve2p1_fmaxnmqv_h,
+ gen_helper_sve2p1_fmaxnmqv_s, gen_helper_sve2p1_fmaxnmqv_d,
+};
+TRANS_FEAT(FMAXNMQV, aa64_sme2p1_or_sve2p1, gen_gvec_fpst_arg_zpz,
+ fmaxnmqv_fns[a->esz], a, 0,
+ a->esz == MO_16 ? FPST_A64_F16 : FPST_A64)
+
+static gen_helper_gvec_3_ptr * const fminnmqv_fns[4] = {
+ NULL, gen_helper_sve2p1_fminnmqv_h,
+ gen_helper_sve2p1_fminnmqv_s, gen_helper_sve2p1_fminnmqv_d,
+};
+TRANS_FEAT(FMINNMQV, aa64_sme2p1_or_sve2p1, gen_gvec_fpst_arg_zpz,
+ fminnmqv_fns[a->esz], a, 0,
+ a->esz == MO_16 ? FPST_A64_F16 : FPST_A64)
+
+static gen_helper_gvec_3_ptr * const fmaxqv_fns[4] = {
+ NULL, gen_helper_sve2p1_fmaxqv_h,
+ gen_helper_sve2p1_fmaxqv_s, gen_helper_sve2p1_fmaxqv_d,
+};
+static gen_helper_gvec_3_ptr * const fmaxqv_ah_fns[4] = {
+ NULL, gen_helper_sve2p1_ah_fmaxqv_h,
+ gen_helper_sve2p1_ah_fmaxqv_s, gen_helper_sve2p1_ah_fmaxqv_d,
+};
+TRANS_FEAT(FMAXQV, aa64_sme2p1_or_sve2p1, gen_gvec_fpst_arg_zpz,
+ (s->fpcr_ah ? fmaxqv_fns : fmaxqv_ah_fns)[a->esz], a, 0,
+ a->esz == MO_16 ? FPST_A64_F16 : FPST_A64)
+
+static gen_helper_gvec_3_ptr * const fminqv_fns[4] = {
+ NULL, gen_helper_sve2p1_fminqv_h,
+ gen_helper_sve2p1_fminqv_s, gen_helper_sve2p1_fminqv_d,
+};
+static gen_helper_gvec_3_ptr * const fminqv_ah_fns[4] = {
+ NULL, gen_helper_sve2p1_ah_fminqv_h,
+ gen_helper_sve2p1_ah_fminqv_s, gen_helper_sve2p1_ah_fminqv_d,
+};
+TRANS_FEAT(FMINQV, aa64_sme2p1_or_sve2p1, gen_gvec_fpst_arg_zpz,
+ (s->fpcr_ah ? fminqv_fns : fminqv_ah_fns)[a->esz], a, 0,
+ a->esz == MO_16 ? FPST_A64_F16 : FPST_A64)
+
/*
*** SVE Floating Point Unary Operations - Unpredicated Group
*/
@@ -4143,7 +4542,7 @@ TRANS_FEAT(UCVTF_dd, aa64_sve, gen_gvec_fpst_arg_zpz,
*/
void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs,
- int len, int rn, int imm)
+ int len, int rn, int imm, MemOp align)
{
int len_align = QEMU_ALIGN_DOWN(len, 16);
int len_remain = len % 16;
@@ -4172,12 +4571,15 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs,
for (i = 0; i < len_align; i += 16) {
tcg_gen_qemu_ld_i128(t16, clean_addr, midx,
- MO_LE | MO_128 | MO_ATOM_NONE);
+ MO_LE | MO_128 | MO_ATOM_NONE | align);
tcg_gen_extr_i128_i64(t0, t1, t16);
tcg_gen_st_i64(t0, base, vofs + i);
tcg_gen_st_i64(t1, base, vofs + i + 8);
tcg_gen_addi_i64(clean_addr, clean_addr, 16);
}
+ if (len_align) {
+ align = MO_UNALN;
+ }
} else {
TCGLabel *loop = gen_new_label();
TCGv_ptr tp, i = tcg_temp_new_ptr();
@@ -4187,7 +4589,7 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs,
t16 = tcg_temp_new_i128();
tcg_gen_qemu_ld_i128(t16, clean_addr, midx,
- MO_LE | MO_128 | MO_ATOM_NONE);
+ MO_LE | MO_128 | MO_ATOM_NONE | align);
tcg_gen_addi_i64(clean_addr, clean_addr, 16);
tp = tcg_temp_new_ptr();
@@ -4202,6 +4604,7 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs,
tcg_gen_st_i64(t1, tp, vofs + 8);
tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop);
+ align = MO_UNALN;
}
/*
@@ -4210,7 +4613,9 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs,
*/
if (len_remain >= 8) {
t0 = tcg_temp_new_i64();
- tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ | MO_ATOM_NONE);
+ tcg_gen_qemu_ld_i64(t0, clean_addr, midx,
+ MO_LEUQ | MO_ATOM_NONE | align);
+ align = MO_UNALN;
tcg_gen_st_i64(t0, base, vofs + len_align);
len_remain -= 8;
len_align += 8;
@@ -4225,12 +4630,14 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs,
case 4:
case 8:
tcg_gen_qemu_ld_i64(t0, clean_addr, midx,
- MO_LE | ctz32(len_remain) | MO_ATOM_NONE);
+ MO_LE | ctz32(len_remain)
+ | MO_ATOM_NONE | align);
break;
case 6:
t1 = tcg_temp_new_i64();
- tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUL | MO_ATOM_NONE);
+ tcg_gen_qemu_ld_i64(t0, clean_addr, midx,
+ MO_LEUL | MO_ATOM_NONE | align);
tcg_gen_addi_i64(clean_addr, clean_addr, 4);
tcg_gen_qemu_ld_i64(t1, clean_addr, midx, MO_LEUW | MO_ATOM_NONE);
tcg_gen_deposit_i64(t0, t0, t1, 32, 32);
@@ -4245,7 +4652,7 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs,
/* Similarly for stores. */
void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs,
- int len, int rn, int imm)
+ int len, int rn, int imm, MemOp align)
{
int len_align = QEMU_ALIGN_DOWN(len, 16);
int len_remain = len % 16;
@@ -4277,9 +4684,12 @@ void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs,
tcg_gen_ld_i64(t1, base, vofs + i + 8);
tcg_gen_concat_i64_i128(t16, t0, t1);
tcg_gen_qemu_st_i128(t16, clean_addr, midx,
- MO_LE | MO_128 | MO_ATOM_NONE);
+ MO_LE | MO_128 | MO_ATOM_NONE | align);
tcg_gen_addi_i64(clean_addr, clean_addr, 16);
}
+ if (len_align) {
+ align = MO_UNALN;
+ }
} else {
TCGLabel *loop = gen_new_label();
TCGv_ptr tp, i = tcg_temp_new_ptr();
@@ -4303,13 +4713,16 @@ void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs,
tcg_gen_addi_i64(clean_addr, clean_addr, 16);
tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop);
+ align = MO_UNALN;
}
/* Predicate register stores can be any multiple of 2. */
if (len_remain >= 8) {
t0 = tcg_temp_new_i64();
tcg_gen_ld_i64(t0, base, vofs + len_align);
- tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ | MO_ATOM_NONE);
+ tcg_gen_qemu_st_i64(t0, clean_addr, midx,
+ MO_LEUQ | MO_ATOM_NONE | align);
+ align = MO_UNALN;
len_remain -= 8;
len_align += 8;
if (len_remain) {
@@ -4325,11 +4738,13 @@ void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs,
case 4:
case 8:
tcg_gen_qemu_st_i64(t0, clean_addr, midx,
- MO_LE | ctz32(len_remain) | MO_ATOM_NONE);
+ MO_LE | ctz32(len_remain)
+ | MO_ATOM_NONE | align);
break;
case 6:
- tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUL | MO_ATOM_NONE);
+ tcg_gen_qemu_st_i64(t0, clean_addr, midx,
+ MO_LEUL | MO_ATOM_NONE | align);
tcg_gen_addi_i64(clean_addr, clean_addr, 4);
tcg_gen_shri_i64(t0, t0, 32);
tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUW | MO_ATOM_NONE);
@@ -4349,7 +4764,8 @@ static bool trans_LDR_zri(DisasContext *s, arg_rri *a)
if (sve_access_check(s)) {
int size = vec_full_reg_size(s);
int off = vec_full_reg_offset(s, a->rd);
- gen_sve_ldr(s, tcg_env, off, size, a->rn, a->imm * size);
+ gen_sve_ldr(s, tcg_env, off, size, a->rn, a->imm * size,
+ s->align_mem ? MO_ALIGN_16 : MO_UNALN);
}
return true;
}
@@ -4362,7 +4778,8 @@ static bool trans_LDR_pri(DisasContext *s, arg_rri *a)
if (sve_access_check(s)) {
int size = pred_full_reg_size(s);
int off = pred_full_reg_offset(s, a->rd);
- gen_sve_ldr(s, tcg_env, off, size, a->rn, a->imm * size);
+ gen_sve_ldr(s, tcg_env, off, size, a->rn, a->imm * size,
+ s->align_mem ? MO_ALIGN_2 : MO_UNALN);
}
return true;
}
@@ -4375,7 +4792,8 @@ static bool trans_STR_zri(DisasContext *s, arg_rri *a)
if (sve_access_check(s)) {
int size = vec_full_reg_size(s);
int off = vec_full_reg_offset(s, a->rd);
- gen_sve_str(s, tcg_env, off, size, a->rn, a->imm * size);
+ gen_sve_str(s, tcg_env, off, size, a->rn, a->imm * size,
+ s->align_mem ? MO_ALIGN_16 : MO_UNALN);
}
return true;
}
@@ -4388,7 +4806,8 @@ static bool trans_STR_pri(DisasContext *s, arg_rri *a)
if (sve_access_check(s)) {
int size = pred_full_reg_size(s);
int off = pred_full_reg_offset(s, a->rd);
- gen_sve_str(s, tcg_env, off, size, a->rn, a->imm * size);
+ gen_sve_str(s, tcg_env, off, size, a->rn, a->imm * size,
+ s->align_mem ? MO_ALIGN_2 : MO_UNALN);
}
return true;
}
@@ -4398,21 +4817,25 @@ static bool trans_STR_pri(DisasContext *s, arg_rri *a)
*/
/* The memory mode of the dtype. */
-static const MemOp dtype_mop[16] = {
+static const MemOp dtype_mop[19] = {
MO_UB, MO_UB, MO_UB, MO_UB,
MO_SL, MO_UW, MO_UW, MO_UW,
MO_SW, MO_SW, MO_UL, MO_UL,
- MO_SB, MO_SB, MO_SB, MO_UQ
+ MO_SB, MO_SB, MO_SB, MO_UQ,
+ /* Artificial values used by decode */
+ MO_UL, MO_UQ, MO_128,
};
#define dtype_msz(x) (dtype_mop[x] & MO_SIZE)
/* The vector element size of dtype. */
-static const uint8_t dtype_esz[16] = {
+static const uint8_t dtype_esz[19] = {
0, 1, 2, 3,
3, 1, 2, 3,
3, 2, 2, 3,
- 3, 2, 1, 3
+ 3, 2, 1, 3,
+ /* Artificial values used by decode */
+ 4, 4, 4,
};
uint32_t make_svemte_desc(DisasContext *s, unsigned vsz, uint32_t nregs,
@@ -4463,7 +4886,7 @@ static void do_mem_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr,
}
/* Indexed by [mte][be][dtype][nreg] */
-static gen_helper_gvec_mem * const ldr_fns[2][2][16][4] = {
+static gen_helper_gvec_mem * const ldr_fns[2][2][19][4] = {
{ /* mte inactive, little-endian */
{ { gen_helper_sve_ld1bb_r, gen_helper_sve_ld2bb_r,
gen_helper_sve_ld3bb_r, gen_helper_sve_ld4bb_r },
@@ -4487,7 +4910,13 @@ static gen_helper_gvec_mem * const ldr_fns[2][2][16][4] = {
{ gen_helper_sve_ld1bss_r, NULL, NULL, NULL },
{ gen_helper_sve_ld1bhs_r, NULL, NULL, NULL },
{ gen_helper_sve_ld1dd_le_r, gen_helper_sve_ld2dd_le_r,
- gen_helper_sve_ld3dd_le_r, gen_helper_sve_ld4dd_le_r } },
+ gen_helper_sve_ld3dd_le_r, gen_helper_sve_ld4dd_le_r },
+
+ { gen_helper_sve_ld1squ_le_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1dqu_le_r, NULL, NULL, NULL },
+ { NULL, gen_helper_sve_ld2qq_le_r,
+ gen_helper_sve_ld3qq_le_r, gen_helper_sve_ld4qq_le_r },
+ },
/* mte inactive, big-endian */
{ { gen_helper_sve_ld1bb_r, gen_helper_sve_ld2bb_r,
@@ -4512,7 +4941,14 @@ static gen_helper_gvec_mem * const ldr_fns[2][2][16][4] = {
{ gen_helper_sve_ld1bss_r, NULL, NULL, NULL },
{ gen_helper_sve_ld1bhs_r, NULL, NULL, NULL },
{ gen_helper_sve_ld1dd_be_r, gen_helper_sve_ld2dd_be_r,
- gen_helper_sve_ld3dd_be_r, gen_helper_sve_ld4dd_be_r } } },
+ gen_helper_sve_ld3dd_be_r, gen_helper_sve_ld4dd_be_r },
+
+ { gen_helper_sve_ld1squ_be_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1dqu_be_r, NULL, NULL, NULL },
+ { NULL, gen_helper_sve_ld2qq_be_r,
+ gen_helper_sve_ld3qq_be_r, gen_helper_sve_ld4qq_be_r },
+ },
+ },
{ /* mte active, little-endian */
{ { gen_helper_sve_ld1bb_r_mte,
@@ -4545,7 +4981,15 @@ static gen_helper_gvec_mem * const ldr_fns[2][2][16][4] = {
{ gen_helper_sve_ld1dd_le_r_mte,
gen_helper_sve_ld2dd_le_r_mte,
gen_helper_sve_ld3dd_le_r_mte,
- gen_helper_sve_ld4dd_le_r_mte } },
+ gen_helper_sve_ld4dd_le_r_mte },
+
+ { gen_helper_sve_ld1squ_le_r_mte, NULL, NULL, NULL },
+ { gen_helper_sve_ld1dqu_le_r_mte, NULL, NULL, NULL },
+ { NULL,
+ gen_helper_sve_ld2qq_le_r_mte,
+ gen_helper_sve_ld3qq_le_r_mte,
+ gen_helper_sve_ld4qq_le_r_mte },
+ },
/* mte active, big-endian */
{ { gen_helper_sve_ld1bb_r_mte,
@@ -4578,7 +5022,16 @@ static gen_helper_gvec_mem * const ldr_fns[2][2][16][4] = {
{ gen_helper_sve_ld1dd_be_r_mte,
gen_helper_sve_ld2dd_be_r_mte,
gen_helper_sve_ld3dd_be_r_mte,
- gen_helper_sve_ld4dd_be_r_mte } } },
+ gen_helper_sve_ld4dd_be_r_mte },
+
+ { gen_helper_sve_ld1squ_be_r_mte, NULL, NULL, NULL },
+ { gen_helper_sve_ld1dqu_be_r_mte, NULL, NULL, NULL },
+ { NULL,
+ gen_helper_sve_ld2qq_be_r_mte,
+ gen_helper_sve_ld3qq_be_r_mte,
+ gen_helper_sve_ld4qq_be_r_mte },
+ },
+ },
};
static void do_ld_zpa(DisasContext *s, int zt, int pg,
@@ -4597,9 +5050,32 @@ static void do_ld_zpa(DisasContext *s, int zt, int pg,
static bool trans_LD_zprr(DisasContext *s, arg_rprr_load *a)
{
- if (a->rm == 31 || !dc_isar_feature(aa64_sve, s)) {
+ if (a->rm == 31) {
return false;
}
+
+ /* dtypes 16-18 are artificial, representing 128-bit element */
+ switch (a->dtype) {
+ case 0 ... 15:
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ break;
+ case 16: case 17:
+ if (!dc_isar_feature(aa64_sve2p1, s)) {
+ return false;
+ }
+ s->is_nonstreaming = true;
+ break;
+ case 18:
+ if (!dc_isar_feature(aa64_sme2p1_or_sve2p1, s)) {
+ return false;
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
if (sve_access_check(s)) {
TCGv_i64 addr = tcg_temp_new_i64();
tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), dtype_msz(a->dtype));
@@ -4611,9 +5087,28 @@ static bool trans_LD_zprr(DisasContext *s, arg_rprr_load *a)
static bool trans_LD_zpri(DisasContext *s, arg_rpri_load *a)
{
- if (!dc_isar_feature(aa64_sve, s)) {
- return false;
+ /* dtypes 16-18 are artificial, representing 128-bit element */
+ switch (a->dtype) {
+ case 0 ... 15:
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ break;
+ case 16: case 17:
+ if (!dc_isar_feature(aa64_sve2p1, s)) {
+ return false;
+ }
+ s->is_nonstreaming = true;
+ break;
+ case 18:
+ if (!dc_isar_feature(aa64_sme2p1_or_sve2p1, s)) {
+ return false;
+ }
+ break;
+ default:
+ g_assert_not_reached();
}
+
if (sve_access_check(s)) {
int vsz = vec_full_reg_size(s);
int elements = vsz >> dtype_esz[a->dtype];
@@ -5060,7 +5555,7 @@ static bool trans_LD1R_zpri(DisasContext *s, arg_rpri_load *a)
static void do_st_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr,
int msz, int esz, int nreg)
{
- static gen_helper_gvec_mem * const fn_single[2][2][4][4] = {
+ static gen_helper_gvec_mem * const fn_single[2][2][4][5] = {
{ { { gen_helper_sve_st1bb_r,
gen_helper_sve_st1bh_r,
gen_helper_sve_st1bs_r,
@@ -5071,9 +5566,11 @@ static void do_st_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr,
gen_helper_sve_st1hd_le_r },
{ NULL, NULL,
gen_helper_sve_st1ss_le_r,
- gen_helper_sve_st1sd_le_r },
+ gen_helper_sve_st1sd_le_r,
+ gen_helper_sve_st1sq_le_r, },
{ NULL, NULL, NULL,
- gen_helper_sve_st1dd_le_r } },
+ gen_helper_sve_st1dd_le_r,
+ gen_helper_sve_st1dq_le_r, } },
{ { gen_helper_sve_st1bb_r,
gen_helper_sve_st1bh_r,
gen_helper_sve_st1bs_r,
@@ -5084,9 +5581,11 @@ static void do_st_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr,
gen_helper_sve_st1hd_be_r },
{ NULL, NULL,
gen_helper_sve_st1ss_be_r,
- gen_helper_sve_st1sd_be_r },
+ gen_helper_sve_st1sd_be_r,
+ gen_helper_sve_st1sq_be_r },
{ NULL, NULL, NULL,
- gen_helper_sve_st1dd_be_r } } },
+ gen_helper_sve_st1dd_be_r,
+ gen_helper_sve_st1dq_be_r } } },
{ { { gen_helper_sve_st1bb_r_mte,
gen_helper_sve_st1bh_r_mte,
@@ -5098,9 +5597,11 @@ static void do_st_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr,
gen_helper_sve_st1hd_le_r_mte },
{ NULL, NULL,
gen_helper_sve_st1ss_le_r_mte,
- gen_helper_sve_st1sd_le_r_mte },
+ gen_helper_sve_st1sd_le_r_mte,
+ gen_helper_sve_st1sq_le_r_mte },
{ NULL, NULL, NULL,
- gen_helper_sve_st1dd_le_r_mte } },
+ gen_helper_sve_st1dd_le_r_mte,
+ gen_helper_sve_st1dq_le_r_mte } },
{ { gen_helper_sve_st1bb_r_mte,
gen_helper_sve_st1bh_r_mte,
gen_helper_sve_st1bs_r_mte,
@@ -5111,59 +5612,73 @@ static void do_st_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr,
gen_helper_sve_st1hd_be_r_mte },
{ NULL, NULL,
gen_helper_sve_st1ss_be_r_mte,
- gen_helper_sve_st1sd_be_r_mte },
+ gen_helper_sve_st1sd_be_r_mte,
+ gen_helper_sve_st1sq_be_r_mte },
{ NULL, NULL, NULL,
- gen_helper_sve_st1dd_be_r_mte } } },
+ gen_helper_sve_st1dd_be_r_mte,
+ gen_helper_sve_st1dq_be_r_mte } } },
};
- static gen_helper_gvec_mem * const fn_multiple[2][2][3][4] = {
+ static gen_helper_gvec_mem * const fn_multiple[2][2][3][5] = {
{ { { gen_helper_sve_st2bb_r,
gen_helper_sve_st2hh_le_r,
gen_helper_sve_st2ss_le_r,
- gen_helper_sve_st2dd_le_r },
+ gen_helper_sve_st2dd_le_r,
+ gen_helper_sve_st2qq_le_r },
{ gen_helper_sve_st3bb_r,
gen_helper_sve_st3hh_le_r,
gen_helper_sve_st3ss_le_r,
- gen_helper_sve_st3dd_le_r },
+ gen_helper_sve_st3dd_le_r,
+ gen_helper_sve_st3qq_le_r },
{ gen_helper_sve_st4bb_r,
gen_helper_sve_st4hh_le_r,
gen_helper_sve_st4ss_le_r,
- gen_helper_sve_st4dd_le_r } },
+ gen_helper_sve_st4dd_le_r,
+ gen_helper_sve_st4qq_le_r } },
{ { gen_helper_sve_st2bb_r,
gen_helper_sve_st2hh_be_r,
gen_helper_sve_st2ss_be_r,
- gen_helper_sve_st2dd_be_r },
+ gen_helper_sve_st2dd_be_r,
+ gen_helper_sve_st2qq_be_r },
{ gen_helper_sve_st3bb_r,
gen_helper_sve_st3hh_be_r,
gen_helper_sve_st3ss_be_r,
- gen_helper_sve_st3dd_be_r },
+ gen_helper_sve_st3dd_be_r,
+ gen_helper_sve_st3qq_be_r },
{ gen_helper_sve_st4bb_r,
gen_helper_sve_st4hh_be_r,
gen_helper_sve_st4ss_be_r,
- gen_helper_sve_st4dd_be_r } } },
+ gen_helper_sve_st4dd_be_r,
+ gen_helper_sve_st4qq_be_r } } },
{ { { gen_helper_sve_st2bb_r_mte,
gen_helper_sve_st2hh_le_r_mte,
gen_helper_sve_st2ss_le_r_mte,
- gen_helper_sve_st2dd_le_r_mte },
+ gen_helper_sve_st2dd_le_r_mte,
+ gen_helper_sve_st2qq_le_r_mte },
{ gen_helper_sve_st3bb_r_mte,
gen_helper_sve_st3hh_le_r_mte,
gen_helper_sve_st3ss_le_r_mte,
- gen_helper_sve_st3dd_le_r_mte },
+ gen_helper_sve_st3dd_le_r_mte,
+ gen_helper_sve_st3qq_le_r_mte },
{ gen_helper_sve_st4bb_r_mte,
gen_helper_sve_st4hh_le_r_mte,
gen_helper_sve_st4ss_le_r_mte,
- gen_helper_sve_st4dd_le_r_mte } },
+ gen_helper_sve_st4dd_le_r_mte,
+ gen_helper_sve_st4qq_le_r_mte } },
{ { gen_helper_sve_st2bb_r_mte,
gen_helper_sve_st2hh_be_r_mte,
gen_helper_sve_st2ss_be_r_mte,
- gen_helper_sve_st2dd_be_r_mte },
+ gen_helper_sve_st2dd_be_r_mte,
+ gen_helper_sve_st2qq_be_r_mte },
{ gen_helper_sve_st3bb_r_mte,
gen_helper_sve_st3hh_be_r_mte,
gen_helper_sve_st3ss_be_r_mte,
- gen_helper_sve_st3dd_be_r_mte },
+ gen_helper_sve_st3dd_be_r_mte,
+ gen_helper_sve_st3qq_be_r_mte },
{ gen_helper_sve_st4bb_r_mte,
gen_helper_sve_st4hh_be_r_mte,
gen_helper_sve_st4ss_be_r_mte,
- gen_helper_sve_st4dd_be_r_mte } } },
+ gen_helper_sve_st4dd_be_r_mte,
+ gen_helper_sve_st4qq_be_r_mte } } },
};
gen_helper_gvec_mem *fn;
int be = s->be_data == MO_BE;
@@ -5182,12 +5697,32 @@ static void do_st_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr,
static bool trans_ST_zprr(DisasContext *s, arg_rprr_store *a)
{
- if (!dc_isar_feature(aa64_sve, s)) {
- return false;
- }
if (a->rm == 31 || a->msz > a->esz) {
return false;
}
+ switch (a->esz) {
+ case MO_8 ... MO_64:
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ break;
+ case MO_128:
+ if (a->nreg == 0) {
+ assert(a->msz < a->esz);
+ if (!dc_isar_feature(aa64_sve2p1, s)) {
+ return false;
+ }
+ s->is_nonstreaming = true;
+ } else {
+ if (!dc_isar_feature(aa64_sme2p1_or_sve2p1, s)) {
+ return false;
+ }
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
if (sve_access_check(s)) {
TCGv_i64 addr = tcg_temp_new_i64();
tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), a->msz);
@@ -5199,12 +5734,32 @@ static bool trans_ST_zprr(DisasContext *s, arg_rprr_store *a)
static bool trans_ST_zpri(DisasContext *s, arg_rpri_store *a)
{
- if (!dc_isar_feature(aa64_sve, s)) {
- return false;
- }
if (a->msz > a->esz) {
return false;
}
+ switch (a->esz) {
+ case MO_8 ... MO_64:
+ if (!dc_isar_feature(aa64_sve, s)) {
+ return false;
+ }
+ break;
+ case MO_128:
+ if (a->nreg == 0) {
+ assert(a->msz < a->esz);
+ if (!dc_isar_feature(aa64_sve2p1, s)) {
+ return false;
+ }
+ s->is_nonstreaming = true;
+ } else {
+ if (!dc_isar_feature(aa64_sme2p1_or_sve2p1, s)) {
+ return false;
+ }
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
if (sve_access_check(s)) {
int vsz = vec_full_reg_size(s);
int elements = vsz >> a->esz;
@@ -5566,13 +6121,23 @@ gather_load_fn64[2][2][2][3][2][4] = {
gen_helper_sve_ldffdd_be_zd_mte, } } } } },
};
+static gen_helper_gvec_mem_scatter * const
+gather_load_fn128[2][2] = {
+ { gen_helper_sve_ldqq_le_zd,
+ gen_helper_sve_ldqq_be_zd },
+ { gen_helper_sve_ldqq_le_zd_mte,
+ gen_helper_sve_ldqq_be_zd_mte }
+};
+
static bool trans_LD1_zprz(DisasContext *s, arg_LD1_zprz *a)
{
gen_helper_gvec_mem_scatter *fn = NULL;
bool be = s->be_data == MO_BE;
bool mte = s->mte_active[0];
- if (!dc_isar_feature(aa64_sve, s)) {
+ if (a->esz < MO_128
+ ? !dc_isar_feature(aa64_sve, s)
+ : !dc_isar_feature(aa64_sve2p1, s)) {
return false;
}
s->is_nonstreaming = true;
@@ -5587,6 +6152,12 @@ static bool trans_LD1_zprz(DisasContext *s, arg_LD1_zprz *a)
case MO_64:
fn = gather_load_fn64[mte][be][a->ff][a->xs][a->u][a->msz];
break;
+ case MO_128:
+ assert(!a->ff && a->u && a->xs == 2 && a->msz == MO_128);
+ fn = gather_load_fn128[mte][be];
+ break;
+ default:
+ g_assert_not_reached();
}
assert(fn != NULL);
@@ -5754,6 +6325,14 @@ static gen_helper_gvec_mem_scatter * const scatter_store_fn64[2][2][3][4] = {
gen_helper_sve_stdd_be_zd_mte, } } },
};
+static gen_helper_gvec_mem_scatter * const
+scatter_store_fn128[2][2] = {
+ { gen_helper_sve_stqq_le_zd,
+ gen_helper_sve_stqq_be_zd },
+ { gen_helper_sve_stqq_le_zd_mte,
+ gen_helper_sve_stqq_be_zd_mte }
+};
+
static bool trans_ST1_zprz(DisasContext *s, arg_ST1_zprz *a)
{
gen_helper_gvec_mem_scatter *fn;
@@ -5763,7 +6342,9 @@ static bool trans_ST1_zprz(DisasContext *s, arg_ST1_zprz *a)
if (a->esz < a->msz || (a->msz == 0 && a->scale)) {
return false;
}
- if (!dc_isar_feature(aa64_sve, s)) {
+ if (a->esz < MO_128
+ ? !dc_isar_feature(aa64_sve, s)
+ : !dc_isar_feature(aa64_sve2p1, s)) {
return false;
}
s->is_nonstreaming = true;
@@ -5777,6 +6358,10 @@ static bool trans_ST1_zprz(DisasContext *s, arg_ST1_zprz *a)
case MO_64:
fn = scatter_store_fn64[mte][be][a->xs][a->msz];
break;
+ case MO_128:
+ assert(a->xs == 2 && a->msz == MO_128);
+ fn = scatter_store_fn128[mte][be];
+ break;
default:
g_assert_not_reached();
}
@@ -5911,6 +6496,7 @@ TRANS_FEAT(MOVPRFX_z, aa64_sve, do_movz_zpz, a->rd, a->rn, a->pg, a->esz, false)
*/
TRANS_FEAT(MUL_zzz, aa64_sve2, gen_gvec_fn_arg_zzz, tcg_gen_gvec_mul, a)
+TRANS_FEAT(SQDMULH_zzz, aa64_sve2, gen_gvec_fn_arg_zzz, gen_gvec_sve2_sqdmulh, a)
static gen_helper_gvec_3 * const smulh_zzz_fns[4] = {
gen_helper_gvec_smulh_b, gen_helper_gvec_smulh_h,
@@ -5929,13 +6515,6 @@ TRANS_FEAT(UMULH_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
TRANS_FEAT(PMUL_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
gen_helper_gvec_pmul_b, a, 0)
-static gen_helper_gvec_3 * const sqdmulh_zzz_fns[4] = {
- gen_helper_sve2_sqdmulh_b, gen_helper_sve2_sqdmulh_h,
- gen_helper_sve2_sqdmulh_s, gen_helper_sve2_sqdmulh_d,
-};
-TRANS_FEAT(SQDMULH_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
- sqdmulh_zzz_fns[a->esz], a, 0)
-
static gen_helper_gvec_3 * const sqrdmulh_zzz_fns[4] = {
gen_helper_sve2_sqrdmulh_b, gen_helper_sve2_sqrdmulh_h,
gen_helper_sve2_sqrdmulh_s, gen_helper_sve2_sqrdmulh_d,
@@ -7008,17 +7587,26 @@ DO_ZPZZ_FP(FMINNMP, aa64_sve2, sve2_fminnmp_zpzz)
DO_ZPZZ_FP(FMAXP, aa64_sve2, sve2_fmaxp_zpzz)
DO_ZPZZ_FP(FMINP, aa64_sve2, sve2_fminp_zpzz)
+static bool do_fmmla(DisasContext *s, arg_rrrr_esz *a,
+ gen_helper_gvec_4_ptr *fn)
+{
+ if (sve_access_check(s)) {
+ if (vec_full_reg_size(s) < 4 * memop_size(a->esz)) {
+ unallocated_encoding(s);
+ } else {
+ gen_gvec_fpst_zzzz(s, fn, a->rd, a->rn, a->rm, a->ra, 0, FPST_A64);
+ }
+ }
+ return true;
+}
+
+TRANS_FEAT_NONSTREAMING(FMMLA_s, aa64_sve_f32mm, do_fmmla, a, gen_helper_fmmla_s)
+TRANS_FEAT_NONSTREAMING(FMMLA_d, aa64_sve_f64mm, do_fmmla, a, gen_helper_fmmla_d)
+
/*
* SVE Integer Multiply-Add (unpredicated)
*/
-TRANS_FEAT_NONSTREAMING(FMMLA_s, aa64_sve_f32mm, gen_gvec_fpst_zzzz,
- gen_helper_fmmla_s, a->rd, a->rn, a->rm, a->ra,
- 0, FPST_A64)
-TRANS_FEAT_NONSTREAMING(FMMLA_d, aa64_sve_f64mm, gen_gvec_fpst_zzzz,
- gen_helper_fmmla_d, a->rd, a->rn, a->rm, a->ra,
- 0, FPST_A64)
-
static gen_helper_gvec_4 * const sqdmlal_zzzw_fns[] = {
NULL, gen_helper_sve2_sqdmlal_zzzw_h,
gen_helper_sve2_sqdmlal_zzzw_s, gen_helper_sve2_sqdmlal_zzzw_d,
@@ -7111,8 +7699,13 @@ static gen_helper_gvec_4 * const sqrdcmlah_fns[] = {
TRANS_FEAT(SQRDCMLAH_zzzz, aa64_sve2, gen_gvec_ool_zzzz,
sqrdcmlah_fns[a->esz], a->rd, a->rn, a->rm, a->ra, a->rot)
-TRANS_FEAT(USDOT_zzzz, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz,
- a->esz == 2 ? gen_helper_gvec_usdot_b : NULL, a, 0)
+TRANS_FEAT(USDOT_zzzz_4s, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz,
+ gen_helper_gvec_usdot_4b, a, 0)
+
+TRANS_FEAT(SDOT_zzzz_2s, aa64_sme2_or_sve2p1, gen_gvec_ool_arg_zzzz,
+ gen_helper_gvec_sdot_2h, a, 0)
+TRANS_FEAT(UDOT_zzzz_2s, aa64_sme2_or_sve2p1, gen_gvec_ool_arg_zzzz,
+ gen_helper_gvec_udot_2h, a, 0)
TRANS_FEAT_NONSTREAMING(AESMC, aa64_sve2_aes, gen_gvec_ool_zz,
gen_helper_crypto_aesmc, a->rd, a->rd, 0)
@@ -7174,7 +7767,7 @@ static bool do_FMLAL_zzxw(DisasContext *s, arg_rrxr_esz *a, bool sub, bool sel)
{
return gen_gvec_ptr_zzzz(s, gen_helper_sve2_fmlal_zzxw_s,
a->rd, a->rn, a->rm, a->ra,
- (a->index << 2) | (sel << 1) | sub, tcg_env);
+ (a->index << 3) | (sel << 1) | sub, tcg_env);
}
TRANS_FEAT(FMLALB_zzxw, aa64_sve2, do_FMLAL_zzxw, a, false, false)
@@ -7189,6 +7782,11 @@ TRANS_FEAT_NONSTREAMING(USMMLA, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz,
TRANS_FEAT_NONSTREAMING(UMMLA, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz,
gen_helper_gvec_ummla_b, a, 0)
+TRANS_FEAT(FDOT_zzzz, aa64_sme2_or_sve2p1, gen_gvec_env_arg_zzzz,
+ gen_helper_sme2_fdot_h, a, 0)
+TRANS_FEAT(FDOT_zzxz, aa64_sme2_or_sve2p1, gen_gvec_env_arg_zzxz,
+ gen_helper_sme2_fdot_idx_h, a)
+
TRANS_FEAT(BFDOT_zzzz, aa64_sve_bf16, gen_gvec_env_arg_zzzz,
gen_helper_gvec_bfdot, a, 0)
TRANS_FEAT(BFDOT_zzxz, aa64_sve_bf16, gen_gvec_env_arg_zzxz,
@@ -7218,6 +7816,36 @@ static bool do_BFMLAL_zzxw(DisasContext *s, arg_rrxr_esz *a, bool sel)
TRANS_FEAT(BFMLALB_zzxw, aa64_sve_bf16, do_BFMLAL_zzxw, a, false)
TRANS_FEAT(BFMLALT_zzxw, aa64_sve_bf16, do_BFMLAL_zzxw, a, true)
+static bool do_BFMLSL_zzzw(DisasContext *s, arg_rrrr_esz *a, bool sel)
+{
+ if (s->fpcr_ah) {
+ return gen_gvec_fpst_zzzz(s, gen_helper_gvec_ah_bfmlsl,
+ a->rd, a->rn, a->rm, a->ra, sel, FPST_AH);
+ } else {
+ return gen_gvec_fpst_zzzz(s, gen_helper_gvec_bfmlsl,
+ a->rd, a->rn, a->rm, a->ra, sel, FPST_A64);
+ }
+}
+
+TRANS_FEAT(BFMLSLB_zzzw, aa64_sme2_or_sve2p1, do_BFMLSL_zzzw, a, false)
+TRANS_FEAT(BFMLSLT_zzzw, aa64_sme2_or_sve2p1, do_BFMLSL_zzzw, a, true)
+
+static bool do_BFMLSL_zzxw(DisasContext *s, arg_rrxr_esz *a, bool sel)
+{
+ if (s->fpcr_ah) {
+ return gen_gvec_fpst_zzzz(s, gen_helper_gvec_ah_bfmlsl_idx,
+ a->rd, a->rn, a->rm, a->ra,
+ (a->index << 1) | sel, FPST_AH);
+ } else {
+ return gen_gvec_fpst_zzzz(s, gen_helper_gvec_bfmlsl_idx,
+ a->rd, a->rn, a->rm, a->ra,
+ (a->index << 1) | sel, FPST_A64);
+ }
+}
+
+TRANS_FEAT(BFMLSLB_zzxw, aa64_sme2_or_sve2p1, do_BFMLSL_zzxw, a, false)
+TRANS_FEAT(BFMLSLT_zzxw, aa64_sme2_or_sve2p1, do_BFMLSL_zzxw, a, true)
+
static bool trans_PSEL(DisasContext *s, arg_psel *a)
{
int vl = vec_full_reg_size(s);
@@ -7226,7 +7854,7 @@ static bool trans_PSEL(DisasContext *s, arg_psel *a)
TCGv_i64 tmp, didx, dbit;
TCGv_ptr ptr;
- if (!dc_isar_feature(aa64_sme, s)) {
+ if (!dc_isar_feature(aa64_sme_or_sve2p1, s)) {
return false;
}
if (!sve_access_check(s)) {
@@ -7265,6 +7893,7 @@ static bool trans_PSEL(DisasContext *s, arg_psel *a)
tcg_gen_neg_i64(tmp, tmp);
/* Apply to either copy the source, or write zeros. */
+ pl = size_for_gvec(pl);
tcg_gen_gvec_ands(MO_64, pred_full_reg_offset(s, a->pd),
pred_full_reg_offset(s, a->pn), tmp, pl, pl);
return true;
@@ -7319,7 +7948,7 @@ static void gen_sclamp(unsigned vece, uint32_t d, uint32_t n, uint32_t m,
tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &ops[vece]);
}
-TRANS_FEAT(SCLAMP, aa64_sme, gen_gvec_fn_arg_zzzz, gen_sclamp, a)
+TRANS_FEAT(SCLAMP, aa64_sme_or_sve2p1, gen_gvec_fn_arg_zzzz, gen_sclamp, a)
static void gen_uclamp_i32(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, TCGv_i32 a)
{
@@ -7370,4 +7999,136 @@ static void gen_uclamp(unsigned vece, uint32_t d, uint32_t n, uint32_t m,
tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &ops[vece]);
}
-TRANS_FEAT(UCLAMP, aa64_sme, gen_gvec_fn_arg_zzzz, gen_uclamp, a)
+TRANS_FEAT(UCLAMP, aa64_sme_or_sve2p1, gen_gvec_fn_arg_zzzz, gen_uclamp, a)
+
+static bool trans_FCLAMP(DisasContext *s, arg_FCLAMP *a)
+{
+ static gen_helper_gvec_3_ptr * const fn[] = {
+ gen_helper_sme2_bfclamp,
+ gen_helper_sme2_fclamp_h,
+ gen_helper_sme2_fclamp_s,
+ gen_helper_sme2_fclamp_d,
+ };
+
+ /* This insn uses MO_8 to encode BFloat16. */
+ if (a->esz == MO_8
+ ? !dc_isar_feature(aa64_sve_b16b16, s)
+ : !dc_isar_feature(aa64_sme2_or_sve2p1, s)) {
+ return false;
+ }
+
+ /* So far we never optimize rda with MOVPRFX */
+ assert(a->rd == a->ra);
+ return gen_gvec_fpst_zzz(s, fn[a->esz], a->rd, a->rn, a->rm, 1,
+ a->esz == MO_16 ? FPST_A64_F16 : FPST_A64);
+}
+
+TRANS_FEAT(SQCVTN_sh, aa64_sme2_or_sve2p1, gen_gvec_ool_zz,
+ gen_helper_sme2_sqcvtn_sh, a->rd, a->rn, 0)
+TRANS_FEAT(UQCVTN_sh, aa64_sme2_or_sve2p1, gen_gvec_ool_zz,
+ gen_helper_sme2_uqcvtn_sh, a->rd, a->rn, 0)
+TRANS_FEAT(SQCVTUN_sh, aa64_sme2_or_sve2p1, gen_gvec_ool_zz,
+ gen_helper_sme2_sqcvtun_sh, a->rd, a->rn, 0)
+
+static bool gen_ldst_c(DisasContext *s, TCGv_i64 addr, int zd, int png,
+ MemOp esz, bool is_write, int n, bool strided)
+{
+ typedef void ldst_c_fn(TCGv_env, TCGv_ptr, TCGv_i64,
+ TCGv_i32, TCGv_i32);
+ static ldst_c_fn * const f_ldst[2][2][4] = {
+ { { gen_helper_sve2p1_ld1bb_c,
+ gen_helper_sve2p1_ld1hh_le_c,
+ gen_helper_sve2p1_ld1ss_le_c,
+ gen_helper_sve2p1_ld1dd_le_c, },
+ { gen_helper_sve2p1_ld1bb_c,
+ gen_helper_sve2p1_ld1hh_be_c,
+ gen_helper_sve2p1_ld1ss_be_c,
+ gen_helper_sve2p1_ld1dd_be_c, } },
+
+ { { gen_helper_sve2p1_st1bb_c,
+ gen_helper_sve2p1_st1hh_le_c,
+ gen_helper_sve2p1_st1ss_le_c,
+ gen_helper_sve2p1_st1dd_le_c, },
+ { gen_helper_sve2p1_st1bb_c,
+ gen_helper_sve2p1_st1hh_be_c,
+ gen_helper_sve2p1_st1ss_be_c,
+ gen_helper_sve2p1_st1dd_be_c, } }
+ };
+
+ TCGv_i32 t_png, t_desc;
+ TCGv_ptr t_zd;
+ uint32_t desc, lg2_rstride = 0;
+ bool be = s->be_data == MO_BE;
+
+ assert(n == 2 || n == 4);
+ if (strided) {
+ lg2_rstride = 3;
+ if (n == 4) {
+ /* Validate ZD alignment. */
+ if (zd & 4) {
+ return false;
+ }
+ lg2_rstride = 2;
+ }
+ /* Ignore non-temporal bit */
+ zd &= ~8;
+ }
+
+ if (strided || !dc_isar_feature(aa64_sve2p1, s)
+ ? !sme_sm_enabled_check(s)
+ : !sve_access_check(s)) {
+ return true;
+ }
+
+ if (!s->mte_active[0]) {
+ addr = clean_data_tbi(s, addr);
+ }
+
+ desc = n == 2 ? 0 : 1;
+ desc = desc | (lg2_rstride << 1);
+ desc = make_svemte_desc(s, vec_full_reg_size(s), 1, esz, is_write, desc);
+ t_desc = tcg_constant_i32(desc);
+
+ t_png = tcg_temp_new_i32();
+ tcg_gen_ld16u_i32(t_png, tcg_env,
+ pred_full_reg_offset(s, png) ^
+ (HOST_BIG_ENDIAN ? 6 : 0));
+
+ t_zd = tcg_temp_new_ptr();
+ tcg_gen_addi_ptr(t_zd, tcg_env, vec_full_reg_offset(s, zd));
+
+ f_ldst[is_write][be][esz](tcg_env, t_zd, addr, t_png, t_desc);
+ return true;
+}
+
+static bool gen_ldst_zcrr_c(DisasContext *s, arg_zcrr_ldst *a,
+ bool is_write, bool strided)
+{
+ TCGv_i64 addr = tcg_temp_new_i64();
+
+ tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), a->esz);
+ tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
+ return gen_ldst_c(s, addr, a->rd, a->png, a->esz, is_write,
+ a->nreg, strided);
+}
+
+static bool gen_ldst_zcri_c(DisasContext *s, arg_zcri_ldst *a,
+ bool is_write, bool strided)
+{
+ TCGv_i64 addr = tcg_temp_new_i64();
+
+ tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn),
+ a->imm * a->nreg * vec_full_reg_size(s));
+ return gen_ldst_c(s, addr, a->rd, a->png, a->esz, is_write,
+ a->nreg, strided);
+}
+
+TRANS_FEAT(LD1_zcrr, aa64_sme2_or_sve2p1, gen_ldst_zcrr_c, a, false, false)
+TRANS_FEAT(LD1_zcri, aa64_sme2_or_sve2p1, gen_ldst_zcri_c, a, false, false)
+TRANS_FEAT(ST1_zcrr, aa64_sme2_or_sve2p1, gen_ldst_zcrr_c, a, true, false)
+TRANS_FEAT(ST1_zcri, aa64_sme2_or_sve2p1, gen_ldst_zcri_c, a, true, false)
+
+TRANS_FEAT(LD1_zcrr_stride, aa64_sme2, gen_ldst_zcrr_c, a, false, true)
+TRANS_FEAT(LD1_zcri_stride, aa64_sme2, gen_ldst_zcri_c, a, false, true)
+TRANS_FEAT(ST1_zcrr_stride, aa64_sme2, gen_ldst_zcrr_c, a, true, true)
+TRANS_FEAT(ST1_zcri_stride, aa64_sme2, gen_ldst_zcri_c, a, true, true)
diff --git a/target/arm/tcg/translate.h b/target/arm/tcg/translate.h
index 0004a97..f974996 100644
--- a/target/arm/tcg/translate.h
+++ b/target/arm/tcg/translate.h
@@ -70,8 +70,10 @@ typedef struct DisasContext {
int fp_excp_el; /* FP exception EL or 0 if enabled */
int sve_excp_el; /* SVE exception EL or 0 if enabled */
int sme_excp_el; /* SME exception EL or 0 if enabled */
+ int zt0_excp_el; /* ZT0 exception EL or 0 if enabled */
int vl; /* current vector length in bytes */
int svl; /* current streaming vector length in bytes */
+ int max_svl; /* maximum implemented streaming vector length */
bool vfp_enabled; /* FP enabled via FPSCR.EN */
int vec_len;
int vec_stride;
@@ -208,6 +210,11 @@ static inline int plus_2(DisasContext *s, int x)
return x + 2;
}
+static inline int plus_8(DisasContext *s, int x)
+{
+ return x + 8;
+}
+
static inline int plus_12(DisasContext *s, int x)
{
return x + 12;
@@ -636,6 +643,8 @@ typedef void GVecGen3Fn(unsigned, uint32_t, uint32_t,
uint32_t, uint32_t, uint32_t);
typedef void GVecGen4Fn(unsigned, uint32_t, uint32_t, uint32_t,
uint32_t, uint32_t, uint32_t);
+typedef void GVecGen3FnVar(unsigned, TCGv_ptr, uint32_t, TCGv_ptr, uint32_t,
+ TCGv_ptr, uint32_t, uint32_t, uint32_t);
/* Function prototype for gen_ functions for calling Neon helpers */
typedef void NeonGenOneOpFn(TCGv_i32, TCGv_i32);
diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c
index 986eaf8..0603db0 100644
--- a/target/arm/tcg/vec_helper.c
+++ b/target/arm/tcg/vec_helper.c
@@ -825,11 +825,11 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
clear_tail(d, opr_sz, simd_maxsz(desc)); \
}
-DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
-DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
-DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
-DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
-DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
+DO_DOT(gvec_sdot_4b, int32_t, int8_t, int8_t)
+DO_DOT(gvec_udot_4b, uint32_t, uint8_t, uint8_t)
+DO_DOT(gvec_usdot_4b, uint32_t, uint8_t, int8_t)
+DO_DOT(gvec_sdot_4h, int64_t, int16_t, int16_t)
+DO_DOT(gvec_udot_4h, uint64_t, uint16_t, uint16_t)
#define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
@@ -865,12 +865,63 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
clear_tail(d, opr_sz, simd_maxsz(desc)); \
}
-DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
-DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
-DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
-DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
-DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
-DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
+DO_DOT_IDX(gvec_sdot_idx_4b, int32_t, int8_t, int8_t, H4)
+DO_DOT_IDX(gvec_udot_idx_4b, uint32_t, uint8_t, uint8_t, H4)
+DO_DOT_IDX(gvec_sudot_idx_4b, int32_t, int8_t, uint8_t, H4)
+DO_DOT_IDX(gvec_usdot_idx_4b, int32_t, uint8_t, int8_t, H4)
+DO_DOT_IDX(gvec_sdot_idx_4h, int64_t, int16_t, int16_t, H8)
+DO_DOT_IDX(gvec_udot_idx_4h, uint64_t, uint16_t, uint16_t, H8)
+
+#undef DO_DOT
+#undef DO_DOT_IDX
+
+/* Similar for 2-way dot product */
+#define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ TYPED *d = vd, *a = va; \
+ TYPEN *n = vn; \
+ TYPEM *m = vm; \
+ for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \
+ d[i] = (a[i] + \
+ (TYPED)n[i * 2 + 0] * m[i * 2 + 0] + \
+ (TYPED)n[i * 2 + 1] * m[i * 2 + 1]); \
+ } \
+ clear_tail(d, opr_sz, simd_maxsz(desc)); \
+}
+
+#define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
+{ \
+ intptr_t i = 0, opr_sz = simd_oprsz(desc); \
+ intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \
+ intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \
+ intptr_t index = simd_data(desc); \
+ TYPED *d = vd, *a = va; \
+ TYPEN *n = vn; \
+ TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 2; \
+ do { \
+ TYPED m0 = m_indexed[i * 2 + 0]; \
+ TYPED m1 = m_indexed[i * 2 + 1]; \
+ do { \
+ d[i] = (a[i] + \
+ n[i * 2 + 0] * m0 + \
+ n[i * 2 + 1] * m1); \
+ } while (++i < segend); \
+ segend = i + (16 / sizeof(TYPED)); \
+ } while (i < opr_sz_n); \
+ clear_tail(d, opr_sz, simd_maxsz(desc)); \
+}
+
+DO_DOT(gvec_sdot_2h, int32_t, int16_t, int16_t)
+DO_DOT(gvec_udot_2h, uint32_t, uint16_t, uint16_t)
+
+DO_DOT_IDX(gvec_sdot_idx_2h, int32_t, int16_t, int16_t, H4)
+DO_DOT_IDX(gvec_udot_idx_2h, uint32_t, uint16_t, uint16_t, H4)
+
+#undef DO_DOT
+#undef DO_DOT_IDX
void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
float_status *fpst, uint32_t desc)
@@ -1419,10 +1470,12 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, \
DO_3OP(gvec_fadd_h, float16_add, float16)
DO_3OP(gvec_fadd_s, float32_add, float32)
DO_3OP(gvec_fadd_d, float64_add, float64)
+DO_3OP(gvec_bfadd, bfloat16_add, bfloat16)
DO_3OP(gvec_fsub_h, float16_sub, float16)
DO_3OP(gvec_fsub_s, float32_sub, float32)
DO_3OP(gvec_fsub_d, float64_sub, float64)
+DO_3OP(gvec_bfsub, bfloat16_sub, bfloat16)
DO_3OP(gvec_fmul_h, float16_mul, float16)
DO_3OP(gvec_fmul_s, float32_mul, float32)
@@ -1515,6 +1568,13 @@ DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16)
DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32)
DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64)
+DO_3OP(gvec_fmax_b16, bfloat16_max, bfloat16)
+DO_3OP(gvec_fmin_b16, bfloat16_min, bfloat16)
+DO_3OP(gvec_fmaxnum_b16, bfloat16_maxnum, bfloat16)
+DO_3OP(gvec_fminnum_b16, bfloat16_minnum, bfloat16)
+DO_3OP(gvec_ah_fmax_b16, helper_sme2_ah_fmax_b16, bfloat16)
+DO_3OP(gvec_ah_fmin_b16, helper_sme2_ah_fmin_b16, bfloat16)
+
#endif
#undef DO_3OP
@@ -1550,6 +1610,12 @@ static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
return float16_muladd(op1, op2, dest, 0, stat);
}
+static bfloat16 bfloat16_muladd_f(bfloat16 dest, bfloat16 op1, bfloat16 op2,
+ float_status *stat)
+{
+ return bfloat16_muladd(op1, op2, dest, 0, stat);
+}
+
static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
float_status *stat)
{
@@ -1568,6 +1634,12 @@ static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
}
+static bfloat16 bfloat16_mulsub_f(bfloat16 dest, bfloat16 op1, bfloat16 op2,
+ float_status *stat)
+{
+ return bfloat16_muladd(bfloat16_chs(op1), op2, dest, 0, stat);
+}
+
static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
float_status *stat)
{
@@ -1586,6 +1658,12 @@ static float16 float16_ah_mulsub_f(float16 dest, float16 op1, float16 op2,
return float16_muladd(op1, op2, dest, float_muladd_negate_product, stat);
}
+static bfloat16 bfloat16_ah_mulsub_f(bfloat16 dest, bfloat16 op1, bfloat16 op2,
+ float_status *stat)
+{
+ return bfloat16_muladd(op1, op2, dest, float_muladd_negate_product, stat);
+}
+
static float32 float32_ah_mulsub_f(float32 dest, float32 op1, float32 op2,
float_status *stat)
{
@@ -1610,23 +1688,28 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, \
clear_tail(d, oprsz, simd_maxsz(desc)); \
}
-DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
-DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
+DO_MULADD(gvec_fmla_nf_h, float16_muladd_nf, float16)
+DO_MULADD(gvec_fmla_nf_s, float32_muladd_nf, float32)
-DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
-DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
+DO_MULADD(gvec_fmls_nf_h, float16_mulsub_nf, float16)
+DO_MULADD(gvec_fmls_nf_s, float32_mulsub_nf, float32)
DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
+DO_MULADD(gvec_bfmla, bfloat16_muladd_f, bfloat16)
DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
+DO_MULADD(gvec_bfmls, bfloat16_mulsub_f, bfloat16)
DO_MULADD(gvec_ah_vfms_h, float16_ah_mulsub_f, float16)
DO_MULADD(gvec_ah_vfms_s, float32_ah_mulsub_f, float32)
DO_MULADD(gvec_ah_vfms_d, float64_ah_mulsub_f, float64)
+DO_MULADD(gvec_ah_bfmls, bfloat16_ah_mulsub_f, bfloat16)
+
+#undef DO_MULADD
/* For the indexed ops, SVE applies the index per 128-bit vector segment.
* For AdvSIMD, there is of course only one such vector segment.
@@ -1745,14 +1828,17 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \
DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2, 0, 0)
DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4, 0, 0)
DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8, 0, 0)
+DO_FMLA_IDX(gvec_bfmla_idx, bfloat16, H2, 0, 0)
DO_FMLA_IDX(gvec_fmls_idx_h, float16, H2, INT16_MIN, 0)
DO_FMLA_IDX(gvec_fmls_idx_s, float32, H4, INT32_MIN, 0)
DO_FMLA_IDX(gvec_fmls_idx_d, float64, H8, INT64_MIN, 0)
+DO_FMLA_IDX(gvec_bfmls_idx, bfloat16, H2, INT16_MIN, 0)
DO_FMLA_IDX(gvec_ah_fmls_idx_h, float16, H2, 0, float_muladd_negate_product)
DO_FMLA_IDX(gvec_ah_fmls_idx_s, float32, H4, 0, float_muladd_negate_product)
DO_FMLA_IDX(gvec_ah_fmls_idx_d, float64, H8, 0, float_muladd_negate_product)
+DO_FMLA_IDX(gvec_ah_bfmls_idx, bfloat16, H2, 0, float_muladd_negate_product)
#undef DO_FMLA_IDX
@@ -2184,7 +2270,8 @@ void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
intptr_t i, oprsz = simd_oprsz(desc);
bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
- float_status *status = &env->vfp.fp_status[FPST_A64];
+ bool za = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
+ float_status *status = &env->vfp.fp_status[za ? FPST_ZA : FPST_A64];
bool fz16 = env->vfp.fpcr & FPCR_FZ16;
int negx = 0, negf = 0;
@@ -2267,8 +2354,9 @@ void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
intptr_t i, j, oprsz = simd_oprsz(desc);
bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
- intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
- float_status *status = &env->vfp.fp_status[FPST_A64];
+ bool za = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
+ intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 3, 3) * sizeof(float16);
+ float_status *status = &env->vfp.fp_status[za ? FPST_ZA : FPST_A64];
bool fz16 = env->vfp.fpcr & FPCR_FZ16;
int negx = 0, negf = 0;
@@ -2989,31 +3077,62 @@ float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst)
float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
float_status *fpst, float_status *fpst_odd)
{
- /*
- * Compare f16_dotadd() in sme_helper.c, but here we have
- * bfloat16 inputs. In particular that means that we do not
- * want the FPCR.FZ16 flush semantics, so we use the normal
- * float_status for the input handling here.
- */
- float64 e1r = float32_to_float64(e1 << 16, fpst);
- float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst);
- float64 e2r = float32_to_float64(e2 << 16, fpst);
- float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst);
- float64 t64;
+ float32 s1r = e1 << 16;
+ float32 s1c = e1 & 0xffff0000u;
+ float32 s2r = e2 << 16;
+ float32 s2c = e2 & 0xffff0000u;
float32 t32;
- /*
- * The ARM pseudocode function FPDot performs both multiplies
- * and the add with a single rounding operation. Emulate this
- * by performing the first multiply in round-to-odd, then doing
- * the second multiply as fused multiply-add, and rounding to
- * float32 all in one step.
- */
- t64 = float64_mul(e1r, e2r, fpst_odd);
- t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
+ /* C.f. FPProcessNaNs4 */
+ if (float32_is_any_nan(s1r) || float32_is_any_nan(s1c) ||
+ float32_is_any_nan(s2r) || float32_is_any_nan(s2c)) {
+ if (float32_is_signaling_nan(s1r, fpst)) {
+ t32 = s1r;
+ } else if (float32_is_signaling_nan(s1c, fpst)) {
+ t32 = s1c;
+ } else if (float32_is_signaling_nan(s2r, fpst)) {
+ t32 = s2r;
+ } else if (float32_is_signaling_nan(s2c, fpst)) {
+ t32 = s2c;
+ } else if (float32_is_any_nan(s1r)) {
+ t32 = s1r;
+ } else if (float32_is_any_nan(s1c)) {
+ t32 = s1c;
+ } else if (float32_is_any_nan(s2r)) {
+ t32 = s2r;
+ } else {
+ t32 = s2c;
+ }
+ /*
+ * FPConvertNaN(FPProcessNaN(t32)) will be done as part
+ * of the final addition below.
+ */
+ } else {
+ /*
+ * Compare f16_dotadd() in sme_helper.c, but here we have
+ * bfloat16 inputs. In particular that means that we do not
+ * want the FPCR.FZ16 flush semantics, so we use the normal
+ * float_status for the input handling here.
+ */
+ float64 e1r = float32_to_float64(s1r, fpst);
+ float64 e1c = float32_to_float64(s1c, fpst);
+ float64 e2r = float32_to_float64(s2r, fpst);
+ float64 e2c = float32_to_float64(s2c, fpst);
+ float64 t64;
+
+ /*
+ * The ARM pseudocode function FPDot performs both multiplies
+ * and the add with a single rounding operation. Emulate this
+ * by performing the first multiply in round-to-odd, then doing
+ * the second multiply as fused multiply-add, and rounding to
+ * float32 all in one step.
+ */
+ t64 = float64_mul(e1r, e2r, fpst_odd);
+ t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
- /* This conversion is exact, because we've already rounded. */
- t32 = float64_to_float32(t64, fpst);
+ /* This conversion is exact, because we've already rounded. */
+ t32 = float64_to_float32(t64, fpst);
+ }
/* The final accumulation step is not fused. */
return float32_add(sum, t32, fpst);
@@ -3070,6 +3189,45 @@ void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
clear_tail(d, opr_sz, simd_maxsz(desc));
}
+void HELPER(sme2_bfvdot_idx)(void *vd, void *vn, void *vm,
+ void *va, CPUARMState *env, uint32_t desc)
+{
+ intptr_t i, j, opr_sz = simd_oprsz(desc);
+ intptr_t idx = extract32(desc, SIMD_DATA_SHIFT, 2);
+ intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
+ intptr_t elements = opr_sz / 4;
+ intptr_t eltspersegment = MIN(16 / 4, elements);
+ float32 *d = vd, *a = va;
+ uint16_t *n0 = vn;
+ uint16_t *n1 = vn + sizeof(ARMVectorReg);
+ uint32_t *m = vm;
+ float_status fpst, fpst_odd;
+
+ if (is_ebf(env, &fpst, &fpst_odd)) {
+ for (i = 0; i < elements; i += eltspersegment) {
+ uint32_t m_idx = m[i + H4(idx)];
+
+ for (j = 0; j < eltspersegment; j++) {
+ uint32_t nn = (n0[H2(2 * (i + j) + sel)])
+ | (n1[H2(2 * (i + j) + sel)] << 16);
+ d[i + H4(j)] = bfdotadd_ebf(a[i + H4(j)], nn, m_idx,
+ &fpst, &fpst_odd);
+ }
+ }
+ } else {
+ for (i = 0; i < elements; i += eltspersegment) {
+ uint32_t m_idx = m[i + H4(idx)];
+
+ for (j = 0; j < eltspersegment; j++) {
+ uint32_t nn = (n0[H2(2 * (i + j) + sel)])
+ | (n1[H2(2 * (i + j) + sel)] << 16);
+ d[i + H4(j)] = bfdotadd(a[i + H4(j)], nn, m_idx, &fpst);
+ }
+ }
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
CPUARMState *env, uint32_t desc)
{
@@ -3146,44 +3304,76 @@ void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
clear_tail(d, opr_sz, simd_maxsz(desc));
}
-void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
- float_status *stat, uint32_t desc)
+static void do_bfmlal(float32 *d, bfloat16 *n, bfloat16 *m, float32 *a,
+ float_status *stat, uint32_t desc, int negx, int negf)
{
intptr_t i, opr_sz = simd_oprsz(desc);
- intptr_t sel = simd_data(desc);
- float32 *d = vd, *a = va;
- bfloat16 *n = vn, *m = vm;
+ intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
for (i = 0; i < opr_sz / 4; ++i) {
- float32 nn = n[H2(i * 2 + sel)] << 16;
+ float32 nn = (negx ^ n[H2(i * 2 + sel)]) << 16;
float32 mm = m[H2(i * 2 + sel)] << 16;
- d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
+ d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], negf, stat);
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
-void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
- void *va, float_status *stat, uint32_t desc)
+void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
+ float_status *stat, uint32_t desc)
+{
+ do_bfmlal(vd, vn, vm, va, stat, desc, 0, 0);
+}
+
+void HELPER(gvec_bfmlsl)(void *vd, void *vn, void *vm, void *va,
+ float_status *stat, uint32_t desc)
+{
+ do_bfmlal(vd, vn, vm, va, stat, desc, 0x8000, 0);
+}
+
+void HELPER(gvec_ah_bfmlsl)(void *vd, void *vn, void *vm, void *va,
+ float_status *stat, uint32_t desc)
+{
+ do_bfmlal(vd, vn, vm, va, stat, desc, 0, float_muladd_negate_product);
+}
+
+static void do_bfmlal_idx(float32 *d, bfloat16 *n, bfloat16 *m, float32 *a,
+ float_status *stat, uint32_t desc, int negx, int negf)
{
intptr_t i, j, opr_sz = simd_oprsz(desc);
intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
intptr_t elements = opr_sz / 4;
intptr_t eltspersegment = MIN(16 / 4, elements);
- float32 *d = vd, *a = va;
- bfloat16 *n = vn, *m = vm;
for (i = 0; i < elements; i += eltspersegment) {
float32 m_idx = m[H2(2 * i + index)] << 16;
for (j = i; j < i + eltspersegment; j++) {
- float32 n_j = n[H2(2 * j + sel)] << 16;
- d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
+ float32 n_j = (negx ^ n[H2(2 * j + sel)]) << 16;
+ d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], negf, stat);
}
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
+void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, void *va,
+ float_status *stat, uint32_t desc)
+{
+ do_bfmlal_idx(vd, vn, vm, va, stat, desc, 0, 0);
+}
+
+void HELPER(gvec_bfmlsl_idx)(void *vd, void *vn, void *vm, void *va,
+ float_status *stat, uint32_t desc)
+{
+ do_bfmlal_idx(vd, vn, vm, va, stat, desc, 0x8000, 0);
+}
+
+void HELPER(gvec_ah_bfmlsl_idx)(void *vd, void *vn, void *vm, void *va,
+ float_status *stat, uint32_t desc)
+{
+ do_bfmlal_idx(vd, vn, vm, va, stat, desc, 0, float_muladd_negate_product);
+}
+
#define DO_CLAMP(NAME, TYPE) \
void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \
{ \
@@ -3253,3 +3443,91 @@ void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc)
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
+
+static inline void do_lut_b(void *zd, uint64_t *indexes, uint64_t *table,
+ unsigned elements, unsigned segbase,
+ unsigned dstride, unsigned isize,
+ unsigned tsize, unsigned nreg)
+{
+ for (unsigned r = 0; r < nreg; ++r) {
+ uint8_t *dst = zd + dstride * r;
+ unsigned base = segbase + r * elements;
+
+ for (unsigned e = 0; e < elements; ++e) {
+ unsigned index = extractn(indexes, (base + e) * isize, isize);
+ dst[H1(e)] = extractn(table, index * tsize, 8);
+ }
+ }
+}
+
+static inline void do_lut_h(void *zd, uint64_t *indexes, uint64_t *table,
+ unsigned elements, unsigned segbase,
+ unsigned dstride, unsigned isize,
+ unsigned tsize, unsigned nreg)
+{
+ for (unsigned r = 0; r < nreg; ++r) {
+ uint16_t *dst = zd + dstride * r;
+ unsigned base = segbase + r * elements;
+
+ for (unsigned e = 0; e < elements; ++e) {
+ unsigned index = extractn(indexes, (base + e) * isize, isize);
+ dst[H2(e)] = extractn(table, index * tsize, 16);
+ }
+ }
+}
+
+static inline void do_lut_s(void *zd, uint64_t *indexes, uint32_t *table,
+ unsigned elements, unsigned segbase,
+ unsigned dstride, unsigned isize,
+ unsigned tsize, unsigned nreg)
+{
+ for (unsigned r = 0; r < nreg; ++r) {
+ uint32_t *dst = zd + dstride * r;
+ unsigned base = segbase + r * elements;
+
+ for (unsigned e = 0; e < elements; ++e) {
+ unsigned index = extractn(indexes, (base + e) * isize, isize);
+ dst[H4(e)] = table[H4(index)];
+ }
+ }
+}
+
+#define DO_SME2_LUT(ISIZE, NREG, SUFF, ESIZE) \
+void helper_sme2_luti##ISIZE##_##NREG##SUFF \
+ (void *zd, void *zn, CPUARMState *env, uint32_t desc) \
+{ \
+ unsigned vl = simd_oprsz(desc); \
+ unsigned strided = extract32(desc, SIMD_DATA_SHIFT, 1); \
+ unsigned idx = extract32(desc, SIMD_DATA_SHIFT + 1, 4); \
+ unsigned elements = vl / ESIZE; \
+ unsigned dstride = (!strided ? 1 : NREG == 4 ? 4 : 8); \
+ unsigned segments = (ESIZE * 8) / (ISIZE * NREG); \
+ unsigned segment = idx & (segments - 1); \
+ ARMVectorReg indexes; \
+ memcpy(&indexes, zn, vl); \
+ do_lut_##SUFF(zd, indexes.d, (void *)env->za_state.zt0, elements, \
+ segment * NREG * elements, \
+ dstride * sizeof(ARMVectorReg), ISIZE, 32, NREG); \
+}
+
+DO_SME2_LUT(2,1,b, 1)
+DO_SME2_LUT(2,1,h, 2)
+DO_SME2_LUT(2,1,s, 4)
+DO_SME2_LUT(2,2,b, 1)
+DO_SME2_LUT(2,2,h, 2)
+DO_SME2_LUT(2,2,s, 4)
+DO_SME2_LUT(2,4,b, 1)
+DO_SME2_LUT(2,4,h, 2)
+DO_SME2_LUT(2,4,s, 4)
+
+DO_SME2_LUT(4,1,b, 1)
+DO_SME2_LUT(4,1,h, 2)
+DO_SME2_LUT(4,1,s, 4)
+DO_SME2_LUT(4,2,b, 1)
+DO_SME2_LUT(4,2,h, 2)
+DO_SME2_LUT(4,2,s, 4)
+DO_SME2_LUT(4,4,b, 1)
+DO_SME2_LUT(4,4,h, 2)
+DO_SME2_LUT(4,4,s, 4)
+
+#undef DO_SME2_LUT
diff --git a/target/arm/tcg/vec_internal.h b/target/arm/tcg/vec_internal.h
index c02f9c3..cf41b03 100644
--- a/target/arm/tcg/vec_internal.h
+++ b/target/arm/tcg/vec_internal.h
@@ -223,6 +223,34 @@ int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *);
int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *);
int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool);
+#define do_ssat_b(val) MIN(MAX(val, INT8_MIN), INT8_MAX)
+#define do_ssat_h(val) MIN(MAX(val, INT16_MIN), INT16_MAX)
+#define do_ssat_s(val) MIN(MAX(val, INT32_MIN), INT32_MAX)
+#define do_usat_b(val) MIN(MAX(val, 0), UINT8_MAX)
+#define do_usat_h(val) MIN(MAX(val, 0), UINT16_MAX)
+#define do_usat_s(val) MIN(MAX(val, 0), UINT32_MAX)
+
+static inline uint64_t do_urshr(uint64_t x, unsigned sh)
+{
+ if (likely(sh < 64)) {
+ return (x >> sh) + ((x >> (sh - 1)) & 1);
+ } else if (sh == 64) {
+ return x >> 63;
+ } else {
+ return 0;
+ }
+}
+
+static inline int64_t do_srshr(int64_t x, unsigned sh)
+{
+ if (likely(sh < 64)) {
+ return (x >> sh) + ((x >> (sh - 1)) & 1);
+ } else {
+ /* Rounding the sign bit always produces 0. */
+ return 0;
+ }
+}
+
/**
* bfdotadd:
* @sum: addend
@@ -272,6 +300,11 @@ bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp);
/*
* Negate as for FPCR.AH=1 -- do not negate NaNs.
*/
+static inline float16 bfloat16_ah_chs(float16 a)
+{
+ return bfloat16_is_any_nan(a) ? a : bfloat16_chs(a);
+}
+
static inline float16 float16_ah_chs(float16 a)
{
return float16_is_any_nan(a) ? a : float16_chs(a);
@@ -302,4 +335,119 @@ static inline float64 float64_maybe_ah_chs(float64 a, bool fpcr_ah)
return fpcr_ah && float64_is_any_nan(a) ? a : float64_chs(a);
}
+/* Not actually called directly as a helper, but uses similar machinery. */
+bfloat16 helper_sme2_ah_fmax_b16(bfloat16 a, bfloat16 b, float_status *fpst);
+bfloat16 helper_sme2_ah_fmin_b16(bfloat16 a, bfloat16 b, float_status *fpst);
+
+float32 sve_f16_to_f32(float16 f, float_status *fpst);
+float16 sve_f32_to_f16(float32 f, float_status *fpst);
+
+/*
+ * Decode helper functions for predicate as counter.
+ */
+
+typedef struct {
+ unsigned count;
+ unsigned lg2_stride;
+ bool invert;
+} DecodeCounter;
+
+static inline DecodeCounter
+decode_counter(unsigned png, unsigned vl, unsigned v_esz)
+{
+ DecodeCounter ret = { };
+
+ /* C.f. Arm pseudocode CounterToPredicate. */
+ if (likely(png & 0xf)) {
+ unsigned p_esz = ctz32(png);
+
+ /*
+ * maxbit = log2(pl(bits) * 4)
+ * = log2(vl(bytes) * 4)
+ * = log2(vl) + 2
+ * maxbit_mask = ones<maxbit:0>
+ * = (1 << (maxbit + 1)) - 1
+ * = (1 << (log2(vl) + 2 + 1)) - 1
+ * = (1 << (log2(vl) + 3)) - 1
+ * = (pow2ceil(vl) << 3) - 1
+ */
+ ret.count = png & (((unsigned)pow2ceil(vl) << 3) - 1);
+ ret.count >>= p_esz + 1;
+
+ ret.invert = (png >> 15) & 1;
+
+ /*
+ * The Arm pseudocode for CounterToPredicate expands the count to
+ * a set of bits, and then the operation proceeds as for the original
+ * interpretation of predicates as a set of bits.
+ *
+ * We can avoid the expansion by adjusting the count and supplying
+ * an element stride.
+ */
+ if (unlikely(p_esz != v_esz)) {
+ if (p_esz < v_esz) {
+ /*
+ * For predicate esz < vector esz, the expanded predicate
+ * will have more bits set than will be consumed.
+ * Adjust the count down, rounding up.
+ * Consider p_esz = MO_8, v_esz = MO_64, count 14:
+ * The expanded predicate would be
+ * 0011 1111 1111 1111
+ * The significant bits are
+ * ...1 ...1 ...1 ...1
+ */
+ unsigned shift = v_esz - p_esz;
+ unsigned trunc = ret.count >> shift;
+ ret.count = trunc + (ret.count != (trunc << shift));
+ } else {
+ /*
+ * For predicate esz > vector esz, the expanded predicate
+ * will have bits set only at power-of-two multiples of
+ * the vector esz. Bits at other multiples will all be
+ * false. Adjust the count up, and supply the caller
+ * with a stride of elements to skip.
+ */
+ unsigned shift = p_esz - v_esz;
+ ret.count <<= shift;
+ ret.lg2_stride = shift;
+ }
+ }
+ }
+ return ret;
+}
+
+/* Extract @len bits from an array of uint64_t at offset @pos bits. */
+static inline uint64_t extractn(uint64_t *p, unsigned pos, unsigned len)
+{
+ uint64_t x;
+
+ p += pos / 64;
+ pos = pos % 64;
+
+ x = p[0];
+ if (pos + len > 64) {
+ x = (x >> pos) | (p[1] << (-pos & 63));
+ pos = 0;
+ }
+ return extract64(x, pos, len);
+}
+
+/* Deposit @len bits into an array of uint64_t at offset @pos bits. */
+static inline void depositn(uint64_t *p, unsigned pos,
+ unsigned len, uint64_t val)
+{
+ p += pos / 64;
+ pos = pos % 64;
+
+ if (pos + len <= 64) {
+ p[0] = deposit64(p[0], pos, len, val);
+ } else {
+ unsigned len0 = 64 - pos;
+ unsigned len1 = len - len0;
+
+ p[0] = deposit64(p[0], pos, len0, val);
+ p[1] = deposit64(p[1], 0, len1, val >> len0);
+ }
+}
+
#endif /* TARGET_ARM_VEC_INTERNAL_H */
diff --git a/target/arm/tcg/vfp_helper.c b/target/arm/tcg/vfp_helper.c
index b1324c5..e156e37 100644
--- a/target/arm/tcg/vfp_helper.c
+++ b/target/arm/tcg/vfp_helper.c
@@ -123,7 +123,7 @@ uint32_t vfp_get_fpsr_from_host(CPUARMState *env)
a64_flags |= (get_float_exception_flags(&env->vfp.fp_status[FPST_A64_F16])
& ~(float_flag_input_denormal_flushed | float_flag_input_denormal_used));
/*
- * We do not merge in flags from FPST_AH or FPST_AH_F16, because
+ * We do not merge in flags from FPST_{AH,ZA} or FPST_{AH,ZA}_F16, because
* they are used for insns that must not set the cumulative exception bits.
*/
@@ -196,6 +196,8 @@ void vfp_set_fpcr_to_host(CPUARMState *env, uint32_t val, uint32_t mask)
set_float_rounding_mode(i, &env->vfp.fp_status[FPST_A64]);
set_float_rounding_mode(i, &env->vfp.fp_status[FPST_A32_F16]);
set_float_rounding_mode(i, &env->vfp.fp_status[FPST_A64_F16]);
+ set_float_rounding_mode(i, &env->vfp.fp_status[FPST_ZA]);
+ set_float_rounding_mode(i, &env->vfp.fp_status[FPST_ZA_F16]);
}
if (changed & FPCR_FZ16) {
bool ftz_enabled = val & FPCR_FZ16;
@@ -203,15 +205,18 @@ void vfp_set_fpcr_to_host(CPUARMState *env, uint32_t val, uint32_t mask)
set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A64_F16]);
set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_STD_F16]);
set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_AH_F16]);
+ set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_ZA_F16]);
set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A32_F16]);
set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A64_F16]);
set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_STD_F16]);
set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_AH_F16]);
+ set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_ZA_F16]);
}
if (changed & FPCR_FZ) {
bool ftz_enabled = val & FPCR_FZ;
set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A32]);
set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A64]);
+ set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_ZA]);
/* FIZ is A64 only so FZ always makes A32 code flush inputs to zero */
set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A32]);
}
@@ -223,6 +228,7 @@ void vfp_set_fpcr_to_host(CPUARMState *env, uint32_t val, uint32_t mask)
bool fitz_enabled = (val & FPCR_FIZ) ||
(val & (FPCR_FZ | FPCR_AH)) == FPCR_FZ;
set_flush_inputs_to_zero(fitz_enabled, &env->vfp.fp_status[FPST_A64]);
+ set_flush_inputs_to_zero(fitz_enabled, &env->vfp.fp_status[FPST_ZA]);
}
if (changed & FPCR_DN) {
bool dnan_enabled = val & FPCR_DN;
@@ -240,9 +246,13 @@ void vfp_set_fpcr_to_host(CPUARMState *env, uint32_t val, uint32_t mask)
/* Change behaviours for A64 FP operations */
arm_set_ah_fp_behaviours(&env->vfp.fp_status[FPST_A64]);
arm_set_ah_fp_behaviours(&env->vfp.fp_status[FPST_A64_F16]);
+ arm_set_ah_fp_behaviours(&env->vfp.fp_status[FPST_ZA]);
+ arm_set_ah_fp_behaviours(&env->vfp.fp_status[FPST_ZA_F16]);
} else {
arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_A64]);
arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_A64_F16]);
+ arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_ZA]);
+ arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_ZA_F16]);
}
}
/*