diff options
Diffstat (limited to 'target/arm/tcg')
37 files changed, 10867 insertions, 2147 deletions
diff --git a/target/arm/tcg/arith_helper.c b/target/arm/tcg/arith_helper.c index 9a555c7..6701398 100644 --- a/target/arm/tcg/arith_helper.c +++ b/target/arm/tcg/arith_helper.c @@ -6,11 +6,12 @@ * SPDX-License-Identifier: GPL-2.0-or-later */ #include "qemu/osdep.h" -#include "cpu.h" -#include "exec/helper-proto.h" #include "qemu/crc32c.h" #include <zlib.h> /* for crc32 */ +#define HELPER_H "tcg/helper.h" +#include "exec/helper-proto.h.inc" + /* * Note that signed overflow is undefined in C. The following routines are * careful to use unsigned types where modulo arithmetic is required. diff --git a/target/arm/tcg/cpregs-at.c b/target/arm/tcg/cpregs-at.c new file mode 100644 index 0000000..398a61d --- /dev/null +++ b/target/arm/tcg/cpregs-at.c @@ -0,0 +1,519 @@ +/* + * System instructions for address translation + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "cpu.h" +#include "cpu-features.h" +#include "internals.h" +#include "cpregs.h" + + +static int par_el1_shareability(GetPhysAddrResult *res) +{ + /* + * The PAR_EL1.SH field must be 0b10 for Device or Normal-NC + * memory -- see pseudocode PAREncodeShareability(). + */ + if (((res->cacheattrs.attrs & 0xf0) == 0) || + res->cacheattrs.attrs == 0x44 || res->cacheattrs.attrs == 0x40) { + return 2; + } + return res->cacheattrs.shareability; +} + +static uint64_t do_ats_write(CPUARMState *env, uint64_t value, + MMUAccessType access_type, ARMMMUIdx mmu_idx, + ARMSecuritySpace ss) +{ + bool ret; + uint64_t par64; + bool format64 = false; + ARMMMUFaultInfo fi = {}; + GetPhysAddrResult res = {}; + + /* + * I_MXTJT: Granule protection checks are not performed on the final + * address of a successful translation. This is a translation not a + * memory reference, so "memop = none = 0". + */ + ret = get_phys_addr_with_space_nogpc(env, value, access_type, 0, + mmu_idx, ss, &res, &fi); + + /* + * ATS operations only do S1 or S1+S2 translations, so we never + * have to deal with the ARMCacheAttrs format for S2 only. + */ + assert(!res.cacheattrs.is_s2_format); + + if (ret) { + /* + * Some kinds of translation fault must cause exceptions rather + * than being reported in the PAR. + */ + int current_el = arm_current_el(env); + int target_el; + uint32_t syn, fsr, fsc; + bool take_exc = false; + + if (fi.s1ptw && current_el == 1 + && arm_mmu_idx_is_stage1_of_2(mmu_idx)) { + /* + * Synchronous stage 2 fault on an access made as part of the + * translation table walk for AT S1E0* or AT S1E1* insn + * executed from NS EL1. If this is a synchronous external abort + * and SCR_EL3.EA == 1, then we take a synchronous external abort + * to EL3. Otherwise the fault is taken as an exception to EL2, + * and HPFAR_EL2 holds the faulting IPA. + */ + if (fi.type == ARMFault_SyncExternalOnWalk && + (env->cp15.scr_el3 & SCR_EA)) { + target_el = 3; + } else { + env->cp15.hpfar_el2 = extract64(fi.s2addr, 12, 47) << 4; + if (arm_is_secure_below_el3(env) && fi.s1ns) { + env->cp15.hpfar_el2 |= HPFAR_NS; + } + target_el = 2; + } + take_exc = true; + } else if (fi.type == ARMFault_SyncExternalOnWalk) { + /* + * Synchronous external aborts during a translation table walk + * are taken as Data Abort exceptions. + */ + if (fi.stage2) { + if (current_el == 3) { + target_el = 3; + } else { + target_el = 2; + } + } else { + target_el = exception_target_el(env); + } + take_exc = true; + } + + if (take_exc) { + /* Construct FSR and FSC using same logic as arm_deliver_fault() */ + if (target_el == 2 || arm_el_is_aa64(env, target_el) || + arm_s1_regime_using_lpae_format(env, mmu_idx)) { + fsr = arm_fi_to_lfsc(&fi); + fsc = extract32(fsr, 0, 6); + } else { + fsr = arm_fi_to_sfsc(&fi); + fsc = 0x3f; + } + /* + * Report exception with ESR indicating a fault due to a + * translation table walk for a cache maintenance instruction. + */ + syn = syn_data_abort_no_iss(current_el == target_el, 0, + fi.ea, 1, fi.s1ptw, 1, fsc); + env->exception.vaddress = value; + env->exception.fsr = fsr; + raise_exception(env, EXCP_DATA_ABORT, syn, target_el); + } + } + + if (is_a64(env)) { + format64 = true; + } else if (arm_feature(env, ARM_FEATURE_LPAE)) { + /* + * ATS1Cxx: + * * TTBCR.EAE determines whether the result is returned using the + * 32-bit or the 64-bit PAR format + * * Instructions executed in Hyp mode always use the 64bit format + * + * ATS1S2NSOxx uses the 64bit format if any of the following is true: + * * The Non-secure TTBCR.EAE bit is set to 1 + * * The implementation includes EL2, and the value of HCR.VM is 1 + * + * (Note that HCR.DC makes HCR.VM behave as if it is 1.) + * + * ATS1Hx always uses the 64bit format. + */ + format64 = arm_s1_regime_using_lpae_format(env, mmu_idx); + + if (arm_feature(env, ARM_FEATURE_EL2)) { + if (mmu_idx == ARMMMUIdx_E10_0 || + mmu_idx == ARMMMUIdx_E10_1 || + mmu_idx == ARMMMUIdx_E10_1_PAN) { + format64 |= env->cp15.hcr_el2 & (HCR_VM | HCR_DC); + } else { + format64 |= arm_current_el(env) == 2; + } + } + } + + if (format64) { + /* Create a 64-bit PAR */ + par64 = (1 << 11); /* LPAE bit always set */ + if (!ret) { + par64 |= res.f.phys_addr & ~0xfffULL; + if (!res.f.attrs.secure) { + par64 |= (1 << 9); /* NS */ + } + par64 |= (uint64_t)res.cacheattrs.attrs << 56; /* ATTR */ + par64 |= par_el1_shareability(&res) << 7; /* SH */ + } else { + uint32_t fsr = arm_fi_to_lfsc(&fi); + + par64 |= 1; /* F */ + par64 |= (fsr & 0x3f) << 1; /* FS */ + if (fi.stage2) { + par64 |= (1 << 9); /* S */ + } + if (fi.s1ptw) { + par64 |= (1 << 8); /* PTW */ + } + } + } else { + /* + * fsr is a DFSR/IFSR value for the short descriptor + * translation table format (with WnR always clear). + * Convert it to a 32-bit PAR. + */ + if (!ret) { + /* We do not set any attribute bits in the PAR */ + if (res.f.lg_page_size == 24 + && arm_feature(env, ARM_FEATURE_V7)) { + par64 = (res.f.phys_addr & 0xff000000) | (1 << 1); + } else { + par64 = res.f.phys_addr & 0xfffff000; + } + if (!res.f.attrs.secure) { + par64 |= (1 << 9); /* NS */ + } + } else { + uint32_t fsr = arm_fi_to_sfsc(&fi); + + par64 = ((fsr & (1 << 10)) >> 5) | ((fsr & (1 << 12)) >> 6) | + ((fsr & 0xf) << 1) | 1; + } + } + return par64; +} + +static void ats_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) +{ + MMUAccessType access_type = ri->opc2 & 1 ? MMU_DATA_STORE : MMU_DATA_LOAD; + uint64_t par64; + ARMMMUIdx mmu_idx; + int el = arm_current_el(env); + ARMSecuritySpace ss = arm_security_space(env); + + switch (ri->opc2 & 6) { + case 0: + /* stage 1 current state PL1: ATS1CPR, ATS1CPW, ATS1CPRP, ATS1CPWP */ + switch (el) { + case 3: + if (ri->crm == 9 && arm_pan_enabled(env)) { + mmu_idx = ARMMMUIdx_E30_3_PAN; + } else { + mmu_idx = ARMMMUIdx_E3; + } + break; + case 2: + g_assert(ss != ARMSS_Secure); /* ARMv8.4-SecEL2 is 64-bit only */ + /* fall through */ + case 1: + if (ri->crm == 9 && arm_pan_enabled(env)) { + mmu_idx = ARMMMUIdx_Stage1_E1_PAN; + } else { + mmu_idx = ARMMMUIdx_Stage1_E1; + } + break; + default: + g_assert_not_reached(); + } + break; + case 2: + /* stage 1 current state PL0: ATS1CUR, ATS1CUW */ + switch (el) { + case 3: + mmu_idx = ARMMMUIdx_E30_0; + break; + case 2: + g_assert(ss != ARMSS_Secure); /* ARMv8.4-SecEL2 is 64-bit only */ + mmu_idx = ARMMMUIdx_Stage1_E0; + break; + case 1: + mmu_idx = ARMMMUIdx_Stage1_E0; + break; + default: + g_assert_not_reached(); + } + break; + case 4: + /* stage 1+2 NonSecure PL1: ATS12NSOPR, ATS12NSOPW */ + mmu_idx = ARMMMUIdx_E10_1; + ss = ARMSS_NonSecure; + break; + case 6: + /* stage 1+2 NonSecure PL0: ATS12NSOUR, ATS12NSOUW */ + mmu_idx = ARMMMUIdx_E10_0; + ss = ARMSS_NonSecure; + break; + default: + g_assert_not_reached(); + } + + par64 = do_ats_write(env, value, access_type, mmu_idx, ss); + + A32_BANKED_CURRENT_REG_SET(env, par, par64); +} + +static void ats1h_write(CPUARMState *env, const ARMCPRegInfo *ri, + uint64_t value) +{ + MMUAccessType access_type = ri->opc2 & 1 ? MMU_DATA_STORE : MMU_DATA_LOAD; + uint64_t par64; + + /* There is no SecureEL2 for AArch32. */ + par64 = do_ats_write(env, value, access_type, ARMMMUIdx_E2, + ARMSS_NonSecure); + + A32_BANKED_CURRENT_REG_SET(env, par, par64); +} + +static CPAccessResult at_e012_access(CPUARMState *env, const ARMCPRegInfo *ri, + bool isread) +{ + /* + * R_NYXTL: instruction is UNDEFINED if it applies to an Exception level + * lower than EL3 and the combination SCR_EL3.{NSE,NS} is reserved. This can + * only happen when executing at EL3 because that combination also causes an + * illegal exception return. We don't need to check FEAT_RME either, because + * scr_write() ensures that the NSE bit is not set otherwise. + */ + if ((env->cp15.scr_el3 & (SCR_NSE | SCR_NS)) == SCR_NSE) { + return CP_ACCESS_UNDEFINED; + } + return CP_ACCESS_OK; +} + +static CPAccessResult at_s1e2_access(CPUARMState *env, const ARMCPRegInfo *ri, + bool isread) +{ + if (arm_current_el(env) == 3 && + !(env->cp15.scr_el3 & (SCR_NS | SCR_EEL2))) { + return CP_ACCESS_UNDEFINED; + } + return at_e012_access(env, ri, isread); +} + +static CPAccessResult at_s1e01_access(CPUARMState *env, const ARMCPRegInfo *ri, + bool isread) +{ + if (arm_current_el(env) == 1 && (arm_hcr_el2_eff(env) & HCR_AT)) { + return CP_ACCESS_TRAP_EL2; + } + return at_e012_access(env, ri, isread); +} + +static void ats_write64(CPUARMState *env, const ARMCPRegInfo *ri, + uint64_t value) +{ + MMUAccessType access_type = ri->opc2 & 1 ? MMU_DATA_STORE : MMU_DATA_LOAD; + ARMMMUIdx mmu_idx; + uint64_t hcr_el2 = arm_hcr_el2_eff(env); + bool regime_e20 = (hcr_el2 & (HCR_E2H | HCR_TGE)) == (HCR_E2H | HCR_TGE); + bool for_el3 = false; + ARMSecuritySpace ss; + + switch (ri->opc2 & 6) { + case 0: + switch (ri->opc1) { + case 0: /* AT S1E1R, AT S1E1W, AT S1E1RP, AT S1E1WP */ + if (ri->crm == 9 && arm_pan_enabled(env)) { + mmu_idx = regime_e20 ? + ARMMMUIdx_E20_2_PAN : ARMMMUIdx_Stage1_E1_PAN; + } else { + mmu_idx = regime_e20 ? ARMMMUIdx_E20_2 : ARMMMUIdx_Stage1_E1; + } + break; + case 4: /* AT S1E2R, AT S1E2W */ + mmu_idx = hcr_el2 & HCR_E2H ? ARMMMUIdx_E20_2 : ARMMMUIdx_E2; + break; + case 6: /* AT S1E3R, AT S1E3W */ + mmu_idx = ARMMMUIdx_E3; + for_el3 = true; + break; + default: + g_assert_not_reached(); + } + break; + case 2: /* AT S1E0R, AT S1E0W */ + mmu_idx = regime_e20 ? ARMMMUIdx_E20_0 : ARMMMUIdx_Stage1_E0; + break; + case 4: /* AT S12E1R, AT S12E1W */ + mmu_idx = regime_e20 ? ARMMMUIdx_E20_2 : ARMMMUIdx_E10_1; + break; + case 6: /* AT S12E0R, AT S12E0W */ + mmu_idx = regime_e20 ? ARMMMUIdx_E20_0 : ARMMMUIdx_E10_0; + break; + default: + g_assert_not_reached(); + } + + ss = for_el3 ? arm_security_space(env) : arm_security_space_below_el3(env); + env->cp15.par_el[1] = do_ats_write(env, value, access_type, mmu_idx, ss); +} + +static CPAccessResult ats_access(CPUARMState *env, const ARMCPRegInfo *ri, + bool isread) +{ + if (ri->opc2 & 4) { + /* + * The ATS12NSO* operations must trap to EL3 or EL2 if executed in + * Secure EL1 (which can only happen if EL3 is AArch64). + * They are simply UNDEF if executed from NS EL1. + * They function normally from EL2 or EL3. + */ + if (arm_current_el(env) == 1) { + if (arm_is_secure_below_el3(env)) { + if (env->cp15.scr_el3 & SCR_EEL2) { + return CP_ACCESS_TRAP_EL2; + } + return CP_ACCESS_TRAP_EL3; + } + return CP_ACCESS_UNDEFINED; + } + } + return CP_ACCESS_OK; +} + +static const ARMCPRegInfo vapa_ats_reginfo[] = { + /* This underdecoding is safe because the reginfo is NO_RAW. */ + { .name = "ATS", .cp = 15, .crn = 7, .crm = 8, .opc1 = 0, .opc2 = CP_ANY, + .access = PL1_W, .accessfn = ats_access, + .writefn = ats_write, .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC }, +}; + +static const ARMCPRegInfo v8_ats_reginfo[] = { + /* 64 bit address translation operations */ + { .name = "AT_S1E1R", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 8, .opc2 = 0, + .access = PL1_W, .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC, + .fgt = FGT_ATS1E1R, + .accessfn = at_s1e01_access, .writefn = ats_write64 }, + { .name = "AT_S1E1W", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 8, .opc2 = 1, + .access = PL1_W, .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC, + .fgt = FGT_ATS1E1W, + .accessfn = at_s1e01_access, .writefn = ats_write64 }, + { .name = "AT_S1E0R", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 8, .opc2 = 2, + .access = PL1_W, .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC, + .fgt = FGT_ATS1E0R, + .accessfn = at_s1e01_access, .writefn = ats_write64 }, + { .name = "AT_S1E0W", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 8, .opc2 = 3, + .access = PL1_W, .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC, + .fgt = FGT_ATS1E0W, + .accessfn = at_s1e01_access, .writefn = ats_write64 }, + { .name = "AT_S12E1R", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 4, .crn = 7, .crm = 8, .opc2 = 4, + .access = PL2_W, .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC, + .accessfn = at_e012_access, .writefn = ats_write64 }, + { .name = "AT_S12E1W", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 4, .crn = 7, .crm = 8, .opc2 = 5, + .access = PL2_W, .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC, + .accessfn = at_e012_access, .writefn = ats_write64 }, + { .name = "AT_S12E0R", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 4, .crn = 7, .crm = 8, .opc2 = 6, + .access = PL2_W, .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC, + .accessfn = at_e012_access, .writefn = ats_write64 }, + { .name = "AT_S12E0W", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 4, .crn = 7, .crm = 8, .opc2 = 7, + .access = PL2_W, .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC, + .accessfn = at_e012_access, .writefn = ats_write64 }, + /* AT S1E2* are elsewhere as they UNDEF from EL3 if EL2 is not present */ + { .name = "AT_S1E3R", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 6, .crn = 7, .crm = 8, .opc2 = 0, + .access = PL3_W, .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC, + .writefn = ats_write64 }, + { .name = "AT_S1E3W", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 6, .crn = 7, .crm = 8, .opc2 = 1, + .access = PL3_W, .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC, + .writefn = ats_write64 }, +}; + +static const ARMCPRegInfo el2_ats_reginfo[] = { + /* + * Unlike the other EL2-related AT operations, these must + * UNDEF from EL3 if EL2 is not implemented, which is why we + * define them here rather than with the rest of the AT ops. + */ + { .name = "AT_S1E2R", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 4, .crn = 7, .crm = 8, .opc2 = 0, + .access = PL2_W, .accessfn = at_s1e2_access, + .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC | ARM_CP_EL3_NO_EL2_UNDEF, + .writefn = ats_write64 }, + { .name = "AT_S1E2W", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 4, .crn = 7, .crm = 8, .opc2 = 1, + .access = PL2_W, .accessfn = at_s1e2_access, + .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC | ARM_CP_EL3_NO_EL2_UNDEF, + .writefn = ats_write64 }, + /* + * The AArch32 ATS1H* operations are CONSTRAINED UNPREDICTABLE + * if EL2 is not implemented; we choose to UNDEF. Behaviour at EL3 + * with SCR.NS == 0 outside Monitor mode is UNPREDICTABLE; we choose + * to behave as if SCR.NS was 1. + */ + { .name = "ATS1HR", .cp = 15, .opc1 = 4, .crn = 7, .crm = 8, .opc2 = 0, + .access = PL2_W, + .writefn = ats1h_write, .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC }, + { .name = "ATS1HW", .cp = 15, .opc1 = 4, .crn = 7, .crm = 8, .opc2 = 1, + .access = PL2_W, + .writefn = ats1h_write, .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC }, +}; + +static const ARMCPRegInfo ats1e1_reginfo[] = { + { .name = "AT_S1E1RP", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 9, .opc2 = 0, + .access = PL1_W, .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC, + .fgt = FGT_ATS1E1RP, + .accessfn = at_s1e01_access, .writefn = ats_write64 }, + { .name = "AT_S1E1WP", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 9, .opc2 = 1, + .access = PL1_W, .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC, + .fgt = FGT_ATS1E1WP, + .accessfn = at_s1e01_access, .writefn = ats_write64 }, +}; + +static const ARMCPRegInfo ats1cp_reginfo[] = { + { .name = "ATS1CPRP", + .cp = 15, .opc1 = 0, .crn = 7, .crm = 9, .opc2 = 0, + .access = PL1_W, .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC, + .writefn = ats_write }, + { .name = "ATS1CPWP", + .cp = 15, .opc1 = 0, .crn = 7, .crm = 9, .opc2 = 1, + .access = PL1_W, .type = ARM_CP_NO_RAW | ARM_CP_RAISES_EXC, + .writefn = ats_write }, +}; + +void define_at_insn_regs(ARMCPU *cpu) +{ + CPUARMState *env = &cpu->env; + + if (arm_feature(env, ARM_FEATURE_VAPA)) { + define_arm_cp_regs(cpu, vapa_ats_reginfo); + } + if (arm_feature(env, ARM_FEATURE_V8)) { + define_arm_cp_regs(cpu, v8_ats_reginfo); + } + if (arm_feature(env, ARM_FEATURE_EL2) + || (arm_feature(env, ARM_FEATURE_EL3) + && arm_feature(env, ARM_FEATURE_V8))) { + define_arm_cp_regs(cpu, el2_ats_reginfo); + } + if (cpu_isar_feature(aa64_ats1e1, cpu)) { + define_arm_cp_regs(cpu, ats1e1_reginfo); + } + if (cpu_isar_feature(aa32_ats1e1, cpu)) { + define_arm_cp_regs(cpu, ats1cp_reginfo); + } +} diff --git a/target/arm/tcg/cpu-v7m.c b/target/arm/tcg/cpu-v7m.c index c4dd309..dc249ce 100644 --- a/target/arm/tcg/cpu-v7m.c +++ b/target/arm/tcg/cpu-v7m.c @@ -45,6 +45,7 @@ static bool arm_v7m_cpu_exec_interrupt(CPUState *cs, int interrupt_request) static void cortex_m0_initfn(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); + ARMISARegisters *isar = &cpu->isar; set_feature(&cpu->env, ARM_FEATURE_V6); set_feature(&cpu->env, ARM_FEATURE_M); @@ -58,51 +59,53 @@ static void cortex_m0_initfn(Object *obj) * by looking at ID register fields. We use the same values as * for the M3. */ - cpu->isar.id_pfr0 = 0x00000030; - cpu->isar.id_pfr1 = 0x00000200; - cpu->isar.id_dfr0 = 0x00100000; - cpu->id_afr0 = 0x00000000; - cpu->isar.id_mmfr0 = 0x00000030; - cpu->isar.id_mmfr1 = 0x00000000; - cpu->isar.id_mmfr2 = 0x00000000; - cpu->isar.id_mmfr3 = 0x00000000; - cpu->isar.id_isar0 = 0x01141110; - cpu->isar.id_isar1 = 0x02111000; - cpu->isar.id_isar2 = 0x21112231; - cpu->isar.id_isar3 = 0x01111110; - cpu->isar.id_isar4 = 0x01310102; - cpu->isar.id_isar5 = 0x00000000; - cpu->isar.id_isar6 = 0x00000000; + SET_IDREG(isar, ID_PFR0, 0x00000030); + SET_IDREG(isar, ID_PFR1, 0x00000200); + SET_IDREG(isar, ID_DFR0, 0x00100000); + SET_IDREG(isar, ID_AFR0, 0x00000000); + SET_IDREG(isar, ID_MMFR0, 0x00000030); + SET_IDREG(isar, ID_MMFR1, 0x00000000); + SET_IDREG(isar, ID_MMFR2, 0x00000000); + SET_IDREG(isar, ID_MMFR3, 0x00000000); + SET_IDREG(isar, ID_ISAR0, 0x01141110); + SET_IDREG(isar, ID_ISAR1, 0x02111000); + SET_IDREG(isar, ID_ISAR2, 0x21112231); + SET_IDREG(isar, ID_ISAR3, 0x01111110); + SET_IDREG(isar, ID_ISAR4, 0x01310102); + SET_IDREG(isar, ID_ISAR5, 0x00000000); + SET_IDREG(isar, ID_ISAR6, 0x00000000); } static void cortex_m3_initfn(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); + ARMISARegisters *isar = &cpu->isar; set_feature(&cpu->env, ARM_FEATURE_V7); set_feature(&cpu->env, ARM_FEATURE_M); set_feature(&cpu->env, ARM_FEATURE_M_MAIN); cpu->midr = 0x410fc231; cpu->pmsav7_dregion = 8; - cpu->isar.id_pfr0 = 0x00000030; - cpu->isar.id_pfr1 = 0x00000200; - cpu->isar.id_dfr0 = 0x00100000; - cpu->id_afr0 = 0x00000000; - cpu->isar.id_mmfr0 = 0x00000030; - cpu->isar.id_mmfr1 = 0x00000000; - cpu->isar.id_mmfr2 = 0x00000000; - cpu->isar.id_mmfr3 = 0x00000000; - cpu->isar.id_isar0 = 0x01141110; - cpu->isar.id_isar1 = 0x02111000; - cpu->isar.id_isar2 = 0x21112231; - cpu->isar.id_isar3 = 0x01111110; - cpu->isar.id_isar4 = 0x01310102; - cpu->isar.id_isar5 = 0x00000000; - cpu->isar.id_isar6 = 0x00000000; + SET_IDREG(isar, ID_PFR0, 0x00000030); + SET_IDREG(isar, ID_PFR1, 0x00000200); + SET_IDREG(isar, ID_DFR0, 0x00100000); + SET_IDREG(isar, ID_AFR0, 0x00000000); + SET_IDREG(isar, ID_MMFR0, 0x00000030); + SET_IDREG(isar, ID_MMFR1, 0x00000000); + SET_IDREG(isar, ID_MMFR2, 0x00000000); + SET_IDREG(isar, ID_MMFR3, 0x00000000); + SET_IDREG(isar, ID_ISAR0, 0x01141110); + SET_IDREG(isar, ID_ISAR1, 0x02111000); + SET_IDREG(isar, ID_ISAR2, 0x21112231); + SET_IDREG(isar, ID_ISAR3, 0x01111110); + SET_IDREG(isar, ID_ISAR4, 0x01310102); + SET_IDREG(isar, ID_ISAR5, 0x00000000); + SET_IDREG(isar, ID_ISAR6, 0x00000000); } static void cortex_m4_initfn(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); + ARMISARegisters *isar = &cpu->isar; set_feature(&cpu->env, ARM_FEATURE_V7); set_feature(&cpu->env, ARM_FEATURE_M); @@ -113,26 +116,27 @@ static void cortex_m4_initfn(Object *obj) cpu->isar.mvfr0 = 0x10110021; cpu->isar.mvfr1 = 0x11000011; cpu->isar.mvfr2 = 0x00000000; - cpu->isar.id_pfr0 = 0x00000030; - cpu->isar.id_pfr1 = 0x00000200; - cpu->isar.id_dfr0 = 0x00100000; - cpu->id_afr0 = 0x00000000; - cpu->isar.id_mmfr0 = 0x00000030; - cpu->isar.id_mmfr1 = 0x00000000; - cpu->isar.id_mmfr2 = 0x00000000; - cpu->isar.id_mmfr3 = 0x00000000; - cpu->isar.id_isar0 = 0x01141110; - cpu->isar.id_isar1 = 0x02111000; - cpu->isar.id_isar2 = 0x21112231; - cpu->isar.id_isar3 = 0x01111110; - cpu->isar.id_isar4 = 0x01310102; - cpu->isar.id_isar5 = 0x00000000; - cpu->isar.id_isar6 = 0x00000000; + SET_IDREG(isar, ID_PFR0, 0x00000030); + SET_IDREG(isar, ID_PFR1, 0x00000200); + SET_IDREG(isar, ID_DFR0, 0x00100000); + SET_IDREG(isar, ID_AFR0, 0x00000000); + SET_IDREG(isar, ID_MMFR0, 0x00000030); + SET_IDREG(isar, ID_MMFR1, 0x00000000); + SET_IDREG(isar, ID_MMFR2, 0x00000000); + SET_IDREG(isar, ID_MMFR3, 0x00000000); + SET_IDREG(isar, ID_ISAR0, 0x01141110); + SET_IDREG(isar, ID_ISAR1, 0x02111000); + SET_IDREG(isar, ID_ISAR2, 0x21112231); + SET_IDREG(isar, ID_ISAR3, 0x01111110); + SET_IDREG(isar, ID_ISAR4, 0x01310102); + SET_IDREG(isar, ID_ISAR5, 0x00000000); + SET_IDREG(isar, ID_ISAR6, 0x00000000); } static void cortex_m7_initfn(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); + ARMISARegisters *isar = &cpu->isar; set_feature(&cpu->env, ARM_FEATURE_V7); set_feature(&cpu->env, ARM_FEATURE_M); @@ -143,26 +147,27 @@ static void cortex_m7_initfn(Object *obj) cpu->isar.mvfr0 = 0x10110221; cpu->isar.mvfr1 = 0x12000011; cpu->isar.mvfr2 = 0x00000040; - cpu->isar.id_pfr0 = 0x00000030; - cpu->isar.id_pfr1 = 0x00000200; - cpu->isar.id_dfr0 = 0x00100000; - cpu->id_afr0 = 0x00000000; - cpu->isar.id_mmfr0 = 0x00100030; - cpu->isar.id_mmfr1 = 0x00000000; - cpu->isar.id_mmfr2 = 0x01000000; - cpu->isar.id_mmfr3 = 0x00000000; - cpu->isar.id_isar0 = 0x01101110; - cpu->isar.id_isar1 = 0x02112000; - cpu->isar.id_isar2 = 0x20232231; - cpu->isar.id_isar3 = 0x01111131; - cpu->isar.id_isar4 = 0x01310132; - cpu->isar.id_isar5 = 0x00000000; - cpu->isar.id_isar6 = 0x00000000; + SET_IDREG(isar, ID_PFR0, 0x00000030); + SET_IDREG(isar, ID_PFR1, 0x00000200); + SET_IDREG(isar, ID_DFR0, 0x00100000); + SET_IDREG(isar, ID_AFR0, 0x00000000); + SET_IDREG(isar, ID_MMFR0, 0x00100030); + SET_IDREG(isar, ID_MMFR1, 0x00000000); + SET_IDREG(isar, ID_MMFR2, 0x01000000); + SET_IDREG(isar, ID_MMFR3, 0x00000000); + SET_IDREG(isar, ID_ISAR0, 0x01101110); + SET_IDREG(isar, ID_ISAR1, 0x02112000); + SET_IDREG(isar, ID_ISAR2, 0x20232231); + SET_IDREG(isar, ID_ISAR3, 0x01111131); + SET_IDREG(isar, ID_ISAR4, 0x01310132); + SET_IDREG(isar, ID_ISAR5, 0x00000000); + SET_IDREG(isar, ID_ISAR6, 0x00000000); } static void cortex_m33_initfn(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); + ARMISARegisters *isar = &cpu->isar; set_feature(&cpu->env, ARM_FEATURE_V8); set_feature(&cpu->env, ARM_FEATURE_M); @@ -175,28 +180,29 @@ static void cortex_m33_initfn(Object *obj) cpu->isar.mvfr0 = 0x10110021; cpu->isar.mvfr1 = 0x11000011; cpu->isar.mvfr2 = 0x00000040; - cpu->isar.id_pfr0 = 0x00000030; - cpu->isar.id_pfr1 = 0x00000210; - cpu->isar.id_dfr0 = 0x00200000; - cpu->id_afr0 = 0x00000000; - cpu->isar.id_mmfr0 = 0x00101F40; - cpu->isar.id_mmfr1 = 0x00000000; - cpu->isar.id_mmfr2 = 0x01000000; - cpu->isar.id_mmfr3 = 0x00000000; - cpu->isar.id_isar0 = 0x01101110; - cpu->isar.id_isar1 = 0x02212000; - cpu->isar.id_isar2 = 0x20232232; - cpu->isar.id_isar3 = 0x01111131; - cpu->isar.id_isar4 = 0x01310132; - cpu->isar.id_isar5 = 0x00000000; - cpu->isar.id_isar6 = 0x00000000; - cpu->clidr = 0x00000000; + SET_IDREG(isar, ID_PFR0, 0x00000030); + SET_IDREG(isar, ID_PFR1, 0x00000210); + SET_IDREG(isar, ID_DFR0, 0x00200000); + SET_IDREG(isar, ID_AFR0, 0x00000000); + SET_IDREG(isar, ID_MMFR0, 0x00101F40); + SET_IDREG(isar, ID_MMFR1, 0x00000000); + SET_IDREG(isar, ID_MMFR2, 0x01000000); + SET_IDREG(isar, ID_MMFR3, 0x00000000); + SET_IDREG(isar, ID_ISAR0, 0x01101110); + SET_IDREG(isar, ID_ISAR1, 0x02212000); + SET_IDREG(isar, ID_ISAR2, 0x20232232); + SET_IDREG(isar, ID_ISAR3, 0x01111131); + SET_IDREG(isar, ID_ISAR4, 0x01310132); + SET_IDREG(isar, ID_ISAR5, 0x00000000); + SET_IDREG(isar, ID_ISAR6, 0x00000000); + SET_IDREG(isar, CLIDR, 0x00000000); cpu->ctr = 0x8000c000; } static void cortex_m55_initfn(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); + ARMISARegisters *isar = &cpu->isar; set_feature(&cpu->env, ARM_FEATURE_V8); set_feature(&cpu->env, ARM_FEATURE_V8_1M); @@ -212,39 +218,47 @@ static void cortex_m55_initfn(Object *obj) cpu->isar.mvfr0 = 0x10110221; cpu->isar.mvfr1 = 0x12100211; cpu->isar.mvfr2 = 0x00000040; - cpu->isar.id_pfr0 = 0x20000030; - cpu->isar.id_pfr1 = 0x00000230; - cpu->isar.id_dfr0 = 0x10200000; - cpu->id_afr0 = 0x00000000; - cpu->isar.id_mmfr0 = 0x00111040; - cpu->isar.id_mmfr1 = 0x00000000; - cpu->isar.id_mmfr2 = 0x01000000; - cpu->isar.id_mmfr3 = 0x00000011; - cpu->isar.id_isar0 = 0x01103110; - cpu->isar.id_isar1 = 0x02212000; - cpu->isar.id_isar2 = 0x20232232; - cpu->isar.id_isar3 = 0x01111131; - cpu->isar.id_isar4 = 0x01310132; - cpu->isar.id_isar5 = 0x00000000; - cpu->isar.id_isar6 = 0x00000000; - cpu->clidr = 0x00000000; /* caches not implemented */ + SET_IDREG(isar, ID_PFR0, 0x20000030); + SET_IDREG(isar, ID_PFR1, 0x00000230); + SET_IDREG(isar, ID_DFR0, 0x10200000); + SET_IDREG(isar, ID_AFR0, 0x00000000); + SET_IDREG(isar, ID_MMFR0, 0x00111040); + SET_IDREG(isar, ID_MMFR1, 0x00000000); + SET_IDREG(isar, ID_MMFR2, 0x01000000); + SET_IDREG(isar, ID_MMFR3, 0x00000011); + SET_IDREG(isar, ID_ISAR0, 0x01103110); + SET_IDREG(isar, ID_ISAR1, 0x02212000); + SET_IDREG(isar, ID_ISAR2, 0x20232232); + SET_IDREG(isar, ID_ISAR3, 0x01111131); + SET_IDREG(isar, ID_ISAR4, 0x01310132); + SET_IDREG(isar, ID_ISAR5, 0x00000000); + SET_IDREG(isar, ID_ISAR6, 0x00000000); + SET_IDREG(isar, CLIDR, 0x00000000); /* caches not implemented */ cpu->ctr = 0x8303c003; } static const TCGCPUOps arm_v7m_tcg_ops = { + /* ARM processors have a weak memory model */ + .guest_default_memory_order = 0, + .mttcg_supported = true, + .initialize = arm_translate_init, .translate_code = arm_translate_code, + .get_tb_cpu_state = arm_get_tb_cpu_state, .synchronize_from_tb = arm_cpu_synchronize_from_tb, .debug_excp_handler = arm_debug_excp_handler, .restore_state_to_opc = arm_restore_state_to_opc, + .mmu_index = arm_cpu_mmu_index, #ifdef CONFIG_USER_ONLY .record_sigsegv = arm_cpu_record_sigsegv, .record_sigbus = arm_cpu_record_sigbus, #else .tlb_fill_align = arm_cpu_tlb_fill_align, + .pointer_wrap = cpu_pointer_wrap_uint32, .cpu_exec_interrupt = arm_v7m_cpu_exec_interrupt, .cpu_exec_halt = arm_cpu_exec_halt, + .cpu_exec_reset = cpu_reset, .do_interrupt = arm_v7m_cpu_do_interrupt, .do_transaction_failed = arm_cpu_do_transaction_failed, .do_unaligned_access = arm_cpu_do_unaligned_access, @@ -254,14 +268,13 @@ static const TCGCPUOps arm_v7m_tcg_ops = { #endif /* !CONFIG_USER_ONLY */ }; -static void arm_v7m_class_init(ObjectClass *oc, void *data) +static void arm_v7m_class_init(ObjectClass *oc, const void *data) { ARMCPUClass *acc = ARM_CPU_CLASS(oc); CPUClass *cc = CPU_CLASS(oc); acc->info = data; cc->tcg_ops = &arm_v7m_tcg_ops; - cc->gdb_core_xml_file = "arm-m-profile.xml"; } static const ARMCPUInfo arm_v7m_cpus[] = { diff --git a/target/arm/tcg/cpu32.c b/target/arm/tcg/cpu32.c index 2c45b7e..a2a23ea 100644 --- a/target/arm/tcg/cpu32.c +++ b/target/arm/tcg/cpu32.c @@ -23,18 +23,19 @@ void aa32_max_features(ARMCPU *cpu) { uint32_t t; + ARMISARegisters *isar = &cpu->isar; /* Add additional features supported by QEMU */ - t = cpu->isar.id_isar5; + t = GET_IDREG(isar, ID_ISAR5); t = FIELD_DP32(t, ID_ISAR5, AES, 2); /* FEAT_PMULL */ t = FIELD_DP32(t, ID_ISAR5, SHA1, 1); /* FEAT_SHA1 */ t = FIELD_DP32(t, ID_ISAR5, SHA2, 1); /* FEAT_SHA256 */ t = FIELD_DP32(t, ID_ISAR5, CRC32, 1); t = FIELD_DP32(t, ID_ISAR5, RDM, 1); /* FEAT_RDM */ t = FIELD_DP32(t, ID_ISAR5, VCMA, 1); /* FEAT_FCMA */ - cpu->isar.id_isar5 = t; + SET_IDREG(isar, ID_ISAR5, t); - t = cpu->isar.id_isar6; + t = GET_IDREG(isar, ID_ISAR6); t = FIELD_DP32(t, ID_ISAR6, JSCVT, 1); /* FEAT_JSCVT */ t = FIELD_DP32(t, ID_ISAR6, DP, 1); /* Feat_DotProd */ t = FIELD_DP32(t, ID_ISAR6, FHM, 1); /* FEAT_FHM */ @@ -42,7 +43,7 @@ void aa32_max_features(ARMCPU *cpu) t = FIELD_DP32(t, ID_ISAR6, SPECRES, 1); /* FEAT_SPECRES */ t = FIELD_DP32(t, ID_ISAR6, BF16, 1); /* FEAT_AA32BF16 */ t = FIELD_DP32(t, ID_ISAR6, I8MM, 1); /* FEAT_AA32I8MM */ - cpu->isar.id_isar6 = t; + SET_IDREG(isar, ID_ISAR6, t); t = cpu->isar.mvfr1; t = FIELD_DP32(t, MVFR1, FPHP, 3); /* FEAT_FP16 */ @@ -54,38 +55,34 @@ void aa32_max_features(ARMCPU *cpu) t = FIELD_DP32(t, MVFR2, FPMISC, 4); /* FP MaxNum */ cpu->isar.mvfr2 = t; - t = cpu->isar.id_mmfr3; - t = FIELD_DP32(t, ID_MMFR3, PAN, 2); /* FEAT_PAN2 */ - cpu->isar.id_mmfr3 = t; + FIELD_DP32_IDREG(isar, ID_MMFR3, PAN, 2); /* FEAT_PAN2 */ - t = cpu->isar.id_mmfr4; + t = GET_IDREG(isar, ID_MMFR4); t = FIELD_DP32(t, ID_MMFR4, HPDS, 2); /* FEAT_HPDS2 */ t = FIELD_DP32(t, ID_MMFR4, AC2, 1); /* ACTLR2, HACTLR2 */ t = FIELD_DP32(t, ID_MMFR4, CNP, 1); /* FEAT_TTCNP */ t = FIELD_DP32(t, ID_MMFR4, XNX, 1); /* FEAT_XNX */ t = FIELD_DP32(t, ID_MMFR4, EVT, 2); /* FEAT_EVT */ - cpu->isar.id_mmfr4 = t; + SET_IDREG(isar, ID_MMFR4, t); - t = cpu->isar.id_mmfr5; - t = FIELD_DP32(t, ID_MMFR5, ETS, 2); /* FEAT_ETS2 */ - cpu->isar.id_mmfr5 = t; + FIELD_DP32_IDREG(isar, ID_MMFR5, ETS, 2); /* FEAT_ETS2 */ - t = cpu->isar.id_pfr0; + t = GET_IDREG(isar, ID_PFR0); t = FIELD_DP32(t, ID_PFR0, CSV2, 2); /* FEAT_CSV2 */ t = FIELD_DP32(t, ID_PFR0, DIT, 1); /* FEAT_DIT */ t = FIELD_DP32(t, ID_PFR0, RAS, 1); /* FEAT_RAS */ - cpu->isar.id_pfr0 = t; + SET_IDREG(isar, ID_PFR0, t); - t = cpu->isar.id_pfr2; + t = GET_IDREG(isar, ID_PFR2); t = FIELD_DP32(t, ID_PFR2, CSV3, 1); /* FEAT_CSV3 */ t = FIELD_DP32(t, ID_PFR2, SSBS, 1); /* FEAT_SSBS */ - cpu->isar.id_pfr2 = t; + SET_IDREG(isar, ID_PFR2, t); - t = cpu->isar.id_dfr0; + t = GET_IDREG(isar, ID_DFR0); t = FIELD_DP32(t, ID_DFR0, COPDBG, 10); /* FEAT_Debugv8p8 */ t = FIELD_DP32(t, ID_DFR0, COPSDBG, 10); /* FEAT_Debugv8p8 */ t = FIELD_DP32(t, ID_DFR0, PERFMON, 6); /* FEAT_PMUv3p5 */ - cpu->isar.id_dfr0 = t; + SET_IDREG(isar, ID_DFR0, t); /* Debug ID registers. */ @@ -115,9 +112,7 @@ void aa32_max_features(ARMCPU *cpu) t = FIELD_DP32(t, DBGDEVID1, PCSROFFSET, 2); cpu->isar.dbgdevid1 = t; - t = cpu->isar.id_dfr1; - t = FIELD_DP32(t, ID_DFR1, HPMN0, 1); /* FEAT_HPMN0 */ - cpu->isar.id_dfr1 = t; + FIELD_DP32_IDREG(isar, ID_DFR1, HPMN0, 1); /* FEAT_HPMN0 */ } /* CPU models. These are not needed for the AArch64 linux-user build. */ @@ -140,7 +135,7 @@ static void arm926_initfn(Object *obj) * ARMv5 does not have the ID_ISAR registers, but we can still * set the field to indicate Jazelle support within QEMU. */ - cpu->isar.id_isar1 = FIELD_DP32(cpu->isar.id_isar1, ID_ISAR1, JAZELLE, 1); + FIELD_DP32_IDREG(&cpu->isar, ID_ISAR1, JAZELLE, 1); /* * Similarly, we need to set MVFR0 fields to enable vfp and short vector * support even though ARMv5 doesn't have this register. @@ -182,7 +177,7 @@ static void arm1026_initfn(Object *obj) * ARMv5 does not have the ID_ISAR registers, but we can still * set the field to indicate Jazelle support within QEMU. */ - cpu->isar.id_isar1 = FIELD_DP32(cpu->isar.id_isar1, ID_ISAR1, JAZELLE, 1); + FIELD_DP32_IDREG(&cpu->isar, ID_ISAR1, JAZELLE, 1); /* * Similarly, we need to set MVFR0 fields to enable vfp and short vector * support even though ARMv5 doesn't have this register. @@ -206,6 +201,7 @@ static void arm1026_initfn(Object *obj) static void arm1136_r2_initfn(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); + ARMISARegisters *isar = &cpu->isar; /* * What qemu calls "arm1136_r2" is actually the 1136 r0p2, ie an * older core than plain "arm1136". In particular this does not @@ -226,24 +222,25 @@ static void arm1136_r2_initfn(Object *obj) cpu->isar.mvfr1 = 0x00000000; cpu->ctr = 0x1dd20d2; cpu->reset_sctlr = 0x00050078; - cpu->isar.id_pfr0 = 0x111; - cpu->isar.id_pfr1 = 0x1; - cpu->isar.id_dfr0 = 0x2; - cpu->id_afr0 = 0x3; - cpu->isar.id_mmfr0 = 0x01130003; - cpu->isar.id_mmfr1 = 0x10030302; - cpu->isar.id_mmfr2 = 0x01222110; - cpu->isar.id_isar0 = 0x00140011; - cpu->isar.id_isar1 = 0x12002111; - cpu->isar.id_isar2 = 0x11231111; - cpu->isar.id_isar3 = 0x01102131; - cpu->isar.id_isar4 = 0x141; + SET_IDREG(isar, ID_PFR0, 0x111); + SET_IDREG(isar, ID_PFR1, 0x1); + SET_IDREG(isar, ID_DFR0, 0x2); + SET_IDREG(isar, ID_AFR0, 0x3); + SET_IDREG(isar, ID_MMFR0, 0x01130003); + SET_IDREG(isar, ID_MMFR1, 0x10030302); + SET_IDREG(isar, ID_MMFR2, 0x01222110); + SET_IDREG(isar, ID_ISAR0, 0x00140011); + SET_IDREG(isar, ID_ISAR1, 0x12002111); + SET_IDREG(isar, ID_ISAR2, 0x11231111); + SET_IDREG(isar, ID_ISAR3, 0x01102131); + SET_IDREG(isar, ID_ISAR4, 0x141); cpu->reset_auxcr = 7; } static void arm1136_initfn(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); + ARMISARegisters *isar = &cpu->isar; cpu->dtb_compatible = "arm,arm1136"; set_feature(&cpu->env, ARM_FEATURE_V6K); @@ -257,24 +254,25 @@ static void arm1136_initfn(Object *obj) cpu->isar.mvfr1 = 0x00000000; cpu->ctr = 0x1dd20d2; cpu->reset_sctlr = 0x00050078; - cpu->isar.id_pfr0 = 0x111; - cpu->isar.id_pfr1 = 0x1; - cpu->isar.id_dfr0 = 0x2; - cpu->id_afr0 = 0x3; - cpu->isar.id_mmfr0 = 0x01130003; - cpu->isar.id_mmfr1 = 0x10030302; - cpu->isar.id_mmfr2 = 0x01222110; - cpu->isar.id_isar0 = 0x00140011; - cpu->isar.id_isar1 = 0x12002111; - cpu->isar.id_isar2 = 0x11231111; - cpu->isar.id_isar3 = 0x01102131; - cpu->isar.id_isar4 = 0x141; + SET_IDREG(isar, ID_PFR0, 0x111); + SET_IDREG(isar, ID_PFR1, 0x1); + SET_IDREG(isar, ID_DFR0, 0x2); + SET_IDREG(isar, ID_AFR0, 0x3); + SET_IDREG(isar, ID_MMFR0, 0x01130003); + SET_IDREG(isar, ID_MMFR1, 0x10030302); + SET_IDREG(isar, ID_MMFR2, 0x01222110); + SET_IDREG(isar, ID_ISAR0, 0x00140011); + SET_IDREG(isar, ID_ISAR1, 0x12002111); + SET_IDREG(isar, ID_ISAR2, 0x11231111); + SET_IDREG(isar, ID_ISAR3, 0x01102131); + SET_IDREG(isar, ID_ISAR4, 0x141); cpu->reset_auxcr = 7; } static void arm1176_initfn(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); + ARMISARegisters *isar = &cpu->isar; cpu->dtb_compatible = "arm,arm1176"; set_feature(&cpu->env, ARM_FEATURE_V6K); @@ -289,24 +287,25 @@ static void arm1176_initfn(Object *obj) cpu->isar.mvfr1 = 0x00000000; cpu->ctr = 0x1dd20d2; cpu->reset_sctlr = 0x00050078; - cpu->isar.id_pfr0 = 0x111; - cpu->isar.id_pfr1 = 0x11; - cpu->isar.id_dfr0 = 0x33; - cpu->id_afr0 = 0; - cpu->isar.id_mmfr0 = 0x01130003; - cpu->isar.id_mmfr1 = 0x10030302; - cpu->isar.id_mmfr2 = 0x01222100; - cpu->isar.id_isar0 = 0x0140011; - cpu->isar.id_isar1 = 0x12002111; - cpu->isar.id_isar2 = 0x11231121; - cpu->isar.id_isar3 = 0x01102131; - cpu->isar.id_isar4 = 0x01141; + SET_IDREG(isar, ID_PFR0, 0x111); + SET_IDREG(isar, ID_PFR1, 0x11); + SET_IDREG(isar, ID_DFR0, 0x33); + SET_IDREG(isar, ID_AFR0, 0); + SET_IDREG(isar, ID_MMFR0, 0x01130003); + SET_IDREG(isar, ID_MMFR1, 0x10030302); + SET_IDREG(isar, ID_MMFR2, 0x01222100); + SET_IDREG(isar, ID_ISAR0, 0x0140011); + SET_IDREG(isar, ID_ISAR1, 0x12002111); + SET_IDREG(isar, ID_ISAR2, 0x11231121); + SET_IDREG(isar, ID_ISAR3, 0x01102131); + SET_IDREG(isar, ID_ISAR4, 0x01141); cpu->reset_auxcr = 7; } static void arm11mpcore_initfn(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); + ARMISARegisters *isar = &cpu->isar; cpu->dtb_compatible = "arm,arm11mpcore"; set_feature(&cpu->env, ARM_FEATURE_V6K); @@ -318,18 +317,18 @@ static void arm11mpcore_initfn(Object *obj) cpu->isar.mvfr0 = 0x11111111; cpu->isar.mvfr1 = 0x00000000; cpu->ctr = 0x1d192992; /* 32K icache 32K dcache */ - cpu->isar.id_pfr0 = 0x111; - cpu->isar.id_pfr1 = 0x1; - cpu->isar.id_dfr0 = 0; - cpu->id_afr0 = 0x2; - cpu->isar.id_mmfr0 = 0x01100103; - cpu->isar.id_mmfr1 = 0x10020302; - cpu->isar.id_mmfr2 = 0x01222000; - cpu->isar.id_isar0 = 0x00100011; - cpu->isar.id_isar1 = 0x12002111; - cpu->isar.id_isar2 = 0x11221011; - cpu->isar.id_isar3 = 0x01102131; - cpu->isar.id_isar4 = 0x141; + SET_IDREG(isar, ID_PFR0, 0x111); + SET_IDREG(isar, ID_PFR1, 0x1); + SET_IDREG(isar, ID_DFR0, 0); + SET_IDREG(isar, ID_AFR0, 0x2); + SET_IDREG(isar, ID_MMFR0, 0x01100103); + SET_IDREG(isar, ID_MMFR1, 0x10020302); + SET_IDREG(isar, ID_MMFR2, 0x01222000); + SET_IDREG(isar, ID_ISAR0, 0x00100011); + SET_IDREG(isar, ID_ISAR1, 0x12002111); + SET_IDREG(isar, ID_ISAR2, 0x11221011); + SET_IDREG(isar, ID_ISAR3, 0x01102131); + SET_IDREG(isar, ID_ISAR4, 0x141); cpu->reset_auxcr = 1; } @@ -343,6 +342,7 @@ static const ARMCPRegInfo cortexa8_cp_reginfo[] = { static void cortex_a8_initfn(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); + ARMISARegisters *isar = &cpu->isar; cpu->dtb_compatible = "arm,cortex-a8"; set_feature(&cpu->env, ARM_FEATURE_V7); @@ -357,21 +357,21 @@ static void cortex_a8_initfn(Object *obj) cpu->isar.mvfr1 = 0x00011111; cpu->ctr = 0x82048004; cpu->reset_sctlr = 0x00c50078; - cpu->isar.id_pfr0 = 0x1031; - cpu->isar.id_pfr1 = 0x11; - cpu->isar.id_dfr0 = 0x400; - cpu->id_afr0 = 0; - cpu->isar.id_mmfr0 = 0x31100003; - cpu->isar.id_mmfr1 = 0x20000000; - cpu->isar.id_mmfr2 = 0x01202000; - cpu->isar.id_mmfr3 = 0x11; - cpu->isar.id_isar0 = 0x00101111; - cpu->isar.id_isar1 = 0x12112111; - cpu->isar.id_isar2 = 0x21232031; - cpu->isar.id_isar3 = 0x11112131; - cpu->isar.id_isar4 = 0x00111142; + SET_IDREG(isar, ID_PFR0, 0x1031); + SET_IDREG(isar, ID_PFR1, 0x11); + SET_IDREG(isar, ID_DFR0, 0x400); + SET_IDREG(isar, ID_AFR0, 0); + SET_IDREG(isar, ID_MMFR0, 0x31100003); + SET_IDREG(isar, ID_MMFR1, 0x20000000); + SET_IDREG(isar, ID_MMFR2, 0x01202000); + SET_IDREG(isar, ID_MMFR3, 0x11); + SET_IDREG(isar, ID_ISAR0, 0x00101111); + SET_IDREG(isar, ID_ISAR1, 0x12112111); + SET_IDREG(isar, ID_ISAR2, 0x21232031); + SET_IDREG(isar, ID_ISAR3, 0x11112131); + SET_IDREG(isar, ID_ISAR4, 0x00111142); cpu->isar.dbgdidr = 0x15141000; - cpu->clidr = (1 << 27) | (2 << 24) | 3; + SET_IDREG(isar, CLIDR, (1 << 27) | (2 << 24) | 3); cpu->ccsidr[0] = 0xe007e01a; /* 16k L1 dcache. */ cpu->ccsidr[1] = 0x2007e01a; /* 16k L1 icache. */ cpu->ccsidr[2] = 0xf0000000; /* No L2 icache. */ @@ -412,6 +412,7 @@ static const ARMCPRegInfo cortexa9_cp_reginfo[] = { static void cortex_a9_initfn(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); + ARMISARegisters *isar = &cpu->isar; cpu->dtb_compatible = "arm,cortex-a9"; set_feature(&cpu->env, ARM_FEATURE_V7); @@ -432,21 +433,21 @@ static void cortex_a9_initfn(Object *obj) cpu->isar.mvfr1 = 0x01111111; cpu->ctr = 0x80038003; cpu->reset_sctlr = 0x00c50078; - cpu->isar.id_pfr0 = 0x1031; - cpu->isar.id_pfr1 = 0x11; - cpu->isar.id_dfr0 = 0x000; - cpu->id_afr0 = 0; - cpu->isar.id_mmfr0 = 0x00100103; - cpu->isar.id_mmfr1 = 0x20000000; - cpu->isar.id_mmfr2 = 0x01230000; - cpu->isar.id_mmfr3 = 0x00002111; - cpu->isar.id_isar0 = 0x00101111; - cpu->isar.id_isar1 = 0x13112111; - cpu->isar.id_isar2 = 0x21232041; - cpu->isar.id_isar3 = 0x11112131; - cpu->isar.id_isar4 = 0x00111142; + SET_IDREG(isar, ID_PFR0, 0x1031); + SET_IDREG(isar, ID_PFR1, 0x11); + SET_IDREG(isar, ID_DFR0, 0x000); + SET_IDREG(isar, ID_AFR0, 0); + SET_IDREG(isar, ID_MMFR0, 0x00100103); + SET_IDREG(isar, ID_MMFR1, 0x20000000); + SET_IDREG(isar, ID_MMFR2, 0x01230000); + SET_IDREG(isar, ID_MMFR3, 0x00002111); + SET_IDREG(isar, ID_ISAR0, 0x00101111); + SET_IDREG(isar, ID_ISAR1, 0x13112111); + SET_IDREG(isar, ID_ISAR2, 0x21232041); + SET_IDREG(isar, ID_ISAR3, 0x11112131); + SET_IDREG(isar, ID_ISAR4, 0x00111142); cpu->isar.dbgdidr = 0x35141000; - cpu->clidr = (1 << 27) | (1 << 24) | 3; + SET_IDREG(isar, CLIDR, (1 << 27) | (1 << 24) | 3); cpu->ccsidr[0] = 0xe00fe019; /* 16k L1 dcache. */ cpu->ccsidr[1] = 0x200fe019; /* 16k L1 icache. */ cpu->isar.reset_pmcr_el0 = 0x41093000; @@ -479,6 +480,7 @@ static const ARMCPRegInfo cortexa15_cp_reginfo[] = { static void cortex_a7_initfn(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); + ARMISARegisters *isar = &cpu->isar; cpu->dtb_compatible = "arm,cortex-a7"; set_feature(&cpu->env, ARM_FEATURE_V7VE); @@ -497,27 +499,27 @@ static void cortex_a7_initfn(Object *obj) cpu->isar.mvfr1 = 0x11111111; cpu->ctr = 0x84448003; cpu->reset_sctlr = 0x00c50078; - cpu->isar.id_pfr0 = 0x00001131; - cpu->isar.id_pfr1 = 0x00011011; - cpu->isar.id_dfr0 = 0x02010555; - cpu->id_afr0 = 0x00000000; - cpu->isar.id_mmfr0 = 0x10101105; - cpu->isar.id_mmfr1 = 0x40000000; - cpu->isar.id_mmfr2 = 0x01240000; - cpu->isar.id_mmfr3 = 0x02102211; + SET_IDREG(isar, ID_PFR0, 0x00001131); + SET_IDREG(isar, ID_PFR1, 0x00011011); + SET_IDREG(isar, ID_DFR0, 0x02010555); + SET_IDREG(isar, ID_AFR0, 0x00000000); + SET_IDREG(isar, ID_MMFR0, 0x10101105); + SET_IDREG(isar, ID_MMFR1, 0x40000000); + SET_IDREG(isar, ID_MMFR2, 0x01240000); + SET_IDREG(isar, ID_MMFR3, 0x02102211); /* * a7_mpcore_r0p5_trm, page 4-4 gives 0x01101110; but * table 4-41 gives 0x02101110, which includes the arm div insns. */ - cpu->isar.id_isar0 = 0x02101110; - cpu->isar.id_isar1 = 0x13112111; - cpu->isar.id_isar2 = 0x21232041; - cpu->isar.id_isar3 = 0x11112131; - cpu->isar.id_isar4 = 0x10011142; + SET_IDREG(isar, ID_ISAR0, 0x02101110); + SET_IDREG(isar, ID_ISAR1, 0x13112111); + SET_IDREG(isar, ID_ISAR2, 0x21232041); + SET_IDREG(isar, ID_ISAR3, 0x11112131); + SET_IDREG(isar, ID_ISAR4, 0x10011142); cpu->isar.dbgdidr = 0x3515f005; cpu->isar.dbgdevid = 0x01110f13; cpu->isar.dbgdevid1 = 0x1; - cpu->clidr = 0x0a200023; + SET_IDREG(isar, CLIDR, 0x0a200023); cpu->ccsidr[0] = 0x701fe00a; /* 32K L1 dcache */ cpu->ccsidr[1] = 0x201fe00a; /* 32K L1 icache */ cpu->ccsidr[2] = 0x711fe07a; /* 4096K L2 unified cache */ @@ -528,6 +530,7 @@ static void cortex_a7_initfn(Object *obj) static void cortex_a15_initfn(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); + ARMISARegisters *isar = &cpu->isar; cpu->dtb_compatible = "arm,cortex-a15"; set_feature(&cpu->env, ARM_FEATURE_V7VE); @@ -548,23 +551,23 @@ static void cortex_a15_initfn(Object *obj) cpu->isar.mvfr1 = 0x11111111; cpu->ctr = 0x8444c004; cpu->reset_sctlr = 0x00c50078; - cpu->isar.id_pfr0 = 0x00001131; - cpu->isar.id_pfr1 = 0x00011011; - cpu->isar.id_dfr0 = 0x02010555; - cpu->id_afr0 = 0x00000000; - cpu->isar.id_mmfr0 = 0x10201105; - cpu->isar.id_mmfr1 = 0x20000000; - cpu->isar.id_mmfr2 = 0x01240000; - cpu->isar.id_mmfr3 = 0x02102211; - cpu->isar.id_isar0 = 0x02101110; - cpu->isar.id_isar1 = 0x13112111; - cpu->isar.id_isar2 = 0x21232041; - cpu->isar.id_isar3 = 0x11112131; - cpu->isar.id_isar4 = 0x10011142; + SET_IDREG(isar, ID_PFR0, 0x00001131); + SET_IDREG(isar, ID_PFR1, 0x00011011); + SET_IDREG(isar, ID_DFR0, 0x02010555); + SET_IDREG(isar, ID_AFR0, 0x00000000); + SET_IDREG(isar, ID_MMFR0, 0x10201105); + SET_IDREG(isar, ID_MMFR1, 0x20000000); + SET_IDREG(isar, ID_MMFR2, 0x01240000); + SET_IDREG(isar, ID_MMFR3, 0x02102211); + SET_IDREG(isar, ID_ISAR0, 0x02101110); + SET_IDREG(isar, ID_ISAR1, 0x13112111); + SET_IDREG(isar, ID_ISAR2, 0x21232041); + SET_IDREG(isar, ID_ISAR3, 0x11112131); + SET_IDREG(isar, ID_ISAR4, 0x10011142); cpu->isar.dbgdidr = 0x3515f021; cpu->isar.dbgdevid = 0x01110f13; cpu->isar.dbgdevid1 = 0x0; - cpu->clidr = 0x0a200023; + SET_IDREG(isar, CLIDR, 0x0a200023); cpu->ccsidr[0] = 0x701fe00a; /* 32K L1 dcache */ cpu->ccsidr[1] = 0x201fe00a; /* 32K L1 icache */ cpu->ccsidr[2] = 0x711fe07a; /* 4096K L2 unified cache */ @@ -585,27 +588,28 @@ static const ARMCPRegInfo cortexr5_cp_reginfo[] = { static void cortex_r5_initfn(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); + ARMISARegisters *isar = &cpu->isar; set_feature(&cpu->env, ARM_FEATURE_V7); set_feature(&cpu->env, ARM_FEATURE_V7MP); set_feature(&cpu->env, ARM_FEATURE_PMSA); set_feature(&cpu->env, ARM_FEATURE_PMU); cpu->midr = 0x411fc153; /* r1p3 */ - cpu->isar.id_pfr0 = 0x0131; - cpu->isar.id_pfr1 = 0x001; - cpu->isar.id_dfr0 = 0x010400; - cpu->id_afr0 = 0x0; - cpu->isar.id_mmfr0 = 0x0210030; - cpu->isar.id_mmfr1 = 0x00000000; - cpu->isar.id_mmfr2 = 0x01200000; - cpu->isar.id_mmfr3 = 0x0211; - cpu->isar.id_isar0 = 0x02101111; - cpu->isar.id_isar1 = 0x13112111; - cpu->isar.id_isar2 = 0x21232141; - cpu->isar.id_isar3 = 0x01112131; - cpu->isar.id_isar4 = 0x0010142; - cpu->isar.id_isar5 = 0x0; - cpu->isar.id_isar6 = 0x0; + SET_IDREG(isar, ID_PFR0, 0x0131); + SET_IDREG(isar, ID_PFR1, 0x001); + SET_IDREG(isar, ID_DFR0, 0x010400); + SET_IDREG(isar, ID_AFR0, 0x0); + SET_IDREG(isar, ID_MMFR0, 0x0210030); + SET_IDREG(isar, ID_MMFR1, 0x00000000); + SET_IDREG(isar, ID_MMFR2, 0x01200000); + SET_IDREG(isar, ID_MMFR3, 0x0211); + SET_IDREG(isar, ID_ISAR0, 0x02101111); + SET_IDREG(isar, ID_ISAR1, 0x13112111); + SET_IDREG(isar, ID_ISAR2, 0x21232141); + SET_IDREG(isar, ID_ISAR3, 0x01112131); + SET_IDREG(isar, ID_ISAR4, 0x0010142); + SET_IDREG(isar, ID_ISAR5, 0x0); + SET_IDREG(isar, ID_ISAR6, 0x0); cpu->mp_is_up = true; cpu->pmsav7_dregion = 16; cpu->isar.reset_pmcr_el0 = 0x41151800; @@ -720,6 +724,7 @@ static const ARMCPRegInfo cortex_r52_cp_reginfo[] = { static void cortex_r52_initfn(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); + ARMISARegisters *isar = &cpu->isar; set_feature(&cpu->env, ARM_FEATURE_V8); set_feature(&cpu->env, ARM_FEATURE_EL2); @@ -737,23 +742,23 @@ static void cortex_r52_initfn(Object *obj) cpu->isar.mvfr2 = 0x00000043; cpu->ctr = 0x8144c004; cpu->reset_sctlr = 0x30c50838; - cpu->isar.id_pfr0 = 0x00000131; - cpu->isar.id_pfr1 = 0x10111001; - cpu->isar.id_dfr0 = 0x03010006; - cpu->id_afr0 = 0x00000000; - cpu->isar.id_mmfr0 = 0x00211040; - cpu->isar.id_mmfr1 = 0x40000000; - cpu->isar.id_mmfr2 = 0x01200000; - cpu->isar.id_mmfr3 = 0xf0102211; - cpu->isar.id_mmfr4 = 0x00000010; - cpu->isar.id_isar0 = 0x02101110; - cpu->isar.id_isar1 = 0x13112111; - cpu->isar.id_isar2 = 0x21232142; - cpu->isar.id_isar3 = 0x01112131; - cpu->isar.id_isar4 = 0x00010142; - cpu->isar.id_isar5 = 0x00010001; + SET_IDREG(isar, ID_PFR0, 0x00000131); + SET_IDREG(isar, ID_PFR1, 0x10111001); + SET_IDREG(isar, ID_DFR0, 0x03010006); + SET_IDREG(isar, ID_AFR0, 0x00000000); + SET_IDREG(isar, ID_MMFR0, 0x00211040); + SET_IDREG(isar, ID_MMFR1, 0x40000000); + SET_IDREG(isar, ID_MMFR2, 0x01200000); + SET_IDREG(isar, ID_MMFR3, 0xf0102211); + SET_IDREG(isar, ID_MMFR4, 0x00000010); + SET_IDREG(isar, ID_ISAR0, 0x02101110); + SET_IDREG(isar, ID_ISAR1, 0x13112111); + SET_IDREG(isar, ID_ISAR2, 0x21232142); + SET_IDREG(isar, ID_ISAR3, 0x01112131); + SET_IDREG(isar, ID_ISAR4, 0x00010142); + SET_IDREG(isar, ID_ISAR5, 0x00010001); cpu->isar.dbgdidr = 0x77168000; - cpu->clidr = (1 << 27) | (1 << 24) | 0x3; + SET_IDREG(isar, CLIDR, (1 << 27) | (1 << 24) | 0x3); cpu->ccsidr[0] = 0x700fe01a; /* 32KB L1 dcache */ cpu->ccsidr[1] = 0x201fe00a; /* 32KB L1 icache */ @@ -949,6 +954,7 @@ static void pxa270c5_initfn(Object *obj) static void arm_max_initfn(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); + ARMISARegisters *isar = &cpu->isar; /* aarch64_a57_initfn, advertising none of the aarch64 features */ cpu->dtb_compatible = "arm,cortex-a57"; @@ -968,23 +974,23 @@ static void arm_max_initfn(Object *obj) cpu->isar.mvfr2 = 0x00000043; cpu->ctr = 0x8444c004; cpu->reset_sctlr = 0x00c50838; - cpu->isar.id_pfr0 = 0x00000131; - cpu->isar.id_pfr1 = 0x00011011; - cpu->isar.id_dfr0 = 0x03010066; - cpu->id_afr0 = 0x00000000; - cpu->isar.id_mmfr0 = 0x10101105; - cpu->isar.id_mmfr1 = 0x40000000; - cpu->isar.id_mmfr2 = 0x01260000; - cpu->isar.id_mmfr3 = 0x02102211; - cpu->isar.id_isar0 = 0x02101110; - cpu->isar.id_isar1 = 0x13112111; - cpu->isar.id_isar2 = 0x21232042; - cpu->isar.id_isar3 = 0x01112131; - cpu->isar.id_isar4 = 0x00011142; - cpu->isar.id_isar5 = 0x00011121; - cpu->isar.id_isar6 = 0; + SET_IDREG(isar, ID_PFR0, 0x00000131); + SET_IDREG(isar, ID_PFR1, 0x00011011); + SET_IDREG(isar, ID_DFR0, 0x03010066); + SET_IDREG(isar, ID_AFR0, 0x00000000); + SET_IDREG(isar, ID_MMFR0, 0x10101105); + SET_IDREG(isar, ID_MMFR1, 0x40000000); + SET_IDREG(isar, ID_MMFR2, 0x01260000); + SET_IDREG(isar, ID_MMFR3, 0x02102211); + SET_IDREG(isar, ID_ISAR0, 0x02101110); + SET_IDREG(isar, ID_ISAR1, 0x13112111); + SET_IDREG(isar, ID_ISAR2, 0x21232042); + SET_IDREG(isar, ID_ISAR3, 0x01112131); + SET_IDREG(isar, ID_ISAR4, 0x00011142); + SET_IDREG(isar, ID_ISAR5, 0x00011121); + SET_IDREG(isar, ID_ISAR6, 0); cpu->isar.reset_pmcr_el0 = 0x41013000; - cpu->clidr = 0x0a200023; + SET_IDREG(isar, CLIDR, 0x0a200023); cpu->ccsidr[0] = 0x701fe00a; /* 32KB L1 dcache */ cpu->ccsidr[1] = 0x201fe012; /* 48KB L1 icache */ cpu->ccsidr[2] = 0x70ffe07a; /* 2048KB L2 cache */ diff --git a/target/arm/tcg/cpu64.c b/target/arm/tcg/cpu64.c index 29ab0ac..35cddba 100644 --- a/target/arm/tcg/cpu64.c +++ b/target/arm/tcg/cpu64.c @@ -32,6 +32,7 @@ static void aarch64_a35_initfn(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); + ARMISARegisters *isar = &cpu->isar; cpu->dtb_compatible = "arm,cortex-a35"; set_feature(&cpu->env, ARM_FEATURE_V8); @@ -48,29 +49,29 @@ static void aarch64_a35_initfn(Object *obj) cpu->midr = 0x411fd040; cpu->revidr = 0; cpu->ctr = 0x84448004; - cpu->isar.id_pfr0 = 0x00000131; - cpu->isar.id_pfr1 = 0x00011011; - cpu->isar.id_dfr0 = 0x03010066; - cpu->id_afr0 = 0; - cpu->isar.id_mmfr0 = 0x10201105; - cpu->isar.id_mmfr1 = 0x40000000; - cpu->isar.id_mmfr2 = 0x01260000; - cpu->isar.id_mmfr3 = 0x02102211; - cpu->isar.id_isar0 = 0x02101110; - cpu->isar.id_isar1 = 0x13112111; - cpu->isar.id_isar2 = 0x21232042; - cpu->isar.id_isar3 = 0x01112131; - cpu->isar.id_isar4 = 0x00011142; - cpu->isar.id_isar5 = 0x00011121; - cpu->isar.id_aa64pfr0 = 0x00002222; - cpu->isar.id_aa64pfr1 = 0; - cpu->isar.id_aa64dfr0 = 0x10305106; - cpu->isar.id_aa64dfr1 = 0; - cpu->isar.id_aa64isar0 = 0x00011120; - cpu->isar.id_aa64isar1 = 0; - cpu->isar.id_aa64mmfr0 = 0x00101122; - cpu->isar.id_aa64mmfr1 = 0; - cpu->clidr = 0x0a200023; + SET_IDREG(isar, ID_PFR0, 0x00000131); + SET_IDREG(isar, ID_PFR1, 0x00011011); + SET_IDREG(isar, ID_DFR0, 0x03010066); + SET_IDREG(isar, ID_AFR0, 0); + SET_IDREG(isar, ID_MMFR0, 0x10201105); + SET_IDREG(isar, ID_MMFR1, 0x40000000); + SET_IDREG(isar, ID_MMFR2, 0x01260000); + SET_IDREG(isar, ID_MMFR3, 0x02102211); + SET_IDREG(isar, ID_ISAR0, 0x02101110); + SET_IDREG(isar, ID_ISAR1, 0x13112111); + SET_IDREG(isar, ID_ISAR2, 0x21232042); + SET_IDREG(isar, ID_ISAR3, 0x01112131); + SET_IDREG(isar, ID_ISAR4, 0x00011142); + SET_IDREG(isar, ID_ISAR5, 0x00011121); + SET_IDREG(isar, ID_AA64PFR0, 0x00002222); + SET_IDREG(isar, ID_AA64PFR1, 0); + SET_IDREG(isar, ID_AA64DFR0, 0x10305106); + SET_IDREG(isar, ID_AA64DFR1, 0); + SET_IDREG(isar, ID_AA64ISAR0, 0x00011120); + SET_IDREG(isar, ID_AA64ISAR1, 0); + SET_IDREG(isar, ID_AA64MMFR0, 0x00101122); + SET_IDREG(isar, ID_AA64MMFR1, 0); + SET_IDREG(isar, CLIDR, 0x0a200023); cpu->dcz_blocksize = 4; /* From B2.4 AArch64 Virtual Memory control registers */ @@ -157,11 +158,8 @@ static bool cpu_arm_get_rme(Object *obj, Error **errp) static void cpu_arm_set_rme(Object *obj, bool value, Error **errp) { ARMCPU *cpu = ARM_CPU(obj); - uint64_t t; - t = cpu->isar.id_aa64pfr0; - t = FIELD_DP64(t, ID_AA64PFR0, RME, value); - cpu->isar.id_aa64pfr0 = t; + FIELD_DP64_IDREG(&cpu->isar, ID_AA64PFR0, RME, value); } static void cpu_max_set_l0gptsz(Object *obj, Visitor *v, const char *name, @@ -204,6 +202,7 @@ static const Property arm_cpu_lpa2_property = static void aarch64_a55_initfn(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); + ARMISARegisters *isar = &cpu->isar; cpu->dtb_compatible = "arm,cortex-a55"; set_feature(&cpu->env, ARM_FEATURE_V8); @@ -217,34 +216,34 @@ static void aarch64_a55_initfn(Object *obj) set_feature(&cpu->env, ARM_FEATURE_PMU); /* Ordered by B2.4 AArch64 registers by functional group */ - cpu->clidr = 0x82000023; + SET_IDREG(isar, CLIDR, 0x82000023); cpu->ctr = 0x84448004; /* L1Ip = VIPT */ cpu->dcz_blocksize = 4; /* 64 bytes */ - cpu->isar.id_aa64dfr0 = 0x0000000010305408ull; - cpu->isar.id_aa64isar0 = 0x0000100010211120ull; - cpu->isar.id_aa64isar1 = 0x0000000000100001ull; - cpu->isar.id_aa64mmfr0 = 0x0000000000101122ull; - cpu->isar.id_aa64mmfr1 = 0x0000000010212122ull; - cpu->isar.id_aa64mmfr2 = 0x0000000000001011ull; - cpu->isar.id_aa64pfr0 = 0x0000000010112222ull; - cpu->isar.id_aa64pfr1 = 0x0000000000000010ull; - cpu->id_afr0 = 0x00000000; - cpu->isar.id_dfr0 = 0x04010088; - cpu->isar.id_isar0 = 0x02101110; - cpu->isar.id_isar1 = 0x13112111; - cpu->isar.id_isar2 = 0x21232042; - cpu->isar.id_isar3 = 0x01112131; - cpu->isar.id_isar4 = 0x00011142; - cpu->isar.id_isar5 = 0x01011121; - cpu->isar.id_isar6 = 0x00000010; - cpu->isar.id_mmfr0 = 0x10201105; - cpu->isar.id_mmfr1 = 0x40000000; - cpu->isar.id_mmfr2 = 0x01260000; - cpu->isar.id_mmfr3 = 0x02122211; - cpu->isar.id_mmfr4 = 0x00021110; - cpu->isar.id_pfr0 = 0x10010131; - cpu->isar.id_pfr1 = 0x00011011; - cpu->isar.id_pfr2 = 0x00000011; + SET_IDREG(isar, ID_AA64DFR0, 0x0000000010305408ull); + SET_IDREG(isar, ID_AA64ISAR0, 0x0000100010211120ull); + SET_IDREG(isar, ID_AA64ISAR1, 0x0000000000100001ull); + SET_IDREG(isar, ID_AA64MMFR0, 0x0000000000101122ull); + SET_IDREG(isar, ID_AA64MMFR1, 0x0000000010212122ull); + SET_IDREG(isar, ID_AA64MMFR2, 0x0000000000001011ull); + SET_IDREG(isar, ID_AA64PFR0, 0x0000000010112222ull); + SET_IDREG(isar, ID_AA64PFR1, 0x0000000000000010ull); + SET_IDREG(isar, ID_AFR0, 0x00000000); + SET_IDREG(isar, ID_DFR0, 0x04010088); + SET_IDREG(isar, ID_ISAR0, 0x02101110); + SET_IDREG(isar, ID_ISAR1, 0x13112111); + SET_IDREG(isar, ID_ISAR2, 0x21232042); + SET_IDREG(isar, ID_ISAR3, 0x01112131); + SET_IDREG(isar, ID_ISAR4, 0x00011142); + SET_IDREG(isar, ID_ISAR5, 0x01011121); + SET_IDREG(isar, ID_ISAR6, 0x00000010); + SET_IDREG(isar, ID_MMFR0, 0x10201105); + SET_IDREG(isar, ID_MMFR1, 0x40000000); + SET_IDREG(isar, ID_MMFR2, 0x01260000); + SET_IDREG(isar, ID_MMFR3, 0x02122211); + SET_IDREG(isar, ID_MMFR4, 0x00021110); + SET_IDREG(isar, ID_PFR0, 0x10010131); + SET_IDREG(isar, ID_PFR1, 0x00011011); + SET_IDREG(isar, ID_PFR2, 0x00000011); cpu->midr = 0x412FD050; /* r2p0 */ cpu->revidr = 0; @@ -276,6 +275,7 @@ static void aarch64_a55_initfn(Object *obj) static void aarch64_a72_initfn(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); + ARMISARegisters *isar = &cpu->isar; cpu->dtb_compatible = "arm,cortex-a72"; set_feature(&cpu->env, ARM_FEATURE_V8); @@ -295,29 +295,29 @@ static void aarch64_a72_initfn(Object *obj) cpu->isar.mvfr2 = 0x00000043; cpu->ctr = 0x8444c004; cpu->reset_sctlr = 0x00c50838; - cpu->isar.id_pfr0 = 0x00000131; - cpu->isar.id_pfr1 = 0x00011011; - cpu->isar.id_dfr0 = 0x03010066; - cpu->id_afr0 = 0x00000000; - cpu->isar.id_mmfr0 = 0x10201105; - cpu->isar.id_mmfr1 = 0x40000000; - cpu->isar.id_mmfr2 = 0x01260000; - cpu->isar.id_mmfr3 = 0x02102211; - cpu->isar.id_isar0 = 0x02101110; - cpu->isar.id_isar1 = 0x13112111; - cpu->isar.id_isar2 = 0x21232042; - cpu->isar.id_isar3 = 0x01112131; - cpu->isar.id_isar4 = 0x00011142; - cpu->isar.id_isar5 = 0x00011121; - cpu->isar.id_aa64pfr0 = 0x00002222; - cpu->isar.id_aa64dfr0 = 0x10305106; - cpu->isar.id_aa64isar0 = 0x00011120; - cpu->isar.id_aa64mmfr0 = 0x00001124; + SET_IDREG(isar, ID_PFR0, 0x00000131); + SET_IDREG(isar, ID_PFR1, 0x00011011); + SET_IDREG(isar, ID_DFR0, 0x03010066); + SET_IDREG(isar, ID_AFR0, 0x00000000); + SET_IDREG(isar, ID_MMFR0, 0x10201105); + SET_IDREG(isar, ID_MMFR1, 0x40000000); + SET_IDREG(isar, ID_MMFR2, 0x01260000); + SET_IDREG(isar, ID_MMFR3, 0x02102211); + SET_IDREG(isar, ID_ISAR0, 0x02101110); + SET_IDREG(isar, ID_ISAR1, 0x13112111); + SET_IDREG(isar, ID_ISAR2, 0x21232042); + SET_IDREG(isar, ID_ISAR3, 0x01112131); + SET_IDREG(isar, ID_ISAR4, 0x00011142); + SET_IDREG(isar, ID_ISAR5, 0x00011121); + SET_IDREG(isar, ID_AA64PFR0, 0x00002222); + SET_IDREG(isar, ID_AA64DFR0, 0x10305106); + SET_IDREG(isar, ID_AA64ISAR0, 0x00011120); + SET_IDREG(isar, ID_AA64MMFR0, 0x00001124); cpu->isar.dbgdidr = 0x3516d000; cpu->isar.dbgdevid = 0x01110f13; cpu->isar.dbgdevid1 = 0x2; cpu->isar.reset_pmcr_el0 = 0x41023000; - cpu->clidr = 0x0a200023; + SET_IDREG(isar, CLIDR, 0x0a200023); /* 32KB L1 dcache */ cpu->ccsidr[0] = make_ccsidr(CCSIDR_FORMAT_LEGACY, 4, 64, 32 * KiB, 7); /* 48KB L1 dcache */ @@ -335,6 +335,7 @@ static void aarch64_a72_initfn(Object *obj) static void aarch64_a76_initfn(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); + ARMISARegisters *isar = &cpu->isar; cpu->dtb_compatible = "arm,cortex-a76"; set_feature(&cpu->env, ARM_FEATURE_V8); @@ -348,34 +349,34 @@ static void aarch64_a76_initfn(Object *obj) set_feature(&cpu->env, ARM_FEATURE_PMU); /* Ordered by B2.4 AArch64 registers by functional group */ - cpu->clidr = 0x82000023; + SET_IDREG(isar, CLIDR, 0x82000023); cpu->ctr = 0x8444C004; cpu->dcz_blocksize = 4; - cpu->isar.id_aa64dfr0 = 0x0000000010305408ull; - cpu->isar.id_aa64isar0 = 0x0000100010211120ull; - cpu->isar.id_aa64isar1 = 0x0000000000100001ull; - cpu->isar.id_aa64mmfr0 = 0x0000000000101122ull; - cpu->isar.id_aa64mmfr1 = 0x0000000010212122ull; - cpu->isar.id_aa64mmfr2 = 0x0000000000001011ull; - cpu->isar.id_aa64pfr0 = 0x1100000010111112ull; /* GIC filled in later */ - cpu->isar.id_aa64pfr1 = 0x0000000000000010ull; - cpu->id_afr0 = 0x00000000; - cpu->isar.id_dfr0 = 0x04010088; - cpu->isar.id_isar0 = 0x02101110; - cpu->isar.id_isar1 = 0x13112111; - cpu->isar.id_isar2 = 0x21232042; - cpu->isar.id_isar3 = 0x01112131; - cpu->isar.id_isar4 = 0x00010142; - cpu->isar.id_isar5 = 0x01011121; - cpu->isar.id_isar6 = 0x00000010; - cpu->isar.id_mmfr0 = 0x10201105; - cpu->isar.id_mmfr1 = 0x40000000; - cpu->isar.id_mmfr2 = 0x01260000; - cpu->isar.id_mmfr3 = 0x02122211; - cpu->isar.id_mmfr4 = 0x00021110; - cpu->isar.id_pfr0 = 0x10010131; - cpu->isar.id_pfr1 = 0x00010000; /* GIC filled in later */ - cpu->isar.id_pfr2 = 0x00000011; + SET_IDREG(isar, ID_AA64DFR0, 0x0000000010305408ull); + SET_IDREG(isar, ID_AA64ISAR0, 0x0000100010211120ull); + SET_IDREG(isar, ID_AA64ISAR1, 0x0000000000100001ull); + SET_IDREG(isar, ID_AA64MMFR0, 0x0000000000101122ull); + SET_IDREG(isar, ID_AA64MMFR1, 0x0000000010212122ull); + SET_IDREG(isar, ID_AA64MMFR2, 0x0000000000001011ull); + SET_IDREG(isar, ID_AA64PFR0, 0x1100000010111112ull); /* GIC filled in later */ + SET_IDREG(isar, ID_AA64PFR1, 0x0000000000000010ull); + SET_IDREG(isar, ID_AFR0, 0x00000000); + SET_IDREG(isar, ID_DFR0, 0x04010088); + SET_IDREG(isar, ID_ISAR0, 0x02101110); + SET_IDREG(isar, ID_ISAR1, 0x13112111); + SET_IDREG(isar, ID_ISAR2, 0x21232042); + SET_IDREG(isar, ID_ISAR3, 0x01112131); + SET_IDREG(isar, ID_ISAR4, 0x00010142); + SET_IDREG(isar, ID_ISAR5, 0x01011121); + SET_IDREG(isar, ID_ISAR6, 0x00000010); + SET_IDREG(isar, ID_MMFR0, 0x10201105); + SET_IDREG(isar, ID_MMFR1, 0x40000000); + SET_IDREG(isar, ID_MMFR2, 0x01260000); + SET_IDREG(isar, ID_MMFR3, 0x02122211); + SET_IDREG(isar, ID_MMFR4, 0x00021110); + SET_IDREG(isar, ID_PFR0, 0x10010131); + SET_IDREG(isar, ID_PFR1, 0x00010000); /* GIC filled in later */ + SET_IDREG(isar, ID_PFR2, 0x00000011); cpu->midr = 0x414fd0b1; /* r4p1 */ cpu->revidr = 0; @@ -408,6 +409,7 @@ static void aarch64_a76_initfn(Object *obj) static void aarch64_a64fx_initfn(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); + ARMISARegisters *isar = &cpu->isar; cpu->dtb_compatible = "arm,a64fx"; set_feature(&cpu->env, ARM_FEATURE_V8); @@ -422,19 +424,19 @@ static void aarch64_a64fx_initfn(Object *obj) cpu->revidr = 0x00000000; cpu->ctr = 0x86668006; cpu->reset_sctlr = 0x30000180; - cpu->isar.id_aa64pfr0 = 0x0000000101111111; /* No RAS Extensions */ - cpu->isar.id_aa64pfr1 = 0x0000000000000000; - cpu->isar.id_aa64dfr0 = 0x0000000010305408; - cpu->isar.id_aa64dfr1 = 0x0000000000000000; - cpu->id_aa64afr0 = 0x0000000000000000; - cpu->id_aa64afr1 = 0x0000000000000000; - cpu->isar.id_aa64mmfr0 = 0x0000000000001122; - cpu->isar.id_aa64mmfr1 = 0x0000000011212100; - cpu->isar.id_aa64mmfr2 = 0x0000000000001011; - cpu->isar.id_aa64isar0 = 0x0000000010211120; - cpu->isar.id_aa64isar1 = 0x0000000000010001; - cpu->isar.id_aa64zfr0 = 0x0000000000000000; - cpu->clidr = 0x0000000080000023; + SET_IDREG(isar, ID_AA64PFR0, 0x0000000101111111); /* No RAS Extensions */ + SET_IDREG(isar, ID_AA64PFR1, 0x0000000000000000); + SET_IDREG(isar, ID_AA64DFR0, 0x0000000010305408); + SET_IDREG(isar, ID_AA64DFR1, 0x0000000000000000); + SET_IDREG(isar, ID_AA64AFR0, 0x0000000000000000); + SET_IDREG(isar, ID_AA64AFR1, 0x0000000000000000); + SET_IDREG(isar, ID_AA64MMFR0, 0x0000000000001122); + SET_IDREG(isar, ID_AA64MMFR1, 0x0000000011212100); + SET_IDREG(isar, ID_AA64MMFR2, 0x0000000000001011); + SET_IDREG(isar, ID_AA64ISAR0, 0x0000000010211120); + SET_IDREG(isar, ID_AA64ISAR1, 0x0000000000010001); + SET_IDREG(isar, ID_AA64ZFR0, 0x0000000000000000); + SET_IDREG(isar, CLIDR, 0x0000000080000023); /* 64KB L1 dcache */ cpu->ccsidr[0] = make_ccsidr(CCSIDR_FORMAT_LEGACY, 4, 256, 64 * KiB, 7); /* 64KB L1 icache */ @@ -581,6 +583,7 @@ static void define_neoverse_v1_cp_reginfo(ARMCPU *cpu) static void aarch64_neoverse_n1_initfn(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); + ARMISARegisters *isar = &cpu->isar; cpu->dtb_compatible = "arm,neoverse-n1"; set_feature(&cpu->env, ARM_FEATURE_V8); @@ -594,34 +597,34 @@ static void aarch64_neoverse_n1_initfn(Object *obj) set_feature(&cpu->env, ARM_FEATURE_PMU); /* Ordered by B2.4 AArch64 registers by functional group */ - cpu->clidr = 0x82000023; + SET_IDREG(isar, CLIDR, 0x82000023); cpu->ctr = 0x8444c004; cpu->dcz_blocksize = 4; - cpu->isar.id_aa64dfr0 = 0x0000000110305408ull; - cpu->isar.id_aa64isar0 = 0x0000100010211120ull; - cpu->isar.id_aa64isar1 = 0x0000000000100001ull; - cpu->isar.id_aa64mmfr0 = 0x0000000000101125ull; - cpu->isar.id_aa64mmfr1 = 0x0000000010212122ull; - cpu->isar.id_aa64mmfr2 = 0x0000000000001011ull; - cpu->isar.id_aa64pfr0 = 0x1100000010111112ull; /* GIC filled in later */ - cpu->isar.id_aa64pfr1 = 0x0000000000000020ull; - cpu->id_afr0 = 0x00000000; - cpu->isar.id_dfr0 = 0x04010088; - cpu->isar.id_isar0 = 0x02101110; - cpu->isar.id_isar1 = 0x13112111; - cpu->isar.id_isar2 = 0x21232042; - cpu->isar.id_isar3 = 0x01112131; - cpu->isar.id_isar4 = 0x00010142; - cpu->isar.id_isar5 = 0x01011121; - cpu->isar.id_isar6 = 0x00000010; - cpu->isar.id_mmfr0 = 0x10201105; - cpu->isar.id_mmfr1 = 0x40000000; - cpu->isar.id_mmfr2 = 0x01260000; - cpu->isar.id_mmfr3 = 0x02122211; - cpu->isar.id_mmfr4 = 0x00021110; - cpu->isar.id_pfr0 = 0x10010131; - cpu->isar.id_pfr1 = 0x00010000; /* GIC filled in later */ - cpu->isar.id_pfr2 = 0x00000011; + SET_IDREG(isar, ID_AA64DFR0, 0x0000000110305408ull); + SET_IDREG(isar, ID_AA64ISAR0, 0x0000100010211120ull); + SET_IDREG(isar, ID_AA64ISAR1, 0x0000000000100001ull); + SET_IDREG(isar, ID_AA64MMFR0, 0x0000000000101125ull); + SET_IDREG(isar, ID_AA64MMFR1, 0x0000000010212122ull); + SET_IDREG(isar, ID_AA64MMFR2, 0x0000000000001011ull); + SET_IDREG(isar, ID_AA64PFR0, 0x1100000010111112ull); /* GIC filled in later */ + SET_IDREG(isar, ID_AA64PFR1, 0x0000000000000020ull); + SET_IDREG(isar, ID_AFR0, 0x00000000); + SET_IDREG(isar, ID_DFR0, 0x04010088); + SET_IDREG(isar, ID_ISAR0, 0x02101110); + SET_IDREG(isar, ID_ISAR1, 0x13112111); + SET_IDREG(isar, ID_ISAR2, 0x21232042); + SET_IDREG(isar, ID_ISAR3, 0x01112131); + SET_IDREG(isar, ID_ISAR4, 0x00010142); + SET_IDREG(isar, ID_ISAR5, 0x01011121); + SET_IDREG(isar, ID_ISAR6, 0x00000010); + SET_IDREG(isar, ID_MMFR0, 0x10201105); + SET_IDREG(isar, ID_MMFR1, 0x40000000); + SET_IDREG(isar, ID_MMFR2, 0x01260000); + SET_IDREG(isar, ID_MMFR3, 0x02122211); + SET_IDREG(isar, ID_MMFR4, 0x00021110); + SET_IDREG(isar, ID_PFR0, 0x10010131); + SET_IDREG(isar, ID_PFR1, 0x00010000); /* GIC filled in later */ + SET_IDREG(isar, ID_PFR2, 0x00000011); cpu->midr = 0x414fd0c1; /* r4p1 */ cpu->revidr = 0; @@ -656,6 +659,7 @@ static void aarch64_neoverse_n1_initfn(Object *obj) static void aarch64_neoverse_v1_initfn(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); + ARMISARegisters *isar = &cpu->isar; cpu->dtb_compatible = "arm,neoverse-v1"; set_feature(&cpu->env, ARM_FEATURE_V8); @@ -669,37 +673,37 @@ static void aarch64_neoverse_v1_initfn(Object *obj) set_feature(&cpu->env, ARM_FEATURE_PMU); /* Ordered by 3.2.4 AArch64 registers by functional group */ - cpu->clidr = 0x82000023; + SET_IDREG(isar, CLIDR, 0x82000023); cpu->ctr = 0xb444c004; /* With DIC and IDC set */ cpu->dcz_blocksize = 4; - cpu->id_aa64afr0 = 0x00000000; - cpu->id_aa64afr1 = 0x00000000; - cpu->isar.id_aa64dfr0 = 0x000001f210305519ull; - cpu->isar.id_aa64dfr1 = 0x00000000; - cpu->isar.id_aa64isar0 = 0x1011111110212120ull; /* with FEAT_RNG */ - cpu->isar.id_aa64isar1 = 0x0011100001211032ull; - cpu->isar.id_aa64mmfr0 = 0x0000000000101125ull; - cpu->isar.id_aa64mmfr1 = 0x0000000010212122ull; - cpu->isar.id_aa64mmfr2 = 0x0220011102101011ull; - cpu->isar.id_aa64pfr0 = 0x1101110120111112ull; /* GIC filled in later */ - cpu->isar.id_aa64pfr1 = 0x0000000000000020ull; - cpu->id_afr0 = 0x00000000; - cpu->isar.id_dfr0 = 0x15011099; - cpu->isar.id_isar0 = 0x02101110; - cpu->isar.id_isar1 = 0x13112111; - cpu->isar.id_isar2 = 0x21232042; - cpu->isar.id_isar3 = 0x01112131; - cpu->isar.id_isar4 = 0x00010142; - cpu->isar.id_isar5 = 0x11011121; - cpu->isar.id_isar6 = 0x01100111; - cpu->isar.id_mmfr0 = 0x10201105; - cpu->isar.id_mmfr1 = 0x40000000; - cpu->isar.id_mmfr2 = 0x01260000; - cpu->isar.id_mmfr3 = 0x02122211; - cpu->isar.id_mmfr4 = 0x01021110; - cpu->isar.id_pfr0 = 0x21110131; - cpu->isar.id_pfr1 = 0x00010000; /* GIC filled in later */ - cpu->isar.id_pfr2 = 0x00000011; + SET_IDREG(isar, ID_AA64AFR0, 0x00000000); + SET_IDREG(isar, ID_AA64AFR1, 0x00000000); + SET_IDREG(isar, ID_AA64DFR0, 0x000001f210305519ull); + SET_IDREG(isar, ID_AA64DFR1, 0x00000000); + SET_IDREG(isar, ID_AA64ISAR0, 0x1011111110212120ull); /* with FEAT_RNG */ + SET_IDREG(isar, ID_AA64ISAR1, 0x0011000001211032ull); + SET_IDREG(isar, ID_AA64MMFR0, 0x0000000000101125ull); + SET_IDREG(isar, ID_AA64MMFR1, 0x0000000010212122ull); + SET_IDREG(isar, ID_AA64MMFR2, 0x0220011102101011ull); + SET_IDREG(isar, ID_AA64PFR0, 0x1101110120111112ull); /* GIC filled in later */ + SET_IDREG(isar, ID_AA64PFR1, 0x0000000000000020ull); + SET_IDREG(isar, ID_AFR0, 0x00000000); + SET_IDREG(isar, ID_DFR0, 0x15011099); + SET_IDREG(isar, ID_ISAR0, 0x02101110); + SET_IDREG(isar, ID_ISAR1, 0x13112111); + SET_IDREG(isar, ID_ISAR2, 0x21232042); + SET_IDREG(isar, ID_ISAR3, 0x01112131); + SET_IDREG(isar, ID_ISAR4, 0x00010142); + SET_IDREG(isar, ID_ISAR5, 0x11011121); + SET_IDREG(isar, ID_ISAR6, 0x01100111); + SET_IDREG(isar, ID_MMFR0, 0x10201105); + SET_IDREG(isar, ID_MMFR1, 0x40000000); + SET_IDREG(isar, ID_MMFR2, 0x01260000); + SET_IDREG(isar, ID_MMFR3, 0x02122211); + SET_IDREG(isar, ID_MMFR4, 0x01021110); + SET_IDREG(isar, ID_PFR0, 0x21110131); + SET_IDREG(isar, ID_PFR1, 0x00010000); /* GIC filled in later */ + SET_IDREG(isar, ID_PFR2, 0x00000011); cpu->midr = 0x411FD402; /* r1p2 */ cpu->revidr = 0; @@ -735,7 +739,7 @@ static void aarch64_neoverse_v1_initfn(Object *obj) cpu->isar.mvfr2 = 0x00000043; /* From 3.7.5 ID_AA64ZFR0_EL1 */ - cpu->isar.id_aa64zfr0 = 0x0000100000100000; + SET_IDREG(isar, ID_AA64ZFR0, 0x0000100000100000); cpu->sve_vq.supported = (1 << 0) /* 128bit */ | (1 << 1); /* 256bit */ @@ -882,6 +886,7 @@ static const ARMCPRegInfo cortex_a710_cp_reginfo[] = { static void aarch64_a710_initfn(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); + ARMISARegisters *isar = &cpu->isar; cpu->dtb_compatible = "arm,cortex-a710"; set_feature(&cpu->env, ARM_FEATURE_V8); @@ -897,39 +902,39 @@ static void aarch64_a710_initfn(Object *obj) /* Ordered by Section B.4: AArch64 registers */ cpu->midr = 0x412FD471; /* r2p1 */ cpu->revidr = 0; - cpu->isar.id_pfr0 = 0x21110131; - cpu->isar.id_pfr1 = 0x00010000; /* GIC filled in later */ - cpu->isar.id_dfr0 = 0x16011099; - cpu->id_afr0 = 0; - cpu->isar.id_mmfr0 = 0x10201105; - cpu->isar.id_mmfr1 = 0x40000000; - cpu->isar.id_mmfr2 = 0x01260000; - cpu->isar.id_mmfr3 = 0x02122211; - cpu->isar.id_isar0 = 0x02101110; - cpu->isar.id_isar1 = 0x13112111; - cpu->isar.id_isar2 = 0x21232042; - cpu->isar.id_isar3 = 0x01112131; - cpu->isar.id_isar4 = 0x00010142; - cpu->isar.id_isar5 = 0x11011121; /* with Crypto */ - cpu->isar.id_mmfr4 = 0x21021110; - cpu->isar.id_isar6 = 0x01111111; + SET_IDREG(isar, ID_PFR0, 0x21110131); + SET_IDREG(isar, ID_PFR1, 0x00010000); /* GIC filled in later */ + SET_IDREG(isar, ID_DFR0, 0x16011099); + SET_IDREG(isar, ID_AFR0, 0); + SET_IDREG(isar, ID_MMFR0, 0x10201105); + SET_IDREG(isar, ID_MMFR1, 0x40000000); + SET_IDREG(isar, ID_MMFR2, 0x01260000); + SET_IDREG(isar, ID_MMFR3, 0x02122211); + SET_IDREG(isar, ID_ISAR0, 0x02101110); + SET_IDREG(isar, ID_ISAR1, 0x13112111); + SET_IDREG(isar, ID_ISAR2, 0x21232042); + SET_IDREG(isar, ID_ISAR3, 0x01112131); + SET_IDREG(isar, ID_ISAR4, 0x00010142); + SET_IDREG(isar, ID_ISAR5, 0x11011121); /* with Crypto */ + SET_IDREG(isar, ID_MMFR4, 0x21021110); + SET_IDREG(isar, ID_ISAR6, 0x01111111); cpu->isar.mvfr0 = 0x10110222; cpu->isar.mvfr1 = 0x13211111; cpu->isar.mvfr2 = 0x00000043; - cpu->isar.id_pfr2 = 0x00000011; - cpu->isar.id_aa64pfr0 = 0x1201111120111112ull; /* GIC filled in later */ - cpu->isar.id_aa64pfr1 = 0x0000000000000221ull; - cpu->isar.id_aa64zfr0 = 0x0000110100110021ull; /* with Crypto */ - cpu->isar.id_aa64dfr0 = 0x000011f010305619ull; - cpu->isar.id_aa64dfr1 = 0; - cpu->id_aa64afr0 = 0; - cpu->id_aa64afr1 = 0; - cpu->isar.id_aa64isar0 = 0x0221111110212120ull; /* with Crypto */ - cpu->isar.id_aa64isar1 = 0x0010111101211052ull; - cpu->isar.id_aa64mmfr0 = 0x0000022200101122ull; - cpu->isar.id_aa64mmfr1 = 0x0000000010212122ull; - cpu->isar.id_aa64mmfr2 = 0x1221011110101011ull; - cpu->clidr = 0x0000001482000023ull; + SET_IDREG(isar, ID_PFR2, 0x00000011); + SET_IDREG(isar, ID_AA64PFR0, 0x1201111120111112ull); /* GIC filled in later */ + SET_IDREG(isar, ID_AA64PFR1, 0x0000000000000221ull); + SET_IDREG(isar, ID_AA64ZFR0, 0x0000110100110021ull); /* with Crypto */ + SET_IDREG(isar, ID_AA64DFR0, 0x000011f010305619ull); + SET_IDREG(isar, ID_AA64DFR1, 0); + SET_IDREG(isar, ID_AA64AFR0, 0); + SET_IDREG(isar, ID_AA64AFR1, 0); + SET_IDREG(isar, ID_AA64ISAR0, 0x0221111110212120ull); /* with Crypto */ + SET_IDREG(isar, ID_AA64ISAR1, 0x0010111101211052ull); + SET_IDREG(isar, ID_AA64MMFR0, 0x0000022200101122ull); + SET_IDREG(isar, ID_AA64MMFR1, 0x0000000010212122ull); + SET_IDREG(isar, ID_AA64MMFR2, 0x1221011110101011ull); + SET_IDREG(isar, CLIDR, 0x0000001482000023ull); cpu->gm_blocksize = 4; cpu->ctr = 0x000000049444c004ull; cpu->dcz_blocksize = 4; @@ -983,6 +988,7 @@ static const ARMCPRegInfo neoverse_n2_cp_reginfo[] = { static void aarch64_neoverse_n2_initfn(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); + ARMISARegisters *isar = &cpu->isar; cpu->dtb_compatible = "arm,neoverse-n2"; set_feature(&cpu->env, ARM_FEATURE_V8); @@ -998,39 +1004,39 @@ static void aarch64_neoverse_n2_initfn(Object *obj) /* Ordered by Section B.5: AArch64 ID registers */ cpu->midr = 0x410FD493; /* r0p3 */ cpu->revidr = 0; - cpu->isar.id_pfr0 = 0x21110131; - cpu->isar.id_pfr1 = 0x00010000; /* GIC filled in later */ - cpu->isar.id_dfr0 = 0x16011099; - cpu->id_afr0 = 0; - cpu->isar.id_mmfr0 = 0x10201105; - cpu->isar.id_mmfr1 = 0x40000000; - cpu->isar.id_mmfr2 = 0x01260000; - cpu->isar.id_mmfr3 = 0x02122211; - cpu->isar.id_isar0 = 0x02101110; - cpu->isar.id_isar1 = 0x13112111; - cpu->isar.id_isar2 = 0x21232042; - cpu->isar.id_isar3 = 0x01112131; - cpu->isar.id_isar4 = 0x00010142; - cpu->isar.id_isar5 = 0x11011121; /* with Crypto */ - cpu->isar.id_mmfr4 = 0x01021110; - cpu->isar.id_isar6 = 0x01111111; + SET_IDREG(isar, ID_PFR0, 0x21110131); + SET_IDREG(isar, ID_PFR1, 0x00010000); /* GIC filled in later */ + SET_IDREG(isar, ID_DFR0, 0x16011099); + SET_IDREG(isar, ID_AFR0, 0); + SET_IDREG(isar, ID_MMFR0, 0x10201105); + SET_IDREG(isar, ID_MMFR1, 0x40000000); + SET_IDREG(isar, ID_MMFR2, 0x01260000); + SET_IDREG(isar, ID_MMFR3, 0x02122211); + SET_IDREG(isar, ID_ISAR0, 0x02101110); + SET_IDREG(isar, ID_ISAR1, 0x13112111); + SET_IDREG(isar, ID_ISAR2, 0x21232042); + SET_IDREG(isar, ID_ISAR3, 0x01112131); + SET_IDREG(isar, ID_ISAR4, 0x00010142); + SET_IDREG(isar, ID_ISAR5, 0x11011121); /* with Crypto */ + SET_IDREG(isar, ID_MMFR4, 0x01021110); + SET_IDREG(isar, ID_ISAR6, 0x01111111); cpu->isar.mvfr0 = 0x10110222; cpu->isar.mvfr1 = 0x13211111; cpu->isar.mvfr2 = 0x00000043; - cpu->isar.id_pfr2 = 0x00000011; - cpu->isar.id_aa64pfr0 = 0x1201111120111112ull; /* GIC filled in later */ - cpu->isar.id_aa64pfr1 = 0x0000000000000221ull; - cpu->isar.id_aa64zfr0 = 0x0000110100110021ull; /* with Crypto */ - cpu->isar.id_aa64dfr0 = 0x000011f210305619ull; - cpu->isar.id_aa64dfr1 = 0; - cpu->id_aa64afr0 = 0; - cpu->id_aa64afr1 = 0; - cpu->isar.id_aa64isar0 = 0x1221111110212120ull; /* with Crypto and FEAT_RNG */ - cpu->isar.id_aa64isar1 = 0x0011111101211052ull; - cpu->isar.id_aa64mmfr0 = 0x0000022200101125ull; - cpu->isar.id_aa64mmfr1 = 0x0000000010212122ull; - cpu->isar.id_aa64mmfr2 = 0x1221011112101011ull; - cpu->clidr = 0x0000001482000023ull; + SET_IDREG(isar, ID_PFR2, 0x00000011); + SET_IDREG(isar, ID_AA64PFR0, 0x1201111120111112ull); /* GIC filled in later */ + SET_IDREG(isar, ID_AA64PFR1, 0x0000000000000221ull); + SET_IDREG(isar, ID_AA64ZFR0, 0x0000110100110021ull); /* with Crypto */ + SET_IDREG(isar, ID_AA64DFR0, 0x000011f210305619ull); + SET_IDREG(isar, ID_AA64DFR1, 0); + SET_IDREG(isar, ID_AA64AFR0, 0); + SET_IDREG(isar, ID_AA64AFR1, 0); + SET_IDREG(isar, ID_AA64ISAR0, 0x1221111110212120ull); /* with Crypto and FEAT_RNG */ + SET_IDREG(isar, ID_AA64ISAR1, 0x0011111101211052ull); + SET_IDREG(isar, ID_AA64MMFR0, 0x0000022200101125ull); + SET_IDREG(isar, ID_AA64MMFR1, 0x0000000010212122ull); + SET_IDREG(isar, ID_AA64MMFR2, 0x1221011112101011ull); + SET_IDREG(isar, CLIDR, 0x0000001482000023ull); cpu->gm_blocksize = 4; cpu->ctr = 0x00000004b444c004ull; cpu->dcz_blocksize = 4; @@ -1083,6 +1089,7 @@ static void aarch64_neoverse_n2_initfn(Object *obj) void aarch64_max_tcg_initfn(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); + ARMISARegisters *isar = &cpu->isar; uint64_t t; uint32_t u; @@ -1118,10 +1125,10 @@ void aarch64_max_tcg_initfn(Object *obj) * We're going to set FEAT_S2FWB, which mandates that CLIDR_EL1.{LoUU,LoUIS} * are zero. */ - u = cpu->clidr; + u = GET_IDREG(isar, CLIDR); u = FIELD_DP32(u, CLIDR_EL1, LOUIS, 0); u = FIELD_DP32(u, CLIDR_EL1, LOUU, 0); - cpu->clidr = u; + SET_IDREG(isar, CLIDR, u); /* * Set CTR_EL0.DIC and IDC to tell the guest it doesnt' need to @@ -1133,7 +1140,7 @@ void aarch64_max_tcg_initfn(Object *obj) t = FIELD_DP64(t, CTR_EL0, DIC, 1); cpu->ctr = t; - t = cpu->isar.id_aa64isar0; + t = GET_IDREG(isar, ID_AA64ISAR0); t = FIELD_DP64(t, ID_AA64ISAR0, AES, 2); /* FEAT_PMULL */ t = FIELD_DP64(t, ID_AA64ISAR0, SHA1, 1); /* FEAT_SHA1 */ t = FIELD_DP64(t, ID_AA64ISAR0, SHA2, 2); /* FEAT_SHA512 */ @@ -1148,9 +1155,9 @@ void aarch64_max_tcg_initfn(Object *obj) t = FIELD_DP64(t, ID_AA64ISAR0, TS, 2); /* FEAT_FlagM2 */ t = FIELD_DP64(t, ID_AA64ISAR0, TLB, 2); /* FEAT_TLBIRANGE */ t = FIELD_DP64(t, ID_AA64ISAR0, RNDR, 1); /* FEAT_RNG */ - cpu->isar.id_aa64isar0 = t; + SET_IDREG(isar, ID_AA64ISAR0, t); - t = cpu->isar.id_aa64isar1; + t = GET_IDREG(isar, ID_AA64ISAR1); t = FIELD_DP64(t, ID_AA64ISAR1, DPB, 2); /* FEAT_DPB2 */ t = FIELD_DP64(t, ID_AA64ISAR1, APA, PauthFeat_FPACCOMBINED); t = FIELD_DP64(t, ID_AA64ISAR1, API, 1); @@ -1164,16 +1171,16 @@ void aarch64_max_tcg_initfn(Object *obj) t = FIELD_DP64(t, ID_AA64ISAR1, DGH, 1); /* FEAT_DGH */ t = FIELD_DP64(t, ID_AA64ISAR1, I8MM, 1); /* FEAT_I8MM */ t = FIELD_DP64(t, ID_AA64ISAR1, XS, 1); /* FEAT_XS */ - cpu->isar.id_aa64isar1 = t; + SET_IDREG(isar, ID_AA64ISAR1, t); - t = cpu->isar.id_aa64isar2; + t = GET_IDREG(isar, ID_AA64ISAR2); t = FIELD_DP64(t, ID_AA64ISAR2, RPRES, 1); /* FEAT_RPRES */ t = FIELD_DP64(t, ID_AA64ISAR2, MOPS, 1); /* FEAT_MOPS */ t = FIELD_DP64(t, ID_AA64ISAR2, BC, 1); /* FEAT_HBC */ t = FIELD_DP64(t, ID_AA64ISAR2, WFXT, 2); /* FEAT_WFxT */ - cpu->isar.id_aa64isar2 = t; + SET_IDREG(isar, ID_AA64ISAR2, t); - t = cpu->isar.id_aa64pfr0; + t = GET_IDREG(isar, ID_AA64PFR0); t = FIELD_DP64(t, ID_AA64PFR0, FP, 1); /* FEAT_FP16 */ t = FIELD_DP64(t, ID_AA64PFR0, ADVSIMD, 1); /* FEAT_FP16 */ t = FIELD_DP64(t, ID_AA64PFR0, RAS, 2); /* FEAT_RASv1p1 + FEAT_DoubleFault */ @@ -1182,9 +1189,9 @@ void aarch64_max_tcg_initfn(Object *obj) t = FIELD_DP64(t, ID_AA64PFR0, DIT, 1); /* FEAT_DIT */ t = FIELD_DP64(t, ID_AA64PFR0, CSV2, 3); /* FEAT_CSV2_3 */ t = FIELD_DP64(t, ID_AA64PFR0, CSV3, 1); /* FEAT_CSV3 */ - cpu->isar.id_aa64pfr0 = t; + SET_IDREG(isar, ID_AA64PFR0, t); - t = cpu->isar.id_aa64pfr1; + t = GET_IDREG(isar, ID_AA64PFR1); t = FIELD_DP64(t, ID_AA64PFR1, BT, 1); /* FEAT_BTI */ t = FIELD_DP64(t, ID_AA64PFR1, SSBS, 2); /* FEAT_SSBS2 */ /* @@ -1194,12 +1201,12 @@ void aarch64_max_tcg_initfn(Object *obj) */ t = FIELD_DP64(t, ID_AA64PFR1, MTE, 3); /* FEAT_MTE3 */ t = FIELD_DP64(t, ID_AA64PFR1, RAS_FRAC, 0); /* FEAT_RASv1p1 + FEAT_DoubleFault */ - t = FIELD_DP64(t, ID_AA64PFR1, SME, 1); /* FEAT_SME */ + t = FIELD_DP64(t, ID_AA64PFR1, SME, 2); /* FEAT_SME2 */ t = FIELD_DP64(t, ID_AA64PFR1, CSV2_FRAC, 0); /* FEAT_CSV2_3 */ t = FIELD_DP64(t, ID_AA64PFR1, NMI, 1); /* FEAT_NMI */ - cpu->isar.id_aa64pfr1 = t; + SET_IDREG(isar, ID_AA64PFR1, t); - t = cpu->isar.id_aa64mmfr0; + t = GET_IDREG(isar, ID_AA64MMFR0); t = FIELD_DP64(t, ID_AA64MMFR0, PARANGE, 6); /* FEAT_LPA: 52 bits */ t = FIELD_DP64(t, ID_AA64MMFR0, TGRAN16, 1); /* 16k pages supported */ t = FIELD_DP64(t, ID_AA64MMFR0, TGRAN16_2, 2); /* 16k stage2 supported */ @@ -1207,9 +1214,9 @@ void aarch64_max_tcg_initfn(Object *obj) t = FIELD_DP64(t, ID_AA64MMFR0, TGRAN4_2, 2); /* 4k stage2 supported */ t = FIELD_DP64(t, ID_AA64MMFR0, FGT, 1); /* FEAT_FGT */ t = FIELD_DP64(t, ID_AA64MMFR0, ECV, 2); /* FEAT_ECV */ - cpu->isar.id_aa64mmfr0 = t; + SET_IDREG(isar, ID_AA64MMFR0, t); - t = cpu->isar.id_aa64mmfr1; + t = GET_IDREG(isar, ID_AA64MMFR1); t = FIELD_DP64(t, ID_AA64MMFR1, HAFDBS, 2); /* FEAT_HAFDBS */ t = FIELD_DP64(t, ID_AA64MMFR1, VMIDBITS, 2); /* FEAT_VMID16 */ t = FIELD_DP64(t, ID_AA64MMFR1, VH, 1); /* FEAT_VHE */ @@ -1222,9 +1229,9 @@ void aarch64_max_tcg_initfn(Object *obj) t = FIELD_DP64(t, ID_AA64MMFR1, AFP, 1); /* FEAT_AFP */ t = FIELD_DP64(t, ID_AA64MMFR1, TIDCP1, 1); /* FEAT_TIDCP1 */ t = FIELD_DP64(t, ID_AA64MMFR1, CMOW, 1); /* FEAT_CMOW */ - cpu->isar.id_aa64mmfr1 = t; + SET_IDREG(isar, ID_AA64MMFR1, t); - t = cpu->isar.id_aa64mmfr2; + t = GET_IDREG(isar, ID_AA64MMFR2); t = FIELD_DP64(t, ID_AA64MMFR2, CNP, 1); /* FEAT_TTCNP */ t = FIELD_DP64(t, ID_AA64MMFR2, UAO, 1); /* FEAT_UAO */ t = FIELD_DP64(t, ID_AA64MMFR2, IESB, 1); /* FEAT_IESB */ @@ -1238,39 +1245,43 @@ void aarch64_max_tcg_initfn(Object *obj) t = FIELD_DP64(t, ID_AA64MMFR2, BBM, 2); /* FEAT_BBM at level 2 */ t = FIELD_DP64(t, ID_AA64MMFR2, EVT, 2); /* FEAT_EVT */ t = FIELD_DP64(t, ID_AA64MMFR2, E0PD, 1); /* FEAT_E0PD */ - cpu->isar.id_aa64mmfr2 = t; + SET_IDREG(isar, ID_AA64MMFR2, t); - t = cpu->isar.id_aa64mmfr3; - t = FIELD_DP64(t, ID_AA64MMFR3, SPEC_FPACC, 1); /* FEAT_FPACC_SPEC */ - cpu->isar.id_aa64mmfr3 = t; + FIELD_DP64_IDREG(isar, ID_AA64MMFR3, SPEC_FPACC, 1); /* FEAT_FPACC_SPEC */ - t = cpu->isar.id_aa64zfr0; - t = FIELD_DP64(t, ID_AA64ZFR0, SVEVER, 1); + t = GET_IDREG(isar, ID_AA64ZFR0); + t = FIELD_DP64(t, ID_AA64ZFR0, SVEVER, 2); /* FEAT_SVE2p1 */ t = FIELD_DP64(t, ID_AA64ZFR0, AES, 2); /* FEAT_SVE_PMULL128 */ t = FIELD_DP64(t, ID_AA64ZFR0, BITPERM, 1); /* FEAT_SVE_BitPerm */ t = FIELD_DP64(t, ID_AA64ZFR0, BFLOAT16, 2); /* FEAT_BF16, FEAT_EBF16 */ + t = FIELD_DP64(t, ID_AA64ZFR0, B16B16, 1); /* FEAT_SVE_B16B16 */ t = FIELD_DP64(t, ID_AA64ZFR0, SHA3, 1); /* FEAT_SVE_SHA3 */ t = FIELD_DP64(t, ID_AA64ZFR0, SM4, 1); /* FEAT_SVE_SM4 */ t = FIELD_DP64(t, ID_AA64ZFR0, I8MM, 1); /* FEAT_I8MM */ t = FIELD_DP64(t, ID_AA64ZFR0, F32MM, 1); /* FEAT_F32MM */ t = FIELD_DP64(t, ID_AA64ZFR0, F64MM, 1); /* FEAT_F64MM */ - cpu->isar.id_aa64zfr0 = t; + SET_IDREG(isar, ID_AA64ZFR0, t); - t = cpu->isar.id_aa64dfr0; + t = GET_IDREG(isar, ID_AA64DFR0); t = FIELD_DP64(t, ID_AA64DFR0, DEBUGVER, 10); /* FEAT_Debugv8p8 */ t = FIELD_DP64(t, ID_AA64DFR0, PMUVER, 6); /* FEAT_PMUv3p5 */ t = FIELD_DP64(t, ID_AA64DFR0, HPMN0, 1); /* FEAT_HPMN0 */ - cpu->isar.id_aa64dfr0 = t; + SET_IDREG(isar, ID_AA64DFR0, t); - t = cpu->isar.id_aa64smfr0; + t = GET_IDREG(isar, ID_AA64SMFR0); t = FIELD_DP64(t, ID_AA64SMFR0, F32F32, 1); /* FEAT_SME */ + t = FIELD_DP64(t, ID_AA64SMFR0, BI32I32, 1); /* FEAT_SME2 */ t = FIELD_DP64(t, ID_AA64SMFR0, B16F32, 1); /* FEAT_SME */ t = FIELD_DP64(t, ID_AA64SMFR0, F16F32, 1); /* FEAT_SME */ t = FIELD_DP64(t, ID_AA64SMFR0, I8I32, 0xf); /* FEAT_SME */ + t = FIELD_DP64(t, ID_AA64SMFR0, F16F16, 1); /* FEAT_SME_F16F16 */ + t = FIELD_DP64(t, ID_AA64SMFR0, B16B16, 1); /* FEAT_SME_B16B16 */ + t = FIELD_DP64(t, ID_AA64SMFR0, I16I32, 5); /* FEAT_SME2 */ t = FIELD_DP64(t, ID_AA64SMFR0, F64F64, 1); /* FEAT_SME_F64F64 */ t = FIELD_DP64(t, ID_AA64SMFR0, I16I64, 0xf); /* FEAT_SME_I16I64 */ + t = FIELD_DP64(t, ID_AA64SMFR0, SMEVER, 2); /* FEAT_SME2p1 */ t = FIELD_DP64(t, ID_AA64SMFR0, FA64, 1); /* FEAT_SME_FA64 */ - cpu->isar.id_aa64smfr0 = t; + SET_IDREG(isar, ID_AA64SMFR0, t); /* Replicate the same data to the 32-bit id registers. */ aa32_max_features(cpu); @@ -1316,7 +1327,7 @@ static void aarch64_cpu_register_types(void) size_t i; for (i = 0; i < ARRAY_SIZE(aarch64_cpus); ++i) { - aarch64_cpu_register(&aarch64_cpus[i]); + arm_cpu_register(&aarch64_cpus[i]); } } diff --git a/target/arm/tcg/crypto_helper.c b/target/arm/tcg/crypto_helper.c index 7cadd61..3428bd1 100644 --- a/target/arm/tcg/crypto_helper.c +++ b/target/arm/tcg/crypto_helper.c @@ -10,14 +10,16 @@ */ #include "qemu/osdep.h" +#include "qemu/bitops.h" -#include "cpu.h" -#include "exec/helper-proto.h" #include "tcg/tcg-gvec-desc.h" #include "crypto/aes-round.h" #include "crypto/sm4.h" #include "vec_internal.h" +#define HELPER_H "tcg/helper.h" +#include "exec/helper-proto.h.inc" + union CRYPTO_STATE { uint8_t bytes[16]; uint32_t words[4]; diff --git a/target/arm/tcg/gengvec64.c b/target/arm/tcg/gengvec64.c index 2617cde..2429cab 100644 --- a/target/arm/tcg/gengvec64.c +++ b/target/arm/tcg/gengvec64.c @@ -369,3 +369,14 @@ void gen_gvec_usqadd_qc(unsigned vece, uint32_t rd_ofs, tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); } + +void gen_gvec_sve2_sqdmulh(unsigned vece, uint32_t rd_ofs, + uint32_t rn_ofs, uint32_t rm_ofs, + uint32_t opr_sz, uint32_t max_sz) +{ + static gen_helper_gvec_3 * const fns[4] = { + gen_helper_sve2_sqdmulh_b, gen_helper_sve2_sqdmulh_h, + gen_helper_sve2_sqdmulh_s, gen_helper_sve2_sqdmulh_d, + }; + tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); +} diff --git a/target/arm/tcg/helper-a64.c b/target/arm/tcg/helper-a64.c index 9244848..71c6c44 100644 --- a/target/arm/tcg/helper-a64.c +++ b/target/arm/tcg/helper-a64.c @@ -29,8 +29,11 @@ #include "internals.h" #include "qemu/crc32c.h" #include "exec/cpu-common.h" -#include "exec/exec-all.h" -#include "exec/cpu_ldst.h" +#include "accel/tcg/cpu-ldst.h" +#include "accel/tcg/helper-retaddr.h" +#include "accel/tcg/probe.h" +#include "exec/target_page.h" +#include "exec/tlb-flags.h" #include "qemu/int128.h" #include "qemu/atomic128.h" #include "fpu/softfloat.h" @@ -399,6 +402,8 @@ AH_MINMAX_HELPER(vfp_ah_mind, float64, float64, min) AH_MINMAX_HELPER(vfp_ah_maxh, dh_ctype_f16, float16, max) AH_MINMAX_HELPER(vfp_ah_maxs, float32, float32, max) AH_MINMAX_HELPER(vfp_ah_maxd, float64, float64, max) +AH_MINMAX_HELPER(sme2_ah_fmax_b16, bfloat16, bfloat16, max) +AH_MINMAX_HELPER(sme2_ah_fmin_b16, bfloat16, bfloat16, min) /* 64-bit versions of the CRC helpers. Note that although the operation * (and the prototypes of crc32c() and crc32() mean that only the bottom @@ -653,15 +658,6 @@ void HELPER(exception_return)(CPUARMState *env, uint64_t new_pc) spsr &= ~PSTATE_SS; } - /* - * FEAT_RME forbids return from EL3 with an invalid security state. - * We don't need an explicit check for FEAT_RME here because we enforce - * in scr_write() that you can't set the NSE bit without it. - */ - if (cur_el == 3 && (env->cp15.scr_el3 & (SCR_NS | SCR_NSE)) == SCR_NSE) { - goto illegal_return; - } - new_el = el_from_spsr(spsr); if (new_el == -1) { goto illegal_return; @@ -673,6 +669,17 @@ void HELPER(exception_return)(CPUARMState *env, uint64_t new_pc) goto illegal_return; } + /* + * FEAT_RME forbids return from EL3 to a lower exception level + * with an invalid security state. + * We don't need an explicit check for FEAT_RME here because we enforce + * in scr_write() that you can't set the NSE bit without it. + */ + if (cur_el == 3 && new_el < 3 && + (env->cp15.scr_el3 & (SCR_NS | SCR_NSE)) == SCR_NSE) { + goto illegal_return; + } + if (new_el != 0 && arm_el_is_aa64(env, new_el) != return_to_aa64) { /* Return to an EL which is configured for a different register width */ goto illegal_return; @@ -1147,7 +1154,6 @@ static void do_setp(CPUARMState *env, uint32_t syndrome, uint32_t mtedesc, env->ZF = 1; /* our env->ZF encoding is inverted */ env->CF = 0; env->VF = 0; - return; } void HELPER(setp)(CPUARMState *env, uint32_t syndrome, uint32_t mtedesc) @@ -1547,7 +1553,6 @@ static void do_cpyp(CPUARMState *env, uint32_t syndrome, uint32_t wdesc, env->ZF = 1; /* our env->ZF encoding is inverted */ env->CF = 0; env->VF = 0; - return; } void HELPER(cpyp)(CPUARMState *env, uint32_t syndrome, uint32_t wdesc, diff --git a/target/arm/tcg/helper-sme.h b/target/arm/tcg/helper-sme.h index 858d691..c551797 100644 --- a/target/arm/tcg/helper-sme.h +++ b/target/arm/tcg/helper-sme.h @@ -33,101 +33,147 @@ DEF_HELPER_FLAGS_4(sme_mova_zc_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(sme_mova_cz_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(sme_mova_zc_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) -DEF_HELPER_FLAGS_5(sme_ld1b_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1b_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1b_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1b_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) - -DEF_HELPER_FLAGS_5(sme_ld1h_be_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1h_le_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1h_be_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1h_le_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1h_be_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1h_le_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1h_be_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1h_le_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) - -DEF_HELPER_FLAGS_5(sme_ld1s_be_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1s_le_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1s_be_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1s_le_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1s_be_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1s_le_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1s_be_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1s_le_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) - -DEF_HELPER_FLAGS_5(sme_ld1d_be_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1d_le_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1d_be_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1d_le_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1d_be_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1d_le_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1d_be_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1d_le_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) - -DEF_HELPER_FLAGS_5(sme_ld1q_be_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1q_le_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1q_be_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1q_le_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1q_be_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1q_le_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1q_be_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_ld1q_le_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) - -DEF_HELPER_FLAGS_5(sme_st1b_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1b_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1b_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1b_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) - -DEF_HELPER_FLAGS_5(sme_st1h_be_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1h_le_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1h_be_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1h_le_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1h_be_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1h_le_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1h_be_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1h_le_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) - -DEF_HELPER_FLAGS_5(sme_st1s_be_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1s_le_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1s_be_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1s_le_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1s_be_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1s_le_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1s_be_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1s_le_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) - -DEF_HELPER_FLAGS_5(sme_st1d_be_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1d_le_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1d_be_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1d_le_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1d_be_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1d_le_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1d_be_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1d_le_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) - -DEF_HELPER_FLAGS_5(sme_st1q_be_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1q_le_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1q_be_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1q_le_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1q_be_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1q_le_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1q_be_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) -DEF_HELPER_FLAGS_5(sme_st1q_le_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_3(sme2_mova_cz_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_mova_zc_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_mova_cz_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_mova_zc_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_mova_cz_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_mova_zc_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_mova_cz_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_mova_zc_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(sme2p1_movaz_zc_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2p1_movaz_zc_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2p1_movaz_zc_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2p1_movaz_zc_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2p1_movaz_zc_q, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sme_ld1b_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1b_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1b_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1b_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) + +DEF_HELPER_FLAGS_5(sme_ld1h_be_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1h_le_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1h_be_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1h_le_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1h_be_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1h_le_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1h_be_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1h_le_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) + +DEF_HELPER_FLAGS_5(sme_ld1s_be_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1s_le_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1s_be_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1s_le_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1s_be_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1s_le_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1s_be_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1s_le_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) + +DEF_HELPER_FLAGS_5(sme_ld1d_be_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1d_le_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1d_be_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1d_le_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1d_be_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1d_le_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1d_be_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1d_le_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) + +DEF_HELPER_FLAGS_5(sme_ld1q_be_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1q_le_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1q_be_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1q_le_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1q_be_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1q_le_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1q_be_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_ld1q_le_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) + +DEF_HELPER_FLAGS_5(sme_st1b_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1b_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1b_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1b_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) + +DEF_HELPER_FLAGS_5(sme_st1h_be_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1h_le_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1h_be_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1h_le_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1h_be_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1h_le_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1h_be_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1h_le_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) + +DEF_HELPER_FLAGS_5(sme_st1s_be_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1s_le_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1s_be_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1s_le_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1s_be_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1s_le_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1s_be_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1s_le_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) + +DEF_HELPER_FLAGS_5(sme_st1d_be_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1d_le_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1d_be_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1d_le_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1d_be_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1d_le_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1d_be_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1d_le_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) + +DEF_HELPER_FLAGS_5(sme_st1q_be_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1q_le_h, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1q_be_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1q_le_v, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1q_be_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1q_le_h_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1q_be_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_5(sme_st1q_le_v_mte, TCG_CALL_NO_WG, void, env, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_5(sme_addha_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(sme_addva_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(sme_addha_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(sme_addva_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) -DEF_HELPER_FLAGS_7(sme_fmopa_h, TCG_CALL_NO_RWG, +DEF_HELPER_FLAGS_7(sme_fmopa_w_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_7(sme_fmopa_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_7(sme_fmopa_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_7(sme_fmopa_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_7(sme_bfmopa_w, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, env, i32) DEF_HELPER_FLAGS_7(sme_bfmopa, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_7(sme_fmops_w_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_7(sme_fmops_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_7(sme_fmops_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_7(sme_fmops_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_7(sme_bfmops_w, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_7(sme_bfmops, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_7(sme_ah_fmops_w_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_7(sme_ah_fmops_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_7(sme_ah_fmops_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_7(sme_ah_fmops_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_7(sme_ah_bfmops_w, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_7(sme_ah_bfmops, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, fpst, i32) + DEF_HELPER_FLAGS_6(sme_smopa_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_6(sme_umopa_s, TCG_CALL_NO_RWG, @@ -144,3 +190,168 @@ DEF_HELPER_FLAGS_6(sme_sumopa_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_6(sme_usmopa_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_6(sme2_bmopa_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sme2_smopa2_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sme2_umopa2_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(gvec_fmax_b16, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fmin_b16, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ah_fmax_b16, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ah_fmin_b16, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fmaxnum_b16, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fminnum_b16, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_6(sme2_fdot_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_6(sme2_fdot_idx_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_6(sme2_fvdot_idx_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, env, i32) + +DEF_HELPER_FLAGS_4(sme2_svdot_idx_4b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sme2_uvdot_idx_4b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sme2_suvdot_idx_4b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sme2_usvdot_idx_4b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sme2_svdot_idx_4h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sme2_uvdot_idx_4h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sme2_svdot_idx_2h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sme2_uvdot_idx_2h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sme2_smlall_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sme2_smlall_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sme2_smlsll_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sme2_smlsll_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sme2_umlall_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sme2_umlall_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sme2_umlsll_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sme2_umlsll_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sme2_usmlall_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sme2_smlall_idx_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sme2_smlall_idx_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sme2_smlsll_idx_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sme2_smlsll_idx_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sme2_umlall_idx_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sme2_umlall_idx_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sme2_umlsll_idx_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sme2_umlsll_idx_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sme2_usmlall_idx_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sme2_sumlall_idx_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sme2_bfcvt, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(sme2_bfcvtn, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(sme2_fcvt_n, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(sme2_fcvtn, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(sme2_fcvt_w, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(sme2_fcvtl, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(sme2_scvtf, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(sme2_ucvtf, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_3(sme2_sqcvt_sb, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_uqcvt_sb, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_sqcvtu_sb, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_sqcvt_sh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_uqcvt_sh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_sqcvtu_sh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_sqcvt_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_uqcvt_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_sqcvtu_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(sme2_sqcvtn_sb, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_uqcvtn_sb, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_sqcvtun_sb, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_sqcvtn_sh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_uqcvtn_sh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_sqcvtun_sh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_sqcvtn_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_uqcvtn_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_sqcvtun_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(sme2_sunpk2_bh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_sunpk2_hs, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_sunpk2_sd, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_sunpk4_bh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_sunpk4_hs, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_sunpk4_sd, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_uunpk2_bh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_uunpk2_hs, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_uunpk2_sd, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_uunpk4_bh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_uunpk4_hs, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_uunpk4_sd, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sme2_zip2_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sme2_zip2_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sme2_zip2_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sme2_zip2_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sme2_zip2_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sme2_uzp2_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sme2_uzp2_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sme2_uzp2_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sme2_uzp2_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sme2_uzp2_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(sme2_zip4_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_zip4_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_zip4_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_zip4_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_zip4_q, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(sme2_uzp4_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_uzp4_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_uzp4_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_uzp4_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_uzp4_q, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(sme2_sqrshr_sh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_uqrshr_sh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_sqrshru_sh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_sqrshr_sb, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_uqrshr_sb, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_sqrshru_sb, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_sqrshr_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_uqrshr_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_sqrshru_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(sme2_sqrshrn_sh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_uqrshrn_sh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_sqrshrun_sh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_sqrshrn_sb, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_uqrshrn_sb, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_sqrshrun_sb, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_sqrshrn_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_uqrshrn_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sme2_sqrshrun_dh, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sme2_sclamp_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sme2_sclamp_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sme2_sclamp_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sme2_sclamp_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sme2_uclamp_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sme2_uclamp_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sme2_uclamp_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sme2_uclamp_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sme2_fclamp_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(sme2_fclamp_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(sme2_fclamp_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(sme2_bfclamp, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(sme2_sel_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32, i32) +DEF_HELPER_FLAGS_5(sme2_sel_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32, i32) +DEF_HELPER_FLAGS_5(sme2_sel_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32, i32) +DEF_HELPER_FLAGS_5(sme2_sel_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32, i32) diff --git a/target/arm/tcg/helper-sve.h b/target/arm/tcg/helper-sve.h index 0b1b588..c3541a8 100644 --- a/target/arm/tcg/helper-sve.h +++ b/target/arm/tcg/helper-sve.h @@ -676,11 +676,21 @@ DEF_HELPER_FLAGS_5(sve2_tbl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(sve2_tbl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(sve2_tbl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_tblq_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_tblq_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_tblq_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_tblq_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + DEF_HELPER_FLAGS_4(sve2_tbx_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(sve2_tbx_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(sve2_tbx_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(sve2_tbx_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_tbxq_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_tbxq_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_tbxq_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_tbxq_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + DEF_HELPER_FLAGS_3(sve_sunpk_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) DEF_HELPER_FLAGS_3(sve_sunpk_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32) DEF_HELPER_FLAGS_3(sve_sunpk_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32) @@ -701,12 +711,22 @@ DEF_HELPER_FLAGS_4(sve_zip_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(sve_zip_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(sve2_zip_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_zipq_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_zipq_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_zipq_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_zipq_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + DEF_HELPER_FLAGS_4(sve_uzp_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(sve_uzp_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(sve_uzp_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(sve_uzp_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(sve2_uzp_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_uzpq_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_uzpq_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_uzpq_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_uzpq_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + DEF_HELPER_FLAGS_4(sve_trn_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(sve_trn_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(sve_trn_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) @@ -937,10 +957,17 @@ DEF_HELPER_FLAGS_4(sve_brkn, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(sve_brkns, TCG_CALL_NO_RWG, i32, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_3(sve_cntp, TCG_CALL_NO_RWG, i64, ptr, ptr, i32) +DEF_HELPER_FLAGS_2(sve2p1_cntp_c, TCG_CALL_NO_RWG_SE, i64, i32, i32) DEF_HELPER_FLAGS_3(sve_whilel, TCG_CALL_NO_RWG, i32, ptr, i32, i32) DEF_HELPER_FLAGS_3(sve_whileg, TCG_CALL_NO_RWG, i32, ptr, i32, i32) +DEF_HELPER_FLAGS_3(sve_while2l, TCG_CALL_NO_RWG, i32, ptr, i32, i32) +DEF_HELPER_FLAGS_3(sve_while2g, TCG_CALL_NO_RWG, i32, ptr, i32, i32) + +DEF_HELPER_FLAGS_3(sve_whilecl, TCG_CALL_NO_RWG, i32, ptr, i32, i32) +DEF_HELPER_FLAGS_3(sve_whilecg, TCG_CALL_NO_RWG, i32, ptr, i32, i32) + DEF_HELPER_FLAGS_4(sve_subri_b, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) DEF_HELPER_FLAGS_4(sve_subri_h, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) DEF_HELPER_FLAGS_4(sve_subri_s, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) @@ -1071,6 +1098,55 @@ DEF_HELPER_FLAGS_4(sve_ah_fminv_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_4(sve_ah_fminv_d, TCG_CALL_NO_RWG, i64, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(sve2p1_faddqv_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(sve2p1_faddqv_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(sve2p1_faddqv_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(sve2p1_fmaxnmqv_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(sve2p1_fmaxnmqv_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(sve2p1_fmaxnmqv_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(sve2p1_fminnmqv_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(sve2p1_fminnmqv_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(sve2p1_fminnmqv_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(sve2p1_fmaxqv_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(sve2p1_fmaxqv_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(sve2p1_fmaxqv_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(sve2p1_fminqv_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(sve2p1_fminqv_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(sve2p1_fminqv_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(sve2p1_ah_fmaxqv_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(sve2p1_ah_fmaxqv_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(sve2p1_ah_fmaxqv_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(sve2p1_ah_fminqv_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(sve2p1_ah_fminqv_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(sve2p1_ah_fminqv_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) + DEF_HELPER_FLAGS_5(sve_fadda_h, TCG_CALL_NO_RWG, i64, i64, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_5(sve_fadda_s, TCG_CALL_NO_RWG, @@ -1120,6 +1196,8 @@ DEF_HELPER_FLAGS_5(sve_fcmne0_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_5(sve_fcmne0_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(sve_fadd_b16, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_6(sve_fadd_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_6(sve_fadd_s, TCG_CALL_NO_RWG, @@ -1127,6 +1205,8 @@ DEF_HELPER_FLAGS_6(sve_fadd_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_6(sve_fadd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(sve_fsub_b16, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_6(sve_fsub_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_6(sve_fsub_s, TCG_CALL_NO_RWG, @@ -1134,6 +1214,8 @@ DEF_HELPER_FLAGS_6(sve_fsub_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_6(sve_fsub_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(sve_fmul_b16, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_6(sve_fmul_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_6(sve_fmul_s, TCG_CALL_NO_RWG, @@ -1148,6 +1230,8 @@ DEF_HELPER_FLAGS_6(sve_fdiv_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_6(sve_fdiv_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(sve_fmin_b16, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_6(sve_fmin_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_6(sve_fmin_s, TCG_CALL_NO_RWG, @@ -1155,6 +1239,8 @@ DEF_HELPER_FLAGS_6(sve_fmin_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_6(sve_fmin_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(sve_fmax_b16, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_6(sve_fmax_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_6(sve_fmax_s, TCG_CALL_NO_RWG, @@ -1162,6 +1248,8 @@ DEF_HELPER_FLAGS_6(sve_fmax_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_6(sve_fmax_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(sve_ah_fmin_b16, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_6(sve_ah_fmin_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_6(sve_ah_fmin_s, TCG_CALL_NO_RWG, @@ -1169,6 +1257,8 @@ DEF_HELPER_FLAGS_6(sve_ah_fmin_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_6(sve_ah_fmin_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(sve_ah_fmax_b16, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_6(sve_ah_fmax_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_6(sve_ah_fmax_s, TCG_CALL_NO_RWG, @@ -1176,6 +1266,8 @@ DEF_HELPER_FLAGS_6(sve_ah_fmax_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_6(sve_ah_fmax_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(sve_fminnum_b16, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_6(sve_fminnum_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_6(sve_fminnum_s, TCG_CALL_NO_RWG, @@ -1183,6 +1275,8 @@ DEF_HELPER_FLAGS_6(sve_fminnum_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_6(sve_fminnum_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(sve_fmaxnum_b16, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_6(sve_fmaxnum_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_6(sve_fmaxnum_s, TCG_CALL_NO_RWG, @@ -1447,6 +1541,8 @@ DEF_HELPER_FLAGS_6(sve_fcadd_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_6(sve_fcadd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_7(sve_fmla_zpzzz_b16, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_7(sve_fmla_zpzzz_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_7(sve_fmla_zpzzz_s, TCG_CALL_NO_RWG, @@ -1454,6 +1550,8 @@ DEF_HELPER_FLAGS_7(sve_fmla_zpzzz_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_7(sve_fmla_zpzzz_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_7(sve_fmls_zpzzz_b16, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_7(sve_fmls_zpzzz_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_7(sve_fmls_zpzzz_s, TCG_CALL_NO_RWG, @@ -1461,6 +1559,8 @@ DEF_HELPER_FLAGS_7(sve_fmls_zpzzz_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_7(sve_fmls_zpzzz_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_7(sve_fnmla_zpzzz_b16, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_7(sve_fnmla_zpzzz_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_7(sve_fnmla_zpzzz_s, TCG_CALL_NO_RWG, @@ -1468,6 +1568,8 @@ DEF_HELPER_FLAGS_7(sve_fnmla_zpzzz_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_7(sve_fnmla_zpzzz_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_7(sve_fnmls_zpzzz_b16, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_7(sve_fnmls_zpzzz_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_7(sve_fnmls_zpzzz_s, TCG_CALL_NO_RWG, @@ -1475,6 +1577,8 @@ DEF_HELPER_FLAGS_7(sve_fnmls_zpzzz_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_7(sve_fnmls_zpzzz_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_7(sve_ah_fmls_zpzzz_b16, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_7(sve_ah_fmls_zpzzz_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_7(sve_ah_fmls_zpzzz_s, TCG_CALL_NO_RWG, @@ -1482,6 +1586,8 @@ DEF_HELPER_FLAGS_7(sve_ah_fmls_zpzzz_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_7(sve_ah_fmls_zpzzz_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_7(sve_ah_fnmla_zpzzz_b16, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_7(sve_ah_fnmla_zpzzz_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_7(sve_ah_fnmla_zpzzz_s, TCG_CALL_NO_RWG, @@ -1489,6 +1595,8 @@ DEF_HELPER_FLAGS_7(sve_ah_fnmla_zpzzz_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_7(sve_ah_fnmla_zpzzz_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_7(sve_ah_fnmls_zpzzz_b16, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_7(sve_ah_fnmls_zpzzz_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_7(sve_ah_fnmls_zpzzz_s, TCG_CALL_NO_RWG, @@ -1547,945 +1655,1015 @@ DEF_HELPER_FLAGS_4(sve2_usubw_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(sve2_usubw_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(sve2_usubw_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) -DEF_HELPER_FLAGS_4(sve_ld1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld2bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld3bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld4bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ld1hh_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld2hh_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld3hh_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld4hh_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ld1hh_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld2hh_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld3hh_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld4hh_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ld1ss_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld2ss_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld3ss_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld4ss_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ld1ss_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld2ss_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld3ss_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld4ss_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ld1dd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld2dd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld3dd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld4dd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ld1dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld2dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld3dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld4dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ld1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ld1hsu_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld1hdu_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld1hss_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld1hds_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ld1hsu_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld1hdu_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld1hss_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld1hds_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ld1sdu_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld1sds_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ld1sdu_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld1sds_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ld1bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld2bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld3bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld4bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ld1hh_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld2hh_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld3hh_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld4hh_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ld1hh_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld2hh_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld3hh_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld4hh_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ld1ss_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld2ss_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld3ss_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld4ss_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ld1ss_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld2ss_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld3ss_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld4ss_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ld1dd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld2dd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld3dd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld4dd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ld1dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld2dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld3dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld4dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ld1bhu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld1bsu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld1bdu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld1bhs_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld1bss_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld1bds_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ld1hsu_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld1hdu_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld1hss_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld1hds_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ld1hsu_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld1hdu_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld1hss_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld1hds_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ld1sdu_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld1sds_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ld1sdu_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ld1sds_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ldff1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldff1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldff1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldff1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldff1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldff1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldff1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ldff1hh_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldff1hsu_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldff1hdu_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldff1hss_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldff1hds_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ldff1hh_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldff1hsu_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldff1hdu_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldff1hss_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldff1hds_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ldff1ss_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldff1sdu_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldff1sds_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ldff1ss_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldff1sdu_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldff1sds_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ldff1dd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldff1dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ldff1bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldff1bhu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldff1bsu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldff1bdu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldff1bhs_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldff1bss_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldff1bds_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld2bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld3bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld4bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld1hh_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld2hh_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld3hh_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld4hh_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld1hh_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld2hh_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld3hh_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld4hh_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld1ss_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld2ss_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld3ss_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld4ss_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld1ss_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld2ss_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld3ss_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld4ss_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld1dd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld2dd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld3dd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld4dd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld1dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld2dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld3dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld4dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld2qq_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld3qq_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld4qq_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld2qq_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld3qq_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld4qq_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld1hsu_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1hdu_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1hss_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1hds_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld1hsu_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1hdu_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1hss_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1hds_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld1sdu_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1sds_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld1squ_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1dqu_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld1sdu_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1sds_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld1squ_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1dqu_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld1bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld2bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld3bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld4bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld1hh_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld2hh_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld3hh_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld4hh_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld1hh_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld2hh_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld3hh_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld4hh_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld1ss_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld2ss_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld3ss_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld4ss_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld1ss_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld2ss_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld3ss_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld4ss_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld1dd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld2dd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld3dd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld4dd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld1dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld2dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld3dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld4dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld2qq_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld3qq_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld4qq_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld2qq_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld3qq_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld4qq_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld1bhu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1bsu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1bdu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1bhs_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1bss_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1bds_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld1hsu_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1hdu_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1hss_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1hds_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld1hsu_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1hdu_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1hss_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1hds_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld1sdu_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1sds_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld1squ_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1dqu_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld1sdu_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1sds_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ld1squ_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ld1dqu_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ldff1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldff1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldff1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldff1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldff1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldff1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldff1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ldff1hh_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldff1hsu_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldff1hdu_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldff1hss_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldff1hds_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ldff1hh_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldff1hsu_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldff1hdu_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldff1hss_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldff1hds_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ldff1ss_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldff1sdu_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldff1sds_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ldff1ss_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldff1sdu_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldff1sds_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ldff1dd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldff1dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ldff1bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldff1bhu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldff1bsu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldff1bdu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldff1bhs_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldff1bss_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldff1bds_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldff1hh_le_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldff1hsu_le_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldff1hdu_le_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldff1hss_le_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldff1hds_le_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldff1hh_be_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldff1hsu_be_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldff1hdu_be_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldff1hss_be_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldff1hds_be_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldff1ss_le_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldff1sdu_le_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldff1sds_le_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldff1ss_be_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldff1sdu_be_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldff1sds_be_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldff1dd_le_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldff1dd_be_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ldnf1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldnf1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldnf1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldnf1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldnf1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldnf1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldnf1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ldnf1hh_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldnf1hsu_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldnf1hdu_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldnf1hss_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldnf1hds_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ldnf1hh_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldnf1hsu_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldnf1hdu_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldnf1hss_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldnf1hds_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ldnf1ss_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldnf1sdu_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldnf1sds_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ldnf1ss_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldnf1sdu_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldnf1sds_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ldnf1dd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldnf1dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_ldnf1bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldnf1bhu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldnf1bsu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldnf1bdu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldnf1bhs_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldnf1bss_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_ldnf1bds_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ldnf1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldnf1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldnf1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldnf1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldnf1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldnf1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldnf1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ldnf1hh_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldnf1hsu_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldnf1hdu_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldnf1hss_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldnf1hds_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ldnf1hh_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldnf1hsu_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldnf1hdu_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldnf1hss_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldnf1hds_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ldnf1ss_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldnf1sdu_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldnf1sds_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ldnf1ss_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldnf1sdu_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldnf1sds_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ldnf1dd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldnf1dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_ldnf1bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldnf1bhu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldnf1bsu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldnf1bdu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldnf1bhs_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldnf1bss_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_ldnf1bds_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldnf1hh_le_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldnf1hsu_le_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldnf1hdu_le_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldnf1hss_le_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldnf1hds_le_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldnf1hh_be_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldnf1hsu_be_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldnf1hdu_be_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldnf1hss_be_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldnf1hds_be_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldnf1ss_le_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldnf1sdu_le_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldnf1sds_le_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldnf1ss_be_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldnf1sdu_be_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldnf1sds_be_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldnf1dd_le_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) + void, env, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve_ldnf1dd_be_r_mte, TCG_CALL_NO_WG, - void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_st1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st2bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st3bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st4bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_st1hh_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st2hh_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st3hh_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st4hh_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_st1hh_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st2hh_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st3hh_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st4hh_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_st1ss_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st2ss_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st3ss_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st4ss_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_st1ss_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st2ss_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st3ss_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st4ss_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_st1dd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st2dd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st3dd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st4dd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_st1dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st2dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st3dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st4dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_st1bh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st1bs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st1bd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_st1hs_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st1hd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st1hs_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st1hd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_st1sd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st1sd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_st1bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st2bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st3bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st4bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_st1hh_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st2hh_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st3hh_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st4hh_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_st1hh_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st2hh_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st3hh_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st4hh_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_st1ss_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st2ss_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st3ss_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st4ss_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_st1ss_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st2ss_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st3ss_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st4ss_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_st1dd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st2dd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st3dd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st4dd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_st1dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st2dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st3dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st4dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_st1bh_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st1bs_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st1bd_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_st1hs_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st1hd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st1hs_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st1hd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) - -DEF_HELPER_FLAGS_4(sve_st1sd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) -DEF_HELPER_FLAGS_4(sve_st1sd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_st1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st2bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st3bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st4bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_st1hh_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st2hh_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st3hh_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st4hh_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_st1hh_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st2hh_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st3hh_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st4hh_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_st1ss_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st2ss_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st3ss_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st4ss_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_st1ss_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st2ss_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st3ss_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st4ss_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_st1dd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st2dd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st3dd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st4dd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_st1dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st2dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st3dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st4dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_st2qq_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st3qq_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st4qq_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_st2qq_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st3qq_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st4qq_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_st1bh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st1bs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st1bd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_st1hs_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st1hd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st1hs_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st1hd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_st1sd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st1sd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_st1sq_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st1sq_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st1dq_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st1dq_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_st1bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st2bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st3bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st4bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_st1hh_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st2hh_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st3hh_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st4hh_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_st1hh_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st2hh_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st3hh_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st4hh_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_st1ss_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st2ss_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st3ss_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st4ss_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_st1ss_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st2ss_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st3ss_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st4ss_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_st1dd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st2dd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st3dd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st4dd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_st1dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st2dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st3dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st4dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_st2qq_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st3qq_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st4qq_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_st2qq_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st3qq_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st4qq_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_st1bh_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st1bs_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st1bd_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_st1hs_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st1hd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st1hs_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st1hd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_st1sd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st1sd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) + +DEF_HELPER_FLAGS_4(sve_st1sq_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st1sq_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st1dq_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) +DEF_HELPER_FLAGS_4(sve_st1dq_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldbsu_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhsu_le_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhsu_be_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldss_le_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldss_be_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldbss_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhss_le_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhss_be_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldbsu_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhsu_le_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhsu_be_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldss_le_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldss_be_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldbss_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhss_le_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhss_be_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldbdu_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhdu_le_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhdu_be_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldsdu_le_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldsdu_be_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_lddd_le_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_lddd_be_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldbds_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhds_le_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhds_be_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldsds_le_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldsds_be_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldbdu_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhdu_le_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhdu_be_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldsdu_le_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldsdu_be_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_lddd_le_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_lddd_be_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldbds_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhds_le_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhds_be_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldsds_le_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldsds_be_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldbdu_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhdu_le_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhdu_be_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldsdu_le_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldsdu_be_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_lddd_le_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_lddd_be_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldbds_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhds_le_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhds_be_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldsds_le_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldsds_be_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_6(sve_ldqq_le_zd, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_6(sve_ldqq_be_zd, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldbsu_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhsu_le_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhsu_be_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldss_le_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldss_be_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldbss_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhss_le_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhss_be_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldbsu_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhsu_le_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhsu_be_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldss_le_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldss_be_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldbss_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhss_le_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhss_be_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldbdu_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhdu_le_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhdu_be_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldsdu_le_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldsdu_be_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_lddd_le_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_lddd_be_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldbds_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhds_le_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhds_be_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldsds_le_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldsds_be_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldbdu_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhdu_le_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhdu_be_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldsdu_le_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldsdu_be_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_lddd_le_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_lddd_be_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldbds_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhds_le_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhds_be_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldsds_le_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldsds_be_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldbdu_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhdu_le_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhdu_be_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldsdu_le_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldsdu_be_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_lddd_le_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_lddd_be_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldbds_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhds_le_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldhds_be_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldsds_le_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldsds_be_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_6(sve_ldqq_le_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_6(sve_ldqq_be_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffbsu_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhsu_le_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhsu_be_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffss_le_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffss_be_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffbss_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhss_le_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhss_be_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffbsu_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhsu_le_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhsu_be_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffss_le_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffss_be_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffbss_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhss_le_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhss_be_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffbdu_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhdu_le_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhdu_be_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffsdu_le_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffsdu_be_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffdd_le_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffdd_be_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffbds_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhds_le_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhds_be_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffsds_le_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffsds_be_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffbdu_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhdu_le_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhdu_be_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffsdu_le_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffsdu_be_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffdd_le_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffdd_be_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffbds_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhds_le_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhds_be_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffsds_le_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffsds_be_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffbdu_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhdu_le_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhdu_be_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffsdu_le_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffsdu_be_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffdd_le_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffdd_be_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffbds_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhds_le_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhds_be_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffsds_le_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffsds_be_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffbsu_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhsu_le_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhsu_be_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffss_le_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffss_be_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffbss_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhss_le_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhss_be_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffbsu_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhsu_le_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhsu_be_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffss_le_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffss_be_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffbss_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhss_le_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhss_be_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffbdu_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhdu_le_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhdu_be_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffsdu_le_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffsdu_be_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffdd_le_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffdd_be_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffbds_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhds_le_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhds_be_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffsds_le_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffsds_be_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffbdu_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhdu_le_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhdu_be_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffsdu_le_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffsdu_be_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffdd_le_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffdd_be_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffbds_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhds_le_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhds_be_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffsds_le_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffsds_be_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffbdu_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhdu_le_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhdu_be_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffsdu_le_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffsdu_be_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffdd_le_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffdd_be_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffbds_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhds_le_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffhds_be_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffsds_le_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_ldffsds_be_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stbs_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_sths_le_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_sths_be_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stss_le_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stss_be_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stbs_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_sths_le_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_sths_be_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stss_le_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stss_be_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stbd_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_sthd_le_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_sthd_be_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stsd_le_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stsd_be_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stdd_le_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stdd_be_zsu, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stbd_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_sthd_le_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_sthd_be_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stsd_le_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stsd_be_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stdd_le_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stdd_be_zss, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stbd_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_sthd_le_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_sthd_be_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stsd_le_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stsd_be_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stdd_le_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stdd_be_zd, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_6(sve_stqq_le_zd, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_6(sve_stqq_be_zd, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stbs_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_sths_le_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_sths_be_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stss_le_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stss_be_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stbs_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_sths_le_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_sths_be_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stss_le_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stss_be_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stbd_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_sthd_le_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_sthd_be_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stsd_le_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stsd_be_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stdd_le_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stdd_be_zsu_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stbd_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_sthd_le_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_sthd_be_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stsd_le_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stsd_be_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stdd_le_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stdd_be_zss_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stbd_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_sthd_le_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_sthd_be_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stsd_le_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stsd_be_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stdd_le_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_6(sve_stdd_be_zd_mte, TCG_CALL_NO_WG, - void, env, ptr, ptr, ptr, tl, i32) + void, env, ptr, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_6(sve_stqq_le_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i64) +DEF_HELPER_FLAGS_6(sve_stqq_be_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i64) DEF_HELPER_FLAGS_4(sve2_sqdmull_zzz_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) @@ -2922,3 +3100,69 @@ DEF_HELPER_FLAGS_4(sve2_sqshlu_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(sve2_sqshlu_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(sve2_sqshlu_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(sve2_sqshlu_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve2p1_addqv_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_addqv_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_addqv_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_addqv_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve2p1_smaxqv_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_smaxqv_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_smaxqv_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_smaxqv_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve2p1_sminqv_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_sminqv_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_sminqv_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_sminqv_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve2p1_umaxqv_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_umaxqv_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_umaxqv_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_umaxqv_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve2p1_uminqv_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_uminqv_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_uminqv_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_uminqv_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(pext, TCG_CALL_NO_RWG, void, ptr, i32, i32) + +DEF_HELPER_FLAGS_4(sve2p1_orqv_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_orqv_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_orqv_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_orqv_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve2p1_eorqv_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_eorqv_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_eorqv_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_eorqv_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve2p1_andqv_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_andqv_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_andqv_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2p1_andqv_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(pmov_pv_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(pmov_pv_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(pmov_pv_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(pmov_vp_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(pmov_vp_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(pmov_vp_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve2p1_ld1bb_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i64) +DEF_HELPER_FLAGS_5(sve2p1_ld1hh_le_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i64) +DEF_HELPER_FLAGS_5(sve2p1_ld1hh_be_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i64) +DEF_HELPER_FLAGS_5(sve2p1_ld1ss_le_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i64) +DEF_HELPER_FLAGS_5(sve2p1_ld1ss_be_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i64) +DEF_HELPER_FLAGS_5(sve2p1_ld1dd_le_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i64) +DEF_HELPER_FLAGS_5(sve2p1_ld1dd_be_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i64) + +DEF_HELPER_FLAGS_5(sve2p1_st1bb_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i64) +DEF_HELPER_FLAGS_5(sve2p1_st1hh_le_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i64) +DEF_HELPER_FLAGS_5(sve2p1_st1hh_be_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i64) +DEF_HELPER_FLAGS_5(sve2p1_st1ss_le_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i64) +DEF_HELPER_FLAGS_5(sve2p1_st1ss_be_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i64) +DEF_HELPER_FLAGS_5(sve2p1_st1dd_le_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i64) +DEF_HELPER_FLAGS_5(sve2p1_st1dd_be_c, TCG_CALL_NO_WG, void, env, ptr, tl, i32, i64) diff --git a/target/arm/tcg/helper.h b/target/arm/tcg/helper.h new file mode 100644 index 0000000..4da32db --- /dev/null +++ b/target/arm/tcg/helper.h @@ -0,0 +1,1218 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +DEF_HELPER_FLAGS_1(sxtb16, TCG_CALL_NO_RWG_SE, i32, i32) +DEF_HELPER_FLAGS_1(uxtb16, TCG_CALL_NO_RWG_SE, i32, i32) + +DEF_HELPER_3(add_setq, i32, env, i32, i32) +DEF_HELPER_3(add_saturate, i32, env, i32, i32) +DEF_HELPER_3(sub_saturate, i32, env, i32, i32) +DEF_HELPER_3(add_usaturate, i32, env, i32, i32) +DEF_HELPER_3(sub_usaturate, i32, env, i32, i32) +DEF_HELPER_FLAGS_3(sdiv, TCG_CALL_NO_RWG, s32, env, s32, s32) +DEF_HELPER_FLAGS_3(udiv, TCG_CALL_NO_RWG, i32, env, i32, i32) +DEF_HELPER_FLAGS_1(rbit, TCG_CALL_NO_RWG_SE, i32, i32) + +#define PAS_OP(pfx) \ + DEF_HELPER_3(pfx ## add8, i32, i32, i32, ptr) \ + DEF_HELPER_3(pfx ## sub8, i32, i32, i32, ptr) \ + DEF_HELPER_3(pfx ## sub16, i32, i32, i32, ptr) \ + DEF_HELPER_3(pfx ## add16, i32, i32, i32, ptr) \ + DEF_HELPER_3(pfx ## addsubx, i32, i32, i32, ptr) \ + DEF_HELPER_3(pfx ## subaddx, i32, i32, i32, ptr) + +PAS_OP(s) +PAS_OP(u) +#undef PAS_OP + +#define PAS_OP(pfx) \ + DEF_HELPER_2(pfx ## add8, i32, i32, i32) \ + DEF_HELPER_2(pfx ## sub8, i32, i32, i32) \ + DEF_HELPER_2(pfx ## sub16, i32, i32, i32) \ + DEF_HELPER_2(pfx ## add16, i32, i32, i32) \ + DEF_HELPER_2(pfx ## addsubx, i32, i32, i32) \ + DEF_HELPER_2(pfx ## subaddx, i32, i32, i32) +PAS_OP(q) +PAS_OP(sh) +PAS_OP(uq) +PAS_OP(uh) +#undef PAS_OP + +DEF_HELPER_3(ssat, i32, env, i32, i32) +DEF_HELPER_3(usat, i32, env, i32, i32) +DEF_HELPER_3(ssat16, i32, env, i32, i32) +DEF_HELPER_3(usat16, i32, env, i32, i32) + +DEF_HELPER_FLAGS_2(usad8, TCG_CALL_NO_RWG_SE, i32, i32, i32) + +DEF_HELPER_FLAGS_3(sel_flags, TCG_CALL_NO_RWG_SE, + i32, i32, i32, i32) +DEF_HELPER_2(exception_internal, noreturn, env, i32) +DEF_HELPER_3(exception_with_syndrome, noreturn, env, i32, i32) +DEF_HELPER_4(exception_with_syndrome_el, noreturn, env, i32, i32, i32) +DEF_HELPER_2(exception_bkpt_insn, noreturn, env, i32) +DEF_HELPER_2(exception_swstep, noreturn, env, i32) +DEF_HELPER_2(exception_pc_alignment, noreturn, env, vaddr) +DEF_HELPER_1(setend, void, env) +DEF_HELPER_2(wfi, void, env, i32) +DEF_HELPER_1(wfe, void, env) +DEF_HELPER_2(wfit, void, env, i64) +DEF_HELPER_1(yield, void, env) +DEF_HELPER_1(pre_hvc, void, env) +DEF_HELPER_2(pre_smc, void, env, i32) +DEF_HELPER_1(vesb, void, env) + +DEF_HELPER_3(cpsr_write, void, env, i32, i32) +DEF_HELPER_2(cpsr_write_eret, void, env, i32) +DEF_HELPER_1(cpsr_read, i32, env) + +DEF_HELPER_3(v7m_msr, void, env, i32, i32) +DEF_HELPER_2(v7m_mrs, i32, env, i32) + +DEF_HELPER_2(v7m_bxns, void, env, i32) +DEF_HELPER_2(v7m_blxns, void, env, i32) + +DEF_HELPER_3(v7m_tt, i32, env, i32, i32) + +DEF_HELPER_1(v7m_preserve_fp_state, void, env) + +DEF_HELPER_2(v7m_vlstm, void, env, i32) +DEF_HELPER_2(v7m_vlldm, void, env, i32) + +DEF_HELPER_2(v8m_stackcheck, void, env, i32) + +DEF_HELPER_FLAGS_2(check_bxj_trap, TCG_CALL_NO_WG, void, env, i32) + +DEF_HELPER_4(access_check_cp_reg, cptr, env, i32, i32, i32) +DEF_HELPER_FLAGS_2(lookup_cp_reg, TCG_CALL_NO_RWG_SE, cptr, env, i32) +DEF_HELPER_FLAGS_2(tidcp_el0, TCG_CALL_NO_WG, void, env, i32) +DEF_HELPER_FLAGS_2(tidcp_el1, TCG_CALL_NO_WG, void, env, i32) +DEF_HELPER_3(set_cp_reg, void, env, cptr, i32) +DEF_HELPER_2(get_cp_reg, i32, env, cptr) +DEF_HELPER_3(set_cp_reg64, void, env, cptr, i64) +DEF_HELPER_2(get_cp_reg64, i64, env, cptr) + +DEF_HELPER_2(get_r13_banked, i32, env, i32) +DEF_HELPER_3(set_r13_banked, void, env, i32, i32) + +DEF_HELPER_3(mrs_banked, i32, env, i32, i32) +DEF_HELPER_4(msr_banked, void, env, i32, i32, i32) + +DEF_HELPER_2(get_user_reg, i32, env, i32) +DEF_HELPER_3(set_user_reg, void, env, i32, i32) + +DEF_HELPER_FLAGS_1(rebuild_hflags_m32_newel, TCG_CALL_NO_RWG, void, env) +DEF_HELPER_FLAGS_2(rebuild_hflags_m32, TCG_CALL_NO_RWG, void, env, int) +DEF_HELPER_FLAGS_1(rebuild_hflags_a32_newel, TCG_CALL_NO_RWG, void, env) +DEF_HELPER_FLAGS_2(rebuild_hflags_a32, TCG_CALL_NO_RWG, void, env, int) +DEF_HELPER_FLAGS_2(rebuild_hflags_a64, TCG_CALL_NO_RWG, void, env, int) + +DEF_HELPER_FLAGS_5(probe_access, TCG_CALL_NO_WG, void, env, vaddr, i32, i32, i32) + +DEF_HELPER_1(vfp_get_fpscr, i32, env) +DEF_HELPER_2(vfp_set_fpscr, void, env, i32) + +DEF_HELPER_3(vfp_addh, f16, f16, f16, fpst) +DEF_HELPER_3(vfp_adds, f32, f32, f32, fpst) +DEF_HELPER_3(vfp_addd, f64, f64, f64, fpst) +DEF_HELPER_3(vfp_subh, f16, f16, f16, fpst) +DEF_HELPER_3(vfp_subs, f32, f32, f32, fpst) +DEF_HELPER_3(vfp_subd, f64, f64, f64, fpst) +DEF_HELPER_3(vfp_mulh, f16, f16, f16, fpst) +DEF_HELPER_3(vfp_muls, f32, f32, f32, fpst) +DEF_HELPER_3(vfp_muld, f64, f64, f64, fpst) +DEF_HELPER_3(vfp_divh, f16, f16, f16, fpst) +DEF_HELPER_3(vfp_divs, f32, f32, f32, fpst) +DEF_HELPER_3(vfp_divd, f64, f64, f64, fpst) +DEF_HELPER_3(vfp_maxh, f16, f16, f16, fpst) +DEF_HELPER_3(vfp_maxs, f32, f32, f32, fpst) +DEF_HELPER_3(vfp_maxd, f64, f64, f64, fpst) +DEF_HELPER_3(vfp_minh, f16, f16, f16, fpst) +DEF_HELPER_3(vfp_mins, f32, f32, f32, fpst) +DEF_HELPER_3(vfp_mind, f64, f64, f64, fpst) +DEF_HELPER_3(vfp_maxnumh, f16, f16, f16, fpst) +DEF_HELPER_3(vfp_maxnums, f32, f32, f32, fpst) +DEF_HELPER_3(vfp_maxnumd, f64, f64, f64, fpst) +DEF_HELPER_3(vfp_minnumh, f16, f16, f16, fpst) +DEF_HELPER_3(vfp_minnums, f32, f32, f32, fpst) +DEF_HELPER_3(vfp_minnumd, f64, f64, f64, fpst) +DEF_HELPER_2(vfp_sqrth, f16, f16, fpst) +DEF_HELPER_2(vfp_sqrts, f32, f32, fpst) +DEF_HELPER_2(vfp_sqrtd, f64, f64, fpst) +DEF_HELPER_3(vfp_cmph, void, f16, f16, env) +DEF_HELPER_3(vfp_cmps, void, f32, f32, env) +DEF_HELPER_3(vfp_cmpd, void, f64, f64, env) +DEF_HELPER_3(vfp_cmpeh, void, f16, f16, env) +DEF_HELPER_3(vfp_cmpes, void, f32, f32, env) +DEF_HELPER_3(vfp_cmped, void, f64, f64, env) + +DEF_HELPER_2(vfp_fcvtds, f64, f32, fpst) +DEF_HELPER_2(vfp_fcvtsd, f32, f64, fpst) +DEF_HELPER_FLAGS_2(bfcvt, TCG_CALL_NO_RWG, i32, f32, fpst) +DEF_HELPER_FLAGS_2(bfcvt_pair, TCG_CALL_NO_RWG, i32, i64, fpst) + +DEF_HELPER_2(vfp_uitoh, f16, i32, fpst) +DEF_HELPER_2(vfp_uitos, f32, i32, fpst) +DEF_HELPER_2(vfp_uitod, f64, i32, fpst) +DEF_HELPER_2(vfp_sitoh, f16, i32, fpst) +DEF_HELPER_2(vfp_sitos, f32, i32, fpst) +DEF_HELPER_2(vfp_sitod, f64, i32, fpst) + +DEF_HELPER_2(vfp_touih, i32, f16, fpst) +DEF_HELPER_2(vfp_touis, i32, f32, fpst) +DEF_HELPER_2(vfp_touid, i32, f64, fpst) +DEF_HELPER_2(vfp_touizh, i32, f16, fpst) +DEF_HELPER_2(vfp_touizs, i32, f32, fpst) +DEF_HELPER_2(vfp_touizd, i32, f64, fpst) +DEF_HELPER_2(vfp_tosih, s32, f16, fpst) +DEF_HELPER_2(vfp_tosis, s32, f32, fpst) +DEF_HELPER_2(vfp_tosid, s32, f64, fpst) +DEF_HELPER_2(vfp_tosizh, s32, f16, fpst) +DEF_HELPER_2(vfp_tosizs, s32, f32, fpst) +DEF_HELPER_2(vfp_tosizd, s32, f64, fpst) + +DEF_HELPER_3(vfp_toshh_round_to_zero, i32, f16, i32, fpst) +DEF_HELPER_3(vfp_toslh_round_to_zero, i32, f16, i32, fpst) +DEF_HELPER_3(vfp_touhh_round_to_zero, i32, f16, i32, fpst) +DEF_HELPER_3(vfp_toulh_round_to_zero, i32, f16, i32, fpst) +DEF_HELPER_3(vfp_toshs_round_to_zero, i32, f32, i32, fpst) +DEF_HELPER_3(vfp_tosls_round_to_zero, i32, f32, i32, fpst) +DEF_HELPER_3(vfp_touhs_round_to_zero, i32, f32, i32, fpst) +DEF_HELPER_3(vfp_touls_round_to_zero, i32, f32, i32, fpst) +DEF_HELPER_3(vfp_toshd_round_to_zero, i64, f64, i32, fpst) +DEF_HELPER_3(vfp_tosld_round_to_zero, i64, f64, i32, fpst) +DEF_HELPER_3(vfp_tosqd_round_to_zero, i64, f64, i32, fpst) +DEF_HELPER_3(vfp_touhd_round_to_zero, i64, f64, i32, fpst) +DEF_HELPER_3(vfp_tould_round_to_zero, i64, f64, i32, fpst) +DEF_HELPER_3(vfp_touqd_round_to_zero, i64, f64, i32, fpst) +DEF_HELPER_3(vfp_touhh, i32, f16, i32, fpst) +DEF_HELPER_3(vfp_toshh, i32, f16, i32, fpst) +DEF_HELPER_3(vfp_toulh, i32, f16, i32, fpst) +DEF_HELPER_3(vfp_toslh, i32, f16, i32, fpst) +DEF_HELPER_3(vfp_touqh, i64, f16, i32, fpst) +DEF_HELPER_3(vfp_tosqh, i64, f16, i32, fpst) +DEF_HELPER_3(vfp_toshs, i32, f32, i32, fpst) +DEF_HELPER_3(vfp_tosls, i32, f32, i32, fpst) +DEF_HELPER_3(vfp_tosqs, i64, f32, i32, fpst) +DEF_HELPER_3(vfp_touhs, i32, f32, i32, fpst) +DEF_HELPER_3(vfp_touls, i32, f32, i32, fpst) +DEF_HELPER_3(vfp_touqs, i64, f32, i32, fpst) +DEF_HELPER_3(vfp_toshd, i64, f64, i32, fpst) +DEF_HELPER_3(vfp_tosld, i64, f64, i32, fpst) +DEF_HELPER_3(vfp_tosqd, i64, f64, i32, fpst) +DEF_HELPER_3(vfp_touhd, i64, f64, i32, fpst) +DEF_HELPER_3(vfp_tould, i64, f64, i32, fpst) +DEF_HELPER_3(vfp_touqd, i64, f64, i32, fpst) +DEF_HELPER_3(vfp_shtos, f32, i32, i32, fpst) +DEF_HELPER_3(vfp_sltos, f32, i32, i32, fpst) +DEF_HELPER_3(vfp_sqtos, f32, i64, i32, fpst) +DEF_HELPER_3(vfp_uhtos, f32, i32, i32, fpst) +DEF_HELPER_3(vfp_ultos, f32, i32, i32, fpst) +DEF_HELPER_3(vfp_uqtos, f32, i64, i32, fpst) +DEF_HELPER_3(vfp_shtod, f64, i64, i32, fpst) +DEF_HELPER_3(vfp_sltod, f64, i64, i32, fpst) +DEF_HELPER_3(vfp_sqtod, f64, i64, i32, fpst) +DEF_HELPER_3(vfp_uhtod, f64, i64, i32, fpst) +DEF_HELPER_3(vfp_ultod, f64, i64, i32, fpst) +DEF_HELPER_3(vfp_uqtod, f64, i64, i32, fpst) +DEF_HELPER_3(vfp_shtoh, f16, i32, i32, fpst) +DEF_HELPER_3(vfp_uhtoh, f16, i32, i32, fpst) +DEF_HELPER_3(vfp_sltoh, f16, i32, i32, fpst) +DEF_HELPER_3(vfp_ultoh, f16, i32, i32, fpst) +DEF_HELPER_3(vfp_sqtoh, f16, i64, i32, fpst) +DEF_HELPER_3(vfp_uqtoh, f16, i64, i32, fpst) + +DEF_HELPER_3(vfp_shtos_round_to_nearest, f32, i32, i32, fpst) +DEF_HELPER_3(vfp_sltos_round_to_nearest, f32, i32, i32, fpst) +DEF_HELPER_3(vfp_uhtos_round_to_nearest, f32, i32, i32, fpst) +DEF_HELPER_3(vfp_ultos_round_to_nearest, f32, i32, i32, fpst) +DEF_HELPER_3(vfp_shtod_round_to_nearest, f64, i64, i32, fpst) +DEF_HELPER_3(vfp_sltod_round_to_nearest, f64, i64, i32, fpst) +DEF_HELPER_3(vfp_uhtod_round_to_nearest, f64, i64, i32, fpst) +DEF_HELPER_3(vfp_ultod_round_to_nearest, f64, i64, i32, fpst) +DEF_HELPER_3(vfp_shtoh_round_to_nearest, f16, i32, i32, fpst) +DEF_HELPER_3(vfp_uhtoh_round_to_nearest, f16, i32, i32, fpst) +DEF_HELPER_3(vfp_sltoh_round_to_nearest, f16, i32, i32, fpst) +DEF_HELPER_3(vfp_ultoh_round_to_nearest, f16, i32, i32, fpst) + +DEF_HELPER_FLAGS_2(set_rmode, TCG_CALL_NO_RWG, i32, i32, fpst) + +DEF_HELPER_FLAGS_3(vfp_fcvt_f16_to_f32, TCG_CALL_NO_RWG, f32, f16, fpst, i32) +DEF_HELPER_FLAGS_3(vfp_fcvt_f32_to_f16, TCG_CALL_NO_RWG, f16, f32, fpst, i32) +DEF_HELPER_FLAGS_3(vfp_fcvt_f16_to_f64, TCG_CALL_NO_RWG, f64, f16, fpst, i32) +DEF_HELPER_FLAGS_3(vfp_fcvt_f64_to_f16, TCG_CALL_NO_RWG, f16, f64, fpst, i32) + +DEF_HELPER_4(vfp_muladdd, f64, f64, f64, f64, fpst) +DEF_HELPER_4(vfp_muladds, f32, f32, f32, f32, fpst) +DEF_HELPER_4(vfp_muladdh, f16, f16, f16, f16, fpst) + +DEF_HELPER_FLAGS_2(recpe_f16, TCG_CALL_NO_RWG, f16, f16, fpst) +DEF_HELPER_FLAGS_2(recpe_f32, TCG_CALL_NO_RWG, f32, f32, fpst) +DEF_HELPER_FLAGS_2(recpe_rpres_f32, TCG_CALL_NO_RWG, f32, f32, fpst) +DEF_HELPER_FLAGS_2(recpe_f64, TCG_CALL_NO_RWG, f64, f64, fpst) +DEF_HELPER_FLAGS_2(rsqrte_f16, TCG_CALL_NO_RWG, f16, f16, fpst) +DEF_HELPER_FLAGS_2(rsqrte_f32, TCG_CALL_NO_RWG, f32, f32, fpst) +DEF_HELPER_FLAGS_2(rsqrte_rpres_f32, TCG_CALL_NO_RWG, f32, f32, fpst) +DEF_HELPER_FLAGS_2(rsqrte_f64, TCG_CALL_NO_RWG, f64, f64, fpst) +DEF_HELPER_FLAGS_1(recpe_u32, TCG_CALL_NO_RWG, i32, i32) +DEF_HELPER_FLAGS_1(rsqrte_u32, TCG_CALL_NO_RWG, i32, i32) +DEF_HELPER_FLAGS_4(neon_tbl, TCG_CALL_NO_RWG, i64, env, i32, i64, i64) + +DEF_HELPER_3(shl_cc, i32, env, i32, i32) +DEF_HELPER_3(shr_cc, i32, env, i32, i32) +DEF_HELPER_3(sar_cc, i32, env, i32, i32) +DEF_HELPER_3(ror_cc, i32, env, i32, i32) + +DEF_HELPER_FLAGS_2(rinth_exact, TCG_CALL_NO_RWG, f16, f16, fpst) +DEF_HELPER_FLAGS_2(rints_exact, TCG_CALL_NO_RWG, f32, f32, fpst) +DEF_HELPER_FLAGS_2(rintd_exact, TCG_CALL_NO_RWG, f64, f64, fpst) +DEF_HELPER_FLAGS_2(rinth, TCG_CALL_NO_RWG, f16, f16, fpst) +DEF_HELPER_FLAGS_2(rints, TCG_CALL_NO_RWG, f32, f32, fpst) +DEF_HELPER_FLAGS_2(rintd, TCG_CALL_NO_RWG, f64, f64, fpst) + +DEF_HELPER_FLAGS_2(vjcvt, TCG_CALL_NO_RWG, i32, f64, env) +DEF_HELPER_FLAGS_2(fjcvtzs, TCG_CALL_NO_RWG, i64, f64, fpst) + +DEF_HELPER_FLAGS_3(check_hcr_el2_trap, TCG_CALL_NO_WG, void, env, i32, i32) + +/* neon_helper.c */ +DEF_HELPER_2(neon_pmin_u8, i32, i32, i32) +DEF_HELPER_2(neon_pmin_s8, i32, i32, i32) +DEF_HELPER_2(neon_pmin_u16, i32, i32, i32) +DEF_HELPER_2(neon_pmin_s16, i32, i32, i32) +DEF_HELPER_2(neon_pmax_u8, i32, i32, i32) +DEF_HELPER_2(neon_pmax_s8, i32, i32, i32) +DEF_HELPER_2(neon_pmax_u16, i32, i32, i32) +DEF_HELPER_2(neon_pmax_s16, i32, i32, i32) + +DEF_HELPER_2(neon_shl_u16, i32, i32, i32) +DEF_HELPER_2(neon_shl_s16, i32, i32, i32) +DEF_HELPER_2(neon_rshl_u8, i32, i32, i32) +DEF_HELPER_2(neon_rshl_s8, i32, i32, i32) +DEF_HELPER_2(neon_rshl_u16, i32, i32, i32) +DEF_HELPER_2(neon_rshl_s16, i32, i32, i32) +DEF_HELPER_2(neon_rshl_u32, i32, i32, i32) +DEF_HELPER_2(neon_rshl_s32, i32, i32, i32) +DEF_HELPER_2(neon_rshl_u64, i64, i64, i64) +DEF_HELPER_2(neon_rshl_s64, i64, i64, i64) +DEF_HELPER_3(neon_qshl_u8, i32, env, i32, i32) +DEF_HELPER_3(neon_qshl_s8, i32, env, i32, i32) +DEF_HELPER_3(neon_qshl_u16, i32, env, i32, i32) +DEF_HELPER_3(neon_qshl_s16, i32, env, i32, i32) +DEF_HELPER_3(neon_qshl_u32, i32, env, i32, i32) +DEF_HELPER_3(neon_qshl_s32, i32, env, i32, i32) +DEF_HELPER_3(neon_qshl_u64, i64, env, i64, i64) +DEF_HELPER_3(neon_qshl_s64, i64, env, i64, i64) +DEF_HELPER_3(neon_qshlu_s8, i32, env, i32, i32) +DEF_HELPER_3(neon_qshlu_s16, i32, env, i32, i32) +DEF_HELPER_3(neon_qshlu_s32, i32, env, i32, i32) +DEF_HELPER_3(neon_qshlu_s64, i64, env, i64, i64) +DEF_HELPER_3(neon_qrshl_u8, i32, env, i32, i32) +DEF_HELPER_3(neon_qrshl_s8, i32, env, i32, i32) +DEF_HELPER_3(neon_qrshl_u16, i32, env, i32, i32) +DEF_HELPER_3(neon_qrshl_s16, i32, env, i32, i32) +DEF_HELPER_3(neon_qrshl_u32, i32, env, i32, i32) +DEF_HELPER_3(neon_qrshl_s32, i32, env, i32, i32) +DEF_HELPER_3(neon_qrshl_u64, i64, env, i64, i64) +DEF_HELPER_3(neon_qrshl_s64, i64, env, i64, i64) +DEF_HELPER_FLAGS_5(neon_sqshl_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_5(neon_sqshl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_5(neon_sqshl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_5(neon_sqshl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_5(neon_uqshl_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_5(neon_uqshl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_5(neon_uqshl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_5(neon_uqshl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_5(neon_sqrshl_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_5(neon_sqrshl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_5(neon_sqrshl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_5(neon_sqrshl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_5(neon_uqrshl_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_5(neon_uqrshl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_5(neon_uqrshl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_5(neon_uqrshl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_4(neon_sqshli_b, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_4(neon_sqshli_h, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_4(neon_sqshli_s, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_4(neon_sqshli_d, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_4(neon_uqshli_b, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_4(neon_uqshli_h, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_4(neon_uqshli_s, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_4(neon_uqshli_d, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_4(neon_sqshlui_b, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_4(neon_sqshlui_h, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_4(neon_sqshlui_s, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_4(neon_sqshlui_d, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) + +DEF_HELPER_FLAGS_4(gvec_srshl_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_srshl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_srshl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_srshl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_urshl_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_urshl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_urshl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_urshl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sme2_srshl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sme2_srshl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sme2_srshl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sme2_urshl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sme2_urshl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sme2_urshl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_2(neon_add_u8, i32, i32, i32) +DEF_HELPER_2(neon_add_u16, i32, i32, i32) +DEF_HELPER_2(neon_sub_u8, i32, i32, i32) +DEF_HELPER_2(neon_sub_u16, i32, i32, i32) +DEF_HELPER_2(neon_mul_u8, i32, i32, i32) +DEF_HELPER_2(neon_mul_u16, i32, i32, i32) + +DEF_HELPER_2(neon_tst_u8, i32, i32, i32) +DEF_HELPER_2(neon_tst_u16, i32, i32, i32) +DEF_HELPER_2(neon_tst_u32, i32, i32, i32) + +DEF_HELPER_1(neon_clz_u8, i32, i32) +DEF_HELPER_1(neon_clz_u16, i32, i32) +DEF_HELPER_1(neon_cls_s8, i32, i32) +DEF_HELPER_1(neon_cls_s16, i32, i32) +DEF_HELPER_1(neon_cls_s32, i32, i32) +DEF_HELPER_FLAGS_3(gvec_cnt_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_rbit_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_3(neon_qdmulh_s16, i32, env, i32, i32) +DEF_HELPER_3(neon_qrdmulh_s16, i32, env, i32, i32) +DEF_HELPER_4(neon_qrdmlah_s16, i32, env, i32, i32, i32) +DEF_HELPER_4(neon_qrdmlsh_s16, i32, env, i32, i32, i32) +DEF_HELPER_3(neon_qdmulh_s32, i32, env, i32, i32) +DEF_HELPER_3(neon_qrdmulh_s32, i32, env, i32, i32) +DEF_HELPER_4(neon_qrdmlah_s32, i32, env, s32, s32, s32) +DEF_HELPER_4(neon_qrdmlsh_s32, i32, env, s32, s32, s32) + +DEF_HELPER_1(neon_narrow_u8, i64, i64) +DEF_HELPER_1(neon_narrow_u16, i64, i64) +DEF_HELPER_2(neon_unarrow_sat8, i64, env, i64) +DEF_HELPER_2(neon_narrow_sat_u8, i64, env, i64) +DEF_HELPER_2(neon_narrow_sat_s8, i64, env, i64) +DEF_HELPER_2(neon_unarrow_sat16, i64, env, i64) +DEF_HELPER_2(neon_narrow_sat_u16, i64, env, i64) +DEF_HELPER_2(neon_narrow_sat_s16, i64, env, i64) +DEF_HELPER_2(neon_unarrow_sat32, i64, env, i64) +DEF_HELPER_2(neon_narrow_sat_u32, i64, env, i64) +DEF_HELPER_2(neon_narrow_sat_s32, i64, env, i64) +DEF_HELPER_1(neon_narrow_high_u8, i32, i64) +DEF_HELPER_1(neon_narrow_high_u16, i32, i64) +DEF_HELPER_1(neon_narrow_round_high_u8, i32, i64) +DEF_HELPER_1(neon_narrow_round_high_u16, i32, i64) +DEF_HELPER_1(neon_widen_u8, i64, i32) +DEF_HELPER_1(neon_widen_s8, i64, i32) +DEF_HELPER_1(neon_widen_u16, i64, i32) +DEF_HELPER_1(neon_widen_s16, i64, i32) + +DEF_HELPER_FLAGS_1(neon_addlp_s8, TCG_CALL_NO_RWG_SE, i64, i64) +DEF_HELPER_FLAGS_1(neon_addlp_s16, TCG_CALL_NO_RWG_SE, i64, i64) +DEF_HELPER_3(neon_addl_saturate_s32, i64, env, i64, i64) +DEF_HELPER_3(neon_addl_saturate_s64, i64, env, i64, i64) +DEF_HELPER_2(neon_abdl_u16, i64, i32, i32) +DEF_HELPER_2(neon_abdl_s16, i64, i32, i32) +DEF_HELPER_2(neon_abdl_u32, i64, i32, i32) +DEF_HELPER_2(neon_abdl_s32, i64, i32, i32) +DEF_HELPER_2(neon_abdl_u64, i64, i32, i32) +DEF_HELPER_2(neon_abdl_s64, i64, i32, i32) +DEF_HELPER_2(neon_mull_u8, i64, i32, i32) +DEF_HELPER_2(neon_mull_s8, i64, i32, i32) +DEF_HELPER_2(neon_mull_u16, i64, i32, i32) +DEF_HELPER_2(neon_mull_s16, i64, i32, i32) + +DEF_HELPER_1(neon_negl_u16, i64, i64) +DEF_HELPER_1(neon_negl_u32, i64, i64) + +DEF_HELPER_FLAGS_2(neon_qabs_s8, TCG_CALL_NO_RWG, i32, env, i32) +DEF_HELPER_FLAGS_2(neon_qabs_s16, TCG_CALL_NO_RWG, i32, env, i32) +DEF_HELPER_FLAGS_2(neon_qabs_s32, TCG_CALL_NO_RWG, i32, env, i32) +DEF_HELPER_FLAGS_2(neon_qabs_s64, TCG_CALL_NO_RWG, i64, env, i64) +DEF_HELPER_FLAGS_2(neon_qneg_s8, TCG_CALL_NO_RWG, i32, env, i32) +DEF_HELPER_FLAGS_2(neon_qneg_s16, TCG_CALL_NO_RWG, i32, env, i32) +DEF_HELPER_FLAGS_2(neon_qneg_s32, TCG_CALL_NO_RWG, i32, env, i32) +DEF_HELPER_FLAGS_2(neon_qneg_s64, TCG_CALL_NO_RWG, i64, env, i64) + +DEF_HELPER_3(neon_ceq_f32, i32, i32, i32, fpst) +DEF_HELPER_3(neon_cge_f32, i32, i32, i32, fpst) +DEF_HELPER_3(neon_cgt_f32, i32, i32, i32, fpst) +DEF_HELPER_3(neon_acge_f32, i32, i32, i32, fpst) +DEF_HELPER_3(neon_acgt_f32, i32, i32, i32, fpst) +DEF_HELPER_3(neon_acge_f64, i64, i64, i64, fpst) +DEF_HELPER_3(neon_acgt_f64, i64, i64, i64, fpst) + +/* iwmmxt_helper.c */ +DEF_HELPER_2(iwmmxt_maddsq, i64, i64, i64) +DEF_HELPER_2(iwmmxt_madduq, i64, i64, i64) +DEF_HELPER_2(iwmmxt_sadb, i64, i64, i64) +DEF_HELPER_2(iwmmxt_sadw, i64, i64, i64) +DEF_HELPER_2(iwmmxt_mulslw, i64, i64, i64) +DEF_HELPER_2(iwmmxt_mulshw, i64, i64, i64) +DEF_HELPER_2(iwmmxt_mululw, i64, i64, i64) +DEF_HELPER_2(iwmmxt_muluhw, i64, i64, i64) +DEF_HELPER_2(iwmmxt_macsw, i64, i64, i64) +DEF_HELPER_2(iwmmxt_macuw, i64, i64, i64) +DEF_HELPER_1(iwmmxt_setpsr_nz, i32, i64) + +#define DEF_IWMMXT_HELPER_SIZE_ENV(name) \ +DEF_HELPER_3(iwmmxt_##name##b, i64, env, i64, i64) \ +DEF_HELPER_3(iwmmxt_##name##w, i64, env, i64, i64) \ +DEF_HELPER_3(iwmmxt_##name##l, i64, env, i64, i64) \ + +DEF_IWMMXT_HELPER_SIZE_ENV(unpackl) +DEF_IWMMXT_HELPER_SIZE_ENV(unpackh) + +DEF_HELPER_2(iwmmxt_unpacklub, i64, env, i64) +DEF_HELPER_2(iwmmxt_unpackluw, i64, env, i64) +DEF_HELPER_2(iwmmxt_unpacklul, i64, env, i64) +DEF_HELPER_2(iwmmxt_unpackhub, i64, env, i64) +DEF_HELPER_2(iwmmxt_unpackhuw, i64, env, i64) +DEF_HELPER_2(iwmmxt_unpackhul, i64, env, i64) +DEF_HELPER_2(iwmmxt_unpacklsb, i64, env, i64) +DEF_HELPER_2(iwmmxt_unpacklsw, i64, env, i64) +DEF_HELPER_2(iwmmxt_unpacklsl, i64, env, i64) +DEF_HELPER_2(iwmmxt_unpackhsb, i64, env, i64) +DEF_HELPER_2(iwmmxt_unpackhsw, i64, env, i64) +DEF_HELPER_2(iwmmxt_unpackhsl, i64, env, i64) + +DEF_IWMMXT_HELPER_SIZE_ENV(cmpeq) +DEF_IWMMXT_HELPER_SIZE_ENV(cmpgtu) +DEF_IWMMXT_HELPER_SIZE_ENV(cmpgts) + +DEF_IWMMXT_HELPER_SIZE_ENV(mins) +DEF_IWMMXT_HELPER_SIZE_ENV(minu) +DEF_IWMMXT_HELPER_SIZE_ENV(maxs) +DEF_IWMMXT_HELPER_SIZE_ENV(maxu) + +DEF_IWMMXT_HELPER_SIZE_ENV(subn) +DEF_IWMMXT_HELPER_SIZE_ENV(addn) +DEF_IWMMXT_HELPER_SIZE_ENV(subu) +DEF_IWMMXT_HELPER_SIZE_ENV(addu) +DEF_IWMMXT_HELPER_SIZE_ENV(subs) +DEF_IWMMXT_HELPER_SIZE_ENV(adds) + +DEF_HELPER_3(iwmmxt_avgb0, i64, env, i64, i64) +DEF_HELPER_3(iwmmxt_avgb1, i64, env, i64, i64) +DEF_HELPER_3(iwmmxt_avgw0, i64, env, i64, i64) +DEF_HELPER_3(iwmmxt_avgw1, i64, env, i64, i64) + +DEF_HELPER_3(iwmmxt_align, i64, i64, i64, i32) +DEF_HELPER_4(iwmmxt_insr, i64, i64, i32, i32, i32) + +DEF_HELPER_1(iwmmxt_bcstb, i64, i32) +DEF_HELPER_1(iwmmxt_bcstw, i64, i32) +DEF_HELPER_1(iwmmxt_bcstl, i64, i32) + +DEF_HELPER_1(iwmmxt_addcb, i64, i64) +DEF_HELPER_1(iwmmxt_addcw, i64, i64) +DEF_HELPER_1(iwmmxt_addcl, i64, i64) + +DEF_HELPER_1(iwmmxt_msbb, i32, i64) +DEF_HELPER_1(iwmmxt_msbw, i32, i64) +DEF_HELPER_1(iwmmxt_msbl, i32, i64) + +DEF_HELPER_3(iwmmxt_srlw, i64, env, i64, i32) +DEF_HELPER_3(iwmmxt_srll, i64, env, i64, i32) +DEF_HELPER_3(iwmmxt_srlq, i64, env, i64, i32) +DEF_HELPER_3(iwmmxt_sllw, i64, env, i64, i32) +DEF_HELPER_3(iwmmxt_slll, i64, env, i64, i32) +DEF_HELPER_3(iwmmxt_sllq, i64, env, i64, i32) +DEF_HELPER_3(iwmmxt_sraw, i64, env, i64, i32) +DEF_HELPER_3(iwmmxt_sral, i64, env, i64, i32) +DEF_HELPER_3(iwmmxt_sraq, i64, env, i64, i32) +DEF_HELPER_3(iwmmxt_rorw, i64, env, i64, i32) +DEF_HELPER_3(iwmmxt_rorl, i64, env, i64, i32) +DEF_HELPER_3(iwmmxt_rorq, i64, env, i64, i32) +DEF_HELPER_3(iwmmxt_shufh, i64, env, i64, i32) + +DEF_HELPER_3(iwmmxt_packuw, i64, env, i64, i64) +DEF_HELPER_3(iwmmxt_packul, i64, env, i64, i64) +DEF_HELPER_3(iwmmxt_packuq, i64, env, i64, i64) +DEF_HELPER_3(iwmmxt_packsw, i64, env, i64, i64) +DEF_HELPER_3(iwmmxt_packsl, i64, env, i64, i64) +DEF_HELPER_3(iwmmxt_packsq, i64, env, i64, i64) + +DEF_HELPER_3(iwmmxt_muladdsl, i64, i64, i32, i32) +DEF_HELPER_3(iwmmxt_muladdsw, i64, i64, i32, i32) +DEF_HELPER_3(iwmmxt_muladdswl, i64, i64, i32, i32) + +DEF_HELPER_FLAGS_2(neon_unzip8, TCG_CALL_NO_RWG, void, ptr, ptr) +DEF_HELPER_FLAGS_2(neon_unzip16, TCG_CALL_NO_RWG, void, ptr, ptr) +DEF_HELPER_FLAGS_2(neon_qunzip8, TCG_CALL_NO_RWG, void, ptr, ptr) +DEF_HELPER_FLAGS_2(neon_qunzip16, TCG_CALL_NO_RWG, void, ptr, ptr) +DEF_HELPER_FLAGS_2(neon_qunzip32, TCG_CALL_NO_RWG, void, ptr, ptr) +DEF_HELPER_FLAGS_2(neon_zip8, TCG_CALL_NO_RWG, void, ptr, ptr) +DEF_HELPER_FLAGS_2(neon_zip16, TCG_CALL_NO_RWG, void, ptr, ptr) +DEF_HELPER_FLAGS_2(neon_qzip8, TCG_CALL_NO_RWG, void, ptr, ptr) +DEF_HELPER_FLAGS_2(neon_qzip16, TCG_CALL_NO_RWG, void, ptr, ptr) +DEF_HELPER_FLAGS_2(neon_qzip32, TCG_CALL_NO_RWG, void, ptr, ptr) + +DEF_HELPER_FLAGS_4(crypto_aese, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(crypto_aesd, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(crypto_aesmc, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(crypto_aesimc, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(crypto_sha1su0, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(crypto_sha1c, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(crypto_sha1p, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(crypto_sha1m, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(crypto_sha1h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(crypto_sha1su1, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(crypto_sha256h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(crypto_sha256h2, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(crypto_sha256su0, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(crypto_sha256su1, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(crypto_sha512h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(crypto_sha512h2, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(crypto_sha512su0, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(crypto_sha512su1, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(crypto_sm3tt1a, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(crypto_sm3tt1b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(crypto_sm3tt2a, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(crypto_sm3tt2b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(crypto_sm3partw1, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(crypto_sm3partw2, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(crypto_sm4e, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(crypto_sm4ekey, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(crypto_rax1, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(crc32, TCG_CALL_NO_RWG_SE, i32, i32, i32, i32) +DEF_HELPER_FLAGS_3(crc32c, TCG_CALL_NO_RWG_SE, i32, i32, i32, i32) + +DEF_HELPER_FLAGS_5(gvec_qrdmlah_s16, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_qrdmlsh_s16, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_qrdmlah_s32, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_qrdmlsh_s32, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve2_sqrdmlah_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve2_sqrdmlsh_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve2_sqrdmlah_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve2_sqrdmlsh_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve2_sqrdmlah_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve2_sqrdmlsh_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve2_sqrdmlah_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve2_sqrdmlsh_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(gvec_sdot_4b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_udot_4b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_sdot_4h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_udot_4h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_usdot_4b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(gvec_sdot_2h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_udot_2h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(gvec_sdot_idx_4b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_udot_idx_4b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_sdot_idx_4h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_udot_idx_4h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_sudot_idx_4b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_usdot_idx_4b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(gvec_sdot_idx_2h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_udot_idx_2h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(gvec_fcaddh, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fcadds, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fcaddd, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_6(gvec_fcmlah, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(gvec_fcmlah_idx, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(gvec_fcmlas, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(gvec_fcmlas_idx, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(gvec_fcmlad, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_4(gvec_sstoh, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_sitos, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_ustoh, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_uitos, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_tosszh, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_tosizs, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_touszh, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_touizs, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_4(gvec_vcvt_sf, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_vcvt_uf, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_vcvt_rz_fs, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_vcvt_rz_fu, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_4(gvec_vcvt_sh, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_vcvt_uh, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_vcvt_rz_hs, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_vcvt_rz_hu, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_4(gvec_vcvt_sd, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_vcvt_ud, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_vcvt_rz_ds, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_vcvt_rz_du, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_4(gvec_vcvt_rm_sd, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_vcvt_rm_ud, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_vcvt_rm_ss, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_vcvt_rm_us, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_vcvt_rm_sh, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_vcvt_rm_uh, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_4(gvec_vrint_rm_h, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_vrint_rm_s, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_4(gvec_vrintx_h, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_vrintx_s, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_4(gvec_frecpe_h, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_frecpe_s, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_frecpe_rpres_s, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_frecpe_d, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_4(gvec_frsqrte_h, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_frsqrte_s, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_frsqrte_rpres_s, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_frsqrte_d, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_4(gvec_fcgt0_h, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_fcgt0_s, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_fcgt0_d, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_4(gvec_fcge0_h, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_fcge0_s, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_fcge0_d, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_4(gvec_fceq0_h, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_fceq0_s, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_fceq0_d, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_4(gvec_fcle0_h, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_fcle0_s, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_fcle0_d, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_4(gvec_fclt0_h, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_fclt0_s, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_fclt0_d, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_fadd_b16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fadd_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fadd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fadd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_bfadd, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_fsub_b16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fsub_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fsub_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fsub_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_bfsub, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_fmul_b16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fmul_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fmul_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fmul_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_fabd_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fabd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fabd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_ah_fabd_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ah_fabd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ah_fabd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_fceq_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fceq_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fceq_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_fcge_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fcge_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fcge_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_fcgt_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fcgt_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fcgt_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_facge_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_facge_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_facge_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_facgt_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_facgt_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_facgt_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_fmax_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fmax_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fmax_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_fmin_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fmin_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fmin_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_fmaxnum_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fmaxnum_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fmaxnum_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_fminnum_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fminnum_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fminnum_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_recps_nf_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_recps_nf_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_rsqrts_nf_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_rsqrts_nf_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_fmla_nf_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fmla_nf_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_fmls_nf_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fmls_nf_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_vfma_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_vfma_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_vfma_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_bfmla, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_vfms_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_vfms_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_vfms_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_bfmls, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_ah_vfms_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ah_vfms_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ah_vfms_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ah_bfmls, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_ftsmul_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ftsmul_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ftsmul_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_fmul_idx_b16, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fmul_idx_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fmul_idx_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fmul_idx_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_fmla_nf_idx_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fmla_nf_idx_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_fmls_nf_idx_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fmls_nf_idx_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_6(gvec_fmla_idx_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(gvec_fmla_idx_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(gvec_fmla_idx_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(gvec_bfmla_idx, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_6(gvec_fmls_idx_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(gvec_fmls_idx_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(gvec_fmls_idx_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(gvec_bfmls_idx, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_6(gvec_ah_fmls_idx_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(gvec_ah_fmls_idx_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(gvec_ah_fmls_idx_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(gvec_ah_bfmls_idx, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_uqadd_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_uqadd_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_uqadd_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_uqadd_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_sqadd_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_sqadd_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_sqadd_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_sqadd_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_uqsub_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_uqsub_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_uqsub_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_uqsub_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_sqsub_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_sqsub_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_sqsub_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_sqsub_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_usqadd_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_usqadd_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_usqadd_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_usqadd_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_suqadd_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_suqadd_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_suqadd_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_suqadd_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(gvec_fmlal_a32, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_5(gvec_fmlal_a64, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_5(gvec_fmlal_idx_a32, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_5(gvec_fmlal_idx_a64, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, env, i32) + +DEF_HELPER_FLAGS_2(frint32_s, TCG_CALL_NO_RWG, f32, f32, fpst) +DEF_HELPER_FLAGS_2(frint64_s, TCG_CALL_NO_RWG, f32, f32, fpst) +DEF_HELPER_FLAGS_2(frint32_d, TCG_CALL_NO_RWG, f64, f64, fpst) +DEF_HELPER_FLAGS_2(frint64_d, TCG_CALL_NO_RWG, f64, f64, fpst) + +DEF_HELPER_FLAGS_3(gvec_ceq0_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_ceq0_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_clt0_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_clt0_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_cle0_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_cle0_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_cgt0_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_cgt0_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_cge0_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_cge0_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_smulh_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_smulh_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_smulh_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_smulh_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_umulh_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_umulh_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_umulh_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_umulh_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_sshl_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_sshl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_ushl_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_ushl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_pmul_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_pmull_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(neon_pmull_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(gvec_ssra_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_ssra_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_ssra_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_ssra_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(gvec_usra_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_usra_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_usra_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_usra_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(gvec_srshr_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_srshr_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_srshr_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_srshr_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(gvec_urshr_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_urshr_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_urshr_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_urshr_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(gvec_srsra_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_srsra_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_srsra_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_srsra_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(gvec_ursra_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_ursra_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_ursra_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_ursra_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(gvec_sri_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_sri_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_sri_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_sri_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(gvec_sli_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_sli_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_sli_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_sli_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_sabd_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_sabd_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_sabd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_sabd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_uabd_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_uabd_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_uabd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_uabd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_saba_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_saba_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_saba_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_saba_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_uaba_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_uaba_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_uaba_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_uaba_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_mul_idx_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_mul_idx_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_mul_idx_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(gvec_mla_idx_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_mla_idx_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_mla_idx_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(gvec_mls_idx_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_mls_idx_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_mls_idx_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(neon_sqdmulh_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(neon_sqdmulh_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(neon_sqrdmulh_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(neon_sqrdmulh_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(neon_sqdmulh_idx_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(neon_sqdmulh_idx_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(neon_sqrdmulh_idx_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(neon_sqrdmulh_idx_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(neon_sqrdmlah_idx_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(neon_sqrdmlah_idx_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(neon_sqrdmlsh_idx_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(neon_sqrdmlsh_idx_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve2_sqdmulh_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2_sqdmulh_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2_sqdmulh_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2_sqdmulh_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve2_sqrdmulh_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2_sqrdmulh_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2_sqrdmulh_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2_sqrdmulh_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve2_sqdmulh_idx_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2_sqdmulh_idx_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2_sqdmulh_idx_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve2_sqrdmulh_idx_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2_sqrdmulh_idx_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve2_sqrdmulh_idx_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_6(sve2_fmlal_zzzw_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_6(sve2_fmlal_zzxw_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, env, i32) + +DEF_HELPER_FLAGS_4(gvec_xar_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(gvec_smmla_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_ummla_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_usmmla_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_6(gvec_bfdot, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_6(gvec_bfdot_idx, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_6(sme2_bfvdot_idx, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, env, i32) + +DEF_HELPER_FLAGS_6(gvec_bfmmla, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, env, i32) + +DEF_HELPER_FLAGS_6(gvec_bfmlal, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(gvec_bfmlsl, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(gvec_ah_bfmlsl, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(gvec_bfmlal_idx, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(gvec_bfmlsl_idx, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(gvec_ah_bfmlsl_idx, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_sclamp_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_sclamp_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_sclamp_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_sclamp_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(gvec_uclamp_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_uclamp_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_uclamp_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_uclamp_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(gvec_faddp_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_faddp_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_faddp_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_fmaxp_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fmaxp_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fmaxp_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_fminp_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fminp_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fminp_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_fmaxnump_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fmaxnump_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fmaxnump_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_fminnump_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fminnump_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_fminnump_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_4(gvec_addp_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_addp_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_addp_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_addp_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_smaxp_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_smaxp_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_smaxp_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_sminp_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_sminp_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_sminp_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_umaxp_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_umaxp_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_umaxp_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_uminp_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_uminp_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_uminp_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(gvec_urecpe_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_ursqrte_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sme2_luti2_1b, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_4(sme2_luti2_1h, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_4(sme2_luti2_1s, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) + +DEF_HELPER_FLAGS_4(sme2_luti2_2b, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_4(sme2_luti2_2h, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_4(sme2_luti2_2s, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) + +DEF_HELPER_FLAGS_4(sme2_luti2_4b, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_4(sme2_luti2_4h, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_4(sme2_luti2_4s, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) + +DEF_HELPER_FLAGS_4(sme2_luti4_1b, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_4(sme2_luti4_1h, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_4(sme2_luti4_1s, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) + +DEF_HELPER_FLAGS_4(sme2_luti4_2b, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_4(sme2_luti4_2h, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_4(sme2_luti4_2s, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) + +DEF_HELPER_FLAGS_4(sme2_luti4_4h, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) +DEF_HELPER_FLAGS_4(sme2_luti4_4s, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32) diff --git a/target/arm/tcg/hflags.c b/target/arm/tcg/hflags.c index 8d79b8b..59ab526 100644 --- a/target/arm/tcg/hflags.c +++ b/target/arm/tcg/hflags.c @@ -9,9 +9,13 @@ #include "cpu.h" #include "internals.h" #include "cpu-features.h" -#include "exec/helper-proto.h" +#include "exec/translation-block.h" +#include "accel/tcg/cpu-ops.h" #include "cpregs.h" +#define HELPER_H "tcg/helper.h" +#include "exec/helper-proto.h.inc" + static inline bool fgt_svc(CPUARMState *env, int el) { /* @@ -210,6 +214,31 @@ static CPUARMTBFlags rebuild_hflags_a32(CPUARMState *env, int fp_el, return rebuild_hflags_common_32(env, fp_el, mmu_idx, flags); } +/* + * Return the exception level to which exceptions should be taken for ZT0. + * C.f. the ARM pseudocode function CheckSMEZT0Enabled, after the ZA check. + */ +static int zt0_exception_el(CPUARMState *env, int el) +{ +#ifndef CONFIG_USER_ONLY + if (el <= 1 + && !el_is_in_host(env, el) + && !FIELD_EX64(env->vfp.smcr_el[1], SMCR, EZT0)) { + return 1; + } + if (el <= 2 + && arm_is_el2_enabled(env) + && !FIELD_EX64(env->vfp.smcr_el[2], SMCR, EZT0)) { + return 2; + } + if (arm_feature(env, ARM_FEATURE_EL3) + && !FIELD_EX64(env->vfp.smcr_el[3], SMCR, EZT0)) { + return 3; + } +#endif + return 0; +} + static CPUARMTBFlags rebuild_hflags_a64(CPUARMState *env, int el, int fp_el, ARMMMUIdx mmu_idx) { @@ -265,7 +294,14 @@ static CPUARMTBFlags rebuild_hflags_a64(CPUARMState *env, int el, int fp_el, DP_TBFLAG_A64(flags, PSTATE_SM, 1); DP_TBFLAG_A64(flags, SME_TRAP_NONSTREAMING, !sme_fa64(env, el)); } - DP_TBFLAG_A64(flags, PSTATE_ZA, FIELD_EX64(env->svcr, SVCR, ZA)); + + if (FIELD_EX64(env->svcr, SVCR, ZA)) { + DP_TBFLAG_A64(flags, PSTATE_ZA, 1); + if (cpu_isar_feature(aa64_sme2, env_archcpu(env))) { + int zt0_el = zt0_exception_el(env, el); + DP_TBFLAG_A64(flags, ZT0EXC_EL, zt0_el); + } + } } sctlr = regime_sctlr(env, stage1); @@ -498,7 +534,7 @@ void HELPER(rebuild_hflags_a64)(CPUARMState *env, int el) env->hflags = rebuild_hflags_a64(env, el, fp_el, mmu_idx); } -void assert_hflags_rebuild_correctly(CPUARMState *env) +static void assert_hflags_rebuild_correctly(CPUARMState *env) { #ifdef CONFIG_DEBUG_TCG CPUARMTBFlags c = env->hflags; @@ -506,10 +542,123 @@ void assert_hflags_rebuild_correctly(CPUARMState *env) if (unlikely(c.flags != r.flags || c.flags2 != r.flags2)) { fprintf(stderr, "TCG hflags mismatch " - "(current:(0x%08x,0x" TARGET_FMT_lx ")" - " rebuilt:(0x%08x,0x" TARGET_FMT_lx ")\n", + "(current:(0x%08x,0x%016" PRIx64 ")" + " rebuilt:(0x%08x,0x%016" PRIx64 ")\n", c.flags, c.flags2, r.flags, r.flags2); abort(); } #endif } + +static bool mve_no_pred(CPUARMState *env) +{ + /* + * Return true if there is definitely no predication of MVE + * instructions by VPR or LTPSIZE. (Returning false even if there + * isn't any predication is OK; generated code will just be + * a little worse.) + * If the CPU does not implement MVE then this TB flag is always 0. + * + * NOTE: if you change this logic, the "recalculate s->mve_no_pred" + * logic in gen_update_fp_context() needs to be updated to match. + * + * We do not include the effect of the ECI bits here -- they are + * tracked in other TB flags. This simplifies the logic for + * "when did we emit code that changes the MVE_NO_PRED TB flag + * and thus need to end the TB?". + */ + if (cpu_isar_feature(aa32_mve, env_archcpu(env))) { + return false; + } + if (env->v7m.vpr) { + return false; + } + if (env->v7m.ltpsize < 4) { + return false; + } + return true; +} + +TCGTBCPUState arm_get_tb_cpu_state(CPUState *cs) +{ + CPUARMState *env = cpu_env(cs); + CPUARMTBFlags flags; + vaddr pc; + + assert_hflags_rebuild_correctly(env); + flags = env->hflags; + + if (EX_TBFLAG_ANY(flags, AARCH64_STATE)) { + pc = env->pc; + if (cpu_isar_feature(aa64_bti, env_archcpu(env))) { + DP_TBFLAG_A64(flags, BTYPE, env->btype); + } + } else { + pc = env->regs[15]; + + if (arm_feature(env, ARM_FEATURE_M)) { + if (arm_feature(env, ARM_FEATURE_M_SECURITY) && + FIELD_EX32(env->v7m.fpccr[M_REG_S], V7M_FPCCR, S) + != env->v7m.secure) { + DP_TBFLAG_M32(flags, FPCCR_S_WRONG, 1); + } + + if ((env->v7m.fpccr[env->v7m.secure] & R_V7M_FPCCR_ASPEN_MASK) && + (!(env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK) || + (env->v7m.secure && + !(env->v7m.control[M_REG_S] & R_V7M_CONTROL_SFPA_MASK)))) { + /* + * ASPEN is set, but FPCA/SFPA indicate that there is no + * active FP context; we must create a new FP context before + * executing any FP insn. + */ + DP_TBFLAG_M32(flags, NEW_FP_CTXT_NEEDED, 1); + } + + bool is_secure = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_S_MASK; + if (env->v7m.fpccr[is_secure] & R_V7M_FPCCR_LSPACT_MASK) { + DP_TBFLAG_M32(flags, LSPACT, 1); + } + + if (mve_no_pred(env)) { + DP_TBFLAG_M32(flags, MVE_NO_PRED, 1); + } + } else { + /* + * Note that XSCALE_CPAR shares bits with VECSTRIDE. + * Note that VECLEN+VECSTRIDE are RES0 for M-profile. + */ + if (arm_feature(env, ARM_FEATURE_XSCALE)) { + DP_TBFLAG_A32(flags, XSCALE_CPAR, env->cp15.c15_cpar); + } else { + DP_TBFLAG_A32(flags, VECLEN, env->vfp.vec_len); + DP_TBFLAG_A32(flags, VECSTRIDE, env->vfp.vec_stride); + } + if (env->vfp.xregs[ARM_VFP_FPEXC] & (1 << 30)) { + DP_TBFLAG_A32(flags, VFPEN, 1); + } + } + + DP_TBFLAG_AM32(flags, THUMB, env->thumb); + DP_TBFLAG_AM32(flags, CONDEXEC, env->condexec_bits); + } + + /* + * The SS_ACTIVE and PSTATE_SS bits correspond to the state machine + * states defined in the ARM ARM for software singlestep: + * SS_ACTIVE PSTATE.SS State + * 0 x Inactive (the TB flag for SS is always 0) + * 1 0 Active-pending + * 1 1 Active-not-pending + * SS_ACTIVE is set in hflags; PSTATE__SS is computed every TB. + */ + if (EX_TBFLAG_ANY(flags, SS_ACTIVE) && (env->pstate & PSTATE_SS)) { + DP_TBFLAG_ANY(flags, PSTATE__SS, 1); + } + + return (TCGTBCPUState){ + .pc = pc, + .flags = flags.flags, + .cs_base = flags.flags2, + }; +} diff --git a/target/arm/tcg/iwmmxt_helper.c b/target/arm/tcg/iwmmxt_helper.c index 610b1b2..ba054b6 100644 --- a/target/arm/tcg/iwmmxt_helper.c +++ b/target/arm/tcg/iwmmxt_helper.c @@ -22,7 +22,9 @@ #include "qemu/osdep.h" #include "cpu.h" -#include "exec/helper-proto.h" + +#define HELPER_H "tcg/helper.h" +#include "exec/helper-proto.h.inc" /* iwMMXt macros extracted from GNU gdb. */ diff --git a/target/arm/tcg/m_helper.c b/target/arm/tcg/m_helper.c index f7354f3..28307b5 100644 --- a/target/arm/tcg/m_helper.c +++ b/target/arm/tcg/m_helper.c @@ -15,10 +15,9 @@ #include "qemu/main-loop.h" #include "qemu/bitops.h" #include "qemu/log.h" -#include "exec/exec-all.h" #include "exec/page-protection.h" #ifdef CONFIG_TCG -#include "exec/cpu_ldst.h" +#include "accel/tcg/cpu-ldst.h" #include "semihosting/common-semi.h" #endif #if !defined(CONFIG_USER_ONLY) @@ -633,8 +632,11 @@ void HELPER(v7m_blxns)(CPUARMState *env, uint32_t dest) } /* Note that these stores can throw exceptions on MPU faults */ - cpu_stl_data_ra(env, sp, nextinst, GETPC()); - cpu_stl_data_ra(env, sp + 4, saved_psr, GETPC()); + ARMMMUIdx mmu_idx = arm_mmu_idx(env); + MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, + arm_to_core_mmu_idx(mmu_idx)); + cpu_stl_mmu(env, sp, nextinst, oi, GETPC()); + cpu_stl_mmu(env, sp + 4, saved_psr, oi, GETPC()); env->regs[13] = sp; env->regs[14] = 0xfeffffff; @@ -1049,6 +1051,9 @@ void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr) bool s = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_S_MASK; bool lspact = env->v7m.fpccr[s] & R_V7M_FPCCR_LSPACT_MASK; uintptr_t ra = GETPC(); + ARMMMUIdx mmu_idx = arm_mmu_idx(env); + MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, + arm_to_core_mmu_idx(mmu_idx)); assert(env->v7m.secure); @@ -1074,7 +1079,7 @@ void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr) * Note that we do not use v7m_stack_write() here, because the * accesses should not set the FSR bits for stacking errors if they * fail. (In pseudocode terms, they are AccType_NORMAL, not AccType_STACK - * or AccType_LAZYFP). Faults in cpu_stl_data_ra() will throw exceptions + * or AccType_LAZYFP). Faults in cpu_stl_mmu() will throw exceptions * and longjmp out. */ if (!(env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPEN_MASK)) { @@ -1090,12 +1095,12 @@ void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr) if (i >= 16) { faddr += 8; /* skip the slot for the FPSCR */ } - cpu_stl_data_ra(env, faddr, slo, ra); - cpu_stl_data_ra(env, faddr + 4, shi, ra); + cpu_stl_mmu(env, faddr, slo, oi, ra); + cpu_stl_mmu(env, faddr + 4, shi, oi, ra); } - cpu_stl_data_ra(env, fptr + 0x40, vfp_get_fpscr(env), ra); + cpu_stl_mmu(env, fptr + 0x40, vfp_get_fpscr(env), oi, ra); if (cpu_isar_feature(aa32_mve, cpu)) { - cpu_stl_data_ra(env, fptr + 0x44, env->v7m.vpr, ra); + cpu_stl_mmu(env, fptr + 0x44, env->v7m.vpr, oi, ra); } /* @@ -1122,6 +1127,9 @@ void HELPER(v7m_vlldm)(CPUARMState *env, uint32_t fptr) { ARMCPU *cpu = env_archcpu(env); uintptr_t ra = GETPC(); + ARMMMUIdx mmu_idx = arm_mmu_idx(env); + MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, + arm_to_core_mmu_idx(mmu_idx)); /* fptr is the value of Rn, the frame pointer we load the FP regs from */ assert(env->v7m.secure); @@ -1156,16 +1164,16 @@ void HELPER(v7m_vlldm)(CPUARMState *env, uint32_t fptr) faddr += 8; /* skip the slot for the FPSCR and VPR */ } - slo = cpu_ldl_data_ra(env, faddr, ra); - shi = cpu_ldl_data_ra(env, faddr + 4, ra); + slo = cpu_ldl_mmu(env, faddr, oi, ra); + shi = cpu_ldl_mmu(env, faddr + 4, oi, ra); dn = (uint64_t) shi << 32 | slo; *aa32_vfp_dreg(env, i / 2) = dn; } - fpscr = cpu_ldl_data_ra(env, fptr + 0x40, ra); + fpscr = cpu_ldl_mmu(env, fptr + 0x40, oi, ra); vfp_set_fpscr(env, fpscr); if (cpu_isar_feature(aa32_mve, cpu)) { - env->v7m.vpr = cpu_ldl_data_ra(env, fptr + 0x44, ra); + env->v7m.vpr = cpu_ldl_mmu(env, fptr + 0x44, oi, ra); } } @@ -1938,7 +1946,7 @@ static bool do_v7m_function_return(ARMCPU *cpu) * do them as secure, so work out what MMU index that is. */ mmu_idx = arm_v7m_mmu_idx_for_secstate(env, true); - oi = make_memop_idx(MO_LEUL, arm_to_core_mmu_idx(mmu_idx)); + oi = make_memop_idx(MO_LEUL | MO_ALIGN, arm_to_core_mmu_idx(mmu_idx)); newpc = cpu_ldl_mmu(env, frameptr, oi, 0); newpsr = cpu_ldl_mmu(env, frameptr + 4, oi, 0); diff --git a/target/arm/tcg/meson.build b/target/arm/tcg/meson.build index dd12cce..895facd 100644 --- a/target/arm/tcg/meson.build +++ b/target/arm/tcg/meson.build @@ -30,18 +30,10 @@ arm_ss.add(files( 'translate-mve.c', 'translate-neon.c', 'translate-vfp.c', - 'crypto_helper.c', - 'hflags.c', - 'iwmmxt_helper.c', 'm_helper.c', 'mve_helper.c', - 'neon_helper.c', 'op_helper.c', - 'tlb_helper.c', 'vec_helper.c', - 'tlb-insns.c', - 'arith_helper.c', - 'vfp_helper.c', )) arm_ss.add(when: 'TARGET_AARCH64', if_true: files( @@ -63,3 +55,27 @@ arm_system_ss.add(files( arm_system_ss.add(when: 'CONFIG_ARM_V7M', if_true: files('cpu-v7m.c')) arm_user_ss.add(when: 'TARGET_AARCH64', if_false: files('cpu-v7m.c')) + +arm_common_ss.add(zlib) + +arm_common_ss.add(files( + 'arith_helper.c', + 'crypto_helper.c', +)) + +arm_common_system_ss.add(files( + 'cpregs-at.c', + 'hflags.c', + 'iwmmxt_helper.c', + 'neon_helper.c', + 'tlb_helper.c', + 'tlb-insns.c', + 'vfp_helper.c', +)) +arm_user_ss.add(files( + 'hflags.c', + 'iwmmxt_helper.c', + 'neon_helper.c', + 'tlb_helper.c', + 'vfp_helper.c', +)) diff --git a/target/arm/tcg/mte_helper.c b/target/arm/tcg/mte_helper.c index 5d6d8a1..0efc18a 100644 --- a/target/arm/tcg/mte_helper.c +++ b/target/arm/tcg/mte_helper.c @@ -21,22 +21,22 @@ #include "qemu/log.h" #include "cpu.h" #include "internals.h" -#include "exec/exec-all.h" #include "exec/page-protection.h" #ifdef CONFIG_USER_ONLY #include "user/cpu_loop.h" #include "user/page-protection.h" #else -#include "exec/ram_addr.h" +#include "system/ram_addr.h" #endif -#include "exec/cpu_ldst.h" +#include "accel/tcg/cpu-ldst.h" +#include "accel/tcg/probe.h" #include "exec/helper-proto.h" +#include "exec/tlb-flags.h" #include "accel/tcg/cpu-ops.h" #include "qapi/error.h" #include "qemu/guest-random.h" #include "mte_helper.h" - static int choose_nonexcluded_tag(int tag, int offset, uint16_t exclude) { if (exclude == 0xffff) { @@ -62,6 +62,7 @@ uint8_t *allocation_tag_mem_probe(CPUARMState *env, int ptr_mmu_idx, bool probe, uintptr_t ra) { #ifdef CONFIG_USER_ONLY + const size_t page_data_size = TARGET_PAGE_SIZE >> (LOG2_TAG_GRANULE + 1); uint64_t clean_ptr = useronly_clean_ptr(ptr); int flags = page_get_flags(clean_ptr); uint8_t *tags; @@ -82,7 +83,7 @@ uint8_t *allocation_tag_mem_probe(CPUARMState *env, int ptr_mmu_idx, return NULL; } - tags = page_get_target_data(clean_ptr); + tags = page_get_target_data(clean_ptr, page_data_size); index = extract32(ptr, LOG2_TAG_GRANULE + 1, TARGET_PAGE_BITS - LOG2_TAG_GRANULE - 1); diff --git a/target/arm/tcg/mve_helper.c b/target/arm/tcg/mve_helper.c index 274003e..63ddcf3 100644 --- a/target/arm/tcg/mve_helper.c +++ b/target/arm/tcg/mve_helper.c @@ -22,8 +22,7 @@ #include "internals.h" #include "vec_internal.h" #include "exec/helper-proto.h" -#include "exec/cpu_ldst.h" -#include "exec/exec-all.h" +#include "accel/tcg/cpu-ldst.h" #include "tcg/tcg.h" #include "fpu/softfloat.h" #include "crypto/clmul.h" @@ -149,13 +148,15 @@ static void mve_advance_vpt(CPUARMState *env) } /* For loads, predicated lanes are zeroed instead of keeping their old values */ -#define DO_VLDR(OP, MSIZE, LDTYPE, ESIZE, TYPE) \ +#define DO_VLDR(OP, MFLAG, MSIZE, MTYPE, LDTYPE, ESIZE, TYPE) \ void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr) \ { \ TYPE *d = vd; \ uint16_t mask = mve_element_mask(env); \ uint16_t eci_mask = mve_eci_mask(env); \ unsigned b, e; \ + int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \ + MemOpIdx oi = make_memop_idx(MFLAG | MO_ALIGN, mmu_idx); \ /* \ * R_SXTM allows the dest reg to become UNKNOWN for abandoned \ * beats so we don't care if we update part of the dest and \ @@ -164,46 +165,48 @@ static void mve_advance_vpt(CPUARMState *env) for (b = 0, e = 0; b < 16; b += ESIZE, e++) { \ if (eci_mask & (1 << b)) { \ d[H##ESIZE(e)] = (mask & (1 << b)) ? \ - cpu_##LDTYPE##_data_ra(env, addr, GETPC()) : 0; \ + (MTYPE)cpu_##LDTYPE##_mmu(env, addr, oi, GETPC()) : 0;\ } \ addr += MSIZE; \ } \ mve_advance_vpt(env); \ } -#define DO_VSTR(OP, MSIZE, STTYPE, ESIZE, TYPE) \ +#define DO_VSTR(OP, MFLAG, MSIZE, STTYPE, ESIZE, TYPE) \ void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr) \ { \ TYPE *d = vd; \ uint16_t mask = mve_element_mask(env); \ unsigned b, e; \ + int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \ + MemOpIdx oi = make_memop_idx(MFLAG | MO_ALIGN, mmu_idx); \ for (b = 0, e = 0; b < 16; b += ESIZE, e++) { \ if (mask & (1 << b)) { \ - cpu_##STTYPE##_data_ra(env, addr, d[H##ESIZE(e)], GETPC()); \ + cpu_##STTYPE##_mmu(env, addr, d[H##ESIZE(e)], oi, GETPC()); \ } \ addr += MSIZE; \ } \ mve_advance_vpt(env); \ } -DO_VLDR(vldrb, 1, ldub, 1, uint8_t) -DO_VLDR(vldrh, 2, lduw, 2, uint16_t) -DO_VLDR(vldrw, 4, ldl, 4, uint32_t) +DO_VLDR(vldrb, MO_UB, 1, uint8_t, ldb, 1, uint8_t) +DO_VLDR(vldrh, MO_TEUW, 2, uint16_t, ldw, 2, uint16_t) +DO_VLDR(vldrw, MO_TEUL, 4, uint32_t, ldl, 4, uint32_t) -DO_VSTR(vstrb, 1, stb, 1, uint8_t) -DO_VSTR(vstrh, 2, stw, 2, uint16_t) -DO_VSTR(vstrw, 4, stl, 4, uint32_t) +DO_VSTR(vstrb, MO_UB, 1, stb, 1, uint8_t) +DO_VSTR(vstrh, MO_TEUW, 2, stw, 2, uint16_t) +DO_VSTR(vstrw, MO_TEUL, 4, stl, 4, uint32_t) -DO_VLDR(vldrb_sh, 1, ldsb, 2, int16_t) -DO_VLDR(vldrb_sw, 1, ldsb, 4, int32_t) -DO_VLDR(vldrb_uh, 1, ldub, 2, uint16_t) -DO_VLDR(vldrb_uw, 1, ldub, 4, uint32_t) -DO_VLDR(vldrh_sw, 2, ldsw, 4, int32_t) -DO_VLDR(vldrh_uw, 2, lduw, 4, uint32_t) +DO_VLDR(vldrb_sh, MO_SB, 1, int8_t, ldb, 2, int16_t) +DO_VLDR(vldrb_sw, MO_SB, 1, int8_t, ldb, 4, int32_t) +DO_VLDR(vldrb_uh, MO_UB, 1, uint8_t, ldb, 2, uint16_t) +DO_VLDR(vldrb_uw, MO_UB, 1, uint8_t, ldb, 4, uint32_t) +DO_VLDR(vldrh_sw, MO_TESW, 2, int16_t, ldw, 4, int32_t) +DO_VLDR(vldrh_uw, MO_TEUW, 2, uint16_t, ldw, 4, uint32_t) -DO_VSTR(vstrb_h, 1, stb, 2, int16_t) -DO_VSTR(vstrb_w, 1, stb, 4, int32_t) -DO_VSTR(vstrh_w, 2, stw, 4, int32_t) +DO_VSTR(vstrb_h, MO_UB, 1, stb, 2, int16_t) +DO_VSTR(vstrb_w, MO_UB, 1, stb, 4, int32_t) +DO_VSTR(vstrh_w, MO_TEUW, 2, stw, 4, int32_t) #undef DO_VLDR #undef DO_VSTR @@ -215,7 +218,7 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t) * For loads, predicated lanes are zeroed instead of retaining * their previous values. */ -#define DO_VLDR_SG(OP, LDTYPE, ESIZE, TYPE, OFFTYPE, ADDRFN, WB) \ +#define DO_VLDR_SG(OP, MFLAG, MTYPE, LDTYPE, ESIZE, TYPE, OFFTYPE, ADDRFN, WB)\ void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \ uint32_t base) \ { \ @@ -225,13 +228,15 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t) uint16_t eci_mask = mve_eci_mask(env); \ unsigned e; \ uint32_t addr; \ + int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \ + MemOpIdx oi = make_memop_idx(MFLAG | MO_ALIGN, mmu_idx); \ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE, eci_mask >>= ESIZE) { \ if (!(eci_mask & 1)) { \ continue; \ } \ addr = ADDRFN(base, m[H##ESIZE(e)]); \ d[H##ESIZE(e)] = (mask & 1) ? \ - cpu_##LDTYPE##_data_ra(env, addr, GETPC()) : 0; \ + (MTYPE)cpu_##LDTYPE##_mmu(env, addr, oi, GETPC()) : 0; \ if (WB) { \ m[H##ESIZE(e)] = addr; \ } \ @@ -240,7 +245,7 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t) } /* We know here TYPE is unsigned so always the same as the offset type */ -#define DO_VSTR_SG(OP, STTYPE, ESIZE, TYPE, ADDRFN, WB) \ +#define DO_VSTR_SG(OP, MFLAG, STTYPE, ESIZE, TYPE, ADDRFN, WB) \ void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \ uint32_t base) \ { \ @@ -250,13 +255,15 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t) uint16_t eci_mask = mve_eci_mask(env); \ unsigned e; \ uint32_t addr; \ + int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \ + MemOpIdx oi = make_memop_idx(MFLAG | MO_ALIGN, mmu_idx); \ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE, eci_mask >>= ESIZE) { \ if (!(eci_mask & 1)) { \ continue; \ } \ addr = ADDRFN(base, m[H##ESIZE(e)]); \ if (mask & 1) { \ - cpu_##STTYPE##_data_ra(env, addr, d[H##ESIZE(e)], GETPC()); \ + cpu_##STTYPE##_mmu(env, addr, d[H##ESIZE(e)], oi, GETPC()); \ } \ if (WB) { \ m[H##ESIZE(e)] = addr; \ @@ -283,13 +290,15 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t) uint16_t eci_mask = mve_eci_mask(env); \ unsigned e; \ uint32_t addr; \ + int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \ + MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \ for (e = 0; e < 16 / 4; e++, mask >>= 4, eci_mask >>= 4) { \ if (!(eci_mask & 1)) { \ continue; \ } \ addr = ADDRFN(base, m[H4(e & ~1)]); \ addr += 4 * (e & 1); \ - d[H4(e)] = (mask & 1) ? cpu_ldl_data_ra(env, addr, GETPC()) : 0; \ + d[H4(e)] = (mask & 1) ? cpu_ldl_mmu(env, addr, oi, GETPC()) : 0; \ if (WB && (e & 1)) { \ m[H4(e & ~1)] = addr - 4; \ } \ @@ -307,6 +316,8 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t) uint16_t eci_mask = mve_eci_mask(env); \ unsigned e; \ uint32_t addr; \ + int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \ + MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \ for (e = 0; e < 16 / 4; e++, mask >>= 4, eci_mask >>= 4) { \ if (!(eci_mask & 1)) { \ continue; \ @@ -314,7 +325,7 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t) addr = ADDRFN(base, m[H4(e & ~1)]); \ addr += 4 * (e & 1); \ if (mask & 1) { \ - cpu_stl_data_ra(env, addr, d[H4(e)], GETPC()); \ + cpu_stl_mmu(env, addr, d[H4(e)], oi, GETPC()); \ } \ if (WB && (e & 1)) { \ m[H4(e & ~1)] = addr - 4; \ @@ -328,40 +339,44 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t) #define ADDR_ADD_OSW(BASE, OFFSET) ((BASE) + ((OFFSET) << 2)) #define ADDR_ADD_OSD(BASE, OFFSET) ((BASE) + ((OFFSET) << 3)) -DO_VLDR_SG(vldrb_sg_sh, ldsb, 2, int16_t, uint16_t, ADDR_ADD, false) -DO_VLDR_SG(vldrb_sg_sw, ldsb, 4, int32_t, uint32_t, ADDR_ADD, false) -DO_VLDR_SG(vldrh_sg_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD, false) +DO_VLDR_SG(vldrb_sg_sh, MO_SB, int8_t, ldb, 2, int16_t, uint16_t, ADDR_ADD, false) +DO_VLDR_SG(vldrb_sg_sw, MO_SB, int8_t, ldb, 4, int32_t, uint32_t, ADDR_ADD, false) +DO_VLDR_SG(vldrh_sg_sw, MO_TESW, int16_t, ldw, 4, int32_t, uint32_t, ADDR_ADD, false) -DO_VLDR_SG(vldrb_sg_ub, ldub, 1, uint8_t, uint8_t, ADDR_ADD, false) -DO_VLDR_SG(vldrb_sg_uh, ldub, 2, uint16_t, uint16_t, ADDR_ADD, false) -DO_VLDR_SG(vldrb_sg_uw, ldub, 4, uint32_t, uint32_t, ADDR_ADD, false) -DO_VLDR_SG(vldrh_sg_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD, false) -DO_VLDR_SG(vldrh_sg_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD, false) -DO_VLDR_SG(vldrw_sg_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD, false) +DO_VLDR_SG(vldrb_sg_ub, MO_UB, uint8_t, ldb, 1, uint8_t, uint8_t, ADDR_ADD, false) +DO_VLDR_SG(vldrb_sg_uh, MO_UB, uint8_t, ldb, 2, uint16_t, uint16_t, ADDR_ADD, false) +DO_VLDR_SG(vldrb_sg_uw, MO_UB, uint8_t, ldb, 4, uint32_t, uint32_t, ADDR_ADD, false) +DO_VLDR_SG(vldrh_sg_uh, MO_TEUW, uint16_t, ldw, 2, uint16_t, uint16_t, ADDR_ADD, false) +DO_VLDR_SG(vldrh_sg_uw, MO_TEUW, uint16_t, ldw, 4, uint32_t, uint32_t, ADDR_ADD, false) +DO_VLDR_SG(vldrw_sg_uw, MO_TEUL, uint32_t, ldl, 4, uint32_t, uint32_t, ADDR_ADD, false) DO_VLDR64_SG(vldrd_sg_ud, ADDR_ADD, false) -DO_VLDR_SG(vldrh_sg_os_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD_OSH, false) -DO_VLDR_SG(vldrh_sg_os_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD_OSH, false) -DO_VLDR_SG(vldrh_sg_os_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD_OSH, false) -DO_VLDR_SG(vldrw_sg_os_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD_OSW, false) +DO_VLDR_SG(vldrh_sg_os_sw, MO_TESW, int16_t, ldw, 4, + int32_t, uint32_t, ADDR_ADD_OSH, false) +DO_VLDR_SG(vldrh_sg_os_uh, MO_TEUW, uint16_t, ldw, 2, + uint16_t, uint16_t, ADDR_ADD_OSH, false) +DO_VLDR_SG(vldrh_sg_os_uw, MO_TEUW, uint16_t, ldw, 4, + uint32_t, uint32_t, ADDR_ADD_OSH, false) +DO_VLDR_SG(vldrw_sg_os_uw, MO_TEUL, uint32_t, ldl, 4, + uint32_t, uint32_t, ADDR_ADD_OSW, false) DO_VLDR64_SG(vldrd_sg_os_ud, ADDR_ADD_OSD, false) -DO_VSTR_SG(vstrb_sg_ub, stb, 1, uint8_t, ADDR_ADD, false) -DO_VSTR_SG(vstrb_sg_uh, stb, 2, uint16_t, ADDR_ADD, false) -DO_VSTR_SG(vstrb_sg_uw, stb, 4, uint32_t, ADDR_ADD, false) -DO_VSTR_SG(vstrh_sg_uh, stw, 2, uint16_t, ADDR_ADD, false) -DO_VSTR_SG(vstrh_sg_uw, stw, 4, uint32_t, ADDR_ADD, false) -DO_VSTR_SG(vstrw_sg_uw, stl, 4, uint32_t, ADDR_ADD, false) +DO_VSTR_SG(vstrb_sg_ub, MO_UB, stb, 1, uint8_t, ADDR_ADD, false) +DO_VSTR_SG(vstrb_sg_uh, MO_UB, stb, 2, uint16_t, ADDR_ADD, false) +DO_VSTR_SG(vstrb_sg_uw, MO_UB, stb, 4, uint32_t, ADDR_ADD, false) +DO_VSTR_SG(vstrh_sg_uh, MO_TEUW, stw, 2, uint16_t, ADDR_ADD, false) +DO_VSTR_SG(vstrh_sg_uw, MO_TEUW, stw, 4, uint32_t, ADDR_ADD, false) +DO_VSTR_SG(vstrw_sg_uw, MO_TEUL, stl, 4, uint32_t, ADDR_ADD, false) DO_VSTR64_SG(vstrd_sg_ud, ADDR_ADD, false) -DO_VSTR_SG(vstrh_sg_os_uh, stw, 2, uint16_t, ADDR_ADD_OSH, false) -DO_VSTR_SG(vstrh_sg_os_uw, stw, 4, uint32_t, ADDR_ADD_OSH, false) -DO_VSTR_SG(vstrw_sg_os_uw, stl, 4, uint32_t, ADDR_ADD_OSW, false) +DO_VSTR_SG(vstrh_sg_os_uh, MO_TEUW, stw, 2, uint16_t, ADDR_ADD_OSH, false) +DO_VSTR_SG(vstrh_sg_os_uw, MO_TEUW, stw, 4, uint32_t, ADDR_ADD_OSH, false) +DO_VSTR_SG(vstrw_sg_os_uw, MO_TEUL, stl, 4, uint32_t, ADDR_ADD_OSW, false) DO_VSTR64_SG(vstrd_sg_os_ud, ADDR_ADD_OSD, false) -DO_VLDR_SG(vldrw_sg_wb_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD, true) +DO_VLDR_SG(vldrw_sg_wb_uw, MO_TEUL, uint32_t, ldl, 4, uint32_t, uint32_t, ADDR_ADD, true) DO_VLDR64_SG(vldrd_sg_wb_ud, ADDR_ADD, true) -DO_VSTR_SG(vstrw_sg_wb_uw, stl, 4, uint32_t, ADDR_ADD, true) +DO_VSTR_SG(vstrw_sg_wb_uw, MO_TEUL, stl, 4, uint32_t, ADDR_ADD, true) DO_VSTR64_SG(vstrd_sg_wb_ud, ADDR_ADD, true) /* @@ -388,13 +403,15 @@ DO_VSTR64_SG(vstrd_sg_wb_ud, ADDR_ADD, true) uint16_t mask = mve_eci_mask(env); \ static const uint8_t off[4] = { O1, O2, O3, O4 }; \ uint32_t addr, data; \ + int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \ + MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \ for (beat = 0; beat < 4; beat++, mask >>= 4) { \ if ((mask & 1) == 0) { \ /* ECI says skip this beat */ \ continue; \ } \ addr = base + off[beat] * 4; \ - data = cpu_ldl_le_data_ra(env, addr, GETPC()); \ + data = cpu_ldl_mmu(env, addr, oi, GETPC()); \ for (e = 0; e < 4; e++, data >>= 8) { \ uint8_t *qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + e); \ qd[H1(off[beat])] = data; \ @@ -412,13 +429,15 @@ DO_VSTR64_SG(vstrd_sg_wb_ud, ADDR_ADD, true) uint32_t addr, data; \ int y; /* y counts 0 2 0 2 */ \ uint16_t *qd; \ + int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \ + MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \ for (beat = 0, y = 0; beat < 4; beat++, mask >>= 4, y ^= 2) { \ if ((mask & 1) == 0) { \ /* ECI says skip this beat */ \ continue; \ } \ addr = base + off[beat] * 8 + (beat & 1) * 4; \ - data = cpu_ldl_le_data_ra(env, addr, GETPC()); \ + data = cpu_ldl_mmu(env, addr, oi, GETPC()); \ qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y); \ qd[H2(off[beat])] = data; \ data >>= 16; \ @@ -437,13 +456,15 @@ DO_VSTR64_SG(vstrd_sg_wb_ud, ADDR_ADD, true) uint32_t addr, data; \ uint32_t *qd; \ int y; \ + int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \ + MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \ for (beat = 0; beat < 4; beat++, mask >>= 4) { \ if ((mask & 1) == 0) { \ /* ECI says skip this beat */ \ continue; \ } \ addr = base + off[beat] * 4; \ - data = cpu_ldl_le_data_ra(env, addr, GETPC()); \ + data = cpu_ldl_mmu(env, addr, oi, GETPC()); \ y = (beat + (O1 & 2)) & 3; \ qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + y); \ qd[H4(off[beat] >> 2)] = data; \ @@ -474,13 +495,15 @@ DO_VLD4W(vld43w, 6, 7, 8, 9) static const uint8_t off[4] = { O1, O2, O3, O4 }; \ uint32_t addr, data; \ uint8_t *qd; \ + int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \ + MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \ for (beat = 0; beat < 4; beat++, mask >>= 4) { \ if ((mask & 1) == 0) { \ /* ECI says skip this beat */ \ continue; \ } \ addr = base + off[beat] * 2; \ - data = cpu_ldl_le_data_ra(env, addr, GETPC()); \ + data = cpu_ldl_mmu(env, addr, oi, GETPC()); \ for (e = 0; e < 4; e++, data >>= 8) { \ qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + (e & 1)); \ qd[H1(off[beat] + (e >> 1))] = data; \ @@ -498,13 +521,15 @@ DO_VLD4W(vld43w, 6, 7, 8, 9) uint32_t addr, data; \ int e; \ uint16_t *qd; \ + int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \ + MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \ for (beat = 0; beat < 4; beat++, mask >>= 4) { \ if ((mask & 1) == 0) { \ /* ECI says skip this beat */ \ continue; \ } \ addr = base + off[beat] * 4; \ - data = cpu_ldl_le_data_ra(env, addr, GETPC()); \ + data = cpu_ldl_mmu(env, addr, oi, GETPC()); \ for (e = 0; e < 2; e++, data >>= 16) { \ qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + e); \ qd[H2(off[beat])] = data; \ @@ -521,13 +546,15 @@ DO_VLD4W(vld43w, 6, 7, 8, 9) static const uint8_t off[4] = { O1, O2, O3, O4 }; \ uint32_t addr, data; \ uint32_t *qd; \ + int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \ + MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \ for (beat = 0; beat < 4; beat++, mask >>= 4) { \ if ((mask & 1) == 0) { \ /* ECI says skip this beat */ \ continue; \ } \ addr = base + off[beat]; \ - data = cpu_ldl_le_data_ra(env, addr, GETPC()); \ + data = cpu_ldl_mmu(env, addr, oi, GETPC()); \ qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + (beat & 1)); \ qd[H4(off[beat] >> 3)] = data; \ } \ @@ -550,6 +577,8 @@ DO_VLD2W(vld21w, 8, 12, 16, 20) uint16_t mask = mve_eci_mask(env); \ static const uint8_t off[4] = { O1, O2, O3, O4 }; \ uint32_t addr, data; \ + int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \ + MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \ for (beat = 0; beat < 4; beat++, mask >>= 4) { \ if ((mask & 1) == 0) { \ /* ECI says skip this beat */ \ @@ -561,7 +590,7 @@ DO_VLD2W(vld21w, 8, 12, 16, 20) uint8_t *qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + e); \ data = (data << 8) | qd[H1(off[beat])]; \ } \ - cpu_stl_le_data_ra(env, addr, data, GETPC()); \ + cpu_stl_mmu(env, addr, data, oi, GETPC()); \ } \ } @@ -575,6 +604,8 @@ DO_VLD2W(vld21w, 8, 12, 16, 20) uint32_t addr, data; \ int y; /* y counts 0 2 0 2 */ \ uint16_t *qd; \ + int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \ + MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \ for (beat = 0, y = 0; beat < 4; beat++, mask >>= 4, y ^= 2) { \ if ((mask & 1) == 0) { \ /* ECI says skip this beat */ \ @@ -585,7 +616,7 @@ DO_VLD2W(vld21w, 8, 12, 16, 20) data = qd[H2(off[beat])]; \ qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y + 1); \ data |= qd[H2(off[beat])] << 16; \ - cpu_stl_le_data_ra(env, addr, data, GETPC()); \ + cpu_stl_mmu(env, addr, data, oi, GETPC()); \ } \ } @@ -599,6 +630,8 @@ DO_VLD2W(vld21w, 8, 12, 16, 20) uint32_t addr, data; \ uint32_t *qd; \ int y; \ + int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \ + MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \ for (beat = 0; beat < 4; beat++, mask >>= 4) { \ if ((mask & 1) == 0) { \ /* ECI says skip this beat */ \ @@ -608,7 +641,7 @@ DO_VLD2W(vld21w, 8, 12, 16, 20) y = (beat + (O1 & 2)) & 3; \ qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + y); \ data = qd[H4(off[beat] >> 2)]; \ - cpu_stl_le_data_ra(env, addr, data, GETPC()); \ + cpu_stl_mmu(env, addr, data, oi, GETPC()); \ } \ } @@ -636,6 +669,8 @@ DO_VST4W(vst43w, 6, 7, 8, 9) static const uint8_t off[4] = { O1, O2, O3, O4 }; \ uint32_t addr, data; \ uint8_t *qd; \ + int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \ + MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \ for (beat = 0; beat < 4; beat++, mask >>= 4) { \ if ((mask & 1) == 0) { \ /* ECI says skip this beat */ \ @@ -647,7 +682,7 @@ DO_VST4W(vst43w, 6, 7, 8, 9) qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + (e & 1)); \ data = (data << 8) | qd[H1(off[beat] + (e >> 1))]; \ } \ - cpu_stl_le_data_ra(env, addr, data, GETPC()); \ + cpu_stl_mmu(env, addr, data, oi, GETPC()); \ } \ } @@ -661,6 +696,8 @@ DO_VST4W(vst43w, 6, 7, 8, 9) uint32_t addr, data; \ int e; \ uint16_t *qd; \ + int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \ + MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \ for (beat = 0; beat < 4; beat++, mask >>= 4) { \ if ((mask & 1) == 0) { \ /* ECI says skip this beat */ \ @@ -672,7 +709,7 @@ DO_VST4W(vst43w, 6, 7, 8, 9) qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + e); \ data = (data << 16) | qd[H2(off[beat])]; \ } \ - cpu_stl_le_data_ra(env, addr, data, GETPC()); \ + cpu_stl_mmu(env, addr, data, oi, GETPC()); \ } \ } @@ -685,6 +722,8 @@ DO_VST4W(vst43w, 6, 7, 8, 9) static const uint8_t off[4] = { O1, O2, O3, O4 }; \ uint32_t addr, data; \ uint32_t *qd; \ + int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env)); \ + MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx); \ for (beat = 0; beat < 4; beat++, mask >>= 4) { \ if ((mask & 1) == 0) { \ /* ECI says skip this beat */ \ @@ -693,7 +732,7 @@ DO_VST4W(vst43w, 6, 7, 8, 9) addr = base + off[beat]; \ qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + (beat & 1)); \ data = qd[H4(off[beat] >> 3)]; \ - cpu_stl_le_data_ra(env, addr, data, GETPC()); \ + cpu_stl_mmu(env, addr, data, oi, GETPC()); \ } \ } @@ -2165,27 +2204,6 @@ DO_VSHLL_ALL(vshllt, true) DO_VSHRN(OP##tb, true, 1, uint8_t, 2, uint16_t, FN) \ DO_VSHRN(OP##th, true, 2, uint16_t, 4, uint32_t, FN) -static inline uint64_t do_urshr(uint64_t x, unsigned sh) -{ - if (likely(sh < 64)) { - return (x >> sh) + ((x >> (sh - 1)) & 1); - } else if (sh == 64) { - return x >> 63; - } else { - return 0; - } -} - -static inline int64_t do_srshr(int64_t x, unsigned sh) -{ - if (likely(sh < 64)) { - return (x >> sh) + ((x >> (sh - 1)) & 1); - } else { - /* Rounding the sign bit always produces 0. */ - return 0; - } -} - DO_VSHRN_ALL(vshrn, DO_SHR) DO_VSHRN_ALL(vrshrn, do_urshr) diff --git a/target/arm/tcg/neon_helper.c b/target/arm/tcg/neon_helper.c index e2cc7cf..8d288f3 100644 --- a/target/arm/tcg/neon_helper.c +++ b/target/arm/tcg/neon_helper.c @@ -9,11 +9,13 @@ #include "qemu/osdep.h" #include "cpu.h" -#include "exec/helper-proto.h" #include "tcg/tcg-gvec-desc.h" #include "fpu/softfloat.h" #include "vec_internal.h" +#define HELPER_H "tcg/helper.h" +#include "exec/helper-proto.h.inc" + #define SIGNBIT (uint32_t)0x80000000 #define SIGNBIT64 ((uint64_t)1 << 63) @@ -227,15 +229,30 @@ NEON_GVEC_VOP2(gvec_srshl_h, int16_t) #undef NEON_FN #define NEON_FN(dest, src1, src2) \ + (dest = do_sqrshl_bhs(src1, src2, 16, true, NULL)) +NEON_GVEC_VOP2(sme2_srshl_h, int16_t) +#undef NEON_FN + +#define NEON_FN(dest, src1, src2) \ (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, NULL)) NEON_GVEC_VOP2(gvec_srshl_s, int32_t) #undef NEON_FN #define NEON_FN(dest, src1, src2) \ + (dest = do_sqrshl_bhs(src1, src2, 32, true, NULL)) +NEON_GVEC_VOP2(sme2_srshl_s, int32_t) +#undef NEON_FN + +#define NEON_FN(dest, src1, src2) \ (dest = do_sqrshl_d(src1, (int8_t)src2, true, NULL)) NEON_GVEC_VOP2(gvec_srshl_d, int64_t) #undef NEON_FN +#define NEON_FN(dest, src1, src2) \ + (dest = do_sqrshl_d(src1, src2, true, NULL)) +NEON_GVEC_VOP2(sme2_srshl_d, int64_t) +#undef NEON_FN + uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift) { return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL); @@ -259,15 +276,30 @@ NEON_GVEC_VOP2(gvec_urshl_h, uint16_t) #undef NEON_FN #define NEON_FN(dest, src1, src2) \ + (dest = do_uqrshl_bhs(src1, (int16_t)src2, 16, true, NULL)) +NEON_GVEC_VOP2(sme2_urshl_h, uint16_t) +#undef NEON_FN + +#define NEON_FN(dest, src1, src2) \ (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, NULL)) NEON_GVEC_VOP2(gvec_urshl_s, int32_t) #undef NEON_FN #define NEON_FN(dest, src1, src2) \ + (dest = do_uqrshl_bhs(src1, src2, 32, true, NULL)) +NEON_GVEC_VOP2(sme2_urshl_s, int32_t) +#undef NEON_FN + +#define NEON_FN(dest, src1, src2) \ (dest = do_uqrshl_d(src1, (int8_t)src2, true, NULL)) NEON_GVEC_VOP2(gvec_urshl_d, int64_t) #undef NEON_FN +#define NEON_FN(dest, src1, src2) \ + (dest = do_uqrshl_d(src1, src2, true, NULL)) +NEON_GVEC_VOP2(sme2_urshl_d, int64_t) +#undef NEON_FN + uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift) { return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL); diff --git a/target/arm/tcg/op_helper.c b/target/arm/tcg/op_helper.c index 30786fd..575e566 100644 --- a/target/arm/tcg/op_helper.c +++ b/target/arm/tcg/op_helper.c @@ -20,10 +20,11 @@ #include "qemu/main-loop.h" #include "cpu.h" #include "exec/helper-proto.h" +#include "exec/target_page.h" #include "internals.h" #include "cpu-features.h" -#include "exec/exec-all.h" -#include "exec/cpu_ldst.h" +#include "accel/tcg/cpu-ldst.h" +#include "accel/tcg/probe.h" #include "cpregs.h" #define SIGNBIT (uint32_t)0x80000000 @@ -1221,7 +1222,7 @@ uint32_t HELPER(ror_cc)(CPUARMState *env, uint32_t x, uint32_t i) } } -void HELPER(probe_access)(CPUARMState *env, target_ulong ptr, +void HELPER(probe_access)(CPUARMState *env, vaddr ptr, uint32_t access_type, uint32_t mmu_idx, uint32_t size) { diff --git a/target/arm/tcg/pauth_helper.c b/target/arm/tcg/pauth_helper.c index c4b1430..c591c30 100644 --- a/target/arm/tcg/pauth_helper.c +++ b/target/arm/tcg/pauth_helper.c @@ -21,8 +21,7 @@ #include "cpu.h" #include "internals.h" #include "cpu-features.h" -#include "exec/exec-all.h" -#include "exec/cpu_ldst.h" +#include "accel/tcg/cpu-ldst.h" #include "exec/helper-proto.h" #include "tcg/tcg-gvec-desc.h" #include "qemu/xxhash.h" diff --git a/target/arm/tcg/sme.decode b/target/arm/tcg/sme.decode index 628804e..6bb9aa2 100644 --- a/target/arm/tcg/sme.decode +++ b/target/arm/tcg/sme.decode @@ -22,30 +22,139 @@ ### SME Misc ZERO 11000000 00 001 00000000000 imm:8 +ZERO_zt0 11000000 01 001 00000000000 00000001 ### SME Move into/from Array %mova_rs 13:2 !function=plus_12 -&mova esz rs pg zr za_imm v:bool to_vec:bool +%mova_rv 13:2 !function=plus_8 +&mova_a rv zr off +&mova_p esz rs pg zr za off v:bool +&mova_t esz rs zr za off v:bool -MOVA 11000000 esz:2 00000 0 v:1 .. pg:3 zr:5 0 za_imm:4 \ - &mova to_vec=0 rs=%mova_rs -MOVA 11000000 11 00000 1 v:1 .. pg:3 zr:5 0 za_imm:4 \ - &mova to_vec=0 rs=%mova_rs esz=4 +MOVA_tz 11000000 00 00000 0 v:1 .. pg:3 zr:5 0 off:4 \ + &mova_p rs=%mova_rs esz=0 za=0 +MOVA_tz 11000000 01 00000 0 v:1 .. pg:3 zr:5 0 za:1 off:3 \ + &mova_p rs=%mova_rs esz=1 +MOVA_tz 11000000 10 00000 0 v:1 .. pg:3 zr:5 0 za:2 off:2 \ + &mova_p rs=%mova_rs esz=2 +MOVA_tz 11000000 11 00000 0 v:1 .. pg:3 zr:5 0 za:3 off:1 \ + &mova_p rs=%mova_rs esz=3 +MOVA_tz 11000000 11 00000 1 v:1 .. pg:3 zr:5 0 za:4 \ + &mova_p rs=%mova_rs esz=4 off=0 -MOVA 11000000 esz:2 00001 0 v:1 .. pg:3 0 za_imm:4 zr:5 \ - &mova to_vec=1 rs=%mova_rs -MOVA 11000000 11 00001 1 v:1 .. pg:3 0 za_imm:4 zr:5 \ - &mova to_vec=1 rs=%mova_rs esz=4 +MOVA_zt 11000000 00 00001 0 v:1 .. pg:3 0 off:4 zr:5 \ + &mova_p rs=%mova_rs esz=0 za=0 +MOVA_zt 11000000 01 00001 0 v:1 .. pg:3 0 za:1 off:3 zr:5 \ + &mova_p rs=%mova_rs esz=1 +MOVA_zt 11000000 10 00001 0 v:1 .. pg:3 0 za:2 off:2 zr:5 \ + &mova_p rs=%mova_rs esz=2 +MOVA_zt 11000000 11 00001 0 v:1 .. pg:3 0 za:3 off:1 zr:5 \ + &mova_p rs=%mova_rs esz=3 +MOVA_zt 11000000 11 00001 1 v:1 .. pg:3 0 za:4 zr:5 \ + &mova_p rs=%mova_rs esz=4 off=0 + +MOVA_tz2 11000000 00 00010 0 v:1 .. 000 zr:4 0 00 off:3 \ + &mova_t rs=%mova_rs esz=0 za=0 +MOVA_tz2 11000000 01 00010 0 v:1 .. 000 zr:4 0 00 za:1 off:2 \ + &mova_t rs=%mova_rs esz=1 +MOVA_tz2 11000000 10 00010 0 v:1 .. 000 zr:4 0 00 za:2 off:1 \ + &mova_t rs=%mova_rs esz=2 +MOVA_tz2 11000000 11 00010 0 v:1 .. 000 zr:4 0 00 za:3 \ + &mova_t rs=%mova_rs esz=3 off=0 + +MOVA_zt2 11000000 00 00011 0 v:1 .. 000 00 off:3 zr:4 0 \ + &mova_t rs=%mova_rs esz=0 za=0 +MOVA_zt2 11000000 01 00011 0 v:1 .. 000 00 za:1 off:2 zr:4 0 \ + &mova_t rs=%mova_rs esz=1 +MOVA_zt2 11000000 10 00011 0 v:1 .. 000 00 za:2 off:1 zr:4 0 \ + &mova_t rs=%mova_rs esz=2 +MOVA_zt2 11000000 11 00011 0 v:1 .. 000 00 za:3 zr:4 0 \ + &mova_t rs=%mova_rs esz=3 off=0 + +MOVA_tz4 11000000 00 00010 0 v:1 .. 001 zr:3 00 000 off:2 \ + &mova_t rs=%mova_rs esz=0 za=0 +MOVA_tz4 11000000 01 00010 0 v:1 .. 001 zr:3 00 000 za:1 off:1 \ + &mova_t rs=%mova_rs esz=1 +MOVA_tz4 11000000 10 00010 0 v:1 .. 001 zr:3 00 000 za:2 \ + &mova_t rs=%mova_rs esz=2 off=0 +MOVA_tz4 11000000 11 00010 0 v:1 .. 001 zr:3 00 00 za:3 \ + &mova_t rs=%mova_rs esz=3 off=0 + +MOVA_zt4 11000000 00 00011 0 v:1 .. 001 000 off:2 zr:3 00 \ + &mova_t rs=%mova_rs esz=0 za=0 +MOVA_zt4 11000000 01 00011 0 v:1 .. 001 000 za:1 off:1 zr:3 00 \ + &mova_t rs=%mova_rs esz=1 +MOVA_zt4 11000000 10 00011 0 v:1 .. 001 000 za:2 zr:3 00 \ + &mova_t rs=%mova_rs esz=2 off=0 +MOVA_zt4 11000000 11 00011 0 v:1 .. 001 00 za:3 zr:3 00 \ + &mova_t rs=%mova_rs esz=3 off=0 + +MOVA_az2 11000000 00 00010 00 .. 010 zr:4 000 off:3 \ + &mova_a rv=%mova_rv +MOVA_az4 11000000 00 00010 00 .. 011 zr:3 0000 off:3 \ + &mova_a rv=%mova_rv + +MOVA_za2 11000000 00 00011 00 .. 010 00 off:3 zr:4 0 \ + &mova_a rv=%mova_rv +MOVA_za4 11000000 00 00011 00 .. 011 00 off:3 zr:3 00 \ + &mova_a rv=%mova_rv + +### SME Move and Zero + +MOVAZ_za2 11000000 00000110 0 .. 01010 off:3 zr:4 0 \ + &mova_a rv=%mova_rv +MOVAZ_za4 11000000 00000110 0 .. 01110 off:3 zr:3 00 \ + &mova_a rv=%mova_rv + +MOVAZ_zt 11000000 00 00001 0 v:1 .. 0001 off:4 zr:5 \ + &mova_t rs=%mova_rs esz=0 za=0 +MOVAZ_zt 11000000 01 00001 0 v:1 .. 0001 za:1 off:3 zr:5 \ + &mova_t rs=%mova_rs esz=1 +MOVAZ_zt 11000000 10 00001 0 v:1 .. 0001 za:2 off:2 zr:5 \ + &mova_t rs=%mova_rs esz=2 +MOVAZ_zt 11000000 11 00001 0 v:1 .. 0001 za:3 off:1 zr:5 \ + &mova_t rs=%mova_rs esz=3 +MOVAZ_zt 11000000 11 00001 1 v:1 .. 0001 za:4 zr:5 \ + &mova_t rs=%mova_rs esz=4 off=0 + +MOVAZ_zt2 11000000 00 00011 0 v:1 .. 00010 off:3 zr:4 0 \ + &mova_t rs=%mova_rs esz=0 za=0 +MOVAZ_zt2 11000000 01 00011 0 v:1 .. 00010 za:1 off:2 zr:4 0 \ + &mova_t rs=%mova_rs esz=1 +MOVAZ_zt2 11000000 10 00011 0 v:1 .. 00010 za:2 off:1 zr:4 0 \ + &mova_t rs=%mova_rs esz=2 +MOVAZ_zt2 11000000 11 00011 0 v:1 .. 00010 za:3 zr:4 0 \ + &mova_t rs=%mova_rs esz=3 off=0 + +MOVAZ_zt4 11000000 00 00011 0 v:1 .. 001100 off:2 zr:3 00 \ + &mova_t rs=%mova_rs esz=0 za=0 +MOVAZ_zt4 11000000 01 00011 0 v:1 .. 001100 za:1 off:1 zr:3 00 \ + &mova_t rs=%mova_rs esz=1 +MOVAZ_zt4 11000000 10 00011 0 v:1 .. 001100 za:2 zr:3 00 \ + &mova_t rs=%mova_rs esz=2 off=0 +MOVAZ_zt4 11000000 11 00011 0 v:1 .. 00110 za:3 zr:3 00 \ + &mova_t rs=%mova_rs esz=3 off=0 + +### SME Move into/from ZT0 + +MOVT_rzt 1100 0000 0100 1100 0 off:3 00 11111 rt:5 +MOVT_ztr 1100 0000 0100 1110 0 off:3 00 11111 rt:5 ### SME Memory -&ldst esz rs pg rn rm za_imm v:bool st:bool +&ldst esz rs pg rn rm za off v:bool st:bool -LDST1 1110000 0 esz:2 st:1 rm:5 v:1 .. pg:3 rn:5 0 za_imm:4 \ - &ldst rs=%mova_rs -LDST1 1110000 111 st:1 rm:5 v:1 .. pg:3 rn:5 0 za_imm:4 \ - &ldst esz=4 rs=%mova_rs +LDST1 1110000 0 00 st:1 rm:5 v:1 .. pg:3 rn:5 0 off:4 \ + &ldst rs=%mova_rs esz=0 za=0 +LDST1 1110000 0 01 st:1 rm:5 v:1 .. pg:3 rn:5 0 za:1 off:3 \ + &ldst rs=%mova_rs esz=1 +LDST1 1110000 0 10 st:1 rm:5 v:1 .. pg:3 rn:5 0 za:2 off:2 \ + &ldst rs=%mova_rs esz=2 +LDST1 1110000 0 11 st:1 rm:5 v:1 .. pg:3 rn:5 0 za:3 off:1 \ + &ldst rs=%mova_rs esz=3 +LDST1 1110000 1 11 st:1 rm:5 v:1 .. pg:3 rn:5 0 za:4 \ + &ldst rs=%mova_rs esz=4 off=0 &ldstr rv rn imm @ldstr ....... ... . ...... .. ... rn:5 . imm:4 \ @@ -54,6 +163,12 @@ LDST1 1110000 111 st:1 rm:5 v:1 .. pg:3 rn:5 0 za_imm:4 \ LDR 1110000 100 0 000000 .. 000 ..... 0 .... @ldstr STR 1110000 100 1 000000 .. 000 ..... 0 .... @ldstr +&ldstzt0 rn +@ldstzt0 ....... ... . ...... .. ... rn:5 ..... &ldstzt0 + +LDR_zt0 1110000 100 0 111111 00 000 ..... 00000 @ldstzt0 +STR_zt0 1110000 100 1 111111 00 000 ..... 00000 @ldstzt0 + ### SME Add Vector to Array &adda zad zn pm pn @@ -68,14 +183,18 @@ ADDVA_d 11000000 11 01000 1 ... ... ..... 00 ... @adda_64 ### SME Outer Product &op zad zn zm pm pn sub:bool +@op_16 ........ ... zm:5 pm:3 pn:3 zn:5 sub:1 ... zad:1 &op @op_32 ........ ... zm:5 pm:3 pn:3 zn:5 sub:1 .. zad:2 &op @op_64 ........ ... zm:5 pm:3 pn:3 zn:5 sub:1 . zad:3 &op +FMOPA_h 10000001 100 ..... ... ... ..... . 100 . @op_16 FMOPA_s 10000000 100 ..... ... ... ..... . 00 .. @op_32 FMOPA_d 10000000 110 ..... ... ... ..... . 0 ... @op_64 -BFMOPA 10000001 100 ..... ... ... ..... . 00 .. @op_32 -FMOPA_h 10000001 101 ..... ... ... ..... . 00 .. @op_32 +BFMOPA 10000001 101 ..... ... ... ..... . 100 . @op_16 + +BFMOPA_w 10000001 100 ..... ... ... ..... . 00 .. @op_32 +FMOPA_w_h 10000001 101 ..... ... ... ..... . 00 .. @op_32 SMOPA_s 1010000 0 10 0 ..... ... ... ..... . 00 .. @op_32 SUMOPA_s 1010000 0 10 1 ..... ... ... ..... . 00 .. @op_32 @@ -86,3 +205,789 @@ SMOPA_d 1010000 0 11 0 ..... ... ... ..... . 0 ... @op_64 SUMOPA_d 1010000 0 11 1 ..... ... ... ..... . 0 ... @op_64 USMOPA_d 1010000 1 11 0 ..... ... ... ..... . 0 ... @op_64 UMOPA_d 1010000 1 11 1 ..... ... ... ..... . 0 ... @op_64 + +BMOPA 1000000 0 10 0 ..... ... ... ..... . 10 .. @op_32 +SMOPA2_s 1010000 0 10 0 ..... ... ... ..... . 10 .. @op_32 +UMOPA2_s 1010000 1 10 0 ..... ... ... ..... . 10 .. @op_32 + +### SME2 Multi-vector Multiple and Single SVE Destructive + +%zd_ax2 1:4 !function=times_2 +%zd_ax4 2:3 !function=times_4 + +&z2z_en zdn zm esz n +@z2z_2x1 ....... . esz:2 .. zm:4 ....0. ..... .... . \ + &z2z_en n=2 zdn=%zd_ax2 +@z2z_4x1 ....... . esz:2 .. zm:4 ....1. ..... ...0 . \ + &z2z_en n=4 zdn=%zd_ax4 + +SMAX_n1 1100000 1 .. 10 .... 1010.0 00000 .... 0 @z2z_2x1 +SMAX_n1 1100000 1 .. 10 .... 1010.0 00000 .... 0 @z2z_4x1 +UMAX_n1 1100000 1 .. 10 .... 1010.0 00000 .... 1 @z2z_2x1 +UMAX_n1 1100000 1 .. 10 .... 1010.0 00000 .... 1 @z2z_4x1 +SMIN_n1 1100000 1 .. 10 .... 1010.0 00001 .... 0 @z2z_2x1 +SMIN_n1 1100000 1 .. 10 .... 1010.0 00001 .... 0 @z2z_4x1 +UMIN_n1 1100000 1 .. 10 .... 1010.0 00001 .... 1 @z2z_2x1 +UMIN_n1 1100000 1 .. 10 .... 1010.0 00001 .... 1 @z2z_4x1 + +FMAX_n1 1100000 1 .. 10 .... 1010.0 01000 .... 0 @z2z_2x1 +FMAX_n1 1100000 1 .. 10 .... 1010.0 01000 .... 0 @z2z_4x1 +FMIN_n1 1100000 1 .. 10 .... 1010.0 01000 .... 1 @z2z_2x1 +FMIN_n1 1100000 1 .. 10 .... 1010.0 01000 .... 1 @z2z_4x1 +FMAXNM_n1 1100000 1 .. 10 .... 1010.0 01001 .... 0 @z2z_2x1 +FMAXNM_n1 1100000 1 .. 10 .... 1010.0 01001 .... 0 @z2z_4x1 +FMINNM_n1 1100000 1 .. 10 .... 1010.0 01001 .... 1 @z2z_2x1 +FMINNM_n1 1100000 1 .. 10 .... 1010.0 01001 .... 1 @z2z_4x1 + +SRSHL_n1 1100000 1 .. 10 .... 1010.0 10001 .... 0 @z2z_2x1 +SRSHL_n1 1100000 1 .. 10 .... 1010.0 10001 .... 0 @z2z_4x1 +URSHL_n1 1100000 1 .. 10 .... 1010.0 10001 .... 1 @z2z_2x1 +URSHL_n1 1100000 1 .. 10 .... 1010.0 10001 .... 1 @z2z_4x1 + +ADD_n1 1100000 1 .. 10 .... 1010.0 11000 .... 0 @z2z_2x1 +ADD_n1 1100000 1 .. 10 .... 1010.0 11000 .... 0 @z2z_4x1 + +SQDMULH_n1 1100000 1 .. 10 .... 1010.1 00000 .... 0 @z2z_2x1 +SQDMULH_n1 1100000 1 .. 10 .... 1010.1 00000 .... 0 @z2z_4x1 + +### SME2 Multi-vector Multiple Vectors SVE Destructive + +%zm_ax2 17:4 !function=times_2 +%zm_ax4 18:3 !function=times_4 + +@z2z_2x2 ....... . esz:2 . ....0 ....0. ..... .... . \ + &z2z_en n=2 zdn=%zd_ax2 zm=%zm_ax2 +@z2z_4x4 ....... . esz:2 . ...00 ....1. ..... ...0 . \ + &z2z_en n=4 zdn=%zd_ax4 zm=%zm_ax4 + +SMAX_nn 1100000 1 .. 1 ..... 1011.0 00000 .... 0 @z2z_2x2 +SMAX_nn 1100000 1 .. 1 ..... 1011.0 00000 .... 0 @z2z_4x4 +UMAX_nn 1100000 1 .. 1 ..... 1011.0 00000 .... 1 @z2z_2x2 +UMAX_nn 1100000 1 .. 1 ..... 1011.0 00000 .... 1 @z2z_4x4 +SMIN_nn 1100000 1 .. 1 ..... 1011.0 00001 .... 0 @z2z_2x2 +SMIN_nn 1100000 1 .. 1 ..... 1011.0 00001 .... 0 @z2z_4x4 +UMIN_nn 1100000 1 .. 1 ..... 1011.0 00001 .... 1 @z2z_2x2 +UMIN_nn 1100000 1 .. 1 ..... 1011.0 00001 .... 1 @z2z_4x4 + +FMAX_nn 1100000 1 .. 1 ..... 1011.0 01000 .... 0 @z2z_2x2 +FMAX_nn 1100000 1 .. 1 ..... 1011.0 01000 .... 0 @z2z_4x4 +FMIN_nn 1100000 1 .. 1 ..... 1011.0 01000 .... 1 @z2z_2x2 +FMIN_nn 1100000 1 .. 1 ..... 1011.0 01000 .... 1 @z2z_4x4 +FMAXNM_nn 1100000 1 .. 1 ..... 1011.0 01001 .... 0 @z2z_2x2 +FMAXNM_nn 1100000 1 .. 1 ..... 1011.0 01001 .... 0 @z2z_4x4 +FMINNM_nn 1100000 1 .. 1 ..... 1011.0 01001 .... 1 @z2z_2x2 +FMINNM_nn 1100000 1 .. 1 ..... 1011.0 01001 .... 1 @z2z_4x4 + +SRSHL_nn 1100000 1 .. 1 ..... 1011.0 10001 .... 0 @z2z_2x2 +SRSHL_nn 1100000 1 .. 1 ..... 1011.0 10001 .... 0 @z2z_4x4 +URSHL_nn 1100000 1 .. 1 ..... 1011.0 10001 .... 1 @z2z_2x2 +URSHL_nn 1100000 1 .. 1 ..... 1011.0 10001 .... 1 @z2z_4x4 + +SQDMULH_nn 1100000 1 .. 1 ..... 1011.1 00000 .... 0 @z2z_2x2 +SQDMULH_nn 1100000 1 .. 1 ..... 1011.1 00000 .... 0 @z2z_4x4 + +### SME2 Multi-vector Multiple and Single Array Vectors + +&azz_n n off rv zn zm +@azz_nx1_o3 ........ .... zm:4 ...... zn:5 .. off:3 &azz_n rv=%mova_rv + +ADD_azz_n1_s 11000001 0010 .... 0 .. 110 ..... 10 ... @azz_nx1_o3 n=2 +ADD_azz_n1_s 11000001 0011 .... 0 .. 110 ..... 10 ... @azz_nx1_o3 n=4 +ADD_azz_n1_d 11000001 0110 .... 0 .. 110 ..... 10 ... @azz_nx1_o3 n=2 +ADD_azz_n1_d 11000001 0111 .... 0 .. 110 ..... 10 ... @azz_nx1_o3 n=4 + +SUB_azz_n1_s 11000001 0010 .... 0 .. 110 ..... 11 ... @azz_nx1_o3 n=2 +SUB_azz_n1_s 11000001 0011 .... 0 .. 110 ..... 11 ... @azz_nx1_o3 n=4 +SUB_azz_n1_d 11000001 0110 .... 0 .. 110 ..... 11 ... @azz_nx1_o3 n=2 +SUB_azz_n1_d 11000001 0111 .... 0 .. 110 ..... 11 ... @azz_nx1_o3 n=4 + +%off3_x2 0:3 !function=times_2 +%off2_x2 0:2 !function=times_2 + +@azz_nx1_o3x2 ........ ... . zm:4 . .. ... zn:5 .. ... \ + &azz_n off=%off3_x2 rv=%mova_rv +@azz_nx1_o2x2 ........ ... . zm:4 . .. ... zn:5 ... .. \ + &azz_n off=%off2_x2 rv=%mova_rv + +FMLAL_n1 11000001 001 0 .... 0 .. 011 ..... 00 ... @azz_nx1_o3x2 n=1 +FMLAL_n1 11000001 001 0 .... 0 .. 010 ..... 000 .. @azz_nx1_o2x2 n=2 +FMLAL_n1 11000001 001 1 .... 0 .. 010 ..... 000 .. @azz_nx1_o2x2 n=4 + +FMLSL_n1 11000001 001 0 .... 0 .. 011 ..... 01 ... @azz_nx1_o3x2 n=1 +FMLSL_n1 11000001 001 0 .... 0 .. 010 ..... 010 .. @azz_nx1_o2x2 n=2 +FMLSL_n1 11000001 001 1 .... 0 .. 010 ..... 010 .. @azz_nx1_o2x2 n=4 + +BFMLAL_n1 11000001 001 0 .... 0 .. 011 ..... 10 ... @azz_nx1_o3x2 n=1 +BFMLAL_n1 11000001 001 0 .... 0 .. 010 ..... 100 .. @azz_nx1_o2x2 n=2 +BFMLAL_n1 11000001 001 1 .... 0 .. 010 ..... 100 .. @azz_nx1_o2x2 n=4 + +BFMLSL_n1 11000001 001 0 .... 0 .. 011 ..... 11 ... @azz_nx1_o3x2 n=1 +BFMLSL_n1 11000001 001 0 .... 0 .. 010 ..... 110 .. @azz_nx1_o2x2 n=2 +BFMLSL_n1 11000001 001 1 .... 0 .. 010 ..... 110 .. @azz_nx1_o2x2 n=4 + +FDOT_n1 11000001 001 0 .... 0 .. 100 ..... 00 ... @azz_nx1_o3 n=2 +FDOT_n1 11000001 001 1 .... 0 .. 100 ..... 00 ... @azz_nx1_o3 n=4 + +BFDOT_n1 11000001 001 0 .... 0 .. 100 ..... 10 ... @azz_nx1_o3 n=2 +BFDOT_n1 11000001 001 1 .... 0 .. 100 ..... 10 ... @azz_nx1_o3 n=4 + +USDOT_n1 11000001 001 0 .... 0 .. 101 ..... 01 ... @azz_nx1_o3 n=2 +USDOT_n1 11000001 001 1 .... 0 .. 101 ..... 01 ... @azz_nx1_o3 n=4 + +SUDOT_n1 11000001 001 0 .... 0 .. 101 ..... 11 ... @azz_nx1_o3 n=2 +SUDOT_n1 11000001 001 1 .... 0 .. 101 ..... 11 ... @azz_nx1_o3 n=4 + +SDOT_n1_4b 11000001 001 0 .... 0 .. 101 ..... 00 ... @azz_nx1_o3 n=2 +SDOT_n1_4b 11000001 001 1 .... 0 .. 101 ..... 00 ... @azz_nx1_o3 n=4 +SDOT_n1_4h 11000001 011 0 .... 0 .. 101 ..... 00 ... @azz_nx1_o3 n=2 +SDOT_n1_4h 11000001 011 1 .... 0 .. 101 ..... 00 ... @azz_nx1_o3 n=4 +SDOT_n1_2h 11000001 011 0 .... 0 .. 101 ..... 01 ... @azz_nx1_o3 n=2 +SDOT_n1_2h 11000001 011 1 .... 0 .. 101 ..... 01 ... @azz_nx1_o3 n=4 + +UDOT_n1_4b 11000001 001 0 .... 0 .. 101 ..... 10 ... @azz_nx1_o3 n=2 +UDOT_n1_4b 11000001 001 1 .... 0 .. 101 ..... 10 ... @azz_nx1_o3 n=4 +UDOT_n1_4h 11000001 011 0 .... 0 .. 101 ..... 10 ... @azz_nx1_o3 n=2 +UDOT_n1_4h 11000001 011 1 .... 0 .. 101 ..... 10 ... @azz_nx1_o3 n=4 +UDOT_n1_2h 11000001 011 0 .... 0 .. 101 ..... 11 ... @azz_nx1_o3 n=2 +UDOT_n1_2h 11000001 011 1 .... 0 .. 101 ..... 11 ... @azz_nx1_o3 n=4 + +SMLAL_n1 11000001 011 0 .... 0 .. 011 ..... 00 ... @azz_nx1_o3x2 n=1 +SMLAL_n1 11000001 011 0 .... 0 .. 010 ..... 000 .. @azz_nx1_o2x2 n=2 +SMLAL_n1 11000001 011 1 .... 0 .. 010 ..... 000 .. @azz_nx1_o2x2 n=4 + +SMLSL_n1 11000001 011 0 .... 0 .. 011 ..... 01 ... @azz_nx1_o3x2 n=1 +SMLSL_n1 11000001 011 0 .... 0 .. 010 ..... 010 .. @azz_nx1_o2x2 n=2 +SMLSL_n1 11000001 011 1 .... 0 .. 010 ..... 010 .. @azz_nx1_o2x2 n=4 + +UMLAL_n1 11000001 011 0 .... 0 .. 011 ..... 10 ... @azz_nx1_o3x2 n=1 +UMLAL_n1 11000001 011 0 .... 0 .. 010 ..... 100 .. @azz_nx1_o2x2 n=2 +UMLAL_n1 11000001 011 1 .... 0 .. 010 ..... 100 .. @azz_nx1_o2x2 n=4 + +UMLSL_n1 11000001 011 0 .... 0 .. 011 ..... 11 ... @azz_nx1_o3x2 n=1 +UMLSL_n1 11000001 011 0 .... 0 .. 010 ..... 110 .. @azz_nx1_o2x2 n=2 +UMLSL_n1 11000001 011 1 .... 0 .. 010 ..... 110 .. @azz_nx1_o2x2 n=4 + +%off2_x4 0:2 !function=times_4 +%off1_x4 0:1 !function=times_4 + +@azz_nx1_o2x4 ........ ... . zm:4 . .. ... zn:5 ... .. \ + &azz_n off=%off2_x4 rv=%mova_rv +@azz_nx1_o1x4 ........ ... . zm:4 . .. ... zn:5 .... . \ + &azz_n off=%off1_x4 rv=%mova_rv + +SMLALL_n1_s 11000001 001 0 .... 0 .. 001 ..... 000 .. @azz_nx1_o2x4 n=1 +SMLALL_n1_d 11000001 011 0 .... 0 .. 001 ..... 000 .. @azz_nx1_o2x4 n=1 +SMLALL_n1_s 11000001 001 0 .... 0 .. 000 ..... 0000 . @azz_nx1_o1x4 n=2 +SMLALL_n1_d 11000001 011 0 .... 0 .. 000 ..... 0000 . @azz_nx1_o1x4 n=2 +SMLALL_n1_s 11000001 001 1 .... 0 .. 000 ..... 0000 . @azz_nx1_o1x4 n=4 +SMLALL_n1_d 11000001 011 1 .... 0 .. 000 ..... 0000 . @azz_nx1_o1x4 n=4 + +SMLSLL_n1_s 11000001 001 0 .... 0 .. 001 ..... 010 .. @azz_nx1_o2x4 n=1 +SMLSLL_n1_d 11000001 011 0 .... 0 .. 001 ..... 010 .. @azz_nx1_o2x4 n=1 +SMLSLL_n1_s 11000001 001 0 .... 0 .. 000 ..... 0100 . @azz_nx1_o1x4 n=2 +SMLSLL_n1_d 11000001 011 0 .... 0 .. 000 ..... 0100 . @azz_nx1_o1x4 n=2 +SMLSLL_n1_s 11000001 001 1 .... 0 .. 000 ..... 0100 . @azz_nx1_o1x4 n=4 +SMLSLL_n1_d 11000001 011 1 .... 0 .. 000 ..... 0100 . @azz_nx1_o1x4 n=4 + +UMLALL_n1_s 11000001 001 0 .... 0 .. 001 ..... 100 .. @azz_nx1_o2x4 n=1 +UMLALL_n1_d 11000001 011 0 .... 0 .. 001 ..... 100 .. @azz_nx1_o2x4 n=1 +UMLALL_n1_s 11000001 001 0 .... 0 .. 000 ..... 1000 . @azz_nx1_o1x4 n=2 +UMLALL_n1_d 11000001 011 0 .... 0 .. 000 ..... 1000 . @azz_nx1_o1x4 n=2 +UMLALL_n1_s 11000001 001 1 .... 0 .. 000 ..... 1000 . @azz_nx1_o1x4 n=4 +UMLALL_n1_d 11000001 011 1 .... 0 .. 000 ..... 1000 . @azz_nx1_o1x4 n=4 + +UMLSLL_n1_s 11000001 001 0 .... 0 .. 001 ..... 110 .. @azz_nx1_o2x4 n=1 +UMLSLL_n1_d 11000001 011 0 .... 0 .. 001 ..... 110 .. @azz_nx1_o2x4 n=1 +UMLSLL_n1_s 11000001 001 0 .... 0 .. 000 ..... 1100 . @azz_nx1_o1x4 n=2 +UMLSLL_n1_d 11000001 011 0 .... 0 .. 000 ..... 1100 . @azz_nx1_o1x4 n=2 +UMLSLL_n1_s 11000001 001 1 .... 0 .. 000 ..... 1100 . @azz_nx1_o1x4 n=4 +UMLSLL_n1_d 11000001 011 1 .... 0 .. 000 ..... 1100 . @azz_nx1_o1x4 n=4 + +USMLALL_n1_s 11000001 001 0 .... 0 .. 001 ..... 001 .. @azz_nx1_o2x4 n=1 +USMLALL_n1_s 11000001 001 0 .... 0 .. 000 ..... 0010 . @azz_nx1_o1x4 n=2 +USMLALL_n1_s 11000001 001 1 .... 0 .. 000 ..... 0010 . @azz_nx1_o1x4 n=4 + +SUMLALL_n1_s 11000001 001 0 .... 0 .. 000 ..... 1010 . @azz_nx1_o1x4 n=2 +SUMLALL_n1_s 11000001 001 1 .... 0 .. 000 ..... 1010 . @azz_nx1_o1x4 n=4 + +BFMLA_n1 11000001 011 0 .... 0 .. 111 ..... 00 ... @azz_nx1_o3 n=2 +FMLA_n1_h 11000001 001 0 .... 0 .. 111 ..... 00 ... @azz_nx1_o3 n=2 +FMLA_n1_s 11000001 001 0 .... 0 .. 110 ..... 00 ... @azz_nx1_o3 n=2 +FMLA_n1_d 11000001 011 0 .... 0 .. 110 ..... 00 ... @azz_nx1_o3 n=2 + +BFMLA_n1 11000001 011 1 .... 0 .. 111 ..... 00 ... @azz_nx1_o3 n=4 +FMLA_n1_h 11000001 001 1 .... 0 .. 111 ..... 00 ... @azz_nx1_o3 n=4 +FMLA_n1_s 11000001 001 1 .... 0 .. 110 ..... 00 ... @azz_nx1_o3 n=4 +FMLA_n1_d 11000001 011 1 .... 0 .. 110 ..... 00 ... @azz_nx1_o3 n=4 + +BFMLS_n1 11000001 011 0 .... 0 .. 111 ..... 01 ... @azz_nx1_o3 n=2 +FMLS_n1_h 11000001 001 0 .... 0 .. 111 ..... 01 ... @azz_nx1_o3 n=2 +FMLS_n1_s 11000001 001 0 .... 0 .. 110 ..... 01 ... @azz_nx1_o3 n=2 +FMLS_n1_d 11000001 011 0 .... 0 .. 110 ..... 01 ... @azz_nx1_o3 n=2 + +BFMLS_n1 11000001 011 1 .... 0 .. 111 ..... 01 ... @azz_nx1_o3 n=4 +FMLS_n1_h 11000001 001 1 .... 0 .. 111 ..... 01 ... @azz_nx1_o3 n=4 +FMLS_n1_s 11000001 001 1 .... 0 .. 110 ..... 01 ... @azz_nx1_o3 n=4 +FMLS_n1_d 11000001 011 1 .... 0 .. 110 ..... 01 ... @azz_nx1_o3 n=4 + +### SME2 Multi-vector Multiple Array Vectors + +%zn_ax2 6:4 !function=times_2 +%zn_ax4 7:3 !function=times_4 + +@azz_2x2_o3 ........ ... ..... . .. ... ..... .. off:3 \ + &azz_n n=2 rv=%mova_rv zn=%zn_ax2 zm=%zm_ax2 +@azz_4x4_o3 ........ ... ..... . .. ... ..... .. off:3 \ + &azz_n n=4 rv=%mova_rv zn=%zn_ax4 zm=%zm_ax4 + +ADD_azz_nn_s 11000001 101 ....0 0 .. 110 ....0 10 ... @azz_2x2_o3 +ADD_azz_nn_s 11000001 101 ...01 0 .. 110 ...00 10 ... @azz_4x4_o3 +ADD_azz_nn_d 11000001 111 ....0 0 .. 110 ....0 10 ... @azz_2x2_o3 +ADD_azz_nn_d 11000001 111 ...01 0 .. 110 ...00 10 ... @azz_4x4_o3 + +SUB_azz_nn_s 11000001 101 ....0 0 .. 110 ....0 11 ... @azz_2x2_o3 +SUB_azz_nn_s 11000001 101 ...01 0 .. 110 ...00 11 ... @azz_4x4_o3 +SUB_azz_nn_d 11000001 111 ....0 0 .. 110 ....0 11 ... @azz_2x2_o3 +SUB_azz_nn_d 11000001 111 ...01 0 .. 110 ...00 11 ... @azz_4x4_o3 + +@azz_2x2_o2x2 ........ ... ..... . .. ... ..... ... .. \ + &azz_n n=2 rv=%mova_rv zn=%zn_ax2 zm=%zm_ax2 off=%off2_x2 +@azz_4x4_o2x2 ........ ... ..... . .. ... ..... ... .. \ + &azz_n n=4 rv=%mova_rv zn=%zn_ax4 zm=%zm_ax4 off=%off2_x2 + +FMLAL_nn 11000001 101 ....0 0 .. 010 ....0 000 .. @azz_2x2_o2x2 +FMLAL_nn 11000001 101 ...01 0 .. 010 ...00 000 .. @azz_4x4_o2x2 + +FMLSL_nn 11000001 101 ....0 0 .. 010 ....0 010 .. @azz_2x2_o2x2 +FMLSL_nn 11000001 101 ...01 0 .. 010 ...00 010 .. @azz_4x4_o2x2 + +BFMLAL_nn 11000001 101 ....0 0 .. 010 ....0 100 .. @azz_2x2_o2x2 +BFMLAL_nn 11000001 101 ...01 0 .. 010 ...00 100 .. @azz_4x4_o2x2 + +BFMLSL_nn 11000001 101 ....0 0 .. 010 ....0 110 .. @azz_2x2_o2x2 +BFMLSL_nn 11000001 101 ...01 0 .. 010 ...00 110 .. @azz_4x4_o2x2 + +FDOT_nn 11000001 101 ....0 0 .. 100 ....0 00 ... @azz_2x2_o3 +FDOT_nn 11000001 101 ...01 0 .. 100 ...00 00 ... @azz_4x4_o3 + +BFDOT_nn 11000001 101 ....0 0 .. 100 ....0 10 ... @azz_2x2_o3 +BFDOT_nn 11000001 101 ...01 0 .. 100 ...00 10 ... @azz_4x4_o3 + +USDOT_nn 11000001 101 ....0 0 .. 101 ....0 01 ... @azz_2x2_o3 +USDOT_nn 11000001 101 ...01 0 .. 101 ...00 01 ... @azz_4x4_o3 + +SDOT_nn_4b 11000001 101 ....0 0 .. 101 ....0 00 ... @azz_2x2_o3 +SDOT_nn_4b 11000001 101 ...01 0 .. 101 ...00 00 ... @azz_4x4_o3 +SDOT_nn_4h 11000001 111 ....0 0 .. 101 ....0 00 ... @azz_2x2_o3 +SDOT_nn_4h 11000001 111 ...01 0 .. 101 ...00 00 ... @azz_4x4_o3 +SDOT_nn_2h 11000001 111 ....0 0 .. 101 ....0 01 ... @azz_2x2_o3 +SDOT_nn_2h 11000001 111 ...01 0 .. 101 ...00 01 ... @azz_4x4_o3 + +UDOT_nn_4b 11000001 101 ....0 0 .. 101 ....0 10 ... @azz_2x2_o3 +UDOT_nn_4b 11000001 101 ...01 0 .. 101 ...00 10 ... @azz_4x4_o3 +UDOT_nn_4h 11000001 111 ....0 0 .. 101 ....0 10 ... @azz_2x2_o3 +UDOT_nn_4h 11000001 111 ...01 0 .. 101 ...00 10 ... @azz_4x4_o3 +UDOT_nn_2h 11000001 111 ....0 0 .. 101 ....0 11 ... @azz_2x2_o3 +UDOT_nn_2h 11000001 111 ...01 0 .. 101 ...00 11 ... @azz_4x4_o3 + +SMLAL_nn 11000001 111 ....0 0 .. 010 ....0 000 .. @azz_2x2_o2x2 +SMLAL_nn 11000001 111 ...01 0 .. 010 ...00 000 .. @azz_4x4_o2x2 + +SMLSL_nn 11000001 111 ....0 0 .. 010 ....0 010 .. @azz_2x2_o2x2 +SMLSL_nn 11000001 111 ...01 0 .. 010 ...00 010 .. @azz_4x4_o2x2 + +UMLAL_nn 11000001 111 ....0 0 .. 010 ....0 100 .. @azz_2x2_o2x2 +UMLAL_nn 11000001 111 ...01 0 .. 010 ...00 100 .. @azz_4x4_o2x2 + +UMLSL_nn 11000001 111 ....0 0 .. 010 ....0 110 .. @azz_2x2_o2x2 +UMLSL_nn 11000001 111 ...01 0 .. 010 ...00 110 .. @azz_4x4_o2x2 + +@azz_2x2_o1x4 ........ ... ..... . .. ... ..... ... .. \ + &azz_n n=2 rv=%mova_rv zn=%zn_ax2 zm=%zm_ax2 off=%off1_x4 +@azz_4x4_o1x4 ........ ... ..... . .. ... ..... ... .. \ + &azz_n n=4 rv=%mova_rv zn=%zn_ax4 zm=%zm_ax4 off=%off1_x4 + +SMLALL_nn_s 11000001 101 ....0 0 .. 000 ....0 0000 . @azz_2x2_o1x4 +SMLALL_nn_d 11000001 111 ....0 0 .. 000 ....0 0000 . @azz_2x2_o1x4 +SMLALL_nn_s 11000001 101 ...01 0 .. 000 ...00 0000 . @azz_4x4_o1x4 +SMLALL_nn_d 11000001 111 ...01 0 .. 000 ...00 0000 . @azz_4x4_o1x4 + +SMLSLL_nn_s 11000001 101 ....0 0 .. 000 ....0 0100 . @azz_2x2_o1x4 +SMLSLL_nn_d 11000001 111 ....0 0 .. 000 ....0 0100 . @azz_2x2_o1x4 +SMLSLL_nn_s 11000001 101 ...01 0 .. 000 ...00 0100 . @azz_4x4_o1x4 +SMLSLL_nn_d 11000001 111 ...01 0 .. 000 ...00 0100 . @azz_4x4_o1x4 + +UMLALL_nn_s 11000001 101 ....0 0 .. 000 ....0 1000 . @azz_2x2_o1x4 +UMLALL_nn_d 11000001 111 ....0 0 .. 000 ....0 1000 . @azz_2x2_o1x4 +UMLALL_nn_s 11000001 101 ...01 0 .. 000 ...00 1000 . @azz_4x4_o1x4 +UMLALL_nn_d 11000001 111 ...01 0 .. 000 ...00 1000 . @azz_4x4_o1x4 + +UMLSLL_nn_s 11000001 101 ....0 0 .. 000 ....0 1100 . @azz_2x2_o1x4 +UMLSLL_nn_d 11000001 111 ....0 0 .. 000 ....0 1100 . @azz_2x2_o1x4 +UMLSLL_nn_s 11000001 101 ...01 0 .. 000 ...00 1100 . @azz_4x4_o1x4 +UMLSLL_nn_d 11000001 111 ...01 0 .. 000 ...00 1100 . @azz_4x4_o1x4 + +USMLALL_nn_s 11000001 101 ....0 0 .. 000 ....0 0010 . @azz_2x2_o1x4 +USMLALL_nn_s 11000001 101 ...01 0 .. 000 ...00 0010 . @azz_4x4_o1x4 + +BFMLA_nn 11000001 111 ....0 0 .. 100 ....0 01 ... @azz_2x2_o3 +FMLA_nn_h 11000001 101 ....0 0 .. 100 ....0 01 ... @azz_2x2_o3 +FMLA_nn_s 11000001 101 ....0 0 .. 110 ....0 00 ... @azz_2x2_o3 +FMLA_nn_d 11000001 111 ....0 0 .. 110 ....0 00 ... @azz_2x2_o3 + +BFMLA_nn 11000001 111 ...01 0 .. 100 ...00 01 ... @azz_4x4_o3 +FMLA_nn_h 11000001 101 ...01 0 .. 100 ...00 01 ... @azz_4x4_o3 +FMLA_nn_s 11000001 101 ...01 0 .. 110 ...00 00 ... @azz_4x4_o3 +FMLA_nn_d 11000001 111 ...01 0 .. 110 ...00 00 ... @azz_4x4_o3 + +BFMLS_nn 11000001 111 ....0 0 .. 100 ....0 11 ... @azz_2x2_o3 +FMLS_nn_h 11000001 101 ....0 0 .. 100 ....0 11 ... @azz_2x2_o3 +FMLS_nn_s 11000001 101 ....0 0 .. 110 ....0 01 ... @azz_2x2_o3 +FMLS_nn_d 11000001 111 ....0 0 .. 110 ....0 01 ... @azz_2x2_o3 + +BFMLS_nn 11000001 111 ...01 0 .. 100 ...00 11 ... @azz_4x4_o3 +FMLS_nn_h 11000001 101 ...01 0 .. 100 ...00 11 ... @azz_4x4_o3 +FMLS_nn_s 11000001 101 ...01 0 .. 110 ...00 01 ... @azz_4x4_o3 +FMLS_nn_d 11000001 111 ...01 0 .. 110 ...00 01 ... @azz_4x4_o3 + +&az_n n off rv zm +@az_2x2_o3 ........ ... ..... . .. ... ..... .. off:3 \ + &az_n n=2 rv=%mova_rv zm=%zn_ax2 +@az_4x4_o3 ........ ... ..... . .. ... ..... .. off:3 \ + &az_n n=4 rv=%mova_rv zm=%zn_ax4 + +FADD_nn_h 11000001 101 00100 0 .. 111 ....0 00 ... @az_2x2_o3 +FADD_nn_s 11000001 101 00000 0 .. 111 ....0 00 ... @az_2x2_o3 +FADD_nn_d 11000001 111 00000 0 .. 111 ....0 00 ... @az_2x2_o3 +FADD_nn_h 11000001 101 00101 0 .. 111 ...00 00 ... @az_4x4_o3 +FADD_nn_s 11000001 101 00001 0 .. 111 ...00 00 ... @az_4x4_o3 +FADD_nn_d 11000001 111 00001 0 .. 111 ...00 00 ... @az_4x4_o3 + +FSUB_nn_h 11000001 101 00100 0 .. 111 ....0 01 ... @az_2x2_o3 +FSUB_nn_s 11000001 101 00000 0 .. 111 ....0 01 ... @az_2x2_o3 +FSUB_nn_d 11000001 111 00000 0 .. 111 ....0 01 ... @az_2x2_o3 +FSUB_nn_h 11000001 101 00101 0 .. 111 ...00 01 ... @az_4x4_o3 +FSUB_nn_s 11000001 101 00001 0 .. 111 ...00 01 ... @az_4x4_o3 +FSUB_nn_d 11000001 111 00001 0 .. 111 ...00 01 ... @az_4x4_o3 + +BFADD_nn 11000001 111 00100 0 .. 111 ....0 00 ... @az_2x2_o3 +BFADD_nn 11000001 111 00101 0 .. 111 ...00 00 ... @az_4x4_o3 +BFSUB_nn 11000001 111 00100 0 .. 111 ....0 01 ... @az_2x2_o3 +BFSUB_nn 11000001 111 00101 0 .. 111 ...00 01 ... @az_4x4_o3 + +### SME2 Multi-vector Indexed + +&azx_n n off rv zn zm idx + +%idx3_15_10 15:1 10:2 +%idx2_10_2 10:2 2:1 + +@azx_1x1_o3x2 ........ .... zm:4 . .. . .. zn:5 .. ... \ + &azx_n n=1 rv=%mova_rv off=%off3_x2 idx=%idx3_15_10 +@azx_2x1_o2x2 ........ .... zm:4 . .. . .. ..... .. ... \ + &azx_n n=2 rv=%mova_rv off=%off2_x2 zn=%zn_ax2 idx=%idx2_10_2 +@azx_4x1_o2x2 ........ .... zm:4 . .. . .. ..... .. ... \ + &azx_n n=4 rv=%mova_rv off=%off2_x2 zn=%zn_ax4 idx=%idx2_10_2 + +FMLAL_nx 11000001 1000 .... . .. 1 .. ..... 00 ... @azx_1x1_o3x2 +FMLAL_nx 11000001 1001 .... 0 .. 1 .. ....0 00 ... @azx_2x1_o2x2 +FMLAL_nx 11000001 1001 .... 1 .. 1 .. ...00 00 ... @azx_4x1_o2x2 + +FMLSL_nx 11000001 1000 .... . .. 1 .. ..... 01 ... @azx_1x1_o3x2 +FMLSL_nx 11000001 1001 .... 0 .. 1 .. ....0 01 ... @azx_2x1_o2x2 +FMLSL_nx 11000001 1001 .... 1 .. 1 .. ...00 01 ... @azx_4x1_o2x2 + +BFMLAL_nx 11000001 1000 .... . .. 1 .. ..... 10 ... @azx_1x1_o3x2 +BFMLAL_nx 11000001 1001 .... 0 .. 1 .. ....0 10 ... @azx_2x1_o2x2 +BFMLAL_nx 11000001 1001 .... 1 .. 1 .. ...00 10 ... @azx_4x1_o2x2 + +BFMLSL_nx 11000001 1000 .... . .. 1 .. ..... 11 ... @azx_1x1_o3x2 +BFMLSL_nx 11000001 1001 .... 0 .. 1 .. ....0 11 ... @azx_2x1_o2x2 +BFMLSL_nx 11000001 1001 .... 1 .. 1 .. ...00 11 ... @azx_4x1_o2x2 + +@azx_2x1_i2_o3 ........ .... zm:4 . .. . idx:2 .... ... off:3 \ + &azx_n n=2 rv=%mova_rv zn=%zn_ax2 +@azx_4x1_i2_o3 ........ .... zm:4 . .. . idx:2 .... ... off:3 \ + &azx_n n=4 rv=%mova_rv zn=%zn_ax4 +@azx_2x1_i1_o3 ........ .... zm:4 . .. .. idx:1 .... ... off:3 \ + &azx_n n=2 rv=%mova_rv zn=%zn_ax2 +@azx_4x1_i1_o3 ........ .... zm:4 . .. .. idx:1 .... ... off:3 \ + &azx_n n=4 rv=%mova_rv zn=%zn_ax4 + +FDOT_nx 11000001 0101 .... 0 .. 1 .. ....0 01 ... @azx_2x1_i2_o3 +FDOT_nx 11000001 0101 .... 1 .. 1 .. ...00 01 ... @azx_4x1_i2_o3 + +BFDOT_nx 11000001 0101 .... 0 .. 1 .. ....0 11 ... @azx_2x1_i2_o3 +BFDOT_nx 11000001 0101 .... 1 .. 1 .. ...00 11 ... @azx_4x1_i2_o3 + +FVDOT 11000001 0101 .... 0 .. 0 .. ....0 01 ... @azx_2x1_i2_o3 +BFVDOT 11000001 0101 .... 0 .. 0 .. ....0 11 ... @azx_2x1_i2_o3 + +SDOT_nx_2h 11000001 0101 .... 0 .. 1 .. ....0 00 ... @azx_2x1_i2_o3 +SDOT_nx_2h 11000001 0101 .... 1 .. 1 .. ...00 00 ... @azx_4x1_i2_o3 +SDOT_nx_4b 11000001 0101 .... 0 .. 1 .. ....1 00 ... @azx_2x1_i2_o3 +SDOT_nx_4b 11000001 0101 .... 1 .. 1 .. ...01 00 ... @azx_4x1_i2_o3 +SDOT_nx_4h 11000001 1101 .... 0 .. 00 . ....0 01 ... @azx_2x1_i1_o3 +SDOT_nx_4h 11000001 1101 .... 1 .. 00 . ...00 01 ... @azx_4x1_i1_o3 + +UDOT_nx_2h 11000001 0101 .... 0 .. 1 .. ....0 10 ... @azx_2x1_i2_o3 +UDOT_nx_2h 11000001 0101 .... 1 .. 1 .. ...00 10 ... @azx_4x1_i2_o3 +UDOT_nx_4b 11000001 0101 .... 0 .. 1 .. ....1 10 ... @azx_2x1_i2_o3 +UDOT_nx_4b 11000001 0101 .... 1 .. 1 .. ...01 10 ... @azx_4x1_i2_o3 +UDOT_nx_4h 11000001 1101 .... 0 .. 00 . ....0 11 ... @azx_2x1_i1_o3 +UDOT_nx_4h 11000001 1101 .... 1 .. 00 . ...00 11 ... @azx_4x1_i1_o3 + +USDOT_nx 11000001 0101 .... 0 .. 1 .. ....1 01 ... @azx_2x1_i2_o3 +USDOT_nx 11000001 0101 .... 1 .. 1 .. ...01 01 ... @azx_4x1_i2_o3 + +SUDOT_nx 11000001 0101 .... 0 .. 1 .. ....1 11 ... @azx_2x1_i2_o3 +SUDOT_nx 11000001 0101 .... 1 .. 1 .. ...01 11 ... @azx_4x1_i2_o3 + +SVDOT_nx_2h 11000001 0101 .... 0 .. 0 .. ....1 00 ... @azx_2x1_i2_o3 +SVDOT_nx_4b 11000001 0101 .... 1 .. 0 .. ...01 00 ... @azx_4x1_i2_o3 +SVDOT_nx_4h 11000001 1101 .... 1 .. 01 . ...00 01 ... @azx_4x1_i1_o3 + +UVDOT_nx_2h 11000001 0101 .... 0 .. 0 .. ....1 10 ... @azx_2x1_i2_o3 +UVDOT_nx_4b 11000001 0101 .... 1 .. 0 .. ...01 10 ... @azx_4x1_i2_o3 +UVDOT_nx_4h 11000001 1101 .... 1 .. 01 . ...00 11 ... @azx_4x1_i1_o3 + +SUVDOT_nx_4b 11000001 0101 .... 1 .. 0 .. ...01 11 ... @azx_4x1_i2_o3 +USVDOT_nx_4b 11000001 0101 .... 1 .. 0 .. ...01 01 ... @azx_4x1_i2_o3 + +SMLAL_nx 11000001 1100 .... . .. 1 .. ..... 00 ... @azx_1x1_o3x2 +SMLAL_nx 11000001 1101 .... 0 .. 1 .. ....0 00 ... @azx_2x1_o2x2 +SMLAL_nx 11000001 1101 .... 1 .. 1 .. ...00 00 ... @azx_4x1_o2x2 + +SMLSL_nx 11000001 1100 .... . .. 1 .. ..... 01 ... @azx_1x1_o3x2 +SMLSL_nx 11000001 1101 .... 0 .. 1 .. ....0 01 ... @azx_2x1_o2x2 +SMLSL_nx 11000001 1101 .... 1 .. 1 .. ...00 01 ... @azx_4x1_o2x2 + +UMLAL_nx 11000001 1100 .... . .. 1 .. ..... 10 ... @azx_1x1_o3x2 +UMLAL_nx 11000001 1101 .... 0 .. 1 .. ....0 10 ... @azx_2x1_o2x2 +UMLAL_nx 11000001 1101 .... 1 .. 1 .. ...00 10 ... @azx_4x1_o2x2 + +UMLSL_nx 11000001 1100 .... . .. 1 .. ..... 11 ... @azx_1x1_o3x2 +UMLSL_nx 11000001 1101 .... 0 .. 1 .. ....0 11 ... @azx_2x1_o2x2 +UMLSL_nx 11000001 1101 .... 1 .. 1 .. ...00 11 ... @azx_4x1_o2x2 + +%idx4_15_10 15:1 10:3 +%idx4_10_1 10:2 1:2 +%idx3_10_1 10:1 1:2 + +@azx_1x1_i4_o2 ........ .... zm:4 . .. ... zn:5 ... .. \ + &azx_n n=1 rv=%mova_rv off=%off2_x4 idx=%idx4_15_10 +@azx_1x1_i3_o2 ........ .... zm:4 . .. ... zn:5 ... .. \ + &azx_n n=1 rv=%mova_rv off=%off2_x4 idx=%idx3_15_10 +@azx_2x1_i4_o1 ........ .... zm:4 . .. ... ..... ... .. \ + &azx_n n=2 rv=%mova_rv off=%off1_x4 zn=%zn_ax2 idx=%idx4_10_1 +@azx_2x1_i3_o1 ........ .... zm:4 . .. ... ..... ... .. \ + &azx_n n=2 rv=%mova_rv off=%off1_x4 zn=%zn_ax2 idx=%idx3_10_1 +@azx_4x1_i4_o1 ........ .... zm:4 . .. ... ..... ... .. \ + &azx_n n=4 rv=%mova_rv off=%off1_x4 zn=%zn_ax4 idx=%idx4_10_1 +@azx_4x1_i3_o1 ........ .... zm:4 . .. ... ..... ... .. \ + &azx_n n=4 rv=%mova_rv off=%off1_x4 zn=%zn_ax4 idx=%idx3_10_1 + +SMLALL_nx_s 11000001 0000 .... . .. ... ..... 000 .. @azx_1x1_i4_o2 +SMLALL_nx_d 11000001 1000 .... . .. 0.. ..... 000 .. @azx_1x1_i3_o2 +SMLALL_nx_s 11000001 0001 .... 0 .. 0.. ....0 00 ... @azx_2x1_i4_o1 +SMLALL_nx_d 11000001 1001 .... 0 .. 00. ....0 00 ... @azx_2x1_i3_o1 +SMLALL_nx_s 11000001 0001 .... 1 .. 0.. ...00 00 ... @azx_4x1_i4_o1 +SMLALL_nx_d 11000001 1001 .... 1 .. 00. ...00 00 ... @azx_4x1_i3_o1 + +SMLSLL_nx_s 11000001 0000 .... . .. ... ..... 010 .. @azx_1x1_i4_o2 +SMLSLL_nx_d 11000001 1000 .... . .. 0.. ..... 010 .. @azx_1x1_i3_o2 +SMLSLL_nx_s 11000001 0001 .... 0 .. 0.. ....0 01 ... @azx_2x1_i4_o1 +SMLSLL_nx_d 11000001 1001 .... 0 .. 00. ....0 01 ... @azx_2x1_i3_o1 +SMLSLL_nx_s 11000001 0001 .... 1 .. 0.. ...00 01 ... @azx_4x1_i4_o1 +SMLSLL_nx_d 11000001 1001 .... 1 .. 00. ...00 01 ... @azx_4x1_i3_o1 + +UMLALL_nx_s 11000001 0000 .... . .. ... ..... 100 .. @azx_1x1_i4_o2 +UMLALL_nx_d 11000001 1000 .... . .. 0.. ..... 100 .. @azx_1x1_i3_o2 +UMLALL_nx_s 11000001 0001 .... 0 .. 0.. ....0 10 ... @azx_2x1_i4_o1 +UMLALL_nx_d 11000001 1001 .... 0 .. 00. ....0 10 ... @azx_2x1_i3_o1 +UMLALL_nx_s 11000001 0001 .... 1 .. 0.. ...00 10 ... @azx_4x1_i4_o1 +UMLALL_nx_d 11000001 1001 .... 1 .. 00. ...00 10 ... @azx_4x1_i3_o1 + +UMLSLL_nx_s 11000001 0000 .... . .. ... ..... 110 .. @azx_1x1_i4_o2 +UMLSLL_nx_d 11000001 1000 .... . .. 0.. ..... 110 .. @azx_1x1_i3_o2 +UMLSLL_nx_s 11000001 0001 .... 0 .. 0.. ....0 11 ... @azx_2x1_i4_o1 +UMLSLL_nx_d 11000001 1001 .... 0 .. 00. ....0 11 ... @azx_2x1_i3_o1 +UMLSLL_nx_s 11000001 0001 .... 1 .. 0.. ...00 11 ... @azx_4x1_i4_o1 +UMLSLL_nx_d 11000001 1001 .... 1 .. 00. ...00 11 ... @azx_4x1_i3_o1 + +USMLALL_nx_s 11000001 0000 .... . .. ... ..... 001 .. @azx_1x1_i4_o2 +USMLALL_nx_s 11000001 0001 .... 0 .. 0.. ....1 00 ... @azx_2x1_i4_o1 +USMLALL_nx_s 11000001 0001 .... 1 .. 0.. ...01 00 ... @azx_4x1_i4_o1 + +SUMLALL_nx_s 11000001 0000 .... . .. ... ..... 101 .. @azx_1x1_i4_o2 +SUMLALL_nx_s 11000001 0001 .... 0 .. 0.. ....1 10 ... @azx_2x1_i4_o1 +SUMLALL_nx_s 11000001 0001 .... 1 .. 0.. ...01 10 ... @azx_4x1_i4_o1 + +%idx3_10_3 10:2 3:1 +@azx_2x1_i3_o3 ........ .... zm:4 . .. ... ..... .. off:3 \ + &azx_n n=2 rv=%mova_rv zn=%zn_ax2 idx=%idx3_10_3 +@azx_4x1_i3_o3 ........ .... zm:4 . .. ... ..... .. off:3 \ + &azx_n n=4 rv=%mova_rv zn=%zn_ax4 idx=%idx3_10_3 + +BFMLA_nx 11000001 0001 .... 0 .. 1.. ....1 0 .... @azx_2x1_i3_o3 +FMLA_nx_h 11000001 0001 .... 0 .. 1.. ....0 0 .... @azx_2x1_i3_o3 +FMLA_nx_s 11000001 0101 .... 0 .. 0.. ....0 00 ... @azx_2x1_i2_o3 +FMLA_nx_d 11000001 1101 .... 0 .. 00. ....0 00 ... @azx_2x1_i1_o3 + +BFMLA_nx 11000001 0001 .... 1 .. 1.. ...01 0 .... @azx_4x1_i3_o3 +FMLA_nx_h 11000001 0001 .... 1 .. 1.. ...00 0 .... @azx_4x1_i3_o3 +FMLA_nx_s 11000001 0101 .... 1 .. 0.. ...00 00 ... @azx_4x1_i2_o3 +FMLA_nx_d 11000001 1101 .... 1 .. 00. ...00 00 ... @azx_4x1_i1_o3 + +BFMLS_nx 11000001 0001 .... 0 .. 1.. ....1 1 .... @azx_2x1_i3_o3 +FMLS_nx_h 11000001 0001 .... 0 .. 1.. ....0 1 .... @azx_2x1_i3_o3 +FMLS_nx_s 11000001 0101 .... 0 .. 0.. ....0 10 ... @azx_2x1_i2_o3 +FMLS_nx_d 11000001 1101 .... 0 .. 00. ....0 10 ... @azx_2x1_i1_o3 + +BFMLS_nx 11000001 0001 .... 1 .. 1.. ...01 1 .... @azx_4x1_i3_o3 +FMLS_nx_h 11000001 0001 .... 1 .. 1.. ...00 1 .... @azx_4x1_i3_o3 +FMLS_nx_s 11000001 0101 .... 1 .. 0.. ...00 10 ... @azx_4x1_i2_o3 +FMLS_nx_d 11000001 1101 .... 1 .. 00. ...00 10 ... @azx_4x1_i1_o3 + +### SME2 Add / Sub array accumulators + +ADD_aaz_s 11000001 101 000000 .. 111 ....0 10 ... @az_2x2_o3 +ADD_aaz_s 11000001 101 000010 .. 111 ...00 10 ... @az_4x4_o3 +ADD_aaz_d 11000001 111 000000 .. 111 ....0 10 ... @az_2x2_o3 +ADD_aaz_d 11000001 111 000010 .. 111 ...00 10 ... @az_4x4_o3 + +SUB_aaz_s 11000001 101 000000 .. 111 ....0 11 ... @az_2x2_o3 +SUB_aaz_s 11000001 101 000010 .. 111 ...00 11 ... @az_4x4_o3 +SUB_aaz_d 11000001 111 000000 .. 111 ....0 11 ... @az_2x2_o3 +SUB_aaz_d 11000001 111 000010 .. 111 ...00 11 ... @az_4x4_o3 + +### SME2 Multi-vector SVE Constructive Unary + +&zz_e zd zn esz +&zz_n zd zn n +@zz_1x2 ........ ... ..... ...... ..... zd:5 \ + &zz_n n=1 zn=%zn_ax2 +@zz_1x4 ........ ... ..... ...... ..... zd:5 \ + &zz_n n=1 zn=%zn_ax4 +@zz_2x1 ........ ... ..... ...... zn:5 ..... \ + &zz_n n=1 zd=%zd_ax2 +@zz_2x2 ........ ... ..... ...... .... . ..... \ + &zz_n n=2 zd=%zd_ax2 zn=%zn_ax2 +@zz_4x4 ........ ... ..... ...... .... . ..... \ + &zz_n n=4 zd=%zd_ax4 zn=%zn_ax4 +@zz_4x2_n1 ........ ... ..... ...... .... . ..... \ + &zz_n n=1 zd=%zd_ax4 zn=%zn_ax2 + +BFCVT 11000001 011 00000 111000 ....0 ..... @zz_1x2 +BFCVTN 11000001 011 00000 111000 ....1 ..... @zz_1x2 + +FCVT_n 11000001 001 00000 111000 ....0 ..... @zz_1x2 +FCVTN 11000001 001 00000 111000 ....1 ..... @zz_1x2 + +FCVT_w 11000001 101 00000 111000 ..... ....0 @zz_2x1 +FCVTL 11000001 101 00000 111000 ..... ....1 @zz_2x1 + +FCVTZS 11000001 001 00001 111000 ....0 ....0 @zz_2x2 +FCVTZS 11000001 001 10001 111000 ...00 ...00 @zz_4x4 +FCVTZU 11000001 001 00001 111000 ....1 ....0 @zz_2x2 +FCVTZU 11000001 001 10001 111000 ...01 ...00 @zz_4x4 + +SCVTF 11000001 001 00010 111000 ....0 ....0 @zz_2x2 +SCVTF 11000001 001 10010 111000 ...00 ...00 @zz_4x4 +UCVTF 11000001 001 00010 111000 ....1 ....0 @zz_2x2 +UCVTF 11000001 001 10010 111000 ...01 ...00 @zz_4x4 + +FRINTN 11000001 101 01000 111000 ....0 ....0 @zz_2x2 +FRINTN 11000001 101 11000 111000 ...00 ...00 @zz_4x4 +FRINTP 11000001 101 01001 111000 ....0 ....0 @zz_2x2 +FRINTP 11000001 101 11001 111000 ...00 ...00 @zz_4x4 +FRINTM 11000001 101 01010 111000 ....0 ....0 @zz_2x2 +FRINTM 11000001 101 11010 111000 ...00 ...00 @zz_4x4 +FRINTA 11000001 101 01100 111000 ....0 ....0 @zz_2x2 +FRINTA 11000001 101 11100 111000 ...00 ...00 @zz_4x4 + +SQCVT_sh 11000001 001 00011 111000 ....0 ..... @zz_1x2 +UQCVT_sh 11000001 001 00011 111000 ....1 ..... @zz_1x2 +SQCVTU_sh 11000001 011 00011 111000 ....0 ..... @zz_1x2 + +SQCVT_sb 11000001 001 10011 111000 ...00 ..... @zz_1x4 +UQCVT_sb 11000001 001 10011 111000 ...01 ..... @zz_1x4 +SQCVTU_sb 11000001 011 10011 111000 ...00 ..... @zz_1x4 + +SQCVT_dh 11000001 101 10011 111000 ...00 ..... @zz_1x4 +UQCVT_dh 11000001 101 10011 111000 ...01 ..... @zz_1x4 +SQCVTU_dh 11000001 111 10011 111000 ...00 ..... @zz_1x4 + +SQCVTN_sb 11000001 001 10011 111000 ...10 ..... @zz_1x4 +UQCVTN_sb 11000001 001 10011 111000 ...11 ..... @zz_1x4 +SQCVTUN_sb 11000001 011 10011 111000 ...10 ..... @zz_1x4 + +SQCVTN_dh 11000001 101 10011 111000 ...10 ..... @zz_1x4 +UQCVTN_dh 11000001 101 10011 111000 ...11 ..... @zz_1x4 +SQCVTUN_dh 11000001 111 10011 111000 ...10 ..... @zz_1x4 + +SUNPK_2bh 11000001 011 00101 111000 ..... ....0 @zz_2x1 +SUNPK_2hs 11000001 101 00101 111000 ..... ....0 @zz_2x1 +SUNPK_2sd 11000001 111 00101 111000 ..... ....0 @zz_2x1 + +UUNPK_2bh 11000001 011 00101 111000 ..... ....1 @zz_2x1 +UUNPK_2hs 11000001 101 00101 111000 ..... ....1 @zz_2x1 +UUNPK_2sd 11000001 111 00101 111000 ..... ....1 @zz_2x1 + +SUNPK_4bh 11000001 011 10101 111000 ....0 ...00 @zz_4x2_n1 +SUNPK_4hs 11000001 101 10101 111000 ....0 ...00 @zz_4x2_n1 +SUNPK_4sd 11000001 111 10101 111000 ....0 ...00 @zz_4x2_n1 + +UUNPK_4bh 11000001 011 10101 111000 ....0 ...01 @zz_4x2_n1 +UUNPK_4hs 11000001 101 10101 111000 ....0 ...01 @zz_4x2_n1 +UUNPK_4sd 11000001 111 10101 111000 ....0 ...01 @zz_4x2_n1 + +ZIP_4 11000001 esz:2 1 10110 111000 ...00 ... 00 \ + &zz_e zd=%zd_ax4 zn=%zn_ax4 +ZIP_4 11000001 001 10111 111000 ...00 ... 00 \ + &zz_e esz=4 zd=%zd_ax4 zn=%zn_ax4 + +UZP_4 11000001 esz:2 1 10110 111000 ...00 ... 10 \ + &zz_e zd=%zd_ax4 zn=%zn_ax4 +UZP_4 11000001 001 10111 111000 ...00 ... 10 \ + &zz_e esz=4 zd=%zd_ax4 zn=%zn_ax4 + +### SME2 Multi-vector SVE Constructive Binary + +&rshr zd zn shift + +%rshr_sh_shift 16:4 !function=rsub_16 +%rshr_sb_shift 16:5 !function=rsub_32 +%rshr_dh_shift 22:1 16:5 !function=rsub_64 + +@rshr_sh ........ .... .... ...... ..... zd:5 \ + &rshr zn=%zn_ax2 shift=%rshr_sh_shift +@rshr_sb ........ ... ..... ...... ..... zd:5 \ + &rshr zn=%zn_ax4 shift=%rshr_sb_shift +@rshr_dh ........ ... ..... ...... ..... zd:5 \ + &rshr zn=%zn_ax4 shift=%rshr_dh_shift + +SQRSHR_sh 11000001 1110 .... 110101 ....0 ..... @rshr_sh +UQRSHR_sh 11000001 1110 .... 110101 ....1 ..... @rshr_sh +SQRSHRU_sh 11000001 1111 .... 110101 ....0 ..... @rshr_sh + +SQRSHR_sb 11000001 011 ..... 110110 ...00 ..... @rshr_sb +SQRSHR_dh 11000001 1.1 ..... 110110 ...00 ..... @rshr_dh +UQRSHR_sb 11000001 011 ..... 110110 ...01 ..... @rshr_sb +UQRSHR_dh 11000001 1.1 ..... 110110 ...01 ..... @rshr_dh +SQRSHRU_sb 11000001 011 ..... 110110 ...10 ..... @rshr_sb +SQRSHRU_dh 11000001 1.1 ..... 110110 ...10 ..... @rshr_dh + +SQRSHRN_sh 01000101 1011 .... 001010 ....0 ..... @rshr_sh +UQRSHRN_sh 01000101 1011 .... 001110 ....0 ..... @rshr_sh +SQRSHRUN_sh 01000101 1011 .... 000010 ....0 ..... @rshr_sh + +SQRSHRN_sb 11000001 011 ..... 110111 ...00 ..... @rshr_sb +SQRSHRN_dh 11000001 1.1 ..... 110111 ...00 ..... @rshr_dh +UQRSHRN_sb 11000001 011 ..... 110111 ...01 ..... @rshr_sb +UQRSHRN_dh 11000001 1.1 ..... 110111 ...01 ..... @rshr_dh +SQRSHRUN_sb 11000001 011 ..... 110111 ...10 ..... @rshr_sb +SQRSHRUN_dh 11000001 1.1 ..... 110111 ...10 ..... @rshr_dh + +&zzz_e zd zn zm esz + +ZIP_2 11000001 esz:2 1 zm:5 110100 zn:5 .... 0 \ + &zzz_e zd=%zd_ax2 +ZIP_2 11000001 00 1 zm:5 110101 zn:5 .... 0 \ + &zzz_e zd=%zd_ax2 esz=4 + +UZP_2 11000001 esz:2 1 zm:5 110100 zn:5 .... 1 \ + &zzz_e zd=%zd_ax2 +UZP_2 11000001 00 1 zm:5 110101 zn:5 .... 1 \ + &zzz_e zd=%zd_ax2 esz=4 + +&zzz_en zd zn zm esz n + +FCLAMP 11000001 esz:2 1 zm:5 110000 zn:5 .... 0 \ + &zzz_en zd=%zd_ax2 n=2 +FCLAMP 11000001 esz:2 1 zm:5 110010 zn:5 ...0 0 \ + &zzz_en zd=%zd_ax4 n=4 + +SCLAMP 11000001 esz:2 1 zm:5 110001 zn:5 .... 0 \ + &zzz_en zd=%zd_ax2 n=2 +SCLAMP 11000001 esz:2 1 zm:5 110011 zn:5 ...0 0 \ + &zzz_en zd=%zd_ax4 n=4 + +UCLAMP 11000001 esz:2 1 zm:5 110001 zn:5 .... 1 \ + &zzz_en zd=%zd_ax2 n=2 +UCLAMP 11000001 esz:2 1 zm:5 110011 zn:5 ...0 1 \ + &zzz_en zd=%zd_ax4 n=4 + +### SME2 Multi-vector SVE Select + +%sel_pg 10:3 !function=plus_8 + +SEL 11000001 esz:2 1 ....0 100 ... ....0 ....0 \ + n=2 zd=%zd_ax2 zn=%zn_ax2 zm=%zm_ax2 pg=%sel_pg +SEL 11000001 esz:2 1 ...01 100 ... ...00 ...00 \ + n=4 zd=%zd_ax4 zn=%zn_ax4 zm=%zm_ax4 pg=%sel_pg + +### SME Multiple Zero + +&zero_za rv off ngrp nvec + +ZERO_za 11000000 000011 000 .. 0000000000 off:3 \ + &zero_za ngrp=2 nvec=1 rv=%mova_rv +ZERO_za 11000000 000011 100 .. 0000000000 off:3 \ + &zero_za ngrp=4 nvec=1 rv=%mova_rv + +ZERO_za 11000000 000011 001 .. 0000000000 ... \ + &zero_za ngrp=1 nvec=2 rv=%mova_rv off=%off3_x2 +ZERO_za 11000000 000011 010 .. 0000000000 0.. \ + &zero_za ngrp=2 nvec=2 rv=%mova_rv off=%off2_x2 +ZERO_za 11000000 000011 011 .. 0000000000 0.. \ + &zero_za ngrp=4 nvec=2 rv=%mova_rv off=%off2_x2 + +ZERO_za 11000000 000011 101 .. 0000000000 0.. \ + &zero_za ngrp=1 nvec=4 rv=%mova_rv off=%off2_x4 +ZERO_za 11000000 000011 110 .. 0000000000 00. \ + &zero_za ngrp=2 nvec=4 rv=%mova_rv off=%off1_x4 +ZERO_za 11000000 000011 111 .. 0000000000 00. \ + &zero_za ngrp=4 nvec=4 rv=%mova_rv off=%off1_x4 + +### SME Lookup Table Read + +&lut zd zn idx + +# LUTI2, consecutive +LUTI2_c_1b 1100 0000 1100 11 idx:4 00 00 zn:5 zd:5 &lut +LUTI2_c_1h 1100 0000 1100 11 idx:4 01 00 zn:5 zd:5 &lut +LUTI2_c_1s 1100 0000 1100 11 idx:4 10 00 zn:5 zd:5 &lut + +LUTI2_c_2b 1100 0000 1000 11 idx:3 1 00 00 zn:5 .... 0 &lut zd=%zd_ax2 +LUTI2_c_2h 1100 0000 1000 11 idx:3 1 01 00 zn:5 .... 0 &lut zd=%zd_ax2 +LUTI2_c_2s 1100 0000 1000 11 idx:3 1 10 00 zn:5 .... 0 &lut zd=%zd_ax2 + +LUTI2_c_4b 1100 0000 1000 11 idx:2 10 00 00 zn:5 ... 00 &lut zd=%zd_ax4 +LUTI2_c_4h 1100 0000 1000 11 idx:2 10 01 00 zn:5 ... 00 &lut zd=%zd_ax4 +LUTI2_c_4s 1100 0000 1000 11 idx:2 10 10 00 zn:5 ... 00 &lut zd=%zd_ax4 + +# LUTI2, strided (must check zd alignment) +LUTI2_s_2b 1100 0000 1001 11 idx:3 1 00 00 zn:5 zd:5 &lut +LUTI2_s_2h 1100 0000 1001 11 idx:3 1 01 00 zn:5 zd:5 &lut + +LUTI2_s_4b 1100 0000 1001 11 idx:2 10 00 00 zn:5 zd:5 &lut +LUTI2_s_4h 1100 0000 1001 11 idx:2 10 01 00 zn:5 zd:5 &lut + +# LUTI4, consecutive +LUTI4_c_1b 1100 0000 1100 101 idx:3 00 00 zn:5 zd:5 &lut +LUTI4_c_1h 1100 0000 1100 101 idx:3 01 00 zn:5 zd:5 &lut +LUTI4_c_1s 1100 0000 1100 101 idx:3 10 00 zn:5 zd:5 &lut + +LUTI4_c_2b 1100 0000 1000 101 idx:2 1 00 00 zn:5 .... 0 &lut zd=%zd_ax2 +LUTI4_c_2h 1100 0000 1000 101 idx:2 1 01 00 zn:5 .... 0 &lut zd=%zd_ax2 +LUTI4_c_2s 1100 0000 1000 101 idx:2 1 10 00 zn:5 .... 0 &lut zd=%zd_ax2 + +LUTI4_c_4h 1100 0000 1000 101 idx:1 10 01 00 zn:5 ... 00 &lut zd=%zd_ax4 +LUTI4_c_4s 1100 0000 1000 101 idx:1 10 10 00 zn:5 ... 00 &lut zd=%zd_ax4 + +# LUTI4, strided (must check zd alignment) +LUTI4_s_2b 1100 0000 1001 101 idx:2 1 00 00 zn:5 zd:5 &lut +LUTI4_s_2h 1100 0000 1001 101 idx:2 1 01 00 zn:5 zd:5 &lut + +LUTI4_s_4h 1100 0000 1001 101 idx:1 10 01 00 zn:5 zd:5 &lut diff --git a/target/arm/tcg/sme_helper.c b/target/arm/tcg/sme_helper.c index dcc48e4..075360d 100644 --- a/target/arm/tcg/sme_helper.c +++ b/target/arm/tcg/sme_helper.c @@ -22,13 +22,20 @@ #include "internals.h" #include "tcg/tcg-gvec-desc.h" #include "exec/helper-proto.h" -#include "exec/cpu_ldst.h" -#include "exec/exec-all.h" +#include "accel/tcg/cpu-ldst.h" +#include "accel/tcg/helper-retaddr.h" #include "qemu/int128.h" #include "fpu/softfloat.h" #include "vec_internal.h" #include "sve_ldst_internal.h" + +static bool vectors_overlap(ARMVectorReg *x, unsigned nx, + ARMVectorReg *y, unsigned ny) +{ + return !(x + nx <= y || y + ny <= x); +} + void helper_set_svcr(CPUARMState *env, uint32_t val, uint32_t mask) { aarch64_set_svcr(env, val, mask); @@ -39,12 +46,12 @@ void helper_sme_zero(CPUARMState *env, uint32_t imm, uint32_t svl) uint32_t i; /* - * Special case clearing the entire ZA space. + * Special case clearing the entire ZArray. * This falls into the CONSTRAINED UNPREDICTABLE zeroing of any * parts of the ZA storage outside of SVL. */ if (imm == 0xff) { - memset(env->zarray, 0, sizeof(env->zarray)); + memset(env->za_state.za, 0, sizeof(env->za_state.za)); return; } @@ -54,7 +61,7 @@ void helper_sme_zero(CPUARMState *env, uint32_t imm, uint32_t svl) */ for (i = 0; i < svl; i++) { if (imm & (1 << (i % 8))) { - memset(&env->zarray[i], 0, svl); + memset(&env->za_state.za[i], 0, svl); } } } @@ -206,6 +213,110 @@ void HELPER(sme_mova_zc_q)(void *vd, void *za, void *vg, uint32_t desc) #undef DO_MOVA_Z +void HELPER(sme2_mova_zc_b)(void *vdst, void *vsrc, uint32_t desc) +{ + const uint8_t *src = vsrc; + uint8_t *dst = vdst; + size_t i, n = simd_oprsz(desc); + + for (i = 0; i < n; ++i) { + dst[i] = src[tile_vslice_index(i)]; + } +} + +void HELPER(sme2_mova_zc_h)(void *vdst, void *vsrc, uint32_t desc) +{ + const uint16_t *src = vsrc; + uint16_t *dst = vdst; + size_t i, n = simd_oprsz(desc) / 2; + + for (i = 0; i < n; ++i) { + dst[i] = src[tile_vslice_index(i)]; + } +} + +void HELPER(sme2_mova_zc_s)(void *vdst, void *vsrc, uint32_t desc) +{ + const uint32_t *src = vsrc; + uint32_t *dst = vdst; + size_t i, n = simd_oprsz(desc) / 4; + + for (i = 0; i < n; ++i) { + dst[i] = src[tile_vslice_index(i)]; + } +} + +void HELPER(sme2_mova_zc_d)(void *vdst, void *vsrc, uint32_t desc) +{ + const uint64_t *src = vsrc; + uint64_t *dst = vdst; + size_t i, n = simd_oprsz(desc) / 8; + + for (i = 0; i < n; ++i) { + dst[i] = src[tile_vslice_index(i)]; + } +} + +void HELPER(sme2p1_movaz_zc_b)(void *vdst, void *vsrc, uint32_t desc) +{ + uint8_t *src = vsrc; + uint8_t *dst = vdst; + size_t i, n = simd_oprsz(desc); + + for (i = 0; i < n; ++i) { + dst[i] = src[tile_vslice_index(i)]; + src[tile_vslice_index(i)] = 0; + } +} + +void HELPER(sme2p1_movaz_zc_h)(void *vdst, void *vsrc, uint32_t desc) +{ + uint16_t *src = vsrc; + uint16_t *dst = vdst; + size_t i, n = simd_oprsz(desc) / 2; + + for (i = 0; i < n; ++i) { + dst[i] = src[tile_vslice_index(i)]; + src[tile_vslice_index(i)] = 0; + } +} + +void HELPER(sme2p1_movaz_zc_s)(void *vdst, void *vsrc, uint32_t desc) +{ + uint32_t *src = vsrc; + uint32_t *dst = vdst; + size_t i, n = simd_oprsz(desc) / 4; + + for (i = 0; i < n; ++i) { + dst[i] = src[tile_vslice_index(i)]; + src[tile_vslice_index(i)] = 0; + } +} + +void HELPER(sme2p1_movaz_zc_d)(void *vdst, void *vsrc, uint32_t desc) +{ + uint64_t *src = vsrc; + uint64_t *dst = vdst; + size_t i, n = simd_oprsz(desc) / 8; + + for (i = 0; i < n; ++i) { + dst[i] = src[tile_vslice_index(i)]; + src[tile_vslice_index(i)] = 0; + } +} + +void HELPER(sme2p1_movaz_zc_q)(void *vdst, void *vsrc, uint32_t desc) +{ + Int128 *src = vsrc; + Int128 *dst = vdst; + size_t i, n = simd_oprsz(desc) / 16; + + for (i = 0; i < n; ++i) { + dst[i] = src[tile_vslice_index(i)]; + memset(&src[tile_vslice_index(i)], 0, 16); + } +} + /* * Clear elements in a tile slice comprising len bytes. */ @@ -314,6 +425,26 @@ static void copy_vertical_q(void *vdst, const void *vsrc, size_t len) } } +void HELPER(sme2_mova_cz_b)(void *vdst, void *vsrc, uint32_t desc) +{ + copy_vertical_b(vdst, vsrc, simd_oprsz(desc)); +} + +void HELPER(sme2_mova_cz_h)(void *vdst, void *vsrc, uint32_t desc) +{ + copy_vertical_h(vdst, vsrc, simd_oprsz(desc)); +} + +void HELPER(sme2_mova_cz_s)(void *vdst, void *vsrc, uint32_t desc) +{ + copy_vertical_s(vdst, vsrc, simd_oprsz(desc)); +} + +void HELPER(sme2_mova_cz_d)(void *vdst, void *vsrc, uint32_t desc) +{ + copy_vertical_d(vdst, vsrc, simd_oprsz(desc)); +} + /* * Host and TLB primitives for vertical tile slice addressing. */ @@ -344,54 +475,22 @@ static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za, \ TLB(env, useronly_clean_ptr(addr), val, ra); \ } -/* - * The ARMVectorReg elements are stored in host-endian 64-bit units. - * For 128-bit quantities, the sequence defined by the Elem[] pseudocode - * corresponds to storing the two 64-bit pieces in little-endian order. - */ -#define DO_LDQ(HNAME, VNAME, BE, HOST, TLB) \ -static inline void HNAME##_host(void *za, intptr_t off, void *host) \ -{ \ - uint64_t val0 = HOST(host), val1 = HOST(host + 8); \ - uint64_t *ptr = za + off; \ - ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \ -} \ +#define DO_LDQ(HNAME, VNAME) \ static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \ { \ HNAME##_host(za, tile_vslice_offset(off), host); \ } \ -static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \ - target_ulong addr, uintptr_t ra) \ -{ \ - uint64_t val0 = TLB(env, useronly_clean_ptr(addr), ra); \ - uint64_t val1 = TLB(env, useronly_clean_ptr(addr + 8), ra); \ - uint64_t *ptr = za + off; \ - ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \ -} \ static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \ target_ulong addr, uintptr_t ra) \ { \ HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra); \ } -#define DO_STQ(HNAME, VNAME, BE, HOST, TLB) \ -static inline void HNAME##_host(void *za, intptr_t off, void *host) \ -{ \ - uint64_t *ptr = za + off; \ - HOST(host, ptr[BE]); \ - HOST(host + 8, ptr[!BE]); \ -} \ +#define DO_STQ(HNAME, VNAME) \ static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \ { \ HNAME##_host(za, tile_vslice_offset(off), host); \ } \ -static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \ - target_ulong addr, uintptr_t ra) \ -{ \ - uint64_t *ptr = za + off; \ - TLB(env, useronly_clean_ptr(addr), ptr[BE], ra); \ - TLB(env, useronly_clean_ptr(addr + 8), ptr[!BE], ra); \ -} \ static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \ target_ulong addr, uintptr_t ra) \ { \ @@ -406,8 +505,8 @@ DO_LD(ld1s_le, uint32_t, ldl_le_p, cpu_ldl_le_data_ra) DO_LD(ld1d_be, uint64_t, ldq_be_p, cpu_ldq_be_data_ra) DO_LD(ld1d_le, uint64_t, ldq_le_p, cpu_ldq_le_data_ra) -DO_LDQ(sve_ld1qq_be, sme_ld1q_be, 1, ldq_be_p, cpu_ldq_be_data_ra) -DO_LDQ(sve_ld1qq_le, sme_ld1q_le, 0, ldq_le_p, cpu_ldq_le_data_ra) +DO_LDQ(sve_ld1qq_be, sme_ld1q_be) +DO_LDQ(sve_ld1qq_le, sme_ld1q_le) DO_ST(st1b, uint8_t, stb_p, cpu_stb_data_ra) DO_ST(st1h_be, uint16_t, stw_be_p, cpu_stw_be_data_ra) @@ -417,8 +516,8 @@ DO_ST(st1s_le, uint32_t, stl_le_p, cpu_stl_le_data_ra) DO_ST(st1d_be, uint64_t, stq_be_p, cpu_stq_be_data_ra) DO_ST(st1d_le, uint64_t, stq_le_p, cpu_stq_le_data_ra) -DO_STQ(sve_st1qq_be, sme_st1q_be, 1, stq_be_p, cpu_stq_be_data_ra) -DO_STQ(sve_st1qq_le, sme_st1q_le, 0, stq_le_p, cpu_stq_le_data_ra) +DO_STQ(sve_st1qq_be, sme_st1q_be) +DO_STQ(sve_st1qq_le, sme_st1q_le) #undef DO_LD #undef DO_ST @@ -567,19 +666,16 @@ void sme_ld1(CPUARMState *env, void *za, uint64_t *vg, static inline QEMU_ALWAYS_INLINE void sme_ld1_mte(CPUARMState *env, void *za, uint64_t *vg, - target_ulong addr, uint32_t desc, uintptr_t ra, + target_ulong addr, uint64_t desc, uintptr_t ra, const int esz, bool vertical, sve_ldst1_host_fn *host_fn, sve_ldst1_tlb_fn *tlb_fn, ClearFn *clr_fn, CopyFn *cpy_fn) { - uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + uint32_t mtedesc = desc >> 32; int bit55 = extract64(addr, 55, 1); - /* Remove mtedesc from the normal sve descriptor. */ - desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); - /* Perform gross MTE suppression early. */ if (!tbi_check(mtedesc, bit55) || tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { @@ -592,28 +688,28 @@ void sme_ld1_mte(CPUARMState *env, void *za, uint64_t *vg, #define DO_LD(L, END, ESZ) \ void HELPER(sme_ld1##L##END##_h)(CPUARMState *env, void *za, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false, \ sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb, \ clear_horizontal, copy_horizontal); \ } \ void HELPER(sme_ld1##L##END##_v)(CPUARMState *env, void *za, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true, \ sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb, \ clear_vertical_##L, copy_vertical_##L); \ } \ void HELPER(sme_ld1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false, \ sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb, \ clear_horizontal, copy_horizontal); \ } \ void HELPER(sme_ld1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true, \ sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb, \ @@ -755,16 +851,13 @@ void sme_st1(CPUARMState *env, void *za, uint64_t *vg, static inline QEMU_ALWAYS_INLINE void sme_st1_mte(CPUARMState *env, void *za, uint64_t *vg, target_ulong addr, - uint32_t desc, uintptr_t ra, int esz, bool vertical, + uint64_t desc, uintptr_t ra, int esz, bool vertical, sve_ldst1_host_fn *host_fn, sve_ldst1_tlb_fn *tlb_fn) { - uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + uint32_t mtedesc = desc >> 32; int bit55 = extract64(addr, 55, 1); - /* Remove mtedesc from the normal sve descriptor. */ - desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); - /* Perform gross MTE suppression early. */ if (!tbi_check(mtedesc, bit55) || tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { @@ -777,25 +870,25 @@ void sme_st1_mte(CPUARMState *env, void *za, uint64_t *vg, target_ulong addr, #define DO_ST(L, END, ESZ) \ void HELPER(sme_st1##L##END##_h)(CPUARMState *env, void *za, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false, \ sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb); \ } \ void HELPER(sme_st1##L##END##_v)(CPUARMState *env, void *za, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true, \ sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb); \ } \ void HELPER(sme_st1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false, \ sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb); \ } \ void HELPER(sme_st1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true, \ sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb); \ @@ -903,28 +996,69 @@ void HELPER(sme_addva_d)(void *vzda, void *vzn, void *vpn, } } -void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn, - void *vpm, float_status *fpst_in, uint32_t desc) +static void do_fmopa_h(void *vza, void *vzn, void *vzm, uint16_t *pn, + uint16_t *pm, float_status *fpst, uint32_t desc, + uint16_t negx, int negf) { intptr_t row, col, oprsz = simd_maxsz(desc); - uint32_t neg = simd_data(desc) << 31; - uint16_t *pn = vpn, *pm = vpm; - float_status fpst; - /* - * Make a copy of float_status because this operation does not - * update the cumulative fp exception status. It also produces - * default nans. - */ - fpst = *fpst_in; - set_default_nan_mode(true, &fpst); + for (row = 0; row < oprsz; ) { + uint16_t pa = pn[H2(row >> 4)]; + do { + if (pa & 1) { + void *vza_row = vza + tile_vslice_offset(row); + uint16_t n = *(uint32_t *)(vzn + H1_2(row)) ^ negx; + + for (col = 0; col < oprsz; ) { + uint16_t pb = pm[H2(col >> 4)]; + do { + if (pb & 1) { + uint16_t *a = vza_row + H1_2(col); + uint16_t *m = vzm + H1_2(col); + *a = float16_muladd(n, *m, *a, negf, fpst); + } + col += 2; + pb >>= 2; + } while (col & 15); + } + } + row += 2; + pa >>= 2; + } while (row & 15); + } +} + +void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn, + void *vpm, float_status *fpst, uint32_t desc) +{ + do_fmopa_h(vza, vzn, vzm, vpn, vpm, fpst, desc, 0, 0); +} + +void HELPER(sme_fmops_h)(void *vza, void *vzn, void *vzm, void *vpn, + void *vpm, float_status *fpst, uint32_t desc) +{ + do_fmopa_h(vza, vzn, vzm, vpn, vpm, fpst, desc, 1u << 15, 0); +} + +void HELPER(sme_ah_fmops_h)(void *vza, void *vzn, void *vzm, void *vpn, + void *vpm, float_status *fpst, uint32_t desc) +{ + do_fmopa_h(vza, vzn, vzm, vpn, vpm, fpst, desc, 0, + float_muladd_negate_product); +} + +static void do_fmopa_s(void *vza, void *vzn, void *vzm, uint16_t *pn, + uint16_t *pm, float_status *fpst, uint32_t desc, + uint32_t negx, int negf) +{ + intptr_t row, col, oprsz = simd_maxsz(desc); for (row = 0; row < oprsz; ) { uint16_t pa = pn[H2(row >> 4)]; do { if (pa & 1) { void *vza_row = vza + tile_vslice_offset(row); - uint32_t n = *(uint32_t *)(vzn + H1_4(row)) ^ neg; + uint32_t n = *(uint32_t *)(vzn + H1_4(row)) ^ negx; for (col = 0; col < oprsz; ) { uint16_t pb = pm[H2(col >> 4)]; @@ -932,7 +1066,7 @@ void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn, if (pb & 1) { uint32_t *a = vza_row + H1_4(col); uint32_t *m = vzm + H1_4(col); - *a = float32_muladd(n, *m, *a, 0, &fpst); + *a = float32_muladd(n, *m, *a, negf, fpst); } col += 4; pb >>= 4; @@ -945,32 +1079,116 @@ void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn, } } -void HELPER(sme_fmopa_d)(void *vza, void *vzn, void *vzm, void *vpn, - void *vpm, float_status *fpst_in, uint32_t desc) +void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn, + void *vpm, float_status *fpst, uint32_t desc) { - intptr_t row, col, oprsz = simd_oprsz(desc) / 8; - uint64_t neg = (uint64_t)simd_data(desc) << 63; - uint64_t *za = vza, *zn = vzn, *zm = vzm; - uint8_t *pn = vpn, *pm = vpm; - float_status fpst = *fpst_in; + do_fmopa_s(vza, vzn, vzm, vpn, vpm, fpst, desc, 0, 0); +} - set_default_nan_mode(true, &fpst); +void HELPER(sme_fmops_s)(void *vza, void *vzn, void *vzm, void *vpn, + void *vpm, float_status *fpst, uint32_t desc) +{ + do_fmopa_s(vza, vzn, vzm, vpn, vpm, fpst, desc, 1u << 31, 0); +} + +void HELPER(sme_ah_fmops_s)(void *vza, void *vzn, void *vzm, void *vpn, + void *vpm, float_status *fpst, uint32_t desc) +{ + do_fmopa_s(vza, vzn, vzm, vpn, vpm, fpst, desc, 0, + float_muladd_negate_product); +} + +static void do_fmopa_d(uint64_t *za, uint64_t *zn, uint64_t *zm, uint8_t *pn, + uint8_t *pm, float_status *fpst, uint32_t desc, + uint64_t negx, int negf) +{ + intptr_t row, col, oprsz = simd_oprsz(desc) / 8; for (row = 0; row < oprsz; ++row) { if (pn[H1(row)] & 1) { uint64_t *za_row = &za[tile_vslice_index(row)]; - uint64_t n = zn[row] ^ neg; + uint64_t n = zn[row] ^ negx; for (col = 0; col < oprsz; ++col) { if (pm[H1(col)] & 1) { uint64_t *a = &za_row[col]; - *a = float64_muladd(n, zm[col], *a, 0, &fpst); + *a = float64_muladd(n, zm[col], *a, negf, fpst); } } } } } +void HELPER(sme_fmopa_d)(void *vza, void *vzn, void *vzm, void *vpn, + void *vpm, float_status *fpst, uint32_t desc) +{ + do_fmopa_d(vza, vzn, vzm, vpn, vpm, fpst, desc, 0, 0); +} + +void HELPER(sme_fmops_d)(void *vza, void *vzn, void *vzm, void *vpn, + void *vpm, float_status *fpst, uint32_t desc) +{ + do_fmopa_d(vza, vzn, vzm, vpn, vpm, fpst, desc, 1ull << 63, 0); +} + +void HELPER(sme_ah_fmops_d)(void *vza, void *vzn, void *vzm, void *vpn, + void *vpm, float_status *fpst, uint32_t desc) +{ + do_fmopa_d(vza, vzn, vzm, vpn, vpm, fpst, desc, 0, + float_muladd_negate_product); +} + +static void do_bfmopa(void *vza, void *vzn, void *vzm, uint16_t *pn, + uint16_t *pm, float_status *fpst, uint32_t desc, + uint16_t negx, int negf) +{ + intptr_t row, col, oprsz = simd_maxsz(desc); + + for (row = 0; row < oprsz; ) { + uint16_t pa = pn[H2(row >> 4)]; + do { + if (pa & 1) { + void *vza_row = vza + tile_vslice_offset(row); + uint16_t n = *(uint32_t *)(vzn + H1_2(row)) ^ negx; + + for (col = 0; col < oprsz; ) { + uint16_t pb = pm[H2(col >> 4)]; + do { + if (pb & 1) { + uint16_t *a = vza_row + H1_2(col); + uint16_t *m = vzm + H1_2(col); + *a = bfloat16_muladd(n, *m, *a, negf, fpst); + } + col += 2; + pb >>= 2; + } while (col & 15); + } + } + row += 2; + pa >>= 2; + } while (row & 15); + } +} + +void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, void *vpn, + void *vpm, float_status *fpst, uint32_t desc) +{ + do_bfmopa(vza, vzn, vzm, vpn, vpm, fpst, desc, 0, 0); +} + +void HELPER(sme_bfmops)(void *vza, void *vzn, void *vzm, void *vpn, + void *vpm, float_status *fpst, uint32_t desc) +{ + do_bfmopa(vza, vzn, vzm, vpn, vpm, fpst, desc, 1u << 15, 0); +} + +void HELPER(sme_ah_bfmops)(void *vza, void *vzn, void *vzm, void *vpn, + void *vpm, float_status *fpst, uint32_t desc) +{ + do_bfmopa(vza, vzn, vzm, vpn, vpm, fpst, desc, 0, + float_muladd_negate_product); +} + /* * Alter PAIR as needed for controlling predicates being false, * and for NEG on an enabled row element. @@ -991,6 +1209,20 @@ static inline uint32_t f16mop_adj_pair(uint32_t pair, uint32_t pg, uint32_t neg) return pair; } +static inline uint32_t f16mop_ah_neg_adj_pair(uint32_t pair, uint32_t pg) +{ + uint32_t l = pg & 1 ? float16_ah_chs(pair) : 0; + uint32_t h = pg & 4 ? float16_ah_chs(pair >> 16) : 0; + return l | (h << 16); +} + +static inline uint32_t bf16mop_ah_neg_adj_pair(uint32_t pair, uint32_t pg) +{ + uint32_t l = pg & 1 ? bfloat16_ah_chs(pair) : 0; + uint32_t h = pg & 4 ? bfloat16_ah_chs(pair >> 16) : 0; + return l | (h << 16); +} + static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *s_f16, float_status *s_std, float_status *s_odd) @@ -1005,49 +1237,67 @@ static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2, * - we have pre-set-up copy of s_std which is set to round-to-odd, * for the multiply (see below) */ - float64 e1r = float16_to_float64(e1 & 0xffff, true, s_f16); - float64 e1c = float16_to_float64(e1 >> 16, true, s_f16); - float64 e2r = float16_to_float64(e2 & 0xffff, true, s_f16); - float64 e2c = float16_to_float64(e2 >> 16, true, s_f16); - float64 t64; + float16 h1r = e1 & 0xffff; + float16 h1c = e1 >> 16; + float16 h2r = e2 & 0xffff; + float16 h2c = e2 >> 16; float32 t32; - /* - * The ARM pseudocode function FPDot performs both multiplies - * and the add with a single rounding operation. Emulate this - * by performing the first multiply in round-to-odd, then doing - * the second multiply as fused multiply-add, and rounding to - * float32 all in one step. - */ - t64 = float64_mul(e1r, e2r, s_odd); - t64 = float64r32_muladd(e1c, e2c, t64, 0, s_std); + /* C.f. FPProcessNaNs4 */ + if (float16_is_any_nan(h1r) || float16_is_any_nan(h1c) || + float16_is_any_nan(h2r) || float16_is_any_nan(h2c)) { + float16 t16; + + if (float16_is_signaling_nan(h1r, s_f16)) { + t16 = h1r; + } else if (float16_is_signaling_nan(h1c, s_f16)) { + t16 = h1c; + } else if (float16_is_signaling_nan(h2r, s_f16)) { + t16 = h2r; + } else if (float16_is_signaling_nan(h2c, s_f16)) { + t16 = h2c; + } else if (float16_is_any_nan(h1r)) { + t16 = h1r; + } else if (float16_is_any_nan(h1c)) { + t16 = h1c; + } else if (float16_is_any_nan(h2r)) { + t16 = h2r; + } else { + t16 = h2c; + } + t32 = float16_to_float32(t16, true, s_f16); + } else { + float64 e1r = float16_to_float64(h1r, true, s_f16); + float64 e1c = float16_to_float64(h1c, true, s_f16); + float64 e2r = float16_to_float64(h2r, true, s_f16); + float64 e2c = float16_to_float64(h2c, true, s_f16); + float64 t64; + + /* + * The ARM pseudocode function FPDot performs both multiplies + * and the add with a single rounding operation. Emulate this + * by performing the first multiply in round-to-odd, then doing + * the second multiply as fused multiply-add, and rounding to + * float32 all in one step. + */ + t64 = float64_mul(e1r, e2r, s_odd); + t64 = float64r32_muladd(e1c, e2c, t64, 0, s_std); - /* This conversion is exact, because we've already rounded. */ - t32 = float64_to_float32(t64, s_std); + /* This conversion is exact, because we've already rounded. */ + t32 = float64_to_float32(t64, s_std); + } /* The final accumulation step is not fused. */ return float32_add(sum, t32, s_std); } -void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn, - void *vpm, CPUARMState *env, uint32_t desc) +static void do_fmopa_w_h(void *vza, void *vzn, void *vzm, uint16_t *pn, + uint16_t *pm, CPUARMState *env, uint32_t desc, + uint32_t negx, bool ah_neg) { intptr_t row, col, oprsz = simd_maxsz(desc); - uint32_t neg = simd_data(desc) * 0x80008000u; - uint16_t *pn = vpn, *pm = vpm; - float_status fpst_odd, fpst_std, fpst_f16; + float_status fpst_odd = env->vfp.fp_status[FPST_ZA]; - /* - * Make copies of the fp status fields we use, because this operation - * does not update the cumulative fp exception status. It also - * produces default NaNs. We also need a second copy of fp_status with - * round-to-odd -- see above. - */ - fpst_f16 = env->vfp.fp_status[FPST_A64_F16]; - fpst_std = env->vfp.fp_status[FPST_A64]; - set_default_nan_mode(true, &fpst_std); - set_default_nan_mode(true, &fpst_f16); - fpst_odd = fpst_std; set_float_rounding_mode(float_round_to_odd, &fpst_odd); for (row = 0; row < oprsz; ) { @@ -1056,7 +1306,11 @@ void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn, void *vza_row = vza + tile_vslice_offset(row); uint32_t n = *(uint32_t *)(vzn + H1_4(row)); - n = f16mop_adj_pair(n, prow, neg); + if (ah_neg) { + n = f16mop_ah_neg_adj_pair(n, prow); + } else { + n = f16mop_adj_pair(n, prow, negx); + } for (col = 0; col < oprsz; ) { uint16_t pcol = pm[H2(col >> 4)]; @@ -1067,7 +1321,9 @@ void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn, m = f16mop_adj_pair(m, pcol, 0); *a = f16_dotadd(*a, n, m, - &fpst_f16, &fpst_std, &fpst_odd); + &env->vfp.fp_status[FPST_ZA_F16], + &env->vfp.fp_status[FPST_ZA], + &fpst_odd); } col += 4; pcol >>= 4; @@ -1079,12 +1335,103 @@ void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn, } } -void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, - void *vpn, void *vpm, CPUARMState *env, uint32_t desc) +void HELPER(sme_fmopa_w_h)(void *vza, void *vzn, void *vzm, void *vpn, + void *vpm, CPUARMState *env, uint32_t desc) +{ + do_fmopa_w_h(vza, vzn, vzm, vpn, vpm, env, desc, 0, false); +} + +void HELPER(sme_fmops_w_h)(void *vza, void *vzn, void *vzm, void *vpn, + void *vpm, CPUARMState *env, uint32_t desc) +{ + do_fmopa_w_h(vza, vzn, vzm, vpn, vpm, env, desc, 0x80008000u, false); +} + +void HELPER(sme_ah_fmops_w_h)(void *vza, void *vzn, void *vzm, void *vpn, + void *vpm, CPUARMState *env, uint32_t desc) +{ + do_fmopa_w_h(vza, vzn, vzm, vpn, vpm, env, desc, 0, true); +} + +void HELPER(sme2_fdot_h)(void *vd, void *vn, void *vm, void *va, + CPUARMState *env, uint32_t desc) +{ + intptr_t i, oprsz = simd_maxsz(desc); + bool za = extract32(desc, SIMD_DATA_SHIFT, 1); + float_status *fpst_std = &env->vfp.fp_status[za ? FPST_ZA : FPST_A64]; + float_status *fpst_f16 = &env->vfp.fp_status[za ? FPST_ZA_F16 : FPST_A64_F16]; + float_status fpst_odd = *fpst_std; + float32 *d = vd, *a = va; + uint32_t *n = vn, *m = vm; + + set_float_rounding_mode(float_round_to_odd, &fpst_odd); + + for (i = 0; i < oprsz / sizeof(float32); ++i) { + d[H4(i)] = f16_dotadd(a[H4(i)], n[H4(i)], m[H4(i)], + fpst_f16, fpst_std, &fpst_odd); + } +} + +void HELPER(sme2_fdot_idx_h)(void *vd, void *vn, void *vm, void *va, + CPUARMState *env, uint32_t desc) +{ + intptr_t i, j, oprsz = simd_maxsz(desc); + intptr_t elements = oprsz / sizeof(float32); + intptr_t eltspersegment = MIN(4, elements); + int idx = extract32(desc, SIMD_DATA_SHIFT, 2); + bool za = extract32(desc, SIMD_DATA_SHIFT + 2, 1); + float_status *fpst_std = &env->vfp.fp_status[za ? FPST_ZA : FPST_A64]; + float_status *fpst_f16 = &env->vfp.fp_status[za ? FPST_ZA_F16 : FPST_A64_F16]; + float_status fpst_odd = *fpst_std; + float32 *d = vd, *a = va; + uint32_t *n = vn, *m = (uint32_t *)vm + H4(idx); + + set_float_rounding_mode(float_round_to_odd, &fpst_odd); + + for (i = 0; i < elements; i += eltspersegment) { + uint32_t mm = m[i]; + for (j = 0; j < eltspersegment; ++j) { + d[H4(i + j)] = f16_dotadd(a[H4(i + j)], n[H4(i + j)], mm, + fpst_f16, fpst_std, &fpst_odd); + } + } +} + +void HELPER(sme2_fvdot_idx_h)(void *vd, void *vn, void *vm, void *va, + CPUARMState *env, uint32_t desc) +{ + intptr_t i, j, oprsz = simd_maxsz(desc); + intptr_t elements = oprsz / sizeof(float32); + intptr_t eltspersegment = MIN(4, elements); + int idx = extract32(desc, SIMD_DATA_SHIFT, 2); + int sel = extract32(desc, SIMD_DATA_SHIFT + 2, 1); + float_status fpst_odd, *fpst_std, *fpst_f16; + float32 *d = vd, *a = va; + uint16_t *n0 = vn; + uint16_t *n1 = vn + sizeof(ARMVectorReg); + uint32_t *m = (uint32_t *)vm + H4(idx); + + fpst_std = &env->vfp.fp_status[FPST_ZA]; + fpst_f16 = &env->vfp.fp_status[FPST_ZA_F16]; + fpst_odd = *fpst_std; + set_float_rounding_mode(float_round_to_odd, &fpst_odd); + + for (i = 0; i < elements; i += eltspersegment) { + uint32_t mm = m[i]; + for (j = 0; j < eltspersegment; ++j) { + uint32_t nn = (n0[H2(2 * (i + j) + sel)]) + | (n1[H2(2 * (i + j) + sel)] << 16); + d[i + H4(j)] = f16_dotadd(a[i + H4(j)], nn, mm, + fpst_f16, fpst_std, &fpst_odd); + } + } +} + +static void do_bfmopa_w(void *vza, void *vzn, void *vzm, + uint16_t *pn, uint16_t *pm, CPUARMState *env, + uint32_t desc, uint32_t negx, bool ah_neg) { intptr_t row, col, oprsz = simd_maxsz(desc); - uint32_t neg = simd_data(desc) * 0x80008000u; - uint16_t *pn = vpn, *pm = vpm; float_status fpst, fpst_odd; if (is_ebf(env, &fpst, &fpst_odd)) { @@ -1094,7 +1441,11 @@ void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, void *vza_row = vza + tile_vslice_offset(row); uint32_t n = *(uint32_t *)(vzn + H1_4(row)); - n = f16mop_adj_pair(n, prow, neg); + if (ah_neg) { + n = bf16mop_ah_neg_adj_pair(n, prow); + } else { + n = f16mop_adj_pair(n, prow, negx); + } for (col = 0; col < oprsz; ) { uint16_t pcol = pm[H2(col >> 4)]; @@ -1121,7 +1472,11 @@ void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, void *vza_row = vza + tile_vslice_offset(row); uint32_t n = *(uint32_t *)(vzn + H1_4(row)); - n = f16mop_adj_pair(n, prow, neg); + if (ah_neg) { + n = bf16mop_ah_neg_adj_pair(n, prow); + } else { + n = f16mop_adj_pair(n, prow, negx); + } for (col = 0; col < oprsz; ) { uint16_t pcol = pm[H2(col >> 4)]; @@ -1144,6 +1499,24 @@ void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, } } +void HELPER(sme_bfmopa_w)(void *vza, void *vzn, void *vzm, void *vpn, + void *vpm, CPUARMState *env, uint32_t desc) +{ + do_bfmopa_w(vza, vzn, vzm, vpn, vpm, env, desc, 0, false); +} + +void HELPER(sme_bfmops_w)(void *vza, void *vzn, void *vzm, void *vpn, + void *vpm, CPUARMState *env, uint32_t desc) +{ + do_bfmopa_w(vza, vzn, vzm, vpn, vpm, env, desc, 0x80008000u, false); +} + +void HELPER(sme_ah_bfmops_w)(void *vza, void *vzn, void *vzm, void *vpn, + void *vpm, CPUARMState *env, uint32_t desc) +{ + do_bfmopa_w(vza, vzn, vzm, vpn, vpm, env, desc, 0, true); +} + typedef uint32_t IMOPFn32(uint32_t, uint32_t, uint32_t, uint8_t, bool); static inline void do_imopa_s(uint32_t *za, uint32_t *zn, uint32_t *zm, uint8_t *pn, uint8_t *pm, @@ -1188,7 +1561,7 @@ static inline void do_imopa_d(uint64_t *za, uint64_t *zn, uint64_t *zm, } } -#define DEF_IMOP_32(NAME, NTYPE, MTYPE) \ +#define DEF_IMOP_8x4_32(NAME, NTYPE, MTYPE) \ static uint32_t NAME(uint32_t n, uint32_t m, uint32_t a, uint8_t p, bool neg) \ { \ uint32_t sum = 0; \ @@ -1201,7 +1574,7 @@ static uint32_t NAME(uint32_t n, uint32_t m, uint32_t a, uint8_t p, bool neg) \ return neg ? a - sum : a + sum; \ } -#define DEF_IMOP_64(NAME, NTYPE, MTYPE) \ +#define DEF_IMOP_16x4_64(NAME, NTYPE, MTYPE) \ static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \ { \ uint64_t sum = 0; \ @@ -1214,27 +1587,1070 @@ static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \ return neg ? a - sum : a + sum; \ } -DEF_IMOP_32(smopa_s, int8_t, int8_t) -DEF_IMOP_32(umopa_s, uint8_t, uint8_t) -DEF_IMOP_32(sumopa_s, int8_t, uint8_t) -DEF_IMOP_32(usmopa_s, uint8_t, int8_t) +DEF_IMOP_8x4_32(smopa_s, int8_t, int8_t) +DEF_IMOP_8x4_32(umopa_s, uint8_t, uint8_t) +DEF_IMOP_8x4_32(sumopa_s, int8_t, uint8_t) +DEF_IMOP_8x4_32(usmopa_s, uint8_t, int8_t) -DEF_IMOP_64(smopa_d, int16_t, int16_t) -DEF_IMOP_64(umopa_d, uint16_t, uint16_t) -DEF_IMOP_64(sumopa_d, int16_t, uint16_t) -DEF_IMOP_64(usmopa_d, uint16_t, int16_t) +DEF_IMOP_16x4_64(smopa_d, int16_t, int16_t) +DEF_IMOP_16x4_64(umopa_d, uint16_t, uint16_t) +DEF_IMOP_16x4_64(sumopa_d, int16_t, uint16_t) +DEF_IMOP_16x4_64(usmopa_d, uint16_t, int16_t) -#define DEF_IMOPH(NAME, S) \ - void HELPER(sme_##NAME##_##S)(void *vza, void *vzn, void *vzm, \ +#define DEF_IMOPH(P, NAME, S) \ + void HELPER(P##_##NAME##_##S)(void *vza, void *vzn, void *vzm, \ void *vpn, void *vpm, uint32_t desc) \ { do_imopa_##S(vza, vzn, vzm, vpn, vpm, desc, NAME##_##S); } -DEF_IMOPH(smopa, s) -DEF_IMOPH(umopa, s) -DEF_IMOPH(sumopa, s) -DEF_IMOPH(usmopa, s) +DEF_IMOPH(sme, smopa, s) +DEF_IMOPH(sme, umopa, s) +DEF_IMOPH(sme, sumopa, s) +DEF_IMOPH(sme, usmopa, s) + +DEF_IMOPH(sme, smopa, d) +DEF_IMOPH(sme, umopa, d) +DEF_IMOPH(sme, sumopa, d) +DEF_IMOPH(sme, usmopa, d) + +static uint32_t bmopa_s(uint32_t n, uint32_t m, uint32_t a, uint8_t p, bool neg) +{ + uint32_t sum = ctpop32(~(n ^ m)); + if (neg) { + sum = -sum; + } + if (!(p & 1)) { + sum = 0; + } + return a + sum; +} + +DEF_IMOPH(sme2, bmopa, s) + +#define DEF_IMOP_16x2_32(NAME, NTYPE, MTYPE) \ +static uint32_t NAME(uint32_t n, uint32_t m, uint32_t a, uint8_t p, bool neg) \ +{ \ + uint32_t sum = 0; \ + /* Apply P to N as a mask, making the inactive elements 0. */ \ + n &= expand_pred_h(p); \ + sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); \ + sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16); \ + return neg ? a - sum : a + sum; \ +} + +DEF_IMOP_16x2_32(smopa2_s, int16_t, int16_t) +DEF_IMOP_16x2_32(umopa2_s, uint16_t, uint16_t) + +DEF_IMOPH(sme2, smopa2, s) +DEF_IMOPH(sme2, umopa2, s) + +#define DO_VDOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD, HN) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + intptr_t svl = simd_oprsz(desc); \ + intptr_t elements = svl / sizeof(TYPED); \ + intptr_t eltperseg = 16 / sizeof(TYPED); \ + intptr_t nreg = sizeof(TYPED) / sizeof(TYPEN); \ + intptr_t vstride = (svl / nreg) * sizeof(ARMVectorReg); \ + intptr_t zstride = sizeof(ARMVectorReg) / sizeof(TYPEN); \ + intptr_t idx = extract32(desc, SIMD_DATA_SHIFT, 2); \ + TYPEN *n = vn; \ + TYPEM *m = vm; \ + for (intptr_t r = 0; r < nreg; r++) { \ + TYPED *d = vd + r * vstride; \ + for (intptr_t seg = 0; seg < elements; seg += eltperseg) { \ + intptr_t s = seg + idx; \ + for (intptr_t e = seg; e < seg + eltperseg; e++) { \ + TYPED sum = d[HD(e)]; \ + for (intptr_t i = 0; i < nreg; i++) { \ + TYPED nn = n[i * zstride + HN(nreg * e + r)]; \ + TYPED mm = m[HN(nreg * s + i)]; \ + sum += nn * mm; \ + } \ + d[HD(e)] = sum; \ + } \ + } \ + } \ +} + +DO_VDOT_IDX(sme2_svdot_idx_4b, int32_t, int8_t, int8_t, H4, H1) +DO_VDOT_IDX(sme2_uvdot_idx_4b, uint32_t, uint8_t, uint8_t, H4, H1) +DO_VDOT_IDX(sme2_suvdot_idx_4b, int32_t, int8_t, uint8_t, H4, H1) +DO_VDOT_IDX(sme2_usvdot_idx_4b, int32_t, uint8_t, int8_t, H4, H1) + +DO_VDOT_IDX(sme2_svdot_idx_4h, int64_t, int16_t, int16_t, H8, H2) +DO_VDOT_IDX(sme2_uvdot_idx_4h, uint64_t, uint16_t, uint16_t, H8, H2) + +DO_VDOT_IDX(sme2_svdot_idx_2h, int32_t, int16_t, int16_t, H4, H2) +DO_VDOT_IDX(sme2_uvdot_idx_2h, uint32_t, uint16_t, uint16_t, H4, H2) + +#undef DO_VDOT_IDX + +#define DO_MLALL(NAME, TYPEW, TYPEN, TYPEM, HW, HN, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ +{ \ + intptr_t elements = simd_oprsz(desc) / sizeof(TYPEW); \ + intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 2); \ + TYPEW *d = vd, *a = va; TYPEN *n = vn; TYPEM *m = vm; \ + for (intptr_t i = 0; i < elements; ++i) { \ + TYPEW nn = n[HN(i * 4 + sel)]; \ + TYPEM mm = m[HN(i * 4 + sel)]; \ + d[HW(i)] = a[HW(i)] OP (nn * mm); \ + } \ +} + +DO_MLALL(sme2_smlall_s, int32_t, int8_t, int8_t, H4, H1, +) +DO_MLALL(sme2_smlall_d, int64_t, int16_t, int16_t, H8, H2, +) +DO_MLALL(sme2_smlsll_s, int32_t, int8_t, int8_t, H4, H1, -) +DO_MLALL(sme2_smlsll_d, int64_t, int16_t, int16_t, H8, H2, -) + +DO_MLALL(sme2_umlall_s, uint32_t, uint8_t, uint8_t, H4, H1, +) +DO_MLALL(sme2_umlall_d, uint64_t, uint16_t, uint16_t, H8, H2, +) +DO_MLALL(sme2_umlsll_s, uint32_t, uint8_t, uint8_t, H4, H1, -) +DO_MLALL(sme2_umlsll_d, uint64_t, uint16_t, uint16_t, H8, H2, -) + +DO_MLALL(sme2_usmlall_s, uint32_t, uint8_t, int8_t, H4, H1, +) + +#undef DO_MLALL + +#define DO_MLALL_IDX(NAME, TYPEW, TYPEN, TYPEM, HW, HN, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ +{ \ + intptr_t elements = simd_oprsz(desc) / sizeof(TYPEW); \ + intptr_t eltspersegment = 16 / sizeof(TYPEW); \ + intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 2); \ + intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 4); \ + TYPEW *d = vd, *a = va; TYPEN *n = vn; TYPEM *m = vm; \ + for (intptr_t i = 0; i < elements; i += eltspersegment) { \ + TYPEW mm = m[HN(i * 4 + idx)]; \ + for (intptr_t j = 0; j < eltspersegment; ++j) { \ + TYPEN nn = n[HN((i + j) * 4 + sel)]; \ + d[HW(i + j)] = a[HW(i + j)] OP (nn * mm); \ + } \ + } \ +} + +DO_MLALL_IDX(sme2_smlall_idx_s, int32_t, int8_t, int8_t, H4, H1, +) +DO_MLALL_IDX(sme2_smlall_idx_d, int64_t, int16_t, int16_t, H8, H2, +) +DO_MLALL_IDX(sme2_smlsll_idx_s, int32_t, int8_t, int8_t, H4, H1, -) +DO_MLALL_IDX(sme2_smlsll_idx_d, int64_t, int16_t, int16_t, H8, H2, -) + +DO_MLALL_IDX(sme2_umlall_idx_s, uint32_t, uint8_t, uint8_t, H4, H1, +) +DO_MLALL_IDX(sme2_umlall_idx_d, uint64_t, uint16_t, uint16_t, H8, H2, +) +DO_MLALL_IDX(sme2_umlsll_idx_s, uint32_t, uint8_t, uint8_t, H4, H1, -) +DO_MLALL_IDX(sme2_umlsll_idx_d, uint64_t, uint16_t, uint16_t, H8, H2, -) + +DO_MLALL_IDX(sme2_usmlall_idx_s, uint32_t, uint8_t, int8_t, H4, H1, +) +DO_MLALL_IDX(sme2_sumlall_idx_s, uint32_t, int8_t, uint8_t, H4, H1, +) + +#undef DO_MLALL_IDX + +/* Convert and compress */ +void HELPER(sme2_bfcvt)(void *vd, void *vs, float_status *fpst, uint32_t desc) +{ + ARMVectorReg scratch; + size_t oprsz = simd_oprsz(desc); + size_t i, n = oprsz / 4; + float32 *s0 = vs; + float32 *s1 = vs + sizeof(ARMVectorReg); + bfloat16 *d = vd; + + if (vd == s1) { + s1 = memcpy(&scratch, s1, oprsz); + } + + for (i = 0; i < n; ++i) { + d[H2(i)] = float32_to_bfloat16(s0[H4(i)], fpst); + } + for (i = 0; i < n; ++i) { + d[H2(i) + n] = float32_to_bfloat16(s1[H4(i)], fpst); + } +} -DEF_IMOPH(smopa, d) -DEF_IMOPH(umopa, d) -DEF_IMOPH(sumopa, d) -DEF_IMOPH(usmopa, d) +void HELPER(sme2_fcvt_n)(void *vd, void *vs, float_status *fpst, uint32_t desc) +{ + ARMVectorReg scratch; + size_t oprsz = simd_oprsz(desc); + size_t i, n = oprsz / 4; + float32 *s0 = vs; + float32 *s1 = vs + sizeof(ARMVectorReg); + float16 *d = vd; + + if (vd == s1) { + s1 = memcpy(&scratch, s1, oprsz); + } + + for (i = 0; i < n; ++i) { + d[H2(i)] = sve_f32_to_f16(s0[H4(i)], fpst); + } + for (i = 0; i < n; ++i) { + d[H2(i) + n] = sve_f32_to_f16(s1[H4(i)], fpst); + } +} + +#define SQCVT2(NAME, TW, TN, HW, HN, SAT) \ +void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \ +{ \ + ARMVectorReg scratch; \ + size_t oprsz = simd_oprsz(desc), n = oprsz / sizeof(TW); \ + TW *s0 = vs, *s1 = vs + sizeof(ARMVectorReg); \ + TN *d = vd; \ + if (vectors_overlap(vd, 1, vs, 2)) { \ + d = (TN *)&scratch; \ + } \ + for (size_t i = 0; i < n; ++i) { \ + d[HN(i)] = SAT(s0[HW(i)]); \ + d[HN(i + n)] = SAT(s1[HW(i)]); \ + } \ + if (d != vd) { \ + memcpy(vd, d, oprsz); \ + } \ +} + +SQCVT2(sme2_sqcvt_sh, int32_t, int16_t, H4, H2, do_ssat_h) +SQCVT2(sme2_uqcvt_sh, uint32_t, uint16_t, H4, H2, do_usat_h) +SQCVT2(sme2_sqcvtu_sh, int32_t, uint16_t, H4, H2, do_usat_h) + +#undef SQCVT2 + +#define SQCVT4(NAME, TW, TN, HW, HN, SAT) \ +void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \ +{ \ + ARMVectorReg scratch; \ + size_t oprsz = simd_oprsz(desc), n = oprsz / sizeof(TW); \ + TW *s0 = vs, *s1 = vs + sizeof(ARMVectorReg); \ + TW *s2 = vs + 2 * sizeof(ARMVectorReg); \ + TW *s3 = vs + 3 * sizeof(ARMVectorReg); \ + TN *d = vd; \ + if (vectors_overlap(vd, 1, vs, 4)) { \ + d = (TN *)&scratch; \ + } \ + for (size_t i = 0; i < n; ++i) { \ + d[HN(i)] = SAT(s0[HW(i)]); \ + d[HN(i + n)] = SAT(s1[HW(i)]); \ + d[HN(i + 2 * n)] = SAT(s2[HW(i)]); \ + d[HN(i + 3 * n)] = SAT(s3[HW(i)]); \ + } \ + if (d != vd) { \ + memcpy(vd, d, oprsz); \ + } \ +} + +SQCVT4(sme2_sqcvt_sb, int32_t, int8_t, H4, H2, do_ssat_b) +SQCVT4(sme2_uqcvt_sb, uint32_t, uint8_t, H4, H2, do_usat_b) +SQCVT4(sme2_sqcvtu_sb, int32_t, uint8_t, H4, H2, do_usat_b) + +SQCVT4(sme2_sqcvt_dh, int64_t, int16_t, H8, H2, do_ssat_h) +SQCVT4(sme2_uqcvt_dh, uint64_t, uint16_t, H8, H2, do_usat_h) +SQCVT4(sme2_sqcvtu_dh, int64_t, uint16_t, H8, H2, do_usat_h) + +#undef SQCVT4 + +#define SQRSHR2(NAME, TW, TN, HW, HN, RSHR, SAT) \ +void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \ +{ \ + ARMVectorReg scratch; \ + size_t oprsz = simd_oprsz(desc), n = oprsz / sizeof(TW); \ + int shift = simd_data(desc); \ + TW *s0 = vs, *s1 = vs + sizeof(ARMVectorReg); \ + TN *d = vd; \ + if (vectors_overlap(vd, 1, vs, 2)) { \ + d = (TN *)&scratch; \ + } \ + for (size_t i = 0; i < n; ++i) { \ + d[HN(i)] = SAT(RSHR(s0[HW(i)], shift)); \ + d[HN(i + n)] = SAT(RSHR(s1[HW(i)], shift)); \ + } \ + if (d != vd) { \ + memcpy(vd, d, oprsz); \ + } \ +} + +SQRSHR2(sme2_sqrshr_sh, int32_t, int16_t, H4, H2, do_srshr, do_ssat_h) +SQRSHR2(sme2_uqrshr_sh, uint32_t, uint16_t, H4, H2, do_urshr, do_usat_h) +SQRSHR2(sme2_sqrshru_sh, int32_t, uint16_t, H4, H2, do_srshr, do_usat_h) + +#undef SQRSHR2 + +#define SQRSHR4(NAME, TW, TN, HW, HN, RSHR, SAT) \ +void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \ +{ \ + ARMVectorReg scratch; \ + size_t oprsz = simd_oprsz(desc), n = oprsz / sizeof(TW); \ + int shift = simd_data(desc); \ + TW *s0 = vs, *s1 = vs + sizeof(ARMVectorReg); \ + TW *s2 = vs + 2 * sizeof(ARMVectorReg); \ + TW *s3 = vs + 3 * sizeof(ARMVectorReg); \ + TN *d = vd; \ + if (vectors_overlap(vd, 1, vs, 4)) { \ + d = (TN *)&scratch; \ + } \ + for (size_t i = 0; i < n; ++i) { \ + d[HN(i)] = SAT(RSHR(s0[HW(i)], shift)); \ + d[HN(i + n)] = SAT(RSHR(s1[HW(i)], shift)); \ + d[HN(i + 2 * n)] = SAT(RSHR(s2[HW(i)], shift)); \ + d[HN(i + 3 * n)] = SAT(RSHR(s3[HW(i)], shift)); \ + } \ + if (d != vd) { \ + memcpy(vd, d, oprsz); \ + } \ +} + +SQRSHR4(sme2_sqrshr_sb, int32_t, int8_t, H4, H2, do_srshr, do_ssat_b) +SQRSHR4(sme2_uqrshr_sb, uint32_t, uint8_t, H4, H2, do_urshr, do_usat_b) +SQRSHR4(sme2_sqrshru_sb, int32_t, uint8_t, H4, H2, do_srshr, do_usat_b) + +SQRSHR4(sme2_sqrshr_dh, int64_t, int16_t, H8, H2, do_srshr, do_ssat_h) +SQRSHR4(sme2_uqrshr_dh, uint64_t, uint16_t, H8, H2, do_urshr, do_usat_h) +SQRSHR4(sme2_sqrshru_dh, int64_t, uint16_t, H8, H2, do_srshr, do_usat_h) + +#undef SQRSHR4 + +/* Convert and interleave */ +void HELPER(sme2_bfcvtn)(void *vd, void *vs, float_status *fpst, uint32_t desc) +{ + size_t i, n = simd_oprsz(desc) / 4; + float32 *s0 = vs; + float32 *s1 = vs + sizeof(ARMVectorReg); + bfloat16 *d = vd; + + for (i = 0; i < n; ++i) { + bfloat16 d0 = float32_to_bfloat16(s0[H4(i)], fpst); + bfloat16 d1 = float32_to_bfloat16(s1[H4(i)], fpst); + d[H2(i * 2 + 0)] = d0; + d[H2(i * 2 + 1)] = d1; + } +} + +void HELPER(sme2_fcvtn)(void *vd, void *vs, float_status *fpst, uint32_t desc) +{ + size_t i, n = simd_oprsz(desc) / 4; + float32 *s0 = vs; + float32 *s1 = vs + sizeof(ARMVectorReg); + bfloat16 *d = vd; + + for (i = 0; i < n; ++i) { + bfloat16 d0 = sve_f32_to_f16(s0[H4(i)], fpst); + bfloat16 d1 = sve_f32_to_f16(s1[H4(i)], fpst); + d[H2(i * 2 + 0)] = d0; + d[H2(i * 2 + 1)] = d1; + } +} + +#define SQCVTN2(NAME, TW, TN, HW, HN, SAT) \ +void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \ +{ \ + ARMVectorReg scratch; \ + size_t oprsz = simd_oprsz(desc), n = oprsz / sizeof(TW); \ + TW *s0 = vs, *s1 = vs + sizeof(ARMVectorReg); \ + TN *d = vd; \ + if (vectors_overlap(vd, 1, vs, 2)) { \ + d = (TN *)&scratch; \ + } \ + for (size_t i = 0; i < n; ++i) { \ + d[HN(2 * i + 0)] = SAT(s0[HW(i)]); \ + d[HN(2 * i + 1)] = SAT(s1[HW(i)]); \ + } \ + if (d != vd) { \ + memcpy(vd, d, oprsz); \ + } \ +} + +SQCVTN2(sme2_sqcvtn_sh, int32_t, int16_t, H4, H2, do_ssat_h) +SQCVTN2(sme2_uqcvtn_sh, uint32_t, uint16_t, H4, H2, do_usat_h) +SQCVTN2(sme2_sqcvtun_sh, int32_t, uint16_t, H4, H2, do_usat_h) + +#undef SQCVTN2 + +#define SQCVTN4(NAME, TW, TN, HW, HN, SAT) \ +void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \ +{ \ + ARMVectorReg scratch; \ + size_t oprsz = simd_oprsz(desc), n = oprsz / sizeof(TW); \ + TW *s0 = vs, *s1 = vs + sizeof(ARMVectorReg); \ + TW *s2 = vs + 2 * sizeof(ARMVectorReg); \ + TW *s3 = vs + 3 * sizeof(ARMVectorReg); \ + TN *d = vd; \ + if (vectors_overlap(vd, 1, vs, 4)) { \ + d = (TN *)&scratch; \ + } \ + for (size_t i = 0; i < n; ++i) { \ + d[HN(4 * i + 0)] = SAT(s0[HW(i)]); \ + d[HN(4 * i + 1)] = SAT(s1[HW(i)]); \ + d[HN(4 * i + 2)] = SAT(s2[HW(i)]); \ + d[HN(4 * i + 3)] = SAT(s3[HW(i)]); \ + } \ + if (d != vd) { \ + memcpy(vd, d, oprsz); \ + } \ +} + +SQCVTN4(sme2_sqcvtn_sb, int32_t, int8_t, H4, H1, do_ssat_b) +SQCVTN4(sme2_uqcvtn_sb, uint32_t, uint8_t, H4, H1, do_usat_b) +SQCVTN4(sme2_sqcvtun_sb, int32_t, uint8_t, H4, H1, do_usat_b) + +SQCVTN4(sme2_sqcvtn_dh, int64_t, int16_t, H8, H2, do_ssat_h) +SQCVTN4(sme2_uqcvtn_dh, uint64_t, uint16_t, H8, H2, do_usat_h) +SQCVTN4(sme2_sqcvtun_dh, int64_t, uint16_t, H8, H2, do_usat_h) + +#undef SQCVTN4 + +#define SQRSHRN2(NAME, TW, TN, HW, HN, RSHR, SAT) \ +void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \ +{ \ + ARMVectorReg scratch; \ + size_t oprsz = simd_oprsz(desc), n = oprsz / sizeof(TW); \ + int shift = simd_data(desc); \ + TW *s0 = vs, *s1 = vs + sizeof(ARMVectorReg); \ + TN *d = vd; \ + if (vectors_overlap(vd, 1, vs, 2)) { \ + d = (TN *)&scratch; \ + } \ + for (size_t i = 0; i < n; ++i) { \ + d[HN(2 * i + 0)] = SAT(RSHR(s0[HW(i)], shift)); \ + d[HN(2 * i + 1)] = SAT(RSHR(s1[HW(i)], shift)); \ + } \ + if (d != vd) { \ + memcpy(vd, d, oprsz); \ + } \ +} + +SQRSHRN2(sme2_sqrshrn_sh, int32_t, int16_t, H4, H2, do_srshr, do_ssat_h) +SQRSHRN2(sme2_uqrshrn_sh, uint32_t, uint16_t, H4, H2, do_urshr, do_usat_h) +SQRSHRN2(sme2_sqrshrun_sh, int32_t, uint16_t, H4, H2, do_srshr, do_usat_h) + +#undef SQRSHRN2 + +#define SQRSHRN4(NAME, TW, TN, HW, HN, RSHR, SAT) \ +void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \ +{ \ + ARMVectorReg scratch; \ + size_t oprsz = simd_oprsz(desc), n = oprsz / sizeof(TW); \ + int shift = simd_data(desc); \ + TW *s0 = vs, *s1 = vs + sizeof(ARMVectorReg); \ + TW *s2 = vs + 2 * sizeof(ARMVectorReg); \ + TW *s3 = vs + 3 * sizeof(ARMVectorReg); \ + TN *d = vd; \ + if (vectors_overlap(vd, 1, vs, 4)) { \ + d = (TN *)&scratch; \ + } \ + for (size_t i = 0; i < n; ++i) { \ + d[HN(4 * i + 0)] = SAT(RSHR(s0[HW(i)], shift)); \ + d[HN(4 * i + 1)] = SAT(RSHR(s1[HW(i)], shift)); \ + d[HN(4 * i + 2)] = SAT(RSHR(s2[HW(i)], shift)); \ + d[HN(4 * i + 3)] = SAT(RSHR(s3[HW(i)], shift)); \ + } \ + if (d != vd) { \ + memcpy(vd, d, oprsz); \ + } \ +} + +SQRSHRN4(sme2_sqrshrn_sb, int32_t, int8_t, H4, H1, do_srshr, do_ssat_b) +SQRSHRN4(sme2_uqrshrn_sb, uint32_t, uint8_t, H4, H1, do_urshr, do_usat_b) +SQRSHRN4(sme2_sqrshrun_sb, int32_t, uint8_t, H4, H1, do_srshr, do_usat_b) + +SQRSHRN4(sme2_sqrshrn_dh, int64_t, int16_t, H8, H2, do_srshr, do_ssat_h) +SQRSHRN4(sme2_uqrshrn_dh, uint64_t, uint16_t, H8, H2, do_urshr, do_usat_h) +SQRSHRN4(sme2_sqrshrun_dh, int64_t, uint16_t, H8, H2, do_srshr, do_usat_h) + +#undef SQRSHRN4 + +/* Expand and convert */ +void HELPER(sme2_fcvt_w)(void *vd, void *vs, float_status *fpst, uint32_t desc) +{ + ARMVectorReg scratch; + size_t oprsz = simd_oprsz(desc); + size_t i, n = oprsz / 4; + float16 *s = vs; + float32 *d0 = vd; + float32 *d1 = vd + sizeof(ARMVectorReg); + + if (vectors_overlap(vd, 1, vs, 2)) { + s = memcpy(&scratch, s, oprsz); + } + + for (i = 0; i < n; ++i) { + d0[H4(i)] = sve_f16_to_f32(s[H2(i)], fpst); + } + for (i = 0; i < n; ++i) { + d1[H4(i)] = sve_f16_to_f32(s[H2(n + i)], fpst); + } +} + +#define UNPK(NAME, SREG, TW, TN, HW, HN) \ +void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \ +{ \ + ARMVectorReg scratch[SREG]; \ + size_t oprsz = simd_oprsz(desc); \ + size_t n = oprsz / sizeof(TW); \ + if (vectors_overlap(vd, 2 * SREG, vs, SREG)) { \ + vs = memcpy(scratch, vs, sizeof(scratch)); \ + } \ + for (size_t r = 0; r < SREG; ++r) { \ + TN *s = vs + r * sizeof(ARMVectorReg); \ + for (size_t i = 0; i < 2; ++i) { \ + TW *d = vd + (2 * r + i) * sizeof(ARMVectorReg); \ + for (size_t e = 0; e < n; ++e) { \ + d[HW(e)] = s[HN(i * n + e)]; \ + } \ + } \ + } \ +} + +UNPK(sme2_sunpk2_bh, 1, int16_t, int8_t, H2, H1) +UNPK(sme2_sunpk2_hs, 1, int32_t, int16_t, H4, H2) +UNPK(sme2_sunpk2_sd, 1, int64_t, int32_t, H8, H4) + +UNPK(sme2_sunpk4_bh, 2, int16_t, int8_t, H2, H1) +UNPK(sme2_sunpk4_hs, 2, int32_t, int16_t, H4, H2) +UNPK(sme2_sunpk4_sd, 2, int64_t, int32_t, H8, H4) + +UNPK(sme2_uunpk2_bh, 1, uint16_t, uint8_t, H2, H1) +UNPK(sme2_uunpk2_hs, 1, uint32_t, uint16_t, H4, H2) +UNPK(sme2_uunpk2_sd, 1, uint64_t, uint32_t, H8, H4) + +UNPK(sme2_uunpk4_bh, 2, uint16_t, uint8_t, H2, H1) +UNPK(sme2_uunpk4_hs, 2, uint32_t, uint16_t, H4, H2) +UNPK(sme2_uunpk4_sd, 2, uint64_t, uint32_t, H8, H4) + +#undef UNPK + +/* Deinterleave and convert. */ +void HELPER(sme2_fcvtl)(void *vd, void *vs, float_status *fpst, uint32_t desc) +{ + size_t i, n = simd_oprsz(desc) / 4; + float16 *s = vs; + float32 *d0 = vd; + float32 *d1 = vd + sizeof(ARMVectorReg); + + for (i = 0; i < n; ++i) { + float32 v0 = sve_f16_to_f32(s[H2(i * 2 + 0)], fpst); + float32 v1 = sve_f16_to_f32(s[H2(i * 2 + 1)], fpst); + d0[H4(i)] = v0; + d1[H4(i)] = v1; + } +} + +void HELPER(sme2_scvtf)(void *vd, void *vs, float_status *fpst, uint32_t desc) +{ + size_t i, n = simd_oprsz(desc) / 4; + int32_t *d = vd; + float32 *s = vs; + + for (i = 0; i < n; ++i) { + d[i] = int32_to_float32(s[i], fpst); + } +} + +void HELPER(sme2_ucvtf)(void *vd, void *vs, float_status *fpst, uint32_t desc) +{ + size_t i, n = simd_oprsz(desc) / 4; + uint32_t *d = vd; + float32 *s = vs; + + for (i = 0; i < n; ++i) { + d[i] = uint32_to_float32(s[i], fpst); + } +} + +#define ZIP2(NAME, TYPE, H) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + ARMVectorReg scratch[2]; \ + size_t oprsz = simd_oprsz(desc); \ + size_t pairs = oprsz / (sizeof(TYPE) * 2); \ + TYPE *n = vn, *m = vm; \ + if (vectors_overlap(vd, 2, vn, 1)) { \ + n = memcpy(&scratch[0], vn, oprsz); \ + } \ + if (vectors_overlap(vd, 2, vm, 1)) { \ + m = memcpy(&scratch[1], vm, oprsz); \ + } \ + for (size_t r = 0; r < 2; ++r) { \ + TYPE *d = vd + r * sizeof(ARMVectorReg); \ + size_t base = r * pairs; \ + for (size_t p = 0; p < pairs; ++p) { \ + d[H(2 * p + 0)] = n[base + H(p)]; \ + d[H(2 * p + 1)] = m[base + H(p)]; \ + } \ + } \ +} + +ZIP2(sme2_zip2_b, uint8_t, H1) +ZIP2(sme2_zip2_h, uint16_t, H2) +ZIP2(sme2_zip2_s, uint32_t, H4) +ZIP2(sme2_zip2_d, uint64_t, ) +ZIP2(sme2_zip2_q, Int128, ) + +#undef ZIP2 + +#define ZIP4(NAME, TYPE, H) \ +void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \ +{ \ + ARMVectorReg scratch[4]; \ + size_t oprsz = simd_oprsz(desc); \ + size_t quads = oprsz / (sizeof(TYPE) * 4); \ + TYPE *s0, *s1, *s2, *s3; \ + if (vs == vd) { \ + vs = memcpy(scratch, vs, sizeof(scratch)); \ + } \ + s0 = vs; \ + s1 = vs + sizeof(ARMVectorReg); \ + s2 = vs + 2 * sizeof(ARMVectorReg); \ + s3 = vs + 3 * sizeof(ARMVectorReg); \ + for (size_t r = 0; r < 4; ++r) { \ + TYPE *d = vd + r * sizeof(ARMVectorReg); \ + size_t base = r * quads; \ + for (size_t q = 0; q < quads; ++q) { \ + d[H(4 * q + 0)] = s0[base + H(q)]; \ + d[H(4 * q + 1)] = s1[base + H(q)]; \ + d[H(4 * q + 2)] = s2[base + H(q)]; \ + d[H(4 * q + 3)] = s3[base + H(q)]; \ + } \ + } \ +} + +ZIP4(sme2_zip4_b, uint8_t, H1) +ZIP4(sme2_zip4_h, uint16_t, H2) +ZIP4(sme2_zip4_s, uint32_t, H4) +ZIP4(sme2_zip4_d, uint64_t, ) +ZIP4(sme2_zip4_q, Int128, ) + +#undef ZIP4 + +#define UZP2(NAME, TYPE, H) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + ARMVectorReg scratch[2]; \ + size_t oprsz = simd_oprsz(desc); \ + size_t pairs = oprsz / (sizeof(TYPE) * 2); \ + TYPE *d0 = vd, *d1 = vd + sizeof(ARMVectorReg); \ + if (vectors_overlap(vd, 2, vn, 1)) { \ + vn = memcpy(&scratch[0], vn, oprsz); \ + } \ + if (vectors_overlap(vd, 2, vm, 1)) { \ + vm = memcpy(&scratch[1], vm, oprsz); \ + } \ + for (size_t r = 0; r < 2; ++r) { \ + TYPE *s = r ? vm : vn; \ + size_t base = r * pairs; \ + for (size_t p = 0; p < pairs; ++p) { \ + d0[base + H(p)] = s[H(2 * p + 0)]; \ + d1[base + H(p)] = s[H(2 * p + 1)]; \ + } \ + } \ +} + +UZP2(sme2_uzp2_b, uint8_t, H1) +UZP2(sme2_uzp2_h, uint16_t, H2) +UZP2(sme2_uzp2_s, uint32_t, H4) +UZP2(sme2_uzp2_d, uint64_t, ) +UZP2(sme2_uzp2_q, Int128, ) + +#undef UZP2 + +#define UZP4(NAME, TYPE, H) \ +void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \ +{ \ + ARMVectorReg scratch[4]; \ + size_t oprsz = simd_oprsz(desc); \ + size_t quads = oprsz / (sizeof(TYPE) * 4); \ + TYPE *d0, *d1, *d2, *d3; \ + if (vs == vd) { \ + vs = memcpy(scratch, vs, sizeof(scratch)); \ + } \ + d0 = vd; \ + d1 = vd + sizeof(ARMVectorReg); \ + d2 = vd + 2 * sizeof(ARMVectorReg); \ + d3 = vd + 3 * sizeof(ARMVectorReg); \ + for (size_t r = 0; r < 4; ++r) { \ + TYPE *s = vs + r * sizeof(ARMVectorReg); \ + size_t base = r * quads; \ + for (size_t q = 0; q < quads; ++q) { \ + d0[base + H(q)] = s[H(4 * q + 0)]; \ + d1[base + H(q)] = s[H(4 * q + 1)]; \ + d2[base + H(q)] = s[H(4 * q + 2)]; \ + d3[base + H(q)] = s[H(4 * q + 3)]; \ + } \ + } \ +} + +UZP4(sme2_uzp4_b, uint8_t, H1) +UZP4(sme2_uzp4_h, uint16_t, H2) +UZP4(sme2_uzp4_s, uint32_t, H4) +UZP4(sme2_uzp4_d, uint64_t, ) +UZP4(sme2_uzp4_q, Int128, ) + +#undef UZP4 + +#define ICLAMP(NAME, TYPE, H) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + size_t stride = sizeof(ARMVectorReg) / sizeof(TYPE); \ + size_t elements = simd_oprsz(desc) / sizeof(TYPE); \ + size_t nreg = simd_data(desc); \ + TYPE *d = vd, *n = vn, *m = vm; \ + for (size_t e = 0; e < elements; e++) { \ + TYPE nn = n[H(e)], mm = m[H(e)]; \ + for (size_t r = 0; r < nreg; r++) { \ + TYPE *dd = &d[r * stride + H(e)]; \ + *dd = MIN(MAX(*dd, nn), mm); \ + } \ + } \ +} + +ICLAMP(sme2_sclamp_b, int8_t, H1) +ICLAMP(sme2_sclamp_h, int16_t, H2) +ICLAMP(sme2_sclamp_s, int32_t, H4) +ICLAMP(sme2_sclamp_d, int64_t, H8) + +ICLAMP(sme2_uclamp_b, uint8_t, H1) +ICLAMP(sme2_uclamp_h, uint16_t, H2) +ICLAMP(sme2_uclamp_s, uint32_t, H4) +ICLAMP(sme2_uclamp_d, uint64_t, H8) + +#undef ICLAMP + +/* + * Note the argument ordering to minnum and maxnum must match + * the ARM pseudocode so that NaNs are propagated properly. + */ +#define FCLAMP(NAME, TYPE, H) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, \ + float_status *fpst, uint32_t desc) \ +{ \ + size_t stride = sizeof(ARMVectorReg) / sizeof(TYPE); \ + size_t elements = simd_oprsz(desc) / sizeof(TYPE); \ + size_t nreg = simd_data(desc); \ + TYPE *d = vd, *n = vn, *m = vm; \ + for (size_t e = 0; e < elements; e++) { \ + TYPE nn = n[H(e)], mm = m[H(e)]; \ + for (size_t r = 0; r < nreg; r++) { \ + TYPE *dd = &d[r * stride + H(e)]; \ + *dd = TYPE##_minnum(TYPE##_maxnum(nn, *dd, fpst), mm, fpst); \ + } \ + } \ +} + +FCLAMP(sme2_fclamp_h, float16, H2) +FCLAMP(sme2_fclamp_s, float32, H4) +FCLAMP(sme2_fclamp_d, float64, H8) +FCLAMP(sme2_bfclamp, bfloat16, H2) + +#undef FCLAMP + +void HELPER(sme2_sel_b)(void *vd, void *vn, void *vm, + uint32_t png, uint32_t desc) +{ + int vl = simd_oprsz(desc); + int nreg = simd_data(desc); + int elements = vl / sizeof(uint8_t); + DecodeCounter p = decode_counter(png, vl, MO_8); + + if (p.lg2_stride == 0) { + if (p.invert) { + for (int r = 0; r < nreg; r++) { + uint8_t *d = vd + r * sizeof(ARMVectorReg); + uint8_t *n = vn + r * sizeof(ARMVectorReg); + uint8_t *m = vm + r * sizeof(ARMVectorReg); + int split = p.count - r * elements; + + if (split <= 0) { + memcpy(d, n, vl); /* all true */ + } else if (elements <= split) { + memcpy(d, m, vl); /* all false */ + } else { + for (int e = 0; e < split; e++) { + d[H1(e)] = m[H1(e)]; + } + for (int e = split; e < elements; e++) { + d[H1(e)] = n[H1(e)]; + } + } + } + } else { + for (int r = 0; r < nreg; r++) { + uint8_t *d = vd + r * sizeof(ARMVectorReg); + uint8_t *n = vn + r * sizeof(ARMVectorReg); + uint8_t *m = vm + r * sizeof(ARMVectorReg); + int split = p.count - r * elements; + + if (split <= 0) { + memcpy(d, m, vl); /* all false */ + } else if (elements <= split) { + memcpy(d, n, vl); /* all true */ + } else { + for (int e = 0; e < split; e++) { + d[H1(e)] = n[H1(e)]; + } + for (int e = split; e < elements; e++) { + d[H1(e)] = m[H1(e)]; + } + } + } + } + } else { + int estride = 1 << p.lg2_stride; + if (p.invert) { + for (int r = 0; r < nreg; r++) { + uint8_t *d = vd + r * sizeof(ARMVectorReg); + uint8_t *n = vn + r * sizeof(ARMVectorReg); + uint8_t *m = vm + r * sizeof(ARMVectorReg); + int split = p.count - r * elements; + int e = 0; + + for (; e < MIN(split, elements); e++) { + d[H1(e)] = m[H1(e)]; + } + for (; e < elements; e += estride) { + d[H1(e)] = n[H1(e)]; + for (int i = 1; i < estride; i++) { + d[H1(e + i)] = m[H1(e + i)]; + } + } + } + } else { + for (int r = 0; r < nreg; r++) { + uint8_t *d = vd + r * sizeof(ARMVectorReg); + uint8_t *n = vn + r * sizeof(ARMVectorReg); + uint8_t *m = vm + r * sizeof(ARMVectorReg); + int split = p.count - r * elements; + int e = 0; + + for (; e < MIN(split, elements); e += estride) { + d[H1(e)] = n[H1(e)]; + for (int i = 1; i < estride; i++) { + d[H1(e + i)] = m[H1(e + i)]; + } + } + for (; e < elements; e++) { + d[H1(e)] = m[H1(e)]; + } + } + } + } +} + +void HELPER(sme2_sel_h)(void *vd, void *vn, void *vm, + uint32_t png, uint32_t desc) +{ + int vl = simd_oprsz(desc); + int nreg = simd_data(desc); + int elements = vl / sizeof(uint16_t); + DecodeCounter p = decode_counter(png, vl, MO_16); + + if (p.lg2_stride == 0) { + if (p.invert) { + for (int r = 0; r < nreg; r++) { + uint16_t *d = vd + r * sizeof(ARMVectorReg); + uint16_t *n = vn + r * sizeof(ARMVectorReg); + uint16_t *m = vm + r * sizeof(ARMVectorReg); + int split = p.count - r * elements; + + if (split <= 0) { + memcpy(d, n, vl); /* all true */ + } else if (elements <= split) { + memcpy(d, m, vl); /* all false */ + } else { + for (int e = 0; e < split; e++) { + d[H2(e)] = m[H2(e)]; + } + for (int e = split; e < elements; e++) { + d[H2(e)] = n[H2(e)]; + } + } + } + } else { + for (int r = 0; r < nreg; r++) { + uint16_t *d = vd + r * sizeof(ARMVectorReg); + uint16_t *n = vn + r * sizeof(ARMVectorReg); + uint16_t *m = vm + r * sizeof(ARMVectorReg); + int split = p.count - r * elements; + + if (split <= 0) { + memcpy(d, m, vl); /* all false */ + } else if (elements <= split) { + memcpy(d, n, vl); /* all true */ + } else { + for (int e = 0; e < split; e++) { + d[H2(e)] = n[H2(e)]; + } + for (int e = split; e < elements; e++) { + d[H2(e)] = m[H2(e)]; + } + } + } + } + } else { + int estride = 1 << p.lg2_stride; + if (p.invert) { + for (int r = 0; r < nreg; r++) { + uint16_t *d = vd + r * sizeof(ARMVectorReg); + uint16_t *n = vn + r * sizeof(ARMVectorReg); + uint16_t *m = vm + r * sizeof(ARMVectorReg); + int split = p.count - r * elements; + int e = 0; + + for (; e < MIN(split, elements); e++) { + d[H2(e)] = m[H2(e)]; + } + for (; e < elements; e += estride) { + d[H2(e)] = n[H2(e)]; + for (int i = 1; i < estride; i++) { + d[H2(e + i)] = m[H2(e + i)]; + } + } + } + } else { + for (int r = 0; r < nreg; r++) { + uint16_t *d = vd + r * sizeof(ARMVectorReg); + uint16_t *n = vn + r * sizeof(ARMVectorReg); + uint16_t *m = vm + r * sizeof(ARMVectorReg); + int split = p.count - r * elements; + int e = 0; + + for (; e < MIN(split, elements); e += estride) { + d[H2(e)] = n[H2(e)]; + for (int i = 1; i < estride; i++) { + d[H2(e + i)] = m[H2(e + i)]; + } + } + for (; e < elements; e++) { + d[H2(e)] = m[H2(e)]; + } + } + } + } +} + +void HELPER(sme2_sel_s)(void *vd, void *vn, void *vm, + uint32_t png, uint32_t desc) +{ + int vl = simd_oprsz(desc); + int nreg = simd_data(desc); + int elements = vl / sizeof(uint32_t); + DecodeCounter p = decode_counter(png, vl, MO_32); + + if (p.lg2_stride == 0) { + if (p.invert) { + for (int r = 0; r < nreg; r++) { + uint32_t *d = vd + r * sizeof(ARMVectorReg); + uint32_t *n = vn + r * sizeof(ARMVectorReg); + uint32_t *m = vm + r * sizeof(ARMVectorReg); + int split = p.count - r * elements; + + if (split <= 0) { + memcpy(d, n, vl); /* all true */ + } else if (elements <= split) { + memcpy(d, m, vl); /* all false */ + } else { + for (int e = 0; e < split; e++) { + d[H4(e)] = m[H4(e)]; + } + for (int e = split; e < elements; e++) { + d[H4(e)] = n[H4(e)]; + } + } + } + } else { + for (int r = 0; r < nreg; r++) { + uint32_t *d = vd + r * sizeof(ARMVectorReg); + uint32_t *n = vn + r * sizeof(ARMVectorReg); + uint32_t *m = vm + r * sizeof(ARMVectorReg); + int split = p.count - r * elements; + + if (split <= 0) { + memcpy(d, m, vl); /* all false */ + } else if (elements <= split) { + memcpy(d, n, vl); /* all true */ + } else { + for (int e = 0; e < split; e++) { + d[H4(e)] = n[H4(e)]; + } + for (int e = split; e < elements; e++) { + d[H4(e)] = m[H4(e)]; + } + } + } + } + } else { + /* p.esz must be MO_64, so stride must be 2. */ + if (p.invert) { + for (int r = 0; r < nreg; r++) { + uint32_t *d = vd + r * sizeof(ARMVectorReg); + uint32_t *n = vn + r * sizeof(ARMVectorReg); + uint32_t *m = vm + r * sizeof(ARMVectorReg); + int split = p.count - r * elements; + int e = 0; + + for (; e < MIN(split, elements); e++) { + d[H4(e)] = m[H4(e)]; + } + for (; e < elements; e += 2) { + d[H4(e)] = n[H4(e)]; + d[H4(e + 1)] = m[H4(e + 1)]; + } + } + } else { + for (int r = 0; r < nreg; r++) { + uint32_t *d = vd + r * sizeof(ARMVectorReg); + uint32_t *n = vn + r * sizeof(ARMVectorReg); + uint32_t *m = vm + r * sizeof(ARMVectorReg); + int split = p.count - r * elements; + int e = 0; + + for (; e < MIN(split, elements); e += 2) { + d[H4(e)] = n[H4(e)]; + d[H4(e + 1)] = m[H4(e + 1)]; + } + for (; e < elements; e++) { + d[H4(e)] = m[H4(e)]; + } + } + } + } +} + +void HELPER(sme2_sel_d)(void *vd, void *vn, void *vm, + uint32_t png, uint32_t desc) +{ + int vl = simd_oprsz(desc); + int nreg = simd_data(desc); + int elements = vl / sizeof(uint64_t); + DecodeCounter p = decode_counter(png, vl, MO_64); + + if (p.invert) { + for (int r = 0; r < nreg; r++) { + uint64_t *d = vd + r * sizeof(ARMVectorReg); + uint64_t *n = vn + r * sizeof(ARMVectorReg); + uint64_t *m = vm + r * sizeof(ARMVectorReg); + int split = p.count - r * elements; + + if (split <= 0) { + memcpy(d, n, vl); /* all true */ + } else if (elements <= split) { + memcpy(d, m, vl); /* all false */ + } else { + memcpy(d, m, split * sizeof(uint64_t)); + memcpy(d + split, n + split, + (elements - split) * sizeof(uint64_t)); + } + } + } else { + for (int r = 0; r < nreg; r++) { + uint64_t *d = vd + r * sizeof(ARMVectorReg); + uint64_t *n = vn + r * sizeof(ARMVectorReg); + uint64_t *m = vm + r * sizeof(ARMVectorReg); + int split = p.count - r * elements; + + if (split <= 0) { + memcpy(d, m, vl); /* all false */ + } else if (elements <= split) { + memcpy(d, n, vl); /* all true */ + } else { + memcpy(d, n, split * sizeof(uint64_t)); + memcpy(d + split, m + split, + (elements - split) * sizeof(uint64_t)); + } + } + } +} diff --git a/target/arm/tcg/sve.decode b/target/arm/tcg/sve.decode index 04b6fcc..ab63cfa 100644 --- a/target/arm/tcg/sve.decode +++ b/target/arm/tcg/sve.decode @@ -30,6 +30,7 @@ %size_23 23:2 %dtype_23_13 23:2 13:2 %index3_22_19 22:1 19:2 +%index3_22_17 22:1 17:2 %index3_19_11 19:2 11:1 %index2_20_11 20:1 11:1 @@ -57,6 +58,11 @@ # as propagated via the MOVPRFX instruction. %reg_movprfx 0:5 +%rn_ax2 6:4 !function=times_2 + +%pnd 0:3 !function=plus_8 +%pnn 5:3 !function=plus_8 + ########################################################################### # Named attribute sets. These are used to make nice(er) names # when creating helpers common to those for the individual @@ -102,6 +108,7 @@ # Two operand @pd_pn ........ esz:2 .. .... ....... rn:4 . rd:4 &rr_esz @rd_rn ........ esz:2 ...... ...... rn:5 rd:5 &rr_esz +@rd_rnx2 ........ ... ..... ...... ..... rd:5 &rr_esz rn=%rn_ax2 # Two operand with governing predicate, flags setting @pd_pg_pn_s ........ . s:1 ...... .. pg:4 . rn:4 . rd:4 &rpr_s @@ -131,11 +138,11 @@ @rda_rn_rm ........ esz:2 . rm:5 ... ... rn:5 rd:5 \ &rrrr_esz ra=%reg_movprfx -# Four operand with unused vector element size -@rda_rn_rm_e0 ........ ... rm:5 ... ... rn:5 rd:5 \ - &rrrr_esz esz=0 ra=%reg_movprfx -@rdn_ra_rm_e0 ........ ... rm:5 ... ... ra:5 rd:5 \ - &rrrr_esz esz=0 rn=%reg_movprfx +# Four operand with explicit vector element size +@rda_rn_rm_ex ........ ... rm:5 ... ... rn:5 rd:5 \ + &rrrr_esz ra=%reg_movprfx +@rdn_ra_rm_ex ........ ... rm:5 ... ... ra:5 rd:5 \ + &rrrr_esz rn=%reg_movprfx # Three operand with "memory" size, aka immediate left shift @rd_rn_msz_rm ........ ... rm:5 .... imm:2 rn:5 rd:5 &rrri @@ -222,6 +229,9 @@ @rprr_load_dt ....... dtype:4 rm:5 ... pg:3 rn:5 rd:5 &rprr_load @rpri_load_dt ....... dtype:4 . imm:s4 ... pg:3 rn:5 rd:5 &rpri_load +@rprr_load ....... .... rm:5 ... pg:3 rn:5 rd:5 &rprr_load +@rpri_load ....... .... . imm:s4 ... pg:3 rn:5 rd:5 &rpri_load + @rprr_load_msz ....... .... rm:5 ... pg:3 rn:5 rd:5 \ &rprr_load dtype=%msz_dtype @rpri_load_msz ....... .... . imm:s4 ... pg:3 rn:5 rd:5 \ @@ -245,7 +255,7 @@ # Stores; user must fill in ESZ, MSZ, NREG as needed. @rprr_store ....... .. .. rm:5 ... pg:3 rn:5 rd:5 &rprr_store -@rpri_store_msz ....... msz:2 .. . imm:s4 ... pg:3 rn:5 rd:5 &rpri_store +@rpri_store ....... .. .. . imm:s4 ... pg:3 rn:5 rd:5 &rpri_store @rprr_store_esz_n0 ....... .. esz:2 rm:5 ... pg:3 rn:5 rd:5 \ &rprr_store nreg=0 @rprr_scatter_store ....... msz:2 .. rm:5 ... pg:3 rn:5 rd:5 \ @@ -320,6 +330,11 @@ ORV 00000100 .. 011 000 001 ... ..... ..... @rd_pg_rn EORV 00000100 .. 011 001 001 ... ..... ..... @rd_pg_rn ANDV 00000100 .. 011 010 001 ... ..... ..... @rd_pg_rn +# SVE2.1 bitwise logical reduction (quadwords) +ORQV 00000100 .. 011 100 001 ... ..... ..... @rd_pg_rn +EORQV 00000100 .. 011 101 001 ... ..... ..... @rd_pg_rn +ANDQV 00000100 .. 011 110 001 ... ..... ..... @rd_pg_rn + # SVE constructive prefix (predicated) MOVPRFX_z 00000100 .. 010 000 001 ... ..... ..... @rd_pg_rn MOVPRFX_m 00000100 .. 010 001 001 ... ..... ..... @rd_pg_rn @@ -335,6 +350,13 @@ UMAXV 00000100 .. 001 001 001 ... ..... ..... @rd_pg_rn SMINV 00000100 .. 001 010 001 ... ..... ..... @rd_pg_rn UMINV 00000100 .. 001 011 001 ... ..... ..... @rd_pg_rn +# SVE2.1 segment reduction +ADDQV 00000100 .. 000 101 001 ... ..... ..... @rd_pg_rn +SMAXQV 00000100 .. 001 100 001 ... ..... ..... @rd_pg_rn +SMINQV 00000100 .. 001 110 001 ... ..... ..... @rd_pg_rn +UMAXQV 00000100 .. 001 101 001 ... ..... ..... @rd_pg_rn +UMINQV 00000100 .. 001 111 001 ... ..... ..... @rd_pg_rn + ### SVE Shift by Immediate - Predicated Group # SVE bitwise shift by immediate (predicated) @@ -428,12 +450,12 @@ XAR 00000100 .. 1 ..... 001 101 rm:5 rd:5 &rrri_esz \ rn=%reg_movprfx esz=%tszimm16_esz imm=%tszimm16_shr # SVE2 bitwise ternary operations -EOR3 00000100 00 1 ..... 001 110 ..... ..... @rdn_ra_rm_e0 -BSL 00000100 00 1 ..... 001 111 ..... ..... @rdn_ra_rm_e0 -BCAX 00000100 01 1 ..... 001 110 ..... ..... @rdn_ra_rm_e0 -BSL1N 00000100 01 1 ..... 001 111 ..... ..... @rdn_ra_rm_e0 -BSL2N 00000100 10 1 ..... 001 111 ..... ..... @rdn_ra_rm_e0 -NBSL 00000100 11 1 ..... 001 111 ..... ..... @rdn_ra_rm_e0 +EOR3 00000100 00 1 ..... 001 110 ..... ..... @rdn_ra_rm_ex esz=0 +BSL 00000100 00 1 ..... 001 111 ..... ..... @rdn_ra_rm_ex esz=0 +BCAX 00000100 01 1 ..... 001 110 ..... ..... @rdn_ra_rm_ex esz=0 +BSL1N 00000100 01 1 ..... 001 111 ..... ..... @rdn_ra_rm_ex esz=0 +BSL2N 00000100 10 1 ..... 001 111 ..... ..... @rdn_ra_rm_ex esz=0 +NBSL 00000100 11 1 ..... 001 111 ..... ..... @rdn_ra_rm_ex esz=0 ### SVE Index Generation Group @@ -559,6 +581,14 @@ DUP_s 00000101 .. 1 00000 001110 ..... ..... @rd_rn DUP_x 00000101 .. 1 ..... 001000 rn:5 rd:5 \ &rri imm=%imm7_22_16 +# SVE Permute Vector - one source quadwords +DUPQ 00000101 001 imm:4 1 001001 rn:5 rd:5 &rri_esz esz=0 +DUPQ 00000101 001 imm:3 10 001001 rn:5 rd:5 &rri_esz esz=1 +DUPQ 00000101 001 imm:2 100 001001 rn:5 rd:5 &rri_esz esz=2 +DUPQ 00000101 001 imm:1 1000 001001 rn:5 rd:5 &rri_esz esz=3 + +EXTQ 00000101 0110 imm:4 001001 rn:5 rd:5 &rri + # SVE insert SIMD&FP scalar register INSR_f 00000101 .. 1 10100 001110 ..... ..... @rdn_rm @@ -568,6 +598,22 @@ INSR_r 00000101 .. 1 00100 001110 ..... ..... @rdn_rm # SVE reverse vector elements REV_v 00000101 .. 1 11000 001110 ..... ..... @rd_rn +# SVE move predicate to/from vector + +PMOV_pv 00000101 00 101 01 0001110 rn:5 0 rd:4 \ + &rri_esz esz=0 imm=0 +PMOV_pv 00000101 00 101 1 imm:1 0001110 rn:5 0 rd:4 &rri_esz esz=1 +PMOV_pv 00000101 01 101 imm:2 0001110 rn:5 0 rd:4 &rri_esz esz=2 +PMOV_pv 00000101 1. 101 .. 0001110 rn:5 0 rd:4 \ + &rri_esz esz=3 imm=%index3_22_17 + +PMOV_vp 00000101 00 101 01 1001110 0 rn:4 rd:5 \ + &rri_esz esz=0 imm=0 +PMOV_vp 00000101 00 101 1 imm:1 1001110 0 rn:4 rd:5 &rri_esz esz=1 +PMOV_vp 00000101 01 101 imm:2 1001110 0 rn:4 rd:5 &rri_esz esz=2 +PMOV_vp 00000101 1. 101 .. 1001110 0 rn:4 rd:5 \ + &rri_esz esz=3 imm=%index3_22_17 + # SVE vector table lookup TBL 00000101 .. 1 ..... 001100 ..... ..... @rd_rn_rm @@ -614,6 +660,15 @@ UZP2_q 00000101 10 1 ..... 000 011 ..... ..... @rd_rn_rm_e0 TRN1_q 00000101 10 1 ..... 000 110 ..... ..... @rd_rn_rm_e0 TRN2_q 00000101 10 1 ..... 000 111 ..... ..... @rd_rn_rm_e0 +# SVE2.1 permute vector elements (quadwords) +ZIPQ1 01000100 .. 0 ..... 111 000 ..... ..... @rd_rn_rm +ZIPQ2 01000100 .. 0 ..... 111 001 ..... ..... @rd_rn_rm +UZPQ1 01000100 .. 0 ..... 111 010 ..... ..... @rd_rn_rm +UZPQ2 01000100 .. 0 ..... 111 011 ..... ..... @rd_rn_rm + +TBLQ 01000100 .. 0 ..... 111 110 ..... ..... @rd_rn_rm +TBXQ 00000101 .. 1 ..... 001 101 ..... ..... @rd_rn_rm + ### SVE Permute - Predicated Group # SVE compress active elements @@ -725,6 +780,7 @@ PTEST 00100101 01 010000 11 pg:4 0 rn:4 0 0000 # SVE predicate initialize PTRUE 00100101 esz:2 01100 s:1 111000 pat:5 0 rd:4 +PTRUE_cnt 00100101 esz:2 1000000111100000010 ... rd=%pnd # SVE initialize FFR SETFFR 00100101 0010 1100 1001 0000 0000 0000 @@ -765,7 +821,8 @@ BRKN 00100101 0. 01100001 .... 0 .... 0 .... @pd_pg_pn_s ### SVE Predicate Count Group # SVE predicate count -CNTP 00100101 .. 100 000 10 .... 0 .... ..... @rd_pg4_pn +CNTP 00100101 .. 100 000 10 .... 0 .... ..... @rd_pg4_pn +CNTP_c 00100101 esz:2 100 000 10 000 vl:1 1 rn:4 rd:5 # SVE inc/dec register by predicate count INCDECP_r 00100101 .. 10110 d:1 10001 00 .... ..... @incdec_pred u=1 @@ -786,11 +843,35 @@ SINCDECP_z 00100101 .. 1010 d:1 u:1 10000 00 .... ..... @incdec2_pred CTERM 00100101 1 sf:1 1 rm:5 001000 rn:5 ne:1 0000 # SVE integer compare scalar count and limit -WHILE 00100101 esz:2 1 rm:5 000 sf:1 u:1 lt:1 rn:5 eq:1 rd:4 +&while esz rd rn rm sf u eq +WHILE_lt 00100101 esz:2 1 rm:5 000 sf:1 u:1 1 rn:5 eq:1 rd:4 &while +WHILE_gt 00100101 esz:2 1 rm:5 000 sf:1 u:1 0 rn:5 eq:1 rd:4 &while # SVE2 pointer conflict compare WHILE_ptr 00100101 esz:2 1 rm:5 001 100 rn:5 rw:1 rd:4 +# SVE2.1 predicate pair +%pd_pair 1:3 !function=times_2 +@while_pair ........ esz:2 . rm:5 .... u:1 . rn:5 . ... eq:1 \ + &while rd=%pd_pair sf=1 + +WHILE_lt_pair 00100101 .. 1 ..... 0101 . 1 ..... 1 ... . @while_pair +WHILE_gt_pair 00100101 .. 1 ..... 0101 . 0 ..... 1 ... . @while_pair + +# SVE2.1 predicate as count +@while_cnt ........ esz:2 . rm:5 .... u:1 . rn:5 . eq:1 ... \ + &while rd=%pnd sf=1 + +WHILE_lt_cnt2 00100101 .. 1 ..... 0100 . 1 ..... 1 . ... @while_cnt +WHILE_lt_cnt4 00100101 .. 1 ..... 0110 . 1 ..... 1 . ... @while_cnt +WHILE_gt_cnt2 00100101 .. 1 ..... 0100 . 0 ..... 1 . ... @while_cnt +WHILE_gt_cnt4 00100101 .. 1 ..... 0110 . 0 ..... 1 . ... @while_cnt + +# SVE2.1 extract mask predicate from predicate-as-counter +&pext rd rn esz imm +PEXT_1 00100101 esz:2 1 00000 0111 00 imm:2 ... 1 rd:4 &pext rn=%pnn +PEXT_2 00100101 esz:2 1 00000 0111 010 imm:1 ... 1 rd:4 &pext rn=%pnn + ### SVE Integer Wide Immediate - Unpredicated Group # SVE broadcast floating-point immediate (unpredicated) @@ -851,10 +932,13 @@ CDOT_zzzz 01000100 esz:2 0 rm:5 0001 rot:2 rn:5 rd:5 ra=%reg_movprfx #### SVE Multiply - Indexed # SVE integer dot product (indexed) -SDOT_zzxw_s 01000100 10 1 ..... 000000 ..... ..... @rrxr_2 esz=2 -SDOT_zzxw_d 01000100 11 1 ..... 000000 ..... ..... @rrxr_1 esz=3 -UDOT_zzxw_s 01000100 10 1 ..... 000001 ..... ..... @rrxr_2 esz=2 -UDOT_zzxw_d 01000100 11 1 ..... 000001 ..... ..... @rrxr_1 esz=3 +SDOT_zzxw_4s 01000100 10 1 ..... 000000 ..... ..... @rrxr_2 esz=2 +SDOT_zzxw_4d 01000100 11 1 ..... 000000 ..... ..... @rrxr_1 esz=3 +UDOT_zzxw_4s 01000100 10 1 ..... 000001 ..... ..... @rrxr_2 esz=2 +UDOT_zzxw_4d 01000100 11 1 ..... 000001 ..... ..... @rrxr_1 esz=3 + +SDOT_zzxw_2s 01000100 10 0 ..... 110010 ..... ..... @rrxr_2 esz=2 +UDOT_zzxw_2s 01000100 10 0 ..... 110011 ..... ..... @rrxr_2 esz=2 # SVE2 integer multiply-add (indexed) MLA_zzxz_h 01000100 0. 1 ..... 000010 ..... ..... @rrxr_3 esz=1 @@ -873,8 +957,8 @@ SQRDMLSH_zzxz_s 01000100 10 1 ..... 000101 ..... ..... @rrxr_2 esz=2 SQRDMLSH_zzxz_d 01000100 11 1 ..... 000101 ..... ..... @rrxr_1 esz=3 # SVE mixed sign dot product (indexed) -USDOT_zzxw_s 01000100 10 1 ..... 000110 ..... ..... @rrxr_2 esz=2 -SUDOT_zzxw_s 01000100 10 1 ..... 000111 ..... ..... @rrxr_2 esz=2 +USDOT_zzxw_4s 01000100 10 1 ..... 000110 ..... ..... @rrxr_2 esz=2 +SUDOT_zzxw_4s 01000100 10 1 ..... 000111 ..... ..... @rrxr_2 esz=2 # SVE2 saturating multiply-add (indexed) SQDMLALB_zzxw_s 01000100 10 1 ..... 0010.0 ..... ..... @rrxr_3a esz=2 @@ -968,9 +1052,11 @@ FCMLA_zzxz 01100100 11 1 index:1 rm:4 0001 rot:2 rn:5 rd:5 \ ### SVE FP Multiply-Add Indexed Group # SVE floating-point multiply-add (indexed) +FMLA_zzxz 01100100 0. 1 ..... 000010 ..... ..... @rrxr_3 esz=0 FMLA_zzxz 01100100 0. 1 ..... 000000 ..... ..... @rrxr_3 esz=1 FMLA_zzxz 01100100 10 1 ..... 000000 ..... ..... @rrxr_2 esz=2 FMLA_zzxz 01100100 11 1 ..... 000000 ..... ..... @rrxr_1 esz=3 +FMLS_zzxz 01100100 0. 1 ..... 000011 ..... ..... @rrxr_3 esz=0 FMLS_zzxz 01100100 0. 1 ..... 000001 ..... ..... @rrxr_3 esz=1 FMLS_zzxz 01100100 10 1 ..... 000001 ..... ..... @rrxr_2 esz=2 FMLS_zzxz 01100100 11 1 ..... 000001 ..... ..... @rrxr_1 esz=3 @@ -978,6 +1064,7 @@ FMLS_zzxz 01100100 11 1 ..... 000001 ..... ..... @rrxr_1 esz=3 ### SVE FP Multiply Indexed Group # SVE floating-point multiply (indexed) +FMUL_zzx 01100100 0. 1 ..... 001010 ..... ..... @rrx_3 esz=0 FMUL_zzx 01100100 0. 1 ..... 001000 ..... ..... @rrx_3 esz=1 FMUL_zzx 01100100 10 1 ..... 001000 ..... ..... @rrx_2 esz=2 FMUL_zzx 01100100 11 1 ..... 001000 ..... ..... @rrx_1 esz=3 @@ -990,6 +1077,14 @@ FMINNMV 01100101 .. 000 101 001 ... ..... ..... @rd_pg_rn FMAXV 01100101 .. 000 110 001 ... ..... ..... @rd_pg_rn FMINV 01100101 .. 000 111 001 ... ..... ..... @rd_pg_rn +### SVE FP recursive reduction (quadwords) + +FADDQV 01100100 .. 010 000 101 ... ..... ..... @rd_pg_rn +FMAXNMQV 01100100 .. 010 100 101 ... ..... ..... @rd_pg_rn +FMINNMQV 01100100 .. 010 101 101 ... ..... ..... @rd_pg_rn +FMAXQV 01100100 .. 010 110 101 ... ..... ..... @rd_pg_rn +FMINQV 01100100 .. 010 111 101 ... ..... ..... @rd_pg_rn + ## SVE Floating Point Unary Operations - Unpredicated Group FRECPE 01100101 .. 001 110 001100 ..... ..... @rd_rn @@ -1151,12 +1246,24 @@ LD1_zpiz 1000010 .. 01 ..... 1.. ... ..... ..... \ # SVE contiguous load (scalar plus scalar) LD_zprr 1010010 .... ..... 010 ... ..... ..... @rprr_load_dt nreg=0 +# LD1W (128-bit element) +LD_zprr 1010010 1000 rm:5 100 pg:3 rn:5 rd:5 \ + &rprr_load dtype=16 nreg=0 +# LD1D (128-bit element) +LD_zprr 1010010 1100 rm:5 100 pg:3 rn:5 rd:5 \ + &rprr_load dtype=17 nreg=0 # SVE contiguous first-fault load (scalar plus scalar) LDFF1_zprr 1010010 .... ..... 011 ... ..... ..... @rprr_load_dt nreg=0 # SVE contiguous load (scalar plus immediate) LD_zpri 1010010 .... 0.... 101 ... ..... ..... @rpri_load_dt nreg=0 +# LD1W (128-bit element) +LD_zpri 1010010 1000 1 imm:s4 001 pg:3 rn:5 rd:5 \ + &rpri_load dtype=16 nreg=0 +# LD1D (128-bit element) +LD_zpri 1010010 1100 1 imm:s4 001 pg:3 rn:5 rd:5 \ + &rpri_load dtype=17 nreg=0 # SVE contiguous non-fault load (scalar plus immediate) LDNF1_zpri 1010010 .... 1.... 101 ... ..... ..... @rpri_load_dt nreg=0 @@ -1166,12 +1273,26 @@ LDNF1_zpri 1010010 .... 1.... 101 ... ..... ..... @rpri_load_dt nreg=0 # SVE load multiple structures (scalar plus scalar) # LD2B, LD2H, LD2W, LD2D; etc. LD_zprr 1010010 .. nreg:2 ..... 110 ... ..... ..... @rprr_load_msz +# LD[234]Q +LD_zprr 1010010 01 01 ..... 100 ... ..... ..... \ + @rprr_load dtype=18 nreg=1 +LD_zprr 1010010 10 01 ..... 100 ... ..... ..... \ + @rprr_load dtype=18 nreg=2 +LD_zprr 1010010 11 01 ..... 100 ... ..... ..... \ + @rprr_load dtype=18 nreg=3 # SVE contiguous non-temporal load (scalar plus immediate) # LDNT1B, LDNT1H, LDNT1W, LDNT1D # SVE load multiple structures (scalar plus immediate) # LD2B, LD2H, LD2W, LD2D; etc. LD_zpri 1010010 .. nreg:2 0.... 111 ... ..... ..... @rpri_load_msz +# LD[234]Q +LD_zpri 1010010 01 001 .... 111 ... ..... ..... \ + @rpri_load dtype=18 nreg=1 +LD_zpri 1010010 10 001 .... 111 ... ..... ..... \ + @rpri_load dtype=18 nreg=2 +LD_zpri 1010010 11 001 .... 111 ... ..... ..... \ + @rpri_load dtype=18 nreg=3 # SVE load and broadcast quadword (scalar plus scalar) LD1RQ_zprr 1010010 .. 00 ..... 000 ... ..... ..... \ @@ -1222,6 +1343,10 @@ LD1_zprz 1100010 10 1. ..... 1.. ... ..... ..... \ LD1_zprz 1100010 11 1. ..... 11. ... ..... ..... \ @rprr_g_load_sc esz=3 msz=3 u=1 +# LD1Q. Note that this is subtly different from LD1_zprz because +# it is vector + scalar, not scalar + vector. +LD1Q 1100 0100 000 rm:5 101 pg:3 rn:5 rd:5 + # SVE 64-bit gather load (vector plus immediate) LD1_zpiz 1100010 .. 01 ..... 1.. ... ..... ..... \ @rpri_g_load esz=3 @@ -1245,8 +1370,20 @@ STR_zri 1110010 11 0. ..... 010 ... ..... ..... @rd_rn_i9 # SVE contiguous store (scalar plus immediate) # ST1B, ST1H, ST1W, ST1D; require msz <= esz -ST_zpri 1110010 .. esz:2 0.... 111 ... ..... ..... \ - @rpri_store_msz nreg=0 +ST_zpri 1110010 00 esz:2 0.... 111 ... ..... ..... \ + @rpri_store msz=0 nreg=0 +ST_zpri 1110010 01 esz:2 0.... 111 ... ..... ..... \ + @rpri_store msz=1 nreg=0 +ST_zpri 1110010 10 10 0.... 111 ... ..... ..... \ + @rpri_store msz=2 esz=2 nreg=0 +ST_zpri 1110010 10 11 0.... 111 ... ..... ..... \ + @rpri_store msz=2 esz=3 nreg=0 +ST_zpri 1110010 11 11 0.... 111 ... ..... ..... \ + @rpri_store msz=3 esz=3 nreg=0 +ST_zpri 1110010 10 00 0.... 111 ... ..... ..... \ + @rpri_store msz=2 esz=4 nreg=0 +ST_zpri 1110010 11 10 0.... 111 ... ..... ..... \ + @rpri_store msz=3 esz=4 nreg=0 # SVE contiguous store (scalar plus scalar) # ST1B, ST1H, ST1W, ST1D; require msz <= esz @@ -1255,20 +1392,40 @@ ST_zprr 1110010 00 .. ..... 010 ... ..... ..... \ @rprr_store_esz_n0 msz=0 ST_zprr 1110010 01 .. ..... 010 ... ..... ..... \ @rprr_store_esz_n0 msz=1 -ST_zprr 1110010 10 .. ..... 010 ... ..... ..... \ - @rprr_store_esz_n0 msz=2 +ST_zprr 1110010 10 10 ..... 010 ... ..... ..... \ + @rprr_store msz=2 esz=2 nreg=0 +ST_zprr 1110010 10 11 ..... 010 ... ..... ..... \ + @rprr_store msz=2 esz=3 nreg=0 ST_zprr 1110010 11 11 ..... 010 ... ..... ..... \ @rprr_store msz=3 esz=3 nreg=0 +ST_zprr 1110010 10 00 ..... 010 ... ..... ..... \ + @rprr_store msz=2 esz=4 nreg=0 +ST_zprr 1110010 11 10 ..... 010 ... ..... ..... \ + @rprr_store msz=3 esz=4 nreg=0 # SVE contiguous non-temporal store (scalar plus immediate) (nreg == 0) # SVE store multiple structures (scalar plus immediate) (nreg != 0) ST_zpri 1110010 .. nreg:2 1.... 111 ... ..... ..... \ - @rpri_store_msz esz=%size_23 + @rpri_store msz=%size_23 esz=%size_23 +# ST[234]Q +ST_zpri 11100100 01 00 .... 000 ... ..... ..... \ + @rpri_store msz=4 esz=4 nreg=1 +ST_zpri 11100100 10 00 .... 000 ... ..... ..... \ + @rpri_store msz=4 esz=4 nreg=2 +ST_zpri 11100100 11 00 .... 000 ... ..... ..... \ + @rpri_store msz=4 esz=4 nreg=3 # SVE contiguous non-temporal store (scalar plus scalar) (nreg == 0) # SVE store multiple structures (scalar plus scalar) (nreg != 0) -ST_zprr 1110010 msz:2 nreg:2 ..... 011 ... ..... ..... \ - @rprr_store esz=%size_23 +ST_zprr 1110010 .. nreg:2 ..... 011 ... ..... ..... \ + @rprr_store msz=%size_23 esz=%size_23 +# ST[234]Q +ST_zprr 11100100 01 1 ..... 000 ... ..... ..... \ + @rprr_store msz=4 esz=4 nreg=1 +ST_zprr 11100100 10 1 ..... 000 ... ..... ..... \ + @rprr_store msz=4 esz=4 nreg=2 +ST_zprr 11100100 11 1 ..... 000 ... ..... ..... \ + @rprr_store msz=4 esz=4 nreg=3 # SVE 32-bit scatter store (scalar plus 32-bit scaled offsets) # Require msz > 0 && msz <= esz. @@ -1293,6 +1450,10 @@ ST1_zprz 1110010 .. 01 ..... 101 ... ..... ..... \ ST1_zprz 1110010 .. 00 ..... 101 ... ..... ..... \ @rprr_scatter_store xs=2 esz=3 scale=0 +# ST1Q. Note that this is subtly different from ST1_zprz because +# it is vector + scalar, not scalar + vector. +ST1Q 1110 0100 001 rm:5 001 pg:3 rn:5 rd:5 + # SVE 64-bit scatter store (vector plus immediate) ST1_zpiz 1110010 .. 10 ..... 101 ... ..... ..... \ @rpri_scatter_store esz=3 @@ -1450,9 +1611,9 @@ EORTB 01000101 .. 0 ..... 10010 1 ..... ..... @rd_rn_rm ## SVE integer matrix multiply accumulate -SMMLA 01000101 00 0 ..... 10011 0 ..... ..... @rda_rn_rm_e0 -USMMLA 01000101 10 0 ..... 10011 0 ..... ..... @rda_rn_rm_e0 -UMMLA 01000101 11 0 ..... 10011 0 ..... ..... @rda_rn_rm_e0 +SMMLA 01000101 00 0 ..... 10011 0 ..... ..... @rda_rn_rm_ex esz=2 +USMMLA 01000101 10 0 ..... 10011 0 ..... ..... @rda_rn_rm_ex esz=2 +UMMLA 01000101 11 0 ..... 10011 0 ..... ..... @rda_rn_rm_ex esz=2 ## SVE2 bitwise permute @@ -1504,13 +1665,22 @@ UABA 01000101 .. 0 ..... 11111 1 ..... ..... @rd_rn_rm #### SVE2 Narrowing ## SVE2 saturating extract narrow - # Bits 23, 18-16 are zero, limited in the translator via esz < 3 & imm == 0. -SQXTNB 01000101 .. 1 ..... 010 000 ..... ..... @rd_rn_tszimm_shl + +{ + SQCVTN_sh 01000101 00 1 10001 010 000 ....0 ..... @rd_rnx2 esz=1 + SQXTNB 01000101 .. 1 ..... 010 000 ..... ..... @rd_rn_tszimm_shl +} SQXTNT 01000101 .. 1 ..... 010 001 ..... ..... @rd_rn_tszimm_shl -UQXTNB 01000101 .. 1 ..... 010 010 ..... ..... @rd_rn_tszimm_shl +{ + UQCVTN_sh 01000101 00 1 10001 010 010 ....0 ..... @rd_rnx2 esz=1 + UQXTNB 01000101 .. 1 ..... 010 010 ..... ..... @rd_rn_tszimm_shl +} UQXTNT 01000101 .. 1 ..... 010 011 ..... ..... @rd_rn_tszimm_shl -SQXTUNB 01000101 .. 1 ..... 010 100 ..... ..... @rd_rn_tszimm_shl +{ + SQCVTUN_sh 01000101 00 1 10001 010 100 ....0 ..... @rd_rnx2 esz=1 + SQXTUNB 01000101 .. 1 ..... 010 100 ..... ..... @rd_rn_tszimm_shl +} SQXTUNT 01000101 .. 1 ..... 010 101 ..... ..... @rd_rn_tszimm_shl ## SVE2 bitwise shift right narrow @@ -1597,14 +1767,17 @@ UMLSLT_zzzw 01000100 .. 0 ..... 010 111 ..... ..... @rda_rn_rm CMLA_zzzz 01000100 esz:2 0 rm:5 0010 rot:2 rn:5 rd:5 ra=%reg_movprfx SQRDCMLAH_zzzz 01000100 esz:2 0 rm:5 0011 rot:2 rn:5 rd:5 ra=%reg_movprfx -## SVE mixed sign dot product +## SVE dot product + +SDOT_zzzz_2s 01000100 00 0 ..... 110 010 ..... ..... @rda_rn_rm_ex esz=2 +UDOT_zzzz_2s 01000100 00 0 ..... 110 011 ..... ..... @rda_rn_rm_ex esz=2 -USDOT_zzzz 01000100 .. 0 ..... 011 110 ..... ..... @rda_rn_rm +USDOT_zzzz_4s 01000100 10 0 ..... 011 110 ..... ..... @rda_rn_rm_ex esz=2 ### SVE2 floating point matrix multiply accumulate -BFMMLA 01100100 01 1 ..... 111 001 ..... ..... @rda_rn_rm_e0 -FMMLA_s 01100100 10 1 ..... 111 001 ..... ..... @rda_rn_rm_e0 -FMMLA_d 01100100 11 1 ..... 111 001 ..... ..... @rda_rn_rm_e0 +BFMMLA 01100100 01 1 ..... 111 001 ..... ..... @rda_rn_rm_ex esz=1 +FMMLA_s 01100100 10 1 ..... 111 001 ..... ..... @rda_rn_rm_ex esz=2 +FMMLA_d 01100100 11 1 ..... 111 001 ..... ..... @rda_rn_rm_ex esz=3 ### SVE2 Memory Gather Load Group @@ -1654,26 +1827,35 @@ FCVTLT_sd 01100100 11 0010 11 101 ... ..... ..... @rd_pg_rn_e0 FLOGB 01100101 00 011 esz:2 0101 pg:3 rn:5 rd:5 &rpr_esz ### SVE2 floating-point multiply-add long (vectors) -FMLALB_zzzw 01100100 10 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_e0 -FMLALT_zzzw 01100100 10 1 ..... 10 0 00 1 ..... ..... @rda_rn_rm_e0 -FMLSLB_zzzw 01100100 10 1 ..... 10 1 00 0 ..... ..... @rda_rn_rm_e0 -FMLSLT_zzzw 01100100 10 1 ..... 10 1 00 1 ..... ..... @rda_rn_rm_e0 +FMLALB_zzzw 01100100 10 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_ex esz=2 +FMLALT_zzzw 01100100 10 1 ..... 10 0 00 1 ..... ..... @rda_rn_rm_ex esz=2 +FMLSLB_zzzw 01100100 10 1 ..... 10 1 00 0 ..... ..... @rda_rn_rm_ex esz=2 +FMLSLT_zzzw 01100100 10 1 ..... 10 1 00 1 ..... ..... @rda_rn_rm_ex esz=2 -BFMLALB_zzzw 01100100 11 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_e0 -BFMLALT_zzzw 01100100 11 1 ..... 10 0 00 1 ..... ..... @rda_rn_rm_e0 +BFMLALB_zzzw 01100100 11 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_ex esz=2 +BFMLALT_zzzw 01100100 11 1 ..... 10 0 00 1 ..... ..... @rda_rn_rm_ex esz=2 +BFMLSLB_zzzw 01100100 11 1 ..... 10 1 00 0 ..... ..... @rda_rn_rm_ex esz=2 +BFMLSLT_zzzw 01100100 11 1 ..... 10 1 00 1 ..... ..... @rda_rn_rm_ex esz=2 -### SVE2 floating-point bfloat16 dot-product -BFDOT_zzzz 01100100 01 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_e0 +### SVE2 floating-point dot-product +FDOT_zzzz 01100100 00 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_ex esz=2 +BFDOT_zzzz 01100100 01 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_ex esz=2 ### SVE2 floating-point multiply-add long (indexed) + FMLALB_zzxw 01100100 10 1 ..... 0100.0 ..... ..... @rrxr_3a esz=2 FMLALT_zzxw 01100100 10 1 ..... 0100.1 ..... ..... @rrxr_3a esz=2 FMLSLB_zzxw 01100100 10 1 ..... 0110.0 ..... ..... @rrxr_3a esz=2 FMLSLT_zzxw 01100100 10 1 ..... 0110.1 ..... ..... @rrxr_3a esz=2 + BFMLALB_zzxw 01100100 11 1 ..... 0100.0 ..... ..... @rrxr_3a esz=2 BFMLALT_zzxw 01100100 11 1 ..... 0100.1 ..... ..... @rrxr_3a esz=2 +BFMLSLB_zzxw 01100100 11 1 ..... 0110.0 ..... ..... @rrxr_3a esz=2 +BFMLSLT_zzxw 01100100 11 1 ..... 0110.1 ..... ..... @rrxr_3a esz=2 -### SVE2 floating-point bfloat16 dot-product (indexed) +### SVE2 floating-point dot-product (indexed) + +FDOT_zzxz 01100100 00 1 ..... 010000 ..... ..... @rrxr_2 esz=2 BFDOT_zzxz 01100100 01 1 ..... 010000 ..... ..... @rrxr_2 esz=2 ### SVE broadcast predicate element @@ -1700,3 +1882,55 @@ PSEL 00100101 .1 1 000 .. 01 .... 0 .... 0 .... \ SCLAMP 01000100 .. 0 ..... 110000 ..... ..... @rda_rn_rm UCLAMP 01000100 .. 0 ..... 110001 ..... ..... @rda_rn_rm + +FCLAMP 01100100 .. 1 ..... 001001 ..... ..... @rda_rn_rm + +### SVE2p1 multi-vec contiguous load + +&zcrr_ldst rd png rn rm esz nreg +&zcri_ldst rd png rn imm esz nreg +%png 10:3 !function=plus_8 +%zd_ax2 1:4 !function=times_2 +%zd_ax4 2:3 !function=times_4 + +LD1_zcrr 10100000000 rm:5 0 esz:2 ... rn:5 .... - \ + &zcrr_ldst %png rd=%zd_ax2 nreg=2 +LD1_zcrr 10100000000 rm:5 1 esz:2 ... rn:5 ... 0- \ + &zcrr_ldst %png rd=%zd_ax4 nreg=4 + +ST1_zcrr 10100000001 rm:5 0 esz:2 ... rn:5 .... - \ + &zcrr_ldst %png rd=%zd_ax2 nreg=2 +ST1_zcrr 10100000001 rm:5 1 esz:2 ... rn:5 ... 0- \ + &zcrr_ldst %png rd=%zd_ax4 nreg=4 + +LD1_zcri 101000000100 imm:s4 0 esz:2 ... rn:5 .... - \ + &zcri_ldst %png rd=%zd_ax2 nreg=2 +LD1_zcri 101000000100 imm:s4 1 esz:2 ... rn:5 ... 0- \ + &zcri_ldst %png rd=%zd_ax4 nreg=4 + +ST1_zcri 101000000110 imm:s4 0 esz:2 ... rn:5 .... - \ + &zcri_ldst %png rd=%zd_ax2 nreg=2 +ST1_zcri 101000000110 imm:s4 1 esz:2 ... rn:5 ... 0- \ + &zcri_ldst %png rd=%zd_ax4 nreg=4 + +# Note: N bit and 0 bit (for nreg4) still mashed in rd. +# This is handled within gen_ldst_c(). +LD1_zcrr_stride 10100001000 rm:5 0 esz:2 ... rn:5 rd:5 \ + &zcrr_ldst %png nreg=2 +LD1_zcrr_stride 10100001000 rm:5 1 esz:2 ... rn:5 rd:5 \ + &zcrr_ldst %png nreg=4 + +ST1_zcrr_stride 10100001001 rm:5 0 esz:2 ... rn:5 rd:5 \ + &zcrr_ldst %png nreg=2 +ST1_zcrr_stride 10100001001 rm:5 1 esz:2 ... rn:5 rd:5 \ + &zcrr_ldst %png nreg=4 + +LD1_zcri_stride 101000010100 imm:s4 0 esz:2 ... rn:5 rd:5 \ + &zcri_ldst %png nreg=2 +LD1_zcri_stride 101000010100 imm:s4 1 esz:2 ... rn:5 rd:5 \ + &zcri_ldst %png nreg=4 + +ST1_zcri_stride 101000010110 imm:s4 0 esz:2 ... rn:5 rd:5 \ + &zcri_ldst %png nreg=2 +ST1_zcri_stride 101000010110 imm:s4 1 esz:2 ... rn:5 rd:5 \ + &zcri_ldst %png nreg=4 diff --git a/target/arm/tcg/sve_helper.c b/target/arm/tcg/sve_helper.c index d786b4b..c442fcb 100644 --- a/target/arm/tcg/sve_helper.c +++ b/target/arm/tcg/sve_helper.c @@ -20,15 +20,19 @@ #include "qemu/osdep.h" #include "cpu.h" #include "internals.h" -#include "exec/exec-all.h" #include "exec/page-protection.h" #include "exec/helper-proto.h" +#include "exec/target_page.h" +#include "exec/tlb-flags.h" #include "tcg/tcg-gvec-desc.h" #include "fpu/softfloat.h" #include "tcg/tcg.h" #include "vec_internal.h" #include "sve_ldst_internal.h" +#include "accel/tcg/cpu-ldst.h" +#include "accel/tcg/helper-retaddr.h" #include "accel/tcg/cpu-ops.h" +#include "accel/tcg/probe.h" #ifdef CONFIG_USER_ONLY #include "user/page-protection.h" #endif @@ -119,6 +123,11 @@ static inline uint64_t expand_pred_s(uint8_t byte) return word[byte & 0x11]; } +static inline uint64_t expand_pred_d(uint8_t byte) +{ + return -(uint64_t)(byte & 1); +} + #define LOGICAL_PPPP(NAME, FUNC) \ void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ { \ @@ -202,6 +211,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ #define DO_EOR(N, M) (N ^ M) #define DO_ORR(N, M) (N | M) #define DO_BIC(N, M) (N & ~M) +#define DO_ORC(N, M) (N | ~M) #define DO_ADD(N, M) (N + M) #define DO_SUB(N, M) (N - M) #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M)) @@ -523,14 +533,9 @@ DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS) DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS) DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D) -static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max) -{ - return val >= max ? max : val <= min ? min : val; -} - -#define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX) -#define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX) -#define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX) +#define DO_SQADD_B(n, m) do_ssat_b((int64_t)n + m) +#define DO_SQADD_H(n, m) do_ssat_h((int64_t)n + m) +#define DO_SQADD_S(n, m) do_ssat_s((int64_t)n + m) static inline int64_t do_sqadd_d(int64_t n, int64_t m) { @@ -547,9 +552,9 @@ DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H) DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S) DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d) -#define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX) -#define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX) -#define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX) +#define DO_UQADD_B(n, m) do_usat_b((int64_t)n + m) +#define DO_UQADD_H(n, m) do_usat_h((int64_t)n + m) +#define DO_UQADD_S(n, m) do_usat_s((int64_t)n + m) static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m) { @@ -562,9 +567,9 @@ DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H) DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S) DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d) -#define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX) -#define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX) -#define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX) +#define DO_SQSUB_B(n, m) do_ssat_b((int64_t)n - m) +#define DO_SQSUB_H(n, m) do_ssat_h((int64_t)n - m) +#define DO_SQSUB_S(n, m) do_ssat_s((int64_t)n - m) static inline int64_t do_sqsub_d(int64_t n, int64_t m) { @@ -581,9 +586,9 @@ DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H) DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S) DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d) -#define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX) -#define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX) -#define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX) +#define DO_UQSUB_B(n, m) do_usat_b((int64_t)n - m) +#define DO_UQSUB_H(n, m) do_usat_h((int64_t)n - m) +#define DO_UQSUB_S(n, m) do_usat_s((int64_t)n - m) static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m) { @@ -595,12 +600,9 @@ DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H) DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S) DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d) -#define DO_SUQADD_B(n, m) \ - do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX) -#define DO_SUQADD_H(n, m) \ - do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX) -#define DO_SUQADD_S(n, m) \ - do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX) +#define DO_SUQADD_B(n, m) do_ssat_b((int64_t)(int8_t)n + m) +#define DO_SUQADD_H(n, m) do_ssat_h((int64_t)(int16_t)n + m) +#define DO_SUQADD_S(n, m) do_ssat_s((int64_t)(int32_t)n + m) static inline int64_t do_suqadd_d(int64_t n, uint64_t m) { @@ -630,12 +632,9 @@ DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H) DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S) DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d) -#define DO_USQADD_B(n, m) \ - do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX) -#define DO_USQADD_H(n, m) \ - do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX) -#define DO_USQADD_S(n, m) \ - do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX) +#define DO_USQADD_B(n, m) do_usat_b((int64_t)n + (int8_t)m) +#define DO_USQADD_H(n, m) do_usat_h((int64_t)n + (int16_t)m) +#define DO_USQADD_S(n, m) do_usat_s((int64_t)n + (int32_t)m) static inline uint64_t do_usqadd_d(uint64_t n, int64_t m) { @@ -1222,37 +1221,29 @@ void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ } \ } -#define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX) -#define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX) -#define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX) - -DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H) -DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S) -DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D) - -DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H) -DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S) -DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D) +DO_XTNB(sve2_sqxtnb_h, int16_t, do_ssat_b) +DO_XTNB(sve2_sqxtnb_s, int32_t, do_ssat_h) +DO_XTNB(sve2_sqxtnb_d, int64_t, do_ssat_s) -#define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX) -#define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX) -#define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX) +DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, do_ssat_b) +DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, do_ssat_h) +DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, do_ssat_s) -DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H) -DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S) -DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D) +DO_XTNB(sve2_uqxtnb_h, uint16_t, do_usat_b) +DO_XTNB(sve2_uqxtnb_s, uint32_t, do_usat_h) +DO_XTNB(sve2_uqxtnb_d, uint64_t, do_usat_s) -DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H) -DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S) -DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D) +DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, do_usat_b) +DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, do_usat_h) +DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, do_usat_s) -DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H) -DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S) -DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D) +DO_XTNB(sve2_sqxtunb_h, int16_t, do_usat_b) +DO_XTNB(sve2_sqxtunb_s, int32_t, do_usat_h) +DO_XTNB(sve2_sqxtunb_d, int64_t, do_usat_s) -DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H) -DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S) -DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D) +DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, do_usat_b) +DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, do_usat_h) +DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, do_usat_s) #undef DO_XTNB #undef DO_XTNT @@ -1829,6 +1820,52 @@ DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN) #undef DO_VPZ #undef DO_VPZ_D +#define DO_VPQ(NAME, TYPE, H, INIT, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ +{ \ + TYPE tmp[16 / sizeof(TYPE)] = { [0 ... 16 / sizeof(TYPE) - 1] = INIT }; \ + TYPE *n = vn; uint16_t *g = vg; \ + uintptr_t oprsz = simd_oprsz(desc); \ + uintptr_t nseg = oprsz / 16, nsegelt = 16 / sizeof(TYPE); \ + for (uintptr_t s = 0; s < nseg; s++) { \ + uint16_t pg = g[H2(s)]; \ + for (uintptr_t e = 0; e < nsegelt; e++, pg >>= sizeof(TYPE)) { \ + if (pg & 1) { \ + tmp[e] = OP(tmp[H(e)], n[s * nsegelt + H(e)]); \ + } \ + } \ + } \ + memcpy(vd, tmp, 16); \ + clear_tail(vd, 16, simd_maxsz(desc)); \ +} + +DO_VPQ(sve2p1_addqv_b, uint8_t, H1, 0, DO_ADD) +DO_VPQ(sve2p1_addqv_h, uint16_t, H2, 0, DO_ADD) +DO_VPQ(sve2p1_addqv_s, uint32_t, H4, 0, DO_ADD) +DO_VPQ(sve2p1_addqv_d, uint64_t, H8, 0, DO_ADD) + +DO_VPQ(sve2p1_smaxqv_b, int8_t, H1, INT8_MIN, DO_MAX) +DO_VPQ(sve2p1_smaxqv_h, int16_t, H2, INT16_MIN, DO_MAX) +DO_VPQ(sve2p1_smaxqv_s, int32_t, H4, INT32_MIN, DO_MAX) +DO_VPQ(sve2p1_smaxqv_d, int64_t, H8, INT64_MIN, DO_MAX) + +DO_VPQ(sve2p1_sminqv_b, int8_t, H1, INT8_MAX, DO_MIN) +DO_VPQ(sve2p1_sminqv_h, int16_t, H2, INT16_MAX, DO_MIN) +DO_VPQ(sve2p1_sminqv_s, int32_t, H4, INT32_MAX, DO_MIN) +DO_VPQ(sve2p1_sminqv_d, int64_t, H8, INT64_MAX, DO_MIN) + +DO_VPQ(sve2p1_umaxqv_b, uint8_t, H1, 0, DO_MAX) +DO_VPQ(sve2p1_umaxqv_h, uint16_t, H2, 0, DO_MAX) +DO_VPQ(sve2p1_umaxqv_s, uint32_t, H4, 0, DO_MAX) +DO_VPQ(sve2p1_umaxqv_d, uint64_t, H8, 0, DO_MAX) + +DO_VPQ(sve2p1_uminqv_b, uint8_t, H1, -1, DO_MIN) +DO_VPQ(sve2p1_uminqv_h, uint16_t, H2, -1, DO_MIN) +DO_VPQ(sve2p1_uminqv_s, uint32_t, H4, -1, DO_MIN) +DO_VPQ(sve2p1_uminqv_d, uint64_t, H8, -1, DO_MIN) + +#undef DO_VPQ + /* Two vector operand, one scalar operand, unpredicated. */ #define DO_ZZI(NAME, TYPE, OP) \ void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \ @@ -1869,10 +1906,46 @@ DO_ZZI(sve_umini_d, uint64_t, DO_MIN) #undef DO_ZZI +#define DO_LOGIC_QV(NAME, SUFF, INIT, VOP, POP) \ +void HELPER(NAME ## _ ## SUFF)(void *vd, void *vn, void *vg, uint32_t desc) \ +{ \ + unsigned seg = simd_oprsz(desc) / 16; \ + uint64_t r0 = INIT, r1 = INIT; \ + for (unsigned s = 0; s < seg; s++) { \ + uint64_t p0 = expand_pred_##SUFF(*(uint8_t *)(vg + H1(s * 2))); \ + uint64_t p1 = expand_pred_##SUFF(*(uint8_t *)(vg + H1(s * 2 + 1))); \ + uint64_t v0 = *(uint64_t *)(vn + s * 16); \ + uint64_t v1 = *(uint64_t *)(vn + s * 16 + 8); \ + v0 = POP(v0, p0), v1 = POP(v1, p1); \ + r0 = VOP(r0, v0), r1 = VOP(r1, v1); \ + } \ + *(uint64_t *)(vd + 0) = r0; \ + *(uint64_t *)(vd + 8) = r1; \ + clear_tail(vd, 16, simd_maxsz(desc)); \ +} + +DO_LOGIC_QV(sve2p1_orqv, b, 0, DO_ORR, DO_AND) +DO_LOGIC_QV(sve2p1_orqv, h, 0, DO_ORR, DO_AND) +DO_LOGIC_QV(sve2p1_orqv, s, 0, DO_ORR, DO_AND) +DO_LOGIC_QV(sve2p1_orqv, d, 0, DO_ORR, DO_AND) + +DO_LOGIC_QV(sve2p1_eorqv, b, 0, DO_EOR, DO_AND) +DO_LOGIC_QV(sve2p1_eorqv, h, 0, DO_EOR, DO_AND) +DO_LOGIC_QV(sve2p1_eorqv, s, 0, DO_EOR, DO_AND) +DO_LOGIC_QV(sve2p1_eorqv, d, 0, DO_EOR, DO_AND) + +DO_LOGIC_QV(sve2p1_andqv, b, -1, DO_AND, DO_ORC) +DO_LOGIC_QV(sve2p1_andqv, h, -1, DO_AND, DO_ORC) +DO_LOGIC_QV(sve2p1_andqv, s, -1, DO_AND, DO_ORC) +DO_LOGIC_QV(sve2p1_andqv, d, -1, DO_AND, DO_ORC) + +#undef DO_LOGIC_QV + #undef DO_AND #undef DO_ORR #undef DO_EOR #undef DO_BIC +#undef DO_ORC #undef DO_ADD #undef DO_SUB #undef DO_MAX @@ -2065,27 +2138,6 @@ void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ when N is negative, add 2**M-1. */ #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M) -static inline uint64_t do_urshr(uint64_t x, unsigned sh) -{ - if (likely(sh < 64)) { - return (x >> sh) + ((x >> (sh - 1)) & 1); - } else if (sh == 64) { - return x >> 63; - } else { - return 0; - } -} - -static inline int64_t do_srshr(int64_t x, unsigned sh) -{ - if (likely(sh < 64)) { - return (x >> sh) + ((x >> (sh - 1)) & 1); - } else { - /* Rounding the sign bit always produces 0. */ - return 0; - } -} - DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR) DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR) DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR) @@ -2183,10 +2235,9 @@ DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr) DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr) DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr) -#define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX) -#define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX) -#define DO_SQSHRUN_D(x, sh) \ - do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX) +#define DO_SQSHRUN_H(x, sh) do_usat_b((int64_t)(x) >> sh) +#define DO_SQSHRUN_S(x, sh) do_usat_h((int64_t)(x) >> sh) +#define DO_SQSHRUN_D(x, sh) do_usat_s((int64_t)(x) >> (sh < 64 ? sh : 63)) DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H) DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S) @@ -2196,9 +2247,9 @@ DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H) DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S) DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D) -#define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX) -#define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX) -#define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX) +#define DO_SQRSHRUN_H(x, sh) do_usat_b(do_srshr(x, sh)) +#define DO_SQRSHRUN_S(x, sh) do_usat_h(do_srshr(x, sh)) +#define DO_SQRSHRUN_D(x, sh) do_usat_s(do_srshr(x, sh)) DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H) DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S) @@ -2208,9 +2259,9 @@ DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H) DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S) DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D) -#define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX) -#define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX) -#define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX) +#define DO_SQSHRN_H(x, sh) do_ssat_b(x >> sh) +#define DO_SQSHRN_S(x, sh) do_ssat_h(x >> sh) +#define DO_SQSHRN_D(x, sh) do_ssat_s(x >> sh) DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H) DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S) @@ -2220,9 +2271,9 @@ DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H) DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S) DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D) -#define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX) -#define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX) -#define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX) +#define DO_SQRSHRN_H(x, sh) do_ssat_b(do_srshr(x, sh)) +#define DO_SQRSHRN_S(x, sh) do_ssat_h(do_srshr(x, sh)) +#define DO_SQRSHRN_D(x, sh) do_ssat_s(do_srshr(x, sh)) DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H) DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S) @@ -2984,6 +3035,56 @@ void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc) } } +/* + * TODO: This could use half_shuffle64 and similar bit tricks to + * expand blocks of bits at once. + */ +#define DO_PMOV_PV(NAME, ESIZE) \ +void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \ +{ \ + unsigned vl = simd_oprsz(desc); \ + unsigned idx = simd_data(desc); \ + unsigned elements = vl / ESIZE; \ + ARMPredicateReg *d = vd; \ + ARMVectorReg *s = vs; \ + memset(d, 0, sizeof(*d)); \ + for (unsigned e = 0; e < elements; ++e) { \ + depositn(d->p, e * ESIZE, 1, extractn(s->d, elements * idx + e, 1)); \ + } \ +} + +DO_PMOV_PV(pmov_pv_h, 2) +DO_PMOV_PV(pmov_pv_s, 4) +DO_PMOV_PV(pmov_pv_d, 8) + +#undef DO_PMOV_PV + +/* + * TODO: This could use half_unshuffle64 and similar bit tricks to + * compress blocks of bits at once. + */ +#define DO_PMOV_VP(NAME, ESIZE) \ +void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \ +{ \ + unsigned vl = simd_oprsz(desc); \ + unsigned idx = simd_data(desc); \ + unsigned elements = vl / ESIZE; \ + ARMVectorReg *d = vd; \ + ARMPredicateReg *s = vs; \ + if (idx == 0) { \ + memset(d, 0, vl); \ + } \ + for (unsigned e = 0; e < elements; ++e) { \ + depositn(d->d, elements * idx + e, 1, extractn(s->p, e * ESIZE, 1)); \ + } \ +} + +DO_PMOV_VP(pmov_vp_h, 2) +DO_PMOV_VP(pmov_vp_s, 4) +DO_PMOV_VP(pmov_vp_d, 8) + +#undef DO_PMOV_VP + typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool); static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc, @@ -3449,6 +3550,45 @@ DO_UZP(sve_uzp_s, uint32_t, H1_4) DO_UZP(sve_uzp_d, uint64_t, H1_8) DO_UZP(sve2_uzp_q, Int128, ) +typedef void perseg_zzz_fn(void *vd, void *vn, void *vm, uint32_t desc); + +static void do_perseg_zzz(void *vd, void *vn, void *vm, + uint32_t desc, perseg_zzz_fn *fn) +{ + intptr_t oprsz = simd_oprsz(desc); + + desc = simd_desc(16, 16, simd_data(desc)); + for (intptr_t i = 0; i < oprsz; i += 16) { + fn(vd + i, vn + i, vm + i, desc); + } +} + +#define DO_PERSEG_ZZZ(NAME, FUNC) \ + void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ + { do_perseg_zzz(vd, vn, vm, desc, FUNC); } + +DO_PERSEG_ZZZ(sve2p1_uzpq_b, helper_sve_uzp_b) +DO_PERSEG_ZZZ(sve2p1_uzpq_h, helper_sve_uzp_h) +DO_PERSEG_ZZZ(sve2p1_uzpq_s, helper_sve_uzp_s) +DO_PERSEG_ZZZ(sve2p1_uzpq_d, helper_sve_uzp_d) + +DO_PERSEG_ZZZ(sve2p1_zipq_b, helper_sve_zip_b) +DO_PERSEG_ZZZ(sve2p1_zipq_h, helper_sve_zip_h) +DO_PERSEG_ZZZ(sve2p1_zipq_s, helper_sve_zip_s) +DO_PERSEG_ZZZ(sve2p1_zipq_d, helper_sve_zip_d) + +DO_PERSEG_ZZZ(sve2p1_tblq_b, helper_sve_tbl_b) +DO_PERSEG_ZZZ(sve2p1_tblq_h, helper_sve_tbl_h) +DO_PERSEG_ZZZ(sve2p1_tblq_s, helper_sve_tbl_s) +DO_PERSEG_ZZZ(sve2p1_tblq_d, helper_sve_tbl_d) + +DO_PERSEG_ZZZ(sve2p1_tbxq_b, helper_sve2_tbx_b) +DO_PERSEG_ZZZ(sve2p1_tbxq_h, helper_sve2_tbx_h) +DO_PERSEG_ZZZ(sve2p1_tbxq_s, helper_sve2_tbx_s) +DO_PERSEG_ZZZ(sve2p1_tbxq_d, helper_sve2_tbx_d) + +#undef DO_PERSEG_ZZZ + #define DO_TRN(NAME, TYPE, H) \ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ { \ @@ -3989,15 +4129,6 @@ static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g, return flags; } -static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz) -{ - /* It is quicker to zero the whole predicate than loop on OPRSZ. - * The compiler should turn this into 4 64-bit integer stores. - */ - memset(d, 0, sizeof(ARMPredicateReg)); - return PREDTEST_INIT; -} - void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg, uint32_t pred_desc) { @@ -4005,7 +4136,7 @@ void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg, if (last_active_pred(vn, vg, oprsz)) { compute_brk_z(vd, vm, vg, oprsz, true); } else { - do_zero(vd, oprsz); + memset(vd, 0, sizeof(ARMPredicateReg)); } } @@ -4016,7 +4147,8 @@ uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg, if (last_active_pred(vn, vg, oprsz)) { return compute_brks_z(vd, vm, vg, oprsz, true); } else { - return do_zero(vd, oprsz); + memset(vd, 0, sizeof(ARMPredicateReg)); + return PREDTEST_INIT; } } @@ -4027,7 +4159,7 @@ void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg, if (last_active_pred(vn, vg, oprsz)) { compute_brk_z(vd, vm, vg, oprsz, false); } else { - do_zero(vd, oprsz); + memset(vd, 0, sizeof(ARMPredicateReg)); } } @@ -4038,7 +4170,8 @@ uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg, if (last_active_pred(vn, vg, oprsz)) { return compute_brks_z(vd, vm, vg, oprsz, false); } else { - return do_zero(vd, oprsz); + memset(vd, 0, sizeof(ARMPredicateReg)); + return PREDTEST_INIT; } } @@ -4094,35 +4227,30 @@ void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc) { intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); if (!last_active_pred(vn, vg, oprsz)) { - do_zero(vd, oprsz); + memset(vd, 0, sizeof(ARMPredicateReg)); } } -/* As if PredTest(Ones(PL), D, esz). */ -static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz, - uint64_t esz_mask) -{ - uint32_t flags = PREDTEST_INIT; - intptr_t i; - - for (i = 0; i < oprsz / 8; i++) { - flags = iter_predtest_fwd(d->p[i], esz_mask, flags); - } - if (oprsz & 7) { - uint64_t mask = ~(-1ULL << (8 * (oprsz & 7))); - flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags); - } - return flags; -} - uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc) { intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); if (last_active_pred(vn, vg, oprsz)) { - return predtest_ones(vd, oprsz, -1); - } else { - return do_zero(vd, oprsz); + ARMPredicateReg *d = vd; + uint32_t flags = PREDTEST_INIT; + intptr_t i; + + /* As if PredTest(Ones(PL), D, MO_8). */ + for (i = 0; i < oprsz / 8; i++) { + flags = iter_predtest_fwd(d->p[i], -1, flags); + } + if (oprsz & 7) { + uint64_t mask = ~(-1ULL << (8 * (oprsz & 7))); + flags = iter_predtest_fwd(d->p[i], mask, flags); + } + return flags; } + memset(vd, 0, sizeof(ARMPredicateReg)); + return PREDTEST_INIT; } uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc) @@ -4139,66 +4267,200 @@ uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc) return sum; } -uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc) +uint64_t HELPER(sve2p1_cntp_c)(uint32_t png, uint32_t desc) { - intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); - intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); - uint64_t esz_mask = pred_esz_masks[esz]; - ARMPredicateReg *d = vd; - uint32_t flags; - intptr_t i; + int pl = FIELD_EX32(desc, PREDDESC, OPRSZ); + int vl = pl * 8; + unsigned v_esz = FIELD_EX32(desc, PREDDESC, ESZ); + int lg2_width = FIELD_EX32(desc, PREDDESC, DATA) + 1; + DecodeCounter p = decode_counter(png, vl, v_esz); + unsigned maxelem = (vl << lg2_width) >> v_esz; + unsigned count = p.count; + + if (p.invert) { + if (count >= maxelem) { + return 0; + } + count = maxelem - count; + } else { + count = MIN(count, maxelem); + } + return count >> p.lg2_stride; +} + +/* C.f. Arm pseudocode EncodePredCount */ +static uint64_t encode_pred_count(uint32_t elements, uint32_t count, + uint32_t esz, bool invert) +{ + uint32_t pred; - /* Begin with a zero predicate register. */ - flags = do_zero(d, oprsz); if (count == 0) { - return flags; + return 0; + } + if (invert) { + count = elements - count; + } else if (count == elements) { + count = 0; + invert = true; } - /* Set all of the requested bits. */ - for (i = 0; i < count / 64; ++i) { - d->p[i] = esz_mask; + pred = (count << 1) | 1; + pred <<= esz; + pred |= invert << 15; + + return pred; +} + +/* C.f. Arm pseudocode PredCountTest */ +static uint32_t pred_count_test(uint32_t elements, uint32_t count, bool invert) +{ + uint32_t flags; + + if (count == 0) { + flags = 1; /* !N, Z, C */ + } else if (!invert) { + flags = (1u << 31) | 2; /* N, !Z */ + flags |= count != elements; /* C */ + } else { + flags = 2; /* !Z, !C */ + flags |= (count == elements) << 31; /* N */ } - if (count & 63) { - d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask; + return flags; +} + +/* D must be cleared on entry. */ +static void do_whilel(ARMPredicateReg *d, uint64_t esz_mask, + uint32_t count, uint32_t oprbits) +{ + tcg_debug_assert(count <= oprbits); + if (count) { + uint32_t i; + + /* Set all of the requested bits. */ + for (i = 0; i < count / 64; ++i) { + d->p[i] = esz_mask; + } + if (count & 63) { + d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask; + } } +} + +uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc) +{ + uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); + uint32_t oprbits = oprsz * 8; + uint64_t esz_mask = pred_esz_masks[esz]; + ARMPredicateReg *d = vd; - return predtest_ones(d, oprsz, esz_mask); + count <<= esz; + memset(d, 0, sizeof(*d)); + do_whilel(d, esz_mask, count, oprbits); + return pred_count_test(oprbits, count, false); } -uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc) +uint32_t HELPER(sve_while2l)(void *vd, uint32_t count, uint32_t pred_desc) { - intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); - intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); + uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); + uint32_t oprbits = oprsz * 8; uint64_t esz_mask = pred_esz_masks[esz]; ARMPredicateReg *d = vd; - intptr_t i, invcount, oprbits; - uint64_t bits; - if (count == 0) { - return do_zero(d, oprsz); + count <<= esz; + memset(d, 0, 2 * sizeof(*d)); + if (count <= oprbits) { + do_whilel(&d[0], esz_mask, count, oprbits); + } else { + do_whilel(&d[0], esz_mask, oprbits, oprbits); + do_whilel(&d[1], esz_mask, count - oprbits, oprbits); } - oprbits = oprsz * 8; + return pred_count_test(2 * oprbits, count, false); +} + +uint32_t HELPER(sve_whilecl)(void *vd, uint32_t count, uint32_t pred_desc) +{ + uint32_t pl = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); + uint32_t scale = FIELD_EX32(pred_desc, PREDDESC, DATA); + uint32_t vl = pl * 8; + uint32_t elements = (vl >> esz) << scale; + ARMPredicateReg *d = vd; + + *d = (ARMPredicateReg) { + .p[0] = encode_pred_count(elements, count, esz, false) + }; + return pred_count_test(elements, count, false); +} + +/* D must be cleared on entry. */ +static void do_whileg(ARMPredicateReg *d, uint64_t esz_mask, + uint32_t count, uint32_t oprbits) +{ tcg_debug_assert(count <= oprbits); + if (count) { + uint32_t i, invcount = oprbits - count; + uint64_t bits = esz_mask & MAKE_64BIT_MASK(invcount & 63, 64); - bits = esz_mask; - if (oprbits & 63) { - bits &= MAKE_64BIT_MASK(0, oprbits & 63); + for (i = invcount / 64; i < oprbits / 64; ++i) { + d->p[i] = bits; + bits = esz_mask; + } + if (oprbits & 63) { + d->p[i] = bits & MAKE_64BIT_MASK(0, oprbits & 63); + } } +} - invcount = oprbits - count; - for (i = (oprsz - 1) / 8; i > invcount / 64; --i) { - d->p[i] = bits; - bits = esz_mask; - } +uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc) +{ + uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); + uint32_t oprbits = oprsz * 8; + uint64_t esz_mask = pred_esz_masks[esz]; + ARMPredicateReg *d = vd; - d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64); + count <<= esz; + memset(d, 0, sizeof(*d)); + do_whileg(d, esz_mask, count, oprbits); + return pred_count_test(oprbits, count, true); +} - while (--i >= 0) { - d->p[i] = 0; +uint32_t HELPER(sve_while2g)(void *vd, uint32_t count, uint32_t pred_desc) +{ + uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); + uint32_t oprbits = oprsz * 8; + uint64_t esz_mask = pred_esz_masks[esz]; + ARMPredicateReg *d = vd; + + count <<= esz; + memset(d, 0, 2 * sizeof(*d)); + if (count <= oprbits) { + do_whileg(&d[1], esz_mask, count, oprbits); + } else { + do_whilel(&d[1], esz_mask, oprbits, oprbits); + do_whileg(&d[0], esz_mask, count - oprbits, oprbits); } - return predtest_ones(d, oprsz, esz_mask); + return pred_count_test(2 * oprbits, count, true); +} + +uint32_t HELPER(sve_whilecg)(void *vd, uint32_t count, uint32_t pred_desc) +{ + uint32_t pl = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); + uint32_t scale = FIELD_EX32(pred_desc, PREDDESC, DATA); + uint32_t vl = pl * 8; + uint32_t elements = (vl >> esz) << scale; + ARMPredicateReg *d = vd; + + *d = (ARMPredicateReg) { + .p[0] = encode_pred_count(elements, count, esz, true) + }; + return pred_count_test(elements, count, true); } /* Recursive reduction on a function; @@ -4209,66 +4471,87 @@ uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc) * The recursion is bounded to depth 7 (128 fp16 elements), so there's * little to gain with a more complex non-recursive form. */ -#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \ -static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \ +#define DO_REDUCE(NAME, SUF, TYPE, H, FUNC, IDENT) \ +static TYPE FUNC##_reduce(TYPE *data, float_status *status, uintptr_t n) \ { \ if (n == 1) { \ return *data; \ } else { \ uintptr_t half = n / 2; \ - TYPE lo = NAME##_reduce(data, status, half); \ - TYPE hi = NAME##_reduce(data + half, status, half); \ + TYPE lo = FUNC##_reduce(data, status, half); \ + TYPE hi = FUNC##_reduce(data + half, status, half); \ return FUNC(lo, hi, status); \ } \ } \ -uint64_t HELPER(NAME)(void *vn, void *vg, float_status *s, uint32_t desc) \ +uint64_t helper_sve_##NAME##v_##SUF(void *vn, void *vg, \ + float_status *status, uint32_t desc) \ { \ uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \ TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \ + TYPE ident = IDENT; \ for (i = 0; i < oprsz; ) { \ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ do { \ TYPE nn = *(TYPE *)(vn + H(i)); \ - *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \ + *(TYPE *)((void *)data + i) = (pg & 1 ? nn : ident); \ i += sizeof(TYPE), pg >>= sizeof(TYPE); \ } while (i & 15); \ } \ for (; i < maxsz; i += sizeof(TYPE)) { \ - *(TYPE *)((void *)data + i) = IDENT; \ + *(TYPE *)((void *)data + i) = ident; \ } \ - return NAME##_reduce(data, s, maxsz / sizeof(TYPE)); \ + return FUNC##_reduce(data, status, maxsz / sizeof(TYPE)); \ +} \ +void helper_sve2p1_##NAME##qv_##SUF(void *vd, void *vn, void *vg, \ + float_status *status, uint32_t desc) \ +{ \ + unsigned oprsz = simd_oprsz(desc), segments = oprsz / 16; \ + TYPE ident = IDENT; \ + for (unsigned e = 0; e < 16; e += sizeof(TYPE)) { \ + TYPE data[ARM_MAX_VQ]; \ + for (unsigned s = 0; s < segments; s++) { \ + uint16_t pg = *(uint16_t *)(vg + H1_2(s * 2)); \ + TYPE nn = *(TYPE *)(vn + (s * 16 + H(e))); \ + data[s] = (pg >> e) & 1 ? nn : ident; \ + } \ + *(TYPE *)(vd + H(e)) = FUNC##_reduce(data, status, segments); \ + } \ + clear_tail(vd, 16, simd_maxsz(desc)); \ } -DO_REDUCE(sve_faddv_h, float16, H1_2, float16_add, float16_zero) -DO_REDUCE(sve_faddv_s, float32, H1_4, float32_add, float32_zero) -DO_REDUCE(sve_faddv_d, float64, H1_8, float64_add, float64_zero) +DO_REDUCE(fadd,h, float16, H1_2, float16_add, float16_zero) +DO_REDUCE(fadd,s, float32, H1_4, float32_add, float32_zero) +DO_REDUCE(fadd,d, float64, H1_8, float64_add, float64_zero) -/* Identity is floatN_default_nan, without the function call. */ -DO_REDUCE(sve_fminnmv_h, float16, H1_2, float16_minnum, 0x7E00) -DO_REDUCE(sve_fminnmv_s, float32, H1_4, float32_minnum, 0x7FC00000) -DO_REDUCE(sve_fminnmv_d, float64, H1_8, float64_minnum, 0x7FF8000000000000ULL) +/* + * We can't avoid the function call for the default NaN value, because + * it changes when FPCR.AH is set. + */ +DO_REDUCE(fminnm,h, float16, H1_2, float16_minnum, float16_default_nan(status)) +DO_REDUCE(fminnm,s, float32, H1_4, float32_minnum, float32_default_nan(status)) +DO_REDUCE(fminnm,d, float64, H1_8, float64_minnum, float64_default_nan(status)) -DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, float16_maxnum, 0x7E00) -DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, float32_maxnum, 0x7FC00000) -DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, float64_maxnum, 0x7FF8000000000000ULL) +DO_REDUCE(fmaxnm,h, float16, H1_2, float16_maxnum, float16_default_nan(status)) +DO_REDUCE(fmaxnm,s, float32, H1_4, float32_maxnum, float32_default_nan(status)) +DO_REDUCE(fmaxnm,d, float64, H1_8, float64_maxnum, float64_default_nan(status)) -DO_REDUCE(sve_fminv_h, float16, H1_2, float16_min, float16_infinity) -DO_REDUCE(sve_fminv_s, float32, H1_4, float32_min, float32_infinity) -DO_REDUCE(sve_fminv_d, float64, H1_8, float64_min, float64_infinity) +DO_REDUCE(fmin,h, float16, H1_2, float16_min, float16_infinity) +DO_REDUCE(fmin,s, float32, H1_4, float32_min, float32_infinity) +DO_REDUCE(fmin,d, float64, H1_8, float64_min, float64_infinity) -DO_REDUCE(sve_fmaxv_h, float16, H1_2, float16_max, float16_chs(float16_infinity)) -DO_REDUCE(sve_fmaxv_s, float32, H1_4, float32_max, float32_chs(float32_infinity)) -DO_REDUCE(sve_fmaxv_d, float64, H1_8, float64_max, float64_chs(float64_infinity)) +DO_REDUCE(fmax,h, float16, H1_2, float16_max, float16_chs(float16_infinity)) +DO_REDUCE(fmax,s, float32, H1_4, float32_max, float32_chs(float32_infinity)) +DO_REDUCE(fmax,d, float64, H1_8, float64_max, float64_chs(float64_infinity)) -DO_REDUCE(sve_ah_fminv_h, float16, H1_2, helper_vfp_ah_minh, float16_infinity) -DO_REDUCE(sve_ah_fminv_s, float32, H1_4, helper_vfp_ah_mins, float32_infinity) -DO_REDUCE(sve_ah_fminv_d, float64, H1_8, helper_vfp_ah_mind, float64_infinity) +DO_REDUCE(ah_fmin,h, float16, H1_2, helper_vfp_ah_minh, float16_infinity) +DO_REDUCE(ah_fmin,s, float32, H1_4, helper_vfp_ah_mins, float32_infinity) +DO_REDUCE(ah_fmin,d, float64, H1_8, helper_vfp_ah_mind, float64_infinity) -DO_REDUCE(sve_ah_fmaxv_h, float16, H1_2, helper_vfp_ah_maxh, +DO_REDUCE(ah_fmax,h, float16, H1_2, helper_vfp_ah_maxh, float16_chs(float16_infinity)) -DO_REDUCE(sve_ah_fmaxv_s, float32, H1_4, helper_vfp_ah_maxs, +DO_REDUCE(ah_fmax,s, float32, H1_4, helper_vfp_ah_maxs, float32_chs(float32_infinity)) -DO_REDUCE(sve_ah_fmaxv_d, float64, H1_8, helper_vfp_ah_maxd, +DO_REDUCE(ah_fmax,d, float64, H1_8, helper_vfp_ah_maxd, float64_chs(float64_infinity)) #undef DO_REDUCE @@ -4351,14 +4634,17 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ } while (i != 0); \ } +DO_ZPZZ_FP(sve_fadd_b16, uint16_t, H1_2, bfloat16_add) DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add) DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add) DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add) +DO_ZPZZ_FP(sve_fsub_b16, uint16_t, H1_2, bfloat16_sub) DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub) DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub) DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub) +DO_ZPZZ_FP(sve_fmul_b16, uint16_t, H1_2, bfloat16_mul) DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul) DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul) DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul) @@ -4367,26 +4653,32 @@ DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div) DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div) DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div) +DO_ZPZZ_FP(sve_fmin_b16, uint16_t, H1_2, bfloat16_min) DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min) DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min) DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min) +DO_ZPZZ_FP(sve_fmax_b16, uint16_t, H1_2, bfloat16_max) DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max) DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max) DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max) +DO_ZPZZ_FP(sve_ah_fmin_b16, uint16_t, H1_2, helper_sme2_ah_fmin_b16) DO_ZPZZ_FP(sve_ah_fmin_h, uint16_t, H1_2, helper_vfp_ah_minh) DO_ZPZZ_FP(sve_ah_fmin_s, uint32_t, H1_4, helper_vfp_ah_mins) DO_ZPZZ_FP(sve_ah_fmin_d, uint64_t, H1_8, helper_vfp_ah_mind) +DO_ZPZZ_FP(sve_ah_fmax_b16, uint16_t, H1_2, helper_sme2_ah_fmax_b16) DO_ZPZZ_FP(sve_ah_fmax_h, uint16_t, H1_2, helper_vfp_ah_maxh) DO_ZPZZ_FP(sve_ah_fmax_s, uint32_t, H1_4, helper_vfp_ah_maxs) DO_ZPZZ_FP(sve_ah_fmax_d, uint64_t, H1_8, helper_vfp_ah_maxd) +DO_ZPZZ_FP(sve_fminnum_b16, uint16_t, H1_2, bfloat16_minnum) DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum) DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum) DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum) +DO_ZPZZ_FP(sve_fmaxnum_b16, uint16_t, H1_2, bfloat16_maxnum) DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum) DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum) DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum) @@ -4550,7 +4842,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vg, \ * FZ16. When converting from fp16, this affects flushing input denormals; * when converting to fp16, this affects flushing output denormals. */ -static inline float32 sve_f16_to_f32(float16 f, float_status *fpst) +float32 sve_f16_to_f32(float16 f, float_status *fpst) { bool save = get_flush_inputs_to_zero(fpst); float32 ret; @@ -4572,7 +4864,7 @@ static inline float64 sve_f16_to_f64(float16 f, float_status *fpst) return ret; } -static inline float16 sve_f32_to_f16(float32 f, float_status *fpst) +float16 sve_f32_to_f16(float32 f, float_status *fpst) { bool save = get_flush_to_zero(fpst); float16 ret; @@ -4812,6 +5104,75 @@ DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int) #undef DO_ZPZ_FP +static void do_fmla_zpzzz_b16(void *vd, void *vn, void *vm, void *va, void *vg, + float_status *status, uint32_t desc, + uint16_t neg1, uint16_t neg3, int flags) +{ + intptr_t i = simd_oprsz(desc); + uint64_t *g = vg; + + do { + uint64_t pg = g[(i - 1) >> 6]; + do { + i -= 2; + if (likely((pg >> (i & 63)) & 1)) { + float16 e1, e2, e3, r; + + e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1; + e2 = *(uint16_t *)(vm + H1_2(i)); + e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3; + r = bfloat16_muladd(e1, e2, e3, flags, status); + *(uint16_t *)(vd + H1_2(i)) = r; + } + } while (i & 63); + } while (i != 0); +} + +void HELPER(sve_fmla_zpzzz_b16)(void *vd, void *vn, void *vm, void *va, + void *vg, float_status *status, uint32_t desc) +{ + do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0, 0, 0); +} + +void HELPER(sve_fmls_zpzzz_b16)(void *vd, void *vn, void *vm, void *va, + void *vg, float_status *status, uint32_t desc) +{ + do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0x8000, 0, 0); +} + +void HELPER(sve_fnmla_zpzzz_b16)(void *vd, void *vn, void *vm, void *va, + void *vg, float_status *status, uint32_t desc) +{ + do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000, 0); +} + +void HELPER(sve_fnmls_zpzzz_b16)(void *vd, void *vn, void *vm, void *va, + void *vg, float_status *status, uint32_t desc) +{ + do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0, 0x8000, 0); +} + +void HELPER(sve_ah_fmls_zpzzz_b16)(void *vd, void *vn, void *vm, void *va, + void *vg, float_status *status, uint32_t desc) +{ + do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0, 0, + float_muladd_negate_product); +} + +void HELPER(sve_ah_fnmla_zpzzz_b16)(void *vd, void *vn, void *vm, void *va, + void *vg, float_status *status, uint32_t desc) +{ + do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0, 0, + float_muladd_negate_product | float_muladd_negate_c); +} + +void HELPER(sve_ah_fnmls_zpzzz_b16)(void *vd, void *vn, void *vm, void *va, + void *vg, float_status *status, uint32_t desc) +{ + do_fmla_zpzzz_b16(vd, vn, vm, va, vg, status, desc, 0, 0, + float_muladd_negate_c); +} + static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg, float_status *status, uint32_t desc, uint16_t neg1, uint16_t neg3, int flags) @@ -6001,17 +6362,14 @@ void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr, static inline QEMU_ALWAYS_INLINE void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, - uint32_t desc, const uintptr_t ra, + uint64_t desc, const uintptr_t ra, const int esz, const int msz, const int N, sve_ldst1_host_fn *host_fn, sve_ldst1_tlb_fn *tlb_fn) { - uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + uint32_t mtedesc = desc >> 32; int bit55 = extract64(addr, 55, 1); - /* Remove mtedesc from the normal sve descriptor. */ - desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); - /* Perform gross MTE suppression early. */ if (!tbi_check(mtedesc, bit55) || tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { @@ -6023,13 +6381,13 @@ void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, #define DO_LD1_1(NAME, ESZ) \ void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \ sve_##NAME##_host, sve_##NAME##_tlb); \ } \ void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \ sve_##NAME##_host, sve_##NAME##_tlb); \ @@ -6037,25 +6395,25 @@ void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \ #define DO_LD1_2(NAME, ESZ, MSZ) \ void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ } \ void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ } \ void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ } \ void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ @@ -6081,18 +6439,21 @@ DO_LD1_2(ld1sds, MO_64, MO_32) DO_LD1_2(ld1dd, MO_64, MO_64) +DO_LD1_2(ld1squ, MO_128, MO_32) +DO_LD1_2(ld1dqu, MO_128, MO_64) + #undef DO_LD1_1 #undef DO_LD1_2 #define DO_LDN_1(N) \ void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \ sve_ld1bb_host, sve_ld1bb_tlb); \ } \ void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \ sve_ld1bb_host, sve_ld1bb_tlb); \ @@ -6100,25 +6461,25 @@ void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \ #define DO_LDN_2(N, SUFF, ESZ) \ void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ } \ void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ } \ void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ } \ void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ @@ -6140,6 +6501,10 @@ DO_LDN_2(2, dd, MO_64) DO_LDN_2(3, dd, MO_64) DO_LDN_2(4, dd, MO_64) +DO_LDN_2(2, qq, MO_128) +DO_LDN_2(3, qq, MO_128) +DO_LDN_2(4, qq, MO_128) + #undef DO_LDN_1 #undef DO_LDN_2 @@ -6359,17 +6724,14 @@ void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr, static inline QEMU_ALWAYS_INLINE void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr, - uint32_t desc, const uintptr_t retaddr, + uint64_t desc, const uintptr_t retaddr, const int esz, const int msz, const SVEContFault fault, sve_ldst1_host_fn *host_fn, sve_ldst1_tlb_fn *tlb_fn) { - uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + uint32_t mtedesc = desc >> 32; int bit55 = extract64(addr, 55, 1); - /* Remove mtedesc from the normal sve descriptor. */ - desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); - /* Perform gross MTE suppression early. */ if (!tbi_check(mtedesc, bit55) || tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { @@ -6382,25 +6744,25 @@ void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr, #define DO_LDFF1_LDNF1_1(PART, ESZ) \ void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \ sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ } \ void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \ sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ } \ void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \ sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ } \ void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \ sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ @@ -6408,49 +6770,49 @@ void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \ #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \ void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ } \ void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ } \ void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ } \ void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ } \ void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ } \ void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ } \ void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ } \ void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ @@ -6617,17 +6979,14 @@ void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr, static inline QEMU_ALWAYS_INLINE void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, - uint32_t desc, const uintptr_t ra, + uint64_t desc, const uintptr_t ra, const int esz, const int msz, const int N, sve_ldst1_host_fn *host_fn, sve_ldst1_tlb_fn *tlb_fn) { - uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + uint32_t mtedesc = desc >> 32; int bit55 = extract64(addr, 55, 1); - /* Remove mtedesc from the normal sve descriptor. */ - desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); - /* Perform gross MTE suppression early. */ if (!tbi_check(mtedesc, bit55) || tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { @@ -6639,13 +6998,13 @@ void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, #define DO_STN_1(N, NAME, ESZ) \ void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \ sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ } \ void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \ sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ @@ -6653,25 +7012,25 @@ void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \ #define DO_STN_2(N, NAME, ESZ, MSZ) \ void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ } \ void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ } \ void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ } \ void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ + target_ulong addr, uint64_t desc) \ { \ sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ @@ -6703,6 +7062,13 @@ DO_STN_2(2, dd, MO_64, MO_64) DO_STN_2(3, dd, MO_64, MO_64) DO_STN_2(4, dd, MO_64, MO_64) +DO_STN_2(1, sq, MO_128, MO_32) +DO_STN_2(1, dq, MO_128, MO_64) + +DO_STN_2(2, qq, MO_128, MO_128) +DO_STN_2(3, qq, MO_128, MO_128) +DO_STN_2(4, qq, MO_128, MO_128) + #undef DO_STN_1 #undef DO_STN_2 @@ -6808,14 +7174,12 @@ void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, static inline QEMU_ALWAYS_INLINE void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, - target_ulong base, uint32_t desc, uintptr_t retaddr, + target_ulong base, uint64_t desc, uintptr_t retaddr, int esize, int msize, zreg_off_fn *off_fn, sve_ldst1_host_fn *host_fn, sve_ldst1_tlb_fn *tlb_fn) { - uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); - /* Remove mtedesc from the normal sve descriptor. */ - desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + uint32_t mtedesc = desc >> 32; /* * ??? TODO: For the 32-bit offset extractions, base + ofs cannot @@ -6829,13 +7193,13 @@ void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \ void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ - void *vm, target_ulong base, uint32_t desc) \ + void *vm, target_ulong base, uint64_t desc) \ { \ sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ } \ void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ - void *vm, target_ulong base, uint32_t desc) \ + void *vm, target_ulong base, uint64_t desc) \ { \ sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ @@ -6843,18 +7207,32 @@ void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \ void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ - void *vm, target_ulong base, uint32_t desc) \ + void *vm, target_ulong base, uint64_t desc) \ { \ sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ } \ void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ - void *vm, target_ulong base, uint32_t desc) \ + void *vm, target_ulong base, uint64_t desc) \ { \ sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ } +#define DO_LD1_ZPZ_Q(MEM, OFS, MSZ) \ +void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ + void *vm, target_ulong base, uint64_t desc) \ +{ \ + sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 16, 1 << MSZ, \ + off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ +} \ +void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ + void *vm, target_ulong base, uint64_t desc) \ +{ \ + sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 16, 1 << MSZ, \ + off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ +} + DO_LD1_ZPZ_S(bsu, zsu, MO_8) DO_LD1_ZPZ_S(bsu, zss, MO_8) DO_LD1_ZPZ_D(bdu, zsu, MO_8) @@ -6919,6 +7297,9 @@ DO_LD1_ZPZ_D(dd_be, zsu, MO_64) DO_LD1_ZPZ_D(dd_be, zss, MO_64) DO_LD1_ZPZ_D(dd_be, zd, MO_64) +DO_LD1_ZPZ_Q(qq_le, zd, MO_128) +DO_LD1_ZPZ_Q(qq_be, zd, MO_128) + #undef DO_LD1_ZPZ_S #undef DO_LD1_ZPZ_D @@ -7017,15 +7398,13 @@ void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, static inline QEMU_ALWAYS_INLINE void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, - target_ulong base, uint32_t desc, uintptr_t retaddr, + target_ulong base, uint64_t desc, uintptr_t retaddr, const int esz, const int msz, zreg_off_fn *off_fn, sve_ldst1_host_fn *host_fn, sve_ldst1_tlb_fn *tlb_fn) { - uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); - /* Remove mtedesc from the normal sve descriptor. */ - desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + uint32_t mtedesc = desc >> 32; /* * ??? TODO: For the 32-bit offset extractions, base + ofs cannot @@ -7040,14 +7419,14 @@ void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \ void HELPER(sve_ldff##MEM##_##OFS) \ (CPUARMState *env, void *vd, void *vg, \ - void *vm, target_ulong base, uint32_t desc) \ + void *vm, target_ulong base, uint64_t desc) \ { \ sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \ off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ } \ void HELPER(sve_ldff##MEM##_##OFS##_mte) \ (CPUARMState *env, void *vd, void *vg, \ - void *vm, target_ulong base, uint32_t desc) \ + void *vm, target_ulong base, uint64_t desc) \ { \ sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \ off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ @@ -7056,14 +7435,14 @@ void HELPER(sve_ldff##MEM##_##OFS##_mte) \ #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \ void HELPER(sve_ldff##MEM##_##OFS) \ (CPUARMState *env, void *vd, void *vg, \ - void *vm, target_ulong base, uint32_t desc) \ + void *vm, target_ulong base, uint64_t desc) \ { \ sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \ off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ } \ void HELPER(sve_ldff##MEM##_##OFS##_mte) \ (CPUARMState *env, void *vd, void *vg, \ - void *vm, target_ulong base, uint32_t desc) \ + void *vm, target_ulong base, uint64_t desc) \ { \ sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \ off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ @@ -7222,14 +7601,12 @@ void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, static inline QEMU_ALWAYS_INLINE void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, - target_ulong base, uint32_t desc, uintptr_t retaddr, + target_ulong base, uint64_t desc, uintptr_t retaddr, int esize, int msize, zreg_off_fn *off_fn, sve_ldst1_host_fn *host_fn, sve_ldst1_tlb_fn *tlb_fn) { - uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); - /* Remove mtedesc from the normal sve descriptor. */ - desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + uint32_t mtedesc = desc >> 32; /* * ??? TODO: For the 32-bit offset extractions, base + ofs cannot @@ -7243,13 +7620,13 @@ void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \ void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ - void *vm, target_ulong base, uint32_t desc) \ + void *vm, target_ulong base, uint64_t desc) \ { \ sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ } \ void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ - void *vm, target_ulong base, uint32_t desc) \ + void *vm, target_ulong base, uint64_t desc) \ { \ sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ @@ -7257,18 +7634,32 @@ void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \ void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ - void *vm, target_ulong base, uint32_t desc) \ + void *vm, target_ulong base, uint64_t desc) \ { \ sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ } \ void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ - void *vm, target_ulong base, uint32_t desc) \ + void *vm, target_ulong base, uint64_t desc) \ { \ sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ } +#define DO_ST1_ZPZ_Q(MEM, OFS, MSZ) \ +void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ + void *vm, target_ulong base, uint64_t desc) \ +{ \ + sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 16, 1 << MSZ, \ + off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ +} \ +void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ + void *vm, target_ulong base, uint64_t desc) \ +{ \ + sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 16, 1 << MSZ, \ + off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ +} + DO_ST1_ZPZ_S(bs, zsu, MO_8) DO_ST1_ZPZ_S(hs_le, zsu, MO_16) DO_ST1_ZPZ_S(hs_be, zsu, MO_16) @@ -7305,9 +7696,507 @@ DO_ST1_ZPZ_D(sd_be, zd, MO_32) DO_ST1_ZPZ_D(dd_le, zd, MO_64) DO_ST1_ZPZ_D(dd_be, zd, MO_64) +DO_ST1_ZPZ_Q(qq_le, zd, MO_128) +DO_ST1_ZPZ_Q(qq_be, zd, MO_128) + #undef DO_ST1_ZPZ_S #undef DO_ST1_ZPZ_D +/* + * SVE2.1 consecutive register load/store + */ + +static unsigned sve2p1_cont_ldst_elements(SVEContLdSt *info, vaddr addr, + uint32_t png, intptr_t reg_max, + int N, int v_esz) +{ + const int esize = 1 << v_esz; + intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split; + DecodeCounter p = decode_counter(png, reg_max, v_esz); + unsigned b_count = p.count << v_esz; + unsigned b_stride = 1 << (v_esz + p.lg2_stride); + intptr_t page_split; + + /* Set all of the element indices to -1, and the TLB data to 0. */ + memset(info, -1, offsetof(SVEContLdSt, page)); + memset(info->page, 0, sizeof(info->page)); + + if (p.invert) { + if (b_count >= reg_max * N) { + return 0; + } + reg_off_first = b_count; + reg_off_last = reg_max * N - b_stride; + } else { + if (b_count == 0) { + return 0; + } + reg_off_first = 0; + reg_off_last = MIN(b_count - esize, reg_max * N - b_stride); + } + + info->reg_off_first[0] = reg_off_first; + info->mem_off_first[0] = reg_off_first; + + page_split = -(addr | TARGET_PAGE_MASK); + if (reg_off_last + esize <= page_split || reg_off_first >= page_split) { + /* The entire operation fits within a single page. */ + info->reg_off_last[0] = reg_off_last; + return b_stride; + } + + info->page_split = page_split; + reg_off_split = ROUND_DOWN(page_split, esize); + + /* + * This is the last full element on the first page, but it is not + * necessarily active. If there is no full element, i.e. the first + * active element is the one that's split, this value remains -1. + * It is useful as iteration bounds. + */ + if (reg_off_split != 0) { + info->reg_off_last[0] = ROUND_DOWN(reg_off_split - esize, b_stride); + } + + /* Determine if an unaligned element spans the pages. */ + if (page_split & (esize - 1)) { + /* It is helpful to know if the split element is active. */ + if ((reg_off_split & (b_stride - 1)) == 0) { + info->reg_off_split = reg_off_split; + info->mem_off_split = reg_off_split; + } + reg_off_split += esize; + } + + /* + * We do want the first active element on the second page, because + * this may affect the address reported in an exception. + */ + reg_off_split = ROUND_UP(reg_off_split, b_stride); + if (reg_off_split <= reg_off_last) { + info->reg_off_first[1] = reg_off_split; + info->mem_off_first[1] = reg_off_split; + info->reg_off_last[1] = reg_off_last; + } + return b_stride; +} + +static void sve2p1_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env, + target_ulong addr, unsigned estride, + int esize, int wp_access, uintptr_t ra) +{ +#ifndef CONFIG_USER_ONLY + intptr_t count_off, count_last; + int flags0 = info->page[0].flags; + int flags1 = info->page[1].flags; + + if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) { + return; + } + + /* Indicate that watchpoints are handled. */ + info->page[0].flags = flags0 & ~TLB_WATCHPOINT; + info->page[1].flags = flags1 & ~TLB_WATCHPOINT; + + if (flags0 & TLB_WATCHPOINT) { + count_off = info->reg_off_first[0]; + count_last = info->reg_off_split; + if (count_last < 0) { + count_last = info->reg_off_last[0]; + } + do { + cpu_check_watchpoint(env_cpu(env), addr + count_off, + esize, info->page[0].attrs, wp_access, ra); + count_off += estride; + } while (count_off <= count_last); + } + + count_off = info->reg_off_first[1]; + if ((flags1 & TLB_WATCHPOINT) && count_off >= 0) { + count_last = info->reg_off_last[1]; + do { + cpu_check_watchpoint(env_cpu(env), addr + count_off, + esize, info->page[1].attrs, + wp_access, ra); + count_off += estride; + } while (count_off <= count_last); + } +#endif +} + +static void sve2p1_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env, + target_ulong addr, unsigned estride, + int esize, uint32_t mtedesc, + uintptr_t ra) +{ + intptr_t count_off, count_last; + + /* + * TODO: estride is always a small power of two, <= 8. + * Manipulate the stride within the loops such that + * - first iteration hits addr + off, as required, + * - second iteration hits ALIGN_UP(addr, 16), + * - other iterations advance addr by 16. + * This will minimize the probing to once per MTE granule. + */ + + /* Process the page only if MemAttr == Tagged. */ + if (info->page[0].tagged) { + count_off = info->reg_off_first[0]; + count_last = info->reg_off_split; + if (count_last < 0) { + count_last = info->reg_off_last[0]; + } + + do { + mte_check(env, mtedesc, addr + count_off, ra); + count_off += estride; + } while (count_off <= count_last); + } + + count_off = info->reg_off_first[1]; + if (count_off >= 0 && info->page[1].tagged) { + count_last = info->reg_off_last[1]; + do { + mte_check(env, mtedesc, addr + count_off, ra); + count_off += estride; + } while (count_off <= count_last); + } +} + +static inline QEMU_ALWAYS_INLINE +void sve2p1_ld1_c(CPUARMState *env, ARMVectorReg *zd, const vaddr addr, + uint32_t png, uint64_t desc64, + const uintptr_t ra, const MemOp esz, + sve_ldst1_host_fn *host_fn, + sve_ldst1_tlb_fn *tlb_fn) +{ + uint32_t mtedesc = desc64 >> 32; + uint32_t desc = desc64; + const unsigned N = (desc >> SIMD_DATA_SHIFT) & 1 ? 4 : 2; + const unsigned rstride = 1 << ((desc >> (SIMD_DATA_SHIFT + 1)) % 4); + const intptr_t reg_max = simd_oprsz(desc); + const unsigned esize = 1 << esz; + intptr_t count_off, count_last; + intptr_t reg_off, reg_last, reg_n; + SVEContLdSt info; + unsigned estride, flags; + void *host; + + estride = sve2p1_cont_ldst_elements(&info, addr, png, reg_max, N, esz); + if (estride == 0) { + /* The entire predicate was false; no load occurs. */ + for (unsigned n = 0; n < N; n++) { + memset(zd + n * rstride, 0, reg_max); + } + return; + } + + /* Probe the page(s). Exit with exception for any invalid page. */ + sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra); + + /* Handle watchpoints for all active elements. */ + sve2p1_cont_ldst_watchpoints(&info, env, addr, estride, + esize, BP_MEM_READ, ra); + + /* + * Handle mte checks for all active elements. + * Since TBI must be set for MTE, !mtedesc => !mte_active. + */ + if (mtedesc) { + sve2p1_cont_ldst_mte_check(&info, env, estride, addr, + esize, mtedesc, ra); + } + + flags = info.page[0].flags | info.page[1].flags; + if (unlikely(flags != 0)) { + /* + * At least one page includes MMIO. + * Any bus operation can fail with cpu_transaction_failed, + * which for ARM will raise SyncExternal. Perform the load + * into scratch memory to preserve register state until the end. + */ + ARMVectorReg scratch[4] = { }; + + count_off = info.reg_off_first[0]; + count_last = info.reg_off_last[1]; + if (count_last < 0) { + count_last = info.reg_off_split; + if (count_last < 0) { + count_last = info.reg_off_last[0]; + } + } + reg_off = count_off % reg_max; + reg_n = count_off / reg_max; + + do { + reg_last = MIN(count_last - count_off, reg_max - esize); + do { + tlb_fn(env, &scratch[reg_n], reg_off, addr + count_off, ra); + reg_off += estride; + count_off += estride; + } while (reg_off <= reg_last); + reg_off = 0; + reg_n++; + } while (count_off <= count_last); + + for (unsigned n = 0; n < N; ++n) { + memcpy(&zd[n * rstride], &scratch[n], reg_max); + } + return; + } + + /* The entire operation is in RAM, on valid pages. */ + + for (unsigned n = 0; n < N; ++n) { + memset(&zd[n * rstride], 0, reg_max); + } + + count_off = info.reg_off_first[0]; + count_last = info.reg_off_last[0]; + reg_off = count_off % reg_max; + reg_n = count_off / reg_max; + host = info.page[0].host; + + set_helper_retaddr(ra); + + do { + reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize); + do { + host_fn(&zd[reg_n * rstride], reg_off, host + count_off); + reg_off += estride; + count_off += estride; + } while (reg_off <= reg_last); + reg_off = 0; + reg_n++; + } while (count_off <= count_last); + + clear_helper_retaddr(); + + /* + * Use the slow path to manage the cross-page misalignment. + * But we know this is RAM and cannot trap. + */ + count_off = info.reg_off_split; + if (unlikely(count_off >= 0)) { + reg_off = count_off % reg_max; + reg_n = count_off / reg_max; + tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra); + } + + count_off = info.reg_off_first[1]; + if (unlikely(count_off >= 0)) { + count_last = info.reg_off_last[1]; + reg_off = count_off % reg_max; + reg_n = count_off / reg_max; + host = info.page[1].host; + + set_helper_retaddr(ra); + + do { + reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize); + do { + host_fn(&zd[reg_n * rstride], reg_off, host + count_off); + reg_off += estride; + count_off += estride; + } while (reg_off <= reg_last); + reg_off = 0; + reg_n++; + } while (count_off <= count_last); + + clear_helper_retaddr(); + } +} + +void HELPER(sve2p1_ld1bb_c)(CPUARMState *env, void *vd, target_ulong addr, + uint32_t png, uint64_t desc) +{ + sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), MO_8, + sve_ld1bb_host, sve_ld1bb_tlb); +} + +#define DO_LD1_2(NAME, ESZ) \ +void HELPER(sve2p1_##NAME##_le_c)(CPUARMState *env, void *vd, \ + target_ulong addr, uint32_t png, \ + uint64_t desc) \ +{ \ + sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), ESZ, \ + sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ +} \ +void HELPER(sve2p1_##NAME##_be_c)(CPUARMState *env, void *vd, \ + target_ulong addr, uint32_t png, \ + uint64_t desc) \ +{ \ + sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), ESZ, \ + sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ +} + +DO_LD1_2(ld1hh, MO_16) +DO_LD1_2(ld1ss, MO_32) +DO_LD1_2(ld1dd, MO_64) + +#undef DO_LD1_2 + +static inline QEMU_ALWAYS_INLINE +void sve2p1_st1_c(CPUARMState *env, ARMVectorReg *zd, const vaddr addr, + uint32_t png, uint64_t desc64, + const uintptr_t ra, const int esz, + sve_ldst1_host_fn *host_fn, + sve_ldst1_tlb_fn *tlb_fn) +{ + uint32_t mtedesc = desc64 >> 32; + uint32_t desc = desc64; + const unsigned N = (desc >> SIMD_DATA_SHIFT) & 1 ? 4 : 2; + const unsigned rstride = 1 << ((desc >> (SIMD_DATA_SHIFT + 1)) % 4); + const intptr_t reg_max = simd_oprsz(desc); + const unsigned esize = 1 << esz; + intptr_t count_off, count_last; + intptr_t reg_off, reg_last, reg_n; + SVEContLdSt info; + unsigned estride, flags; + void *host; + + estride = sve2p1_cont_ldst_elements(&info, addr, png, reg_max, N, esz); + if (estride == 0) { + /* The entire predicate was false; no store occurs. */ + return; + } + + /* Probe the page(s). Exit with exception for any invalid page. */ + sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra); + + /* Handle watchpoints for all active elements. */ + sve2p1_cont_ldst_watchpoints(&info, env, addr, estride, + esize, BP_MEM_WRITE, ra); + + /* + * Handle mte checks for all active elements. + * Since TBI must be set for MTE, !mtedesc => !mte_active. + */ + if (mtedesc) { + sve2p1_cont_ldst_mte_check(&info, env, estride, addr, + esize, mtedesc, ra); + } + + flags = info.page[0].flags | info.page[1].flags; + if (unlikely(flags != 0)) { + /* + * At least one page includes MMIO. + * Any bus operation can fail with cpu_transaction_failed, + * which for ARM will raise SyncExternal. Perform the load + * into scratch memory to preserve register state until the end. + */ + count_off = info.reg_off_first[0]; + count_last = info.reg_off_last[1]; + if (count_last < 0) { + count_last = info.reg_off_split; + if (count_last < 0) { + count_last = info.reg_off_last[0]; + } + } + reg_off = count_off % reg_max; + reg_n = count_off / reg_max; + + do { + reg_last = MIN(count_last - count_off, reg_max - esize); + do { + tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra); + reg_off += estride; + count_off += estride; + } while (reg_off <= reg_last); + reg_off = 0; + reg_n++; + } while (count_off <= count_last); + return; + } + + /* The entire operation is in RAM, on valid pages. */ + + count_off = info.reg_off_first[0]; + count_last = info.reg_off_last[0]; + reg_off = count_off % reg_max; + reg_n = count_off / reg_max; + host = info.page[0].host; + + set_helper_retaddr(ra); + + do { + reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize); + do { + host_fn(&zd[reg_n * rstride], reg_off, host + count_off); + reg_off += estride; + count_off += estride; + } while (reg_off <= reg_last); + reg_off = 0; + reg_n++; + } while (count_off <= count_last); + + clear_helper_retaddr(); + + /* + * Use the slow path to manage the cross-page misalignment. + * But we know this is RAM and cannot trap. + */ + count_off = info.reg_off_split; + if (unlikely(count_off >= 0)) { + reg_off = count_off % reg_max; + reg_n = count_off / reg_max; + tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra); + } + + count_off = info.reg_off_first[1]; + if (unlikely(count_off >= 0)) { + count_last = info.reg_off_last[1]; + reg_off = count_off % reg_max; + reg_n = count_off / reg_max; + host = info.page[1].host; + + set_helper_retaddr(ra); + + do { + reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize); + do { + host_fn(&zd[reg_n * rstride], reg_off, host + count_off); + reg_off += estride; + count_off += estride; + } while (reg_off <= reg_last); + reg_off = 0; + reg_n++; + } while (count_off <= count_last); + + clear_helper_retaddr(); + } +} + +void HELPER(sve2p1_st1bb_c)(CPUARMState *env, void *vd, target_ulong addr, + uint32_t png, uint64_t desc) +{ + sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), MO_8, + sve_st1bb_host, sve_st1bb_tlb); +} + +#define DO_ST1_2(NAME, ESZ) \ +void HELPER(sve2p1_##NAME##_le_c)(CPUARMState *env, void *vd, \ + target_ulong addr, uint32_t png, \ + uint64_t desc) \ +{ \ + sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), ESZ, \ + sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ +} \ +void HELPER(sve2p1_##NAME##_be_c)(CPUARMState *env, void *vd, \ + target_ulong addr, uint32_t png, \ + uint64_t desc) \ +{ \ + sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), ESZ, \ + sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ +} + +DO_ST1_2(st1hh, MO_16) +DO_ST1_2(st1ss, MO_32) +DO_ST1_2(st1dd, MO_64) + +#undef DO_ST1_2 + void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) { intptr_t i, opr_sz = simd_oprsz(desc) / 8; @@ -7711,3 +8600,31 @@ DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64) #undef DO_FCVTLT #undef DO_FCVTNT + +void HELPER(pext)(void *vd, uint32_t png, uint32_t desc) +{ + int pl = FIELD_EX32(desc, PREDDESC, OPRSZ); + int vl = pl * 8; + unsigned v_esz = FIELD_EX32(desc, PREDDESC, ESZ); + int part = FIELD_EX32(desc, PREDDESC, DATA); + DecodeCounter p = decode_counter(png, vl, v_esz); + uint64_t mask = pred_esz_masks[v_esz + p.lg2_stride]; + ARMPredicateReg *d = vd; + + /* + * Convert from element count to byte count and adjust + * for the portion of the 4*VL counter to be extracted. + */ + int b_count = (p.count << v_esz) - vl * part; + + memset(d, 0, sizeof(*d)); + if (p.invert) { + if (b_count <= 0) { + do_whilel(vd, mask, vl, vl); + } else if (b_count < vl) { + do_whileg(vd, mask, vl - b_count, vl); + } + } else if (b_count > 0) { + do_whilel(vd, mask, MIN(b_count, vl), vl); + } +} diff --git a/target/arm/tcg/sve_ldst_internal.h b/target/arm/tcg/sve_ldst_internal.h index 4f159ec..c67cda9 100644 --- a/target/arm/tcg/sve_ldst_internal.h +++ b/target/arm/tcg/sve_ldst_internal.h @@ -20,7 +20,7 @@ #ifndef TARGET_ARM_SVE_LDST_INTERNAL_H #define TARGET_ARM_SVE_LDST_INTERNAL_H -#include "exec/cpu_ldst.h" +#include "accel/tcg/cpu-ldst.h" /* * Load one element into @vd + @reg_off from @host. @@ -116,6 +116,94 @@ DO_ST_PRIM_2(sd, H1_8, uint64_t, uint32_t, stl) DO_LD_PRIM_2(dd, H1_8, uint64_t, uint64_t, ldq) DO_ST_PRIM_2(dd, H1_8, uint64_t, uint64_t, stq) +#define DO_LD_PRIM_3(NAME, FUNC) \ + static inline void sve_##NAME##_host(void *vd, \ + intptr_t reg_off, void *host) \ + { sve_##FUNC##_host(vd, reg_off, host); \ + *(uint64_t *)(vd + reg_off + 8) = 0; } \ + static inline void sve_##NAME##_tlb(CPUARMState *env, void *vd, \ + intptr_t reg_off, target_ulong addr, uintptr_t ra) \ + { sve_##FUNC##_tlb(env, vd, reg_off, addr, ra); \ + *(uint64_t *)(vd + reg_off + 8) = 0; } + +DO_LD_PRIM_3(ld1squ_be, ld1sdu_be) +DO_LD_PRIM_3(ld1squ_le, ld1sdu_le) +DO_LD_PRIM_3(ld1dqu_be, ld1dd_be) +DO_LD_PRIM_3(ld1dqu_le, ld1dd_le) + +#define sve_st1sq_be_host sve_st1sd_be_host +#define sve_st1sq_le_host sve_st1sd_le_host +#define sve_st1sq_be_tlb sve_st1sd_be_tlb +#define sve_st1sq_le_tlb sve_st1sd_le_tlb + +#define sve_st1dq_be_host sve_st1dd_be_host +#define sve_st1dq_le_host sve_st1dd_le_host +#define sve_st1dq_be_tlb sve_st1dd_be_tlb +#define sve_st1dq_le_tlb sve_st1dd_le_tlb + +/* + * The ARMVectorReg elements are stored in host-endian 64-bit units. + * For 128-bit quantities, the sequence defined by the Elem[] pseudocode + * corresponds to storing the two 64-bit pieces in little-endian order. + */ +/* FIXME: Nothing in this file makes any effort at atomicity. */ + +static inline void sve_ld1qq_be_host(void *vd, intptr_t reg_off, void *host) +{ + sve_ld1dd_be_host(vd, reg_off + 8, host); + sve_ld1dd_be_host(vd, reg_off, host + 8); +} + +static inline void sve_ld1qq_le_host(void *vd, intptr_t reg_off, void *host) +{ + sve_ld1dd_le_host(vd, reg_off, host); + sve_ld1dd_le_host(vd, reg_off + 8, host + 8); +} + +static inline void +sve_ld1qq_be_tlb(CPUARMState *env, void *vd, intptr_t reg_off, + target_ulong addr, uintptr_t ra) +{ + sve_ld1dd_be_tlb(env, vd, reg_off + 8, addr, ra); + sve_ld1dd_be_tlb(env, vd, reg_off, addr + 8, ra); +} + +static inline void +sve_ld1qq_le_tlb(CPUARMState *env, void *vd, intptr_t reg_off, + target_ulong addr, uintptr_t ra) +{ + sve_ld1dd_le_tlb(env, vd, reg_off, addr, ra); + sve_ld1dd_le_tlb(env, vd, reg_off + 8, addr + 8, ra); +} + +static inline void sve_st1qq_be_host(void *vd, intptr_t reg_off, void *host) +{ + sve_st1dd_be_host(vd, reg_off + 8, host); + sve_st1dd_be_host(vd, reg_off, host + 8); +} + +static inline void sve_st1qq_le_host(void *vd, intptr_t reg_off, void *host) +{ + sve_st1dd_le_host(vd, reg_off, host); + sve_st1dd_le_host(vd, reg_off + 8, host + 8); +} + +static inline void +sve_st1qq_be_tlb(CPUARMState *env, void *vd, intptr_t reg_off, + target_ulong addr, uintptr_t ra) +{ + sve_st1dd_be_tlb(env, vd, reg_off + 8, addr, ra); + sve_st1dd_be_tlb(env, vd, reg_off, addr + 8, ra); +} + +static inline void +sve_st1qq_le_tlb(CPUARMState *env, void *vd, intptr_t reg_off, + target_ulong addr, uintptr_t ra) +{ + sve_st1dd_le_tlb(env, vd, reg_off, addr, ra); + sve_st1dd_le_tlb(env, vd, reg_off + 8, addr + 8, ra); +} + #undef DO_LD_TLB #undef DO_ST_TLB #undef DO_LD_HOST @@ -123,6 +211,7 @@ DO_ST_PRIM_2(dd, H1_8, uint64_t, uint64_t, stq) #undef DO_ST_PRIM_1 #undef DO_LD_PRIM_2 #undef DO_ST_PRIM_2 +#undef DO_LD_PRIM_3 /* * Resolve the guest virtual address to info->host and info->flags. diff --git a/target/arm/tcg/tlb-insns.c b/target/arm/tcg/tlb-insns.c index 630a481..95c26c6 100644 --- a/target/arm/tcg/tlb-insns.c +++ b/target/arm/tcg/tlb-insns.c @@ -8,6 +8,7 @@ #include "qemu/osdep.h" #include "qemu/log.h" #include "exec/cputlb.h" +#include "exec/target_page.h" #include "cpu.h" #include "internals.h" #include "cpu-features.h" @@ -34,7 +35,6 @@ static CPAccessResult access_ttlbis(CPUARMState *env, const ARMCPRegInfo *ri, return CP_ACCESS_OK; } -#ifdef TARGET_AARCH64 /* Check for traps from EL1 due to HCR_EL2.TTLB or TTLBOS. */ static CPAccessResult access_ttlbos(CPUARMState *env, const ARMCPRegInfo *ri, bool isread) @@ -45,7 +45,6 @@ static CPAccessResult access_ttlbos(CPUARMState *env, const ARMCPRegInfo *ri, } return CP_ACCESS_OK; } -#endif /* IS variants of TLB operations must affect all cores */ static void tlbiall_is_write(CPUARMState *env, const ARMCPRegInfo *ri, @@ -801,7 +800,6 @@ static const ARMCPRegInfo tlbi_el3_cp_reginfo[] = { .writefn = tlbi_aa64_vae3_write }, }; -#ifdef TARGET_AARCH64 typedef struct { uint64_t base; uint64_t length; @@ -1269,8 +1267,6 @@ static const ARMCPRegInfo tlbi_rme_reginfo[] = { .writefn = tlbi_aa64_paallos_write }, }; -#endif - void define_tlb_insn_regs(ARMCPU *cpu) { CPUARMState *env = &cpu->env; @@ -1298,7 +1294,6 @@ void define_tlb_insn_regs(ARMCPU *cpu) if (arm_feature(env, ARM_FEATURE_EL3)) { define_arm_cp_regs(cpu, tlbi_el3_cp_reginfo); } -#ifdef TARGET_AARCH64 if (cpu_isar_feature(aa64_tlbirange, cpu)) { define_arm_cp_regs(cpu, tlbirange_reginfo); } @@ -1308,5 +1303,4 @@ void define_tlb_insn_regs(ARMCPU *cpu) if (cpu_isar_feature(aa64_rme, cpu)) { define_arm_cp_regs(cpu, tlbi_rme_reginfo); } -#endif } diff --git a/target/arm/tcg/tlb_helper.c b/target/arm/tcg/tlb_helper.c index 8841f03..23c72a9 100644 --- a/target/arm/tcg/tlb_helper.c +++ b/target/arm/tcg/tlb_helper.c @@ -9,9 +9,9 @@ #include "cpu.h" #include "internals.h" #include "cpu-features.h" -#include "exec/exec-all.h" -#include "exec/helper-proto.h" +#define HELPER_H "tcg/helper.h" +#include "exec/helper-proto.h.inc" /* * Returns true if the stage 1 translation regime is using LPAE format page @@ -277,7 +277,7 @@ void arm_cpu_do_unaligned_access(CPUState *cs, vaddr vaddr, arm_deliver_fault(cpu, vaddr, access_type, mmu_idx, &fi); } -void helper_exception_pc_alignment(CPUARMState *env, target_ulong pc) +void helper_exception_pc_alignment(CPUARMState *env, vaddr pc) { ARMMMUFaultInfo fi = { .type = ARMFault_Alignment }; int target_el = exception_target_el(env); diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c index 3901432..dbf4759 100644 --- a/target/arm/tcg/translate-a64.c +++ b/target/arm/tcg/translate-a64.c @@ -17,8 +17,7 @@ * License along with this library; if not, see <http://www.gnu.org/licenses/>. */ #include "qemu/osdep.h" - -#include "exec/exec-all.h" +#include "exec/target_page.h" #include "translate.h" #include "translate-a64.h" #include "qemu/log.h" @@ -434,12 +433,6 @@ static void gen_rebuild_hflags(DisasContext *s) gen_helper_rebuild_hflags_a64(tcg_env, tcg_constant_i32(s->current_el)); } -static void gen_exception_internal(int excp) -{ - assert(excp_is_internal(excp)); - gen_helper_exception_internal(tcg_env, tcg_constant_i32(excp)); -} - static void gen_exception_internal_insn(DisasContext *s, int excp) { gen_a64_update_pc(s, 0); @@ -1076,11 +1069,9 @@ static void gen_adc_CC(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1) TCGv_i64 cf_64 = tcg_temp_new_i64(); TCGv_i64 vf_64 = tcg_temp_new_i64(); TCGv_i64 tmp = tcg_temp_new_i64(); - TCGv_i64 zero = tcg_constant_i64(0); tcg_gen_extu_i32_i64(cf_64, cpu_CF); - tcg_gen_add2_i64(result, cf_64, t0, zero, cf_64, zero); - tcg_gen_add2_i64(result, cf_64, result, cf_64, t1, zero); + tcg_gen_addcio_i64(result, cf_64, t0, t1, cf_64); tcg_gen_extrl_i64_i32(cpu_CF, cf_64); gen_set_NZ64(result); @@ -1094,12 +1085,10 @@ static void gen_adc_CC(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1) TCGv_i32 t0_32 = tcg_temp_new_i32(); TCGv_i32 t1_32 = tcg_temp_new_i32(); TCGv_i32 tmp = tcg_temp_new_i32(); - TCGv_i32 zero = tcg_constant_i32(0); tcg_gen_extrl_i64_i32(t0_32, t0); tcg_gen_extrl_i64_i32(t1_32, t1); - tcg_gen_add2_i32(cpu_NF, cpu_CF, t0_32, zero, cpu_CF, zero); - tcg_gen_add2_i32(cpu_NF, cpu_CF, cpu_NF, cpu_CF, t1_32, zero); + tcg_gen_addcio_i32(cpu_NF, cpu_CF, t0_32, t1_32, cpu_CF); tcg_gen_mov_i32(cpu_ZF, cpu_NF); tcg_gen_xor_i32(cpu_VF, cpu_NF, t0_32); @@ -1392,11 +1381,8 @@ static bool fp_access_check_only(DisasContext *s) return true; } -static bool fp_access_check(DisasContext *s) +static bool nonstreaming_check(DisasContext *s) { - if (!fp_access_check_only(s)) { - return false; - } if (s->sme_trap_nonstreaming && s->is_nonstreaming) { gen_exception_insn(s, 0, EXCP_UDEF, syn_smetrap(SME_ET_Streaming, false)); @@ -1405,6 +1391,11 @@ static bool fp_access_check(DisasContext *s) return true; } +static bool fp_access_check(DisasContext *s) +{ + return fp_access_check_only(s) && nonstreaming_check(s); +} + /* * Return <0 for non-supported element sizes, with MO_16 controlled by * FEAT_FP16; return 0 for fp disabled; otherwise return >0 for success. @@ -1455,14 +1446,24 @@ static int fp_access_check_vector_hsd(DisasContext *s, bool is_q, MemOp esz) */ bool sve_access_check(DisasContext *s) { - if (s->pstate_sm || !dc_isar_feature(aa64_sve, s)) { + if (dc_isar_feature(aa64_sme, s)) { bool ret; - assert(dc_isar_feature(aa64_sme, s)); - ret = sme_sm_enabled_check(s); + if (s->pstate_sm) { + ret = sme_enabled_check(s); + } else if (dc_isar_feature(aa64_sve, s)) { + goto continue_sve; + } else { + ret = sme_sm_enabled_check(s); + } + if (ret) { + ret = nonstreaming_check(s); + } s->sve_access_checked = (ret ? 1 : -1); return ret; } + + continue_sve: if (s->sve_excp_el) { /* Assert that we only raise one exception per instruction. */ assert(!s->sve_access_checked); @@ -1499,7 +1500,8 @@ bool sme_enabled_check(DisasContext *s) * to be zero when fp_excp_el has priority. This is because we need * sme_excp_el by itself for cpregs access checks. */ - if (!s->fp_excp_el || s->sme_excp_el < s->fp_excp_el) { + if (s->sme_excp_el + && (!s->fp_excp_el || s->sme_excp_el <= s->fp_excp_el)) { bool ret = sme_access_check(s); s->fp_access_checked = (ret ? 1 : -1); return ret; @@ -1821,6 +1823,10 @@ static bool trans_RETA(DisasContext *s, arg_reta *a) { TCGv_i64 dst; + if (!dc_isar_feature(aa64_pauth, s)) { + return false; + } + dst = auth_branch_target(s, cpu_reg(s, 30), cpu_X[31], !a->m); gen_a64_set_pc(s, dst); s->base.is_jmp = DISAS_JUMP; @@ -6108,9 +6114,9 @@ static bool do_dot_vector_env(DisasContext *s, arg_qrrr_e *a, return true; } -TRANS_FEAT(SDOT_v, aa64_dp, do_dot_vector, a, gen_helper_gvec_sdot_b) -TRANS_FEAT(UDOT_v, aa64_dp, do_dot_vector, a, gen_helper_gvec_udot_b) -TRANS_FEAT(USDOT_v, aa64_i8mm, do_dot_vector, a, gen_helper_gvec_usdot_b) +TRANS_FEAT(SDOT_v, aa64_dp, do_dot_vector, a, gen_helper_gvec_sdot_4b) +TRANS_FEAT(UDOT_v, aa64_dp, do_dot_vector, a, gen_helper_gvec_udot_4b) +TRANS_FEAT(USDOT_v, aa64_i8mm, do_dot_vector, a, gen_helper_gvec_usdot_4b) TRANS_FEAT(BFDOT_v, aa64_bf16, do_dot_vector_env, a, gen_helper_gvec_bfdot) TRANS_FEAT(BFMMLA, aa64_bf16, do_dot_vector_env, a, gen_helper_gvec_bfmmla) TRANS_FEAT(SMMLA, aa64_i8mm, do_dot_vector, a, gen_helper_gvec_smmla_b) @@ -6870,12 +6876,12 @@ static bool do_dot_vector_idx_env(DisasContext *s, arg_qrrx_e *a, return true; } -TRANS_FEAT(SDOT_vi, aa64_dp, do_dot_vector_idx, a, gen_helper_gvec_sdot_idx_b) -TRANS_FEAT(UDOT_vi, aa64_dp, do_dot_vector_idx, a, gen_helper_gvec_udot_idx_b) +TRANS_FEAT(SDOT_vi, aa64_dp, do_dot_vector_idx, a, gen_helper_gvec_sdot_idx_4b) +TRANS_FEAT(UDOT_vi, aa64_dp, do_dot_vector_idx, a, gen_helper_gvec_udot_idx_4b) TRANS_FEAT(SUDOT_vi, aa64_i8mm, do_dot_vector_idx, a, - gen_helper_gvec_sudot_idx_b) + gen_helper_gvec_sudot_idx_4b) TRANS_FEAT(USDOT_vi, aa64_i8mm, do_dot_vector_idx, a, - gen_helper_gvec_usdot_idx_b) + gen_helper_gvec_usdot_idx_4b) TRANS_FEAT(BFDOT_vi, aa64_bf16, do_dot_vector_idx_env, a, gen_helper_gvec_bfdot_idx) @@ -8600,7 +8606,7 @@ static bool trans_CCMP(DisasContext *s, arg_CCMP *a) tcg_gen_subi_i32(tcg_t2, tcg_t0, 1); nzcv = a->nzcv; - has_andc = tcg_op_supported(INDEX_op_andc_i32, TCG_TYPE_I32, 0); + has_andc = tcg_op_supported(INDEX_op_andc, TCG_TYPE_I32, 0); if (nzcv & 8) { /* N */ tcg_gen_or_i32(cpu_NF, cpu_NF, tcg_t1); } else { @@ -10133,8 +10139,10 @@ static void aarch64_tr_init_disas_context(DisasContextBase *dcbase, dc->trap_eret = EX_TBFLAG_A64(tb_flags, TRAP_ERET); dc->sve_excp_el = EX_TBFLAG_A64(tb_flags, SVEEXC_EL); dc->sme_excp_el = EX_TBFLAG_A64(tb_flags, SMEEXC_EL); + dc->zt0_excp_el = EX_TBFLAG_A64(tb_flags, ZT0EXC_EL); dc->vl = (EX_TBFLAG_A64(tb_flags, VL) + 1) * 16; dc->svl = (EX_TBFLAG_A64(tb_flags, SVL) + 1) * 16; + dc->max_svl = arm_cpu->sme_max_vq * 16; dc->pauth_active = EX_TBFLAG_A64(tb_flags, PAUTH_ACTIVE); dc->bt = EX_TBFLAG_A64(tb_flags, BT); dc->btype = EX_TBFLAG_A64(tb_flags, BTYPE); @@ -10247,7 +10255,7 @@ static void aarch64_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu) * start of the TB. */ assert(s->base.num_insns == 1); - gen_helper_exception_pc_alignment(tcg_env, tcg_constant_tl(pc)); + gen_helper_exception_pc_alignment(tcg_env, tcg_constant_vaddr(pc)); s->base.is_jmp = DISAS_NORETURN; s->base.pc_next = QEMU_ALIGN_UP(pc, 4); return; diff --git a/target/arm/tcg/translate-a64.h b/target/arm/tcg/translate-a64.h index b2420f5..9c45f89 100644 --- a/target/arm/tcg/translate-a64.h +++ b/target/arm/tcg/translate-a64.h @@ -28,7 +28,7 @@ bool logic_imm_decode_wmask(uint64_t *result, unsigned int immn, bool sve_access_check(DisasContext *s); bool sme_enabled_check(DisasContext *s); bool sme_enabled_check_with_svcr(DisasContext *s, unsigned); -uint32_t make_svemte_desc(DisasContext *s, unsigned vsz, uint32_t nregs, +uint64_t make_svemte_desc(DisasContext *s, unsigned vsz, uint32_t nregs, uint32_t msz, bool is_write, uint32_t data); /* This function corresponds to CheckStreamingSVEEnabled. */ @@ -225,7 +225,13 @@ void gen_gvec_usqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz); -void gen_sve_ldr(DisasContext *s, TCGv_ptr, int vofs, int len, int rn, int imm); -void gen_sve_str(DisasContext *s, TCGv_ptr, int vofs, int len, int rn, int imm); +void gen_gvec_sve2_sqdmulh(unsigned vece, uint32_t rd_ofs, + uint32_t rn_ofs, uint32_t rm_ofs, + uint32_t opr_sz, uint32_t max_sz); + +void gen_sve_ldr(DisasContext *s, TCGv_ptr, int vofs, + int len, int rn, int imm, MemOp align); +void gen_sve_str(DisasContext *s, TCGv_ptr, int vofs, + int len, int rn, int imm, MemOp align); #endif /* TARGET_ARM_TRANSLATE_A64_H */ diff --git a/target/arm/tcg/translate-neon.c b/target/arm/tcg/translate-neon.c index c4fecb8..844d2e2 100644 --- a/target/arm/tcg/translate-neon.c +++ b/target/arm/tcg/translate-neon.c @@ -271,7 +271,7 @@ static bool trans_VSDOT(DisasContext *s, arg_VSDOT *a) return false; } return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0, - gen_helper_gvec_sdot_b); + gen_helper_gvec_sdot_4b); } static bool trans_VUDOT(DisasContext *s, arg_VUDOT *a) @@ -280,7 +280,7 @@ static bool trans_VUDOT(DisasContext *s, arg_VUDOT *a) return false; } return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0, - gen_helper_gvec_udot_b); + gen_helper_gvec_udot_4b); } static bool trans_VUSDOT(DisasContext *s, arg_VUSDOT *a) @@ -289,7 +289,7 @@ static bool trans_VUSDOT(DisasContext *s, arg_VUSDOT *a) return false; } return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0, - gen_helper_gvec_usdot_b); + gen_helper_gvec_usdot_4b); } static bool trans_VDOT_b16(DisasContext *s, arg_VDOT_b16 *a) @@ -356,7 +356,7 @@ static bool trans_VSDOT_scalar(DisasContext *s, arg_VSDOT_scalar *a) return false; } return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index, - gen_helper_gvec_sdot_idx_b); + gen_helper_gvec_sdot_idx_4b); } static bool trans_VUDOT_scalar(DisasContext *s, arg_VUDOT_scalar *a) @@ -365,7 +365,7 @@ static bool trans_VUDOT_scalar(DisasContext *s, arg_VUDOT_scalar *a) return false; } return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index, - gen_helper_gvec_udot_idx_b); + gen_helper_gvec_udot_idx_4b); } static bool trans_VUSDOT_scalar(DisasContext *s, arg_VUSDOT_scalar *a) @@ -374,7 +374,7 @@ static bool trans_VUSDOT_scalar(DisasContext *s, arg_VUSDOT_scalar *a) return false; } return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index, - gen_helper_gvec_usdot_idx_b); + gen_helper_gvec_usdot_idx_4b); } static bool trans_VSUDOT_scalar(DisasContext *s, arg_VSUDOT_scalar *a) @@ -383,7 +383,7 @@ static bool trans_VSUDOT_scalar(DisasContext *s, arg_VSUDOT_scalar *a) return false; } return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index, - gen_helper_gvec_sudot_idx_b); + gen_helper_gvec_sudot_idx_4b); } static bool trans_VDOT_b16_scal(DisasContext *s, arg_VDOT_b16_scal *a) @@ -1010,8 +1010,8 @@ DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h) DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h) DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h) DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h) -DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h) -DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h) +DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_nf_s, gen_helper_gvec_fmla_nf_h) +DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_nf_s, gen_helper_gvec_fmls_nf_h) DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h) DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h) DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h) diff --git a/target/arm/tcg/translate-sme.c b/target/arm/tcg/translate-sme.c index fcbb350..091c56d 100644 --- a/target/arm/tcg/translate-sme.c +++ b/target/arm/tcg/translate-sme.c @@ -27,16 +27,25 @@ #include "decode-sme.c.inc" +static bool sme2_zt0_enabled_check(DisasContext *s) +{ + if (!sme_za_enabled_check(s)) { + return false; + } + if (s->zt0_excp_el) { + gen_exception_insn_el(s, 0, EXCP_UDEF, + syn_smetrap(SME_ET_InaccessibleZT0, false), + s->zt0_excp_el); + return false; + } + return true; +} -/* - * Resolve tile.size[index] to a host pointer, where tile and index - * are always decoded together, dependent on the element size. - */ +/* Resolve tile.size[rs+imm] to a host pointer. */ static TCGv_ptr get_tile_rowcol(DisasContext *s, int esz, int rs, - int tile_index, bool vertical) + int tile, int imm, int div_len, + int vec_mod, bool vertical) { - int tile = tile_index >> (4 - esz); - int index = esz == MO_128 ? 0 : extract32(tile_index, 0, 4 - esz); int pos, len, offset; TCGv_i32 tmp; TCGv_ptr addr; @@ -44,10 +53,23 @@ static TCGv_ptr get_tile_rowcol(DisasContext *s, int esz, int rs, /* Compute the final index, which is Rs+imm. */ tmp = tcg_temp_new_i32(); tcg_gen_trunc_tl_i32(tmp, cpu_reg(s, rs)); - tcg_gen_addi_i32(tmp, tmp, index); + /* + * Round the vector index down to a multiple of vec_mod if necessary. + * We do this before adding the offset, to handle cases like + * MOVA (tile to vector, 2 registers) where we want to call this + * several times in a loop with an increasing offset. We rely on + * the instruction encodings always forcing the initial offset in + * [rs + offset] to be a multiple of vec_mod. The pseudocode usually + * does the round-down after adding the offset rather than before, + * but MOVA is an exception. + */ + if (vec_mod > 1) { + tcg_gen_andc_i32(tmp, tmp, tcg_constant_i32(vec_mod - 1)); + } + tcg_gen_addi_i32(tmp, tmp, imm); /* Prepare a power-of-two modulo via extraction of @len bits. */ - len = ctz32(streaming_vec_reg_size(s)) - esz; + len = ctz32(streaming_vec_reg_size(s) / div_len) - esz; if (!len) { /* @@ -92,7 +114,7 @@ static TCGv_ptr get_tile_rowcol(DisasContext *s, int esz, int rs, offset = tile * sizeof(ARMVectorReg); /* Include the byte offset of zarray to make this relative to env. */ - offset += offsetof(CPUARMState, zarray); + offset += offsetof(CPUARMState, za_state.za); tcg_gen_addi_i32(tmp, tmp, offset); /* Add the byte offset to env to produce the final pointer. */ @@ -103,6 +125,14 @@ static TCGv_ptr get_tile_rowcol(DisasContext *s, int esz, int rs, return addr; } +/* Resolve ZArray[rs+imm] to a host pointer. */ +static TCGv_ptr get_zarray(DisasContext *s, int rs, int imm, + int div_len, int vec_mod) +{ + /* ZA[n] equates to ZA0H.B[n]. */ + return get_tile_rowcol(s, MO_8, rs, 0, imm, div_len, vec_mod, false); +} + /* * Resolve tile.size[0] to a host pointer. * Used by e.g. outer product insns where we require the entire tile. @@ -112,7 +142,7 @@ static TCGv_ptr get_tile(DisasContext *s, int esz, int tile) TCGv_ptr addr = tcg_temp_new_ptr(); int offset; - offset = tile * sizeof(ARMVectorReg) + offsetof(CPUARMState, zarray); + offset = tile * sizeof(ARMVectorReg) + offsetof(CPUARMState, za_state.za); tcg_gen_addi_ptr(addr, tcg_env, offset); return addr; @@ -130,7 +160,40 @@ static bool trans_ZERO(DisasContext *s, arg_ZERO *a) return true; } -static bool trans_MOVA(DisasContext *s, arg_MOVA *a) +static bool trans_ZERO_zt0(DisasContext *s, arg_ZERO_zt0 *a) +{ + if (!dc_isar_feature(aa64_sme2, s)) { + return false; + } + if (sme_enabled_check(s) && sme2_zt0_enabled_check(s)) { + tcg_gen_gvec_dup_imm(MO_64, offsetof(CPUARMState, za_state.zt0), + sizeof_field(CPUARMState, za_state.zt0), + sizeof_field(CPUARMState, za_state.zt0), 0); + } + return true; +} + +static bool trans_ZERO_za(DisasContext *s, arg_ZERO_za *a) +{ + if (!dc_isar_feature(aa64_sme2p1, s)) { + return false; + } + if (sme_smza_enabled_check(s)) { + int svl = streaming_vec_reg_size(s); + int vstride = svl / a->ngrp; + TCGv_ptr t_za = get_zarray(s, a->rv, a->off, a->ngrp, a->nvec); + + for (int r = 0; r < a->ngrp; ++r) { + for (int i = 0; i < a->nvec; ++i) { + int o_za = (r * vstride + i) * sizeof(ARMVectorReg); + tcg_gen_gvec_dup_imm_var(MO_64, t_za, o_za, svl, svl, 0); + } + } + } + return true; +} + +static bool do_mova_tile(DisasContext *s, arg_mova_p *a, bool to_vec) { static gen_helper_gvec_4 * const h_fns[5] = { gen_helper_sve_sel_zpzz_b, gen_helper_sve_sel_zpzz_h, @@ -152,14 +215,11 @@ static bool trans_MOVA(DisasContext *s, arg_MOVA *a) TCGv_i32 t_desc; int svl; - if (!dc_isar_feature(aa64_sme, s)) { - return false; - } if (!sme_smza_enabled_check(s)) { return true; } - t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v); + t_za = get_tile_rowcol(s, a->esz, a->rs, a->za, a->off, 1, 0, a->v); t_zr = vec_full_reg_ptr(s, a->zr); t_pg = pred_full_reg_ptr(s, a->pg); @@ -168,14 +228,14 @@ static bool trans_MOVA(DisasContext *s, arg_MOVA *a) if (a->v) { /* Vertical slice -- use sme mova helpers. */ - if (a->to_vec) { + if (to_vec) { zc_fns[a->esz](t_zr, t_za, t_pg, t_desc); } else { cz_fns[a->esz](t_za, t_zr, t_pg, t_desc); } } else { /* Horizontal slice -- reuse sve sel helpers. */ - if (a->to_vec) { + if (to_vec) { h_fns[a->esz](t_zr, t_za, t_zr, t_pg, t_desc); } else { h_fns[a->esz](t_za, t_zr, t_za, t_pg, t_desc); @@ -184,9 +244,150 @@ static bool trans_MOVA(DisasContext *s, arg_MOVA *a) return true; } +TRANS_FEAT(MOVA_tz, aa64_sme, do_mova_tile, a, false) +TRANS_FEAT(MOVA_zt, aa64_sme, do_mova_tile, a, true) + +static bool do_mova_tile_n(DisasContext *s, arg_mova_t *a, int n, + bool to_vec, bool zero) +{ + static gen_helper_gvec_2 * const cz_fns[] = { + gen_helper_sme2_mova_cz_b, gen_helper_sme2_mova_cz_h, + gen_helper_sme2_mova_cz_s, gen_helper_sme2_mova_cz_d, + }; + static gen_helper_gvec_2 * const zc_fns[] = { + gen_helper_sme2_mova_zc_b, gen_helper_sme2_mova_zc_h, + gen_helper_sme2_mova_zc_s, gen_helper_sme2_mova_zc_d, + }; + static gen_helper_gvec_2 * const zc_z_fns[] = { + gen_helper_sme2p1_movaz_zc_b, gen_helper_sme2p1_movaz_zc_h, + gen_helper_sme2p1_movaz_zc_s, gen_helper_sme2p1_movaz_zc_d, + gen_helper_sme2p1_movaz_zc_q, + }; + TCGv_ptr t_za; + int svl, bytes_per_op = n << a->esz; + + /* + * The MaxImplementedSVL check happens in the decode pseudocode, + * before the SM+ZA enabled check in the operation pseudocode. + * This will (currently) only fail for NREG=4, ESZ=MO_64. + */ + if (s->max_svl < bytes_per_op) { + unallocated_encoding(s); + return true; + } + + assert(a->esz <= MO_64 + zero); + + if (!sme_smza_enabled_check(s)) { + return true; + } + + svl = streaming_vec_reg_size(s); + + /* + * The CurrentVL check happens in the operation pseudocode, + * after the SM+ZA enabled check. + */ + if (svl < bytes_per_op) { + unallocated_encoding(s); + return true; + } + + if (a->v) { + TCGv_i32 t_desc = tcg_constant_i32(simd_desc(svl, svl, 0)); + + for (int i = 0; i < n; ++i) { + TCGv_ptr t_zr = vec_full_reg_ptr(s, a->zr * n + i); + t_za = get_tile_rowcol(s, a->esz, a->rs, a->za, + a->off * n + i, 1, n, a->v); + if (zero) { + zc_z_fns[a->esz](t_zr, t_za, t_desc); + } else if (to_vec) { + zc_fns[a->esz](t_zr, t_za, t_desc); + } else { + cz_fns[a->esz](t_za, t_zr, t_desc); + } + } + } else { + for (int i = 0; i < n; ++i) { + int o_zr = vec_full_reg_offset(s, a->zr * n + i); + t_za = get_tile_rowcol(s, a->esz, a->rs, a->za, + a->off * n + i, 1, n, a->v); + if (to_vec) { + tcg_gen_gvec_mov_var(MO_8, tcg_env, o_zr, t_za, 0, svl, svl); + if (zero) { + tcg_gen_gvec_dup_imm_var(MO_8, t_za, 0, svl, svl, 0); + } + } else { + tcg_gen_gvec_mov_var(MO_8, t_za, 0, tcg_env, o_zr, svl, svl); + } + } + } + return true; +} + +TRANS_FEAT(MOVA_tz2, aa64_sme2, do_mova_tile_n, a, 2, false, false) +TRANS_FEAT(MOVA_tz4, aa64_sme2, do_mova_tile_n, a, 4, false, false) +TRANS_FEAT(MOVA_zt2, aa64_sme2, do_mova_tile_n, a, 2, true, false) +TRANS_FEAT(MOVA_zt4, aa64_sme2, do_mova_tile_n, a, 4, true, false) + +TRANS_FEAT(MOVAZ_zt, aa64_sme2p1, do_mova_tile_n, a, 1, true, true) +TRANS_FEAT(MOVAZ_zt2, aa64_sme2p1, do_mova_tile_n, a, 2, true, true) +TRANS_FEAT(MOVAZ_zt4, aa64_sme2p1, do_mova_tile_n, a, 4, true, true) + +static bool do_mova_array_n(DisasContext *s, arg_mova_a *a, int n, + bool to_vec, bool zero) +{ + TCGv_ptr t_za; + int svl; + + if (!sme_smza_enabled_check(s)) { + return true; + } + + svl = streaming_vec_reg_size(s); + t_za = get_zarray(s, a->rv, a->off, n, 0); + + for (int i = 0; i < n; ++i) { + int o_za = (svl / n * sizeof(ARMVectorReg)) * i; + int o_zr = vec_full_reg_offset(s, a->zr * n + i); + + if (to_vec) { + tcg_gen_gvec_mov_var(MO_8, tcg_env, o_zr, t_za, o_za, svl, svl); + if (zero) { + tcg_gen_gvec_dup_imm_var(MO_8, t_za, o_za, svl, svl, 0); + } + } else { + tcg_gen_gvec_mov_var(MO_8, t_za, o_za, tcg_env, o_zr, svl, svl); + } + } + return true; +} + +TRANS_FEAT(MOVA_az2, aa64_sme2, do_mova_array_n, a, 2, false, false) +TRANS_FEAT(MOVA_az4, aa64_sme2, do_mova_array_n, a, 4, false, false) +TRANS_FEAT(MOVA_za2, aa64_sme2, do_mova_array_n, a, 2, true, false) +TRANS_FEAT(MOVA_za4, aa64_sme2, do_mova_array_n, a, 4, true, false) + +TRANS_FEAT(MOVAZ_za2, aa64_sme2p1, do_mova_array_n, a, 2, true, true) +TRANS_FEAT(MOVAZ_za4, aa64_sme2p1, do_mova_array_n, a, 4, true, true) + +static bool do_movt(DisasContext *s, arg_MOVT_rzt *a, + void (*func)(TCGv_i64, TCGv_ptr, tcg_target_long)) +{ + if (sme2_zt0_enabled_check(s)) { + func(cpu_reg(s, a->rt), tcg_env, + offsetof(CPUARMState, za_state.zt0) + a->off * 8); + } + return true; +} + +TRANS_FEAT(MOVT_rzt, aa64_sme2, do_movt, a, tcg_gen_ld_i64) +TRANS_FEAT(MOVT_ztr, aa64_sme2, do_movt, a, tcg_gen_st_i64) + static bool trans_LDST1(DisasContext *s, arg_LDST1 *a) { - typedef void GenLdSt1(TCGv_env, TCGv_ptr, TCGv_ptr, TCGv, TCGv_i32); + typedef void GenLdSt1(TCGv_env, TCGv_ptr, TCGv_ptr, TCGv, TCGv_i64); /* * Indexed by [esz][be][v][mte][st], which is (except for load/store) @@ -214,7 +415,7 @@ static bool trans_LDST1(DisasContext *s, arg_LDST1 *a) TCGv_ptr t_za, t_pg; TCGv_i64 addr; - uint32_t desc; + uint64_t desc; bool be = s->be_data == MO_BE; bool mte = s->mte_active[0]; @@ -225,7 +426,7 @@ static bool trans_LDST1(DisasContext *s, arg_LDST1 *a) return true; } - t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v); + t_za = get_tile_rowcol(s, a->esz, a->rs, a->za, a->off, 1, 0, a->v); t_pg = pred_full_reg_ptr(s, a->pg); addr = tcg_temp_new_i64(); @@ -239,32 +440,41 @@ static bool trans_LDST1(DisasContext *s, arg_LDST1 *a) desc = make_svemte_desc(s, streaming_vec_reg_size(s), 1, a->esz, a->st, 0); fns[a->esz][be][a->v][mte][a->st](tcg_env, t_za, t_pg, addr, - tcg_constant_i32(desc)); + tcg_constant_i64(desc)); return true; } -typedef void GenLdStR(DisasContext *, TCGv_ptr, int, int, int, int); +typedef void GenLdStR(DisasContext *, TCGv_ptr, int, int, int, int, MemOp); static bool do_ldst_r(DisasContext *s, arg_ldstr *a, GenLdStR *fn) { - int svl = streaming_vec_reg_size(s); - int imm = a->imm; - TCGv_ptr base; + if (sme_za_enabled_check(s)) { + int svl = streaming_vec_reg_size(s); + int imm = a->imm; + TCGv_ptr base = get_zarray(s, a->rv, imm, 1, 0); - if (!sme_za_enabled_check(s)) { - return true; + fn(s, base, 0, svl, a->rn, imm * svl, + s->align_mem ? MO_ALIGN_16 : MO_UNALN); } - - /* ZA[n] equates to ZA0H.B[n]. */ - base = get_tile_rowcol(s, MO_8, a->rv, imm, false); - - fn(s, base, 0, svl, a->rn, imm * svl); return true; } TRANS_FEAT(LDR, aa64_sme, do_ldst_r, a, gen_sve_ldr) TRANS_FEAT(STR, aa64_sme, do_ldst_r, a, gen_sve_str) +static bool do_ldst_zt0(DisasContext *s, arg_ldstzt0 *a, GenLdStR *fn) +{ + if (sme2_zt0_enabled_check(s)) { + fn(s, tcg_env, offsetof(CPUARMState, za_state.zt0), + sizeof_field(CPUARMState, za_state.zt0), a->rn, 0, + s->align_mem ? MO_ALIGN_16 : MO_UNALN); + } + return true; +} + +TRANS_FEAT(LDR_zt0, aa64_sme2, do_ldst_zt0, a, gen_sve_ldr) +TRANS_FEAT(STR_zt0, aa64_sme2, do_ldst_zt0, a, gen_sve_str) + static bool do_adda(DisasContext *s, arg_adda *a, MemOp esz, gen_helper_gvec_4 *fn) { @@ -316,7 +526,7 @@ static bool do_outprod_fpst(DisasContext *s, arg_op *a, MemOp esz, gen_helper_gvec_5_ptr *fn) { int svl = streaming_vec_reg_size(s); - uint32_t desc = simd_desc(svl, svl, a->sub); + uint32_t desc = simd_desc(svl, svl, 0); TCGv_ptr za, zn, zm, pn, pm, fpst; if (!sme_smza_enabled_check(s)) { @@ -338,7 +548,7 @@ static bool do_outprod_env(DisasContext *s, arg_op *a, MemOp esz, gen_helper_gvec_5_ptr *fn) { int svl = streaming_vec_reg_size(s); - uint32_t desc = simd_desc(svl, svl, a->sub); + uint32_t desc = simd_desc(svl, svl, 0); TCGv_ptr za, zn, zm, pn, pm; if (!sme_smza_enabled_check(s)) { @@ -355,14 +565,32 @@ static bool do_outprod_env(DisasContext *s, arg_op *a, MemOp esz, return true; } -TRANS_FEAT(FMOPA_h, aa64_sme, do_outprod_env, a, - MO_32, gen_helper_sme_fmopa_h) -TRANS_FEAT(FMOPA_s, aa64_sme, do_outprod_fpst, a, - MO_32, FPST_A64, gen_helper_sme_fmopa_s) -TRANS_FEAT(FMOPA_d, aa64_sme_f64f64, do_outprod_fpst, a, - MO_64, FPST_A64, gen_helper_sme_fmopa_d) +TRANS_FEAT(FMOPA_w_h, aa64_sme, do_outprod_env, a, MO_32, + !a->sub ? gen_helper_sme_fmopa_w_h + : !s->fpcr_ah ? gen_helper_sme_fmops_w_h + : gen_helper_sme_ah_fmops_w_h) +TRANS_FEAT(FMOPA_h, aa64_sme_f16f16, do_outprod_fpst, a, MO_16, FPST_ZA_F16, + !a->sub ? gen_helper_sme_fmopa_h + : !s->fpcr_ah ? gen_helper_sme_fmops_h + : gen_helper_sme_ah_fmops_h) +TRANS_FEAT(FMOPA_s, aa64_sme, do_outprod_fpst, a, MO_32, FPST_ZA, + !a->sub ? gen_helper_sme_fmopa_s + : !s->fpcr_ah ? gen_helper_sme_fmops_s + : gen_helper_sme_ah_fmops_s) +TRANS_FEAT(FMOPA_d, aa64_sme_f64f64, do_outprod_fpst, a, MO_64, FPST_ZA, + !a->sub ? gen_helper_sme_fmopa_d + : !s->fpcr_ah ? gen_helper_sme_fmops_d + : gen_helper_sme_ah_fmops_d) + +TRANS_FEAT(BFMOPA, aa64_sme_b16b16, do_outprod_fpst, a, MO_16, FPST_ZA, + !a->sub ? gen_helper_sme_bfmopa + : !s->fpcr_ah ? gen_helper_sme_bfmops + : gen_helper_sme_ah_bfmops) -TRANS_FEAT(BFMOPA, aa64_sme, do_outprod_env, a, MO_32, gen_helper_sme_bfmopa) +TRANS_FEAT(BFMOPA_w, aa64_sme, do_outprod_env, a, MO_32, + !a->sub ? gen_helper_sme_bfmopa_w + : !s->fpcr_ah ? gen_helper_sme_bfmops_w + : gen_helper_sme_ah_bfmops_w) TRANS_FEAT(SMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_smopa_s) TRANS_FEAT(UMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_umopa_s) @@ -373,3 +601,1173 @@ TRANS_FEAT(SMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_smopa_ TRANS_FEAT(UMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_umopa_d) TRANS_FEAT(SUMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_sumopa_d) TRANS_FEAT(USMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_usmopa_d) + +TRANS_FEAT(BMOPA, aa64_sme2, do_outprod, a, MO_32, gen_helper_sme2_bmopa_s) +TRANS_FEAT(SMOPA2_s, aa64_sme2, do_outprod, a, MO_32, gen_helper_sme2_smopa2_s) +TRANS_FEAT(UMOPA2_s, aa64_sme2, do_outprod, a, MO_32, gen_helper_sme2_umopa2_s) + +static bool do_z2z_n1(DisasContext *s, arg_z2z_en *a, GVecGen3Fn *fn) +{ + int esz, dn, vsz, mofs, n; + bool overlap = false; + + if (!sme_sm_enabled_check(s)) { + return true; + } + + esz = a->esz; + n = a->n; + dn = a->zdn; + mofs = vec_full_reg_offset(s, a->zm); + vsz = streaming_vec_reg_size(s); + + for (int i = 0; i < n; i++) { + int dofs = vec_full_reg_offset(s, dn + i); + if (dofs == mofs) { + overlap = true; + } else { + fn(esz, dofs, dofs, mofs, vsz, vsz); + } + } + if (overlap) { + fn(esz, mofs, mofs, mofs, vsz, vsz); + } + return true; +} + +static void gen_sme2_srshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) +{ + static gen_helper_gvec_3 * const fns[] = { + gen_helper_gvec_srshl_b, gen_helper_sme2_srshl_h, + gen_helper_sme2_srshl_s, gen_helper_sme2_srshl_d, + }; + tcg_debug_assert(vece <= MO_64); + tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); +} + +static void gen_sme2_urshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, + uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) +{ + static gen_helper_gvec_3 * const fns[] = { + gen_helper_gvec_urshl_b, gen_helper_sme2_urshl_h, + gen_helper_sme2_urshl_s, gen_helper_sme2_urshl_d, + }; + tcg_debug_assert(vece <= MO_64); + tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); +} + +TRANS_FEAT(ADD_n1, aa64_sme2, do_z2z_n1, a, tcg_gen_gvec_add) +TRANS_FEAT(SMAX_n1, aa64_sme2, do_z2z_n1, a, tcg_gen_gvec_smax) +TRANS_FEAT(SMIN_n1, aa64_sme2, do_z2z_n1, a, tcg_gen_gvec_smin) +TRANS_FEAT(UMAX_n1, aa64_sme2, do_z2z_n1, a, tcg_gen_gvec_umax) +TRANS_FEAT(UMIN_n1, aa64_sme2, do_z2z_n1, a, tcg_gen_gvec_umin) +TRANS_FEAT(SRSHL_n1, aa64_sme2, do_z2z_n1, a, gen_sme2_srshl) +TRANS_FEAT(URSHL_n1, aa64_sme2, do_z2z_n1, a, gen_sme2_urshl) +TRANS_FEAT(SQDMULH_n1, aa64_sme2, do_z2z_n1, a, gen_gvec_sve2_sqdmulh) + +static bool do_z2z_nn(DisasContext *s, arg_z2z_en *a, GVecGen3Fn *fn) +{ + int esz, dn, dm, vsz, n; + + if (!sme_sm_enabled_check(s)) { + return true; + } + + esz = a->esz; + n = a->n; + dn = a->zdn; + dm = a->zm; + vsz = streaming_vec_reg_size(s); + + for (int i = 0; i < n; i++) { + int dofs = vec_full_reg_offset(s, dn + i); + int mofs = vec_full_reg_offset(s, dm + i); + + fn(esz, dofs, dofs, mofs, vsz, vsz); + } + return true; +} + +TRANS_FEAT(SMAX_nn, aa64_sme2, do_z2z_nn, a, tcg_gen_gvec_smax) +TRANS_FEAT(SMIN_nn, aa64_sme2, do_z2z_nn, a, tcg_gen_gvec_smin) +TRANS_FEAT(UMAX_nn, aa64_sme2, do_z2z_nn, a, tcg_gen_gvec_umax) +TRANS_FEAT(UMIN_nn, aa64_sme2, do_z2z_nn, a, tcg_gen_gvec_umin) +TRANS_FEAT(SRSHL_nn, aa64_sme2, do_z2z_nn, a, gen_sme2_srshl) +TRANS_FEAT(URSHL_nn, aa64_sme2, do_z2z_nn, a, gen_sme2_urshl) +TRANS_FEAT(SQDMULH_nn, aa64_sme2, do_z2z_nn, a, gen_gvec_sve2_sqdmulh) + +static bool do_z2z_n1_fpst(DisasContext *s, arg_z2z_en *a, + gen_helper_gvec_3_ptr * const fns[4]) +{ + int esz = a->esz, n, dn, vsz, mofs; + bool overlap = false; + gen_helper_gvec_3_ptr *fn; + TCGv_ptr fpst; + + /* These insns use MO_8 to encode BFloat16. */ + if (esz == MO_8 && !dc_isar_feature(aa64_sme_b16b16, s)) { + return false; + } + if (!sme_sm_enabled_check(s)) { + return true; + } + + fpst = fpstatus_ptr(esz == MO_16 ? FPST_A64_F16 : FPST_A64); + fn = fns[esz]; + n = a->n; + dn = a->zdn; + mofs = vec_full_reg_offset(s, a->zm); + vsz = streaming_vec_reg_size(s); + + for (int i = 0; i < n; i++) { + int dofs = vec_full_reg_offset(s, dn + i); + if (dofs == mofs) { + overlap = true; + } else { + tcg_gen_gvec_3_ptr(dofs, dofs, mofs, fpst, vsz, vsz, 0, fn); + } + } + if (overlap) { + tcg_gen_gvec_3_ptr(mofs, mofs, mofs, fpst, vsz, vsz, 0, fn); + } + return true; +} + +static bool do_z2z_nn_fpst(DisasContext *s, arg_z2z_en *a, + gen_helper_gvec_3_ptr * const fns[4]) +{ + int esz = a->esz, n, dn, dm, vsz; + gen_helper_gvec_3_ptr *fn; + TCGv_ptr fpst; + + if (esz == MO_8 && !dc_isar_feature(aa64_sme_b16b16, s)) { + return false; + } + if (!sme_sm_enabled_check(s)) { + return true; + } + + fpst = fpstatus_ptr(esz == MO_16 ? FPST_A64_F16 : FPST_A64); + fn = fns[esz]; + n = a->n; + dn = a->zdn; + dm = a->zm; + vsz = streaming_vec_reg_size(s); + + for (int i = 0; i < n; i++) { + int dofs = vec_full_reg_offset(s, dn + i); + int mofs = vec_full_reg_offset(s, dm + i); + + tcg_gen_gvec_3_ptr(dofs, dofs, mofs, fpst, vsz, vsz, 0, fn); + } + return true; +} + +static gen_helper_gvec_3_ptr * const f_vector_fmax[2][4] = { + { gen_helper_gvec_fmax_b16, + gen_helper_gvec_fmax_h, + gen_helper_gvec_fmax_s, + gen_helper_gvec_fmax_d }, + { gen_helper_gvec_ah_fmax_b16, + gen_helper_gvec_ah_fmax_h, + gen_helper_gvec_ah_fmax_s, + gen_helper_gvec_ah_fmax_d }, +}; +TRANS_FEAT(FMAX_n1, aa64_sme2, do_z2z_n1_fpst, a, f_vector_fmax[s->fpcr_ah]) +TRANS_FEAT(FMAX_nn, aa64_sme2, do_z2z_nn_fpst, a, f_vector_fmax[s->fpcr_ah]) + +static gen_helper_gvec_3_ptr * const f_vector_fmin[2][4] = { + { gen_helper_gvec_fmin_b16, + gen_helper_gvec_fmin_h, + gen_helper_gvec_fmin_s, + gen_helper_gvec_fmin_d }, + { gen_helper_gvec_ah_fmin_b16, + gen_helper_gvec_ah_fmin_h, + gen_helper_gvec_ah_fmin_s, + gen_helper_gvec_ah_fmin_d }, +}; +TRANS_FEAT(FMIN_n1, aa64_sme2, do_z2z_n1_fpst, a, f_vector_fmin[s->fpcr_ah]) +TRANS_FEAT(FMIN_nn, aa64_sme2, do_z2z_nn_fpst, a, f_vector_fmin[s->fpcr_ah]) + +static gen_helper_gvec_3_ptr * const f_vector_fmaxnm[4] = { + gen_helper_gvec_fmaxnum_b16, + gen_helper_gvec_fmaxnum_h, + gen_helper_gvec_fmaxnum_s, + gen_helper_gvec_fmaxnum_d, +}; +TRANS_FEAT(FMAXNM_n1, aa64_sme2, do_z2z_n1_fpst, a, f_vector_fmaxnm) +TRANS_FEAT(FMAXNM_nn, aa64_sme2, do_z2z_nn_fpst, a, f_vector_fmaxnm) + +static gen_helper_gvec_3_ptr * const f_vector_fminnm[4] = { + gen_helper_gvec_fminnum_b16, + gen_helper_gvec_fminnum_h, + gen_helper_gvec_fminnum_s, + gen_helper_gvec_fminnum_d, +}; +TRANS_FEAT(FMINNM_n1, aa64_sme2, do_z2z_n1_fpst, a, f_vector_fminnm) +TRANS_FEAT(FMINNM_nn, aa64_sme2, do_z2z_nn_fpst, a, f_vector_fminnm) + +/* Add/Sub vector Z[m] to each Z[n*N] with result in ZA[d*N]. */ +static bool do_azz_n1(DisasContext *s, arg_azz_n *a, int esz, + GVecGen3FnVar *fn) +{ + TCGv_ptr t_za; + int svl, n, o_zm; + + if (!sme_smza_enabled_check(s)) { + return true; + } + + n = a->n; + t_za = get_zarray(s, a->rv, a->off, n, 0); + o_zm = vec_full_reg_offset(s, a->zm); + svl = streaming_vec_reg_size(s); + + for (int i = 0; i < n; ++i) { + int o_za = (svl / n * sizeof(ARMVectorReg)) * i; + int o_zn = vec_full_reg_offset(s, (a->zn + i) % 32); + + fn(esz, t_za, o_za, tcg_env, o_zn, tcg_env, o_zm, svl, svl); + } + return true; +} + +TRANS_FEAT(ADD_azz_n1_s, aa64_sme2, do_azz_n1, a, MO_32, tcg_gen_gvec_add_var) +TRANS_FEAT(SUB_azz_n1_s, aa64_sme2, do_azz_n1, a, MO_32, tcg_gen_gvec_sub_var) +TRANS_FEAT(ADD_azz_n1_d, aa64_sme2_i16i64, do_azz_n1, a, MO_64, tcg_gen_gvec_add_var) +TRANS_FEAT(SUB_azz_n1_d, aa64_sme2_i16i64, do_azz_n1, a, MO_64, tcg_gen_gvec_sub_var) + +/* Add/Sub each vector Z[m*N] to each Z[n*N] with result in ZA[d*N]. */ +static bool do_azz_nn(DisasContext *s, arg_azz_n *a, int esz, + GVecGen3FnVar *fn) +{ + TCGv_ptr t_za; + int svl, n; + + if (!sme_smza_enabled_check(s)) { + return true; + } + + n = a->n; + t_za = get_zarray(s, a->rv, a->off, n, 1); + svl = streaming_vec_reg_size(s); + + for (int i = 0; i < n; ++i) { + int o_za = (svl / n * sizeof(ARMVectorReg)) * i; + int o_zn = vec_full_reg_offset(s, a->zn + i); + int o_zm = vec_full_reg_offset(s, a->zm + i); + + fn(esz, t_za, o_za, tcg_env, o_zn, tcg_env, o_zm, svl, svl); + } + return true; +} + +TRANS_FEAT(ADD_azz_nn_s, aa64_sme2, do_azz_nn, a, MO_32, tcg_gen_gvec_add_var) +TRANS_FEAT(SUB_azz_nn_s, aa64_sme2, do_azz_nn, a, MO_32, tcg_gen_gvec_sub_var) +TRANS_FEAT(ADD_azz_nn_d, aa64_sme2_i16i64, do_azz_nn, a, MO_64, tcg_gen_gvec_add_var) +TRANS_FEAT(SUB_azz_nn_d, aa64_sme2_i16i64, do_azz_nn, a, MO_64, tcg_gen_gvec_sub_var) + +/* Add/Sub each ZA[d*N] += Z[m*N] */ +static bool do_aaz(DisasContext *s, arg_az_n *a, int esz, GVecGen3FnVar *fn) +{ + TCGv_ptr t_za; + int svl, n; + + if (!sme_smza_enabled_check(s)) { + return true; + } + + n = a->n; + t_za = get_zarray(s, a->rv, a->off, n, 0); + svl = streaming_vec_reg_size(s); + + for (int i = 0; i < n; ++i) { + int o_za = (svl / n * sizeof(ARMVectorReg)) * i; + int o_zm = vec_full_reg_offset(s, a->zm + i); + + fn(esz, t_za, o_za, t_za, o_za, tcg_env, o_zm, svl, svl); + } + return true; +} + +TRANS_FEAT(ADD_aaz_s, aa64_sme2, do_aaz, a, MO_32, tcg_gen_gvec_add_var) +TRANS_FEAT(SUB_aaz_s, aa64_sme2, do_aaz, a, MO_32, tcg_gen_gvec_sub_var) +TRANS_FEAT(ADD_aaz_d, aa64_sme2_i16i64, do_aaz, a, MO_64, tcg_gen_gvec_add_var) +TRANS_FEAT(SUB_aaz_d, aa64_sme2_i16i64, do_aaz, a, MO_64, tcg_gen_gvec_sub_var) + +/* + * Expand array multi-vector single (n1), array multi-vector (nn), + * and array multi-vector indexed (nx), for floating-point accumulate. + * multi: true for nn, false for n1. + * fpst: >= 0 to set ptr argument for FPST_*, < 0 for ENV. + * data: stuff for simd_data, including any index. + */ +#define FPST_ENV -1 + +static bool do_azz_fp(DisasContext *s, int nreg, int nsel, + int rv, int off, int zn, int zm, + int data, int shsel, bool multi, int fpst, + gen_helper_gvec_3_ptr *fn) +{ + if (sme_smza_enabled_check(s)) { + int svl = streaming_vec_reg_size(s); + int vstride = svl / nreg; + TCGv_ptr t_za = get_zarray(s, rv, off, nreg, nsel); + TCGv_ptr t, ptr; + + if (fpst >= 0) { + ptr = fpstatus_ptr(fpst); + } else { + ptr = tcg_env; + } + t = tcg_temp_new_ptr(); + + for (int r = 0; r < nreg; ++r) { + TCGv_ptr t_zn = vec_full_reg_ptr(s, zn); + TCGv_ptr t_zm = vec_full_reg_ptr(s, zm); + + for (int i = 0; i < nsel; ++i) { + int o_za = (r * vstride + i) * sizeof(ARMVectorReg); + int desc = simd_desc(svl, svl, data | (i << shsel)); + + tcg_gen_addi_ptr(t, t_za, o_za); + fn(t, t_zn, t_zm, ptr, tcg_constant_i32(desc)); + } + + /* + * For multiple-and-single vectors, Zn may wrap. + * For multiple vectors, both Zn and Zm are aligned. + */ + zn = (zn + 1) % 32; + zm += multi; + } + } + return true; +} + +static bool do_azz_acc_fp(DisasContext *s, int nreg, int nsel, + int rv, int off, int zn, int zm, + int data, int shsel, bool multi, int fpst, + gen_helper_gvec_4_ptr *fn) +{ + if (sme_smza_enabled_check(s)) { + int svl = streaming_vec_reg_size(s); + int vstride = svl / nreg; + TCGv_ptr t_za = get_zarray(s, rv, off, nreg, nsel); + TCGv_ptr t, ptr; + + if (fpst >= 0) { + ptr = fpstatus_ptr(fpst); + } else { + ptr = tcg_env; + } + t = tcg_temp_new_ptr(); + + for (int r = 0; r < nreg; ++r) { + TCGv_ptr t_zn = vec_full_reg_ptr(s, zn); + TCGv_ptr t_zm = vec_full_reg_ptr(s, zm); + + for (int i = 0; i < nsel; ++i) { + int o_za = (r * vstride + i) * sizeof(ARMVectorReg); + int desc = simd_desc(svl, svl, data | (i << shsel)); + + tcg_gen_addi_ptr(t, t_za, o_za); + fn(t, t_zn, t_zm, t, ptr, tcg_constant_i32(desc)); + } + + /* + * For multiple-and-single vectors, Zn may wrap. + * For multiple vectors, both Zn and Zm are aligned. + */ + zn = (zn + 1) % 32; + zm += multi; + } + } + return true; +} + +static bool do_fmlal(DisasContext *s, arg_azz_n *a, bool sub, bool multi) +{ + return do_azz_acc_fp(s, a->n, 2, a->rv, a->off, a->zn, a->zm, + (1 << 2) | sub, 1, + multi, FPST_ENV, gen_helper_sve2_fmlal_zzzw_s); +} + +TRANS_FEAT(FMLAL_n1, aa64_sme2, do_fmlal, a, false, false) +TRANS_FEAT(FMLSL_n1, aa64_sme2, do_fmlal, a, true, false) +TRANS_FEAT(FMLAL_nn, aa64_sme2, do_fmlal, a, false, true) +TRANS_FEAT(FMLSL_nn, aa64_sme2, do_fmlal, a, true, true) + +static bool do_fmlal_nx(DisasContext *s, arg_azx_n *a, bool sub) +{ + return do_azz_acc_fp(s, a->n, 2, a->rv, a->off, a->zn, a->zm, + (a->idx << 3) | (1 << 2) | sub, 1, + false, FPST_ENV, gen_helper_sve2_fmlal_zzxw_s); +} + +TRANS_FEAT(FMLAL_nx, aa64_sme2, do_fmlal_nx, a, false) +TRANS_FEAT(FMLSL_nx, aa64_sme2, do_fmlal_nx, a, true) + +static bool do_bfmlal(DisasContext *s, arg_azz_n *a, bool sub, bool multi) +{ + return do_azz_acc_fp(s, a->n, 2, a->rv, a->off, a->zn, a->zm, + 0, 0, multi, FPST_ZA, + (!sub ? gen_helper_gvec_bfmlal + : s->fpcr_ah ? gen_helper_gvec_ah_bfmlsl + : gen_helper_gvec_bfmlsl)); +} + +TRANS_FEAT(BFMLAL_n1, aa64_sme2, do_bfmlal, a, false, false) +TRANS_FEAT(BFMLSL_n1, aa64_sme2, do_bfmlal, a, true, false) +TRANS_FEAT(BFMLAL_nn, aa64_sme2, do_bfmlal, a, false, true) +TRANS_FEAT(BFMLSL_nn, aa64_sme2, do_bfmlal, a, true, true) + +static bool do_bfmlal_nx(DisasContext *s, arg_azx_n *a, bool sub) +{ + return do_azz_acc_fp(s, a->n, 2, a->rv, a->off, a->zn, a->zm, + a->idx << 1, 0, false, FPST_ZA, + !sub ? gen_helper_gvec_bfmlal_idx + : s->fpcr_ah ? gen_helper_gvec_ah_bfmlsl_idx + : gen_helper_gvec_bfmlsl_idx); +} + +TRANS_FEAT(BFMLAL_nx, aa64_sme2, do_bfmlal_nx, a, false) +TRANS_FEAT(BFMLSL_nx, aa64_sme2, do_bfmlal_nx, a, true) + +static bool do_fdot(DisasContext *s, arg_azz_n *a, bool multi) +{ + return do_azz_acc_fp(s, a->n, 1, a->rv, a->off, a->zn, a->zm, 1, 0, + multi, FPST_ENV, gen_helper_sme2_fdot_h); +} + +TRANS_FEAT(FDOT_n1, aa64_sme2, do_fdot, a, false) +TRANS_FEAT(FDOT_nn, aa64_sme2, do_fdot, a, true) + +static bool do_fdot_nx(DisasContext *s, arg_azx_n *a) +{ + return do_azz_acc_fp(s, a->n, 1, a->rv, a->off, a->zn, a->zm, + a->idx | (1 << 2), 0, false, FPST_ENV, + gen_helper_sme2_fdot_idx_h); +} + +TRANS_FEAT(FDOT_nx, aa64_sme2, do_fdot_nx, a) + +static bool do_bfdot(DisasContext *s, arg_azz_n *a, bool multi) +{ + return do_azz_acc_fp(s, a->n, 1, a->rv, a->off, a->zn, a->zm, 0, 0, + multi, FPST_ENV, gen_helper_gvec_bfdot); +} + +TRANS_FEAT(BFDOT_n1, aa64_sme2, do_bfdot, a, false) +TRANS_FEAT(BFDOT_nn, aa64_sme2, do_bfdot, a, true) + +static bool do_bfdot_nx(DisasContext *s, arg_azx_n *a) +{ + return do_azz_acc_fp(s, a->n, 1, a->rv, a->off, a->zn, a->zm, a->idx, 0, + false, FPST_ENV, gen_helper_gvec_bfdot_idx); +} + +TRANS_FEAT(BFDOT_nx, aa64_sme2, do_bfdot_nx, a) + +static bool do_vdot(DisasContext *s, arg_azx_n *a, gen_helper_gvec_4_ptr *fn) +{ + if (sme_smza_enabled_check(s)) { + int svl = streaming_vec_reg_size(s); + int vstride = svl / 2; + TCGv_ptr t_za = get_zarray(s, a->rv, a->off, 2, 1); + TCGv_ptr t_zn = vec_full_reg_ptr(s, a->zn); + TCGv_ptr t_zm = vec_full_reg_ptr(s, a->zm); + TCGv_ptr t = tcg_temp_new_ptr(); + + for (int i = 0; i < 2; ++i) { + int o_za = i * vstride * sizeof(ARMVectorReg); + int desc = simd_desc(svl, svl, a->idx | (i << 2)); + + tcg_gen_addi_ptr(t, t_za, o_za); + fn(t, t_zn, t_zm, t, tcg_env, tcg_constant_i32(desc)); + } + } + return true; +} + +TRANS_FEAT(FVDOT, aa64_sme, do_vdot, a, gen_helper_sme2_fvdot_idx_h) +TRANS_FEAT(BFVDOT, aa64_sme, do_vdot, a, gen_helper_sme2_bfvdot_idx) + +static bool do_fmla(DisasContext *s, arg_azz_n *a, bool multi, + ARMFPStatusFlavour fpst, gen_helper_gvec_3_ptr *fn) +{ + return do_azz_fp(s, a->n, 1, a->rv, a->off, a->zn, a->zm, + 0, 0, multi, fpst, fn); +} + +TRANS_FEAT(FMLA_n1_h, aa64_sme_f16f16, do_fmla, a, false, FPST_ZA_F16, + gen_helper_gvec_vfma_h) +TRANS_FEAT(FMLS_n1_h, aa64_sme_f16f16, do_fmla, a, false, FPST_ZA_F16, + s->fpcr_ah ? gen_helper_gvec_ah_vfms_h : gen_helper_gvec_vfms_h) +TRANS_FEAT(FMLA_nn_h, aa64_sme_f16f16, do_fmla, a, true, FPST_ZA_F16, + gen_helper_gvec_vfma_h) +TRANS_FEAT(FMLS_nn_h, aa64_sme_f16f16, do_fmla, a, true, FPST_ZA_F16, + s->fpcr_ah ? gen_helper_gvec_ah_vfms_h : gen_helper_gvec_vfms_h) + +TRANS_FEAT(FMLA_n1_s, aa64_sme2, do_fmla, a, false, FPST_ZA, + gen_helper_gvec_vfma_s) +TRANS_FEAT(FMLS_n1_s, aa64_sme2, do_fmla, a, false, FPST_ZA, + s->fpcr_ah ? gen_helper_gvec_ah_vfms_s : gen_helper_gvec_vfms_s) +TRANS_FEAT(FMLA_nn_s, aa64_sme2, do_fmla, a, true, FPST_ZA, + gen_helper_gvec_vfma_s) +TRANS_FEAT(FMLS_nn_s, aa64_sme2, do_fmla, a, true, FPST_ZA, + s->fpcr_ah ? gen_helper_gvec_ah_vfms_s : gen_helper_gvec_vfms_s) + +TRANS_FEAT(FMLA_n1_d, aa64_sme2_f64f64, do_fmla, a, false, FPST_ZA, + gen_helper_gvec_vfma_d) +TRANS_FEAT(FMLS_n1_d, aa64_sme2_f64f64, do_fmla, a, false, FPST_ZA, + s->fpcr_ah ? gen_helper_gvec_ah_vfms_d : gen_helper_gvec_vfms_d) +TRANS_FEAT(FMLA_nn_d, aa64_sme2_f64f64, do_fmla, a, true, FPST_ZA, + gen_helper_gvec_vfma_d) +TRANS_FEAT(FMLS_nn_d, aa64_sme2_f64f64, do_fmla, a, true, FPST_ZA, + s->fpcr_ah ? gen_helper_gvec_ah_vfms_d : gen_helper_gvec_vfms_d) + +TRANS_FEAT(BFMLA_n1, aa64_sme_b16b16, do_fmla, a, false, FPST_ZA, + gen_helper_gvec_bfmla) +TRANS_FEAT(BFMLS_n1, aa64_sme_b16b16, do_fmla, a, false, FPST_ZA, + s->fpcr_ah ? gen_helper_gvec_ah_bfmls : gen_helper_gvec_bfmls) +TRANS_FEAT(BFMLA_nn, aa64_sme_b16b16, do_fmla, a, true, FPST_ZA, + gen_helper_gvec_bfmla) +TRANS_FEAT(BFMLS_nn, aa64_sme_b16b16, do_fmla, a, true, FPST_ZA, + s->fpcr_ah ? gen_helper_gvec_ah_bfmls : gen_helper_gvec_bfmls) + +static bool do_fmla_nx(DisasContext *s, arg_azx_n *a, + ARMFPStatusFlavour fpst, gen_helper_gvec_4_ptr *fn) +{ + return do_azz_acc_fp(s, a->n, 1, a->rv, a->off, a->zn, a->zm, + a->idx, 0, false, fpst, fn); +} + +TRANS_FEAT(FMLA_nx_h, aa64_sme_f16f16, do_fmla_nx, a, FPST_ZA_F16, + gen_helper_gvec_fmla_idx_h) +TRANS_FEAT(FMLS_nx_h, aa64_sme_f16f16, do_fmla_nx, a, FPST_ZA_F16, + s->fpcr_ah ? gen_helper_gvec_ah_fmls_idx_h : gen_helper_gvec_fmls_idx_h) +TRANS_FEAT(FMLA_nx_s, aa64_sme2, do_fmla_nx, a, FPST_ZA, + gen_helper_gvec_fmla_idx_s) +TRANS_FEAT(FMLS_nx_s, aa64_sme2, do_fmla_nx, a, FPST_ZA, + s->fpcr_ah ? gen_helper_gvec_ah_fmls_idx_s : gen_helper_gvec_fmls_idx_s) +TRANS_FEAT(FMLA_nx_d, aa64_sme2_f64f64, do_fmla_nx, a, FPST_ZA, + gen_helper_gvec_fmla_idx_d) +TRANS_FEAT(FMLS_nx_d, aa64_sme2_f64f64, do_fmla_nx, a, FPST_ZA, + s->fpcr_ah ? gen_helper_gvec_ah_fmls_idx_d : gen_helper_gvec_fmls_idx_d) + +TRANS_FEAT(BFMLA_nx, aa64_sme_b16b16, do_fmla_nx, a, FPST_ZA, + gen_helper_gvec_bfmla_idx) +TRANS_FEAT(BFMLS_nx, aa64_sme_b16b16, do_fmla_nx, a, FPST_ZA, + s->fpcr_ah ? gen_helper_gvec_ah_bfmls_idx : gen_helper_gvec_bfmls_idx) + +static bool do_faddsub(DisasContext *s, arg_az_n *a, ARMFPStatusFlavour fpst, + gen_helper_gvec_3_ptr *fn) +{ + if (sme_smza_enabled_check(s)) { + int svl = streaming_vec_reg_size(s); + int n = a->n; + int zm = a->zm; + int vstride = svl / n; + TCGv_ptr t_za = get_zarray(s, a->rv, a->off, n, 0); + TCGv_ptr ptr = fpstatus_ptr(fpst); + TCGv_ptr t = tcg_temp_new_ptr(); + + for (int r = 0; r < n; ++r) { + TCGv_ptr t_zm = vec_full_reg_ptr(s, zm + r); + int o_za = r * vstride * sizeof(ARMVectorReg); + int desc = simd_desc(svl, svl, 0); + + tcg_gen_addi_ptr(t, t_za, o_za); + fn(t, t, t_zm, ptr, tcg_constant_i32(desc)); + } + } + return true; +} + +TRANS_FEAT(FADD_nn_h, aa64_sme_f16f16, do_faddsub, a, + FPST_ZA_F16, gen_helper_gvec_fadd_h) +TRANS_FEAT(FSUB_nn_h, aa64_sme_f16f16, do_faddsub, a, + FPST_ZA_F16, gen_helper_gvec_fsub_h) + +TRANS_FEAT(FADD_nn_s, aa64_sme2, do_faddsub, a, + FPST_ZA, gen_helper_gvec_fadd_s) +TRANS_FEAT(FSUB_nn_s, aa64_sme2, do_faddsub, a, + FPST_ZA, gen_helper_gvec_fsub_s) + +TRANS_FEAT(FADD_nn_d, aa64_sme2_f64f64, do_faddsub, a, + FPST_ZA, gen_helper_gvec_fadd_d) +TRANS_FEAT(FSUB_nn_d, aa64_sme2_f64f64, do_faddsub, a, + FPST_ZA, gen_helper_gvec_fsub_d) + +TRANS_FEAT(BFADD_nn, aa64_sme_b16b16, do_faddsub, a, + FPST_ZA, gen_helper_gvec_bfadd) +TRANS_FEAT(BFSUB_nn, aa64_sme_b16b16, do_faddsub, a, + FPST_ZA, gen_helper_gvec_bfsub) + +/* + * Expand array multi-vector single (n1), array multi-vector (nn), + * and array multi-vector indexed (nx), for integer accumulate. + * multi: true for nn, false for n1. + * data: stuff for simd_data, including any index. + */ +static bool do_azz_acc(DisasContext *s, int nreg, int nsel, + int rv, int off, int zn, int zm, + int data, int shsel, bool multi, + gen_helper_gvec_4 *fn) +{ + if (sme_smza_enabled_check(s)) { + int svl = streaming_vec_reg_size(s); + int vstride = svl / nreg; + TCGv_ptr t_za = get_zarray(s, rv, off, nreg, nsel); + TCGv_ptr t = tcg_temp_new_ptr(); + + for (int r = 0; r < nreg; ++r) { + TCGv_ptr t_zn = vec_full_reg_ptr(s, zn); + TCGv_ptr t_zm = vec_full_reg_ptr(s, zm); + + for (int i = 0; i < nsel; ++i) { + int o_za = (r * vstride + i) * sizeof(ARMVectorReg); + int desc = simd_desc(svl, svl, data | (i << shsel)); + + tcg_gen_addi_ptr(t, t_za, o_za); + fn(t, t_zn, t_zm, t, tcg_constant_i32(desc)); + } + + /* + * For multiple-and-single vectors, Zn may wrap. + * For multiple vectors, both Zn and Zm are aligned. + */ + zn = (zn + 1) % 32; + zm += multi; + } + } + return true; +} + +static bool do_dot(DisasContext *s, arg_azz_n *a, bool multi, + gen_helper_gvec_4 *fn) +{ + return do_azz_acc(s, a->n, 1, a->rv, a->off, a->zn, a->zm, + 0, 0, multi, fn); +} + +static void gen_helper_gvec_sudot_4b(TCGv_ptr d, TCGv_ptr n, TCGv_ptr m, + TCGv_ptr a, TCGv_i32 desc) +{ + gen_helper_gvec_usdot_4b(d, m, n, a, desc); +} + +TRANS_FEAT(USDOT_n1, aa64_sme2, do_dot, a, false, gen_helper_gvec_usdot_4b) +TRANS_FEAT(SUDOT_n1, aa64_sme2, do_dot, a, false, gen_helper_gvec_sudot_4b) +TRANS_FEAT(SDOT_n1_2h, aa64_sme2, do_dot, a, false, gen_helper_gvec_sdot_2h) +TRANS_FEAT(UDOT_n1_2h, aa64_sme2, do_dot, a, false, gen_helper_gvec_udot_2h) +TRANS_FEAT(SDOT_n1_4b, aa64_sme2, do_dot, a, false, gen_helper_gvec_sdot_4b) +TRANS_FEAT(UDOT_n1_4b, aa64_sme2, do_dot, a, false, gen_helper_gvec_udot_4b) +TRANS_FEAT(SDOT_n1_4h, aa64_sme2_i16i64, do_dot, a, false, gen_helper_gvec_sdot_4h) +TRANS_FEAT(UDOT_n1_4h, aa64_sme2_i16i64, do_dot, a, false, gen_helper_gvec_udot_4h) + +TRANS_FEAT(USDOT_nn, aa64_sme2, do_dot, a, true, gen_helper_gvec_usdot_4b) +TRANS_FEAT(SDOT_nn_2h, aa64_sme2, do_dot, a, true, gen_helper_gvec_sdot_2h) +TRANS_FEAT(UDOT_nn_2h, aa64_sme2, do_dot, a, true, gen_helper_gvec_udot_2h) +TRANS_FEAT(SDOT_nn_4b, aa64_sme2, do_dot, a, true, gen_helper_gvec_sdot_4b) +TRANS_FEAT(UDOT_nn_4b, aa64_sme2, do_dot, a, true, gen_helper_gvec_udot_4b) +TRANS_FEAT(SDOT_nn_4h, aa64_sme2_i16i64, do_dot, a, true, gen_helper_gvec_sdot_4h) +TRANS_FEAT(UDOT_nn_4h, aa64_sme2_i16i64, do_dot, a, true, gen_helper_gvec_udot_4h) + +static bool do_dot_nx(DisasContext *s, arg_azx_n *a, gen_helper_gvec_4 *fn) +{ + return do_azz_acc(s, a->n, 1, a->rv, a->off, a->zn, a->zm, + a->idx, 0, false, fn); +} + +TRANS_FEAT(USDOT_nx, aa64_sme2, do_dot_nx, a, gen_helper_gvec_usdot_idx_4b) +TRANS_FEAT(SUDOT_nx, aa64_sme2, do_dot_nx, a, gen_helper_gvec_sudot_idx_4b) +TRANS_FEAT(SDOT_nx_2h, aa64_sme2, do_dot_nx, a, gen_helper_gvec_sdot_idx_2h) +TRANS_FEAT(UDOT_nx_2h, aa64_sme2, do_dot_nx, a, gen_helper_gvec_udot_idx_2h) +TRANS_FEAT(SDOT_nx_4b, aa64_sme2, do_dot_nx, a, gen_helper_gvec_sdot_idx_4b) +TRANS_FEAT(UDOT_nx_4b, aa64_sme2, do_dot_nx, a, gen_helper_gvec_udot_idx_4b) +TRANS_FEAT(SDOT_nx_4h, aa64_sme2_i16i64, do_dot_nx, a, gen_helper_gvec_sdot_idx_4h) +TRANS_FEAT(UDOT_nx_4h, aa64_sme2_i16i64, do_dot_nx, a, gen_helper_gvec_udot_idx_4h) + +static bool do_vdot_nx(DisasContext *s, arg_azx_n *a, gen_helper_gvec_3 *fn) +{ + if (sme_smza_enabled_check(s)) { + int svl = streaming_vec_reg_size(s); + fn(get_zarray(s, a->rv, a->off, a->n, 0), + vec_full_reg_ptr(s, a->zn), + vec_full_reg_ptr(s, a->zm), + tcg_constant_i32(simd_desc(svl, svl, a->idx))); + } + return true; +} + +TRANS_FEAT(SVDOT_nx_2h, aa64_sme2, do_vdot_nx, a, gen_helper_sme2_svdot_idx_2h) +TRANS_FEAT(SVDOT_nx_4b, aa64_sme2, do_vdot_nx, a, gen_helper_sme2_svdot_idx_4b) +TRANS_FEAT(SVDOT_nx_4h, aa64_sme2, do_vdot_nx, a, gen_helper_sme2_svdot_idx_4h) + +TRANS_FEAT(UVDOT_nx_2h, aa64_sme2, do_vdot_nx, a, gen_helper_sme2_uvdot_idx_2h) +TRANS_FEAT(UVDOT_nx_4b, aa64_sme2, do_vdot_nx, a, gen_helper_sme2_uvdot_idx_4b) +TRANS_FEAT(UVDOT_nx_4h, aa64_sme2, do_vdot_nx, a, gen_helper_sme2_uvdot_idx_4h) + +TRANS_FEAT(SUVDOT_nx_4b, aa64_sme2, do_vdot_nx, a, gen_helper_sme2_suvdot_idx_4b) +TRANS_FEAT(USVDOT_nx_4b, aa64_sme2, do_vdot_nx, a, gen_helper_sme2_usvdot_idx_4b) + +static bool do_smlal(DisasContext *s, arg_azz_n *a, bool multi, + gen_helper_gvec_4 *fn) +{ + return do_azz_acc(s, a->n, 2, a->rv, a->off, a->zn, a->zm, + 0, 0, multi, fn); +} + +TRANS_FEAT(SMLAL_n1, aa64_sme2, do_smlal, a, false, gen_helper_sve2_smlal_zzzw_s) +TRANS_FEAT(SMLSL_n1, aa64_sme2, do_smlal, a, false, gen_helper_sve2_smlsl_zzzw_s) +TRANS_FEAT(UMLAL_n1, aa64_sme2, do_smlal, a, false, gen_helper_sve2_umlal_zzzw_s) +TRANS_FEAT(UMLSL_n1, aa64_sme2, do_smlal, a, false, gen_helper_sve2_umlsl_zzzw_s) + +TRANS_FEAT(SMLAL_nn, aa64_sme2, do_smlal, a, true, gen_helper_sve2_smlal_zzzw_s) +TRANS_FEAT(SMLSL_nn, aa64_sme2, do_smlal, a, true, gen_helper_sve2_smlsl_zzzw_s) +TRANS_FEAT(UMLAL_nn, aa64_sme2, do_smlal, a, true, gen_helper_sve2_umlal_zzzw_s) +TRANS_FEAT(UMLSL_nn, aa64_sme2, do_smlal, a, true, gen_helper_sve2_umlsl_zzzw_s) + +static bool do_smlal_nx(DisasContext *s, arg_azx_n *a, + gen_helper_gvec_4 *fn) +{ + return do_azz_acc(s, a->n, 2, a->rv, a->off, a->zn, a->zm, + a->idx << 1, 0, false, fn); +} + +TRANS_FEAT(SMLAL_nx, aa64_sme2, do_smlal_nx, a, gen_helper_sve2_smlal_idx_s) +TRANS_FEAT(SMLSL_nx, aa64_sme2, do_smlal_nx, a, gen_helper_sve2_smlsl_idx_s) +TRANS_FEAT(UMLAL_nx, aa64_sme2, do_smlal_nx, a, gen_helper_sve2_umlal_idx_s) +TRANS_FEAT(UMLSL_nx, aa64_sme2, do_smlal_nx, a, gen_helper_sve2_umlsl_idx_s) + +static bool do_smlall(DisasContext *s, arg_azz_n *a, bool multi, + gen_helper_gvec_4 *fn) +{ + return do_azz_acc(s, a->n, 4, a->rv, a->off, a->zn, a->zm, + 0, 0, multi, fn); +} + +static void gen_helper_sme2_sumlall_s(TCGv_ptr d, TCGv_ptr n, TCGv_ptr m, + TCGv_ptr a, TCGv_i32 desc) +{ + gen_helper_sme2_usmlall_s(d, m, n, a, desc); +} + +TRANS_FEAT(SMLALL_n1_s, aa64_sme2, do_smlall, a, false, gen_helper_sme2_smlall_s) +TRANS_FEAT(SMLSLL_n1_s, aa64_sme2, do_smlall, a, false, gen_helper_sme2_smlsll_s) +TRANS_FEAT(UMLALL_n1_s, aa64_sme2, do_smlall, a, false, gen_helper_sme2_umlall_s) +TRANS_FEAT(UMLSLL_n1_s, aa64_sme2, do_smlall, a, false, gen_helper_sme2_umlsll_s) +TRANS_FEAT(USMLALL_n1_s, aa64_sme2, do_smlall, a, false, gen_helper_sme2_usmlall_s) +TRANS_FEAT(SUMLALL_n1_s, aa64_sme2, do_smlall, a, false, gen_helper_sme2_sumlall_s) + +TRANS_FEAT(SMLALL_n1_d, aa64_sme2_i16i64, do_smlall, a, false, gen_helper_sme2_smlall_d) +TRANS_FEAT(SMLSLL_n1_d, aa64_sme2_i16i64, do_smlall, a, false, gen_helper_sme2_smlsll_d) +TRANS_FEAT(UMLALL_n1_d, aa64_sme2_i16i64, do_smlall, a, false, gen_helper_sme2_umlall_d) +TRANS_FEAT(UMLSLL_n1_d, aa64_sme2_i16i64, do_smlall, a, false, gen_helper_sme2_umlsll_d) + +TRANS_FEAT(SMLALL_nn_s, aa64_sme2, do_smlall, a, true, gen_helper_sme2_smlall_s) +TRANS_FEAT(SMLSLL_nn_s, aa64_sme2, do_smlall, a, true, gen_helper_sme2_smlsll_s) +TRANS_FEAT(UMLALL_nn_s, aa64_sme2, do_smlall, a, true, gen_helper_sme2_umlall_s) +TRANS_FEAT(UMLSLL_nn_s, aa64_sme2, do_smlall, a, true, gen_helper_sme2_umlsll_s) +TRANS_FEAT(USMLALL_nn_s, aa64_sme2, do_smlall, a, true, gen_helper_sme2_usmlall_s) + +TRANS_FEAT(SMLALL_nn_d, aa64_sme2_i16i64, do_smlall, a, true, gen_helper_sme2_smlall_d) +TRANS_FEAT(SMLSLL_nn_d, aa64_sme2_i16i64, do_smlall, a, true, gen_helper_sme2_smlsll_d) +TRANS_FEAT(UMLALL_nn_d, aa64_sme2_i16i64, do_smlall, a, true, gen_helper_sme2_umlall_d) +TRANS_FEAT(UMLSLL_nn_d, aa64_sme2_i16i64, do_smlall, a, true, gen_helper_sme2_umlsll_d) + +static bool do_smlall_nx(DisasContext *s, arg_azx_n *a, + gen_helper_gvec_4 *fn) +{ + return do_azz_acc(s, a->n, 4, a->rv, a->off, a->zn, a->zm, + a->idx << 2, 0, false, fn); +} + +TRANS_FEAT(SMLALL_nx_s, aa64_sme2, do_smlall_nx, a, gen_helper_sme2_smlall_idx_s) +TRANS_FEAT(SMLSLL_nx_s, aa64_sme2, do_smlall_nx, a, gen_helper_sme2_smlsll_idx_s) +TRANS_FEAT(UMLALL_nx_s, aa64_sme2, do_smlall_nx, a, gen_helper_sme2_umlall_idx_s) +TRANS_FEAT(UMLSLL_nx_s, aa64_sme2, do_smlall_nx, a, gen_helper_sme2_umlsll_idx_s) +TRANS_FEAT(USMLALL_nx_s, aa64_sme2, do_smlall_nx, a, gen_helper_sme2_usmlall_idx_s) +TRANS_FEAT(SUMLALL_nx_s, aa64_sme2, do_smlall_nx, a, gen_helper_sme2_sumlall_idx_s) + +TRANS_FEAT(SMLALL_nx_d, aa64_sme2_i16i64, do_smlall_nx, a, gen_helper_sme2_smlall_idx_d) +TRANS_FEAT(SMLSLL_nx_d, aa64_sme2_i16i64, do_smlall_nx, a, gen_helper_sme2_smlsll_idx_d) +TRANS_FEAT(UMLALL_nx_d, aa64_sme2_i16i64, do_smlall_nx, a, gen_helper_sme2_umlall_idx_d) +TRANS_FEAT(UMLSLL_nx_d, aa64_sme2_i16i64, do_smlall_nx, a, gen_helper_sme2_umlsll_idx_d) + +static bool do_zz_fpst(DisasContext *s, arg_zz_n *a, int data, + ARMFPStatusFlavour type, gen_helper_gvec_2_ptr *fn) +{ + if (sme_sm_enabled_check(s)) { + int svl = streaming_vec_reg_size(s); + TCGv_ptr fpst = fpstatus_ptr(type); + + for (int i = 0, n = a->n; i < n; ++i) { + tcg_gen_gvec_2_ptr(vec_full_reg_offset(s, a->zd + i), + vec_full_reg_offset(s, a->zn + i), + fpst, svl, svl, data, fn); + } + } + return true; +} + +TRANS_FEAT(BFCVT, aa64_sme2, do_zz_fpst, a, 0, + FPST_A64, gen_helper_sme2_bfcvt) +TRANS_FEAT(BFCVTN, aa64_sme2, do_zz_fpst, a, 0, + FPST_A64, gen_helper_sme2_bfcvtn) +TRANS_FEAT(FCVT_n, aa64_sme2, do_zz_fpst, a, 0, + FPST_A64, gen_helper_sme2_fcvt_n) +TRANS_FEAT(FCVTN, aa64_sme2, do_zz_fpst, a, 0, + FPST_A64, gen_helper_sme2_fcvtn) + +TRANS_FEAT(FCVT_w, aa64_sme_f16f16, do_zz_fpst, a, 0, + FPST_A64_F16, gen_helper_sme2_fcvt_w) +TRANS_FEAT(FCVTL, aa64_sme_f16f16, do_zz_fpst, a, 0, + FPST_A64_F16, gen_helper_sme2_fcvtl) + +TRANS_FEAT(FCVTZS, aa64_sme2, do_zz_fpst, a, 0, + FPST_A64, gen_helper_gvec_vcvt_rz_fs) +TRANS_FEAT(FCVTZU, aa64_sme2, do_zz_fpst, a, 0, + FPST_A64, gen_helper_gvec_vcvt_rz_fu) + +TRANS_FEAT(SCVTF, aa64_sme2, do_zz_fpst, a, 0, + FPST_A64, gen_helper_sme2_scvtf) +TRANS_FEAT(UCVTF, aa64_sme2, do_zz_fpst, a, 0, + FPST_A64, gen_helper_sme2_ucvtf) + +TRANS_FEAT(FRINTN, aa64_sme2, do_zz_fpst, a, float_round_nearest_even, + FPST_A64, gen_helper_gvec_vrint_rm_s) +TRANS_FEAT(FRINTP, aa64_sme2, do_zz_fpst, a, float_round_up, + FPST_A64, gen_helper_gvec_vrint_rm_s) +TRANS_FEAT(FRINTM, aa64_sme2, do_zz_fpst, a, float_round_down, + FPST_A64, gen_helper_gvec_vrint_rm_s) +TRANS_FEAT(FRINTA, aa64_sme2, do_zz_fpst, a, float_round_ties_away, + FPST_A64, gen_helper_gvec_vrint_rm_s) + +static bool do_zz(DisasContext *s, arg_zz_n *a, int data, + gen_helper_gvec_2 *fn) +{ + if (sme_sm_enabled_check(s)) { + int svl = streaming_vec_reg_size(s); + + for (int i = 0, n = a->n; i < n; ++i) { + tcg_gen_gvec_2_ool(vec_full_reg_offset(s, a->zd + i), + vec_full_reg_offset(s, a->zn + i), + svl, svl, data, fn); + } + } + return true; +} + +TRANS_FEAT(SQCVT_sh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sqcvt_sh) +TRANS_FEAT(UQCVT_sh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uqcvt_sh) +TRANS_FEAT(SQCVTU_sh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sqcvtu_sh) + +TRANS_FEAT(SQCVT_sb, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sqcvt_sb) +TRANS_FEAT(UQCVT_sb, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uqcvt_sb) +TRANS_FEAT(SQCVTU_sb, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sqcvtu_sb) + +TRANS_FEAT(SQCVT_dh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sqcvt_dh) +TRANS_FEAT(UQCVT_dh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uqcvt_dh) +TRANS_FEAT(SQCVTU_dh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sqcvtu_dh) + +TRANS_FEAT(SQCVTN_sb, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sqcvtn_sb) +TRANS_FEAT(UQCVTN_sb, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uqcvtn_sb) +TRANS_FEAT(SQCVTUN_sb, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sqcvtun_sb) + +TRANS_FEAT(SQCVTN_dh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sqcvtn_dh) +TRANS_FEAT(UQCVTN_dh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uqcvtn_dh) +TRANS_FEAT(SQCVTUN_dh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sqcvtun_dh) + +TRANS_FEAT(SUNPK_2bh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sunpk2_bh) +TRANS_FEAT(SUNPK_2hs, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sunpk2_hs) +TRANS_FEAT(SUNPK_2sd, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sunpk2_sd) + +TRANS_FEAT(SUNPK_4bh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sunpk4_bh) +TRANS_FEAT(SUNPK_4hs, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sunpk4_hs) +TRANS_FEAT(SUNPK_4sd, aa64_sme2, do_zz, a, 0, gen_helper_sme2_sunpk4_sd) + +TRANS_FEAT(UUNPK_2bh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uunpk2_bh) +TRANS_FEAT(UUNPK_2hs, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uunpk2_hs) +TRANS_FEAT(UUNPK_2sd, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uunpk2_sd) + +TRANS_FEAT(UUNPK_4bh, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uunpk4_bh) +TRANS_FEAT(UUNPK_4hs, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uunpk4_hs) +TRANS_FEAT(UUNPK_4sd, aa64_sme2, do_zz, a, 0, gen_helper_sme2_uunpk4_sd) + +static bool do_zipuzp_4(DisasContext *s, arg_zz_e *a, + gen_helper_gvec_2 * const fn[5]) +{ + int bytes_per_op = 4 << a->esz; + + /* Both MO_64 and MO_128 can fail the size test. */ + if (s->max_svl < bytes_per_op) { + unallocated_encoding(s); + } else if (sme_sm_enabled_check(s)) { + int svl = streaming_vec_reg_size(s); + if (svl < bytes_per_op) { + unallocated_encoding(s); + } else { + tcg_gen_gvec_2_ool(vec_full_reg_offset(s, a->zd), + vec_full_reg_offset(s, a->zn), + svl, svl, 0, fn[a->esz]); + } + } + return true; +} + +static gen_helper_gvec_2 * const zip4_fns[] = { + gen_helper_sme2_zip4_b, + gen_helper_sme2_zip4_h, + gen_helper_sme2_zip4_s, + gen_helper_sme2_zip4_d, + gen_helper_sme2_zip4_q, +}; +TRANS_FEAT(ZIP_4, aa64_sme2, do_zipuzp_4, a, zip4_fns) + +static gen_helper_gvec_2 * const uzp4_fns[] = { + gen_helper_sme2_uzp4_b, + gen_helper_sme2_uzp4_h, + gen_helper_sme2_uzp4_s, + gen_helper_sme2_uzp4_d, + gen_helper_sme2_uzp4_q, +}; +TRANS_FEAT(UZP_4, aa64_sme2, do_zipuzp_4, a, uzp4_fns) + +static bool do_zz_rshr(DisasContext *s, arg_rshr *a, gen_helper_gvec_2 *fn) +{ + if (sve_access_check(s)) { + int vl = vec_full_reg_size(s); + tcg_gen_gvec_2_ool(vec_full_reg_offset(s, a->zd), + vec_full_reg_offset(s, a->zn), + vl, vl, a->shift, fn); + } + return true; +} + +TRANS_FEAT(SQRSHR_sh, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_sqrshr_sh) +TRANS_FEAT(UQRSHR_sh, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_uqrshr_sh) +TRANS_FEAT(SQRSHRU_sh, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_sqrshru_sh) + +TRANS_FEAT(SQRSHR_sb, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_sqrshr_sb) +TRANS_FEAT(SQRSHR_dh, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_sqrshr_dh) +TRANS_FEAT(UQRSHR_sb, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_uqrshr_sb) +TRANS_FEAT(UQRSHR_dh, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_uqrshr_dh) +TRANS_FEAT(SQRSHRU_sb, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_sqrshru_sb) +TRANS_FEAT(SQRSHRU_dh, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_sqrshru_dh) + +TRANS_FEAT(SQRSHRN_sh, aa64_sme2_or_sve2p1, do_zz_rshr, a, gen_helper_sme2_sqrshrn_sh) +TRANS_FEAT(UQRSHRN_sh, aa64_sme2_or_sve2p1, do_zz_rshr, a, gen_helper_sme2_uqrshrn_sh) +TRANS_FEAT(SQRSHRUN_sh, aa64_sme2_or_sve2p1, do_zz_rshr, a, gen_helper_sme2_sqrshrun_sh) + +TRANS_FEAT(SQRSHRN_sb, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_sqrshrn_sb) +TRANS_FEAT(SQRSHRN_dh, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_sqrshrn_dh) +TRANS_FEAT(UQRSHRN_sb, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_uqrshrn_sb) +TRANS_FEAT(UQRSHRN_dh, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_uqrshrn_dh) +TRANS_FEAT(SQRSHRUN_sb, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_sqrshrun_sb) +TRANS_FEAT(SQRSHRUN_dh, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_sqrshrun_dh) + +static bool do_zipuzp_2(DisasContext *s, arg_zzz_e *a, + gen_helper_gvec_3 * const fn[5]) +{ + int bytes_per_op = 2 << a->esz; + + /* MO_128 can fail the size test. */ + if (s->max_svl < bytes_per_op) { + unallocated_encoding(s); + } else if (sme_sm_enabled_check(s)) { + int svl = streaming_vec_reg_size(s); + if (svl < bytes_per_op) { + unallocated_encoding(s); + } else { + tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->zd), + vec_full_reg_offset(s, a->zn), + vec_full_reg_offset(s, a->zm), + svl, svl, 0, fn[a->esz]); + } + } + return true; +} + +static gen_helper_gvec_3 * const zip2_fns[] = { + gen_helper_sme2_zip2_b, + gen_helper_sme2_zip2_h, + gen_helper_sme2_zip2_s, + gen_helper_sme2_zip2_d, + gen_helper_sme2_zip2_q, +}; +TRANS_FEAT(ZIP_2, aa64_sme2, do_zipuzp_2, a, zip2_fns) + +static gen_helper_gvec_3 * const uzp2_fns[] = { + gen_helper_sme2_uzp2_b, + gen_helper_sme2_uzp2_h, + gen_helper_sme2_uzp2_s, + gen_helper_sme2_uzp2_d, + gen_helper_sme2_uzp2_q, +}; +TRANS_FEAT(UZP_2, aa64_sme2, do_zipuzp_2, a, uzp2_fns) + +static bool trans_FCLAMP(DisasContext *s, arg_zzz_en *a) +{ + static gen_helper_gvec_3_ptr * const fn[] = { + gen_helper_sme2_bfclamp, + gen_helper_sme2_fclamp_h, + gen_helper_sme2_fclamp_s, + gen_helper_sme2_fclamp_d, + }; + TCGv_ptr fpst; + int vl; + + if (!dc_isar_feature(aa64_sme2, s)) { + return false; + } + /* This insn uses MO_8 to encode BFloat16. */ + if (a->esz == MO_8 && !dc_isar_feature(aa64_sme_b16b16, s)) { + return false; + } + if (!sme_sm_enabled_check(s)) { + return true; + } + + fpst = fpstatus_ptr(a->esz == MO_16 ? FPST_A64_F16 : FPST_A64); + vl = vec_full_reg_size(s); + + tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, a->zd), + vec_full_reg_offset(s, a->zn), + vec_full_reg_offset(s, a->zm), + fpst, vl, vl, a->n, fn[a->esz]); + return true; +} + +static bool do_clamp(DisasContext *s, arg_zzz_en *a, + gen_helper_gvec_3 * const fn[4]) +{ + int vl; + + if (!dc_isar_feature(aa64_sme2, s)) { + return false; + } + if (!sme_sm_enabled_check(s)) { + return true; + } + + /* + * Clamp is just a min+max, easily supported by most host + * vector operations -- we already have such an expansion in + * translate-sve.c for a single output. + * TODO: Add support in gvec for multiple simultaneous output, + * and/or copy to temporary upon overlap. + */ + vl = vec_full_reg_size(s); + tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->zd), + vec_full_reg_offset(s, a->zn), + vec_full_reg_offset(s, a->zm), + vl, vl, a->n, fn[a->esz]); + return true; +} + +static gen_helper_gvec_3 * const sclamp_fns[] = { + gen_helper_sme2_sclamp_b, + gen_helper_sme2_sclamp_h, + gen_helper_sme2_sclamp_s, + gen_helper_sme2_sclamp_d, +}; +TRANS(SCLAMP, do_clamp, a, sclamp_fns) + +static gen_helper_gvec_3 * const uclamp_fns[] = { + gen_helper_sme2_uclamp_b, + gen_helper_sme2_uclamp_h, + gen_helper_sme2_uclamp_s, + gen_helper_sme2_uclamp_d, +}; +TRANS(UCLAMP, do_clamp, a, uclamp_fns) + +static bool trans_SEL(DisasContext *s, arg_SEL *a) +{ + typedef void sme_sel_fn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32, TCGv_i32); + static sme_sel_fn * const fns[4] = { + gen_helper_sme2_sel_b, gen_helper_sme2_sel_h, + gen_helper_sme2_sel_s, gen_helper_sme2_sel_d + }; + + if (!dc_isar_feature(aa64_sme2, s)) { + return false; + } + if (sme_sm_enabled_check(s)) { + int svl = streaming_vec_reg_size(s); + uint32_t desc = simd_desc(svl, svl, a->n); + TCGv_ptr t_d = tcg_temp_new_ptr(); + TCGv_ptr t_n = tcg_temp_new_ptr(); + TCGv_ptr t_m = tcg_temp_new_ptr(); + TCGv_i32 png = tcg_temp_new_i32(); + + tcg_gen_addi_ptr(t_d, tcg_env, vec_full_reg_offset(s, a->zd)); + tcg_gen_addi_ptr(t_n, tcg_env, vec_full_reg_offset(s, a->zn)); + tcg_gen_addi_ptr(t_m, tcg_env, vec_full_reg_offset(s, a->zm)); + + tcg_gen_ld16u_i32(png, tcg_env, pred_full_reg_offset(s, a->pg) + ^ (HOST_BIG_ENDIAN ? 6 : 0)); + + fns[a->esz](t_d, t_n, t_m, png, tcg_constant_i32(desc)); + } + return true; +} + +static bool do_lut(DisasContext *s, arg_lut *a, + gen_helper_gvec_2_ptr *fn, bool strided) +{ + if (sme_sm_enabled_check(s) && sme2_zt0_enabled_check(s)) { + int svl = streaming_vec_reg_size(s); + tcg_gen_gvec_2_ptr(vec_full_reg_offset(s, a->zd), + vec_full_reg_offset(s, a->zn), + tcg_env, svl, svl, strided | (a->idx << 1), fn); + } + return true; +} + +TRANS_FEAT(LUTI2_c_1b, aa64_sme2, do_lut, a, gen_helper_sme2_luti2_1b, false) +TRANS_FEAT(LUTI2_c_1h, aa64_sme2, do_lut, a, gen_helper_sme2_luti2_1h, false) +TRANS_FEAT(LUTI2_c_1s, aa64_sme2, do_lut, a, gen_helper_sme2_luti2_1s, false) + +TRANS_FEAT(LUTI2_c_2b, aa64_sme2, do_lut, a, gen_helper_sme2_luti2_2b, false) +TRANS_FEAT(LUTI2_c_2h, aa64_sme2, do_lut, a, gen_helper_sme2_luti2_2h, false) +TRANS_FEAT(LUTI2_c_2s, aa64_sme2, do_lut, a, gen_helper_sme2_luti2_2s, false) + +TRANS_FEAT(LUTI2_c_4b, aa64_sme2, do_lut, a, gen_helper_sme2_luti2_4b, false) +TRANS_FEAT(LUTI2_c_4h, aa64_sme2, do_lut, a, gen_helper_sme2_luti2_4h, false) +TRANS_FEAT(LUTI2_c_4s, aa64_sme2, do_lut, a, gen_helper_sme2_luti2_4s, false) + +TRANS_FEAT(LUTI4_c_1b, aa64_sme2, do_lut, a, gen_helper_sme2_luti4_1b, false) +TRANS_FEAT(LUTI4_c_1h, aa64_sme2, do_lut, a, gen_helper_sme2_luti4_1h, false) +TRANS_FEAT(LUTI4_c_1s, aa64_sme2, do_lut, a, gen_helper_sme2_luti4_1s, false) + +TRANS_FEAT(LUTI4_c_2b, aa64_sme2, do_lut, a, gen_helper_sme2_luti4_2b, false) +TRANS_FEAT(LUTI4_c_2h, aa64_sme2, do_lut, a, gen_helper_sme2_luti4_2h, false) +TRANS_FEAT(LUTI4_c_2s, aa64_sme2, do_lut, a, gen_helper_sme2_luti4_2s, false) + +TRANS_FEAT(LUTI4_c_4h, aa64_sme2, do_lut, a, gen_helper_sme2_luti4_4h, false) +TRANS_FEAT(LUTI4_c_4s, aa64_sme2, do_lut, a, gen_helper_sme2_luti4_4s, false) + +static bool do_lut_s4(DisasContext *s, arg_lut *a, gen_helper_gvec_2_ptr *fn) +{ + return !(a->zd & 0b01100) && do_lut(s, a, fn, true); +} + +static bool do_lut_s8(DisasContext *s, arg_lut *a, gen_helper_gvec_2_ptr *fn) +{ + return !(a->zd & 0b01000) && do_lut(s, a, fn, true); +} + +TRANS_FEAT(LUTI2_s_2b, aa64_sme2p1, do_lut_s8, a, gen_helper_sme2_luti2_2b) +TRANS_FEAT(LUTI2_s_2h, aa64_sme2p1, do_lut_s8, a, gen_helper_sme2_luti2_2h) + +TRANS_FEAT(LUTI2_s_4b, aa64_sme2p1, do_lut_s4, a, gen_helper_sme2_luti2_4b) +TRANS_FEAT(LUTI2_s_4h, aa64_sme2p1, do_lut_s4, a, gen_helper_sme2_luti2_4h) + +TRANS_FEAT(LUTI4_s_2b, aa64_sme2p1, do_lut_s8, a, gen_helper_sme2_luti4_2b) +TRANS_FEAT(LUTI4_s_2h, aa64_sme2p1, do_lut_s8, a, gen_helper_sme2_luti4_2h) + +TRANS_FEAT(LUTI4_s_4h, aa64_sme2p1, do_lut_s4, a, gen_helper_sme2_luti4_4h) diff --git a/target/arm/tcg/translate-sve.c b/target/arm/tcg/translate-sve.c index d23be47..07b827f 100644 --- a/target/arm/tcg/translate-sve.c +++ b/target/arm/tcg/translate-sve.c @@ -31,9 +31,9 @@ typedef void gen_helper_gvec_flags_3(TCGv_i32, TCGv_ptr, TCGv_ptr, typedef void gen_helper_gvec_flags_4(TCGv_i32, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32); -typedef void gen_helper_gvec_mem(TCGv_env, TCGv_ptr, TCGv_i64, TCGv_i32); +typedef void gen_helper_gvec_mem(TCGv_env, TCGv_ptr, TCGv_i64, TCGv_i64); typedef void gen_helper_gvec_mem_scatter(TCGv_env, TCGv_ptr, TCGv_ptr, - TCGv_ptr, TCGv_i64, TCGv_i32); + TCGv_ptr, TCGv_i64, TCGv_i64); /* * Helpers for extracting complex instruction fields. @@ -89,7 +89,7 @@ static inline int expand_imm_sh8u(DisasContext *s, int x) */ static inline int msz_dtype(DisasContext *s, int msz) { - static const uint8_t dtype[4] = { 0, 5, 10, 15 }; + static const uint8_t dtype[5] = { 0, 5, 10, 15, 18 }; return dtype[msz]; } @@ -190,6 +190,10 @@ static bool gen_gvec_fpst_zzz(DisasContext *s, gen_helper_gvec_3_ptr *fn, static bool gen_gvec_fpst_arg_zzz(DisasContext *s, gen_helper_gvec_3_ptr *fn, arg_rrr_esz *a, int data) { + /* These insns use MO_8 to encode BFloat16 */ + if (a->esz == MO_8 && !dc_isar_feature(aa64_sve_b16b16, s)) { + return false; + } return gen_gvec_fpst_zzz(s, fn, a->rd, a->rn, a->rm, data, a->esz == MO_16 ? FPST_A64_F16 : FPST_A64); } @@ -403,6 +407,10 @@ static bool gen_gvec_fpst_zzzp(DisasContext *s, gen_helper_gvec_4_ptr *fn, static bool gen_gvec_fpst_arg_zpzz(DisasContext *s, gen_helper_gvec_4_ptr *fn, arg_rprr_esz *a) { + /* These insns use MO_8 to encode BFloat16. */ + if (a->esz == MO_8 && !dc_isar_feature(aa64_sve_b16b16, s)) { + return false; + } return gen_gvec_fpst_zzzp(s, fn, a->rd, a->rn, a->rm, a->pg, 0, a->esz == MO_16 ? FPST_A64_F16 : FPST_A64); } @@ -629,7 +637,7 @@ static void gen_bsl2n_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 k) * = | ~(m | k) */ tcg_gen_and_i64(n, n, k); - if (tcg_op_supported(INDEX_op_orc_i64, TCG_TYPE_I64, 0)) { + if (tcg_op_supported(INDEX_op_orc, TCG_TYPE_I64, 0)) { tcg_gen_or_i64(m, m, k); tcg_gen_orc_i64(d, n, m); } else { @@ -778,6 +786,9 @@ DO_ZPZ(NOT_zpz, aa64_sve, sve_not_zpz) DO_ZPZ(ABS, aa64_sve, sve_abs) DO_ZPZ(NEG, aa64_sve, sve_neg) DO_ZPZ(RBIT, aa64_sve, sve_rbit) +DO_ZPZ(ORQV, aa64_sme2p1_or_sve2p1, sve2p1_orqv) +DO_ZPZ(EORQV, aa64_sme2p1_or_sve2p1, sve2p1_eorqv) +DO_ZPZ(ANDQV, aa64_sme2p1_or_sve2p1, sve2p1_andqv) static gen_helper_gvec_3 * const fabs_fns[4] = { NULL, gen_helper_sve_fabs_h, @@ -828,6 +839,41 @@ TRANS_FEAT(SXTW, aa64_sve, gen_gvec_ool_arg_zpz, TRANS_FEAT(UXTW, aa64_sve, gen_gvec_ool_arg_zpz, a->esz == 3 ? gen_helper_sve_uxtw_d : NULL, a, 0) +static gen_helper_gvec_3 * const addqv_fns[4] = { + gen_helper_sve2p1_addqv_b, gen_helper_sve2p1_addqv_h, + gen_helper_sve2p1_addqv_s, gen_helper_sve2p1_addqv_d, +}; +TRANS_FEAT(ADDQV, aa64_sme2p1_or_sve2p1, + gen_gvec_ool_arg_zpz, addqv_fns[a->esz], a, 0) + +static gen_helper_gvec_3 * const smaxqv_fns[4] = { + gen_helper_sve2p1_smaxqv_b, gen_helper_sve2p1_smaxqv_h, + gen_helper_sve2p1_smaxqv_s, gen_helper_sve2p1_smaxqv_d, +}; +TRANS_FEAT(SMAXQV, aa64_sme2p1_or_sve2p1, + gen_gvec_ool_arg_zpz, smaxqv_fns[a->esz], a, 0) + +static gen_helper_gvec_3 * const sminqv_fns[4] = { + gen_helper_sve2p1_sminqv_b, gen_helper_sve2p1_sminqv_h, + gen_helper_sve2p1_sminqv_s, gen_helper_sve2p1_sminqv_d, +}; +TRANS_FEAT(SMINQV, aa64_sme2p1_or_sve2p1, + gen_gvec_ool_arg_zpz, sminqv_fns[a->esz], a, 0) + +static gen_helper_gvec_3 * const umaxqv_fns[4] = { + gen_helper_sve2p1_umaxqv_b, gen_helper_sve2p1_umaxqv_h, + gen_helper_sve2p1_umaxqv_s, gen_helper_sve2p1_umaxqv_d, +}; +TRANS_FEAT(UMAXQV, aa64_sme2p1_or_sve2p1, + gen_gvec_ool_arg_zpz, umaxqv_fns[a->esz], a, 0) + +static gen_helper_gvec_3 * const uminqv_fns[4] = { + gen_helper_sve2p1_uminqv_b, gen_helper_sve2p1_uminqv_h, + gen_helper_sve2p1_uminqv_s, gen_helper_sve2p1_uminqv_d, +}; +TRANS_FEAT(UMINQV, aa64_sme2p1_or_sve2p1, + gen_gvec_ool_arg_zpz, uminqv_fns[a->esz], a, 0) + /* *** SVE Integer Reduction Group */ @@ -1679,6 +1725,22 @@ static bool do_predset(DisasContext *s, int esz, int rd, int pat, bool setflag) TRANS_FEAT(PTRUE, aa64_sve, do_predset, a->esz, a->rd, a->pat, a->s) +static bool trans_PTRUE_cnt(DisasContext *s, arg_PTRUE_cnt *a) +{ + if (!dc_isar_feature(aa64_sme2_or_sve2p1, s)) { + return false; + } + if (sve_access_check(s)) { + /* Canonical TRUE is 0 count, invert bit, plus element size. */ + int val = (1 << 15) | (1 << a->esz); + + /* Write val to the first uint64_t; clear all of the rest. */ + tcg_gen_gvec_dup_imm(MO_64, pred_full_reg_offset(s, a->rd), + 8, size_for_gvec(pred_full_reg_size(s)), val); + } + return true; +} + /* Note pat == 31 is #all, to set all elements. */ TRANS_FEAT_NONSTREAMING(SETFFR, aa64_sve, do_predset, 0, FFR_PRED_NUM, 31, false) @@ -2148,6 +2210,55 @@ static bool do_EXT(DisasContext *s, int rd, int rn, int rm, int imm) TRANS_FEAT(EXT, aa64_sve, do_EXT, a->rd, a->rn, a->rm, a->imm) TRANS_FEAT(EXT_sve2, aa64_sve2, do_EXT, a->rd, a->rn, (a->rn + 1) % 32, a->imm) +static bool trans_EXTQ(DisasContext *s, arg_EXTQ *a) +{ + unsigned vl, dofs, sofs0, sofs1, sofs2, imm; + + if (!dc_isar_feature(aa64_sme2p1_or_sve2p1, s)) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + imm = a->imm; + if (imm == 0) { + /* So far we never optimize Zdn with MOVPRFX, so zd = zn is a nop. */ + return true; + } + + vl = vec_full_reg_size(s); + dofs = vec_full_reg_offset(s, a->rd); + sofs2 = vec_full_reg_offset(s, a->rn); + + if (imm & 8) { + sofs0 = dofs + 8; + sofs1 = sofs2; + sofs2 += 8; + } else { + sofs0 = dofs; + sofs1 = dofs + 8; + } + imm = (imm & 7) << 3; + + for (unsigned i = 0; i < vl; i += 16) { + TCGv_i64 s0 = tcg_temp_new_i64(); + TCGv_i64 s1 = tcg_temp_new_i64(); + TCGv_i64 s2 = tcg_temp_new_i64(); + + tcg_gen_ld_i64(s0, tcg_env, sofs0 + i); + tcg_gen_ld_i64(s1, tcg_env, sofs1 + i); + tcg_gen_ld_i64(s2, tcg_env, sofs2 + i); + + tcg_gen_extract2_i64(s0, s0, s1, imm); + tcg_gen_extract2_i64(s1, s1, s2, imm); + + tcg_gen_st_i64(s0, tcg_env, dofs + i); + tcg_gen_st_i64(s1, tcg_env, dofs + i + 8); + } + return true; +} + /* *** SVE Permute - Unpredicated Group */ @@ -2195,6 +2306,27 @@ static bool trans_DUP_x(DisasContext *s, arg_DUP_x *a) return true; } +static bool trans_DUPQ(DisasContext *s, arg_DUPQ *a) +{ + unsigned vl, dofs, nofs; + + if (!dc_isar_feature(aa64_sme2p1_or_sve2p1, s)) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + vl = vec_full_reg_size(s); + dofs = vec_full_reg_offset(s, a->rd); + nofs = vec_reg_offset(s, a->rn, a->imm, a->esz); + + for (unsigned i = 0; i < vl; i += 16) { + tcg_gen_gvec_dup_mem(a->esz, dofs + i, nofs + i, 16, 16); + } + return true; +} + static void do_insr_i64(DisasContext *s, arg_rrr_esz *a, TCGv_i64 val) { typedef void gen_insr(TCGv_ptr, TCGv_ptr, TCGv_i64, TCGv_i32); @@ -2256,12 +2388,124 @@ static gen_helper_gvec_4 * const sve2_tbl_fns[4] = { TRANS_FEAT(TBL_sve2, aa64_sve2, gen_gvec_ool_zzzz, sve2_tbl_fns[a->esz], a->rd, a->rn, (a->rn + 1) % 32, a->rm, 0) +static gen_helper_gvec_3 * const tblq_fns[4] = { + gen_helper_sve2p1_tblq_b, gen_helper_sve2p1_tblq_h, + gen_helper_sve2p1_tblq_s, gen_helper_sve2p1_tblq_d +}; +TRANS_FEAT(TBLQ, aa64_sme2p1_or_sve2p1, gen_gvec_ool_arg_zzz, + tblq_fns[a->esz], a, 0) + static gen_helper_gvec_3 * const tbx_fns[4] = { gen_helper_sve2_tbx_b, gen_helper_sve2_tbx_h, gen_helper_sve2_tbx_s, gen_helper_sve2_tbx_d }; TRANS_FEAT(TBX, aa64_sve2, gen_gvec_ool_arg_zzz, tbx_fns[a->esz], a, 0) +static gen_helper_gvec_3 * const tbxq_fns[4] = { + gen_helper_sve2p1_tbxq_b, gen_helper_sve2p1_tbxq_h, + gen_helper_sve2p1_tbxq_s, gen_helper_sve2p1_tbxq_d +}; +TRANS_FEAT(TBXQ, aa64_sme2p1_or_sve2p1, gen_gvec_ool_arg_zzz, + tbxq_fns[a->esz], a, 0) + +static bool trans_PMOV_pv(DisasContext *s, arg_PMOV_pv *a) +{ + static gen_helper_gvec_2 * const fns[4] = { + NULL, gen_helper_pmov_pv_h, + gen_helper_pmov_pv_s, gen_helper_pmov_pv_d + }; + unsigned vl, pl, vofs, pofs; + TCGv_i64 tmp; + + if (!dc_isar_feature(aa64_sme2p1_or_sve2p1, s)) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + vl = vec_full_reg_size(s); + if (a->esz != MO_8) { + tcg_gen_gvec_2_ool(pred_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vl, vl, a->imm, fns[a->esz]); + return true; + } + + /* + * Copy the low PL bytes from vector Zn, zero-extending to a + * multiple of 8 bytes, so that Pd is properly cleared. + */ + + pl = vl / 8; + pofs = pred_full_reg_offset(s, a->rd); + vofs = vec_full_reg_offset(s, a->rn); + + QEMU_BUILD_BUG_ON(sizeof(ARMPredicateReg) != 32); + for (unsigned i = 32; i >= 8; i >>= 1) { + if (pl & i) { + tcg_gen_gvec_mov(MO_64, pofs, vofs, i, i); + pofs += i; + vofs += i; + } + } + switch (pl & 7) { + case 0: + return true; + case 2: + tmp = tcg_temp_new_i64(); + tcg_gen_ld16u_i64(tmp, tcg_env, vofs + (HOST_BIG_ENDIAN ? 6 : 0)); + break; + case 4: + tmp = tcg_temp_new_i64(); + tcg_gen_ld32u_i64(tmp, tcg_env, vofs + (HOST_BIG_ENDIAN ? 4 : 0)); + break; + case 6: + tmp = tcg_temp_new_i64(); + tcg_gen_ld_i64(tmp, tcg_env, vofs); + tcg_gen_extract_i64(tmp, tmp, 0, 48); + break; + default: + g_assert_not_reached(); + } + tcg_gen_st_i64(tmp, tcg_env, pofs); + return true; +} + +static bool trans_PMOV_vp(DisasContext *s, arg_PMOV_pv *a) +{ + static gen_helper_gvec_2 * const fns[4] = { + NULL, gen_helper_pmov_vp_h, + gen_helper_pmov_vp_s, gen_helper_pmov_vp_d + }; + unsigned vl; + + if (!dc_isar_feature(aa64_sme2p1_or_sve2p1, s)) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + vl = vec_full_reg_size(s); + + if (a->esz == MO_8) { + /* + * The low PL bytes are copied from Pn to Zd unchanged. + * We know that the unused portion of Pn is zero, and + * that imm == 0, so the balance of Zd must be zeroed. + */ + tcg_gen_gvec_mov(MO_64, vec_full_reg_offset(s, a->rd), + pred_full_reg_offset(s, a->rn), + size_for_gvec(vl / 8), vl); + } else { + tcg_gen_gvec_2_ool(vec_full_reg_offset(s, a->rd), + pred_full_reg_offset(s, a->rn), + vl, vl, a->imm, fns[a->esz]); + } + return true; +} + static bool trans_UNPK(DisasContext *s, arg_UNPK *a) { static gen_helper_gvec_2 * const fns[4][2] = { @@ -2352,6 +2596,23 @@ TRANS_FEAT(PUNPKHI, aa64_sve, do_perm_pred2, a, 1, gen_helper_sve_punpk_p) *** SVE Permute - Interleaving Group */ +static bool do_interleave_q(DisasContext *s, gen_helper_gvec_3 *fn, + arg_rrr_esz *a, int data) +{ + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + if (vsz < 32) { + unallocated_encoding(s); + } else { + tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + vsz, vsz, data, fn); + } + } + return true; +} + static gen_helper_gvec_3 * const zip_fns[4] = { gen_helper_sve_zip_b, gen_helper_sve_zip_h, gen_helper_sve_zip_s, gen_helper_sve_zip_d, @@ -2361,26 +2622,43 @@ TRANS_FEAT(ZIP1_z, aa64_sve, gen_gvec_ool_arg_zzz, TRANS_FEAT(ZIP2_z, aa64_sve, gen_gvec_ool_arg_zzz, zip_fns[a->esz], a, vec_full_reg_size(s) / 2) -TRANS_FEAT(ZIP1_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz, - gen_helper_sve2_zip_q, a, 0) -TRANS_FEAT(ZIP2_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz, - gen_helper_sve2_zip_q, a, - QEMU_ALIGN_DOWN(vec_full_reg_size(s), 32) / 2) +TRANS_FEAT_NONSTREAMING(ZIP1_q, aa64_sve_f64mm, do_interleave_q, + gen_helper_sve2_zip_q, a, 0) +TRANS_FEAT_NONSTREAMING(ZIP2_q, aa64_sve_f64mm, do_interleave_q, + gen_helper_sve2_zip_q, a, + QEMU_ALIGN_DOWN(vec_full_reg_size(s), 32) / 2) + +static gen_helper_gvec_3 * const zipq_fns[4] = { + gen_helper_sve2p1_zipq_b, gen_helper_sve2p1_zipq_h, + gen_helper_sve2p1_zipq_s, gen_helper_sve2p1_zipq_d, +}; +TRANS_FEAT(ZIPQ1, aa64_sme2p1_or_sve2p1, gen_gvec_ool_arg_zzz, + zipq_fns[a->esz], a, 0) +TRANS_FEAT(ZIPQ2, aa64_sme2p1_or_sve2p1, gen_gvec_ool_arg_zzz, + zipq_fns[a->esz], a, 16 / 2) static gen_helper_gvec_3 * const uzp_fns[4] = { gen_helper_sve_uzp_b, gen_helper_sve_uzp_h, gen_helper_sve_uzp_s, gen_helper_sve_uzp_d, }; - TRANS_FEAT(UZP1_z, aa64_sve, gen_gvec_ool_arg_zzz, uzp_fns[a->esz], a, 0) TRANS_FEAT(UZP2_z, aa64_sve, gen_gvec_ool_arg_zzz, uzp_fns[a->esz], a, 1 << a->esz) -TRANS_FEAT(UZP1_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz, - gen_helper_sve2_uzp_q, a, 0) -TRANS_FEAT(UZP2_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz, - gen_helper_sve2_uzp_q, a, 16) +TRANS_FEAT_NONSTREAMING(UZP1_q, aa64_sve_f64mm, do_interleave_q, + gen_helper_sve2_uzp_q, a, 0) +TRANS_FEAT_NONSTREAMING(UZP2_q, aa64_sve_f64mm, do_interleave_q, + gen_helper_sve2_uzp_q, a, 16) + +static gen_helper_gvec_3 * const uzpq_fns[4] = { + gen_helper_sve2p1_uzpq_b, gen_helper_sve2p1_uzpq_h, + gen_helper_sve2p1_uzpq_s, gen_helper_sve2p1_uzpq_d, +}; +TRANS_FEAT(UZPQ1, aa64_sme2p1_or_sve2p1, gen_gvec_ool_arg_zzz, + uzpq_fns[a->esz], a, 0) +TRANS_FEAT(UZPQ2, aa64_sme2p1_or_sve2p1, gen_gvec_ool_arg_zzz, + uzpq_fns[a->esz], a, 1 << a->esz) static gen_helper_gvec_3 * const trn_fns[4] = { gen_helper_sve_trn_b, gen_helper_sve_trn_h, @@ -2392,10 +2670,10 @@ TRANS_FEAT(TRN1_z, aa64_sve, gen_gvec_ool_arg_zzz, TRANS_FEAT(TRN2_z, aa64_sve, gen_gvec_ool_arg_zzz, trn_fns[a->esz], a, 1 << a->esz) -TRANS_FEAT(TRN1_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz, - gen_helper_sve2_trn_q, a, 0) -TRANS_FEAT(TRN2_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz, - gen_helper_sve2_trn_q, a, 16) +TRANS_FEAT_NONSTREAMING(TRN1_q, aa64_sve_f64mm, do_interleave_q, + gen_helper_sve2_trn_q, a, 0) +TRANS_FEAT_NONSTREAMING(TRN2_q, aa64_sve_f64mm, do_interleave_q, + gen_helper_sve2_trn_q, a, 16) /* *** SVE Permute Vector - Predicated Group @@ -2981,6 +3259,36 @@ static bool trans_CNTP(DisasContext *s, arg_CNTP *a) return true; } +static bool trans_CNTP_c(DisasContext *s, arg_CNTP_c *a) +{ + TCGv_i32 t_png; + uint32_t desc = 0; + + if (dc_isar_feature(aa64_sve2p1, s)) { + if (!sve_access_check(s)) { + return true; + } + } else if (dc_isar_feature(aa64_sme2, s)) { + if (!sme_sm_enabled_check(s)) { + return true; + } + } else { + return false; + } + + t_png = tcg_temp_new_i32(); + tcg_gen_ld16u_i32(t_png, tcg_env, + pred_full_reg_offset(s, a->rn) ^ + (HOST_BIG_ENDIAN ? 6 : 0)); + + desc = FIELD_DP32(desc, PREDDESC, OPRSZ, pred_full_reg_size(s)); + desc = FIELD_DP32(desc, PREDDESC, ESZ, a->esz); + desc = FIELD_DP32(desc, PREDDESC, DATA, a->vl); + + gen_helper_sve2p1_cntp_c(cpu_reg(s, a->rd), t_png, tcg_constant_i32(desc)); + return true; +} + static bool trans_INCDECP_r(DisasContext *s, arg_incdec_pred *a) { if (!dc_isar_feature(aa64_sve, s)) { @@ -3091,7 +3399,9 @@ static bool trans_CTERM(DisasContext *s, arg_CTERM *a) return true; } -static bool trans_WHILE(DisasContext *s, arg_WHILE *a) +typedef void gen_while_fn(TCGv_i32, TCGv_ptr, TCGv_i32, TCGv_i32); +static bool do_WHILE(DisasContext *s, arg_while *a, + bool lt, int scale, int data, gen_while_fn *fn) { TCGv_i64 op0, op1, t0, t1, tmax; TCGv_i32 t2; @@ -3101,14 +3411,8 @@ static bool trans_WHILE(DisasContext *s, arg_WHILE *a) TCGCond cond; uint64_t maxval; /* Note that GE/HS has a->eq == 0 and GT/HI has a->eq == 1. */ - bool eq = a->eq == a->lt; + bool eq = a->eq == lt; - /* The greater-than conditions are all SVE2. */ - if (a->lt - ? !dc_isar_feature(aa64_sve, s) - : !dc_isar_feature(aa64_sve2, s)) { - return false; - } if (!sve_access_check(s)) { return true; } @@ -3132,7 +3436,7 @@ static bool trans_WHILE(DisasContext *s, arg_WHILE *a) t0 = tcg_temp_new_i64(); t1 = tcg_temp_new_i64(); - if (a->lt) { + if (lt) { tcg_gen_sub_i64(t0, op1, op0); if (a->u) { maxval = a->sf ? UINT64_MAX : UINT32_MAX; @@ -3152,7 +3456,7 @@ static bool trans_WHILE(DisasContext *s, arg_WHILE *a) } } - tmax = tcg_constant_i64(vsz >> a->esz); + tmax = tcg_constant_i64((vsz << scale) >> a->esz); if (eq) { /* Equality means one more iteration. */ tcg_gen_addi_i64(t0, t0, 1); @@ -3181,24 +3485,38 @@ static bool trans_WHILE(DisasContext *s, arg_WHILE *a) t2 = tcg_temp_new_i32(); tcg_gen_extrl_i64_i32(t2, t0); - /* Scale elements to bits. */ - tcg_gen_shli_i32(t2, t2, a->esz); - desc = FIELD_DP32(desc, PREDDESC, OPRSZ, vsz / 8); desc = FIELD_DP32(desc, PREDDESC, ESZ, a->esz); + desc = FIELD_DP32(desc, PREDDESC, DATA, data); ptr = tcg_temp_new_ptr(); tcg_gen_addi_ptr(ptr, tcg_env, pred_full_reg_offset(s, a->rd)); - if (a->lt) { - gen_helper_sve_whilel(t2, ptr, t2, tcg_constant_i32(desc)); - } else { - gen_helper_sve_whileg(t2, ptr, t2, tcg_constant_i32(desc)); - } + fn(t2, ptr, t2, tcg_constant_i32(desc)); + do_pred_flags(t2); return true; } +TRANS_FEAT(WHILE_lt, aa64_sve, do_WHILE, + a, true, 0, 0, gen_helper_sve_whilel) +TRANS_FEAT(WHILE_gt, aa64_sve2, do_WHILE, + a, false, 0, 0, gen_helper_sve_whileg) + +TRANS_FEAT(WHILE_lt_pair, aa64_sme2_or_sve2p1, do_WHILE, + a, true, 1, 0, gen_helper_sve_while2l) +TRANS_FEAT(WHILE_gt_pair, aa64_sme2_or_sve2p1, do_WHILE, + a, false, 1, 0, gen_helper_sve_while2g) + +TRANS_FEAT(WHILE_lt_cnt2, aa64_sme2_or_sve2p1, do_WHILE, + a, true, 1, 1, gen_helper_sve_whilecl) +TRANS_FEAT(WHILE_lt_cnt4, aa64_sme2_or_sve2p1, do_WHILE, + a, true, 2, 2, gen_helper_sve_whilecl) +TRANS_FEAT(WHILE_gt_cnt2, aa64_sme2_or_sve2p1, do_WHILE, + a, false, 1, 1, gen_helper_sve_whilecg) +TRANS_FEAT(WHILE_gt_cnt4, aa64_sme2_or_sve2p1, do_WHILE, + a, false, 2, 2, gen_helper_sve_whilecg) + static bool trans_WHILE_ptr(DisasContext *s, arg_WHILE_ptr *a) { TCGv_i64 op0, op1, diff, t1, tmax; @@ -3217,7 +3535,7 @@ static bool trans_WHILE_ptr(DisasContext *s, arg_WHILE_ptr *a) op0 = read_cpu_reg(s, a->rn, 1); op1 = read_cpu_reg(s, a->rm, 1); - tmax = tcg_constant_i64(vsz); + tmax = tcg_constant_i64(vsz >> a->esz); diff = tcg_temp_new_i64(); if (a->rw) { @@ -3227,15 +3545,15 @@ static bool trans_WHILE_ptr(DisasContext *s, arg_WHILE_ptr *a) tcg_gen_sub_i64(diff, op0, op1); tcg_gen_sub_i64(t1, op1, op0); tcg_gen_movcond_i64(TCG_COND_GEU, diff, op0, op1, diff, t1); - /* Round down to a multiple of ESIZE. */ - tcg_gen_andi_i64(diff, diff, -1 << a->esz); + /* Divide, rounding down, by ESIZE. */ + tcg_gen_shri_i64(diff, diff, a->esz); /* If op1 == op0, diff == 0, and the condition is always true. */ tcg_gen_movcond_i64(TCG_COND_EQ, diff, op0, op1, tmax, diff); } else { /* WHILEWR */ tcg_gen_sub_i64(diff, op1, op0); - /* Round down to a multiple of ESIZE. */ - tcg_gen_andi_i64(diff, diff, -1 << a->esz); + /* Divide, rounding down, by ESIZE. */ + tcg_gen_shri_i64(diff, diff, a->esz); /* If op0 >= op1, diff <= 0, the condition is always true. */ tcg_gen_movcond_i64(TCG_COND_GEU, diff, op0, op1, tmax, diff); } @@ -3258,6 +3576,42 @@ static bool trans_WHILE_ptr(DisasContext *s, arg_WHILE_ptr *a) return true; } +static bool do_pext(DisasContext *s, arg_pext *a, int n) +{ + TCGv_i32 t_png; + TCGv_ptr t_pd; + int pl; + + if (!sve_access_check(s)) { + return true; + } + + t_png = tcg_temp_new_i32(); + tcg_gen_ld16u_i32(t_png, tcg_env, + pred_full_reg_offset(s, a->rn) ^ + (HOST_BIG_ENDIAN ? 6 : 0)); + + t_pd = tcg_temp_new_ptr(); + pl = pred_full_reg_size(s); + + for (int i = 0; i < n; ++i) { + int rd = (a->rd + i) % 16; + int part = a->imm * n + i; + unsigned desc = 0; + + desc = FIELD_DP32(desc, PREDDESC, OPRSZ, pl); + desc = FIELD_DP32(desc, PREDDESC, ESZ, a->esz); + desc = FIELD_DP32(desc, PREDDESC, DATA, part); + + tcg_gen_addi_ptr(t_pd, tcg_env, pred_full_reg_offset(s, rd)); + gen_helper_pext(t_pd, t_png, tcg_constant_i32(desc)); + } + return true; +} + +TRANS_FEAT(PEXT_1, aa64_sme2_or_sve2p1, do_pext, a, 1) +TRANS_FEAT(PEXT_2, aa64_sme2_or_sve2p1, do_pext, a, 2) + /* *** SVE Integer Wide Immediate - Unpredicated Group */ @@ -3385,8 +3739,8 @@ DO_ZZI(UMIN, umin) #undef DO_ZZI static gen_helper_gvec_4 * const dot_fns[2][2] = { - { gen_helper_gvec_sdot_b, gen_helper_gvec_sdot_h }, - { gen_helper_gvec_udot_b, gen_helper_gvec_udot_h } + { gen_helper_gvec_sdot_4b, gen_helper_gvec_sdot_4h }, + { gen_helper_gvec_udot_4b, gen_helper_gvec_udot_4h } }; TRANS_FEAT(DOT_zzzz, aa64_sve, gen_gvec_ool_zzzz, dot_fns[a->u][a->sz], a->rd, a->rn, a->rm, a->ra, 0) @@ -3395,19 +3749,24 @@ TRANS_FEAT(DOT_zzzz, aa64_sve, gen_gvec_ool_zzzz, * SVE Multiply - Indexed */ -TRANS_FEAT(SDOT_zzxw_s, aa64_sve, gen_gvec_ool_arg_zzxz, - gen_helper_gvec_sdot_idx_b, a) -TRANS_FEAT(SDOT_zzxw_d, aa64_sve, gen_gvec_ool_arg_zzxz, - gen_helper_gvec_sdot_idx_h, a) -TRANS_FEAT(UDOT_zzxw_s, aa64_sve, gen_gvec_ool_arg_zzxz, - gen_helper_gvec_udot_idx_b, a) -TRANS_FEAT(UDOT_zzxw_d, aa64_sve, gen_gvec_ool_arg_zzxz, - gen_helper_gvec_udot_idx_h, a) - -TRANS_FEAT(SUDOT_zzxw_s, aa64_sve_i8mm, gen_gvec_ool_arg_zzxz, - gen_helper_gvec_sudot_idx_b, a) -TRANS_FEAT(USDOT_zzxw_s, aa64_sve_i8mm, gen_gvec_ool_arg_zzxz, - gen_helper_gvec_usdot_idx_b, a) +TRANS_FEAT(SDOT_zzxw_4s, aa64_sve, gen_gvec_ool_arg_zzxz, + gen_helper_gvec_sdot_idx_4b, a) +TRANS_FEAT(SDOT_zzxw_4d, aa64_sve, gen_gvec_ool_arg_zzxz, + gen_helper_gvec_sdot_idx_4h, a) +TRANS_FEAT(UDOT_zzxw_4s, aa64_sve, gen_gvec_ool_arg_zzxz, + gen_helper_gvec_udot_idx_4b, a) +TRANS_FEAT(UDOT_zzxw_4d, aa64_sve, gen_gvec_ool_arg_zzxz, + gen_helper_gvec_udot_idx_4h, a) + +TRANS_FEAT(SUDOT_zzxw_4s, aa64_sve_i8mm, gen_gvec_ool_arg_zzxz, + gen_helper_gvec_sudot_idx_4b, a) +TRANS_FEAT(USDOT_zzxw_4s, aa64_sve_i8mm, gen_gvec_ool_arg_zzxz, + gen_helper_gvec_usdot_idx_4b, a) + +TRANS_FEAT(SDOT_zzxw_2s, aa64_sme2_or_sve2p1, gen_gvec_ool_arg_zzxz, + gen_helper_gvec_sdot_idx_2h, a) +TRANS_FEAT(UDOT_zzxw_2s, aa64_sme2_or_sve2p1, gen_gvec_ool_arg_zzxz, + gen_helper_gvec_udot_idx_2h, a) #define DO_SVE2_RRX(NAME, FUNC) \ TRANS_FEAT(NAME, aa64_sve, gen_gvec_ool_zzz, FUNC, \ @@ -3524,31 +3883,38 @@ DO_SVE2_RRXR_ROT(CDOT_zzxw_d, gen_helper_sve2_cdot_idx_d) *** SVE Floating Point Multiply-Add Indexed Group */ +static bool do_fmla_zzxz(DisasContext *s, arg_rrxr_esz *a, + gen_helper_gvec_4_ptr *fn) +{ + /* These insns use MO_8 to encode BFloat16 */ + if (a->esz == MO_8 && !dc_isar_feature(aa64_sve_b16b16, s)) { + return false; + } + return gen_gvec_fpst_zzzz(s, fn, a->rd, a->rn, a->rm, a->ra, a->index, + a->esz == MO_16 ? FPST_A64_F16 : FPST_A64); +} + static gen_helper_gvec_4_ptr * const fmla_idx_fns[4] = { - NULL, gen_helper_gvec_fmla_idx_h, + gen_helper_gvec_bfmla_idx, gen_helper_gvec_fmla_idx_h, gen_helper_gvec_fmla_idx_s, gen_helper_gvec_fmla_idx_d }; -TRANS_FEAT(FMLA_zzxz, aa64_sve, gen_gvec_fpst_zzzz, - fmla_idx_fns[a->esz], a->rd, a->rn, a->rm, a->ra, a->index, - a->esz == MO_16 ? FPST_A64_F16 : FPST_A64) +TRANS_FEAT(FMLA_zzxz, aa64_sve, do_fmla_zzxz, a, fmla_idx_fns[a->esz]) static gen_helper_gvec_4_ptr * const fmls_idx_fns[4][2] = { - { NULL, NULL }, + { gen_helper_gvec_bfmls_idx, gen_helper_gvec_ah_bfmls_idx }, { gen_helper_gvec_fmls_idx_h, gen_helper_gvec_ah_fmls_idx_h }, { gen_helper_gvec_fmls_idx_s, gen_helper_gvec_ah_fmls_idx_s }, { gen_helper_gvec_fmls_idx_d, gen_helper_gvec_ah_fmls_idx_d }, }; -TRANS_FEAT(FMLS_zzxz, aa64_sve, gen_gvec_fpst_zzzz, - fmls_idx_fns[a->esz][s->fpcr_ah], - a->rd, a->rn, a->rm, a->ra, a->index, - a->esz == MO_16 ? FPST_A64_F16 : FPST_A64) +TRANS_FEAT(FMLS_zzxz, aa64_sve, do_fmla_zzxz, a, + fmls_idx_fns[a->esz][s->fpcr_ah]) /* *** SVE Floating Point Multiply Indexed Group */ static gen_helper_gvec_3_ptr * const fmul_idx_fns[4] = { - NULL, gen_helper_gvec_fmul_idx_h, + gen_helper_gvec_fmul_idx_b16, gen_helper_gvec_fmul_idx_h, gen_helper_gvec_fmul_idx_s, gen_helper_gvec_fmul_idx_d, }; TRANS_FEAT(FMUL_zzx, aa64_sve, gen_gvec_fpst_zzz, @@ -3621,6 +3987,54 @@ DO_VPZ_AH(FMAXV, fmaxv) #undef DO_VPZ +static gen_helper_gvec_3_ptr * const faddqv_fns[4] = { + NULL, gen_helper_sve2p1_faddqv_h, + gen_helper_sve2p1_faddqv_s, gen_helper_sve2p1_faddqv_d, +}; +TRANS_FEAT(FADDQV, aa64_sme2p1_or_sve2p1, gen_gvec_fpst_arg_zpz, + faddqv_fns[a->esz], a, 0, + a->esz == MO_16 ? FPST_A64_F16 : FPST_A64) + +static gen_helper_gvec_3_ptr * const fmaxnmqv_fns[4] = { + NULL, gen_helper_sve2p1_fmaxnmqv_h, + gen_helper_sve2p1_fmaxnmqv_s, gen_helper_sve2p1_fmaxnmqv_d, +}; +TRANS_FEAT(FMAXNMQV, aa64_sme2p1_or_sve2p1, gen_gvec_fpst_arg_zpz, + fmaxnmqv_fns[a->esz], a, 0, + a->esz == MO_16 ? FPST_A64_F16 : FPST_A64) + +static gen_helper_gvec_3_ptr * const fminnmqv_fns[4] = { + NULL, gen_helper_sve2p1_fminnmqv_h, + gen_helper_sve2p1_fminnmqv_s, gen_helper_sve2p1_fminnmqv_d, +}; +TRANS_FEAT(FMINNMQV, aa64_sme2p1_or_sve2p1, gen_gvec_fpst_arg_zpz, + fminnmqv_fns[a->esz], a, 0, + a->esz == MO_16 ? FPST_A64_F16 : FPST_A64) + +static gen_helper_gvec_3_ptr * const fmaxqv_fns[4] = { + NULL, gen_helper_sve2p1_fmaxqv_h, + gen_helper_sve2p1_fmaxqv_s, gen_helper_sve2p1_fmaxqv_d, +}; +static gen_helper_gvec_3_ptr * const fmaxqv_ah_fns[4] = { + NULL, gen_helper_sve2p1_ah_fmaxqv_h, + gen_helper_sve2p1_ah_fmaxqv_s, gen_helper_sve2p1_ah_fmaxqv_d, +}; +TRANS_FEAT(FMAXQV, aa64_sme2p1_or_sve2p1, gen_gvec_fpst_arg_zpz, + (s->fpcr_ah ? fmaxqv_ah_fns : fmaxqv_fns)[a->esz], a, 0, + a->esz == MO_16 ? FPST_A64_F16 : FPST_A64) + +static gen_helper_gvec_3_ptr * const fminqv_fns[4] = { + NULL, gen_helper_sve2p1_fminqv_h, + gen_helper_sve2p1_fminqv_s, gen_helper_sve2p1_fminqv_d, +}; +static gen_helper_gvec_3_ptr * const fminqv_ah_fns[4] = { + NULL, gen_helper_sve2p1_ah_fminqv_h, + gen_helper_sve2p1_ah_fminqv_s, gen_helper_sve2p1_ah_fminqv_d, +}; +TRANS_FEAT(FMINQV, aa64_sme2p1_or_sve2p1, gen_gvec_fpst_arg_zpz, + (s->fpcr_ah ? fminqv_ah_fns : fminqv_fns)[a->esz], a, 0, + a->esz == MO_16 ? FPST_A64_F16 : FPST_A64) + /* *** SVE Floating Point Unary Operations - Unpredicated Group */ @@ -3747,7 +4161,7 @@ static bool trans_FADDA(DisasContext *s, arg_rprr_esz *a) #define DO_FP3(NAME, name) \ static gen_helper_gvec_3_ptr * const name##_fns[4] = { \ - NULL, gen_helper_gvec_##name##_h, \ + gen_helper_gvec_##name##_b16, gen_helper_gvec_##name##_h, \ gen_helper_gvec_##name##_s, gen_helper_gvec_##name##_d \ }; \ TRANS_FEAT(NAME, aa64_sve, gen_gvec_fpst_arg_zzz, name##_fns[a->esz], a, 0) @@ -3803,13 +4217,34 @@ TRANS_FEAT_NONSTREAMING(FTSMUL, aa64_sve, gen_gvec_fpst_arg_zzz, s->fpcr_ah ? name##_ah_zpzz_fns[a->esz] : \ name##_zpzz_fns[a->esz], a) -DO_ZPZZ_FP(FADD_zpzz, aa64_sve, sve_fadd) -DO_ZPZZ_FP(FSUB_zpzz, aa64_sve, sve_fsub) -DO_ZPZZ_FP(FMUL_zpzz, aa64_sve, sve_fmul) -DO_ZPZZ_AH_FP(FMIN_zpzz, aa64_sve, sve_fmin, sve_ah_fmin) -DO_ZPZZ_AH_FP(FMAX_zpzz, aa64_sve, sve_fmax, sve_ah_fmax) -DO_ZPZZ_FP(FMINNM_zpzz, aa64_sve, sve_fminnum) -DO_ZPZZ_FP(FMAXNM_zpzz, aa64_sve, sve_fmaxnum) +/* Similar, but for insns where sz == 0 encodes bfloat16 */ +#define DO_ZPZZ_FP_B16(NAME, FEAT, name) \ + static gen_helper_gvec_4_ptr * const name##_zpzz_fns[4] = { \ + gen_helper_##name##_b16, gen_helper_##name##_h, \ + gen_helper_##name##_s, gen_helper_##name##_d \ + }; \ + TRANS_FEAT(NAME, FEAT, gen_gvec_fpst_arg_zpzz, name##_zpzz_fns[a->esz], a) + +#define DO_ZPZZ_AH_FP_B16(NAME, FEAT, name, ah_name) \ + static gen_helper_gvec_4_ptr * const name##_zpzz_fns[4] = { \ + gen_helper_##name##_b16, gen_helper_##name##_h, \ + gen_helper_##name##_s, gen_helper_##name##_d \ + }; \ + static gen_helper_gvec_4_ptr * const name##_ah_zpzz_fns[4] = { \ + gen_helper_##ah_name##_b16, gen_helper_##ah_name##_h, \ + gen_helper_##ah_name##_s, gen_helper_##ah_name##_d \ + }; \ + TRANS_FEAT(NAME, FEAT, gen_gvec_fpst_arg_zpzz, \ + s->fpcr_ah ? name##_ah_zpzz_fns[a->esz] : \ + name##_zpzz_fns[a->esz], a) + +DO_ZPZZ_FP_B16(FADD_zpzz, aa64_sve, sve_fadd) +DO_ZPZZ_FP_B16(FSUB_zpzz, aa64_sve, sve_fsub) +DO_ZPZZ_FP_B16(FMUL_zpzz, aa64_sve, sve_fmul) +DO_ZPZZ_AH_FP_B16(FMIN_zpzz, aa64_sve, sve_fmin, sve_ah_fmin) +DO_ZPZZ_AH_FP_B16(FMAX_zpzz, aa64_sve, sve_fmax, sve_ah_fmax) +DO_ZPZZ_FP_B16(FMINNM_zpzz, aa64_sve, sve_fminnum) +DO_ZPZZ_FP_B16(FMAXNM_zpzz, aa64_sve, sve_fmaxnum) DO_ZPZZ_AH_FP(FABD, aa64_sve, sve_fabd, sve_ah_fabd) DO_ZPZZ_FP(FSCALE, aa64_sve, sve_fscalbn) DO_ZPZZ_FP(FDIV, aa64_sve, sve_fdiv) @@ -3940,19 +4375,28 @@ TRANS_FEAT(FCADD, aa64_sve, gen_gvec_fpst_zzzp, fcadd_fns[a->esz], a->rd, a->rn, a->rm, a->pg, a->rot | (s->fpcr_ah << 1), a->esz == MO_16 ? FPST_A64_F16 : FPST_A64) +static bool do_fmla_zpzzz(DisasContext *s, arg_rprrr_esz *a, + gen_helper_gvec_5_ptr *fn) +{ + /* These insns use MO_8 to encode BFloat16 */ + if (a->esz == MO_8 && !dc_isar_feature(aa64_sve_b16b16, s)) { + return false; + } + return gen_gvec_fpst_zzzzp(s, fn, a->rd, a->rn, a->rm, a->ra, a->pg, 0, + a->esz == MO_16 ? FPST_A64_F16 : FPST_A64); +} + #define DO_FMLA(NAME, name, ah_name) \ static gen_helper_gvec_5_ptr * const name##_fns[4] = { \ - NULL, gen_helper_sve_##name##_h, \ + gen_helper_sve_##name##_b16, gen_helper_sve_##name##_h, \ gen_helper_sve_##name##_s, gen_helper_sve_##name##_d \ }; \ static gen_helper_gvec_5_ptr * const name##_ah_fns[4] = { \ - NULL, gen_helper_sve_##ah_name##_h, \ + gen_helper_sve_##ah_name##_b16, gen_helper_sve_##ah_name##_h, \ gen_helper_sve_##ah_name##_s, gen_helper_sve_##ah_name##_d \ }; \ - TRANS_FEAT(NAME, aa64_sve, gen_gvec_fpst_zzzzp, \ - s->fpcr_ah ? name##_ah_fns[a->esz] : name##_fns[a->esz], \ - a->rd, a->rn, a->rm, a->ra, a->pg, 0, \ - a->esz == MO_16 ? FPST_A64_F16 : FPST_A64) + TRANS_FEAT(NAME, aa64_sve, do_fmla_zpzzz, a, \ + s->fpcr_ah ? name##_ah_fns[a->esz] : name##_fns[a->esz]) /* We don't need an ah_fmla_zpzzz because fmla doesn't negate anything */ DO_FMLA(FMLA_zpzzz, fmla_zpzzz, fmla_zpzzz) @@ -4143,7 +4587,7 @@ TRANS_FEAT(UCVTF_dd, aa64_sve, gen_gvec_fpst_arg_zpz, */ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, - int len, int rn, int imm) + int len, int rn, int imm, MemOp align) { int len_align = QEMU_ALIGN_DOWN(len, 16); int len_remain = len % 16; @@ -4172,12 +4616,15 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, for (i = 0; i < len_align; i += 16) { tcg_gen_qemu_ld_i128(t16, clean_addr, midx, - MO_LE | MO_128 | MO_ATOM_NONE); + MO_LE | MO_128 | MO_ATOM_NONE | align); tcg_gen_extr_i128_i64(t0, t1, t16); tcg_gen_st_i64(t0, base, vofs + i); tcg_gen_st_i64(t1, base, vofs + i + 8); tcg_gen_addi_i64(clean_addr, clean_addr, 16); } + if (len_align) { + align = MO_UNALN; + } } else { TCGLabel *loop = gen_new_label(); TCGv_ptr tp, i = tcg_temp_new_ptr(); @@ -4187,7 +4634,7 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, t16 = tcg_temp_new_i128(); tcg_gen_qemu_ld_i128(t16, clean_addr, midx, - MO_LE | MO_128 | MO_ATOM_NONE); + MO_LE | MO_128 | MO_ATOM_NONE | align); tcg_gen_addi_i64(clean_addr, clean_addr, 16); tp = tcg_temp_new_ptr(); @@ -4202,6 +4649,7 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, tcg_gen_st_i64(t1, tp, vofs + 8); tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop); + align = MO_UNALN; } /* @@ -4210,7 +4658,9 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, */ if (len_remain >= 8) { t0 = tcg_temp_new_i64(); - tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ | MO_ATOM_NONE); + tcg_gen_qemu_ld_i64(t0, clean_addr, midx, + MO_LEUQ | MO_ATOM_NONE | align); + align = MO_UNALN; tcg_gen_st_i64(t0, base, vofs + len_align); len_remain -= 8; len_align += 8; @@ -4225,12 +4675,14 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, case 4: case 8: tcg_gen_qemu_ld_i64(t0, clean_addr, midx, - MO_LE | ctz32(len_remain) | MO_ATOM_NONE); + MO_LE | ctz32(len_remain) + | MO_ATOM_NONE | align); break; case 6: t1 = tcg_temp_new_i64(); - tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUL | MO_ATOM_NONE); + tcg_gen_qemu_ld_i64(t0, clean_addr, midx, + MO_LEUL | MO_ATOM_NONE | align); tcg_gen_addi_i64(clean_addr, clean_addr, 4); tcg_gen_qemu_ld_i64(t1, clean_addr, midx, MO_LEUW | MO_ATOM_NONE); tcg_gen_deposit_i64(t0, t0, t1, 32, 32); @@ -4245,7 +4697,7 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, /* Similarly for stores. */ void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs, - int len, int rn, int imm) + int len, int rn, int imm, MemOp align) { int len_align = QEMU_ALIGN_DOWN(len, 16); int len_remain = len % 16; @@ -4277,9 +4729,12 @@ void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs, tcg_gen_ld_i64(t1, base, vofs + i + 8); tcg_gen_concat_i64_i128(t16, t0, t1); tcg_gen_qemu_st_i128(t16, clean_addr, midx, - MO_LE | MO_128 | MO_ATOM_NONE); + MO_LE | MO_128 | MO_ATOM_NONE | align); tcg_gen_addi_i64(clean_addr, clean_addr, 16); } + if (len_align) { + align = MO_UNALN; + } } else { TCGLabel *loop = gen_new_label(); TCGv_ptr tp, i = tcg_temp_new_ptr(); @@ -4303,13 +4758,16 @@ void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs, tcg_gen_addi_i64(clean_addr, clean_addr, 16); tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop); + align = MO_UNALN; } /* Predicate register stores can be any multiple of 2. */ if (len_remain >= 8) { t0 = tcg_temp_new_i64(); tcg_gen_ld_i64(t0, base, vofs + len_align); - tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ | MO_ATOM_NONE); + tcg_gen_qemu_st_i64(t0, clean_addr, midx, + MO_LEUQ | MO_ATOM_NONE | align); + align = MO_UNALN; len_remain -= 8; len_align += 8; if (len_remain) { @@ -4325,11 +4783,13 @@ void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs, case 4: case 8: tcg_gen_qemu_st_i64(t0, clean_addr, midx, - MO_LE | ctz32(len_remain) | MO_ATOM_NONE); + MO_LE | ctz32(len_remain) + | MO_ATOM_NONE | align); break; case 6: - tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUL | MO_ATOM_NONE); + tcg_gen_qemu_st_i64(t0, clean_addr, midx, + MO_LEUL | MO_ATOM_NONE | align); tcg_gen_addi_i64(clean_addr, clean_addr, 4); tcg_gen_shri_i64(t0, t0, 32); tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUW | MO_ATOM_NONE); @@ -4349,7 +4809,8 @@ static bool trans_LDR_zri(DisasContext *s, arg_rri *a) if (sve_access_check(s)) { int size = vec_full_reg_size(s); int off = vec_full_reg_offset(s, a->rd); - gen_sve_ldr(s, tcg_env, off, size, a->rn, a->imm * size); + gen_sve_ldr(s, tcg_env, off, size, a->rn, a->imm * size, + s->align_mem ? MO_ALIGN_16 : MO_UNALN); } return true; } @@ -4362,7 +4823,8 @@ static bool trans_LDR_pri(DisasContext *s, arg_rri *a) if (sve_access_check(s)) { int size = pred_full_reg_size(s); int off = pred_full_reg_offset(s, a->rd); - gen_sve_ldr(s, tcg_env, off, size, a->rn, a->imm * size); + gen_sve_ldr(s, tcg_env, off, size, a->rn, a->imm * size, + s->align_mem ? MO_ALIGN_2 : MO_UNALN); } return true; } @@ -4375,7 +4837,8 @@ static bool trans_STR_zri(DisasContext *s, arg_rri *a) if (sve_access_check(s)) { int size = vec_full_reg_size(s); int off = vec_full_reg_offset(s, a->rd); - gen_sve_str(s, tcg_env, off, size, a->rn, a->imm * size); + gen_sve_str(s, tcg_env, off, size, a->rn, a->imm * size, + s->align_mem ? MO_ALIGN_16 : MO_UNALN); } return true; } @@ -4388,7 +4851,8 @@ static bool trans_STR_pri(DisasContext *s, arg_rri *a) if (sve_access_check(s)) { int size = pred_full_reg_size(s); int off = pred_full_reg_offset(s, a->rd); - gen_sve_str(s, tcg_env, off, size, a->rn, a->imm * size); + gen_sve_str(s, tcg_env, off, size, a->rn, a->imm * size, + s->align_mem ? MO_ALIGN_2 : MO_UNALN); } return true; } @@ -4398,34 +4862,37 @@ static bool trans_STR_pri(DisasContext *s, arg_rri *a) */ /* The memory mode of the dtype. */ -static const MemOp dtype_mop[16] = { +static const MemOp dtype_mop[19] = { MO_UB, MO_UB, MO_UB, MO_UB, MO_SL, MO_UW, MO_UW, MO_UW, MO_SW, MO_SW, MO_UL, MO_UL, - MO_SB, MO_SB, MO_SB, MO_UQ + MO_SB, MO_SB, MO_SB, MO_UQ, + /* Artificial values used by decode */ + MO_UL, MO_UQ, MO_128, }; #define dtype_msz(x) (dtype_mop[x] & MO_SIZE) /* The vector element size of dtype. */ -static const uint8_t dtype_esz[16] = { +static const uint8_t dtype_esz[19] = { 0, 1, 2, 3, 3, 1, 2, 3, 3, 2, 2, 3, - 3, 2, 1, 3 + 3, 2, 1, 3, + /* Artificial values used by decode */ + 4, 4, 4, }; -uint32_t make_svemte_desc(DisasContext *s, unsigned vsz, uint32_t nregs, +uint64_t make_svemte_desc(DisasContext *s, unsigned vsz, uint32_t nregs, uint32_t msz, bool is_write, uint32_t data) { uint32_t sizem1; - uint32_t desc = 0; + uint64_t desc = 0; /* Assert all of the data fits, with or without MTE enabled. */ assert(nregs >= 1 && nregs <= 4); sizem1 = (nregs << msz) - 1; assert(sizem1 <= R_MTEDESC_SIZEM1_MASK >> R_MTEDESC_SIZEM1_SHIFT); - assert(data < 1u << SVE_MTEDESC_SHIFT); if (s->mte_active[0]) { desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s)); @@ -4433,9 +4900,9 @@ uint32_t make_svemte_desc(DisasContext *s, unsigned vsz, uint32_t nregs, desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma); desc = FIELD_DP32(desc, MTEDESC, WRITE, is_write); desc = FIELD_DP32(desc, MTEDESC, SIZEM1, sizem1); - desc <<= SVE_MTEDESC_SHIFT; + desc <<= 32; } - return simd_desc(vsz, vsz, desc | data); + return simd_desc(vsz, vsz, data) | desc; } static void do_mem_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr, @@ -4443,7 +4910,7 @@ static void do_mem_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr, gen_helper_gvec_mem *fn) { TCGv_ptr t_pg; - uint32_t desc; + uint64_t desc; if (!s->mte_active[0]) { addr = clean_data_tbi(s, addr); @@ -4459,11 +4926,11 @@ static void do_mem_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr, t_pg = tcg_temp_new_ptr(); tcg_gen_addi_ptr(t_pg, tcg_env, pred_full_reg_offset(s, pg)); - fn(tcg_env, t_pg, addr, tcg_constant_i32(desc)); + fn(tcg_env, t_pg, addr, tcg_constant_i64(desc)); } /* Indexed by [mte][be][dtype][nreg] */ -static gen_helper_gvec_mem * const ldr_fns[2][2][16][4] = { +static gen_helper_gvec_mem * const ldr_fns[2][2][19][4] = { { /* mte inactive, little-endian */ { { gen_helper_sve_ld1bb_r, gen_helper_sve_ld2bb_r, gen_helper_sve_ld3bb_r, gen_helper_sve_ld4bb_r }, @@ -4487,7 +4954,13 @@ static gen_helper_gvec_mem * const ldr_fns[2][2][16][4] = { { gen_helper_sve_ld1bss_r, NULL, NULL, NULL }, { gen_helper_sve_ld1bhs_r, NULL, NULL, NULL }, { gen_helper_sve_ld1dd_le_r, gen_helper_sve_ld2dd_le_r, - gen_helper_sve_ld3dd_le_r, gen_helper_sve_ld4dd_le_r } }, + gen_helper_sve_ld3dd_le_r, gen_helper_sve_ld4dd_le_r }, + + { gen_helper_sve_ld1squ_le_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1dqu_le_r, NULL, NULL, NULL }, + { NULL, gen_helper_sve_ld2qq_le_r, + gen_helper_sve_ld3qq_le_r, gen_helper_sve_ld4qq_le_r }, + }, /* mte inactive, big-endian */ { { gen_helper_sve_ld1bb_r, gen_helper_sve_ld2bb_r, @@ -4512,7 +4985,14 @@ static gen_helper_gvec_mem * const ldr_fns[2][2][16][4] = { { gen_helper_sve_ld1bss_r, NULL, NULL, NULL }, { gen_helper_sve_ld1bhs_r, NULL, NULL, NULL }, { gen_helper_sve_ld1dd_be_r, gen_helper_sve_ld2dd_be_r, - gen_helper_sve_ld3dd_be_r, gen_helper_sve_ld4dd_be_r } } }, + gen_helper_sve_ld3dd_be_r, gen_helper_sve_ld4dd_be_r }, + + { gen_helper_sve_ld1squ_be_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1dqu_be_r, NULL, NULL, NULL }, + { NULL, gen_helper_sve_ld2qq_be_r, + gen_helper_sve_ld3qq_be_r, gen_helper_sve_ld4qq_be_r }, + }, + }, { /* mte active, little-endian */ { { gen_helper_sve_ld1bb_r_mte, @@ -4545,7 +5025,15 @@ static gen_helper_gvec_mem * const ldr_fns[2][2][16][4] = { { gen_helper_sve_ld1dd_le_r_mte, gen_helper_sve_ld2dd_le_r_mte, gen_helper_sve_ld3dd_le_r_mte, - gen_helper_sve_ld4dd_le_r_mte } }, + gen_helper_sve_ld4dd_le_r_mte }, + + { gen_helper_sve_ld1squ_le_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1dqu_le_r_mte, NULL, NULL, NULL }, + { NULL, + gen_helper_sve_ld2qq_le_r_mte, + gen_helper_sve_ld3qq_le_r_mte, + gen_helper_sve_ld4qq_le_r_mte }, + }, /* mte active, big-endian */ { { gen_helper_sve_ld1bb_r_mte, @@ -4578,7 +5066,16 @@ static gen_helper_gvec_mem * const ldr_fns[2][2][16][4] = { { gen_helper_sve_ld1dd_be_r_mte, gen_helper_sve_ld2dd_be_r_mte, gen_helper_sve_ld3dd_be_r_mte, - gen_helper_sve_ld4dd_be_r_mte } } }, + gen_helper_sve_ld4dd_be_r_mte }, + + { gen_helper_sve_ld1squ_be_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1dqu_be_r_mte, NULL, NULL, NULL }, + { NULL, + gen_helper_sve_ld2qq_be_r_mte, + gen_helper_sve_ld3qq_be_r_mte, + gen_helper_sve_ld4qq_be_r_mte }, + }, + }, }; static void do_ld_zpa(DisasContext *s, int zt, int pg, @@ -4597,9 +5094,32 @@ static void do_ld_zpa(DisasContext *s, int zt, int pg, static bool trans_LD_zprr(DisasContext *s, arg_rprr_load *a) { - if (a->rm == 31 || !dc_isar_feature(aa64_sve, s)) { + if (a->rm == 31) { return false; } + + /* dtypes 16-18 are artificial, representing 128-bit element */ + switch (a->dtype) { + case 0 ... 15: + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + break; + case 16: case 17: + if (!dc_isar_feature(aa64_sve2p1, s)) { + return false; + } + s->is_nonstreaming = true; + break; + case 18: + if (!dc_isar_feature(aa64_sme2p1_or_sve2p1, s)) { + return false; + } + break; + default: + g_assert_not_reached(); + } + if (sve_access_check(s)) { TCGv_i64 addr = tcg_temp_new_i64(); tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), dtype_msz(a->dtype)); @@ -4611,9 +5131,28 @@ static bool trans_LD_zprr(DisasContext *s, arg_rprr_load *a) static bool trans_LD_zpri(DisasContext *s, arg_rpri_load *a) { - if (!dc_isar_feature(aa64_sve, s)) { - return false; + /* dtypes 16-18 are artificial, representing 128-bit element */ + switch (a->dtype) { + case 0 ... 15: + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + break; + case 16: case 17: + if (!dc_isar_feature(aa64_sve2p1, s)) { + return false; + } + s->is_nonstreaming = true; + break; + case 18: + if (!dc_isar_feature(aa64_sme2p1_or_sve2p1, s)) { + return false; + } + break; + default: + g_assert_not_reached(); } + if (sve_access_check(s)) { int vsz = vec_full_reg_size(s); int elements = vsz >> dtype_esz[a->dtype]; @@ -4839,7 +5378,7 @@ static void do_ldrq(DisasContext *s, int zt, int pg, TCGv_i64 addr, int dtype) unsigned vsz = vec_full_reg_size(s); TCGv_ptr t_pg; int poff; - uint32_t desc; + uint64_t desc; /* Load the first quadword using the normal predicated load helpers. */ if (!s->mte_active[0]) { @@ -4870,7 +5409,7 @@ static void do_ldrq(DisasContext *s, int zt, int pg, TCGv_i64 addr, int dtype) gen_helper_gvec_mem *fn = ldr_fns[s->mte_active[0]][s->be_data == MO_BE][dtype][0]; desc = make_svemte_desc(s, 16, 1, dtype_msz(dtype), false, zt); - fn(tcg_env, t_pg, addr, tcg_constant_i32(desc)); + fn(tcg_env, t_pg, addr, tcg_constant_i64(desc)); /* Replicate that first quadword. */ if (vsz > 16) { @@ -4913,7 +5452,7 @@ static void do_ldro(DisasContext *s, int zt, int pg, TCGv_i64 addr, int dtype) unsigned vsz_r32; TCGv_ptr t_pg; int poff, doff; - uint32_t desc; + uint64_t desc; if (vsz < 32) { /* @@ -4954,7 +5493,7 @@ static void do_ldro(DisasContext *s, int zt, int pg, TCGv_i64 addr, int dtype) gen_helper_gvec_mem *fn = ldr_fns[s->mte_active[0]][s->be_data == MO_BE][dtype][0]; desc = make_svemte_desc(s, 32, 1, dtype_msz(dtype), false, zt); - fn(tcg_env, t_pg, addr, tcg_constant_i32(desc)); + fn(tcg_env, t_pg, addr, tcg_constant_i64(desc)); /* * Replicate that first octaword. @@ -5060,7 +5599,7 @@ static bool trans_LD1R_zpri(DisasContext *s, arg_rpri_load *a) static void do_st_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr, int msz, int esz, int nreg) { - static gen_helper_gvec_mem * const fn_single[2][2][4][4] = { + static gen_helper_gvec_mem * const fn_single[2][2][4][5] = { { { { gen_helper_sve_st1bb_r, gen_helper_sve_st1bh_r, gen_helper_sve_st1bs_r, @@ -5071,9 +5610,11 @@ static void do_st_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr, gen_helper_sve_st1hd_le_r }, { NULL, NULL, gen_helper_sve_st1ss_le_r, - gen_helper_sve_st1sd_le_r }, + gen_helper_sve_st1sd_le_r, + gen_helper_sve_st1sq_le_r, }, { NULL, NULL, NULL, - gen_helper_sve_st1dd_le_r } }, + gen_helper_sve_st1dd_le_r, + gen_helper_sve_st1dq_le_r, } }, { { gen_helper_sve_st1bb_r, gen_helper_sve_st1bh_r, gen_helper_sve_st1bs_r, @@ -5084,9 +5625,11 @@ static void do_st_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr, gen_helper_sve_st1hd_be_r }, { NULL, NULL, gen_helper_sve_st1ss_be_r, - gen_helper_sve_st1sd_be_r }, + gen_helper_sve_st1sd_be_r, + gen_helper_sve_st1sq_be_r }, { NULL, NULL, NULL, - gen_helper_sve_st1dd_be_r } } }, + gen_helper_sve_st1dd_be_r, + gen_helper_sve_st1dq_be_r } } }, { { { gen_helper_sve_st1bb_r_mte, gen_helper_sve_st1bh_r_mte, @@ -5098,9 +5641,11 @@ static void do_st_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr, gen_helper_sve_st1hd_le_r_mte }, { NULL, NULL, gen_helper_sve_st1ss_le_r_mte, - gen_helper_sve_st1sd_le_r_mte }, + gen_helper_sve_st1sd_le_r_mte, + gen_helper_sve_st1sq_le_r_mte }, { NULL, NULL, NULL, - gen_helper_sve_st1dd_le_r_mte } }, + gen_helper_sve_st1dd_le_r_mte, + gen_helper_sve_st1dq_le_r_mte } }, { { gen_helper_sve_st1bb_r_mte, gen_helper_sve_st1bh_r_mte, gen_helper_sve_st1bs_r_mte, @@ -5111,59 +5656,73 @@ static void do_st_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr, gen_helper_sve_st1hd_be_r_mte }, { NULL, NULL, gen_helper_sve_st1ss_be_r_mte, - gen_helper_sve_st1sd_be_r_mte }, + gen_helper_sve_st1sd_be_r_mte, + gen_helper_sve_st1sq_be_r_mte }, { NULL, NULL, NULL, - gen_helper_sve_st1dd_be_r_mte } } }, + gen_helper_sve_st1dd_be_r_mte, + gen_helper_sve_st1dq_be_r_mte } } }, }; - static gen_helper_gvec_mem * const fn_multiple[2][2][3][4] = { + static gen_helper_gvec_mem * const fn_multiple[2][2][3][5] = { { { { gen_helper_sve_st2bb_r, gen_helper_sve_st2hh_le_r, gen_helper_sve_st2ss_le_r, - gen_helper_sve_st2dd_le_r }, + gen_helper_sve_st2dd_le_r, + gen_helper_sve_st2qq_le_r }, { gen_helper_sve_st3bb_r, gen_helper_sve_st3hh_le_r, gen_helper_sve_st3ss_le_r, - gen_helper_sve_st3dd_le_r }, + gen_helper_sve_st3dd_le_r, + gen_helper_sve_st3qq_le_r }, { gen_helper_sve_st4bb_r, gen_helper_sve_st4hh_le_r, gen_helper_sve_st4ss_le_r, - gen_helper_sve_st4dd_le_r } }, + gen_helper_sve_st4dd_le_r, + gen_helper_sve_st4qq_le_r } }, { { gen_helper_sve_st2bb_r, gen_helper_sve_st2hh_be_r, gen_helper_sve_st2ss_be_r, - gen_helper_sve_st2dd_be_r }, + gen_helper_sve_st2dd_be_r, + gen_helper_sve_st2qq_be_r }, { gen_helper_sve_st3bb_r, gen_helper_sve_st3hh_be_r, gen_helper_sve_st3ss_be_r, - gen_helper_sve_st3dd_be_r }, + gen_helper_sve_st3dd_be_r, + gen_helper_sve_st3qq_be_r }, { gen_helper_sve_st4bb_r, gen_helper_sve_st4hh_be_r, gen_helper_sve_st4ss_be_r, - gen_helper_sve_st4dd_be_r } } }, + gen_helper_sve_st4dd_be_r, + gen_helper_sve_st4qq_be_r } } }, { { { gen_helper_sve_st2bb_r_mte, gen_helper_sve_st2hh_le_r_mte, gen_helper_sve_st2ss_le_r_mte, - gen_helper_sve_st2dd_le_r_mte }, + gen_helper_sve_st2dd_le_r_mte, + gen_helper_sve_st2qq_le_r_mte }, { gen_helper_sve_st3bb_r_mte, gen_helper_sve_st3hh_le_r_mte, gen_helper_sve_st3ss_le_r_mte, - gen_helper_sve_st3dd_le_r_mte }, + gen_helper_sve_st3dd_le_r_mte, + gen_helper_sve_st3qq_le_r_mte }, { gen_helper_sve_st4bb_r_mte, gen_helper_sve_st4hh_le_r_mte, gen_helper_sve_st4ss_le_r_mte, - gen_helper_sve_st4dd_le_r_mte } }, + gen_helper_sve_st4dd_le_r_mte, + gen_helper_sve_st4qq_le_r_mte } }, { { gen_helper_sve_st2bb_r_mte, gen_helper_sve_st2hh_be_r_mte, gen_helper_sve_st2ss_be_r_mte, - gen_helper_sve_st2dd_be_r_mte }, + gen_helper_sve_st2dd_be_r_mte, + gen_helper_sve_st2qq_be_r_mte }, { gen_helper_sve_st3bb_r_mte, gen_helper_sve_st3hh_be_r_mte, gen_helper_sve_st3ss_be_r_mte, - gen_helper_sve_st3dd_be_r_mte }, + gen_helper_sve_st3dd_be_r_mte, + gen_helper_sve_st3qq_be_r_mte }, { gen_helper_sve_st4bb_r_mte, gen_helper_sve_st4hh_be_r_mte, gen_helper_sve_st4ss_be_r_mte, - gen_helper_sve_st4dd_be_r_mte } } }, + gen_helper_sve_st4dd_be_r_mte, + gen_helper_sve_st4qq_be_r_mte } } }, }; gen_helper_gvec_mem *fn; int be = s->be_data == MO_BE; @@ -5182,12 +5741,32 @@ static void do_st_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr, static bool trans_ST_zprr(DisasContext *s, arg_rprr_store *a) { - if (!dc_isar_feature(aa64_sve, s)) { - return false; - } if (a->rm == 31 || a->msz > a->esz) { return false; } + switch (a->esz) { + case MO_8 ... MO_64: + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + break; + case MO_128: + if (a->nreg == 0) { + assert(a->msz < a->esz); + if (!dc_isar_feature(aa64_sve2p1, s)) { + return false; + } + s->is_nonstreaming = true; + } else { + if (!dc_isar_feature(aa64_sme2p1_or_sve2p1, s)) { + return false; + } + } + break; + default: + g_assert_not_reached(); + } + if (sve_access_check(s)) { TCGv_i64 addr = tcg_temp_new_i64(); tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), a->msz); @@ -5199,12 +5778,32 @@ static bool trans_ST_zprr(DisasContext *s, arg_rprr_store *a) static bool trans_ST_zpri(DisasContext *s, arg_rpri_store *a) { - if (!dc_isar_feature(aa64_sve, s)) { - return false; - } if (a->msz > a->esz) { return false; } + switch (a->esz) { + case MO_8 ... MO_64: + if (!dc_isar_feature(aa64_sve, s)) { + return false; + } + break; + case MO_128: + if (a->nreg == 0) { + assert(a->msz < a->esz); + if (!dc_isar_feature(aa64_sve2p1, s)) { + return false; + } + s->is_nonstreaming = true; + } else { + if (!dc_isar_feature(aa64_sme2p1_or_sve2p1, s)) { + return false; + } + } + break; + default: + g_assert_not_reached(); + } + if (sve_access_check(s)) { int vsz = vec_full_reg_size(s); int elements = vsz >> a->esz; @@ -5228,14 +5827,14 @@ static void do_mem_zpz(DisasContext *s, int zt, int pg, int zm, TCGv_ptr t_zm = tcg_temp_new_ptr(); TCGv_ptr t_pg = tcg_temp_new_ptr(); TCGv_ptr t_zt = tcg_temp_new_ptr(); - uint32_t desc; + uint64_t desc; tcg_gen_addi_ptr(t_pg, tcg_env, pred_full_reg_offset(s, pg)); tcg_gen_addi_ptr(t_zm, tcg_env, vec_full_reg_offset(s, zm)); tcg_gen_addi_ptr(t_zt, tcg_env, vec_full_reg_offset(s, zt)); desc = make_svemte_desc(s, vec_full_reg_size(s), 1, msz, is_write, scale); - fn(tcg_env, t_zt, t_pg, t_zm, scalar, tcg_constant_i32(desc)); + fn(tcg_env, t_zt, t_pg, t_zm, scalar, tcg_constant_i64(desc)); } /* Indexed by [mte][be][ff][xs][u][msz]. */ @@ -5566,6 +6165,14 @@ gather_load_fn64[2][2][2][3][2][4] = { gen_helper_sve_ldffdd_be_zd_mte, } } } } }, }; +static gen_helper_gvec_mem_scatter * const +gather_load_fn128[2][2] = { + { gen_helper_sve_ldqq_le_zd, + gen_helper_sve_ldqq_be_zd }, + { gen_helper_sve_ldqq_le_zd_mte, + gen_helper_sve_ldqq_be_zd_mte } +}; + static bool trans_LD1_zprz(DisasContext *s, arg_LD1_zprz *a) { gen_helper_gvec_mem_scatter *fn = NULL; @@ -5587,6 +6194,8 @@ static bool trans_LD1_zprz(DisasContext *s, arg_LD1_zprz *a) case MO_64: fn = gather_load_fn64[mte][be][a->ff][a->xs][a->u][a->msz]; break; + default: + g_assert_not_reached(); } assert(fn != NULL); @@ -5595,6 +6204,32 @@ static bool trans_LD1_zprz(DisasContext *s, arg_LD1_zprz *a) return true; } +static bool trans_LD1Q(DisasContext *s, arg_LD1Q *a) +{ + gen_helper_gvec_mem_scatter *fn = NULL; + bool be = s->be_data == MO_BE; + bool mte = s->mte_active[0]; + + if (!dc_isar_feature(aa64_sve2p1, s)) { + return false; + } + s->is_nonstreaming = true; + if (!sve_access_check(s)) { + return true; + } + + fn = gather_load_fn128[mte][be]; + assert(fn != NULL); + + /* + * Unlike LD1_zprz, a->rm is the scalar register and it can be XZR, not XSP. + * a->rn is the vector register. + */ + do_mem_zpz(s, a->rd, a->pg, a->rn, 0, + cpu_reg(s, a->rm), MO_128, false, fn); + return true; +} + static bool trans_LD1_zpiz(DisasContext *s, arg_LD1_zpiz *a) { gen_helper_gvec_mem_scatter *fn = NULL; @@ -5754,6 +6389,14 @@ static gen_helper_gvec_mem_scatter * const scatter_store_fn64[2][2][3][4] = { gen_helper_sve_stdd_be_zd_mte, } } }, }; +static gen_helper_gvec_mem_scatter * const +scatter_store_fn128[2][2] = { + { gen_helper_sve_stqq_le_zd, + gen_helper_sve_stqq_be_zd }, + { gen_helper_sve_stqq_le_zd_mte, + gen_helper_sve_stqq_be_zd_mte } +}; + static bool trans_ST1_zprz(DisasContext *s, arg_ST1_zprz *a) { gen_helper_gvec_mem_scatter *fn; @@ -5785,6 +6428,29 @@ static bool trans_ST1_zprz(DisasContext *s, arg_ST1_zprz *a) return true; } +static bool trans_ST1Q(DisasContext *s, arg_ST1Q *a) +{ + gen_helper_gvec_mem_scatter *fn; + bool be = s->be_data == MO_BE; + bool mte = s->mte_active[0]; + + if (!dc_isar_feature(aa64_sve2p1, s)) { + return false; + } + s->is_nonstreaming = true; + if (!sve_access_check(s)) { + return true; + } + fn = scatter_store_fn128[mte][be]; + /* + * Unlike ST1_zprz, a->rm is the scalar register, and it + * can be XZR, not XSP. a->rn is the vector register. + */ + do_mem_zpz(s, a->rd, a->pg, a->rn, 0, + cpu_reg(s, a->rm), MO_128, true, fn); + return true; +} + static bool trans_ST1_zpiz(DisasContext *s, arg_ST1_zpiz *a) { gen_helper_gvec_mem_scatter *fn = NULL; @@ -5911,6 +6577,7 @@ TRANS_FEAT(MOVPRFX_z, aa64_sve, do_movz_zpz, a->rd, a->rn, a->pg, a->esz, false) */ TRANS_FEAT(MUL_zzz, aa64_sve2, gen_gvec_fn_arg_zzz, tcg_gen_gvec_mul, a) +TRANS_FEAT(SQDMULH_zzz, aa64_sve2, gen_gvec_fn_arg_zzz, gen_gvec_sve2_sqdmulh, a) static gen_helper_gvec_3 * const smulh_zzz_fns[4] = { gen_helper_gvec_smulh_b, gen_helper_gvec_smulh_h, @@ -5929,13 +6596,6 @@ TRANS_FEAT(UMULH_zzz, aa64_sve2, gen_gvec_ool_arg_zzz, TRANS_FEAT(PMUL_zzz, aa64_sve2, gen_gvec_ool_arg_zzz, gen_helper_gvec_pmul_b, a, 0) -static gen_helper_gvec_3 * const sqdmulh_zzz_fns[4] = { - gen_helper_sve2_sqdmulh_b, gen_helper_sve2_sqdmulh_h, - gen_helper_sve2_sqdmulh_s, gen_helper_sve2_sqdmulh_d, -}; -TRANS_FEAT(SQDMULH_zzz, aa64_sve2, gen_gvec_ool_arg_zzz, - sqdmulh_zzz_fns[a->esz], a, 0) - static gen_helper_gvec_3 * const sqrdmulh_zzz_fns[4] = { gen_helper_sve2_sqrdmulh_b, gen_helper_sve2_sqrdmulh_h, gen_helper_sve2_sqrdmulh_s, gen_helper_sve2_sqrdmulh_d, @@ -7008,17 +7668,26 @@ DO_ZPZZ_FP(FMINNMP, aa64_sve2, sve2_fminnmp_zpzz) DO_ZPZZ_FP(FMAXP, aa64_sve2, sve2_fmaxp_zpzz) DO_ZPZZ_FP(FMINP, aa64_sve2, sve2_fminp_zpzz) +static bool do_fmmla(DisasContext *s, arg_rrrr_esz *a, + gen_helper_gvec_4_ptr *fn) +{ + if (sve_access_check(s)) { + if (vec_full_reg_size(s) < 4 * memop_size(a->esz)) { + unallocated_encoding(s); + } else { + gen_gvec_fpst_zzzz(s, fn, a->rd, a->rn, a->rm, a->ra, 0, FPST_A64); + } + } + return true; +} + +TRANS_FEAT_NONSTREAMING(FMMLA_s, aa64_sve_f32mm, do_fmmla, a, gen_helper_fmmla_s) +TRANS_FEAT_NONSTREAMING(FMMLA_d, aa64_sve_f64mm, do_fmmla, a, gen_helper_fmmla_d) + /* * SVE Integer Multiply-Add (unpredicated) */ -TRANS_FEAT_NONSTREAMING(FMMLA_s, aa64_sve_f32mm, gen_gvec_fpst_zzzz, - gen_helper_fmmla_s, a->rd, a->rn, a->rm, a->ra, - 0, FPST_A64) -TRANS_FEAT_NONSTREAMING(FMMLA_d, aa64_sve_f64mm, gen_gvec_fpst_zzzz, - gen_helper_fmmla_d, a->rd, a->rn, a->rm, a->ra, - 0, FPST_A64) - static gen_helper_gvec_4 * const sqdmlal_zzzw_fns[] = { NULL, gen_helper_sve2_sqdmlal_zzzw_h, gen_helper_sve2_sqdmlal_zzzw_s, gen_helper_sve2_sqdmlal_zzzw_d, @@ -7111,8 +7780,13 @@ static gen_helper_gvec_4 * const sqrdcmlah_fns[] = { TRANS_FEAT(SQRDCMLAH_zzzz, aa64_sve2, gen_gvec_ool_zzzz, sqrdcmlah_fns[a->esz], a->rd, a->rn, a->rm, a->ra, a->rot) -TRANS_FEAT(USDOT_zzzz, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz, - a->esz == 2 ? gen_helper_gvec_usdot_b : NULL, a, 0) +TRANS_FEAT(USDOT_zzzz_4s, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz, + gen_helper_gvec_usdot_4b, a, 0) + +TRANS_FEAT(SDOT_zzzz_2s, aa64_sme2_or_sve2p1, gen_gvec_ool_arg_zzzz, + gen_helper_gvec_sdot_2h, a, 0) +TRANS_FEAT(UDOT_zzzz_2s, aa64_sme2_or_sve2p1, gen_gvec_ool_arg_zzzz, + gen_helper_gvec_udot_2h, a, 0) TRANS_FEAT_NONSTREAMING(AESMC, aa64_sve2_aes, gen_gvec_ool_zz, gen_helper_crypto_aesmc, a->rd, a->rd, 0) @@ -7174,7 +7848,7 @@ static bool do_FMLAL_zzxw(DisasContext *s, arg_rrxr_esz *a, bool sub, bool sel) { return gen_gvec_ptr_zzzz(s, gen_helper_sve2_fmlal_zzxw_s, a->rd, a->rn, a->rm, a->ra, - (a->index << 2) | (sel << 1) | sub, tcg_env); + (a->index << 3) | (sel << 1) | sub, tcg_env); } TRANS_FEAT(FMLALB_zzxw, aa64_sve2, do_FMLAL_zzxw, a, false, false) @@ -7189,6 +7863,11 @@ TRANS_FEAT_NONSTREAMING(USMMLA, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz, TRANS_FEAT_NONSTREAMING(UMMLA, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz, gen_helper_gvec_ummla_b, a, 0) +TRANS_FEAT(FDOT_zzzz, aa64_sme2_or_sve2p1, gen_gvec_env_arg_zzzz, + gen_helper_sme2_fdot_h, a, 0) +TRANS_FEAT(FDOT_zzxz, aa64_sme2_or_sve2p1, gen_gvec_env_arg_zzxz, + gen_helper_sme2_fdot_idx_h, a) + TRANS_FEAT(BFDOT_zzzz, aa64_sve_bf16, gen_gvec_env_arg_zzzz, gen_helper_gvec_bfdot, a, 0) TRANS_FEAT(BFDOT_zzxz, aa64_sve_bf16, gen_gvec_env_arg_zzxz, @@ -7218,6 +7897,36 @@ static bool do_BFMLAL_zzxw(DisasContext *s, arg_rrxr_esz *a, bool sel) TRANS_FEAT(BFMLALB_zzxw, aa64_sve_bf16, do_BFMLAL_zzxw, a, false) TRANS_FEAT(BFMLALT_zzxw, aa64_sve_bf16, do_BFMLAL_zzxw, a, true) +static bool do_BFMLSL_zzzw(DisasContext *s, arg_rrrr_esz *a, bool sel) +{ + if (s->fpcr_ah) { + return gen_gvec_fpst_zzzz(s, gen_helper_gvec_ah_bfmlsl, + a->rd, a->rn, a->rm, a->ra, sel, FPST_AH); + } else { + return gen_gvec_fpst_zzzz(s, gen_helper_gvec_bfmlsl, + a->rd, a->rn, a->rm, a->ra, sel, FPST_A64); + } +} + +TRANS_FEAT(BFMLSLB_zzzw, aa64_sme2_or_sve2p1, do_BFMLSL_zzzw, a, false) +TRANS_FEAT(BFMLSLT_zzzw, aa64_sme2_or_sve2p1, do_BFMLSL_zzzw, a, true) + +static bool do_BFMLSL_zzxw(DisasContext *s, arg_rrxr_esz *a, bool sel) +{ + if (s->fpcr_ah) { + return gen_gvec_fpst_zzzz(s, gen_helper_gvec_ah_bfmlsl_idx, + a->rd, a->rn, a->rm, a->ra, + (a->index << 1) | sel, FPST_AH); + } else { + return gen_gvec_fpst_zzzz(s, gen_helper_gvec_bfmlsl_idx, + a->rd, a->rn, a->rm, a->ra, + (a->index << 1) | sel, FPST_A64); + } +} + +TRANS_FEAT(BFMLSLB_zzxw, aa64_sme2_or_sve2p1, do_BFMLSL_zzxw, a, false) +TRANS_FEAT(BFMLSLT_zzxw, aa64_sme2_or_sve2p1, do_BFMLSL_zzxw, a, true) + static bool trans_PSEL(DisasContext *s, arg_psel *a) { int vl = vec_full_reg_size(s); @@ -7226,7 +7935,7 @@ static bool trans_PSEL(DisasContext *s, arg_psel *a) TCGv_i64 tmp, didx, dbit; TCGv_ptr ptr; - if (!dc_isar_feature(aa64_sme, s)) { + if (!dc_isar_feature(aa64_sme_or_sve2p1, s)) { return false; } if (!sve_access_check(s)) { @@ -7265,6 +7974,7 @@ static bool trans_PSEL(DisasContext *s, arg_psel *a) tcg_gen_neg_i64(tmp, tmp); /* Apply to either copy the source, or write zeros. */ + pl = size_for_gvec(pl); tcg_gen_gvec_ands(MO_64, pred_full_reg_offset(s, a->pd), pred_full_reg_offset(s, a->pn), tmp, pl, pl); return true; @@ -7319,7 +8029,7 @@ static void gen_sclamp(unsigned vece, uint32_t d, uint32_t n, uint32_t m, tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &ops[vece]); } -TRANS_FEAT(SCLAMP, aa64_sme, gen_gvec_fn_arg_zzzz, gen_sclamp, a) +TRANS_FEAT(SCLAMP, aa64_sme_or_sve2p1, gen_gvec_fn_arg_zzzz, gen_sclamp, a) static void gen_uclamp_i32(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, TCGv_i32 a) { @@ -7370,4 +8080,137 @@ static void gen_uclamp(unsigned vece, uint32_t d, uint32_t n, uint32_t m, tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &ops[vece]); } -TRANS_FEAT(UCLAMP, aa64_sme, gen_gvec_fn_arg_zzzz, gen_uclamp, a) +TRANS_FEAT(UCLAMP, aa64_sme_or_sve2p1, gen_gvec_fn_arg_zzzz, gen_uclamp, a) + +static bool trans_FCLAMP(DisasContext *s, arg_FCLAMP *a) +{ + static gen_helper_gvec_3_ptr * const fn[] = { + gen_helper_sme2_bfclamp, + gen_helper_sme2_fclamp_h, + gen_helper_sme2_fclamp_s, + gen_helper_sme2_fclamp_d, + }; + + /* This insn uses MO_8 to encode BFloat16. */ + if (a->esz == MO_8 + ? !dc_isar_feature(aa64_sve_b16b16, s) + : !dc_isar_feature(aa64_sme2_or_sve2p1, s)) { + return false; + } + + /* So far we never optimize rda with MOVPRFX */ + assert(a->rd == a->ra); + return gen_gvec_fpst_zzz(s, fn[a->esz], a->rd, a->rn, a->rm, 1, + a->esz == MO_16 ? FPST_A64_F16 : FPST_A64); +} + +TRANS_FEAT(SQCVTN_sh, aa64_sme2_or_sve2p1, gen_gvec_ool_zz, + gen_helper_sme2_sqcvtn_sh, a->rd, a->rn, 0) +TRANS_FEAT(UQCVTN_sh, aa64_sme2_or_sve2p1, gen_gvec_ool_zz, + gen_helper_sme2_uqcvtn_sh, a->rd, a->rn, 0) +TRANS_FEAT(SQCVTUN_sh, aa64_sme2_or_sve2p1, gen_gvec_ool_zz, + gen_helper_sme2_sqcvtun_sh, a->rd, a->rn, 0) + +static bool gen_ldst_c(DisasContext *s, TCGv_i64 addr, int zd, int png, + MemOp esz, bool is_write, int n, bool strided) +{ + typedef void ldst_c_fn(TCGv_env, TCGv_ptr, TCGv_i64, + TCGv_i32, TCGv_i64); + static ldst_c_fn * const f_ldst[2][2][4] = { + { { gen_helper_sve2p1_ld1bb_c, + gen_helper_sve2p1_ld1hh_le_c, + gen_helper_sve2p1_ld1ss_le_c, + gen_helper_sve2p1_ld1dd_le_c, }, + { gen_helper_sve2p1_ld1bb_c, + gen_helper_sve2p1_ld1hh_be_c, + gen_helper_sve2p1_ld1ss_be_c, + gen_helper_sve2p1_ld1dd_be_c, } }, + + { { gen_helper_sve2p1_st1bb_c, + gen_helper_sve2p1_st1hh_le_c, + gen_helper_sve2p1_st1ss_le_c, + gen_helper_sve2p1_st1dd_le_c, }, + { gen_helper_sve2p1_st1bb_c, + gen_helper_sve2p1_st1hh_be_c, + gen_helper_sve2p1_st1ss_be_c, + gen_helper_sve2p1_st1dd_be_c, } } + }; + + TCGv_i32 t_png; + TCGv_i64 t_desc; + TCGv_ptr t_zd; + uint64_t desc, lg2_rstride = 0; + bool be = s->be_data == MO_BE; + + assert(n == 2 || n == 4); + if (strided) { + lg2_rstride = 3; + if (n == 4) { + /* Validate ZD alignment. */ + if (zd & 4) { + return false; + } + lg2_rstride = 2; + } + /* Ignore non-temporal bit */ + zd &= ~8; + } + + if (strided || !dc_isar_feature(aa64_sve2p1, s) + ? !sme_sm_enabled_check(s) + : !sve_access_check(s)) { + return true; + } + + if (!s->mte_active[0]) { + addr = clean_data_tbi(s, addr); + } + + desc = n == 2 ? 0 : 1; + desc = desc | (lg2_rstride << 1); + desc = make_svemte_desc(s, vec_full_reg_size(s), 1, esz, is_write, desc); + t_desc = tcg_constant_i64(desc); + + t_png = tcg_temp_new_i32(); + tcg_gen_ld16u_i32(t_png, tcg_env, + pred_full_reg_offset(s, png) ^ + (HOST_BIG_ENDIAN ? 6 : 0)); + + t_zd = tcg_temp_new_ptr(); + tcg_gen_addi_ptr(t_zd, tcg_env, vec_full_reg_offset(s, zd)); + + f_ldst[is_write][be][esz](tcg_env, t_zd, addr, t_png, t_desc); + return true; +} + +static bool gen_ldst_zcrr_c(DisasContext *s, arg_zcrr_ldst *a, + bool is_write, bool strided) +{ + TCGv_i64 addr = tcg_temp_new_i64(); + + tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), a->esz); + tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn)); + return gen_ldst_c(s, addr, a->rd, a->png, a->esz, is_write, + a->nreg, strided); +} + +static bool gen_ldst_zcri_c(DisasContext *s, arg_zcri_ldst *a, + bool is_write, bool strided) +{ + TCGv_i64 addr = tcg_temp_new_i64(); + + tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), + a->imm * a->nreg * vec_full_reg_size(s)); + return gen_ldst_c(s, addr, a->rd, a->png, a->esz, is_write, + a->nreg, strided); +} + +TRANS_FEAT(LD1_zcrr, aa64_sme2_or_sve2p1, gen_ldst_zcrr_c, a, false, false) +TRANS_FEAT(LD1_zcri, aa64_sme2_or_sve2p1, gen_ldst_zcri_c, a, false, false) +TRANS_FEAT(ST1_zcrr, aa64_sme2_or_sve2p1, gen_ldst_zcrr_c, a, true, false) +TRANS_FEAT(ST1_zcri, aa64_sme2_or_sve2p1, gen_ldst_zcri_c, a, true, false) + +TRANS_FEAT(LD1_zcrr_stride, aa64_sme2, gen_ldst_zcrr_c, a, false, true) +TRANS_FEAT(LD1_zcri_stride, aa64_sme2, gen_ldst_zcri_c, a, false, true) +TRANS_FEAT(ST1_zcrr_stride, aa64_sme2, gen_ldst_zcrr_c, a, true, true) +TRANS_FEAT(ST1_zcri_stride, aa64_sme2, gen_ldst_zcri_c, a, true, true) diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c index d280018..f7d6d8c 100644 --- a/target/arm/tcg/translate.c +++ b/target/arm/tcg/translate.c @@ -27,6 +27,7 @@ #include "semihosting/semihost.h" #include "cpregs.h" #include "exec/helper-proto.h" +#include "exec/target_page.h" #define HELPER_H "helper.h" #include "exec/helper-info.c.inc" @@ -371,7 +372,7 @@ static void gen_rebuild_hflags(DisasContext *s, bool new_el) } } -static void gen_exception_internal(int excp) +void gen_exception_internal(int excp) { assert(excp_is_internal(excp)); gen_helper_exception_internal(tcg_env, tcg_constant_i32(excp)); @@ -493,20 +494,9 @@ static void gen_add_CC(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1) static void gen_adc_CC(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1) { TCGv_i32 tmp = tcg_temp_new_i32(); - if (tcg_op_supported(INDEX_op_add2_i32, TCG_TYPE_I32, 0)) { - tcg_gen_movi_i32(tmp, 0); - tcg_gen_add2_i32(cpu_NF, cpu_CF, t0, tmp, cpu_CF, tmp); - tcg_gen_add2_i32(cpu_NF, cpu_CF, cpu_NF, cpu_CF, t1, tmp); - } else { - TCGv_i64 q0 = tcg_temp_new_i64(); - TCGv_i64 q1 = tcg_temp_new_i64(); - tcg_gen_extu_i32_i64(q0, t0); - tcg_gen_extu_i32_i64(q1, t1); - tcg_gen_add_i64(q0, q0, q1); - tcg_gen_extu_i32_i64(q1, cpu_CF); - tcg_gen_add_i64(q0, q0, q1); - tcg_gen_extr_i64_i32(cpu_NF, cpu_CF, q0); - } + + tcg_gen_addcio_i32(cpu_NF, cpu_CF, t0, t1, cpu_CF); + tcg_gen_mov_i32(cpu_ZF, cpu_NF); tcg_gen_xor_i32(cpu_VF, cpu_NF, t0); tcg_gen_xor_i32(tmp, t0, t1); @@ -7770,7 +7760,8 @@ static bool arm_check_ss_active(DisasContext *dc) static void arm_post_translate_insn(DisasContext *dc) { - if (dc->condjmp && dc->base.is_jmp == DISAS_NEXT) { + if (dc->condjmp && + (dc->base.is_jmp == DISAS_NEXT || dc->base.is_jmp == DISAS_TOO_MANY)) { if (dc->pc_save != dc->condlabel.pc_save) { gen_update_pc(dc, dc->condlabel.pc_save - dc->pc_save); } @@ -7800,7 +7791,7 @@ static void arm_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu) * be possible after an indirect branch, at the start of the TB. */ assert(dc->base.num_insns == 1); - gen_helper_exception_pc_alignment(tcg_env, tcg_constant_tl(pc)); + gen_helper_exception_pc_alignment(tcg_env, tcg_constant_vaddr(pc)); dc->base.is_jmp = DISAS_NORETURN; dc->base.pc_next = QEMU_ALIGN_UP(pc, 4); return; diff --git a/target/arm/tcg/translate.h b/target/arm/tcg/translate.h index 53e485d..f974996 100644 --- a/target/arm/tcg/translate.h +++ b/target/arm/tcg/translate.h @@ -4,7 +4,6 @@ #include "cpu.h" #include "tcg/tcg-op.h" #include "tcg/tcg-op-gvec.h" -#include "exec/exec-all.h" #include "exec/translator.h" #include "exec/translation-block.h" #include "exec/helper-gen.h" @@ -71,8 +70,10 @@ typedef struct DisasContext { int fp_excp_el; /* FP exception EL or 0 if enabled */ int sve_excp_el; /* SVE exception EL or 0 if enabled */ int sme_excp_el; /* SME exception EL or 0 if enabled */ + int zt0_excp_el; /* ZT0 exception EL or 0 if enabled */ int vl; /* current vector length in bytes */ int svl; /* current streaming vector length in bytes */ + int max_svl; /* maximum implemented streaming vector length */ bool vfp_enabled; /* FP enabled via FPSCR.EN */ int vec_len; int vec_stride; @@ -209,6 +210,11 @@ static inline int plus_2(DisasContext *s, int x) return x + 2; } +static inline int plus_8(DisasContext *s, int x) +{ + return x + 8; +} + static inline int plus_12(DisasContext *s, int x) { return x + 12; @@ -348,6 +354,7 @@ void arm_jump_cc(DisasCompare *cmp, TCGLabel *label); void arm_gen_test_cc(int cc, TCGLabel *label); MemOp pow2_align(unsigned i); void unallocated_encoding(DisasContext *s); +void gen_exception_internal(int excp); void gen_exception_insn_el(DisasContext *s, target_long pc_diff, int excp, uint32_t syn, uint32_t target_el); void gen_exception_insn(DisasContext *s, target_long pc_diff, @@ -636,6 +643,8 @@ typedef void GVecGen3Fn(unsigned, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t); typedef void GVecGen4Fn(unsigned, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t); +typedef void GVecGen3FnVar(unsigned, TCGv_ptr, uint32_t, TCGv_ptr, uint32_t, + TCGv_ptr, uint32_t, uint32_t, uint32_t); /* Function prototype for gen_ functions for calling Neon helpers */ typedef void NeonGenOneOpFn(TCGv_i32, TCGv_i32); diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c index 986eaf8..33a136b 100644 --- a/target/arm/tcg/vec_helper.c +++ b/target/arm/tcg/vec_helper.c @@ -825,11 +825,11 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ clear_tail(d, opr_sz, simd_maxsz(desc)); \ } -DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t) -DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t) -DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t) -DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t) -DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t) +DO_DOT(gvec_sdot_4b, int32_t, int8_t, int8_t) +DO_DOT(gvec_udot_4b, uint32_t, uint8_t, uint8_t) +DO_DOT(gvec_usdot_4b, uint32_t, uint8_t, int8_t) +DO_DOT(gvec_sdot_4h, int64_t, int16_t, int16_t) +DO_DOT(gvec_udot_4h, uint64_t, uint16_t, uint16_t) #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ @@ -865,12 +865,63 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ clear_tail(d, opr_sz, simd_maxsz(desc)); \ } -DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4) -DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4) -DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4) -DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4) -DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8) -DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8) +DO_DOT_IDX(gvec_sdot_idx_4b, int32_t, int8_t, int8_t, H4) +DO_DOT_IDX(gvec_udot_idx_4b, uint32_t, uint8_t, uint8_t, H4) +DO_DOT_IDX(gvec_sudot_idx_4b, int32_t, int8_t, uint8_t, H4) +DO_DOT_IDX(gvec_usdot_idx_4b, int32_t, uint8_t, int8_t, H4) +DO_DOT_IDX(gvec_sdot_idx_4h, int64_t, int16_t, int16_t, H8) +DO_DOT_IDX(gvec_udot_idx_4h, uint64_t, uint16_t, uint16_t, H8) + +#undef DO_DOT +#undef DO_DOT_IDX + +/* Similar for 2-way dot product */ +#define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + TYPED *d = vd, *a = va; \ + TYPEN *n = vn; \ + TYPEM *m = vm; \ + for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ + d[i] = (a[i] + \ + (TYPED)n[i * 2 + 0] * m[i * 2 + 0] + \ + (TYPED)n[i * 2 + 1] * m[i * 2 + 1]); \ + } \ + clear_tail(d, opr_sz, simd_maxsz(desc)); \ +} + +#define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ +{ \ + intptr_t i = 0, opr_sz = simd_oprsz(desc); \ + intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ + intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ + intptr_t index = simd_data(desc); \ + TYPED *d = vd, *a = va; \ + TYPEN *n = vn; \ + TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 2; \ + do { \ + TYPED m0 = m_indexed[i * 2 + 0]; \ + TYPED m1 = m_indexed[i * 2 + 1]; \ + do { \ + d[i] = (a[i] + \ + n[i * 2 + 0] * m0 + \ + n[i * 2 + 1] * m1); \ + } while (++i < segend); \ + segend = i + (16 / sizeof(TYPED)); \ + } while (i < opr_sz_n); \ + clear_tail(d, opr_sz, simd_maxsz(desc)); \ +} + +DO_DOT(gvec_sdot_2h, int32_t, int16_t, int16_t) +DO_DOT(gvec_udot_2h, uint32_t, uint16_t, uint16_t) + +DO_DOT_IDX(gvec_sdot_idx_2h, int32_t, int16_t, int16_t, H4) +DO_DOT_IDX(gvec_udot_idx_2h, uint32_t, uint16_t, uint16_t, H4) + +#undef DO_DOT +#undef DO_DOT_IDX void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, float_status *fpst, uint32_t desc) @@ -1416,14 +1467,19 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, \ clear_tail(d, oprsz, simd_maxsz(desc)); \ } +DO_3OP(gvec_fadd_b16, bfloat16_add, float16) DO_3OP(gvec_fadd_h, float16_add, float16) DO_3OP(gvec_fadd_s, float32_add, float32) DO_3OP(gvec_fadd_d, float64_add, float64) +DO_3OP(gvec_bfadd, bfloat16_add, bfloat16) +DO_3OP(gvec_fsub_b16, bfloat16_sub, float16) DO_3OP(gvec_fsub_h, float16_sub, float16) DO_3OP(gvec_fsub_s, float32_sub, float32) DO_3OP(gvec_fsub_d, float64_sub, float64) +DO_3OP(gvec_bfsub, bfloat16_sub, bfloat16) +DO_3OP(gvec_fmul_b16, bfloat16_mul, float16) DO_3OP(gvec_fmul_h, float16_mul, float16) DO_3OP(gvec_fmul_s, float32_mul, float32) DO_3OP(gvec_fmul_d, float64_mul, float64) @@ -1515,6 +1571,13 @@ DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16) DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32) DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64) +DO_3OP(gvec_fmax_b16, bfloat16_max, bfloat16) +DO_3OP(gvec_fmin_b16, bfloat16_min, bfloat16) +DO_3OP(gvec_fmaxnum_b16, bfloat16_maxnum, bfloat16) +DO_3OP(gvec_fminnum_b16, bfloat16_minnum, bfloat16) +DO_3OP(gvec_ah_fmax_b16, helper_sme2_ah_fmax_b16, bfloat16) +DO_3OP(gvec_ah_fmin_b16, helper_sme2_ah_fmin_b16, bfloat16) + #endif #undef DO_3OP @@ -1550,6 +1613,12 @@ static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2, return float16_muladd(op1, op2, dest, 0, stat); } +static bfloat16 bfloat16_muladd_f(bfloat16 dest, bfloat16 op1, bfloat16 op2, + float_status *stat) +{ + return bfloat16_muladd(op1, op2, dest, 0, stat); +} + static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2, float_status *stat) { @@ -1568,6 +1637,12 @@ static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2, return float16_muladd(float16_chs(op1), op2, dest, 0, stat); } +static bfloat16 bfloat16_mulsub_f(bfloat16 dest, bfloat16 op1, bfloat16 op2, + float_status *stat) +{ + return bfloat16_muladd(bfloat16_chs(op1), op2, dest, 0, stat); +} + static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2, float_status *stat) { @@ -1586,6 +1661,12 @@ static float16 float16_ah_mulsub_f(float16 dest, float16 op1, float16 op2, return float16_muladd(op1, op2, dest, float_muladd_negate_product, stat); } +static bfloat16 bfloat16_ah_mulsub_f(bfloat16 dest, bfloat16 op1, bfloat16 op2, + float_status *stat) +{ + return bfloat16_muladd(op1, op2, dest, float_muladd_negate_product, stat); +} + static float32 float32_ah_mulsub_f(float32 dest, float32 op1, float32 op2, float_status *stat) { @@ -1610,23 +1691,28 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, \ clear_tail(d, oprsz, simd_maxsz(desc)); \ } -DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16) -DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32) +DO_MULADD(gvec_fmla_nf_h, float16_muladd_nf, float16) +DO_MULADD(gvec_fmla_nf_s, float32_muladd_nf, float32) -DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16) -DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32) +DO_MULADD(gvec_fmls_nf_h, float16_mulsub_nf, float16) +DO_MULADD(gvec_fmls_nf_s, float32_mulsub_nf, float32) DO_MULADD(gvec_vfma_h, float16_muladd_f, float16) DO_MULADD(gvec_vfma_s, float32_muladd_f, float32) DO_MULADD(gvec_vfma_d, float64_muladd_f, float64) +DO_MULADD(gvec_bfmla, bfloat16_muladd_f, bfloat16) DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64) +DO_MULADD(gvec_bfmls, bfloat16_mulsub_f, bfloat16) DO_MULADD(gvec_ah_vfms_h, float16_ah_mulsub_f, float16) DO_MULADD(gvec_ah_vfms_s, float32_ah_mulsub_f, float32) DO_MULADD(gvec_ah_vfms_d, float64_ah_mulsub_f, float64) +DO_MULADD(gvec_ah_bfmls, bfloat16_ah_mulsub_f, bfloat16) + +#undef DO_MULADD /* For the indexed ops, SVE applies the index per 128-bit vector segment. * For AdvSIMD, there is of course only one such vector segment. @@ -1699,6 +1785,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, \ #define nop(N, M, S) (M) +DO_FMUL_IDX(gvec_fmul_idx_b16, nop, bfloat16_mul, float16, H2) DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2) DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4) DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8) @@ -1745,14 +1832,17 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2, 0, 0) DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4, 0, 0) DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8, 0, 0) +DO_FMLA_IDX(gvec_bfmla_idx, bfloat16, H2, 0, 0) DO_FMLA_IDX(gvec_fmls_idx_h, float16, H2, INT16_MIN, 0) DO_FMLA_IDX(gvec_fmls_idx_s, float32, H4, INT32_MIN, 0) DO_FMLA_IDX(gvec_fmls_idx_d, float64, H8, INT64_MIN, 0) +DO_FMLA_IDX(gvec_bfmls_idx, bfloat16, H2, INT16_MIN, 0) DO_FMLA_IDX(gvec_ah_fmls_idx_h, float16, H2, 0, float_muladd_negate_product) DO_FMLA_IDX(gvec_ah_fmls_idx_s, float32, H4, 0, float_muladd_negate_product) DO_FMLA_IDX(gvec_ah_fmls_idx_d, float64, H8, 0, float_muladd_negate_product) +DO_FMLA_IDX(gvec_ah_bfmls_idx, bfloat16, H2, 0, float_muladd_negate_product) #undef DO_FMLA_IDX @@ -2184,7 +2274,8 @@ void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, intptr_t i, oprsz = simd_oprsz(desc); bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); - float_status *status = &env->vfp.fp_status[FPST_A64]; + bool za = extract32(desc, SIMD_DATA_SHIFT + 2, 1); + float_status *status = &env->vfp.fp_status[za ? FPST_ZA : FPST_A64]; bool fz16 = env->vfp.fpcr & FPCR_FZ16; int negx = 0, negf = 0; @@ -2267,8 +2358,9 @@ void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, intptr_t i, j, oprsz = simd_oprsz(desc); bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); - intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16); - float_status *status = &env->vfp.fp_status[FPST_A64]; + bool za = extract32(desc, SIMD_DATA_SHIFT + 2, 1); + intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 3, 3) * sizeof(float16); + float_status *status = &env->vfp.fp_status[za ? FPST_ZA : FPST_A64]; bool fz16 = env->vfp.fpcr & FPCR_FZ16; int negx = 0, negf = 0; @@ -2989,31 +3081,62 @@ float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst) float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst, float_status *fpst_odd) { - /* - * Compare f16_dotadd() in sme_helper.c, but here we have - * bfloat16 inputs. In particular that means that we do not - * want the FPCR.FZ16 flush semantics, so we use the normal - * float_status for the input handling here. - */ - float64 e1r = float32_to_float64(e1 << 16, fpst); - float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst); - float64 e2r = float32_to_float64(e2 << 16, fpst); - float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst); - float64 t64; + float32 s1r = e1 << 16; + float32 s1c = e1 & 0xffff0000u; + float32 s2r = e2 << 16; + float32 s2c = e2 & 0xffff0000u; float32 t32; - /* - * The ARM pseudocode function FPDot performs both multiplies - * and the add with a single rounding operation. Emulate this - * by performing the first multiply in round-to-odd, then doing - * the second multiply as fused multiply-add, and rounding to - * float32 all in one step. - */ - t64 = float64_mul(e1r, e2r, fpst_odd); - t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst); + /* C.f. FPProcessNaNs4 */ + if (float32_is_any_nan(s1r) || float32_is_any_nan(s1c) || + float32_is_any_nan(s2r) || float32_is_any_nan(s2c)) { + if (float32_is_signaling_nan(s1r, fpst)) { + t32 = s1r; + } else if (float32_is_signaling_nan(s1c, fpst)) { + t32 = s1c; + } else if (float32_is_signaling_nan(s2r, fpst)) { + t32 = s2r; + } else if (float32_is_signaling_nan(s2c, fpst)) { + t32 = s2c; + } else if (float32_is_any_nan(s1r)) { + t32 = s1r; + } else if (float32_is_any_nan(s1c)) { + t32 = s1c; + } else if (float32_is_any_nan(s2r)) { + t32 = s2r; + } else { + t32 = s2c; + } + /* + * FPConvertNaN(FPProcessNaN(t32)) will be done as part + * of the final addition below. + */ + } else { + /* + * Compare f16_dotadd() in sme_helper.c, but here we have + * bfloat16 inputs. In particular that means that we do not + * want the FPCR.FZ16 flush semantics, so we use the normal + * float_status for the input handling here. + */ + float64 e1r = float32_to_float64(s1r, fpst); + float64 e1c = float32_to_float64(s1c, fpst); + float64 e2r = float32_to_float64(s2r, fpst); + float64 e2c = float32_to_float64(s2c, fpst); + float64 t64; + + /* + * The ARM pseudocode function FPDot performs both multiplies + * and the add with a single rounding operation. Emulate this + * by performing the first multiply in round-to-odd, then doing + * the second multiply as fused multiply-add, and rounding to + * float32 all in one step. + */ + t64 = float64_mul(e1r, e2r, fpst_odd); + t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst); - /* This conversion is exact, because we've already rounded. */ - t32 = float64_to_float32(t64, fpst); + /* This conversion is exact, because we've already rounded. */ + t32 = float64_to_float32(t64, fpst); + } /* The final accumulation step is not fused. */ return float32_add(sum, t32, fpst); @@ -3070,6 +3193,45 @@ void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, clear_tail(d, opr_sz, simd_maxsz(desc)); } +void HELPER(sme2_bfvdot_idx)(void *vd, void *vn, void *vm, + void *va, CPUARMState *env, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc); + intptr_t idx = extract32(desc, SIMD_DATA_SHIFT, 2); + intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 2, 1); + intptr_t elements = opr_sz / 4; + intptr_t eltspersegment = MIN(16 / 4, elements); + float32 *d = vd, *a = va; + uint16_t *n0 = vn; + uint16_t *n1 = vn + sizeof(ARMVectorReg); + uint32_t *m = vm; + float_status fpst, fpst_odd; + + if (is_ebf(env, &fpst, &fpst_odd)) { + for (i = 0; i < elements; i += eltspersegment) { + uint32_t m_idx = m[i + H4(idx)]; + + for (j = 0; j < eltspersegment; j++) { + uint32_t nn = (n0[H2(2 * (i + j) + sel)]) + | (n1[H2(2 * (i + j) + sel)] << 16); + d[i + H4(j)] = bfdotadd_ebf(a[i + H4(j)], nn, m_idx, + &fpst, &fpst_odd); + } + } + } else { + for (i = 0; i < elements; i += eltspersegment) { + uint32_t m_idx = m[i + H4(idx)]; + + for (j = 0; j < eltspersegment; j++) { + uint32_t nn = (n0[H2(2 * (i + j) + sel)]) + | (n1[H2(2 * (i + j) + sel)] << 16); + d[i + H4(j)] = bfdotadd(a[i + H4(j)], nn, m_idx, &fpst); + } + } + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, CPUARMState *env, uint32_t desc) { @@ -3146,44 +3308,76 @@ void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, clear_tail(d, opr_sz, simd_maxsz(desc)); } -void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, - float_status *stat, uint32_t desc) +static void do_bfmlal(float32 *d, bfloat16 *n, bfloat16 *m, float32 *a, + float_status *stat, uint32_t desc, int negx, int negf) { intptr_t i, opr_sz = simd_oprsz(desc); - intptr_t sel = simd_data(desc); - float32 *d = vd, *a = va; - bfloat16 *n = vn, *m = vm; + intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); for (i = 0; i < opr_sz / 4; ++i) { - float32 nn = n[H2(i * 2 + sel)] << 16; + float32 nn = (negx ^ n[H2(i * 2 + sel)]) << 16; float32 mm = m[H2(i * 2 + sel)] << 16; - d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat); + d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], negf, stat); } clear_tail(d, opr_sz, simd_maxsz(desc)); } -void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, - void *va, float_status *stat, uint32_t desc) +void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, + float_status *stat, uint32_t desc) +{ + do_bfmlal(vd, vn, vm, va, stat, desc, 0, 0); +} + +void HELPER(gvec_bfmlsl)(void *vd, void *vn, void *vm, void *va, + float_status *stat, uint32_t desc) +{ + do_bfmlal(vd, vn, vm, va, stat, desc, 0x8000, 0); +} + +void HELPER(gvec_ah_bfmlsl)(void *vd, void *vn, void *vm, void *va, + float_status *stat, uint32_t desc) +{ + do_bfmlal(vd, vn, vm, va, stat, desc, 0, float_muladd_negate_product); +} + +static void do_bfmlal_idx(float32 *d, bfloat16 *n, bfloat16 *m, float32 *a, + float_status *stat, uint32_t desc, int negx, int negf) { intptr_t i, j, opr_sz = simd_oprsz(desc); intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3); intptr_t elements = opr_sz / 4; intptr_t eltspersegment = MIN(16 / 4, elements); - float32 *d = vd, *a = va; - bfloat16 *n = vn, *m = vm; for (i = 0; i < elements; i += eltspersegment) { float32 m_idx = m[H2(2 * i + index)] << 16; for (j = i; j < i + eltspersegment; j++) { - float32 n_j = n[H2(2 * j + sel)] << 16; - d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat); + float32 n_j = (negx ^ n[H2(2 * j + sel)]) << 16; + d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], negf, stat); } } clear_tail(d, opr_sz, simd_maxsz(desc)); } +void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, void *va, + float_status *stat, uint32_t desc) +{ + do_bfmlal_idx(vd, vn, vm, va, stat, desc, 0, 0); +} + +void HELPER(gvec_bfmlsl_idx)(void *vd, void *vn, void *vm, void *va, + float_status *stat, uint32_t desc) +{ + do_bfmlal_idx(vd, vn, vm, va, stat, desc, 0x8000, 0); +} + +void HELPER(gvec_ah_bfmlsl_idx)(void *vd, void *vn, void *vm, void *va, + float_status *stat, uint32_t desc) +{ + do_bfmlal_idx(vd, vn, vm, va, stat, desc, 0, float_muladd_negate_product); +} + #define DO_CLAMP(NAME, TYPE) \ void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \ { \ @@ -3253,3 +3447,90 @@ void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc) } clear_tail(d, opr_sz, simd_maxsz(desc)); } + +static inline void do_lut_b(void *zd, uint64_t *indexes, uint64_t *table, + unsigned elements, unsigned segbase, + unsigned dstride, unsigned isize, + unsigned tsize, unsigned nreg) +{ + for (unsigned r = 0; r < nreg; ++r) { + uint8_t *dst = zd + dstride * r; + unsigned base = segbase + r * elements; + + for (unsigned e = 0; e < elements; ++e) { + unsigned index = extractn(indexes, (base + e) * isize, isize); + dst[H1(e)] = extractn(table, index * tsize, 8); + } + } +} + +static inline void do_lut_h(void *zd, uint64_t *indexes, uint64_t *table, + unsigned elements, unsigned segbase, + unsigned dstride, unsigned isize, + unsigned tsize, unsigned nreg) +{ + for (unsigned r = 0; r < nreg; ++r) { + uint16_t *dst = zd + dstride * r; + unsigned base = segbase + r * elements; + + for (unsigned e = 0; e < elements; ++e) { + unsigned index = extractn(indexes, (base + e) * isize, isize); + dst[H2(e)] = extractn(table, index * tsize, 16); + } + } +} + +static inline void do_lut_s(void *zd, uint64_t *indexes, uint32_t *table, + unsigned elements, unsigned segbase, + unsigned dstride, unsigned isize, + unsigned tsize, unsigned nreg) +{ + for (unsigned r = 0; r < nreg; ++r) { + uint32_t *dst = zd + dstride * r; + unsigned base = segbase + r * elements; + + for (unsigned e = 0; e < elements; ++e) { + unsigned index = extractn(indexes, (base + e) * isize, isize); + dst[H4(e)] = table[H4(index)]; + } + } +} + +#define DO_SME2_LUT(ISIZE, NREG, SUFF, ESIZE) \ +void helper_sme2_luti##ISIZE##_##NREG##SUFF \ + (void *zd, void *zn, CPUARMState *env, uint32_t desc) \ +{ \ + unsigned vl = simd_oprsz(desc); \ + unsigned strided = extract32(desc, SIMD_DATA_SHIFT, 1); \ + unsigned idx = extract32(desc, SIMD_DATA_SHIFT + 1, 4); \ + unsigned elements = vl / ESIZE; \ + unsigned dstride = (!strided ? 1 : NREG == 4 ? 4 : 8); \ + unsigned segments = (ESIZE * 8) / (ISIZE * NREG); \ + unsigned segment = idx & (segments - 1); \ + ARMVectorReg indexes; \ + memcpy(&indexes, zn, vl); \ + do_lut_##SUFF(zd, indexes.d, (void *)env->za_state.zt0, elements, \ + segment * NREG * elements, \ + dstride * sizeof(ARMVectorReg), ISIZE, 32, NREG); \ +} + +DO_SME2_LUT(2,1,b, 1) +DO_SME2_LUT(2,1,h, 2) +DO_SME2_LUT(2,1,s, 4) +DO_SME2_LUT(2,2,b, 1) +DO_SME2_LUT(2,2,h, 2) +DO_SME2_LUT(2,2,s, 4) +DO_SME2_LUT(2,4,b, 1) +DO_SME2_LUT(2,4,h, 2) +DO_SME2_LUT(2,4,s, 4) + +DO_SME2_LUT(4,1,b, 1) +DO_SME2_LUT(4,1,h, 2) +DO_SME2_LUT(4,1,s, 4) +DO_SME2_LUT(4,2,b, 1) +DO_SME2_LUT(4,2,h, 2) +DO_SME2_LUT(4,2,s, 4) +DO_SME2_LUT(4,4,h, 2) +DO_SME2_LUT(4,4,s, 4) + +#undef DO_SME2_LUT diff --git a/target/arm/tcg/vec_internal.h b/target/arm/tcg/vec_internal.h index 6b93b5a..cf41b03 100644 --- a/target/arm/tcg/vec_internal.h +++ b/target/arm/tcg/vec_internal.h @@ -22,6 +22,8 @@ #include "fpu/softfloat.h" +typedef struct CPUArchState CPUARMState; + /* * Note that vector data is stored in host-endian 64-bit chunks, * so addressing units smaller than that needs a host-endian fixup. @@ -221,6 +223,34 @@ int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *); int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *); int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool); +#define do_ssat_b(val) MIN(MAX(val, INT8_MIN), INT8_MAX) +#define do_ssat_h(val) MIN(MAX(val, INT16_MIN), INT16_MAX) +#define do_ssat_s(val) MIN(MAX(val, INT32_MIN), INT32_MAX) +#define do_usat_b(val) MIN(MAX(val, 0), UINT8_MAX) +#define do_usat_h(val) MIN(MAX(val, 0), UINT16_MAX) +#define do_usat_s(val) MIN(MAX(val, 0), UINT32_MAX) + +static inline uint64_t do_urshr(uint64_t x, unsigned sh) +{ + if (likely(sh < 64)) { + return (x >> sh) + ((x >> (sh - 1)) & 1); + } else if (sh == 64) { + return x >> 63; + } else { + return 0; + } +} + +static inline int64_t do_srshr(int64_t x, unsigned sh) +{ + if (likely(sh < 64)) { + return (x >> sh) + ((x >> (sh - 1)) & 1); + } else { + /* Rounding the sign bit always produces 0. */ + return 0; + } +} + /** * bfdotadd: * @sum: addend @@ -270,6 +300,11 @@ bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp); /* * Negate as for FPCR.AH=1 -- do not negate NaNs. */ +static inline float16 bfloat16_ah_chs(float16 a) +{ + return bfloat16_is_any_nan(a) ? a : bfloat16_chs(a); +} + static inline float16 float16_ah_chs(float16 a) { return float16_is_any_nan(a) ? a : float16_chs(a); @@ -300,4 +335,119 @@ static inline float64 float64_maybe_ah_chs(float64 a, bool fpcr_ah) return fpcr_ah && float64_is_any_nan(a) ? a : float64_chs(a); } +/* Not actually called directly as a helper, but uses similar machinery. */ +bfloat16 helper_sme2_ah_fmax_b16(bfloat16 a, bfloat16 b, float_status *fpst); +bfloat16 helper_sme2_ah_fmin_b16(bfloat16 a, bfloat16 b, float_status *fpst); + +float32 sve_f16_to_f32(float16 f, float_status *fpst); +float16 sve_f32_to_f16(float32 f, float_status *fpst); + +/* + * Decode helper functions for predicate as counter. + */ + +typedef struct { + unsigned count; + unsigned lg2_stride; + bool invert; +} DecodeCounter; + +static inline DecodeCounter +decode_counter(unsigned png, unsigned vl, unsigned v_esz) +{ + DecodeCounter ret = { }; + + /* C.f. Arm pseudocode CounterToPredicate. */ + if (likely(png & 0xf)) { + unsigned p_esz = ctz32(png); + + /* + * maxbit = log2(pl(bits) * 4) + * = log2(vl(bytes) * 4) + * = log2(vl) + 2 + * maxbit_mask = ones<maxbit:0> + * = (1 << (maxbit + 1)) - 1 + * = (1 << (log2(vl) + 2 + 1)) - 1 + * = (1 << (log2(vl) + 3)) - 1 + * = (pow2ceil(vl) << 3) - 1 + */ + ret.count = png & (((unsigned)pow2ceil(vl) << 3) - 1); + ret.count >>= p_esz + 1; + + ret.invert = (png >> 15) & 1; + + /* + * The Arm pseudocode for CounterToPredicate expands the count to + * a set of bits, and then the operation proceeds as for the original + * interpretation of predicates as a set of bits. + * + * We can avoid the expansion by adjusting the count and supplying + * an element stride. + */ + if (unlikely(p_esz != v_esz)) { + if (p_esz < v_esz) { + /* + * For predicate esz < vector esz, the expanded predicate + * will have more bits set than will be consumed. + * Adjust the count down, rounding up. + * Consider p_esz = MO_8, v_esz = MO_64, count 14: + * The expanded predicate would be + * 0011 1111 1111 1111 + * The significant bits are + * ...1 ...1 ...1 ...1 + */ + unsigned shift = v_esz - p_esz; + unsigned trunc = ret.count >> shift; + ret.count = trunc + (ret.count != (trunc << shift)); + } else { + /* + * For predicate esz > vector esz, the expanded predicate + * will have bits set only at power-of-two multiples of + * the vector esz. Bits at other multiples will all be + * false. Adjust the count up, and supply the caller + * with a stride of elements to skip. + */ + unsigned shift = p_esz - v_esz; + ret.count <<= shift; + ret.lg2_stride = shift; + } + } + } + return ret; +} + +/* Extract @len bits from an array of uint64_t at offset @pos bits. */ +static inline uint64_t extractn(uint64_t *p, unsigned pos, unsigned len) +{ + uint64_t x; + + p += pos / 64; + pos = pos % 64; + + x = p[0]; + if (pos + len > 64) { + x = (x >> pos) | (p[1] << (-pos & 63)); + pos = 0; + } + return extract64(x, pos, len); +} + +/* Deposit @len bits into an array of uint64_t at offset @pos bits. */ +static inline void depositn(uint64_t *p, unsigned pos, + unsigned len, uint64_t val) +{ + p += pos / 64; + pos = pos % 64; + + if (pos + len <= 64) { + p[0] = deposit64(p[0], pos, len, val); + } else { + unsigned len0 = 64 - pos; + unsigned len1 = len - len0; + + p[0] = deposit64(p[0], pos, len0, val); + p[1] = deposit64(p[1], 0, len1, val >> len0); + } +} + #endif /* TARGET_ARM_VEC_INTERNAL_H */ diff --git a/target/arm/tcg/vfp_helper.c b/target/arm/tcg/vfp_helper.c index b32e2f4..e156e37 100644 --- a/target/arm/tcg/vfp_helper.c +++ b/target/arm/tcg/vfp_helper.c @@ -19,12 +19,14 @@ #include "qemu/osdep.h" #include "cpu.h" -#include "exec/helper-proto.h" #include "internals.h" #include "cpu-features.h" #include "fpu/softfloat.h" #include "qemu/log.h" +#define HELPER_H "tcg/helper.h" +#include "exec/helper-proto.h.inc" + /* * Set the float_status behaviour to match the Arm defaults: * * tininess-before-rounding @@ -121,7 +123,7 @@ uint32_t vfp_get_fpsr_from_host(CPUARMState *env) a64_flags |= (get_float_exception_flags(&env->vfp.fp_status[FPST_A64_F16]) & ~(float_flag_input_denormal_flushed | float_flag_input_denormal_used)); /* - * We do not merge in flags from FPST_AH or FPST_AH_F16, because + * We do not merge in flags from FPST_{AH,ZA} or FPST_{AH,ZA}_F16, because * they are used for insns that must not set the cumulative exception bits. */ @@ -194,6 +196,8 @@ void vfp_set_fpcr_to_host(CPUARMState *env, uint32_t val, uint32_t mask) set_float_rounding_mode(i, &env->vfp.fp_status[FPST_A64]); set_float_rounding_mode(i, &env->vfp.fp_status[FPST_A32_F16]); set_float_rounding_mode(i, &env->vfp.fp_status[FPST_A64_F16]); + set_float_rounding_mode(i, &env->vfp.fp_status[FPST_ZA]); + set_float_rounding_mode(i, &env->vfp.fp_status[FPST_ZA_F16]); } if (changed & FPCR_FZ16) { bool ftz_enabled = val & FPCR_FZ16; @@ -201,15 +205,18 @@ void vfp_set_fpcr_to_host(CPUARMState *env, uint32_t val, uint32_t mask) set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A64_F16]); set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_STD_F16]); set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_AH_F16]); + set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_ZA_F16]); set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A32_F16]); set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A64_F16]); set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_STD_F16]); set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_AH_F16]); + set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_ZA_F16]); } if (changed & FPCR_FZ) { bool ftz_enabled = val & FPCR_FZ; set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A32]); set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A64]); + set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_ZA]); /* FIZ is A64 only so FZ always makes A32 code flush inputs to zero */ set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A32]); } @@ -221,6 +228,7 @@ void vfp_set_fpcr_to_host(CPUARMState *env, uint32_t val, uint32_t mask) bool fitz_enabled = (val & FPCR_FIZ) || (val & (FPCR_FZ | FPCR_AH)) == FPCR_FZ; set_flush_inputs_to_zero(fitz_enabled, &env->vfp.fp_status[FPST_A64]); + set_flush_inputs_to_zero(fitz_enabled, &env->vfp.fp_status[FPST_ZA]); } if (changed & FPCR_DN) { bool dnan_enabled = val & FPCR_DN; @@ -238,9 +246,13 @@ void vfp_set_fpcr_to_host(CPUARMState *env, uint32_t val, uint32_t mask) /* Change behaviours for A64 FP operations */ arm_set_ah_fp_behaviours(&env->vfp.fp_status[FPST_A64]); arm_set_ah_fp_behaviours(&env->vfp.fp_status[FPST_A64_F16]); + arm_set_ah_fp_behaviours(&env->vfp.fp_status[FPST_ZA]); + arm_set_ah_fp_behaviours(&env->vfp.fp_status[FPST_ZA_F16]); } else { arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_A64]); arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_A64_F16]); + arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_ZA]); + arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_ZA_F16]); } } /* |