From 212154821e6dc400315a67ad8204f5e5c4b3023c Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Tue, 21 Feb 2023 13:37:14 +0000
Subject: include/hw/arm/allwinner-a10.h: Remove superfluous includes from the
 header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pci_device.h is not needed at all in allwinner-a10.h, and serial.h
is only needed by the corresponding .c file.

Signed-off-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Message-id: 20230215152233.210024-1-thuth@redhat.com
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 hw/arm/allwinner-a10.c         | 1 +
 include/hw/arm/allwinner-a10.h | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/hw/arm/allwinner-a10.c b/hw/arm/allwinner-a10.c
index dc1966f..b7ca795 100644
--- a/hw/arm/allwinner-a10.c
+++ b/hw/arm/allwinner-a10.c
@@ -18,6 +18,7 @@
 #include "qemu/osdep.h"
 #include "qapi/error.h"
 #include "qemu/module.h"
+#include "hw/char/serial.h"
 #include "hw/sysbus.h"
 #include "hw/arm/allwinner-a10.h"
 #include "hw/misc/unimp.h"
diff --git a/include/hw/arm/allwinner-a10.h b/include/hw/arm/allwinner-a10.h
index 79e0c80..095afb2 100644
--- a/include/hw/arm/allwinner-a10.h
+++ b/include/hw/arm/allwinner-a10.h
@@ -1,9 +1,7 @@
 #ifndef HW_ARM_ALLWINNER_A10_H
 #define HW_ARM_ALLWINNER_A10_H
 
-#include "hw/char/serial.h"
 #include "hw/arm/boot.h"
-#include "hw/pci/pci_device.h"
 #include "hw/timer/allwinner-a10-pit.h"
 #include "hw/intc/allwinner-a10-pic.h"
 #include "hw/net/allwinner_emac.h"
-- 
cgit v1.1


From fa05d1abb998a3272f97a70db2f8a01852ebc06c Mon Sep 17 00:00:00 2001
From: Fabiano Rosas <farosas@suse.de>
Date: Fri, 17 Feb 2023 17:11:27 -0300
Subject: target/arm: Wrap breakpoint/watchpoint updates with tcg_enabled

This is in preparation for restricting compilation of some parts of
debug_helper.c to TCG only.

Signed-off-by: Fabiano Rosas <farosas@suse.de>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 target/arm/cpu.c          |  6 ++++--
 target/arm/debug_helper.c | 16 ++++++++++++----
 target/arm/machine.c      |  7 +++++--
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index 876ab8f..da416f7 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -539,8 +539,10 @@ static void arm_cpu_reset_hold(Object *obj)
     }
 #endif
 
-    hw_breakpoint_update_all(cpu);
-    hw_watchpoint_update_all(cpu);
+    if (tcg_enabled()) {
+        hw_breakpoint_update_all(cpu);
+        hw_watchpoint_update_all(cpu);
+    }
     arm_rebuild_hflags(env);
 }
 
diff --git a/target/arm/debug_helper.c b/target/arm/debug_helper.c
index 3c671c8..3325eb9 100644
--- a/target/arm/debug_helper.c
+++ b/target/arm/debug_helper.c
@@ -939,7 +939,9 @@ static void dbgwvr_write(CPUARMState *env, const ARMCPRegInfo *ri,
     value &= ~3ULL;
 
     raw_write(env, ri, value);
-    hw_watchpoint_update(cpu, i);
+    if (tcg_enabled()) {
+        hw_watchpoint_update(cpu, i);
+    }
 }
 
 static void dbgwcr_write(CPUARMState *env, const ARMCPRegInfo *ri,
@@ -949,7 +951,9 @@ static void dbgwcr_write(CPUARMState *env, const ARMCPRegInfo *ri,
     int i = ri->crm;
 
     raw_write(env, ri, value);
-    hw_watchpoint_update(cpu, i);
+    if (tcg_enabled()) {
+        hw_watchpoint_update(cpu, i);
+    }
 }
 
 void hw_breakpoint_update(ARMCPU *cpu, int n)
@@ -1062,7 +1066,9 @@ static void dbgbvr_write(CPUARMState *env, const ARMCPRegInfo *ri,
     int i = ri->crm;
 
     raw_write(env, ri, value);
-    hw_breakpoint_update(cpu, i);
+    if (tcg_enabled()) {
+        hw_breakpoint_update(cpu, i);
+    }
 }
 
 static void dbgbcr_write(CPUARMState *env, const ARMCPRegInfo *ri,
@@ -1079,7 +1085,9 @@ static void dbgbcr_write(CPUARMState *env, const ARMCPRegInfo *ri,
     value = deposit64(value, 8, 1, extract64(value, 7, 1));
 
     raw_write(env, ri, value);
-    hw_breakpoint_update(cpu, i);
+    if (tcg_enabled()) {
+        hw_breakpoint_update(cpu, i);
+    }
 }
 
 void define_debug_regs(ARMCPU *cpu)
diff --git a/target/arm/machine.c b/target/arm/machine.c
index b4c3850..fd6323f 100644
--- a/target/arm/machine.c
+++ b/target/arm/machine.c
@@ -2,6 +2,7 @@
 #include "cpu.h"
 #include "qemu/error-report.h"
 #include "sysemu/kvm.h"
+#include "sysemu/tcg.h"
 #include "kvm_arm.h"
 #include "internals.h"
 #include "migration/cpu.h"
@@ -848,8 +849,10 @@ static int cpu_post_load(void *opaque, int version_id)
         return -1;
     }
 
-    hw_breakpoint_update_all(cpu);
-    hw_watchpoint_update_all(cpu);
+    if (tcg_enabled()) {
+        hw_breakpoint_update_all(cpu);
+        hw_watchpoint_update_all(cpu);
+    }
 
     /*
      * TCG gen_update_fp_context() relies on the invariant that
-- 
cgit v1.1


From 2059ec754f9040a6a9f62a9abfeb76a9d8655e11 Mon Sep 17 00:00:00 2001
From: Fabiano Rosas <farosas@suse.de>
Date: Fri, 17 Feb 2023 17:11:28 -0300
Subject: target/arm: Wrap TCG-only code in debug_helper.c

The next few patches will move helpers under CONFIG_TCG. We'd prefer
to keep the debug helpers and debug registers close together, so
rearrange the file a bit to be able to wrap the helpers with a TCG
ifdef.

Signed-off-by: Fabiano Rosas <farosas@suse.de>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 target/arm/debug_helper.c | 476 +++++++++++++++++++++++-----------------------
 1 file changed, 239 insertions(+), 237 deletions(-)

diff --git a/target/arm/debug_helper.c b/target/arm/debug_helper.c
index 3325eb9..dfc8b2a 100644
--- a/target/arm/debug_helper.c
+++ b/target/arm/debug_helper.c
@@ -12,8 +12,9 @@
 #include "cpregs.h"
 #include "exec/exec-all.h"
 #include "exec/helper-proto.h"
+#include "sysemu/tcg.h"
 
-
+#ifdef CONFIG_TCG
 /* Return the Exception Level targeted by debug exceptions. */
 static int arm_debug_target_el(CPUARMState *env)
 {
@@ -536,6 +537,243 @@ void HELPER(exception_swstep)(CPUARMState *env, uint32_t syndrome)
     raise_exception_debug(env, EXCP_UDEF, syndrome);
 }
 
+void hw_watchpoint_update(ARMCPU *cpu, int n)
+{
+    CPUARMState *env = &cpu->env;
+    vaddr len = 0;
+    vaddr wvr = env->cp15.dbgwvr[n];
+    uint64_t wcr = env->cp15.dbgwcr[n];
+    int mask;
+    int flags = BP_CPU | BP_STOP_BEFORE_ACCESS;
+
+    if (env->cpu_watchpoint[n]) {
+        cpu_watchpoint_remove_by_ref(CPU(cpu), env->cpu_watchpoint[n]);
+        env->cpu_watchpoint[n] = NULL;
+    }
+
+    if (!FIELD_EX64(wcr, DBGWCR, E)) {
+        /* E bit clear : watchpoint disabled */
+        return;
+    }
+
+    switch (FIELD_EX64(wcr, DBGWCR, LSC)) {
+    case 0:
+        /* LSC 00 is reserved and must behave as if the wp is disabled */
+        return;
+    case 1:
+        flags |= BP_MEM_READ;
+        break;
+    case 2:
+        flags |= BP_MEM_WRITE;
+        break;
+    case 3:
+        flags |= BP_MEM_ACCESS;
+        break;
+    }
+
+    /*
+     * Attempts to use both MASK and BAS fields simultaneously are
+     * CONSTRAINED UNPREDICTABLE; we opt to ignore BAS in this case,
+     * thus generating a watchpoint for every byte in the masked region.
+     */
+    mask = FIELD_EX64(wcr, DBGWCR, MASK);
+    if (mask == 1 || mask == 2) {
+        /*
+         * Reserved values of MASK; we must act as if the mask value was
+         * some non-reserved value, or as if the watchpoint were disabled.
+         * We choose the latter.
+         */
+        return;
+    } else if (mask) {
+        /* Watchpoint covers an aligned area up to 2GB in size */
+        len = 1ULL << mask;
+        /*
+         * If masked bits in WVR are not zero it's CONSTRAINED UNPREDICTABLE
+         * whether the watchpoint fires when the unmasked bits match; we opt
+         * to generate the exceptions.
+         */
+        wvr &= ~(len - 1);
+    } else {
+        /* Watchpoint covers bytes defined by the byte address select bits */
+        int bas = FIELD_EX64(wcr, DBGWCR, BAS);
+        int basstart;
+
+        if (extract64(wvr, 2, 1)) {
+            /*
+             * Deprecated case of an only 4-aligned address. BAS[7:4] are
+             * ignored, and BAS[3:0] define which bytes to watch.
+             */
+            bas &= 0xf;
+        }
+
+        if (bas == 0) {
+            /* This must act as if the watchpoint is disabled */
+            return;
+        }
+
+        /*
+         * The BAS bits are supposed to be programmed to indicate a contiguous
+         * range of bytes. Otherwise it is CONSTRAINED UNPREDICTABLE whether
+         * we fire for each byte in the word/doubleword addressed by the WVR.
+         * We choose to ignore any non-zero bits after the first range of 1s.
+         */
+        basstart = ctz32(bas);
+        len = cto32(bas >> basstart);
+        wvr += basstart;
+    }
+
+    cpu_watchpoint_insert(CPU(cpu), wvr, len, flags,
+                          &env->cpu_watchpoint[n]);
+}
+
+void hw_watchpoint_update_all(ARMCPU *cpu)
+{
+    int i;
+    CPUARMState *env = &cpu->env;
+
+    /*
+     * Completely clear out existing QEMU watchpoints and our array, to
+     * avoid possible stale entries following migration load.
+     */
+    cpu_watchpoint_remove_all(CPU(cpu), BP_CPU);
+    memset(env->cpu_watchpoint, 0, sizeof(env->cpu_watchpoint));
+
+    for (i = 0; i < ARRAY_SIZE(cpu->env.cpu_watchpoint); i++) {
+        hw_watchpoint_update(cpu, i);
+    }
+}
+
+void hw_breakpoint_update(ARMCPU *cpu, int n)
+{
+    CPUARMState *env = &cpu->env;
+    uint64_t bvr = env->cp15.dbgbvr[n];
+    uint64_t bcr = env->cp15.dbgbcr[n];
+    vaddr addr;
+    int bt;
+    int flags = BP_CPU;
+
+    if (env->cpu_breakpoint[n]) {
+        cpu_breakpoint_remove_by_ref(CPU(cpu), env->cpu_breakpoint[n]);
+        env->cpu_breakpoint[n] = NULL;
+    }
+
+    if (!extract64(bcr, 0, 1)) {
+        /* E bit clear : watchpoint disabled */
+        return;
+    }
+
+    bt = extract64(bcr, 20, 4);
+
+    switch (bt) {
+    case 4: /* unlinked address mismatch (reserved if AArch64) */
+    case 5: /* linked address mismatch (reserved if AArch64) */
+        qemu_log_mask(LOG_UNIMP,
+                      "arm: address mismatch breakpoint types not implemented\n");
+        return;
+    case 0: /* unlinked address match */
+    case 1: /* linked address match */
+    {
+        /*
+         * Bits [1:0] are RES0.
+         *
+         * It is IMPLEMENTATION DEFINED whether bits [63:49]
+         * ([63:53] for FEAT_LVA) are hardwired to a copy of the sign bit
+         * of the VA field ([48] or [52] for FEAT_LVA), or whether the
+         * value is read as written.  It is CONSTRAINED UNPREDICTABLE
+         * whether the RESS bits are ignored when comparing an address.
+         * Therefore we are allowed to compare the entire register, which
+         * lets us avoid considering whether FEAT_LVA is actually enabled.
+         *
+         * The BAS field is used to allow setting breakpoints on 16-bit
+         * wide instructions; it is CONSTRAINED UNPREDICTABLE whether
+         * a bp will fire if the addresses covered by the bp and the addresses
+         * covered by the insn overlap but the insn doesn't start at the
+         * start of the bp address range. We choose to require the insn and
+         * the bp to have the same address. The constraints on writing to
+         * BAS enforced in dbgbcr_write mean we have only four cases:
+         *  0b0000  => no breakpoint
+         *  0b0011  => breakpoint on addr
+         *  0b1100  => breakpoint on addr + 2
+         *  0b1111  => breakpoint on addr
+         * See also figure D2-3 in the v8 ARM ARM (DDI0487A.c).
+         */
+        int bas = extract64(bcr, 5, 4);
+        addr = bvr & ~3ULL;
+        if (bas == 0) {
+            return;
+        }
+        if (bas == 0xc) {
+            addr += 2;
+        }
+        break;
+    }
+    case 2: /* unlinked context ID match */
+    case 8: /* unlinked VMID match (reserved if no EL2) */
+    case 10: /* unlinked context ID and VMID match (reserved if no EL2) */
+        qemu_log_mask(LOG_UNIMP,
+                      "arm: unlinked context breakpoint types not implemented\n");
+        return;
+    case 9: /* linked VMID match (reserved if no EL2) */
+    case 11: /* linked context ID and VMID match (reserved if no EL2) */
+    case 3: /* linked context ID match */
+    default:
+        /*
+         * We must generate no events for Linked context matches (unless
+         * they are linked to by some other bp/wp, which is handled in
+         * updates for the linking bp/wp). We choose to also generate no events
+         * for reserved values.
+         */
+        return;
+    }
+
+    cpu_breakpoint_insert(CPU(cpu), addr, flags, &env->cpu_breakpoint[n]);
+}
+
+void hw_breakpoint_update_all(ARMCPU *cpu)
+{
+    int i;
+    CPUARMState *env = &cpu->env;
+
+    /*
+     * Completely clear out existing QEMU breakpoints and our array, to
+     * avoid possible stale entries following migration load.
+     */
+    cpu_breakpoint_remove_all(CPU(cpu), BP_CPU);
+    memset(env->cpu_breakpoint, 0, sizeof(env->cpu_breakpoint));
+
+    for (i = 0; i < ARRAY_SIZE(cpu->env.cpu_breakpoint); i++) {
+        hw_breakpoint_update(cpu, i);
+    }
+}
+
+#if !defined(CONFIG_USER_ONLY)
+
+vaddr arm_adjust_watchpoint_address(CPUState *cs, vaddr addr, int len)
+{
+    ARMCPU *cpu = ARM_CPU(cs);
+    CPUARMState *env = &cpu->env;
+
+    /*
+     * In BE32 system mode, target memory is stored byteswapped (on a
+     * little-endian host system), and by the time we reach here (via an
+     * opcode helper) the addresses of subword accesses have been adjusted
+     * to account for that, which means that watchpoints will not match.
+     * Undo the adjustment here.
+     */
+    if (arm_sctlr_b(env)) {
+        if (len == 1) {
+            addr ^= 3;
+        } else if (len == 2) {
+            addr ^= 2;
+        }
+    }
+
+    return addr;
+}
+
+#endif /* !CONFIG_USER_ONLY */
+#endif /* CONFIG_TCG */
+
 /*
  * Check for traps to "powerdown debug" registers, which are controlled
  * by MDCR.TDOSA
@@ -813,112 +1051,6 @@ static const ARMCPRegInfo debug_lpae_cp_reginfo[] = {
       .access = PL0_R, .type = ARM_CP_CONST | ARM_CP_64BIT, .resetvalue = 0 },
 };
 
-void hw_watchpoint_update(ARMCPU *cpu, int n)
-{
-    CPUARMState *env = &cpu->env;
-    vaddr len = 0;
-    vaddr wvr = env->cp15.dbgwvr[n];
-    uint64_t wcr = env->cp15.dbgwcr[n];
-    int mask;
-    int flags = BP_CPU | BP_STOP_BEFORE_ACCESS;
-
-    if (env->cpu_watchpoint[n]) {
-        cpu_watchpoint_remove_by_ref(CPU(cpu), env->cpu_watchpoint[n]);
-        env->cpu_watchpoint[n] = NULL;
-    }
-
-    if (!FIELD_EX64(wcr, DBGWCR, E)) {
-        /* E bit clear : watchpoint disabled */
-        return;
-    }
-
-    switch (FIELD_EX64(wcr, DBGWCR, LSC)) {
-    case 0:
-        /* LSC 00 is reserved and must behave as if the wp is disabled */
-        return;
-    case 1:
-        flags |= BP_MEM_READ;
-        break;
-    case 2:
-        flags |= BP_MEM_WRITE;
-        break;
-    case 3:
-        flags |= BP_MEM_ACCESS;
-        break;
-    }
-
-    /*
-     * Attempts to use both MASK and BAS fields simultaneously are
-     * CONSTRAINED UNPREDICTABLE; we opt to ignore BAS in this case,
-     * thus generating a watchpoint for every byte in the masked region.
-     */
-    mask = FIELD_EX64(wcr, DBGWCR, MASK);
-    if (mask == 1 || mask == 2) {
-        /*
-         * Reserved values of MASK; we must act as if the mask value was
-         * some non-reserved value, or as if the watchpoint were disabled.
-         * We choose the latter.
-         */
-        return;
-    } else if (mask) {
-        /* Watchpoint covers an aligned area up to 2GB in size */
-        len = 1ULL << mask;
-        /*
-         * If masked bits in WVR are not zero it's CONSTRAINED UNPREDICTABLE
-         * whether the watchpoint fires when the unmasked bits match; we opt
-         * to generate the exceptions.
-         */
-        wvr &= ~(len - 1);
-    } else {
-        /* Watchpoint covers bytes defined by the byte address select bits */
-        int bas = FIELD_EX64(wcr, DBGWCR, BAS);
-        int basstart;
-
-        if (extract64(wvr, 2, 1)) {
-            /*
-             * Deprecated case of an only 4-aligned address. BAS[7:4] are
-             * ignored, and BAS[3:0] define which bytes to watch.
-             */
-            bas &= 0xf;
-        }
-
-        if (bas == 0) {
-            /* This must act as if the watchpoint is disabled */
-            return;
-        }
-
-        /*
-         * The BAS bits are supposed to be programmed to indicate a contiguous
-         * range of bytes. Otherwise it is CONSTRAINED UNPREDICTABLE whether
-         * we fire for each byte in the word/doubleword addressed by the WVR.
-         * We choose to ignore any non-zero bits after the first range of 1s.
-         */
-        basstart = ctz32(bas);
-        len = cto32(bas >> basstart);
-        wvr += basstart;
-    }
-
-    cpu_watchpoint_insert(CPU(cpu), wvr, len, flags,
-                          &env->cpu_watchpoint[n]);
-}
-
-void hw_watchpoint_update_all(ARMCPU *cpu)
-{
-    int i;
-    CPUARMState *env = &cpu->env;
-
-    /*
-     * Completely clear out existing QEMU watchpoints and our array, to
-     * avoid possible stale entries following migration load.
-     */
-    cpu_watchpoint_remove_all(CPU(cpu), BP_CPU);
-    memset(env->cpu_watchpoint, 0, sizeof(env->cpu_watchpoint));
-
-    for (i = 0; i < ARRAY_SIZE(cpu->env.cpu_watchpoint); i++) {
-        hw_watchpoint_update(cpu, i);
-    }
-}
-
 static void dbgwvr_write(CPUARMState *env, const ARMCPRegInfo *ri,
                          uint64_t value)
 {
@@ -956,109 +1088,6 @@ static void dbgwcr_write(CPUARMState *env, const ARMCPRegInfo *ri,
     }
 }
 
-void hw_breakpoint_update(ARMCPU *cpu, int n)
-{
-    CPUARMState *env = &cpu->env;
-    uint64_t bvr = env->cp15.dbgbvr[n];
-    uint64_t bcr = env->cp15.dbgbcr[n];
-    vaddr addr;
-    int bt;
-    int flags = BP_CPU;
-
-    if (env->cpu_breakpoint[n]) {
-        cpu_breakpoint_remove_by_ref(CPU(cpu), env->cpu_breakpoint[n]);
-        env->cpu_breakpoint[n] = NULL;
-    }
-
-    if (!extract64(bcr, 0, 1)) {
-        /* E bit clear : watchpoint disabled */
-        return;
-    }
-
-    bt = extract64(bcr, 20, 4);
-
-    switch (bt) {
-    case 4: /* unlinked address mismatch (reserved if AArch64) */
-    case 5: /* linked address mismatch (reserved if AArch64) */
-        qemu_log_mask(LOG_UNIMP,
-                      "arm: address mismatch breakpoint types not implemented\n");
-        return;
-    case 0: /* unlinked address match */
-    case 1: /* linked address match */
-    {
-        /*
-         * Bits [1:0] are RES0.
-         *
-         * It is IMPLEMENTATION DEFINED whether bits [63:49]
-         * ([63:53] for FEAT_LVA) are hardwired to a copy of the sign bit
-         * of the VA field ([48] or [52] for FEAT_LVA), or whether the
-         * value is read as written.  It is CONSTRAINED UNPREDICTABLE
-         * whether the RESS bits are ignored when comparing an address.
-         * Therefore we are allowed to compare the entire register, which
-         * lets us avoid considering whether FEAT_LVA is actually enabled.
-         *
-         * The BAS field is used to allow setting breakpoints on 16-bit
-         * wide instructions; it is CONSTRAINED UNPREDICTABLE whether
-         * a bp will fire if the addresses covered by the bp and the addresses
-         * covered by the insn overlap but the insn doesn't start at the
-         * start of the bp address range. We choose to require the insn and
-         * the bp to have the same address. The constraints on writing to
-         * BAS enforced in dbgbcr_write mean we have only four cases:
-         *  0b0000  => no breakpoint
-         *  0b0011  => breakpoint on addr
-         *  0b1100  => breakpoint on addr + 2
-         *  0b1111  => breakpoint on addr
-         * See also figure D2-3 in the v8 ARM ARM (DDI0487A.c).
-         */
-        int bas = extract64(bcr, 5, 4);
-        addr = bvr & ~3ULL;
-        if (bas == 0) {
-            return;
-        }
-        if (bas == 0xc) {
-            addr += 2;
-        }
-        break;
-    }
-    case 2: /* unlinked context ID match */
-    case 8: /* unlinked VMID match (reserved if no EL2) */
-    case 10: /* unlinked context ID and VMID match (reserved if no EL2) */
-        qemu_log_mask(LOG_UNIMP,
-                      "arm: unlinked context breakpoint types not implemented\n");
-        return;
-    case 9: /* linked VMID match (reserved if no EL2) */
-    case 11: /* linked context ID and VMID match (reserved if no EL2) */
-    case 3: /* linked context ID match */
-    default:
-        /*
-         * We must generate no events for Linked context matches (unless
-         * they are linked to by some other bp/wp, which is handled in
-         * updates for the linking bp/wp). We choose to also generate no events
-         * for reserved values.
-         */
-        return;
-    }
-
-    cpu_breakpoint_insert(CPU(cpu), addr, flags, &env->cpu_breakpoint[n]);
-}
-
-void hw_breakpoint_update_all(ARMCPU *cpu)
-{
-    int i;
-    CPUARMState *env = &cpu->env;
-
-    /*
-     * Completely clear out existing QEMU breakpoints and our array, to
-     * avoid possible stale entries following migration load.
-     */
-    cpu_breakpoint_remove_all(CPU(cpu), BP_CPU);
-    memset(env->cpu_breakpoint, 0, sizeof(env->cpu_breakpoint));
-
-    for (i = 0; i < ARRAY_SIZE(cpu->env.cpu_breakpoint); i++) {
-        hw_breakpoint_update(cpu, i);
-    }
-}
-
 static void dbgbvr_write(CPUARMState *env, const ARMCPRegInfo *ri,
                          uint64_t value)
 {
@@ -1210,30 +1239,3 @@ void define_debug_regs(ARMCPU *cpu)
         g_free(dbgwcr_el1_name);
     }
 }
-
-#if !defined(CONFIG_USER_ONLY)
-
-vaddr arm_adjust_watchpoint_address(CPUState *cs, vaddr addr, int len)
-{
-    ARMCPU *cpu = ARM_CPU(cs);
-    CPUARMState *env = &cpu->env;
-
-    /*
-     * In BE32 system mode, target memory is stored byteswapped (on a
-     * little-endian host system), and by the time we reach here (via an
-     * opcode helper) the addresses of subword accesses have been adjusted
-     * to account for that, which means that watchpoints will not match.
-     * Undo the adjustment here.
-     */
-    if (arm_sctlr_b(env)) {
-        if (len == 1) {
-            addr ^= 3;
-        } else if (len == 2) {
-            addr ^= 2;
-        }
-    }
-
-    return addr;
-}
-
-#endif
-- 
cgit v1.1


From f0984d4040c328d1c021ae6680479cbbe13c485b Mon Sep 17 00:00:00 2001
From: Fabiano Rosas <farosas@suse.de>
Date: Fri, 17 Feb 2023 17:11:29 -0300
Subject: target/arm: move translate modules to tcg/
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce the target/arm/tcg directory. Its purpose is to hold the TCG
code that is selected by CONFIG_TCG.

Signed-off-by: Claudio Fontana <cfontana@suse.de>
Signed-off-by: Fabiano Rosas <farosas@suse.de>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Tested-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 MAINTAINERS                       |     1 +
 target/arm/a32-uncond.decode      |    74 -
 target/arm/a32.decode             |   557 --
 target/arm/m-nocp.decode          |    72 -
 target/arm/meson.build            |    30 +-
 target/arm/mve.decode             |   832 --
 target/arm/neon-dp.decode         |   646 --
 target/arm/neon-ls.decode         |    52 -
 target/arm/neon-shared.decode     |    99 -
 target/arm/sme-fa64.decode        |    60 -
 target/arm/sme.decode             |    88 -
 target/arm/sve.decode             |  1702 ----
 target/arm/t16.decode             |   281 -
 target/arm/t32.decode             |   753 --
 target/arm/tcg/a32-uncond.decode  |    74 +
 target/arm/tcg/a32.decode         |   557 ++
 target/arm/tcg/m-nocp.decode      |    72 +
 target/arm/tcg/meson.build        |    32 +
 target/arm/tcg/mve.decode         |   832 ++
 target/arm/tcg/neon-dp.decode     |   646 ++
 target/arm/tcg/neon-ls.decode     |    52 +
 target/arm/tcg/neon-shared.decode |    99 +
 target/arm/tcg/sme-fa64.decode    |    60 +
 target/arm/tcg/sme.decode         |    88 +
 target/arm/tcg/sve.decode         |  1702 ++++
 target/arm/tcg/t16.decode         |   281 +
 target/arm/tcg/t32.decode         |   753 ++
 target/arm/tcg/translate-a64.c    | 15054 ++++++++++++++++++++++++++++++++++++
 target/arm/tcg/translate-a64.h    |   201 +
 target/arm/tcg/translate-m-nocp.c |   788 ++
 target/arm/tcg/translate-mve.c    |  2310 ++++++
 target/arm/tcg/translate-neon.c   |  4064 ++++++++++
 target/arm/tcg/translate-sme.c    |   373 +
 target/arm/tcg/translate-sve.c    |  7583 ++++++++++++++++++
 target/arm/tcg/translate-vfp.c    |  3619 +++++++++
 target/arm/tcg/translate.c        |  9990 ++++++++++++++++++++++++
 target/arm/tcg/translate.h        |   644 ++
 target/arm/tcg/vfp-uncond.decode  |    82 +
 target/arm/tcg/vfp.decode         |   247 +
 target/arm/translate-a64.c        | 15054 ------------------------------------
 target/arm/translate-a64.h        |   201 -
 target/arm/translate-m-nocp.c     |   788 --
 target/arm/translate-mve.c        |  2310 ------
 target/arm/translate-neon.c       |  4064 ----------
 target/arm/translate-sme.c        |   373 -
 target/arm/translate-sve.c        |  7583 ------------------
 target/arm/translate-vfp.c        |  3619 ---------
 target/arm/translate.c            |  9990 ------------------------
 target/arm/translate.h            |   644 --
 target/arm/vfp-uncond.decode      |    82 -
 target/arm/vfp.decode             |   247 -
 51 files changed, 50208 insertions(+), 50197 deletions(-)
 delete mode 100644 target/arm/a32-uncond.decode
 delete mode 100644 target/arm/a32.decode
 delete mode 100644 target/arm/m-nocp.decode
 delete mode 100644 target/arm/mve.decode
 delete mode 100644 target/arm/neon-dp.decode
 delete mode 100644 target/arm/neon-ls.decode
 delete mode 100644 target/arm/neon-shared.decode
 delete mode 100644 target/arm/sme-fa64.decode
 delete mode 100644 target/arm/sme.decode
 delete mode 100644 target/arm/sve.decode
 delete mode 100644 target/arm/t16.decode
 delete mode 100644 target/arm/t32.decode
 create mode 100644 target/arm/tcg/a32-uncond.decode
 create mode 100644 target/arm/tcg/a32.decode
 create mode 100644 target/arm/tcg/m-nocp.decode
 create mode 100644 target/arm/tcg/meson.build
 create mode 100644 target/arm/tcg/mve.decode
 create mode 100644 target/arm/tcg/neon-dp.decode
 create mode 100644 target/arm/tcg/neon-ls.decode
 create mode 100644 target/arm/tcg/neon-shared.decode
 create mode 100644 target/arm/tcg/sme-fa64.decode
 create mode 100644 target/arm/tcg/sme.decode
 create mode 100644 target/arm/tcg/sve.decode
 create mode 100644 target/arm/tcg/t16.decode
 create mode 100644 target/arm/tcg/t32.decode
 create mode 100644 target/arm/tcg/translate-a64.c
 create mode 100644 target/arm/tcg/translate-a64.h
 create mode 100644 target/arm/tcg/translate-m-nocp.c
 create mode 100644 target/arm/tcg/translate-mve.c
 create mode 100644 target/arm/tcg/translate-neon.c
 create mode 100644 target/arm/tcg/translate-sme.c
 create mode 100644 target/arm/tcg/translate-sve.c
 create mode 100644 target/arm/tcg/translate-vfp.c
 create mode 100644 target/arm/tcg/translate.c
 create mode 100644 target/arm/tcg/translate.h
 create mode 100644 target/arm/tcg/vfp-uncond.decode
 create mode 100644 target/arm/tcg/vfp.decode
 delete mode 100644 target/arm/translate-a64.c
 delete mode 100644 target/arm/translate-a64.h
 delete mode 100644 target/arm/translate-m-nocp.c
 delete mode 100644 target/arm/translate-mve.c
 delete mode 100644 target/arm/translate-neon.c
 delete mode 100644 target/arm/translate-sme.c
 delete mode 100644 target/arm/translate-sve.c
 delete mode 100644 target/arm/translate-vfp.c
 delete mode 100644 target/arm/translate.c
 delete mode 100644 target/arm/translate.h
 delete mode 100644 target/arm/vfp-uncond.decode
 delete mode 100644 target/arm/vfp.decode

diff --git a/MAINTAINERS b/MAINTAINERS
index 5c1ee41..c6e6549 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -161,6 +161,7 @@ M: Peter Maydell <peter.maydell@linaro.org>
 L: qemu-arm@nongnu.org
 S: Maintained
 F: target/arm/
+F: target/arm/tcg/
 F: tests/tcg/arm/
 F: tests/tcg/aarch64/
 F: tests/qtest/arm-cpu-features.c
diff --git a/target/arm/a32-uncond.decode b/target/arm/a32-uncond.decode
deleted file mode 100644
index 2339de2..0000000
--- a/target/arm/a32-uncond.decode
+++ /dev/null
@@ -1,74 +0,0 @@
-# A32 unconditional instructions
-#
-#  Copyright (c) 2019 Linaro, Ltd
-#
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-#
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, see <http://www.gnu.org/licenses/>.
-
-#
-# This file is processed by scripts/decodetree.py
-#
-# All insns that have 0xf in insn[31:28] are decoded here.
-# All of those that have a COND field in insn[31:28] are in a32.decode
-#
-
-&empty           !extern
-&i               !extern imm
-&setend          E
-
-# Branch with Link and Exchange
-
-%imm24h          0:s24 24:1 !function=times_2
-
-BLX_i            1111 101 . ........................          &i imm=%imm24h
-
-# System Instructions
-
-&rfe             rn w pu
-&srs             mode w pu
-&cps             mode imod M A I F
-
-RFE              1111 100 pu:2 0 w:1 1 rn:4 0000 1010 0000 0000   &rfe
-SRS              1111 100 pu:2 1 w:1 0 1101 0000 0101 000 mode:5  &srs
-CPS              1111 0001 0000 imod:2 M:1 0 0000 000 A:1 I:1 F:1 0 mode:5 \
-                 &cps
-
-# Clear-Exclusive, Barriers
-
-# QEMU does not require the option field for the barriers.
-CLREX            1111 0101 0111 1111 1111 0000 0001 1111
-DSB              1111 0101 0111 1111 1111 0000 0100 ----
-DMB              1111 0101 0111 1111 1111 0000 0101 ----
-ISB              1111 0101 0111 1111 1111 0000 0110 ----
-SB               1111 0101 0111 1111 1111 0000 0111 0000
-
-# Set Endianness
-SETEND           1111 0001 0000 0001 0000 00 E:1 0 0000 0000  &setend
-
-# Preload instructions
-
-PLD              1111 0101 -101 ---- 1111 ---- ---- ----    # (imm, lit) 5te
-PLDW             1111 0101 -001 ---- 1111 ---- ---- ----    # (imm, lit) 7mp
-PLI              1111 0100 -101 ---- 1111 ---- ---- ----    # (imm, lit) 7
-
-PLD              1111 0111 -101 ---- 1111 ----- -- 0 ----   # (register) 5te
-PLDW             1111 0111 -001 ---- 1111 ----- -- 0 ----   # (register) 7mp
-PLI              1111 0110 -101 ---- 1111 ----- -- 0 ----   # (register) 7
-
-# Unallocated memory hints
-#
-# Since these are v7MP nops, and PLDW is v7MP and implemented as nop,
-# (ab)use the PLDW helper.
-
-PLDW             1111 0100 -001 ---- ---- ---- ---- ----
-PLDW             1111 0110 -001 ---- ---- ---- ---0 ----
diff --git a/target/arm/a32.decode b/target/arm/a32.decode
deleted file mode 100644
index f2ca480..0000000
--- a/target/arm/a32.decode
+++ /dev/null
@@ -1,557 +0,0 @@
-# A32 conditional instructions
-#
-#  Copyright (c) 2019 Linaro, Ltd
-#
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-#
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, see <http://www.gnu.org/licenses/>.
-
-#
-# This file is processed by scripts/decodetree.py
-#
-# All of the insn that have a COND field in insn[31:28] are here.
-# All insns that have 0xf in insn[31:28] are in a32-uncond.decode.
-#
-
-&empty
-&s_rrr_shi       s rd rn rm shim shty
-&s_rrr_shr       s rn rd rm rs shty
-&s_rri_rot       s rn rd imm rot
-&s_rrrr          s rd rn rm ra
-&rrrr            rd rn rm ra
-&rrr_rot         rd rn rm rot
-&rrr             rd rn rm
-&rr              rd rm
-&ri              rd imm
-&r               rm
-&i               imm
-&msr_reg         rn r mask
-&mrs_reg         rd r
-&msr_bank        rn r sysm
-&mrs_bank        rd r sysm
-&ldst_rr         p w u rn rt rm shimm shtype
-&ldst_ri         p w u rn rt imm
-&ldst_block      rn i b u w list
-&strex           rn rd rt rt2 imm
-&ldrex           rn rt rt2 imm
-&bfx             rd rn lsb widthm1
-&bfi             rd rn lsb msb
-&sat             rd rn satimm imm sh
-&pkh             rd rn rm imm tb
-&mcr             cp opc1 crn crm opc2 rt
-&mcrr            cp opc1 crm rt rt2
-
-# Data-processing (register)
-
-@s_rrr_shi       ---- ... .... s:1 rn:4 rd:4 shim:5 shty:2 . rm:4 \
-                 &s_rrr_shi
-@s_rxr_shi       ---- ... .... s:1 .... rd:4 shim:5 shty:2 . rm:4 \
-                 &s_rrr_shi rn=0
-@S_xrr_shi       ---- ... .... .   rn:4 .... shim:5 shty:2 . rm:4 \
-                 &s_rrr_shi s=1 rd=0
-
-AND_rrri         .... 000 0000 . .... .... ..... .. 0 ....    @s_rrr_shi
-EOR_rrri         .... 000 0001 . .... .... ..... .. 0 ....    @s_rrr_shi
-SUB_rrri         .... 000 0010 . .... .... ..... .. 0 ....    @s_rrr_shi
-RSB_rrri         .... 000 0011 . .... .... ..... .. 0 ....    @s_rrr_shi
-ADD_rrri         .... 000 0100 . .... .... ..... .. 0 ....    @s_rrr_shi
-ADC_rrri         .... 000 0101 . .... .... ..... .. 0 ....    @s_rrr_shi
-SBC_rrri         .... 000 0110 . .... .... ..... .. 0 ....    @s_rrr_shi
-RSC_rrri         .... 000 0111 . .... .... ..... .. 0 ....    @s_rrr_shi
-TST_xrri         .... 000 1000 1 .... 0000 ..... .. 0 ....    @S_xrr_shi
-TEQ_xrri         .... 000 1001 1 .... 0000 ..... .. 0 ....    @S_xrr_shi
-CMP_xrri         .... 000 1010 1 .... 0000 ..... .. 0 ....    @S_xrr_shi
-CMN_xrri         .... 000 1011 1 .... 0000 ..... .. 0 ....    @S_xrr_shi
-ORR_rrri         .... 000 1100 . .... .... ..... .. 0 ....    @s_rrr_shi
-MOV_rxri         .... 000 1101 . 0000 .... ..... .. 0 ....    @s_rxr_shi
-BIC_rrri         .... 000 1110 . .... .... ..... .. 0 ....    @s_rrr_shi
-MVN_rxri         .... 000 1111 . 0000 .... ..... .. 0 ....    @s_rxr_shi
-
-%imm16           16:4 0:12
-@mov16           ---- .... .... .... rd:4 ............        &ri imm=%imm16
-
-MOVW             .... 0011 0000 .... .... ............        @mov16
-MOVT             .... 0011 0100 .... .... ............        @mov16
-
-# Data-processing (register-shifted register)
-
-@s_rrr_shr       ---- ... .... s:1 rn:4 rd:4 rs:4 . shty:2 . rm:4 \
-                 &s_rrr_shr
-@s_rxr_shr       ---- ... .... s:1 .... rd:4 rs:4 . shty:2 . rm:4 \
-                 &s_rrr_shr rn=0
-@S_xrr_shr       ---- ... .... .   rn:4 .... rs:4 . shty:2 . rm:4 \
-                 &s_rrr_shr rd=0 s=1
-
-AND_rrrr         .... 000 0000 . .... .... .... 0 .. 1 ....   @s_rrr_shr
-EOR_rrrr         .... 000 0001 . .... .... .... 0 .. 1 ....   @s_rrr_shr
-SUB_rrrr         .... 000 0010 . .... .... .... 0 .. 1 ....   @s_rrr_shr
-RSB_rrrr         .... 000 0011 . .... .... .... 0 .. 1 ....   @s_rrr_shr
-ADD_rrrr         .... 000 0100 . .... .... .... 0 .. 1 ....   @s_rrr_shr
-ADC_rrrr         .... 000 0101 . .... .... .... 0 .. 1 ....   @s_rrr_shr
-SBC_rrrr         .... 000 0110 . .... .... .... 0 .. 1 ....   @s_rrr_shr
-RSC_rrrr         .... 000 0111 . .... .... .... 0 .. 1 ....   @s_rrr_shr
-TST_xrrr         .... 000 1000 1 .... 0000 .... 0 .. 1 ....   @S_xrr_shr
-TEQ_xrrr         .... 000 1001 1 .... 0000 .... 0 .. 1 ....   @S_xrr_shr
-CMP_xrrr         .... 000 1010 1 .... 0000 .... 0 .. 1 ....   @S_xrr_shr
-CMN_xrrr         .... 000 1011 1 .... 0000 .... 0 .. 1 ....   @S_xrr_shr
-ORR_rrrr         .... 000 1100 . .... .... .... 0 .. 1 ....   @s_rrr_shr
-MOV_rxrr         .... 000 1101 . 0000 .... .... 0 .. 1 ....   @s_rxr_shr
-BIC_rrrr         .... 000 1110 . .... .... .... 0 .. 1 ....   @s_rrr_shr
-MVN_rxrr         .... 000 1111 . 0000 .... .... 0 .. 1 ....   @s_rxr_shr
-
-# Data-processing (immediate)
-
-%a32extrot       8:4 !function=times_2
-
-@s_rri_rot       ---- ... .... s:1 rn:4 rd:4 .... imm:8 \
-                 &s_rri_rot rot=%a32extrot
-@s_rxi_rot       ---- ... .... s:1 .... rd:4 .... imm:8 \
-                 &s_rri_rot rot=%a32extrot rn=0
-@S_xri_rot       ---- ... .... .   rn:4 .... .... imm:8 \
-                 &s_rri_rot rot=%a32extrot rd=0 s=1
-
-AND_rri          .... 001 0000 . .... .... ............       @s_rri_rot
-EOR_rri          .... 001 0001 . .... .... ............       @s_rri_rot
-SUB_rri          .... 001 0010 . .... .... ............       @s_rri_rot
-RSB_rri          .... 001 0011 . .... .... ............       @s_rri_rot
-ADD_rri          .... 001 0100 . .... .... ............       @s_rri_rot
-ADC_rri          .... 001 0101 . .... .... ............       @s_rri_rot
-SBC_rri          .... 001 0110 . .... .... ............       @s_rri_rot
-RSC_rri          .... 001 0111 . .... .... ............       @s_rri_rot
-TST_xri          .... 001 1000 1 .... 0000 ............       @S_xri_rot
-TEQ_xri          .... 001 1001 1 .... 0000 ............       @S_xri_rot
-CMP_xri          .... 001 1010 1 .... 0000 ............       @S_xri_rot
-CMN_xri          .... 001 1011 1 .... 0000 ............       @S_xri_rot
-ORR_rri          .... 001 1100 . .... .... ............       @s_rri_rot
-MOV_rxi          .... 001 1101 . 0000 .... ............       @s_rxi_rot
-BIC_rri          .... 001 1110 . .... .... ............       @s_rri_rot
-MVN_rxi          .... 001 1111 . 0000 .... ............       @s_rxi_rot
-
-# Multiply and multiply accumulate
-
-@s_rdamn         ---- .... ... s:1 rd:4 ra:4 rm:4 .... rn:4   &s_rrrr
-@s_rd0mn         ---- .... ... s:1 rd:4 .... rm:4 .... rn:4   &s_rrrr ra=0
-@rdamn           ---- .... ... .   rd:4 ra:4 rm:4 .... rn:4   &rrrr
-@rd0mn           ---- .... ... .   rd:4 .... rm:4 .... rn:4   &rrrr ra=0
-
-MUL              .... 0000 000 . .... 0000 .... 1001 ....     @s_rd0mn
-MLA              .... 0000 001 . .... .... .... 1001 ....     @s_rdamn
-UMAAL            .... 0000 010 0 .... .... .... 1001 ....     @rdamn
-MLS              .... 0000 011 0 .... .... .... 1001 ....     @rdamn
-UMULL            .... 0000 100 . .... .... .... 1001 ....     @s_rdamn
-UMLAL            .... 0000 101 . .... .... .... 1001 ....     @s_rdamn
-SMULL            .... 0000 110 . .... .... .... 1001 ....     @s_rdamn
-SMLAL            .... 0000 111 . .... .... .... 1001 ....     @s_rdamn
-
-# Saturating addition and subtraction
-
-@rndm            ---- .... .... rn:4 rd:4 .... .... rm:4      &rrr
-
-QADD             .... 0001 0000 .... .... 0000 0101 ....      @rndm
-QSUB             .... 0001 0010 .... .... 0000 0101 ....      @rndm
-QDADD            .... 0001 0100 .... .... 0000 0101 ....      @rndm
-QDSUB            .... 0001 0110 .... .... 0000 0101 ....      @rndm
-
-# Halfword multiply and multiply accumulate
-
-SMLABB           .... 0001 0000 .... .... .... 1000 ....      @rdamn
-SMLABT           .... 0001 0000 .... .... .... 1100 ....      @rdamn
-SMLATB           .... 0001 0000 .... .... .... 1010 ....      @rdamn
-SMLATT           .... 0001 0000 .... .... .... 1110 ....      @rdamn
-SMLAWB           .... 0001 0010 .... .... .... 1000 ....      @rdamn
-SMULWB           .... 0001 0010 .... 0000 .... 1010 ....      @rd0mn
-SMLAWT           .... 0001 0010 .... .... .... 1100 ....      @rdamn
-SMULWT           .... 0001 0010 .... 0000 .... 1110 ....      @rd0mn
-SMLALBB          .... 0001 0100 .... .... .... 1000 ....      @rdamn
-SMLALBT          .... 0001 0100 .... .... .... 1100 ....      @rdamn
-SMLALTB          .... 0001 0100 .... .... .... 1010 ....      @rdamn
-SMLALTT          .... 0001 0100 .... .... .... 1110 ....      @rdamn
-SMULBB           .... 0001 0110 .... 0000 .... 1000 ....      @rd0mn
-SMULBT           .... 0001 0110 .... 0000 .... 1100 ....      @rd0mn
-SMULTB           .... 0001 0110 .... 0000 .... 1010 ....      @rd0mn
-SMULTT           .... 0001 0110 .... 0000 .... 1110 ....      @rd0mn
-
-# MSR (immediate) and hints
-
-&msr_i           r mask rot imm
-@msr_i           ---- .... .... mask:4 .... rot:4 imm:8       &msr_i
-
-{
-  {
-    [
-      YIELD      ---- 0011 0010 0000 1111 ---- 0000 0001
-      WFE        ---- 0011 0010 0000 1111 ---- 0000 0010
-      WFI        ---- 0011 0010 0000 1111 ---- 0000 0011
-
-      # TODO: Implement SEV, SEVL; may help SMP performance.
-      # SEV      ---- 0011 0010 0000 1111 ---- 0000 0100
-      # SEVL     ---- 0011 0010 0000 1111 ---- 0000 0101
-
-      ESB        ---- 0011 0010 0000 1111 ---- 0001 0000
-    ]
-
-    # The canonical nop ends in 00000000, but the whole of the
-    # rest of the space executes as nop if otherwise unsupported.
-    NOP          ---- 0011 0010 0000 1111 ---- ---- ----
-  }
-  # Note mask = 0 is covered by NOP
-  MSR_imm        .... 0011 0010 .... 1111 .... .... ....      @msr_i r=0
-}
-MSR_imm          .... 0011 0110 .... 1111 .... .... ....      @msr_i r=1
-
-# Cyclic Redundancy Check
-
-CRC32B           .... 0001 0000 .... .... 0000 0100 ....      @rndm
-CRC32H           .... 0001 0010 .... .... 0000 0100 ....      @rndm
-CRC32W           .... 0001 0100 .... .... 0000 0100 ....      @rndm
-CRC32CB          .... 0001 0000 .... .... 0010 0100 ....      @rndm
-CRC32CH          .... 0001 0010 .... .... 0010 0100 ....      @rndm
-CRC32CW          .... 0001 0100 .... .... 0010 0100 ....      @rndm
-
-# Miscellaneous instructions
-
-%sysm            8:1 16:4
-%imm16_8_0       8:12 0:4
-
-@rm              ---- .... .... .... .... .... .... rm:4      &r
-@rdm             ---- .... .... .... rd:4 .... .... rm:4      &rr
-@i16             ---- .... .... .... .... .... .... ....      &i imm=%imm16_8_0
-
-MRS_bank         ---- 0001 0 r:1 00 .... rd:4 001. 0000 0000  &mrs_bank %sysm
-MSR_bank         ---- 0001 0 r:1 10 .... 1111 001. 0000 rn:4  &msr_bank %sysm
-
-MRS_reg          ---- 0001 0 r:1 00 1111   rd:4 0000 0000 0000  &mrs_reg
-MSR_reg          ---- 0001 0 r:1 10 mask:4 1111 0000 0000 rn:4  &msr_reg
-
-BX               .... 0001 0010 1111 1111 1111 0001 ....      @rm
-BXJ              .... 0001 0010 1111 1111 1111 0010 ....      @rm
-BLX_r            .... 0001 0010 1111 1111 1111 0011 ....      @rm
-
-CLZ              .... 0001 0110 1111 .... 1111 0001 ....      @rdm
-
-ERET             ---- 0001 0110 0000 0000 0000 0110 1110
-
-HLT              .... 0001 0000 .... .... .... 0111 ....      @i16
-BKPT             .... 0001 0010 .... .... .... 0111 ....      @i16
-HVC              .... 0001 0100 .... .... .... 0111 ....      @i16
-SMC              ---- 0001 0110 0000 0000 0000 0111 imm:4     &i
-
-# Load/Store Dual, Half, Signed Byte (register)
-
-@ldst_rr_p1w     ---- ...1 u:1 . w:1 . rn:4 rt:4 .... .... rm:4 \
-                 &ldst_rr p=1 shimm=0 shtype=0
-@ldst_rr_pw0     ---- ...0 u:1 . 0   . rn:4 rt:4 .... .... rm:4 \
-                 &ldst_rr p=0 w=0 shimm=0 shtype=0
-
-STRH_rr          .... 000. .0.0 .... .... 0000 1011 ....      @ldst_rr_pw0
-STRH_rr          .... 000. .0.0 .... .... 0000 1011 ....      @ldst_rr_p1w
-
-LDRD_rr          .... 000. .0.0 .... .... 0000 1101 ....      @ldst_rr_pw0
-LDRD_rr          .... 000. .0.0 .... .... 0000 1101 ....      @ldst_rr_p1w
-
-STRD_rr          .... 000. .0.0 .... .... 0000 1111 ....      @ldst_rr_pw0
-STRD_rr          .... 000. .0.0 .... .... 0000 1111 ....      @ldst_rr_p1w
-
-LDRH_rr          .... 000. .0.1 .... .... 0000 1011 ....      @ldst_rr_pw0
-LDRH_rr          .... 000. .0.1 .... .... 0000 1011 ....      @ldst_rr_p1w
-
-LDRSB_rr         .... 000. .0.1 .... .... 0000 1101 ....      @ldst_rr_pw0
-LDRSB_rr         .... 000. .0.1 .... .... 0000 1101 ....      @ldst_rr_p1w
-
-LDRSH_rr         .... 000. .0.1 .... .... 0000 1111 ....      @ldst_rr_pw0
-LDRSH_rr         .... 000. .0.1 .... .... 0000 1111 ....      @ldst_rr_p1w
-
-# Note the unpriv load/stores use the previously invalid P=0, W=1 encoding,
-# and act as normal post-indexed (P=0, W=0).
-@ldst_rr_p0w1    ---- ...0 u:1 . 1   . rn:4 rt:4 .... .... rm:4 \
-                 &ldst_rr p=0 w=0 shimm=0 shtype=0
-
-STRHT_rr         .... 000. .0.0 .... .... 0000 1011 ....      @ldst_rr_p0w1
-LDRHT_rr         .... 000. .0.1 .... .... 0000 1011 ....      @ldst_rr_p0w1
-LDRSBT_rr        .... 000. .0.1 .... .... 0000 1101 ....      @ldst_rr_p0w1
-LDRSHT_rr        .... 000. .0.1 .... .... 0000 1111 ....      @ldst_rr_p0w1
-
-# Load/Store word and unsigned byte (register)
-
-@ldst_rs_p1w     ---- ...1 u:1 . w:1 . rn:4 rt:4 shimm:5 shtype:2 . rm:4 \
-                 &ldst_rr p=1
-@ldst_rs_pw0     ---- ...0 u:1 . 0   . rn:4 rt:4 shimm:5 shtype:2 . rm:4 \
-                 &ldst_rr p=0 w=0
-
-STR_rr           .... 011. .0.0 .... .... .... ...0 ....      @ldst_rs_pw0
-STR_rr           .... 011. .0.0 .... .... .... ...0 ....      @ldst_rs_p1w
-STRB_rr          .... 011. .1.0 .... .... .... ...0 ....      @ldst_rs_pw0
-STRB_rr          .... 011. .1.0 .... .... .... ...0 ....      @ldst_rs_p1w
-
-LDR_rr           .... 011. .0.1 .... .... .... ...0 ....      @ldst_rs_pw0
-LDR_rr           .... 011. .0.1 .... .... .... ...0 ....      @ldst_rs_p1w
-LDRB_rr          .... 011. .1.1 .... .... .... ...0 ....      @ldst_rs_pw0
-LDRB_rr          .... 011. .1.1 .... .... .... ...0 ....      @ldst_rs_p1w
-
-@ldst_rs_p0w1    ---- ...0 u:1 . 1   . rn:4 rt:4 shimm:5 shtype:2 . rm:4 \
-                 &ldst_rr p=0 w=0
-
-STRT_rr          .... 011. .0.0 .... .... .... ...0 ....      @ldst_rs_p0w1
-STRBT_rr         .... 011. .1.0 .... .... .... ...0 ....      @ldst_rs_p0w1
-LDRT_rr          .... 011. .0.1 .... .... .... ...0 ....      @ldst_rs_p0w1
-LDRBT_rr         .... 011. .1.1 .... .... .... ...0 ....      @ldst_rs_p0w1
-
-# Load/Store Dual, Half, Signed Byte (immediate)
-
-%imm8s_8_0       8:4 0:4
-@ldst_ri8_p1w    ---- ...1 u:1 . w:1 . rn:4 rt:4 .... .... .... \
-                 &ldst_ri imm=%imm8s_8_0 p=1
-@ldst_ri8_pw0    ---- ...0 u:1 . 0   . rn:4 rt:4 .... .... .... \
-                 &ldst_ri imm=%imm8s_8_0 p=0 w=0
-
-STRH_ri          .... 000. .1.0 .... .... .... 1011 ....      @ldst_ri8_pw0
-STRH_ri          .... 000. .1.0 .... .... .... 1011 ....      @ldst_ri8_p1w
-
-LDRD_ri_a32      .... 000. .1.0 .... .... .... 1101 ....      @ldst_ri8_pw0
-LDRD_ri_a32      .... 000. .1.0 .... .... .... 1101 ....      @ldst_ri8_p1w
-
-STRD_ri_a32      .... 000. .1.0 .... .... .... 1111 ....      @ldst_ri8_pw0
-STRD_ri_a32      .... 000. .1.0 .... .... .... 1111 ....      @ldst_ri8_p1w
-
-LDRH_ri          .... 000. .1.1 .... .... .... 1011 ....      @ldst_ri8_pw0
-LDRH_ri          .... 000. .1.1 .... .... .... 1011 ....      @ldst_ri8_p1w
-
-LDRSB_ri         .... 000. .1.1 .... .... .... 1101 ....      @ldst_ri8_pw0
-LDRSB_ri         .... 000. .1.1 .... .... .... 1101 ....      @ldst_ri8_p1w
-
-LDRSH_ri         .... 000. .1.1 .... .... .... 1111 ....      @ldst_ri8_pw0
-LDRSH_ri         .... 000. .1.1 .... .... .... 1111 ....      @ldst_ri8_p1w
-
-# Note the unpriv load/stores use the previously invalid P=0, W=1 encoding,
-# and act as normal post-indexed (P=0, W=0).
-@ldst_ri8_p0w1   ---- ...0 u:1 . 1   . rn:4 rt:4 .... .... .... \
-                 &ldst_ri imm=%imm8s_8_0 p=0 w=0
-
-STRHT_ri         .... 000. .1.0 .... .... .... 1011 ....      @ldst_ri8_p0w1
-LDRHT_ri         .... 000. .1.1 .... .... .... 1011 ....      @ldst_ri8_p0w1
-LDRSBT_ri        .... 000. .1.1 .... .... .... 1101 ....      @ldst_ri8_p0w1
-LDRSHT_ri        .... 000. .1.1 .... .... .... 1111 ....      @ldst_ri8_p0w1
-
-# Load/Store word and unsigned byte (immediate)
-
-@ldst_ri12_p1w   ---- ...1 u:1 . w:1 . rn:4 rt:4 imm:12       &ldst_ri p=1
-@ldst_ri12_pw0   ---- ...0 u:1 . 0   . rn:4 rt:4 imm:12       &ldst_ri p=0 w=0
-
-STR_ri           .... 010. .0.0 .... .... ............        @ldst_ri12_p1w
-STR_ri           .... 010. .0.0 .... .... ............        @ldst_ri12_pw0
-STRB_ri          .... 010. .1.0 .... .... ............        @ldst_ri12_p1w
-STRB_ri          .... 010. .1.0 .... .... ............        @ldst_ri12_pw0
-
-LDR_ri           .... 010. .0.1 .... .... ............        @ldst_ri12_p1w
-LDR_ri           .... 010. .0.1 .... .... ............        @ldst_ri12_pw0
-LDRB_ri          .... 010. .1.1 .... .... ............        @ldst_ri12_p1w
-LDRB_ri          .... 010. .1.1 .... .... ............        @ldst_ri12_pw0
-
-@ldst_ri12_p0w1  ---- ...0 u:1 . 1 . rn:4 rt:4 imm:12         &ldst_ri p=0 w=0
-
-STRT_ri          .... 010. .0.0 .... .... ............        @ldst_ri12_p0w1
-STRBT_ri         .... 010. .1.0 .... .... ............        @ldst_ri12_p0w1
-LDRT_ri          .... 010. .0.1 .... .... ............        @ldst_ri12_p0w1
-LDRBT_ri         .... 010. .1.1 .... .... ............        @ldst_ri12_p0w1
-
-# Synchronization primitives
-
-@swp             ---- .... .... rn:4 rt:4 .... .... rt2:4
-
-SWP              .... 0001 0000 .... .... 0000 1001 ....      @swp
-SWPB             .... 0001 0100 .... .... 0000 1001 ....      @swp
-
-# Load/Store Exclusive and Load-Acquire/Store-Release
-#
-# Note rt2 for STREXD/LDREXD is set by the helper after checking rt is even.
-
-@strex           ---- .... .... rn:4 rd:4 .... .... rt:4 \
-                 &strex imm=0 rt2=15
-@ldrex           ---- .... .... rn:4 rt:4 .... .... .... \
-                 &ldrex imm=0 rt2=15
-@stl             ---- .... .... rn:4 .... .... .... rt:4 \
-                 &ldrex imm=0 rt2=15
-
-STREX            .... 0001 1000 .... .... 1111 1001 ....      @strex
-STREXD_a32       .... 0001 1010 .... .... 1111 1001 ....      @strex
-STREXB           .... 0001 1100 .... .... 1111 1001 ....      @strex
-STREXH           .... 0001 1110 .... .... 1111 1001 ....      @strex
-
-STLEX            .... 0001 1000 .... .... 1110 1001 ....      @strex
-STLEXD_a32       .... 0001 1010 .... .... 1110 1001 ....      @strex
-STLEXB           .... 0001 1100 .... .... 1110 1001 ....      @strex
-STLEXH           .... 0001 1110 .... .... 1110 1001 ....      @strex
-
-STL              .... 0001 1000 .... 1111 1100 1001 ....      @stl
-STLB             .... 0001 1100 .... 1111 1100 1001 ....      @stl
-STLH             .... 0001 1110 .... 1111 1100 1001 ....      @stl
-
-LDREX            .... 0001 1001 .... .... 1111 1001 1111      @ldrex
-LDREXD_a32       .... 0001 1011 .... .... 1111 1001 1111      @ldrex
-LDREXB           .... 0001 1101 .... .... 1111 1001 1111      @ldrex
-LDREXH           .... 0001 1111 .... .... 1111 1001 1111      @ldrex
-
-LDAEX            .... 0001 1001 .... .... 1110 1001 1111      @ldrex
-LDAEXD_a32       .... 0001 1011 .... .... 1110 1001 1111      @ldrex
-LDAEXB           .... 0001 1101 .... .... 1110 1001 1111      @ldrex
-LDAEXH           .... 0001 1111 .... .... 1110 1001 1111      @ldrex
-
-LDA              .... 0001 1001 .... .... 1100 1001 1111      @ldrex
-LDAB             .... 0001 1101 .... .... 1100 1001 1111      @ldrex
-LDAH             .... 0001 1111 .... .... 1100 1001 1111      @ldrex
-
-# Media instructions
-
-# usad8 is usada8 w/ ra=15
-USADA8           ---- 0111 1000 rd:4 ra:4 rm:4 0001 rn:4
-
-# ubfx and sbfx
-@bfx             ---- .... ... widthm1:5 rd:4 lsb:5 ... rn:4  &bfx
-
-SBFX             .... 0111 101 ..... .... ..... 101 ....      @bfx
-UBFX             .... 0111 111 ..... .... ..... 101 ....      @bfx
-
-# bfc is bfi w/ rn=15
-BFCI             ---- 0111 110 msb:5 rd:4 lsb:5 001 rn:4      &bfi
-
-# While we could get UDEF by not including this, add the pattern for
-# documentation and to conflict with any other typos in this file.
-UDF              1110 0111 1111 ---- ---- ---- 1111 ----
-
-# Parallel addition and subtraction
-
-SADD16           .... 0110 0001 .... .... 1111 0001 ....      @rndm
-SASX             .... 0110 0001 .... .... 1111 0011 ....      @rndm
-SSAX             .... 0110 0001 .... .... 1111 0101 ....      @rndm
-SSUB16           .... 0110 0001 .... .... 1111 0111 ....      @rndm
-SADD8            .... 0110 0001 .... .... 1111 1001 ....      @rndm
-SSUB8            .... 0110 0001 .... .... 1111 1111 ....      @rndm
-
-QADD16           .... 0110 0010 .... .... 1111 0001 ....      @rndm
-QASX             .... 0110 0010 .... .... 1111 0011 ....      @rndm
-QSAX             .... 0110 0010 .... .... 1111 0101 ....      @rndm
-QSUB16           .... 0110 0010 .... .... 1111 0111 ....      @rndm
-QADD8            .... 0110 0010 .... .... 1111 1001 ....      @rndm
-QSUB8            .... 0110 0010 .... .... 1111 1111 ....      @rndm
-
-SHADD16          .... 0110 0011 .... .... 1111 0001 ....      @rndm
-SHASX            .... 0110 0011 .... .... 1111 0011 ....      @rndm
-SHSAX            .... 0110 0011 .... .... 1111 0101 ....      @rndm
-SHSUB16          .... 0110 0011 .... .... 1111 0111 ....      @rndm
-SHADD8           .... 0110 0011 .... .... 1111 1001 ....      @rndm
-SHSUB8           .... 0110 0011 .... .... 1111 1111 ....      @rndm
-
-UADD16           .... 0110 0101 .... .... 1111 0001 ....      @rndm
-UASX             .... 0110 0101 .... .... 1111 0011 ....      @rndm
-USAX             .... 0110 0101 .... .... 1111 0101 ....      @rndm
-USUB16           .... 0110 0101 .... .... 1111 0111 ....      @rndm
-UADD8            .... 0110 0101 .... .... 1111 1001 ....      @rndm
-USUB8            .... 0110 0101 .... .... 1111 1111 ....      @rndm
-
-UQADD16          .... 0110 0110 .... .... 1111 0001 ....      @rndm
-UQASX            .... 0110 0110 .... .... 1111 0011 ....      @rndm
-UQSAX            .... 0110 0110 .... .... 1111 0101 ....      @rndm
-UQSUB16          .... 0110 0110 .... .... 1111 0111 ....      @rndm
-UQADD8           .... 0110 0110 .... .... 1111 1001 ....      @rndm
-UQSUB8           .... 0110 0110 .... .... 1111 1111 ....      @rndm
-
-UHADD16          .... 0110 0111 .... .... 1111 0001 ....      @rndm
-UHASX            .... 0110 0111 .... .... 1111 0011 ....      @rndm
-UHSAX            .... 0110 0111 .... .... 1111 0101 ....      @rndm
-UHSUB16          .... 0110 0111 .... .... 1111 0111 ....      @rndm
-UHADD8           .... 0110 0111 .... .... 1111 1001 ....      @rndm
-UHSUB8           .... 0110 0111 .... .... 1111 1111 ....      @rndm
-
-# Packing, unpacking, saturation, and reversal
-
-PKH              ---- 0110 1000 rn:4 rd:4 imm:5 tb:1 01 rm:4  &pkh
-
-@sat             ---- .... ... satimm:5  rd:4 imm:5 sh:1 .. rn:4  &sat
-@sat16           ---- .... .... satimm:4 rd:4 .... .... rn:4 \
-                 &sat imm=0 sh=0
-
-SSAT             .... 0110 101. .... .... .... ..01 ....      @sat
-USAT             .... 0110 111. .... .... .... ..01 ....      @sat
-
-SSAT16           .... 0110 1010 .... .... 1111 0011 ....      @sat16
-USAT16           .... 0110 1110 .... .... 1111 0011 ....      @sat16
-
-@rrr_rot         ---- .... .... rn:4 rd:4 rot:2 ...... rm:4   &rrr_rot
-
-SXTAB16          .... 0110 1000 .... .... ..00 0111 ....      @rrr_rot
-SXTAB            .... 0110 1010 .... .... ..00 0111 ....      @rrr_rot
-SXTAH            .... 0110 1011 .... .... ..00 0111 ....      @rrr_rot
-UXTAB16          .... 0110 1100 .... .... ..00 0111 ....      @rrr_rot
-UXTAB            .... 0110 1110 .... .... ..00 0111 ....      @rrr_rot
-UXTAH            .... 0110 1111 .... .... ..00 0111 ....      @rrr_rot
-
-SEL              .... 0110 1000 .... .... 1111 1011 ....      @rndm
-REV              .... 0110 1011 1111 .... 1111 0011 ....      @rdm
-REV16            .... 0110 1011 1111 .... 1111 1011 ....      @rdm
-REVSH            .... 0110 1111 1111 .... 1111 1011 ....      @rdm
-RBIT             .... 0110 1111 1111 .... 1111 0011 ....      @rdm
-
-# Signed multiply, signed and unsigned divide
-
-@rdmn            ---- .... .... rd:4 .... rm:4 .... rn:4      &rrr
-
-SMLAD            .... 0111 0000 .... .... .... 0001 ....      @rdamn
-SMLADX           .... 0111 0000 .... .... .... 0011 ....      @rdamn
-SMLSD            .... 0111 0000 .... .... .... 0101 ....      @rdamn
-SMLSDX           .... 0111 0000 .... .... .... 0111 ....      @rdamn
-
-SDIV             .... 0111 0001 .... 1111 .... 0001 ....      @rdmn
-UDIV             .... 0111 0011 .... 1111 .... 0001 ....      @rdmn
-
-SMLALD           .... 0111 0100 .... .... .... 0001 ....      @rdamn
-SMLALDX          .... 0111 0100 .... .... .... 0011 ....      @rdamn
-SMLSLD           .... 0111 0100 .... .... .... 0101 ....      @rdamn
-SMLSLDX          .... 0111 0100 .... .... .... 0111 ....      @rdamn
-
-SMMLA            .... 0111 0101 .... .... .... 0001 ....      @rdamn
-SMMLAR           .... 0111 0101 .... .... .... 0011 ....      @rdamn
-SMMLS            .... 0111 0101 .... .... .... 1101 ....      @rdamn
-SMMLSR           .... 0111 0101 .... .... .... 1111 ....      @rdamn
-
-# Block data transfer
-
-STM              ---- 100 b:1 i:1 u:1 w:1 0 rn:4 list:16   &ldst_block
-LDM_a32          ---- 100 b:1 i:1 u:1 w:1 1 rn:4 list:16   &ldst_block
-
-# Branch, branch with link
-
-%imm26           0:s24  !function=times_4
-@branch          ---- .... ........................           &i imm=%imm26
-
-B                .... 1010 ........................           @branch
-BL               .... 1011 ........................           @branch
-
-# Coprocessor instructions
-
-# We decode MCR, MCR, MRRC and MCRR only, because for QEMU the
-# other coprocessor instructions always UNDEF.
-# The trans_ functions for these will ignore cp values 8..13 for v7 or
-# earlier, and 0..13 for v8 and later, because those areas of the
-# encoding space may be used for other things, such as VFP or Neon.
-
-@mcr             ---- .... opc1:3 . crn:4 rt:4 cp:4 opc2:3 . crm:4 &mcr
-@mcrr            ---- .... .... rt2:4 rt:4 cp:4 opc1:4 crm:4       &mcrr
-
-MCRR             .... 1100 0100 .... .... .... .... .... @mcrr
-MRRC             .... 1100 0101 .... .... .... .... .... @mcrr
-
-MCR              .... 1110 ... 0 .... .... .... ... 1 .... @mcr
-MRC              .... 1110 ... 1 .... .... .... ... 1 .... @mcr
-
-# Supervisor call
-
-SVC              ---- 1111 imm:24                             &i
diff --git a/target/arm/m-nocp.decode b/target/arm/m-nocp.decode
deleted file mode 100644
index b65c801..0000000
--- a/target/arm/m-nocp.decode
+++ /dev/null
@@ -1,72 +0,0 @@
-# M-profile UserFault.NOCP exception handling
-#
-#  Copyright (c) 2020 Linaro, Ltd
-#
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-#
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, see <http://www.gnu.org/licenses/>.
-
-#
-# This file is processed by scripts/decodetree.py
-#
-# For M-profile, the architecture specifies that NOCP UsageFaults
-# should take precedence over UNDEF faults over the whole wide
-# range of coprocessor-space encodings, with the exception of
-# VLLDM and VLSTM. (Compare v8.1M IsCPInstruction() pseudocode and
-# v8M Arm ARM rule R_QLGM.) This isn't mandatory for v8.0M but we choose
-# to behave the same as v8.1M.
-# This decode is handled before any others (and in particular before
-# decoding FP instructions which are in the coprocessor space).
-# If the coprocessor is not present or disabled then we will generate
-# the NOCP exception; otherwise we let the insn through to the main decode.
-
-%vd_dp  22:1 12:4
-%vd_sp  12:4 22:1
-
-&nocp cp
-
-# M-profile VLDR/VSTR to sysreg
-%vldr_sysreg 22:1 13:3
-%imm7_0x4 0:7 !function=times_4
-
-&vldr_sysreg rn reg imm a w p
-@vldr_sysreg .... ... . a:1 . . . rn:4 ... . ... .. ....... \
-             reg=%vldr_sysreg imm=%imm7_0x4 &vldr_sysreg
-
-{
-  # Special cases which do not take an early NOCP: VLLDM and VLSTM
-  VLLDM_VLSTM  1110 1100 001 l:1 rn:4 0000 1010 op:1 000 0000
-  # VSCCLRM (new in v8.1M) is similar:
-  VSCCLRM      1110 1100 1.01 1111 .... 1011 imm:7 0   vd=%vd_dp size=3
-  VSCCLRM      1110 1100 1.01 1111 .... 1010 imm:8     vd=%vd_sp size=2
-
-  # FP system register accesses: these are a special case because accesses
-  # to FPCXT_NS succeed even if the FPU is disabled. We therefore need
-  # to handle them before the big NOCP blocks. Note that within these
-  # insns NOCP still has higher priority than UNDEFs; this is implemented
-  # by their returning 'false' for UNDEF so as to fall through into the
-  # NOCP check (in contrast to VLLDM etc, which call unallocated_encoding()
-  # for the UNDEFs there that must take precedence over NOCP.)
-
-  VMSR_VMRS    ---- 1110 111 l:1 reg:4 rt:4 1010 0001 0000
-
-  # P=0 W=0 is SEE "Related encodings", so split into two patterns
-  VLDR_sysreg  ---- 110 1 . . w:1 1 .... ... 0 111 11 ....... @vldr_sysreg p=1
-  VLDR_sysreg  ---- 110 0 . . 1   1 .... ... 0 111 11 ....... @vldr_sysreg p=0 w=1
-  VSTR_sysreg  ---- 110 1 . . w:1 0 .... ... 0 111 11 ....... @vldr_sysreg p=1
-  VSTR_sysreg  ---- 110 0 . . 1   0 .... ... 0 111 11 ....... @vldr_sysreg p=0 w=1
-
-  NOCP         111- 1110 ---- ---- ---- cp:4 ---- ---- &nocp
-  NOCP         111- 110- ---- ---- ---- cp:4 ---- ---- &nocp
-  # From v8.1M onwards this range will also NOCP:
-  NOCP_8_1     111- 1111 ---- ---- ---- ---- ---- ---- &nocp cp=10
-}
diff --git a/target/arm/meson.build b/target/arm/meson.build
index 87e911b..b2904b6 100644
--- a/target/arm/meson.build
+++ b/target/arm/meson.build
@@ -1,22 +1,4 @@
-gen = [
-  decodetree.process('sve.decode', extra_args: '--decode=disas_sve'),
-  decodetree.process('sme.decode', extra_args: '--decode=disas_sme'),
-  decodetree.process('sme-fa64.decode', extra_args: '--static-decode=disas_sme_fa64'),
-  decodetree.process('neon-shared.decode', extra_args: '--decode=disas_neon_shared'),
-  decodetree.process('neon-dp.decode', extra_args: '--decode=disas_neon_dp'),
-  decodetree.process('neon-ls.decode', extra_args: '--decode=disas_neon_ls'),
-  decodetree.process('vfp.decode', extra_args: '--decode=disas_vfp'),
-  decodetree.process('vfp-uncond.decode', extra_args: '--decode=disas_vfp_uncond'),
-  decodetree.process('m-nocp.decode', extra_args: '--decode=disas_m_nocp'),
-  decodetree.process('mve.decode', extra_args: '--decode=disas_mve'),
-  decodetree.process('a32.decode', extra_args: '--static-decode=disas_a32'),
-  decodetree.process('a32-uncond.decode', extra_args: '--static-decode=disas_a32_uncond'),
-  decodetree.process('t32.decode', extra_args: '--static-decode=disas_t32'),
-  decodetree.process('t16.decode', extra_args: ['-w', '16', '--static-decode=disas_t16']),
-]
-
 arm_ss = ss.source_set()
-arm_ss.add(gen)
 arm_ss.add(files(
   'cpu.c',
   'crypto_helper.c',
@@ -29,11 +11,6 @@ arm_ss.add(files(
   'neon_helper.c',
   'op_helper.c',
   'tlb_helper.c',
-  'translate.c',
-  'translate-m-nocp.c',
-  'translate-mve.c',
-  'translate-neon.c',
-  'translate-vfp.c',
   'vec_helper.c',
   'vfp_helper.c',
   'cpu_tcg.c',
@@ -50,9 +27,6 @@ arm_ss.add(when: 'TARGET_AARCH64', if_true: files(
   'pauth_helper.c',
   'sve_helper.c',
   'sme_helper.c',
-  'translate-a64.c',
-  'translate-sve.c',
-  'translate-sme.c',
 ))
 
 arm_softmmu_ss = ss.source_set()
@@ -67,5 +41,9 @@ arm_softmmu_ss.add(files(
 
 subdir('hvf')
 
+if 'CONFIG_TCG' in config_all
+   subdir('tcg')
+endif
+
 target_arch += {'arm': arm_ss}
 target_softmmu_arch += {'arm': arm_softmmu_ss}
diff --git a/target/arm/mve.decode b/target/arm/mve.decode
deleted file mode 100644
index 14a4f39..0000000
--- a/target/arm/mve.decode
+++ /dev/null
@@ -1,832 +0,0 @@
-# M-profile MVE instruction descriptions
-#
-#  Copyright (c) 2021 Linaro, Ltd
-#
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-#
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, see <http://www.gnu.org/licenses/>.
-
-#
-# This file is processed by scripts/decodetree.py
-#
-
-%qd 22:1 13:3
-%qm 5:1 1:3
-%qn 7:1 17:3
-
-# VQDMULL has size in bit 28: 0 for 16 bit, 1 for 32 bit
-%size_28 28:1 !function=plus_1
-
-# 2 operand fp insns have size in bit 20: 1 for 16 bit, 0 for 32 bit,
-# like Neon FP insns.
-%2op_fp_size 20:1 !function=neon_3same_fp_size
-# VCADD is an exception, where bit 20 is 0 for 16 bit and 1 for 32 bit
-%2op_fp_size_rev 20:1 !function=plus_1
-# FP scalars have size in bit 28, 1 for 16 bit, 0 for 32 bit
-%2op_fp_scalar_size 28:1 !function=neon_3same_fp_size
-
-# 1imm format immediate
-%imm_28_16_0 28:1 16:3 0:4
-
-&vldr_vstr rn qd imm p a w size l u
-&1op qd qm size
-&2op qd qm qn size
-&2scalar qd qn rm size
-&1imm qd imm cmode op
-&2shift qd qm shift size
-&vidup qd rn size imm
-&viwdup qd rn rm size imm
-&vcmp qm qn size mask
-&vcmp_scalar qn rm size mask
-&shl_scalar qda rm size
-&vmaxv qm rda size
-&vabav qn qm rda size
-&vldst_sg qd qm rn size msize os
-&vldst_sg_imm qd qm a w imm
-&vldst_il qd rn size pat w
-
-# scatter-gather memory size is in bits 6:4
-%sg_msize 6:1 4:1
-
-@vldr_vstr ....... . . . . l:1 rn:4 ... ...... imm:7 &vldr_vstr qd=%qd u=0
-# Note that both Rn and Qd are 3 bits only (no D bit)
-@vldst_wn ... u:1 ... . . . . l:1 . rn:3 qd:3 . ... .. imm:7 &vldr_vstr
-
-@vldst_sg .... .... .... rn:4 .... ... size:2 ... ... os:1 &vldst_sg \
-          qd=%qd qm=%qm msize=%sg_msize
-
-# Qm is in the fields usually labeled Qn
-@vldst_sg_imm .... .... a:1 . w:1 . .... .... .... . imm:7 &vldst_sg_imm \
-              qd=%qd qm=%qn
-
-# Deinterleaving load/interleaving store
-@vldst_il .... .... .. w:1 . rn:4 .... ... size:2 pat:2 ..... &vldst_il \
-          qd=%qd
-
-@1op .... .... .... size:2 .. .... .... .... .... &1op qd=%qd qm=%qm
-@1op_nosz .... .... .... .... .... .... .... .... &1op qd=%qd qm=%qm size=0
-@2op .... .... .. size:2 .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn
-@2op_nosz .... .... .... .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn size=0
-@2op_sz28 .... .... .... .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn \
-     size=%size_28
-@1imm .... .... .... .... .... cmode:4 .. op:1 . .... &1imm qd=%qd imm=%imm_28_16_0
-
-# The _rev suffix indicates that Vn and Vm are reversed. This is
-# the case for shifts. In the Arm ARM these insns are documented
-# with the Vm and Vn fields in their usual places, but in the
-# assembly the operands are listed "backwards", ie in the order
-# Qd, Qm, Qn where other insns use Qd, Qn, Qm. For QEMU we choose
-# to consider Vm and Vn as being in different fields in the insn.
-# This gives us consistency with A64 and Neon.
-@2op_rev .... .... .. size:2 .... .... .... .... .... &2op qd=%qd qm=%qn qn=%qm
-
-@2scalar .... .... .. size:2 .... .... .... .... rm:4 &2scalar qd=%qd qn=%qn
-@2scalar_nosz .... .... .... .... .... .... .... rm:4 &2scalar qd=%qd qn=%qn
-
-@2_shl_b .... .... .. 001 shift:3 .... .... .... .... &2shift qd=%qd qm=%qm size=0
-@2_shl_h .... .... .. 01  shift:4 .... .... .... .... &2shift qd=%qd qm=%qm size=1
-@2_shl_w .... .... .. 1   shift:5 .... .... .... .... &2shift qd=%qd qm=%qm size=2
-
-@2_shll_b .... .... ... 01 shift:3 .... .... .... .... &2shift qd=%qd qm=%qm size=0
-@2_shll_h .... .... ... 1  shift:4 .... .... .... .... &2shift qd=%qd qm=%qm size=1
-# VSHLL encoding T2 where shift == esize
-@2_shll_esize_b .... .... .... 00 .. .... .... .... .... &2shift \
-                qd=%qd qm=%qm size=0 shift=8
-@2_shll_esize_h .... .... .... 01 .. .... .... .... .... &2shift \
-                qd=%qd qm=%qm size=1 shift=16
-
-# Right shifts are encoded as N - shift, where N is the element size in bits.
-%rshift_i5  16:5 !function=rsub_32
-%rshift_i4  16:4 !function=rsub_16
-%rshift_i3  16:3 !function=rsub_8
-
-@2_shr_b .... .... .. 001 ... .... .... .... .... &2shift qd=%qd qm=%qm \
-         size=0 shift=%rshift_i3
-@2_shr_h .... .... .. 01 .... .... .... .... .... &2shift qd=%qd qm=%qm \
-         size=1 shift=%rshift_i4
-@2_shr_w .... .... .. 1 ..... .... .... .... .... &2shift qd=%qd qm=%qm \
-         size=2 shift=%rshift_i5
-
-@shl_scalar .... .... .... size:2 .. .... .... .... rm:4 &shl_scalar qda=%qd
-
-# Vector comparison; 4-bit Qm but 3-bit Qn
-%mask_22_13      22:1 13:3
-@vcmp    .... .... .. size:2 qn:3 . .... .... .... .... &vcmp qm=%qm mask=%mask_22_13
-@vcmp_scalar .... .... .. size:2 qn:3 . .... .... .... rm:4 &vcmp_scalar \
-             mask=%mask_22_13
-
-@vcmp_fp .... .... .... qn:3 . .... .... .... .... &vcmp \
-         qm=%qm size=%2op_fp_scalar_size mask=%mask_22_13
-
-# Bit 28 is a 2op_fp_scalar_size bit, but we do not decode it in this
-# format to avoid complicated overlapping-instruction-groups
-@vcmp_fp_scalar .... .... .... qn:3 . .... .... .... rm:4 &vcmp_scalar \
-                mask=%mask_22_13
-
-@vmaxv .... .... .... size:2 .. rda:4 .... .... .... &vmaxv qm=%qm
-
-@2op_fp .... .... .... .... .... .... .... .... &2op \
-        qd=%qd qn=%qn qm=%qm size=%2op_fp_size
-
-@2op_fp_size_rev .... .... .... .... .... .... .... .... &2op \
-                 qd=%qd qn=%qn qm=%qm size=%2op_fp_size_rev
-
-# 2-operand, but Qd and Qn share a field. Size is in bit 28, but we
-# don't decode it in this format
-@vmaxnma  .... .... .... .... .... .... .... .... &2op \
-          qd=%qd qn=%qd qm=%qm
-
-# Here also we don't decode the bit 28 size in the format to avoid
-# awkward nested overlap groups
-@vmaxnmv          .... .... .... .... rda:4 .... .... .... &vmaxv qm=%qm
-
-@2op_fp_scalar .... .... .... .... .... .... .... rm:4 &2scalar \
-               qd=%qd qn=%qn size=%2op_fp_scalar_size
-
-# Vector loads and stores
-
-# Widening loads and narrowing stores:
-# for these P=0 W=0 is 'related encoding'; sz=11 is 'related encoding'
-# This means we need to expand out to multiple patterns for P, W, SZ.
-# For stores the U bit must be 0 but we catch that in the trans_ function.
-# The naming scheme here is "VLDSTB_H == in-memory byte load/store to/from
-# signed halfword element in register", etc.
-VLDSTB_H         111 . 110 0 a:1 0 1   . 0 ... ... 0 111 01 ....... @vldst_wn \
-                 p=0 w=1 size=1
-VLDSTB_H         111 . 110 1 a:1 0 w:1 . 0 ... ... 0 111 01 ....... @vldst_wn \
-                 p=1 size=1
-VLDSTB_W         111 . 110 0 a:1 0 1   . 0 ... ... 0 111 10 ....... @vldst_wn \
-                 p=0 w=1 size=2
-VLDSTB_W         111 . 110 1 a:1 0 w:1 . 0 ... ... 0 111 10 ....... @vldst_wn \
-                 p=1 size=2
-VLDSTH_W         111 . 110 0 a:1 0 1   . 1 ... ... 0 111 10 ....... @vldst_wn \
-                 p=0 w=1 size=2
-VLDSTH_W         111 . 110 1 a:1 0 w:1 . 1 ... ... 0 111 10 ....... @vldst_wn \
-                 p=1 size=2
-
-# Non-widening loads/stores (P=0 W=0 is 'related encoding')
-VLDR_VSTR        1110110 0 a:1 . 1   . .... ... 111100 .......   @vldr_vstr \
-                 size=0 p=0 w=1
-VLDR_VSTR        1110110 0 a:1 . 1   . .... ... 111101 .......   @vldr_vstr \
-                 size=1 p=0 w=1
-VLDR_VSTR        1110110 0 a:1 . 1   . .... ... 111110 .......   @vldr_vstr \
-                 size=2 p=0 w=1
-VLDR_VSTR        1110110 1 a:1 . w:1 . .... ... 111100 .......   @vldr_vstr \
-                 size=0 p=1
-VLDR_VSTR        1110110 1 a:1 . w:1 . .... ... 111101 .......   @vldr_vstr \
-                 size=1 p=1
-VLDR_VSTR        1110110 1 a:1 . w:1 . .... ... 111110 .......   @vldr_vstr \
-                 size=2 p=1
-
-# gather loads/scatter stores
-VLDR_S_sg        111 0 1100 1 . 01 .... ... 0 111 . .... .... @vldst_sg
-VLDR_U_sg        111 1 1100 1 . 01 .... ... 0 111 . .... .... @vldst_sg
-VSTR_sg          111 0 1100 1 . 00 .... ... 0 111 . .... .... @vldst_sg
-
-VLDRW_sg_imm     111 1 1101 ... 1 ... 0 ... 1 1110 .... .... @vldst_sg_imm
-VLDRD_sg_imm     111 1 1101 ... 1 ... 0 ... 1 1111 .... .... @vldst_sg_imm
-VSTRW_sg_imm     111 1 1101 ... 0 ... 0 ... 1 1110 .... .... @vldst_sg_imm
-VSTRD_sg_imm     111 1 1101 ... 0 ... 0 ... 1 1111 .... .... @vldst_sg_imm
-
-# deinterleaving loads/interleaving stores
-VLD2             1111 1100 1 .. 1 .... ... 1 111 .. .. 00000 @vldst_il
-VLD4             1111 1100 1 .. 1 .... ... 1 111 .. .. 00001 @vldst_il
-VST2             1111 1100 1 .. 0 .... ... 1 111 .. .. 00000 @vldst_il
-VST4             1111 1100 1 .. 0 .... ... 1 111 .. .. 00001 @vldst_il
-
-# Moves between 2 32-bit vector lanes and 2 general purpose registers
-VMOV_to_2gp      1110 1100 0 . 00 rt2:4 ... 0 1111 000 idx:1 rt:4 qd=%qd
-VMOV_from_2gp    1110 1100 0 . 01 rt2:4 ... 0 1111 000 idx:1 rt:4 qd=%qd
-
-# Vector 2-op
-VAND             1110 1111 0 . 00 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz
-VBIC             1110 1111 0 . 01 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz
-VORR             1110 1111 0 . 10 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz
-VORN             1110 1111 0 . 11 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz
-VEOR             1111 1111 0 . 00 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz
-
-VADD             1110 1111 0 . .. ... 0 ... 0 1000 . 1 . 0 ... 0 @2op
-VSUB             1111 1111 0 . .. ... 0 ... 0 1000 . 1 . 0 ... 0 @2op
-VMUL             1110 1111 0 . .. ... 0 ... 0 1001 . 1 . 1 ... 0 @2op
-
-# The VSHLL T2 encoding is not a @2op pattern, but is here because it
-# overlaps what would be size=0b11 VMULH/VRMULH
-{
-  VCVTB_SH       111 0 1110 0 . 11 1111 ... 0 1110 0 0 . 0 ... 1 @1op_nosz
-
-  VMAXNMA        111 0 1110 0 . 11 1111 ... 0 1110 1 0 . 0 ... 1 @vmaxnma size=2
-
-  VSHLL_BS       111 0 1110 0 . 11 .. 01 ... 0 1110 0 0 . 0 ... 1 @2_shll_esize_b
-  VSHLL_BS       111 0 1110 0 . 11 .. 01 ... 0 1110 0 0 . 0 ... 1 @2_shll_esize_h
-
-  VQMOVUNB       111 0 1110 0 . 11 .. 01 ... 0 1110 1 0 . 0 ... 1 @1op
-  VQMOVN_BS      111 0 1110 0 . 11 .. 11 ... 0 1110 0 0 . 0 ... 1 @1op
-
-  VMAXA          111 0 1110 0 . 11 .. 11 ... 0 1110 1 0 . 0 ... 1 @1op
-
-  VMULH_S        111 0 1110 0 . .. ...1 ... 0 1110 . 0 . 0 ... 1 @2op
-}
-
-{
-  VCVTB_HS       111 1 1110 0 . 11  1111 ... 0 1110 0 0 . 0 ... 1  @1op_nosz
-
-  VMAXNMA        111 1 1110 0 . 11  1111 ... 0 1110 1 0 . 0 ... 1 @vmaxnma size=1
-
-  VSHLL_BU       111 1 1110 0 . 11 .. 01 ... 0 1110 0 0 . 0 ... 1 @2_shll_esize_b
-  VSHLL_BU       111 1 1110 0 . 11 .. 01 ... 0 1110 0 0 . 0 ... 1 @2_shll_esize_h
-
-  VMOVNB         111 1 1110 0 . 11 .. 01 ... 0 1110 1 0 . 0 ... 1 @1op
-  VQMOVN_BU      111 1 1110 0 . 11 .. 11 ... 0 1110 0 0 . 0 ... 1 @1op
-
-  VMULH_U        111 1 1110 0 . .. ...1 ... 0 1110 . 0 . 0 ... 1 @2op
-}
-
-{
-  VCVTT_SH       111 0 1110 0 . 11  1111 ... 1 1110 0 0 . 0 ... 1 @1op_nosz
-
-  VMINNMA        111 0 1110 0 . 11  1111 ... 1 1110 1 0 . 0 ... 1 @vmaxnma size=2
-  VSHLL_TS       111 0 1110 0 . 11 .. 01 ... 1 1110 0 0 . 0 ... 1 @2_shll_esize_b
-  VSHLL_TS       111 0 1110 0 . 11 .. 01 ... 1 1110 0 0 . 0 ... 1 @2_shll_esize_h
-
-  VQMOVUNT       111 0 1110 0 . 11 .. 01 ... 1 1110 1 0 . 0 ... 1 @1op
-  VQMOVN_TS      111 0 1110 0 . 11 .. 11 ... 1 1110 0 0 . 0 ... 1 @1op
-
-  VMINA          111 0 1110 0 . 11 .. 11 ... 1 1110 1 0 . 0 ... 1 @1op
-
-  VRMULH_S       111 0 1110 0 . .. ...1 ... 1 1110 . 0 . 0 ... 1 @2op
-}
-
-{
-  VCVTT_HS       111 1 1110 0 . 11  1111 ... 1 1110 0 0 . 0 ... 1 @1op_nosz
-
-  VMINNMA        111 1 1110 0 . 11  1111 ... 1 1110 1 0 . 0 ... 1 @vmaxnma size=1
-  VSHLL_TU       111 1 1110 0 . 11 .. 01 ... 1 1110 0 0 . 0 ... 1 @2_shll_esize_b
-  VSHLL_TU       111 1 1110 0 . 11 .. 01 ... 1 1110 0 0 . 0 ... 1 @2_shll_esize_h
-
-  VMOVNT         111 1 1110 0 . 11 .. 01 ... 1 1110 1 0 . 0 ... 1 @1op
-  VQMOVN_TU      111 1 1110 0 . 11 .. 11 ... 1 1110 0 0 . 0 ... 1 @1op
-
-  VRMULH_U       111 1 1110 0 . .. ...1 ... 1 1110 . 0 . 0 ... 1 @2op
-}
-
-VMAX_S           111 0 1111 0 . .. ... 0 ... 0 0110 . 1 . 0 ... 0 @2op
-VMAX_U           111 1 1111 0 . .. ... 0 ... 0 0110 . 1 . 0 ... 0 @2op
-VMIN_S           111 0 1111 0 . .. ... 0 ... 0 0110 . 1 . 1 ... 0 @2op
-VMIN_U           111 1 1111 0 . .. ... 0 ... 0 0110 . 1 . 1 ... 0 @2op
-
-VABD_S           111 0 1111 0 . .. ... 0 ... 0 0111 . 1 . 0 ... 0 @2op
-VABD_U           111 1 1111 0 . .. ... 0 ... 0 0111 . 1 . 0 ... 0 @2op
-
-VHADD_S          111 0 1111 0 . .. ... 0 ... 0 0000 . 1 . 0 ... 0 @2op
-VHADD_U          111 1 1111 0 . .. ... 0 ... 0 0000 . 1 . 0 ... 0 @2op
-VHSUB_S          111 0 1111 0 . .. ... 0 ... 0 0010 . 1 . 0 ... 0 @2op
-VHSUB_U          111 1 1111 0 . .. ... 0 ... 0 0010 . 1 . 0 ... 0 @2op
-
-{
-  VMULLP_B       111 . 1110 0 . 11 ... 1 ... 0 1110 . 0 . 0 ... 0 @2op_sz28
-  VMULL_BS       111 0 1110 0 . .. ... 1 ... 0 1110 . 0 . 0 ... 0 @2op
-  VMULL_BU       111 1 1110 0 . .. ... 1 ... 0 1110 . 0 . 0 ... 0 @2op
-}
-{
-  VMULLP_T       111 . 1110 0 . 11 ... 1 ... 1 1110 . 0 . 0 ... 0 @2op_sz28
-  VMULL_TS       111 0 1110 0 . .. ... 1 ... 1 1110 . 0 . 0 ... 0 @2op
-  VMULL_TU       111 1 1110 0 . .. ... 1 ... 1 1110 . 0 . 0 ... 0 @2op
-}
-
-VQDMULH          1110 1111 0 . .. ... 0 ... 0 1011 . 1 . 0 ... 0 @2op
-VQRDMULH         1111 1111 0 . .. ... 0 ... 0 1011 . 1 . 0 ... 0 @2op
-
-VQADD_S          111 0 1111 0 . .. ... 0 ... 0 0000 . 1 . 1 ... 0 @2op
-VQADD_U          111 1 1111 0 . .. ... 0 ... 0 0000 . 1 . 1 ... 0 @2op
-VQSUB_S          111 0 1111 0 . .. ... 0 ... 0 0010 . 1 . 1 ... 0 @2op
-VQSUB_U          111 1 1111 0 . .. ... 0 ... 0 0010 . 1 . 1 ... 0 @2op
-
-VSHL_S           111 0 1111 0 . .. ... 0 ... 0 0100 . 1 . 0 ... 0 @2op_rev
-VSHL_U           111 1 1111 0 . .. ... 0 ... 0 0100 . 1 . 0 ... 0 @2op_rev
-
-VRSHL_S          111 0 1111 0 . .. ... 0 ... 0 0101 . 1 . 0 ... 0 @2op_rev
-VRSHL_U          111 1 1111 0 . .. ... 0 ... 0 0101 . 1 . 0 ... 0 @2op_rev
-
-VQSHL_S          111 0 1111 0 . .. ... 0 ... 0 0100 . 1 . 1 ... 0 @2op_rev
-VQSHL_U          111 1 1111 0 . .. ... 0 ... 0 0100 . 1 . 1 ... 0 @2op_rev
-
-VQRSHL_S         111 0 1111 0 . .. ... 0 ... 0 0101 . 1 . 1 ... 0 @2op_rev
-VQRSHL_U         111 1 1111 0 . .. ... 0 ... 0 0101 . 1 . 1 ... 0 @2op_rev
-
-{
-  VCMUL0         111 . 1110 0 . 11 ... 0 ... 0 1110 . 0 . 0 ... 0 @2op_sz28
-  VQDMLADH       1110  1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 0 @2op
-  VQDMLSDH       1111  1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 0 @2op
-}
-
-{
-  VCMUL180       111 . 1110 0 . 11 ... 0 ... 1 1110 . 0 . 0 ... 0 @2op_sz28
-  VQDMLADHX      111 0 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 0 @2op
-  VQDMLSDHX      111 1 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 0 @2op
-}
-
-{
-  VCMUL90        111 . 1110 0 . 11 ... 0 ... 0 1110 . 0 . 0 ... 1 @2op_sz28
-  VQRDMLADH      111 0 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 1 @2op
-  VQRDMLSDH      111 1 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 1 @2op
-}
-
-{
-  VCMUL270       111 . 1110 0 . 11 ... 0 ... 1 1110 . 0 . 0 ... 1 @2op_sz28
-  VQRDMLADHX     111 0 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 1 @2op
-  VQRDMLSDHX     111 1 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 1 @2op
-}
-
-VQDMULLB         111 . 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 1 @2op_sz28
-VQDMULLT         111 . 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 1 @2op_sz28
-
-VRHADD_S         111 0 1111 0 . .. ... 0 ... 0 0001 . 1 . 0 ... 0 @2op
-VRHADD_U         111 1 1111 0 . .. ... 0 ... 0 0001 . 1 . 0 ... 0 @2op
-
-{
-  VADC           1110 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 0 @2op_nosz
-  VADCI          1110 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 0 @2op_nosz
-  VHCADD90       1110 1110 0 . .. ... 0 ... 0 1111 . 0 . 0 ... 0 @2op
-  VHCADD270      1110 1110 0 . .. ... 0 ... 1 1111 . 0 . 0 ... 0 @2op
-}
-
-{
-  VSBC           1111 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 0 @2op_nosz
-  VSBCI          1111 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 0 @2op_nosz
-  VCADD90        1111 1110 0 . .. ... 0 ... 0 1111 . 0 . 0 ... 0 @2op
-  VCADD270       1111 1110 0 . .. ... 0 ... 1 1111 . 0 . 0 ... 0 @2op
-}
-
-# Vector miscellaneous
-
-VCLS             1111 1111 1 . 11 .. 00 ... 0 0100 01 . 0 ... 0 @1op
-VCLZ             1111 1111 1 . 11 .. 00 ... 0 0100 11 . 0 ... 0 @1op
-
-VREV16           1111 1111 1 . 11 .. 00 ... 0 0001 01 . 0 ... 0 @1op
-VREV32           1111 1111 1 . 11 .. 00 ... 0 0000 11 . 0 ... 0 @1op
-VREV64           1111 1111 1 . 11 .. 00 ... 0 0000 01 . 0 ... 0 @1op
-
-VMVN             1111 1111 1 . 11 00 00 ... 0 0101 11 . 0 ... 0 @1op_nosz
-
-VABS             1111 1111 1 . 11 .. 01 ... 0 0011 01 . 0 ... 0 @1op
-VABS_fp          1111 1111 1 . 11 .. 01 ... 0 0111 01 . 0 ... 0 @1op
-VNEG             1111 1111 1 . 11 .. 01 ... 0 0011 11 . 0 ... 0 @1op
-VNEG_fp          1111 1111 1 . 11 .. 01 ... 0 0111 11 . 0 ... 0 @1op
-
-VQABS            1111 1111 1 . 11 .. 00 ... 0 0111 01 . 0 ... 0 @1op
-VQNEG            1111 1111 1 . 11 .. 00 ... 0 0111 11 . 0 ... 0 @1op
-
-&vdup qd rt size
-# Qd is in the fields usually named Qn
-@vdup            .... .... . . .. ... . rt:4 .... . . . . .... qd=%qn &vdup
-
-# B and E bits encode size, which we decode here to the usual size values
-VDUP             1110 1110 1 1 10 ... 0 .... 1011 . 0 0 1 0000 @vdup size=0
-VDUP             1110 1110 1 0 10 ... 0 .... 1011 . 0 1 1 0000 @vdup size=1
-VDUP             1110 1110 1 0 10 ... 0 .... 1011 . 0 0 1 0000 @vdup size=2
-
-# Incrementing and decrementing dup
-
-# VIDUP, VDDUP format immediate: 1 << (immh:imml)
-%imm_vidup 7:1 0:1 !function=vidup_imm
-
-# VIDUP, VDDUP registers: Rm bits [3:1] from insn, bit 0 is 1;
-# Rn bits [3:1] from insn, bit 0 is 0
-%vidup_rm 1:3 !function=times_2_plus_1
-%vidup_rn 17:3 !function=times_2
-
-@vidup           .... .... . . size:2 .... .... .... .... .... \
-                 qd=%qd imm=%imm_vidup rn=%vidup_rn &vidup
-@viwdup          .... .... . . size:2 .... .... .... .... .... \
-                 qd=%qd imm=%imm_vidup rm=%vidup_rm rn=%vidup_rn &viwdup
-{
-  VIDUP          1110 1110 0 . .. ... 1 ... 0 1111 . 110 111 . @vidup
-  VIWDUP         1110 1110 0 . .. ... 1 ... 0 1111 . 110 ... . @viwdup
-}
-{
-  VCMPGT_fp_scalar 1110 1110 0 . 11 ... 1 ... 1 1111  0110 .... @vcmp_fp_scalar size=2
-  VCMPLE_fp_scalar 1110 1110 0 . 11 ... 1 ... 1 1111  1110 .... @vcmp_fp_scalar size=2
-  VDDUP            1110 1110 0 . .. ... 1 ... 1 1111 . 110 111 . @vidup
-  VDWDUP           1110 1110 0 . .. ... 1 ... 1 1111 . 110 ... . @viwdup
-}
-
-# multiply-add long dual accumulate
-# rdahi: bits [3:1] from insn, bit 0 is 1
-# rdalo: bits [3:1] from insn, bit 0 is 0
-%rdahi 20:3 !function=times_2_plus_1
-%rdalo 13:3 !function=times_2
-# size bit is 0 for 16 bit, 1 for 32 bit
-%size_16 16:1 !function=plus_1
-
-&vmlaldav rdahi rdalo size qn qm x a
-&vmladav rda size qn qm x a
-
-@vmlaldav        .... .... . ... ... . ... x:1 .... .. a:1 . qm:3 . \
-                 qn=%qn rdahi=%rdahi rdalo=%rdalo size=%size_16 &vmlaldav
-@vmlaldav_nosz   .... .... . ... ... . ... x:1 .... .. a:1 . qm:3 . \
-                 qn=%qn rdahi=%rdahi rdalo=%rdalo size=0 &vmlaldav
-@vmladav         .... .... .... ... . ... x:1 .... . . a:1 . qm:3 . \
-                 qn=%qn rda=%rdalo size=%size_16 &vmladav
-@vmladav_nosz    .... .... .... ... . ... x:1 .... . . a:1 . qm:3 . \
-                 qn=%qn rda=%rdalo size=0 &vmladav
-
-{
-  VMLADAV_S      1110 1110 1111  ... . ... . 1110 . 0 . 0 ... 0 @vmladav
-  VMLALDAV_S     1110 1110 1 ... ... . ... . 1110 . 0 . 0 ... 0 @vmlaldav
-}
-{
-  VMLADAV_U      1111 1110 1111  ... . ... . 1110 . 0 . 0 ... 0 @vmladav
-  VMLALDAV_U     1111 1110 1 ... ... . ... . 1110 . 0 . 0 ... 0 @vmlaldav
-}
-
-{
-  VMLSDAV        1110 1110 1111  ... . ... . 1110 . 0 . 0 ... 1 @vmladav
-  VMLSLDAV       1110 1110 1 ... ... . ... . 1110 . 0 . 0 ... 1 @vmlaldav
-}
-
-{
-  VMLSDAV        1111 1110 1111  ... 0 ... . 1110 . 0 . 0 ... 1 @vmladav_nosz
-  VRMLSLDAVH     1111 1110 1 ... ... 0 ... . 1110 . 0 . 0 ... 1 @vmlaldav_nosz
-}
-
-VMLADAV_S        1110 1110 1111  ... 0 ... . 1111 . 0 . 0 ... 1 @vmladav_nosz
-VMLADAV_U        1111 1110 1111  ... 0 ... . 1111 . 0 . 0 ... 1 @vmladav_nosz
-
-{
-  [
-    VMAXNMAV     1110 1110 1110  11 00 ....  1111 0 0 . 0 ... 0 @vmaxnmv size=2
-    VMINNMAV     1110 1110 1110  11 00 ....  1111 1 0 . 0 ... 0 @vmaxnmv size=2
-    VMAXNMV      1110 1110 1110  11 10 ....  1111 0 0 . 0 ... 0 @vmaxnmv size=2
-    VMINNMV      1110 1110 1110  11 10 ....  1111 1 0 . 0 ... 0 @vmaxnmv size=2
-  ]
-  [
-    VMAXV_S      1110 1110 1110  .. 10 ....  1111 0 0 . 0 ... 0 @vmaxv
-    VMINV_S      1110 1110 1110  .. 10 ....  1111 1 0 . 0 ... 0 @vmaxv
-    VMAXAV       1110 1110 1110  .. 00 ....  1111 0 0 . 0 ... 0 @vmaxv
-    VMINAV       1110 1110 1110  .. 00 ....  1111 1 0 . 0 ... 0 @vmaxv
-  ]
-  VMLADAV_S      1110 1110 1111  ... 0 ... . 1111 . 0 . 0 ... 0 @vmladav_nosz
-  VRMLALDAVH_S   1110 1110 1 ... ... 0 ... . 1111 . 0 . 0 ... 0 @vmlaldav_nosz
-}
-
-{
-  [
-    VMAXNMAV     1111 1110 1110  11 00 ....  1111 0 0 . 0 ... 0 @vmaxnmv size=1
-    VMINNMAV     1111 1110 1110  11 00 ....  1111 1 0 . 0 ... 0 @vmaxnmv size=1
-    VMAXNMV      1111 1110 1110  11 10 ....  1111 0 0 . 0 ... 0 @vmaxnmv size=1
-    VMINNMV      1111 1110 1110  11 10 ....  1111 1 0 . 0 ... 0 @vmaxnmv size=1
-  ]
-  [
-    VMAXV_U      1111 1110 1110  .. 10 ....  1111 0 0 . 0 ... 0 @vmaxv
-    VMINV_U      1111 1110 1110  .. 10 ....  1111 1 0 . 0 ... 0 @vmaxv
-  ]
-  VMLADAV_U      1111 1110 1111  ... 0 ... . 1111 . 0 . 0 ... 0 @vmladav_nosz
-  VRMLALDAVH_U   1111 1110 1 ... ... 0 ... . 1111 . 0 . 0 ... 0 @vmlaldav_nosz
-}
-
-# Scalar operations
-
-{
-  VCMPEQ_fp_scalar 1110 1110 0 . 11 ... 1 ... 0 1111  0100 .... @vcmp_fp_scalar size=2
-  VCMPNE_fp_scalar 1110 1110 0 . 11 ... 1 ... 0 1111  1100 .... @vcmp_fp_scalar size=2
-  VADD_scalar      1110 1110 0 . .. ... 1 ... 0 1111 . 100 .... @2scalar
-}
-
-{
-  VCMPLT_fp_scalar 1110 1110 0 . 11 ... 1 ... 1 1111  1100 .... @vcmp_fp_scalar size=2
-  VCMPGE_fp_scalar 1110 1110 0 . 11 ... 1 ... 1 1111  0100 .... @vcmp_fp_scalar size=2
-  VSUB_scalar      1110 1110 0 . .. ... 1 ... 1 1111 . 100 .... @2scalar
-}
-
-{
-  VSHL_S_scalar   1110 1110 0 . 11 .. 01 ... 1 1110 0110 .... @shl_scalar
-  VRSHL_S_scalar  1110 1110 0 . 11 .. 11 ... 1 1110 0110 .... @shl_scalar
-  VQSHL_S_scalar  1110 1110 0 . 11 .. 01 ... 1 1110 1110 .... @shl_scalar
-  VQRSHL_S_scalar 1110 1110 0 . 11 .. 11 ... 1 1110 1110 .... @shl_scalar
-  VMUL_scalar     1110 1110 0 . .. ... 1 ... 1 1110 . 110 .... @2scalar
-}
-
-{
-  VSHL_U_scalar   1111 1110 0 . 11 .. 01 ... 1 1110 0110 .... @shl_scalar
-  VRSHL_U_scalar  1111 1110 0 . 11 .. 11 ... 1 1110 0110 .... @shl_scalar
-  VQSHL_U_scalar  1111 1110 0 . 11 .. 01 ... 1 1110 1110 .... @shl_scalar
-  VQRSHL_U_scalar 1111 1110 0 . 11 .. 11 ... 1 1110 1110 .... @shl_scalar
-  VBRSR           1111 1110 0 . .. ... 1 ... 1 1110 . 110 .... @2scalar
-}
-
-{
-  VADD_fp_scalar  111 . 1110 0 . 11 ... 0 ... 0 1111 . 100 .... @2op_fp_scalar
-  VHADD_S_scalar  1110  1110 0 . .. ... 0 ... 0 1111 . 100 .... @2scalar
-  VHADD_U_scalar  1111  1110 0 . .. ... 0 ... 0 1111 . 100 .... @2scalar
-}
-
-{
-  VSUB_fp_scalar  111 . 1110 0 . 11 ... 0 ... 1 1111 . 100 .... @2op_fp_scalar
-  VHSUB_S_scalar  1110  1110 0 . .. ... 0 ... 1 1111 . 100 .... @2scalar
-  VHSUB_U_scalar  1111  1110 0 . .. ... 0 ... 1 1111 . 100 .... @2scalar
-}
-
-{
-  VQADD_S_scalar  1110  1110 0 . .. ... 0 ... 0 1111 . 110 .... @2scalar
-  VQADD_U_scalar  1111  1110 0 . .. ... 0 ... 0 1111 . 110 .... @2scalar
-  VQDMULLB_scalar 111 . 1110 0 . 11 ... 0 ... 0 1111 . 110 .... @2scalar_nosz \
-                  size=%size_28
-}
-
-{
-  VQSUB_S_scalar  1110  1110 0 . .. ... 0 ... 1 1111 . 110 .... @2scalar
-  VQSUB_U_scalar  1111  1110 0 . .. ... 0 ... 1 1111 . 110 .... @2scalar
-  VQDMULLT_scalar 111 . 1110 0 . 11 ... 0 ... 1 1111 . 110 .... @2scalar_nosz \
-                  size=%size_28
-}
-
-{
-  VMUL_fp_scalar  111 . 1110 0 . 11 ... 1 ... 0 1110 . 110 .... @2op_fp_scalar
-  VQDMULH_scalar  1110  1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar
-  VQRDMULH_scalar 1111  1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar
-}
-
-{
-  VFMA_scalar     111 . 1110 0 . 11 ... 1 ... 0 1110 . 100 .... @2op_fp_scalar
-  # The U bit (28) is don't-care because it does not affect the result
-  VMLA            111 - 1110 0 . .. ... 1 ... 0 1110 . 100 .... @2scalar
-}
-
-{
-  VFMAS_scalar    111 . 1110 0 . 11 ... 1 ... 1 1110 . 100 .... @2op_fp_scalar
-  # The U bit (28) is don't-care because it does not affect the result
-  VMLAS           111 - 1110 0 . .. ... 1 ... 1 1110 . 100 .... @2scalar
-}
-
-VQRDMLAH         1110 1110 0 . .. ... 0 ... 0 1110 . 100 .... @2scalar
-VQRDMLASH        1110 1110 0 . .. ... 0 ... 1 1110 . 100 .... @2scalar
-VQDMLAH          1110 1110 0 . .. ... 0 ... 0 1110 . 110 .... @2scalar
-VQDMLASH         1110 1110 0 . .. ... 0 ... 1 1110 . 110 .... @2scalar
-
-# Vector add across vector
-{
-  VADDV          111 u:1 1110 1111 size:2 01 ... 0 1111 0 0 a:1 0 qm:3 0 rda=%rdalo
-  VADDLV         111 u:1 1110 1 ... 1001 ... 0 1111 00 a:1 0 qm:3 0 \
-                 rdahi=%rdahi rdalo=%rdalo
-}
-
-@vabav           .... .... .. size:2 .... rda:4 .... .... .... &vabav qn=%qn qm=%qm
-
-VABAV_S          111 0 1110 10 .. ... 0 .... 1111 . 0 . 0 ... 1 @vabav
-VABAV_U          111 1 1110 10 .. ... 0 .... 1111 . 0 . 0 ... 1 @vabav
-
-# Logical immediate operations (1 reg and modified-immediate)
-
-# The cmode/op bits here decode VORR/VBIC/VMOV/VMVN, but
-# not in a way we can conveniently represent in decodetree without
-# a lot of repetition:
-# VORR: op=0, (cmode & 1) && cmode < 12
-# VBIC: op=1, (cmode & 1) && cmode < 12
-# VMOV: everything else
-# So we have a single decode line and check the cmode/op in the
-# trans function.
-Vimm_1r 111 . 1111 1 . 00 0 ... ... 0 .... 0 1 . 1 .... @1imm
-
-# Shifts by immediate
-
-VSHLI             111 0 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_b
-VSHLI             111 0 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_h
-VSHLI             111 0 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_w
-
-VQSHLI_S          111 0 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_b
-VQSHLI_S          111 0 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_h
-VQSHLI_S          111 0 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_w
-
-VQSHLI_U          111 1 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_b
-VQSHLI_U          111 1 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_h
-VQSHLI_U          111 1 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_w
-
-VQSHLUI           111 1 1111 1 . ... ... ... 0 0110 0 1 . 1 ... 0 @2_shl_b
-VQSHLUI           111 1 1111 1 . ... ... ... 0 0110 0 1 . 1 ... 0 @2_shl_h
-VQSHLUI           111 1 1111 1 . ... ... ... 0 0110 0 1 . 1 ... 0 @2_shl_w
-
-VSHRI_S           111 0 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_b
-VSHRI_S           111 0 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_h
-VSHRI_S           111 0 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_w
-
-VSHRI_U           111 1 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_b
-VSHRI_U           111 1 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_h
-VSHRI_U           111 1 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_w
-
-VRSHRI_S          111 0 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_b
-VRSHRI_S          111 0 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_h
-VRSHRI_S          111 0 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_w
-
-VRSHRI_U          111 1 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_b
-VRSHRI_U          111 1 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_h
-VRSHRI_U          111 1 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_w
-
-# VSHLL T1 encoding; the T2 VSHLL encoding is elsewhere in this file
-# Note that VMOVL is encoded as "VSHLL with a zero shift count"; we
-# implement it that way rather than special-casing it in the decode.
-VSHLL_BS          111 0 1110 1 . 1 .. ... ... 0 1111 0 1 . 0 ... 0 @2_shll_b
-VSHLL_BS          111 0 1110 1 . 1 .. ... ... 0 1111 0 1 . 0 ... 0 @2_shll_h
-
-VSHLL_BU          111 1 1110 1 . 1 .. ... ... 0 1111 0 1 . 0 ... 0 @2_shll_b
-VSHLL_BU          111 1 1110 1 . 1 .. ... ... 0 1111 0 1 . 0 ... 0 @2_shll_h
-
-VSHLL_TS          111 0 1110 1 . 1 .. ... ... 1 1111 0 1 . 0 ... 0 @2_shll_b
-VSHLL_TS          111 0 1110 1 . 1 .. ... ... 1 1111 0 1 . 0 ... 0 @2_shll_h
-
-VSHLL_TU          111 1 1110 1 . 1 .. ... ... 1 1111 0 1 . 0 ... 0 @2_shll_b
-VSHLL_TU          111 1 1110 1 . 1 .. ... ... 1 1111 0 1 . 0 ... 0 @2_shll_h
-
-# Shift-and-insert
-VSRI              111 1 1111 1 . ... ... ... 0 0100 0 1 . 1 ... 0 @2_shr_b
-VSRI              111 1 1111 1 . ... ... ... 0 0100 0 1 . 1 ... 0 @2_shr_h
-VSRI              111 1 1111 1 . ... ... ... 0 0100 0 1 . 1 ... 0 @2_shr_w
-
-VSLI              111 1 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_b
-VSLI              111 1 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_h
-VSLI              111 1 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_w
-
-# Narrowing shifts (which only support b and h sizes)
-VSHRNB            111 0 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 1 @2_shr_b
-VSHRNB            111 0 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 1 @2_shr_h
-VSHRNT            111 0 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 1 @2_shr_b
-VSHRNT            111 0 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 1 @2_shr_h
-
-VRSHRNB           111 1 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 1 @2_shr_b
-VRSHRNB           111 1 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 1 @2_shr_h
-VRSHRNT           111 1 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 1 @2_shr_b
-VRSHRNT           111 1 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 1 @2_shr_h
-
-VQSHRNB_S         111 0 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 0 @2_shr_b
-VQSHRNB_S         111 0 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 0 @2_shr_h
-VQSHRNT_S         111 0 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 0 @2_shr_b
-VQSHRNT_S         111 0 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 0 @2_shr_h
-VQSHRNB_U         111 1 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 0 @2_shr_b
-VQSHRNB_U         111 1 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 0 @2_shr_h
-VQSHRNT_U         111 1 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 0 @2_shr_b
-VQSHRNT_U         111 1 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 0 @2_shr_h
-
-VQSHRUNB          111 0 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 0 @2_shr_b
-VQSHRUNB          111 0 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 0 @2_shr_h
-VQSHRUNT          111 0 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 0 @2_shr_b
-VQSHRUNT          111 0 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 0 @2_shr_h
-
-VQRSHRNB_S        111 0 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 1 @2_shr_b
-VQRSHRNB_S        111 0 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 1 @2_shr_h
-VQRSHRNT_S        111 0 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 1 @2_shr_b
-VQRSHRNT_S        111 0 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 1 @2_shr_h
-VQRSHRNB_U        111 1 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 1 @2_shr_b
-VQRSHRNB_U        111 1 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 1 @2_shr_h
-VQRSHRNT_U        111 1 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 1 @2_shr_b
-VQRSHRNT_U        111 1 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 1 @2_shr_h
-
-VQRSHRUNB         111 1 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 0 @2_shr_b
-VQRSHRUNB         111 1 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 0 @2_shr_h
-VQRSHRUNT         111 1 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 0 @2_shr_b
-VQRSHRUNT         111 1 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 0 @2_shr_h
-
-VSHLC             111 0 1110 1 . 1 imm:5 ... 0 1111 1100 rdm:4 qd=%qd
-
-# Comparisons. We expand out the conditions which are split across
-# encodings T1, T2, T3 and the fc bits. These include VPT, which is
-# effectively "VCMP then VPST". A plain "VCMP" has a mask field of zero.
-{
-  VCMPEQ_fp       111 . 1110 0 . 11 ... 1 ... 0 1111 0 0 . 0 ... 0 @vcmp_fp
-  VCMPEQ          111 1 1110 0 . .. ... 1 ... 0 1111 0 0 . 0 ... 0 @vcmp
-}
-
-{
-  VCMPNE_fp       111 . 1110 0 . 11 ... 1 ... 0 1111 1 0 . 0 ... 0 @vcmp_fp
-  VCMPNE          111 1 1110 0 . .. ... 1 ... 0 1111 1 0 . 0 ... 0 @vcmp
-}
-
-{
-  VCMPGE_fp       111 . 1110 0 . 11 ... 1 ... 1 1111 0 0 . 0 ... 0 @vcmp_fp
-  VCMPGE          111 1 1110 0 . .. ... 1 ... 1 1111 0 0 . 0 ... 0 @vcmp
-}
-
-{
-  VCMPLT_fp       111 . 1110 0 . 11 ... 1 ... 1 1111 1 0 . 0 ... 0 @vcmp_fp
-  VCMPLT          111 1 1110 0 . .. ... 1 ... 1 1111 1 0 . 0 ... 0 @vcmp
-}
-
-{
-  VCMPGT_fp       111 . 1110 0 . 11 ... 1 ... 1 1111 0 0 . 0 ... 1 @vcmp_fp
-  VCMPGT          111 1 1110 0 . .. ... 1 ... 1 1111 0 0 . 0 ... 1 @vcmp
-}
-
-{
-  VCMPLE_fp         111 . 1110 0 . 11 ... 1 ... 1 1111 1 0 . 0 ... 1 @vcmp_fp
-  VCMPLE            1111 1110 0 . .. ... 1 ... 1 1111 1 0 . 0 ... 1 @vcmp
-}
-
-{
-  VPSEL           1111 1110 0 . 11 ... 1 ... 0 1111 . 0 . 0 ... 1 @2op_nosz
-  VCMPCS          1111 1110 0 . .. ... 1 ... 0 1111 0 0 . 0 ... 1 @vcmp
-  VCMPHI          1111 1110 0 . .. ... 1 ... 0 1111 1 0 . 0 ... 1 @vcmp
-}
-
-{
-  VPNOT            1111 1110 0 0 11 000 1 000 0 1111 0100 1101
-  VPST             1111 1110 0 . 11 000 1 ... 0 1111 0100 1101 mask=%mask_22_13
-  VCMPEQ_fp_scalar 1111 1110 0 . 11 ... 1 ... 0 1111 0100 .... @vcmp_fp_scalar size=1
-  VCMPEQ_scalar    1111 1110 0 . .. ... 1 ... 0 1111 0100 .... @vcmp_scalar
-}
-
-{
-  VCMPNE_fp_scalar 1111 1110 0 . 11 ... 1 ... 0 1111 1100 .... @vcmp_fp_scalar size=1
-  VCMPNE_scalar    1111 1110 0 . .. ... 1 ... 0 1111 1100 .... @vcmp_scalar
-}
-
-{
-  VCMPGT_fp_scalar 1111 1110 0 . 11 ... 1 ... 1 1111 0110 .... @vcmp_fp_scalar size=1
-  VCMPGT_scalar    1111 1110 0 . .. ... 1 ... 1 1111 0110 .... @vcmp_scalar
-}
-
-{
-  VCMPLE_fp_scalar 1111 1110 0 . 11 ... 1 ... 1 1111 1110 .... @vcmp_fp_scalar size=1
-  VCMPLE_scalar    1111 1110 0 . .. ... 1 ... 1 1111 1110 .... @vcmp_scalar
-}
-
-{
-  VCMPGE_fp_scalar 1111 1110 0 . 11 ... 1 ... 1 1111 0100 .... @vcmp_fp_scalar size=1
-  VCMPGE_scalar    1111 1110 0 . .. ... 1 ... 1 1111 0100 .... @vcmp_scalar
-}
-{
-  VCMPLT_fp_scalar 1111 1110 0 . 11 ... 1 ... 1 1111 1100 .... @vcmp_fp_scalar size=1
-  VCMPLT_scalar    1111 1110 0 . .. ... 1 ... 1 1111 1100 .... @vcmp_scalar
-}
-
-VCMPCS_scalar     1111 1110 0 . .. ... 1 ... 0 1111 0 1 1 0 .... @vcmp_scalar
-VCMPHI_scalar     1111 1110 0 . .. ... 1 ... 0 1111 1 1 1 0 .... @vcmp_scalar
-
-# 2-operand FP
-VADD_fp           1110 1111 0 . 0 . ... 0 ... 0 1101 . 1 . 0 ... 0 @2op_fp
-VSUB_fp           1110 1111 0 . 1 . ... 0 ... 0 1101 . 1 . 0 ... 0 @2op_fp
-VMUL_fp           1111 1111 0 . 0 . ... 0 ... 0 1101 . 1 . 1 ... 0 @2op_fp
-VABD_fp           1111 1111 0 . 1 . ... 0 ... 0 1101 . 1 . 0 ... 0 @2op_fp
-
-VMAXNM            1111 1111 0 . 0 . ... 0 ... 0 1111 . 1 . 1 ... 0 @2op_fp
-VMINNM            1111 1111 0 . 1 . ... 0 ... 0 1111 . 1 . 1 ... 0 @2op_fp
-
-VCADD90_fp        1111 1100 1 . 0 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev
-VCADD270_fp       1111 1101 1 . 0 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev
-
-VFMA              1110 1111 0 . 0 . ... 0 ... 0 1100 . 1 . 1 ... 0 @2op_fp
-VFMS              1110 1111 0 . 1 . ... 0 ... 0 1100 . 1 . 1 ... 0 @2op_fp
-
-VCMLA0            1111 110 00 . 1 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev
-VCMLA90           1111 110 01 . 1 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev
-VCMLA180          1111 110 10 . 1 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev
-VCMLA270          1111 110 11 . 1 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev
-
-# floating-point <-> fixed-point conversions. Naming convention:
-# VCVT_<from><to>, S = signed int, U = unsigned int, H = halfprec, F = singleprec
-@vcvt             .... .... .. 1 ..... .... .. 1 . .... .... &2shift \
-                  qd=%qd qm=%qm shift=%rshift_i5 size=2
-@vcvt_f16         .... .... .. 11 .... .... .. 0 . .... .... &2shift \
-                  qd=%qd qm=%qm shift=%rshift_i4 size=1
-
-VCVT_SH_fixed     1110 1111 1 . ...... ... 0 11 . 0 01 . 1 ... 0 @vcvt_f16
-VCVT_UH_fixed     1111 1111 1 . ...... ... 0 11 . 0 01 . 1 ... 0 @vcvt_f16
-
-VCVT_HS_fixed     1110 1111 1 . ...... ... 0 11 . 1 01 . 1 ... 0 @vcvt_f16
-VCVT_HU_fixed     1111 1111 1 . ...... ... 0 11 . 1 01 . 1 ... 0 @vcvt_f16
-
-VCVT_SF_fixed     1110 1111 1 . ...... ... 0 11 . 0 01 . 1 ... 0 @vcvt
-VCVT_UF_fixed     1111 1111 1 . ...... ... 0 11 . 0 01 . 1 ... 0 @vcvt
-
-VCVT_FS_fixed     1110 1111 1 . ...... ... 0 11 . 1 01 . 1 ... 0 @vcvt
-VCVT_FU_fixed     1111 1111 1 . ...... ... 0 11 . 1 01 . 1 ... 0 @vcvt
-
-# VCVT between floating point and integer (halfprec and single);
-# VCVT_<from><to>, S = signed int, U = unsigned int, F = float
-VCVT_SF           1111 1111 1 . 11 .. 11 ... 0 011 00 1 . 0 ... 0 @1op
-VCVT_UF           1111 1111 1 . 11 .. 11 ... 0 011 01 1 . 0 ... 0 @1op
-VCVT_FS           1111 1111 1 . 11 .. 11 ... 0 011 10 1 . 0 ... 0 @1op
-VCVT_FU           1111 1111 1 . 11 .. 11 ... 0 011 11 1 . 0 ... 0 @1op
-
-# VCVT from floating point to integer with specified rounding mode
-VCVTAS            1111 1111 1 . 11 .. 11 ... 000 00 0 1 . 0 ... 0 @1op
-VCVTAU            1111 1111 1 . 11 .. 11 ... 000 00 1 1 . 0 ... 0 @1op
-VCVTNS            1111 1111 1 . 11 .. 11 ... 000 01 0 1 . 0 ... 0 @1op
-VCVTNU            1111 1111 1 . 11 .. 11 ... 000 01 1 1 . 0 ... 0 @1op
-VCVTPS            1111 1111 1 . 11 .. 11 ... 000 10 0 1 . 0 ... 0 @1op
-VCVTPU            1111 1111 1 . 11 .. 11 ... 000 10 1 1 . 0 ... 0 @1op
-VCVTMS            1111 1111 1 . 11 .. 11 ... 000 11 0 1 . 0 ... 0 @1op
-VCVTMU            1111 1111 1 . 11 .. 11 ... 000 11 1 1 . 0 ... 0 @1op
-
-VRINTN            1111 1111 1 . 11 .. 10 ... 001 000 1 . 0 ... 0 @1op
-VRINTX            1111 1111 1 . 11 .. 10 ... 001 001 1 . 0 ... 0 @1op
-VRINTA            1111 1111 1 . 11 .. 10 ... 001 010 1 . 0 ... 0 @1op
-VRINTZ            1111 1111 1 . 11 .. 10 ... 001 011 1 . 0 ... 0 @1op
-VRINTM            1111 1111 1 . 11 .. 10 ... 001 101 1 . 0 ... 0 @1op
-VRINTP            1111 1111 1 . 11 .. 10 ... 001 111 1 . 0 ... 0 @1op
diff --git a/target/arm/neon-dp.decode b/target/arm/neon-dp.decode
deleted file mode 100644
index fd3a01b..0000000
--- a/target/arm/neon-dp.decode
+++ /dev/null
@@ -1,646 +0,0 @@
-# AArch32 Neon data-processing instruction descriptions
-#
-#  Copyright (c) 2020 Linaro, Ltd
-#
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-#
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, see <http://www.gnu.org/licenses/>.
-
-#
-# This file is processed by scripts/decodetree.py
-#
-# VFP/Neon register fields; same as vfp.decode
-%vm_dp  5:1 0:4
-%vn_dp  7:1 16:4
-%vd_dp  22:1 12:4
-
-# Encodings for Neon data processing instructions where the T32 encoding
-# is a simple transformation of the A32 encoding.
-# More specifically, this file covers instructions where the A32 encoding is
-#   0b1111_001p_qqqq_qqqq_qqqq_qqqq_qqqq_qqqq
-# and the T32 encoding is
-#   0b111p_1111_qqqq_qqqq_qqqq_qqqq_qqqq_qqqq
-# This file works on the A32 encoding only; calling code for T32 has to
-# transform the insn into the A32 version first.
-
-######################################################################
-# 3-reg-same grouping:
-# 1111 001 U 0 D sz:2 Vn:4 Vd:4 opc:4 N Q M op Vm:4
-######################################################################
-
-&3same vm vn vd q size
-
-@3same           .... ... . . . size:2 .... .... .... . q:1 . . .... \
-                 &3same vm=%vm_dp vn=%vn_dp vd=%vd_dp
-
-@3same_q0        .... ... . . . size:2 .... .... .... . 0 . . .... \
-                 &3same vm=%vm_dp vn=%vn_dp vd=%vd_dp q=0
-
-# For FP insns the high bit of 'size' is used as part of opcode decode,
-# and the 'size' bit is 0 for 32-bit float and 1 for 16-bit float.
-# This converts this encoding to the same MO_8/16/32/64 values that the
-# integer neon insns use.
-%3same_fp_size   20:1 !function=neon_3same_fp_size
-
-@3same_fp        .... ... . . . . . .... .... .... . q:1 . . .... \
-                 &3same vm=%vm_dp vn=%vn_dp vd=%vd_dp size=%3same_fp_size
-@3same_fp_q0     .... ... . . . . . .... .... .... . 0 . . .... \
-                 &3same vm=%vm_dp vn=%vn_dp vd=%vd_dp q=0 size=%3same_fp_size
-
-VHADD_S_3s       1111 001 0 0 . .. .... .... 0000 . . . 0 .... @3same
-VHADD_U_3s       1111 001 1 0 . .. .... .... 0000 . . . 0 .... @3same
-VQADD_S_3s       1111 001 0 0 . .. .... .... 0000 . . . 1 .... @3same
-VQADD_U_3s       1111 001 1 0 . .. .... .... 0000 . . . 1 .... @3same
-
-VRHADD_S_3s      1111 001 0 0 . .. .... .... 0001 . . . 0 .... @3same
-VRHADD_U_3s      1111 001 1 0 . .. .... .... 0001 . . . 0 .... @3same
-
-@3same_logic     .... ... . . . .. .... .... .... . q:1 .. .... \
-                 &3same vm=%vm_dp vn=%vn_dp vd=%vd_dp size=0
-
-VAND_3s          1111 001 0 0 . 00 .... .... 0001 ... 1 .... @3same_logic
-VBIC_3s          1111 001 0 0 . 01 .... .... 0001 ... 1 .... @3same_logic
-VORR_3s          1111 001 0 0 . 10 .... .... 0001 ... 1 .... @3same_logic
-VORN_3s          1111 001 0 0 . 11 .... .... 0001 ... 1 .... @3same_logic
-VEOR_3s          1111 001 1 0 . 00 .... .... 0001 ... 1 .... @3same_logic
-VBSL_3s          1111 001 1 0 . 01 .... .... 0001 ... 1 .... @3same_logic
-VBIT_3s          1111 001 1 0 . 10 .... .... 0001 ... 1 .... @3same_logic
-VBIF_3s          1111 001 1 0 . 11 .... .... 0001 ... 1 .... @3same_logic
-
-VHSUB_S_3s       1111 001 0 0 . .. .... .... 0010 . . . 0 .... @3same
-VHSUB_U_3s       1111 001 1 0 . .. .... .... 0010 . . . 0 .... @3same
-
-VQSUB_S_3s       1111 001 0 0 . .. .... .... 0010 . . . 1 .... @3same
-VQSUB_U_3s       1111 001 1 0 . .. .... .... 0010 . . . 1 .... @3same
-
-VCGT_S_3s        1111 001 0 0 . .. .... .... 0011 . . . 0 .... @3same
-VCGT_U_3s        1111 001 1 0 . .. .... .... 0011 . . . 0 .... @3same
-VCGE_S_3s        1111 001 0 0 . .. .... .... 0011 . . . 1 .... @3same
-VCGE_U_3s        1111 001 1 0 . .. .... .... 0011 . . . 1 .... @3same
-
-# The _rev suffix indicates that Vn and Vm are reversed. This is
-# the case for shifts. In the Arm ARM these insns are documented
-# with the Vm and Vn fields in their usual places, but in the
-# assembly the operands are listed "backwards", ie in the order
-# Dd, Dm, Dn where other insns use Dd, Dn, Dm. For QEMU we choose
-# to consider Vm and Vn as being in different fields in the insn,
-# which allows us to avoid special-casing shifts in the trans_
-# function code. We would otherwise need to manually swap the operands
-# over to call Neon helper functions that are shared with AArch64,
-# which does not have this odd reversed-operand situation.
-@3same_rev       .... ... . . . size:2 .... .... .... . q:1 . . .... \
-                 &3same vn=%vm_dp vm=%vn_dp vd=%vd_dp
-
-VSHL_S_3s        1111 001 0 0 . .. .... .... 0100 . . . 0 .... @3same_rev
-VSHL_U_3s        1111 001 1 0 . .. .... .... 0100 . . . 0 .... @3same_rev
-
-# Insns operating on 64-bit elements (size!=0b11 handled elsewhere)
-# The _rev suffix indicates that Vn and Vm are reversed (as explained
-# by the comment for the @3same_rev format).
-@3same_64_rev    .... ... . . . 11 .... .... .... . q:1 . . .... \
-                 &3same vm=%vn_dp vn=%vm_dp vd=%vd_dp size=3
-
-{
-  VQSHL_S64_3s   1111 001 0 0 . .. .... .... 0100 . . . 1 .... @3same_64_rev
-  VQSHL_S_3s     1111 001 0 0 . .. .... .... 0100 . . . 1 .... @3same_rev
-}
-{
-  VQSHL_U64_3s   1111 001 1 0 . .. .... .... 0100 . . . 1 .... @3same_64_rev
-  VQSHL_U_3s     1111 001 1 0 . .. .... .... 0100 . . . 1 .... @3same_rev
-}
-{
-  VRSHL_S64_3s   1111 001 0 0 . .. .... .... 0101 . . . 0 .... @3same_64_rev
-  VRSHL_S_3s     1111 001 0 0 . .. .... .... 0101 . . . 0 .... @3same_rev
-}
-{
-  VRSHL_U64_3s   1111 001 1 0 . .. .... .... 0101 . . . 0 .... @3same_64_rev
-  VRSHL_U_3s     1111 001 1 0 . .. .... .... 0101 . . . 0 .... @3same_rev
-}
-{
-  VQRSHL_S64_3s  1111 001 0 0 . .. .... .... 0101 . . . 1 .... @3same_64_rev
-  VQRSHL_S_3s    1111 001 0 0 . .. .... .... 0101 . . . 1 .... @3same_rev
-}
-{
-  VQRSHL_U64_3s  1111 001 1 0 . .. .... .... 0101 . . . 1 .... @3same_64_rev
-  VQRSHL_U_3s    1111 001 1 0 . .. .... .... 0101 . . . 1 .... @3same_rev
-}
-
-VMAX_S_3s        1111 001 0 0 . .. .... .... 0110 . . . 0 .... @3same
-VMAX_U_3s        1111 001 1 0 . .. .... .... 0110 . . . 0 .... @3same
-VMIN_S_3s        1111 001 0 0 . .. .... .... 0110 . . . 1 .... @3same
-VMIN_U_3s        1111 001 1 0 . .. .... .... 0110 . . . 1 .... @3same
-
-VABD_S_3s        1111 001 0 0 . .. .... .... 0111 . . . 0 .... @3same
-VABD_U_3s        1111 001 1 0 . .. .... .... 0111 . . . 0 .... @3same
-
-VABA_S_3s        1111 001 0 0 . .. .... .... 0111 . . . 1 .... @3same
-VABA_U_3s        1111 001 1 0 . .. .... .... 0111 . . . 1 .... @3same
-
-VADD_3s          1111 001 0 0 . .. .... .... 1000 . . . 0 .... @3same
-VSUB_3s          1111 001 1 0 . .. .... .... 1000 . . . 0 .... @3same
-
-VTST_3s          1111 001 0 0 . .. .... .... 1000 . . . 1 .... @3same
-VCEQ_3s          1111 001 1 0 . .. .... .... 1000 . . . 1 .... @3same
-
-VMLA_3s          1111 001 0 0 . .. .... .... 1001 . . . 0 .... @3same
-VMLS_3s          1111 001 1 0 . .. .... .... 1001 . . . 0 .... @3same
-
-VMUL_3s          1111 001 0 0 . .. .... .... 1001 . . . 1 .... @3same
-VMUL_p_3s        1111 001 1 0 . .. .... .... 1001 . . . 1 .... @3same
-
-VPMAX_S_3s       1111 001 0 0 . .. .... .... 1010 . . . 0 .... @3same_q0
-VPMAX_U_3s       1111 001 1 0 . .. .... .... 1010 . . . 0 .... @3same_q0
-
-VPMIN_S_3s       1111 001 0 0 . .. .... .... 1010 . . . 1 .... @3same_q0
-VPMIN_U_3s       1111 001 1 0 . .. .... .... 1010 . . . 1 .... @3same_q0
-
-VQDMULH_3s       1111 001 0 0 . .. .... .... 1011 . . . 0 .... @3same
-VQRDMULH_3s      1111 001 1 0 . .. .... .... 1011 . . . 0 .... @3same
-
-VPADD_3s         1111 001 0 0 . .. .... .... 1011 . . . 1 .... @3same_q0
-
-VQRDMLAH_3s      1111 001 1 0 . .. .... .... 1011 ... 1 .... @3same
-
-@3same_crypto    .... .... .... .... .... .... .... .... \
-                 &3same vm=%vm_dp vn=%vn_dp vd=%vd_dp size=0 q=1
-
-SHA1C_3s         1111 001 0 0 . 00 .... .... 1100 . 1 . 0 .... @3same_crypto
-SHA1P_3s         1111 001 0 0 . 01 .... .... 1100 . 1 . 0 .... @3same_crypto
-SHA1M_3s         1111 001 0 0 . 10 .... .... 1100 . 1 . 0 .... @3same_crypto
-SHA1SU0_3s       1111 001 0 0 . 11 .... .... 1100 . 1 . 0 .... @3same_crypto
-SHA256H_3s       1111 001 1 0 . 00 .... .... 1100 . 1 . 0 .... @3same_crypto
-SHA256H2_3s      1111 001 1 0 . 01 .... .... 1100 . 1 . 0 .... @3same_crypto
-SHA256SU1_3s     1111 001 1 0 . 10 .... .... 1100 . 1 . 0 .... @3same_crypto
-
-VFMA_fp_3s       1111 001 0 0 . 0 . .... .... 1100 ... 1 .... @3same_fp
-VFMS_fp_3s       1111 001 0 0 . 1 . .... .... 1100 ... 1 .... @3same_fp
-
-VQRDMLSH_3s      1111 001 1 0 . .. .... .... 1100 ... 1 .... @3same
-
-VADD_fp_3s       1111 001 0 0 . 0 . .... .... 1101 ... 0 .... @3same_fp
-VSUB_fp_3s       1111 001 0 0 . 1 . .... .... 1101 ... 0 .... @3same_fp
-VPADD_fp_3s      1111 001 1 0 . 0 . .... .... 1101 ... 0 .... @3same_fp_q0
-VABD_fp_3s       1111 001 1 0 . 1 . .... .... 1101 ... 0 .... @3same_fp
-VMLA_fp_3s       1111 001 0 0 . 0 . .... .... 1101 ... 1 .... @3same_fp
-VMLS_fp_3s       1111 001 0 0 . 1 . .... .... 1101 ... 1 .... @3same_fp
-VMUL_fp_3s       1111 001 1 0 . 0 . .... .... 1101 ... 1 .... @3same_fp
-VCEQ_fp_3s       1111 001 0 0 . 0 . .... .... 1110 ... 0 .... @3same_fp
-VCGE_fp_3s       1111 001 1 0 . 0 . .... .... 1110 ... 0 .... @3same_fp
-VACGE_fp_3s      1111 001 1 0 . 0 . .... .... 1110 ... 1 .... @3same_fp
-VCGT_fp_3s       1111 001 1 0 . 1 . .... .... 1110 ... 0 .... @3same_fp
-VACGT_fp_3s      1111 001 1 0 . 1 . .... .... 1110 ... 1 .... @3same_fp
-VMAX_fp_3s       1111 001 0 0 . 0 . .... .... 1111 ... 0 .... @3same_fp
-VMIN_fp_3s       1111 001 0 0 . 1 . .... .... 1111 ... 0 .... @3same_fp
-VPMAX_fp_3s      1111 001 1 0 . 0 . .... .... 1111 ... 0 .... @3same_fp_q0
-VPMIN_fp_3s      1111 001 1 0 . 1 . .... .... 1111 ... 0 .... @3same_fp_q0
-VRECPS_fp_3s     1111 001 0 0 . 0 . .... .... 1111 ... 1 .... @3same_fp
-VRSQRTS_fp_3s    1111 001 0 0 . 1 . .... .... 1111 ... 1 .... @3same_fp
-VMAXNM_fp_3s     1111 001 1 0 . 0 . .... .... 1111 ... 1 .... @3same_fp
-VMINNM_fp_3s     1111 001 1 0 . 1 . .... .... 1111 ... 1 .... @3same_fp
-
-######################################################################
-# 2-reg-and-shift grouping:
-# 1111 001 U 1 D immH:3 immL:3 Vd:4 opc:4 L Q M 1 Vm:4
-######################################################################
-&2reg_shift vm vd q shift size
-
-# Right shifts are encoded as N - shift, where N is the element size in bits.
-%neon_rshift_i6  16:6 !function=rsub_64
-%neon_rshift_i5  16:5 !function=rsub_32
-%neon_rshift_i4  16:4 !function=rsub_16
-%neon_rshift_i3  16:3 !function=rsub_8
-
-@2reg_shr_d      .... ... . . . ......  .... .... 1 q:1 . . .... \
-                 &2reg_shift vm=%vm_dp vd=%vd_dp size=3 shift=%neon_rshift_i6
-@2reg_shr_s      .... ... . . . 1 ..... .... .... 0 q:1 . . .... \
-                 &2reg_shift vm=%vm_dp vd=%vd_dp size=2 shift=%neon_rshift_i5
-@2reg_shr_h      .... ... . . . 01 .... .... .... 0 q:1 . . .... \
-                 &2reg_shift vm=%vm_dp vd=%vd_dp size=1 shift=%neon_rshift_i4
-@2reg_shr_b      .... ... . . . 001 ... .... .... 0 q:1 . . .... \
-                 &2reg_shift vm=%vm_dp vd=%vd_dp size=0 shift=%neon_rshift_i3
-
-@2reg_shl_d      .... ... . . . shift:6      .... .... 1 q:1 . . .... \
-                 &2reg_shift vm=%vm_dp vd=%vd_dp size=3
-@2reg_shl_s      .... ... . . . 1 shift:5    .... .... 0 q:1 . . .... \
-                 &2reg_shift vm=%vm_dp vd=%vd_dp size=2
-@2reg_shl_h      .... ... . . . 01 shift:4   .... .... 0 q:1 . . .... \
-                 &2reg_shift vm=%vm_dp vd=%vd_dp size=1
-@2reg_shl_b      .... ... . . . 001 shift:3  .... .... 0 q:1 . . .... \
-                 &2reg_shift vm=%vm_dp vd=%vd_dp size=0
-
-# Narrowing right shifts: here the Q bit is part of the opcode decode
-@2reg_shrn_d     .... ... . . . 1 ..... .... .... 0 . . . .... \
-                 &2reg_shift vm=%vm_dp vd=%vd_dp size=3 q=0 \
-                 shift=%neon_rshift_i5
-@2reg_shrn_s     .... ... . . . 01 .... .... .... 0 . . . .... \
-                 &2reg_shift vm=%vm_dp vd=%vd_dp size=2 q=0 \
-                 shift=%neon_rshift_i4
-@2reg_shrn_h     .... ... . . . 001 ... .... .... 0 . . . .... \
-                 &2reg_shift vm=%vm_dp vd=%vd_dp size=1 q=0 \
-                 shift=%neon_rshift_i3
-
-# Long left shifts: again Q is part of opcode decode
-@2reg_shll_s     .... ... . . . 1 shift:5    .... .... 0 . . . .... \
-                 &2reg_shift vm=%vm_dp vd=%vd_dp size=2 q=0
-@2reg_shll_h     .... ... . . . 01 shift:4   .... .... 0 . . . .... \
-                 &2reg_shift vm=%vm_dp vd=%vd_dp size=1 q=0
-@2reg_shll_b     .... ... . . . 001 shift:3  .... .... 0 . . . .... \
-                 &2reg_shift vm=%vm_dp vd=%vd_dp size=0 q=0
-
-@2reg_vcvt       .... ... . . . 1 ..... .... .... . q:1 . . .... \
-                 &2reg_shift vm=%vm_dp vd=%vd_dp size=2 shift=%neon_rshift_i5
-@2reg_vcvt_f16   .... ... . . . 11 .... .... .... . q:1 . . .... \
-                 &2reg_shift vm=%vm_dp vd=%vd_dp size=1 shift=%neon_rshift_i4
-
-VSHR_S_2sh       1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_d
-VSHR_S_2sh       1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_s
-VSHR_S_2sh       1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_h
-VSHR_S_2sh       1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_b
-
-VSHR_U_2sh       1111 001 1 1 . ...... .... 0000 . . . 1 .... @2reg_shr_d
-VSHR_U_2sh       1111 001 1 1 . ...... .... 0000 . . . 1 .... @2reg_shr_s
-VSHR_U_2sh       1111 001 1 1 . ...... .... 0000 . . . 1 .... @2reg_shr_h
-VSHR_U_2sh       1111 001 1 1 . ...... .... 0000 . . . 1 .... @2reg_shr_b
-
-VSRA_S_2sh       1111 001 0 1 . ...... .... 0001 . . . 1 .... @2reg_shr_d
-VSRA_S_2sh       1111 001 0 1 . ...... .... 0001 . . . 1 .... @2reg_shr_s
-VSRA_S_2sh       1111 001 0 1 . ...... .... 0001 . . . 1 .... @2reg_shr_h
-VSRA_S_2sh       1111 001 0 1 . ...... .... 0001 . . . 1 .... @2reg_shr_b
-
-VSRA_U_2sh       1111 001 1 1 . ...... .... 0001 . . . 1 .... @2reg_shr_d
-VSRA_U_2sh       1111 001 1 1 . ...... .... 0001 . . . 1 .... @2reg_shr_s
-VSRA_U_2sh       1111 001 1 1 . ...... .... 0001 . . . 1 .... @2reg_shr_h
-VSRA_U_2sh       1111 001 1 1 . ...... .... 0001 . . . 1 .... @2reg_shr_b
-
-VRSHR_S_2sh      1111 001 0 1 . ...... .... 0010 . . . 1 .... @2reg_shr_d
-VRSHR_S_2sh      1111 001 0 1 . ...... .... 0010 . . . 1 .... @2reg_shr_s
-VRSHR_S_2sh      1111 001 0 1 . ...... .... 0010 . . . 1 .... @2reg_shr_h
-VRSHR_S_2sh      1111 001 0 1 . ...... .... 0010 . . . 1 .... @2reg_shr_b
-
-VRSHR_U_2sh      1111 001 1 1 . ...... .... 0010 . . . 1 .... @2reg_shr_d
-VRSHR_U_2sh      1111 001 1 1 . ...... .... 0010 . . . 1 .... @2reg_shr_s
-VRSHR_U_2sh      1111 001 1 1 . ...... .... 0010 . . . 1 .... @2reg_shr_h
-VRSHR_U_2sh      1111 001 1 1 . ...... .... 0010 . . . 1 .... @2reg_shr_b
-
-VRSRA_S_2sh      1111 001 0 1 . ...... .... 0011 . . . 1 .... @2reg_shr_d
-VRSRA_S_2sh      1111 001 0 1 . ...... .... 0011 . . . 1 .... @2reg_shr_s
-VRSRA_S_2sh      1111 001 0 1 . ...... .... 0011 . . . 1 .... @2reg_shr_h
-VRSRA_S_2sh      1111 001 0 1 . ...... .... 0011 . . . 1 .... @2reg_shr_b
-
-VRSRA_U_2sh      1111 001 1 1 . ...... .... 0011 . . . 1 .... @2reg_shr_d
-VRSRA_U_2sh      1111 001 1 1 . ...... .... 0011 . . . 1 .... @2reg_shr_s
-VRSRA_U_2sh      1111 001 1 1 . ...... .... 0011 . . . 1 .... @2reg_shr_h
-VRSRA_U_2sh      1111 001 1 1 . ...... .... 0011 . . . 1 .... @2reg_shr_b
-
-VSRI_2sh         1111 001 1 1 . ...... .... 0100 . . . 1 .... @2reg_shr_d
-VSRI_2sh         1111 001 1 1 . ...... .... 0100 . . . 1 .... @2reg_shr_s
-VSRI_2sh         1111 001 1 1 . ...... .... 0100 . . . 1 .... @2reg_shr_h
-VSRI_2sh         1111 001 1 1 . ...... .... 0100 . . . 1 .... @2reg_shr_b
-
-VSHL_2sh         1111 001 0 1 . ...... .... 0101 . . . 1 .... @2reg_shl_d
-VSHL_2sh         1111 001 0 1 . ...... .... 0101 . . . 1 .... @2reg_shl_s
-VSHL_2sh         1111 001 0 1 . ...... .... 0101 . . . 1 .... @2reg_shl_h
-VSHL_2sh         1111 001 0 1 . ...... .... 0101 . . . 1 .... @2reg_shl_b
-
-VSLI_2sh         1111 001 1 1 . ...... .... 0101 . . . 1 .... @2reg_shl_d
-VSLI_2sh         1111 001 1 1 . ...... .... 0101 . . . 1 .... @2reg_shl_s
-VSLI_2sh         1111 001 1 1 . ...... .... 0101 . . . 1 .... @2reg_shl_h
-VSLI_2sh         1111 001 1 1 . ...... .... 0101 . . . 1 .... @2reg_shl_b
-
-VQSHLU_64_2sh    1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_d
-VQSHLU_2sh       1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_s
-VQSHLU_2sh       1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_h
-VQSHLU_2sh       1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_b
-
-VQSHL_S_64_2sh   1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_d
-VQSHL_S_2sh      1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_s
-VQSHL_S_2sh      1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_h
-VQSHL_S_2sh      1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_b
-
-VQSHL_U_64_2sh   1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_d
-VQSHL_U_2sh      1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_s
-VQSHL_U_2sh      1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_h
-VQSHL_U_2sh      1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_b
-
-VSHRN_64_2sh     1111 001 0 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_d
-VSHRN_32_2sh     1111 001 0 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_s
-VSHRN_16_2sh     1111 001 0 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_h
-
-VRSHRN_64_2sh    1111 001 0 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_d
-VRSHRN_32_2sh    1111 001 0 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_s
-VRSHRN_16_2sh    1111 001 0 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_h
-
-VQSHRUN_64_2sh   1111 001 1 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_d
-VQSHRUN_32_2sh   1111 001 1 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_s
-VQSHRUN_16_2sh   1111 001 1 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_h
-
-VQRSHRUN_64_2sh  1111 001 1 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_d
-VQRSHRUN_32_2sh  1111 001 1 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_s
-VQRSHRUN_16_2sh  1111 001 1 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_h
-
-# VQSHRN with signed input
-VQSHRN_S64_2sh   1111 001 0 1 . ...... .... 1001 . 0 . 1 .... @2reg_shrn_d
-VQSHRN_S32_2sh   1111 001 0 1 . ...... .... 1001 . 0 . 1 .... @2reg_shrn_s
-VQSHRN_S16_2sh   1111 001 0 1 . ...... .... 1001 . 0 . 1 .... @2reg_shrn_h
-
-# VQRSHRN with signed input
-VQRSHRN_S64_2sh  1111 001 0 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_d
-VQRSHRN_S32_2sh  1111 001 0 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_s
-VQRSHRN_S16_2sh  1111 001 0 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_h
-
-# VQSHRN with unsigned input
-VQSHRN_U64_2sh   1111 001 1 1 . ...... .... 1001 . 0 . 1 .... @2reg_shrn_d
-VQSHRN_U32_2sh   1111 001 1 1 . ...... .... 1001 . 0 . 1 .... @2reg_shrn_s
-VQSHRN_U16_2sh   1111 001 1 1 . ...... .... 1001 . 0 . 1 .... @2reg_shrn_h
-
-# VQRSHRN with unsigned input
-VQRSHRN_U64_2sh  1111 001 1 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_d
-VQRSHRN_U32_2sh  1111 001 1 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_s
-VQRSHRN_U16_2sh  1111 001 1 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_h
-
-VSHLL_S_2sh      1111 001 0 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_s
-VSHLL_S_2sh      1111 001 0 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_h
-VSHLL_S_2sh      1111 001 0 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_b
-
-VSHLL_U_2sh      1111 001 1 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_s
-VSHLL_U_2sh      1111 001 1 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_h
-VSHLL_U_2sh      1111 001 1 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_b
-
-# VCVT fixed<->float conversions
-VCVT_SH_2sh      1111 001 0 1 . ...... .... 1100 0 . . 1 .... @2reg_vcvt_f16
-VCVT_UH_2sh      1111 001 1 1 . ...... .... 1100 0 . . 1 .... @2reg_vcvt_f16
-VCVT_HS_2sh      1111 001 0 1 . ...... .... 1101 0 . . 1 .... @2reg_vcvt_f16
-VCVT_HU_2sh      1111 001 1 1 . ...... .... 1101 0 . . 1 .... @2reg_vcvt_f16
-
-VCVT_SF_2sh      1111 001 0 1 . ...... .... 1110 0 . . 1 .... @2reg_vcvt
-VCVT_UF_2sh      1111 001 1 1 . ...... .... 1110 0 . . 1 .... @2reg_vcvt
-VCVT_FS_2sh      1111 001 0 1 . ...... .... 1111 0 . . 1 .... @2reg_vcvt
-VCVT_FU_2sh      1111 001 1 1 . ...... .... 1111 0 . . 1 .... @2reg_vcvt
-
-######################################################################
-# 1-reg-and-modified-immediate grouping:
-# 1111 001 i 1 D 000 imm:3 Vd:4 cmode:4 0 Q op 1 Vm:4
-######################################################################
-
-&1reg_imm        vd q imm cmode op
-
-%asimd_imm_value 24:1 16:3 0:4
-
-@1reg_imm        .... ... . . . ... ... .... .... . q:1 . . .... \
-                 &1reg_imm imm=%asimd_imm_value vd=%vd_dp
-
-# The cmode/op bits here decode VORR/VBIC/VMOV/VMNV, but
-# not in a way we can conveniently represent in decodetree without
-# a lot of repetition:
-# VORR: op=0, (cmode & 1) && cmode < 12
-# VBIC: op=1, (cmode & 1) && cmode < 12
-# VMOV: everything else
-# So we have a single decode line and check the cmode/op in the
-# trans function.
-Vimm_1r          1111 001 . 1 . 000 ... .... cmode:4 0 . op:1 1 .... @1reg_imm
-
-######################################################################
-# Within the "two registers, or three registers of different lengths"
-# grouping ([23,4]=0b10), bits [21:20] are either part of the opcode
-# decode: 0b11 for VEXT, two-reg-misc, VTBL, and duplicate-scalar;
-# or they are a size field for the three-reg-different-lengths and
-# two-reg-and-scalar insn groups (where size cannot be 0b11). This
-# is slightly awkward for decodetree: we handle it with this
-# non-exclusive group which contains within it two exclusive groups:
-# one for the size=0b11 patterns, and one for the size-not-0b11
-# patterns. This allows us to check that none of the insns within
-# each subgroup accidentally overlap each other. Note that all the
-# trans functions for the size-not-0b11 patterns must check and
-# return false for size==3.
-######################################################################
-{
-  [
-    ##################################################################
-    # Miscellaneous size=0b11 insns
-    ##################################################################
-    VEXT         1111 001 0 1 . 11 .... .... imm:4 . q:1 . 0 .... \
-                 vm=%vm_dp vn=%vn_dp vd=%vd_dp
-
-    VTBL         1111 001 1 1 . 11 .... .... 10 len:2 . op:1 . 0 .... \
-                 vm=%vm_dp vn=%vn_dp vd=%vd_dp
-
-    VDUP_scalar  1111 001 1 1 . 11 index:3 1 .... 11 000 q:1 . 0 .... \
-                 vm=%vm_dp vd=%vd_dp size=0
-    VDUP_scalar  1111 001 1 1 . 11 index:2 10 .... 11 000 q:1 . 0 .... \
-                 vm=%vm_dp vd=%vd_dp size=1
-    VDUP_scalar  1111 001 1 1 . 11 index:1 100 .... 11 000 q:1 . 0 .... \
-                 vm=%vm_dp vd=%vd_dp size=2
-
-    ##################################################################
-    # 2-reg-misc grouping:
-    # 1111 001 11 D 11 size:2 opc1:2 Vd:4 0 opc2:4 q:1 M 0 Vm:4
-    ##################################################################
-
-    &2misc vd vm q size
-
-    @2misc       .... ... .. . .. size:2 .. .... . .... q:1 . . .... \
-                 &2misc vm=%vm_dp vd=%vd_dp
-    @2misc_q0    .... ... .. . .. size:2 .. .... . .... . . . .... \
-                 &2misc vm=%vm_dp vd=%vd_dp q=0
-    @2misc_q1    .... ... .. . .. size:2 .. .... . .... . . . .... \
-                 &2misc vm=%vm_dp vd=%vd_dp q=1
-
-    VREV64       1111 001 11 . 11 .. 00 .... 0 0000 . . 0 .... @2misc
-    VREV32       1111 001 11 . 11 .. 00 .... 0 0001 . . 0 .... @2misc
-    VREV16       1111 001 11 . 11 .. 00 .... 0 0010 . . 0 .... @2misc
-
-    VPADDL_S     1111 001 11 . 11 .. 00 .... 0 0100 . . 0 .... @2misc
-    VPADDL_U     1111 001 11 . 11 .. 00 .... 0 0101 . . 0 .... @2misc
-
-    AESE         1111 001 11 . 11 .. 00 .... 0 0110 0 . 0 .... @2misc_q1
-    AESD         1111 001 11 . 11 .. 00 .... 0 0110 1 . 0 .... @2misc_q1
-    AESMC        1111 001 11 . 11 .. 00 .... 0 0111 0 . 0 .... @2misc_q1
-    AESIMC       1111 001 11 . 11 .. 00 .... 0 0111 1 . 0 .... @2misc_q1
-
-    VCLS         1111 001 11 . 11 .. 00 .... 0 1000 . . 0 .... @2misc
-    VCLZ         1111 001 11 . 11 .. 00 .... 0 1001 . . 0 .... @2misc
-    VCNT         1111 001 11 . 11 .. 00 .... 0 1010 . . 0 .... @2misc
-
-    VMVN         1111 001 11 . 11 .. 00 .... 0 1011 . . 0 .... @2misc
-
-    VPADAL_S     1111 001 11 . 11 .. 00 .... 0 1100 . . 0 .... @2misc
-    VPADAL_U     1111 001 11 . 11 .. 00 .... 0 1101 . . 0 .... @2misc
-
-    VQABS        1111 001 11 . 11 .. 00 .... 0 1110 . . 0 .... @2misc
-    VQNEG        1111 001 11 . 11 .. 00 .... 0 1111 . . 0 .... @2misc
-
-    VCGT0        1111 001 11 . 11 .. 01 .... 0 0000 . . 0 .... @2misc
-    VCGE0        1111 001 11 . 11 .. 01 .... 0 0001 . . 0 .... @2misc
-    VCEQ0        1111 001 11 . 11 .. 01 .... 0 0010 . . 0 .... @2misc
-    VCLE0        1111 001 11 . 11 .. 01 .... 0 0011 . . 0 .... @2misc
-    VCLT0        1111 001 11 . 11 .. 01 .... 0 0100 . . 0 .... @2misc
-
-    SHA1H        1111 001 11 . 11 .. 01 .... 0 0101 1 . 0 .... @2misc_q1
-
-    VABS         1111 001 11 . 11 .. 01 .... 0 0110 . . 0 .... @2misc
-    VNEG         1111 001 11 . 11 .. 01 .... 0 0111 . . 0 .... @2misc
-
-    VCGT0_F      1111 001 11 . 11 .. 01 .... 0 1000 . . 0 .... @2misc
-    VCGE0_F      1111 001 11 . 11 .. 01 .... 0 1001 . . 0 .... @2misc
-    VCEQ0_F      1111 001 11 . 11 .. 01 .... 0 1010 . . 0 .... @2misc
-    VCLE0_F      1111 001 11 . 11 .. 01 .... 0 1011 . . 0 .... @2misc
-    VCLT0_F      1111 001 11 . 11 .. 01 .... 0 1100 . . 0 .... @2misc
-
-    VABS_F       1111 001 11 . 11 .. 01 .... 0 1110 . . 0 .... @2misc
-    VNEG_F       1111 001 11 . 11 .. 01 .... 0 1111 . . 0 .... @2misc
-
-    VSWP         1111 001 11 . 11 .. 10 .... 0 0000 . . 0 .... @2misc
-    VTRN         1111 001 11 . 11 .. 10 .... 0 0001 . . 0 .... @2misc
-    VUZP         1111 001 11 . 11 .. 10 .... 0 0010 . . 0 .... @2misc
-    VZIP         1111 001 11 . 11 .. 10 .... 0 0011 . . 0 .... @2misc
-
-    VMOVN        1111 001 11 . 11 .. 10 .... 0 0100 0 . 0 .... @2misc_q0
-    # VQMOVUN: unsigned result (source is always signed)
-    VQMOVUN      1111 001 11 . 11 .. 10 .... 0 0100 1 . 0 .... @2misc_q0
-    # VQMOVN: signed result, source may be signed (_S) or unsigned (_U)
-    VQMOVN_S     1111 001 11 . 11 .. 10 .... 0 0101 0 . 0 .... @2misc_q0
-    VQMOVN_U     1111 001 11 . 11 .. 10 .... 0 0101 1 . 0 .... @2misc_q0
-
-    VSHLL        1111 001 11 . 11 .. 10 .... 0 0110 0 . 0 .... @2misc_q0
-
-    SHA1SU1      1111 001 11 . 11 .. 10 .... 0 0111 0 . 0 .... @2misc_q1
-    SHA256SU0    1111 001 11 . 11 .. 10 .... 0 0111 1 . 0 .... @2misc_q1
-
-    VRINTN       1111 001 11 . 11 .. 10 .... 0 1000 . . 0 .... @2misc
-    VRINTX       1111 001 11 . 11 .. 10 .... 0 1001 . . 0 .... @2misc
-    VRINTA       1111 001 11 . 11 .. 10 .... 0 1010 . . 0 .... @2misc
-    VRINTZ       1111 001 11 . 11 .. 10 .... 0 1011 . . 0 .... @2misc
-
-    VCVT_F16_F32 1111 001 11 . 11 .. 10 .... 0 1100 0 . 0 .... @2misc_q0
-    VCVT_B16_F32 1111 001 11 . 11 .. 10 .... 0 1100 1 . 0 .... @2misc_q0
-
-    VRINTM       1111 001 11 . 11 .. 10 .... 0 1101 . . 0 .... @2misc
-
-    VCVT_F32_F16 1111 001 11 . 11 .. 10 .... 0 1110 0 . 0 .... @2misc_q0
-
-    VRINTP       1111 001 11 . 11 .. 10 .... 0 1111 . . 0 .... @2misc
-
-    VCVTAS       1111 001 11 . 11 .. 11 .... 0 0000 . . 0 .... @2misc
-    VCVTAU       1111 001 11 . 11 .. 11 .... 0 0001 . . 0 .... @2misc
-    VCVTNS       1111 001 11 . 11 .. 11 .... 0 0010 . . 0 .... @2misc
-    VCVTNU       1111 001 11 . 11 .. 11 .... 0 0011 . . 0 .... @2misc
-    VCVTPS       1111 001 11 . 11 .. 11 .... 0 0100 . . 0 .... @2misc
-    VCVTPU       1111 001 11 . 11 .. 11 .... 0 0101 . . 0 .... @2misc
-    VCVTMS       1111 001 11 . 11 .. 11 .... 0 0110 . . 0 .... @2misc
-    VCVTMU       1111 001 11 . 11 .. 11 .... 0 0111 . . 0 .... @2misc
-
-    VRECPE       1111 001 11 . 11 .. 11 .... 0 1000 . . 0 .... @2misc
-    VRSQRTE      1111 001 11 . 11 .. 11 .... 0 1001 . . 0 .... @2misc
-    VRECPE_F     1111 001 11 . 11 .. 11 .... 0 1010 . . 0 .... @2misc
-    VRSQRTE_F    1111 001 11 . 11 .. 11 .... 0 1011 . . 0 .... @2misc
-    VCVT_FS      1111 001 11 . 11 .. 11 .... 0 1100 . . 0 .... @2misc
-    VCVT_FU      1111 001 11 . 11 .. 11 .... 0 1101 . . 0 .... @2misc
-    VCVT_SF      1111 001 11 . 11 .. 11 .... 0 1110 . . 0 .... @2misc
-    VCVT_UF      1111 001 11 . 11 .. 11 .... 0 1111 . . 0 .... @2misc
-  ]
-
-  # Subgroup for size != 0b11
-  [
-    ##################################################################
-    # 3-reg-different-length grouping:
-    # 1111 001 U 1 D sz!=11 Vn:4 Vd:4 opc:4 N 0 M 0 Vm:4
-    ##################################################################
-
-    &3diff vm vn vd size
-
-    @3diff       .... ... . . . size:2 .... .... .... . . . . .... \
-                 &3diff vm=%vm_dp vn=%vn_dp vd=%vd_dp
-
-    VADDL_S_3d   1111 001 0 1 . .. .... .... 0000 . 0 . 0 .... @3diff
-    VADDL_U_3d   1111 001 1 1 . .. .... .... 0000 . 0 . 0 .... @3diff
-
-    VADDW_S_3d   1111 001 0 1 . .. .... .... 0001 . 0 . 0 .... @3diff
-    VADDW_U_3d   1111 001 1 1 . .. .... .... 0001 . 0 . 0 .... @3diff
-
-    VSUBL_S_3d   1111 001 0 1 . .. .... .... 0010 . 0 . 0 .... @3diff
-    VSUBL_U_3d   1111 001 1 1 . .. .... .... 0010 . 0 . 0 .... @3diff
-
-    VSUBW_S_3d   1111 001 0 1 . .. .... .... 0011 . 0 . 0 .... @3diff
-    VSUBW_U_3d   1111 001 1 1 . .. .... .... 0011 . 0 . 0 .... @3diff
-
-    VADDHN_3d    1111 001 0 1 . .. .... .... 0100 . 0 . 0 .... @3diff
-    VRADDHN_3d   1111 001 1 1 . .. .... .... 0100 . 0 . 0 .... @3diff
-
-    VABAL_S_3d   1111 001 0 1 . .. .... .... 0101 . 0 . 0 .... @3diff
-    VABAL_U_3d   1111 001 1 1 . .. .... .... 0101 . 0 . 0 .... @3diff
-
-    VSUBHN_3d    1111 001 0 1 . .. .... .... 0110 . 0 . 0 .... @3diff
-    VRSUBHN_3d   1111 001 1 1 . .. .... .... 0110 . 0 . 0 .... @3diff
-
-    VABDL_S_3d   1111 001 0 1 . .. .... .... 0111 . 0 . 0 .... @3diff
-    VABDL_U_3d   1111 001 1 1 . .. .... .... 0111 . 0 . 0 .... @3diff
-
-    VMLAL_S_3d   1111 001 0 1 . .. .... .... 1000 . 0 . 0 .... @3diff
-    VMLAL_U_3d   1111 001 1 1 . .. .... .... 1000 . 0 . 0 .... @3diff
-
-    VQDMLAL_3d   1111 001 0 1 . .. .... .... 1001 . 0 . 0 .... @3diff
-
-    VMLSL_S_3d   1111 001 0 1 . .. .... .... 1010 . 0 . 0 .... @3diff
-    VMLSL_U_3d   1111 001 1 1 . .. .... .... 1010 . 0 . 0 .... @3diff
-
-    VQDMLSL_3d   1111 001 0 1 . .. .... .... 1011 . 0 . 0 .... @3diff
-
-    VMULL_S_3d   1111 001 0 1 . .. .... .... 1100 . 0 . 0 .... @3diff
-    VMULL_U_3d   1111 001 1 1 . .. .... .... 1100 . 0 . 0 .... @3diff
-
-    VQDMULL_3d   1111 001 0 1 . .. .... .... 1101 . 0 . 0 .... @3diff
-
-    VMULL_P_3d   1111 001 0 1 . .. .... .... 1110 . 0 . 0 .... @3diff
-
-    ##################################################################
-    # 2-regs-plus-scalar grouping:
-    # 1111 001 Q 1 D sz!=11 Vn:4 Vd:4 opc:4 N 1 M 0 Vm:4
-    ##################################################################
-    &2scalar vm vn vd size q
-
-    @2scalar     .... ... q:1 . . size:2 .... .... .... . . . . .... \
-                 &2scalar vm=%vm_dp vn=%vn_dp vd=%vd_dp
-    # For the 'long' ops the Q bit is part of insn decode
-    @2scalar_q0  .... ... . . . size:2 .... .... .... . . . . .... \
-                 &2scalar vm=%vm_dp vn=%vn_dp vd=%vd_dp q=0
-
-    VMLA_2sc     1111 001 . 1 . .. .... .... 0000 . 1 . 0 .... @2scalar
-    VMLA_F_2sc   1111 001 . 1 . .. .... .... 0001 . 1 . 0 .... @2scalar
-
-    VMLAL_S_2sc  1111 001 0 1 . .. .... .... 0010 . 1 . 0 .... @2scalar_q0
-    VMLAL_U_2sc  1111 001 1 1 . .. .... .... 0010 . 1 . 0 .... @2scalar_q0
-
-    VQDMLAL_2sc  1111 001 0 1 . .. .... .... 0011 . 1 . 0 .... @2scalar_q0
-
-    VMLS_2sc     1111 001 . 1 . .. .... .... 0100 . 1 . 0 .... @2scalar
-    VMLS_F_2sc   1111 001 . 1 . .. .... .... 0101 . 1 . 0 .... @2scalar
-
-    VMLSL_S_2sc  1111 001 0 1 . .. .... .... 0110 . 1 . 0 .... @2scalar_q0
-    VMLSL_U_2sc  1111 001 1 1 . .. .... .... 0110 . 1 . 0 .... @2scalar_q0
-
-    VQDMLSL_2sc  1111 001 0 1 . .. .... .... 0111 . 1 . 0 .... @2scalar_q0
-
-    VMUL_2sc     1111 001 . 1 . .. .... .... 1000 . 1 . 0 .... @2scalar
-    VMUL_F_2sc   1111 001 . 1 . .. .... .... 1001 . 1 . 0 .... @2scalar
-
-    VMULL_S_2sc  1111 001 0 1 . .. .... .... 1010 . 1 . 0 .... @2scalar_q0
-    VMULL_U_2sc  1111 001 1 1 . .. .... .... 1010 . 1 . 0 .... @2scalar_q0
-
-    VQDMULL_2sc  1111 001 0 1 . .. .... .... 1011 . 1 . 0 .... @2scalar_q0
-
-    VQDMULH_2sc  1111 001 . 1 . .. .... .... 1100 . 1 . 0 .... @2scalar
-    VQRDMULH_2sc 1111 001 . 1 . .. .... .... 1101 . 1 . 0 .... @2scalar
-
-    VQRDMLAH_2sc 1111 001 . 1 . .. .... .... 1110 . 1 . 0 .... @2scalar
-    VQRDMLSH_2sc 1111 001 . 1 . .. .... .... 1111 . 1 . 0 .... @2scalar
-  ]
-}
diff --git a/target/arm/neon-ls.decode b/target/arm/neon-ls.decode
deleted file mode 100644
index c5f364c..0000000
--- a/target/arm/neon-ls.decode
+++ /dev/null
@@ -1,52 +0,0 @@
-# AArch32 Neon load/store instruction descriptions
-#
-#  Copyright (c) 2020 Linaro, Ltd
-#
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-#
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, see <http://www.gnu.org/licenses/>.
-
-#
-# This file is processed by scripts/decodetree.py
-#
-
-# Encodings for Neon load/store instructions where the T32 encoding
-# is a simple transformation of the A32 encoding.
-# More specifically, this file covers instructions where the A32 encoding is
-#   0b1111_0100_xxx0_xxxx_xxxx_xxxx_xxxx_xxxx
-# and the T32 encoding is
-#   0b1111_1001_xxx0_xxxx_xxxx_xxxx_xxxx_xxxx
-# This file works on the A32 encoding only; calling code for T32 has to
-# transform the insn into the A32 version first.
-
-%vd_dp  22:1 12:4
-
-# Neon load/store multiple structures
-
-VLDST_multiple 1111 0100 0 . l:1 0 rn:4 .... itype:4 size:2 align:2 rm:4 \
-               vd=%vd_dp
-
-# Neon load single element to all lanes
-
-VLD_all_lanes  1111 0100 1 . 1 0 rn:4 .... 11 n:2 size:2 t:1 a:1 rm:4 \
-               vd=%vd_dp
-
-# Neon load/store single structure to one lane
-%imm1_5_p1 5:1 !function=plus_1
-%imm1_6_p1 6:1 !function=plus_1
-
-VLDST_single   1111 0100 1 . l:1 0 rn:4 .... 00 n:2 reg_idx:3 align:1 rm:4 \
-               vd=%vd_dp size=0 stride=1
-VLDST_single   1111 0100 1 . l:1 0 rn:4 .... 01 n:2 reg_idx:2 . align:1 rm:4 \
-               vd=%vd_dp size=1 stride=%imm1_5_p1
-VLDST_single   1111 0100 1 . l:1 0 rn:4 .... 10 n:2 reg_idx:1 . align:2 rm:4 \
-               vd=%vd_dp size=2 stride=%imm1_6_p1
diff --git a/target/arm/neon-shared.decode b/target/arm/neon-shared.decode
deleted file mode 100644
index 8e6bd0b..0000000
--- a/target/arm/neon-shared.decode
+++ /dev/null
@@ -1,99 +0,0 @@
-# AArch32 Neon instruction descriptions
-#
-#  Copyright (c) 2020 Linaro, Ltd
-#
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-#
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, see <http://www.gnu.org/licenses/>.
-
-#
-# This file is processed by scripts/decodetree.py
-#
-
-# Encodings for Neon instructions whose encoding is the same for
-# both A32 and T32.
-
-# More specifically, this covers:
-# 2reg scalar ext: 0b1111_1110_xxxx_xxxx_xxxx_1x0x_xxxx_xxxx
-# 3same ext:       0b1111_110x_xxxx_xxxx_xxxx_1x0x_xxxx_xxxx
-
-# VFP/Neon register fields; same as vfp.decode
-%vm_dp  5:1 0:4
-%vm_sp  0:4 5:1
-%vn_dp  7:1 16:4
-%vn_sp  16:4 7:1
-%vd_dp  22:1 12:4
-%vd_sp  12:4 22:1
-
-# For VCMLA/VCADD insns, convert the single-bit size field
-# which is 0 for fp16 and 1 for fp32 into a MO_* constant.
-# (Note that this is the reverse of the sense of the 1-bit size
-# field in the 3same_fp Neon insns.)
-%vcadd_size    20:1 !function=plus_1
-
-VCMLA          1111 110 rot:2 . 1 . .... .... 1000 . q:1 . 0 .... \
-               vm=%vm_dp vn=%vn_dp vd=%vd_dp size=%vcadd_size
-
-VCADD          1111 110 rot:1 1 . 0 . .... .... 1000 . q:1 . 0 .... \
-               vm=%vm_dp vn=%vn_dp vd=%vd_dp size=%vcadd_size
-
-VSDOT          1111 110 00 . 10 .... .... 1101 . q:1 . 0 .... \
-               vm=%vm_dp vn=%vn_dp vd=%vd_dp
-VUDOT          1111 110 00 . 10 .... .... 1101 . q:1 . 1 .... \
-               vm=%vm_dp vn=%vn_dp vd=%vd_dp
-VUSDOT         1111 110 01 . 10 .... .... 1101 . q:1 . 0 .... \
-               vm=%vm_dp vn=%vn_dp vd=%vd_dp
-VDOT_b16       1111 110 00 . 00 .... .... 1101 . q:1 . 0 .... \
-               vm=%vm_dp vn=%vn_dp vd=%vd_dp
-
-# VFM[AS]L
-VFML           1111 110 0 s:1 . 10 .... .... 1000 . 0 . 1 .... \
-               vm=%vm_sp vn=%vn_sp vd=%vd_dp q=0
-VFML           1111 110 0 s:1 . 10 .... .... 1000 . 1 . 1 .... \
-               vm=%vm_dp vn=%vn_dp vd=%vd_dp q=1
-
-VSMMLA         1111 1100 0.10 .... .... 1100 .1.0 .... \
-               vm=%vm_dp vn=%vn_dp vd=%vd_dp
-VUMMLA         1111 1100 0.10 .... .... 1100 .1.1 .... \
-               vm=%vm_dp vn=%vn_dp vd=%vd_dp
-VUSMMLA        1111 1100 1.10 .... .... 1100 .1.0 .... \
-               vm=%vm_dp vn=%vn_dp vd=%vd_dp
-VMMLA_b16      1111 1100 0.00 .... .... 1100 .1.0 .... \
-               vm=%vm_dp vn=%vn_dp vd=%vd_dp
-
-VFMA_b16       1111 110 0 0.11 .... .... 1000 . q:1 . 1 .... \
-               vm=%vm_dp vn=%vn_dp vd=%vd_dp
-
-VCMLA_scalar   1111 1110 0 . rot:2 .... .... 1000 . q:1 index:1 0 vm:4 \
-               vn=%vn_dp vd=%vd_dp size=1
-VCMLA_scalar   1111 1110 1 . rot:2 .... .... 1000 . q:1 . 0 .... \
-               vm=%vm_dp vn=%vn_dp vd=%vd_dp size=2 index=0
-
-VSDOT_scalar   1111 1110 0 . 10 .... .... 1101 . q:1 index:1 0 vm:4 \
-               vn=%vn_dp vd=%vd_dp
-VUDOT_scalar   1111 1110 0 . 10 .... .... 1101 . q:1 index:1 1 vm:4 \
-               vn=%vn_dp vd=%vd_dp
-VUSDOT_scalar  1111 1110 1 . 00 .... .... 1101 . q:1 index:1 0 vm:4 \
-               vn=%vn_dp vd=%vd_dp
-VSUDOT_scalar  1111 1110 1 . 00 .... .... 1101 . q:1 index:1 1 vm:4 \
-               vn=%vn_dp vd=%vd_dp
-VDOT_b16_scal  1111 1110 0 . 00 .... .... 1101 . q:1 index:1 0 vm:4 \
-               vn=%vn_dp vd=%vd_dp
-
-%vfml_scalar_q0_rm 0:3 5:1
-%vfml_scalar_q1_index 5:1 3:1
-VFML_scalar    1111 1110 0 . 0 s:1 .... .... 1000 . 0 . 1 index:1 ... \
-               rm=%vfml_scalar_q0_rm vn=%vn_sp vd=%vd_dp q=0
-VFML_scalar    1111 1110 0 . 0 s:1 .... .... 1000 . 1 . 1 . rm:3 \
-               index=%vfml_scalar_q1_index vn=%vn_dp vd=%vd_dp q=1
-VFMA_b16_scal  1111 1110 0.11 .... .... 1000 . q:1 . 1 . vm:3 \
-               index=%vfml_scalar_q1_index vn=%vn_dp vd=%vd_dp
diff --git a/target/arm/sme-fa64.decode b/target/arm/sme-fa64.decode
deleted file mode 100644
index 47708cc..0000000
--- a/target/arm/sme-fa64.decode
+++ /dev/null
@@ -1,60 +0,0 @@
-# AArch64 SME allowed instruction decoding
-#
-#  Copyright (c) 2022 Linaro, Ltd
-#
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-#
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, see <http://www.gnu.org/licenses/>.
-
-#
-# This file is processed by scripts/decodetree.py
-#
-
-# These patterns are taken from Appendix E1.1 of DDI0616 A.a,
-# Arm Architecture Reference Manual Supplement,
-# The Scalable Matrix Extension (SME), for Armv9-A
-
-{
-  [
-    OK  0-00 1110 0000 0001 0010 11-- ---- ----   # SMOV W|Xd,Vn.B[0]
-    OK  0-00 1110 0000 0010 0010 11-- ---- ----   # SMOV W|Xd,Vn.H[0]
-    OK  0100 1110 0000 0100 0010 11-- ---- ----   # SMOV Xd,Vn.S[0]
-    OK  0000 1110 0000 0001 0011 11-- ---- ----   # UMOV Wd,Vn.B[0]
-    OK  0000 1110 0000 0010 0011 11-- ---- ----   # UMOV Wd,Vn.H[0]
-    OK  0000 1110 0000 0100 0011 11-- ---- ----   # UMOV Wd,Vn.S[0]
-    OK  0100 1110 0000 1000 0011 11-- ---- ----   # UMOV Xd,Vn.D[0]
-  ]
-  FAIL  0--0 111- ---- ---- ---- ---- ---- ----   # Advanced SIMD vector operations
-}
-
-{
-  [
-    OK  0101 1110 --1- ---- 11-1 11-- ---- ----   # FMULX/FRECPS/FRSQRTS (scalar)
-    OK  0101 1110 -10- ---- 00-1 11-- ---- ----   # FMULX/FRECPS/FRSQRTS (scalar, FP16)
-    OK  01-1 1110 1-10 0001 11-1 10-- ---- ----   # FRECPE/FRSQRTE/FRECPX (scalar)
-    OK  01-1 1110 1111 1001 11-1 10-- ---- ----   # FRECPE/FRSQRTE/FRECPX (scalar, FP16)
-  ]
-  FAIL  01-1 111- ---- ---- ---- ---- ---- ----   # Advanced SIMD single-element operations
-}
-
-FAIL    0-00 110- ---- ---- ---- ---- ---- ----   # Advanced SIMD structure load/store
-FAIL    1100 1110 ---- ---- ---- ---- ---- ----   # Advanced SIMD cryptography extensions
-FAIL    0001 1110 0111 1110 0000 00-- ---- ----   # FJCVTZS
-
-# These are the "avoidance of doubt" final table of Illegal Advanced SIMD instructions
-# We don't actually need to include these, as the default is OK.
-#       -001 111- ---- ---- ---- ---- ---- ----   # Scalar floating-point operations
-#       --10 110- ---- ---- ---- ---- ---- ----   # Load/store pair of FP registers
-#       --01 1100 ---- ---- ---- ---- ---- ----   # Load FP register (PC-relative literal)
-#       --11 1100 --0- ---- ---- ---- ---- ----   # Load/store FP register (unscaled imm)
-#       --11 1100 --1- ---- ---- ---- ---- --10   # Load/store FP register (register offset)
-#       --11 1101 ---- ---- ---- ---- ---- ----   # Load/store FP register (scaled imm)
diff --git a/target/arm/sme.decode b/target/arm/sme.decode
deleted file mode 100644
index 628804e..0000000
--- a/target/arm/sme.decode
+++ /dev/null
@@ -1,88 +0,0 @@
-# AArch64 SME instruction descriptions
-#
-#  Copyright (c) 2022 Linaro, Ltd
-#
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-#
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, see <http://www.gnu.org/licenses/>.
-
-#
-# This file is processed by scripts/decodetree.py
-#
-
-### SME Misc
-
-ZERO            11000000 00 001 00000000000 imm:8
-
-### SME Move into/from Array
-
-%mova_rs        13:2 !function=plus_12
-&mova           esz rs pg zr za_imm v:bool to_vec:bool
-
-MOVA            11000000 esz:2 00000 0 v:1 .. pg:3 zr:5 0 za_imm:4  \
-                &mova to_vec=0 rs=%mova_rs
-MOVA            11000000 11    00000 1 v:1 .. pg:3 zr:5 0 za_imm:4  \
-                &mova to_vec=0 rs=%mova_rs esz=4
-
-MOVA            11000000 esz:2 00001 0 v:1 .. pg:3 0 za_imm:4 zr:5  \
-                &mova to_vec=1 rs=%mova_rs
-MOVA            11000000 11    00001 1 v:1 .. pg:3 0 za_imm:4 zr:5  \
-                &mova to_vec=1 rs=%mova_rs esz=4
-
-### SME Memory
-
-&ldst           esz rs pg rn rm za_imm v:bool st:bool
-
-LDST1           1110000 0 esz:2 st:1 rm:5 v:1 .. pg:3 rn:5 0 za_imm:4  \
-                &ldst rs=%mova_rs
-LDST1           1110000 111     st:1 rm:5 v:1 .. pg:3 rn:5 0 za_imm:4  \
-                &ldst esz=4 rs=%mova_rs
-
-&ldstr          rv rn imm
-@ldstr          ....... ... . ...... .. ... rn:5 . imm:4 \
-                &ldstr rv=%mova_rs
-
-LDR             1110000 100 0 000000 .. 000 ..... 0 ....        @ldstr
-STR             1110000 100 1 000000 .. 000 ..... 0 ....        @ldstr
-
-### SME Add Vector to Array
-
-&adda           zad zn pm pn
-@adda_32        ........ .. ..... . pm:3 pn:3 zn:5 ... zad:2    &adda
-@adda_64        ........ .. ..... . pm:3 pn:3 zn:5 ..  zad:3    &adda
-
-ADDHA_s         11000000 10 01000 0 ... ... ..... 000 ..        @adda_32
-ADDVA_s         11000000 10 01000 1 ... ... ..... 000 ..        @adda_32
-ADDHA_d         11000000 11 01000 0 ... ... ..... 00 ...        @adda_64
-ADDVA_d         11000000 11 01000 1 ... ... ..... 00 ...        @adda_64
-
-### SME Outer Product
-
-&op             zad zn zm pm pn sub:bool
-@op_32          ........ ... zm:5 pm:3 pn:3 zn:5 sub:1 .. zad:2 &op
-@op_64          ........ ... zm:5 pm:3 pn:3 zn:5 sub:1 .  zad:3 &op
-
-FMOPA_s         10000000 100 ..... ... ... ..... . 00 ..        @op_32
-FMOPA_d         10000000 110 ..... ... ... ..... . 0 ...        @op_64
-
-BFMOPA          10000001 100 ..... ... ... ..... . 00 ..        @op_32
-FMOPA_h         10000001 101 ..... ... ... ..... . 00 ..        @op_32
-
-SMOPA_s         1010000 0 10 0 ..... ... ... ..... . 00 ..      @op_32
-SUMOPA_s        1010000 0 10 1 ..... ... ... ..... . 00 ..      @op_32
-USMOPA_s        1010000 1 10 0 ..... ... ... ..... . 00 ..      @op_32
-UMOPA_s         1010000 1 10 1 ..... ... ... ..... . 00 ..      @op_32
-
-SMOPA_d         1010000 0 11 0 ..... ... ... ..... . 0 ...      @op_64
-SUMOPA_d        1010000 0 11 1 ..... ... ... ..... . 0 ...      @op_64
-USMOPA_d        1010000 1 11 0 ..... ... ... ..... . 0 ...      @op_64
-UMOPA_d         1010000 1 11 1 ..... ... ... ..... . 0 ...      @op_64
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
deleted file mode 100644
index 14b3a69..0000000
--- a/target/arm/sve.decode
+++ /dev/null
@@ -1,1702 +0,0 @@
-# AArch64 SVE instruction descriptions
-#
-#  Copyright (c) 2017 Linaro, Ltd
-#
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-#
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, see <http://www.gnu.org/licenses/>.
-
-#
-# This file is processed by scripts/decodetree.py
-#
-
-###########################################################################
-# Named fields.  These are primarily for disjoint fields.
-
-%imm4_16_p1     16:4 !function=plus_1
-%imm6_22_5      22:1 5:5
-%imm7_22_16     22:2 16:5
-%imm8_16_10     16:5 10:3
-%imm9_16_10     16:s6 10:3
-%size_23        23:2
-%dtype_23_13    23:2 13:2
-%index3_22_19   22:1 19:2
-%index3_19_11   19:2 11:1
-%index2_20_11   20:1 11:1
-
-# A combination of tsz:imm3 -- extract esize.
-%tszimm_esz     22:2 5:5 !function=tszimm_esz
-# A combination of tsz:imm3 -- extract (2 * esize) - (tsz:imm3)
-%tszimm_shr     22:2 5:5 !function=tszimm_shr
-# A combination of tsz:imm3 -- extract (tsz:imm3) - esize
-%tszimm_shl     22:2 5:5 !function=tszimm_shl
-
-# Similarly for the tszh/tszl pair at 22/16 for zzi
-%tszimm16_esz   22:2 16:5 !function=tszimm_esz
-%tszimm16_shr   22:2 16:5 !function=tszimm_shr
-%tszimm16_shl   22:2 16:5 !function=tszimm_shl
-
-# Signed 8-bit immediate, optionally shifted left by 8.
-%sh8_i8s        5:9 !function=expand_imm_sh8s
-# Unsigned 8-bit immediate, optionally shifted left by 8.
-%sh8_i8u        5:9 !function=expand_imm_sh8u
-
-# Unsigned load of msz into esz=2, represented as a dtype.
-%msz_dtype      23:2 !function=msz_dtype
-
-# Either a copy of rd (at bit 0), or a different source
-# as propagated via the MOVPRFX instruction.
-%reg_movprfx    0:5
-
-###########################################################################
-# Named attribute sets.  These are used to make nice(er) names
-# when creating helpers common to those for the individual
-# instruction patterns.
-
-&rr_esz         rd rn esz
-&rri            rd rn imm
-&rr_dbm         rd rn dbm
-&rrri           rd rn rm imm
-&rri_esz        rd rn imm esz
-&rrri_esz       rd rn rm imm esz
-&rrr_esz        rd rn rm esz
-&rrx_esz        rd rn rm index esz
-&rpr_esz        rd pg rn esz
-&rpr_s          rd pg rn s
-&rprr_s         rd pg rn rm s
-&rprr_esz       rd pg rn rm esz
-&rrrr_esz       rd ra rn rm esz
-&rrxr_esz       rd rn rm ra index esz
-&rprrr_esz      rd pg rn rm ra esz
-&rpri_esz       rd pg rn imm esz
-&ptrue          rd esz pat s
-&incdec_cnt     rd pat esz imm d u
-&incdec2_cnt    rd rn pat esz imm d u
-&incdec_pred    rd pg esz d u
-&incdec2_pred   rd rn pg esz d u
-&rprr_load      rd pg rn rm dtype nreg
-&rpri_load      rd pg rn imm dtype nreg
-&rprr_store     rd pg rn rm msz esz nreg
-&rpri_store     rd pg rn imm msz esz nreg
-&rprr_gather_load       rd pg rn rm esz msz u ff xs scale
-&rpri_gather_load       rd pg rn imm esz msz u ff
-&rprr_scatter_store     rd pg rn rm esz msz xs scale
-&rpri_scatter_store     rd pg rn imm esz msz
-
-###########################################################################
-# Named instruction formats.  These are generally used to
-# reduce the amount of duplication between instruction patterns.
-
-# Two operand with unused vector element size
-@pd_pn_e0       ........ ........ ....... rn:4 . rd:4           &rr_esz esz=0
-
-# Two operand
-@pd_pn          ........ esz:2 .. .... ....... rn:4 . rd:4      &rr_esz
-@rd_rn          ........ esz:2 ...... ...... rn:5 rd:5          &rr_esz
-
-# Two operand with governing predicate, flags setting
-@pd_pg_pn_s     ........ . s:1 ...... .. pg:4 . rn:4 . rd:4     &rpr_s
-@pd_pg_pn_s0    ........ . .   ...... .. pg:4 . rn:4 . rd:4     &rpr_s s=0
-
-# Three operand with unused vector element size
-@rd_rn_rm_e0    ........ ... rm:5 ... ... rn:5 rd:5             &rrr_esz esz=0
-
-# Three predicate operand, with governing predicate, flag setting
-@pd_pg_pn_pm_s  ........ . s:1 .. rm:4 .. pg:4 . rn:4 . rd:4    &rprr_s
-
-# Three operand, vector element size
-@rd_rn_rm       ........ esz:2 . rm:5 ... ... rn:5 rd:5         &rrr_esz
-@pd_pn_pm       ........ esz:2 .. rm:4 ....... rn:4 . rd:4      &rrr_esz
-@rdn_rm         ........ esz:2 ...... ...... rm:5 rd:5 \
-                &rrr_esz rn=%reg_movprfx
-@rdn_rm_e0      ........ .. ...... ...... rm:5 rd:5 \
-                &rrr_esz rn=%reg_movprfx esz=0
-@rdn_sh_i8u     ........ esz:2 ...... ...... ..... rd:5 \
-                &rri_esz rn=%reg_movprfx imm=%sh8_i8u
-@rdn_i8u        ........ esz:2 ...... ... imm:8 rd:5 \
-                &rri_esz rn=%reg_movprfx
-@rdn_i8s        ........ esz:2 ...... ... imm:s8 rd:5 \
-                &rri_esz rn=%reg_movprfx
-
-# Four operand, vector element size
-@rda_rn_rm      ........ esz:2 . rm:5 ... ... rn:5 rd:5 \
-                &rrrr_esz ra=%reg_movprfx
-
-# Four operand with unused vector element size
-@rda_rn_rm_e0   ........ ... rm:5 ... ... rn:5 rd:5 \
-                &rrrr_esz esz=0 ra=%reg_movprfx
-@rdn_ra_rm_e0   ........ ... rm:5 ... ... ra:5 rd:5 \
-                &rrrr_esz esz=0 rn=%reg_movprfx
-
-# Three operand with "memory" size, aka immediate left shift
-@rd_rn_msz_rm   ........ ... rm:5 .... imm:2 rn:5 rd:5          &rrri
-
-# Two register operand, with governing predicate, vector element size
-@rdn_pg_rm      ........ esz:2 ... ... ... pg:3 rm:5 rd:5 \
-                &rprr_esz rn=%reg_movprfx
-@rdm_pg_rn      ........ esz:2 ... ... ... pg:3 rn:5 rd:5 \
-                &rprr_esz rm=%reg_movprfx
-@rd_pg4_rn_rm   ........ esz:2 . rm:5  .. pg:4  rn:5 rd:5       &rprr_esz
-@pd_pg_rn_rm    ........ esz:2 . rm:5 ... pg:3 rn:5 . rd:4      &rprr_esz
-
-# Three register operand, with governing predicate, vector element size
-@rda_pg_rn_rm   ........ esz:2 . rm:5  ... pg:3 rn:5 rd:5 \
-                &rprrr_esz ra=%reg_movprfx
-@rdn_pg_ra_rm   ........ esz:2 . rm:5  ... pg:3 ra:5 rd:5 \
-                &rprrr_esz rn=%reg_movprfx
-@rdn_pg_rm_ra   ........ esz:2 . ra:5  ... pg:3 rm:5 rd:5 \
-                &rprrr_esz rn=%reg_movprfx
-@rd_pg_rn_rm   ........ esz:2 . rm:5 ... pg:3 rn:5 rd:5       &rprr_esz
-
-# One register operand, with governing predicate, vector element size
-@rd_pg_rn       ........ esz:2 ... ... ... pg:3 rn:5 rd:5       &rpr_esz
-@rd_pg4_pn      ........ esz:2 ... ... .. pg:4 . rn:4 rd:5      &rpr_esz
-@pd_pg_rn       ........ esz:2 ... ... ... pg:3 rn:5 . rd:4     &rpr_esz
-
-# One register operand, with governing predicate, no vector element size
-@rd_pg_rn_e0    ........ .. ... ... ... pg:3 rn:5 rd:5          &rpr_esz esz=0
-
-# Two register operands with a 6-bit signed immediate.
-@rd_rn_i6       ........ ... rn:5 ..... imm:s6 rd:5             &rri
-
-# Two register operand, one immediate operand, with predicate,
-# element size encoded as TSZHL.
-@rdn_pg_tszimm_shl  ........ .. ... ... ... pg:3 ..... rd:5 \
-                    &rpri_esz rn=%reg_movprfx esz=%tszimm_esz imm=%tszimm_shl
-@rdn_pg_tszimm_shr  ........ .. ... ... ... pg:3 ..... rd:5 \
-                    &rpri_esz rn=%reg_movprfx esz=%tszimm_esz imm=%tszimm_shr
-
-# Similarly without predicate.
-@rd_rn_tszimm_shl   ........ .. ... ... ...... rn:5 rd:5 \
-                    &rri_esz esz=%tszimm16_esz imm=%tszimm16_shl
-@rd_rn_tszimm_shr   ........ .. ... ... ...... rn:5 rd:5 \
-                    &rri_esz esz=%tszimm16_esz imm=%tszimm16_shr
-
-# Two register operand, one immediate operand, with 4-bit predicate.
-# User must fill in imm.
-@rdn_pg4        ........ esz:2 .. pg:4 ... ........ rd:5 \
-                &rpri_esz rn=%reg_movprfx
-
-# Two register operand, one one-bit floating-point operand.
-@rdn_i1         ........ esz:2 ......... pg:3 .... imm:1 rd:5 \
-                &rpri_esz rn=%reg_movprfx
-
-# Two register operand, one encoded bitmask.
-@rdn_dbm        ........ .. .... dbm:13 rd:5 \
-                &rr_dbm rn=%reg_movprfx
-
-# Predicate output, vector and immediate input,
-# controlling predicate, element size.
-@pd_pg_rn_i7    ........ esz:2 . imm:7 . pg:3 rn:5 . rd:4       &rpri_esz
-@pd_pg_rn_i5    ........ esz:2 . imm:s5 ... pg:3 rn:5 . rd:4    &rpri_esz
-
-# Basic Load/Store with 9-bit immediate offset
-@pd_rn_i9       ........ ........ ...... rn:5 . rd:4    \
-                &rri imm=%imm9_16_10
-@rd_rn_i9       ........ ........ ...... rn:5 rd:5      \
-                &rri imm=%imm9_16_10
-
-# One register, pattern, and uint4+1.
-# User must fill in U and D.
-@incdec_cnt     ........ esz:2 .. .... ...... pat:5 rd:5 \
-                &incdec_cnt imm=%imm4_16_p1
-@incdec2_cnt    ........ esz:2 .. .... ...... pat:5 rd:5 \
-                &incdec2_cnt imm=%imm4_16_p1 rn=%reg_movprfx
-
-# One register, predicate.
-# User must fill in U and D.
-@incdec_pred    ........ esz:2 .... .. ..... .. pg:4 rd:5       &incdec_pred
-@incdec2_pred   ........ esz:2 .... .. ..... .. pg:4 rd:5 \
-                &incdec2_pred rn=%reg_movprfx
-
-# Loads; user must fill in NREG.
-@rprr_load_dt   ....... dtype:4 rm:5 ... pg:3 rn:5 rd:5         &rprr_load
-@rpri_load_dt   ....... dtype:4 . imm:s4 ... pg:3 rn:5 rd:5     &rpri_load
-
-@rprr_load_msz  ....... .... rm:5 ... pg:3 rn:5 rd:5 \
-                &rprr_load dtype=%msz_dtype
-@rpri_load_msz  ....... .... . imm:s4 ... pg:3 rn:5 rd:5 \
-                &rpri_load dtype=%msz_dtype
-
-# Gather Loads.
-@rprr_g_load_u        ....... .. .    . rm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \
-                      &rprr_gather_load xs=2
-@rprr_g_load_xs_u     ....... .. xs:1 . rm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \
-                      &rprr_gather_load
-@rprr_g_load_xs_u_sc  ....... .. xs:1 scale:1 rm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \
-                      &rprr_gather_load
-@rprr_g_load_xs_sc    ....... .. xs:1 scale:1 rm:5 . . ff:1 pg:3 rn:5 rd:5 \
-                      &rprr_gather_load
-@rprr_g_load_u_sc     ....... .. .    scale:1 rm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \
-                      &rprr_gather_load xs=2
-@rprr_g_load_sc       ....... .. .    scale:1 rm:5 . . ff:1 pg:3 rn:5 rd:5 \
-                      &rprr_gather_load xs=2
-@rpri_g_load          ....... msz:2 .. imm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \
-                      &rpri_gather_load
-
-# Stores; user must fill in ESZ, MSZ, NREG as needed.
-@rprr_store         ....... ..    ..     rm:5 ... pg:3 rn:5 rd:5    &rprr_store
-@rpri_store_msz     ....... msz:2 .. . imm:s4 ... pg:3 rn:5 rd:5    &rpri_store
-@rprr_store_esz_n0  ....... ..    esz:2  rm:5 ... pg:3 rn:5 rd:5 \
-                    &rprr_store nreg=0
-@rprr_scatter_store ....... msz:2 ..     rm:5 ... pg:3 rn:5 rd:5 \
-                    &rprr_scatter_store
-@rpri_scatter_store ....... msz:2 ..    imm:5 ... pg:3 rn:5 rd:5 \
-                    &rpri_scatter_store
-
-# Two registers and a scalar by N-bit index
-@rrx_3          ........ .. . ..      rm:3 ...... rn:5 rd:5 \
-                &rrx_esz index=%index3_22_19
-@rrx_2          ........ .. . index:2 rm:3 ...... rn:5 rd:5  &rrx_esz
-@rrx_1          ........ .. . index:1 rm:4 ...... rn:5 rd:5  &rrx_esz
-
-# Two registers and a scalar by N-bit index, alternate
-@rrx_3a         ........ .. . .. rm:3 ...... rn:5 rd:5 \
-                &rrx_esz index=%index3_19_11
-@rrx_2a         ........ .. . .  rm:4 ...... rn:5 rd:5 \
-                &rrx_esz index=%index2_20_11
-
-# Three registers and a scalar by N-bit index
-@rrxr_3         ........ .. . ..      rm:3 ...... rn:5 rd:5 \
-                &rrxr_esz ra=%reg_movprfx index=%index3_22_19
-@rrxr_2         ........ .. . index:2 rm:3 ...... rn:5 rd:5 \
-                &rrxr_esz ra=%reg_movprfx
-@rrxr_1         ........ .. . index:1 rm:4 ...... rn:5 rd:5 \
-                &rrxr_esz ra=%reg_movprfx
-
-# Three registers and a scalar by N-bit index, alternate
-@rrxr_3a        ........ .. ... rm:3 ...... rn:5 rd:5 \
-                &rrxr_esz ra=%reg_movprfx index=%index3_19_11
-@rrxr_2a        ........ .. ..  rm:4 ...... rn:5 rd:5 \
-                &rrxr_esz ra=%reg_movprfx index=%index2_20_11
-
-###########################################################################
-# Instruction patterns.  Grouped according to the SVE encodingindex.xhtml.
-
-### SVE Integer Arithmetic - Binary Predicated Group
-
-# SVE bitwise logical vector operations (predicated)
-ORR_zpzz        00000100 .. 011 000 000 ... ..... .....   @rdn_pg_rm
-EOR_zpzz        00000100 .. 011 001 000 ... ..... .....   @rdn_pg_rm
-AND_zpzz        00000100 .. 011 010 000 ... ..... .....   @rdn_pg_rm
-BIC_zpzz        00000100 .. 011 011 000 ... ..... .....   @rdn_pg_rm
-
-# SVE integer add/subtract vectors (predicated)
-ADD_zpzz        00000100 .. 000 000 000 ... ..... .....   @rdn_pg_rm
-SUB_zpzz        00000100 .. 000 001 000 ... ..... .....   @rdn_pg_rm
-SUB_zpzz        00000100 .. 000 011 000 ... ..... .....   @rdm_pg_rn # SUBR
-
-# SVE integer min/max/difference (predicated)
-SMAX_zpzz       00000100 .. 001 000 000 ... ..... .....   @rdn_pg_rm
-UMAX_zpzz       00000100 .. 001 001 000 ... ..... .....   @rdn_pg_rm
-SMIN_zpzz       00000100 .. 001 010 000 ... ..... .....   @rdn_pg_rm
-UMIN_zpzz       00000100 .. 001 011 000 ... ..... .....   @rdn_pg_rm
-SABD_zpzz       00000100 .. 001 100 000 ... ..... .....   @rdn_pg_rm
-UABD_zpzz       00000100 .. 001 101 000 ... ..... .....   @rdn_pg_rm
-
-# SVE integer multiply/divide (predicated)
-MUL_zpzz        00000100 .. 010 000 000 ... ..... .....   @rdn_pg_rm
-SMULH_zpzz      00000100 .. 010 010 000 ... ..... .....   @rdn_pg_rm
-UMULH_zpzz      00000100 .. 010 011 000 ... ..... .....   @rdn_pg_rm
-# Note that divide requires size >= 2; below 2 is unallocated.
-SDIV_zpzz       00000100 .. 010 100 000 ... ..... .....   @rdn_pg_rm
-UDIV_zpzz       00000100 .. 010 101 000 ... ..... .....   @rdn_pg_rm
-SDIV_zpzz       00000100 .. 010 110 000 ... ..... .....   @rdm_pg_rn # SDIVR
-UDIV_zpzz       00000100 .. 010 111 000 ... ..... .....   @rdm_pg_rn # UDIVR
-
-### SVE Integer Reduction Group
-
-# SVE bitwise logical reduction (predicated)
-ORV             00000100 .. 011 000 001 ... ..... .....         @rd_pg_rn
-EORV            00000100 .. 011 001 001 ... ..... .....         @rd_pg_rn
-ANDV            00000100 .. 011 010 001 ... ..... .....         @rd_pg_rn
-
-# SVE constructive prefix (predicated)
-MOVPRFX_z       00000100 .. 010 000 001 ... ..... .....         @rd_pg_rn
-MOVPRFX_m       00000100 .. 010 001 001 ... ..... .....         @rd_pg_rn
-
-# SVE integer add reduction (predicated)
-# Note that saddv requires size != 3.
-UADDV           00000100 .. 000 001 001 ... ..... .....         @rd_pg_rn
-SADDV           00000100 .. 000 000 001 ... ..... .....         @rd_pg_rn
-
-# SVE integer min/max reduction (predicated)
-SMAXV           00000100 .. 001 000 001 ... ..... .....         @rd_pg_rn
-UMAXV           00000100 .. 001 001 001 ... ..... .....         @rd_pg_rn
-SMINV           00000100 .. 001 010 001 ... ..... .....         @rd_pg_rn
-UMINV           00000100 .. 001 011 001 ... ..... .....         @rd_pg_rn
-
-### SVE Shift by Immediate - Predicated Group
-
-# SVE bitwise shift by immediate (predicated)
-ASR_zpzi        00000100 .. 000 000 100 ... .. ... .....  @rdn_pg_tszimm_shr
-LSR_zpzi        00000100 .. 000 001 100 ... .. ... .....  @rdn_pg_tszimm_shr
-LSL_zpzi        00000100 .. 000 011 100 ... .. ... .....  @rdn_pg_tszimm_shl
-ASRD            00000100 .. 000 100 100 ... .. ... .....  @rdn_pg_tszimm_shr
-SQSHL_zpzi      00000100 .. 000 110 100 ... .. ... .....  @rdn_pg_tszimm_shl
-UQSHL_zpzi      00000100 .. 000 111 100 ... .. ... .....  @rdn_pg_tszimm_shl
-SRSHR           00000100 .. 001 100 100 ... .. ... .....  @rdn_pg_tszimm_shr
-URSHR           00000100 .. 001 101 100 ... .. ... .....  @rdn_pg_tszimm_shr
-SQSHLU          00000100 .. 001 111 100 ... .. ... .....  @rdn_pg_tszimm_shl
-
-# SVE bitwise shift by vector (predicated)
-ASR_zpzz        00000100 .. 010 000 100 ... ..... .....   @rdn_pg_rm
-LSR_zpzz        00000100 .. 010 001 100 ... ..... .....   @rdn_pg_rm
-LSL_zpzz        00000100 .. 010 011 100 ... ..... .....   @rdn_pg_rm
-ASR_zpzz        00000100 .. 010 100 100 ... ..... .....   @rdm_pg_rn # ASRR
-LSR_zpzz        00000100 .. 010 101 100 ... ..... .....   @rdm_pg_rn # LSRR
-LSL_zpzz        00000100 .. 010 111 100 ... ..... .....   @rdm_pg_rn # LSLR
-
-# SVE bitwise shift by wide elements (predicated)
-# Note these require size != 3.
-ASR_zpzw        00000100 .. 011 000 100 ... ..... .....         @rdn_pg_rm
-LSR_zpzw        00000100 .. 011 001 100 ... ..... .....         @rdn_pg_rm
-LSL_zpzw        00000100 .. 011 011 100 ... ..... .....         @rdn_pg_rm
-
-### SVE Integer Arithmetic - Unary Predicated Group
-
-# SVE unary bit operations (predicated)
-# Note esz != 0 for FABS and FNEG.
-CLS             00000100 .. 011 000 101 ... ..... .....         @rd_pg_rn
-CLZ             00000100 .. 011 001 101 ... ..... .....         @rd_pg_rn
-CNT_zpz         00000100 .. 011 010 101 ... ..... .....         @rd_pg_rn
-CNOT            00000100 .. 011 011 101 ... ..... .....         @rd_pg_rn
-NOT_zpz         00000100 .. 011 110 101 ... ..... .....         @rd_pg_rn
-FABS            00000100 .. 011 100 101 ... ..... .....         @rd_pg_rn
-FNEG            00000100 .. 011 101 101 ... ..... .....         @rd_pg_rn
-
-# SVE integer unary operations (predicated)
-# Note esz > original size for extensions.
-ABS             00000100 .. 010 110 101 ... ..... .....         @rd_pg_rn
-NEG             00000100 .. 010 111 101 ... ..... .....         @rd_pg_rn
-SXTB            00000100 .. 010 000 101 ... ..... .....         @rd_pg_rn
-UXTB            00000100 .. 010 001 101 ... ..... .....         @rd_pg_rn
-SXTH            00000100 .. 010 010 101 ... ..... .....         @rd_pg_rn
-UXTH            00000100 .. 010 011 101 ... ..... .....         @rd_pg_rn
-SXTW            00000100 .. 010 100 101 ... ..... .....         @rd_pg_rn
-UXTW            00000100 .. 010 101 101 ... ..... .....         @rd_pg_rn
-
-### SVE Floating Point Compare - Vectors Group
-
-# SVE floating-point compare vectors
-FCMGE_ppzz      01100101 .. 0 ..... 010 ... ..... 0 ....        @pd_pg_rn_rm
-FCMGT_ppzz      01100101 .. 0 ..... 010 ... ..... 1 ....        @pd_pg_rn_rm
-FCMEQ_ppzz      01100101 .. 0 ..... 011 ... ..... 0 ....        @pd_pg_rn_rm
-FCMNE_ppzz      01100101 .. 0 ..... 011 ... ..... 1 ....        @pd_pg_rn_rm
-FCMUO_ppzz      01100101 .. 0 ..... 110 ... ..... 0 ....        @pd_pg_rn_rm
-FACGE_ppzz      01100101 .. 0 ..... 110 ... ..... 1 ....        @pd_pg_rn_rm
-FACGT_ppzz      01100101 .. 0 ..... 111 ... ..... 1 ....        @pd_pg_rn_rm
-
-### SVE Integer Multiply-Add Group
-
-# SVE integer multiply-add writing addend (predicated)
-MLA             00000100 .. 0 ..... 010 ... ..... .....   @rda_pg_rn_rm
-MLS             00000100 .. 0 ..... 011 ... ..... .....   @rda_pg_rn_rm
-
-# SVE integer multiply-add writing multiplicand (predicated)
-MLA             00000100 .. 0 ..... 110 ... ..... .....   @rdn_pg_ra_rm # MAD
-MLS             00000100 .. 0 ..... 111 ... ..... .....   @rdn_pg_ra_rm # MSB
-
-### SVE Integer Arithmetic - Unpredicated Group
-
-# SVE integer add/subtract vectors (unpredicated)
-ADD_zzz         00000100 .. 1 ..... 000 000 ..... .....         @rd_rn_rm
-SUB_zzz         00000100 .. 1 ..... 000 001 ..... .....         @rd_rn_rm
-SQADD_zzz       00000100 .. 1 ..... 000 100 ..... .....         @rd_rn_rm
-UQADD_zzz       00000100 .. 1 ..... 000 101 ..... .....         @rd_rn_rm
-SQSUB_zzz       00000100 .. 1 ..... 000 110 ..... .....         @rd_rn_rm
-UQSUB_zzz       00000100 .. 1 ..... 000 111 ..... .....         @rd_rn_rm
-
-### SVE Logical - Unpredicated Group
-
-# SVE bitwise logical operations (unpredicated)
-AND_zzz         00000100 00 1 ..... 001 100 ..... .....         @rd_rn_rm_e0
-ORR_zzz         00000100 01 1 ..... 001 100 ..... .....         @rd_rn_rm_e0
-EOR_zzz         00000100 10 1 ..... 001 100 ..... .....         @rd_rn_rm_e0
-BIC_zzz         00000100 11 1 ..... 001 100 ..... .....         @rd_rn_rm_e0
-
-XAR             00000100 .. 1 ..... 001 101 rm:5  rd:5   &rrri_esz \
-                rn=%reg_movprfx esz=%tszimm16_esz imm=%tszimm16_shr
-
-# SVE2 bitwise ternary operations
-EOR3            00000100 00 1 ..... 001 110 ..... .....         @rdn_ra_rm_e0
-BSL             00000100 00 1 ..... 001 111 ..... .....         @rdn_ra_rm_e0
-BCAX            00000100 01 1 ..... 001 110 ..... .....         @rdn_ra_rm_e0
-BSL1N           00000100 01 1 ..... 001 111 ..... .....         @rdn_ra_rm_e0
-BSL2N           00000100 10 1 ..... 001 111 ..... .....         @rdn_ra_rm_e0
-NBSL            00000100 11 1 ..... 001 111 ..... .....         @rdn_ra_rm_e0
-
-### SVE Index Generation Group
-
-# SVE index generation (immediate start, immediate increment)
-INDEX_ii        00000100 esz:2 1 imm2:s5 010000 imm1:s5 rd:5
-
-# SVE index generation (immediate start, register increment)
-INDEX_ir        00000100 esz:2 1 rm:5 010010 imm:s5 rd:5
-
-# SVE index generation (register start, immediate increment)
-INDEX_ri        00000100 esz:2 1 imm:s5 010001 rn:5 rd:5
-
-# SVE index generation (register start, register increment)
-INDEX_rr        00000100 .. 1 ..... 010011 ..... .....          @rd_rn_rm
-
-### SVE / Streaming SVE Stack Allocation Group
-
-# SVE stack frame adjustment
-ADDVL           00000100 001 ..... 01010 ...... .....           @rd_rn_i6
-ADDSVL          00000100 001 ..... 01011 ...... .....           @rd_rn_i6
-ADDPL           00000100 011 ..... 01010 ...... .....           @rd_rn_i6
-ADDSPL          00000100 011 ..... 01011 ...... .....           @rd_rn_i6
-
-# SVE stack frame size
-RDVL            00000100 101 11111 01010 imm:s6 rd:5
-RDSVL           00000100 101 11111 01011 imm:s6 rd:5
-
-### SVE Bitwise Shift - Unpredicated Group
-
-# SVE bitwise shift by immediate (unpredicated)
-ASR_zzi         00000100 .. 1 ..... 1001 00 ..... .....  @rd_rn_tszimm_shr
-LSR_zzi         00000100 .. 1 ..... 1001 01 ..... .....  @rd_rn_tszimm_shr
-LSL_zzi         00000100 .. 1 ..... 1001 11 ..... .....  @rd_rn_tszimm_shl
-
-# SVE bitwise shift by wide elements (unpredicated)
-# Note esz != 3
-ASR_zzw         00000100 .. 1 ..... 1000 00 ..... .....         @rd_rn_rm
-LSR_zzw         00000100 .. 1 ..... 1000 01 ..... .....         @rd_rn_rm
-LSL_zzw         00000100 .. 1 ..... 1000 11 ..... .....         @rd_rn_rm
-
-### SVE Compute Vector Address Group
-
-# SVE vector address generation
-ADR_s32         00000100 00 1 ..... 1010 .. ..... .....         @rd_rn_msz_rm
-ADR_u32         00000100 01 1 ..... 1010 .. ..... .....         @rd_rn_msz_rm
-ADR_p32         00000100 10 1 ..... 1010 .. ..... .....         @rd_rn_msz_rm
-ADR_p64         00000100 11 1 ..... 1010 .. ..... .....         @rd_rn_msz_rm
-
-### SVE Integer Misc - Unpredicated Group
-
-# SVE constructive prefix (unpredicated)
-MOVPRFX         00000100 00 1 00000 101111 rn:5 rd:5
-
-# SVE floating-point exponential accelerator
-# Note esz != 0
-FEXPA           00000100 .. 1 00000 101110 ..... .....          @rd_rn
-
-# SVE floating-point trig select coefficient
-# Note esz != 0
-FTSSEL          00000100 .. 1 ..... 101100 ..... .....          @rd_rn_rm
-
-### SVE Element Count Group
-
-# SVE element count
-CNT_r           00000100 .. 10 .... 1110 0 0 ..... .....    @incdec_cnt d=0 u=1
-
-# SVE inc/dec register by element count
-INCDEC_r        00000100 .. 11 .... 1110 0 d:1 ..... .....      @incdec_cnt u=1
-
-# SVE saturating inc/dec register by element count
-SINCDEC_r_32    00000100 .. 10 .... 1111 d:1 u:1 ..... .....    @incdec_cnt
-SINCDEC_r_64    00000100 .. 11 .... 1111 d:1 u:1 ..... .....    @incdec_cnt
-
-# SVE inc/dec vector by element count
-# Note this requires esz != 0.
-INCDEC_v        00000100 .. 1 1 .... 1100 0 d:1 ..... .....    @incdec2_cnt u=1
-
-# SVE saturating inc/dec vector by element count
-# Note these require esz != 0.
-SINCDEC_v       00000100 .. 1 0 .... 1100 d:1 u:1 ..... .....   @incdec2_cnt
-
-### SVE Bitwise Immediate Group
-
-# SVE bitwise logical with immediate (unpredicated)
-ORR_zzi         00000101 00 0000 ............. .....            @rdn_dbm
-EOR_zzi         00000101 01 0000 ............. .....            @rdn_dbm
-AND_zzi         00000101 10 0000 ............. .....            @rdn_dbm
-
-# SVE broadcast bitmask immediate
-DUPM            00000101 11 0000 dbm:13 rd:5
-
-### SVE Integer Wide Immediate - Predicated Group
-
-# SVE copy floating-point immediate (predicated)
-FCPY            00000101 .. 01 .... 110 imm:8 .....             @rdn_pg4
-
-# SVE copy integer immediate (predicated)
-{
-  INVALID       00000101 00 01 ---- 01 1 -------- -----
-  CPY_m_i       00000101 .. 01 .... 01 . ........ .....   @rdn_pg4 imm=%sh8_i8s
-}
-{
-  INVALID       00000101 00 01 ---- 00 1 -------- -----
-  CPY_z_i       00000101 .. 01 .... 00 . ........ .....   @rdn_pg4 imm=%sh8_i8s
-}
-
-### SVE Permute - Extract Group
-
-# SVE extract vector (destructive)
-EXT             00000101 001 ..... 000 ... rm:5 rd:5 \
-                &rrri rn=%reg_movprfx imm=%imm8_16_10
-
-# SVE2 extract vector (constructive)
-EXT_sve2        00000101 011 ..... 000 ... rn:5 rd:5 \
-                &rri imm=%imm8_16_10
-
-### SVE Permute - Unpredicated Group
-
-# SVE broadcast general register
-DUP_s           00000101 .. 1 00000 001110 ..... .....          @rd_rn
-
-# SVE broadcast indexed element
-DUP_x           00000101 .. 1 ..... 001000 rn:5 rd:5 \
-                &rri imm=%imm7_22_16
-
-# SVE insert SIMD&FP scalar register
-INSR_f          00000101 .. 1 10100 001110 ..... .....          @rdn_rm
-
-# SVE insert general register
-INSR_r          00000101 .. 1 00100 001110 ..... .....          @rdn_rm
-
-# SVE reverse vector elements
-REV_v           00000101 .. 1 11000 001110 ..... .....          @rd_rn
-
-# SVE vector table lookup
-TBL             00000101 .. 1 ..... 001100 ..... .....          @rd_rn_rm
-
-# SVE unpack vector elements
-UNPK            00000101 esz:2 1100 u:1 h:1 001110 rn:5 rd:5
-
-# SVE2 Table Lookup (three sources)
-
-TBL_sve2        00000101 .. 1 ..... 001010 ..... .....          @rd_rn_rm
-TBX             00000101 .. 1 ..... 001011 ..... .....          @rd_rn_rm
-
-### SVE Permute - Predicates Group
-
-# SVE permute predicate elements
-ZIP1_p          00000101 .. 10 .... 010 000 0 .... 0 ....       @pd_pn_pm
-ZIP2_p          00000101 .. 10 .... 010 001 0 .... 0 ....       @pd_pn_pm
-UZP1_p          00000101 .. 10 .... 010 010 0 .... 0 ....       @pd_pn_pm
-UZP2_p          00000101 .. 10 .... 010 011 0 .... 0 ....       @pd_pn_pm
-TRN1_p          00000101 .. 10 .... 010 100 0 .... 0 ....       @pd_pn_pm
-TRN2_p          00000101 .. 10 .... 010 101 0 .... 0 ....       @pd_pn_pm
-
-# SVE reverse predicate elements
-REV_p           00000101 .. 11 0100 010 000 0 .... 0 ....       @pd_pn
-
-# SVE unpack predicate elements
-PUNPKLO         00000101 00 11 0000 010 000 0 .... 0 ....       @pd_pn_e0
-PUNPKHI         00000101 00 11 0001 010 000 0 .... 0 ....       @pd_pn_e0
-
-### SVE Permute - Interleaving Group
-
-# SVE permute vector elements
-ZIP1_z          00000101 .. 1 ..... 011 000 ..... .....         @rd_rn_rm
-ZIP2_z          00000101 .. 1 ..... 011 001 ..... .....         @rd_rn_rm
-UZP1_z          00000101 .. 1 ..... 011 010 ..... .....         @rd_rn_rm
-UZP2_z          00000101 .. 1 ..... 011 011 ..... .....         @rd_rn_rm
-TRN1_z          00000101 .. 1 ..... 011 100 ..... .....         @rd_rn_rm
-TRN2_z          00000101 .. 1 ..... 011 101 ..... .....         @rd_rn_rm
-
-# SVE2 permute vector segments
-ZIP1_q          00000101 10 1 ..... 000 000 ..... .....         @rd_rn_rm_e0
-ZIP2_q          00000101 10 1 ..... 000 001 ..... .....         @rd_rn_rm_e0
-UZP1_q          00000101 10 1 ..... 000 010 ..... .....         @rd_rn_rm_e0
-UZP2_q          00000101 10 1 ..... 000 011 ..... .....         @rd_rn_rm_e0
-TRN1_q          00000101 10 1 ..... 000 110 ..... .....         @rd_rn_rm_e0
-TRN2_q          00000101 10 1 ..... 000 111 ..... .....         @rd_rn_rm_e0
-
-### SVE Permute - Predicated Group
-
-# SVE compress active elements
-# Note esz >= 2
-COMPACT         00000101 .. 100001 100 ... ..... .....          @rd_pg_rn
-
-# SVE conditionally broadcast element to vector
-CLASTA_z        00000101 .. 10100 0 100 ... ..... .....         @rdn_pg_rm
-CLASTB_z        00000101 .. 10100 1 100 ... ..... .....         @rdn_pg_rm
-
-# SVE conditionally copy element to SIMD&FP scalar
-CLASTA_v        00000101 .. 10101 0 100 ... ..... .....         @rd_pg_rn
-CLASTB_v        00000101 .. 10101 1 100 ... ..... .....         @rd_pg_rn
-
-# SVE conditionally copy element to general register
-CLASTA_r        00000101 .. 11000 0 101 ... ..... .....         @rd_pg_rn
-CLASTB_r        00000101 .. 11000 1 101 ... ..... .....         @rd_pg_rn
-
-# SVE copy element to SIMD&FP scalar register
-LASTA_v         00000101 .. 10001 0 100 ... ..... .....         @rd_pg_rn
-LASTB_v         00000101 .. 10001 1 100 ... ..... .....         @rd_pg_rn
-
-# SVE copy element to general register
-LASTA_r         00000101 .. 10000 0 101 ... ..... .....         @rd_pg_rn
-LASTB_r         00000101 .. 10000 1 101 ... ..... .....         @rd_pg_rn
-
-# SVE copy element from SIMD&FP scalar register
-CPY_m_v         00000101 .. 100000 100 ... ..... .....          @rd_pg_rn
-
-# SVE copy element from general register to vector (predicated)
-CPY_m_r         00000101 .. 101000 101 ... ..... .....          @rd_pg_rn
-
-# SVE reverse within elements
-# Note esz >= operation size
-REVB            00000101 .. 1001 00 100 ... ..... .....         @rd_pg_rn
-REVH            00000101 .. 1001 01 100 ... ..... .....         @rd_pg_rn
-REVW            00000101 .. 1001 10 100 ... ..... .....         @rd_pg_rn
-RBIT            00000101 .. 1001 11 100 ... ..... .....         @rd_pg_rn
-REVD            00000101 00 1011 10 100 ... ..... .....         @rd_pg_rn_e0
-
-# SVE vector splice (predicated, destructive)
-SPLICE          00000101 .. 101 100 100 ... ..... .....         @rdn_pg_rm
-
-# SVE2 vector splice (predicated, constructive)
-SPLICE_sve2     00000101 .. 101 101 100 ... ..... .....         @rd_pg_rn
-
-### SVE Select Vectors Group
-
-# SVE select vector elements (predicated)
-SEL_zpzz        00000101 .. 1 ..... 11 .... ..... .....         @rd_pg4_rn_rm
-
-### SVE Integer Compare - Vectors Group
-
-# SVE integer compare_vectors
-CMPHS_ppzz      00100100 .. 0 ..... 000 ... ..... 0 ....        @pd_pg_rn_rm
-CMPHI_ppzz      00100100 .. 0 ..... 000 ... ..... 1 ....        @pd_pg_rn_rm
-CMPGE_ppzz      00100100 .. 0 ..... 100 ... ..... 0 ....        @pd_pg_rn_rm
-CMPGT_ppzz      00100100 .. 0 ..... 100 ... ..... 1 ....        @pd_pg_rn_rm
-CMPEQ_ppzz      00100100 .. 0 ..... 101 ... ..... 0 ....        @pd_pg_rn_rm
-CMPNE_ppzz      00100100 .. 0 ..... 101 ... ..... 1 ....        @pd_pg_rn_rm
-
-# SVE integer compare with wide elements
-# Note these require esz != 3.
-CMPEQ_ppzw      00100100 .. 0 ..... 001 ... ..... 0 ....        @pd_pg_rn_rm
-CMPNE_ppzw      00100100 .. 0 ..... 001 ... ..... 1 ....        @pd_pg_rn_rm
-CMPGE_ppzw      00100100 .. 0 ..... 010 ... ..... 0 ....        @pd_pg_rn_rm
-CMPGT_ppzw      00100100 .. 0 ..... 010 ... ..... 1 ....        @pd_pg_rn_rm
-CMPLT_ppzw      00100100 .. 0 ..... 011 ... ..... 0 ....        @pd_pg_rn_rm
-CMPLE_ppzw      00100100 .. 0 ..... 011 ... ..... 1 ....        @pd_pg_rn_rm
-CMPHS_ppzw      00100100 .. 0 ..... 110 ... ..... 0 ....        @pd_pg_rn_rm
-CMPHI_ppzw      00100100 .. 0 ..... 110 ... ..... 1 ....        @pd_pg_rn_rm
-CMPLO_ppzw      00100100 .. 0 ..... 111 ... ..... 0 ....        @pd_pg_rn_rm
-CMPLS_ppzw      00100100 .. 0 ..... 111 ... ..... 1 ....        @pd_pg_rn_rm
-
-### SVE Integer Compare - Unsigned Immediate Group
-
-# SVE integer compare with unsigned immediate
-CMPHS_ppzi      00100100 .. 1 ....... 0 ... ..... 0 ....      @pd_pg_rn_i7
-CMPHI_ppzi      00100100 .. 1 ....... 0 ... ..... 1 ....      @pd_pg_rn_i7
-CMPLO_ppzi      00100100 .. 1 ....... 1 ... ..... 0 ....      @pd_pg_rn_i7
-CMPLS_ppzi      00100100 .. 1 ....... 1 ... ..... 1 ....      @pd_pg_rn_i7
-
-### SVE Integer Compare - Signed Immediate Group
-
-# SVE integer compare with signed immediate
-CMPGE_ppzi      00100101 .. 0 ..... 000 ... ..... 0 ....      @pd_pg_rn_i5
-CMPGT_ppzi      00100101 .. 0 ..... 000 ... ..... 1 ....      @pd_pg_rn_i5
-CMPLT_ppzi      00100101 .. 0 ..... 001 ... ..... 0 ....      @pd_pg_rn_i5
-CMPLE_ppzi      00100101 .. 0 ..... 001 ... ..... 1 ....      @pd_pg_rn_i5
-CMPEQ_ppzi      00100101 .. 0 ..... 100 ... ..... 0 ....      @pd_pg_rn_i5
-CMPNE_ppzi      00100101 .. 0 ..... 100 ... ..... 1 ....      @pd_pg_rn_i5
-
-### SVE Predicate Logical Operations Group
-
-# SVE predicate logical operations
-AND_pppp        00100101 0. 00 .... 01 .... 0 .... 0 ....       @pd_pg_pn_pm_s
-BIC_pppp        00100101 0. 00 .... 01 .... 0 .... 1 ....       @pd_pg_pn_pm_s
-EOR_pppp        00100101 0. 00 .... 01 .... 1 .... 0 ....       @pd_pg_pn_pm_s
-SEL_pppp        00100101 0. 00 .... 01 .... 1 .... 1 ....       @pd_pg_pn_pm_s
-ORR_pppp        00100101 1. 00 .... 01 .... 0 .... 0 ....       @pd_pg_pn_pm_s
-ORN_pppp        00100101 1. 00 .... 01 .... 0 .... 1 ....       @pd_pg_pn_pm_s
-NOR_pppp        00100101 1. 00 .... 01 .... 1 .... 0 ....       @pd_pg_pn_pm_s
-NAND_pppp       00100101 1. 00 .... 01 .... 1 .... 1 ....       @pd_pg_pn_pm_s
-
-### SVE Predicate Misc Group
-
-# SVE predicate test
-PTEST           00100101 01 010000 11 pg:4 0 rn:4 0 0000
-
-# SVE predicate initialize
-PTRUE           00100101 esz:2 01100 s:1 111000 pat:5 0 rd:4
-
-# SVE initialize FFR
-SETFFR          00100101 0010 1100 1001 0000 0000 0000
-
-# SVE zero predicate register
-PFALSE          00100101 0001 1000 1110 0100 0000 rd:4
-
-# SVE predicate read from FFR (predicated)
-RDFFR_p         00100101 0 s:1 0110001111000 pg:4 0 rd:4
-
-# SVE predicate read from FFR (unpredicated)
-RDFFR           00100101 0001 1001 1111 0000 0000 rd:4
-
-# SVE FFR write from predicate (WRFFR)
-WRFFR           00100101 0010 1000 1001 000 rn:4 00000
-
-# SVE predicate first active
-PFIRST          00100101 01 011 000 11000 00 .... 0 ....        @pd_pn_e0
-
-# SVE predicate next active
-PNEXT           00100101 .. 011 001 11000 10 .... 0 ....        @pd_pn
-
-### SVE Partition Break Group
-
-# SVE propagate break from previous partition
-BRKPA           00100101 0. 00 .... 11 .... 0 .... 0 ....       @pd_pg_pn_pm_s
-BRKPB           00100101 0. 00 .... 11 .... 0 .... 1 ....       @pd_pg_pn_pm_s
-
-# SVE partition break condition
-BRKA_z          00100101 0. 01000001 .... 0 .... 0 ....         @pd_pg_pn_s
-BRKB_z          00100101 1. 01000001 .... 0 .... 0 ....         @pd_pg_pn_s
-BRKA_m          00100101 00 01000001 .... 0 .... 1 ....         @pd_pg_pn_s0
-BRKB_m          00100101 10 01000001 .... 0 .... 1 ....         @pd_pg_pn_s0
-
-# SVE propagate break to next partition
-BRKN            00100101 0. 01100001 .... 0 .... 0 ....         @pd_pg_pn_s
-
-### SVE Predicate Count Group
-
-# SVE predicate count
-CNTP            00100101 .. 100 000 10 .... 0 .... .....        @rd_pg4_pn
-
-# SVE inc/dec register by predicate count
-INCDECP_r       00100101 .. 10110 d:1 10001 00 .... .....     @incdec_pred u=1
-
-# SVE inc/dec vector by predicate count
-INCDECP_z       00100101 .. 10110 d:1 10000 00 .... .....    @incdec2_pred u=1
-
-# SVE saturating inc/dec register by predicate count
-SINCDECP_r_32   00100101 .. 1010 d:1 u:1 10001 00 .... .....    @incdec_pred
-SINCDECP_r_64   00100101 .. 1010 d:1 u:1 10001 10 .... .....    @incdec_pred
-
-# SVE saturating inc/dec vector by predicate count
-SINCDECP_z      00100101 .. 1010 d:1 u:1 10000 00 .... .....    @incdec2_pred
-
-### SVE Integer Compare - Scalars Group
-
-# SVE conditionally terminate scalars
-CTERM           00100101 1 sf:1 1 rm:5 001000 rn:5 ne:1 0000
-
-# SVE integer compare scalar count and limit
-WHILE           00100101 esz:2 1 rm:5 000 sf:1 u:1 lt:1 rn:5 eq:1 rd:4
-
-# SVE2 pointer conflict compare
-WHILE_ptr       00100101 esz:2 1 rm:5 001 100 rn:5 rw:1 rd:4
-
-### SVE Integer Wide Immediate - Unpredicated Group
-
-# SVE broadcast floating-point immediate (unpredicated)
-FDUP            00100101 esz:2 111 00 1110 imm:8 rd:5
-
-# SVE broadcast integer immediate (unpredicated)
-{
-  INVALID       00100101 00    111 00 011 1 -------- -----
-  DUP_i         00100101 esz:2 111 00 011 . ........ rd:5       imm=%sh8_i8s
-}
-
-# SVE integer add/subtract immediate (unpredicated)
-{
-  INVALID       00100101 00 100 000 11 1 -------- -----
-  ADD_zzi       00100101 .. 100 000 11 . ........ .....         @rdn_sh_i8u
-}
-{
-  INVALID       00100101 00 100 001 11 1 -------- -----
-  SUB_zzi       00100101 .. 100 001 11 . ........ .....         @rdn_sh_i8u
-}
-{
-  INVALID       00100101 00 100 011 11 1 -------- -----
-  SUBR_zzi      00100101 .. 100 011 11 . ........ .....         @rdn_sh_i8u
-}
-{
-  INVALID       00100101 00 100 100 11 1 -------- -----
-  SQADD_zzi     00100101 .. 100 100 11 . ........ .....         @rdn_sh_i8u
-}
-{
-  INVALID       00100101 00 100 101 11 1 -------- -----
-  UQADD_zzi     00100101 .. 100 101 11 . ........ .....         @rdn_sh_i8u
-}
-{
-  INVALID       00100101 00 100 110 11 1 -------- -----
-  SQSUB_zzi     00100101 .. 100 110 11 . ........ .....         @rdn_sh_i8u
-}
-{
-  INVALID       00100101 00 100 111 11 1 -------- -----
-  UQSUB_zzi     00100101 .. 100 111 11 . ........ .....         @rdn_sh_i8u
-}
-
-# SVE integer min/max immediate (unpredicated)
-SMAX_zzi        00100101 .. 101 000 110 ........ .....          @rdn_i8s
-UMAX_zzi        00100101 .. 101 001 110 ........ .....          @rdn_i8u
-SMIN_zzi        00100101 .. 101 010 110 ........ .....          @rdn_i8s
-UMIN_zzi        00100101 .. 101 011 110 ........ .....          @rdn_i8u
-
-# SVE integer multiply immediate (unpredicated)
-MUL_zzi         00100101 .. 110 000 110 ........ .....          @rdn_i8s
-
-# SVE integer dot product (unpredicated)
-DOT_zzzz        01000100 1 sz:1 0 rm:5 00000 u:1 rn:5 rd:5 \
-                ra=%reg_movprfx
-
-# SVE2 complex dot product (vectors)
-CDOT_zzzz       01000100 esz:2 0 rm:5 0001 rot:2 rn:5 rd:5  ra=%reg_movprfx
-
-#### SVE Multiply - Indexed
-
-# SVE integer dot product (indexed)
-SDOT_zzxw_s     01000100 10 1 ..... 000000 ..... .....   @rrxr_2 esz=2
-SDOT_zzxw_d     01000100 11 1 ..... 000000 ..... .....   @rrxr_1 esz=3
-UDOT_zzxw_s     01000100 10 1 ..... 000001 ..... .....   @rrxr_2 esz=2
-UDOT_zzxw_d     01000100 11 1 ..... 000001 ..... .....   @rrxr_1 esz=3
-
-# SVE2 integer multiply-add (indexed)
-MLA_zzxz_h      01000100 0. 1 ..... 000010 ..... .....   @rrxr_3 esz=1
-MLA_zzxz_s      01000100 10 1 ..... 000010 ..... .....   @rrxr_2 esz=2
-MLA_zzxz_d      01000100 11 1 ..... 000010 ..... .....   @rrxr_1 esz=3
-MLS_zzxz_h      01000100 0. 1 ..... 000011 ..... .....   @rrxr_3 esz=1
-MLS_zzxz_s      01000100 10 1 ..... 000011 ..... .....   @rrxr_2 esz=2
-MLS_zzxz_d      01000100 11 1 ..... 000011 ..... .....   @rrxr_1 esz=3
-
-# SVE2 saturating multiply-add high (indexed)
-SQRDMLAH_zzxz_h 01000100 0. 1 ..... 000100 ..... .....   @rrxr_3 esz=1
-SQRDMLAH_zzxz_s 01000100 10 1 ..... 000100 ..... .....   @rrxr_2 esz=2
-SQRDMLAH_zzxz_d 01000100 11 1 ..... 000100 ..... .....   @rrxr_1 esz=3
-SQRDMLSH_zzxz_h 01000100 0. 1 ..... 000101 ..... .....   @rrxr_3 esz=1
-SQRDMLSH_zzxz_s 01000100 10 1 ..... 000101 ..... .....   @rrxr_2 esz=2
-SQRDMLSH_zzxz_d 01000100 11 1 ..... 000101 ..... .....   @rrxr_1 esz=3
-
-# SVE mixed sign dot product (indexed)
-USDOT_zzxw_s    01000100 10 1 ..... 000110 ..... .....   @rrxr_2 esz=2
-SUDOT_zzxw_s    01000100 10 1 ..... 000111 ..... .....   @rrxr_2 esz=2
-
-# SVE2 saturating multiply-add (indexed)
-SQDMLALB_zzxw_s 01000100 10 1 ..... 0010.0 ..... .....   @rrxr_3a esz=2
-SQDMLALB_zzxw_d 01000100 11 1 ..... 0010.0 ..... .....   @rrxr_2a esz=3
-SQDMLALT_zzxw_s 01000100 10 1 ..... 0010.1 ..... .....   @rrxr_3a esz=2
-SQDMLALT_zzxw_d 01000100 11 1 ..... 0010.1 ..... .....   @rrxr_2a esz=3
-SQDMLSLB_zzxw_s 01000100 10 1 ..... 0011.0 ..... .....   @rrxr_3a esz=2
-SQDMLSLB_zzxw_d 01000100 11 1 ..... 0011.0 ..... .....   @rrxr_2a esz=3
-SQDMLSLT_zzxw_s 01000100 10 1 ..... 0011.1 ..... .....   @rrxr_3a esz=2
-SQDMLSLT_zzxw_d 01000100 11 1 ..... 0011.1 ..... .....   @rrxr_2a esz=3
-
-# SVE2 complex integer dot product (indexed)
-CDOT_zzxw_s     01000100 10 1 index:2 rm:3 0100 rot:2 rn:5 rd:5 \
-                ra=%reg_movprfx
-CDOT_zzxw_d     01000100 11 1 index:1 rm:4 0100 rot:2 rn:5 rd:5 \
-                ra=%reg_movprfx
-
-# SVE2 complex integer multiply-add (indexed)
-CMLA_zzxz_h     01000100 10 1 index:2 rm:3 0110 rot:2 rn:5 rd:5 \
-                ra=%reg_movprfx
-CMLA_zzxz_s     01000100 11 1 index:1 rm:4 0110 rot:2 rn:5 rd:5 \
-                ra=%reg_movprfx
-
-# SVE2 complex saturating integer multiply-add (indexed)
-SQRDCMLAH_zzxz_h  01000100 10 1 index:2 rm:3 0111 rot:2 rn:5 rd:5 \
-                  ra=%reg_movprfx
-SQRDCMLAH_zzxz_s  01000100 11 1 index:1 rm:4 0111 rot:2 rn:5 rd:5 \
-                  ra=%reg_movprfx
-
-# SVE2 multiply-add long (indexed)
-SMLALB_zzxw_s   01000100 10 1 ..... 1000.0 ..... .....   @rrxr_3a esz=2
-SMLALB_zzxw_d   01000100 11 1 ..... 1000.0 ..... .....   @rrxr_2a esz=3
-SMLALT_zzxw_s   01000100 10 1 ..... 1000.1 ..... .....   @rrxr_3a esz=2
-SMLALT_zzxw_d   01000100 11 1 ..... 1000.1 ..... .....   @rrxr_2a esz=3
-UMLALB_zzxw_s   01000100 10 1 ..... 1001.0 ..... .....   @rrxr_3a esz=2
-UMLALB_zzxw_d   01000100 11 1 ..... 1001.0 ..... .....   @rrxr_2a esz=3
-UMLALT_zzxw_s   01000100 10 1 ..... 1001.1 ..... .....   @rrxr_3a esz=2
-UMLALT_zzxw_d   01000100 11 1 ..... 1001.1 ..... .....   @rrxr_2a esz=3
-SMLSLB_zzxw_s   01000100 10 1 ..... 1010.0 ..... .....   @rrxr_3a esz=2
-SMLSLB_zzxw_d   01000100 11 1 ..... 1010.0 ..... .....   @rrxr_2a esz=3
-SMLSLT_zzxw_s   01000100 10 1 ..... 1010.1 ..... .....   @rrxr_3a esz=2
-SMLSLT_zzxw_d   01000100 11 1 ..... 1010.1 ..... .....   @rrxr_2a esz=3
-UMLSLB_zzxw_s   01000100 10 1 ..... 1011.0 ..... .....   @rrxr_3a esz=2
-UMLSLB_zzxw_d   01000100 11 1 ..... 1011.0 ..... .....   @rrxr_2a esz=3
-UMLSLT_zzxw_s   01000100 10 1 ..... 1011.1 ..... .....   @rrxr_3a esz=2
-UMLSLT_zzxw_d   01000100 11 1 ..... 1011.1 ..... .....   @rrxr_2a esz=3
-
-# SVE2 integer multiply long (indexed)
-SMULLB_zzx_s    01000100 10 1 ..... 1100.0 ..... .....   @rrx_3a esz=2
-SMULLB_zzx_d    01000100 11 1 ..... 1100.0 ..... .....   @rrx_2a esz=3
-SMULLT_zzx_s    01000100 10 1 ..... 1100.1 ..... .....   @rrx_3a esz=2
-SMULLT_zzx_d    01000100 11 1 ..... 1100.1 ..... .....   @rrx_2a esz=3
-UMULLB_zzx_s    01000100 10 1 ..... 1101.0 ..... .....   @rrx_3a esz=2
-UMULLB_zzx_d    01000100 11 1 ..... 1101.0 ..... .....   @rrx_2a esz=3
-UMULLT_zzx_s    01000100 10 1 ..... 1101.1 ..... .....   @rrx_3a esz=2
-UMULLT_zzx_d    01000100 11 1 ..... 1101.1 ..... .....   @rrx_2a esz=3
-
-# SVE2 saturating multiply (indexed)
-SQDMULLB_zzx_s  01000100 10 1 ..... 1110.0 ..... .....   @rrx_3a esz=2
-SQDMULLB_zzx_d  01000100 11 1 ..... 1110.0 ..... .....   @rrx_2a esz=3
-SQDMULLT_zzx_s  01000100 10 1 ..... 1110.1 ..... .....   @rrx_3a esz=2
-SQDMULLT_zzx_d  01000100 11 1 ..... 1110.1 ..... .....   @rrx_2a esz=3
-
-# SVE2 saturating multiply high (indexed)
-SQDMULH_zzx_h   01000100 0. 1 ..... 111100 ..... .....   @rrx_3 esz=1
-SQDMULH_zzx_s   01000100 10 1 ..... 111100 ..... .....   @rrx_2 esz=2
-SQDMULH_zzx_d   01000100 11 1 ..... 111100 ..... .....   @rrx_1 esz=3
-SQRDMULH_zzx_h  01000100 0. 1 ..... 111101 ..... .....   @rrx_3 esz=1
-SQRDMULH_zzx_s  01000100 10 1 ..... 111101 ..... .....   @rrx_2 esz=2
-SQRDMULH_zzx_d  01000100 11 1 ..... 111101 ..... .....   @rrx_1 esz=3
-
-# SVE2 integer multiply (indexed)
-MUL_zzx_h       01000100 0. 1 ..... 111110 ..... .....   @rrx_3 esz=1
-MUL_zzx_s       01000100 10 1 ..... 111110 ..... .....   @rrx_2 esz=2
-MUL_zzx_d       01000100 11 1 ..... 111110 ..... .....   @rrx_1 esz=3
-
-# SVE floating-point complex add (predicated)
-FCADD           01100100 esz:2 00000 rot:1 100 pg:3 rm:5 rd:5 \
-                rn=%reg_movprfx
-
-# SVE floating-point complex multiply-add (predicated)
-FCMLA_zpzzz     01100100 esz:2 0 rm:5 0 rot:2 pg:3 rn:5 rd:5 \
-                ra=%reg_movprfx
-
-# SVE floating-point complex multiply-add (indexed)
-FCMLA_zzxz      01100100 10 1 index:2 rm:3 0001 rot:2 rn:5 rd:5 \
-                ra=%reg_movprfx esz=1
-FCMLA_zzxz      01100100 11 1 index:1 rm:4 0001 rot:2 rn:5 rd:5 \
-                ra=%reg_movprfx esz=2
-
-### SVE FP Multiply-Add Indexed Group
-
-# SVE floating-point multiply-add (indexed)
-FMLA_zzxz       01100100 0. 1 ..... 000000 ..... .....  @rrxr_3 esz=1
-FMLA_zzxz       01100100 10 1 ..... 000000 ..... .....  @rrxr_2 esz=2
-FMLA_zzxz       01100100 11 1 ..... 000000 ..... .....  @rrxr_1 esz=3
-FMLS_zzxz       01100100 0. 1 ..... 000001 ..... .....  @rrxr_3 esz=1
-FMLS_zzxz       01100100 10 1 ..... 000001 ..... .....  @rrxr_2 esz=2
-FMLS_zzxz       01100100 11 1 ..... 000001 ..... .....  @rrxr_1 esz=3
-
-### SVE FP Multiply Indexed Group
-
-# SVE floating-point multiply (indexed)
-FMUL_zzx        01100100 0. 1 ..... 001000 ..... .....   @rrx_3 esz=1
-FMUL_zzx        01100100 10 1 ..... 001000 ..... .....   @rrx_2 esz=2
-FMUL_zzx        01100100 11 1 ..... 001000 ..... .....   @rrx_1 esz=3
-
-### SVE FP Fast Reduction Group
-
-FADDV           01100101 .. 000 000 001 ... ..... .....         @rd_pg_rn
-FMAXNMV         01100101 .. 000 100 001 ... ..... .....         @rd_pg_rn
-FMINNMV         01100101 .. 000 101 001 ... ..... .....         @rd_pg_rn
-FMAXV           01100101 .. 000 110 001 ... ..... .....         @rd_pg_rn
-FMINV           01100101 .. 000 111 001 ... ..... .....         @rd_pg_rn
-
-## SVE Floating Point Unary Operations - Unpredicated Group
-
-FRECPE          01100101 .. 001 110 001100 ..... .....          @rd_rn
-FRSQRTE         01100101 .. 001 111 001100 ..... .....          @rd_rn
-
-### SVE FP Compare with Zero Group
-
-FCMGE_ppz0      01100101 .. 0100 00 001 ... ..... 0 ....        @pd_pg_rn
-FCMGT_ppz0      01100101 .. 0100 00 001 ... ..... 1 ....        @pd_pg_rn
-FCMLT_ppz0      01100101 .. 0100 01 001 ... ..... 0 ....        @pd_pg_rn
-FCMLE_ppz0      01100101 .. 0100 01 001 ... ..... 1 ....        @pd_pg_rn
-FCMEQ_ppz0      01100101 .. 0100 10 001 ... ..... 0 ....        @pd_pg_rn
-FCMNE_ppz0      01100101 .. 0100 11 001 ... ..... 0 ....        @pd_pg_rn
-
-### SVE FP Accumulating Reduction Group
-
-# SVE floating-point serial reduction (predicated)
-FADDA           01100101 .. 011 000 001 ... ..... .....         @rdn_pg_rm
-
-### SVE Floating Point Arithmetic - Unpredicated Group
-
-# SVE floating-point arithmetic (unpredicated)
-FADD_zzz        01100101 .. 0 ..... 000 000 ..... .....         @rd_rn_rm
-FSUB_zzz        01100101 .. 0 ..... 000 001 ..... .....         @rd_rn_rm
-FMUL_zzz        01100101 .. 0 ..... 000 010 ..... .....         @rd_rn_rm
-FTSMUL          01100101 .. 0 ..... 000 011 ..... .....         @rd_rn_rm
-FRECPS          01100101 .. 0 ..... 000 110 ..... .....         @rd_rn_rm
-FRSQRTS         01100101 .. 0 ..... 000 111 ..... .....         @rd_rn_rm
-
-### SVE FP Arithmetic Predicated Group
-
-# SVE floating-point arithmetic (predicated)
-FADD_zpzz       01100101 .. 00 0000 100 ... ..... .....    @rdn_pg_rm
-FSUB_zpzz       01100101 .. 00 0001 100 ... ..... .....    @rdn_pg_rm
-FMUL_zpzz       01100101 .. 00 0010 100 ... ..... .....    @rdn_pg_rm
-FSUB_zpzz       01100101 .. 00 0011 100 ... ..... .....    @rdm_pg_rn # FSUBR
-FMAXNM_zpzz     01100101 .. 00 0100 100 ... ..... .....    @rdn_pg_rm
-FMINNM_zpzz     01100101 .. 00 0101 100 ... ..... .....    @rdn_pg_rm
-FMAX_zpzz       01100101 .. 00 0110 100 ... ..... .....    @rdn_pg_rm
-FMIN_zpzz       01100101 .. 00 0111 100 ... ..... .....    @rdn_pg_rm
-FABD            01100101 .. 00 1000 100 ... ..... .....    @rdn_pg_rm
-FSCALE          01100101 .. 00 1001 100 ... ..... .....    @rdn_pg_rm
-FMULX           01100101 .. 00 1010 100 ... ..... .....    @rdn_pg_rm
-FDIV            01100101 .. 00 1100 100 ... ..... .....    @rdm_pg_rn # FDIVR
-FDIV            01100101 .. 00 1101 100 ... ..... .....    @rdn_pg_rm
-
-# SVE floating-point arithmetic with immediate (predicated)
-FADD_zpzi       01100101 .. 011 000 100 ... 0000 . .....        @rdn_i1
-FSUB_zpzi       01100101 .. 011 001 100 ... 0000 . .....        @rdn_i1
-FMUL_zpzi       01100101 .. 011 010 100 ... 0000 . .....        @rdn_i1
-FSUBR_zpzi      01100101 .. 011 011 100 ... 0000 . .....        @rdn_i1
-FMAXNM_zpzi     01100101 .. 011 100 100 ... 0000 . .....        @rdn_i1
-FMINNM_zpzi     01100101 .. 011 101 100 ... 0000 . .....        @rdn_i1
-FMAX_zpzi       01100101 .. 011 110 100 ... 0000 . .....        @rdn_i1
-FMIN_zpzi       01100101 .. 011 111 100 ... 0000 . .....        @rdn_i1
-
-# SVE floating-point trig multiply-add coefficient
-FTMAD           01100101 esz:2 010 imm:3 100000 rm:5 rd:5       rn=%reg_movprfx
-
-### SVE FP Multiply-Add Group
-
-# SVE floating-point multiply-accumulate writing addend
-FMLA_zpzzz      01100101 .. 1 ..... 000 ... ..... .....         @rda_pg_rn_rm
-FMLS_zpzzz      01100101 .. 1 ..... 001 ... ..... .....         @rda_pg_rn_rm
-FNMLA_zpzzz     01100101 .. 1 ..... 010 ... ..... .....         @rda_pg_rn_rm
-FNMLS_zpzzz     01100101 .. 1 ..... 011 ... ..... .....         @rda_pg_rn_rm
-
-# SVE floating-point multiply-accumulate writing multiplicand
-# Alter the operand extraction order and reuse the helpers from above.
-# FMAD, FMSB, FNMAD, FNMS
-FMLA_zpzzz      01100101 .. 1 ..... 100 ... ..... .....         @rdn_pg_rm_ra
-FMLS_zpzzz      01100101 .. 1 ..... 101 ... ..... .....         @rdn_pg_rm_ra
-FNMLA_zpzzz     01100101 .. 1 ..... 110 ... ..... .....         @rdn_pg_rm_ra
-FNMLS_zpzzz     01100101 .. 1 ..... 111 ... ..... .....         @rdn_pg_rm_ra
-
-### SVE FP Unary Operations Predicated Group
-
-# SVE floating-point convert precision
-FCVT_sh         01100101 10 0010 00 101 ... ..... .....         @rd_pg_rn_e0
-FCVT_hs         01100101 10 0010 01 101 ... ..... .....         @rd_pg_rn_e0
-BFCVT           01100101 10 0010 10 101 ... ..... .....         @rd_pg_rn_e0
-FCVT_dh         01100101 11 0010 00 101 ... ..... .....         @rd_pg_rn_e0
-FCVT_hd         01100101 11 0010 01 101 ... ..... .....         @rd_pg_rn_e0
-FCVT_ds         01100101 11 0010 10 101 ... ..... .....         @rd_pg_rn_e0
-FCVT_sd         01100101 11 0010 11 101 ... ..... .....         @rd_pg_rn_e0
-
-# SVE floating-point convert to integer
-FCVTZS_hh       01100101 01 011 01 0 101 ... ..... .....        @rd_pg_rn_e0
-FCVTZU_hh       01100101 01 011 01 1 101 ... ..... .....        @rd_pg_rn_e0
-FCVTZS_hs       01100101 01 011 10 0 101 ... ..... .....        @rd_pg_rn_e0
-FCVTZU_hs       01100101 01 011 10 1 101 ... ..... .....        @rd_pg_rn_e0
-FCVTZS_hd       01100101 01 011 11 0 101 ... ..... .....        @rd_pg_rn_e0
-FCVTZU_hd       01100101 01 011 11 1 101 ... ..... .....        @rd_pg_rn_e0
-FCVTZS_ss       01100101 10 011 10 0 101 ... ..... .....        @rd_pg_rn_e0
-FCVTZU_ss       01100101 10 011 10 1 101 ... ..... .....        @rd_pg_rn_e0
-FCVTZS_ds       01100101 11 011 00 0 101 ... ..... .....        @rd_pg_rn_e0
-FCVTZU_ds       01100101 11 011 00 1 101 ... ..... .....        @rd_pg_rn_e0
-FCVTZS_sd       01100101 11 011 10 0 101 ... ..... .....        @rd_pg_rn_e0
-FCVTZU_sd       01100101 11 011 10 1 101 ... ..... .....        @rd_pg_rn_e0
-FCVTZS_dd       01100101 11 011 11 0 101 ... ..... .....        @rd_pg_rn_e0
-FCVTZU_dd       01100101 11 011 11 1 101 ... ..... .....        @rd_pg_rn_e0
-
-# SVE floating-point round to integral value
-FRINTN          01100101 .. 000 000 101 ... ..... .....         @rd_pg_rn
-FRINTP          01100101 .. 000 001 101 ... ..... .....         @rd_pg_rn
-FRINTM          01100101 .. 000 010 101 ... ..... .....         @rd_pg_rn
-FRINTZ          01100101 .. 000 011 101 ... ..... .....         @rd_pg_rn
-FRINTA          01100101 .. 000 100 101 ... ..... .....         @rd_pg_rn
-FRINTX          01100101 .. 000 110 101 ... ..... .....         @rd_pg_rn
-FRINTI          01100101 .. 000 111 101 ... ..... .....         @rd_pg_rn
-
-# SVE floating-point unary operations
-FRECPX          01100101 .. 001 100 101 ... ..... .....         @rd_pg_rn
-FSQRT           01100101 .. 001 101 101 ... ..... .....         @rd_pg_rn
-
-# SVE integer convert to floating-point
-SCVTF_hh        01100101 01 010 01 0 101 ... ..... .....        @rd_pg_rn_e0
-SCVTF_sh        01100101 01 010 10 0 101 ... ..... .....        @rd_pg_rn_e0
-SCVTF_dh        01100101 01 010 11 0 101 ... ..... .....        @rd_pg_rn_e0
-SCVTF_ss        01100101 10 010 10 0 101 ... ..... .....        @rd_pg_rn_e0
-SCVTF_sd        01100101 11 010 00 0 101 ... ..... .....        @rd_pg_rn_e0
-SCVTF_ds        01100101 11 010 10 0 101 ... ..... .....        @rd_pg_rn_e0
-SCVTF_dd        01100101 11 010 11 0 101 ... ..... .....        @rd_pg_rn_e0
-
-UCVTF_hh        01100101 01 010 01 1 101 ... ..... .....        @rd_pg_rn_e0
-UCVTF_sh        01100101 01 010 10 1 101 ... ..... .....        @rd_pg_rn_e0
-UCVTF_dh        01100101 01 010 11 1 101 ... ..... .....        @rd_pg_rn_e0
-UCVTF_ss        01100101 10 010 10 1 101 ... ..... .....        @rd_pg_rn_e0
-UCVTF_sd        01100101 11 010 00 1 101 ... ..... .....        @rd_pg_rn_e0
-UCVTF_ds        01100101 11 010 10 1 101 ... ..... .....        @rd_pg_rn_e0
-UCVTF_dd        01100101 11 010 11 1 101 ... ..... .....        @rd_pg_rn_e0
-
-### SVE Memory - 32-bit Gather and Unsized Contiguous Group
-
-# SVE load predicate register
-LDR_pri         10000101 10 ...... 000 ... ..... 0 ....         @pd_rn_i9
-
-# SVE load vector register
-LDR_zri         10000101 10 ...... 010 ... ..... .....          @rd_rn_i9
-
-# SVE load and broadcast element
-LD1R_zpri       1000010 .. 1 imm:6 1.. pg:3 rn:5 rd:5 \
-                &rpri_load dtype=%dtype_23_13 nreg=0
-
-# SVE 32-bit gather load (scalar plus 32-bit unscaled offsets)
-# SVE 32-bit gather load (scalar plus 32-bit scaled offsets)
-LD1_zprz        1000010 00 .0 ..... 0.. ... ..... ..... \
-                @rprr_g_load_xs_u esz=2 msz=0 scale=0
-LD1_zprz        1000010 01 .. ..... 0.. ... ..... ..... \
-                @rprr_g_load_xs_u_sc esz=2 msz=1
-LD1_zprz        1000010 10 .. ..... 01. ... ..... ..... \
-                @rprr_g_load_xs_sc esz=2 msz=2 u=1
-
-# SVE 32-bit gather load (vector plus immediate)
-LD1_zpiz        1000010 .. 01 ..... 1.. ... ..... ..... \
-                @rpri_g_load esz=2
-
-### SVE Memory Contiguous Load Group
-
-# SVE contiguous load (scalar plus scalar)
-LD_zprr         1010010 .... ..... 010 ... ..... .....    @rprr_load_dt nreg=0
-
-# SVE contiguous first-fault load (scalar plus scalar)
-LDFF1_zprr      1010010 .... ..... 011 ... ..... .....    @rprr_load_dt nreg=0
-
-# SVE contiguous load (scalar plus immediate)
-LD_zpri         1010010 .... 0.... 101 ... ..... .....    @rpri_load_dt nreg=0
-
-# SVE contiguous non-fault load (scalar plus immediate)
-LDNF1_zpri      1010010 .... 1.... 101 ... ..... .....    @rpri_load_dt nreg=0
-
-# SVE contiguous non-temporal load (scalar plus scalar)
-# LDNT1B, LDNT1H, LDNT1W, LDNT1D
-# SVE load multiple structures (scalar plus scalar)
-# LD2B, LD2H, LD2W, LD2D; etc.
-LD_zprr         1010010 .. nreg:2 ..... 110 ... ..... .....     @rprr_load_msz
-
-# SVE contiguous non-temporal load (scalar plus immediate)
-# LDNT1B, LDNT1H, LDNT1W, LDNT1D
-# SVE load multiple structures (scalar plus immediate)
-# LD2B, LD2H, LD2W, LD2D; etc.
-LD_zpri         1010010 .. nreg:2 0.... 111 ... ..... .....     @rpri_load_msz
-
-# SVE load and broadcast quadword (scalar plus scalar)
-LD1RQ_zprr      1010010 .. 00 ..... 000 ... ..... ..... \
-                @rprr_load_msz nreg=0
-LD1RO_zprr      1010010 .. 01 ..... 000 ... ..... ..... \
-                @rprr_load_msz nreg=0
-
-# SVE load and broadcast quadword (scalar plus immediate)
-# LD1RQB, LD1RQH, LD1RQS, LD1RQD
-LD1RQ_zpri      1010010 .. 00 0.... 001 ... ..... ..... \
-                @rpri_load_msz nreg=0
-LD1RO_zpri      1010010 .. 01 0.... 001 ... ..... ..... \
-                @rpri_load_msz nreg=0
-
-# SVE 32-bit gather prefetch (scalar plus 32-bit scaled offsets)
-PRF_ns          1000010 00 -1 ----- 0-- --- ----- 0 ----
-
-# SVE 32-bit gather prefetch (vector plus immediate)
-PRF_ns          1000010 -- 00 ----- 111 --- ----- 0 ----
-
-# SVE contiguous prefetch (scalar plus immediate)
-PRF             1000010 11 1- ----- 0-- --- ----- 0 ----
-
-# SVE contiguous prefetch (scalar plus scalar)
-PRF_rr          1000010 -- 00 rm:5 110 --- ----- 0 ----
-
-### SVE Memory 64-bit Gather Group
-
-# SVE 64-bit gather load (scalar plus 32-bit unpacked unscaled offsets)
-# SVE 64-bit gather load (scalar plus 32-bit unpacked scaled offsets)
-LD1_zprz        1100010 00 .0 ..... 0.. ... ..... ..... \
-                @rprr_g_load_xs_u esz=3 msz=0 scale=0
-LD1_zprz        1100010 01 .. ..... 0.. ... ..... ..... \
-                @rprr_g_load_xs_u_sc esz=3 msz=1
-LD1_zprz        1100010 10 .. ..... 0.. ... ..... ..... \
-                @rprr_g_load_xs_u_sc esz=3 msz=2
-LD1_zprz        1100010 11 .. ..... 01. ... ..... ..... \
-                @rprr_g_load_xs_sc esz=3 msz=3 u=1
-
-# SVE 64-bit gather load (scalar plus 64-bit unscaled offsets)
-# SVE 64-bit gather load (scalar plus 64-bit scaled offsets)
-LD1_zprz        1100010 00 10 ..... 1.. ... ..... ..... \
-                @rprr_g_load_u esz=3 msz=0 scale=0
-LD1_zprz        1100010 01 1. ..... 1.. ... ..... ..... \
-                @rprr_g_load_u_sc esz=3 msz=1
-LD1_zprz        1100010 10 1. ..... 1.. ... ..... ..... \
-                @rprr_g_load_u_sc esz=3 msz=2
-LD1_zprz        1100010 11 1. ..... 11. ... ..... ..... \
-                @rprr_g_load_sc esz=3 msz=3 u=1
-
-# SVE 64-bit gather load (vector plus immediate)
-LD1_zpiz        1100010 .. 01 ..... 1.. ... ..... ..... \
-                @rpri_g_load esz=3
-
-# SVE 64-bit gather prefetch (scalar plus 64-bit scaled offsets)
-PRF_ns          1100010 00 11 ----- 1-- --- ----- 0 ----
-
-# SVE 64-bit gather prefetch (scalar plus unpacked 32-bit scaled offsets)
-PRF_ns          1100010 00 -1 ----- 0-- --- ----- 0 ----
-
-# SVE 64-bit gather prefetch (vector plus immediate)
-PRF_ns          1100010 -- 00 ----- 111 --- ----- 0 ----
-
-### SVE Memory Store Group
-
-# SVE store predicate register
-STR_pri         1110010 11 0.     ..... 000 ... ..... 0 ....    @pd_rn_i9
-
-# SVE store vector register
-STR_zri         1110010 11 0.     ..... 010 ... ..... .....     @rd_rn_i9
-
-# SVE contiguous store (scalar plus immediate)
-# ST1B, ST1H, ST1W, ST1D; require msz <= esz
-ST_zpri         1110010 .. esz:2  0.... 111 ... ..... ..... \
-                @rpri_store_msz nreg=0
-
-# SVE contiguous store (scalar plus scalar)
-# ST1B, ST1H, ST1W, ST1D; require msz <= esz
-# Enumerate msz lest we conflict with STR_zri.
-ST_zprr         1110010 00 ..     ..... 010 ... ..... ..... \
-                @rprr_store_esz_n0 msz=0
-ST_zprr         1110010 01 ..     ..... 010 ... ..... ..... \
-                @rprr_store_esz_n0 msz=1
-ST_zprr         1110010 10 ..     ..... 010 ... ..... ..... \
-                @rprr_store_esz_n0 msz=2
-ST_zprr         1110010 11 11     ..... 010 ... ..... ..... \
-                @rprr_store msz=3 esz=3 nreg=0
-
-# SVE contiguous non-temporal store (scalar plus immediate)  (nreg == 0)
-# SVE store multiple structures (scalar plus immediate)      (nreg != 0)
-ST_zpri         1110010 .. nreg:2 1.... 111 ... ..... ..... \
-                @rpri_store_msz esz=%size_23
-
-# SVE contiguous non-temporal store (scalar plus scalar)     (nreg == 0)
-# SVE store multiple structures (scalar plus scalar)         (nreg != 0)
-ST_zprr         1110010 msz:2 nreg:2 ..... 011 ... ..... ..... \
-                @rprr_store esz=%size_23
-
-# SVE 32-bit scatter store (scalar plus 32-bit scaled offsets)
-# Require msz > 0 && msz <= esz.
-ST1_zprz        1110010 .. 11 ..... 100 ... ..... ..... \
-                @rprr_scatter_store xs=0 esz=2 scale=1
-ST1_zprz        1110010 .. 11 ..... 110 ... ..... ..... \
-                @rprr_scatter_store xs=1 esz=2 scale=1
-
-# SVE 32-bit scatter store (scalar plus 32-bit unscaled offsets)
-# Require msz <= esz.
-ST1_zprz        1110010 .. 10 ..... 100 ... ..... ..... \
-                @rprr_scatter_store xs=0 esz=2 scale=0
-ST1_zprz        1110010 .. 10 ..... 110 ... ..... ..... \
-                @rprr_scatter_store xs=1 esz=2 scale=0
-
-# SVE 64-bit scatter store (scalar plus 64-bit scaled offset)
-# Require msz > 0
-ST1_zprz        1110010 .. 01 ..... 101 ... ..... ..... \
-                @rprr_scatter_store xs=2 esz=3 scale=1
-
-# SVE 64-bit scatter store (scalar plus 64-bit unscaled offset)
-ST1_zprz        1110010 .. 00 ..... 101 ... ..... ..... \
-                @rprr_scatter_store xs=2 esz=3 scale=0
-
-# SVE 64-bit scatter store (vector plus immediate)
-ST1_zpiz        1110010 .. 10 ..... 101 ... ..... ..... \
-                @rpri_scatter_store esz=3
-
-# SVE 32-bit scatter store (vector plus immediate)
-ST1_zpiz        1110010 .. 11 ..... 101 ... ..... ..... \
-                @rpri_scatter_store esz=2
-
-# SVE 64-bit scatter store (scalar plus unpacked 32-bit scaled offset)
-# Require msz > 0
-ST1_zprz        1110010 .. 01 ..... 100 ... ..... ..... \
-                @rprr_scatter_store xs=0 esz=3 scale=1
-ST1_zprz        1110010 .. 01 ..... 110 ... ..... ..... \
-                @rprr_scatter_store xs=1 esz=3 scale=1
-
-# SVE 64-bit scatter store (scalar plus unpacked 32-bit unscaled offset)
-ST1_zprz        1110010 .. 00 ..... 100 ... ..... ..... \
-                @rprr_scatter_store xs=0 esz=3 scale=0
-ST1_zprz        1110010 .. 00 ..... 110 ... ..... ..... \
-                @rprr_scatter_store xs=1 esz=3 scale=0
-
-#### SVE2 Support
-
-### SVE2 Integer Multiply - Unpredicated
-
-# SVE2 integer multiply vectors (unpredicated)
-MUL_zzz         00000100 .. 1 ..... 0110 00 ..... .....  @rd_rn_rm
-SMULH_zzz       00000100 .. 1 ..... 0110 10 ..... .....  @rd_rn_rm
-UMULH_zzz       00000100 .. 1 ..... 0110 11 ..... .....  @rd_rn_rm
-PMUL_zzz        00000100 00 1 ..... 0110 01 ..... .....  @rd_rn_rm_e0
-
-# SVE2 signed saturating doubling multiply high (unpredicated)
-SQDMULH_zzz     00000100 .. 1 ..... 0111 00 ..... .....  @rd_rn_rm
-SQRDMULH_zzz    00000100 .. 1 ..... 0111 01 ..... .....  @rd_rn_rm
-
-### SVE2 Integer - Predicated
-
-SADALP_zpzz     01000100 .. 000 100 101 ... ..... .....  @rdm_pg_rn
-UADALP_zpzz     01000100 .. 000 101 101 ... ..... .....  @rdm_pg_rn
-
-### SVE2 integer unary operations (predicated)
-
-URECPE          01000100 .. 000 000 101 ... ..... .....  @rd_pg_rn
-URSQRTE         01000100 .. 000 001 101 ... ..... .....  @rd_pg_rn
-SQABS           01000100 .. 001 000 101 ... ..... .....  @rd_pg_rn
-SQNEG           01000100 .. 001 001 101 ... ..... .....  @rd_pg_rn
-
-### SVE2 saturating/rounding bitwise shift left (predicated)
-
-SRSHL           01000100 .. 000 010 100 ... ..... .....  @rdn_pg_rm
-URSHL           01000100 .. 000 011 100 ... ..... .....  @rdn_pg_rm
-SRSHL           01000100 .. 000 110 100 ... ..... .....  @rdm_pg_rn # SRSHLR
-URSHL           01000100 .. 000 111 100 ... ..... .....  @rdm_pg_rn # URSHLR
-
-SQSHL           01000100 .. 001 000 100 ... ..... .....  @rdn_pg_rm
-UQSHL           01000100 .. 001 001 100 ... ..... .....  @rdn_pg_rm
-SQSHL           01000100 .. 001 100 100 ... ..... .....  @rdm_pg_rn # SQSHLR
-UQSHL           01000100 .. 001 101 100 ... ..... .....  @rdm_pg_rn # UQSHLR
-
-SQRSHL          01000100 .. 001 010 100 ... ..... .....  @rdn_pg_rm
-UQRSHL          01000100 .. 001 011 100 ... ..... .....  @rdn_pg_rm
-SQRSHL          01000100 .. 001 110 100 ... ..... .....  @rdm_pg_rn # SQRSHLR
-UQRSHL          01000100 .. 001 111 100 ... ..... .....  @rdm_pg_rn # UQRSHLR
-
-### SVE2 integer halving add/subtract (predicated)
-
-SHADD           01000100 .. 010 000 100 ... ..... .....  @rdn_pg_rm
-UHADD           01000100 .. 010 001 100 ... ..... .....  @rdn_pg_rm
-SHSUB           01000100 .. 010 010 100 ... ..... .....  @rdn_pg_rm
-UHSUB           01000100 .. 010 011 100 ... ..... .....  @rdn_pg_rm
-SRHADD          01000100 .. 010 100 100 ... ..... .....  @rdn_pg_rm
-URHADD          01000100 .. 010 101 100 ... ..... .....  @rdn_pg_rm
-SHSUB           01000100 .. 010 110 100 ... ..... .....  @rdm_pg_rn # SHSUBR
-UHSUB           01000100 .. 010 111 100 ... ..... .....  @rdm_pg_rn # UHSUBR
-
-### SVE2 integer pairwise arithmetic
-
-ADDP            01000100 .. 010 001 101 ... ..... .....  @rdn_pg_rm
-SMAXP           01000100 .. 010 100 101 ... ..... .....  @rdn_pg_rm
-UMAXP           01000100 .. 010 101 101 ... ..... .....  @rdn_pg_rm
-SMINP           01000100 .. 010 110 101 ... ..... .....  @rdn_pg_rm
-UMINP           01000100 .. 010 111 101 ... ..... .....  @rdn_pg_rm
-
-### SVE2 saturating add/subtract (predicated)
-
-SQADD_zpzz      01000100 .. 011 000 100 ... ..... .....  @rdn_pg_rm
-UQADD_zpzz      01000100 .. 011 001 100 ... ..... .....  @rdn_pg_rm
-SQSUB_zpzz      01000100 .. 011 010 100 ... ..... .....  @rdn_pg_rm
-UQSUB_zpzz      01000100 .. 011 011 100 ... ..... .....  @rdn_pg_rm
-SUQADD          01000100 .. 011 100 100 ... ..... .....  @rdn_pg_rm
-USQADD          01000100 .. 011 101 100 ... ..... .....  @rdn_pg_rm
-SQSUB_zpzz      01000100 .. 011 110 100 ... ..... .....  @rdm_pg_rn # SQSUBR
-UQSUB_zpzz      01000100 .. 011 111 100 ... ..... .....  @rdm_pg_rn # UQSUBR
-
-#### SVE2 Widening Integer Arithmetic
-
-## SVE2 integer add/subtract long
-
-SADDLB          01000101 .. 0 ..... 00 0000 ..... .....  @rd_rn_rm
-SADDLT          01000101 .. 0 ..... 00 0001 ..... .....  @rd_rn_rm
-UADDLB          01000101 .. 0 ..... 00 0010 ..... .....  @rd_rn_rm
-UADDLT          01000101 .. 0 ..... 00 0011 ..... .....  @rd_rn_rm
-
-SSUBLB          01000101 .. 0 ..... 00 0100 ..... .....  @rd_rn_rm
-SSUBLT          01000101 .. 0 ..... 00 0101 ..... .....  @rd_rn_rm
-USUBLB          01000101 .. 0 ..... 00 0110 ..... .....  @rd_rn_rm
-USUBLT          01000101 .. 0 ..... 00 0111 ..... .....  @rd_rn_rm
-
-SABDLB          01000101 .. 0 ..... 00 1100 ..... .....  @rd_rn_rm
-SABDLT          01000101 .. 0 ..... 00 1101 ..... .....  @rd_rn_rm
-UABDLB          01000101 .. 0 ..... 00 1110 ..... .....  @rd_rn_rm
-UABDLT          01000101 .. 0 ..... 00 1111 ..... .....  @rd_rn_rm
-
-## SVE2 integer add/subtract interleaved long
-
-SADDLBT         01000101 .. 0 ..... 1000 00 ..... .....  @rd_rn_rm
-SSUBLBT         01000101 .. 0 ..... 1000 10 ..... .....  @rd_rn_rm
-SSUBLTB         01000101 .. 0 ..... 1000 11 ..... .....  @rd_rn_rm
-
-## SVE2 integer add/subtract wide
-
-SADDWB          01000101 .. 0 ..... 010 000 ..... .....  @rd_rn_rm
-SADDWT          01000101 .. 0 ..... 010 001 ..... .....  @rd_rn_rm
-UADDWB          01000101 .. 0 ..... 010 010 ..... .....  @rd_rn_rm
-UADDWT          01000101 .. 0 ..... 010 011 ..... .....  @rd_rn_rm
-
-SSUBWB          01000101 .. 0 ..... 010 100 ..... .....  @rd_rn_rm
-SSUBWT          01000101 .. 0 ..... 010 101 ..... .....  @rd_rn_rm
-USUBWB          01000101 .. 0 ..... 010 110 ..... .....  @rd_rn_rm
-USUBWT          01000101 .. 0 ..... 010 111 ..... .....  @rd_rn_rm
-
-## SVE2 integer multiply long
-
-SQDMULLB_zzz    01000101 .. 0 ..... 011 000 ..... .....  @rd_rn_rm
-SQDMULLT_zzz    01000101 .. 0 ..... 011 001 ..... .....  @rd_rn_rm
-PMULLB          01000101 .. 0 ..... 011 010 ..... .....  @rd_rn_rm
-PMULLT          01000101 .. 0 ..... 011 011 ..... .....  @rd_rn_rm
-SMULLB_zzz      01000101 .. 0 ..... 011 100 ..... .....  @rd_rn_rm
-SMULLT_zzz      01000101 .. 0 ..... 011 101 ..... .....  @rd_rn_rm
-UMULLB_zzz      01000101 .. 0 ..... 011 110 ..... .....  @rd_rn_rm
-UMULLT_zzz      01000101 .. 0 ..... 011 111 ..... .....  @rd_rn_rm
-
-## SVE2 bitwise shift left long
-
-# Note bit23 == 0 is handled by esz > 0 in do_sve2_shll_tb.
-SSHLLB          01000101 .. 0 ..... 1010 00 ..... .....  @rd_rn_tszimm_shl
-SSHLLT          01000101 .. 0 ..... 1010 01 ..... .....  @rd_rn_tszimm_shl
-USHLLB          01000101 .. 0 ..... 1010 10 ..... .....  @rd_rn_tszimm_shl
-USHLLT          01000101 .. 0 ..... 1010 11 ..... .....  @rd_rn_tszimm_shl
-
-## SVE2 bitwise exclusive-or interleaved
-
-EORBT           01000101 .. 0 ..... 10010 0 ..... .....  @rd_rn_rm
-EORTB           01000101 .. 0 ..... 10010 1 ..... .....  @rd_rn_rm
-
-## SVE integer matrix multiply accumulate
-
-SMMLA           01000101 00 0 ..... 10011 0 ..... .....  @rda_rn_rm_e0
-USMMLA          01000101 10 0 ..... 10011 0 ..... .....  @rda_rn_rm_e0
-UMMLA           01000101 11 0 ..... 10011 0 ..... .....  @rda_rn_rm_e0
-
-## SVE2 bitwise permute
-
-BEXT            01000101 .. 0 ..... 1011 00 ..... .....  @rd_rn_rm
-BDEP            01000101 .. 0 ..... 1011 01 ..... .....  @rd_rn_rm
-BGRP            01000101 .. 0 ..... 1011 10 ..... .....  @rd_rn_rm
-
-#### SVE2 Accumulate
-
-## SVE2 complex integer add
-
-CADD_rot90      01000101 .. 00000 0 11011 0 ..... .....  @rdn_rm
-CADD_rot270     01000101 .. 00000 0 11011 1 ..... .....  @rdn_rm
-SQCADD_rot90    01000101 .. 00000 1 11011 0 ..... .....  @rdn_rm
-SQCADD_rot270   01000101 .. 00000 1 11011 1 ..... .....  @rdn_rm
-
-## SVE2 integer absolute difference and accumulate long
-
-SABALB          01000101 .. 0 ..... 1100 00 ..... .....  @rda_rn_rm
-SABALT          01000101 .. 0 ..... 1100 01 ..... .....  @rda_rn_rm
-UABALB          01000101 .. 0 ..... 1100 10 ..... .....  @rda_rn_rm
-UABALT          01000101 .. 0 ..... 1100 11 ..... .....  @rda_rn_rm
-
-## SVE2 integer add/subtract long with carry
-
-# ADC and SBC decoded via size in helper dispatch.
-ADCLB           01000101 .. 0 ..... 11010 0 ..... .....  @rda_rn_rm
-ADCLT           01000101 .. 0 ..... 11010 1 ..... .....  @rda_rn_rm
-
-## SVE2 bitwise shift right and accumulate
-
-# TODO: Use @rda and %reg_movprfx here.
-SSRA            01000101 .. 0 ..... 1110 00 ..... .....  @rd_rn_tszimm_shr
-USRA            01000101 .. 0 ..... 1110 01 ..... .....  @rd_rn_tszimm_shr
-SRSRA           01000101 .. 0 ..... 1110 10 ..... .....  @rd_rn_tszimm_shr
-URSRA           01000101 .. 0 ..... 1110 11 ..... .....  @rd_rn_tszimm_shr
-
-## SVE2 bitwise shift and insert
-
-SRI             01000101 .. 0 ..... 11110 0 ..... .....  @rd_rn_tszimm_shr
-SLI             01000101 .. 0 ..... 11110 1 ..... .....  @rd_rn_tszimm_shl
-
-## SVE2 integer absolute difference and accumulate
-
-# TODO: Use @rda and %reg_movprfx here.
-SABA            01000101 .. 0 ..... 11111 0 ..... .....  @rd_rn_rm
-UABA            01000101 .. 0 ..... 11111 1 ..... .....  @rd_rn_rm
-
-#### SVE2 Narrowing
-
-## SVE2 saturating extract narrow
-
-# Bits 23, 18-16 are zero, limited in the translator via esz < 3 & imm == 0.
-SQXTNB          01000101 .. 1 ..... 010 000 ..... .....  @rd_rn_tszimm_shl
-SQXTNT          01000101 .. 1 ..... 010 001 ..... .....  @rd_rn_tszimm_shl
-UQXTNB          01000101 .. 1 ..... 010 010 ..... .....  @rd_rn_tszimm_shl
-UQXTNT          01000101 .. 1 ..... 010 011 ..... .....  @rd_rn_tszimm_shl
-SQXTUNB         01000101 .. 1 ..... 010 100 ..... .....  @rd_rn_tszimm_shl
-SQXTUNT         01000101 .. 1 ..... 010 101 ..... .....  @rd_rn_tszimm_shl
-
-## SVE2 bitwise shift right narrow
-
-# Bit 23 == 0 is handled by esz > 0 in the translator.
-SQSHRUNB        01000101 .. 1 ..... 00 0000 ..... .....  @rd_rn_tszimm_shr
-SQSHRUNT        01000101 .. 1 ..... 00 0001 ..... .....  @rd_rn_tszimm_shr
-SQRSHRUNB       01000101 .. 1 ..... 00 0010 ..... .....  @rd_rn_tszimm_shr
-SQRSHRUNT       01000101 .. 1 ..... 00 0011 ..... .....  @rd_rn_tszimm_shr
-SHRNB           01000101 .. 1 ..... 00 0100 ..... .....  @rd_rn_tszimm_shr
-SHRNT           01000101 .. 1 ..... 00 0101 ..... .....  @rd_rn_tszimm_shr
-RSHRNB          01000101 .. 1 ..... 00 0110 ..... .....  @rd_rn_tszimm_shr
-RSHRNT          01000101 .. 1 ..... 00 0111 ..... .....  @rd_rn_tszimm_shr
-SQSHRNB         01000101 .. 1 ..... 00 1000 ..... .....  @rd_rn_tszimm_shr
-SQSHRNT         01000101 .. 1 ..... 00 1001 ..... .....  @rd_rn_tszimm_shr
-SQRSHRNB        01000101 .. 1 ..... 00 1010 ..... .....  @rd_rn_tszimm_shr
-SQRSHRNT        01000101 .. 1 ..... 00 1011 ..... .....  @rd_rn_tszimm_shr
-UQSHRNB         01000101 .. 1 ..... 00 1100 ..... .....  @rd_rn_tszimm_shr
-UQSHRNT         01000101 .. 1 ..... 00 1101 ..... .....  @rd_rn_tszimm_shr
-UQRSHRNB        01000101 .. 1 ..... 00 1110 ..... .....  @rd_rn_tszimm_shr
-UQRSHRNT        01000101 .. 1 ..... 00 1111 ..... .....  @rd_rn_tszimm_shr
-
-## SVE2 integer add/subtract narrow high part
-
-ADDHNB          01000101 .. 1 ..... 011 000 ..... .....  @rd_rn_rm
-ADDHNT          01000101 .. 1 ..... 011 001 ..... .....  @rd_rn_rm
-RADDHNB         01000101 .. 1 ..... 011 010 ..... .....  @rd_rn_rm
-RADDHNT         01000101 .. 1 ..... 011 011 ..... .....  @rd_rn_rm
-SUBHNB          01000101 .. 1 ..... 011 100 ..... .....  @rd_rn_rm
-SUBHNT          01000101 .. 1 ..... 011 101 ..... .....  @rd_rn_rm
-RSUBHNB         01000101 .. 1 ..... 011 110 ..... .....  @rd_rn_rm
-RSUBHNT         01000101 .. 1 ..... 011 111 ..... .....  @rd_rn_rm
-
-### SVE2 Character Match
-
-MATCH           01000101 .. 1 ..... 100 ... ..... 0 .... @pd_pg_rn_rm
-NMATCH          01000101 .. 1 ..... 100 ... ..... 1 .... @pd_pg_rn_rm
-
-### SVE2 Histogram Computation
-
-HISTCNT         01000101 .. 1 ..... 110 ... ..... .....  @rd_pg_rn_rm
-HISTSEG         01000101 .. 1 ..... 101 000 ..... .....  @rd_rn_rm
-
-## SVE2 floating-point pairwise operations
-
-FADDP           01100100 .. 010 00 0 100 ... ..... ..... @rdn_pg_rm
-FMAXNMP         01100100 .. 010 10 0 100 ... ..... ..... @rdn_pg_rm
-FMINNMP         01100100 .. 010 10 1 100 ... ..... ..... @rdn_pg_rm
-FMAXP           01100100 .. 010 11 0 100 ... ..... ..... @rdn_pg_rm
-FMINP           01100100 .. 010 11 1 100 ... ..... ..... @rdn_pg_rm
-
-#### SVE Integer Multiply-Add (unpredicated)
-
-## SVE2 saturating multiply-add long
-
-SQDMLALB_zzzw   01000100 .. 0 ..... 0110 00 ..... .....  @rda_rn_rm
-SQDMLALT_zzzw   01000100 .. 0 ..... 0110 01 ..... .....  @rda_rn_rm
-SQDMLSLB_zzzw   01000100 .. 0 ..... 0110 10 ..... .....  @rda_rn_rm
-SQDMLSLT_zzzw   01000100 .. 0 ..... 0110 11 ..... .....  @rda_rn_rm
-
-## SVE2 saturating multiply-add interleaved long
-
-SQDMLALBT       01000100 .. 0 ..... 00001 0 ..... .....  @rda_rn_rm
-SQDMLSLBT       01000100 .. 0 ..... 00001 1 ..... .....  @rda_rn_rm
-
-## SVE2 saturating multiply-add high
-
-SQRDMLAH_zzzz   01000100 .. 0 ..... 01110 0 ..... .....  @rda_rn_rm
-SQRDMLSH_zzzz   01000100 .. 0 ..... 01110 1 ..... .....  @rda_rn_rm
-
-## SVE2 integer multiply-add long
-
-SMLALB_zzzw     01000100 .. 0 ..... 010 000 ..... .....  @rda_rn_rm
-SMLALT_zzzw     01000100 .. 0 ..... 010 001 ..... .....  @rda_rn_rm
-UMLALB_zzzw     01000100 .. 0 ..... 010 010 ..... .....  @rda_rn_rm
-UMLALT_zzzw     01000100 .. 0 ..... 010 011 ..... .....  @rda_rn_rm
-SMLSLB_zzzw     01000100 .. 0 ..... 010 100 ..... .....  @rda_rn_rm
-SMLSLT_zzzw     01000100 .. 0 ..... 010 101 ..... .....  @rda_rn_rm
-UMLSLB_zzzw     01000100 .. 0 ..... 010 110 ..... .....  @rda_rn_rm
-UMLSLT_zzzw     01000100 .. 0 ..... 010 111 ..... .....  @rda_rn_rm
-
-## SVE2 complex integer multiply-add
-
-CMLA_zzzz       01000100 esz:2 0 rm:5 0010 rot:2 rn:5 rd:5  ra=%reg_movprfx
-SQRDCMLAH_zzzz  01000100 esz:2 0 rm:5 0011 rot:2 rn:5 rd:5  ra=%reg_movprfx
-
-## SVE mixed sign dot product
-
-USDOT_zzzz      01000100 .. 0 ..... 011 110 ..... .....  @rda_rn_rm
-
-### SVE2 floating point matrix multiply accumulate
-BFMMLA          01100100 01 1 ..... 111 001 ..... .....  @rda_rn_rm_e0
-FMMLA_s         01100100 10 1 ..... 111 001 ..... .....  @rda_rn_rm_e0
-FMMLA_d         01100100 11 1 ..... 111 001 ..... .....  @rda_rn_rm_e0
-
-### SVE2 Memory Gather Load Group
-
-# SVE2 64-bit gather non-temporal load (scalar plus 64-bit unscaled offsets)
-LDNT1_zprz      1100010 msz:2 00 rm:5 1 u:1 0 pg:3 rn:5 rd:5 \
-                &rprr_gather_load xs=2 esz=3 scale=0 ff=0
-
-# SVE2 32-bit gather non-temporal load (scalar plus 32-bit unscaled offsets)
-LDNT1_zprz      1000010 msz:2 00 rm:5 10 u:1 pg:3 rn:5 rd:5 \
-                &rprr_gather_load xs=0 esz=2 scale=0 ff=0
-
-### SVE2 Memory Store Group
-
-# SVE2 64-bit scatter non-temporal store (vector plus scalar)
-STNT1_zprz      1110010 .. 00 ..... 001 ... ..... ..... \
-                @rprr_scatter_store xs=2 esz=3 scale=0
-
-# SVE2 32-bit scatter non-temporal store (vector plus scalar)
-STNT1_zprz      1110010 .. 10 ..... 001 ... ..... ..... \
-                @rprr_scatter_store xs=0 esz=2 scale=0
-
-### SVE2 Crypto Extensions
-
-# SVE2 crypto unary operations
-# AESMC and AESIMC
-AESMC           01000101 00 10000011100 decrypt:1 00000 rd:5
-
-# SVE2 crypto destructive binary operations
-AESE            01000101 00 10001 0 11100 0 ..... .....  @rdn_rm_e0
-AESD            01000101 00 10001 0 11100 1 ..... .....  @rdn_rm_e0
-SM4E            01000101 00 10001 1 11100 0 ..... .....  @rdn_rm_e0
-
-# SVE2 crypto constructive binary operations
-SM4EKEY         01000101 00 1 ..... 11110 0 ..... .....  @rd_rn_rm_e0
-RAX1            01000101 00 1 ..... 11110 1 ..... .....  @rd_rn_rm_e0
-
-### SVE2 floating-point convert precision odd elements
-FCVTXNT_ds      01100100 00 0010 10 101 ... ..... .....  @rd_pg_rn_e0
-FCVTX_ds        01100101 00 0010 10 101 ... ..... .....  @rd_pg_rn_e0
-FCVTNT_sh       01100100 10 0010 00 101 ... ..... .....  @rd_pg_rn_e0
-BFCVTNT         01100100 10 0010 10 101 ... ..... .....  @rd_pg_rn_e0
-FCVTLT_hs       01100100 10 0010 01 101 ... ..... .....  @rd_pg_rn_e0
-FCVTNT_ds       01100100 11 0010 10 101 ... ..... .....  @rd_pg_rn_e0
-FCVTLT_sd       01100100 11 0010 11 101 ... ..... .....  @rd_pg_rn_e0
-
-### SVE2 floating-point convert to integer
-FLOGB           01100101 00 011 esz:2 0101 pg:3 rn:5 rd:5  &rpr_esz
-
-### SVE2 floating-point multiply-add long (vectors)
-FMLALB_zzzw     01100100 10 1 ..... 10 0 00 0 ..... .....  @rda_rn_rm_e0
-FMLALT_zzzw     01100100 10 1 ..... 10 0 00 1 ..... .....  @rda_rn_rm_e0
-FMLSLB_zzzw     01100100 10 1 ..... 10 1 00 0 ..... .....  @rda_rn_rm_e0
-FMLSLT_zzzw     01100100 10 1 ..... 10 1 00 1 ..... .....  @rda_rn_rm_e0
-
-BFMLALB_zzzw    01100100 11 1 ..... 10 0 00 0 ..... .....  @rda_rn_rm_e0
-BFMLALT_zzzw    01100100 11 1 ..... 10 0 00 1 ..... .....  @rda_rn_rm_e0
-
-### SVE2 floating-point bfloat16 dot-product
-BFDOT_zzzz      01100100 01 1 ..... 10 0 00 0 ..... .....  @rda_rn_rm_e0
-
-### SVE2 floating-point multiply-add long (indexed)
-FMLALB_zzxw     01100100 10 1 ..... 0100.0 ..... .....     @rrxr_3a esz=2
-FMLALT_zzxw     01100100 10 1 ..... 0100.1 ..... .....     @rrxr_3a esz=2
-FMLSLB_zzxw     01100100 10 1 ..... 0110.0 ..... .....     @rrxr_3a esz=2
-FMLSLT_zzxw     01100100 10 1 ..... 0110.1 ..... .....     @rrxr_3a esz=2
-BFMLALB_zzxw    01100100 11 1 ..... 0100.0 ..... .....     @rrxr_3a esz=2
-BFMLALT_zzxw    01100100 11 1 ..... 0100.1 ..... .....     @rrxr_3a esz=2
-
-### SVE2 floating-point bfloat16 dot-product (indexed)
-BFDOT_zzxz      01100100 01 1 ..... 010000 ..... .....     @rrxr_2 esz=2
-
-### SVE broadcast predicate element
-
-&psel           esz pd pn pm rv imm
-%psel_rv        16:2 !function=plus_12
-%psel_imm_b     22:2 19:2
-%psel_imm_h     22:2 20:1
-%psel_imm_s     22:2
-%psel_imm_d     23:1
-@psel           ........ .. . ... .. .. pn:4 . pm:4 . pd:4  \
-                &psel rv=%psel_rv
-
-PSEL            00100101 .. 1 ..1 .. 01 .... 0 .... 0 ....  \
-                @psel esz=0 imm=%psel_imm_b
-PSEL            00100101 .. 1 .10 .. 01 .... 0 .... 0 ....  \
-                @psel esz=1 imm=%psel_imm_h
-PSEL            00100101 .. 1 100 .. 01 .... 0 .... 0 ....  \
-                @psel esz=2 imm=%psel_imm_s
-PSEL            00100101 .1 1 000 .. 01 .... 0 .... 0 ....  \
-                @psel esz=3 imm=%psel_imm_d
-
-### SVE clamp
-
-SCLAMP          01000100 .. 0 ..... 110000 ..... .....          @rda_rn_rm
-UCLAMP          01000100 .. 0 ..... 110001 ..... .....          @rda_rn_rm
diff --git a/target/arm/t16.decode b/target/arm/t16.decode
deleted file mode 100644
index 646c749..0000000
--- a/target/arm/t16.decode
+++ /dev/null
@@ -1,281 +0,0 @@
-# Thumb1 instructions
-#
-#  Copyright (c) 2019 Linaro, Ltd
-#
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-#
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, see <http://www.gnu.org/licenses/>.
-
-#
-# This file is processed by scripts/decodetree.py
-#
-
-&empty           !extern
-&s_rrr_shi       !extern s rd rn rm shim shty
-&s_rrr_shr       !extern s rn rd rm rs shty
-&s_rri_rot       !extern s rn rd imm rot
-&s_rrrr          !extern s rd rn rm ra
-&rrr_rot         !extern rd rn rm rot
-&rr              !extern rd rm
-&ri              !extern rd imm
-&r               !extern rm
-&i               !extern imm
-&ldst_rr         !extern p w u rn rt rm shimm shtype
-&ldst_ri         !extern p w u rn rt imm
-&ldst_block      !extern rn i b u w list
-&setend          !extern E
-&cps             !extern mode imod M A I F
-&ci              !extern cond imm
-
-# Set S if the instruction is outside of an IT block.
-%s               !function=t16_setflags
-
-# Data-processing (two low registers)
-
-%reg_0           0:3
-
-@lll_noshr       ...... .... rm:3 rd:3 \
-                 &s_rrr_shi %s rn=%reg_0 shim=0 shty=0
-@xll_noshr       ...... .... rm:3 rn:3 \
-                 &s_rrr_shi s=1 rd=0 shim=0 shty=0
-@lxl_shr         ...... .... rs:3 rd:3 \
-                 &s_rrr_shr %s rm=%reg_0 rn=0
-
-AND_rrri         010000 0000 ... ...            @lll_noshr
-EOR_rrri         010000 0001 ... ...            @lll_noshr
-MOV_rxrr         010000 0010 ... ...            @lxl_shr shty=0  # LSL
-MOV_rxrr         010000 0011 ... ...            @lxl_shr shty=1  # LSR
-MOV_rxrr         010000 0100 ... ...            @lxl_shr shty=2  # ASR
-ADC_rrri         010000 0101 ... ...            @lll_noshr
-SBC_rrri         010000 0110 ... ...            @lll_noshr
-MOV_rxrr         010000 0111 ... ...            @lxl_shr shty=3  # ROR
-TST_xrri         010000 1000 ... ...            @xll_noshr
-RSB_rri          010000 1001 rn:3 rd:3          &s_rri_rot %s imm=0 rot=0
-CMP_xrri         010000 1010 ... ...            @xll_noshr
-CMN_xrri         010000 1011 ... ...            @xll_noshr
-ORR_rrri         010000 1100 ... ...            @lll_noshr
-MUL              010000 1101 rn:3 rd:3          &s_rrrr %s rm=%reg_0 ra=0
-BIC_rrri         010000 1110 ... ...            @lll_noshr
-MVN_rxri         010000 1111 ... ...            @lll_noshr
-
-# Load/store (register offset)
-
-@ldst_rr         ....... rm:3 rn:3 rt:3 \
-                 &ldst_rr p=1 w=0 u=1 shimm=0 shtype=0
-
-STR_rr           0101 000 ... ... ...           @ldst_rr
-STRH_rr          0101 001 ... ... ...           @ldst_rr
-STRB_rr          0101 010 ... ... ...           @ldst_rr
-LDRSB_rr         0101 011 ... ... ...           @ldst_rr
-LDR_rr           0101 100 ... ... ...           @ldst_rr
-LDRH_rr          0101 101 ... ... ...           @ldst_rr
-LDRB_rr          0101 110 ... ... ...           @ldst_rr
-LDRSH_rr         0101 111 ... ... ...           @ldst_rr
-
-# Load/store word/byte (immediate offset)
-
-%imm5_6x4       6:5 !function=times_4
-
-@ldst_ri_1      ..... imm:5 rn:3 rt:3 \
-                &ldst_ri p=1 w=0 u=1
-@ldst_ri_4      ..... ..... rn:3 rt:3 \
-                &ldst_ri p=1 w=0 u=1 imm=%imm5_6x4
-
-STR_ri          01100 ..... ... ...             @ldst_ri_4
-LDR_ri          01101 ..... ... ...             @ldst_ri_4
-STRB_ri         01110 ..... ... ...             @ldst_ri_1
-LDRB_ri         01111 ..... ... ...             @ldst_ri_1
-
-# Load/store halfword (immediate offset)
-
-%imm5_6x2       6:5 !function=times_2
-@ldst_ri_2      ..... ..... rn:3 rt:3 \
-                &ldst_ri p=1 w=0 u=1 imm=%imm5_6x2
-
-STRH_ri         10000 ..... ... ...             @ldst_ri_2
-LDRH_ri         10001 ..... ... ...             @ldst_ri_2
-
-# Load/store (SP-relative)
-
-%imm8_0x4       0:8 !function=times_4
-@ldst_spec_i    ..... rt:3 ........ \
-                &ldst_ri p=1 w=0 u=1 imm=%imm8_0x4
-
-STR_ri          10010 ... ........              @ldst_spec_i rn=13
-LDR_ri          10011 ... ........              @ldst_spec_i rn=13
-
-# Load (PC-relative)
-
-LDR_ri          01001 ... ........              @ldst_spec_i rn=15
-
-# Add PC/SP (immediate)
-
-ADR             10100 rd:3 ........             imm=%imm8_0x4
-ADD_rri         10101 rd:3 ........ \
-                &s_rri_rot rn=13 s=0 rot=0 imm=%imm8_0x4  # SP
-
-# Load/store multiple
-
-@ldstm          ..... rn:3 list:8               &ldst_block i=1 b=0 u=0 w=1
-
-STM             11000 ... ........              @ldstm
-LDM_t16         11001 ... ........              @ldstm
-
-# Shift (immediate)
-
-@shift_i        ..... shim:5 rm:3 rd:3          &s_rrr_shi %s rn=%reg_0
-
-MOV_rxri        000 00 ..... ... ...            @shift_i shty=0  # LSL
-MOV_rxri        000 01 ..... ... ...            @shift_i shty=1  # LSR
-MOV_rxri        000 10 ..... ... ...            @shift_i shty=2  # ASR
-
-# Add/subtract (three low registers)
-
-@addsub_3       ....... rm:3 rn:3 rd:3 \
-                &s_rrr_shi %s shim=0 shty=0
-
-ADD_rrri        0001100 ... ... ...             @addsub_3
-SUB_rrri        0001101 ... ... ...             @addsub_3
-
-# Add/subtract (two low registers and immediate)
-
-@addsub_2i      ....... imm:3 rn:3 rd:3 \
-                &s_rri_rot %s rot=0
-
-ADD_rri         0001 110 ... ... ...            @addsub_2i
-SUB_rri         0001 111 ... ... ...            @addsub_2i
-
-# Add, subtract, compare, move (one low register and immediate)
-
-%reg_8          8:3
-@arith_1i       ..... rd:3 imm:8 \
-                &s_rri_rot rot=0 rn=%reg_8
-
-MOV_rxi         00100 ... ........              @arith_1i %s
-CMP_xri         00101 ... ........              @arith_1i s=1
-ADD_rri         00110 ... ........              @arith_1i %s
-SUB_rri         00111 ... ........              @arith_1i %s
-
-# Add, compare, move (two high registers)
-
-%reg_0_7        7:1 0:3
-@addsub_2h      .... .... . rm:4 ... \
-                &s_rrr_shi rd=%reg_0_7 rn=%reg_0_7 shim=0 shty=0
-
-ADD_rrri        0100 0100 . .... ...            @addsub_2h s=0
-CMP_xrri        0100 0101 . .... ...            @addsub_2h s=1
-MOV_rxri        0100 0110 . .... ...            @addsub_2h s=0
-
-# Adjust SP (immediate)
-
-%imm7_0x4       0:7 !function=times_4
-@addsub_sp_i    .... .... . ....... \
-                &s_rri_rot s=0 rd=13 rn=13 rot=0 imm=%imm7_0x4
-
-ADD_rri         1011 0000 0 .......             @addsub_sp_i
-SUB_rri         1011 0000 1 .......             @addsub_sp_i
-
-# Branch and exchange
-
-@branchr        .... .... . rm:4 ...            &r
-
-BX              0100 0111 0 .... 000            @branchr
-BLX_r           0100 0111 1 .... 000            @branchr
-BXNS            0100 0111 0 .... 100            @branchr
-BLXNS           0100 0111 1 .... 100            @branchr
-
-# Extend
-
-@extend         .... .... .. rm:3 rd:3          &rrr_rot rn=15 rot=0
-
-SXTAH           1011 0010 00 ... ...            @extend
-SXTAB           1011 0010 01 ... ...            @extend
-UXTAH           1011 0010 10 ... ...            @extend
-UXTAB           1011 0010 11 ... ...            @extend
-
-# Change processor state
-
-%imod           4:1 !function=plus_2
-
-SETEND          1011 0110 010 1 E:1 000         &setend
-{
-  CPS           1011 0110 011 . 0 A:1 I:1 F:1   &cps mode=0 M=0 %imod
-  CPS_v7m       1011 0110 011 im:1 00 I:1 F:1
-}
-
-# Reverse bytes
-
-@rdm            .... .... .. rm:3 rd:3          &rr
-
-REV             1011 1010 00 ... ...            @rdm
-REV16           1011 1010 01 ... ...            @rdm
-REVSH           1011 1010 11 ... ...            @rdm
-
-# Hints
-
-{
-  {
-    YIELD       1011 1111 0001 0000
-    WFE         1011 1111 0010 0000
-    WFI         1011 1111 0011 0000
-
-    # TODO: Implement SEV, SEVL; may help SMP performance.
-    # SEV       1011 1111 0100 0000
-    # SEVL      1011 1111 0101 0000
-
-    # The canonical nop has the second nibble as 0000, but the whole of the
-    # rest of the space is a reserved hint, behaves as nop.
-    NOP         1011 1111 ---- 0000
-  }
-  IT            1011 1111 cond_mask:8
-}
-
-# Miscellaneous 16-bit instructions
-
-%imm6_9_3       9:1 3:5 !function=times_2
-
-HLT             1011 1010 10 imm:6              &i
-BKPT            1011 1110 imm:8                 &i
-CBZ             1011 nz:1 0.1 ..... rn:3        imm=%imm6_9_3
-
-# Push and Pop
-
-%push_list      0:9 !function=t16_push_list
-%pop_list       0:9 !function=t16_pop_list
-
-STM             1011 010 ......... \
-                &ldst_block i=0 b=1 u=0 w=1 rn=13 list=%push_list
-LDM_t16         1011 110 ......... \
-                &ldst_block i=1 b=0 u=0 w=1 rn=13 list=%pop_list
-
-# Conditional branches, Supervisor call
-
-%imm8_0x2       0:s8 !function=times_2
-
-{
-  UDF           1101 1110 ---- ----
-  SVC           1101 1111 imm:8                 &i
-  B_cond_thumb  1101 cond:4 ........            &ci imm=%imm8_0x2
-}
-
-# Unconditional Branch
-
-%imm11_0x2      0:s11 !function=times_2
-
-B               11100 ...........               &i imm=%imm11_0x2
-
-# thumb_insn_is_16bit() ensures we won't be decoding these as
-# T16 instructions for a Thumb2 CPU, so these patterns must be
-# a Thumb1 split BL/BLX.
-BLX_suffix      11101 imm:11                    &i
-BL_BLX_prefix   11110 imm:s11                   &i
-BL_suffix       11111 imm:11                    &i
diff --git a/target/arm/t32.decode b/target/arm/t32.decode
deleted file mode 100644
index f21ad01..0000000
--- a/target/arm/t32.decode
+++ /dev/null
@@ -1,753 +0,0 @@
-# Thumb2 instructions
-#
-#  Copyright (c) 2019 Linaro, Ltd
-#
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-#
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, see <http://www.gnu.org/licenses/>.
-
-#
-# This file is processed by scripts/decodetree.py
-#
-
-&empty           !extern
-&s_rrr_shi       !extern s rd rn rm shim shty
-&s_rrr_shr       !extern s rn rd rm rs shty
-&s_rri_rot       !extern s rn rd imm rot
-&s_rrrr          !extern s rd rn rm ra
-&rrrr            !extern rd rn rm ra
-&rrr_rot         !extern rd rn rm rot
-&rrr             !extern rd rn rm
-&rr              !extern rd rm
-&ri              !extern rd imm
-&r               !extern rm
-&i               !extern imm
-&msr_reg         !extern rn r mask
-&mrs_reg         !extern rd r
-&msr_bank        !extern rn r sysm
-&mrs_bank        !extern rd r sysm
-&ldst_rr         !extern p w u rn rt rm shimm shtype
-&ldst_ri         !extern p w u rn rt imm
-&ldst_block      !extern rn i b u w list
-&strex           !extern rn rd rt rt2 imm
-&ldrex           !extern rn rt rt2 imm
-&bfx             !extern rd rn lsb widthm1
-&bfi             !extern rd rn lsb msb
-&sat             !extern rd rn satimm imm sh
-&pkh             !extern rd rn rm imm tb
-&cps             !extern mode imod M A I F
-&mcr             !extern cp opc1 crn crm opc2 rt
-&mcrr            !extern cp opc1 crm rt rt2
-
-&mve_shl_ri      rdalo rdahi shim
-&mve_shl_rr      rdalo rdahi rm
-&mve_sh_ri       rda shim
-&mve_sh_rr       rda rm
-
-# rdahi: bits [3:1] from insn, bit 0 is 1
-# rdalo: bits [3:1] from insn, bit 0 is 0
-%rdahi_9 9:3 !function=times_2_plus_1
-%rdalo_17 17:3 !function=times_2
-
-# Data-processing (register)
-
-%imm5_12_6       12:3 6:2
-
-@s_rrr_shi       ....... .... s:1 rn:4 .... rd:4 .. shty:2 rm:4 \
-                 &s_rrr_shi shim=%imm5_12_6
-@s_rxr_shi       ....... .... s:1 .... .... rd:4 .. shty:2 rm:4 \
-                 &s_rrr_shi shim=%imm5_12_6 rn=0
-@S_xrr_shi       ....... .... .   rn:4 .... .... .. shty:2 rm:4 \
-                 &s_rrr_shi shim=%imm5_12_6 s=1 rd=0
-
-@mve_shl_ri      ....... .... . ... . . ... ... . .. .. .... \
-                 &mve_shl_ri shim=%imm5_12_6 rdalo=%rdalo_17 rdahi=%rdahi_9
-@mve_shl_rr      ....... .... . ... . rm:4  ... . .. .. .... \
-                 &mve_shl_rr rdalo=%rdalo_17 rdahi=%rdahi_9
-@mve_sh_ri       ....... .... . rda:4 . ... ... . .. .. .... \
-                 &mve_sh_ri shim=%imm5_12_6
-@mve_sh_rr       ....... .... . rda:4 rm:4 .... .... .... &mve_sh_rr
-
-{
-  TST_xrri       1110101 0000 1 .... 0 ... 1111 .... ....     @S_xrr_shi
-  AND_rrri       1110101 0000 . .... 0 ... .... .... ....     @s_rrr_shi
-}
-BIC_rrri         1110101 0001 . .... 0 ... .... .... ....     @s_rrr_shi
-{
-  # The v8.1M MVE shift insns overlap in encoding with MOVS/ORRS
-  # and are distinguished by having Rm==13 or 15. Those are UNPREDICTABLE
-  # cases for MOVS/ORRS. We decode the MVE cases first, ensuring that
-  # they explicitly call unallocated_encoding() for cases that must UNDEF
-  # (eg "using a new shift insn on a v8.1M CPU without MVE"), and letting
-  # the rest fall through (where ORR_rrri and MOV_rxri will end up
-  # handling them as r13 and r15 accesses with the same semantics as A32).
-  [
-    {
-      UQSHL_ri   1110101 0010 1 ....  0 ...  1111 .. 00 1111  @mve_sh_ri
-      LSLL_ri    1110101 0010 1 ... 0 0 ... ... 1 .. 00 1111  @mve_shl_ri
-      UQSHLL_ri  1110101 0010 1 ... 1 0 ... ... 1 .. 00 1111  @mve_shl_ri
-    }
-
-    {
-      URSHR_ri   1110101 0010 1 ....  0 ...  1111 .. 01 1111  @mve_sh_ri
-      LSRL_ri    1110101 0010 1 ... 0 0 ... ... 1 .. 01 1111  @mve_shl_ri
-      URSHRL_ri  1110101 0010 1 ... 1 0 ... ... 1 .. 01 1111  @mve_shl_ri
-    }
-
-    {
-      SRSHR_ri   1110101 0010 1 ....  0 ...  1111 .. 10 1111  @mve_sh_ri
-      ASRL_ri    1110101 0010 1 ... 0 0 ... ... 1 .. 10 1111  @mve_shl_ri
-      SRSHRL_ri  1110101 0010 1 ... 1 0 ... ... 1 .. 10 1111  @mve_shl_ri
-    }
-
-    {
-      SQSHL_ri   1110101 0010 1 ....  0 ...  1111 .. 11 1111  @mve_sh_ri
-      SQSHLL_ri  1110101 0010 1 ... 1 0 ... ... 1 .. 11 1111  @mve_shl_ri
-    }
-
-    {
-      UQRSHL_rr    1110101 0010 1 ....  ....  1111 0000 1101  @mve_sh_rr
-      LSLL_rr      1110101 0010 1 ... 0 .... ... 1 0000 1101  @mve_shl_rr
-      UQRSHLL64_rr 1110101 0010 1 ... 1 .... ... 1 0000 1101  @mve_shl_rr
-    }
-
-    {
-      SQRSHR_rr    1110101 0010 1 ....  ....  1111 0010 1101  @mve_sh_rr
-      ASRL_rr      1110101 0010 1 ... 0 .... ... 1 0010 1101  @mve_shl_rr
-      SQRSHRL64_rr 1110101 0010 1 ... 1 .... ... 1 0010 1101  @mve_shl_rr
-    }
-
-    UQRSHLL48_rr 1110101 0010 1 ... 1 ....  ... 1  1000 1101  @mve_shl_rr
-    SQRSHRL48_rr 1110101 0010 1 ... 1 ....  ... 1  1010 1101  @mve_shl_rr
-  ]
-
-  MOV_rxri       1110101 0010 . 1111 0 ... .... .... ....     @s_rxr_shi
-  ORR_rrri       1110101 0010 . .... 0 ... .... .... ....     @s_rrr_shi
-
-  # v8.1M CSEL and friends
-  CSEL           1110101 0010 1 rn:4 10 op:2 rd:4 fcond:4 rm:4
-}
-{
-  MVN_rxri       1110101 0011 . 1111 0 ... .... .... ....     @s_rxr_shi
-  ORN_rrri       1110101 0011 . .... 0 ... .... .... ....     @s_rrr_shi
-}
-{
-  TEQ_xrri       1110101 0100 1 .... 0 ... 1111 .... ....     @S_xrr_shi
-  EOR_rrri       1110101 0100 . .... 0 ... .... .... ....     @s_rrr_shi
-}
-PKH              1110101 0110 0 rn:4 0 ... rd:4 .. tb:1 0 rm:4 \
-                 &pkh imm=%imm5_12_6
-{
-  CMN_xrri       1110101 1000 1 .... 0 ... 1111 .... ....     @S_xrr_shi
-  ADD_rrri       1110101 1000 . .... 0 ... .... .... ....     @s_rrr_shi
-}
-ADC_rrri         1110101 1010 . .... 0 ... .... .... ....     @s_rrr_shi
-SBC_rrri         1110101 1011 . .... 0 ... .... .... ....     @s_rrr_shi
-{
-  CMP_xrri       1110101 1101 1 .... 0 ... 1111 .... ....     @S_xrr_shi
-  SUB_rrri       1110101 1101 . .... 0 ... .... .... ....     @s_rrr_shi
-}
-RSB_rrri         1110101 1110 . .... 0 ... .... .... ....     @s_rrr_shi
-
-# Data-processing (register-shifted register)
-
-MOV_rxrr         1111 1010 0 shty:2 s:1 rm:4 1111 rd:4 0000 rs:4 \
-                 &s_rrr_shr rn=0
-
-# Data-processing (immediate)
-
-%t32extrot       26:1 12:3 0:8  !function=t32_expandimm_rot
-%t32extimm       26:1 12:3 0:8  !function=t32_expandimm_imm
-
-@s_rri_rot       ....... .... s:1 rn:4 . ... rd:4 ........ \
-                 &s_rri_rot imm=%t32extimm rot=%t32extrot
-@s_rxi_rot       ....... .... s:1 .... . ... rd:4 ........ \
-                 &s_rri_rot imm=%t32extimm rot=%t32extrot rn=0
-@S_xri_rot       ....... .... .   rn:4 . ... .... ........ \
-                 &s_rri_rot imm=%t32extimm rot=%t32extrot s=1 rd=0
-
-{
-  TST_xri        1111 0.0 0000 1 .... 0 ... 1111 ........     @S_xri_rot
-  AND_rri        1111 0.0 0000 . .... 0 ... .... ........     @s_rri_rot
-}
-BIC_rri          1111 0.0 0001 . .... 0 ... .... ........     @s_rri_rot
-{
-  MOV_rxi        1111 0.0 0010 . 1111 0 ... .... ........     @s_rxi_rot
-  ORR_rri        1111 0.0 0010 . .... 0 ... .... ........     @s_rri_rot
-}
-{
-  MVN_rxi        1111 0.0 0011 . 1111 0 ... .... ........     @s_rxi_rot
-  ORN_rri        1111 0.0 0011 . .... 0 ... .... ........     @s_rri_rot
-}
-{
-  TEQ_xri        1111 0.0 0100 1 .... 0 ... 1111 ........     @S_xri_rot
-  EOR_rri        1111 0.0 0100 . .... 0 ... .... ........     @s_rri_rot
-}
-{
-  CMN_xri        1111 0.0 1000 1 .... 0 ... 1111 ........     @S_xri_rot
-  ADD_rri        1111 0.0 1000 . .... 0 ... .... ........     @s_rri_rot
-}
-ADC_rri          1111 0.0 1010 . .... 0 ... .... ........     @s_rri_rot
-SBC_rri          1111 0.0 1011 . .... 0 ... .... ........     @s_rri_rot
-{
-  CMP_xri        1111 0.0 1101 1 .... 0 ... 1111 ........     @S_xri_rot
-  SUB_rri        1111 0.0 1101 . .... 0 ... .... ........     @s_rri_rot
-}
-RSB_rri          1111 0.0 1110 . .... 0 ... .... ........     @s_rri_rot
-
-# Data processing (plain binary immediate)
-
-%imm12_26_12_0   26:1 12:3 0:8
-%neg12_26_12_0   26:1 12:3 0:8 !function=negate
-@s0_rri_12       .... ... .... . rn:4 . ... rd:4 ........ \
-                 &s_rri_rot imm=%imm12_26_12_0 rot=0 s=0
-
-{
-  ADR            1111 0.1 0000 0 1111 0 ... rd:4 ........ \
-                 &ri imm=%imm12_26_12_0
-  ADD_rri        1111 0.1 0000 0 .... 0 ... .... ........     @s0_rri_12
-}
-{
-  ADR            1111 0.1 0101 0 1111 0 ... rd:4 ........ \
-                 &ri imm=%neg12_26_12_0
-  SUB_rri        1111 0.1 0101 0 .... 0 ... .... ........     @s0_rri_12
-}
-
-# Move Wide
-
-%imm16_26_16_12_0 16:4 26:1 12:3 0:8
-@mov16           .... .... .... .... .... rd:4 .... .... \
-                 &ri imm=%imm16_26_16_12_0
-
-MOVW             1111 0.10 0100 .... 0 ... .... ........      @mov16
-MOVT             1111 0.10 1100 .... 0 ... .... ........      @mov16
-
-# Saturate, bitfield
-
-@sat             .... .... .. sh:1 . rn:4 . ... rd:4 .. . satimm:5 \
-                 &sat imm=%imm5_12_6
-@sat16           .... .... .. .    . rn:4 . ... rd:4 .. . satimm:5 \
-                 &sat sh=0 imm=0
-
-{
-  SSAT16         1111 0011 001 0 .... 0 000 .... 00 0 .....   @sat16
-  SSAT           1111 0011 00. 0 .... 0 ... .... .. 0 .....   @sat
-}
-{
-  USAT16         1111 0011 101 0 .... 0 000 .... 00 0 .....   @sat16
-  USAT           1111 0011 10. 0 .... 0 ... .... .. 0 .....   @sat
-}
-
-@bfx             .... .... ... . rn:4 . ... rd:4 .. . widthm1:5 \
-                 &bfx lsb=%imm5_12_6
-@bfi             .... .... ... . rn:4 . ... rd:4 .. . msb:5 \
-                 &bfi lsb=%imm5_12_6
-
-SBFX             1111 0011 010 0 .... 0 ... .... ..0.....     @bfx
-UBFX             1111 0011 110 0 .... 0 ... .... ..0.....     @bfx
-
-# bfc is bfi w/ rn=15
-BFCI             1111 0011 011 0 .... 0 ... .... ..0.....     @bfi
-
-# Multiply and multiply accumulate
-
-@s0_rnadm        .... .... .... rn:4 ra:4 rd:4 .... rm:4      &s_rrrr s=0
-@s0_rn0dm        .... .... .... rn:4 .... rd:4 .... rm:4      &s_rrrr ra=0 s=0
-@rnadm           .... .... .... rn:4 ra:4 rd:4 .... rm:4      &rrrr
-@rn0dm           .... .... .... rn:4 .... rd:4 .... rm:4      &rrrr ra=0
-@rndm            .... .... .... rn:4 .... rd:4 .... rm:4      &rrr
-@rdm             .... .... .... .... .... rd:4 .... rm:4      &rr
-
-{
-  MUL            1111 1011 0000 .... 1111 .... 0000 ....      @s0_rn0dm
-  MLA            1111 1011 0000 .... .... .... 0000 ....      @s0_rnadm
-}
-MLS              1111 1011 0000 .... .... .... 0001 ....      @rnadm
-SMULL            1111 1011 1000 .... .... .... 0000 ....      @s0_rnadm
-UMULL            1111 1011 1010 .... .... .... 0000 ....      @s0_rnadm
-SMLAL            1111 1011 1100 .... .... .... 0000 ....      @s0_rnadm
-UMLAL            1111 1011 1110 .... .... .... 0000 ....      @s0_rnadm
-UMAAL            1111 1011 1110 .... .... .... 0110 ....      @rnadm
-{
-  SMULWB         1111 1011 0011 .... 1111 .... 0000 ....      @rn0dm
-  SMLAWB         1111 1011 0011 .... .... .... 0000 ....      @rnadm
-}
-{
-  SMULWT         1111 1011 0011 .... 1111 .... 0001 ....      @rn0dm
-  SMLAWT         1111 1011 0011 .... .... .... 0001 ....      @rnadm
-}
-{
-  SMULBB         1111 1011 0001 .... 1111 .... 0000 ....      @rn0dm
-  SMLABB         1111 1011 0001 .... .... .... 0000 ....      @rnadm
-}
-{
-  SMULBT         1111 1011 0001 .... 1111 .... 0001 ....      @rn0dm
-  SMLABT         1111 1011 0001 .... .... .... 0001 ....      @rnadm
-}
-{
-  SMULTB         1111 1011 0001 .... 1111 .... 0010 ....      @rn0dm
-  SMLATB         1111 1011 0001 .... .... .... 0010 ....      @rnadm
-}
-{
-  SMULTT         1111 1011 0001 .... 1111 .... 0011 ....      @rn0dm
-  SMLATT         1111 1011 0001 .... .... .... 0011 ....      @rnadm
-}
-SMLALBB          1111 1011 1100 .... .... .... 1000 ....      @rnadm
-SMLALBT          1111 1011 1100 .... .... .... 1001 ....      @rnadm
-SMLALTB          1111 1011 1100 .... .... .... 1010 ....      @rnadm
-SMLALTT          1111 1011 1100 .... .... .... 1011 ....      @rnadm
-
-# usad8 is usada8 w/ ra=15
-USADA8           1111 1011 0111 .... .... .... 0000 ....      @rnadm
-
-SMLAD            1111 1011 0010 .... .... .... 0000 ....      @rnadm
-SMLADX           1111 1011 0010 .... .... .... 0001 ....      @rnadm
-SMLSD            1111 1011 0100 .... .... .... 0000 ....      @rnadm
-SMLSDX           1111 1011 0100 .... .... .... 0001 ....      @rnadm
-
-SMLALD           1111 1011 1100 .... .... .... 1100 ....      @rnadm
-SMLALDX          1111 1011 1100 .... .... .... 1101 ....      @rnadm
-SMLSLD           1111 1011 1101 .... .... .... 1100 ....      @rnadm
-SMLSLDX          1111 1011 1101 .... .... .... 1101 ....      @rnadm
-
-SMMLA            1111 1011 0101 .... .... .... 0000 ....      @rnadm
-SMMLAR           1111 1011 0101 .... .... .... 0001 ....      @rnadm
-SMMLS            1111 1011 0110 .... .... .... 0000 ....      @rnadm
-SMMLSR           1111 1011 0110 .... .... .... 0001 ....      @rnadm
-
-SDIV             1111 1011 1001 .... 1111 .... 1111 ....      @rndm
-UDIV             1111 1011 1011 .... 1111 .... 1111 ....      @rndm
-
-# Data-processing (two source registers)
-
-QADD             1111 1010 1000 .... 1111 .... 1000 ....      @rndm
-QSUB             1111 1010 1000 .... 1111 .... 1010 ....      @rndm
-QDADD            1111 1010 1000 .... 1111 .... 1001 ....      @rndm
-QDSUB            1111 1010 1000 .... 1111 .... 1011 ....      @rndm
-
-CRC32B           1111 1010 1100 .... 1111 .... 1000 ....      @rndm
-CRC32H           1111 1010 1100 .... 1111 .... 1001 ....      @rndm
-CRC32W           1111 1010 1100 .... 1111 .... 1010 ....      @rndm
-CRC32CB          1111 1010 1101 .... 1111 .... 1000 ....      @rndm
-CRC32CH          1111 1010 1101 .... 1111 .... 1001 ....      @rndm
-CRC32CW          1111 1010 1101 .... 1111 .... 1010 ....      @rndm
-
-SEL              1111 1010 1010 .... 1111 .... 1000 ....      @rndm
-
-# Note rn != rm is CONSTRAINED UNPREDICTABLE; we choose to ignore rn.
-REV              1111 1010 1001 ---- 1111 .... 1000 ....      @rdm
-REV16            1111 1010 1001 ---- 1111 .... 1001 ....      @rdm
-RBIT             1111 1010 1001 ---- 1111 .... 1010 ....      @rdm
-REVSH            1111 1010 1001 ---- 1111 .... 1011 ....      @rdm
-CLZ              1111 1010 1011 ---- 1111 .... 1000 ....      @rdm
-
-# Branches and miscellaneous control
-
-%msr_sysm        4:1 8:4
-%mrs_sysm        4:1 16:4
-%imm16_16_0      16:4 0:12
-%imm21           26:s1 11:1 13:1 16:6 0:11 !function=times_2
-&ci              cond imm
-
-{
-  # Group insn[25:23] = 111, which is cond=111x for the branch below,
-  # or unconditional, which would be illegal for the branch.
-  [
-    # Hints, and CPS
-    {
-      [
-        YIELD    1111 0011 1010 1111 1000 0000 0000 0001
-        WFE      1111 0011 1010 1111 1000 0000 0000 0010
-        WFI      1111 0011 1010 1111 1000 0000 0000 0011
-
-        # TODO: Implement SEV, SEVL; may help SMP performance.
-        # SEV    1111 0011 1010 1111 1000 0000 0000 0100
-        # SEVL   1111 0011 1010 1111 1000 0000 0000 0101
-
-        ESB      1111 0011 1010 1111 1000 0000 0001 0000
-      ]
-
-      # The canonical nop ends in 0000 0000, but the whole rest
-      # of the space is "reserved hint, behaves as nop".
-      NOP        1111 0011 1010 1111 1000 0000 ---- ----
-
-      # If imod == '00' && M == '0' then SEE "Hint instructions", above.
-      CPS        1111 0011 1010 1111 1000 0 imod:2 M:1 A:1 I:1 F:1 mode:5 \
-                 &cps
-    }
-
-    # Miscellaneous control
-    CLREX        1111 0011 1011 1111 1000 1111 0010 1111
-    DSB          1111 0011 1011 1111 1000 1111 0100 ----
-    DMB          1111 0011 1011 1111 1000 1111 0101 ----
-    ISB          1111 0011 1011 1111 1000 1111 0110 ----
-    SB           1111 0011 1011 1111 1000 1111 0111 0000
-
-    # Note that the v7m insn overlaps both the normal and banked insn.
-    {
-      MRS_bank   1111 0011 111 r:1 .... 1000 rd:4   001. 0000  \
-                 &mrs_bank sysm=%mrs_sysm
-      MRS_reg    1111 0011 111 r:1 1111 1000 rd:4   0000 0000  &mrs_reg
-      MRS_v7m    1111 0011 111 0   1111 1000 rd:4   sysm:8
-    }
-    {
-      MSR_bank   1111 0011 100 r:1 rn:4 1000 ....   001. 0000  \
-                 &msr_bank sysm=%msr_sysm
-      MSR_reg    1111 0011 100 r:1 rn:4 1000 mask:4 0000 0000  &msr_reg
-      MSR_v7m    1111 0011 100 0   rn:4 1000 mask:2 00 sysm:8
-    }
-    BXJ          1111 0011 1100 rm:4 1000 1111 0000 0000      &r
-    {
-      # At v6T2, this is the T5 encoding of SUBS PC, LR, #IMM, and works as for
-      # every other encoding of SUBS.  With v7VE, IMM=0 is redefined as ERET.
-      # The distinction between the two only matters for Hyp mode.
-      ERET       1111 0011 1101 1110 1000 1111 0000 0000
-      SUB_rri    1111 0011 1101 1110 1000 1111 imm:8 \
-                 &s_rri_rot rot=0 s=1 rd=15 rn=14
-    }
-    SMC          1111 0111 1111 imm:4 1000 0000 0000 0000     &i
-    HVC          1111 0111 1110 ....  1000 .... .... ....     \
-                 &i imm=%imm16_16_0
-    UDF          1111 0111 1111 ----  1010 ---- ---- ----
-  ]
-  B_cond_thumb   1111 0. cond:4 ...... 10.0 ............      &ci imm=%imm21
-}
-
-# Load/store (register, immediate, literal)
-
-@ldst_rr         .... .... .... rn:4 rt:4 ...... shimm:2 rm:4 \
-                 &ldst_rr p=1 w=0 u=1 shtype=0
-@ldst_ri_idx     .... .... .... rn:4 rt:4 . p:1 u:1 . imm:8 \
-                 &ldst_ri w=1
-@ldst_ri_neg     .... .... .... rn:4 rt:4 .... imm:8 \
-                 &ldst_ri p=1 w=0 u=0
-@ldst_ri_unp     .... .... .... rn:4 rt:4 .... imm:8 \
-                 &ldst_ri p=1 w=0 u=1
-@ldst_ri_pos     .... .... .... rn:4 rt:4 imm:12 \
-                 &ldst_ri p=1 w=0 u=1
-@ldst_ri_lit     .... .... u:1 ... .... rt:4 imm:12 \
-                 &ldst_ri p=1 w=0 rn=15
-
-STRB_rr          1111 1000 0000 .... .... 000000 .. ....      @ldst_rr
-STRB_ri          1111 1000 0000 .... .... 1..1 ........       @ldst_ri_idx
-STRB_ri          1111 1000 0000 .... .... 1100 ........       @ldst_ri_neg
-STRBT_ri         1111 1000 0000 .... .... 1110 ........       @ldst_ri_unp
-STRB_ri          1111 1000 1000 .... .... ............        @ldst_ri_pos
-
-STRH_rr          1111 1000 0010 .... .... 000000 .. ....      @ldst_rr
-STRH_ri          1111 1000 0010 .... .... 1..1 ........       @ldst_ri_idx
-STRH_ri          1111 1000 0010 .... .... 1100 ........       @ldst_ri_neg
-STRHT_ri         1111 1000 0010 .... .... 1110 ........       @ldst_ri_unp
-STRH_ri          1111 1000 1010 .... .... ............        @ldst_ri_pos
-
-STR_rr           1111 1000 0100 .... .... 000000 .. ....      @ldst_rr
-STR_ri           1111 1000 0100 .... .... 1..1 ........       @ldst_ri_idx
-STR_ri           1111 1000 0100 .... .... 1100 ........       @ldst_ri_neg
-STRT_ri          1111 1000 0100 .... .... 1110 ........       @ldst_ri_unp
-STR_ri           1111 1000 1100 .... .... ............        @ldst_ri_pos
-
-# Note that Load, unsigned (literal) overlaps all other load encodings.
-{
-  {
-    NOP          1111 1000 -001 1111 1111 ------------        # PLD
-    LDRB_ri      1111 1000 .001 1111 .... ............        @ldst_ri_lit
-  }
-  {
-    NOP          1111 1000 1001 ---- 1111 ------------        # PLD
-    LDRB_ri      1111 1000 1001 .... .... ............        @ldst_ri_pos
-  }
-  LDRB_ri        1111 1000 0001 .... .... 1..1 ........       @ldst_ri_idx
-  {
-    NOP          1111 1000 0001 ---- 1111 1100 --------       # PLD
-    LDRB_ri      1111 1000 0001 .... .... 1100 ........       @ldst_ri_neg
-  }
-  LDRBT_ri       1111 1000 0001 .... .... 1110 ........       @ldst_ri_unp
-  {
-    NOP          1111 1000 0001 ---- 1111 000000 -- ----      # PLD
-    LDRB_rr      1111 1000 0001 .... .... 000000 .. ....      @ldst_rr
-  }
-}
-{
-  {
-    NOP          1111 1000 -011 1111 1111 ------------        # PLD
-    LDRH_ri      1111 1000 .011 1111 .... ............        @ldst_ri_lit
-  }
-  {
-    NOP          1111 1000 1011 ---- 1111 ------------        # PLDW
-    LDRH_ri      1111 1000 1011 .... .... ............        @ldst_ri_pos
-  }
-  LDRH_ri        1111 1000 0011 .... .... 1..1 ........       @ldst_ri_idx
-  {
-    NOP          1111 1000 0011 ---- 1111 1100 --------       # PLDW
-    LDRH_ri      1111 1000 0011 .... .... 1100 ........       @ldst_ri_neg
-  }
-  LDRHT_ri       1111 1000 0011 .... .... 1110 ........       @ldst_ri_unp
-  {
-    NOP          1111 1000 0011 ---- 1111 000000 -- ----      # PLDW
-    LDRH_rr      1111 1000 0011 .... .... 000000 .. ....      @ldst_rr
-  }
-}
-{
-  LDR_ri         1111 1000 .101 1111 .... ............        @ldst_ri_lit
-  LDR_ri         1111 1000 1101 .... .... ............        @ldst_ri_pos
-  LDR_ri         1111 1000 0101 .... .... 1..1 ........       @ldst_ri_idx
-  LDR_ri         1111 1000 0101 .... .... 1100 ........       @ldst_ri_neg
-  LDRT_ri        1111 1000 0101 .... .... 1110 ........       @ldst_ri_unp
-  LDR_rr         1111 1000 0101 .... .... 000000 .. ....      @ldst_rr
-}
-# NOPs here are PLI.
-{
-  {
-    NOP          1111 1001 -001 1111 1111 ------------
-    LDRSB_ri     1111 1001 .001 1111 .... ............        @ldst_ri_lit
-  }
-  {
-    NOP          1111 1001 1001 ---- 1111 ------------
-    LDRSB_ri     1111 1001 1001 .... .... ............        @ldst_ri_pos
-  }
-  LDRSB_ri       1111 1001 0001 .... .... 1..1 ........       @ldst_ri_idx
-  {
-    NOP          1111 1001 0001 ---- 1111 1100 --------
-    LDRSB_ri     1111 1001 0001 .... .... 1100 ........       @ldst_ri_neg
-  }
-  LDRSBT_ri      1111 1001 0001 .... .... 1110 ........       @ldst_ri_unp
-  {
-    NOP          1111 1001 0001 ---- 1111 000000 -- ----
-    LDRSB_rr     1111 1001 0001 .... .... 000000 .. ....      @ldst_rr
-  }
-}
-# NOPs here are unallocated memory hints, treated as NOP.
-{
-  {
-    NOP          1111 1001 -011 1111 1111 ------------
-    LDRSH_ri     1111 1001 .011 1111 .... ............        @ldst_ri_lit
-  }
-  {
-    NOP          1111 1001 1011 ---- 1111 ------------
-    LDRSH_ri     1111 1001 1011 .... .... ............        @ldst_ri_pos
-  }
-  LDRSH_ri       1111 1001 0011 .... .... 1..1 ........       @ldst_ri_idx
-  {
-    NOP          1111 1001 0011 ---- 1111 1100 --------
-    LDRSH_ri     1111 1001 0011 .... .... 1100 ........       @ldst_ri_neg
-  }
-  LDRSHT_ri      1111 1001 0011 .... .... 1110 ........       @ldst_ri_unp
-  {
-    NOP          1111 1001 0011 ---- 1111 000000 -- ----
-    LDRSH_rr     1111 1001 0011 .... .... 000000 .. ....      @ldst_rr
-  }
-}
-
-%imm8x4          0:8 !function=times_4
-&ldst_ri2        p w u rn rt rt2 imm
-@ldstd_ri8       .... .... u:1 ... rn:4 rt:4 rt2:4 ........   \
-                 &ldst_ri2 imm=%imm8x4
-
-STRD_ri_t32      1110 1000 .110 .... .... .... ........    @ldstd_ri8 w=1 p=0
-LDRD_ri_t32      1110 1000 .111 .... .... .... ........    @ldstd_ri8 w=1 p=0
-
-STRD_ri_t32      1110 1001 .100 .... .... .... ........    @ldstd_ri8 w=0 p=1
-LDRD_ri_t32      1110 1001 .101 .... .... .... ........    @ldstd_ri8 w=0 p=1
-
-STRD_ri_t32      1110 1001 .110 .... .... .... ........    @ldstd_ri8 w=1 p=1
-{
-  SG             1110 1001 0111 1111 1110 1001 01111111
-  LDRD_ri_t32    1110 1001 .111 .... .... .... ........    @ldstd_ri8 w=1 p=1
-}
-
-# Load/Store Exclusive, Load-Acquire/Store-Release, and Table Branch
-
-@strex_i         .... .... .... rn:4 rt:4 rd:4 .... .... \
-                 &strex rt2=15 imm=%imm8x4
-@strex_0         .... .... .... rn:4 rt:4 .... .... rd:4 \
-                 &strex rt2=15 imm=0
-@strex_d         .... .... .... rn:4 rt:4 rt2:4 .... rd:4 \
-                 &strex imm=0
-
-@ldrex_i         .... .... .... rn:4 rt:4 .... .... .... \
-                 &ldrex rt2=15 imm=%imm8x4
-@ldrex_0         .... .... .... rn:4 rt:4 .... .... .... \
-                 &ldrex rt2=15 imm=0
-@ldrex_d         .... .... .... rn:4 rt:4 rt2:4 .... .... \
-                 &ldrex imm=0
-
-{
-  TT             1110 1000 0100 rn:4 1111 rd:4 A:1 T:1 000000
-  STREX          1110 1000 0100 .... .... .... .... ....      @strex_i
-}
-STREXB           1110 1000 1100 .... .... 1111 0100 ....      @strex_0
-STREXH           1110 1000 1100 .... .... 1111 0101 ....      @strex_0
-STREXD_t32       1110 1000 1100 .... .... .... 0111 ....      @strex_d
-
-STLEX            1110 1000 1100 .... .... 1111 1110 ....      @strex_0
-STLEXB           1110 1000 1100 .... .... 1111 1100 ....      @strex_0
-STLEXH           1110 1000 1100 .... .... 1111 1101 ....      @strex_0
-STLEXD_t32       1110 1000 1100 .... .... .... 1111 ....      @strex_d
-
-STL              1110 1000 1100 .... .... 1111 1010 1111      @ldrex_0
-STLB             1110 1000 1100 .... .... 1111 1000 1111      @ldrex_0
-STLH             1110 1000 1100 .... .... 1111 1001 1111      @ldrex_0
-
-LDREX            1110 1000 0101 .... .... 1111 .... ....      @ldrex_i
-LDREXB           1110 1000 1101 .... .... 1111 0100 1111      @ldrex_0
-LDREXH           1110 1000 1101 .... .... 1111 0101 1111      @ldrex_0
-LDREXD_t32       1110 1000 1101 .... .... .... 0111 1111      @ldrex_d
-
-LDAEX            1110 1000 1101 .... .... 1111 1110 1111      @ldrex_0
-LDAEXB           1110 1000 1101 .... .... 1111 1100 1111      @ldrex_0
-LDAEXH           1110 1000 1101 .... .... 1111 1101 1111      @ldrex_0
-LDAEXD_t32       1110 1000 1101 .... .... .... 1111 1111      @ldrex_d
-
-LDA              1110 1000 1101 .... .... 1111 1010 1111      @ldrex_0
-LDAB             1110 1000 1101 .... .... 1111 1000 1111      @ldrex_0
-LDAH             1110 1000 1101 .... .... 1111 1001 1111      @ldrex_0
-
-&tbranch         rn rm
-@tbranch         .... .... .... rn:4 .... .... .... rm:4      &tbranch
-
-TBB              1110 1000 1101 .... 1111 0000 0000 ....      @tbranch
-TBH              1110 1000 1101 .... 1111 0000 0001 ....      @tbranch
-
-# Parallel addition and subtraction
-
-SADD8            1111 1010 1000 .... 1111 .... 0000 ....      @rndm
-QADD8            1111 1010 1000 .... 1111 .... 0001 ....      @rndm
-SHADD8           1111 1010 1000 .... 1111 .... 0010 ....      @rndm
-UADD8            1111 1010 1000 .... 1111 .... 0100 ....      @rndm
-UQADD8           1111 1010 1000 .... 1111 .... 0101 ....      @rndm
-UHADD8           1111 1010 1000 .... 1111 .... 0110 ....      @rndm
-
-SADD16           1111 1010 1001 .... 1111 .... 0000 ....      @rndm
-QADD16           1111 1010 1001 .... 1111 .... 0001 ....      @rndm
-SHADD16          1111 1010 1001 .... 1111 .... 0010 ....      @rndm
-UADD16           1111 1010 1001 .... 1111 .... 0100 ....      @rndm
-UQADD16          1111 1010 1001 .... 1111 .... 0101 ....      @rndm
-UHADD16          1111 1010 1001 .... 1111 .... 0110 ....      @rndm
-
-SASX             1111 1010 1010 .... 1111 .... 0000 ....      @rndm
-QASX             1111 1010 1010 .... 1111 .... 0001 ....      @rndm
-SHASX            1111 1010 1010 .... 1111 .... 0010 ....      @rndm
-UASX             1111 1010 1010 .... 1111 .... 0100 ....      @rndm
-UQASX            1111 1010 1010 .... 1111 .... 0101 ....      @rndm
-UHASX            1111 1010 1010 .... 1111 .... 0110 ....      @rndm
-
-SSUB8            1111 1010 1100 .... 1111 .... 0000 ....      @rndm
-QSUB8            1111 1010 1100 .... 1111 .... 0001 ....      @rndm
-SHSUB8           1111 1010 1100 .... 1111 .... 0010 ....      @rndm
-USUB8            1111 1010 1100 .... 1111 .... 0100 ....      @rndm
-UQSUB8           1111 1010 1100 .... 1111 .... 0101 ....      @rndm
-UHSUB8           1111 1010 1100 .... 1111 .... 0110 ....      @rndm
-
-SSUB16           1111 1010 1101 .... 1111 .... 0000 ....      @rndm
-QSUB16           1111 1010 1101 .... 1111 .... 0001 ....      @rndm
-SHSUB16          1111 1010 1101 .... 1111 .... 0010 ....      @rndm
-USUB16           1111 1010 1101 .... 1111 .... 0100 ....      @rndm
-UQSUB16          1111 1010 1101 .... 1111 .... 0101 ....      @rndm
-UHSUB16          1111 1010 1101 .... 1111 .... 0110 ....      @rndm
-
-SSAX             1111 1010 1110 .... 1111 .... 0000 ....      @rndm
-QSAX             1111 1010 1110 .... 1111 .... 0001 ....      @rndm
-SHSAX            1111 1010 1110 .... 1111 .... 0010 ....      @rndm
-USAX             1111 1010 1110 .... 1111 .... 0100 ....      @rndm
-UQSAX            1111 1010 1110 .... 1111 .... 0101 ....      @rndm
-UHSAX            1111 1010 1110 .... 1111 .... 0110 ....      @rndm
-
-# Register extends
-
-@rrr_rot         .... .... .... rn:4 .... rd:4 .. rot:2 rm:4  &rrr_rot
-
-SXTAH            1111 1010 0000 .... 1111 .... 10.. ....      @rrr_rot
-UXTAH            1111 1010 0001 .... 1111 .... 10.. ....      @rrr_rot
-SXTAB16          1111 1010 0010 .... 1111 .... 10.. ....      @rrr_rot
-UXTAB16          1111 1010 0011 .... 1111 .... 10.. ....      @rrr_rot
-SXTAB            1111 1010 0100 .... 1111 .... 10.. ....      @rrr_rot
-UXTAB            1111 1010 0101 .... 1111 .... 10.. ....      @rrr_rot
-
-# Load/store multiple
-
-@ldstm           .... .... .. w:1 . rn:4 list:16              &ldst_block u=0
-
-STM_t32          1110 1000 10.0 .... ................         @ldstm i=1 b=0
-STM_t32          1110 1001 00.0 .... ................         @ldstm i=0 b=1
-{
-  # Rn=15 UNDEFs for LDM; M-profile CLRM uses that encoding
-  CLRM           1110 1000 1001 1111 list:16
-  LDM_t32        1110 1000 10.1 .... ................         @ldstm i=1 b=0
-}
-LDM_t32          1110 1001 00.1 .... ................         @ldstm i=0 b=1
-
-&rfe             !extern rn w pu
-@rfe             .... .... .. w:1 . rn:4 ................     &rfe
-
-RFE              1110 1000 00.1 .... 1100000000000000         @rfe pu=2
-RFE              1110 1001 10.1 .... 1100000000000000         @rfe pu=1
-
-&srs             !extern mode w pu
-@srs             .... .... .. w:1 . .... ........... mode:5   &srs
-
-SRS              1110 1000 00.0 1101 1100 0000 000. ....      @srs pu=2
-SRS              1110 1001 10.0 1101 1100 0000 000. ....      @srs pu=1
-
-# Coprocessor instructions
-
-# We decode MCR, MCR, MRRC and MCRR only, because for QEMU the
-# other coprocessor instructions always UNDEF.
-# The trans_ functions for these will ignore cp values 8..13 for v7 or
-# earlier, and 0..13 for v8 and later, because those areas of the
-# encoding space may be used for other things, such as VFP or Neon.
-
-@mcr             .... .... opc1:3 . crn:4 rt:4 cp:4 opc2:3 . crm:4
-@mcrr            .... .... .... rt2:4 rt:4 cp:4 opc1:4 crm:4
-
-MCRR             1110 1100 0100 .... .... .... .... .... @mcrr
-MRRC             1110 1100 0101 .... .... .... .... .... @mcrr
-
-MCR              1110 1110 ... 0 .... .... .... ... 1 .... @mcr
-MRC              1110 1110 ... 1 .... .... .... ... 1 .... @mcr
-
-# Branches
-
-%imm24           26:s1 13:1 11:1 16:10 0:11 !function=t32_branch24
-@branch24        ................................             &i imm=%imm24
-
-B                1111 0. .......... 10.1 ............         @branch24
-BL               1111 0. .......... 11.1 ............         @branch24
-{
-  # BLX_i is non-M-profile only
-  BLX_i          1111 0. .......... 11.0 ............         @branch24
-  # M-profile only: loop and branch insns
-  [
-    # All these BF insns have boff != 0b0000; we NOP them all
-    BF           1111 0 boff:4  ------- 1100 - ---------- 1    # BFL
-    BF           1111 0 boff:4 0 ------ 1110 - ---------- 1    # BFCSEL
-    BF           1111 0 boff:4 10 ----- 1110 - ---------- 1    # BF
-    BF           1111 0 boff:4 11 ----- 1110 0 0000000000 1    # BFX, BFLX
-  ]
-  [
-    # LE and WLS immediate
-    %lob_imm 1:10 11:1 !function=times_2
-
-    DLS          1111 0 0000 100     rn:4 1110 0000 0000 0001 size=4
-    WLS          1111 0 0000 100     rn:4 1100 . .......... 1 imm=%lob_imm size=4
-    {
-      LE         1111 0 0000 0 f:1 tp:1 1111 1100 . .......... 1 imm=%lob_imm
-      # This is WLSTP
-      WLS        1111 0 0000 0 size:2 rn:4 1100 . .......... 1 imm=%lob_imm
-    }
-    {
-      LCTP       1111 0 0000 000     1111 1110 0000 0000 0001
-      # This is DLSTP
-      DLS        1111 0 0000 0 size:2 rn:4 1110 0000 0000 0001
-    }
-    VCTP         1111 0 0000 0 size:2 rn:4 1110 1000 0000 0001
-  ]
-}
diff --git a/target/arm/tcg/a32-uncond.decode b/target/arm/tcg/a32-uncond.decode
new file mode 100644
index 0000000..2339de2
--- /dev/null
+++ b/target/arm/tcg/a32-uncond.decode
@@ -0,0 +1,74 @@
+# A32 unconditional instructions
+#
+#  Copyright (c) 2019 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+# All insns that have 0xf in insn[31:28] are decoded here.
+# All of those that have a COND field in insn[31:28] are in a32.decode
+#
+
+&empty           !extern
+&i               !extern imm
+&setend          E
+
+# Branch with Link and Exchange
+
+%imm24h          0:s24 24:1 !function=times_2
+
+BLX_i            1111 101 . ........................          &i imm=%imm24h
+
+# System Instructions
+
+&rfe             rn w pu
+&srs             mode w pu
+&cps             mode imod M A I F
+
+RFE              1111 100 pu:2 0 w:1 1 rn:4 0000 1010 0000 0000   &rfe
+SRS              1111 100 pu:2 1 w:1 0 1101 0000 0101 000 mode:5  &srs
+CPS              1111 0001 0000 imod:2 M:1 0 0000 000 A:1 I:1 F:1 0 mode:5 \
+                 &cps
+
+# Clear-Exclusive, Barriers
+
+# QEMU does not require the option field for the barriers.
+CLREX            1111 0101 0111 1111 1111 0000 0001 1111
+DSB              1111 0101 0111 1111 1111 0000 0100 ----
+DMB              1111 0101 0111 1111 1111 0000 0101 ----
+ISB              1111 0101 0111 1111 1111 0000 0110 ----
+SB               1111 0101 0111 1111 1111 0000 0111 0000
+
+# Set Endianness
+SETEND           1111 0001 0000 0001 0000 00 E:1 0 0000 0000  &setend
+
+# Preload instructions
+
+PLD              1111 0101 -101 ---- 1111 ---- ---- ----    # (imm, lit) 5te
+PLDW             1111 0101 -001 ---- 1111 ---- ---- ----    # (imm, lit) 7mp
+PLI              1111 0100 -101 ---- 1111 ---- ---- ----    # (imm, lit) 7
+
+PLD              1111 0111 -101 ---- 1111 ----- -- 0 ----   # (register) 5te
+PLDW             1111 0111 -001 ---- 1111 ----- -- 0 ----   # (register) 7mp
+PLI              1111 0110 -101 ---- 1111 ----- -- 0 ----   # (register) 7
+
+# Unallocated memory hints
+#
+# Since these are v7MP nops, and PLDW is v7MP and implemented as nop,
+# (ab)use the PLDW helper.
+
+PLDW             1111 0100 -001 ---- ---- ---- ---- ----
+PLDW             1111 0110 -001 ---- ---- ---- ---0 ----
diff --git a/target/arm/tcg/a32.decode b/target/arm/tcg/a32.decode
new file mode 100644
index 0000000..f2ca480
--- /dev/null
+++ b/target/arm/tcg/a32.decode
@@ -0,0 +1,557 @@
+# A32 conditional instructions
+#
+#  Copyright (c) 2019 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+# All of the insn that have a COND field in insn[31:28] are here.
+# All insns that have 0xf in insn[31:28] are in a32-uncond.decode.
+#
+
+&empty
+&s_rrr_shi       s rd rn rm shim shty
+&s_rrr_shr       s rn rd rm rs shty
+&s_rri_rot       s rn rd imm rot
+&s_rrrr          s rd rn rm ra
+&rrrr            rd rn rm ra
+&rrr_rot         rd rn rm rot
+&rrr             rd rn rm
+&rr              rd rm
+&ri              rd imm
+&r               rm
+&i               imm
+&msr_reg         rn r mask
+&mrs_reg         rd r
+&msr_bank        rn r sysm
+&mrs_bank        rd r sysm
+&ldst_rr         p w u rn rt rm shimm shtype
+&ldst_ri         p w u rn rt imm
+&ldst_block      rn i b u w list
+&strex           rn rd rt rt2 imm
+&ldrex           rn rt rt2 imm
+&bfx             rd rn lsb widthm1
+&bfi             rd rn lsb msb
+&sat             rd rn satimm imm sh
+&pkh             rd rn rm imm tb
+&mcr             cp opc1 crn crm opc2 rt
+&mcrr            cp opc1 crm rt rt2
+
+# Data-processing (register)
+
+@s_rrr_shi       ---- ... .... s:1 rn:4 rd:4 shim:5 shty:2 . rm:4 \
+                 &s_rrr_shi
+@s_rxr_shi       ---- ... .... s:1 .... rd:4 shim:5 shty:2 . rm:4 \
+                 &s_rrr_shi rn=0
+@S_xrr_shi       ---- ... .... .   rn:4 .... shim:5 shty:2 . rm:4 \
+                 &s_rrr_shi s=1 rd=0
+
+AND_rrri         .... 000 0000 . .... .... ..... .. 0 ....    @s_rrr_shi
+EOR_rrri         .... 000 0001 . .... .... ..... .. 0 ....    @s_rrr_shi
+SUB_rrri         .... 000 0010 . .... .... ..... .. 0 ....    @s_rrr_shi
+RSB_rrri         .... 000 0011 . .... .... ..... .. 0 ....    @s_rrr_shi
+ADD_rrri         .... 000 0100 . .... .... ..... .. 0 ....    @s_rrr_shi
+ADC_rrri         .... 000 0101 . .... .... ..... .. 0 ....    @s_rrr_shi
+SBC_rrri         .... 000 0110 . .... .... ..... .. 0 ....    @s_rrr_shi
+RSC_rrri         .... 000 0111 . .... .... ..... .. 0 ....    @s_rrr_shi
+TST_xrri         .... 000 1000 1 .... 0000 ..... .. 0 ....    @S_xrr_shi
+TEQ_xrri         .... 000 1001 1 .... 0000 ..... .. 0 ....    @S_xrr_shi
+CMP_xrri         .... 000 1010 1 .... 0000 ..... .. 0 ....    @S_xrr_shi
+CMN_xrri         .... 000 1011 1 .... 0000 ..... .. 0 ....    @S_xrr_shi
+ORR_rrri         .... 000 1100 . .... .... ..... .. 0 ....    @s_rrr_shi
+MOV_rxri         .... 000 1101 . 0000 .... ..... .. 0 ....    @s_rxr_shi
+BIC_rrri         .... 000 1110 . .... .... ..... .. 0 ....    @s_rrr_shi
+MVN_rxri         .... 000 1111 . 0000 .... ..... .. 0 ....    @s_rxr_shi
+
+%imm16           16:4 0:12
+@mov16           ---- .... .... .... rd:4 ............        &ri imm=%imm16
+
+MOVW             .... 0011 0000 .... .... ............        @mov16
+MOVT             .... 0011 0100 .... .... ............        @mov16
+
+# Data-processing (register-shifted register)
+
+@s_rrr_shr       ---- ... .... s:1 rn:4 rd:4 rs:4 . shty:2 . rm:4 \
+                 &s_rrr_shr
+@s_rxr_shr       ---- ... .... s:1 .... rd:4 rs:4 . shty:2 . rm:4 \
+                 &s_rrr_shr rn=0
+@S_xrr_shr       ---- ... .... .   rn:4 .... rs:4 . shty:2 . rm:4 \
+                 &s_rrr_shr rd=0 s=1
+
+AND_rrrr         .... 000 0000 . .... .... .... 0 .. 1 ....   @s_rrr_shr
+EOR_rrrr         .... 000 0001 . .... .... .... 0 .. 1 ....   @s_rrr_shr
+SUB_rrrr         .... 000 0010 . .... .... .... 0 .. 1 ....   @s_rrr_shr
+RSB_rrrr         .... 000 0011 . .... .... .... 0 .. 1 ....   @s_rrr_shr
+ADD_rrrr         .... 000 0100 . .... .... .... 0 .. 1 ....   @s_rrr_shr
+ADC_rrrr         .... 000 0101 . .... .... .... 0 .. 1 ....   @s_rrr_shr
+SBC_rrrr         .... 000 0110 . .... .... .... 0 .. 1 ....   @s_rrr_shr
+RSC_rrrr         .... 000 0111 . .... .... .... 0 .. 1 ....   @s_rrr_shr
+TST_xrrr         .... 000 1000 1 .... 0000 .... 0 .. 1 ....   @S_xrr_shr
+TEQ_xrrr         .... 000 1001 1 .... 0000 .... 0 .. 1 ....   @S_xrr_shr
+CMP_xrrr         .... 000 1010 1 .... 0000 .... 0 .. 1 ....   @S_xrr_shr
+CMN_xrrr         .... 000 1011 1 .... 0000 .... 0 .. 1 ....   @S_xrr_shr
+ORR_rrrr         .... 000 1100 . .... .... .... 0 .. 1 ....   @s_rrr_shr
+MOV_rxrr         .... 000 1101 . 0000 .... .... 0 .. 1 ....   @s_rxr_shr
+BIC_rrrr         .... 000 1110 . .... .... .... 0 .. 1 ....   @s_rrr_shr
+MVN_rxrr         .... 000 1111 . 0000 .... .... 0 .. 1 ....   @s_rxr_shr
+
+# Data-processing (immediate)
+
+%a32extrot       8:4 !function=times_2
+
+@s_rri_rot       ---- ... .... s:1 rn:4 rd:4 .... imm:8 \
+                 &s_rri_rot rot=%a32extrot
+@s_rxi_rot       ---- ... .... s:1 .... rd:4 .... imm:8 \
+                 &s_rri_rot rot=%a32extrot rn=0
+@S_xri_rot       ---- ... .... .   rn:4 .... .... imm:8 \
+                 &s_rri_rot rot=%a32extrot rd=0 s=1
+
+AND_rri          .... 001 0000 . .... .... ............       @s_rri_rot
+EOR_rri          .... 001 0001 . .... .... ............       @s_rri_rot
+SUB_rri          .... 001 0010 . .... .... ............       @s_rri_rot
+RSB_rri          .... 001 0011 . .... .... ............       @s_rri_rot
+ADD_rri          .... 001 0100 . .... .... ............       @s_rri_rot
+ADC_rri          .... 001 0101 . .... .... ............       @s_rri_rot
+SBC_rri          .... 001 0110 . .... .... ............       @s_rri_rot
+RSC_rri          .... 001 0111 . .... .... ............       @s_rri_rot
+TST_xri          .... 001 1000 1 .... 0000 ............       @S_xri_rot
+TEQ_xri          .... 001 1001 1 .... 0000 ............       @S_xri_rot
+CMP_xri          .... 001 1010 1 .... 0000 ............       @S_xri_rot
+CMN_xri          .... 001 1011 1 .... 0000 ............       @S_xri_rot
+ORR_rri          .... 001 1100 . .... .... ............       @s_rri_rot
+MOV_rxi          .... 001 1101 . 0000 .... ............       @s_rxi_rot
+BIC_rri          .... 001 1110 . .... .... ............       @s_rri_rot
+MVN_rxi          .... 001 1111 . 0000 .... ............       @s_rxi_rot
+
+# Multiply and multiply accumulate
+
+@s_rdamn         ---- .... ... s:1 rd:4 ra:4 rm:4 .... rn:4   &s_rrrr
+@s_rd0mn         ---- .... ... s:1 rd:4 .... rm:4 .... rn:4   &s_rrrr ra=0
+@rdamn           ---- .... ... .   rd:4 ra:4 rm:4 .... rn:4   &rrrr
+@rd0mn           ---- .... ... .   rd:4 .... rm:4 .... rn:4   &rrrr ra=0
+
+MUL              .... 0000 000 . .... 0000 .... 1001 ....     @s_rd0mn
+MLA              .... 0000 001 . .... .... .... 1001 ....     @s_rdamn
+UMAAL            .... 0000 010 0 .... .... .... 1001 ....     @rdamn
+MLS              .... 0000 011 0 .... .... .... 1001 ....     @rdamn
+UMULL            .... 0000 100 . .... .... .... 1001 ....     @s_rdamn
+UMLAL            .... 0000 101 . .... .... .... 1001 ....     @s_rdamn
+SMULL            .... 0000 110 . .... .... .... 1001 ....     @s_rdamn
+SMLAL            .... 0000 111 . .... .... .... 1001 ....     @s_rdamn
+
+# Saturating addition and subtraction
+
+@rndm            ---- .... .... rn:4 rd:4 .... .... rm:4      &rrr
+
+QADD             .... 0001 0000 .... .... 0000 0101 ....      @rndm
+QSUB             .... 0001 0010 .... .... 0000 0101 ....      @rndm
+QDADD            .... 0001 0100 .... .... 0000 0101 ....      @rndm
+QDSUB            .... 0001 0110 .... .... 0000 0101 ....      @rndm
+
+# Halfword multiply and multiply accumulate
+
+SMLABB           .... 0001 0000 .... .... .... 1000 ....      @rdamn
+SMLABT           .... 0001 0000 .... .... .... 1100 ....      @rdamn
+SMLATB           .... 0001 0000 .... .... .... 1010 ....      @rdamn
+SMLATT           .... 0001 0000 .... .... .... 1110 ....      @rdamn
+SMLAWB           .... 0001 0010 .... .... .... 1000 ....      @rdamn
+SMULWB           .... 0001 0010 .... 0000 .... 1010 ....      @rd0mn
+SMLAWT           .... 0001 0010 .... .... .... 1100 ....      @rdamn
+SMULWT           .... 0001 0010 .... 0000 .... 1110 ....      @rd0mn
+SMLALBB          .... 0001 0100 .... .... .... 1000 ....      @rdamn
+SMLALBT          .... 0001 0100 .... .... .... 1100 ....      @rdamn
+SMLALTB          .... 0001 0100 .... .... .... 1010 ....      @rdamn
+SMLALTT          .... 0001 0100 .... .... .... 1110 ....      @rdamn
+SMULBB           .... 0001 0110 .... 0000 .... 1000 ....      @rd0mn
+SMULBT           .... 0001 0110 .... 0000 .... 1100 ....      @rd0mn
+SMULTB           .... 0001 0110 .... 0000 .... 1010 ....      @rd0mn
+SMULTT           .... 0001 0110 .... 0000 .... 1110 ....      @rd0mn
+
+# MSR (immediate) and hints
+
+&msr_i           r mask rot imm
+@msr_i           ---- .... .... mask:4 .... rot:4 imm:8       &msr_i
+
+{
+  {
+    [
+      YIELD      ---- 0011 0010 0000 1111 ---- 0000 0001
+      WFE        ---- 0011 0010 0000 1111 ---- 0000 0010
+      WFI        ---- 0011 0010 0000 1111 ---- 0000 0011
+
+      # TODO: Implement SEV, SEVL; may help SMP performance.
+      # SEV      ---- 0011 0010 0000 1111 ---- 0000 0100
+      # SEVL     ---- 0011 0010 0000 1111 ---- 0000 0101
+
+      ESB        ---- 0011 0010 0000 1111 ---- 0001 0000
+    ]
+
+    # The canonical nop ends in 00000000, but the whole of the
+    # rest of the space executes as nop if otherwise unsupported.
+    NOP          ---- 0011 0010 0000 1111 ---- ---- ----
+  }
+  # Note mask = 0 is covered by NOP
+  MSR_imm        .... 0011 0010 .... 1111 .... .... ....      @msr_i r=0
+}
+MSR_imm          .... 0011 0110 .... 1111 .... .... ....      @msr_i r=1
+
+# Cyclic Redundancy Check
+
+CRC32B           .... 0001 0000 .... .... 0000 0100 ....      @rndm
+CRC32H           .... 0001 0010 .... .... 0000 0100 ....      @rndm
+CRC32W           .... 0001 0100 .... .... 0000 0100 ....      @rndm
+CRC32CB          .... 0001 0000 .... .... 0010 0100 ....      @rndm
+CRC32CH          .... 0001 0010 .... .... 0010 0100 ....      @rndm
+CRC32CW          .... 0001 0100 .... .... 0010 0100 ....      @rndm
+
+# Miscellaneous instructions
+
+%sysm            8:1 16:4
+%imm16_8_0       8:12 0:4
+
+@rm              ---- .... .... .... .... .... .... rm:4      &r
+@rdm             ---- .... .... .... rd:4 .... .... rm:4      &rr
+@i16             ---- .... .... .... .... .... .... ....      &i imm=%imm16_8_0
+
+MRS_bank         ---- 0001 0 r:1 00 .... rd:4 001. 0000 0000  &mrs_bank %sysm
+MSR_bank         ---- 0001 0 r:1 10 .... 1111 001. 0000 rn:4  &msr_bank %sysm
+
+MRS_reg          ---- 0001 0 r:1 00 1111   rd:4 0000 0000 0000  &mrs_reg
+MSR_reg          ---- 0001 0 r:1 10 mask:4 1111 0000 0000 rn:4  &msr_reg
+
+BX               .... 0001 0010 1111 1111 1111 0001 ....      @rm
+BXJ              .... 0001 0010 1111 1111 1111 0010 ....      @rm
+BLX_r            .... 0001 0010 1111 1111 1111 0011 ....      @rm
+
+CLZ              .... 0001 0110 1111 .... 1111 0001 ....      @rdm
+
+ERET             ---- 0001 0110 0000 0000 0000 0110 1110
+
+HLT              .... 0001 0000 .... .... .... 0111 ....      @i16
+BKPT             .... 0001 0010 .... .... .... 0111 ....      @i16
+HVC              .... 0001 0100 .... .... .... 0111 ....      @i16
+SMC              ---- 0001 0110 0000 0000 0000 0111 imm:4     &i
+
+# Load/Store Dual, Half, Signed Byte (register)
+
+@ldst_rr_p1w     ---- ...1 u:1 . w:1 . rn:4 rt:4 .... .... rm:4 \
+                 &ldst_rr p=1 shimm=0 shtype=0
+@ldst_rr_pw0     ---- ...0 u:1 . 0   . rn:4 rt:4 .... .... rm:4 \
+                 &ldst_rr p=0 w=0 shimm=0 shtype=0
+
+STRH_rr          .... 000. .0.0 .... .... 0000 1011 ....      @ldst_rr_pw0
+STRH_rr          .... 000. .0.0 .... .... 0000 1011 ....      @ldst_rr_p1w
+
+LDRD_rr          .... 000. .0.0 .... .... 0000 1101 ....      @ldst_rr_pw0
+LDRD_rr          .... 000. .0.0 .... .... 0000 1101 ....      @ldst_rr_p1w
+
+STRD_rr          .... 000. .0.0 .... .... 0000 1111 ....      @ldst_rr_pw0
+STRD_rr          .... 000. .0.0 .... .... 0000 1111 ....      @ldst_rr_p1w
+
+LDRH_rr          .... 000. .0.1 .... .... 0000 1011 ....      @ldst_rr_pw0
+LDRH_rr          .... 000. .0.1 .... .... 0000 1011 ....      @ldst_rr_p1w
+
+LDRSB_rr         .... 000. .0.1 .... .... 0000 1101 ....      @ldst_rr_pw0
+LDRSB_rr         .... 000. .0.1 .... .... 0000 1101 ....      @ldst_rr_p1w
+
+LDRSH_rr         .... 000. .0.1 .... .... 0000 1111 ....      @ldst_rr_pw0
+LDRSH_rr         .... 000. .0.1 .... .... 0000 1111 ....      @ldst_rr_p1w
+
+# Note the unpriv load/stores use the previously invalid P=0, W=1 encoding,
+# and act as normal post-indexed (P=0, W=0).
+@ldst_rr_p0w1    ---- ...0 u:1 . 1   . rn:4 rt:4 .... .... rm:4 \
+                 &ldst_rr p=0 w=0 shimm=0 shtype=0
+
+STRHT_rr         .... 000. .0.0 .... .... 0000 1011 ....      @ldst_rr_p0w1
+LDRHT_rr         .... 000. .0.1 .... .... 0000 1011 ....      @ldst_rr_p0w1
+LDRSBT_rr        .... 000. .0.1 .... .... 0000 1101 ....      @ldst_rr_p0w1
+LDRSHT_rr        .... 000. .0.1 .... .... 0000 1111 ....      @ldst_rr_p0w1
+
+# Load/Store word and unsigned byte (register)
+
+@ldst_rs_p1w     ---- ...1 u:1 . w:1 . rn:4 rt:4 shimm:5 shtype:2 . rm:4 \
+                 &ldst_rr p=1
+@ldst_rs_pw0     ---- ...0 u:1 . 0   . rn:4 rt:4 shimm:5 shtype:2 . rm:4 \
+                 &ldst_rr p=0 w=0
+
+STR_rr           .... 011. .0.0 .... .... .... ...0 ....      @ldst_rs_pw0
+STR_rr           .... 011. .0.0 .... .... .... ...0 ....      @ldst_rs_p1w
+STRB_rr          .... 011. .1.0 .... .... .... ...0 ....      @ldst_rs_pw0
+STRB_rr          .... 011. .1.0 .... .... .... ...0 ....      @ldst_rs_p1w
+
+LDR_rr           .... 011. .0.1 .... .... .... ...0 ....      @ldst_rs_pw0
+LDR_rr           .... 011. .0.1 .... .... .... ...0 ....      @ldst_rs_p1w
+LDRB_rr          .... 011. .1.1 .... .... .... ...0 ....      @ldst_rs_pw0
+LDRB_rr          .... 011. .1.1 .... .... .... ...0 ....      @ldst_rs_p1w
+
+@ldst_rs_p0w1    ---- ...0 u:1 . 1   . rn:4 rt:4 shimm:5 shtype:2 . rm:4 \
+                 &ldst_rr p=0 w=0
+
+STRT_rr          .... 011. .0.0 .... .... .... ...0 ....      @ldst_rs_p0w1
+STRBT_rr         .... 011. .1.0 .... .... .... ...0 ....      @ldst_rs_p0w1
+LDRT_rr          .... 011. .0.1 .... .... .... ...0 ....      @ldst_rs_p0w1
+LDRBT_rr         .... 011. .1.1 .... .... .... ...0 ....      @ldst_rs_p0w1
+
+# Load/Store Dual, Half, Signed Byte (immediate)
+
+%imm8s_8_0       8:4 0:4
+@ldst_ri8_p1w    ---- ...1 u:1 . w:1 . rn:4 rt:4 .... .... .... \
+                 &ldst_ri imm=%imm8s_8_0 p=1
+@ldst_ri8_pw0    ---- ...0 u:1 . 0   . rn:4 rt:4 .... .... .... \
+                 &ldst_ri imm=%imm8s_8_0 p=0 w=0
+
+STRH_ri          .... 000. .1.0 .... .... .... 1011 ....      @ldst_ri8_pw0
+STRH_ri          .... 000. .1.0 .... .... .... 1011 ....      @ldst_ri8_p1w
+
+LDRD_ri_a32      .... 000. .1.0 .... .... .... 1101 ....      @ldst_ri8_pw0
+LDRD_ri_a32      .... 000. .1.0 .... .... .... 1101 ....      @ldst_ri8_p1w
+
+STRD_ri_a32      .... 000. .1.0 .... .... .... 1111 ....      @ldst_ri8_pw0
+STRD_ri_a32      .... 000. .1.0 .... .... .... 1111 ....      @ldst_ri8_p1w
+
+LDRH_ri          .... 000. .1.1 .... .... .... 1011 ....      @ldst_ri8_pw0
+LDRH_ri          .... 000. .1.1 .... .... .... 1011 ....      @ldst_ri8_p1w
+
+LDRSB_ri         .... 000. .1.1 .... .... .... 1101 ....      @ldst_ri8_pw0
+LDRSB_ri         .... 000. .1.1 .... .... .... 1101 ....      @ldst_ri8_p1w
+
+LDRSH_ri         .... 000. .1.1 .... .... .... 1111 ....      @ldst_ri8_pw0
+LDRSH_ri         .... 000. .1.1 .... .... .... 1111 ....      @ldst_ri8_p1w
+
+# Note the unpriv load/stores use the previously invalid P=0, W=1 encoding,
+# and act as normal post-indexed (P=0, W=0).
+@ldst_ri8_p0w1   ---- ...0 u:1 . 1   . rn:4 rt:4 .... .... .... \
+                 &ldst_ri imm=%imm8s_8_0 p=0 w=0
+
+STRHT_ri         .... 000. .1.0 .... .... .... 1011 ....      @ldst_ri8_p0w1
+LDRHT_ri         .... 000. .1.1 .... .... .... 1011 ....      @ldst_ri8_p0w1
+LDRSBT_ri        .... 000. .1.1 .... .... .... 1101 ....      @ldst_ri8_p0w1
+LDRSHT_ri        .... 000. .1.1 .... .... .... 1111 ....      @ldst_ri8_p0w1
+
+# Load/Store word and unsigned byte (immediate)
+
+@ldst_ri12_p1w   ---- ...1 u:1 . w:1 . rn:4 rt:4 imm:12       &ldst_ri p=1
+@ldst_ri12_pw0   ---- ...0 u:1 . 0   . rn:4 rt:4 imm:12       &ldst_ri p=0 w=0
+
+STR_ri           .... 010. .0.0 .... .... ............        @ldst_ri12_p1w
+STR_ri           .... 010. .0.0 .... .... ............        @ldst_ri12_pw0
+STRB_ri          .... 010. .1.0 .... .... ............        @ldst_ri12_p1w
+STRB_ri          .... 010. .1.0 .... .... ............        @ldst_ri12_pw0
+
+LDR_ri           .... 010. .0.1 .... .... ............        @ldst_ri12_p1w
+LDR_ri           .... 010. .0.1 .... .... ............        @ldst_ri12_pw0
+LDRB_ri          .... 010. .1.1 .... .... ............        @ldst_ri12_p1w
+LDRB_ri          .... 010. .1.1 .... .... ............        @ldst_ri12_pw0
+
+@ldst_ri12_p0w1  ---- ...0 u:1 . 1 . rn:4 rt:4 imm:12         &ldst_ri p=0 w=0
+
+STRT_ri          .... 010. .0.0 .... .... ............        @ldst_ri12_p0w1
+STRBT_ri         .... 010. .1.0 .... .... ............        @ldst_ri12_p0w1
+LDRT_ri          .... 010. .0.1 .... .... ............        @ldst_ri12_p0w1
+LDRBT_ri         .... 010. .1.1 .... .... ............        @ldst_ri12_p0w1
+
+# Synchronization primitives
+
+@swp             ---- .... .... rn:4 rt:4 .... .... rt2:4
+
+SWP              .... 0001 0000 .... .... 0000 1001 ....      @swp
+SWPB             .... 0001 0100 .... .... 0000 1001 ....      @swp
+
+# Load/Store Exclusive and Load-Acquire/Store-Release
+#
+# Note rt2 for STREXD/LDREXD is set by the helper after checking rt is even.
+
+@strex           ---- .... .... rn:4 rd:4 .... .... rt:4 \
+                 &strex imm=0 rt2=15
+@ldrex           ---- .... .... rn:4 rt:4 .... .... .... \
+                 &ldrex imm=0 rt2=15
+@stl             ---- .... .... rn:4 .... .... .... rt:4 \
+                 &ldrex imm=0 rt2=15
+
+STREX            .... 0001 1000 .... .... 1111 1001 ....      @strex
+STREXD_a32       .... 0001 1010 .... .... 1111 1001 ....      @strex
+STREXB           .... 0001 1100 .... .... 1111 1001 ....      @strex
+STREXH           .... 0001 1110 .... .... 1111 1001 ....      @strex
+
+STLEX            .... 0001 1000 .... .... 1110 1001 ....      @strex
+STLEXD_a32       .... 0001 1010 .... .... 1110 1001 ....      @strex
+STLEXB           .... 0001 1100 .... .... 1110 1001 ....      @strex
+STLEXH           .... 0001 1110 .... .... 1110 1001 ....      @strex
+
+STL              .... 0001 1000 .... 1111 1100 1001 ....      @stl
+STLB             .... 0001 1100 .... 1111 1100 1001 ....      @stl
+STLH             .... 0001 1110 .... 1111 1100 1001 ....      @stl
+
+LDREX            .... 0001 1001 .... .... 1111 1001 1111      @ldrex
+LDREXD_a32       .... 0001 1011 .... .... 1111 1001 1111      @ldrex
+LDREXB           .... 0001 1101 .... .... 1111 1001 1111      @ldrex
+LDREXH           .... 0001 1111 .... .... 1111 1001 1111      @ldrex
+
+LDAEX            .... 0001 1001 .... .... 1110 1001 1111      @ldrex
+LDAEXD_a32       .... 0001 1011 .... .... 1110 1001 1111      @ldrex
+LDAEXB           .... 0001 1101 .... .... 1110 1001 1111      @ldrex
+LDAEXH           .... 0001 1111 .... .... 1110 1001 1111      @ldrex
+
+LDA              .... 0001 1001 .... .... 1100 1001 1111      @ldrex
+LDAB             .... 0001 1101 .... .... 1100 1001 1111      @ldrex
+LDAH             .... 0001 1111 .... .... 1100 1001 1111      @ldrex
+
+# Media instructions
+
+# usad8 is usada8 w/ ra=15
+USADA8           ---- 0111 1000 rd:4 ra:4 rm:4 0001 rn:4
+
+# ubfx and sbfx
+@bfx             ---- .... ... widthm1:5 rd:4 lsb:5 ... rn:4  &bfx
+
+SBFX             .... 0111 101 ..... .... ..... 101 ....      @bfx
+UBFX             .... 0111 111 ..... .... ..... 101 ....      @bfx
+
+# bfc is bfi w/ rn=15
+BFCI             ---- 0111 110 msb:5 rd:4 lsb:5 001 rn:4      &bfi
+
+# While we could get UDEF by not including this, add the pattern for
+# documentation and to conflict with any other typos in this file.
+UDF              1110 0111 1111 ---- ---- ---- 1111 ----
+
+# Parallel addition and subtraction
+
+SADD16           .... 0110 0001 .... .... 1111 0001 ....      @rndm
+SASX             .... 0110 0001 .... .... 1111 0011 ....      @rndm
+SSAX             .... 0110 0001 .... .... 1111 0101 ....      @rndm
+SSUB16           .... 0110 0001 .... .... 1111 0111 ....      @rndm
+SADD8            .... 0110 0001 .... .... 1111 1001 ....      @rndm
+SSUB8            .... 0110 0001 .... .... 1111 1111 ....      @rndm
+
+QADD16           .... 0110 0010 .... .... 1111 0001 ....      @rndm
+QASX             .... 0110 0010 .... .... 1111 0011 ....      @rndm
+QSAX             .... 0110 0010 .... .... 1111 0101 ....      @rndm
+QSUB16           .... 0110 0010 .... .... 1111 0111 ....      @rndm
+QADD8            .... 0110 0010 .... .... 1111 1001 ....      @rndm
+QSUB8            .... 0110 0010 .... .... 1111 1111 ....      @rndm
+
+SHADD16          .... 0110 0011 .... .... 1111 0001 ....      @rndm
+SHASX            .... 0110 0011 .... .... 1111 0011 ....      @rndm
+SHSAX            .... 0110 0011 .... .... 1111 0101 ....      @rndm
+SHSUB16          .... 0110 0011 .... .... 1111 0111 ....      @rndm
+SHADD8           .... 0110 0011 .... .... 1111 1001 ....      @rndm
+SHSUB8           .... 0110 0011 .... .... 1111 1111 ....      @rndm
+
+UADD16           .... 0110 0101 .... .... 1111 0001 ....      @rndm
+UASX             .... 0110 0101 .... .... 1111 0011 ....      @rndm
+USAX             .... 0110 0101 .... .... 1111 0101 ....      @rndm
+USUB16           .... 0110 0101 .... .... 1111 0111 ....      @rndm
+UADD8            .... 0110 0101 .... .... 1111 1001 ....      @rndm
+USUB8            .... 0110 0101 .... .... 1111 1111 ....      @rndm
+
+UQADD16          .... 0110 0110 .... .... 1111 0001 ....      @rndm
+UQASX            .... 0110 0110 .... .... 1111 0011 ....      @rndm
+UQSAX            .... 0110 0110 .... .... 1111 0101 ....      @rndm
+UQSUB16          .... 0110 0110 .... .... 1111 0111 ....      @rndm
+UQADD8           .... 0110 0110 .... .... 1111 1001 ....      @rndm
+UQSUB8           .... 0110 0110 .... .... 1111 1111 ....      @rndm
+
+UHADD16          .... 0110 0111 .... .... 1111 0001 ....      @rndm
+UHASX            .... 0110 0111 .... .... 1111 0011 ....      @rndm
+UHSAX            .... 0110 0111 .... .... 1111 0101 ....      @rndm
+UHSUB16          .... 0110 0111 .... .... 1111 0111 ....      @rndm
+UHADD8           .... 0110 0111 .... .... 1111 1001 ....      @rndm
+UHSUB8           .... 0110 0111 .... .... 1111 1111 ....      @rndm
+
+# Packing, unpacking, saturation, and reversal
+
+PKH              ---- 0110 1000 rn:4 rd:4 imm:5 tb:1 01 rm:4  &pkh
+
+@sat             ---- .... ... satimm:5  rd:4 imm:5 sh:1 .. rn:4  &sat
+@sat16           ---- .... .... satimm:4 rd:4 .... .... rn:4 \
+                 &sat imm=0 sh=0
+
+SSAT             .... 0110 101. .... .... .... ..01 ....      @sat
+USAT             .... 0110 111. .... .... .... ..01 ....      @sat
+
+SSAT16           .... 0110 1010 .... .... 1111 0011 ....      @sat16
+USAT16           .... 0110 1110 .... .... 1111 0011 ....      @sat16
+
+@rrr_rot         ---- .... .... rn:4 rd:4 rot:2 ...... rm:4   &rrr_rot
+
+SXTAB16          .... 0110 1000 .... .... ..00 0111 ....      @rrr_rot
+SXTAB            .... 0110 1010 .... .... ..00 0111 ....      @rrr_rot
+SXTAH            .... 0110 1011 .... .... ..00 0111 ....      @rrr_rot
+UXTAB16          .... 0110 1100 .... .... ..00 0111 ....      @rrr_rot
+UXTAB            .... 0110 1110 .... .... ..00 0111 ....      @rrr_rot
+UXTAH            .... 0110 1111 .... .... ..00 0111 ....      @rrr_rot
+
+SEL              .... 0110 1000 .... .... 1111 1011 ....      @rndm
+REV              .... 0110 1011 1111 .... 1111 0011 ....      @rdm
+REV16            .... 0110 1011 1111 .... 1111 1011 ....      @rdm
+REVSH            .... 0110 1111 1111 .... 1111 1011 ....      @rdm
+RBIT             .... 0110 1111 1111 .... 1111 0011 ....      @rdm
+
+# Signed multiply, signed and unsigned divide
+
+@rdmn            ---- .... .... rd:4 .... rm:4 .... rn:4      &rrr
+
+SMLAD            .... 0111 0000 .... .... .... 0001 ....      @rdamn
+SMLADX           .... 0111 0000 .... .... .... 0011 ....      @rdamn
+SMLSD            .... 0111 0000 .... .... .... 0101 ....      @rdamn
+SMLSDX           .... 0111 0000 .... .... .... 0111 ....      @rdamn
+
+SDIV             .... 0111 0001 .... 1111 .... 0001 ....      @rdmn
+UDIV             .... 0111 0011 .... 1111 .... 0001 ....      @rdmn
+
+SMLALD           .... 0111 0100 .... .... .... 0001 ....      @rdamn
+SMLALDX          .... 0111 0100 .... .... .... 0011 ....      @rdamn
+SMLSLD           .... 0111 0100 .... .... .... 0101 ....      @rdamn
+SMLSLDX          .... 0111 0100 .... .... .... 0111 ....      @rdamn
+
+SMMLA            .... 0111 0101 .... .... .... 0001 ....      @rdamn
+SMMLAR           .... 0111 0101 .... .... .... 0011 ....      @rdamn
+SMMLS            .... 0111 0101 .... .... .... 1101 ....      @rdamn
+SMMLSR           .... 0111 0101 .... .... .... 1111 ....      @rdamn
+
+# Block data transfer
+
+STM              ---- 100 b:1 i:1 u:1 w:1 0 rn:4 list:16   &ldst_block
+LDM_a32          ---- 100 b:1 i:1 u:1 w:1 1 rn:4 list:16   &ldst_block
+
+# Branch, branch with link
+
+%imm26           0:s24  !function=times_4
+@branch          ---- .... ........................           &i imm=%imm26
+
+B                .... 1010 ........................           @branch
+BL               .... 1011 ........................           @branch
+
+# Coprocessor instructions
+
+# We decode MCR, MCR, MRRC and MCRR only, because for QEMU the
+# other coprocessor instructions always UNDEF.
+# The trans_ functions for these will ignore cp values 8..13 for v7 or
+# earlier, and 0..13 for v8 and later, because those areas of the
+# encoding space may be used for other things, such as VFP or Neon.
+
+@mcr             ---- .... opc1:3 . crn:4 rt:4 cp:4 opc2:3 . crm:4 &mcr
+@mcrr            ---- .... .... rt2:4 rt:4 cp:4 opc1:4 crm:4       &mcrr
+
+MCRR             .... 1100 0100 .... .... .... .... .... @mcrr
+MRRC             .... 1100 0101 .... .... .... .... .... @mcrr
+
+MCR              .... 1110 ... 0 .... .... .... ... 1 .... @mcr
+MRC              .... 1110 ... 1 .... .... .... ... 1 .... @mcr
+
+# Supervisor call
+
+SVC              ---- 1111 imm:24                             &i
diff --git a/target/arm/tcg/m-nocp.decode b/target/arm/tcg/m-nocp.decode
new file mode 100644
index 0000000..b65c801
--- /dev/null
+++ b/target/arm/tcg/m-nocp.decode
@@ -0,0 +1,72 @@
+# M-profile UserFault.NOCP exception handling
+#
+#  Copyright (c) 2020 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+# For M-profile, the architecture specifies that NOCP UsageFaults
+# should take precedence over UNDEF faults over the whole wide
+# range of coprocessor-space encodings, with the exception of
+# VLLDM and VLSTM. (Compare v8.1M IsCPInstruction() pseudocode and
+# v8M Arm ARM rule R_QLGM.) This isn't mandatory for v8.0M but we choose
+# to behave the same as v8.1M.
+# This decode is handled before any others (and in particular before
+# decoding FP instructions which are in the coprocessor space).
+# If the coprocessor is not present or disabled then we will generate
+# the NOCP exception; otherwise we let the insn through to the main decode.
+
+%vd_dp  22:1 12:4
+%vd_sp  12:4 22:1
+
+&nocp cp
+
+# M-profile VLDR/VSTR to sysreg
+%vldr_sysreg 22:1 13:3
+%imm7_0x4 0:7 !function=times_4
+
+&vldr_sysreg rn reg imm a w p
+@vldr_sysreg .... ... . a:1 . . . rn:4 ... . ... .. ....... \
+             reg=%vldr_sysreg imm=%imm7_0x4 &vldr_sysreg
+
+{
+  # Special cases which do not take an early NOCP: VLLDM and VLSTM
+  VLLDM_VLSTM  1110 1100 001 l:1 rn:4 0000 1010 op:1 000 0000
+  # VSCCLRM (new in v8.1M) is similar:
+  VSCCLRM      1110 1100 1.01 1111 .... 1011 imm:7 0   vd=%vd_dp size=3
+  VSCCLRM      1110 1100 1.01 1111 .... 1010 imm:8     vd=%vd_sp size=2
+
+  # FP system register accesses: these are a special case because accesses
+  # to FPCXT_NS succeed even if the FPU is disabled. We therefore need
+  # to handle them before the big NOCP blocks. Note that within these
+  # insns NOCP still has higher priority than UNDEFs; this is implemented
+  # by their returning 'false' for UNDEF so as to fall through into the
+  # NOCP check (in contrast to VLLDM etc, which call unallocated_encoding()
+  # for the UNDEFs there that must take precedence over NOCP.)
+
+  VMSR_VMRS    ---- 1110 111 l:1 reg:4 rt:4 1010 0001 0000
+
+  # P=0 W=0 is SEE "Related encodings", so split into two patterns
+  VLDR_sysreg  ---- 110 1 . . w:1 1 .... ... 0 111 11 ....... @vldr_sysreg p=1
+  VLDR_sysreg  ---- 110 0 . . 1   1 .... ... 0 111 11 ....... @vldr_sysreg p=0 w=1
+  VSTR_sysreg  ---- 110 1 . . w:1 0 .... ... 0 111 11 ....... @vldr_sysreg p=1
+  VSTR_sysreg  ---- 110 0 . . 1   0 .... ... 0 111 11 ....... @vldr_sysreg p=0 w=1
+
+  NOCP         111- 1110 ---- ---- ---- cp:4 ---- ---- &nocp
+  NOCP         111- 110- ---- ---- ---- cp:4 ---- ---- &nocp
+  # From v8.1M onwards this range will also NOCP:
+  NOCP_8_1     111- 1111 ---- ---- ---- ---- ---- ---- &nocp cp=10
+}
diff --git a/target/arm/tcg/meson.build b/target/arm/tcg/meson.build
new file mode 100644
index 0000000..044561b
--- /dev/null
+++ b/target/arm/tcg/meson.build
@@ -0,0 +1,32 @@
+gen = [
+  decodetree.process('sve.decode', extra_args: '--decode=disas_sve'),
+  decodetree.process('sme.decode', extra_args: '--decode=disas_sme'),
+  decodetree.process('sme-fa64.decode', extra_args: '--static-decode=disas_sme_fa64'),
+  decodetree.process('neon-shared.decode', extra_args: '--decode=disas_neon_shared'),
+  decodetree.process('neon-dp.decode', extra_args: '--decode=disas_neon_dp'),
+  decodetree.process('neon-ls.decode', extra_args: '--decode=disas_neon_ls'),
+  decodetree.process('vfp.decode', extra_args: '--decode=disas_vfp'),
+  decodetree.process('vfp-uncond.decode', extra_args: '--decode=disas_vfp_uncond'),
+  decodetree.process('m-nocp.decode', extra_args: '--decode=disas_m_nocp'),
+  decodetree.process('mve.decode', extra_args: '--decode=disas_mve'),
+  decodetree.process('a32.decode', extra_args: '--static-decode=disas_a32'),
+  decodetree.process('a32-uncond.decode', extra_args: '--static-decode=disas_a32_uncond'),
+  decodetree.process('t32.decode', extra_args: '--static-decode=disas_t32'),
+  decodetree.process('t16.decode', extra_args: ['-w', '16', '--static-decode=disas_t16']),
+]
+
+arm_ss.add(gen)
+
+arm_ss.add(files(
+  'translate.c',
+  'translate-m-nocp.c',
+  'translate-mve.c',
+  'translate-neon.c',
+  'translate-vfp.c',
+))
+
+arm_ss.add(when: 'TARGET_AARCH64', if_true: files(
+  'translate-a64.c',
+  'translate-sve.c',
+  'translate-sme.c',
+))
diff --git a/target/arm/tcg/mve.decode b/target/arm/tcg/mve.decode
new file mode 100644
index 0000000..14a4f39
--- /dev/null
+++ b/target/arm/tcg/mve.decode
@@ -0,0 +1,832 @@
+# M-profile MVE instruction descriptions
+#
+#  Copyright (c) 2021 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+
+%qd 22:1 13:3
+%qm 5:1 1:3
+%qn 7:1 17:3
+
+# VQDMULL has size in bit 28: 0 for 16 bit, 1 for 32 bit
+%size_28 28:1 !function=plus_1
+
+# 2 operand fp insns have size in bit 20: 1 for 16 bit, 0 for 32 bit,
+# like Neon FP insns.
+%2op_fp_size 20:1 !function=neon_3same_fp_size
+# VCADD is an exception, where bit 20 is 0 for 16 bit and 1 for 32 bit
+%2op_fp_size_rev 20:1 !function=plus_1
+# FP scalars have size in bit 28, 1 for 16 bit, 0 for 32 bit
+%2op_fp_scalar_size 28:1 !function=neon_3same_fp_size
+
+# 1imm format immediate
+%imm_28_16_0 28:1 16:3 0:4
+
+&vldr_vstr rn qd imm p a w size l u
+&1op qd qm size
+&2op qd qm qn size
+&2scalar qd qn rm size
+&1imm qd imm cmode op
+&2shift qd qm shift size
+&vidup qd rn size imm
+&viwdup qd rn rm size imm
+&vcmp qm qn size mask
+&vcmp_scalar qn rm size mask
+&shl_scalar qda rm size
+&vmaxv qm rda size
+&vabav qn qm rda size
+&vldst_sg qd qm rn size msize os
+&vldst_sg_imm qd qm a w imm
+&vldst_il qd rn size pat w
+
+# scatter-gather memory size is in bits 6:4
+%sg_msize 6:1 4:1
+
+@vldr_vstr ....... . . . . l:1 rn:4 ... ...... imm:7 &vldr_vstr qd=%qd u=0
+# Note that both Rn and Qd are 3 bits only (no D bit)
+@vldst_wn ... u:1 ... . . . . l:1 . rn:3 qd:3 . ... .. imm:7 &vldr_vstr
+
+@vldst_sg .... .... .... rn:4 .... ... size:2 ... ... os:1 &vldst_sg \
+          qd=%qd qm=%qm msize=%sg_msize
+
+# Qm is in the fields usually labeled Qn
+@vldst_sg_imm .... .... a:1 . w:1 . .... .... .... . imm:7 &vldst_sg_imm \
+              qd=%qd qm=%qn
+
+# Deinterleaving load/interleaving store
+@vldst_il .... .... .. w:1 . rn:4 .... ... size:2 pat:2 ..... &vldst_il \
+          qd=%qd
+
+@1op .... .... .... size:2 .. .... .... .... .... &1op qd=%qd qm=%qm
+@1op_nosz .... .... .... .... .... .... .... .... &1op qd=%qd qm=%qm size=0
+@2op .... .... .. size:2 .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn
+@2op_nosz .... .... .... .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn size=0
+@2op_sz28 .... .... .... .... .... .... .... .... &2op qd=%qd qm=%qm qn=%qn \
+     size=%size_28
+@1imm .... .... .... .... .... cmode:4 .. op:1 . .... &1imm qd=%qd imm=%imm_28_16_0
+
+# The _rev suffix indicates that Vn and Vm are reversed. This is
+# the case for shifts. In the Arm ARM these insns are documented
+# with the Vm and Vn fields in their usual places, but in the
+# assembly the operands are listed "backwards", ie in the order
+# Qd, Qm, Qn where other insns use Qd, Qn, Qm. For QEMU we choose
+# to consider Vm and Vn as being in different fields in the insn.
+# This gives us consistency with A64 and Neon.
+@2op_rev .... .... .. size:2 .... .... .... .... .... &2op qd=%qd qm=%qn qn=%qm
+
+@2scalar .... .... .. size:2 .... .... .... .... rm:4 &2scalar qd=%qd qn=%qn
+@2scalar_nosz .... .... .... .... .... .... .... rm:4 &2scalar qd=%qd qn=%qn
+
+@2_shl_b .... .... .. 001 shift:3 .... .... .... .... &2shift qd=%qd qm=%qm size=0
+@2_shl_h .... .... .. 01  shift:4 .... .... .... .... &2shift qd=%qd qm=%qm size=1
+@2_shl_w .... .... .. 1   shift:5 .... .... .... .... &2shift qd=%qd qm=%qm size=2
+
+@2_shll_b .... .... ... 01 shift:3 .... .... .... .... &2shift qd=%qd qm=%qm size=0
+@2_shll_h .... .... ... 1  shift:4 .... .... .... .... &2shift qd=%qd qm=%qm size=1
+# VSHLL encoding T2 where shift == esize
+@2_shll_esize_b .... .... .... 00 .. .... .... .... .... &2shift \
+                qd=%qd qm=%qm size=0 shift=8
+@2_shll_esize_h .... .... .... 01 .. .... .... .... .... &2shift \
+                qd=%qd qm=%qm size=1 shift=16
+
+# Right shifts are encoded as N - shift, where N is the element size in bits.
+%rshift_i5  16:5 !function=rsub_32
+%rshift_i4  16:4 !function=rsub_16
+%rshift_i3  16:3 !function=rsub_8
+
+@2_shr_b .... .... .. 001 ... .... .... .... .... &2shift qd=%qd qm=%qm \
+         size=0 shift=%rshift_i3
+@2_shr_h .... .... .. 01 .... .... .... .... .... &2shift qd=%qd qm=%qm \
+         size=1 shift=%rshift_i4
+@2_shr_w .... .... .. 1 ..... .... .... .... .... &2shift qd=%qd qm=%qm \
+         size=2 shift=%rshift_i5
+
+@shl_scalar .... .... .... size:2 .. .... .... .... rm:4 &shl_scalar qda=%qd
+
+# Vector comparison; 4-bit Qm but 3-bit Qn
+%mask_22_13      22:1 13:3
+@vcmp    .... .... .. size:2 qn:3 . .... .... .... .... &vcmp qm=%qm mask=%mask_22_13
+@vcmp_scalar .... .... .. size:2 qn:3 . .... .... .... rm:4 &vcmp_scalar \
+             mask=%mask_22_13
+
+@vcmp_fp .... .... .... qn:3 . .... .... .... .... &vcmp \
+         qm=%qm size=%2op_fp_scalar_size mask=%mask_22_13
+
+# Bit 28 is a 2op_fp_scalar_size bit, but we do not decode it in this
+# format to avoid complicated overlapping-instruction-groups
+@vcmp_fp_scalar .... .... .... qn:3 . .... .... .... rm:4 &vcmp_scalar \
+                mask=%mask_22_13
+
+@vmaxv .... .... .... size:2 .. rda:4 .... .... .... &vmaxv qm=%qm
+
+@2op_fp .... .... .... .... .... .... .... .... &2op \
+        qd=%qd qn=%qn qm=%qm size=%2op_fp_size
+
+@2op_fp_size_rev .... .... .... .... .... .... .... .... &2op \
+                 qd=%qd qn=%qn qm=%qm size=%2op_fp_size_rev
+
+# 2-operand, but Qd and Qn share a field. Size is in bit 28, but we
+# don't decode it in this format
+@vmaxnma  .... .... .... .... .... .... .... .... &2op \
+          qd=%qd qn=%qd qm=%qm
+
+# Here also we don't decode the bit 28 size in the format to avoid
+# awkward nested overlap groups
+@vmaxnmv          .... .... .... .... rda:4 .... .... .... &vmaxv qm=%qm
+
+@2op_fp_scalar .... .... .... .... .... .... .... rm:4 &2scalar \
+               qd=%qd qn=%qn size=%2op_fp_scalar_size
+
+# Vector loads and stores
+
+# Widening loads and narrowing stores:
+# for these P=0 W=0 is 'related encoding'; sz=11 is 'related encoding'
+# This means we need to expand out to multiple patterns for P, W, SZ.
+# For stores the U bit must be 0 but we catch that in the trans_ function.
+# The naming scheme here is "VLDSTB_H == in-memory byte load/store to/from
+# signed halfword element in register", etc.
+VLDSTB_H         111 . 110 0 a:1 0 1   . 0 ... ... 0 111 01 ....... @vldst_wn \
+                 p=0 w=1 size=1
+VLDSTB_H         111 . 110 1 a:1 0 w:1 . 0 ... ... 0 111 01 ....... @vldst_wn \
+                 p=1 size=1
+VLDSTB_W         111 . 110 0 a:1 0 1   . 0 ... ... 0 111 10 ....... @vldst_wn \
+                 p=0 w=1 size=2
+VLDSTB_W         111 . 110 1 a:1 0 w:1 . 0 ... ... 0 111 10 ....... @vldst_wn \
+                 p=1 size=2
+VLDSTH_W         111 . 110 0 a:1 0 1   . 1 ... ... 0 111 10 ....... @vldst_wn \
+                 p=0 w=1 size=2
+VLDSTH_W         111 . 110 1 a:1 0 w:1 . 1 ... ... 0 111 10 ....... @vldst_wn \
+                 p=1 size=2
+
+# Non-widening loads/stores (P=0 W=0 is 'related encoding')
+VLDR_VSTR        1110110 0 a:1 . 1   . .... ... 111100 .......   @vldr_vstr \
+                 size=0 p=0 w=1
+VLDR_VSTR        1110110 0 a:1 . 1   . .... ... 111101 .......   @vldr_vstr \
+                 size=1 p=0 w=1
+VLDR_VSTR        1110110 0 a:1 . 1   . .... ... 111110 .......   @vldr_vstr \
+                 size=2 p=0 w=1
+VLDR_VSTR        1110110 1 a:1 . w:1 . .... ... 111100 .......   @vldr_vstr \
+                 size=0 p=1
+VLDR_VSTR        1110110 1 a:1 . w:1 . .... ... 111101 .......   @vldr_vstr \
+                 size=1 p=1
+VLDR_VSTR        1110110 1 a:1 . w:1 . .... ... 111110 .......   @vldr_vstr \
+                 size=2 p=1
+
+# gather loads/scatter stores
+VLDR_S_sg        111 0 1100 1 . 01 .... ... 0 111 . .... .... @vldst_sg
+VLDR_U_sg        111 1 1100 1 . 01 .... ... 0 111 . .... .... @vldst_sg
+VSTR_sg          111 0 1100 1 . 00 .... ... 0 111 . .... .... @vldst_sg
+
+VLDRW_sg_imm     111 1 1101 ... 1 ... 0 ... 1 1110 .... .... @vldst_sg_imm
+VLDRD_sg_imm     111 1 1101 ... 1 ... 0 ... 1 1111 .... .... @vldst_sg_imm
+VSTRW_sg_imm     111 1 1101 ... 0 ... 0 ... 1 1110 .... .... @vldst_sg_imm
+VSTRD_sg_imm     111 1 1101 ... 0 ... 0 ... 1 1111 .... .... @vldst_sg_imm
+
+# deinterleaving loads/interleaving stores
+VLD2             1111 1100 1 .. 1 .... ... 1 111 .. .. 00000 @vldst_il
+VLD4             1111 1100 1 .. 1 .... ... 1 111 .. .. 00001 @vldst_il
+VST2             1111 1100 1 .. 0 .... ... 1 111 .. .. 00000 @vldst_il
+VST4             1111 1100 1 .. 0 .... ... 1 111 .. .. 00001 @vldst_il
+
+# Moves between 2 32-bit vector lanes and 2 general purpose registers
+VMOV_to_2gp      1110 1100 0 . 00 rt2:4 ... 0 1111 000 idx:1 rt:4 qd=%qd
+VMOV_from_2gp    1110 1100 0 . 01 rt2:4 ... 0 1111 000 idx:1 rt:4 qd=%qd
+
+# Vector 2-op
+VAND             1110 1111 0 . 00 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz
+VBIC             1110 1111 0 . 01 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz
+VORR             1110 1111 0 . 10 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz
+VORN             1110 1111 0 . 11 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz
+VEOR             1111 1111 0 . 00 ... 0 ... 0 0001 . 1 . 1 ... 0 @2op_nosz
+
+VADD             1110 1111 0 . .. ... 0 ... 0 1000 . 1 . 0 ... 0 @2op
+VSUB             1111 1111 0 . .. ... 0 ... 0 1000 . 1 . 0 ... 0 @2op
+VMUL             1110 1111 0 . .. ... 0 ... 0 1001 . 1 . 1 ... 0 @2op
+
+# The VSHLL T2 encoding is not a @2op pattern, but is here because it
+# overlaps what would be size=0b11 VMULH/VRMULH
+{
+  VCVTB_SH       111 0 1110 0 . 11 1111 ... 0 1110 0 0 . 0 ... 1 @1op_nosz
+
+  VMAXNMA        111 0 1110 0 . 11 1111 ... 0 1110 1 0 . 0 ... 1 @vmaxnma size=2
+
+  VSHLL_BS       111 0 1110 0 . 11 .. 01 ... 0 1110 0 0 . 0 ... 1 @2_shll_esize_b
+  VSHLL_BS       111 0 1110 0 . 11 .. 01 ... 0 1110 0 0 . 0 ... 1 @2_shll_esize_h
+
+  VQMOVUNB       111 0 1110 0 . 11 .. 01 ... 0 1110 1 0 . 0 ... 1 @1op
+  VQMOVN_BS      111 0 1110 0 . 11 .. 11 ... 0 1110 0 0 . 0 ... 1 @1op
+
+  VMAXA          111 0 1110 0 . 11 .. 11 ... 0 1110 1 0 . 0 ... 1 @1op
+
+  VMULH_S        111 0 1110 0 . .. ...1 ... 0 1110 . 0 . 0 ... 1 @2op
+}
+
+{
+  VCVTB_HS       111 1 1110 0 . 11  1111 ... 0 1110 0 0 . 0 ... 1  @1op_nosz
+
+  VMAXNMA        111 1 1110 0 . 11  1111 ... 0 1110 1 0 . 0 ... 1 @vmaxnma size=1
+
+  VSHLL_BU       111 1 1110 0 . 11 .. 01 ... 0 1110 0 0 . 0 ... 1 @2_shll_esize_b
+  VSHLL_BU       111 1 1110 0 . 11 .. 01 ... 0 1110 0 0 . 0 ... 1 @2_shll_esize_h
+
+  VMOVNB         111 1 1110 0 . 11 .. 01 ... 0 1110 1 0 . 0 ... 1 @1op
+  VQMOVN_BU      111 1 1110 0 . 11 .. 11 ... 0 1110 0 0 . 0 ... 1 @1op
+
+  VMULH_U        111 1 1110 0 . .. ...1 ... 0 1110 . 0 . 0 ... 1 @2op
+}
+
+{
+  VCVTT_SH       111 0 1110 0 . 11  1111 ... 1 1110 0 0 . 0 ... 1 @1op_nosz
+
+  VMINNMA        111 0 1110 0 . 11  1111 ... 1 1110 1 0 . 0 ... 1 @vmaxnma size=2
+  VSHLL_TS       111 0 1110 0 . 11 .. 01 ... 1 1110 0 0 . 0 ... 1 @2_shll_esize_b
+  VSHLL_TS       111 0 1110 0 . 11 .. 01 ... 1 1110 0 0 . 0 ... 1 @2_shll_esize_h
+
+  VQMOVUNT       111 0 1110 0 . 11 .. 01 ... 1 1110 1 0 . 0 ... 1 @1op
+  VQMOVN_TS      111 0 1110 0 . 11 .. 11 ... 1 1110 0 0 . 0 ... 1 @1op
+
+  VMINA          111 0 1110 0 . 11 .. 11 ... 1 1110 1 0 . 0 ... 1 @1op
+
+  VRMULH_S       111 0 1110 0 . .. ...1 ... 1 1110 . 0 . 0 ... 1 @2op
+}
+
+{
+  VCVTT_HS       111 1 1110 0 . 11  1111 ... 1 1110 0 0 . 0 ... 1 @1op_nosz
+
+  VMINNMA        111 1 1110 0 . 11  1111 ... 1 1110 1 0 . 0 ... 1 @vmaxnma size=1
+  VSHLL_TU       111 1 1110 0 . 11 .. 01 ... 1 1110 0 0 . 0 ... 1 @2_shll_esize_b
+  VSHLL_TU       111 1 1110 0 . 11 .. 01 ... 1 1110 0 0 . 0 ... 1 @2_shll_esize_h
+
+  VMOVNT         111 1 1110 0 . 11 .. 01 ... 1 1110 1 0 . 0 ... 1 @1op
+  VQMOVN_TU      111 1 1110 0 . 11 .. 11 ... 1 1110 0 0 . 0 ... 1 @1op
+
+  VRMULH_U       111 1 1110 0 . .. ...1 ... 1 1110 . 0 . 0 ... 1 @2op
+}
+
+VMAX_S           111 0 1111 0 . .. ... 0 ... 0 0110 . 1 . 0 ... 0 @2op
+VMAX_U           111 1 1111 0 . .. ... 0 ... 0 0110 . 1 . 0 ... 0 @2op
+VMIN_S           111 0 1111 0 . .. ... 0 ... 0 0110 . 1 . 1 ... 0 @2op
+VMIN_U           111 1 1111 0 . .. ... 0 ... 0 0110 . 1 . 1 ... 0 @2op
+
+VABD_S           111 0 1111 0 . .. ... 0 ... 0 0111 . 1 . 0 ... 0 @2op
+VABD_U           111 1 1111 0 . .. ... 0 ... 0 0111 . 1 . 0 ... 0 @2op
+
+VHADD_S          111 0 1111 0 . .. ... 0 ... 0 0000 . 1 . 0 ... 0 @2op
+VHADD_U          111 1 1111 0 . .. ... 0 ... 0 0000 . 1 . 0 ... 0 @2op
+VHSUB_S          111 0 1111 0 . .. ... 0 ... 0 0010 . 1 . 0 ... 0 @2op
+VHSUB_U          111 1 1111 0 . .. ... 0 ... 0 0010 . 1 . 0 ... 0 @2op
+
+{
+  VMULLP_B       111 . 1110 0 . 11 ... 1 ... 0 1110 . 0 . 0 ... 0 @2op_sz28
+  VMULL_BS       111 0 1110 0 . .. ... 1 ... 0 1110 . 0 . 0 ... 0 @2op
+  VMULL_BU       111 1 1110 0 . .. ... 1 ... 0 1110 . 0 . 0 ... 0 @2op
+}
+{
+  VMULLP_T       111 . 1110 0 . 11 ... 1 ... 1 1110 . 0 . 0 ... 0 @2op_sz28
+  VMULL_TS       111 0 1110 0 . .. ... 1 ... 1 1110 . 0 . 0 ... 0 @2op
+  VMULL_TU       111 1 1110 0 . .. ... 1 ... 1 1110 . 0 . 0 ... 0 @2op
+}
+
+VQDMULH          1110 1111 0 . .. ... 0 ... 0 1011 . 1 . 0 ... 0 @2op
+VQRDMULH         1111 1111 0 . .. ... 0 ... 0 1011 . 1 . 0 ... 0 @2op
+
+VQADD_S          111 0 1111 0 . .. ... 0 ... 0 0000 . 1 . 1 ... 0 @2op
+VQADD_U          111 1 1111 0 . .. ... 0 ... 0 0000 . 1 . 1 ... 0 @2op
+VQSUB_S          111 0 1111 0 . .. ... 0 ... 0 0010 . 1 . 1 ... 0 @2op
+VQSUB_U          111 1 1111 0 . .. ... 0 ... 0 0010 . 1 . 1 ... 0 @2op
+
+VSHL_S           111 0 1111 0 . .. ... 0 ... 0 0100 . 1 . 0 ... 0 @2op_rev
+VSHL_U           111 1 1111 0 . .. ... 0 ... 0 0100 . 1 . 0 ... 0 @2op_rev
+
+VRSHL_S          111 0 1111 0 . .. ... 0 ... 0 0101 . 1 . 0 ... 0 @2op_rev
+VRSHL_U          111 1 1111 0 . .. ... 0 ... 0 0101 . 1 . 0 ... 0 @2op_rev
+
+VQSHL_S          111 0 1111 0 . .. ... 0 ... 0 0100 . 1 . 1 ... 0 @2op_rev
+VQSHL_U          111 1 1111 0 . .. ... 0 ... 0 0100 . 1 . 1 ... 0 @2op_rev
+
+VQRSHL_S         111 0 1111 0 . .. ... 0 ... 0 0101 . 1 . 1 ... 0 @2op_rev
+VQRSHL_U         111 1 1111 0 . .. ... 0 ... 0 0101 . 1 . 1 ... 0 @2op_rev
+
+{
+  VCMUL0         111 . 1110 0 . 11 ... 0 ... 0 1110 . 0 . 0 ... 0 @2op_sz28
+  VQDMLADH       1110  1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 0 @2op
+  VQDMLSDH       1111  1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 0 @2op
+}
+
+{
+  VCMUL180       111 . 1110 0 . 11 ... 0 ... 1 1110 . 0 . 0 ... 0 @2op_sz28
+  VQDMLADHX      111 0 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 0 @2op
+  VQDMLSDHX      111 1 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 0 @2op
+}
+
+{
+  VCMUL90        111 . 1110 0 . 11 ... 0 ... 0 1110 . 0 . 0 ... 1 @2op_sz28
+  VQRDMLADH      111 0 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 1 @2op
+  VQRDMLSDH      111 1 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 1 @2op
+}
+
+{
+  VCMUL270       111 . 1110 0 . 11 ... 0 ... 1 1110 . 0 . 0 ... 1 @2op_sz28
+  VQRDMLADHX     111 0 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 1 @2op
+  VQRDMLSDHX     111 1 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 1 @2op
+}
+
+VQDMULLB         111 . 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 1 @2op_sz28
+VQDMULLT         111 . 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 1 @2op_sz28
+
+VRHADD_S         111 0 1111 0 . .. ... 0 ... 0 0001 . 1 . 0 ... 0 @2op
+VRHADD_U         111 1 1111 0 . .. ... 0 ... 0 0001 . 1 . 0 ... 0 @2op
+
+{
+  VADC           1110 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 0 @2op_nosz
+  VADCI          1110 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 0 @2op_nosz
+  VHCADD90       1110 1110 0 . .. ... 0 ... 0 1111 . 0 . 0 ... 0 @2op
+  VHCADD270      1110 1110 0 . .. ... 0 ... 1 1111 . 0 . 0 ... 0 @2op
+}
+
+{
+  VSBC           1111 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 0 @2op_nosz
+  VSBCI          1111 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 0 @2op_nosz
+  VCADD90        1111 1110 0 . .. ... 0 ... 0 1111 . 0 . 0 ... 0 @2op
+  VCADD270       1111 1110 0 . .. ... 0 ... 1 1111 . 0 . 0 ... 0 @2op
+}
+
+# Vector miscellaneous
+
+VCLS             1111 1111 1 . 11 .. 00 ... 0 0100 01 . 0 ... 0 @1op
+VCLZ             1111 1111 1 . 11 .. 00 ... 0 0100 11 . 0 ... 0 @1op
+
+VREV16           1111 1111 1 . 11 .. 00 ... 0 0001 01 . 0 ... 0 @1op
+VREV32           1111 1111 1 . 11 .. 00 ... 0 0000 11 . 0 ... 0 @1op
+VREV64           1111 1111 1 . 11 .. 00 ... 0 0000 01 . 0 ... 0 @1op
+
+VMVN             1111 1111 1 . 11 00 00 ... 0 0101 11 . 0 ... 0 @1op_nosz
+
+VABS             1111 1111 1 . 11 .. 01 ... 0 0011 01 . 0 ... 0 @1op
+VABS_fp          1111 1111 1 . 11 .. 01 ... 0 0111 01 . 0 ... 0 @1op
+VNEG             1111 1111 1 . 11 .. 01 ... 0 0011 11 . 0 ... 0 @1op
+VNEG_fp          1111 1111 1 . 11 .. 01 ... 0 0111 11 . 0 ... 0 @1op
+
+VQABS            1111 1111 1 . 11 .. 00 ... 0 0111 01 . 0 ... 0 @1op
+VQNEG            1111 1111 1 . 11 .. 00 ... 0 0111 11 . 0 ... 0 @1op
+
+&vdup qd rt size
+# Qd is in the fields usually named Qn
+@vdup            .... .... . . .. ... . rt:4 .... . . . . .... qd=%qn &vdup
+
+# B and E bits encode size, which we decode here to the usual size values
+VDUP             1110 1110 1 1 10 ... 0 .... 1011 . 0 0 1 0000 @vdup size=0
+VDUP             1110 1110 1 0 10 ... 0 .... 1011 . 0 1 1 0000 @vdup size=1
+VDUP             1110 1110 1 0 10 ... 0 .... 1011 . 0 0 1 0000 @vdup size=2
+
+# Incrementing and decrementing dup
+
+# VIDUP, VDDUP format immediate: 1 << (immh:imml)
+%imm_vidup 7:1 0:1 !function=vidup_imm
+
+# VIDUP, VDDUP registers: Rm bits [3:1] from insn, bit 0 is 1;
+# Rn bits [3:1] from insn, bit 0 is 0
+%vidup_rm 1:3 !function=times_2_plus_1
+%vidup_rn 17:3 !function=times_2
+
+@vidup           .... .... . . size:2 .... .... .... .... .... \
+                 qd=%qd imm=%imm_vidup rn=%vidup_rn &vidup
+@viwdup          .... .... . . size:2 .... .... .... .... .... \
+                 qd=%qd imm=%imm_vidup rm=%vidup_rm rn=%vidup_rn &viwdup
+{
+  VIDUP          1110 1110 0 . .. ... 1 ... 0 1111 . 110 111 . @vidup
+  VIWDUP         1110 1110 0 . .. ... 1 ... 0 1111 . 110 ... . @viwdup
+}
+{
+  VCMPGT_fp_scalar 1110 1110 0 . 11 ... 1 ... 1 1111  0110 .... @vcmp_fp_scalar size=2
+  VCMPLE_fp_scalar 1110 1110 0 . 11 ... 1 ... 1 1111  1110 .... @vcmp_fp_scalar size=2
+  VDDUP            1110 1110 0 . .. ... 1 ... 1 1111 . 110 111 . @vidup
+  VDWDUP           1110 1110 0 . .. ... 1 ... 1 1111 . 110 ... . @viwdup
+}
+
+# multiply-add long dual accumulate
+# rdahi: bits [3:1] from insn, bit 0 is 1
+# rdalo: bits [3:1] from insn, bit 0 is 0
+%rdahi 20:3 !function=times_2_plus_1
+%rdalo 13:3 !function=times_2
+# size bit is 0 for 16 bit, 1 for 32 bit
+%size_16 16:1 !function=plus_1
+
+&vmlaldav rdahi rdalo size qn qm x a
+&vmladav rda size qn qm x a
+
+@vmlaldav        .... .... . ... ... . ... x:1 .... .. a:1 . qm:3 . \
+                 qn=%qn rdahi=%rdahi rdalo=%rdalo size=%size_16 &vmlaldav
+@vmlaldav_nosz   .... .... . ... ... . ... x:1 .... .. a:1 . qm:3 . \
+                 qn=%qn rdahi=%rdahi rdalo=%rdalo size=0 &vmlaldav
+@vmladav         .... .... .... ... . ... x:1 .... . . a:1 . qm:3 . \
+                 qn=%qn rda=%rdalo size=%size_16 &vmladav
+@vmladav_nosz    .... .... .... ... . ... x:1 .... . . a:1 . qm:3 . \
+                 qn=%qn rda=%rdalo size=0 &vmladav
+
+{
+  VMLADAV_S      1110 1110 1111  ... . ... . 1110 . 0 . 0 ... 0 @vmladav
+  VMLALDAV_S     1110 1110 1 ... ... . ... . 1110 . 0 . 0 ... 0 @vmlaldav
+}
+{
+  VMLADAV_U      1111 1110 1111  ... . ... . 1110 . 0 . 0 ... 0 @vmladav
+  VMLALDAV_U     1111 1110 1 ... ... . ... . 1110 . 0 . 0 ... 0 @vmlaldav
+}
+
+{
+  VMLSDAV        1110 1110 1111  ... . ... . 1110 . 0 . 0 ... 1 @vmladav
+  VMLSLDAV       1110 1110 1 ... ... . ... . 1110 . 0 . 0 ... 1 @vmlaldav
+}
+
+{
+  VMLSDAV        1111 1110 1111  ... 0 ... . 1110 . 0 . 0 ... 1 @vmladav_nosz
+  VRMLSLDAVH     1111 1110 1 ... ... 0 ... . 1110 . 0 . 0 ... 1 @vmlaldav_nosz
+}
+
+VMLADAV_S        1110 1110 1111  ... 0 ... . 1111 . 0 . 0 ... 1 @vmladav_nosz
+VMLADAV_U        1111 1110 1111  ... 0 ... . 1111 . 0 . 0 ... 1 @vmladav_nosz
+
+{
+  [
+    VMAXNMAV     1110 1110 1110  11 00 ....  1111 0 0 . 0 ... 0 @vmaxnmv size=2
+    VMINNMAV     1110 1110 1110  11 00 ....  1111 1 0 . 0 ... 0 @vmaxnmv size=2
+    VMAXNMV      1110 1110 1110  11 10 ....  1111 0 0 . 0 ... 0 @vmaxnmv size=2
+    VMINNMV      1110 1110 1110  11 10 ....  1111 1 0 . 0 ... 0 @vmaxnmv size=2
+  ]
+  [
+    VMAXV_S      1110 1110 1110  .. 10 ....  1111 0 0 . 0 ... 0 @vmaxv
+    VMINV_S      1110 1110 1110  .. 10 ....  1111 1 0 . 0 ... 0 @vmaxv
+    VMAXAV       1110 1110 1110  .. 00 ....  1111 0 0 . 0 ... 0 @vmaxv
+    VMINAV       1110 1110 1110  .. 00 ....  1111 1 0 . 0 ... 0 @vmaxv
+  ]
+  VMLADAV_S      1110 1110 1111  ... 0 ... . 1111 . 0 . 0 ... 0 @vmladav_nosz
+  VRMLALDAVH_S   1110 1110 1 ... ... 0 ... . 1111 . 0 . 0 ... 0 @vmlaldav_nosz
+}
+
+{
+  [
+    VMAXNMAV     1111 1110 1110  11 00 ....  1111 0 0 . 0 ... 0 @vmaxnmv size=1
+    VMINNMAV     1111 1110 1110  11 00 ....  1111 1 0 . 0 ... 0 @vmaxnmv size=1
+    VMAXNMV      1111 1110 1110  11 10 ....  1111 0 0 . 0 ... 0 @vmaxnmv size=1
+    VMINNMV      1111 1110 1110  11 10 ....  1111 1 0 . 0 ... 0 @vmaxnmv size=1
+  ]
+  [
+    VMAXV_U      1111 1110 1110  .. 10 ....  1111 0 0 . 0 ... 0 @vmaxv
+    VMINV_U      1111 1110 1110  .. 10 ....  1111 1 0 . 0 ... 0 @vmaxv
+  ]
+  VMLADAV_U      1111 1110 1111  ... 0 ... . 1111 . 0 . 0 ... 0 @vmladav_nosz
+  VRMLALDAVH_U   1111 1110 1 ... ... 0 ... . 1111 . 0 . 0 ... 0 @vmlaldav_nosz
+}
+
+# Scalar operations
+
+{
+  VCMPEQ_fp_scalar 1110 1110 0 . 11 ... 1 ... 0 1111  0100 .... @vcmp_fp_scalar size=2
+  VCMPNE_fp_scalar 1110 1110 0 . 11 ... 1 ... 0 1111  1100 .... @vcmp_fp_scalar size=2
+  VADD_scalar      1110 1110 0 . .. ... 1 ... 0 1111 . 100 .... @2scalar
+}
+
+{
+  VCMPLT_fp_scalar 1110 1110 0 . 11 ... 1 ... 1 1111  1100 .... @vcmp_fp_scalar size=2
+  VCMPGE_fp_scalar 1110 1110 0 . 11 ... 1 ... 1 1111  0100 .... @vcmp_fp_scalar size=2
+  VSUB_scalar      1110 1110 0 . .. ... 1 ... 1 1111 . 100 .... @2scalar
+}
+
+{
+  VSHL_S_scalar   1110 1110 0 . 11 .. 01 ... 1 1110 0110 .... @shl_scalar
+  VRSHL_S_scalar  1110 1110 0 . 11 .. 11 ... 1 1110 0110 .... @shl_scalar
+  VQSHL_S_scalar  1110 1110 0 . 11 .. 01 ... 1 1110 1110 .... @shl_scalar
+  VQRSHL_S_scalar 1110 1110 0 . 11 .. 11 ... 1 1110 1110 .... @shl_scalar
+  VMUL_scalar     1110 1110 0 . .. ... 1 ... 1 1110 . 110 .... @2scalar
+}
+
+{
+  VSHL_U_scalar   1111 1110 0 . 11 .. 01 ... 1 1110 0110 .... @shl_scalar
+  VRSHL_U_scalar  1111 1110 0 . 11 .. 11 ... 1 1110 0110 .... @shl_scalar
+  VQSHL_U_scalar  1111 1110 0 . 11 .. 01 ... 1 1110 1110 .... @shl_scalar
+  VQRSHL_U_scalar 1111 1110 0 . 11 .. 11 ... 1 1110 1110 .... @shl_scalar
+  VBRSR           1111 1110 0 . .. ... 1 ... 1 1110 . 110 .... @2scalar
+}
+
+{
+  VADD_fp_scalar  111 . 1110 0 . 11 ... 0 ... 0 1111 . 100 .... @2op_fp_scalar
+  VHADD_S_scalar  1110  1110 0 . .. ... 0 ... 0 1111 . 100 .... @2scalar
+  VHADD_U_scalar  1111  1110 0 . .. ... 0 ... 0 1111 . 100 .... @2scalar
+}
+
+{
+  VSUB_fp_scalar  111 . 1110 0 . 11 ... 0 ... 1 1111 . 100 .... @2op_fp_scalar
+  VHSUB_S_scalar  1110  1110 0 . .. ... 0 ... 1 1111 . 100 .... @2scalar
+  VHSUB_U_scalar  1111  1110 0 . .. ... 0 ... 1 1111 . 100 .... @2scalar
+}
+
+{
+  VQADD_S_scalar  1110  1110 0 . .. ... 0 ... 0 1111 . 110 .... @2scalar
+  VQADD_U_scalar  1111  1110 0 . .. ... 0 ... 0 1111 . 110 .... @2scalar
+  VQDMULLB_scalar 111 . 1110 0 . 11 ... 0 ... 0 1111 . 110 .... @2scalar_nosz \
+                  size=%size_28
+}
+
+{
+  VQSUB_S_scalar  1110  1110 0 . .. ... 0 ... 1 1111 . 110 .... @2scalar
+  VQSUB_U_scalar  1111  1110 0 . .. ... 0 ... 1 1111 . 110 .... @2scalar
+  VQDMULLT_scalar 111 . 1110 0 . 11 ... 0 ... 1 1111 . 110 .... @2scalar_nosz \
+                  size=%size_28
+}
+
+{
+  VMUL_fp_scalar  111 . 1110 0 . 11 ... 1 ... 0 1110 . 110 .... @2op_fp_scalar
+  VQDMULH_scalar  1110  1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar
+  VQRDMULH_scalar 1111  1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar
+}
+
+{
+  VFMA_scalar     111 . 1110 0 . 11 ... 1 ... 0 1110 . 100 .... @2op_fp_scalar
+  # The U bit (28) is don't-care because it does not affect the result
+  VMLA            111 - 1110 0 . .. ... 1 ... 0 1110 . 100 .... @2scalar
+}
+
+{
+  VFMAS_scalar    111 . 1110 0 . 11 ... 1 ... 1 1110 . 100 .... @2op_fp_scalar
+  # The U bit (28) is don't-care because it does not affect the result
+  VMLAS           111 - 1110 0 . .. ... 1 ... 1 1110 . 100 .... @2scalar
+}
+
+VQRDMLAH         1110 1110 0 . .. ... 0 ... 0 1110 . 100 .... @2scalar
+VQRDMLASH        1110 1110 0 . .. ... 0 ... 1 1110 . 100 .... @2scalar
+VQDMLAH          1110 1110 0 . .. ... 0 ... 0 1110 . 110 .... @2scalar
+VQDMLASH         1110 1110 0 . .. ... 0 ... 1 1110 . 110 .... @2scalar
+
+# Vector add across vector
+{
+  VADDV          111 u:1 1110 1111 size:2 01 ... 0 1111 0 0 a:1 0 qm:3 0 rda=%rdalo
+  VADDLV         111 u:1 1110 1 ... 1001 ... 0 1111 00 a:1 0 qm:3 0 \
+                 rdahi=%rdahi rdalo=%rdalo
+}
+
+@vabav           .... .... .. size:2 .... rda:4 .... .... .... &vabav qn=%qn qm=%qm
+
+VABAV_S          111 0 1110 10 .. ... 0 .... 1111 . 0 . 0 ... 1 @vabav
+VABAV_U          111 1 1110 10 .. ... 0 .... 1111 . 0 . 0 ... 1 @vabav
+
+# Logical immediate operations (1 reg and modified-immediate)
+
+# The cmode/op bits here decode VORR/VBIC/VMOV/VMVN, but
+# not in a way we can conveniently represent in decodetree without
+# a lot of repetition:
+# VORR: op=0, (cmode & 1) && cmode < 12
+# VBIC: op=1, (cmode & 1) && cmode < 12
+# VMOV: everything else
+# So we have a single decode line and check the cmode/op in the
+# trans function.
+Vimm_1r 111 . 1111 1 . 00 0 ... ... 0 .... 0 1 . 1 .... @1imm
+
+# Shifts by immediate
+
+VSHLI             111 0 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_b
+VSHLI             111 0 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_h
+VSHLI             111 0 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_w
+
+VQSHLI_S          111 0 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_b
+VQSHLI_S          111 0 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_h
+VQSHLI_S          111 0 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_w
+
+VQSHLI_U          111 1 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_b
+VQSHLI_U          111 1 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_h
+VQSHLI_U          111 1 1111 1 . ... ... ... 0 0111 0 1 . 1 ... 0 @2_shl_w
+
+VQSHLUI           111 1 1111 1 . ... ... ... 0 0110 0 1 . 1 ... 0 @2_shl_b
+VQSHLUI           111 1 1111 1 . ... ... ... 0 0110 0 1 . 1 ... 0 @2_shl_h
+VQSHLUI           111 1 1111 1 . ... ... ... 0 0110 0 1 . 1 ... 0 @2_shl_w
+
+VSHRI_S           111 0 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_b
+VSHRI_S           111 0 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_h
+VSHRI_S           111 0 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_w
+
+VSHRI_U           111 1 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_b
+VSHRI_U           111 1 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_h
+VSHRI_U           111 1 1111 1 . ... ... ... 0 0000 0 1 . 1 ... 0 @2_shr_w
+
+VRSHRI_S          111 0 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_b
+VRSHRI_S          111 0 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_h
+VRSHRI_S          111 0 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_w
+
+VRSHRI_U          111 1 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_b
+VRSHRI_U          111 1 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_h
+VRSHRI_U          111 1 1111 1 . ... ... ... 0 0010 0 1 . 1 ... 0 @2_shr_w
+
+# VSHLL T1 encoding; the T2 VSHLL encoding is elsewhere in this file
+# Note that VMOVL is encoded as "VSHLL with a zero shift count"; we
+# implement it that way rather than special-casing it in the decode.
+VSHLL_BS          111 0 1110 1 . 1 .. ... ... 0 1111 0 1 . 0 ... 0 @2_shll_b
+VSHLL_BS          111 0 1110 1 . 1 .. ... ... 0 1111 0 1 . 0 ... 0 @2_shll_h
+
+VSHLL_BU          111 1 1110 1 . 1 .. ... ... 0 1111 0 1 . 0 ... 0 @2_shll_b
+VSHLL_BU          111 1 1110 1 . 1 .. ... ... 0 1111 0 1 . 0 ... 0 @2_shll_h
+
+VSHLL_TS          111 0 1110 1 . 1 .. ... ... 1 1111 0 1 . 0 ... 0 @2_shll_b
+VSHLL_TS          111 0 1110 1 . 1 .. ... ... 1 1111 0 1 . 0 ... 0 @2_shll_h
+
+VSHLL_TU          111 1 1110 1 . 1 .. ... ... 1 1111 0 1 . 0 ... 0 @2_shll_b
+VSHLL_TU          111 1 1110 1 . 1 .. ... ... 1 1111 0 1 . 0 ... 0 @2_shll_h
+
+# Shift-and-insert
+VSRI              111 1 1111 1 . ... ... ... 0 0100 0 1 . 1 ... 0 @2_shr_b
+VSRI              111 1 1111 1 . ... ... ... 0 0100 0 1 . 1 ... 0 @2_shr_h
+VSRI              111 1 1111 1 . ... ... ... 0 0100 0 1 . 1 ... 0 @2_shr_w
+
+VSLI              111 1 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_b
+VSLI              111 1 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_h
+VSLI              111 1 1111 1 . ... ... ... 0 0101 0 1 . 1 ... 0 @2_shl_w
+
+# Narrowing shifts (which only support b and h sizes)
+VSHRNB            111 0 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 1 @2_shr_b
+VSHRNB            111 0 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 1 @2_shr_h
+VSHRNT            111 0 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 1 @2_shr_b
+VSHRNT            111 0 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 1 @2_shr_h
+
+VRSHRNB           111 1 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 1 @2_shr_b
+VRSHRNB           111 1 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 1 @2_shr_h
+VRSHRNT           111 1 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 1 @2_shr_b
+VRSHRNT           111 1 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 1 @2_shr_h
+
+VQSHRNB_S         111 0 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 0 @2_shr_b
+VQSHRNB_S         111 0 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 0 @2_shr_h
+VQSHRNT_S         111 0 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 0 @2_shr_b
+VQSHRNT_S         111 0 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 0 @2_shr_h
+VQSHRNB_U         111 1 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 0 @2_shr_b
+VQSHRNB_U         111 1 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 0 @2_shr_h
+VQSHRNT_U         111 1 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 0 @2_shr_b
+VQSHRNT_U         111 1 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 0 @2_shr_h
+
+VQSHRUNB          111 0 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 0 @2_shr_b
+VQSHRUNB          111 0 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 0 @2_shr_h
+VQSHRUNT          111 0 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 0 @2_shr_b
+VQSHRUNT          111 0 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 0 @2_shr_h
+
+VQRSHRNB_S        111 0 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 1 @2_shr_b
+VQRSHRNB_S        111 0 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 1 @2_shr_h
+VQRSHRNT_S        111 0 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 1 @2_shr_b
+VQRSHRNT_S        111 0 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 1 @2_shr_h
+VQRSHRNB_U        111 1 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 1 @2_shr_b
+VQRSHRNB_U        111 1 1110 1 . ... ... ... 0 1111 0 1 . 0 ... 1 @2_shr_h
+VQRSHRNT_U        111 1 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 1 @2_shr_b
+VQRSHRNT_U        111 1 1110 1 . ... ... ... 1 1111 0 1 . 0 ... 1 @2_shr_h
+
+VQRSHRUNB         111 1 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 0 @2_shr_b
+VQRSHRUNB         111 1 1110 1 . ... ... ... 0 1111 1 1 . 0 ... 0 @2_shr_h
+VQRSHRUNT         111 1 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 0 @2_shr_b
+VQRSHRUNT         111 1 1110 1 . ... ... ... 1 1111 1 1 . 0 ... 0 @2_shr_h
+
+VSHLC             111 0 1110 1 . 1 imm:5 ... 0 1111 1100 rdm:4 qd=%qd
+
+# Comparisons. We expand out the conditions which are split across
+# encodings T1, T2, T3 and the fc bits. These include VPT, which is
+# effectively "VCMP then VPST". A plain "VCMP" has a mask field of zero.
+{
+  VCMPEQ_fp       111 . 1110 0 . 11 ... 1 ... 0 1111 0 0 . 0 ... 0 @vcmp_fp
+  VCMPEQ          111 1 1110 0 . .. ... 1 ... 0 1111 0 0 . 0 ... 0 @vcmp
+}
+
+{
+  VCMPNE_fp       111 . 1110 0 . 11 ... 1 ... 0 1111 1 0 . 0 ... 0 @vcmp_fp
+  VCMPNE          111 1 1110 0 . .. ... 1 ... 0 1111 1 0 . 0 ... 0 @vcmp
+}
+
+{
+  VCMPGE_fp       111 . 1110 0 . 11 ... 1 ... 1 1111 0 0 . 0 ... 0 @vcmp_fp
+  VCMPGE          111 1 1110 0 . .. ... 1 ... 1 1111 0 0 . 0 ... 0 @vcmp
+}
+
+{
+  VCMPLT_fp       111 . 1110 0 . 11 ... 1 ... 1 1111 1 0 . 0 ... 0 @vcmp_fp
+  VCMPLT          111 1 1110 0 . .. ... 1 ... 1 1111 1 0 . 0 ... 0 @vcmp
+}
+
+{
+  VCMPGT_fp       111 . 1110 0 . 11 ... 1 ... 1 1111 0 0 . 0 ... 1 @vcmp_fp
+  VCMPGT          111 1 1110 0 . .. ... 1 ... 1 1111 0 0 . 0 ... 1 @vcmp
+}
+
+{
+  VCMPLE_fp         111 . 1110 0 . 11 ... 1 ... 1 1111 1 0 . 0 ... 1 @vcmp_fp
+  VCMPLE            1111 1110 0 . .. ... 1 ... 1 1111 1 0 . 0 ... 1 @vcmp
+}
+
+{
+  VPSEL           1111 1110 0 . 11 ... 1 ... 0 1111 . 0 . 0 ... 1 @2op_nosz
+  VCMPCS          1111 1110 0 . .. ... 1 ... 0 1111 0 0 . 0 ... 1 @vcmp
+  VCMPHI          1111 1110 0 . .. ... 1 ... 0 1111 1 0 . 0 ... 1 @vcmp
+}
+
+{
+  VPNOT            1111 1110 0 0 11 000 1 000 0 1111 0100 1101
+  VPST             1111 1110 0 . 11 000 1 ... 0 1111 0100 1101 mask=%mask_22_13
+  VCMPEQ_fp_scalar 1111 1110 0 . 11 ... 1 ... 0 1111 0100 .... @vcmp_fp_scalar size=1
+  VCMPEQ_scalar    1111 1110 0 . .. ... 1 ... 0 1111 0100 .... @vcmp_scalar
+}
+
+{
+  VCMPNE_fp_scalar 1111 1110 0 . 11 ... 1 ... 0 1111 1100 .... @vcmp_fp_scalar size=1
+  VCMPNE_scalar    1111 1110 0 . .. ... 1 ... 0 1111 1100 .... @vcmp_scalar
+}
+
+{
+  VCMPGT_fp_scalar 1111 1110 0 . 11 ... 1 ... 1 1111 0110 .... @vcmp_fp_scalar size=1
+  VCMPGT_scalar    1111 1110 0 . .. ... 1 ... 1 1111 0110 .... @vcmp_scalar
+}
+
+{
+  VCMPLE_fp_scalar 1111 1110 0 . 11 ... 1 ... 1 1111 1110 .... @vcmp_fp_scalar size=1
+  VCMPLE_scalar    1111 1110 0 . .. ... 1 ... 1 1111 1110 .... @vcmp_scalar
+}
+
+{
+  VCMPGE_fp_scalar 1111 1110 0 . 11 ... 1 ... 1 1111 0100 .... @vcmp_fp_scalar size=1
+  VCMPGE_scalar    1111 1110 0 . .. ... 1 ... 1 1111 0100 .... @vcmp_scalar
+}
+{
+  VCMPLT_fp_scalar 1111 1110 0 . 11 ... 1 ... 1 1111 1100 .... @vcmp_fp_scalar size=1
+  VCMPLT_scalar    1111 1110 0 . .. ... 1 ... 1 1111 1100 .... @vcmp_scalar
+}
+
+VCMPCS_scalar     1111 1110 0 . .. ... 1 ... 0 1111 0 1 1 0 .... @vcmp_scalar
+VCMPHI_scalar     1111 1110 0 . .. ... 1 ... 0 1111 1 1 1 0 .... @vcmp_scalar
+
+# 2-operand FP
+VADD_fp           1110 1111 0 . 0 . ... 0 ... 0 1101 . 1 . 0 ... 0 @2op_fp
+VSUB_fp           1110 1111 0 . 1 . ... 0 ... 0 1101 . 1 . 0 ... 0 @2op_fp
+VMUL_fp           1111 1111 0 . 0 . ... 0 ... 0 1101 . 1 . 1 ... 0 @2op_fp
+VABD_fp           1111 1111 0 . 1 . ... 0 ... 0 1101 . 1 . 0 ... 0 @2op_fp
+
+VMAXNM            1111 1111 0 . 0 . ... 0 ... 0 1111 . 1 . 1 ... 0 @2op_fp
+VMINNM            1111 1111 0 . 1 . ... 0 ... 0 1111 . 1 . 1 ... 0 @2op_fp
+
+VCADD90_fp        1111 1100 1 . 0 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev
+VCADD270_fp       1111 1101 1 . 0 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev
+
+VFMA              1110 1111 0 . 0 . ... 0 ... 0 1100 . 1 . 1 ... 0 @2op_fp
+VFMS              1110 1111 0 . 1 . ... 0 ... 0 1100 . 1 . 1 ... 0 @2op_fp
+
+VCMLA0            1111 110 00 . 1 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev
+VCMLA90           1111 110 01 . 1 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev
+VCMLA180          1111 110 10 . 1 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev
+VCMLA270          1111 110 11 . 1 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev
+
+# floating-point <-> fixed-point conversions. Naming convention:
+# VCVT_<from><to>, S = signed int, U = unsigned int, H = halfprec, F = singleprec
+@vcvt             .... .... .. 1 ..... .... .. 1 . .... .... &2shift \
+                  qd=%qd qm=%qm shift=%rshift_i5 size=2
+@vcvt_f16         .... .... .. 11 .... .... .. 0 . .... .... &2shift \
+                  qd=%qd qm=%qm shift=%rshift_i4 size=1
+
+VCVT_SH_fixed     1110 1111 1 . ...... ... 0 11 . 0 01 . 1 ... 0 @vcvt_f16
+VCVT_UH_fixed     1111 1111 1 . ...... ... 0 11 . 0 01 . 1 ... 0 @vcvt_f16
+
+VCVT_HS_fixed     1110 1111 1 . ...... ... 0 11 . 1 01 . 1 ... 0 @vcvt_f16
+VCVT_HU_fixed     1111 1111 1 . ...... ... 0 11 . 1 01 . 1 ... 0 @vcvt_f16
+
+VCVT_SF_fixed     1110 1111 1 . ...... ... 0 11 . 0 01 . 1 ... 0 @vcvt
+VCVT_UF_fixed     1111 1111 1 . ...... ... 0 11 . 0 01 . 1 ... 0 @vcvt
+
+VCVT_FS_fixed     1110 1111 1 . ...... ... 0 11 . 1 01 . 1 ... 0 @vcvt
+VCVT_FU_fixed     1111 1111 1 . ...... ... 0 11 . 1 01 . 1 ... 0 @vcvt
+
+# VCVT between floating point and integer (halfprec and single);
+# VCVT_<from><to>, S = signed int, U = unsigned int, F = float
+VCVT_SF           1111 1111 1 . 11 .. 11 ... 0 011 00 1 . 0 ... 0 @1op
+VCVT_UF           1111 1111 1 . 11 .. 11 ... 0 011 01 1 . 0 ... 0 @1op
+VCVT_FS           1111 1111 1 . 11 .. 11 ... 0 011 10 1 . 0 ... 0 @1op
+VCVT_FU           1111 1111 1 . 11 .. 11 ... 0 011 11 1 . 0 ... 0 @1op
+
+# VCVT from floating point to integer with specified rounding mode
+VCVTAS            1111 1111 1 . 11 .. 11 ... 000 00 0 1 . 0 ... 0 @1op
+VCVTAU            1111 1111 1 . 11 .. 11 ... 000 00 1 1 . 0 ... 0 @1op
+VCVTNS            1111 1111 1 . 11 .. 11 ... 000 01 0 1 . 0 ... 0 @1op
+VCVTNU            1111 1111 1 . 11 .. 11 ... 000 01 1 1 . 0 ... 0 @1op
+VCVTPS            1111 1111 1 . 11 .. 11 ... 000 10 0 1 . 0 ... 0 @1op
+VCVTPU            1111 1111 1 . 11 .. 11 ... 000 10 1 1 . 0 ... 0 @1op
+VCVTMS            1111 1111 1 . 11 .. 11 ... 000 11 0 1 . 0 ... 0 @1op
+VCVTMU            1111 1111 1 . 11 .. 11 ... 000 11 1 1 . 0 ... 0 @1op
+
+VRINTN            1111 1111 1 . 11 .. 10 ... 001 000 1 . 0 ... 0 @1op
+VRINTX            1111 1111 1 . 11 .. 10 ... 001 001 1 . 0 ... 0 @1op
+VRINTA            1111 1111 1 . 11 .. 10 ... 001 010 1 . 0 ... 0 @1op
+VRINTZ            1111 1111 1 . 11 .. 10 ... 001 011 1 . 0 ... 0 @1op
+VRINTM            1111 1111 1 . 11 .. 10 ... 001 101 1 . 0 ... 0 @1op
+VRINTP            1111 1111 1 . 11 .. 10 ... 001 111 1 . 0 ... 0 @1op
diff --git a/target/arm/tcg/neon-dp.decode b/target/arm/tcg/neon-dp.decode
new file mode 100644
index 0000000..fd3a01b
--- /dev/null
+++ b/target/arm/tcg/neon-dp.decode
@@ -0,0 +1,646 @@
+# AArch32 Neon data-processing instruction descriptions
+#
+#  Copyright (c) 2020 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+# VFP/Neon register fields; same as vfp.decode
+%vm_dp  5:1 0:4
+%vn_dp  7:1 16:4
+%vd_dp  22:1 12:4
+
+# Encodings for Neon data processing instructions where the T32 encoding
+# is a simple transformation of the A32 encoding.
+# More specifically, this file covers instructions where the A32 encoding is
+#   0b1111_001p_qqqq_qqqq_qqqq_qqqq_qqqq_qqqq
+# and the T32 encoding is
+#   0b111p_1111_qqqq_qqqq_qqqq_qqqq_qqqq_qqqq
+# This file works on the A32 encoding only; calling code for T32 has to
+# transform the insn into the A32 version first.
+
+######################################################################
+# 3-reg-same grouping:
+# 1111 001 U 0 D sz:2 Vn:4 Vd:4 opc:4 N Q M op Vm:4
+######################################################################
+
+&3same vm vn vd q size
+
+@3same           .... ... . . . size:2 .... .... .... . q:1 . . .... \
+                 &3same vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+@3same_q0        .... ... . . . size:2 .... .... .... . 0 . . .... \
+                 &3same vm=%vm_dp vn=%vn_dp vd=%vd_dp q=0
+
+# For FP insns the high bit of 'size' is used as part of opcode decode,
+# and the 'size' bit is 0 for 32-bit float and 1 for 16-bit float.
+# This converts this encoding to the same MO_8/16/32/64 values that the
+# integer neon insns use.
+%3same_fp_size   20:1 !function=neon_3same_fp_size
+
+@3same_fp        .... ... . . . . . .... .... .... . q:1 . . .... \
+                 &3same vm=%vm_dp vn=%vn_dp vd=%vd_dp size=%3same_fp_size
+@3same_fp_q0     .... ... . . . . . .... .... .... . 0 . . .... \
+                 &3same vm=%vm_dp vn=%vn_dp vd=%vd_dp q=0 size=%3same_fp_size
+
+VHADD_S_3s       1111 001 0 0 . .. .... .... 0000 . . . 0 .... @3same
+VHADD_U_3s       1111 001 1 0 . .. .... .... 0000 . . . 0 .... @3same
+VQADD_S_3s       1111 001 0 0 . .. .... .... 0000 . . . 1 .... @3same
+VQADD_U_3s       1111 001 1 0 . .. .... .... 0000 . . . 1 .... @3same
+
+VRHADD_S_3s      1111 001 0 0 . .. .... .... 0001 . . . 0 .... @3same
+VRHADD_U_3s      1111 001 1 0 . .. .... .... 0001 . . . 0 .... @3same
+
+@3same_logic     .... ... . . . .. .... .... .... . q:1 .. .... \
+                 &3same vm=%vm_dp vn=%vn_dp vd=%vd_dp size=0
+
+VAND_3s          1111 001 0 0 . 00 .... .... 0001 ... 1 .... @3same_logic
+VBIC_3s          1111 001 0 0 . 01 .... .... 0001 ... 1 .... @3same_logic
+VORR_3s          1111 001 0 0 . 10 .... .... 0001 ... 1 .... @3same_logic
+VORN_3s          1111 001 0 0 . 11 .... .... 0001 ... 1 .... @3same_logic
+VEOR_3s          1111 001 1 0 . 00 .... .... 0001 ... 1 .... @3same_logic
+VBSL_3s          1111 001 1 0 . 01 .... .... 0001 ... 1 .... @3same_logic
+VBIT_3s          1111 001 1 0 . 10 .... .... 0001 ... 1 .... @3same_logic
+VBIF_3s          1111 001 1 0 . 11 .... .... 0001 ... 1 .... @3same_logic
+
+VHSUB_S_3s       1111 001 0 0 . .. .... .... 0010 . . . 0 .... @3same
+VHSUB_U_3s       1111 001 1 0 . .. .... .... 0010 . . . 0 .... @3same
+
+VQSUB_S_3s       1111 001 0 0 . .. .... .... 0010 . . . 1 .... @3same
+VQSUB_U_3s       1111 001 1 0 . .. .... .... 0010 . . . 1 .... @3same
+
+VCGT_S_3s        1111 001 0 0 . .. .... .... 0011 . . . 0 .... @3same
+VCGT_U_3s        1111 001 1 0 . .. .... .... 0011 . . . 0 .... @3same
+VCGE_S_3s        1111 001 0 0 . .. .... .... 0011 . . . 1 .... @3same
+VCGE_U_3s        1111 001 1 0 . .. .... .... 0011 . . . 1 .... @3same
+
+# The _rev suffix indicates that Vn and Vm are reversed. This is
+# the case for shifts. In the Arm ARM these insns are documented
+# with the Vm and Vn fields in their usual places, but in the
+# assembly the operands are listed "backwards", ie in the order
+# Dd, Dm, Dn where other insns use Dd, Dn, Dm. For QEMU we choose
+# to consider Vm and Vn as being in different fields in the insn,
+# which allows us to avoid special-casing shifts in the trans_
+# function code. We would otherwise need to manually swap the operands
+# over to call Neon helper functions that are shared with AArch64,
+# which does not have this odd reversed-operand situation.
+@3same_rev       .... ... . . . size:2 .... .... .... . q:1 . . .... \
+                 &3same vn=%vm_dp vm=%vn_dp vd=%vd_dp
+
+VSHL_S_3s        1111 001 0 0 . .. .... .... 0100 . . . 0 .... @3same_rev
+VSHL_U_3s        1111 001 1 0 . .. .... .... 0100 . . . 0 .... @3same_rev
+
+# Insns operating on 64-bit elements (size!=0b11 handled elsewhere)
+# The _rev suffix indicates that Vn and Vm are reversed (as explained
+# by the comment for the @3same_rev format).
+@3same_64_rev    .... ... . . . 11 .... .... .... . q:1 . . .... \
+                 &3same vm=%vn_dp vn=%vm_dp vd=%vd_dp size=3
+
+{
+  VQSHL_S64_3s   1111 001 0 0 . .. .... .... 0100 . . . 1 .... @3same_64_rev
+  VQSHL_S_3s     1111 001 0 0 . .. .... .... 0100 . . . 1 .... @3same_rev
+}
+{
+  VQSHL_U64_3s   1111 001 1 0 . .. .... .... 0100 . . . 1 .... @3same_64_rev
+  VQSHL_U_3s     1111 001 1 0 . .. .... .... 0100 . . . 1 .... @3same_rev
+}
+{
+  VRSHL_S64_3s   1111 001 0 0 . .. .... .... 0101 . . . 0 .... @3same_64_rev
+  VRSHL_S_3s     1111 001 0 0 . .. .... .... 0101 . . . 0 .... @3same_rev
+}
+{
+  VRSHL_U64_3s   1111 001 1 0 . .. .... .... 0101 . . . 0 .... @3same_64_rev
+  VRSHL_U_3s     1111 001 1 0 . .. .... .... 0101 . . . 0 .... @3same_rev
+}
+{
+  VQRSHL_S64_3s  1111 001 0 0 . .. .... .... 0101 . . . 1 .... @3same_64_rev
+  VQRSHL_S_3s    1111 001 0 0 . .. .... .... 0101 . . . 1 .... @3same_rev
+}
+{
+  VQRSHL_U64_3s  1111 001 1 0 . .. .... .... 0101 . . . 1 .... @3same_64_rev
+  VQRSHL_U_3s    1111 001 1 0 . .. .... .... 0101 . . . 1 .... @3same_rev
+}
+
+VMAX_S_3s        1111 001 0 0 . .. .... .... 0110 . . . 0 .... @3same
+VMAX_U_3s        1111 001 1 0 . .. .... .... 0110 . . . 0 .... @3same
+VMIN_S_3s        1111 001 0 0 . .. .... .... 0110 . . . 1 .... @3same
+VMIN_U_3s        1111 001 1 0 . .. .... .... 0110 . . . 1 .... @3same
+
+VABD_S_3s        1111 001 0 0 . .. .... .... 0111 . . . 0 .... @3same
+VABD_U_3s        1111 001 1 0 . .. .... .... 0111 . . . 0 .... @3same
+
+VABA_S_3s        1111 001 0 0 . .. .... .... 0111 . . . 1 .... @3same
+VABA_U_3s        1111 001 1 0 . .. .... .... 0111 . . . 1 .... @3same
+
+VADD_3s          1111 001 0 0 . .. .... .... 1000 . . . 0 .... @3same
+VSUB_3s          1111 001 1 0 . .. .... .... 1000 . . . 0 .... @3same
+
+VTST_3s          1111 001 0 0 . .. .... .... 1000 . . . 1 .... @3same
+VCEQ_3s          1111 001 1 0 . .. .... .... 1000 . . . 1 .... @3same
+
+VMLA_3s          1111 001 0 0 . .. .... .... 1001 . . . 0 .... @3same
+VMLS_3s          1111 001 1 0 . .. .... .... 1001 . . . 0 .... @3same
+
+VMUL_3s          1111 001 0 0 . .. .... .... 1001 . . . 1 .... @3same
+VMUL_p_3s        1111 001 1 0 . .. .... .... 1001 . . . 1 .... @3same
+
+VPMAX_S_3s       1111 001 0 0 . .. .... .... 1010 . . . 0 .... @3same_q0
+VPMAX_U_3s       1111 001 1 0 . .. .... .... 1010 . . . 0 .... @3same_q0
+
+VPMIN_S_3s       1111 001 0 0 . .. .... .... 1010 . . . 1 .... @3same_q0
+VPMIN_U_3s       1111 001 1 0 . .. .... .... 1010 . . . 1 .... @3same_q0
+
+VQDMULH_3s       1111 001 0 0 . .. .... .... 1011 . . . 0 .... @3same
+VQRDMULH_3s      1111 001 1 0 . .. .... .... 1011 . . . 0 .... @3same
+
+VPADD_3s         1111 001 0 0 . .. .... .... 1011 . . . 1 .... @3same_q0
+
+VQRDMLAH_3s      1111 001 1 0 . .. .... .... 1011 ... 1 .... @3same
+
+@3same_crypto    .... .... .... .... .... .... .... .... \
+                 &3same vm=%vm_dp vn=%vn_dp vd=%vd_dp size=0 q=1
+
+SHA1C_3s         1111 001 0 0 . 00 .... .... 1100 . 1 . 0 .... @3same_crypto
+SHA1P_3s         1111 001 0 0 . 01 .... .... 1100 . 1 . 0 .... @3same_crypto
+SHA1M_3s         1111 001 0 0 . 10 .... .... 1100 . 1 . 0 .... @3same_crypto
+SHA1SU0_3s       1111 001 0 0 . 11 .... .... 1100 . 1 . 0 .... @3same_crypto
+SHA256H_3s       1111 001 1 0 . 00 .... .... 1100 . 1 . 0 .... @3same_crypto
+SHA256H2_3s      1111 001 1 0 . 01 .... .... 1100 . 1 . 0 .... @3same_crypto
+SHA256SU1_3s     1111 001 1 0 . 10 .... .... 1100 . 1 . 0 .... @3same_crypto
+
+VFMA_fp_3s       1111 001 0 0 . 0 . .... .... 1100 ... 1 .... @3same_fp
+VFMS_fp_3s       1111 001 0 0 . 1 . .... .... 1100 ... 1 .... @3same_fp
+
+VQRDMLSH_3s      1111 001 1 0 . .. .... .... 1100 ... 1 .... @3same
+
+VADD_fp_3s       1111 001 0 0 . 0 . .... .... 1101 ... 0 .... @3same_fp
+VSUB_fp_3s       1111 001 0 0 . 1 . .... .... 1101 ... 0 .... @3same_fp
+VPADD_fp_3s      1111 001 1 0 . 0 . .... .... 1101 ... 0 .... @3same_fp_q0
+VABD_fp_3s       1111 001 1 0 . 1 . .... .... 1101 ... 0 .... @3same_fp
+VMLA_fp_3s       1111 001 0 0 . 0 . .... .... 1101 ... 1 .... @3same_fp
+VMLS_fp_3s       1111 001 0 0 . 1 . .... .... 1101 ... 1 .... @3same_fp
+VMUL_fp_3s       1111 001 1 0 . 0 . .... .... 1101 ... 1 .... @3same_fp
+VCEQ_fp_3s       1111 001 0 0 . 0 . .... .... 1110 ... 0 .... @3same_fp
+VCGE_fp_3s       1111 001 1 0 . 0 . .... .... 1110 ... 0 .... @3same_fp
+VACGE_fp_3s      1111 001 1 0 . 0 . .... .... 1110 ... 1 .... @3same_fp
+VCGT_fp_3s       1111 001 1 0 . 1 . .... .... 1110 ... 0 .... @3same_fp
+VACGT_fp_3s      1111 001 1 0 . 1 . .... .... 1110 ... 1 .... @3same_fp
+VMAX_fp_3s       1111 001 0 0 . 0 . .... .... 1111 ... 0 .... @3same_fp
+VMIN_fp_3s       1111 001 0 0 . 1 . .... .... 1111 ... 0 .... @3same_fp
+VPMAX_fp_3s      1111 001 1 0 . 0 . .... .... 1111 ... 0 .... @3same_fp_q0
+VPMIN_fp_3s      1111 001 1 0 . 1 . .... .... 1111 ... 0 .... @3same_fp_q0
+VRECPS_fp_3s     1111 001 0 0 . 0 . .... .... 1111 ... 1 .... @3same_fp
+VRSQRTS_fp_3s    1111 001 0 0 . 1 . .... .... 1111 ... 1 .... @3same_fp
+VMAXNM_fp_3s     1111 001 1 0 . 0 . .... .... 1111 ... 1 .... @3same_fp
+VMINNM_fp_3s     1111 001 1 0 . 1 . .... .... 1111 ... 1 .... @3same_fp
+
+######################################################################
+# 2-reg-and-shift grouping:
+# 1111 001 U 1 D immH:3 immL:3 Vd:4 opc:4 L Q M 1 Vm:4
+######################################################################
+&2reg_shift vm vd q shift size
+
+# Right shifts are encoded as N - shift, where N is the element size in bits.
+%neon_rshift_i6  16:6 !function=rsub_64
+%neon_rshift_i5  16:5 !function=rsub_32
+%neon_rshift_i4  16:4 !function=rsub_16
+%neon_rshift_i3  16:3 !function=rsub_8
+
+@2reg_shr_d      .... ... . . . ......  .... .... 1 q:1 . . .... \
+                 &2reg_shift vm=%vm_dp vd=%vd_dp size=3 shift=%neon_rshift_i6
+@2reg_shr_s      .... ... . . . 1 ..... .... .... 0 q:1 . . .... \
+                 &2reg_shift vm=%vm_dp vd=%vd_dp size=2 shift=%neon_rshift_i5
+@2reg_shr_h      .... ... . . . 01 .... .... .... 0 q:1 . . .... \
+                 &2reg_shift vm=%vm_dp vd=%vd_dp size=1 shift=%neon_rshift_i4
+@2reg_shr_b      .... ... . . . 001 ... .... .... 0 q:1 . . .... \
+                 &2reg_shift vm=%vm_dp vd=%vd_dp size=0 shift=%neon_rshift_i3
+
+@2reg_shl_d      .... ... . . . shift:6      .... .... 1 q:1 . . .... \
+                 &2reg_shift vm=%vm_dp vd=%vd_dp size=3
+@2reg_shl_s      .... ... . . . 1 shift:5    .... .... 0 q:1 . . .... \
+                 &2reg_shift vm=%vm_dp vd=%vd_dp size=2
+@2reg_shl_h      .... ... . . . 01 shift:4   .... .... 0 q:1 . . .... \
+                 &2reg_shift vm=%vm_dp vd=%vd_dp size=1
+@2reg_shl_b      .... ... . . . 001 shift:3  .... .... 0 q:1 . . .... \
+                 &2reg_shift vm=%vm_dp vd=%vd_dp size=0
+
+# Narrowing right shifts: here the Q bit is part of the opcode decode
+@2reg_shrn_d     .... ... . . . 1 ..... .... .... 0 . . . .... \
+                 &2reg_shift vm=%vm_dp vd=%vd_dp size=3 q=0 \
+                 shift=%neon_rshift_i5
+@2reg_shrn_s     .... ... . . . 01 .... .... .... 0 . . . .... \
+                 &2reg_shift vm=%vm_dp vd=%vd_dp size=2 q=0 \
+                 shift=%neon_rshift_i4
+@2reg_shrn_h     .... ... . . . 001 ... .... .... 0 . . . .... \
+                 &2reg_shift vm=%vm_dp vd=%vd_dp size=1 q=0 \
+                 shift=%neon_rshift_i3
+
+# Long left shifts: again Q is part of opcode decode
+@2reg_shll_s     .... ... . . . 1 shift:5    .... .... 0 . . . .... \
+                 &2reg_shift vm=%vm_dp vd=%vd_dp size=2 q=0
+@2reg_shll_h     .... ... . . . 01 shift:4   .... .... 0 . . . .... \
+                 &2reg_shift vm=%vm_dp vd=%vd_dp size=1 q=0
+@2reg_shll_b     .... ... . . . 001 shift:3  .... .... 0 . . . .... \
+                 &2reg_shift vm=%vm_dp vd=%vd_dp size=0 q=0
+
+@2reg_vcvt       .... ... . . . 1 ..... .... .... . q:1 . . .... \
+                 &2reg_shift vm=%vm_dp vd=%vd_dp size=2 shift=%neon_rshift_i5
+@2reg_vcvt_f16   .... ... . . . 11 .... .... .... . q:1 . . .... \
+                 &2reg_shift vm=%vm_dp vd=%vd_dp size=1 shift=%neon_rshift_i4
+
+VSHR_S_2sh       1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_d
+VSHR_S_2sh       1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_s
+VSHR_S_2sh       1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_h
+VSHR_S_2sh       1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_b
+
+VSHR_U_2sh       1111 001 1 1 . ...... .... 0000 . . . 1 .... @2reg_shr_d
+VSHR_U_2sh       1111 001 1 1 . ...... .... 0000 . . . 1 .... @2reg_shr_s
+VSHR_U_2sh       1111 001 1 1 . ...... .... 0000 . . . 1 .... @2reg_shr_h
+VSHR_U_2sh       1111 001 1 1 . ...... .... 0000 . . . 1 .... @2reg_shr_b
+
+VSRA_S_2sh       1111 001 0 1 . ...... .... 0001 . . . 1 .... @2reg_shr_d
+VSRA_S_2sh       1111 001 0 1 . ...... .... 0001 . . . 1 .... @2reg_shr_s
+VSRA_S_2sh       1111 001 0 1 . ...... .... 0001 . . . 1 .... @2reg_shr_h
+VSRA_S_2sh       1111 001 0 1 . ...... .... 0001 . . . 1 .... @2reg_shr_b
+
+VSRA_U_2sh       1111 001 1 1 . ...... .... 0001 . . . 1 .... @2reg_shr_d
+VSRA_U_2sh       1111 001 1 1 . ...... .... 0001 . . . 1 .... @2reg_shr_s
+VSRA_U_2sh       1111 001 1 1 . ...... .... 0001 . . . 1 .... @2reg_shr_h
+VSRA_U_2sh       1111 001 1 1 . ...... .... 0001 . . . 1 .... @2reg_shr_b
+
+VRSHR_S_2sh      1111 001 0 1 . ...... .... 0010 . . . 1 .... @2reg_shr_d
+VRSHR_S_2sh      1111 001 0 1 . ...... .... 0010 . . . 1 .... @2reg_shr_s
+VRSHR_S_2sh      1111 001 0 1 . ...... .... 0010 . . . 1 .... @2reg_shr_h
+VRSHR_S_2sh      1111 001 0 1 . ...... .... 0010 . . . 1 .... @2reg_shr_b
+
+VRSHR_U_2sh      1111 001 1 1 . ...... .... 0010 . . . 1 .... @2reg_shr_d
+VRSHR_U_2sh      1111 001 1 1 . ...... .... 0010 . . . 1 .... @2reg_shr_s
+VRSHR_U_2sh      1111 001 1 1 . ...... .... 0010 . . . 1 .... @2reg_shr_h
+VRSHR_U_2sh      1111 001 1 1 . ...... .... 0010 . . . 1 .... @2reg_shr_b
+
+VRSRA_S_2sh      1111 001 0 1 . ...... .... 0011 . . . 1 .... @2reg_shr_d
+VRSRA_S_2sh      1111 001 0 1 . ...... .... 0011 . . . 1 .... @2reg_shr_s
+VRSRA_S_2sh      1111 001 0 1 . ...... .... 0011 . . . 1 .... @2reg_shr_h
+VRSRA_S_2sh      1111 001 0 1 . ...... .... 0011 . . . 1 .... @2reg_shr_b
+
+VRSRA_U_2sh      1111 001 1 1 . ...... .... 0011 . . . 1 .... @2reg_shr_d
+VRSRA_U_2sh      1111 001 1 1 . ...... .... 0011 . . . 1 .... @2reg_shr_s
+VRSRA_U_2sh      1111 001 1 1 . ...... .... 0011 . . . 1 .... @2reg_shr_h
+VRSRA_U_2sh      1111 001 1 1 . ...... .... 0011 . . . 1 .... @2reg_shr_b
+
+VSRI_2sh         1111 001 1 1 . ...... .... 0100 . . . 1 .... @2reg_shr_d
+VSRI_2sh         1111 001 1 1 . ...... .... 0100 . . . 1 .... @2reg_shr_s
+VSRI_2sh         1111 001 1 1 . ...... .... 0100 . . . 1 .... @2reg_shr_h
+VSRI_2sh         1111 001 1 1 . ...... .... 0100 . . . 1 .... @2reg_shr_b
+
+VSHL_2sh         1111 001 0 1 . ...... .... 0101 . . . 1 .... @2reg_shl_d
+VSHL_2sh         1111 001 0 1 . ...... .... 0101 . . . 1 .... @2reg_shl_s
+VSHL_2sh         1111 001 0 1 . ...... .... 0101 . . . 1 .... @2reg_shl_h
+VSHL_2sh         1111 001 0 1 . ...... .... 0101 . . . 1 .... @2reg_shl_b
+
+VSLI_2sh         1111 001 1 1 . ...... .... 0101 . . . 1 .... @2reg_shl_d
+VSLI_2sh         1111 001 1 1 . ...... .... 0101 . . . 1 .... @2reg_shl_s
+VSLI_2sh         1111 001 1 1 . ...... .... 0101 . . . 1 .... @2reg_shl_h
+VSLI_2sh         1111 001 1 1 . ...... .... 0101 . . . 1 .... @2reg_shl_b
+
+VQSHLU_64_2sh    1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_d
+VQSHLU_2sh       1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_s
+VQSHLU_2sh       1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_h
+VQSHLU_2sh       1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_b
+
+VQSHL_S_64_2sh   1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_d
+VQSHL_S_2sh      1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_s
+VQSHL_S_2sh      1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_h
+VQSHL_S_2sh      1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_b
+
+VQSHL_U_64_2sh   1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_d
+VQSHL_U_2sh      1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_s
+VQSHL_U_2sh      1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_h
+VQSHL_U_2sh      1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_b
+
+VSHRN_64_2sh     1111 001 0 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_d
+VSHRN_32_2sh     1111 001 0 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_s
+VSHRN_16_2sh     1111 001 0 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_h
+
+VRSHRN_64_2sh    1111 001 0 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_d
+VRSHRN_32_2sh    1111 001 0 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_s
+VRSHRN_16_2sh    1111 001 0 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_h
+
+VQSHRUN_64_2sh   1111 001 1 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_d
+VQSHRUN_32_2sh   1111 001 1 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_s
+VQSHRUN_16_2sh   1111 001 1 1 . ...... .... 1000 . 0 . 1 .... @2reg_shrn_h
+
+VQRSHRUN_64_2sh  1111 001 1 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_d
+VQRSHRUN_32_2sh  1111 001 1 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_s
+VQRSHRUN_16_2sh  1111 001 1 1 . ...... .... 1000 . 1 . 1 .... @2reg_shrn_h
+
+# VQSHRN with signed input
+VQSHRN_S64_2sh   1111 001 0 1 . ...... .... 1001 . 0 . 1 .... @2reg_shrn_d
+VQSHRN_S32_2sh   1111 001 0 1 . ...... .... 1001 . 0 . 1 .... @2reg_shrn_s
+VQSHRN_S16_2sh   1111 001 0 1 . ...... .... 1001 . 0 . 1 .... @2reg_shrn_h
+
+# VQRSHRN with signed input
+VQRSHRN_S64_2sh  1111 001 0 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_d
+VQRSHRN_S32_2sh  1111 001 0 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_s
+VQRSHRN_S16_2sh  1111 001 0 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_h
+
+# VQSHRN with unsigned input
+VQSHRN_U64_2sh   1111 001 1 1 . ...... .... 1001 . 0 . 1 .... @2reg_shrn_d
+VQSHRN_U32_2sh   1111 001 1 1 . ...... .... 1001 . 0 . 1 .... @2reg_shrn_s
+VQSHRN_U16_2sh   1111 001 1 1 . ...... .... 1001 . 0 . 1 .... @2reg_shrn_h
+
+# VQRSHRN with unsigned input
+VQRSHRN_U64_2sh  1111 001 1 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_d
+VQRSHRN_U32_2sh  1111 001 1 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_s
+VQRSHRN_U16_2sh  1111 001 1 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_h
+
+VSHLL_S_2sh      1111 001 0 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_s
+VSHLL_S_2sh      1111 001 0 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_h
+VSHLL_S_2sh      1111 001 0 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_b
+
+VSHLL_U_2sh      1111 001 1 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_s
+VSHLL_U_2sh      1111 001 1 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_h
+VSHLL_U_2sh      1111 001 1 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_b
+
+# VCVT fixed<->float conversions
+VCVT_SH_2sh      1111 001 0 1 . ...... .... 1100 0 . . 1 .... @2reg_vcvt_f16
+VCVT_UH_2sh      1111 001 1 1 . ...... .... 1100 0 . . 1 .... @2reg_vcvt_f16
+VCVT_HS_2sh      1111 001 0 1 . ...... .... 1101 0 . . 1 .... @2reg_vcvt_f16
+VCVT_HU_2sh      1111 001 1 1 . ...... .... 1101 0 . . 1 .... @2reg_vcvt_f16
+
+VCVT_SF_2sh      1111 001 0 1 . ...... .... 1110 0 . . 1 .... @2reg_vcvt
+VCVT_UF_2sh      1111 001 1 1 . ...... .... 1110 0 . . 1 .... @2reg_vcvt
+VCVT_FS_2sh      1111 001 0 1 . ...... .... 1111 0 . . 1 .... @2reg_vcvt
+VCVT_FU_2sh      1111 001 1 1 . ...... .... 1111 0 . . 1 .... @2reg_vcvt
+
+######################################################################
+# 1-reg-and-modified-immediate grouping:
+# 1111 001 i 1 D 000 imm:3 Vd:4 cmode:4 0 Q op 1 Vm:4
+######################################################################
+
+&1reg_imm        vd q imm cmode op
+
+%asimd_imm_value 24:1 16:3 0:4
+
+@1reg_imm        .... ... . . . ... ... .... .... . q:1 . . .... \
+                 &1reg_imm imm=%asimd_imm_value vd=%vd_dp
+
+# The cmode/op bits here decode VORR/VBIC/VMOV/VMNV, but
+# not in a way we can conveniently represent in decodetree without
+# a lot of repetition:
+# VORR: op=0, (cmode & 1) && cmode < 12
+# VBIC: op=1, (cmode & 1) && cmode < 12
+# VMOV: everything else
+# So we have a single decode line and check the cmode/op in the
+# trans function.
+Vimm_1r          1111 001 . 1 . 000 ... .... cmode:4 0 . op:1 1 .... @1reg_imm
+
+######################################################################
+# Within the "two registers, or three registers of different lengths"
+# grouping ([23,4]=0b10), bits [21:20] are either part of the opcode
+# decode: 0b11 for VEXT, two-reg-misc, VTBL, and duplicate-scalar;
+# or they are a size field for the three-reg-different-lengths and
+# two-reg-and-scalar insn groups (where size cannot be 0b11). This
+# is slightly awkward for decodetree: we handle it with this
+# non-exclusive group which contains within it two exclusive groups:
+# one for the size=0b11 patterns, and one for the size-not-0b11
+# patterns. This allows us to check that none of the insns within
+# each subgroup accidentally overlap each other. Note that all the
+# trans functions for the size-not-0b11 patterns must check and
+# return false for size==3.
+######################################################################
+{
+  [
+    ##################################################################
+    # Miscellaneous size=0b11 insns
+    ##################################################################
+    VEXT         1111 001 0 1 . 11 .... .... imm:4 . q:1 . 0 .... \
+                 vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+    VTBL         1111 001 1 1 . 11 .... .... 10 len:2 . op:1 . 0 .... \
+                 vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+    VDUP_scalar  1111 001 1 1 . 11 index:3 1 .... 11 000 q:1 . 0 .... \
+                 vm=%vm_dp vd=%vd_dp size=0
+    VDUP_scalar  1111 001 1 1 . 11 index:2 10 .... 11 000 q:1 . 0 .... \
+                 vm=%vm_dp vd=%vd_dp size=1
+    VDUP_scalar  1111 001 1 1 . 11 index:1 100 .... 11 000 q:1 . 0 .... \
+                 vm=%vm_dp vd=%vd_dp size=2
+
+    ##################################################################
+    # 2-reg-misc grouping:
+    # 1111 001 11 D 11 size:2 opc1:2 Vd:4 0 opc2:4 q:1 M 0 Vm:4
+    ##################################################################
+
+    &2misc vd vm q size
+
+    @2misc       .... ... .. . .. size:2 .. .... . .... q:1 . . .... \
+                 &2misc vm=%vm_dp vd=%vd_dp
+    @2misc_q0    .... ... .. . .. size:2 .. .... . .... . . . .... \
+                 &2misc vm=%vm_dp vd=%vd_dp q=0
+    @2misc_q1    .... ... .. . .. size:2 .. .... . .... . . . .... \
+                 &2misc vm=%vm_dp vd=%vd_dp q=1
+
+    VREV64       1111 001 11 . 11 .. 00 .... 0 0000 . . 0 .... @2misc
+    VREV32       1111 001 11 . 11 .. 00 .... 0 0001 . . 0 .... @2misc
+    VREV16       1111 001 11 . 11 .. 00 .... 0 0010 . . 0 .... @2misc
+
+    VPADDL_S     1111 001 11 . 11 .. 00 .... 0 0100 . . 0 .... @2misc
+    VPADDL_U     1111 001 11 . 11 .. 00 .... 0 0101 . . 0 .... @2misc
+
+    AESE         1111 001 11 . 11 .. 00 .... 0 0110 0 . 0 .... @2misc_q1
+    AESD         1111 001 11 . 11 .. 00 .... 0 0110 1 . 0 .... @2misc_q1
+    AESMC        1111 001 11 . 11 .. 00 .... 0 0111 0 . 0 .... @2misc_q1
+    AESIMC       1111 001 11 . 11 .. 00 .... 0 0111 1 . 0 .... @2misc_q1
+
+    VCLS         1111 001 11 . 11 .. 00 .... 0 1000 . . 0 .... @2misc
+    VCLZ         1111 001 11 . 11 .. 00 .... 0 1001 . . 0 .... @2misc
+    VCNT         1111 001 11 . 11 .. 00 .... 0 1010 . . 0 .... @2misc
+
+    VMVN         1111 001 11 . 11 .. 00 .... 0 1011 . . 0 .... @2misc
+
+    VPADAL_S     1111 001 11 . 11 .. 00 .... 0 1100 . . 0 .... @2misc
+    VPADAL_U     1111 001 11 . 11 .. 00 .... 0 1101 . . 0 .... @2misc
+
+    VQABS        1111 001 11 . 11 .. 00 .... 0 1110 . . 0 .... @2misc
+    VQNEG        1111 001 11 . 11 .. 00 .... 0 1111 . . 0 .... @2misc
+
+    VCGT0        1111 001 11 . 11 .. 01 .... 0 0000 . . 0 .... @2misc
+    VCGE0        1111 001 11 . 11 .. 01 .... 0 0001 . . 0 .... @2misc
+    VCEQ0        1111 001 11 . 11 .. 01 .... 0 0010 . . 0 .... @2misc
+    VCLE0        1111 001 11 . 11 .. 01 .... 0 0011 . . 0 .... @2misc
+    VCLT0        1111 001 11 . 11 .. 01 .... 0 0100 . . 0 .... @2misc
+
+    SHA1H        1111 001 11 . 11 .. 01 .... 0 0101 1 . 0 .... @2misc_q1
+
+    VABS         1111 001 11 . 11 .. 01 .... 0 0110 . . 0 .... @2misc
+    VNEG         1111 001 11 . 11 .. 01 .... 0 0111 . . 0 .... @2misc
+
+    VCGT0_F      1111 001 11 . 11 .. 01 .... 0 1000 . . 0 .... @2misc
+    VCGE0_F      1111 001 11 . 11 .. 01 .... 0 1001 . . 0 .... @2misc
+    VCEQ0_F      1111 001 11 . 11 .. 01 .... 0 1010 . . 0 .... @2misc
+    VCLE0_F      1111 001 11 . 11 .. 01 .... 0 1011 . . 0 .... @2misc
+    VCLT0_F      1111 001 11 . 11 .. 01 .... 0 1100 . . 0 .... @2misc
+
+    VABS_F       1111 001 11 . 11 .. 01 .... 0 1110 . . 0 .... @2misc
+    VNEG_F       1111 001 11 . 11 .. 01 .... 0 1111 . . 0 .... @2misc
+
+    VSWP         1111 001 11 . 11 .. 10 .... 0 0000 . . 0 .... @2misc
+    VTRN         1111 001 11 . 11 .. 10 .... 0 0001 . . 0 .... @2misc
+    VUZP         1111 001 11 . 11 .. 10 .... 0 0010 . . 0 .... @2misc
+    VZIP         1111 001 11 . 11 .. 10 .... 0 0011 . . 0 .... @2misc
+
+    VMOVN        1111 001 11 . 11 .. 10 .... 0 0100 0 . 0 .... @2misc_q0
+    # VQMOVUN: unsigned result (source is always signed)
+    VQMOVUN      1111 001 11 . 11 .. 10 .... 0 0100 1 . 0 .... @2misc_q0
+    # VQMOVN: signed result, source may be signed (_S) or unsigned (_U)
+    VQMOVN_S     1111 001 11 . 11 .. 10 .... 0 0101 0 . 0 .... @2misc_q0
+    VQMOVN_U     1111 001 11 . 11 .. 10 .... 0 0101 1 . 0 .... @2misc_q0
+
+    VSHLL        1111 001 11 . 11 .. 10 .... 0 0110 0 . 0 .... @2misc_q0
+
+    SHA1SU1      1111 001 11 . 11 .. 10 .... 0 0111 0 . 0 .... @2misc_q1
+    SHA256SU0    1111 001 11 . 11 .. 10 .... 0 0111 1 . 0 .... @2misc_q1
+
+    VRINTN       1111 001 11 . 11 .. 10 .... 0 1000 . . 0 .... @2misc
+    VRINTX       1111 001 11 . 11 .. 10 .... 0 1001 . . 0 .... @2misc
+    VRINTA       1111 001 11 . 11 .. 10 .... 0 1010 . . 0 .... @2misc
+    VRINTZ       1111 001 11 . 11 .. 10 .... 0 1011 . . 0 .... @2misc
+
+    VCVT_F16_F32 1111 001 11 . 11 .. 10 .... 0 1100 0 . 0 .... @2misc_q0
+    VCVT_B16_F32 1111 001 11 . 11 .. 10 .... 0 1100 1 . 0 .... @2misc_q0
+
+    VRINTM       1111 001 11 . 11 .. 10 .... 0 1101 . . 0 .... @2misc
+
+    VCVT_F32_F16 1111 001 11 . 11 .. 10 .... 0 1110 0 . 0 .... @2misc_q0
+
+    VRINTP       1111 001 11 . 11 .. 10 .... 0 1111 . . 0 .... @2misc
+
+    VCVTAS       1111 001 11 . 11 .. 11 .... 0 0000 . . 0 .... @2misc
+    VCVTAU       1111 001 11 . 11 .. 11 .... 0 0001 . . 0 .... @2misc
+    VCVTNS       1111 001 11 . 11 .. 11 .... 0 0010 . . 0 .... @2misc
+    VCVTNU       1111 001 11 . 11 .. 11 .... 0 0011 . . 0 .... @2misc
+    VCVTPS       1111 001 11 . 11 .. 11 .... 0 0100 . . 0 .... @2misc
+    VCVTPU       1111 001 11 . 11 .. 11 .... 0 0101 . . 0 .... @2misc
+    VCVTMS       1111 001 11 . 11 .. 11 .... 0 0110 . . 0 .... @2misc
+    VCVTMU       1111 001 11 . 11 .. 11 .... 0 0111 . . 0 .... @2misc
+
+    VRECPE       1111 001 11 . 11 .. 11 .... 0 1000 . . 0 .... @2misc
+    VRSQRTE      1111 001 11 . 11 .. 11 .... 0 1001 . . 0 .... @2misc
+    VRECPE_F     1111 001 11 . 11 .. 11 .... 0 1010 . . 0 .... @2misc
+    VRSQRTE_F    1111 001 11 . 11 .. 11 .... 0 1011 . . 0 .... @2misc
+    VCVT_FS      1111 001 11 . 11 .. 11 .... 0 1100 . . 0 .... @2misc
+    VCVT_FU      1111 001 11 . 11 .. 11 .... 0 1101 . . 0 .... @2misc
+    VCVT_SF      1111 001 11 . 11 .. 11 .... 0 1110 . . 0 .... @2misc
+    VCVT_UF      1111 001 11 . 11 .. 11 .... 0 1111 . . 0 .... @2misc
+  ]
+
+  # Subgroup for size != 0b11
+  [
+    ##################################################################
+    # 3-reg-different-length grouping:
+    # 1111 001 U 1 D sz!=11 Vn:4 Vd:4 opc:4 N 0 M 0 Vm:4
+    ##################################################################
+
+    &3diff vm vn vd size
+
+    @3diff       .... ... . . . size:2 .... .... .... . . . . .... \
+                 &3diff vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+    VADDL_S_3d   1111 001 0 1 . .. .... .... 0000 . 0 . 0 .... @3diff
+    VADDL_U_3d   1111 001 1 1 . .. .... .... 0000 . 0 . 0 .... @3diff
+
+    VADDW_S_3d   1111 001 0 1 . .. .... .... 0001 . 0 . 0 .... @3diff
+    VADDW_U_3d   1111 001 1 1 . .. .... .... 0001 . 0 . 0 .... @3diff
+
+    VSUBL_S_3d   1111 001 0 1 . .. .... .... 0010 . 0 . 0 .... @3diff
+    VSUBL_U_3d   1111 001 1 1 . .. .... .... 0010 . 0 . 0 .... @3diff
+
+    VSUBW_S_3d   1111 001 0 1 . .. .... .... 0011 . 0 . 0 .... @3diff
+    VSUBW_U_3d   1111 001 1 1 . .. .... .... 0011 . 0 . 0 .... @3diff
+
+    VADDHN_3d    1111 001 0 1 . .. .... .... 0100 . 0 . 0 .... @3diff
+    VRADDHN_3d   1111 001 1 1 . .. .... .... 0100 . 0 . 0 .... @3diff
+
+    VABAL_S_3d   1111 001 0 1 . .. .... .... 0101 . 0 . 0 .... @3diff
+    VABAL_U_3d   1111 001 1 1 . .. .... .... 0101 . 0 . 0 .... @3diff
+
+    VSUBHN_3d    1111 001 0 1 . .. .... .... 0110 . 0 . 0 .... @3diff
+    VRSUBHN_3d   1111 001 1 1 . .. .... .... 0110 . 0 . 0 .... @3diff
+
+    VABDL_S_3d   1111 001 0 1 . .. .... .... 0111 . 0 . 0 .... @3diff
+    VABDL_U_3d   1111 001 1 1 . .. .... .... 0111 . 0 . 0 .... @3diff
+
+    VMLAL_S_3d   1111 001 0 1 . .. .... .... 1000 . 0 . 0 .... @3diff
+    VMLAL_U_3d   1111 001 1 1 . .. .... .... 1000 . 0 . 0 .... @3diff
+
+    VQDMLAL_3d   1111 001 0 1 . .. .... .... 1001 . 0 . 0 .... @3diff
+
+    VMLSL_S_3d   1111 001 0 1 . .. .... .... 1010 . 0 . 0 .... @3diff
+    VMLSL_U_3d   1111 001 1 1 . .. .... .... 1010 . 0 . 0 .... @3diff
+
+    VQDMLSL_3d   1111 001 0 1 . .. .... .... 1011 . 0 . 0 .... @3diff
+
+    VMULL_S_3d   1111 001 0 1 . .. .... .... 1100 . 0 . 0 .... @3diff
+    VMULL_U_3d   1111 001 1 1 . .. .... .... 1100 . 0 . 0 .... @3diff
+
+    VQDMULL_3d   1111 001 0 1 . .. .... .... 1101 . 0 . 0 .... @3diff
+
+    VMULL_P_3d   1111 001 0 1 . .. .... .... 1110 . 0 . 0 .... @3diff
+
+    ##################################################################
+    # 2-regs-plus-scalar grouping:
+    # 1111 001 Q 1 D sz!=11 Vn:4 Vd:4 opc:4 N 1 M 0 Vm:4
+    ##################################################################
+    &2scalar vm vn vd size q
+
+    @2scalar     .... ... q:1 . . size:2 .... .... .... . . . . .... \
+                 &2scalar vm=%vm_dp vn=%vn_dp vd=%vd_dp
+    # For the 'long' ops the Q bit is part of insn decode
+    @2scalar_q0  .... ... . . . size:2 .... .... .... . . . . .... \
+                 &2scalar vm=%vm_dp vn=%vn_dp vd=%vd_dp q=0
+
+    VMLA_2sc     1111 001 . 1 . .. .... .... 0000 . 1 . 0 .... @2scalar
+    VMLA_F_2sc   1111 001 . 1 . .. .... .... 0001 . 1 . 0 .... @2scalar
+
+    VMLAL_S_2sc  1111 001 0 1 . .. .... .... 0010 . 1 . 0 .... @2scalar_q0
+    VMLAL_U_2sc  1111 001 1 1 . .. .... .... 0010 . 1 . 0 .... @2scalar_q0
+
+    VQDMLAL_2sc  1111 001 0 1 . .. .... .... 0011 . 1 . 0 .... @2scalar_q0
+
+    VMLS_2sc     1111 001 . 1 . .. .... .... 0100 . 1 . 0 .... @2scalar
+    VMLS_F_2sc   1111 001 . 1 . .. .... .... 0101 . 1 . 0 .... @2scalar
+
+    VMLSL_S_2sc  1111 001 0 1 . .. .... .... 0110 . 1 . 0 .... @2scalar_q0
+    VMLSL_U_2sc  1111 001 1 1 . .. .... .... 0110 . 1 . 0 .... @2scalar_q0
+
+    VQDMLSL_2sc  1111 001 0 1 . .. .... .... 0111 . 1 . 0 .... @2scalar_q0
+
+    VMUL_2sc     1111 001 . 1 . .. .... .... 1000 . 1 . 0 .... @2scalar
+    VMUL_F_2sc   1111 001 . 1 . .. .... .... 1001 . 1 . 0 .... @2scalar
+
+    VMULL_S_2sc  1111 001 0 1 . .. .... .... 1010 . 1 . 0 .... @2scalar_q0
+    VMULL_U_2sc  1111 001 1 1 . .. .... .... 1010 . 1 . 0 .... @2scalar_q0
+
+    VQDMULL_2sc  1111 001 0 1 . .. .... .... 1011 . 1 . 0 .... @2scalar_q0
+
+    VQDMULH_2sc  1111 001 . 1 . .. .... .... 1100 . 1 . 0 .... @2scalar
+    VQRDMULH_2sc 1111 001 . 1 . .. .... .... 1101 . 1 . 0 .... @2scalar
+
+    VQRDMLAH_2sc 1111 001 . 1 . .. .... .... 1110 . 1 . 0 .... @2scalar
+    VQRDMLSH_2sc 1111 001 . 1 . .. .... .... 1111 . 1 . 0 .... @2scalar
+  ]
+}
diff --git a/target/arm/tcg/neon-ls.decode b/target/arm/tcg/neon-ls.decode
new file mode 100644
index 0000000..c5f364c
--- /dev/null
+++ b/target/arm/tcg/neon-ls.decode
@@ -0,0 +1,52 @@
+# AArch32 Neon load/store instruction descriptions
+#
+#  Copyright (c) 2020 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+
+# Encodings for Neon load/store instructions where the T32 encoding
+# is a simple transformation of the A32 encoding.
+# More specifically, this file covers instructions where the A32 encoding is
+#   0b1111_0100_xxx0_xxxx_xxxx_xxxx_xxxx_xxxx
+# and the T32 encoding is
+#   0b1111_1001_xxx0_xxxx_xxxx_xxxx_xxxx_xxxx
+# This file works on the A32 encoding only; calling code for T32 has to
+# transform the insn into the A32 version first.
+
+%vd_dp  22:1 12:4
+
+# Neon load/store multiple structures
+
+VLDST_multiple 1111 0100 0 . l:1 0 rn:4 .... itype:4 size:2 align:2 rm:4 \
+               vd=%vd_dp
+
+# Neon load single element to all lanes
+
+VLD_all_lanes  1111 0100 1 . 1 0 rn:4 .... 11 n:2 size:2 t:1 a:1 rm:4 \
+               vd=%vd_dp
+
+# Neon load/store single structure to one lane
+%imm1_5_p1 5:1 !function=plus_1
+%imm1_6_p1 6:1 !function=plus_1
+
+VLDST_single   1111 0100 1 . l:1 0 rn:4 .... 00 n:2 reg_idx:3 align:1 rm:4 \
+               vd=%vd_dp size=0 stride=1
+VLDST_single   1111 0100 1 . l:1 0 rn:4 .... 01 n:2 reg_idx:2 . align:1 rm:4 \
+               vd=%vd_dp size=1 stride=%imm1_5_p1
+VLDST_single   1111 0100 1 . l:1 0 rn:4 .... 10 n:2 reg_idx:1 . align:2 rm:4 \
+               vd=%vd_dp size=2 stride=%imm1_6_p1
diff --git a/target/arm/tcg/neon-shared.decode b/target/arm/tcg/neon-shared.decode
new file mode 100644
index 0000000..8e6bd0b
--- /dev/null
+++ b/target/arm/tcg/neon-shared.decode
@@ -0,0 +1,99 @@
+# AArch32 Neon instruction descriptions
+#
+#  Copyright (c) 2020 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+
+# Encodings for Neon instructions whose encoding is the same for
+# both A32 and T32.
+
+# More specifically, this covers:
+# 2reg scalar ext: 0b1111_1110_xxxx_xxxx_xxxx_1x0x_xxxx_xxxx
+# 3same ext:       0b1111_110x_xxxx_xxxx_xxxx_1x0x_xxxx_xxxx
+
+# VFP/Neon register fields; same as vfp.decode
+%vm_dp  5:1 0:4
+%vm_sp  0:4 5:1
+%vn_dp  7:1 16:4
+%vn_sp  16:4 7:1
+%vd_dp  22:1 12:4
+%vd_sp  12:4 22:1
+
+# For VCMLA/VCADD insns, convert the single-bit size field
+# which is 0 for fp16 and 1 for fp32 into a MO_* constant.
+# (Note that this is the reverse of the sense of the 1-bit size
+# field in the 3same_fp Neon insns.)
+%vcadd_size    20:1 !function=plus_1
+
+VCMLA          1111 110 rot:2 . 1 . .... .... 1000 . q:1 . 0 .... \
+               vm=%vm_dp vn=%vn_dp vd=%vd_dp size=%vcadd_size
+
+VCADD          1111 110 rot:1 1 . 0 . .... .... 1000 . q:1 . 0 .... \
+               vm=%vm_dp vn=%vn_dp vd=%vd_dp size=%vcadd_size
+
+VSDOT          1111 110 00 . 10 .... .... 1101 . q:1 . 0 .... \
+               vm=%vm_dp vn=%vn_dp vd=%vd_dp
+VUDOT          1111 110 00 . 10 .... .... 1101 . q:1 . 1 .... \
+               vm=%vm_dp vn=%vn_dp vd=%vd_dp
+VUSDOT         1111 110 01 . 10 .... .... 1101 . q:1 . 0 .... \
+               vm=%vm_dp vn=%vn_dp vd=%vd_dp
+VDOT_b16       1111 110 00 . 00 .... .... 1101 . q:1 . 0 .... \
+               vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+# VFM[AS]L
+VFML           1111 110 0 s:1 . 10 .... .... 1000 . 0 . 1 .... \
+               vm=%vm_sp vn=%vn_sp vd=%vd_dp q=0
+VFML           1111 110 0 s:1 . 10 .... .... 1000 . 1 . 1 .... \
+               vm=%vm_dp vn=%vn_dp vd=%vd_dp q=1
+
+VSMMLA         1111 1100 0.10 .... .... 1100 .1.0 .... \
+               vm=%vm_dp vn=%vn_dp vd=%vd_dp
+VUMMLA         1111 1100 0.10 .... .... 1100 .1.1 .... \
+               vm=%vm_dp vn=%vn_dp vd=%vd_dp
+VUSMMLA        1111 1100 1.10 .... .... 1100 .1.0 .... \
+               vm=%vm_dp vn=%vn_dp vd=%vd_dp
+VMMLA_b16      1111 1100 0.00 .... .... 1100 .1.0 .... \
+               vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+VFMA_b16       1111 110 0 0.11 .... .... 1000 . q:1 . 1 .... \
+               vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+VCMLA_scalar   1111 1110 0 . rot:2 .... .... 1000 . q:1 index:1 0 vm:4 \
+               vn=%vn_dp vd=%vd_dp size=1
+VCMLA_scalar   1111 1110 1 . rot:2 .... .... 1000 . q:1 . 0 .... \
+               vm=%vm_dp vn=%vn_dp vd=%vd_dp size=2 index=0
+
+VSDOT_scalar   1111 1110 0 . 10 .... .... 1101 . q:1 index:1 0 vm:4 \
+               vn=%vn_dp vd=%vd_dp
+VUDOT_scalar   1111 1110 0 . 10 .... .... 1101 . q:1 index:1 1 vm:4 \
+               vn=%vn_dp vd=%vd_dp
+VUSDOT_scalar  1111 1110 1 . 00 .... .... 1101 . q:1 index:1 0 vm:4 \
+               vn=%vn_dp vd=%vd_dp
+VSUDOT_scalar  1111 1110 1 . 00 .... .... 1101 . q:1 index:1 1 vm:4 \
+               vn=%vn_dp vd=%vd_dp
+VDOT_b16_scal  1111 1110 0 . 00 .... .... 1101 . q:1 index:1 0 vm:4 \
+               vn=%vn_dp vd=%vd_dp
+
+%vfml_scalar_q0_rm 0:3 5:1
+%vfml_scalar_q1_index 5:1 3:1
+VFML_scalar    1111 1110 0 . 0 s:1 .... .... 1000 . 0 . 1 index:1 ... \
+               rm=%vfml_scalar_q0_rm vn=%vn_sp vd=%vd_dp q=0
+VFML_scalar    1111 1110 0 . 0 s:1 .... .... 1000 . 1 . 1 . rm:3 \
+               index=%vfml_scalar_q1_index vn=%vn_dp vd=%vd_dp q=1
+VFMA_b16_scal  1111 1110 0.11 .... .... 1000 . q:1 . 1 . vm:3 \
+               index=%vfml_scalar_q1_index vn=%vn_dp vd=%vd_dp
diff --git a/target/arm/tcg/sme-fa64.decode b/target/arm/tcg/sme-fa64.decode
new file mode 100644
index 0000000..47708cc
--- /dev/null
+++ b/target/arm/tcg/sme-fa64.decode
@@ -0,0 +1,60 @@
+# AArch64 SME allowed instruction decoding
+#
+#  Copyright (c) 2022 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+
+# These patterns are taken from Appendix E1.1 of DDI0616 A.a,
+# Arm Architecture Reference Manual Supplement,
+# The Scalable Matrix Extension (SME), for Armv9-A
+
+{
+  [
+    OK  0-00 1110 0000 0001 0010 11-- ---- ----   # SMOV W|Xd,Vn.B[0]
+    OK  0-00 1110 0000 0010 0010 11-- ---- ----   # SMOV W|Xd,Vn.H[0]
+    OK  0100 1110 0000 0100 0010 11-- ---- ----   # SMOV Xd,Vn.S[0]
+    OK  0000 1110 0000 0001 0011 11-- ---- ----   # UMOV Wd,Vn.B[0]
+    OK  0000 1110 0000 0010 0011 11-- ---- ----   # UMOV Wd,Vn.H[0]
+    OK  0000 1110 0000 0100 0011 11-- ---- ----   # UMOV Wd,Vn.S[0]
+    OK  0100 1110 0000 1000 0011 11-- ---- ----   # UMOV Xd,Vn.D[0]
+  ]
+  FAIL  0--0 111- ---- ---- ---- ---- ---- ----   # Advanced SIMD vector operations
+}
+
+{
+  [
+    OK  0101 1110 --1- ---- 11-1 11-- ---- ----   # FMULX/FRECPS/FRSQRTS (scalar)
+    OK  0101 1110 -10- ---- 00-1 11-- ---- ----   # FMULX/FRECPS/FRSQRTS (scalar, FP16)
+    OK  01-1 1110 1-10 0001 11-1 10-- ---- ----   # FRECPE/FRSQRTE/FRECPX (scalar)
+    OK  01-1 1110 1111 1001 11-1 10-- ---- ----   # FRECPE/FRSQRTE/FRECPX (scalar, FP16)
+  ]
+  FAIL  01-1 111- ---- ---- ---- ---- ---- ----   # Advanced SIMD single-element operations
+}
+
+FAIL    0-00 110- ---- ---- ---- ---- ---- ----   # Advanced SIMD structure load/store
+FAIL    1100 1110 ---- ---- ---- ---- ---- ----   # Advanced SIMD cryptography extensions
+FAIL    0001 1110 0111 1110 0000 00-- ---- ----   # FJCVTZS
+
+# These are the "avoidance of doubt" final table of Illegal Advanced SIMD instructions
+# We don't actually need to include these, as the default is OK.
+#       -001 111- ---- ---- ---- ---- ---- ----   # Scalar floating-point operations
+#       --10 110- ---- ---- ---- ---- ---- ----   # Load/store pair of FP registers
+#       --01 1100 ---- ---- ---- ---- ---- ----   # Load FP register (PC-relative literal)
+#       --11 1100 --0- ---- ---- ---- ---- ----   # Load/store FP register (unscaled imm)
+#       --11 1100 --1- ---- ---- ---- ---- --10   # Load/store FP register (register offset)
+#       --11 1101 ---- ---- ---- ---- ---- ----   # Load/store FP register (scaled imm)
diff --git a/target/arm/tcg/sme.decode b/target/arm/tcg/sme.decode
new file mode 100644
index 0000000..628804e
--- /dev/null
+++ b/target/arm/tcg/sme.decode
@@ -0,0 +1,88 @@
+# AArch64 SME instruction descriptions
+#
+#  Copyright (c) 2022 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+
+### SME Misc
+
+ZERO            11000000 00 001 00000000000 imm:8
+
+### SME Move into/from Array
+
+%mova_rs        13:2 !function=plus_12
+&mova           esz rs pg zr za_imm v:bool to_vec:bool
+
+MOVA            11000000 esz:2 00000 0 v:1 .. pg:3 zr:5 0 za_imm:4  \
+                &mova to_vec=0 rs=%mova_rs
+MOVA            11000000 11    00000 1 v:1 .. pg:3 zr:5 0 za_imm:4  \
+                &mova to_vec=0 rs=%mova_rs esz=4
+
+MOVA            11000000 esz:2 00001 0 v:1 .. pg:3 0 za_imm:4 zr:5  \
+                &mova to_vec=1 rs=%mova_rs
+MOVA            11000000 11    00001 1 v:1 .. pg:3 0 za_imm:4 zr:5  \
+                &mova to_vec=1 rs=%mova_rs esz=4
+
+### SME Memory
+
+&ldst           esz rs pg rn rm za_imm v:bool st:bool
+
+LDST1           1110000 0 esz:2 st:1 rm:5 v:1 .. pg:3 rn:5 0 za_imm:4  \
+                &ldst rs=%mova_rs
+LDST1           1110000 111     st:1 rm:5 v:1 .. pg:3 rn:5 0 za_imm:4  \
+                &ldst esz=4 rs=%mova_rs
+
+&ldstr          rv rn imm
+@ldstr          ....... ... . ...... .. ... rn:5 . imm:4 \
+                &ldstr rv=%mova_rs
+
+LDR             1110000 100 0 000000 .. 000 ..... 0 ....        @ldstr
+STR             1110000 100 1 000000 .. 000 ..... 0 ....        @ldstr
+
+### SME Add Vector to Array
+
+&adda           zad zn pm pn
+@adda_32        ........ .. ..... . pm:3 pn:3 zn:5 ... zad:2    &adda
+@adda_64        ........ .. ..... . pm:3 pn:3 zn:5 ..  zad:3    &adda
+
+ADDHA_s         11000000 10 01000 0 ... ... ..... 000 ..        @adda_32
+ADDVA_s         11000000 10 01000 1 ... ... ..... 000 ..        @adda_32
+ADDHA_d         11000000 11 01000 0 ... ... ..... 00 ...        @adda_64
+ADDVA_d         11000000 11 01000 1 ... ... ..... 00 ...        @adda_64
+
+### SME Outer Product
+
+&op             zad zn zm pm pn sub:bool
+@op_32          ........ ... zm:5 pm:3 pn:3 zn:5 sub:1 .. zad:2 &op
+@op_64          ........ ... zm:5 pm:3 pn:3 zn:5 sub:1 .  zad:3 &op
+
+FMOPA_s         10000000 100 ..... ... ... ..... . 00 ..        @op_32
+FMOPA_d         10000000 110 ..... ... ... ..... . 0 ...        @op_64
+
+BFMOPA          10000001 100 ..... ... ... ..... . 00 ..        @op_32
+FMOPA_h         10000001 101 ..... ... ... ..... . 00 ..        @op_32
+
+SMOPA_s         1010000 0 10 0 ..... ... ... ..... . 00 ..      @op_32
+SUMOPA_s        1010000 0 10 1 ..... ... ... ..... . 00 ..      @op_32
+USMOPA_s        1010000 1 10 0 ..... ... ... ..... . 00 ..      @op_32
+UMOPA_s         1010000 1 10 1 ..... ... ... ..... . 00 ..      @op_32
+
+SMOPA_d         1010000 0 11 0 ..... ... ... ..... . 0 ...      @op_64
+SUMOPA_d        1010000 0 11 1 ..... ... ... ..... . 0 ...      @op_64
+USMOPA_d        1010000 1 11 0 ..... ... ... ..... . 0 ...      @op_64
+UMOPA_d         1010000 1 11 1 ..... ... ... ..... . 0 ...      @op_64
diff --git a/target/arm/tcg/sve.decode b/target/arm/tcg/sve.decode
new file mode 100644
index 0000000..14b3a69
--- /dev/null
+++ b/target/arm/tcg/sve.decode
@@ -0,0 +1,1702 @@
+# AArch64 SVE instruction descriptions
+#
+#  Copyright (c) 2017 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+
+###########################################################################
+# Named fields.  These are primarily for disjoint fields.
+
+%imm4_16_p1     16:4 !function=plus_1
+%imm6_22_5      22:1 5:5
+%imm7_22_16     22:2 16:5
+%imm8_16_10     16:5 10:3
+%imm9_16_10     16:s6 10:3
+%size_23        23:2
+%dtype_23_13    23:2 13:2
+%index3_22_19   22:1 19:2
+%index3_19_11   19:2 11:1
+%index2_20_11   20:1 11:1
+
+# A combination of tsz:imm3 -- extract esize.
+%tszimm_esz     22:2 5:5 !function=tszimm_esz
+# A combination of tsz:imm3 -- extract (2 * esize) - (tsz:imm3)
+%tszimm_shr     22:2 5:5 !function=tszimm_shr
+# A combination of tsz:imm3 -- extract (tsz:imm3) - esize
+%tszimm_shl     22:2 5:5 !function=tszimm_shl
+
+# Similarly for the tszh/tszl pair at 22/16 for zzi
+%tszimm16_esz   22:2 16:5 !function=tszimm_esz
+%tszimm16_shr   22:2 16:5 !function=tszimm_shr
+%tszimm16_shl   22:2 16:5 !function=tszimm_shl
+
+# Signed 8-bit immediate, optionally shifted left by 8.
+%sh8_i8s        5:9 !function=expand_imm_sh8s
+# Unsigned 8-bit immediate, optionally shifted left by 8.
+%sh8_i8u        5:9 !function=expand_imm_sh8u
+
+# Unsigned load of msz into esz=2, represented as a dtype.
+%msz_dtype      23:2 !function=msz_dtype
+
+# Either a copy of rd (at bit 0), or a different source
+# as propagated via the MOVPRFX instruction.
+%reg_movprfx    0:5
+
+###########################################################################
+# Named attribute sets.  These are used to make nice(er) names
+# when creating helpers common to those for the individual
+# instruction patterns.
+
+&rr_esz         rd rn esz
+&rri            rd rn imm
+&rr_dbm         rd rn dbm
+&rrri           rd rn rm imm
+&rri_esz        rd rn imm esz
+&rrri_esz       rd rn rm imm esz
+&rrr_esz        rd rn rm esz
+&rrx_esz        rd rn rm index esz
+&rpr_esz        rd pg rn esz
+&rpr_s          rd pg rn s
+&rprr_s         rd pg rn rm s
+&rprr_esz       rd pg rn rm esz
+&rrrr_esz       rd ra rn rm esz
+&rrxr_esz       rd rn rm ra index esz
+&rprrr_esz      rd pg rn rm ra esz
+&rpri_esz       rd pg rn imm esz
+&ptrue          rd esz pat s
+&incdec_cnt     rd pat esz imm d u
+&incdec2_cnt    rd rn pat esz imm d u
+&incdec_pred    rd pg esz d u
+&incdec2_pred   rd rn pg esz d u
+&rprr_load      rd pg rn rm dtype nreg
+&rpri_load      rd pg rn imm dtype nreg
+&rprr_store     rd pg rn rm msz esz nreg
+&rpri_store     rd pg rn imm msz esz nreg
+&rprr_gather_load       rd pg rn rm esz msz u ff xs scale
+&rpri_gather_load       rd pg rn imm esz msz u ff
+&rprr_scatter_store     rd pg rn rm esz msz xs scale
+&rpri_scatter_store     rd pg rn imm esz msz
+
+###########################################################################
+# Named instruction formats.  These are generally used to
+# reduce the amount of duplication between instruction patterns.
+
+# Two operand with unused vector element size
+@pd_pn_e0       ........ ........ ....... rn:4 . rd:4           &rr_esz esz=0
+
+# Two operand
+@pd_pn          ........ esz:2 .. .... ....... rn:4 . rd:4      &rr_esz
+@rd_rn          ........ esz:2 ...... ...... rn:5 rd:5          &rr_esz
+
+# Two operand with governing predicate, flags setting
+@pd_pg_pn_s     ........ . s:1 ...... .. pg:4 . rn:4 . rd:4     &rpr_s
+@pd_pg_pn_s0    ........ . .   ...... .. pg:4 . rn:4 . rd:4     &rpr_s s=0
+
+# Three operand with unused vector element size
+@rd_rn_rm_e0    ........ ... rm:5 ... ... rn:5 rd:5             &rrr_esz esz=0
+
+# Three predicate operand, with governing predicate, flag setting
+@pd_pg_pn_pm_s  ........ . s:1 .. rm:4 .. pg:4 . rn:4 . rd:4    &rprr_s
+
+# Three operand, vector element size
+@rd_rn_rm       ........ esz:2 . rm:5 ... ... rn:5 rd:5         &rrr_esz
+@pd_pn_pm       ........ esz:2 .. rm:4 ....... rn:4 . rd:4      &rrr_esz
+@rdn_rm         ........ esz:2 ...... ...... rm:5 rd:5 \
+                &rrr_esz rn=%reg_movprfx
+@rdn_rm_e0      ........ .. ...... ...... rm:5 rd:5 \
+                &rrr_esz rn=%reg_movprfx esz=0
+@rdn_sh_i8u     ........ esz:2 ...... ...... ..... rd:5 \
+                &rri_esz rn=%reg_movprfx imm=%sh8_i8u
+@rdn_i8u        ........ esz:2 ...... ... imm:8 rd:5 \
+                &rri_esz rn=%reg_movprfx
+@rdn_i8s        ........ esz:2 ...... ... imm:s8 rd:5 \
+                &rri_esz rn=%reg_movprfx
+
+# Four operand, vector element size
+@rda_rn_rm      ........ esz:2 . rm:5 ... ... rn:5 rd:5 \
+                &rrrr_esz ra=%reg_movprfx
+
+# Four operand with unused vector element size
+@rda_rn_rm_e0   ........ ... rm:5 ... ... rn:5 rd:5 \
+                &rrrr_esz esz=0 ra=%reg_movprfx
+@rdn_ra_rm_e0   ........ ... rm:5 ... ... ra:5 rd:5 \
+                &rrrr_esz esz=0 rn=%reg_movprfx
+
+# Three operand with "memory" size, aka immediate left shift
+@rd_rn_msz_rm   ........ ... rm:5 .... imm:2 rn:5 rd:5          &rrri
+
+# Two register operand, with governing predicate, vector element size
+@rdn_pg_rm      ........ esz:2 ... ... ... pg:3 rm:5 rd:5 \
+                &rprr_esz rn=%reg_movprfx
+@rdm_pg_rn      ........ esz:2 ... ... ... pg:3 rn:5 rd:5 \
+                &rprr_esz rm=%reg_movprfx
+@rd_pg4_rn_rm   ........ esz:2 . rm:5  .. pg:4  rn:5 rd:5       &rprr_esz
+@pd_pg_rn_rm    ........ esz:2 . rm:5 ... pg:3 rn:5 . rd:4      &rprr_esz
+
+# Three register operand, with governing predicate, vector element size
+@rda_pg_rn_rm   ........ esz:2 . rm:5  ... pg:3 rn:5 rd:5 \
+                &rprrr_esz ra=%reg_movprfx
+@rdn_pg_ra_rm   ........ esz:2 . rm:5  ... pg:3 ra:5 rd:5 \
+                &rprrr_esz rn=%reg_movprfx
+@rdn_pg_rm_ra   ........ esz:2 . ra:5  ... pg:3 rm:5 rd:5 \
+                &rprrr_esz rn=%reg_movprfx
+@rd_pg_rn_rm   ........ esz:2 . rm:5 ... pg:3 rn:5 rd:5       &rprr_esz
+
+# One register operand, with governing predicate, vector element size
+@rd_pg_rn       ........ esz:2 ... ... ... pg:3 rn:5 rd:5       &rpr_esz
+@rd_pg4_pn      ........ esz:2 ... ... .. pg:4 . rn:4 rd:5      &rpr_esz
+@pd_pg_rn       ........ esz:2 ... ... ... pg:3 rn:5 . rd:4     &rpr_esz
+
+# One register operand, with governing predicate, no vector element size
+@rd_pg_rn_e0    ........ .. ... ... ... pg:3 rn:5 rd:5          &rpr_esz esz=0
+
+# Two register operands with a 6-bit signed immediate.
+@rd_rn_i6       ........ ... rn:5 ..... imm:s6 rd:5             &rri
+
+# Two register operand, one immediate operand, with predicate,
+# element size encoded as TSZHL.
+@rdn_pg_tszimm_shl  ........ .. ... ... ... pg:3 ..... rd:5 \
+                    &rpri_esz rn=%reg_movprfx esz=%tszimm_esz imm=%tszimm_shl
+@rdn_pg_tszimm_shr  ........ .. ... ... ... pg:3 ..... rd:5 \
+                    &rpri_esz rn=%reg_movprfx esz=%tszimm_esz imm=%tszimm_shr
+
+# Similarly without predicate.
+@rd_rn_tszimm_shl   ........ .. ... ... ...... rn:5 rd:5 \
+                    &rri_esz esz=%tszimm16_esz imm=%tszimm16_shl
+@rd_rn_tszimm_shr   ........ .. ... ... ...... rn:5 rd:5 \
+                    &rri_esz esz=%tszimm16_esz imm=%tszimm16_shr
+
+# Two register operand, one immediate operand, with 4-bit predicate.
+# User must fill in imm.
+@rdn_pg4        ........ esz:2 .. pg:4 ... ........ rd:5 \
+                &rpri_esz rn=%reg_movprfx
+
+# Two register operand, one one-bit floating-point operand.
+@rdn_i1         ........ esz:2 ......... pg:3 .... imm:1 rd:5 \
+                &rpri_esz rn=%reg_movprfx
+
+# Two register operand, one encoded bitmask.
+@rdn_dbm        ........ .. .... dbm:13 rd:5 \
+                &rr_dbm rn=%reg_movprfx
+
+# Predicate output, vector and immediate input,
+# controlling predicate, element size.
+@pd_pg_rn_i7    ........ esz:2 . imm:7 . pg:3 rn:5 . rd:4       &rpri_esz
+@pd_pg_rn_i5    ........ esz:2 . imm:s5 ... pg:3 rn:5 . rd:4    &rpri_esz
+
+# Basic Load/Store with 9-bit immediate offset
+@pd_rn_i9       ........ ........ ...... rn:5 . rd:4    \
+                &rri imm=%imm9_16_10
+@rd_rn_i9       ........ ........ ...... rn:5 rd:5      \
+                &rri imm=%imm9_16_10
+
+# One register, pattern, and uint4+1.
+# User must fill in U and D.
+@incdec_cnt     ........ esz:2 .. .... ...... pat:5 rd:5 \
+                &incdec_cnt imm=%imm4_16_p1
+@incdec2_cnt    ........ esz:2 .. .... ...... pat:5 rd:5 \
+                &incdec2_cnt imm=%imm4_16_p1 rn=%reg_movprfx
+
+# One register, predicate.
+# User must fill in U and D.
+@incdec_pred    ........ esz:2 .... .. ..... .. pg:4 rd:5       &incdec_pred
+@incdec2_pred   ........ esz:2 .... .. ..... .. pg:4 rd:5 \
+                &incdec2_pred rn=%reg_movprfx
+
+# Loads; user must fill in NREG.
+@rprr_load_dt   ....... dtype:4 rm:5 ... pg:3 rn:5 rd:5         &rprr_load
+@rpri_load_dt   ....... dtype:4 . imm:s4 ... pg:3 rn:5 rd:5     &rpri_load
+
+@rprr_load_msz  ....... .... rm:5 ... pg:3 rn:5 rd:5 \
+                &rprr_load dtype=%msz_dtype
+@rpri_load_msz  ....... .... . imm:s4 ... pg:3 rn:5 rd:5 \
+                &rpri_load dtype=%msz_dtype
+
+# Gather Loads.
+@rprr_g_load_u        ....... .. .    . rm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \
+                      &rprr_gather_load xs=2
+@rprr_g_load_xs_u     ....... .. xs:1 . rm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \
+                      &rprr_gather_load
+@rprr_g_load_xs_u_sc  ....... .. xs:1 scale:1 rm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \
+                      &rprr_gather_load
+@rprr_g_load_xs_sc    ....... .. xs:1 scale:1 rm:5 . . ff:1 pg:3 rn:5 rd:5 \
+                      &rprr_gather_load
+@rprr_g_load_u_sc     ....... .. .    scale:1 rm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \
+                      &rprr_gather_load xs=2
+@rprr_g_load_sc       ....... .. .    scale:1 rm:5 . . ff:1 pg:3 rn:5 rd:5 \
+                      &rprr_gather_load xs=2
+@rpri_g_load          ....... msz:2 .. imm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \
+                      &rpri_gather_load
+
+# Stores; user must fill in ESZ, MSZ, NREG as needed.
+@rprr_store         ....... ..    ..     rm:5 ... pg:3 rn:5 rd:5    &rprr_store
+@rpri_store_msz     ....... msz:2 .. . imm:s4 ... pg:3 rn:5 rd:5    &rpri_store
+@rprr_store_esz_n0  ....... ..    esz:2  rm:5 ... pg:3 rn:5 rd:5 \
+                    &rprr_store nreg=0
+@rprr_scatter_store ....... msz:2 ..     rm:5 ... pg:3 rn:5 rd:5 \
+                    &rprr_scatter_store
+@rpri_scatter_store ....... msz:2 ..    imm:5 ... pg:3 rn:5 rd:5 \
+                    &rpri_scatter_store
+
+# Two registers and a scalar by N-bit index
+@rrx_3          ........ .. . ..      rm:3 ...... rn:5 rd:5 \
+                &rrx_esz index=%index3_22_19
+@rrx_2          ........ .. . index:2 rm:3 ...... rn:5 rd:5  &rrx_esz
+@rrx_1          ........ .. . index:1 rm:4 ...... rn:5 rd:5  &rrx_esz
+
+# Two registers and a scalar by N-bit index, alternate
+@rrx_3a         ........ .. . .. rm:3 ...... rn:5 rd:5 \
+                &rrx_esz index=%index3_19_11
+@rrx_2a         ........ .. . .  rm:4 ...... rn:5 rd:5 \
+                &rrx_esz index=%index2_20_11
+
+# Three registers and a scalar by N-bit index
+@rrxr_3         ........ .. . ..      rm:3 ...... rn:5 rd:5 \
+                &rrxr_esz ra=%reg_movprfx index=%index3_22_19
+@rrxr_2         ........ .. . index:2 rm:3 ...... rn:5 rd:5 \
+                &rrxr_esz ra=%reg_movprfx
+@rrxr_1         ........ .. . index:1 rm:4 ...... rn:5 rd:5 \
+                &rrxr_esz ra=%reg_movprfx
+
+# Three registers and a scalar by N-bit index, alternate
+@rrxr_3a        ........ .. ... rm:3 ...... rn:5 rd:5 \
+                &rrxr_esz ra=%reg_movprfx index=%index3_19_11
+@rrxr_2a        ........ .. ..  rm:4 ...... rn:5 rd:5 \
+                &rrxr_esz ra=%reg_movprfx index=%index2_20_11
+
+###########################################################################
+# Instruction patterns.  Grouped according to the SVE encodingindex.xhtml.
+
+### SVE Integer Arithmetic - Binary Predicated Group
+
+# SVE bitwise logical vector operations (predicated)
+ORR_zpzz        00000100 .. 011 000 000 ... ..... .....   @rdn_pg_rm
+EOR_zpzz        00000100 .. 011 001 000 ... ..... .....   @rdn_pg_rm
+AND_zpzz        00000100 .. 011 010 000 ... ..... .....   @rdn_pg_rm
+BIC_zpzz        00000100 .. 011 011 000 ... ..... .....   @rdn_pg_rm
+
+# SVE integer add/subtract vectors (predicated)
+ADD_zpzz        00000100 .. 000 000 000 ... ..... .....   @rdn_pg_rm
+SUB_zpzz        00000100 .. 000 001 000 ... ..... .....   @rdn_pg_rm
+SUB_zpzz        00000100 .. 000 011 000 ... ..... .....   @rdm_pg_rn # SUBR
+
+# SVE integer min/max/difference (predicated)
+SMAX_zpzz       00000100 .. 001 000 000 ... ..... .....   @rdn_pg_rm
+UMAX_zpzz       00000100 .. 001 001 000 ... ..... .....   @rdn_pg_rm
+SMIN_zpzz       00000100 .. 001 010 000 ... ..... .....   @rdn_pg_rm
+UMIN_zpzz       00000100 .. 001 011 000 ... ..... .....   @rdn_pg_rm
+SABD_zpzz       00000100 .. 001 100 000 ... ..... .....   @rdn_pg_rm
+UABD_zpzz       00000100 .. 001 101 000 ... ..... .....   @rdn_pg_rm
+
+# SVE integer multiply/divide (predicated)
+MUL_zpzz        00000100 .. 010 000 000 ... ..... .....   @rdn_pg_rm
+SMULH_zpzz      00000100 .. 010 010 000 ... ..... .....   @rdn_pg_rm
+UMULH_zpzz      00000100 .. 010 011 000 ... ..... .....   @rdn_pg_rm
+# Note that divide requires size >= 2; below 2 is unallocated.
+SDIV_zpzz       00000100 .. 010 100 000 ... ..... .....   @rdn_pg_rm
+UDIV_zpzz       00000100 .. 010 101 000 ... ..... .....   @rdn_pg_rm
+SDIV_zpzz       00000100 .. 010 110 000 ... ..... .....   @rdm_pg_rn # SDIVR
+UDIV_zpzz       00000100 .. 010 111 000 ... ..... .....   @rdm_pg_rn # UDIVR
+
+### SVE Integer Reduction Group
+
+# SVE bitwise logical reduction (predicated)
+ORV             00000100 .. 011 000 001 ... ..... .....         @rd_pg_rn
+EORV            00000100 .. 011 001 001 ... ..... .....         @rd_pg_rn
+ANDV            00000100 .. 011 010 001 ... ..... .....         @rd_pg_rn
+
+# SVE constructive prefix (predicated)
+MOVPRFX_z       00000100 .. 010 000 001 ... ..... .....         @rd_pg_rn
+MOVPRFX_m       00000100 .. 010 001 001 ... ..... .....         @rd_pg_rn
+
+# SVE integer add reduction (predicated)
+# Note that saddv requires size != 3.
+UADDV           00000100 .. 000 001 001 ... ..... .....         @rd_pg_rn
+SADDV           00000100 .. 000 000 001 ... ..... .....         @rd_pg_rn
+
+# SVE integer min/max reduction (predicated)
+SMAXV           00000100 .. 001 000 001 ... ..... .....         @rd_pg_rn
+UMAXV           00000100 .. 001 001 001 ... ..... .....         @rd_pg_rn
+SMINV           00000100 .. 001 010 001 ... ..... .....         @rd_pg_rn
+UMINV           00000100 .. 001 011 001 ... ..... .....         @rd_pg_rn
+
+### SVE Shift by Immediate - Predicated Group
+
+# SVE bitwise shift by immediate (predicated)
+ASR_zpzi        00000100 .. 000 000 100 ... .. ... .....  @rdn_pg_tszimm_shr
+LSR_zpzi        00000100 .. 000 001 100 ... .. ... .....  @rdn_pg_tszimm_shr
+LSL_zpzi        00000100 .. 000 011 100 ... .. ... .....  @rdn_pg_tszimm_shl
+ASRD            00000100 .. 000 100 100 ... .. ... .....  @rdn_pg_tszimm_shr
+SQSHL_zpzi      00000100 .. 000 110 100 ... .. ... .....  @rdn_pg_tszimm_shl
+UQSHL_zpzi      00000100 .. 000 111 100 ... .. ... .....  @rdn_pg_tszimm_shl
+SRSHR           00000100 .. 001 100 100 ... .. ... .....  @rdn_pg_tszimm_shr
+URSHR           00000100 .. 001 101 100 ... .. ... .....  @rdn_pg_tszimm_shr
+SQSHLU          00000100 .. 001 111 100 ... .. ... .....  @rdn_pg_tszimm_shl
+
+# SVE bitwise shift by vector (predicated)
+ASR_zpzz        00000100 .. 010 000 100 ... ..... .....   @rdn_pg_rm
+LSR_zpzz        00000100 .. 010 001 100 ... ..... .....   @rdn_pg_rm
+LSL_zpzz        00000100 .. 010 011 100 ... ..... .....   @rdn_pg_rm
+ASR_zpzz        00000100 .. 010 100 100 ... ..... .....   @rdm_pg_rn # ASRR
+LSR_zpzz        00000100 .. 010 101 100 ... ..... .....   @rdm_pg_rn # LSRR
+LSL_zpzz        00000100 .. 010 111 100 ... ..... .....   @rdm_pg_rn # LSLR
+
+# SVE bitwise shift by wide elements (predicated)
+# Note these require size != 3.
+ASR_zpzw        00000100 .. 011 000 100 ... ..... .....         @rdn_pg_rm
+LSR_zpzw        00000100 .. 011 001 100 ... ..... .....         @rdn_pg_rm
+LSL_zpzw        00000100 .. 011 011 100 ... ..... .....         @rdn_pg_rm
+
+### SVE Integer Arithmetic - Unary Predicated Group
+
+# SVE unary bit operations (predicated)
+# Note esz != 0 for FABS and FNEG.
+CLS             00000100 .. 011 000 101 ... ..... .....         @rd_pg_rn
+CLZ             00000100 .. 011 001 101 ... ..... .....         @rd_pg_rn
+CNT_zpz         00000100 .. 011 010 101 ... ..... .....         @rd_pg_rn
+CNOT            00000100 .. 011 011 101 ... ..... .....         @rd_pg_rn
+NOT_zpz         00000100 .. 011 110 101 ... ..... .....         @rd_pg_rn
+FABS            00000100 .. 011 100 101 ... ..... .....         @rd_pg_rn
+FNEG            00000100 .. 011 101 101 ... ..... .....         @rd_pg_rn
+
+# SVE integer unary operations (predicated)
+# Note esz > original size for extensions.
+ABS             00000100 .. 010 110 101 ... ..... .....         @rd_pg_rn
+NEG             00000100 .. 010 111 101 ... ..... .....         @rd_pg_rn
+SXTB            00000100 .. 010 000 101 ... ..... .....         @rd_pg_rn
+UXTB            00000100 .. 010 001 101 ... ..... .....         @rd_pg_rn
+SXTH            00000100 .. 010 010 101 ... ..... .....         @rd_pg_rn
+UXTH            00000100 .. 010 011 101 ... ..... .....         @rd_pg_rn
+SXTW            00000100 .. 010 100 101 ... ..... .....         @rd_pg_rn
+UXTW            00000100 .. 010 101 101 ... ..... .....         @rd_pg_rn
+
+### SVE Floating Point Compare - Vectors Group
+
+# SVE floating-point compare vectors
+FCMGE_ppzz      01100101 .. 0 ..... 010 ... ..... 0 ....        @pd_pg_rn_rm
+FCMGT_ppzz      01100101 .. 0 ..... 010 ... ..... 1 ....        @pd_pg_rn_rm
+FCMEQ_ppzz      01100101 .. 0 ..... 011 ... ..... 0 ....        @pd_pg_rn_rm
+FCMNE_ppzz      01100101 .. 0 ..... 011 ... ..... 1 ....        @pd_pg_rn_rm
+FCMUO_ppzz      01100101 .. 0 ..... 110 ... ..... 0 ....        @pd_pg_rn_rm
+FACGE_ppzz      01100101 .. 0 ..... 110 ... ..... 1 ....        @pd_pg_rn_rm
+FACGT_ppzz      01100101 .. 0 ..... 111 ... ..... 1 ....        @pd_pg_rn_rm
+
+### SVE Integer Multiply-Add Group
+
+# SVE integer multiply-add writing addend (predicated)
+MLA             00000100 .. 0 ..... 010 ... ..... .....   @rda_pg_rn_rm
+MLS             00000100 .. 0 ..... 011 ... ..... .....   @rda_pg_rn_rm
+
+# SVE integer multiply-add writing multiplicand (predicated)
+MLA             00000100 .. 0 ..... 110 ... ..... .....   @rdn_pg_ra_rm # MAD
+MLS             00000100 .. 0 ..... 111 ... ..... .....   @rdn_pg_ra_rm # MSB
+
+### SVE Integer Arithmetic - Unpredicated Group
+
+# SVE integer add/subtract vectors (unpredicated)
+ADD_zzz         00000100 .. 1 ..... 000 000 ..... .....         @rd_rn_rm
+SUB_zzz         00000100 .. 1 ..... 000 001 ..... .....         @rd_rn_rm
+SQADD_zzz       00000100 .. 1 ..... 000 100 ..... .....         @rd_rn_rm
+UQADD_zzz       00000100 .. 1 ..... 000 101 ..... .....         @rd_rn_rm
+SQSUB_zzz       00000100 .. 1 ..... 000 110 ..... .....         @rd_rn_rm
+UQSUB_zzz       00000100 .. 1 ..... 000 111 ..... .....         @rd_rn_rm
+
+### SVE Logical - Unpredicated Group
+
+# SVE bitwise logical operations (unpredicated)
+AND_zzz         00000100 00 1 ..... 001 100 ..... .....         @rd_rn_rm_e0
+ORR_zzz         00000100 01 1 ..... 001 100 ..... .....         @rd_rn_rm_e0
+EOR_zzz         00000100 10 1 ..... 001 100 ..... .....         @rd_rn_rm_e0
+BIC_zzz         00000100 11 1 ..... 001 100 ..... .....         @rd_rn_rm_e0
+
+XAR             00000100 .. 1 ..... 001 101 rm:5  rd:5   &rrri_esz \
+                rn=%reg_movprfx esz=%tszimm16_esz imm=%tszimm16_shr
+
+# SVE2 bitwise ternary operations
+EOR3            00000100 00 1 ..... 001 110 ..... .....         @rdn_ra_rm_e0
+BSL             00000100 00 1 ..... 001 111 ..... .....         @rdn_ra_rm_e0
+BCAX            00000100 01 1 ..... 001 110 ..... .....         @rdn_ra_rm_e0
+BSL1N           00000100 01 1 ..... 001 111 ..... .....         @rdn_ra_rm_e0
+BSL2N           00000100 10 1 ..... 001 111 ..... .....         @rdn_ra_rm_e0
+NBSL            00000100 11 1 ..... 001 111 ..... .....         @rdn_ra_rm_e0
+
+### SVE Index Generation Group
+
+# SVE index generation (immediate start, immediate increment)
+INDEX_ii        00000100 esz:2 1 imm2:s5 010000 imm1:s5 rd:5
+
+# SVE index generation (immediate start, register increment)
+INDEX_ir        00000100 esz:2 1 rm:5 010010 imm:s5 rd:5
+
+# SVE index generation (register start, immediate increment)
+INDEX_ri        00000100 esz:2 1 imm:s5 010001 rn:5 rd:5
+
+# SVE index generation (register start, register increment)
+INDEX_rr        00000100 .. 1 ..... 010011 ..... .....          @rd_rn_rm
+
+### SVE / Streaming SVE Stack Allocation Group
+
+# SVE stack frame adjustment
+ADDVL           00000100 001 ..... 01010 ...... .....           @rd_rn_i6
+ADDSVL          00000100 001 ..... 01011 ...... .....           @rd_rn_i6
+ADDPL           00000100 011 ..... 01010 ...... .....           @rd_rn_i6
+ADDSPL          00000100 011 ..... 01011 ...... .....           @rd_rn_i6
+
+# SVE stack frame size
+RDVL            00000100 101 11111 01010 imm:s6 rd:5
+RDSVL           00000100 101 11111 01011 imm:s6 rd:5
+
+### SVE Bitwise Shift - Unpredicated Group
+
+# SVE bitwise shift by immediate (unpredicated)
+ASR_zzi         00000100 .. 1 ..... 1001 00 ..... .....  @rd_rn_tszimm_shr
+LSR_zzi         00000100 .. 1 ..... 1001 01 ..... .....  @rd_rn_tszimm_shr
+LSL_zzi         00000100 .. 1 ..... 1001 11 ..... .....  @rd_rn_tszimm_shl
+
+# SVE bitwise shift by wide elements (unpredicated)
+# Note esz != 3
+ASR_zzw         00000100 .. 1 ..... 1000 00 ..... .....         @rd_rn_rm
+LSR_zzw         00000100 .. 1 ..... 1000 01 ..... .....         @rd_rn_rm
+LSL_zzw         00000100 .. 1 ..... 1000 11 ..... .....         @rd_rn_rm
+
+### SVE Compute Vector Address Group
+
+# SVE vector address generation
+ADR_s32         00000100 00 1 ..... 1010 .. ..... .....         @rd_rn_msz_rm
+ADR_u32         00000100 01 1 ..... 1010 .. ..... .....         @rd_rn_msz_rm
+ADR_p32         00000100 10 1 ..... 1010 .. ..... .....         @rd_rn_msz_rm
+ADR_p64         00000100 11 1 ..... 1010 .. ..... .....         @rd_rn_msz_rm
+
+### SVE Integer Misc - Unpredicated Group
+
+# SVE constructive prefix (unpredicated)
+MOVPRFX         00000100 00 1 00000 101111 rn:5 rd:5
+
+# SVE floating-point exponential accelerator
+# Note esz != 0
+FEXPA           00000100 .. 1 00000 101110 ..... .....          @rd_rn
+
+# SVE floating-point trig select coefficient
+# Note esz != 0
+FTSSEL          00000100 .. 1 ..... 101100 ..... .....          @rd_rn_rm
+
+### SVE Element Count Group
+
+# SVE element count
+CNT_r           00000100 .. 10 .... 1110 0 0 ..... .....    @incdec_cnt d=0 u=1
+
+# SVE inc/dec register by element count
+INCDEC_r        00000100 .. 11 .... 1110 0 d:1 ..... .....      @incdec_cnt u=1
+
+# SVE saturating inc/dec register by element count
+SINCDEC_r_32    00000100 .. 10 .... 1111 d:1 u:1 ..... .....    @incdec_cnt
+SINCDEC_r_64    00000100 .. 11 .... 1111 d:1 u:1 ..... .....    @incdec_cnt
+
+# SVE inc/dec vector by element count
+# Note this requires esz != 0.
+INCDEC_v        00000100 .. 1 1 .... 1100 0 d:1 ..... .....    @incdec2_cnt u=1
+
+# SVE saturating inc/dec vector by element count
+# Note these require esz != 0.
+SINCDEC_v       00000100 .. 1 0 .... 1100 d:1 u:1 ..... .....   @incdec2_cnt
+
+### SVE Bitwise Immediate Group
+
+# SVE bitwise logical with immediate (unpredicated)
+ORR_zzi         00000101 00 0000 ............. .....            @rdn_dbm
+EOR_zzi         00000101 01 0000 ............. .....            @rdn_dbm
+AND_zzi         00000101 10 0000 ............. .....            @rdn_dbm
+
+# SVE broadcast bitmask immediate
+DUPM            00000101 11 0000 dbm:13 rd:5
+
+### SVE Integer Wide Immediate - Predicated Group
+
+# SVE copy floating-point immediate (predicated)
+FCPY            00000101 .. 01 .... 110 imm:8 .....             @rdn_pg4
+
+# SVE copy integer immediate (predicated)
+{
+  INVALID       00000101 00 01 ---- 01 1 -------- -----
+  CPY_m_i       00000101 .. 01 .... 01 . ........ .....   @rdn_pg4 imm=%sh8_i8s
+}
+{
+  INVALID       00000101 00 01 ---- 00 1 -------- -----
+  CPY_z_i       00000101 .. 01 .... 00 . ........ .....   @rdn_pg4 imm=%sh8_i8s
+}
+
+### SVE Permute - Extract Group
+
+# SVE extract vector (destructive)
+EXT             00000101 001 ..... 000 ... rm:5 rd:5 \
+                &rrri rn=%reg_movprfx imm=%imm8_16_10
+
+# SVE2 extract vector (constructive)
+EXT_sve2        00000101 011 ..... 000 ... rn:5 rd:5 \
+                &rri imm=%imm8_16_10
+
+### SVE Permute - Unpredicated Group
+
+# SVE broadcast general register
+DUP_s           00000101 .. 1 00000 001110 ..... .....          @rd_rn
+
+# SVE broadcast indexed element
+DUP_x           00000101 .. 1 ..... 001000 rn:5 rd:5 \
+                &rri imm=%imm7_22_16
+
+# SVE insert SIMD&FP scalar register
+INSR_f          00000101 .. 1 10100 001110 ..... .....          @rdn_rm
+
+# SVE insert general register
+INSR_r          00000101 .. 1 00100 001110 ..... .....          @rdn_rm
+
+# SVE reverse vector elements
+REV_v           00000101 .. 1 11000 001110 ..... .....          @rd_rn
+
+# SVE vector table lookup
+TBL             00000101 .. 1 ..... 001100 ..... .....          @rd_rn_rm
+
+# SVE unpack vector elements
+UNPK            00000101 esz:2 1100 u:1 h:1 001110 rn:5 rd:5
+
+# SVE2 Table Lookup (three sources)
+
+TBL_sve2        00000101 .. 1 ..... 001010 ..... .....          @rd_rn_rm
+TBX             00000101 .. 1 ..... 001011 ..... .....          @rd_rn_rm
+
+### SVE Permute - Predicates Group
+
+# SVE permute predicate elements
+ZIP1_p          00000101 .. 10 .... 010 000 0 .... 0 ....       @pd_pn_pm
+ZIP2_p          00000101 .. 10 .... 010 001 0 .... 0 ....       @pd_pn_pm
+UZP1_p          00000101 .. 10 .... 010 010 0 .... 0 ....       @pd_pn_pm
+UZP2_p          00000101 .. 10 .... 010 011 0 .... 0 ....       @pd_pn_pm
+TRN1_p          00000101 .. 10 .... 010 100 0 .... 0 ....       @pd_pn_pm
+TRN2_p          00000101 .. 10 .... 010 101 0 .... 0 ....       @pd_pn_pm
+
+# SVE reverse predicate elements
+REV_p           00000101 .. 11 0100 010 000 0 .... 0 ....       @pd_pn
+
+# SVE unpack predicate elements
+PUNPKLO         00000101 00 11 0000 010 000 0 .... 0 ....       @pd_pn_e0
+PUNPKHI         00000101 00 11 0001 010 000 0 .... 0 ....       @pd_pn_e0
+
+### SVE Permute - Interleaving Group
+
+# SVE permute vector elements
+ZIP1_z          00000101 .. 1 ..... 011 000 ..... .....         @rd_rn_rm
+ZIP2_z          00000101 .. 1 ..... 011 001 ..... .....         @rd_rn_rm
+UZP1_z          00000101 .. 1 ..... 011 010 ..... .....         @rd_rn_rm
+UZP2_z          00000101 .. 1 ..... 011 011 ..... .....         @rd_rn_rm
+TRN1_z          00000101 .. 1 ..... 011 100 ..... .....         @rd_rn_rm
+TRN2_z          00000101 .. 1 ..... 011 101 ..... .....         @rd_rn_rm
+
+# SVE2 permute vector segments
+ZIP1_q          00000101 10 1 ..... 000 000 ..... .....         @rd_rn_rm_e0
+ZIP2_q          00000101 10 1 ..... 000 001 ..... .....         @rd_rn_rm_e0
+UZP1_q          00000101 10 1 ..... 000 010 ..... .....         @rd_rn_rm_e0
+UZP2_q          00000101 10 1 ..... 000 011 ..... .....         @rd_rn_rm_e0
+TRN1_q          00000101 10 1 ..... 000 110 ..... .....         @rd_rn_rm_e0
+TRN2_q          00000101 10 1 ..... 000 111 ..... .....         @rd_rn_rm_e0
+
+### SVE Permute - Predicated Group
+
+# SVE compress active elements
+# Note esz >= 2
+COMPACT         00000101 .. 100001 100 ... ..... .....          @rd_pg_rn
+
+# SVE conditionally broadcast element to vector
+CLASTA_z        00000101 .. 10100 0 100 ... ..... .....         @rdn_pg_rm
+CLASTB_z        00000101 .. 10100 1 100 ... ..... .....         @rdn_pg_rm
+
+# SVE conditionally copy element to SIMD&FP scalar
+CLASTA_v        00000101 .. 10101 0 100 ... ..... .....         @rd_pg_rn
+CLASTB_v        00000101 .. 10101 1 100 ... ..... .....         @rd_pg_rn
+
+# SVE conditionally copy element to general register
+CLASTA_r        00000101 .. 11000 0 101 ... ..... .....         @rd_pg_rn
+CLASTB_r        00000101 .. 11000 1 101 ... ..... .....         @rd_pg_rn
+
+# SVE copy element to SIMD&FP scalar register
+LASTA_v         00000101 .. 10001 0 100 ... ..... .....         @rd_pg_rn
+LASTB_v         00000101 .. 10001 1 100 ... ..... .....         @rd_pg_rn
+
+# SVE copy element to general register
+LASTA_r         00000101 .. 10000 0 101 ... ..... .....         @rd_pg_rn
+LASTB_r         00000101 .. 10000 1 101 ... ..... .....         @rd_pg_rn
+
+# SVE copy element from SIMD&FP scalar register
+CPY_m_v         00000101 .. 100000 100 ... ..... .....          @rd_pg_rn
+
+# SVE copy element from general register to vector (predicated)
+CPY_m_r         00000101 .. 101000 101 ... ..... .....          @rd_pg_rn
+
+# SVE reverse within elements
+# Note esz >= operation size
+REVB            00000101 .. 1001 00 100 ... ..... .....         @rd_pg_rn
+REVH            00000101 .. 1001 01 100 ... ..... .....         @rd_pg_rn
+REVW            00000101 .. 1001 10 100 ... ..... .....         @rd_pg_rn
+RBIT            00000101 .. 1001 11 100 ... ..... .....         @rd_pg_rn
+REVD            00000101 00 1011 10 100 ... ..... .....         @rd_pg_rn_e0
+
+# SVE vector splice (predicated, destructive)
+SPLICE          00000101 .. 101 100 100 ... ..... .....         @rdn_pg_rm
+
+# SVE2 vector splice (predicated, constructive)
+SPLICE_sve2     00000101 .. 101 101 100 ... ..... .....         @rd_pg_rn
+
+### SVE Select Vectors Group
+
+# SVE select vector elements (predicated)
+SEL_zpzz        00000101 .. 1 ..... 11 .... ..... .....         @rd_pg4_rn_rm
+
+### SVE Integer Compare - Vectors Group
+
+# SVE integer compare_vectors
+CMPHS_ppzz      00100100 .. 0 ..... 000 ... ..... 0 ....        @pd_pg_rn_rm
+CMPHI_ppzz      00100100 .. 0 ..... 000 ... ..... 1 ....        @pd_pg_rn_rm
+CMPGE_ppzz      00100100 .. 0 ..... 100 ... ..... 0 ....        @pd_pg_rn_rm
+CMPGT_ppzz      00100100 .. 0 ..... 100 ... ..... 1 ....        @pd_pg_rn_rm
+CMPEQ_ppzz      00100100 .. 0 ..... 101 ... ..... 0 ....        @pd_pg_rn_rm
+CMPNE_ppzz      00100100 .. 0 ..... 101 ... ..... 1 ....        @pd_pg_rn_rm
+
+# SVE integer compare with wide elements
+# Note these require esz != 3.
+CMPEQ_ppzw      00100100 .. 0 ..... 001 ... ..... 0 ....        @pd_pg_rn_rm
+CMPNE_ppzw      00100100 .. 0 ..... 001 ... ..... 1 ....        @pd_pg_rn_rm
+CMPGE_ppzw      00100100 .. 0 ..... 010 ... ..... 0 ....        @pd_pg_rn_rm
+CMPGT_ppzw      00100100 .. 0 ..... 010 ... ..... 1 ....        @pd_pg_rn_rm
+CMPLT_ppzw      00100100 .. 0 ..... 011 ... ..... 0 ....        @pd_pg_rn_rm
+CMPLE_ppzw      00100100 .. 0 ..... 011 ... ..... 1 ....        @pd_pg_rn_rm
+CMPHS_ppzw      00100100 .. 0 ..... 110 ... ..... 0 ....        @pd_pg_rn_rm
+CMPHI_ppzw      00100100 .. 0 ..... 110 ... ..... 1 ....        @pd_pg_rn_rm
+CMPLO_ppzw      00100100 .. 0 ..... 111 ... ..... 0 ....        @pd_pg_rn_rm
+CMPLS_ppzw      00100100 .. 0 ..... 111 ... ..... 1 ....        @pd_pg_rn_rm
+
+### SVE Integer Compare - Unsigned Immediate Group
+
+# SVE integer compare with unsigned immediate
+CMPHS_ppzi      00100100 .. 1 ....... 0 ... ..... 0 ....      @pd_pg_rn_i7
+CMPHI_ppzi      00100100 .. 1 ....... 0 ... ..... 1 ....      @pd_pg_rn_i7
+CMPLO_ppzi      00100100 .. 1 ....... 1 ... ..... 0 ....      @pd_pg_rn_i7
+CMPLS_ppzi      00100100 .. 1 ....... 1 ... ..... 1 ....      @pd_pg_rn_i7
+
+### SVE Integer Compare - Signed Immediate Group
+
+# SVE integer compare with signed immediate
+CMPGE_ppzi      00100101 .. 0 ..... 000 ... ..... 0 ....      @pd_pg_rn_i5
+CMPGT_ppzi      00100101 .. 0 ..... 000 ... ..... 1 ....      @pd_pg_rn_i5
+CMPLT_ppzi      00100101 .. 0 ..... 001 ... ..... 0 ....      @pd_pg_rn_i5
+CMPLE_ppzi      00100101 .. 0 ..... 001 ... ..... 1 ....      @pd_pg_rn_i5
+CMPEQ_ppzi      00100101 .. 0 ..... 100 ... ..... 0 ....      @pd_pg_rn_i5
+CMPNE_ppzi      00100101 .. 0 ..... 100 ... ..... 1 ....      @pd_pg_rn_i5
+
+### SVE Predicate Logical Operations Group
+
+# SVE predicate logical operations
+AND_pppp        00100101 0. 00 .... 01 .... 0 .... 0 ....       @pd_pg_pn_pm_s
+BIC_pppp        00100101 0. 00 .... 01 .... 0 .... 1 ....       @pd_pg_pn_pm_s
+EOR_pppp        00100101 0. 00 .... 01 .... 1 .... 0 ....       @pd_pg_pn_pm_s
+SEL_pppp        00100101 0. 00 .... 01 .... 1 .... 1 ....       @pd_pg_pn_pm_s
+ORR_pppp        00100101 1. 00 .... 01 .... 0 .... 0 ....       @pd_pg_pn_pm_s
+ORN_pppp        00100101 1. 00 .... 01 .... 0 .... 1 ....       @pd_pg_pn_pm_s
+NOR_pppp        00100101 1. 00 .... 01 .... 1 .... 0 ....       @pd_pg_pn_pm_s
+NAND_pppp       00100101 1. 00 .... 01 .... 1 .... 1 ....       @pd_pg_pn_pm_s
+
+### SVE Predicate Misc Group
+
+# SVE predicate test
+PTEST           00100101 01 010000 11 pg:4 0 rn:4 0 0000
+
+# SVE predicate initialize
+PTRUE           00100101 esz:2 01100 s:1 111000 pat:5 0 rd:4
+
+# SVE initialize FFR
+SETFFR          00100101 0010 1100 1001 0000 0000 0000
+
+# SVE zero predicate register
+PFALSE          00100101 0001 1000 1110 0100 0000 rd:4
+
+# SVE predicate read from FFR (predicated)
+RDFFR_p         00100101 0 s:1 0110001111000 pg:4 0 rd:4
+
+# SVE predicate read from FFR (unpredicated)
+RDFFR           00100101 0001 1001 1111 0000 0000 rd:4
+
+# SVE FFR write from predicate (WRFFR)
+WRFFR           00100101 0010 1000 1001 000 rn:4 00000
+
+# SVE predicate first active
+PFIRST          00100101 01 011 000 11000 00 .... 0 ....        @pd_pn_e0
+
+# SVE predicate next active
+PNEXT           00100101 .. 011 001 11000 10 .... 0 ....        @pd_pn
+
+### SVE Partition Break Group
+
+# SVE propagate break from previous partition
+BRKPA           00100101 0. 00 .... 11 .... 0 .... 0 ....       @pd_pg_pn_pm_s
+BRKPB           00100101 0. 00 .... 11 .... 0 .... 1 ....       @pd_pg_pn_pm_s
+
+# SVE partition break condition
+BRKA_z          00100101 0. 01000001 .... 0 .... 0 ....         @pd_pg_pn_s
+BRKB_z          00100101 1. 01000001 .... 0 .... 0 ....         @pd_pg_pn_s
+BRKA_m          00100101 00 01000001 .... 0 .... 1 ....         @pd_pg_pn_s0
+BRKB_m          00100101 10 01000001 .... 0 .... 1 ....         @pd_pg_pn_s0
+
+# SVE propagate break to next partition
+BRKN            00100101 0. 01100001 .... 0 .... 0 ....         @pd_pg_pn_s
+
+### SVE Predicate Count Group
+
+# SVE predicate count
+CNTP            00100101 .. 100 000 10 .... 0 .... .....        @rd_pg4_pn
+
+# SVE inc/dec register by predicate count
+INCDECP_r       00100101 .. 10110 d:1 10001 00 .... .....     @incdec_pred u=1
+
+# SVE inc/dec vector by predicate count
+INCDECP_z       00100101 .. 10110 d:1 10000 00 .... .....    @incdec2_pred u=1
+
+# SVE saturating inc/dec register by predicate count
+SINCDECP_r_32   00100101 .. 1010 d:1 u:1 10001 00 .... .....    @incdec_pred
+SINCDECP_r_64   00100101 .. 1010 d:1 u:1 10001 10 .... .....    @incdec_pred
+
+# SVE saturating inc/dec vector by predicate count
+SINCDECP_z      00100101 .. 1010 d:1 u:1 10000 00 .... .....    @incdec2_pred
+
+### SVE Integer Compare - Scalars Group
+
+# SVE conditionally terminate scalars
+CTERM           00100101 1 sf:1 1 rm:5 001000 rn:5 ne:1 0000
+
+# SVE integer compare scalar count and limit
+WHILE           00100101 esz:2 1 rm:5 000 sf:1 u:1 lt:1 rn:5 eq:1 rd:4
+
+# SVE2 pointer conflict compare
+WHILE_ptr       00100101 esz:2 1 rm:5 001 100 rn:5 rw:1 rd:4
+
+### SVE Integer Wide Immediate - Unpredicated Group
+
+# SVE broadcast floating-point immediate (unpredicated)
+FDUP            00100101 esz:2 111 00 1110 imm:8 rd:5
+
+# SVE broadcast integer immediate (unpredicated)
+{
+  INVALID       00100101 00    111 00 011 1 -------- -----
+  DUP_i         00100101 esz:2 111 00 011 . ........ rd:5       imm=%sh8_i8s
+}
+
+# SVE integer add/subtract immediate (unpredicated)
+{
+  INVALID       00100101 00 100 000 11 1 -------- -----
+  ADD_zzi       00100101 .. 100 000 11 . ........ .....         @rdn_sh_i8u
+}
+{
+  INVALID       00100101 00 100 001 11 1 -------- -----
+  SUB_zzi       00100101 .. 100 001 11 . ........ .....         @rdn_sh_i8u
+}
+{
+  INVALID       00100101 00 100 011 11 1 -------- -----
+  SUBR_zzi      00100101 .. 100 011 11 . ........ .....         @rdn_sh_i8u
+}
+{
+  INVALID       00100101 00 100 100 11 1 -------- -----
+  SQADD_zzi     00100101 .. 100 100 11 . ........ .....         @rdn_sh_i8u
+}
+{
+  INVALID       00100101 00 100 101 11 1 -------- -----
+  UQADD_zzi     00100101 .. 100 101 11 . ........ .....         @rdn_sh_i8u
+}
+{
+  INVALID       00100101 00 100 110 11 1 -------- -----
+  SQSUB_zzi     00100101 .. 100 110 11 . ........ .....         @rdn_sh_i8u
+}
+{
+  INVALID       00100101 00 100 111 11 1 -------- -----
+  UQSUB_zzi     00100101 .. 100 111 11 . ........ .....         @rdn_sh_i8u
+}
+
+# SVE integer min/max immediate (unpredicated)
+SMAX_zzi        00100101 .. 101 000 110 ........ .....          @rdn_i8s
+UMAX_zzi        00100101 .. 101 001 110 ........ .....          @rdn_i8u
+SMIN_zzi        00100101 .. 101 010 110 ........ .....          @rdn_i8s
+UMIN_zzi        00100101 .. 101 011 110 ........ .....          @rdn_i8u
+
+# SVE integer multiply immediate (unpredicated)
+MUL_zzi         00100101 .. 110 000 110 ........ .....          @rdn_i8s
+
+# SVE integer dot product (unpredicated)
+DOT_zzzz        01000100 1 sz:1 0 rm:5 00000 u:1 rn:5 rd:5 \
+                ra=%reg_movprfx
+
+# SVE2 complex dot product (vectors)
+CDOT_zzzz       01000100 esz:2 0 rm:5 0001 rot:2 rn:5 rd:5  ra=%reg_movprfx
+
+#### SVE Multiply - Indexed
+
+# SVE integer dot product (indexed)
+SDOT_zzxw_s     01000100 10 1 ..... 000000 ..... .....   @rrxr_2 esz=2
+SDOT_zzxw_d     01000100 11 1 ..... 000000 ..... .....   @rrxr_1 esz=3
+UDOT_zzxw_s     01000100 10 1 ..... 000001 ..... .....   @rrxr_2 esz=2
+UDOT_zzxw_d     01000100 11 1 ..... 000001 ..... .....   @rrxr_1 esz=3
+
+# SVE2 integer multiply-add (indexed)
+MLA_zzxz_h      01000100 0. 1 ..... 000010 ..... .....   @rrxr_3 esz=1
+MLA_zzxz_s      01000100 10 1 ..... 000010 ..... .....   @rrxr_2 esz=2
+MLA_zzxz_d      01000100 11 1 ..... 000010 ..... .....   @rrxr_1 esz=3
+MLS_zzxz_h      01000100 0. 1 ..... 000011 ..... .....   @rrxr_3 esz=1
+MLS_zzxz_s      01000100 10 1 ..... 000011 ..... .....   @rrxr_2 esz=2
+MLS_zzxz_d      01000100 11 1 ..... 000011 ..... .....   @rrxr_1 esz=3
+
+# SVE2 saturating multiply-add high (indexed)
+SQRDMLAH_zzxz_h 01000100 0. 1 ..... 000100 ..... .....   @rrxr_3 esz=1
+SQRDMLAH_zzxz_s 01000100 10 1 ..... 000100 ..... .....   @rrxr_2 esz=2
+SQRDMLAH_zzxz_d 01000100 11 1 ..... 000100 ..... .....   @rrxr_1 esz=3
+SQRDMLSH_zzxz_h 01000100 0. 1 ..... 000101 ..... .....   @rrxr_3 esz=1
+SQRDMLSH_zzxz_s 01000100 10 1 ..... 000101 ..... .....   @rrxr_2 esz=2
+SQRDMLSH_zzxz_d 01000100 11 1 ..... 000101 ..... .....   @rrxr_1 esz=3
+
+# SVE mixed sign dot product (indexed)
+USDOT_zzxw_s    01000100 10 1 ..... 000110 ..... .....   @rrxr_2 esz=2
+SUDOT_zzxw_s    01000100 10 1 ..... 000111 ..... .....   @rrxr_2 esz=2
+
+# SVE2 saturating multiply-add (indexed)
+SQDMLALB_zzxw_s 01000100 10 1 ..... 0010.0 ..... .....   @rrxr_3a esz=2
+SQDMLALB_zzxw_d 01000100 11 1 ..... 0010.0 ..... .....   @rrxr_2a esz=3
+SQDMLALT_zzxw_s 01000100 10 1 ..... 0010.1 ..... .....   @rrxr_3a esz=2
+SQDMLALT_zzxw_d 01000100 11 1 ..... 0010.1 ..... .....   @rrxr_2a esz=3
+SQDMLSLB_zzxw_s 01000100 10 1 ..... 0011.0 ..... .....   @rrxr_3a esz=2
+SQDMLSLB_zzxw_d 01000100 11 1 ..... 0011.0 ..... .....   @rrxr_2a esz=3
+SQDMLSLT_zzxw_s 01000100 10 1 ..... 0011.1 ..... .....   @rrxr_3a esz=2
+SQDMLSLT_zzxw_d 01000100 11 1 ..... 0011.1 ..... .....   @rrxr_2a esz=3
+
+# SVE2 complex integer dot product (indexed)
+CDOT_zzxw_s     01000100 10 1 index:2 rm:3 0100 rot:2 rn:5 rd:5 \
+                ra=%reg_movprfx
+CDOT_zzxw_d     01000100 11 1 index:1 rm:4 0100 rot:2 rn:5 rd:5 \
+                ra=%reg_movprfx
+
+# SVE2 complex integer multiply-add (indexed)
+CMLA_zzxz_h     01000100 10 1 index:2 rm:3 0110 rot:2 rn:5 rd:5 \
+                ra=%reg_movprfx
+CMLA_zzxz_s     01000100 11 1 index:1 rm:4 0110 rot:2 rn:5 rd:5 \
+                ra=%reg_movprfx
+
+# SVE2 complex saturating integer multiply-add (indexed)
+SQRDCMLAH_zzxz_h  01000100 10 1 index:2 rm:3 0111 rot:2 rn:5 rd:5 \
+                  ra=%reg_movprfx
+SQRDCMLAH_zzxz_s  01000100 11 1 index:1 rm:4 0111 rot:2 rn:5 rd:5 \
+                  ra=%reg_movprfx
+
+# SVE2 multiply-add long (indexed)
+SMLALB_zzxw_s   01000100 10 1 ..... 1000.0 ..... .....   @rrxr_3a esz=2
+SMLALB_zzxw_d   01000100 11 1 ..... 1000.0 ..... .....   @rrxr_2a esz=3
+SMLALT_zzxw_s   01000100 10 1 ..... 1000.1 ..... .....   @rrxr_3a esz=2
+SMLALT_zzxw_d   01000100 11 1 ..... 1000.1 ..... .....   @rrxr_2a esz=3
+UMLALB_zzxw_s   01000100 10 1 ..... 1001.0 ..... .....   @rrxr_3a esz=2
+UMLALB_zzxw_d   01000100 11 1 ..... 1001.0 ..... .....   @rrxr_2a esz=3
+UMLALT_zzxw_s   01000100 10 1 ..... 1001.1 ..... .....   @rrxr_3a esz=2
+UMLALT_zzxw_d   01000100 11 1 ..... 1001.1 ..... .....   @rrxr_2a esz=3
+SMLSLB_zzxw_s   01000100 10 1 ..... 1010.0 ..... .....   @rrxr_3a esz=2
+SMLSLB_zzxw_d   01000100 11 1 ..... 1010.0 ..... .....   @rrxr_2a esz=3
+SMLSLT_zzxw_s   01000100 10 1 ..... 1010.1 ..... .....   @rrxr_3a esz=2
+SMLSLT_zzxw_d   01000100 11 1 ..... 1010.1 ..... .....   @rrxr_2a esz=3
+UMLSLB_zzxw_s   01000100 10 1 ..... 1011.0 ..... .....   @rrxr_3a esz=2
+UMLSLB_zzxw_d   01000100 11 1 ..... 1011.0 ..... .....   @rrxr_2a esz=3
+UMLSLT_zzxw_s   01000100 10 1 ..... 1011.1 ..... .....   @rrxr_3a esz=2
+UMLSLT_zzxw_d   01000100 11 1 ..... 1011.1 ..... .....   @rrxr_2a esz=3
+
+# SVE2 integer multiply long (indexed)
+SMULLB_zzx_s    01000100 10 1 ..... 1100.0 ..... .....   @rrx_3a esz=2
+SMULLB_zzx_d    01000100 11 1 ..... 1100.0 ..... .....   @rrx_2a esz=3
+SMULLT_zzx_s    01000100 10 1 ..... 1100.1 ..... .....   @rrx_3a esz=2
+SMULLT_zzx_d    01000100 11 1 ..... 1100.1 ..... .....   @rrx_2a esz=3
+UMULLB_zzx_s    01000100 10 1 ..... 1101.0 ..... .....   @rrx_3a esz=2
+UMULLB_zzx_d    01000100 11 1 ..... 1101.0 ..... .....   @rrx_2a esz=3
+UMULLT_zzx_s    01000100 10 1 ..... 1101.1 ..... .....   @rrx_3a esz=2
+UMULLT_zzx_d    01000100 11 1 ..... 1101.1 ..... .....   @rrx_2a esz=3
+
+# SVE2 saturating multiply (indexed)
+SQDMULLB_zzx_s  01000100 10 1 ..... 1110.0 ..... .....   @rrx_3a esz=2
+SQDMULLB_zzx_d  01000100 11 1 ..... 1110.0 ..... .....   @rrx_2a esz=3
+SQDMULLT_zzx_s  01000100 10 1 ..... 1110.1 ..... .....   @rrx_3a esz=2
+SQDMULLT_zzx_d  01000100 11 1 ..... 1110.1 ..... .....   @rrx_2a esz=3
+
+# SVE2 saturating multiply high (indexed)
+SQDMULH_zzx_h   01000100 0. 1 ..... 111100 ..... .....   @rrx_3 esz=1
+SQDMULH_zzx_s   01000100 10 1 ..... 111100 ..... .....   @rrx_2 esz=2
+SQDMULH_zzx_d   01000100 11 1 ..... 111100 ..... .....   @rrx_1 esz=3
+SQRDMULH_zzx_h  01000100 0. 1 ..... 111101 ..... .....   @rrx_3 esz=1
+SQRDMULH_zzx_s  01000100 10 1 ..... 111101 ..... .....   @rrx_2 esz=2
+SQRDMULH_zzx_d  01000100 11 1 ..... 111101 ..... .....   @rrx_1 esz=3
+
+# SVE2 integer multiply (indexed)
+MUL_zzx_h       01000100 0. 1 ..... 111110 ..... .....   @rrx_3 esz=1
+MUL_zzx_s       01000100 10 1 ..... 111110 ..... .....   @rrx_2 esz=2
+MUL_zzx_d       01000100 11 1 ..... 111110 ..... .....   @rrx_1 esz=3
+
+# SVE floating-point complex add (predicated)
+FCADD           01100100 esz:2 00000 rot:1 100 pg:3 rm:5 rd:5 \
+                rn=%reg_movprfx
+
+# SVE floating-point complex multiply-add (predicated)
+FCMLA_zpzzz     01100100 esz:2 0 rm:5 0 rot:2 pg:3 rn:5 rd:5 \
+                ra=%reg_movprfx
+
+# SVE floating-point complex multiply-add (indexed)
+FCMLA_zzxz      01100100 10 1 index:2 rm:3 0001 rot:2 rn:5 rd:5 \
+                ra=%reg_movprfx esz=1
+FCMLA_zzxz      01100100 11 1 index:1 rm:4 0001 rot:2 rn:5 rd:5 \
+                ra=%reg_movprfx esz=2
+
+### SVE FP Multiply-Add Indexed Group
+
+# SVE floating-point multiply-add (indexed)
+FMLA_zzxz       01100100 0. 1 ..... 000000 ..... .....  @rrxr_3 esz=1
+FMLA_zzxz       01100100 10 1 ..... 000000 ..... .....  @rrxr_2 esz=2
+FMLA_zzxz       01100100 11 1 ..... 000000 ..... .....  @rrxr_1 esz=3
+FMLS_zzxz       01100100 0. 1 ..... 000001 ..... .....  @rrxr_3 esz=1
+FMLS_zzxz       01100100 10 1 ..... 000001 ..... .....  @rrxr_2 esz=2
+FMLS_zzxz       01100100 11 1 ..... 000001 ..... .....  @rrxr_1 esz=3
+
+### SVE FP Multiply Indexed Group
+
+# SVE floating-point multiply (indexed)
+FMUL_zzx        01100100 0. 1 ..... 001000 ..... .....   @rrx_3 esz=1
+FMUL_zzx        01100100 10 1 ..... 001000 ..... .....   @rrx_2 esz=2
+FMUL_zzx        01100100 11 1 ..... 001000 ..... .....   @rrx_1 esz=3
+
+### SVE FP Fast Reduction Group
+
+FADDV           01100101 .. 000 000 001 ... ..... .....         @rd_pg_rn
+FMAXNMV         01100101 .. 000 100 001 ... ..... .....         @rd_pg_rn
+FMINNMV         01100101 .. 000 101 001 ... ..... .....         @rd_pg_rn
+FMAXV           01100101 .. 000 110 001 ... ..... .....         @rd_pg_rn
+FMINV           01100101 .. 000 111 001 ... ..... .....         @rd_pg_rn
+
+## SVE Floating Point Unary Operations - Unpredicated Group
+
+FRECPE          01100101 .. 001 110 001100 ..... .....          @rd_rn
+FRSQRTE         01100101 .. 001 111 001100 ..... .....          @rd_rn
+
+### SVE FP Compare with Zero Group
+
+FCMGE_ppz0      01100101 .. 0100 00 001 ... ..... 0 ....        @pd_pg_rn
+FCMGT_ppz0      01100101 .. 0100 00 001 ... ..... 1 ....        @pd_pg_rn
+FCMLT_ppz0      01100101 .. 0100 01 001 ... ..... 0 ....        @pd_pg_rn
+FCMLE_ppz0      01100101 .. 0100 01 001 ... ..... 1 ....        @pd_pg_rn
+FCMEQ_ppz0      01100101 .. 0100 10 001 ... ..... 0 ....        @pd_pg_rn
+FCMNE_ppz0      01100101 .. 0100 11 001 ... ..... 0 ....        @pd_pg_rn
+
+### SVE FP Accumulating Reduction Group
+
+# SVE floating-point serial reduction (predicated)
+FADDA           01100101 .. 011 000 001 ... ..... .....         @rdn_pg_rm
+
+### SVE Floating Point Arithmetic - Unpredicated Group
+
+# SVE floating-point arithmetic (unpredicated)
+FADD_zzz        01100101 .. 0 ..... 000 000 ..... .....         @rd_rn_rm
+FSUB_zzz        01100101 .. 0 ..... 000 001 ..... .....         @rd_rn_rm
+FMUL_zzz        01100101 .. 0 ..... 000 010 ..... .....         @rd_rn_rm
+FTSMUL          01100101 .. 0 ..... 000 011 ..... .....         @rd_rn_rm
+FRECPS          01100101 .. 0 ..... 000 110 ..... .....         @rd_rn_rm
+FRSQRTS         01100101 .. 0 ..... 000 111 ..... .....         @rd_rn_rm
+
+### SVE FP Arithmetic Predicated Group
+
+# SVE floating-point arithmetic (predicated)
+FADD_zpzz       01100101 .. 00 0000 100 ... ..... .....    @rdn_pg_rm
+FSUB_zpzz       01100101 .. 00 0001 100 ... ..... .....    @rdn_pg_rm
+FMUL_zpzz       01100101 .. 00 0010 100 ... ..... .....    @rdn_pg_rm
+FSUB_zpzz       01100101 .. 00 0011 100 ... ..... .....    @rdm_pg_rn # FSUBR
+FMAXNM_zpzz     01100101 .. 00 0100 100 ... ..... .....    @rdn_pg_rm
+FMINNM_zpzz     01100101 .. 00 0101 100 ... ..... .....    @rdn_pg_rm
+FMAX_zpzz       01100101 .. 00 0110 100 ... ..... .....    @rdn_pg_rm
+FMIN_zpzz       01100101 .. 00 0111 100 ... ..... .....    @rdn_pg_rm
+FABD            01100101 .. 00 1000 100 ... ..... .....    @rdn_pg_rm
+FSCALE          01100101 .. 00 1001 100 ... ..... .....    @rdn_pg_rm
+FMULX           01100101 .. 00 1010 100 ... ..... .....    @rdn_pg_rm
+FDIV            01100101 .. 00 1100 100 ... ..... .....    @rdm_pg_rn # FDIVR
+FDIV            01100101 .. 00 1101 100 ... ..... .....    @rdn_pg_rm
+
+# SVE floating-point arithmetic with immediate (predicated)
+FADD_zpzi       01100101 .. 011 000 100 ... 0000 . .....        @rdn_i1
+FSUB_zpzi       01100101 .. 011 001 100 ... 0000 . .....        @rdn_i1
+FMUL_zpzi       01100101 .. 011 010 100 ... 0000 . .....        @rdn_i1
+FSUBR_zpzi      01100101 .. 011 011 100 ... 0000 . .....        @rdn_i1
+FMAXNM_zpzi     01100101 .. 011 100 100 ... 0000 . .....        @rdn_i1
+FMINNM_zpzi     01100101 .. 011 101 100 ... 0000 . .....        @rdn_i1
+FMAX_zpzi       01100101 .. 011 110 100 ... 0000 . .....        @rdn_i1
+FMIN_zpzi       01100101 .. 011 111 100 ... 0000 . .....        @rdn_i1
+
+# SVE floating-point trig multiply-add coefficient
+FTMAD           01100101 esz:2 010 imm:3 100000 rm:5 rd:5       rn=%reg_movprfx
+
+### SVE FP Multiply-Add Group
+
+# SVE floating-point multiply-accumulate writing addend
+FMLA_zpzzz      01100101 .. 1 ..... 000 ... ..... .....         @rda_pg_rn_rm
+FMLS_zpzzz      01100101 .. 1 ..... 001 ... ..... .....         @rda_pg_rn_rm
+FNMLA_zpzzz     01100101 .. 1 ..... 010 ... ..... .....         @rda_pg_rn_rm
+FNMLS_zpzzz     01100101 .. 1 ..... 011 ... ..... .....         @rda_pg_rn_rm
+
+# SVE floating-point multiply-accumulate writing multiplicand
+# Alter the operand extraction order and reuse the helpers from above.
+# FMAD, FMSB, FNMAD, FNMS
+FMLA_zpzzz      01100101 .. 1 ..... 100 ... ..... .....         @rdn_pg_rm_ra
+FMLS_zpzzz      01100101 .. 1 ..... 101 ... ..... .....         @rdn_pg_rm_ra
+FNMLA_zpzzz     01100101 .. 1 ..... 110 ... ..... .....         @rdn_pg_rm_ra
+FNMLS_zpzzz     01100101 .. 1 ..... 111 ... ..... .....         @rdn_pg_rm_ra
+
+### SVE FP Unary Operations Predicated Group
+
+# SVE floating-point convert precision
+FCVT_sh         01100101 10 0010 00 101 ... ..... .....         @rd_pg_rn_e0
+FCVT_hs         01100101 10 0010 01 101 ... ..... .....         @rd_pg_rn_e0
+BFCVT           01100101 10 0010 10 101 ... ..... .....         @rd_pg_rn_e0
+FCVT_dh         01100101 11 0010 00 101 ... ..... .....         @rd_pg_rn_e0
+FCVT_hd         01100101 11 0010 01 101 ... ..... .....         @rd_pg_rn_e0
+FCVT_ds         01100101 11 0010 10 101 ... ..... .....         @rd_pg_rn_e0
+FCVT_sd         01100101 11 0010 11 101 ... ..... .....         @rd_pg_rn_e0
+
+# SVE floating-point convert to integer
+FCVTZS_hh       01100101 01 011 01 0 101 ... ..... .....        @rd_pg_rn_e0
+FCVTZU_hh       01100101 01 011 01 1 101 ... ..... .....        @rd_pg_rn_e0
+FCVTZS_hs       01100101 01 011 10 0 101 ... ..... .....        @rd_pg_rn_e0
+FCVTZU_hs       01100101 01 011 10 1 101 ... ..... .....        @rd_pg_rn_e0
+FCVTZS_hd       01100101 01 011 11 0 101 ... ..... .....        @rd_pg_rn_e0
+FCVTZU_hd       01100101 01 011 11 1 101 ... ..... .....        @rd_pg_rn_e0
+FCVTZS_ss       01100101 10 011 10 0 101 ... ..... .....        @rd_pg_rn_e0
+FCVTZU_ss       01100101 10 011 10 1 101 ... ..... .....        @rd_pg_rn_e0
+FCVTZS_ds       01100101 11 011 00 0 101 ... ..... .....        @rd_pg_rn_e0
+FCVTZU_ds       01100101 11 011 00 1 101 ... ..... .....        @rd_pg_rn_e0
+FCVTZS_sd       01100101 11 011 10 0 101 ... ..... .....        @rd_pg_rn_e0
+FCVTZU_sd       01100101 11 011 10 1 101 ... ..... .....        @rd_pg_rn_e0
+FCVTZS_dd       01100101 11 011 11 0 101 ... ..... .....        @rd_pg_rn_e0
+FCVTZU_dd       01100101 11 011 11 1 101 ... ..... .....        @rd_pg_rn_e0
+
+# SVE floating-point round to integral value
+FRINTN          01100101 .. 000 000 101 ... ..... .....         @rd_pg_rn
+FRINTP          01100101 .. 000 001 101 ... ..... .....         @rd_pg_rn
+FRINTM          01100101 .. 000 010 101 ... ..... .....         @rd_pg_rn
+FRINTZ          01100101 .. 000 011 101 ... ..... .....         @rd_pg_rn
+FRINTA          01100101 .. 000 100 101 ... ..... .....         @rd_pg_rn
+FRINTX          01100101 .. 000 110 101 ... ..... .....         @rd_pg_rn
+FRINTI          01100101 .. 000 111 101 ... ..... .....         @rd_pg_rn
+
+# SVE floating-point unary operations
+FRECPX          01100101 .. 001 100 101 ... ..... .....         @rd_pg_rn
+FSQRT           01100101 .. 001 101 101 ... ..... .....         @rd_pg_rn
+
+# SVE integer convert to floating-point
+SCVTF_hh        01100101 01 010 01 0 101 ... ..... .....        @rd_pg_rn_e0
+SCVTF_sh        01100101 01 010 10 0 101 ... ..... .....        @rd_pg_rn_e0
+SCVTF_dh        01100101 01 010 11 0 101 ... ..... .....        @rd_pg_rn_e0
+SCVTF_ss        01100101 10 010 10 0 101 ... ..... .....        @rd_pg_rn_e0
+SCVTF_sd        01100101 11 010 00 0 101 ... ..... .....        @rd_pg_rn_e0
+SCVTF_ds        01100101 11 010 10 0 101 ... ..... .....        @rd_pg_rn_e0
+SCVTF_dd        01100101 11 010 11 0 101 ... ..... .....        @rd_pg_rn_e0
+
+UCVTF_hh        01100101 01 010 01 1 101 ... ..... .....        @rd_pg_rn_e0
+UCVTF_sh        01100101 01 010 10 1 101 ... ..... .....        @rd_pg_rn_e0
+UCVTF_dh        01100101 01 010 11 1 101 ... ..... .....        @rd_pg_rn_e0
+UCVTF_ss        01100101 10 010 10 1 101 ... ..... .....        @rd_pg_rn_e0
+UCVTF_sd        01100101 11 010 00 1 101 ... ..... .....        @rd_pg_rn_e0
+UCVTF_ds        01100101 11 010 10 1 101 ... ..... .....        @rd_pg_rn_e0
+UCVTF_dd        01100101 11 010 11 1 101 ... ..... .....        @rd_pg_rn_e0
+
+### SVE Memory - 32-bit Gather and Unsized Contiguous Group
+
+# SVE load predicate register
+LDR_pri         10000101 10 ...... 000 ... ..... 0 ....         @pd_rn_i9
+
+# SVE load vector register
+LDR_zri         10000101 10 ...... 010 ... ..... .....          @rd_rn_i9
+
+# SVE load and broadcast element
+LD1R_zpri       1000010 .. 1 imm:6 1.. pg:3 rn:5 rd:5 \
+                &rpri_load dtype=%dtype_23_13 nreg=0
+
+# SVE 32-bit gather load (scalar plus 32-bit unscaled offsets)
+# SVE 32-bit gather load (scalar plus 32-bit scaled offsets)
+LD1_zprz        1000010 00 .0 ..... 0.. ... ..... ..... \
+                @rprr_g_load_xs_u esz=2 msz=0 scale=0
+LD1_zprz        1000010 01 .. ..... 0.. ... ..... ..... \
+                @rprr_g_load_xs_u_sc esz=2 msz=1
+LD1_zprz        1000010 10 .. ..... 01. ... ..... ..... \
+                @rprr_g_load_xs_sc esz=2 msz=2 u=1
+
+# SVE 32-bit gather load (vector plus immediate)
+LD1_zpiz        1000010 .. 01 ..... 1.. ... ..... ..... \
+                @rpri_g_load esz=2
+
+### SVE Memory Contiguous Load Group
+
+# SVE contiguous load (scalar plus scalar)
+LD_zprr         1010010 .... ..... 010 ... ..... .....    @rprr_load_dt nreg=0
+
+# SVE contiguous first-fault load (scalar plus scalar)
+LDFF1_zprr      1010010 .... ..... 011 ... ..... .....    @rprr_load_dt nreg=0
+
+# SVE contiguous load (scalar plus immediate)
+LD_zpri         1010010 .... 0.... 101 ... ..... .....    @rpri_load_dt nreg=0
+
+# SVE contiguous non-fault load (scalar plus immediate)
+LDNF1_zpri      1010010 .... 1.... 101 ... ..... .....    @rpri_load_dt nreg=0
+
+# SVE contiguous non-temporal load (scalar plus scalar)
+# LDNT1B, LDNT1H, LDNT1W, LDNT1D
+# SVE load multiple structures (scalar plus scalar)
+# LD2B, LD2H, LD2W, LD2D; etc.
+LD_zprr         1010010 .. nreg:2 ..... 110 ... ..... .....     @rprr_load_msz
+
+# SVE contiguous non-temporal load (scalar plus immediate)
+# LDNT1B, LDNT1H, LDNT1W, LDNT1D
+# SVE load multiple structures (scalar plus immediate)
+# LD2B, LD2H, LD2W, LD2D; etc.
+LD_zpri         1010010 .. nreg:2 0.... 111 ... ..... .....     @rpri_load_msz
+
+# SVE load and broadcast quadword (scalar plus scalar)
+LD1RQ_zprr      1010010 .. 00 ..... 000 ... ..... ..... \
+                @rprr_load_msz nreg=0
+LD1RO_zprr      1010010 .. 01 ..... 000 ... ..... ..... \
+                @rprr_load_msz nreg=0
+
+# SVE load and broadcast quadword (scalar plus immediate)
+# LD1RQB, LD1RQH, LD1RQS, LD1RQD
+LD1RQ_zpri      1010010 .. 00 0.... 001 ... ..... ..... \
+                @rpri_load_msz nreg=0
+LD1RO_zpri      1010010 .. 01 0.... 001 ... ..... ..... \
+                @rpri_load_msz nreg=0
+
+# SVE 32-bit gather prefetch (scalar plus 32-bit scaled offsets)
+PRF_ns          1000010 00 -1 ----- 0-- --- ----- 0 ----
+
+# SVE 32-bit gather prefetch (vector plus immediate)
+PRF_ns          1000010 -- 00 ----- 111 --- ----- 0 ----
+
+# SVE contiguous prefetch (scalar plus immediate)
+PRF             1000010 11 1- ----- 0-- --- ----- 0 ----
+
+# SVE contiguous prefetch (scalar plus scalar)
+PRF_rr          1000010 -- 00 rm:5 110 --- ----- 0 ----
+
+### SVE Memory 64-bit Gather Group
+
+# SVE 64-bit gather load (scalar plus 32-bit unpacked unscaled offsets)
+# SVE 64-bit gather load (scalar plus 32-bit unpacked scaled offsets)
+LD1_zprz        1100010 00 .0 ..... 0.. ... ..... ..... \
+                @rprr_g_load_xs_u esz=3 msz=0 scale=0
+LD1_zprz        1100010 01 .. ..... 0.. ... ..... ..... \
+                @rprr_g_load_xs_u_sc esz=3 msz=1
+LD1_zprz        1100010 10 .. ..... 0.. ... ..... ..... \
+                @rprr_g_load_xs_u_sc esz=3 msz=2
+LD1_zprz        1100010 11 .. ..... 01. ... ..... ..... \
+                @rprr_g_load_xs_sc esz=3 msz=3 u=1
+
+# SVE 64-bit gather load (scalar plus 64-bit unscaled offsets)
+# SVE 64-bit gather load (scalar plus 64-bit scaled offsets)
+LD1_zprz        1100010 00 10 ..... 1.. ... ..... ..... \
+                @rprr_g_load_u esz=3 msz=0 scale=0
+LD1_zprz        1100010 01 1. ..... 1.. ... ..... ..... \
+                @rprr_g_load_u_sc esz=3 msz=1
+LD1_zprz        1100010 10 1. ..... 1.. ... ..... ..... \
+                @rprr_g_load_u_sc esz=3 msz=2
+LD1_zprz        1100010 11 1. ..... 11. ... ..... ..... \
+                @rprr_g_load_sc esz=3 msz=3 u=1
+
+# SVE 64-bit gather load (vector plus immediate)
+LD1_zpiz        1100010 .. 01 ..... 1.. ... ..... ..... \
+                @rpri_g_load esz=3
+
+# SVE 64-bit gather prefetch (scalar plus 64-bit scaled offsets)
+PRF_ns          1100010 00 11 ----- 1-- --- ----- 0 ----
+
+# SVE 64-bit gather prefetch (scalar plus unpacked 32-bit scaled offsets)
+PRF_ns          1100010 00 -1 ----- 0-- --- ----- 0 ----
+
+# SVE 64-bit gather prefetch (vector plus immediate)
+PRF_ns          1100010 -- 00 ----- 111 --- ----- 0 ----
+
+### SVE Memory Store Group
+
+# SVE store predicate register
+STR_pri         1110010 11 0.     ..... 000 ... ..... 0 ....    @pd_rn_i9
+
+# SVE store vector register
+STR_zri         1110010 11 0.     ..... 010 ... ..... .....     @rd_rn_i9
+
+# SVE contiguous store (scalar plus immediate)
+# ST1B, ST1H, ST1W, ST1D; require msz <= esz
+ST_zpri         1110010 .. esz:2  0.... 111 ... ..... ..... \
+                @rpri_store_msz nreg=0
+
+# SVE contiguous store (scalar plus scalar)
+# ST1B, ST1H, ST1W, ST1D; require msz <= esz
+# Enumerate msz lest we conflict with STR_zri.
+ST_zprr         1110010 00 ..     ..... 010 ... ..... ..... \
+                @rprr_store_esz_n0 msz=0
+ST_zprr         1110010 01 ..     ..... 010 ... ..... ..... \
+                @rprr_store_esz_n0 msz=1
+ST_zprr         1110010 10 ..     ..... 010 ... ..... ..... \
+                @rprr_store_esz_n0 msz=2
+ST_zprr         1110010 11 11     ..... 010 ... ..... ..... \
+                @rprr_store msz=3 esz=3 nreg=0
+
+# SVE contiguous non-temporal store (scalar plus immediate)  (nreg == 0)
+# SVE store multiple structures (scalar plus immediate)      (nreg != 0)
+ST_zpri         1110010 .. nreg:2 1.... 111 ... ..... ..... \
+                @rpri_store_msz esz=%size_23
+
+# SVE contiguous non-temporal store (scalar plus scalar)     (nreg == 0)
+# SVE store multiple structures (scalar plus scalar)         (nreg != 0)
+ST_zprr         1110010 msz:2 nreg:2 ..... 011 ... ..... ..... \
+                @rprr_store esz=%size_23
+
+# SVE 32-bit scatter store (scalar plus 32-bit scaled offsets)
+# Require msz > 0 && msz <= esz.
+ST1_zprz        1110010 .. 11 ..... 100 ... ..... ..... \
+                @rprr_scatter_store xs=0 esz=2 scale=1
+ST1_zprz        1110010 .. 11 ..... 110 ... ..... ..... \
+                @rprr_scatter_store xs=1 esz=2 scale=1
+
+# SVE 32-bit scatter store (scalar plus 32-bit unscaled offsets)
+# Require msz <= esz.
+ST1_zprz        1110010 .. 10 ..... 100 ... ..... ..... \
+                @rprr_scatter_store xs=0 esz=2 scale=0
+ST1_zprz        1110010 .. 10 ..... 110 ... ..... ..... \
+                @rprr_scatter_store xs=1 esz=2 scale=0
+
+# SVE 64-bit scatter store (scalar plus 64-bit scaled offset)
+# Require msz > 0
+ST1_zprz        1110010 .. 01 ..... 101 ... ..... ..... \
+                @rprr_scatter_store xs=2 esz=3 scale=1
+
+# SVE 64-bit scatter store (scalar plus 64-bit unscaled offset)
+ST1_zprz        1110010 .. 00 ..... 101 ... ..... ..... \
+                @rprr_scatter_store xs=2 esz=3 scale=0
+
+# SVE 64-bit scatter store (vector plus immediate)
+ST1_zpiz        1110010 .. 10 ..... 101 ... ..... ..... \
+                @rpri_scatter_store esz=3
+
+# SVE 32-bit scatter store (vector plus immediate)
+ST1_zpiz        1110010 .. 11 ..... 101 ... ..... ..... \
+                @rpri_scatter_store esz=2
+
+# SVE 64-bit scatter store (scalar plus unpacked 32-bit scaled offset)
+# Require msz > 0
+ST1_zprz        1110010 .. 01 ..... 100 ... ..... ..... \
+                @rprr_scatter_store xs=0 esz=3 scale=1
+ST1_zprz        1110010 .. 01 ..... 110 ... ..... ..... \
+                @rprr_scatter_store xs=1 esz=3 scale=1
+
+# SVE 64-bit scatter store (scalar plus unpacked 32-bit unscaled offset)
+ST1_zprz        1110010 .. 00 ..... 100 ... ..... ..... \
+                @rprr_scatter_store xs=0 esz=3 scale=0
+ST1_zprz        1110010 .. 00 ..... 110 ... ..... ..... \
+                @rprr_scatter_store xs=1 esz=3 scale=0
+
+#### SVE2 Support
+
+### SVE2 Integer Multiply - Unpredicated
+
+# SVE2 integer multiply vectors (unpredicated)
+MUL_zzz         00000100 .. 1 ..... 0110 00 ..... .....  @rd_rn_rm
+SMULH_zzz       00000100 .. 1 ..... 0110 10 ..... .....  @rd_rn_rm
+UMULH_zzz       00000100 .. 1 ..... 0110 11 ..... .....  @rd_rn_rm
+PMUL_zzz        00000100 00 1 ..... 0110 01 ..... .....  @rd_rn_rm_e0
+
+# SVE2 signed saturating doubling multiply high (unpredicated)
+SQDMULH_zzz     00000100 .. 1 ..... 0111 00 ..... .....  @rd_rn_rm
+SQRDMULH_zzz    00000100 .. 1 ..... 0111 01 ..... .....  @rd_rn_rm
+
+### SVE2 Integer - Predicated
+
+SADALP_zpzz     01000100 .. 000 100 101 ... ..... .....  @rdm_pg_rn
+UADALP_zpzz     01000100 .. 000 101 101 ... ..... .....  @rdm_pg_rn
+
+### SVE2 integer unary operations (predicated)
+
+URECPE          01000100 .. 000 000 101 ... ..... .....  @rd_pg_rn
+URSQRTE         01000100 .. 000 001 101 ... ..... .....  @rd_pg_rn
+SQABS           01000100 .. 001 000 101 ... ..... .....  @rd_pg_rn
+SQNEG           01000100 .. 001 001 101 ... ..... .....  @rd_pg_rn
+
+### SVE2 saturating/rounding bitwise shift left (predicated)
+
+SRSHL           01000100 .. 000 010 100 ... ..... .....  @rdn_pg_rm
+URSHL           01000100 .. 000 011 100 ... ..... .....  @rdn_pg_rm
+SRSHL           01000100 .. 000 110 100 ... ..... .....  @rdm_pg_rn # SRSHLR
+URSHL           01000100 .. 000 111 100 ... ..... .....  @rdm_pg_rn # URSHLR
+
+SQSHL           01000100 .. 001 000 100 ... ..... .....  @rdn_pg_rm
+UQSHL           01000100 .. 001 001 100 ... ..... .....  @rdn_pg_rm
+SQSHL           01000100 .. 001 100 100 ... ..... .....  @rdm_pg_rn # SQSHLR
+UQSHL           01000100 .. 001 101 100 ... ..... .....  @rdm_pg_rn # UQSHLR
+
+SQRSHL          01000100 .. 001 010 100 ... ..... .....  @rdn_pg_rm
+UQRSHL          01000100 .. 001 011 100 ... ..... .....  @rdn_pg_rm
+SQRSHL          01000100 .. 001 110 100 ... ..... .....  @rdm_pg_rn # SQRSHLR
+UQRSHL          01000100 .. 001 111 100 ... ..... .....  @rdm_pg_rn # UQRSHLR
+
+### SVE2 integer halving add/subtract (predicated)
+
+SHADD           01000100 .. 010 000 100 ... ..... .....  @rdn_pg_rm
+UHADD           01000100 .. 010 001 100 ... ..... .....  @rdn_pg_rm
+SHSUB           01000100 .. 010 010 100 ... ..... .....  @rdn_pg_rm
+UHSUB           01000100 .. 010 011 100 ... ..... .....  @rdn_pg_rm
+SRHADD          01000100 .. 010 100 100 ... ..... .....  @rdn_pg_rm
+URHADD          01000100 .. 010 101 100 ... ..... .....  @rdn_pg_rm
+SHSUB           01000100 .. 010 110 100 ... ..... .....  @rdm_pg_rn # SHSUBR
+UHSUB           01000100 .. 010 111 100 ... ..... .....  @rdm_pg_rn # UHSUBR
+
+### SVE2 integer pairwise arithmetic
+
+ADDP            01000100 .. 010 001 101 ... ..... .....  @rdn_pg_rm
+SMAXP           01000100 .. 010 100 101 ... ..... .....  @rdn_pg_rm
+UMAXP           01000100 .. 010 101 101 ... ..... .....  @rdn_pg_rm
+SMINP           01000100 .. 010 110 101 ... ..... .....  @rdn_pg_rm
+UMINP           01000100 .. 010 111 101 ... ..... .....  @rdn_pg_rm
+
+### SVE2 saturating add/subtract (predicated)
+
+SQADD_zpzz      01000100 .. 011 000 100 ... ..... .....  @rdn_pg_rm
+UQADD_zpzz      01000100 .. 011 001 100 ... ..... .....  @rdn_pg_rm
+SQSUB_zpzz      01000100 .. 011 010 100 ... ..... .....  @rdn_pg_rm
+UQSUB_zpzz      01000100 .. 011 011 100 ... ..... .....  @rdn_pg_rm
+SUQADD          01000100 .. 011 100 100 ... ..... .....  @rdn_pg_rm
+USQADD          01000100 .. 011 101 100 ... ..... .....  @rdn_pg_rm
+SQSUB_zpzz      01000100 .. 011 110 100 ... ..... .....  @rdm_pg_rn # SQSUBR
+UQSUB_zpzz      01000100 .. 011 111 100 ... ..... .....  @rdm_pg_rn # UQSUBR
+
+#### SVE2 Widening Integer Arithmetic
+
+## SVE2 integer add/subtract long
+
+SADDLB          01000101 .. 0 ..... 00 0000 ..... .....  @rd_rn_rm
+SADDLT          01000101 .. 0 ..... 00 0001 ..... .....  @rd_rn_rm
+UADDLB          01000101 .. 0 ..... 00 0010 ..... .....  @rd_rn_rm
+UADDLT          01000101 .. 0 ..... 00 0011 ..... .....  @rd_rn_rm
+
+SSUBLB          01000101 .. 0 ..... 00 0100 ..... .....  @rd_rn_rm
+SSUBLT          01000101 .. 0 ..... 00 0101 ..... .....  @rd_rn_rm
+USUBLB          01000101 .. 0 ..... 00 0110 ..... .....  @rd_rn_rm
+USUBLT          01000101 .. 0 ..... 00 0111 ..... .....  @rd_rn_rm
+
+SABDLB          01000101 .. 0 ..... 00 1100 ..... .....  @rd_rn_rm
+SABDLT          01000101 .. 0 ..... 00 1101 ..... .....  @rd_rn_rm
+UABDLB          01000101 .. 0 ..... 00 1110 ..... .....  @rd_rn_rm
+UABDLT          01000101 .. 0 ..... 00 1111 ..... .....  @rd_rn_rm
+
+## SVE2 integer add/subtract interleaved long
+
+SADDLBT         01000101 .. 0 ..... 1000 00 ..... .....  @rd_rn_rm
+SSUBLBT         01000101 .. 0 ..... 1000 10 ..... .....  @rd_rn_rm
+SSUBLTB         01000101 .. 0 ..... 1000 11 ..... .....  @rd_rn_rm
+
+## SVE2 integer add/subtract wide
+
+SADDWB          01000101 .. 0 ..... 010 000 ..... .....  @rd_rn_rm
+SADDWT          01000101 .. 0 ..... 010 001 ..... .....  @rd_rn_rm
+UADDWB          01000101 .. 0 ..... 010 010 ..... .....  @rd_rn_rm
+UADDWT          01000101 .. 0 ..... 010 011 ..... .....  @rd_rn_rm
+
+SSUBWB          01000101 .. 0 ..... 010 100 ..... .....  @rd_rn_rm
+SSUBWT          01000101 .. 0 ..... 010 101 ..... .....  @rd_rn_rm
+USUBWB          01000101 .. 0 ..... 010 110 ..... .....  @rd_rn_rm
+USUBWT          01000101 .. 0 ..... 010 111 ..... .....  @rd_rn_rm
+
+## SVE2 integer multiply long
+
+SQDMULLB_zzz    01000101 .. 0 ..... 011 000 ..... .....  @rd_rn_rm
+SQDMULLT_zzz    01000101 .. 0 ..... 011 001 ..... .....  @rd_rn_rm
+PMULLB          01000101 .. 0 ..... 011 010 ..... .....  @rd_rn_rm
+PMULLT          01000101 .. 0 ..... 011 011 ..... .....  @rd_rn_rm
+SMULLB_zzz      01000101 .. 0 ..... 011 100 ..... .....  @rd_rn_rm
+SMULLT_zzz      01000101 .. 0 ..... 011 101 ..... .....  @rd_rn_rm
+UMULLB_zzz      01000101 .. 0 ..... 011 110 ..... .....  @rd_rn_rm
+UMULLT_zzz      01000101 .. 0 ..... 011 111 ..... .....  @rd_rn_rm
+
+## SVE2 bitwise shift left long
+
+# Note bit23 == 0 is handled by esz > 0 in do_sve2_shll_tb.
+SSHLLB          01000101 .. 0 ..... 1010 00 ..... .....  @rd_rn_tszimm_shl
+SSHLLT          01000101 .. 0 ..... 1010 01 ..... .....  @rd_rn_tszimm_shl
+USHLLB          01000101 .. 0 ..... 1010 10 ..... .....  @rd_rn_tszimm_shl
+USHLLT          01000101 .. 0 ..... 1010 11 ..... .....  @rd_rn_tszimm_shl
+
+## SVE2 bitwise exclusive-or interleaved
+
+EORBT           01000101 .. 0 ..... 10010 0 ..... .....  @rd_rn_rm
+EORTB           01000101 .. 0 ..... 10010 1 ..... .....  @rd_rn_rm
+
+## SVE integer matrix multiply accumulate
+
+SMMLA           01000101 00 0 ..... 10011 0 ..... .....  @rda_rn_rm_e0
+USMMLA          01000101 10 0 ..... 10011 0 ..... .....  @rda_rn_rm_e0
+UMMLA           01000101 11 0 ..... 10011 0 ..... .....  @rda_rn_rm_e0
+
+## SVE2 bitwise permute
+
+BEXT            01000101 .. 0 ..... 1011 00 ..... .....  @rd_rn_rm
+BDEP            01000101 .. 0 ..... 1011 01 ..... .....  @rd_rn_rm
+BGRP            01000101 .. 0 ..... 1011 10 ..... .....  @rd_rn_rm
+
+#### SVE2 Accumulate
+
+## SVE2 complex integer add
+
+CADD_rot90      01000101 .. 00000 0 11011 0 ..... .....  @rdn_rm
+CADD_rot270     01000101 .. 00000 0 11011 1 ..... .....  @rdn_rm
+SQCADD_rot90    01000101 .. 00000 1 11011 0 ..... .....  @rdn_rm
+SQCADD_rot270   01000101 .. 00000 1 11011 1 ..... .....  @rdn_rm
+
+## SVE2 integer absolute difference and accumulate long
+
+SABALB          01000101 .. 0 ..... 1100 00 ..... .....  @rda_rn_rm
+SABALT          01000101 .. 0 ..... 1100 01 ..... .....  @rda_rn_rm
+UABALB          01000101 .. 0 ..... 1100 10 ..... .....  @rda_rn_rm
+UABALT          01000101 .. 0 ..... 1100 11 ..... .....  @rda_rn_rm
+
+## SVE2 integer add/subtract long with carry
+
+# ADC and SBC decoded via size in helper dispatch.
+ADCLB           01000101 .. 0 ..... 11010 0 ..... .....  @rda_rn_rm
+ADCLT           01000101 .. 0 ..... 11010 1 ..... .....  @rda_rn_rm
+
+## SVE2 bitwise shift right and accumulate
+
+# TODO: Use @rda and %reg_movprfx here.
+SSRA            01000101 .. 0 ..... 1110 00 ..... .....  @rd_rn_tszimm_shr
+USRA            01000101 .. 0 ..... 1110 01 ..... .....  @rd_rn_tszimm_shr
+SRSRA           01000101 .. 0 ..... 1110 10 ..... .....  @rd_rn_tszimm_shr
+URSRA           01000101 .. 0 ..... 1110 11 ..... .....  @rd_rn_tszimm_shr
+
+## SVE2 bitwise shift and insert
+
+SRI             01000101 .. 0 ..... 11110 0 ..... .....  @rd_rn_tszimm_shr
+SLI             01000101 .. 0 ..... 11110 1 ..... .....  @rd_rn_tszimm_shl
+
+## SVE2 integer absolute difference and accumulate
+
+# TODO: Use @rda and %reg_movprfx here.
+SABA            01000101 .. 0 ..... 11111 0 ..... .....  @rd_rn_rm
+UABA            01000101 .. 0 ..... 11111 1 ..... .....  @rd_rn_rm
+
+#### SVE2 Narrowing
+
+## SVE2 saturating extract narrow
+
+# Bits 23, 18-16 are zero, limited in the translator via esz < 3 & imm == 0.
+SQXTNB          01000101 .. 1 ..... 010 000 ..... .....  @rd_rn_tszimm_shl
+SQXTNT          01000101 .. 1 ..... 010 001 ..... .....  @rd_rn_tszimm_shl
+UQXTNB          01000101 .. 1 ..... 010 010 ..... .....  @rd_rn_tszimm_shl
+UQXTNT          01000101 .. 1 ..... 010 011 ..... .....  @rd_rn_tszimm_shl
+SQXTUNB         01000101 .. 1 ..... 010 100 ..... .....  @rd_rn_tszimm_shl
+SQXTUNT         01000101 .. 1 ..... 010 101 ..... .....  @rd_rn_tszimm_shl
+
+## SVE2 bitwise shift right narrow
+
+# Bit 23 == 0 is handled by esz > 0 in the translator.
+SQSHRUNB        01000101 .. 1 ..... 00 0000 ..... .....  @rd_rn_tszimm_shr
+SQSHRUNT        01000101 .. 1 ..... 00 0001 ..... .....  @rd_rn_tszimm_shr
+SQRSHRUNB       01000101 .. 1 ..... 00 0010 ..... .....  @rd_rn_tszimm_shr
+SQRSHRUNT       01000101 .. 1 ..... 00 0011 ..... .....  @rd_rn_tszimm_shr
+SHRNB           01000101 .. 1 ..... 00 0100 ..... .....  @rd_rn_tszimm_shr
+SHRNT           01000101 .. 1 ..... 00 0101 ..... .....  @rd_rn_tszimm_shr
+RSHRNB          01000101 .. 1 ..... 00 0110 ..... .....  @rd_rn_tszimm_shr
+RSHRNT          01000101 .. 1 ..... 00 0111 ..... .....  @rd_rn_tszimm_shr
+SQSHRNB         01000101 .. 1 ..... 00 1000 ..... .....  @rd_rn_tszimm_shr
+SQSHRNT         01000101 .. 1 ..... 00 1001 ..... .....  @rd_rn_tszimm_shr
+SQRSHRNB        01000101 .. 1 ..... 00 1010 ..... .....  @rd_rn_tszimm_shr
+SQRSHRNT        01000101 .. 1 ..... 00 1011 ..... .....  @rd_rn_tszimm_shr
+UQSHRNB         01000101 .. 1 ..... 00 1100 ..... .....  @rd_rn_tszimm_shr
+UQSHRNT         01000101 .. 1 ..... 00 1101 ..... .....  @rd_rn_tszimm_shr
+UQRSHRNB        01000101 .. 1 ..... 00 1110 ..... .....  @rd_rn_tszimm_shr
+UQRSHRNT        01000101 .. 1 ..... 00 1111 ..... .....  @rd_rn_tszimm_shr
+
+## SVE2 integer add/subtract narrow high part
+
+ADDHNB          01000101 .. 1 ..... 011 000 ..... .....  @rd_rn_rm
+ADDHNT          01000101 .. 1 ..... 011 001 ..... .....  @rd_rn_rm
+RADDHNB         01000101 .. 1 ..... 011 010 ..... .....  @rd_rn_rm
+RADDHNT         01000101 .. 1 ..... 011 011 ..... .....  @rd_rn_rm
+SUBHNB          01000101 .. 1 ..... 011 100 ..... .....  @rd_rn_rm
+SUBHNT          01000101 .. 1 ..... 011 101 ..... .....  @rd_rn_rm
+RSUBHNB         01000101 .. 1 ..... 011 110 ..... .....  @rd_rn_rm
+RSUBHNT         01000101 .. 1 ..... 011 111 ..... .....  @rd_rn_rm
+
+### SVE2 Character Match
+
+MATCH           01000101 .. 1 ..... 100 ... ..... 0 .... @pd_pg_rn_rm
+NMATCH          01000101 .. 1 ..... 100 ... ..... 1 .... @pd_pg_rn_rm
+
+### SVE2 Histogram Computation
+
+HISTCNT         01000101 .. 1 ..... 110 ... ..... .....  @rd_pg_rn_rm
+HISTSEG         01000101 .. 1 ..... 101 000 ..... .....  @rd_rn_rm
+
+## SVE2 floating-point pairwise operations
+
+FADDP           01100100 .. 010 00 0 100 ... ..... ..... @rdn_pg_rm
+FMAXNMP         01100100 .. 010 10 0 100 ... ..... ..... @rdn_pg_rm
+FMINNMP         01100100 .. 010 10 1 100 ... ..... ..... @rdn_pg_rm
+FMAXP           01100100 .. 010 11 0 100 ... ..... ..... @rdn_pg_rm
+FMINP           01100100 .. 010 11 1 100 ... ..... ..... @rdn_pg_rm
+
+#### SVE Integer Multiply-Add (unpredicated)
+
+## SVE2 saturating multiply-add long
+
+SQDMLALB_zzzw   01000100 .. 0 ..... 0110 00 ..... .....  @rda_rn_rm
+SQDMLALT_zzzw   01000100 .. 0 ..... 0110 01 ..... .....  @rda_rn_rm
+SQDMLSLB_zzzw   01000100 .. 0 ..... 0110 10 ..... .....  @rda_rn_rm
+SQDMLSLT_zzzw   01000100 .. 0 ..... 0110 11 ..... .....  @rda_rn_rm
+
+## SVE2 saturating multiply-add interleaved long
+
+SQDMLALBT       01000100 .. 0 ..... 00001 0 ..... .....  @rda_rn_rm
+SQDMLSLBT       01000100 .. 0 ..... 00001 1 ..... .....  @rda_rn_rm
+
+## SVE2 saturating multiply-add high
+
+SQRDMLAH_zzzz   01000100 .. 0 ..... 01110 0 ..... .....  @rda_rn_rm
+SQRDMLSH_zzzz   01000100 .. 0 ..... 01110 1 ..... .....  @rda_rn_rm
+
+## SVE2 integer multiply-add long
+
+SMLALB_zzzw     01000100 .. 0 ..... 010 000 ..... .....  @rda_rn_rm
+SMLALT_zzzw     01000100 .. 0 ..... 010 001 ..... .....  @rda_rn_rm
+UMLALB_zzzw     01000100 .. 0 ..... 010 010 ..... .....  @rda_rn_rm
+UMLALT_zzzw     01000100 .. 0 ..... 010 011 ..... .....  @rda_rn_rm
+SMLSLB_zzzw     01000100 .. 0 ..... 010 100 ..... .....  @rda_rn_rm
+SMLSLT_zzzw     01000100 .. 0 ..... 010 101 ..... .....  @rda_rn_rm
+UMLSLB_zzzw     01000100 .. 0 ..... 010 110 ..... .....  @rda_rn_rm
+UMLSLT_zzzw     01000100 .. 0 ..... 010 111 ..... .....  @rda_rn_rm
+
+## SVE2 complex integer multiply-add
+
+CMLA_zzzz       01000100 esz:2 0 rm:5 0010 rot:2 rn:5 rd:5  ra=%reg_movprfx
+SQRDCMLAH_zzzz  01000100 esz:2 0 rm:5 0011 rot:2 rn:5 rd:5  ra=%reg_movprfx
+
+## SVE mixed sign dot product
+
+USDOT_zzzz      01000100 .. 0 ..... 011 110 ..... .....  @rda_rn_rm
+
+### SVE2 floating point matrix multiply accumulate
+BFMMLA          01100100 01 1 ..... 111 001 ..... .....  @rda_rn_rm_e0
+FMMLA_s         01100100 10 1 ..... 111 001 ..... .....  @rda_rn_rm_e0
+FMMLA_d         01100100 11 1 ..... 111 001 ..... .....  @rda_rn_rm_e0
+
+### SVE2 Memory Gather Load Group
+
+# SVE2 64-bit gather non-temporal load (scalar plus 64-bit unscaled offsets)
+LDNT1_zprz      1100010 msz:2 00 rm:5 1 u:1 0 pg:3 rn:5 rd:5 \
+                &rprr_gather_load xs=2 esz=3 scale=0 ff=0
+
+# SVE2 32-bit gather non-temporal load (scalar plus 32-bit unscaled offsets)
+LDNT1_zprz      1000010 msz:2 00 rm:5 10 u:1 pg:3 rn:5 rd:5 \
+                &rprr_gather_load xs=0 esz=2 scale=0 ff=0
+
+### SVE2 Memory Store Group
+
+# SVE2 64-bit scatter non-temporal store (vector plus scalar)
+STNT1_zprz      1110010 .. 00 ..... 001 ... ..... ..... \
+                @rprr_scatter_store xs=2 esz=3 scale=0
+
+# SVE2 32-bit scatter non-temporal store (vector plus scalar)
+STNT1_zprz      1110010 .. 10 ..... 001 ... ..... ..... \
+                @rprr_scatter_store xs=0 esz=2 scale=0
+
+### SVE2 Crypto Extensions
+
+# SVE2 crypto unary operations
+# AESMC and AESIMC
+AESMC           01000101 00 10000011100 decrypt:1 00000 rd:5
+
+# SVE2 crypto destructive binary operations
+AESE            01000101 00 10001 0 11100 0 ..... .....  @rdn_rm_e0
+AESD            01000101 00 10001 0 11100 1 ..... .....  @rdn_rm_e0
+SM4E            01000101 00 10001 1 11100 0 ..... .....  @rdn_rm_e0
+
+# SVE2 crypto constructive binary operations
+SM4EKEY         01000101 00 1 ..... 11110 0 ..... .....  @rd_rn_rm_e0
+RAX1            01000101 00 1 ..... 11110 1 ..... .....  @rd_rn_rm_e0
+
+### SVE2 floating-point convert precision odd elements
+FCVTXNT_ds      01100100 00 0010 10 101 ... ..... .....  @rd_pg_rn_e0
+FCVTX_ds        01100101 00 0010 10 101 ... ..... .....  @rd_pg_rn_e0
+FCVTNT_sh       01100100 10 0010 00 101 ... ..... .....  @rd_pg_rn_e0
+BFCVTNT         01100100 10 0010 10 101 ... ..... .....  @rd_pg_rn_e0
+FCVTLT_hs       01100100 10 0010 01 101 ... ..... .....  @rd_pg_rn_e0
+FCVTNT_ds       01100100 11 0010 10 101 ... ..... .....  @rd_pg_rn_e0
+FCVTLT_sd       01100100 11 0010 11 101 ... ..... .....  @rd_pg_rn_e0
+
+### SVE2 floating-point convert to integer
+FLOGB           01100101 00 011 esz:2 0101 pg:3 rn:5 rd:5  &rpr_esz
+
+### SVE2 floating-point multiply-add long (vectors)
+FMLALB_zzzw     01100100 10 1 ..... 10 0 00 0 ..... .....  @rda_rn_rm_e0
+FMLALT_zzzw     01100100 10 1 ..... 10 0 00 1 ..... .....  @rda_rn_rm_e0
+FMLSLB_zzzw     01100100 10 1 ..... 10 1 00 0 ..... .....  @rda_rn_rm_e0
+FMLSLT_zzzw     01100100 10 1 ..... 10 1 00 1 ..... .....  @rda_rn_rm_e0
+
+BFMLALB_zzzw    01100100 11 1 ..... 10 0 00 0 ..... .....  @rda_rn_rm_e0
+BFMLALT_zzzw    01100100 11 1 ..... 10 0 00 1 ..... .....  @rda_rn_rm_e0
+
+### SVE2 floating-point bfloat16 dot-product
+BFDOT_zzzz      01100100 01 1 ..... 10 0 00 0 ..... .....  @rda_rn_rm_e0
+
+### SVE2 floating-point multiply-add long (indexed)
+FMLALB_zzxw     01100100 10 1 ..... 0100.0 ..... .....     @rrxr_3a esz=2
+FMLALT_zzxw     01100100 10 1 ..... 0100.1 ..... .....     @rrxr_3a esz=2
+FMLSLB_zzxw     01100100 10 1 ..... 0110.0 ..... .....     @rrxr_3a esz=2
+FMLSLT_zzxw     01100100 10 1 ..... 0110.1 ..... .....     @rrxr_3a esz=2
+BFMLALB_zzxw    01100100 11 1 ..... 0100.0 ..... .....     @rrxr_3a esz=2
+BFMLALT_zzxw    01100100 11 1 ..... 0100.1 ..... .....     @rrxr_3a esz=2
+
+### SVE2 floating-point bfloat16 dot-product (indexed)
+BFDOT_zzxz      01100100 01 1 ..... 010000 ..... .....     @rrxr_2 esz=2
+
+### SVE broadcast predicate element
+
+&psel           esz pd pn pm rv imm
+%psel_rv        16:2 !function=plus_12
+%psel_imm_b     22:2 19:2
+%psel_imm_h     22:2 20:1
+%psel_imm_s     22:2
+%psel_imm_d     23:1
+@psel           ........ .. . ... .. .. pn:4 . pm:4 . pd:4  \
+                &psel rv=%psel_rv
+
+PSEL            00100101 .. 1 ..1 .. 01 .... 0 .... 0 ....  \
+                @psel esz=0 imm=%psel_imm_b
+PSEL            00100101 .. 1 .10 .. 01 .... 0 .... 0 ....  \
+                @psel esz=1 imm=%psel_imm_h
+PSEL            00100101 .. 1 100 .. 01 .... 0 .... 0 ....  \
+                @psel esz=2 imm=%psel_imm_s
+PSEL            00100101 .1 1 000 .. 01 .... 0 .... 0 ....  \
+                @psel esz=3 imm=%psel_imm_d
+
+### SVE clamp
+
+SCLAMP          01000100 .. 0 ..... 110000 ..... .....          @rda_rn_rm
+UCLAMP          01000100 .. 0 ..... 110001 ..... .....          @rda_rn_rm
diff --git a/target/arm/tcg/t16.decode b/target/arm/tcg/t16.decode
new file mode 100644
index 0000000..646c749
--- /dev/null
+++ b/target/arm/tcg/t16.decode
@@ -0,0 +1,281 @@
+# Thumb1 instructions
+#
+#  Copyright (c) 2019 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+
+&empty           !extern
+&s_rrr_shi       !extern s rd rn rm shim shty
+&s_rrr_shr       !extern s rn rd rm rs shty
+&s_rri_rot       !extern s rn rd imm rot
+&s_rrrr          !extern s rd rn rm ra
+&rrr_rot         !extern rd rn rm rot
+&rr              !extern rd rm
+&ri              !extern rd imm
+&r               !extern rm
+&i               !extern imm
+&ldst_rr         !extern p w u rn rt rm shimm shtype
+&ldst_ri         !extern p w u rn rt imm
+&ldst_block      !extern rn i b u w list
+&setend          !extern E
+&cps             !extern mode imod M A I F
+&ci              !extern cond imm
+
+# Set S if the instruction is outside of an IT block.
+%s               !function=t16_setflags
+
+# Data-processing (two low registers)
+
+%reg_0           0:3
+
+@lll_noshr       ...... .... rm:3 rd:3 \
+                 &s_rrr_shi %s rn=%reg_0 shim=0 shty=0
+@xll_noshr       ...... .... rm:3 rn:3 \
+                 &s_rrr_shi s=1 rd=0 shim=0 shty=0
+@lxl_shr         ...... .... rs:3 rd:3 \
+                 &s_rrr_shr %s rm=%reg_0 rn=0
+
+AND_rrri         010000 0000 ... ...            @lll_noshr
+EOR_rrri         010000 0001 ... ...            @lll_noshr
+MOV_rxrr         010000 0010 ... ...            @lxl_shr shty=0  # LSL
+MOV_rxrr         010000 0011 ... ...            @lxl_shr shty=1  # LSR
+MOV_rxrr         010000 0100 ... ...            @lxl_shr shty=2  # ASR
+ADC_rrri         010000 0101 ... ...            @lll_noshr
+SBC_rrri         010000 0110 ... ...            @lll_noshr
+MOV_rxrr         010000 0111 ... ...            @lxl_shr shty=3  # ROR
+TST_xrri         010000 1000 ... ...            @xll_noshr
+RSB_rri          010000 1001 rn:3 rd:3          &s_rri_rot %s imm=0 rot=0
+CMP_xrri         010000 1010 ... ...            @xll_noshr
+CMN_xrri         010000 1011 ... ...            @xll_noshr
+ORR_rrri         010000 1100 ... ...            @lll_noshr
+MUL              010000 1101 rn:3 rd:3          &s_rrrr %s rm=%reg_0 ra=0
+BIC_rrri         010000 1110 ... ...            @lll_noshr
+MVN_rxri         010000 1111 ... ...            @lll_noshr
+
+# Load/store (register offset)
+
+@ldst_rr         ....... rm:3 rn:3 rt:3 \
+                 &ldst_rr p=1 w=0 u=1 shimm=0 shtype=0
+
+STR_rr           0101 000 ... ... ...           @ldst_rr
+STRH_rr          0101 001 ... ... ...           @ldst_rr
+STRB_rr          0101 010 ... ... ...           @ldst_rr
+LDRSB_rr         0101 011 ... ... ...           @ldst_rr
+LDR_rr           0101 100 ... ... ...           @ldst_rr
+LDRH_rr          0101 101 ... ... ...           @ldst_rr
+LDRB_rr          0101 110 ... ... ...           @ldst_rr
+LDRSH_rr         0101 111 ... ... ...           @ldst_rr
+
+# Load/store word/byte (immediate offset)
+
+%imm5_6x4       6:5 !function=times_4
+
+@ldst_ri_1      ..... imm:5 rn:3 rt:3 \
+                &ldst_ri p=1 w=0 u=1
+@ldst_ri_4      ..... ..... rn:3 rt:3 \
+                &ldst_ri p=1 w=0 u=1 imm=%imm5_6x4
+
+STR_ri          01100 ..... ... ...             @ldst_ri_4
+LDR_ri          01101 ..... ... ...             @ldst_ri_4
+STRB_ri         01110 ..... ... ...             @ldst_ri_1
+LDRB_ri         01111 ..... ... ...             @ldst_ri_1
+
+# Load/store halfword (immediate offset)
+
+%imm5_6x2       6:5 !function=times_2
+@ldst_ri_2      ..... ..... rn:3 rt:3 \
+                &ldst_ri p=1 w=0 u=1 imm=%imm5_6x2
+
+STRH_ri         10000 ..... ... ...             @ldst_ri_2
+LDRH_ri         10001 ..... ... ...             @ldst_ri_2
+
+# Load/store (SP-relative)
+
+%imm8_0x4       0:8 !function=times_4
+@ldst_spec_i    ..... rt:3 ........ \
+                &ldst_ri p=1 w=0 u=1 imm=%imm8_0x4
+
+STR_ri          10010 ... ........              @ldst_spec_i rn=13
+LDR_ri          10011 ... ........              @ldst_spec_i rn=13
+
+# Load (PC-relative)
+
+LDR_ri          01001 ... ........              @ldst_spec_i rn=15
+
+# Add PC/SP (immediate)
+
+ADR             10100 rd:3 ........             imm=%imm8_0x4
+ADD_rri         10101 rd:3 ........ \
+                &s_rri_rot rn=13 s=0 rot=0 imm=%imm8_0x4  # SP
+
+# Load/store multiple
+
+@ldstm          ..... rn:3 list:8               &ldst_block i=1 b=0 u=0 w=1
+
+STM             11000 ... ........              @ldstm
+LDM_t16         11001 ... ........              @ldstm
+
+# Shift (immediate)
+
+@shift_i        ..... shim:5 rm:3 rd:3          &s_rrr_shi %s rn=%reg_0
+
+MOV_rxri        000 00 ..... ... ...            @shift_i shty=0  # LSL
+MOV_rxri        000 01 ..... ... ...            @shift_i shty=1  # LSR
+MOV_rxri        000 10 ..... ... ...            @shift_i shty=2  # ASR
+
+# Add/subtract (three low registers)
+
+@addsub_3       ....... rm:3 rn:3 rd:3 \
+                &s_rrr_shi %s shim=0 shty=0
+
+ADD_rrri        0001100 ... ... ...             @addsub_3
+SUB_rrri        0001101 ... ... ...             @addsub_3
+
+# Add/subtract (two low registers and immediate)
+
+@addsub_2i      ....... imm:3 rn:3 rd:3 \
+                &s_rri_rot %s rot=0
+
+ADD_rri         0001 110 ... ... ...            @addsub_2i
+SUB_rri         0001 111 ... ... ...            @addsub_2i
+
+# Add, subtract, compare, move (one low register and immediate)
+
+%reg_8          8:3
+@arith_1i       ..... rd:3 imm:8 \
+                &s_rri_rot rot=0 rn=%reg_8
+
+MOV_rxi         00100 ... ........              @arith_1i %s
+CMP_xri         00101 ... ........              @arith_1i s=1
+ADD_rri         00110 ... ........              @arith_1i %s
+SUB_rri         00111 ... ........              @arith_1i %s
+
+# Add, compare, move (two high registers)
+
+%reg_0_7        7:1 0:3
+@addsub_2h      .... .... . rm:4 ... \
+                &s_rrr_shi rd=%reg_0_7 rn=%reg_0_7 shim=0 shty=0
+
+ADD_rrri        0100 0100 . .... ...            @addsub_2h s=0
+CMP_xrri        0100 0101 . .... ...            @addsub_2h s=1
+MOV_rxri        0100 0110 . .... ...            @addsub_2h s=0
+
+# Adjust SP (immediate)
+
+%imm7_0x4       0:7 !function=times_4
+@addsub_sp_i    .... .... . ....... \
+                &s_rri_rot s=0 rd=13 rn=13 rot=0 imm=%imm7_0x4
+
+ADD_rri         1011 0000 0 .......             @addsub_sp_i
+SUB_rri         1011 0000 1 .......             @addsub_sp_i
+
+# Branch and exchange
+
+@branchr        .... .... . rm:4 ...            &r
+
+BX              0100 0111 0 .... 000            @branchr
+BLX_r           0100 0111 1 .... 000            @branchr
+BXNS            0100 0111 0 .... 100            @branchr
+BLXNS           0100 0111 1 .... 100            @branchr
+
+# Extend
+
+@extend         .... .... .. rm:3 rd:3          &rrr_rot rn=15 rot=0
+
+SXTAH           1011 0010 00 ... ...            @extend
+SXTAB           1011 0010 01 ... ...            @extend
+UXTAH           1011 0010 10 ... ...            @extend
+UXTAB           1011 0010 11 ... ...            @extend
+
+# Change processor state
+
+%imod           4:1 !function=plus_2
+
+SETEND          1011 0110 010 1 E:1 000         &setend
+{
+  CPS           1011 0110 011 . 0 A:1 I:1 F:1   &cps mode=0 M=0 %imod
+  CPS_v7m       1011 0110 011 im:1 00 I:1 F:1
+}
+
+# Reverse bytes
+
+@rdm            .... .... .. rm:3 rd:3          &rr
+
+REV             1011 1010 00 ... ...            @rdm
+REV16           1011 1010 01 ... ...            @rdm
+REVSH           1011 1010 11 ... ...            @rdm
+
+# Hints
+
+{
+  {
+    YIELD       1011 1111 0001 0000
+    WFE         1011 1111 0010 0000
+    WFI         1011 1111 0011 0000
+
+    # TODO: Implement SEV, SEVL; may help SMP performance.
+    # SEV       1011 1111 0100 0000
+    # SEVL      1011 1111 0101 0000
+
+    # The canonical nop has the second nibble as 0000, but the whole of the
+    # rest of the space is a reserved hint, behaves as nop.
+    NOP         1011 1111 ---- 0000
+  }
+  IT            1011 1111 cond_mask:8
+}
+
+# Miscellaneous 16-bit instructions
+
+%imm6_9_3       9:1 3:5 !function=times_2
+
+HLT             1011 1010 10 imm:6              &i
+BKPT            1011 1110 imm:8                 &i
+CBZ             1011 nz:1 0.1 ..... rn:3        imm=%imm6_9_3
+
+# Push and Pop
+
+%push_list      0:9 !function=t16_push_list
+%pop_list       0:9 !function=t16_pop_list
+
+STM             1011 010 ......... \
+                &ldst_block i=0 b=1 u=0 w=1 rn=13 list=%push_list
+LDM_t16         1011 110 ......... \
+                &ldst_block i=1 b=0 u=0 w=1 rn=13 list=%pop_list
+
+# Conditional branches, Supervisor call
+
+%imm8_0x2       0:s8 !function=times_2
+
+{
+  UDF           1101 1110 ---- ----
+  SVC           1101 1111 imm:8                 &i
+  B_cond_thumb  1101 cond:4 ........            &ci imm=%imm8_0x2
+}
+
+# Unconditional Branch
+
+%imm11_0x2      0:s11 !function=times_2
+
+B               11100 ...........               &i imm=%imm11_0x2
+
+# thumb_insn_is_16bit() ensures we won't be decoding these as
+# T16 instructions for a Thumb2 CPU, so these patterns must be
+# a Thumb1 split BL/BLX.
+BLX_suffix      11101 imm:11                    &i
+BL_BLX_prefix   11110 imm:s11                   &i
+BL_suffix       11111 imm:11                    &i
diff --git a/target/arm/tcg/t32.decode b/target/arm/tcg/t32.decode
new file mode 100644
index 0000000..f21ad01
--- /dev/null
+++ b/target/arm/tcg/t32.decode
@@ -0,0 +1,753 @@
+# Thumb2 instructions
+#
+#  Copyright (c) 2019 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+
+&empty           !extern
+&s_rrr_shi       !extern s rd rn rm shim shty
+&s_rrr_shr       !extern s rn rd rm rs shty
+&s_rri_rot       !extern s rn rd imm rot
+&s_rrrr          !extern s rd rn rm ra
+&rrrr            !extern rd rn rm ra
+&rrr_rot         !extern rd rn rm rot
+&rrr             !extern rd rn rm
+&rr              !extern rd rm
+&ri              !extern rd imm
+&r               !extern rm
+&i               !extern imm
+&msr_reg         !extern rn r mask
+&mrs_reg         !extern rd r
+&msr_bank        !extern rn r sysm
+&mrs_bank        !extern rd r sysm
+&ldst_rr         !extern p w u rn rt rm shimm shtype
+&ldst_ri         !extern p w u rn rt imm
+&ldst_block      !extern rn i b u w list
+&strex           !extern rn rd rt rt2 imm
+&ldrex           !extern rn rt rt2 imm
+&bfx             !extern rd rn lsb widthm1
+&bfi             !extern rd rn lsb msb
+&sat             !extern rd rn satimm imm sh
+&pkh             !extern rd rn rm imm tb
+&cps             !extern mode imod M A I F
+&mcr             !extern cp opc1 crn crm opc2 rt
+&mcrr            !extern cp opc1 crm rt rt2
+
+&mve_shl_ri      rdalo rdahi shim
+&mve_shl_rr      rdalo rdahi rm
+&mve_sh_ri       rda shim
+&mve_sh_rr       rda rm
+
+# rdahi: bits [3:1] from insn, bit 0 is 1
+# rdalo: bits [3:1] from insn, bit 0 is 0
+%rdahi_9 9:3 !function=times_2_plus_1
+%rdalo_17 17:3 !function=times_2
+
+# Data-processing (register)
+
+%imm5_12_6       12:3 6:2
+
+@s_rrr_shi       ....... .... s:1 rn:4 .... rd:4 .. shty:2 rm:4 \
+                 &s_rrr_shi shim=%imm5_12_6
+@s_rxr_shi       ....... .... s:1 .... .... rd:4 .. shty:2 rm:4 \
+                 &s_rrr_shi shim=%imm5_12_6 rn=0
+@S_xrr_shi       ....... .... .   rn:4 .... .... .. shty:2 rm:4 \
+                 &s_rrr_shi shim=%imm5_12_6 s=1 rd=0
+
+@mve_shl_ri      ....... .... . ... . . ... ... . .. .. .... \
+                 &mve_shl_ri shim=%imm5_12_6 rdalo=%rdalo_17 rdahi=%rdahi_9
+@mve_shl_rr      ....... .... . ... . rm:4  ... . .. .. .... \
+                 &mve_shl_rr rdalo=%rdalo_17 rdahi=%rdahi_9
+@mve_sh_ri       ....... .... . rda:4 . ... ... . .. .. .... \
+                 &mve_sh_ri shim=%imm5_12_6
+@mve_sh_rr       ....... .... . rda:4 rm:4 .... .... .... &mve_sh_rr
+
+{
+  TST_xrri       1110101 0000 1 .... 0 ... 1111 .... ....     @S_xrr_shi
+  AND_rrri       1110101 0000 . .... 0 ... .... .... ....     @s_rrr_shi
+}
+BIC_rrri         1110101 0001 . .... 0 ... .... .... ....     @s_rrr_shi
+{
+  # The v8.1M MVE shift insns overlap in encoding with MOVS/ORRS
+  # and are distinguished by having Rm==13 or 15. Those are UNPREDICTABLE
+  # cases for MOVS/ORRS. We decode the MVE cases first, ensuring that
+  # they explicitly call unallocated_encoding() for cases that must UNDEF
+  # (eg "using a new shift insn on a v8.1M CPU without MVE"), and letting
+  # the rest fall through (where ORR_rrri and MOV_rxri will end up
+  # handling them as r13 and r15 accesses with the same semantics as A32).
+  [
+    {
+      UQSHL_ri   1110101 0010 1 ....  0 ...  1111 .. 00 1111  @mve_sh_ri
+      LSLL_ri    1110101 0010 1 ... 0 0 ... ... 1 .. 00 1111  @mve_shl_ri
+      UQSHLL_ri  1110101 0010 1 ... 1 0 ... ... 1 .. 00 1111  @mve_shl_ri
+    }
+
+    {
+      URSHR_ri   1110101 0010 1 ....  0 ...  1111 .. 01 1111  @mve_sh_ri
+      LSRL_ri    1110101 0010 1 ... 0 0 ... ... 1 .. 01 1111  @mve_shl_ri
+      URSHRL_ri  1110101 0010 1 ... 1 0 ... ... 1 .. 01 1111  @mve_shl_ri
+    }
+
+    {
+      SRSHR_ri   1110101 0010 1 ....  0 ...  1111 .. 10 1111  @mve_sh_ri
+      ASRL_ri    1110101 0010 1 ... 0 0 ... ... 1 .. 10 1111  @mve_shl_ri
+      SRSHRL_ri  1110101 0010 1 ... 1 0 ... ... 1 .. 10 1111  @mve_shl_ri
+    }
+
+    {
+      SQSHL_ri   1110101 0010 1 ....  0 ...  1111 .. 11 1111  @mve_sh_ri
+      SQSHLL_ri  1110101 0010 1 ... 1 0 ... ... 1 .. 11 1111  @mve_shl_ri
+    }
+
+    {
+      UQRSHL_rr    1110101 0010 1 ....  ....  1111 0000 1101  @mve_sh_rr
+      LSLL_rr      1110101 0010 1 ... 0 .... ... 1 0000 1101  @mve_shl_rr
+      UQRSHLL64_rr 1110101 0010 1 ... 1 .... ... 1 0000 1101  @mve_shl_rr
+    }
+
+    {
+      SQRSHR_rr    1110101 0010 1 ....  ....  1111 0010 1101  @mve_sh_rr
+      ASRL_rr      1110101 0010 1 ... 0 .... ... 1 0010 1101  @mve_shl_rr
+      SQRSHRL64_rr 1110101 0010 1 ... 1 .... ... 1 0010 1101  @mve_shl_rr
+    }
+
+    UQRSHLL48_rr 1110101 0010 1 ... 1 ....  ... 1  1000 1101  @mve_shl_rr
+    SQRSHRL48_rr 1110101 0010 1 ... 1 ....  ... 1  1010 1101  @mve_shl_rr
+  ]
+
+  MOV_rxri       1110101 0010 . 1111 0 ... .... .... ....     @s_rxr_shi
+  ORR_rrri       1110101 0010 . .... 0 ... .... .... ....     @s_rrr_shi
+
+  # v8.1M CSEL and friends
+  CSEL           1110101 0010 1 rn:4 10 op:2 rd:4 fcond:4 rm:4
+}
+{
+  MVN_rxri       1110101 0011 . 1111 0 ... .... .... ....     @s_rxr_shi
+  ORN_rrri       1110101 0011 . .... 0 ... .... .... ....     @s_rrr_shi
+}
+{
+  TEQ_xrri       1110101 0100 1 .... 0 ... 1111 .... ....     @S_xrr_shi
+  EOR_rrri       1110101 0100 . .... 0 ... .... .... ....     @s_rrr_shi
+}
+PKH              1110101 0110 0 rn:4 0 ... rd:4 .. tb:1 0 rm:4 \
+                 &pkh imm=%imm5_12_6
+{
+  CMN_xrri       1110101 1000 1 .... 0 ... 1111 .... ....     @S_xrr_shi
+  ADD_rrri       1110101 1000 . .... 0 ... .... .... ....     @s_rrr_shi
+}
+ADC_rrri         1110101 1010 . .... 0 ... .... .... ....     @s_rrr_shi
+SBC_rrri         1110101 1011 . .... 0 ... .... .... ....     @s_rrr_shi
+{
+  CMP_xrri       1110101 1101 1 .... 0 ... 1111 .... ....     @S_xrr_shi
+  SUB_rrri       1110101 1101 . .... 0 ... .... .... ....     @s_rrr_shi
+}
+RSB_rrri         1110101 1110 . .... 0 ... .... .... ....     @s_rrr_shi
+
+# Data-processing (register-shifted register)
+
+MOV_rxrr         1111 1010 0 shty:2 s:1 rm:4 1111 rd:4 0000 rs:4 \
+                 &s_rrr_shr rn=0
+
+# Data-processing (immediate)
+
+%t32extrot       26:1 12:3 0:8  !function=t32_expandimm_rot
+%t32extimm       26:1 12:3 0:8  !function=t32_expandimm_imm
+
+@s_rri_rot       ....... .... s:1 rn:4 . ... rd:4 ........ \
+                 &s_rri_rot imm=%t32extimm rot=%t32extrot
+@s_rxi_rot       ....... .... s:1 .... . ... rd:4 ........ \
+                 &s_rri_rot imm=%t32extimm rot=%t32extrot rn=0
+@S_xri_rot       ....... .... .   rn:4 . ... .... ........ \
+                 &s_rri_rot imm=%t32extimm rot=%t32extrot s=1 rd=0
+
+{
+  TST_xri        1111 0.0 0000 1 .... 0 ... 1111 ........     @S_xri_rot
+  AND_rri        1111 0.0 0000 . .... 0 ... .... ........     @s_rri_rot
+}
+BIC_rri          1111 0.0 0001 . .... 0 ... .... ........     @s_rri_rot
+{
+  MOV_rxi        1111 0.0 0010 . 1111 0 ... .... ........     @s_rxi_rot
+  ORR_rri        1111 0.0 0010 . .... 0 ... .... ........     @s_rri_rot
+}
+{
+  MVN_rxi        1111 0.0 0011 . 1111 0 ... .... ........     @s_rxi_rot
+  ORN_rri        1111 0.0 0011 . .... 0 ... .... ........     @s_rri_rot
+}
+{
+  TEQ_xri        1111 0.0 0100 1 .... 0 ... 1111 ........     @S_xri_rot
+  EOR_rri        1111 0.0 0100 . .... 0 ... .... ........     @s_rri_rot
+}
+{
+  CMN_xri        1111 0.0 1000 1 .... 0 ... 1111 ........     @S_xri_rot
+  ADD_rri        1111 0.0 1000 . .... 0 ... .... ........     @s_rri_rot
+}
+ADC_rri          1111 0.0 1010 . .... 0 ... .... ........     @s_rri_rot
+SBC_rri          1111 0.0 1011 . .... 0 ... .... ........     @s_rri_rot
+{
+  CMP_xri        1111 0.0 1101 1 .... 0 ... 1111 ........     @S_xri_rot
+  SUB_rri        1111 0.0 1101 . .... 0 ... .... ........     @s_rri_rot
+}
+RSB_rri          1111 0.0 1110 . .... 0 ... .... ........     @s_rri_rot
+
+# Data processing (plain binary immediate)
+
+%imm12_26_12_0   26:1 12:3 0:8
+%neg12_26_12_0   26:1 12:3 0:8 !function=negate
+@s0_rri_12       .... ... .... . rn:4 . ... rd:4 ........ \
+                 &s_rri_rot imm=%imm12_26_12_0 rot=0 s=0
+
+{
+  ADR            1111 0.1 0000 0 1111 0 ... rd:4 ........ \
+                 &ri imm=%imm12_26_12_0
+  ADD_rri        1111 0.1 0000 0 .... 0 ... .... ........     @s0_rri_12
+}
+{
+  ADR            1111 0.1 0101 0 1111 0 ... rd:4 ........ \
+                 &ri imm=%neg12_26_12_0
+  SUB_rri        1111 0.1 0101 0 .... 0 ... .... ........     @s0_rri_12
+}
+
+# Move Wide
+
+%imm16_26_16_12_0 16:4 26:1 12:3 0:8
+@mov16           .... .... .... .... .... rd:4 .... .... \
+                 &ri imm=%imm16_26_16_12_0
+
+MOVW             1111 0.10 0100 .... 0 ... .... ........      @mov16
+MOVT             1111 0.10 1100 .... 0 ... .... ........      @mov16
+
+# Saturate, bitfield
+
+@sat             .... .... .. sh:1 . rn:4 . ... rd:4 .. . satimm:5 \
+                 &sat imm=%imm5_12_6
+@sat16           .... .... .. .    . rn:4 . ... rd:4 .. . satimm:5 \
+                 &sat sh=0 imm=0
+
+{
+  SSAT16         1111 0011 001 0 .... 0 000 .... 00 0 .....   @sat16
+  SSAT           1111 0011 00. 0 .... 0 ... .... .. 0 .....   @sat
+}
+{
+  USAT16         1111 0011 101 0 .... 0 000 .... 00 0 .....   @sat16
+  USAT           1111 0011 10. 0 .... 0 ... .... .. 0 .....   @sat
+}
+
+@bfx             .... .... ... . rn:4 . ... rd:4 .. . widthm1:5 \
+                 &bfx lsb=%imm5_12_6
+@bfi             .... .... ... . rn:4 . ... rd:4 .. . msb:5 \
+                 &bfi lsb=%imm5_12_6
+
+SBFX             1111 0011 010 0 .... 0 ... .... ..0.....     @bfx
+UBFX             1111 0011 110 0 .... 0 ... .... ..0.....     @bfx
+
+# bfc is bfi w/ rn=15
+BFCI             1111 0011 011 0 .... 0 ... .... ..0.....     @bfi
+
+# Multiply and multiply accumulate
+
+@s0_rnadm        .... .... .... rn:4 ra:4 rd:4 .... rm:4      &s_rrrr s=0
+@s0_rn0dm        .... .... .... rn:4 .... rd:4 .... rm:4      &s_rrrr ra=0 s=0
+@rnadm           .... .... .... rn:4 ra:4 rd:4 .... rm:4      &rrrr
+@rn0dm           .... .... .... rn:4 .... rd:4 .... rm:4      &rrrr ra=0
+@rndm            .... .... .... rn:4 .... rd:4 .... rm:4      &rrr
+@rdm             .... .... .... .... .... rd:4 .... rm:4      &rr
+
+{
+  MUL            1111 1011 0000 .... 1111 .... 0000 ....      @s0_rn0dm
+  MLA            1111 1011 0000 .... .... .... 0000 ....      @s0_rnadm
+}
+MLS              1111 1011 0000 .... .... .... 0001 ....      @rnadm
+SMULL            1111 1011 1000 .... .... .... 0000 ....      @s0_rnadm
+UMULL            1111 1011 1010 .... .... .... 0000 ....      @s0_rnadm
+SMLAL            1111 1011 1100 .... .... .... 0000 ....      @s0_rnadm
+UMLAL            1111 1011 1110 .... .... .... 0000 ....      @s0_rnadm
+UMAAL            1111 1011 1110 .... .... .... 0110 ....      @rnadm
+{
+  SMULWB         1111 1011 0011 .... 1111 .... 0000 ....      @rn0dm
+  SMLAWB         1111 1011 0011 .... .... .... 0000 ....      @rnadm
+}
+{
+  SMULWT         1111 1011 0011 .... 1111 .... 0001 ....      @rn0dm
+  SMLAWT         1111 1011 0011 .... .... .... 0001 ....      @rnadm
+}
+{
+  SMULBB         1111 1011 0001 .... 1111 .... 0000 ....      @rn0dm
+  SMLABB         1111 1011 0001 .... .... .... 0000 ....      @rnadm
+}
+{
+  SMULBT         1111 1011 0001 .... 1111 .... 0001 ....      @rn0dm
+  SMLABT         1111 1011 0001 .... .... .... 0001 ....      @rnadm
+}
+{
+  SMULTB         1111 1011 0001 .... 1111 .... 0010 ....      @rn0dm
+  SMLATB         1111 1011 0001 .... .... .... 0010 ....      @rnadm
+}
+{
+  SMULTT         1111 1011 0001 .... 1111 .... 0011 ....      @rn0dm
+  SMLATT         1111 1011 0001 .... .... .... 0011 ....      @rnadm
+}
+SMLALBB          1111 1011 1100 .... .... .... 1000 ....      @rnadm
+SMLALBT          1111 1011 1100 .... .... .... 1001 ....      @rnadm
+SMLALTB          1111 1011 1100 .... .... .... 1010 ....      @rnadm
+SMLALTT          1111 1011 1100 .... .... .... 1011 ....      @rnadm
+
+# usad8 is usada8 w/ ra=15
+USADA8           1111 1011 0111 .... .... .... 0000 ....      @rnadm
+
+SMLAD            1111 1011 0010 .... .... .... 0000 ....      @rnadm
+SMLADX           1111 1011 0010 .... .... .... 0001 ....      @rnadm
+SMLSD            1111 1011 0100 .... .... .... 0000 ....      @rnadm
+SMLSDX           1111 1011 0100 .... .... .... 0001 ....      @rnadm
+
+SMLALD           1111 1011 1100 .... .... .... 1100 ....      @rnadm
+SMLALDX          1111 1011 1100 .... .... .... 1101 ....      @rnadm
+SMLSLD           1111 1011 1101 .... .... .... 1100 ....      @rnadm
+SMLSLDX          1111 1011 1101 .... .... .... 1101 ....      @rnadm
+
+SMMLA            1111 1011 0101 .... .... .... 0000 ....      @rnadm
+SMMLAR           1111 1011 0101 .... .... .... 0001 ....      @rnadm
+SMMLS            1111 1011 0110 .... .... .... 0000 ....      @rnadm
+SMMLSR           1111 1011 0110 .... .... .... 0001 ....      @rnadm
+
+SDIV             1111 1011 1001 .... 1111 .... 1111 ....      @rndm
+UDIV             1111 1011 1011 .... 1111 .... 1111 ....      @rndm
+
+# Data-processing (two source registers)
+
+QADD             1111 1010 1000 .... 1111 .... 1000 ....      @rndm
+QSUB             1111 1010 1000 .... 1111 .... 1010 ....      @rndm
+QDADD            1111 1010 1000 .... 1111 .... 1001 ....      @rndm
+QDSUB            1111 1010 1000 .... 1111 .... 1011 ....      @rndm
+
+CRC32B           1111 1010 1100 .... 1111 .... 1000 ....      @rndm
+CRC32H           1111 1010 1100 .... 1111 .... 1001 ....      @rndm
+CRC32W           1111 1010 1100 .... 1111 .... 1010 ....      @rndm
+CRC32CB          1111 1010 1101 .... 1111 .... 1000 ....      @rndm
+CRC32CH          1111 1010 1101 .... 1111 .... 1001 ....      @rndm
+CRC32CW          1111 1010 1101 .... 1111 .... 1010 ....      @rndm
+
+SEL              1111 1010 1010 .... 1111 .... 1000 ....      @rndm
+
+# Note rn != rm is CONSTRAINED UNPREDICTABLE; we choose to ignore rn.
+REV              1111 1010 1001 ---- 1111 .... 1000 ....      @rdm
+REV16            1111 1010 1001 ---- 1111 .... 1001 ....      @rdm
+RBIT             1111 1010 1001 ---- 1111 .... 1010 ....      @rdm
+REVSH            1111 1010 1001 ---- 1111 .... 1011 ....      @rdm
+CLZ              1111 1010 1011 ---- 1111 .... 1000 ....      @rdm
+
+# Branches and miscellaneous control
+
+%msr_sysm        4:1 8:4
+%mrs_sysm        4:1 16:4
+%imm16_16_0      16:4 0:12
+%imm21           26:s1 11:1 13:1 16:6 0:11 !function=times_2
+&ci              cond imm
+
+{
+  # Group insn[25:23] = 111, which is cond=111x for the branch below,
+  # or unconditional, which would be illegal for the branch.
+  [
+    # Hints, and CPS
+    {
+      [
+        YIELD    1111 0011 1010 1111 1000 0000 0000 0001
+        WFE      1111 0011 1010 1111 1000 0000 0000 0010
+        WFI      1111 0011 1010 1111 1000 0000 0000 0011
+
+        # TODO: Implement SEV, SEVL; may help SMP performance.
+        # SEV    1111 0011 1010 1111 1000 0000 0000 0100
+        # SEVL   1111 0011 1010 1111 1000 0000 0000 0101
+
+        ESB      1111 0011 1010 1111 1000 0000 0001 0000
+      ]
+
+      # The canonical nop ends in 0000 0000, but the whole rest
+      # of the space is "reserved hint, behaves as nop".
+      NOP        1111 0011 1010 1111 1000 0000 ---- ----
+
+      # If imod == '00' && M == '0' then SEE "Hint instructions", above.
+      CPS        1111 0011 1010 1111 1000 0 imod:2 M:1 A:1 I:1 F:1 mode:5 \
+                 &cps
+    }
+
+    # Miscellaneous control
+    CLREX        1111 0011 1011 1111 1000 1111 0010 1111
+    DSB          1111 0011 1011 1111 1000 1111 0100 ----
+    DMB          1111 0011 1011 1111 1000 1111 0101 ----
+    ISB          1111 0011 1011 1111 1000 1111 0110 ----
+    SB           1111 0011 1011 1111 1000 1111 0111 0000
+
+    # Note that the v7m insn overlaps both the normal and banked insn.
+    {
+      MRS_bank   1111 0011 111 r:1 .... 1000 rd:4   001. 0000  \
+                 &mrs_bank sysm=%mrs_sysm
+      MRS_reg    1111 0011 111 r:1 1111 1000 rd:4   0000 0000  &mrs_reg
+      MRS_v7m    1111 0011 111 0   1111 1000 rd:4   sysm:8
+    }
+    {
+      MSR_bank   1111 0011 100 r:1 rn:4 1000 ....   001. 0000  \
+                 &msr_bank sysm=%msr_sysm
+      MSR_reg    1111 0011 100 r:1 rn:4 1000 mask:4 0000 0000  &msr_reg
+      MSR_v7m    1111 0011 100 0   rn:4 1000 mask:2 00 sysm:8
+    }
+    BXJ          1111 0011 1100 rm:4 1000 1111 0000 0000      &r
+    {
+      # At v6T2, this is the T5 encoding of SUBS PC, LR, #IMM, and works as for
+      # every other encoding of SUBS.  With v7VE, IMM=0 is redefined as ERET.
+      # The distinction between the two only matters for Hyp mode.
+      ERET       1111 0011 1101 1110 1000 1111 0000 0000
+      SUB_rri    1111 0011 1101 1110 1000 1111 imm:8 \
+                 &s_rri_rot rot=0 s=1 rd=15 rn=14
+    }
+    SMC          1111 0111 1111 imm:4 1000 0000 0000 0000     &i
+    HVC          1111 0111 1110 ....  1000 .... .... ....     \
+                 &i imm=%imm16_16_0
+    UDF          1111 0111 1111 ----  1010 ---- ---- ----
+  ]
+  B_cond_thumb   1111 0. cond:4 ...... 10.0 ............      &ci imm=%imm21
+}
+
+# Load/store (register, immediate, literal)
+
+@ldst_rr         .... .... .... rn:4 rt:4 ...... shimm:2 rm:4 \
+                 &ldst_rr p=1 w=0 u=1 shtype=0
+@ldst_ri_idx     .... .... .... rn:4 rt:4 . p:1 u:1 . imm:8 \
+                 &ldst_ri w=1
+@ldst_ri_neg     .... .... .... rn:4 rt:4 .... imm:8 \
+                 &ldst_ri p=1 w=0 u=0
+@ldst_ri_unp     .... .... .... rn:4 rt:4 .... imm:8 \
+                 &ldst_ri p=1 w=0 u=1
+@ldst_ri_pos     .... .... .... rn:4 rt:4 imm:12 \
+                 &ldst_ri p=1 w=0 u=1
+@ldst_ri_lit     .... .... u:1 ... .... rt:4 imm:12 \
+                 &ldst_ri p=1 w=0 rn=15
+
+STRB_rr          1111 1000 0000 .... .... 000000 .. ....      @ldst_rr
+STRB_ri          1111 1000 0000 .... .... 1..1 ........       @ldst_ri_idx
+STRB_ri          1111 1000 0000 .... .... 1100 ........       @ldst_ri_neg
+STRBT_ri         1111 1000 0000 .... .... 1110 ........       @ldst_ri_unp
+STRB_ri          1111 1000 1000 .... .... ............        @ldst_ri_pos
+
+STRH_rr          1111 1000 0010 .... .... 000000 .. ....      @ldst_rr
+STRH_ri          1111 1000 0010 .... .... 1..1 ........       @ldst_ri_idx
+STRH_ri          1111 1000 0010 .... .... 1100 ........       @ldst_ri_neg
+STRHT_ri         1111 1000 0010 .... .... 1110 ........       @ldst_ri_unp
+STRH_ri          1111 1000 1010 .... .... ............        @ldst_ri_pos
+
+STR_rr           1111 1000 0100 .... .... 000000 .. ....      @ldst_rr
+STR_ri           1111 1000 0100 .... .... 1..1 ........       @ldst_ri_idx
+STR_ri           1111 1000 0100 .... .... 1100 ........       @ldst_ri_neg
+STRT_ri          1111 1000 0100 .... .... 1110 ........       @ldst_ri_unp
+STR_ri           1111 1000 1100 .... .... ............        @ldst_ri_pos
+
+# Note that Load, unsigned (literal) overlaps all other load encodings.
+{
+  {
+    NOP          1111 1000 -001 1111 1111 ------------        # PLD
+    LDRB_ri      1111 1000 .001 1111 .... ............        @ldst_ri_lit
+  }
+  {
+    NOP          1111 1000 1001 ---- 1111 ------------        # PLD
+    LDRB_ri      1111 1000 1001 .... .... ............        @ldst_ri_pos
+  }
+  LDRB_ri        1111 1000 0001 .... .... 1..1 ........       @ldst_ri_idx
+  {
+    NOP          1111 1000 0001 ---- 1111 1100 --------       # PLD
+    LDRB_ri      1111 1000 0001 .... .... 1100 ........       @ldst_ri_neg
+  }
+  LDRBT_ri       1111 1000 0001 .... .... 1110 ........       @ldst_ri_unp
+  {
+    NOP          1111 1000 0001 ---- 1111 000000 -- ----      # PLD
+    LDRB_rr      1111 1000 0001 .... .... 000000 .. ....      @ldst_rr
+  }
+}
+{
+  {
+    NOP          1111 1000 -011 1111 1111 ------------        # PLD
+    LDRH_ri      1111 1000 .011 1111 .... ............        @ldst_ri_lit
+  }
+  {
+    NOP          1111 1000 1011 ---- 1111 ------------        # PLDW
+    LDRH_ri      1111 1000 1011 .... .... ............        @ldst_ri_pos
+  }
+  LDRH_ri        1111 1000 0011 .... .... 1..1 ........       @ldst_ri_idx
+  {
+    NOP          1111 1000 0011 ---- 1111 1100 --------       # PLDW
+    LDRH_ri      1111 1000 0011 .... .... 1100 ........       @ldst_ri_neg
+  }
+  LDRHT_ri       1111 1000 0011 .... .... 1110 ........       @ldst_ri_unp
+  {
+    NOP          1111 1000 0011 ---- 1111 000000 -- ----      # PLDW
+    LDRH_rr      1111 1000 0011 .... .... 000000 .. ....      @ldst_rr
+  }
+}
+{
+  LDR_ri         1111 1000 .101 1111 .... ............        @ldst_ri_lit
+  LDR_ri         1111 1000 1101 .... .... ............        @ldst_ri_pos
+  LDR_ri         1111 1000 0101 .... .... 1..1 ........       @ldst_ri_idx
+  LDR_ri         1111 1000 0101 .... .... 1100 ........       @ldst_ri_neg
+  LDRT_ri        1111 1000 0101 .... .... 1110 ........       @ldst_ri_unp
+  LDR_rr         1111 1000 0101 .... .... 000000 .. ....      @ldst_rr
+}
+# NOPs here are PLI.
+{
+  {
+    NOP          1111 1001 -001 1111 1111 ------------
+    LDRSB_ri     1111 1001 .001 1111 .... ............        @ldst_ri_lit
+  }
+  {
+    NOP          1111 1001 1001 ---- 1111 ------------
+    LDRSB_ri     1111 1001 1001 .... .... ............        @ldst_ri_pos
+  }
+  LDRSB_ri       1111 1001 0001 .... .... 1..1 ........       @ldst_ri_idx
+  {
+    NOP          1111 1001 0001 ---- 1111 1100 --------
+    LDRSB_ri     1111 1001 0001 .... .... 1100 ........       @ldst_ri_neg
+  }
+  LDRSBT_ri      1111 1001 0001 .... .... 1110 ........       @ldst_ri_unp
+  {
+    NOP          1111 1001 0001 ---- 1111 000000 -- ----
+    LDRSB_rr     1111 1001 0001 .... .... 000000 .. ....      @ldst_rr
+  }
+}
+# NOPs here are unallocated memory hints, treated as NOP.
+{
+  {
+    NOP          1111 1001 -011 1111 1111 ------------
+    LDRSH_ri     1111 1001 .011 1111 .... ............        @ldst_ri_lit
+  }
+  {
+    NOP          1111 1001 1011 ---- 1111 ------------
+    LDRSH_ri     1111 1001 1011 .... .... ............        @ldst_ri_pos
+  }
+  LDRSH_ri       1111 1001 0011 .... .... 1..1 ........       @ldst_ri_idx
+  {
+    NOP          1111 1001 0011 ---- 1111 1100 --------
+    LDRSH_ri     1111 1001 0011 .... .... 1100 ........       @ldst_ri_neg
+  }
+  LDRSHT_ri      1111 1001 0011 .... .... 1110 ........       @ldst_ri_unp
+  {
+    NOP          1111 1001 0011 ---- 1111 000000 -- ----
+    LDRSH_rr     1111 1001 0011 .... .... 000000 .. ....      @ldst_rr
+  }
+}
+
+%imm8x4          0:8 !function=times_4
+&ldst_ri2        p w u rn rt rt2 imm
+@ldstd_ri8       .... .... u:1 ... rn:4 rt:4 rt2:4 ........   \
+                 &ldst_ri2 imm=%imm8x4
+
+STRD_ri_t32      1110 1000 .110 .... .... .... ........    @ldstd_ri8 w=1 p=0
+LDRD_ri_t32      1110 1000 .111 .... .... .... ........    @ldstd_ri8 w=1 p=0
+
+STRD_ri_t32      1110 1001 .100 .... .... .... ........    @ldstd_ri8 w=0 p=1
+LDRD_ri_t32      1110 1001 .101 .... .... .... ........    @ldstd_ri8 w=0 p=1
+
+STRD_ri_t32      1110 1001 .110 .... .... .... ........    @ldstd_ri8 w=1 p=1
+{
+  SG             1110 1001 0111 1111 1110 1001 01111111
+  LDRD_ri_t32    1110 1001 .111 .... .... .... ........    @ldstd_ri8 w=1 p=1
+}
+
+# Load/Store Exclusive, Load-Acquire/Store-Release, and Table Branch
+
+@strex_i         .... .... .... rn:4 rt:4 rd:4 .... .... \
+                 &strex rt2=15 imm=%imm8x4
+@strex_0         .... .... .... rn:4 rt:4 .... .... rd:4 \
+                 &strex rt2=15 imm=0
+@strex_d         .... .... .... rn:4 rt:4 rt2:4 .... rd:4 \
+                 &strex imm=0
+
+@ldrex_i         .... .... .... rn:4 rt:4 .... .... .... \
+                 &ldrex rt2=15 imm=%imm8x4
+@ldrex_0         .... .... .... rn:4 rt:4 .... .... .... \
+                 &ldrex rt2=15 imm=0
+@ldrex_d         .... .... .... rn:4 rt:4 rt2:4 .... .... \
+                 &ldrex imm=0
+
+{
+  TT             1110 1000 0100 rn:4 1111 rd:4 A:1 T:1 000000
+  STREX          1110 1000 0100 .... .... .... .... ....      @strex_i
+}
+STREXB           1110 1000 1100 .... .... 1111 0100 ....      @strex_0
+STREXH           1110 1000 1100 .... .... 1111 0101 ....      @strex_0
+STREXD_t32       1110 1000 1100 .... .... .... 0111 ....      @strex_d
+
+STLEX            1110 1000 1100 .... .... 1111 1110 ....      @strex_0
+STLEXB           1110 1000 1100 .... .... 1111 1100 ....      @strex_0
+STLEXH           1110 1000 1100 .... .... 1111 1101 ....      @strex_0
+STLEXD_t32       1110 1000 1100 .... .... .... 1111 ....      @strex_d
+
+STL              1110 1000 1100 .... .... 1111 1010 1111      @ldrex_0
+STLB             1110 1000 1100 .... .... 1111 1000 1111      @ldrex_0
+STLH             1110 1000 1100 .... .... 1111 1001 1111      @ldrex_0
+
+LDREX            1110 1000 0101 .... .... 1111 .... ....      @ldrex_i
+LDREXB           1110 1000 1101 .... .... 1111 0100 1111      @ldrex_0
+LDREXH           1110 1000 1101 .... .... 1111 0101 1111      @ldrex_0
+LDREXD_t32       1110 1000 1101 .... .... .... 0111 1111      @ldrex_d
+
+LDAEX            1110 1000 1101 .... .... 1111 1110 1111      @ldrex_0
+LDAEXB           1110 1000 1101 .... .... 1111 1100 1111      @ldrex_0
+LDAEXH           1110 1000 1101 .... .... 1111 1101 1111      @ldrex_0
+LDAEXD_t32       1110 1000 1101 .... .... .... 1111 1111      @ldrex_d
+
+LDA              1110 1000 1101 .... .... 1111 1010 1111      @ldrex_0
+LDAB             1110 1000 1101 .... .... 1111 1000 1111      @ldrex_0
+LDAH             1110 1000 1101 .... .... 1111 1001 1111      @ldrex_0
+
+&tbranch         rn rm
+@tbranch         .... .... .... rn:4 .... .... .... rm:4      &tbranch
+
+TBB              1110 1000 1101 .... 1111 0000 0000 ....      @tbranch
+TBH              1110 1000 1101 .... 1111 0000 0001 ....      @tbranch
+
+# Parallel addition and subtraction
+
+SADD8            1111 1010 1000 .... 1111 .... 0000 ....      @rndm
+QADD8            1111 1010 1000 .... 1111 .... 0001 ....      @rndm
+SHADD8           1111 1010 1000 .... 1111 .... 0010 ....      @rndm
+UADD8            1111 1010 1000 .... 1111 .... 0100 ....      @rndm
+UQADD8           1111 1010 1000 .... 1111 .... 0101 ....      @rndm
+UHADD8           1111 1010 1000 .... 1111 .... 0110 ....      @rndm
+
+SADD16           1111 1010 1001 .... 1111 .... 0000 ....      @rndm
+QADD16           1111 1010 1001 .... 1111 .... 0001 ....      @rndm
+SHADD16          1111 1010 1001 .... 1111 .... 0010 ....      @rndm
+UADD16           1111 1010 1001 .... 1111 .... 0100 ....      @rndm
+UQADD16          1111 1010 1001 .... 1111 .... 0101 ....      @rndm
+UHADD16          1111 1010 1001 .... 1111 .... 0110 ....      @rndm
+
+SASX             1111 1010 1010 .... 1111 .... 0000 ....      @rndm
+QASX             1111 1010 1010 .... 1111 .... 0001 ....      @rndm
+SHASX            1111 1010 1010 .... 1111 .... 0010 ....      @rndm
+UASX             1111 1010 1010 .... 1111 .... 0100 ....      @rndm
+UQASX            1111 1010 1010 .... 1111 .... 0101 ....      @rndm
+UHASX            1111 1010 1010 .... 1111 .... 0110 ....      @rndm
+
+SSUB8            1111 1010 1100 .... 1111 .... 0000 ....      @rndm
+QSUB8            1111 1010 1100 .... 1111 .... 0001 ....      @rndm
+SHSUB8           1111 1010 1100 .... 1111 .... 0010 ....      @rndm
+USUB8            1111 1010 1100 .... 1111 .... 0100 ....      @rndm
+UQSUB8           1111 1010 1100 .... 1111 .... 0101 ....      @rndm
+UHSUB8           1111 1010 1100 .... 1111 .... 0110 ....      @rndm
+
+SSUB16           1111 1010 1101 .... 1111 .... 0000 ....      @rndm
+QSUB16           1111 1010 1101 .... 1111 .... 0001 ....      @rndm
+SHSUB16          1111 1010 1101 .... 1111 .... 0010 ....      @rndm
+USUB16           1111 1010 1101 .... 1111 .... 0100 ....      @rndm
+UQSUB16          1111 1010 1101 .... 1111 .... 0101 ....      @rndm
+UHSUB16          1111 1010 1101 .... 1111 .... 0110 ....      @rndm
+
+SSAX             1111 1010 1110 .... 1111 .... 0000 ....      @rndm
+QSAX             1111 1010 1110 .... 1111 .... 0001 ....      @rndm
+SHSAX            1111 1010 1110 .... 1111 .... 0010 ....      @rndm
+USAX             1111 1010 1110 .... 1111 .... 0100 ....      @rndm
+UQSAX            1111 1010 1110 .... 1111 .... 0101 ....      @rndm
+UHSAX            1111 1010 1110 .... 1111 .... 0110 ....      @rndm
+
+# Register extends
+
+@rrr_rot         .... .... .... rn:4 .... rd:4 .. rot:2 rm:4  &rrr_rot
+
+SXTAH            1111 1010 0000 .... 1111 .... 10.. ....      @rrr_rot
+UXTAH            1111 1010 0001 .... 1111 .... 10.. ....      @rrr_rot
+SXTAB16          1111 1010 0010 .... 1111 .... 10.. ....      @rrr_rot
+UXTAB16          1111 1010 0011 .... 1111 .... 10.. ....      @rrr_rot
+SXTAB            1111 1010 0100 .... 1111 .... 10.. ....      @rrr_rot
+UXTAB            1111 1010 0101 .... 1111 .... 10.. ....      @rrr_rot
+
+# Load/store multiple
+
+@ldstm           .... .... .. w:1 . rn:4 list:16              &ldst_block u=0
+
+STM_t32          1110 1000 10.0 .... ................         @ldstm i=1 b=0
+STM_t32          1110 1001 00.0 .... ................         @ldstm i=0 b=1
+{
+  # Rn=15 UNDEFs for LDM; M-profile CLRM uses that encoding
+  CLRM           1110 1000 1001 1111 list:16
+  LDM_t32        1110 1000 10.1 .... ................         @ldstm i=1 b=0
+}
+LDM_t32          1110 1001 00.1 .... ................         @ldstm i=0 b=1
+
+&rfe             !extern rn w pu
+@rfe             .... .... .. w:1 . rn:4 ................     &rfe
+
+RFE              1110 1000 00.1 .... 1100000000000000         @rfe pu=2
+RFE              1110 1001 10.1 .... 1100000000000000         @rfe pu=1
+
+&srs             !extern mode w pu
+@srs             .... .... .. w:1 . .... ........... mode:5   &srs
+
+SRS              1110 1000 00.0 1101 1100 0000 000. ....      @srs pu=2
+SRS              1110 1001 10.0 1101 1100 0000 000. ....      @srs pu=1
+
+# Coprocessor instructions
+
+# We decode MCR, MCR, MRRC and MCRR only, because for QEMU the
+# other coprocessor instructions always UNDEF.
+# The trans_ functions for these will ignore cp values 8..13 for v7 or
+# earlier, and 0..13 for v8 and later, because those areas of the
+# encoding space may be used for other things, such as VFP or Neon.
+
+@mcr             .... .... opc1:3 . crn:4 rt:4 cp:4 opc2:3 . crm:4
+@mcrr            .... .... .... rt2:4 rt:4 cp:4 opc1:4 crm:4
+
+MCRR             1110 1100 0100 .... .... .... .... .... @mcrr
+MRRC             1110 1100 0101 .... .... .... .... .... @mcrr
+
+MCR              1110 1110 ... 0 .... .... .... ... 1 .... @mcr
+MRC              1110 1110 ... 1 .... .... .... ... 1 .... @mcr
+
+# Branches
+
+%imm24           26:s1 13:1 11:1 16:10 0:11 !function=t32_branch24
+@branch24        ................................             &i imm=%imm24
+
+B                1111 0. .......... 10.1 ............         @branch24
+BL               1111 0. .......... 11.1 ............         @branch24
+{
+  # BLX_i is non-M-profile only
+  BLX_i          1111 0. .......... 11.0 ............         @branch24
+  # M-profile only: loop and branch insns
+  [
+    # All these BF insns have boff != 0b0000; we NOP them all
+    BF           1111 0 boff:4  ------- 1100 - ---------- 1    # BFL
+    BF           1111 0 boff:4 0 ------ 1110 - ---------- 1    # BFCSEL
+    BF           1111 0 boff:4 10 ----- 1110 - ---------- 1    # BF
+    BF           1111 0 boff:4 11 ----- 1110 0 0000000000 1    # BFX, BFLX
+  ]
+  [
+    # LE and WLS immediate
+    %lob_imm 1:10 11:1 !function=times_2
+
+    DLS          1111 0 0000 100     rn:4 1110 0000 0000 0001 size=4
+    WLS          1111 0 0000 100     rn:4 1100 . .......... 1 imm=%lob_imm size=4
+    {
+      LE         1111 0 0000 0 f:1 tp:1 1111 1100 . .......... 1 imm=%lob_imm
+      # This is WLSTP
+      WLS        1111 0 0000 0 size:2 rn:4 1100 . .......... 1 imm=%lob_imm
+    }
+    {
+      LCTP       1111 0 0000 000     1111 1110 0000 0000 0001
+      # This is DLSTP
+      DLS        1111 0 0000 0 size:2 rn:4 1110 0000 0000 0001
+    }
+    VCTP         1111 0 0000 0 size:2 rn:4 1110 1000 0000 0001
+  ]
+}
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
new file mode 100644
index 0000000..da9f877
--- /dev/null
+++ b/target/arm/tcg/translate-a64.c
@@ -0,0 +1,15054 @@
+/*
+ *  AArch64 translation
+ *
+ *  Copyright (c) 2013 Alexander Graf <agraf@suse.de>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include "qemu/osdep.h"
+
+#include "cpu.h"
+#include "exec/exec-all.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
+#include "qemu/log.h"
+#include "arm_ldst.h"
+#include "translate.h"
+#include "internals.h"
+#include "qemu/host-utils.h"
+#include "semihosting/semihost.h"
+#include "exec/gen-icount.h"
+#include "exec/helper-proto.h"
+#include "exec/helper-gen.h"
+#include "exec/log.h"
+#include "cpregs.h"
+#include "translate-a64.h"
+#include "qemu/atomic128.h"
+
+static TCGv_i64 cpu_X[32];
+static TCGv_i64 cpu_pc;
+
+/* Load/store exclusive handling */
+static TCGv_i64 cpu_exclusive_high;
+
+static const char *regnames[] = {
+    "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+    "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+    "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23",
+    "x24", "x25", "x26", "x27", "x28", "x29", "lr", "sp"
+};
+
+enum a64_shift_type {
+    A64_SHIFT_TYPE_LSL = 0,
+    A64_SHIFT_TYPE_LSR = 1,
+    A64_SHIFT_TYPE_ASR = 2,
+    A64_SHIFT_TYPE_ROR = 3
+};
+
+/* Table based decoder typedefs - used when the relevant bits for decode
+ * are too awkwardly scattered across the instruction (eg SIMD).
+ */
+typedef void AArch64DecodeFn(DisasContext *s, uint32_t insn);
+
+typedef struct AArch64DecodeTable {
+    uint32_t pattern;
+    uint32_t mask;
+    AArch64DecodeFn *disas_fn;
+} AArch64DecodeTable;
+
+/* initialize TCG globals.  */
+void a64_translate_init(void)
+{
+    int i;
+
+    cpu_pc = tcg_global_mem_new_i64(cpu_env,
+                                    offsetof(CPUARMState, pc),
+                                    "pc");
+    for (i = 0; i < 32; i++) {
+        cpu_X[i] = tcg_global_mem_new_i64(cpu_env,
+                                          offsetof(CPUARMState, xregs[i]),
+                                          regnames[i]);
+    }
+
+    cpu_exclusive_high = tcg_global_mem_new_i64(cpu_env,
+        offsetof(CPUARMState, exclusive_high), "exclusive_high");
+}
+
+/*
+ * Return the core mmu_idx to use for A64 "unprivileged load/store" insns
+ */
+static int get_a64_user_mem_index(DisasContext *s)
+{
+    /*
+     * If AccType_UNPRIV is not used, the insn uses AccType_NORMAL,
+     * which is the usual mmu_idx for this cpu state.
+     */
+    ARMMMUIdx useridx = s->mmu_idx;
+
+    if (s->unpriv) {
+        /*
+         * We have pre-computed the condition for AccType_UNPRIV.
+         * Therefore we should never get here with a mmu_idx for
+         * which we do not know the corresponding user mmu_idx.
+         */
+        switch (useridx) {
+        case ARMMMUIdx_E10_1:
+        case ARMMMUIdx_E10_1_PAN:
+            useridx = ARMMMUIdx_E10_0;
+            break;
+        case ARMMMUIdx_E20_2:
+        case ARMMMUIdx_E20_2_PAN:
+            useridx = ARMMMUIdx_E20_0;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+    }
+    return arm_to_core_mmu_idx(useridx);
+}
+
+static void set_btype_raw(int val)
+{
+    tcg_gen_st_i32(tcg_constant_i32(val), cpu_env,
+                   offsetof(CPUARMState, btype));
+}
+
+static void set_btype(DisasContext *s, int val)
+{
+    /* BTYPE is a 2-bit field, and 0 should be done with reset_btype.  */
+    tcg_debug_assert(val >= 1 && val <= 3);
+    set_btype_raw(val);
+    s->btype = -1;
+}
+
+static void reset_btype(DisasContext *s)
+{
+    if (s->btype != 0) {
+        set_btype_raw(0);
+        s->btype = 0;
+    }
+}
+
+static void gen_pc_plus_diff(DisasContext *s, TCGv_i64 dest, target_long diff)
+{
+    assert(s->pc_save != -1);
+    if (TARGET_TB_PCREL) {
+        tcg_gen_addi_i64(dest, cpu_pc, (s->pc_curr - s->pc_save) + diff);
+    } else {
+        tcg_gen_movi_i64(dest, s->pc_curr + diff);
+    }
+}
+
+void gen_a64_update_pc(DisasContext *s, target_long diff)
+{
+    gen_pc_plus_diff(s, cpu_pc, diff);
+    s->pc_save = s->pc_curr + diff;
+}
+
+/*
+ * Handle Top Byte Ignore (TBI) bits.
+ *
+ * If address tagging is enabled via the TCR TBI bits:
+ *  + for EL2 and EL3 there is only one TBI bit, and if it is set
+ *    then the address is zero-extended, clearing bits [63:56]
+ *  + for EL0 and EL1, TBI0 controls addresses with bit 55 == 0
+ *    and TBI1 controls addressses with bit 55 == 1.
+ *    If the appropriate TBI bit is set for the address then
+ *    the address is sign-extended from bit 55 into bits [63:56]
+ *
+ * Here We have concatenated TBI{1,0} into tbi.
+ */
+static void gen_top_byte_ignore(DisasContext *s, TCGv_i64 dst,
+                                TCGv_i64 src, int tbi)
+{
+    if (tbi == 0) {
+        /* Load unmodified address */
+        tcg_gen_mov_i64(dst, src);
+    } else if (!regime_has_2_ranges(s->mmu_idx)) {
+        /* Force tag byte to all zero */
+        tcg_gen_extract_i64(dst, src, 0, 56);
+    } else {
+        /* Sign-extend from bit 55.  */
+        tcg_gen_sextract_i64(dst, src, 0, 56);
+
+        switch (tbi) {
+        case 1:
+            /* tbi0 but !tbi1: only use the extension if positive */
+            tcg_gen_and_i64(dst, dst, src);
+            break;
+        case 2:
+            /* !tbi0 but tbi1: only use the extension if negative */
+            tcg_gen_or_i64(dst, dst, src);
+            break;
+        case 3:
+            /* tbi0 and tbi1: always use the extension */
+            break;
+        default:
+            g_assert_not_reached();
+        }
+    }
+}
+
+static void gen_a64_set_pc(DisasContext *s, TCGv_i64 src)
+{
+    /*
+     * If address tagging is enabled for instructions via the TCR TBI bits,
+     * then loading an address into the PC will clear out any tag.
+     */
+    gen_top_byte_ignore(s, cpu_pc, src, s->tbii);
+    s->pc_save = -1;
+}
+
+/*
+ * Handle MTE and/or TBI.
+ *
+ * For TBI, ideally, we would do nothing.  Proper behaviour on fault is
+ * for the tag to be present in the FAR_ELx register.  But for user-only
+ * mode we do not have a TLB with which to implement this, so we must
+ * remove the top byte now.
+ *
+ * Always return a fresh temporary that we can increment independently
+ * of the write-back address.
+ */
+
+TCGv_i64 clean_data_tbi(DisasContext *s, TCGv_i64 addr)
+{
+    TCGv_i64 clean = new_tmp_a64(s);
+#ifdef CONFIG_USER_ONLY
+    gen_top_byte_ignore(s, clean, addr, s->tbid);
+#else
+    tcg_gen_mov_i64(clean, addr);
+#endif
+    return clean;
+}
+
+/* Insert a zero tag into src, with the result at dst. */
+static void gen_address_with_allocation_tag0(TCGv_i64 dst, TCGv_i64 src)
+{
+    tcg_gen_andi_i64(dst, src, ~MAKE_64BIT_MASK(56, 4));
+}
+
+static void gen_probe_access(DisasContext *s, TCGv_i64 ptr,
+                             MMUAccessType acc, int log2_size)
+{
+    gen_helper_probe_access(cpu_env, ptr,
+                            tcg_constant_i32(acc),
+                            tcg_constant_i32(get_mem_index(s)),
+                            tcg_constant_i32(1 << log2_size));
+}
+
+/*
+ * For MTE, check a single logical or atomic access.  This probes a single
+ * address, the exact one specified.  The size and alignment of the access
+ * is not relevant to MTE, per se, but watchpoints do require the size,
+ * and we want to recognize those before making any other changes to state.
+ */
+static TCGv_i64 gen_mte_check1_mmuidx(DisasContext *s, TCGv_i64 addr,
+                                      bool is_write, bool tag_checked,
+                                      int log2_size, bool is_unpriv,
+                                      int core_idx)
+{
+    if (tag_checked && s->mte_active[is_unpriv]) {
+        TCGv_i64 ret;
+        int desc = 0;
+
+        desc = FIELD_DP32(desc, MTEDESC, MIDX, core_idx);
+        desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid);
+        desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma);
+        desc = FIELD_DP32(desc, MTEDESC, WRITE, is_write);
+        desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (1 << log2_size) - 1);
+
+        ret = new_tmp_a64(s);
+        gen_helper_mte_check(ret, cpu_env, tcg_constant_i32(desc), addr);
+
+        return ret;
+    }
+    return clean_data_tbi(s, addr);
+}
+
+TCGv_i64 gen_mte_check1(DisasContext *s, TCGv_i64 addr, bool is_write,
+                        bool tag_checked, int log2_size)
+{
+    return gen_mte_check1_mmuidx(s, addr, is_write, tag_checked, log2_size,
+                                 false, get_mem_index(s));
+}
+
+/*
+ * For MTE, check multiple logical sequential accesses.
+ */
+TCGv_i64 gen_mte_checkN(DisasContext *s, TCGv_i64 addr, bool is_write,
+                        bool tag_checked, int size)
+{
+    if (tag_checked && s->mte_active[0]) {
+        TCGv_i64 ret;
+        int desc = 0;
+
+        desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s));
+        desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid);
+        desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma);
+        desc = FIELD_DP32(desc, MTEDESC, WRITE, is_write);
+        desc = FIELD_DP32(desc, MTEDESC, SIZEM1, size - 1);
+
+        ret = new_tmp_a64(s);
+        gen_helper_mte_check(ret, cpu_env, tcg_constant_i32(desc), addr);
+
+        return ret;
+    }
+    return clean_data_tbi(s, addr);
+}
+
+typedef struct DisasCompare64 {
+    TCGCond cond;
+    TCGv_i64 value;
+} DisasCompare64;
+
+static void a64_test_cc(DisasCompare64 *c64, int cc)
+{
+    DisasCompare c32;
+
+    arm_test_cc(&c32, cc);
+
+    /* Sign-extend the 32-bit value so that the GE/LT comparisons work
+       * properly.  The NE/EQ comparisons are also fine with this choice.  */
+    c64->cond = c32.cond;
+    c64->value = tcg_temp_new_i64();
+    tcg_gen_ext_i32_i64(c64->value, c32.value);
+
+    arm_free_cc(&c32);
+}
+
+static void a64_free_cc(DisasCompare64 *c64)
+{
+    tcg_temp_free_i64(c64->value);
+}
+
+static void gen_rebuild_hflags(DisasContext *s)
+{
+    gen_helper_rebuild_hflags_a64(cpu_env, tcg_constant_i32(s->current_el));
+}
+
+static void gen_exception_internal(int excp)
+{
+    assert(excp_is_internal(excp));
+    gen_helper_exception_internal(cpu_env, tcg_constant_i32(excp));
+}
+
+static void gen_exception_internal_insn(DisasContext *s, int excp)
+{
+    gen_a64_update_pc(s, 0);
+    gen_exception_internal(excp);
+    s->base.is_jmp = DISAS_NORETURN;
+}
+
+static void gen_exception_bkpt_insn(DisasContext *s, uint32_t syndrome)
+{
+    gen_a64_update_pc(s, 0);
+    gen_helper_exception_bkpt_insn(cpu_env, tcg_constant_i32(syndrome));
+    s->base.is_jmp = DISAS_NORETURN;
+}
+
+static void gen_step_complete_exception(DisasContext *s)
+{
+    /* We just completed step of an insn. Move from Active-not-pending
+     * to Active-pending, and then also take the swstep exception.
+     * This corresponds to making the (IMPDEF) choice to prioritize
+     * swstep exceptions over asynchronous exceptions taken to an exception
+     * level where debug is disabled. This choice has the advantage that
+     * we do not need to maintain internal state corresponding to the
+     * ISV/EX syndrome bits between completion of the step and generation
+     * of the exception, and our syndrome information is always correct.
+     */
+    gen_ss_advance(s);
+    gen_swstep_exception(s, 1, s->is_ldex);
+    s->base.is_jmp = DISAS_NORETURN;
+}
+
+static inline bool use_goto_tb(DisasContext *s, uint64_t dest)
+{
+    if (s->ss_active) {
+        return false;
+    }
+    return translator_use_goto_tb(&s->base, dest);
+}
+
+static void gen_goto_tb(DisasContext *s, int n, int64_t diff)
+{
+    if (use_goto_tb(s, s->pc_curr + diff)) {
+        /*
+         * For pcrel, the pc must always be up-to-date on entry to
+         * the linked TB, so that it can use simple additions for all
+         * further adjustments.  For !pcrel, the linked TB is compiled
+         * to know its full virtual address, so we can delay the
+         * update to pc to the unlinked path.  A long chain of links
+         * can thus avoid many updates to the PC.
+         */
+        if (TARGET_TB_PCREL) {
+            gen_a64_update_pc(s, diff);
+            tcg_gen_goto_tb(n);
+        } else {
+            tcg_gen_goto_tb(n);
+            gen_a64_update_pc(s, diff);
+        }
+        tcg_gen_exit_tb(s->base.tb, n);
+        s->base.is_jmp = DISAS_NORETURN;
+    } else {
+        gen_a64_update_pc(s, diff);
+        if (s->ss_active) {
+            gen_step_complete_exception(s);
+        } else {
+            tcg_gen_lookup_and_goto_ptr();
+            s->base.is_jmp = DISAS_NORETURN;
+        }
+    }
+}
+
+static void init_tmp_a64_array(DisasContext *s)
+{
+#ifdef CONFIG_DEBUG_TCG
+    memset(s->tmp_a64, 0, sizeof(s->tmp_a64));
+#endif
+    s->tmp_a64_count = 0;
+}
+
+static void free_tmp_a64(DisasContext *s)
+{
+    int i;
+    for (i = 0; i < s->tmp_a64_count; i++) {
+        tcg_temp_free_i64(s->tmp_a64[i]);
+    }
+    init_tmp_a64_array(s);
+}
+
+TCGv_i64 new_tmp_a64(DisasContext *s)
+{
+    assert(s->tmp_a64_count < TMP_A64_MAX);
+    return s->tmp_a64[s->tmp_a64_count++] = tcg_temp_new_i64();
+}
+
+TCGv_i64 new_tmp_a64_local(DisasContext *s)
+{
+    assert(s->tmp_a64_count < TMP_A64_MAX);
+    return s->tmp_a64[s->tmp_a64_count++] = tcg_temp_local_new_i64();
+}
+
+TCGv_i64 new_tmp_a64_zero(DisasContext *s)
+{
+    TCGv_i64 t = new_tmp_a64(s);
+    tcg_gen_movi_i64(t, 0);
+    return t;
+}
+
+/*
+ * Register access functions
+ *
+ * These functions are used for directly accessing a register in where
+ * changes to the final register value are likely to be made. If you
+ * need to use a register for temporary calculation (e.g. index type
+ * operations) use the read_* form.
+ *
+ * B1.2.1 Register mappings
+ *
+ * In instruction register encoding 31 can refer to ZR (zero register) or
+ * the SP (stack pointer) depending on context. In QEMU's case we map SP
+ * to cpu_X[31] and ZR accesses to a temporary which can be discarded.
+ * This is the point of the _sp forms.
+ */
+TCGv_i64 cpu_reg(DisasContext *s, int reg)
+{
+    if (reg == 31) {
+        return new_tmp_a64_zero(s);
+    } else {
+        return cpu_X[reg];
+    }
+}
+
+/* register access for when 31 == SP */
+TCGv_i64 cpu_reg_sp(DisasContext *s, int reg)
+{
+    return cpu_X[reg];
+}
+
+/* read a cpu register in 32bit/64bit mode. Returns a TCGv_i64
+ * representing the register contents. This TCGv is an auto-freed
+ * temporary so it need not be explicitly freed, and may be modified.
+ */
+TCGv_i64 read_cpu_reg(DisasContext *s, int reg, int sf)
+{
+    TCGv_i64 v = new_tmp_a64(s);
+    if (reg != 31) {
+        if (sf) {
+            tcg_gen_mov_i64(v, cpu_X[reg]);
+        } else {
+            tcg_gen_ext32u_i64(v, cpu_X[reg]);
+        }
+    } else {
+        tcg_gen_movi_i64(v, 0);
+    }
+    return v;
+}
+
+TCGv_i64 read_cpu_reg_sp(DisasContext *s, int reg, int sf)
+{
+    TCGv_i64 v = new_tmp_a64(s);
+    if (sf) {
+        tcg_gen_mov_i64(v, cpu_X[reg]);
+    } else {
+        tcg_gen_ext32u_i64(v, cpu_X[reg]);
+    }
+    return v;
+}
+
+/* Return the offset into CPUARMState of a slice (from
+ * the least significant end) of FP register Qn (ie
+ * Dn, Sn, Hn or Bn).
+ * (Note that this is not the same mapping as for A32; see cpu.h)
+ */
+static inline int fp_reg_offset(DisasContext *s, int regno, MemOp size)
+{
+    return vec_reg_offset(s, regno, 0, size);
+}
+
+/* Offset of the high half of the 128 bit vector Qn */
+static inline int fp_reg_hi_offset(DisasContext *s, int regno)
+{
+    return vec_reg_offset(s, regno, 1, MO_64);
+}
+
+/* Convenience accessors for reading and writing single and double
+ * FP registers. Writing clears the upper parts of the associated
+ * 128 bit vector register, as required by the architecture.
+ * Note that unlike the GP register accessors, the values returned
+ * by the read functions must be manually freed.
+ */
+static TCGv_i64 read_fp_dreg(DisasContext *s, int reg)
+{
+    TCGv_i64 v = tcg_temp_new_i64();
+
+    tcg_gen_ld_i64(v, cpu_env, fp_reg_offset(s, reg, MO_64));
+    return v;
+}
+
+static TCGv_i32 read_fp_sreg(DisasContext *s, int reg)
+{
+    TCGv_i32 v = tcg_temp_new_i32();
+
+    tcg_gen_ld_i32(v, cpu_env, fp_reg_offset(s, reg, MO_32));
+    return v;
+}
+
+static TCGv_i32 read_fp_hreg(DisasContext *s, int reg)
+{
+    TCGv_i32 v = tcg_temp_new_i32();
+
+    tcg_gen_ld16u_i32(v, cpu_env, fp_reg_offset(s, reg, MO_16));
+    return v;
+}
+
+/* Clear the bits above an N-bit vector, for N = (is_q ? 128 : 64).
+ * If SVE is not enabled, then there are only 128 bits in the vector.
+ */
+static void clear_vec_high(DisasContext *s, bool is_q, int rd)
+{
+    unsigned ofs = fp_reg_offset(s, rd, MO_64);
+    unsigned vsz = vec_full_reg_size(s);
+
+    /* Nop move, with side effect of clearing the tail. */
+    tcg_gen_gvec_mov(MO_64, ofs, ofs, is_q ? 16 : 8, vsz);
+}
+
+void write_fp_dreg(DisasContext *s, int reg, TCGv_i64 v)
+{
+    unsigned ofs = fp_reg_offset(s, reg, MO_64);
+
+    tcg_gen_st_i64(v, cpu_env, ofs);
+    clear_vec_high(s, false, reg);
+}
+
+static void write_fp_sreg(DisasContext *s, int reg, TCGv_i32 v)
+{
+    TCGv_i64 tmp = tcg_temp_new_i64();
+
+    tcg_gen_extu_i32_i64(tmp, v);
+    write_fp_dreg(s, reg, tmp);
+    tcg_temp_free_i64(tmp);
+}
+
+/* Expand a 2-operand AdvSIMD vector operation using an expander function.  */
+static void gen_gvec_fn2(DisasContext *s, bool is_q, int rd, int rn,
+                         GVecGen2Fn *gvec_fn, int vece)
+{
+    gvec_fn(vece, vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn),
+            is_q ? 16 : 8, vec_full_reg_size(s));
+}
+
+/* Expand a 2-operand + immediate AdvSIMD vector operation using
+ * an expander function.
+ */
+static void gen_gvec_fn2i(DisasContext *s, bool is_q, int rd, int rn,
+                          int64_t imm, GVecGen2iFn *gvec_fn, int vece)
+{
+    gvec_fn(vece, vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn),
+            imm, is_q ? 16 : 8, vec_full_reg_size(s));
+}
+
+/* Expand a 3-operand AdvSIMD vector operation using an expander function.  */
+static void gen_gvec_fn3(DisasContext *s, bool is_q, int rd, int rn, int rm,
+                         GVecGen3Fn *gvec_fn, int vece)
+{
+    gvec_fn(vece, vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn),
+            vec_full_reg_offset(s, rm), is_q ? 16 : 8, vec_full_reg_size(s));
+}
+
+/* Expand a 4-operand AdvSIMD vector operation using an expander function.  */
+static void gen_gvec_fn4(DisasContext *s, bool is_q, int rd, int rn, int rm,
+                         int rx, GVecGen4Fn *gvec_fn, int vece)
+{
+    gvec_fn(vece, vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn),
+            vec_full_reg_offset(s, rm), vec_full_reg_offset(s, rx),
+            is_q ? 16 : 8, vec_full_reg_size(s));
+}
+
+/* Expand a 2-operand operation using an out-of-line helper.  */
+static void gen_gvec_op2_ool(DisasContext *s, bool is_q, int rd,
+                             int rn, int data, gen_helper_gvec_2 *fn)
+{
+    tcg_gen_gvec_2_ool(vec_full_reg_offset(s, rd),
+                       vec_full_reg_offset(s, rn),
+                       is_q ? 16 : 8, vec_full_reg_size(s), data, fn);
+}
+
+/* Expand a 3-operand operation using an out-of-line helper.  */
+static void gen_gvec_op3_ool(DisasContext *s, bool is_q, int rd,
+                             int rn, int rm, int data, gen_helper_gvec_3 *fn)
+{
+    tcg_gen_gvec_3_ool(vec_full_reg_offset(s, rd),
+                       vec_full_reg_offset(s, rn),
+                       vec_full_reg_offset(s, rm),
+                       is_q ? 16 : 8, vec_full_reg_size(s), data, fn);
+}
+
+/* Expand a 3-operand + fpstatus pointer + simd data value operation using
+ * an out-of-line helper.
+ */
+static void gen_gvec_op3_fpst(DisasContext *s, bool is_q, int rd, int rn,
+                              int rm, bool is_fp16, int data,
+                              gen_helper_gvec_3_ptr *fn)
+{
+    TCGv_ptr fpst = fpstatus_ptr(is_fp16 ? FPST_FPCR_F16 : FPST_FPCR);
+    tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd),
+                       vec_full_reg_offset(s, rn),
+                       vec_full_reg_offset(s, rm), fpst,
+                       is_q ? 16 : 8, vec_full_reg_size(s), data, fn);
+    tcg_temp_free_ptr(fpst);
+}
+
+/* Expand a 3-operand + qc + operation using an out-of-line helper.  */
+static void gen_gvec_op3_qc(DisasContext *s, bool is_q, int rd, int rn,
+                            int rm, gen_helper_gvec_3_ptr *fn)
+{
+    TCGv_ptr qc_ptr = tcg_temp_new_ptr();
+
+    tcg_gen_addi_ptr(qc_ptr, cpu_env, offsetof(CPUARMState, vfp.qc));
+    tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd),
+                       vec_full_reg_offset(s, rn),
+                       vec_full_reg_offset(s, rm), qc_ptr,
+                       is_q ? 16 : 8, vec_full_reg_size(s), 0, fn);
+    tcg_temp_free_ptr(qc_ptr);
+}
+
+/* Expand a 4-operand operation using an out-of-line helper.  */
+static void gen_gvec_op4_ool(DisasContext *s, bool is_q, int rd, int rn,
+                             int rm, int ra, int data, gen_helper_gvec_4 *fn)
+{
+    tcg_gen_gvec_4_ool(vec_full_reg_offset(s, rd),
+                       vec_full_reg_offset(s, rn),
+                       vec_full_reg_offset(s, rm),
+                       vec_full_reg_offset(s, ra),
+                       is_q ? 16 : 8, vec_full_reg_size(s), data, fn);
+}
+
+/*
+ * Expand a 4-operand + fpstatus pointer + simd data value operation using
+ * an out-of-line helper.
+ */
+static void gen_gvec_op4_fpst(DisasContext *s, bool is_q, int rd, int rn,
+                              int rm, int ra, bool is_fp16, int data,
+                              gen_helper_gvec_4_ptr *fn)
+{
+    TCGv_ptr fpst = fpstatus_ptr(is_fp16 ? FPST_FPCR_F16 : FPST_FPCR);
+    tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, rd),
+                       vec_full_reg_offset(s, rn),
+                       vec_full_reg_offset(s, rm),
+                       vec_full_reg_offset(s, ra), fpst,
+                       is_q ? 16 : 8, vec_full_reg_size(s), data, fn);
+    tcg_temp_free_ptr(fpst);
+}
+
+/* Set ZF and NF based on a 64 bit result. This is alas fiddlier
+ * than the 32 bit equivalent.
+ */
+static inline void gen_set_NZ64(TCGv_i64 result)
+{
+    tcg_gen_extr_i64_i32(cpu_ZF, cpu_NF, result);
+    tcg_gen_or_i32(cpu_ZF, cpu_ZF, cpu_NF);
+}
+
+/* Set NZCV as for a logical operation: NZ as per result, CV cleared. */
+static inline void gen_logic_CC(int sf, TCGv_i64 result)
+{
+    if (sf) {
+        gen_set_NZ64(result);
+    } else {
+        tcg_gen_extrl_i64_i32(cpu_ZF, result);
+        tcg_gen_mov_i32(cpu_NF, cpu_ZF);
+    }
+    tcg_gen_movi_i32(cpu_CF, 0);
+    tcg_gen_movi_i32(cpu_VF, 0);
+}
+
+/* dest = T0 + T1; compute C, N, V and Z flags */
+static void gen_add_CC(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1)
+{
+    if (sf) {
+        TCGv_i64 result, flag, tmp;
+        result = tcg_temp_new_i64();
+        flag = tcg_temp_new_i64();
+        tmp = tcg_temp_new_i64();
+
+        tcg_gen_movi_i64(tmp, 0);
+        tcg_gen_add2_i64(result, flag, t0, tmp, t1, tmp);
+
+        tcg_gen_extrl_i64_i32(cpu_CF, flag);
+
+        gen_set_NZ64(result);
+
+        tcg_gen_xor_i64(flag, result, t0);
+        tcg_gen_xor_i64(tmp, t0, t1);
+        tcg_gen_andc_i64(flag, flag, tmp);
+        tcg_temp_free_i64(tmp);
+        tcg_gen_extrh_i64_i32(cpu_VF, flag);
+
+        tcg_gen_mov_i64(dest, result);
+        tcg_temp_free_i64(result);
+        tcg_temp_free_i64(flag);
+    } else {
+        /* 32 bit arithmetic */
+        TCGv_i32 t0_32 = tcg_temp_new_i32();
+        TCGv_i32 t1_32 = tcg_temp_new_i32();
+        TCGv_i32 tmp = tcg_temp_new_i32();
+
+        tcg_gen_movi_i32(tmp, 0);
+        tcg_gen_extrl_i64_i32(t0_32, t0);
+        tcg_gen_extrl_i64_i32(t1_32, t1);
+        tcg_gen_add2_i32(cpu_NF, cpu_CF, t0_32, tmp, t1_32, tmp);
+        tcg_gen_mov_i32(cpu_ZF, cpu_NF);
+        tcg_gen_xor_i32(cpu_VF, cpu_NF, t0_32);
+        tcg_gen_xor_i32(tmp, t0_32, t1_32);
+        tcg_gen_andc_i32(cpu_VF, cpu_VF, tmp);
+        tcg_gen_extu_i32_i64(dest, cpu_NF);
+
+        tcg_temp_free_i32(tmp);
+        tcg_temp_free_i32(t0_32);
+        tcg_temp_free_i32(t1_32);
+    }
+}
+
+/* dest = T0 - T1; compute C, N, V and Z flags */
+static void gen_sub_CC(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1)
+{
+    if (sf) {
+        /* 64 bit arithmetic */
+        TCGv_i64 result, flag, tmp;
+
+        result = tcg_temp_new_i64();
+        flag = tcg_temp_new_i64();
+        tcg_gen_sub_i64(result, t0, t1);
+
+        gen_set_NZ64(result);
+
+        tcg_gen_setcond_i64(TCG_COND_GEU, flag, t0, t1);
+        tcg_gen_extrl_i64_i32(cpu_CF, flag);
+
+        tcg_gen_xor_i64(flag, result, t0);
+        tmp = tcg_temp_new_i64();
+        tcg_gen_xor_i64(tmp, t0, t1);
+        tcg_gen_and_i64(flag, flag, tmp);
+        tcg_temp_free_i64(tmp);
+        tcg_gen_extrh_i64_i32(cpu_VF, flag);
+        tcg_gen_mov_i64(dest, result);
+        tcg_temp_free_i64(flag);
+        tcg_temp_free_i64(result);
+    } else {
+        /* 32 bit arithmetic */
+        TCGv_i32 t0_32 = tcg_temp_new_i32();
+        TCGv_i32 t1_32 = tcg_temp_new_i32();
+        TCGv_i32 tmp;
+
+        tcg_gen_extrl_i64_i32(t0_32, t0);
+        tcg_gen_extrl_i64_i32(t1_32, t1);
+        tcg_gen_sub_i32(cpu_NF, t0_32, t1_32);
+        tcg_gen_mov_i32(cpu_ZF, cpu_NF);
+        tcg_gen_setcond_i32(TCG_COND_GEU, cpu_CF, t0_32, t1_32);
+        tcg_gen_xor_i32(cpu_VF, cpu_NF, t0_32);
+        tmp = tcg_temp_new_i32();
+        tcg_gen_xor_i32(tmp, t0_32, t1_32);
+        tcg_temp_free_i32(t0_32);
+        tcg_temp_free_i32(t1_32);
+        tcg_gen_and_i32(cpu_VF, cpu_VF, tmp);
+        tcg_temp_free_i32(tmp);
+        tcg_gen_extu_i32_i64(dest, cpu_NF);
+    }
+}
+
+/* dest = T0 + T1 + CF; do not compute flags. */
+static void gen_adc(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1)
+{
+    TCGv_i64 flag = tcg_temp_new_i64();
+    tcg_gen_extu_i32_i64(flag, cpu_CF);
+    tcg_gen_add_i64(dest, t0, t1);
+    tcg_gen_add_i64(dest, dest, flag);
+    tcg_temp_free_i64(flag);
+
+    if (!sf) {
+        tcg_gen_ext32u_i64(dest, dest);
+    }
+}
+
+/* dest = T0 + T1 + CF; compute C, N, V and Z flags. */
+static void gen_adc_CC(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1)
+{
+    if (sf) {
+        TCGv_i64 result = tcg_temp_new_i64();
+        TCGv_i64 cf_64 = tcg_temp_new_i64();
+        TCGv_i64 vf_64 = tcg_temp_new_i64();
+        TCGv_i64 tmp = tcg_temp_new_i64();
+        TCGv_i64 zero = tcg_constant_i64(0);
+
+        tcg_gen_extu_i32_i64(cf_64, cpu_CF);
+        tcg_gen_add2_i64(result, cf_64, t0, zero, cf_64, zero);
+        tcg_gen_add2_i64(result, cf_64, result, cf_64, t1, zero);
+        tcg_gen_extrl_i64_i32(cpu_CF, cf_64);
+        gen_set_NZ64(result);
+
+        tcg_gen_xor_i64(vf_64, result, t0);
+        tcg_gen_xor_i64(tmp, t0, t1);
+        tcg_gen_andc_i64(vf_64, vf_64, tmp);
+        tcg_gen_extrh_i64_i32(cpu_VF, vf_64);
+
+        tcg_gen_mov_i64(dest, result);
+
+        tcg_temp_free_i64(tmp);
+        tcg_temp_free_i64(vf_64);
+        tcg_temp_free_i64(cf_64);
+        tcg_temp_free_i64(result);
+    } else {
+        TCGv_i32 t0_32 = tcg_temp_new_i32();
+        TCGv_i32 t1_32 = tcg_temp_new_i32();
+        TCGv_i32 tmp = tcg_temp_new_i32();
+        TCGv_i32 zero = tcg_constant_i32(0);
+
+        tcg_gen_extrl_i64_i32(t0_32, t0);
+        tcg_gen_extrl_i64_i32(t1_32, t1);
+        tcg_gen_add2_i32(cpu_NF, cpu_CF, t0_32, zero, cpu_CF, zero);
+        tcg_gen_add2_i32(cpu_NF, cpu_CF, cpu_NF, cpu_CF, t1_32, zero);
+
+        tcg_gen_mov_i32(cpu_ZF, cpu_NF);
+        tcg_gen_xor_i32(cpu_VF, cpu_NF, t0_32);
+        tcg_gen_xor_i32(tmp, t0_32, t1_32);
+        tcg_gen_andc_i32(cpu_VF, cpu_VF, tmp);
+        tcg_gen_extu_i32_i64(dest, cpu_NF);
+
+        tcg_temp_free_i32(tmp);
+        tcg_temp_free_i32(t1_32);
+        tcg_temp_free_i32(t0_32);
+    }
+}
+
+/*
+ * Load/Store generators
+ */
+
+/*
+ * Store from GPR register to memory.
+ */
+static void do_gpr_st_memidx(DisasContext *s, TCGv_i64 source,
+                             TCGv_i64 tcg_addr, MemOp memop, int memidx,
+                             bool iss_valid,
+                             unsigned int iss_srt,
+                             bool iss_sf, bool iss_ar)
+{
+    memop = finalize_memop(s, memop);
+    tcg_gen_qemu_st_i64(source, tcg_addr, memidx, memop);
+
+    if (iss_valid) {
+        uint32_t syn;
+
+        syn = syn_data_abort_with_iss(0,
+                                      (memop & MO_SIZE),
+                                      false,
+                                      iss_srt,
+                                      iss_sf,
+                                      iss_ar,
+                                      0, 0, 0, 0, 0, false);
+        disas_set_insn_syndrome(s, syn);
+    }
+}
+
+static void do_gpr_st(DisasContext *s, TCGv_i64 source,
+                      TCGv_i64 tcg_addr, MemOp memop,
+                      bool iss_valid,
+                      unsigned int iss_srt,
+                      bool iss_sf, bool iss_ar)
+{
+    do_gpr_st_memidx(s, source, tcg_addr, memop, get_mem_index(s),
+                     iss_valid, iss_srt, iss_sf, iss_ar);
+}
+
+/*
+ * Load from memory to GPR register
+ */
+static void do_gpr_ld_memidx(DisasContext *s, TCGv_i64 dest, TCGv_i64 tcg_addr,
+                             MemOp memop, bool extend, int memidx,
+                             bool iss_valid, unsigned int iss_srt,
+                             bool iss_sf, bool iss_ar)
+{
+    memop = finalize_memop(s, memop);
+    tcg_gen_qemu_ld_i64(dest, tcg_addr, memidx, memop);
+
+    if (extend && (memop & MO_SIGN)) {
+        g_assert((memop & MO_SIZE) <= MO_32);
+        tcg_gen_ext32u_i64(dest, dest);
+    }
+
+    if (iss_valid) {
+        uint32_t syn;
+
+        syn = syn_data_abort_with_iss(0,
+                                      (memop & MO_SIZE),
+                                      (memop & MO_SIGN) != 0,
+                                      iss_srt,
+                                      iss_sf,
+                                      iss_ar,
+                                      0, 0, 0, 0, 0, false);
+        disas_set_insn_syndrome(s, syn);
+    }
+}
+
+static void do_gpr_ld(DisasContext *s, TCGv_i64 dest, TCGv_i64 tcg_addr,
+                      MemOp memop, bool extend,
+                      bool iss_valid, unsigned int iss_srt,
+                      bool iss_sf, bool iss_ar)
+{
+    do_gpr_ld_memidx(s, dest, tcg_addr, memop, extend, get_mem_index(s),
+                     iss_valid, iss_srt, iss_sf, iss_ar);
+}
+
+/*
+ * Store from FP register to memory
+ */
+static void do_fp_st(DisasContext *s, int srcidx, TCGv_i64 tcg_addr, int size)
+{
+    /* This writes the bottom N bits of a 128 bit wide vector to memory */
+    TCGv_i64 tmplo = tcg_temp_new_i64();
+    MemOp mop;
+
+    tcg_gen_ld_i64(tmplo, cpu_env, fp_reg_offset(s, srcidx, MO_64));
+
+    if (size < 4) {
+        mop = finalize_memop(s, size);
+        tcg_gen_qemu_st_i64(tmplo, tcg_addr, get_mem_index(s), mop);
+    } else {
+        bool be = s->be_data == MO_BE;
+        TCGv_i64 tcg_hiaddr = tcg_temp_new_i64();
+        TCGv_i64 tmphi = tcg_temp_new_i64();
+
+        tcg_gen_ld_i64(tmphi, cpu_env, fp_reg_hi_offset(s, srcidx));
+
+        mop = s->be_data | MO_UQ;
+        tcg_gen_qemu_st_i64(be ? tmphi : tmplo, tcg_addr, get_mem_index(s),
+                            mop | (s->align_mem ? MO_ALIGN_16 : 0));
+        tcg_gen_addi_i64(tcg_hiaddr, tcg_addr, 8);
+        tcg_gen_qemu_st_i64(be ? tmplo : tmphi, tcg_hiaddr,
+                            get_mem_index(s), mop);
+
+        tcg_temp_free_i64(tcg_hiaddr);
+        tcg_temp_free_i64(tmphi);
+    }
+
+    tcg_temp_free_i64(tmplo);
+}
+
+/*
+ * Load from memory to FP register
+ */
+static void do_fp_ld(DisasContext *s, int destidx, TCGv_i64 tcg_addr, int size)
+{
+    /* This always zero-extends and writes to a full 128 bit wide vector */
+    TCGv_i64 tmplo = tcg_temp_new_i64();
+    TCGv_i64 tmphi = NULL;
+    MemOp mop;
+
+    if (size < 4) {
+        mop = finalize_memop(s, size);
+        tcg_gen_qemu_ld_i64(tmplo, tcg_addr, get_mem_index(s), mop);
+    } else {
+        bool be = s->be_data == MO_BE;
+        TCGv_i64 tcg_hiaddr;
+
+        tmphi = tcg_temp_new_i64();
+        tcg_hiaddr = tcg_temp_new_i64();
+
+        mop = s->be_data | MO_UQ;
+        tcg_gen_qemu_ld_i64(be ? tmphi : tmplo, tcg_addr, get_mem_index(s),
+                            mop | (s->align_mem ? MO_ALIGN_16 : 0));
+        tcg_gen_addi_i64(tcg_hiaddr, tcg_addr, 8);
+        tcg_gen_qemu_ld_i64(be ? tmplo : tmphi, tcg_hiaddr,
+                            get_mem_index(s), mop);
+        tcg_temp_free_i64(tcg_hiaddr);
+    }
+
+    tcg_gen_st_i64(tmplo, cpu_env, fp_reg_offset(s, destidx, MO_64));
+    tcg_temp_free_i64(tmplo);
+
+    if (tmphi) {
+        tcg_gen_st_i64(tmphi, cpu_env, fp_reg_hi_offset(s, destidx));
+        tcg_temp_free_i64(tmphi);
+    }
+    clear_vec_high(s, tmphi != NULL, destidx);
+}
+
+/*
+ * Vector load/store helpers.
+ *
+ * The principal difference between this and a FP load is that we don't
+ * zero extend as we are filling a partial chunk of the vector register.
+ * These functions don't support 128 bit loads/stores, which would be
+ * normal load/store operations.
+ *
+ * The _i32 versions are useful when operating on 32 bit quantities
+ * (eg for floating point single or using Neon helper functions).
+ */
+
+/* Get value of an element within a vector register */
+static void read_vec_element(DisasContext *s, TCGv_i64 tcg_dest, int srcidx,
+                             int element, MemOp memop)
+{
+    int vect_off = vec_reg_offset(s, srcidx, element, memop & MO_SIZE);
+    switch ((unsigned)memop) {
+    case MO_8:
+        tcg_gen_ld8u_i64(tcg_dest, cpu_env, vect_off);
+        break;
+    case MO_16:
+        tcg_gen_ld16u_i64(tcg_dest, cpu_env, vect_off);
+        break;
+    case MO_32:
+        tcg_gen_ld32u_i64(tcg_dest, cpu_env, vect_off);
+        break;
+    case MO_8|MO_SIGN:
+        tcg_gen_ld8s_i64(tcg_dest, cpu_env, vect_off);
+        break;
+    case MO_16|MO_SIGN:
+        tcg_gen_ld16s_i64(tcg_dest, cpu_env, vect_off);
+        break;
+    case MO_32|MO_SIGN:
+        tcg_gen_ld32s_i64(tcg_dest, cpu_env, vect_off);
+        break;
+    case MO_64:
+    case MO_64|MO_SIGN:
+        tcg_gen_ld_i64(tcg_dest, cpu_env, vect_off);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+static void read_vec_element_i32(DisasContext *s, TCGv_i32 tcg_dest, int srcidx,
+                                 int element, MemOp memop)
+{
+    int vect_off = vec_reg_offset(s, srcidx, element, memop & MO_SIZE);
+    switch (memop) {
+    case MO_8:
+        tcg_gen_ld8u_i32(tcg_dest, cpu_env, vect_off);
+        break;
+    case MO_16:
+        tcg_gen_ld16u_i32(tcg_dest, cpu_env, vect_off);
+        break;
+    case MO_8|MO_SIGN:
+        tcg_gen_ld8s_i32(tcg_dest, cpu_env, vect_off);
+        break;
+    case MO_16|MO_SIGN:
+        tcg_gen_ld16s_i32(tcg_dest, cpu_env, vect_off);
+        break;
+    case MO_32:
+    case MO_32|MO_SIGN:
+        tcg_gen_ld_i32(tcg_dest, cpu_env, vect_off);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+/* Set value of an element within a vector register */
+static void write_vec_element(DisasContext *s, TCGv_i64 tcg_src, int destidx,
+                              int element, MemOp memop)
+{
+    int vect_off = vec_reg_offset(s, destidx, element, memop & MO_SIZE);
+    switch (memop) {
+    case MO_8:
+        tcg_gen_st8_i64(tcg_src, cpu_env, vect_off);
+        break;
+    case MO_16:
+        tcg_gen_st16_i64(tcg_src, cpu_env, vect_off);
+        break;
+    case MO_32:
+        tcg_gen_st32_i64(tcg_src, cpu_env, vect_off);
+        break;
+    case MO_64:
+        tcg_gen_st_i64(tcg_src, cpu_env, vect_off);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+static void write_vec_element_i32(DisasContext *s, TCGv_i32 tcg_src,
+                                  int destidx, int element, MemOp memop)
+{
+    int vect_off = vec_reg_offset(s, destidx, element, memop & MO_SIZE);
+    switch (memop) {
+    case MO_8:
+        tcg_gen_st8_i32(tcg_src, cpu_env, vect_off);
+        break;
+    case MO_16:
+        tcg_gen_st16_i32(tcg_src, cpu_env, vect_off);
+        break;
+    case MO_32:
+        tcg_gen_st_i32(tcg_src, cpu_env, vect_off);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+/* Store from vector register to memory */
+static void do_vec_st(DisasContext *s, int srcidx, int element,
+                      TCGv_i64 tcg_addr, MemOp mop)
+{
+    TCGv_i64 tcg_tmp = tcg_temp_new_i64();
+
+    read_vec_element(s, tcg_tmp, srcidx, element, mop & MO_SIZE);
+    tcg_gen_qemu_st_i64(tcg_tmp, tcg_addr, get_mem_index(s), mop);
+
+    tcg_temp_free_i64(tcg_tmp);
+}
+
+/* Load from memory to vector register */
+static void do_vec_ld(DisasContext *s, int destidx, int element,
+                      TCGv_i64 tcg_addr, MemOp mop)
+{
+    TCGv_i64 tcg_tmp = tcg_temp_new_i64();
+
+    tcg_gen_qemu_ld_i64(tcg_tmp, tcg_addr, get_mem_index(s), mop);
+    write_vec_element(s, tcg_tmp, destidx, element, mop & MO_SIZE);
+
+    tcg_temp_free_i64(tcg_tmp);
+}
+
+/* Check that FP/Neon access is enabled. If it is, return
+ * true. If not, emit code to generate an appropriate exception,
+ * and return false; the caller should not emit any code for
+ * the instruction. Note that this check must happen after all
+ * unallocated-encoding checks (otherwise the syndrome information
+ * for the resulting exception will be incorrect).
+ */
+static bool fp_access_check_only(DisasContext *s)
+{
+    if (s->fp_excp_el) {
+        assert(!s->fp_access_checked);
+        s->fp_access_checked = true;
+
+        gen_exception_insn_el(s, 0, EXCP_UDEF,
+                              syn_fp_access_trap(1, 0xe, false, 0),
+                              s->fp_excp_el);
+        return false;
+    }
+    s->fp_access_checked = true;
+    return true;
+}
+
+static bool fp_access_check(DisasContext *s)
+{
+    if (!fp_access_check_only(s)) {
+        return false;
+    }
+    if (s->sme_trap_nonstreaming && s->is_nonstreaming) {
+        gen_exception_insn(s, 0, EXCP_UDEF,
+                           syn_smetrap(SME_ET_Streaming, false));
+        return false;
+    }
+    return true;
+}
+
+/*
+ * Check that SVE access is enabled.  If it is, return true.
+ * If not, emit code to generate an appropriate exception and return false.
+ * This function corresponds to CheckSVEEnabled().
+ */
+bool sve_access_check(DisasContext *s)
+{
+    if (s->pstate_sm || !dc_isar_feature(aa64_sve, s)) {
+        assert(dc_isar_feature(aa64_sme, s));
+        if (!sme_sm_enabled_check(s)) {
+            goto fail_exit;
+        }
+    } else if (s->sve_excp_el) {
+        gen_exception_insn_el(s, 0, EXCP_UDEF,
+                              syn_sve_access_trap(), s->sve_excp_el);
+        goto fail_exit;
+    }
+    s->sve_access_checked = true;
+    return fp_access_check(s);
+
+ fail_exit:
+    /* Assert that we only raise one exception per instruction. */
+    assert(!s->sve_access_checked);
+    s->sve_access_checked = true;
+    return false;
+}
+
+/*
+ * Check that SME access is enabled, raise an exception if not.
+ * Note that this function corresponds to CheckSMEAccess and is
+ * only used directly for cpregs.
+ */
+static bool sme_access_check(DisasContext *s)
+{
+    if (s->sme_excp_el) {
+        gen_exception_insn_el(s, 0, EXCP_UDEF,
+                              syn_smetrap(SME_ET_AccessTrap, false),
+                              s->sme_excp_el);
+        return false;
+    }
+    return true;
+}
+
+/* This function corresponds to CheckSMEEnabled. */
+bool sme_enabled_check(DisasContext *s)
+{
+    /*
+     * Note that unlike sve_excp_el, we have not constrained sme_excp_el
+     * to be zero when fp_excp_el has priority.  This is because we need
+     * sme_excp_el by itself for cpregs access checks.
+     */
+    if (!s->fp_excp_el || s->sme_excp_el < s->fp_excp_el) {
+        s->fp_access_checked = true;
+        return sme_access_check(s);
+    }
+    return fp_access_check_only(s);
+}
+
+/* Common subroutine for CheckSMEAnd*Enabled. */
+bool sme_enabled_check_with_svcr(DisasContext *s, unsigned req)
+{
+    if (!sme_enabled_check(s)) {
+        return false;
+    }
+    if (FIELD_EX64(req, SVCR, SM) && !s->pstate_sm) {
+        gen_exception_insn(s, 0, EXCP_UDEF,
+                           syn_smetrap(SME_ET_NotStreaming, false));
+        return false;
+    }
+    if (FIELD_EX64(req, SVCR, ZA) && !s->pstate_za) {
+        gen_exception_insn(s, 0, EXCP_UDEF,
+                           syn_smetrap(SME_ET_InactiveZA, false));
+        return false;
+    }
+    return true;
+}
+
+/*
+ * This utility function is for doing register extension with an
+ * optional shift. You will likely want to pass a temporary for the
+ * destination register. See DecodeRegExtend() in the ARM ARM.
+ */
+static void ext_and_shift_reg(TCGv_i64 tcg_out, TCGv_i64 tcg_in,
+                              int option, unsigned int shift)
+{
+    int extsize = extract32(option, 0, 2);
+    bool is_signed = extract32(option, 2, 1);
+
+    if (is_signed) {
+        switch (extsize) {
+        case 0:
+            tcg_gen_ext8s_i64(tcg_out, tcg_in);
+            break;
+        case 1:
+            tcg_gen_ext16s_i64(tcg_out, tcg_in);
+            break;
+        case 2:
+            tcg_gen_ext32s_i64(tcg_out, tcg_in);
+            break;
+        case 3:
+            tcg_gen_mov_i64(tcg_out, tcg_in);
+            break;
+        }
+    } else {
+        switch (extsize) {
+        case 0:
+            tcg_gen_ext8u_i64(tcg_out, tcg_in);
+            break;
+        case 1:
+            tcg_gen_ext16u_i64(tcg_out, tcg_in);
+            break;
+        case 2:
+            tcg_gen_ext32u_i64(tcg_out, tcg_in);
+            break;
+        case 3:
+            tcg_gen_mov_i64(tcg_out, tcg_in);
+            break;
+        }
+    }
+
+    if (shift) {
+        tcg_gen_shli_i64(tcg_out, tcg_out, shift);
+    }
+}
+
+static inline void gen_check_sp_alignment(DisasContext *s)
+{
+    /* The AArch64 architecture mandates that (if enabled via PSTATE
+     * or SCTLR bits) there is a check that SP is 16-aligned on every
+     * SP-relative load or store (with an exception generated if it is not).
+     * In line with general QEMU practice regarding misaligned accesses,
+     * we omit these checks for the sake of guest program performance.
+     * This function is provided as a hook so we can more easily add these
+     * checks in future (possibly as a "favour catching guest program bugs
+     * over speed" user selectable option).
+     */
+}
+
+/*
+ * This provides a simple table based table lookup decoder. It is
+ * intended to be used when the relevant bits for decode are too
+ * awkwardly placed and switch/if based logic would be confusing and
+ * deeply nested. Since it's a linear search through the table, tables
+ * should be kept small.
+ *
+ * It returns the first handler where insn & mask == pattern, or
+ * NULL if there is no match.
+ * The table is terminated by an empty mask (i.e. 0)
+ */
+static inline AArch64DecodeFn *lookup_disas_fn(const AArch64DecodeTable *table,
+                                               uint32_t insn)
+{
+    const AArch64DecodeTable *tptr = table;
+
+    while (tptr->mask) {
+        if ((insn & tptr->mask) == tptr->pattern) {
+            return tptr->disas_fn;
+        }
+        tptr++;
+    }
+    return NULL;
+}
+
+/*
+ * The instruction disassembly implemented here matches
+ * the instruction encoding classifications in chapter C4
+ * of the ARM Architecture Reference Manual (DDI0487B_a);
+ * classification names and decode diagrams here should generally
+ * match up with those in the manual.
+ */
+
+/* Unconditional branch (immediate)
+ *   31  30       26 25                                  0
+ * +----+-----------+-------------------------------------+
+ * | op | 0 0 1 0 1 |                 imm26               |
+ * +----+-----------+-------------------------------------+
+ */
+static void disas_uncond_b_imm(DisasContext *s, uint32_t insn)
+{
+    int64_t diff = sextract32(insn, 0, 26) * 4;
+
+    if (insn & (1U << 31)) {
+        /* BL Branch with link */
+        gen_pc_plus_diff(s, cpu_reg(s, 30), curr_insn_len(s));
+    }
+
+    /* B Branch / BL Branch with link */
+    reset_btype(s);
+    gen_goto_tb(s, 0, diff);
+}
+
+/* Compare and branch (immediate)
+ *   31  30         25  24  23                  5 4      0
+ * +----+-------------+----+---------------------+--------+
+ * | sf | 0 1 1 0 1 0 | op |         imm19       |   Rt   |
+ * +----+-------------+----+---------------------+--------+
+ */
+static void disas_comp_b_imm(DisasContext *s, uint32_t insn)
+{
+    unsigned int sf, op, rt;
+    int64_t diff;
+    DisasLabel match;
+    TCGv_i64 tcg_cmp;
+
+    sf = extract32(insn, 31, 1);
+    op = extract32(insn, 24, 1); /* 0: CBZ; 1: CBNZ */
+    rt = extract32(insn, 0, 5);
+    diff = sextract32(insn, 5, 19) * 4;
+
+    tcg_cmp = read_cpu_reg(s, rt, sf);
+    reset_btype(s);
+
+    match = gen_disas_label(s);
+    tcg_gen_brcondi_i64(op ? TCG_COND_NE : TCG_COND_EQ,
+                        tcg_cmp, 0, match.label);
+    gen_goto_tb(s, 0, 4);
+    set_disas_label(s, match);
+    gen_goto_tb(s, 1, diff);
+}
+
+/* Test and branch (immediate)
+ *   31  30         25  24  23   19 18          5 4    0
+ * +----+-------------+----+-------+-------------+------+
+ * | b5 | 0 1 1 0 1 1 | op |  b40  |    imm14    |  Rt  |
+ * +----+-------------+----+-------+-------------+------+
+ */
+static void disas_test_b_imm(DisasContext *s, uint32_t insn)
+{
+    unsigned int bit_pos, op, rt;
+    int64_t diff;
+    DisasLabel match;
+    TCGv_i64 tcg_cmp;
+
+    bit_pos = (extract32(insn, 31, 1) << 5) | extract32(insn, 19, 5);
+    op = extract32(insn, 24, 1); /* 0: TBZ; 1: TBNZ */
+    diff = sextract32(insn, 5, 14) * 4;
+    rt = extract32(insn, 0, 5);
+
+    tcg_cmp = tcg_temp_new_i64();
+    tcg_gen_andi_i64(tcg_cmp, cpu_reg(s, rt), (1ULL << bit_pos));
+
+    reset_btype(s);
+
+    match = gen_disas_label(s);
+    tcg_gen_brcondi_i64(op ? TCG_COND_NE : TCG_COND_EQ,
+                        tcg_cmp, 0, match.label);
+    tcg_temp_free_i64(tcg_cmp);
+    gen_goto_tb(s, 0, 4);
+    set_disas_label(s, match);
+    gen_goto_tb(s, 1, diff);
+}
+
+/* Conditional branch (immediate)
+ *  31           25  24  23                  5   4  3    0
+ * +---------------+----+---------------------+----+------+
+ * | 0 1 0 1 0 1 0 | o1 |         imm19       | o0 | cond |
+ * +---------------+----+---------------------+----+------+
+ */
+static void disas_cond_b_imm(DisasContext *s, uint32_t insn)
+{
+    unsigned int cond;
+    int64_t diff;
+
+    if ((insn & (1 << 4)) || (insn & (1 << 24))) {
+        unallocated_encoding(s);
+        return;
+    }
+    diff = sextract32(insn, 5, 19) * 4;
+    cond = extract32(insn, 0, 4);
+
+    reset_btype(s);
+    if (cond < 0x0e) {
+        /* genuinely conditional branches */
+        DisasLabel match = gen_disas_label(s);
+        arm_gen_test_cc(cond, match.label);
+        gen_goto_tb(s, 0, 4);
+        set_disas_label(s, match);
+        gen_goto_tb(s, 1, diff);
+    } else {
+        /* 0xe and 0xf are both "always" conditions */
+        gen_goto_tb(s, 0, diff);
+    }
+}
+
+/* HINT instruction group, including various allocated HINTs */
+static void handle_hint(DisasContext *s, uint32_t insn,
+                        unsigned int op1, unsigned int op2, unsigned int crm)
+{
+    unsigned int selector = crm << 3 | op2;
+
+    if (op1 != 3) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (selector) {
+    case 0b00000: /* NOP */
+        break;
+    case 0b00011: /* WFI */
+        s->base.is_jmp = DISAS_WFI;
+        break;
+    case 0b00001: /* YIELD */
+        /* When running in MTTCG we don't generate jumps to the yield and
+         * WFE helpers as it won't affect the scheduling of other vCPUs.
+         * If we wanted to more completely model WFE/SEV so we don't busy
+         * spin unnecessarily we would need to do something more involved.
+         */
+        if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) {
+            s->base.is_jmp = DISAS_YIELD;
+        }
+        break;
+    case 0b00010: /* WFE */
+        if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) {
+            s->base.is_jmp = DISAS_WFE;
+        }
+        break;
+    case 0b00100: /* SEV */
+    case 0b00101: /* SEVL */
+    case 0b00110: /* DGH */
+        /* we treat all as NOP at least for now */
+        break;
+    case 0b00111: /* XPACLRI */
+        if (s->pauth_active) {
+            gen_helper_xpaci(cpu_X[30], cpu_env, cpu_X[30]);
+        }
+        break;
+    case 0b01000: /* PACIA1716 */
+        if (s->pauth_active) {
+            gen_helper_pacia(cpu_X[17], cpu_env, cpu_X[17], cpu_X[16]);
+        }
+        break;
+    case 0b01010: /* PACIB1716 */
+        if (s->pauth_active) {
+            gen_helper_pacib(cpu_X[17], cpu_env, cpu_X[17], cpu_X[16]);
+        }
+        break;
+    case 0b01100: /* AUTIA1716 */
+        if (s->pauth_active) {
+            gen_helper_autia(cpu_X[17], cpu_env, cpu_X[17], cpu_X[16]);
+        }
+        break;
+    case 0b01110: /* AUTIB1716 */
+        if (s->pauth_active) {
+            gen_helper_autib(cpu_X[17], cpu_env, cpu_X[17], cpu_X[16]);
+        }
+        break;
+    case 0b10000: /* ESB */
+        /* Without RAS, we must implement this as NOP. */
+        if (dc_isar_feature(aa64_ras, s)) {
+            /*
+             * QEMU does not have a source of physical SErrors,
+             * so we are only concerned with virtual SErrors.
+             * The pseudocode in the ARM for this case is
+             *   if PSTATE.EL IN {EL0, EL1} && EL2Enabled() then
+             *      AArch64.vESBOperation();
+             * Most of the condition can be evaluated at translation time.
+             * Test for EL2 present, and defer test for SEL2 to runtime.
+             */
+            if (s->current_el <= 1 && arm_dc_feature(s, ARM_FEATURE_EL2)) {
+                gen_helper_vesb(cpu_env);
+            }
+        }
+        break;
+    case 0b11000: /* PACIAZ */
+        if (s->pauth_active) {
+            gen_helper_pacia(cpu_X[30], cpu_env, cpu_X[30],
+                                new_tmp_a64_zero(s));
+        }
+        break;
+    case 0b11001: /* PACIASP */
+        if (s->pauth_active) {
+            gen_helper_pacia(cpu_X[30], cpu_env, cpu_X[30], cpu_X[31]);
+        }
+        break;
+    case 0b11010: /* PACIBZ */
+        if (s->pauth_active) {
+            gen_helper_pacib(cpu_X[30], cpu_env, cpu_X[30],
+                                new_tmp_a64_zero(s));
+        }
+        break;
+    case 0b11011: /* PACIBSP */
+        if (s->pauth_active) {
+            gen_helper_pacib(cpu_X[30], cpu_env, cpu_X[30], cpu_X[31]);
+        }
+        break;
+    case 0b11100: /* AUTIAZ */
+        if (s->pauth_active) {
+            gen_helper_autia(cpu_X[30], cpu_env, cpu_X[30],
+                              new_tmp_a64_zero(s));
+        }
+        break;
+    case 0b11101: /* AUTIASP */
+        if (s->pauth_active) {
+            gen_helper_autia(cpu_X[30], cpu_env, cpu_X[30], cpu_X[31]);
+        }
+        break;
+    case 0b11110: /* AUTIBZ */
+        if (s->pauth_active) {
+            gen_helper_autib(cpu_X[30], cpu_env, cpu_X[30],
+                              new_tmp_a64_zero(s));
+        }
+        break;
+    case 0b11111: /* AUTIBSP */
+        if (s->pauth_active) {
+            gen_helper_autib(cpu_X[30], cpu_env, cpu_X[30], cpu_X[31]);
+        }
+        break;
+    default:
+        /* default specified as NOP equivalent */
+        break;
+    }
+}
+
+static void gen_clrex(DisasContext *s, uint32_t insn)
+{
+    tcg_gen_movi_i64(cpu_exclusive_addr, -1);
+}
+
+/* CLREX, DSB, DMB, ISB */
+static void handle_sync(DisasContext *s, uint32_t insn,
+                        unsigned int op1, unsigned int op2, unsigned int crm)
+{
+    TCGBar bar;
+
+    if (op1 != 3) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (op2) {
+    case 2: /* CLREX */
+        gen_clrex(s, insn);
+        return;
+    case 4: /* DSB */
+    case 5: /* DMB */
+        switch (crm & 3) {
+        case 1: /* MBReqTypes_Reads */
+            bar = TCG_BAR_SC | TCG_MO_LD_LD | TCG_MO_LD_ST;
+            break;
+        case 2: /* MBReqTypes_Writes */
+            bar = TCG_BAR_SC | TCG_MO_ST_ST;
+            break;
+        default: /* MBReqTypes_All */
+            bar = TCG_BAR_SC | TCG_MO_ALL;
+            break;
+        }
+        tcg_gen_mb(bar);
+        return;
+    case 6: /* ISB */
+        /* We need to break the TB after this insn to execute
+         * a self-modified code correctly and also to take
+         * any pending interrupts immediately.
+         */
+        reset_btype(s);
+        gen_goto_tb(s, 0, 4);
+        return;
+
+    case 7: /* SB */
+        if (crm != 0 || !dc_isar_feature(aa64_sb, s)) {
+            goto do_unallocated;
+        }
+        /*
+         * TODO: There is no speculation barrier opcode for TCG;
+         * MB and end the TB instead.
+         */
+        tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
+        gen_goto_tb(s, 0, 4);
+        return;
+
+    default:
+    do_unallocated:
+        unallocated_encoding(s);
+        return;
+    }
+}
+
+static void gen_xaflag(void)
+{
+    TCGv_i32 z = tcg_temp_new_i32();
+
+    tcg_gen_setcondi_i32(TCG_COND_EQ, z, cpu_ZF, 0);
+
+    /*
+     * (!C & !Z) << 31
+     * (!(C | Z)) << 31
+     * ~((C | Z) << 31)
+     * ~-(C | Z)
+     * (C | Z) - 1
+     */
+    tcg_gen_or_i32(cpu_NF, cpu_CF, z);
+    tcg_gen_subi_i32(cpu_NF, cpu_NF, 1);
+
+    /* !(Z & C) */
+    tcg_gen_and_i32(cpu_ZF, z, cpu_CF);
+    tcg_gen_xori_i32(cpu_ZF, cpu_ZF, 1);
+
+    /* (!C & Z) << 31 -> -(Z & ~C) */
+    tcg_gen_andc_i32(cpu_VF, z, cpu_CF);
+    tcg_gen_neg_i32(cpu_VF, cpu_VF);
+
+    /* C | Z */
+    tcg_gen_or_i32(cpu_CF, cpu_CF, z);
+
+    tcg_temp_free_i32(z);
+}
+
+static void gen_axflag(void)
+{
+    tcg_gen_sari_i32(cpu_VF, cpu_VF, 31);         /* V ? -1 : 0 */
+    tcg_gen_andc_i32(cpu_CF, cpu_CF, cpu_VF);     /* C & !V */
+
+    /* !(Z | V) -> !(!ZF | V) -> ZF & !V -> ZF & ~VF */
+    tcg_gen_andc_i32(cpu_ZF, cpu_ZF, cpu_VF);
+
+    tcg_gen_movi_i32(cpu_NF, 0);
+    tcg_gen_movi_i32(cpu_VF, 0);
+}
+
+/* MSR (immediate) - move immediate to processor state field */
+static void handle_msr_i(DisasContext *s, uint32_t insn,
+                         unsigned int op1, unsigned int op2, unsigned int crm)
+{
+    int op = op1 << 3 | op2;
+
+    /* End the TB by default, chaining is ok.  */
+    s->base.is_jmp = DISAS_TOO_MANY;
+
+    switch (op) {
+    case 0x00: /* CFINV */
+        if (crm != 0 || !dc_isar_feature(aa64_condm_4, s)) {
+            goto do_unallocated;
+        }
+        tcg_gen_xori_i32(cpu_CF, cpu_CF, 1);
+        s->base.is_jmp = DISAS_NEXT;
+        break;
+
+    case 0x01: /* XAFlag */
+        if (crm != 0 || !dc_isar_feature(aa64_condm_5, s)) {
+            goto do_unallocated;
+        }
+        gen_xaflag();
+        s->base.is_jmp = DISAS_NEXT;
+        break;
+
+    case 0x02: /* AXFlag */
+        if (crm != 0 || !dc_isar_feature(aa64_condm_5, s)) {
+            goto do_unallocated;
+        }
+        gen_axflag();
+        s->base.is_jmp = DISAS_NEXT;
+        break;
+
+    case 0x03: /* UAO */
+        if (!dc_isar_feature(aa64_uao, s) || s->current_el == 0) {
+            goto do_unallocated;
+        }
+        if (crm & 1) {
+            set_pstate_bits(PSTATE_UAO);
+        } else {
+            clear_pstate_bits(PSTATE_UAO);
+        }
+        gen_rebuild_hflags(s);
+        break;
+
+    case 0x04: /* PAN */
+        if (!dc_isar_feature(aa64_pan, s) || s->current_el == 0) {
+            goto do_unallocated;
+        }
+        if (crm & 1) {
+            set_pstate_bits(PSTATE_PAN);
+        } else {
+            clear_pstate_bits(PSTATE_PAN);
+        }
+        gen_rebuild_hflags(s);
+        break;
+
+    case 0x05: /* SPSel */
+        if (s->current_el == 0) {
+            goto do_unallocated;
+        }
+        gen_helper_msr_i_spsel(cpu_env, tcg_constant_i32(crm & PSTATE_SP));
+        break;
+
+    case 0x19: /* SSBS */
+        if (!dc_isar_feature(aa64_ssbs, s)) {
+            goto do_unallocated;
+        }
+        if (crm & 1) {
+            set_pstate_bits(PSTATE_SSBS);
+        } else {
+            clear_pstate_bits(PSTATE_SSBS);
+        }
+        /* Don't need to rebuild hflags since SSBS is a nop */
+        break;
+
+    case 0x1a: /* DIT */
+        if (!dc_isar_feature(aa64_dit, s)) {
+            goto do_unallocated;
+        }
+        if (crm & 1) {
+            set_pstate_bits(PSTATE_DIT);
+        } else {
+            clear_pstate_bits(PSTATE_DIT);
+        }
+        /* There's no need to rebuild hflags because DIT is a nop */
+        break;
+
+    case 0x1e: /* DAIFSet */
+        gen_helper_msr_i_daifset(cpu_env, tcg_constant_i32(crm));
+        break;
+
+    case 0x1f: /* DAIFClear */
+        gen_helper_msr_i_daifclear(cpu_env, tcg_constant_i32(crm));
+        /* For DAIFClear, exit the cpu loop to re-evaluate pending IRQs.  */
+        s->base.is_jmp = DISAS_UPDATE_EXIT;
+        break;
+
+    case 0x1c: /* TCO */
+        if (dc_isar_feature(aa64_mte, s)) {
+            /* Full MTE is enabled -- set the TCO bit as directed. */
+            if (crm & 1) {
+                set_pstate_bits(PSTATE_TCO);
+            } else {
+                clear_pstate_bits(PSTATE_TCO);
+            }
+            gen_rebuild_hflags(s);
+            /* Many factors, including TCO, go into MTE_ACTIVE. */
+            s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
+        } else if (dc_isar_feature(aa64_mte_insn_reg, s)) {
+            /* Only "instructions accessible at EL0" -- PSTATE.TCO is WI.  */
+            s->base.is_jmp = DISAS_NEXT;
+        } else {
+            goto do_unallocated;
+        }
+        break;
+
+    case 0x1b: /* SVCR* */
+        if (!dc_isar_feature(aa64_sme, s) || crm < 2 || crm > 7) {
+            goto do_unallocated;
+        }
+        if (sme_access_check(s)) {
+            int old = s->pstate_sm | (s->pstate_za << 1);
+            int new = (crm & 1) * 3;
+            int msk = (crm >> 1) & 3;
+
+            if ((old ^ new) & msk) {
+                /* At least one bit changes. */
+                gen_helper_set_svcr(cpu_env, tcg_constant_i32(new),
+                                    tcg_constant_i32(msk));
+            } else {
+                s->base.is_jmp = DISAS_NEXT;
+            }
+        }
+        break;
+
+    default:
+    do_unallocated:
+        unallocated_encoding(s);
+        return;
+    }
+}
+
+static void gen_get_nzcv(TCGv_i64 tcg_rt)
+{
+    TCGv_i32 tmp = tcg_temp_new_i32();
+    TCGv_i32 nzcv = tcg_temp_new_i32();
+
+    /* build bit 31, N */
+    tcg_gen_andi_i32(nzcv, cpu_NF, (1U << 31));
+    /* build bit 30, Z */
+    tcg_gen_setcondi_i32(TCG_COND_EQ, tmp, cpu_ZF, 0);
+    tcg_gen_deposit_i32(nzcv, nzcv, tmp, 30, 1);
+    /* build bit 29, C */
+    tcg_gen_deposit_i32(nzcv, nzcv, cpu_CF, 29, 1);
+    /* build bit 28, V */
+    tcg_gen_shri_i32(tmp, cpu_VF, 31);
+    tcg_gen_deposit_i32(nzcv, nzcv, tmp, 28, 1);
+    /* generate result */
+    tcg_gen_extu_i32_i64(tcg_rt, nzcv);
+
+    tcg_temp_free_i32(nzcv);
+    tcg_temp_free_i32(tmp);
+}
+
+static void gen_set_nzcv(TCGv_i64 tcg_rt)
+{
+    TCGv_i32 nzcv = tcg_temp_new_i32();
+
+    /* take NZCV from R[t] */
+    tcg_gen_extrl_i64_i32(nzcv, tcg_rt);
+
+    /* bit 31, N */
+    tcg_gen_andi_i32(cpu_NF, nzcv, (1U << 31));
+    /* bit 30, Z */
+    tcg_gen_andi_i32(cpu_ZF, nzcv, (1 << 30));
+    tcg_gen_setcondi_i32(TCG_COND_EQ, cpu_ZF, cpu_ZF, 0);
+    /* bit 29, C */
+    tcg_gen_andi_i32(cpu_CF, nzcv, (1 << 29));
+    tcg_gen_shri_i32(cpu_CF, cpu_CF, 29);
+    /* bit 28, V */
+    tcg_gen_andi_i32(cpu_VF, nzcv, (1 << 28));
+    tcg_gen_shli_i32(cpu_VF, cpu_VF, 3);
+    tcg_temp_free_i32(nzcv);
+}
+
+static void gen_sysreg_undef(DisasContext *s, bool isread,
+                             uint8_t op0, uint8_t op1, uint8_t op2,
+                             uint8_t crn, uint8_t crm, uint8_t rt)
+{
+    /*
+     * Generate code to emit an UNDEF with correct syndrome
+     * information for a failed system register access.
+     * This is EC_UNCATEGORIZED (ie a standard UNDEF) in most cases,
+     * but if FEAT_IDST is implemented then read accesses to registers
+     * in the feature ID space are reported with the EC_SYSTEMREGISTERTRAP
+     * syndrome.
+     */
+    uint32_t syndrome;
+
+    if (isread && dc_isar_feature(aa64_ids, s) &&
+        arm_cpreg_encoding_in_idspace(op0, op1, op2, crn, crm)) {
+        syndrome = syn_aa64_sysregtrap(op0, op1, op2, crn, crm, rt, isread);
+    } else {
+        syndrome = syn_uncategorized();
+    }
+    gen_exception_insn(s, 0, EXCP_UDEF, syndrome);
+}
+
+/* MRS - move from system register
+ * MSR (register) - move to system register
+ * SYS
+ * SYSL
+ * These are all essentially the same insn in 'read' and 'write'
+ * versions, with varying op0 fields.
+ */
+static void handle_sys(DisasContext *s, uint32_t insn, bool isread,
+                       unsigned int op0, unsigned int op1, unsigned int op2,
+                       unsigned int crn, unsigned int crm, unsigned int rt)
+{
+    uint32_t key = ENCODE_AA64_CP_REG(CP_REG_ARM64_SYSREG_CP,
+                                      crn, crm, op0, op1, op2);
+    const ARMCPRegInfo *ri = get_arm_cp_reginfo(s->cp_regs, key);
+    TCGv_ptr tcg_ri = NULL;
+    TCGv_i64 tcg_rt;
+
+    if (!ri) {
+        /* Unknown register; this might be a guest error or a QEMU
+         * unimplemented feature.
+         */
+        qemu_log_mask(LOG_UNIMP, "%s access to unsupported AArch64 "
+                      "system register op0:%d op1:%d crn:%d crm:%d op2:%d\n",
+                      isread ? "read" : "write", op0, op1, crn, crm, op2);
+        gen_sysreg_undef(s, isread, op0, op1, op2, crn, crm, rt);
+        return;
+    }
+
+    /* Check access permissions */
+    if (!cp_access_ok(s->current_el, ri, isread)) {
+        gen_sysreg_undef(s, isread, op0, op1, op2, crn, crm, rt);
+        return;
+    }
+
+    if (ri->accessfn || (ri->fgt && s->fgt_active)) {
+        /* Emit code to perform further access permissions checks at
+         * runtime; this may result in an exception.
+         */
+        uint32_t syndrome;
+
+        syndrome = syn_aa64_sysregtrap(op0, op1, op2, crn, crm, rt, isread);
+        gen_a64_update_pc(s, 0);
+        tcg_ri = tcg_temp_new_ptr();
+        gen_helper_access_check_cp_reg(tcg_ri, cpu_env,
+                                       tcg_constant_i32(key),
+                                       tcg_constant_i32(syndrome),
+                                       tcg_constant_i32(isread));
+    } else if (ri->type & ARM_CP_RAISES_EXC) {
+        /*
+         * The readfn or writefn might raise an exception;
+         * synchronize the CPU state in case it does.
+         */
+        gen_a64_update_pc(s, 0);
+    }
+
+    /* Handle special cases first */
+    switch (ri->type & ARM_CP_SPECIAL_MASK) {
+    case 0:
+        break;
+    case ARM_CP_NOP:
+        goto exit;
+    case ARM_CP_NZCV:
+        tcg_rt = cpu_reg(s, rt);
+        if (isread) {
+            gen_get_nzcv(tcg_rt);
+        } else {
+            gen_set_nzcv(tcg_rt);
+        }
+        goto exit;
+    case ARM_CP_CURRENTEL:
+        /* Reads as current EL value from pstate, which is
+         * guaranteed to be constant by the tb flags.
+         */
+        tcg_rt = cpu_reg(s, rt);
+        tcg_gen_movi_i64(tcg_rt, s->current_el << 2);
+        goto exit;
+    case ARM_CP_DC_ZVA:
+        /* Writes clear the aligned block of memory which rt points into. */
+        if (s->mte_active[0]) {
+            int desc = 0;
+
+            desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s));
+            desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid);
+            desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma);
+
+            tcg_rt = new_tmp_a64(s);
+            gen_helper_mte_check_zva(tcg_rt, cpu_env,
+                                     tcg_constant_i32(desc), cpu_reg(s, rt));
+        } else {
+            tcg_rt = clean_data_tbi(s, cpu_reg(s, rt));
+        }
+        gen_helper_dc_zva(cpu_env, tcg_rt);
+        goto exit;
+    case ARM_CP_DC_GVA:
+        {
+            TCGv_i64 clean_addr, tag;
+
+            /*
+             * DC_GVA, like DC_ZVA, requires that we supply the original
+             * pointer for an invalid page.  Probe that address first.
+             */
+            tcg_rt = cpu_reg(s, rt);
+            clean_addr = clean_data_tbi(s, tcg_rt);
+            gen_probe_access(s, clean_addr, MMU_DATA_STORE, MO_8);
+
+            if (s->ata) {
+                /* Extract the tag from the register to match STZGM.  */
+                tag = tcg_temp_new_i64();
+                tcg_gen_shri_i64(tag, tcg_rt, 56);
+                gen_helper_stzgm_tags(cpu_env, clean_addr, tag);
+                tcg_temp_free_i64(tag);
+            }
+        }
+        goto exit;
+    case ARM_CP_DC_GZVA:
+        {
+            TCGv_i64 clean_addr, tag;
+
+            /* For DC_GZVA, we can rely on DC_ZVA for the proper fault. */
+            tcg_rt = cpu_reg(s, rt);
+            clean_addr = clean_data_tbi(s, tcg_rt);
+            gen_helper_dc_zva(cpu_env, clean_addr);
+
+            if (s->ata) {
+                /* Extract the tag from the register to match STZGM.  */
+                tag = tcg_temp_new_i64();
+                tcg_gen_shri_i64(tag, tcg_rt, 56);
+                gen_helper_stzgm_tags(cpu_env, clean_addr, tag);
+                tcg_temp_free_i64(tag);
+            }
+        }
+        goto exit;
+    default:
+        g_assert_not_reached();
+    }
+    if ((ri->type & ARM_CP_FPU) && !fp_access_check_only(s)) {
+        goto exit;
+    } else if ((ri->type & ARM_CP_SVE) && !sve_access_check(s)) {
+        goto exit;
+    } else if ((ri->type & ARM_CP_SME) && !sme_access_check(s)) {
+        goto exit;
+    }
+
+    if ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) {
+        gen_io_start();
+    }
+
+    tcg_rt = cpu_reg(s, rt);
+
+    if (isread) {
+        if (ri->type & ARM_CP_CONST) {
+            tcg_gen_movi_i64(tcg_rt, ri->resetvalue);
+        } else if (ri->readfn) {
+            if (!tcg_ri) {
+                tcg_ri = gen_lookup_cp_reg(key);
+            }
+            gen_helper_get_cp_reg64(tcg_rt, cpu_env, tcg_ri);
+        } else {
+            tcg_gen_ld_i64(tcg_rt, cpu_env, ri->fieldoffset);
+        }
+    } else {
+        if (ri->type & ARM_CP_CONST) {
+            /* If not forbidden by access permissions, treat as WI */
+            goto exit;
+        } else if (ri->writefn) {
+            if (!tcg_ri) {
+                tcg_ri = gen_lookup_cp_reg(key);
+            }
+            gen_helper_set_cp_reg64(cpu_env, tcg_ri, tcg_rt);
+        } else {
+            tcg_gen_st_i64(tcg_rt, cpu_env, ri->fieldoffset);
+        }
+    }
+
+    if ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) {
+        /* I/O operations must end the TB here (whether read or write) */
+        s->base.is_jmp = DISAS_UPDATE_EXIT;
+    }
+    if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {
+        /*
+         * A write to any coprocessor regiser that ends a TB
+         * must rebuild the hflags for the next TB.
+         */
+        gen_rebuild_hflags(s);
+        /*
+         * We default to ending the TB on a coprocessor register write,
+         * but allow this to be suppressed by the register definition
+         * (usually only necessary to work around guest bugs).
+         */
+        s->base.is_jmp = DISAS_UPDATE_EXIT;
+    }
+
+ exit:
+    if (tcg_ri) {
+        tcg_temp_free_ptr(tcg_ri);
+    }
+}
+
+/* System
+ *  31                 22 21  20 19 18 16 15   12 11    8 7   5 4    0
+ * +---------------------+---+-----+-----+-------+-------+-----+------+
+ * | 1 1 0 1 0 1 0 1 0 0 | L | op0 | op1 |  CRn  |  CRm  | op2 |  Rt  |
+ * +---------------------+---+-----+-----+-------+-------+-----+------+
+ */
+static void disas_system(DisasContext *s, uint32_t insn)
+{
+    unsigned int l, op0, op1, crn, crm, op2, rt;
+    l = extract32(insn, 21, 1);
+    op0 = extract32(insn, 19, 2);
+    op1 = extract32(insn, 16, 3);
+    crn = extract32(insn, 12, 4);
+    crm = extract32(insn, 8, 4);
+    op2 = extract32(insn, 5, 3);
+    rt = extract32(insn, 0, 5);
+
+    if (op0 == 0) {
+        if (l || rt != 31) {
+            unallocated_encoding(s);
+            return;
+        }
+        switch (crn) {
+        case 2: /* HINT (including allocated hints like NOP, YIELD, etc) */
+            handle_hint(s, insn, op1, op2, crm);
+            break;
+        case 3: /* CLREX, DSB, DMB, ISB */
+            handle_sync(s, insn, op1, op2, crm);
+            break;
+        case 4: /* MSR (immediate) */
+            handle_msr_i(s, insn, op1, op2, crm);
+            break;
+        default:
+            unallocated_encoding(s);
+            break;
+        }
+        return;
+    }
+    handle_sys(s, insn, l, op0, op1, op2, crn, crm, rt);
+}
+
+/* Exception generation
+ *
+ *  31             24 23 21 20                     5 4   2 1  0
+ * +-----------------+-----+------------------------+-----+----+
+ * | 1 1 0 1 0 1 0 0 | opc |          imm16         | op2 | LL |
+ * +-----------------------+------------------------+----------+
+ */
+static void disas_exc(DisasContext *s, uint32_t insn)
+{
+    int opc = extract32(insn, 21, 3);
+    int op2_ll = extract32(insn, 0, 5);
+    int imm16 = extract32(insn, 5, 16);
+    uint32_t syndrome;
+
+    switch (opc) {
+    case 0:
+        /* For SVC, HVC and SMC we advance the single-step state
+         * machine before taking the exception. This is architecturally
+         * mandated, to ensure that single-stepping a system call
+         * instruction works properly.
+         */
+        switch (op2_ll) {
+        case 1:                                                     /* SVC */
+            syndrome = syn_aa64_svc(imm16);
+            if (s->fgt_svc) {
+                gen_exception_insn_el(s, 0, EXCP_UDEF, syndrome, 2);
+                break;
+            }
+            gen_ss_advance(s);
+            gen_exception_insn(s, 4, EXCP_SWI, syndrome);
+            break;
+        case 2:                                                     /* HVC */
+            if (s->current_el == 0) {
+                unallocated_encoding(s);
+                break;
+            }
+            /* The pre HVC helper handles cases when HVC gets trapped
+             * as an undefined insn by runtime configuration.
+             */
+            gen_a64_update_pc(s, 0);
+            gen_helper_pre_hvc(cpu_env);
+            gen_ss_advance(s);
+            gen_exception_insn_el(s, 4, EXCP_HVC, syn_aa64_hvc(imm16), 2);
+            break;
+        case 3:                                                     /* SMC */
+            if (s->current_el == 0) {
+                unallocated_encoding(s);
+                break;
+            }
+            gen_a64_update_pc(s, 0);
+            gen_helper_pre_smc(cpu_env, tcg_constant_i32(syn_aa64_smc(imm16)));
+            gen_ss_advance(s);
+            gen_exception_insn_el(s, 4, EXCP_SMC, syn_aa64_smc(imm16), 3);
+            break;
+        default:
+            unallocated_encoding(s);
+            break;
+        }
+        break;
+    case 1:
+        if (op2_ll != 0) {
+            unallocated_encoding(s);
+            break;
+        }
+        /* BRK */
+        gen_exception_bkpt_insn(s, syn_aa64_bkpt(imm16));
+        break;
+    case 2:
+        if (op2_ll != 0) {
+            unallocated_encoding(s);
+            break;
+        }
+        /* HLT. This has two purposes.
+         * Architecturally, it is an external halting debug instruction.
+         * Since QEMU doesn't implement external debug, we treat this as
+         * it is required for halting debug disabled: it will UNDEF.
+         * Secondly, "HLT 0xf000" is the A64 semihosting syscall instruction.
+         */
+        if (semihosting_enabled(s->current_el == 0) && imm16 == 0xf000) {
+            gen_exception_internal_insn(s, EXCP_SEMIHOST);
+        } else {
+            unallocated_encoding(s);
+        }
+        break;
+    case 5:
+        if (op2_ll < 1 || op2_ll > 3) {
+            unallocated_encoding(s);
+            break;
+        }
+        /* DCPS1, DCPS2, DCPS3 */
+        unallocated_encoding(s);
+        break;
+    default:
+        unallocated_encoding(s);
+        break;
+    }
+}
+
+/* Unconditional branch (register)
+ *  31           25 24   21 20   16 15   10 9    5 4     0
+ * +---------------+-------+-------+-------+------+-------+
+ * | 1 1 0 1 0 1 1 |  opc  |  op2  |  op3  |  Rn  |  op4  |
+ * +---------------+-------+-------+-------+------+-------+
+ */
+static void disas_uncond_b_reg(DisasContext *s, uint32_t insn)
+{
+    unsigned int opc, op2, op3, rn, op4;
+    unsigned btype_mod = 2;   /* 0: BR, 1: BLR, 2: other */
+    TCGv_i64 dst;
+    TCGv_i64 modifier;
+
+    opc = extract32(insn, 21, 4);
+    op2 = extract32(insn, 16, 5);
+    op3 = extract32(insn, 10, 6);
+    rn = extract32(insn, 5, 5);
+    op4 = extract32(insn, 0, 5);
+
+    if (op2 != 0x1f) {
+        goto do_unallocated;
+    }
+
+    switch (opc) {
+    case 0: /* BR */
+    case 1: /* BLR */
+    case 2: /* RET */
+        btype_mod = opc;
+        switch (op3) {
+        case 0:
+            /* BR, BLR, RET */
+            if (op4 != 0) {
+                goto do_unallocated;
+            }
+            dst = cpu_reg(s, rn);
+            break;
+
+        case 2:
+        case 3:
+            if (!dc_isar_feature(aa64_pauth, s)) {
+                goto do_unallocated;
+            }
+            if (opc == 2) {
+                /* RETAA, RETAB */
+                if (rn != 0x1f || op4 != 0x1f) {
+                    goto do_unallocated;
+                }
+                rn = 30;
+                modifier = cpu_X[31];
+            } else {
+                /* BRAAZ, BRABZ, BLRAAZ, BLRABZ */
+                if (op4 != 0x1f) {
+                    goto do_unallocated;
+                }
+                modifier = new_tmp_a64_zero(s);
+            }
+            if (s->pauth_active) {
+                dst = new_tmp_a64(s);
+                if (op3 == 2) {
+                    gen_helper_autia(dst, cpu_env, cpu_reg(s, rn), modifier);
+                } else {
+                    gen_helper_autib(dst, cpu_env, cpu_reg(s, rn), modifier);
+                }
+            } else {
+                dst = cpu_reg(s, rn);
+            }
+            break;
+
+        default:
+            goto do_unallocated;
+        }
+        /* BLR also needs to load return address */
+        if (opc == 1) {
+            TCGv_i64 lr = cpu_reg(s, 30);
+            if (dst == lr) {
+                TCGv_i64 tmp = new_tmp_a64(s);
+                tcg_gen_mov_i64(tmp, dst);
+                dst = tmp;
+            }
+            gen_pc_plus_diff(s, lr, curr_insn_len(s));
+        }
+        gen_a64_set_pc(s, dst);
+        break;
+
+    case 8: /* BRAA */
+    case 9: /* BLRAA */
+        if (!dc_isar_feature(aa64_pauth, s)) {
+            goto do_unallocated;
+        }
+        if ((op3 & ~1) != 2) {
+            goto do_unallocated;
+        }
+        btype_mod = opc & 1;
+        if (s->pauth_active) {
+            dst = new_tmp_a64(s);
+            modifier = cpu_reg_sp(s, op4);
+            if (op3 == 2) {
+                gen_helper_autia(dst, cpu_env, cpu_reg(s, rn), modifier);
+            } else {
+                gen_helper_autib(dst, cpu_env, cpu_reg(s, rn), modifier);
+            }
+        } else {
+            dst = cpu_reg(s, rn);
+        }
+        /* BLRAA also needs to load return address */
+        if (opc == 9) {
+            TCGv_i64 lr = cpu_reg(s, 30);
+            if (dst == lr) {
+                TCGv_i64 tmp = new_tmp_a64(s);
+                tcg_gen_mov_i64(tmp, dst);
+                dst = tmp;
+            }
+            gen_pc_plus_diff(s, lr, curr_insn_len(s));
+        }
+        gen_a64_set_pc(s, dst);
+        break;
+
+    case 4: /* ERET */
+        if (s->current_el == 0) {
+            goto do_unallocated;
+        }
+        switch (op3) {
+        case 0: /* ERET */
+            if (op4 != 0) {
+                goto do_unallocated;
+            }
+            if (s->fgt_eret) {
+                gen_exception_insn_el(s, 0, EXCP_UDEF, syn_erettrap(op3), 2);
+                return;
+            }
+            dst = tcg_temp_new_i64();
+            tcg_gen_ld_i64(dst, cpu_env,
+                           offsetof(CPUARMState, elr_el[s->current_el]));
+            break;
+
+        case 2: /* ERETAA */
+        case 3: /* ERETAB */
+            if (!dc_isar_feature(aa64_pauth, s)) {
+                goto do_unallocated;
+            }
+            if (rn != 0x1f || op4 != 0x1f) {
+                goto do_unallocated;
+            }
+            /* The FGT trap takes precedence over an auth trap. */
+            if (s->fgt_eret) {
+                gen_exception_insn_el(s, 0, EXCP_UDEF, syn_erettrap(op3), 2);
+                return;
+            }
+            dst = tcg_temp_new_i64();
+            tcg_gen_ld_i64(dst, cpu_env,
+                           offsetof(CPUARMState, elr_el[s->current_el]));
+            if (s->pauth_active) {
+                modifier = cpu_X[31];
+                if (op3 == 2) {
+                    gen_helper_autia(dst, cpu_env, dst, modifier);
+                } else {
+                    gen_helper_autib(dst, cpu_env, dst, modifier);
+                }
+            }
+            break;
+
+        default:
+            goto do_unallocated;
+        }
+        if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
+            gen_io_start();
+        }
+
+        gen_helper_exception_return(cpu_env, dst);
+        tcg_temp_free_i64(dst);
+        /* Must exit loop to check un-masked IRQs */
+        s->base.is_jmp = DISAS_EXIT;
+        return;
+
+    case 5: /* DRPS */
+        if (op3 != 0 || op4 != 0 || rn != 0x1f) {
+            goto do_unallocated;
+        } else {
+            unallocated_encoding(s);
+        }
+        return;
+
+    default:
+    do_unallocated:
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (btype_mod) {
+    case 0: /* BR */
+        if (dc_isar_feature(aa64_bti, s)) {
+            /* BR to {x16,x17} or !guard -> 1, else 3.  */
+            set_btype(s, rn == 16 || rn == 17 || !s->guarded_page ? 1 : 3);
+        }
+        break;
+
+    case 1: /* BLR */
+        if (dc_isar_feature(aa64_bti, s)) {
+            /* BLR sets BTYPE to 2, regardless of source guarded page.  */
+            set_btype(s, 2);
+        }
+        break;
+
+    default: /* RET or none of the above.  */
+        /* BTYPE will be set to 0 by normal end-of-insn processing.  */
+        break;
+    }
+
+    s->base.is_jmp = DISAS_JUMP;
+}
+
+/* Branches, exception generating and system instructions */
+static void disas_b_exc_sys(DisasContext *s, uint32_t insn)
+{
+    switch (extract32(insn, 25, 7)) {
+    case 0x0a: case 0x0b:
+    case 0x4a: case 0x4b: /* Unconditional branch (immediate) */
+        disas_uncond_b_imm(s, insn);
+        break;
+    case 0x1a: case 0x5a: /* Compare & branch (immediate) */
+        disas_comp_b_imm(s, insn);
+        break;
+    case 0x1b: case 0x5b: /* Test & branch (immediate) */
+        disas_test_b_imm(s, insn);
+        break;
+    case 0x2a: /* Conditional branch (immediate) */
+        disas_cond_b_imm(s, insn);
+        break;
+    case 0x6a: /* Exception generation / System */
+        if (insn & (1 << 24)) {
+            if (extract32(insn, 22, 2) == 0) {
+                disas_system(s, insn);
+            } else {
+                unallocated_encoding(s);
+            }
+        } else {
+            disas_exc(s, insn);
+        }
+        break;
+    case 0x6b: /* Unconditional branch (register) */
+        disas_uncond_b_reg(s, insn);
+        break;
+    default:
+        unallocated_encoding(s);
+        break;
+    }
+}
+
+/*
+ * Load/Store exclusive instructions are implemented by remembering
+ * the value/address loaded, and seeing if these are the same
+ * when the store is performed. This is not actually the architecturally
+ * mandated semantics, but it works for typical guest code sequences
+ * and avoids having to monitor regular stores.
+ *
+ * The store exclusive uses the atomic cmpxchg primitives to avoid
+ * races in multi-threaded linux-user and when MTTCG softmmu is
+ * enabled.
+ */
+static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
+                               TCGv_i64 addr, int size, bool is_pair)
+{
+    int idx = get_mem_index(s);
+    MemOp memop = s->be_data;
+
+    g_assert(size <= 3);
+    if (is_pair) {
+        g_assert(size >= 2);
+        if (size == 2) {
+            /* The pair must be single-copy atomic for the doubleword.  */
+            memop |= MO_64 | MO_ALIGN;
+            tcg_gen_qemu_ld_i64(cpu_exclusive_val, addr, idx, memop);
+            if (s->be_data == MO_LE) {
+                tcg_gen_extract_i64(cpu_reg(s, rt), cpu_exclusive_val, 0, 32);
+                tcg_gen_extract_i64(cpu_reg(s, rt2), cpu_exclusive_val, 32, 32);
+            } else {
+                tcg_gen_extract_i64(cpu_reg(s, rt), cpu_exclusive_val, 32, 32);
+                tcg_gen_extract_i64(cpu_reg(s, rt2), cpu_exclusive_val, 0, 32);
+            }
+        } else {
+            /* The pair must be single-copy atomic for *each* doubleword, not
+               the entire quadword, however it must be quadword aligned.  */
+            memop |= MO_64;
+            tcg_gen_qemu_ld_i64(cpu_exclusive_val, addr, idx,
+                                memop | MO_ALIGN_16);
+
+            TCGv_i64 addr2 = tcg_temp_new_i64();
+            tcg_gen_addi_i64(addr2, addr, 8);
+            tcg_gen_qemu_ld_i64(cpu_exclusive_high, addr2, idx, memop);
+            tcg_temp_free_i64(addr2);
+
+            tcg_gen_mov_i64(cpu_reg(s, rt), cpu_exclusive_val);
+            tcg_gen_mov_i64(cpu_reg(s, rt2), cpu_exclusive_high);
+        }
+    } else {
+        memop |= size | MO_ALIGN;
+        tcg_gen_qemu_ld_i64(cpu_exclusive_val, addr, idx, memop);
+        tcg_gen_mov_i64(cpu_reg(s, rt), cpu_exclusive_val);
+    }
+    tcg_gen_mov_i64(cpu_exclusive_addr, addr);
+}
+
+static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
+                                TCGv_i64 addr, int size, int is_pair)
+{
+    /* if (env->exclusive_addr == addr && env->exclusive_val == [addr]
+     *     && (!is_pair || env->exclusive_high == [addr + datasize])) {
+     *     [addr] = {Rt};
+     *     if (is_pair) {
+     *         [addr + datasize] = {Rt2};
+     *     }
+     *     {Rd} = 0;
+     * } else {
+     *     {Rd} = 1;
+     * }
+     * env->exclusive_addr = -1;
+     */
+    TCGLabel *fail_label = gen_new_label();
+    TCGLabel *done_label = gen_new_label();
+    TCGv_i64 tmp;
+
+    tcg_gen_brcond_i64(TCG_COND_NE, addr, cpu_exclusive_addr, fail_label);
+
+    tmp = tcg_temp_new_i64();
+    if (is_pair) {
+        if (size == 2) {
+            if (s->be_data == MO_LE) {
+                tcg_gen_concat32_i64(tmp, cpu_reg(s, rt), cpu_reg(s, rt2));
+            } else {
+                tcg_gen_concat32_i64(tmp, cpu_reg(s, rt2), cpu_reg(s, rt));
+            }
+            tcg_gen_atomic_cmpxchg_i64(tmp, cpu_exclusive_addr,
+                                       cpu_exclusive_val, tmp,
+                                       get_mem_index(s),
+                                       MO_64 | MO_ALIGN | s->be_data);
+            tcg_gen_setcond_i64(TCG_COND_NE, tmp, tmp, cpu_exclusive_val);
+        } else {
+            TCGv_i128 t16 = tcg_temp_new_i128();
+            TCGv_i128 c16 = tcg_temp_new_i128();
+            TCGv_i64 a, b;
+
+            if (s->be_data == MO_LE) {
+                tcg_gen_concat_i64_i128(t16, cpu_reg(s, rt), cpu_reg(s, rt2));
+                tcg_gen_concat_i64_i128(c16, cpu_exclusive_val,
+                                        cpu_exclusive_high);
+            } else {
+                tcg_gen_concat_i64_i128(t16, cpu_reg(s, rt2), cpu_reg(s, rt));
+                tcg_gen_concat_i64_i128(c16, cpu_exclusive_high,
+                                        cpu_exclusive_val);
+            }
+
+            tcg_gen_atomic_cmpxchg_i128(t16, cpu_exclusive_addr, c16, t16,
+                                        get_mem_index(s),
+                                        MO_128 | MO_ALIGN | s->be_data);
+            tcg_temp_free_i128(c16);
+
+            a = tcg_temp_new_i64();
+            b = tcg_temp_new_i64();
+            if (s->be_data == MO_LE) {
+                tcg_gen_extr_i128_i64(a, b, t16);
+            } else {
+                tcg_gen_extr_i128_i64(b, a, t16);
+            }
+
+            tcg_gen_xor_i64(a, a, cpu_exclusive_val);
+            tcg_gen_xor_i64(b, b, cpu_exclusive_high);
+            tcg_gen_or_i64(tmp, a, b);
+            tcg_temp_free_i64(a);
+            tcg_temp_free_i64(b);
+            tcg_temp_free_i128(t16);
+
+            tcg_gen_setcondi_i64(TCG_COND_NE, tmp, tmp, 0);
+        }
+    } else {
+        tcg_gen_atomic_cmpxchg_i64(tmp, cpu_exclusive_addr, cpu_exclusive_val,
+                                   cpu_reg(s, rt), get_mem_index(s),
+                                   size | MO_ALIGN | s->be_data);
+        tcg_gen_setcond_i64(TCG_COND_NE, tmp, tmp, cpu_exclusive_val);
+    }
+    tcg_gen_mov_i64(cpu_reg(s, rd), tmp);
+    tcg_temp_free_i64(tmp);
+    tcg_gen_br(done_label);
+
+    gen_set_label(fail_label);
+    tcg_gen_movi_i64(cpu_reg(s, rd), 1);
+    gen_set_label(done_label);
+    tcg_gen_movi_i64(cpu_exclusive_addr, -1);
+}
+
+static void gen_compare_and_swap(DisasContext *s, int rs, int rt,
+                                 int rn, int size)
+{
+    TCGv_i64 tcg_rs = cpu_reg(s, rs);
+    TCGv_i64 tcg_rt = cpu_reg(s, rt);
+    int memidx = get_mem_index(s);
+    TCGv_i64 clean_addr;
+
+    if (rn == 31) {
+        gen_check_sp_alignment(s);
+    }
+    clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), true, rn != 31, size);
+    tcg_gen_atomic_cmpxchg_i64(tcg_rs, clean_addr, tcg_rs, tcg_rt, memidx,
+                               size | MO_ALIGN | s->be_data);
+}
+
+static void gen_compare_and_swap_pair(DisasContext *s, int rs, int rt,
+                                      int rn, int size)
+{
+    TCGv_i64 s1 = cpu_reg(s, rs);
+    TCGv_i64 s2 = cpu_reg(s, rs + 1);
+    TCGv_i64 t1 = cpu_reg(s, rt);
+    TCGv_i64 t2 = cpu_reg(s, rt + 1);
+    TCGv_i64 clean_addr;
+    int memidx = get_mem_index(s);
+
+    if (rn == 31) {
+        gen_check_sp_alignment(s);
+    }
+
+    /* This is a single atomic access, despite the "pair". */
+    clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), true, rn != 31, size + 1);
+
+    if (size == 2) {
+        TCGv_i64 cmp = tcg_temp_new_i64();
+        TCGv_i64 val = tcg_temp_new_i64();
+
+        if (s->be_data == MO_LE) {
+            tcg_gen_concat32_i64(val, t1, t2);
+            tcg_gen_concat32_i64(cmp, s1, s2);
+        } else {
+            tcg_gen_concat32_i64(val, t2, t1);
+            tcg_gen_concat32_i64(cmp, s2, s1);
+        }
+
+        tcg_gen_atomic_cmpxchg_i64(cmp, clean_addr, cmp, val, memidx,
+                                   MO_64 | MO_ALIGN | s->be_data);
+        tcg_temp_free_i64(val);
+
+        if (s->be_data == MO_LE) {
+            tcg_gen_extr32_i64(s1, s2, cmp);
+        } else {
+            tcg_gen_extr32_i64(s2, s1, cmp);
+        }
+        tcg_temp_free_i64(cmp);
+    } else {
+        TCGv_i128 cmp = tcg_temp_new_i128();
+        TCGv_i128 val = tcg_temp_new_i128();
+
+        if (s->be_data == MO_LE) {
+            tcg_gen_concat_i64_i128(val, t1, t2);
+            tcg_gen_concat_i64_i128(cmp, s1, s2);
+        } else {
+            tcg_gen_concat_i64_i128(val, t2, t1);
+            tcg_gen_concat_i64_i128(cmp, s2, s1);
+        }
+
+        tcg_gen_atomic_cmpxchg_i128(cmp, clean_addr, cmp, val, memidx,
+                                    MO_128 | MO_ALIGN | s->be_data);
+        tcg_temp_free_i128(val);
+
+        if (s->be_data == MO_LE) {
+            tcg_gen_extr_i128_i64(s1, s2, cmp);
+        } else {
+            tcg_gen_extr_i128_i64(s2, s1, cmp);
+        }
+        tcg_temp_free_i128(cmp);
+    }
+}
+
+/* Update the Sixty-Four bit (SF) registersize. This logic is derived
+ * from the ARMv8 specs for LDR (Shared decode for all encodings).
+ */
+static bool disas_ldst_compute_iss_sf(int size, bool is_signed, int opc)
+{
+    int opc0 = extract32(opc, 0, 1);
+    int regsize;
+
+    if (is_signed) {
+        regsize = opc0 ? 32 : 64;
+    } else {
+        regsize = size == 3 ? 64 : 32;
+    }
+    return regsize == 64;
+}
+
+/* Load/store exclusive
+ *
+ *  31 30 29         24  23  22   21  20  16  15  14   10 9    5 4    0
+ * +-----+-------------+----+---+----+------+----+-------+------+------+
+ * | sz  | 0 0 1 0 0 0 | o2 | L | o1 |  Rs  | o0 |  Rt2  |  Rn  | Rt   |
+ * +-----+-------------+----+---+----+------+----+-------+------+------+
+ *
+ *  sz: 00 -> 8 bit, 01 -> 16 bit, 10 -> 32 bit, 11 -> 64 bit
+ *   L: 0 -> store, 1 -> load
+ *  o2: 0 -> exclusive, 1 -> not
+ *  o1: 0 -> single register, 1 -> register pair
+ *  o0: 1 -> load-acquire/store-release, 0 -> not
+ */
+static void disas_ldst_excl(DisasContext *s, uint32_t insn)
+{
+    int rt = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int rt2 = extract32(insn, 10, 5);
+    int rs = extract32(insn, 16, 5);
+    int is_lasr = extract32(insn, 15, 1);
+    int o2_L_o1_o0 = extract32(insn, 21, 3) * 2 | is_lasr;
+    int size = extract32(insn, 30, 2);
+    TCGv_i64 clean_addr;
+
+    switch (o2_L_o1_o0) {
+    case 0x0: /* STXR */
+    case 0x1: /* STLXR */
+        if (rn == 31) {
+            gen_check_sp_alignment(s);
+        }
+        if (is_lasr) {
+            tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL);
+        }
+        clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn),
+                                    true, rn != 31, size);
+        gen_store_exclusive(s, rs, rt, rt2, clean_addr, size, false);
+        return;
+
+    case 0x4: /* LDXR */
+    case 0x5: /* LDAXR */
+        if (rn == 31) {
+            gen_check_sp_alignment(s);
+        }
+        clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn),
+                                    false, rn != 31, size);
+        s->is_ldex = true;
+        gen_load_exclusive(s, rt, rt2, clean_addr, size, false);
+        if (is_lasr) {
+            tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ);
+        }
+        return;
+
+    case 0x8: /* STLLR */
+        if (!dc_isar_feature(aa64_lor, s)) {
+            break;
+        }
+        /* StoreLORelease is the same as Store-Release for QEMU.  */
+        /* fall through */
+    case 0x9: /* STLR */
+        /* Generate ISS for non-exclusive accesses including LASR.  */
+        if (rn == 31) {
+            gen_check_sp_alignment(s);
+        }
+        tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL);
+        clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn),
+                                    true, rn != 31, size);
+        /* TODO: ARMv8.4-LSE SCTLR.nAA */
+        do_gpr_st(s, cpu_reg(s, rt), clean_addr, size | MO_ALIGN, true, rt,
+                  disas_ldst_compute_iss_sf(size, false, 0), is_lasr);
+        return;
+
+    case 0xc: /* LDLAR */
+        if (!dc_isar_feature(aa64_lor, s)) {
+            break;
+        }
+        /* LoadLOAcquire is the same as Load-Acquire for QEMU.  */
+        /* fall through */
+    case 0xd: /* LDAR */
+        /* Generate ISS for non-exclusive accesses including LASR.  */
+        if (rn == 31) {
+            gen_check_sp_alignment(s);
+        }
+        clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn),
+                                    false, rn != 31, size);
+        /* TODO: ARMv8.4-LSE SCTLR.nAA */
+        do_gpr_ld(s, cpu_reg(s, rt), clean_addr, size | MO_ALIGN, false, true,
+                  rt, disas_ldst_compute_iss_sf(size, false, 0), is_lasr);
+        tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ);
+        return;
+
+    case 0x2: case 0x3: /* CASP / STXP */
+        if (size & 2) { /* STXP / STLXP */
+            if (rn == 31) {
+                gen_check_sp_alignment(s);
+            }
+            if (is_lasr) {
+                tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL);
+            }
+            clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn),
+                                        true, rn != 31, size);
+            gen_store_exclusive(s, rs, rt, rt2, clean_addr, size, true);
+            return;
+        }
+        if (rt2 == 31
+            && ((rt | rs) & 1) == 0
+            && dc_isar_feature(aa64_atomics, s)) {
+            /* CASP / CASPL */
+            gen_compare_and_swap_pair(s, rs, rt, rn, size | 2);
+            return;
+        }
+        break;
+
+    case 0x6: case 0x7: /* CASPA / LDXP */
+        if (size & 2) { /* LDXP / LDAXP */
+            if (rn == 31) {
+                gen_check_sp_alignment(s);
+            }
+            clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn),
+                                        false, rn != 31, size);
+            s->is_ldex = true;
+            gen_load_exclusive(s, rt, rt2, clean_addr, size, true);
+            if (is_lasr) {
+                tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ);
+            }
+            return;
+        }
+        if (rt2 == 31
+            && ((rt | rs) & 1) == 0
+            && dc_isar_feature(aa64_atomics, s)) {
+            /* CASPA / CASPAL */
+            gen_compare_and_swap_pair(s, rs, rt, rn, size | 2);
+            return;
+        }
+        break;
+
+    case 0xa: /* CAS */
+    case 0xb: /* CASL */
+    case 0xe: /* CASA */
+    case 0xf: /* CASAL */
+        if (rt2 == 31 && dc_isar_feature(aa64_atomics, s)) {
+            gen_compare_and_swap(s, rs, rt, rn, size);
+            return;
+        }
+        break;
+    }
+    unallocated_encoding(s);
+}
+
+/*
+ * Load register (literal)
+ *
+ *  31 30 29   27  26 25 24 23                5 4     0
+ * +-----+-------+---+-----+-------------------+-------+
+ * | opc | 0 1 1 | V | 0 0 |     imm19         |  Rt   |
+ * +-----+-------+---+-----+-------------------+-------+
+ *
+ * V: 1 -> vector (simd/fp)
+ * opc (non-vector): 00 -> 32 bit, 01 -> 64 bit,
+ *                   10-> 32 bit signed, 11 -> prefetch
+ * opc (vector): 00 -> 32 bit, 01 -> 64 bit, 10 -> 128 bit (11 unallocated)
+ */
+static void disas_ld_lit(DisasContext *s, uint32_t insn)
+{
+    int rt = extract32(insn, 0, 5);
+    int64_t imm = sextract32(insn, 5, 19) << 2;
+    bool is_vector = extract32(insn, 26, 1);
+    int opc = extract32(insn, 30, 2);
+    bool is_signed = false;
+    int size = 2;
+    TCGv_i64 tcg_rt, clean_addr;
+
+    if (is_vector) {
+        if (opc == 3) {
+            unallocated_encoding(s);
+            return;
+        }
+        size = 2 + opc;
+        if (!fp_access_check(s)) {
+            return;
+        }
+    } else {
+        if (opc == 3) {
+            /* PRFM (literal) : prefetch */
+            return;
+        }
+        size = 2 + extract32(opc, 0, 1);
+        is_signed = extract32(opc, 1, 1);
+    }
+
+    tcg_rt = cpu_reg(s, rt);
+
+    clean_addr = new_tmp_a64(s);
+    gen_pc_plus_diff(s, clean_addr, imm);
+    if (is_vector) {
+        do_fp_ld(s, rt, clean_addr, size);
+    } else {
+        /* Only unsigned 32bit loads target 32bit registers.  */
+        bool iss_sf = opc != 0;
+
+        do_gpr_ld(s, tcg_rt, clean_addr, size + is_signed * MO_SIGN,
+                  false, true, rt, iss_sf, false);
+    }
+}
+
+/*
+ * LDNP (Load Pair - non-temporal hint)
+ * LDP (Load Pair - non vector)
+ * LDPSW (Load Pair Signed Word - non vector)
+ * STNP (Store Pair - non-temporal hint)
+ * STP (Store Pair - non vector)
+ * LDNP (Load Pair of SIMD&FP - non-temporal hint)
+ * LDP (Load Pair of SIMD&FP)
+ * STNP (Store Pair of SIMD&FP - non-temporal hint)
+ * STP (Store Pair of SIMD&FP)
+ *
+ *  31 30 29   27  26  25 24   23  22 21   15 14   10 9    5 4    0
+ * +-----+-------+---+---+-------+---+-----------------------------+
+ * | opc | 1 0 1 | V | 0 | index | L |  imm7 |  Rt2  |  Rn  | Rt   |
+ * +-----+-------+---+---+-------+---+-------+-------+------+------+
+ *
+ * opc: LDP/STP/LDNP/STNP        00 -> 32 bit, 10 -> 64 bit
+ *      LDPSW/STGP               01
+ *      LDP/STP/LDNP/STNP (SIMD) 00 -> 32 bit, 01 -> 64 bit, 10 -> 128 bit
+ *   V: 0 -> GPR, 1 -> Vector
+ * idx: 00 -> signed offset with non-temporal hint, 01 -> post-index,
+ *      10 -> signed offset, 11 -> pre-index
+ *   L: 0 -> Store 1 -> Load
+ *
+ * Rt, Rt2 = GPR or SIMD registers to be stored
+ * Rn = general purpose register containing address
+ * imm7 = signed offset (multiple of 4 or 8 depending on size)
+ */
+static void disas_ldst_pair(DisasContext *s, uint32_t insn)
+{
+    int rt = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int rt2 = extract32(insn, 10, 5);
+    uint64_t offset = sextract64(insn, 15, 7);
+    int index = extract32(insn, 23, 2);
+    bool is_vector = extract32(insn, 26, 1);
+    bool is_load = extract32(insn, 22, 1);
+    int opc = extract32(insn, 30, 2);
+
+    bool is_signed = false;
+    bool postindex = false;
+    bool wback = false;
+    bool set_tag = false;
+
+    TCGv_i64 clean_addr, dirty_addr;
+
+    int size;
+
+    if (opc == 3) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (is_vector) {
+        size = 2 + opc;
+    } else if (opc == 1 && !is_load) {
+        /* STGP */
+        if (!dc_isar_feature(aa64_mte_insn_reg, s) || index == 0) {
+            unallocated_encoding(s);
+            return;
+        }
+        size = 3;
+        set_tag = true;
+    } else {
+        size = 2 + extract32(opc, 1, 1);
+        is_signed = extract32(opc, 0, 1);
+        if (!is_load && is_signed) {
+            unallocated_encoding(s);
+            return;
+        }
+    }
+
+    switch (index) {
+    case 1: /* post-index */
+        postindex = true;
+        wback = true;
+        break;
+    case 0:
+        /* signed offset with "non-temporal" hint. Since we don't emulate
+         * caches we don't care about hints to the cache system about
+         * data access patterns, and handle this identically to plain
+         * signed offset.
+         */
+        if (is_signed) {
+            /* There is no non-temporal-hint version of LDPSW */
+            unallocated_encoding(s);
+            return;
+        }
+        postindex = false;
+        break;
+    case 2: /* signed offset, rn not updated */
+        postindex = false;
+        break;
+    case 3: /* pre-index */
+        postindex = false;
+        wback = true;
+        break;
+    }
+
+    if (is_vector && !fp_access_check(s)) {
+        return;
+    }
+
+    offset <<= (set_tag ? LOG2_TAG_GRANULE : size);
+
+    if (rn == 31) {
+        gen_check_sp_alignment(s);
+    }
+
+    dirty_addr = read_cpu_reg_sp(s, rn, 1);
+    if (!postindex) {
+        tcg_gen_addi_i64(dirty_addr, dirty_addr, offset);
+    }
+
+    if (set_tag) {
+        if (!s->ata) {
+            /*
+             * TODO: We could rely on the stores below, at least for
+             * system mode, if we arrange to add MO_ALIGN_16.
+             */
+            gen_helper_stg_stub(cpu_env, dirty_addr);
+        } else if (tb_cflags(s->base.tb) & CF_PARALLEL) {
+            gen_helper_stg_parallel(cpu_env, dirty_addr, dirty_addr);
+        } else {
+            gen_helper_stg(cpu_env, dirty_addr, dirty_addr);
+        }
+    }
+
+    clean_addr = gen_mte_checkN(s, dirty_addr, !is_load,
+                                (wback || rn != 31) && !set_tag, 2 << size);
+
+    if (is_vector) {
+        if (is_load) {
+            do_fp_ld(s, rt, clean_addr, size);
+        } else {
+            do_fp_st(s, rt, clean_addr, size);
+        }
+        tcg_gen_addi_i64(clean_addr, clean_addr, 1 << size);
+        if (is_load) {
+            do_fp_ld(s, rt2, clean_addr, size);
+        } else {
+            do_fp_st(s, rt2, clean_addr, size);
+        }
+    } else {
+        TCGv_i64 tcg_rt = cpu_reg(s, rt);
+        TCGv_i64 tcg_rt2 = cpu_reg(s, rt2);
+
+        if (is_load) {
+            TCGv_i64 tmp = tcg_temp_new_i64();
+
+            /* Do not modify tcg_rt before recognizing any exception
+             * from the second load.
+             */
+            do_gpr_ld(s, tmp, clean_addr, size + is_signed * MO_SIGN,
+                      false, false, 0, false, false);
+            tcg_gen_addi_i64(clean_addr, clean_addr, 1 << size);
+            do_gpr_ld(s, tcg_rt2, clean_addr, size + is_signed * MO_SIGN,
+                      false, false, 0, false, false);
+
+            tcg_gen_mov_i64(tcg_rt, tmp);
+            tcg_temp_free_i64(tmp);
+        } else {
+            do_gpr_st(s, tcg_rt, clean_addr, size,
+                      false, 0, false, false);
+            tcg_gen_addi_i64(clean_addr, clean_addr, 1 << size);
+            do_gpr_st(s, tcg_rt2, clean_addr, size,
+                      false, 0, false, false);
+        }
+    }
+
+    if (wback) {
+        if (postindex) {
+            tcg_gen_addi_i64(dirty_addr, dirty_addr, offset);
+        }
+        tcg_gen_mov_i64(cpu_reg_sp(s, rn), dirty_addr);
+    }
+}
+
+/*
+ * Load/store (immediate post-indexed)
+ * Load/store (immediate pre-indexed)
+ * Load/store (unscaled immediate)
+ *
+ * 31 30 29   27  26 25 24 23 22 21  20    12 11 10 9    5 4    0
+ * +----+-------+---+-----+-----+---+--------+-----+------+------+
+ * |size| 1 1 1 | V | 0 0 | opc | 0 |  imm9  | idx |  Rn  |  Rt  |
+ * +----+-------+---+-----+-----+---+--------+-----+------+------+
+ *
+ * idx = 01 -> post-indexed, 11 pre-indexed, 00 unscaled imm. (no writeback)
+         10 -> unprivileged
+ * V = 0 -> non-vector
+ * size: 00 -> 8 bit, 01 -> 16 bit, 10 -> 32 bit, 11 -> 64bit
+ * opc: 00 -> store, 01 -> loadu, 10 -> loads 64, 11 -> loads 32
+ */
+static void disas_ldst_reg_imm9(DisasContext *s, uint32_t insn,
+                                int opc,
+                                int size,
+                                int rt,
+                                bool is_vector)
+{
+    int rn = extract32(insn, 5, 5);
+    int imm9 = sextract32(insn, 12, 9);
+    int idx = extract32(insn, 10, 2);
+    bool is_signed = false;
+    bool is_store = false;
+    bool is_extended = false;
+    bool is_unpriv = (idx == 2);
+    bool iss_valid;
+    bool post_index;
+    bool writeback;
+    int memidx;
+
+    TCGv_i64 clean_addr, dirty_addr;
+
+    if (is_vector) {
+        size |= (opc & 2) << 1;
+        if (size > 4 || is_unpriv) {
+            unallocated_encoding(s);
+            return;
+        }
+        is_store = ((opc & 1) == 0);
+        if (!fp_access_check(s)) {
+            return;
+        }
+    } else {
+        if (size == 3 && opc == 2) {
+            /* PRFM - prefetch */
+            if (idx != 0) {
+                unallocated_encoding(s);
+                return;
+            }
+            return;
+        }
+        if (opc == 3 && size > 1) {
+            unallocated_encoding(s);
+            return;
+        }
+        is_store = (opc == 0);
+        is_signed = extract32(opc, 1, 1);
+        is_extended = (size < 3) && extract32(opc, 0, 1);
+    }
+
+    switch (idx) {
+    case 0:
+    case 2:
+        post_index = false;
+        writeback = false;
+        break;
+    case 1:
+        post_index = true;
+        writeback = true;
+        break;
+    case 3:
+        post_index = false;
+        writeback = true;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    iss_valid = !is_vector && !writeback;
+
+    if (rn == 31) {
+        gen_check_sp_alignment(s);
+    }
+
+    dirty_addr = read_cpu_reg_sp(s, rn, 1);
+    if (!post_index) {
+        tcg_gen_addi_i64(dirty_addr, dirty_addr, imm9);
+    }
+
+    memidx = is_unpriv ? get_a64_user_mem_index(s) : get_mem_index(s);
+    clean_addr = gen_mte_check1_mmuidx(s, dirty_addr, is_store,
+                                       writeback || rn != 31,
+                                       size, is_unpriv, memidx);
+
+    if (is_vector) {
+        if (is_store) {
+            do_fp_st(s, rt, clean_addr, size);
+        } else {
+            do_fp_ld(s, rt, clean_addr, size);
+        }
+    } else {
+        TCGv_i64 tcg_rt = cpu_reg(s, rt);
+        bool iss_sf = disas_ldst_compute_iss_sf(size, is_signed, opc);
+
+        if (is_store) {
+            do_gpr_st_memidx(s, tcg_rt, clean_addr, size, memidx,
+                             iss_valid, rt, iss_sf, false);
+        } else {
+            do_gpr_ld_memidx(s, tcg_rt, clean_addr, size + is_signed * MO_SIGN,
+                             is_extended, memidx,
+                             iss_valid, rt, iss_sf, false);
+        }
+    }
+
+    if (writeback) {
+        TCGv_i64 tcg_rn = cpu_reg_sp(s, rn);
+        if (post_index) {
+            tcg_gen_addi_i64(dirty_addr, dirty_addr, imm9);
+        }
+        tcg_gen_mov_i64(tcg_rn, dirty_addr);
+    }
+}
+
+/*
+ * Load/store (register offset)
+ *
+ * 31 30 29   27  26 25 24 23 22 21  20  16 15 13 12 11 10 9  5 4  0
+ * +----+-------+---+-----+-----+---+------+-----+--+-----+----+----+
+ * |size| 1 1 1 | V | 0 0 | opc | 1 |  Rm  | opt | S| 1 0 | Rn | Rt |
+ * +----+-------+---+-----+-----+---+------+-----+--+-----+----+----+
+ *
+ * For non-vector:
+ *   size: 00-> byte, 01 -> 16 bit, 10 -> 32bit, 11 -> 64bit
+ *   opc: 00 -> store, 01 -> loadu, 10 -> loads 64, 11 -> loads 32
+ * For vector:
+ *   size is opc<1>:size<1:0> so 100 -> 128 bit; 110 and 111 unallocated
+ *   opc<0>: 0 -> store, 1 -> load
+ * V: 1 -> vector/simd
+ * opt: extend encoding (see DecodeRegExtend)
+ * S: if S=1 then scale (essentially index by sizeof(size))
+ * Rt: register to transfer into/out of
+ * Rn: address register or SP for base
+ * Rm: offset register or ZR for offset
+ */
+static void disas_ldst_reg_roffset(DisasContext *s, uint32_t insn,
+                                   int opc,
+                                   int size,
+                                   int rt,
+                                   bool is_vector)
+{
+    int rn = extract32(insn, 5, 5);
+    int shift = extract32(insn, 12, 1);
+    int rm = extract32(insn, 16, 5);
+    int opt = extract32(insn, 13, 3);
+    bool is_signed = false;
+    bool is_store = false;
+    bool is_extended = false;
+
+    TCGv_i64 tcg_rm, clean_addr, dirty_addr;
+
+    if (extract32(opt, 1, 1) == 0) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (is_vector) {
+        size |= (opc & 2) << 1;
+        if (size > 4) {
+            unallocated_encoding(s);
+            return;
+        }
+        is_store = !extract32(opc, 0, 1);
+        if (!fp_access_check(s)) {
+            return;
+        }
+    } else {
+        if (size == 3 && opc == 2) {
+            /* PRFM - prefetch */
+            return;
+        }
+        if (opc == 3 && size > 1) {
+            unallocated_encoding(s);
+            return;
+        }
+        is_store = (opc == 0);
+        is_signed = extract32(opc, 1, 1);
+        is_extended = (size < 3) && extract32(opc, 0, 1);
+    }
+
+    if (rn == 31) {
+        gen_check_sp_alignment(s);
+    }
+    dirty_addr = read_cpu_reg_sp(s, rn, 1);
+
+    tcg_rm = read_cpu_reg(s, rm, 1);
+    ext_and_shift_reg(tcg_rm, tcg_rm, opt, shift ? size : 0);
+
+    tcg_gen_add_i64(dirty_addr, dirty_addr, tcg_rm);
+    clean_addr = gen_mte_check1(s, dirty_addr, is_store, true, size);
+
+    if (is_vector) {
+        if (is_store) {
+            do_fp_st(s, rt, clean_addr, size);
+        } else {
+            do_fp_ld(s, rt, clean_addr, size);
+        }
+    } else {
+        TCGv_i64 tcg_rt = cpu_reg(s, rt);
+        bool iss_sf = disas_ldst_compute_iss_sf(size, is_signed, opc);
+        if (is_store) {
+            do_gpr_st(s, tcg_rt, clean_addr, size,
+                      true, rt, iss_sf, false);
+        } else {
+            do_gpr_ld(s, tcg_rt, clean_addr, size + is_signed * MO_SIGN,
+                      is_extended, true, rt, iss_sf, false);
+        }
+    }
+}
+
+/*
+ * Load/store (unsigned immediate)
+ *
+ * 31 30 29   27  26 25 24 23 22 21        10 9     5
+ * +----+-------+---+-----+-----+------------+-------+------+
+ * |size| 1 1 1 | V | 0 1 | opc |   imm12    |  Rn   |  Rt  |
+ * +----+-------+---+-----+-----+------------+-------+------+
+ *
+ * For non-vector:
+ *   size: 00-> byte, 01 -> 16 bit, 10 -> 32bit, 11 -> 64bit
+ *   opc: 00 -> store, 01 -> loadu, 10 -> loads 64, 11 -> loads 32
+ * For vector:
+ *   size is opc<1>:size<1:0> so 100 -> 128 bit; 110 and 111 unallocated
+ *   opc<0>: 0 -> store, 1 -> load
+ * Rn: base address register (inc SP)
+ * Rt: target register
+ */
+static void disas_ldst_reg_unsigned_imm(DisasContext *s, uint32_t insn,
+                                        int opc,
+                                        int size,
+                                        int rt,
+                                        bool is_vector)
+{
+    int rn = extract32(insn, 5, 5);
+    unsigned int imm12 = extract32(insn, 10, 12);
+    unsigned int offset;
+
+    TCGv_i64 clean_addr, dirty_addr;
+
+    bool is_store;
+    bool is_signed = false;
+    bool is_extended = false;
+
+    if (is_vector) {
+        size |= (opc & 2) << 1;
+        if (size > 4) {
+            unallocated_encoding(s);
+            return;
+        }
+        is_store = !extract32(opc, 0, 1);
+        if (!fp_access_check(s)) {
+            return;
+        }
+    } else {
+        if (size == 3 && opc == 2) {
+            /* PRFM - prefetch */
+            return;
+        }
+        if (opc == 3 && size > 1) {
+            unallocated_encoding(s);
+            return;
+        }
+        is_store = (opc == 0);
+        is_signed = extract32(opc, 1, 1);
+        is_extended = (size < 3) && extract32(opc, 0, 1);
+    }
+
+    if (rn == 31) {
+        gen_check_sp_alignment(s);
+    }
+    dirty_addr = read_cpu_reg_sp(s, rn, 1);
+    offset = imm12 << size;
+    tcg_gen_addi_i64(dirty_addr, dirty_addr, offset);
+    clean_addr = gen_mte_check1(s, dirty_addr, is_store, rn != 31, size);
+
+    if (is_vector) {
+        if (is_store) {
+            do_fp_st(s, rt, clean_addr, size);
+        } else {
+            do_fp_ld(s, rt, clean_addr, size);
+        }
+    } else {
+        TCGv_i64 tcg_rt = cpu_reg(s, rt);
+        bool iss_sf = disas_ldst_compute_iss_sf(size, is_signed, opc);
+        if (is_store) {
+            do_gpr_st(s, tcg_rt, clean_addr, size,
+                      true, rt, iss_sf, false);
+        } else {
+            do_gpr_ld(s, tcg_rt, clean_addr, size + is_signed * MO_SIGN,
+                      is_extended, true, rt, iss_sf, false);
+        }
+    }
+}
+
+/* Atomic memory operations
+ *
+ *  31  30      27  26    24    22  21   16   15    12    10    5     0
+ * +------+-------+---+-----+-----+---+----+----+-----+-----+----+-----+
+ * | size | 1 1 1 | V | 0 0 | A R | 1 | Rs | o3 | opc | 0 0 | Rn |  Rt |
+ * +------+-------+---+-----+-----+--------+----+-----+-----+----+-----+
+ *
+ * Rt: the result register
+ * Rn: base address or SP
+ * Rs: the source register for the operation
+ * V: vector flag (always 0 as of v8.3)
+ * A: acquire flag
+ * R: release flag
+ */
+static void disas_ldst_atomic(DisasContext *s, uint32_t insn,
+                              int size, int rt, bool is_vector)
+{
+    int rs = extract32(insn, 16, 5);
+    int rn = extract32(insn, 5, 5);
+    int o3_opc = extract32(insn, 12, 4);
+    bool r = extract32(insn, 22, 1);
+    bool a = extract32(insn, 23, 1);
+    TCGv_i64 tcg_rs, tcg_rt, clean_addr;
+    AtomicThreeOpFn *fn = NULL;
+    MemOp mop = s->be_data | size | MO_ALIGN;
+
+    if (is_vector || !dc_isar_feature(aa64_atomics, s)) {
+        unallocated_encoding(s);
+        return;
+    }
+    switch (o3_opc) {
+    case 000: /* LDADD */
+        fn = tcg_gen_atomic_fetch_add_i64;
+        break;
+    case 001: /* LDCLR */
+        fn = tcg_gen_atomic_fetch_and_i64;
+        break;
+    case 002: /* LDEOR */
+        fn = tcg_gen_atomic_fetch_xor_i64;
+        break;
+    case 003: /* LDSET */
+        fn = tcg_gen_atomic_fetch_or_i64;
+        break;
+    case 004: /* LDSMAX */
+        fn = tcg_gen_atomic_fetch_smax_i64;
+        mop |= MO_SIGN;
+        break;
+    case 005: /* LDSMIN */
+        fn = tcg_gen_atomic_fetch_smin_i64;
+        mop |= MO_SIGN;
+        break;
+    case 006: /* LDUMAX */
+        fn = tcg_gen_atomic_fetch_umax_i64;
+        break;
+    case 007: /* LDUMIN */
+        fn = tcg_gen_atomic_fetch_umin_i64;
+        break;
+    case 010: /* SWP */
+        fn = tcg_gen_atomic_xchg_i64;
+        break;
+    case 014: /* LDAPR, LDAPRH, LDAPRB */
+        if (!dc_isar_feature(aa64_rcpc_8_3, s) ||
+            rs != 31 || a != 1 || r != 0) {
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (rn == 31) {
+        gen_check_sp_alignment(s);
+    }
+    clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), false, rn != 31, size);
+
+    if (o3_opc == 014) {
+        /*
+         * LDAPR* are a special case because they are a simple load, not a
+         * fetch-and-do-something op.
+         * The architectural consistency requirements here are weaker than
+         * full load-acquire (we only need "load-acquire processor consistent"),
+         * but we choose to implement them as full LDAQ.
+         */
+        do_gpr_ld(s, cpu_reg(s, rt), clean_addr, size, false,
+                  true, rt, disas_ldst_compute_iss_sf(size, false, 0), true);
+        tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ);
+        return;
+    }
+
+    tcg_rs = read_cpu_reg(s, rs, true);
+    tcg_rt = cpu_reg(s, rt);
+
+    if (o3_opc == 1) { /* LDCLR */
+        tcg_gen_not_i64(tcg_rs, tcg_rs);
+    }
+
+    /* The tcg atomic primitives are all full barriers.  Therefore we
+     * can ignore the Acquire and Release bits of this instruction.
+     */
+    fn(tcg_rt, clean_addr, tcg_rs, get_mem_index(s), mop);
+
+    if ((mop & MO_SIGN) && size != MO_64) {
+        tcg_gen_ext32u_i64(tcg_rt, tcg_rt);
+    }
+}
+
+/*
+ * PAC memory operations
+ *
+ *  31  30      27  26    24    22  21       12  11  10    5     0
+ * +------+-------+---+-----+-----+---+--------+---+---+----+-----+
+ * | size | 1 1 1 | V | 0 0 | M S | 1 |  imm9  | W | 1 | Rn |  Rt |
+ * +------+-------+---+-----+-----+---+--------+---+---+----+-----+
+ *
+ * Rt: the result register
+ * Rn: base address or SP
+ * V: vector flag (always 0 as of v8.3)
+ * M: clear for key DA, set for key DB
+ * W: pre-indexing flag
+ * S: sign for imm9.
+ */
+static void disas_ldst_pac(DisasContext *s, uint32_t insn,
+                           int size, int rt, bool is_vector)
+{
+    int rn = extract32(insn, 5, 5);
+    bool is_wback = extract32(insn, 11, 1);
+    bool use_key_a = !extract32(insn, 23, 1);
+    int offset;
+    TCGv_i64 clean_addr, dirty_addr, tcg_rt;
+
+    if (size != 3 || is_vector || !dc_isar_feature(aa64_pauth, s)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (rn == 31) {
+        gen_check_sp_alignment(s);
+    }
+    dirty_addr = read_cpu_reg_sp(s, rn, 1);
+
+    if (s->pauth_active) {
+        if (use_key_a) {
+            gen_helper_autda(dirty_addr, cpu_env, dirty_addr,
+                             new_tmp_a64_zero(s));
+        } else {
+            gen_helper_autdb(dirty_addr, cpu_env, dirty_addr,
+                             new_tmp_a64_zero(s));
+        }
+    }
+
+    /* Form the 10-bit signed, scaled offset.  */
+    offset = (extract32(insn, 22, 1) << 9) | extract32(insn, 12, 9);
+    offset = sextract32(offset << size, 0, 10 + size);
+    tcg_gen_addi_i64(dirty_addr, dirty_addr, offset);
+
+    /* Note that "clean" and "dirty" here refer to TBI not PAC.  */
+    clean_addr = gen_mte_check1(s, dirty_addr, false,
+                                is_wback || rn != 31, size);
+
+    tcg_rt = cpu_reg(s, rt);
+    do_gpr_ld(s, tcg_rt, clean_addr, size,
+              /* extend */ false, /* iss_valid */ !is_wback,
+              /* iss_srt */ rt, /* iss_sf */ true, /* iss_ar */ false);
+
+    if (is_wback) {
+        tcg_gen_mov_i64(cpu_reg_sp(s, rn), dirty_addr);
+    }
+}
+
+/*
+ * LDAPR/STLR (unscaled immediate)
+ *
+ *  31  30            24    22  21       12    10    5     0
+ * +------+-------------+-----+---+--------+-----+----+-----+
+ * | size | 0 1 1 0 0 1 | opc | 0 |  imm9  | 0 0 | Rn |  Rt |
+ * +------+-------------+-----+---+--------+-----+----+-----+
+ *
+ * Rt: source or destination register
+ * Rn: base register
+ * imm9: unscaled immediate offset
+ * opc: 00: STLUR*, 01/10/11: various LDAPUR*
+ * size: size of load/store
+ */
+static void disas_ldst_ldapr_stlr(DisasContext *s, uint32_t insn)
+{
+    int rt = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int offset = sextract32(insn, 12, 9);
+    int opc = extract32(insn, 22, 2);
+    int size = extract32(insn, 30, 2);
+    TCGv_i64 clean_addr, dirty_addr;
+    bool is_store = false;
+    bool extend = false;
+    bool iss_sf;
+    MemOp mop;
+
+    if (!dc_isar_feature(aa64_rcpc_8_4, s)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    /* TODO: ARMv8.4-LSE SCTLR.nAA */
+    mop = size | MO_ALIGN;
+
+    switch (opc) {
+    case 0: /* STLURB */
+        is_store = true;
+        break;
+    case 1: /* LDAPUR* */
+        break;
+    case 2: /* LDAPURS* 64-bit variant */
+        if (size == 3) {
+            unallocated_encoding(s);
+            return;
+        }
+        mop |= MO_SIGN;
+        break;
+    case 3: /* LDAPURS* 32-bit variant */
+        if (size > 1) {
+            unallocated_encoding(s);
+            return;
+        }
+        mop |= MO_SIGN;
+        extend = true; /* zero-extend 32->64 after signed load */
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    iss_sf = disas_ldst_compute_iss_sf(size, (mop & MO_SIGN) != 0, opc);
+
+    if (rn == 31) {
+        gen_check_sp_alignment(s);
+    }
+
+    dirty_addr = read_cpu_reg_sp(s, rn, 1);
+    tcg_gen_addi_i64(dirty_addr, dirty_addr, offset);
+    clean_addr = clean_data_tbi(s, dirty_addr);
+
+    if (is_store) {
+        /* Store-Release semantics */
+        tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL);
+        do_gpr_st(s, cpu_reg(s, rt), clean_addr, mop, true, rt, iss_sf, true);
+    } else {
+        /*
+         * Load-AcquirePC semantics; we implement as the slightly more
+         * restrictive Load-Acquire.
+         */
+        do_gpr_ld(s, cpu_reg(s, rt), clean_addr, mop,
+                  extend, true, rt, iss_sf, true);
+        tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ);
+    }
+}
+
+/* Load/store register (all forms) */
+static void disas_ldst_reg(DisasContext *s, uint32_t insn)
+{
+    int rt = extract32(insn, 0, 5);
+    int opc = extract32(insn, 22, 2);
+    bool is_vector = extract32(insn, 26, 1);
+    int size = extract32(insn, 30, 2);
+
+    switch (extract32(insn, 24, 2)) {
+    case 0:
+        if (extract32(insn, 21, 1) == 0) {
+            /* Load/store register (unscaled immediate)
+             * Load/store immediate pre/post-indexed
+             * Load/store register unprivileged
+             */
+            disas_ldst_reg_imm9(s, insn, opc, size, rt, is_vector);
+            return;
+        }
+        switch (extract32(insn, 10, 2)) {
+        case 0:
+            disas_ldst_atomic(s, insn, size, rt, is_vector);
+            return;
+        case 2:
+            disas_ldst_reg_roffset(s, insn, opc, size, rt, is_vector);
+            return;
+        default:
+            disas_ldst_pac(s, insn, size, rt, is_vector);
+            return;
+        }
+        break;
+    case 1:
+        disas_ldst_reg_unsigned_imm(s, insn, opc, size, rt, is_vector);
+        return;
+    }
+    unallocated_encoding(s);
+}
+
+/* AdvSIMD load/store multiple structures
+ *
+ *  31  30  29           23 22  21         16 15    12 11  10 9    5 4    0
+ * +---+---+---------------+---+-------------+--------+------+------+------+
+ * | 0 | Q | 0 0 1 1 0 0 0 | L | 0 0 0 0 0 0 | opcode | size |  Rn  |  Rt  |
+ * +---+---+---------------+---+-------------+--------+------+------+------+
+ *
+ * AdvSIMD load/store multiple structures (post-indexed)
+ *
+ *  31  30  29           23 22  21  20     16 15    12 11  10 9    5 4    0
+ * +---+---+---------------+---+---+---------+--------+------+------+------+
+ * | 0 | Q | 0 0 1 1 0 0 1 | L | 0 |   Rm    | opcode | size |  Rn  |  Rt  |
+ * +---+---+---------------+---+---+---------+--------+------+------+------+
+ *
+ * Rt: first (or only) SIMD&FP register to be transferred
+ * Rn: base address or SP
+ * Rm (post-index only): post-index register (when !31) or size dependent #imm
+ */
+static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn)
+{
+    int rt = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int rm = extract32(insn, 16, 5);
+    int size = extract32(insn, 10, 2);
+    int opcode = extract32(insn, 12, 4);
+    bool is_store = !extract32(insn, 22, 1);
+    bool is_postidx = extract32(insn, 23, 1);
+    bool is_q = extract32(insn, 30, 1);
+    TCGv_i64 clean_addr, tcg_rn, tcg_ebytes;
+    MemOp endian, align, mop;
+
+    int total;    /* total bytes */
+    int elements; /* elements per vector */
+    int rpt;    /* num iterations */
+    int selem;  /* structure elements */
+    int r;
+
+    if (extract32(insn, 31, 1) || extract32(insn, 21, 1)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!is_postidx && rm != 0) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    /* From the shared decode logic */
+    switch (opcode) {
+    case 0x0:
+        rpt = 1;
+        selem = 4;
+        break;
+    case 0x2:
+        rpt = 4;
+        selem = 1;
+        break;
+    case 0x4:
+        rpt = 1;
+        selem = 3;
+        break;
+    case 0x6:
+        rpt = 3;
+        selem = 1;
+        break;
+    case 0x7:
+        rpt = 1;
+        selem = 1;
+        break;
+    case 0x8:
+        rpt = 1;
+        selem = 2;
+        break;
+    case 0xa:
+        rpt = 2;
+        selem = 1;
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (size == 3 && !is_q && selem != 1) {
+        /* reserved */
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    if (rn == 31) {
+        gen_check_sp_alignment(s);
+    }
+
+    /* For our purposes, bytes are always little-endian.  */
+    endian = s->be_data;
+    if (size == 0) {
+        endian = MO_LE;
+    }
+
+    total = rpt * selem * (is_q ? 16 : 8);
+    tcg_rn = cpu_reg_sp(s, rn);
+
+    /*
+     * Issue the MTE check vs the logical repeat count, before we
+     * promote consecutive little-endian elements below.
+     */
+    clean_addr = gen_mte_checkN(s, tcg_rn, is_store, is_postidx || rn != 31,
+                                total);
+
+    /*
+     * Consecutive little-endian elements from a single register
+     * can be promoted to a larger little-endian operation.
+     */
+    align = MO_ALIGN;
+    if (selem == 1 && endian == MO_LE) {
+        align = pow2_align(size);
+        size = 3;
+    }
+    if (!s->align_mem) {
+        align = 0;
+    }
+    mop = endian | size | align;
+
+    elements = (is_q ? 16 : 8) >> size;
+    tcg_ebytes = tcg_constant_i64(1 << size);
+    for (r = 0; r < rpt; r++) {
+        int e;
+        for (e = 0; e < elements; e++) {
+            int xs;
+            for (xs = 0; xs < selem; xs++) {
+                int tt = (rt + r + xs) % 32;
+                if (is_store) {
+                    do_vec_st(s, tt, e, clean_addr, mop);
+                } else {
+                    do_vec_ld(s, tt, e, clean_addr, mop);
+                }
+                tcg_gen_add_i64(clean_addr, clean_addr, tcg_ebytes);
+            }
+        }
+    }
+
+    if (!is_store) {
+        /* For non-quad operations, setting a slice of the low
+         * 64 bits of the register clears the high 64 bits (in
+         * the ARM ARM pseudocode this is implicit in the fact
+         * that 'rval' is a 64 bit wide variable).
+         * For quad operations, we might still need to zero the
+         * high bits of SVE.
+         */
+        for (r = 0; r < rpt * selem; r++) {
+            int tt = (rt + r) % 32;
+            clear_vec_high(s, is_q, tt);
+        }
+    }
+
+    if (is_postidx) {
+        if (rm == 31) {
+            tcg_gen_addi_i64(tcg_rn, tcg_rn, total);
+        } else {
+            tcg_gen_add_i64(tcg_rn, tcg_rn, cpu_reg(s, rm));
+        }
+    }
+}
+
+/* AdvSIMD load/store single structure
+ *
+ *  31  30  29           23 22 21 20       16 15 13 12  11  10 9    5 4    0
+ * +---+---+---------------+-----+-----------+-----+---+------+------+------+
+ * | 0 | Q | 0 0 1 1 0 1 0 | L R | 0 0 0 0 0 | opc | S | size |  Rn  |  Rt  |
+ * +---+---+---------------+-----+-----------+-----+---+------+------+------+
+ *
+ * AdvSIMD load/store single structure (post-indexed)
+ *
+ *  31  30  29           23 22 21 20       16 15 13 12  11  10 9    5 4    0
+ * +---+---+---------------+-----+-----------+-----+---+------+------+------+
+ * | 0 | Q | 0 0 1 1 0 1 1 | L R |     Rm    | opc | S | size |  Rn  |  Rt  |
+ * +---+---+---------------+-----+-----------+-----+---+------+------+------+
+ *
+ * Rt: first (or only) SIMD&FP register to be transferred
+ * Rn: base address or SP
+ * Rm (post-index only): post-index register (when !31) or size dependent #imm
+ * index = encoded in Q:S:size dependent on size
+ *
+ * lane_size = encoded in R, opc
+ * transfer width = encoded in opc, S, size
+ */
+static void disas_ldst_single_struct(DisasContext *s, uint32_t insn)
+{
+    int rt = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int rm = extract32(insn, 16, 5);
+    int size = extract32(insn, 10, 2);
+    int S = extract32(insn, 12, 1);
+    int opc = extract32(insn, 13, 3);
+    int R = extract32(insn, 21, 1);
+    int is_load = extract32(insn, 22, 1);
+    int is_postidx = extract32(insn, 23, 1);
+    int is_q = extract32(insn, 30, 1);
+
+    int scale = extract32(opc, 1, 2);
+    int selem = (extract32(opc, 0, 1) << 1 | R) + 1;
+    bool replicate = false;
+    int index = is_q << 3 | S << 2 | size;
+    int xs, total;
+    TCGv_i64 clean_addr, tcg_rn, tcg_ebytes;
+    MemOp mop;
+
+    if (extract32(insn, 31, 1)) {
+        unallocated_encoding(s);
+        return;
+    }
+    if (!is_postidx && rm != 0) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (scale) {
+    case 3:
+        if (!is_load || S) {
+            unallocated_encoding(s);
+            return;
+        }
+        scale = size;
+        replicate = true;
+        break;
+    case 0:
+        break;
+    case 1:
+        if (extract32(size, 0, 1)) {
+            unallocated_encoding(s);
+            return;
+        }
+        index >>= 1;
+        break;
+    case 2:
+        if (extract32(size, 1, 1)) {
+            unallocated_encoding(s);
+            return;
+        }
+        if (!extract32(size, 0, 1)) {
+            index >>= 2;
+        } else {
+            if (S) {
+                unallocated_encoding(s);
+                return;
+            }
+            index >>= 3;
+            scale = 3;
+        }
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    if (rn == 31) {
+        gen_check_sp_alignment(s);
+    }
+
+    total = selem << scale;
+    tcg_rn = cpu_reg_sp(s, rn);
+
+    clean_addr = gen_mte_checkN(s, tcg_rn, !is_load, is_postidx || rn != 31,
+                                total);
+    mop = finalize_memop(s, scale);
+
+    tcg_ebytes = tcg_constant_i64(1 << scale);
+    for (xs = 0; xs < selem; xs++) {
+        if (replicate) {
+            /* Load and replicate to all elements */
+            TCGv_i64 tcg_tmp = tcg_temp_new_i64();
+
+            tcg_gen_qemu_ld_i64(tcg_tmp, clean_addr, get_mem_index(s), mop);
+            tcg_gen_gvec_dup_i64(scale, vec_full_reg_offset(s, rt),
+                                 (is_q + 1) * 8, vec_full_reg_size(s),
+                                 tcg_tmp);
+            tcg_temp_free_i64(tcg_tmp);
+        } else {
+            /* Load/store one element per register */
+            if (is_load) {
+                do_vec_ld(s, rt, index, clean_addr, mop);
+            } else {
+                do_vec_st(s, rt, index, clean_addr, mop);
+            }
+        }
+        tcg_gen_add_i64(clean_addr, clean_addr, tcg_ebytes);
+        rt = (rt + 1) % 32;
+    }
+
+    if (is_postidx) {
+        if (rm == 31) {
+            tcg_gen_addi_i64(tcg_rn, tcg_rn, total);
+        } else {
+            tcg_gen_add_i64(tcg_rn, tcg_rn, cpu_reg(s, rm));
+        }
+    }
+}
+
+/*
+ * Load/Store memory tags
+ *
+ *  31 30 29         24     22  21     12    10      5      0
+ * +-----+-------------+-----+---+------+-----+------+------+
+ * | 1 1 | 0 1 1 0 0 1 | op1 | 1 | imm9 | op2 |  Rn  |  Rt  |
+ * +-----+-------------+-----+---+------+-----+------+------+
+ */
+static void disas_ldst_tag(DisasContext *s, uint32_t insn)
+{
+    int rt = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    uint64_t offset = sextract64(insn, 12, 9) << LOG2_TAG_GRANULE;
+    int op2 = extract32(insn, 10, 2);
+    int op1 = extract32(insn, 22, 2);
+    bool is_load = false, is_pair = false, is_zero = false, is_mult = false;
+    int index = 0;
+    TCGv_i64 addr, clean_addr, tcg_rt;
+
+    /* We checked insn bits [29:24,21] in the caller.  */
+    if (extract32(insn, 30, 2) != 3) {
+        goto do_unallocated;
+    }
+
+    /*
+     * @index is a tri-state variable which has 3 states:
+     * < 0 : post-index, writeback
+     * = 0 : signed offset
+     * > 0 : pre-index, writeback
+     */
+    switch (op1) {
+    case 0:
+        if (op2 != 0) {
+            /* STG */
+            index = op2 - 2;
+        } else {
+            /* STZGM */
+            if (s->current_el == 0 || offset != 0) {
+                goto do_unallocated;
+            }
+            is_mult = is_zero = true;
+        }
+        break;
+    case 1:
+        if (op2 != 0) {
+            /* STZG */
+            is_zero = true;
+            index = op2 - 2;
+        } else {
+            /* LDG */
+            is_load = true;
+        }
+        break;
+    case 2:
+        if (op2 != 0) {
+            /* ST2G */
+            is_pair = true;
+            index = op2 - 2;
+        } else {
+            /* STGM */
+            if (s->current_el == 0 || offset != 0) {
+                goto do_unallocated;
+            }
+            is_mult = true;
+        }
+        break;
+    case 3:
+        if (op2 != 0) {
+            /* STZ2G */
+            is_pair = is_zero = true;
+            index = op2 - 2;
+        } else {
+            /* LDGM */
+            if (s->current_el == 0 || offset != 0) {
+                goto do_unallocated;
+            }
+            is_mult = is_load = true;
+        }
+        break;
+
+    default:
+    do_unallocated:
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (is_mult
+        ? !dc_isar_feature(aa64_mte, s)
+        : !dc_isar_feature(aa64_mte_insn_reg, s)) {
+        goto do_unallocated;
+    }
+
+    if (rn == 31) {
+        gen_check_sp_alignment(s);
+    }
+
+    addr = read_cpu_reg_sp(s, rn, true);
+    if (index >= 0) {
+        /* pre-index or signed offset */
+        tcg_gen_addi_i64(addr, addr, offset);
+    }
+
+    if (is_mult) {
+        tcg_rt = cpu_reg(s, rt);
+
+        if (is_zero) {
+            int size = 4 << s->dcz_blocksize;
+
+            if (s->ata) {
+                gen_helper_stzgm_tags(cpu_env, addr, tcg_rt);
+            }
+            /*
+             * The non-tags portion of STZGM is mostly like DC_ZVA,
+             * except the alignment happens before the access.
+             */
+            clean_addr = clean_data_tbi(s, addr);
+            tcg_gen_andi_i64(clean_addr, clean_addr, -size);
+            gen_helper_dc_zva(cpu_env, clean_addr);
+        } else if (s->ata) {
+            if (is_load) {
+                gen_helper_ldgm(tcg_rt, cpu_env, addr);
+            } else {
+                gen_helper_stgm(cpu_env, addr, tcg_rt);
+            }
+        } else {
+            MMUAccessType acc = is_load ? MMU_DATA_LOAD : MMU_DATA_STORE;
+            int size = 4 << GMID_EL1_BS;
+
+            clean_addr = clean_data_tbi(s, addr);
+            tcg_gen_andi_i64(clean_addr, clean_addr, -size);
+            gen_probe_access(s, clean_addr, acc, size);
+
+            if (is_load) {
+                /* The result tags are zeros.  */
+                tcg_gen_movi_i64(tcg_rt, 0);
+            }
+        }
+        return;
+    }
+
+    if (is_load) {
+        tcg_gen_andi_i64(addr, addr, -TAG_GRANULE);
+        tcg_rt = cpu_reg(s, rt);
+        if (s->ata) {
+            gen_helper_ldg(tcg_rt, cpu_env, addr, tcg_rt);
+        } else {
+            clean_addr = clean_data_tbi(s, addr);
+            gen_probe_access(s, clean_addr, MMU_DATA_LOAD, MO_8);
+            gen_address_with_allocation_tag0(tcg_rt, addr);
+        }
+    } else {
+        tcg_rt = cpu_reg_sp(s, rt);
+        if (!s->ata) {
+            /*
+             * For STG and ST2G, we need to check alignment and probe memory.
+             * TODO: For STZG and STZ2G, we could rely on the stores below,
+             * at least for system mode; user-only won't enforce alignment.
+             */
+            if (is_pair) {
+                gen_helper_st2g_stub(cpu_env, addr);
+            } else {
+                gen_helper_stg_stub(cpu_env, addr);
+            }
+        } else if (tb_cflags(s->base.tb) & CF_PARALLEL) {
+            if (is_pair) {
+                gen_helper_st2g_parallel(cpu_env, addr, tcg_rt);
+            } else {
+                gen_helper_stg_parallel(cpu_env, addr, tcg_rt);
+            }
+        } else {
+            if (is_pair) {
+                gen_helper_st2g(cpu_env, addr, tcg_rt);
+            } else {
+                gen_helper_stg(cpu_env, addr, tcg_rt);
+            }
+        }
+    }
+
+    if (is_zero) {
+        TCGv_i64 clean_addr = clean_data_tbi(s, addr);
+        TCGv_i64 tcg_zero = tcg_constant_i64(0);
+        int mem_index = get_mem_index(s);
+        int i, n = (1 + is_pair) << LOG2_TAG_GRANULE;
+
+        tcg_gen_qemu_st_i64(tcg_zero, clean_addr, mem_index,
+                            MO_UQ | MO_ALIGN_16);
+        for (i = 8; i < n; i += 8) {
+            tcg_gen_addi_i64(clean_addr, clean_addr, 8);
+            tcg_gen_qemu_st_i64(tcg_zero, clean_addr, mem_index, MO_UQ);
+        }
+    }
+
+    if (index != 0) {
+        /* pre-index or post-index */
+        if (index < 0) {
+            /* post-index */
+            tcg_gen_addi_i64(addr, addr, offset);
+        }
+        tcg_gen_mov_i64(cpu_reg_sp(s, rn), addr);
+    }
+}
+
+/* Loads and stores */
+static void disas_ldst(DisasContext *s, uint32_t insn)
+{
+    switch (extract32(insn, 24, 6)) {
+    case 0x08: /* Load/store exclusive */
+        disas_ldst_excl(s, insn);
+        break;
+    case 0x18: case 0x1c: /* Load register (literal) */
+        disas_ld_lit(s, insn);
+        break;
+    case 0x28: case 0x29:
+    case 0x2c: case 0x2d: /* Load/store pair (all forms) */
+        disas_ldst_pair(s, insn);
+        break;
+    case 0x38: case 0x39:
+    case 0x3c: case 0x3d: /* Load/store register (all forms) */
+        disas_ldst_reg(s, insn);
+        break;
+    case 0x0c: /* AdvSIMD load/store multiple structures */
+        disas_ldst_multiple_struct(s, insn);
+        break;
+    case 0x0d: /* AdvSIMD load/store single structure */
+        disas_ldst_single_struct(s, insn);
+        break;
+    case 0x19:
+        if (extract32(insn, 21, 1) != 0) {
+            disas_ldst_tag(s, insn);
+        } else if (extract32(insn, 10, 2) == 0) {
+            disas_ldst_ldapr_stlr(s, insn);
+        } else {
+            unallocated_encoding(s);
+        }
+        break;
+    default:
+        unallocated_encoding(s);
+        break;
+    }
+}
+
+/* PC-rel. addressing
+ *   31  30   29 28       24 23                5 4    0
+ * +----+-------+-----------+-------------------+------+
+ * | op | immlo | 1 0 0 0 0 |       immhi       |  Rd  |
+ * +----+-------+-----------+-------------------+------+
+ */
+static void disas_pc_rel_adr(DisasContext *s, uint32_t insn)
+{
+    unsigned int page, rd;
+    int64_t offset;
+
+    page = extract32(insn, 31, 1);
+    /* SignExtend(immhi:immlo) -> offset */
+    offset = sextract64(insn, 5, 19);
+    offset = offset << 2 | extract32(insn, 29, 2);
+    rd = extract32(insn, 0, 5);
+
+    if (page) {
+        /* ADRP (page based) */
+        offset <<= 12;
+        /* The page offset is ok for TARGET_TB_PCREL. */
+        offset -= s->pc_curr & 0xfff;
+    }
+
+    gen_pc_plus_diff(s, cpu_reg(s, rd), offset);
+}
+
+/*
+ * Add/subtract (immediate)
+ *
+ *  31 30 29 28         23 22 21         10 9   5 4   0
+ * +--+--+--+-------------+--+-------------+-----+-----+
+ * |sf|op| S| 1 0 0 0 1 0 |sh|    imm12    |  Rn | Rd  |
+ * +--+--+--+-------------+--+-------------+-----+-----+
+ *
+ *    sf: 0 -> 32bit, 1 -> 64bit
+ *    op: 0 -> add  , 1 -> sub
+ *     S: 1 -> set flags
+ *    sh: 1 -> LSL imm by 12
+ */
+static void disas_add_sub_imm(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    uint64_t imm = extract32(insn, 10, 12);
+    bool shift = extract32(insn, 22, 1);
+    bool setflags = extract32(insn, 29, 1);
+    bool sub_op = extract32(insn, 30, 1);
+    bool is_64bit = extract32(insn, 31, 1);
+
+    TCGv_i64 tcg_rn = cpu_reg_sp(s, rn);
+    TCGv_i64 tcg_rd = setflags ? cpu_reg(s, rd) : cpu_reg_sp(s, rd);
+    TCGv_i64 tcg_result;
+
+    if (shift) {
+        imm <<= 12;
+    }
+
+    tcg_result = tcg_temp_new_i64();
+    if (!setflags) {
+        if (sub_op) {
+            tcg_gen_subi_i64(tcg_result, tcg_rn, imm);
+        } else {
+            tcg_gen_addi_i64(tcg_result, tcg_rn, imm);
+        }
+    } else {
+        TCGv_i64 tcg_imm = tcg_constant_i64(imm);
+        if (sub_op) {
+            gen_sub_CC(is_64bit, tcg_result, tcg_rn, tcg_imm);
+        } else {
+            gen_add_CC(is_64bit, tcg_result, tcg_rn, tcg_imm);
+        }
+    }
+
+    if (is_64bit) {
+        tcg_gen_mov_i64(tcg_rd, tcg_result);
+    } else {
+        tcg_gen_ext32u_i64(tcg_rd, tcg_result);
+    }
+
+    tcg_temp_free_i64(tcg_result);
+}
+
+/*
+ * Add/subtract (immediate, with tags)
+ *
+ *  31 30 29 28         23 22 21     16 14      10 9   5 4   0
+ * +--+--+--+-------------+--+---------+--+-------+-----+-----+
+ * |sf|op| S| 1 0 0 0 1 1 |o2|  uimm6  |o3| uimm4 |  Rn | Rd  |
+ * +--+--+--+-------------+--+---------+--+-------+-----+-----+
+ *
+ *    op: 0 -> add, 1 -> sub
+ */
+static void disas_add_sub_imm_with_tags(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int uimm4 = extract32(insn, 10, 4);
+    int uimm6 = extract32(insn, 16, 6);
+    bool sub_op = extract32(insn, 30, 1);
+    TCGv_i64 tcg_rn, tcg_rd;
+    int imm;
+
+    /* Test all of sf=1, S=0, o2=0, o3=0.  */
+    if ((insn & 0xa040c000u) != 0x80000000u ||
+        !dc_isar_feature(aa64_mte_insn_reg, s)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    imm = uimm6 << LOG2_TAG_GRANULE;
+    if (sub_op) {
+        imm = -imm;
+    }
+
+    tcg_rn = cpu_reg_sp(s, rn);
+    tcg_rd = cpu_reg_sp(s, rd);
+
+    if (s->ata) {
+        gen_helper_addsubg(tcg_rd, cpu_env, tcg_rn,
+                           tcg_constant_i32(imm),
+                           tcg_constant_i32(uimm4));
+    } else {
+        tcg_gen_addi_i64(tcg_rd, tcg_rn, imm);
+        gen_address_with_allocation_tag0(tcg_rd, tcg_rd);
+    }
+}
+
+/* The input should be a value in the bottom e bits (with higher
+ * bits zero); returns that value replicated into every element
+ * of size e in a 64 bit integer.
+ */
+static uint64_t bitfield_replicate(uint64_t mask, unsigned int e)
+{
+    assert(e != 0);
+    while (e < 64) {
+        mask |= mask << e;
+        e *= 2;
+    }
+    return mask;
+}
+
+/* Return a value with the bottom len bits set (where 0 < len <= 64) */
+static inline uint64_t bitmask64(unsigned int length)
+{
+    assert(length > 0 && length <= 64);
+    return ~0ULL >> (64 - length);
+}
+
+/* Simplified variant of pseudocode DecodeBitMasks() for the case where we
+ * only require the wmask. Returns false if the imms/immr/immn are a reserved
+ * value (ie should cause a guest UNDEF exception), and true if they are
+ * valid, in which case the decoded bit pattern is written to result.
+ */
+bool logic_imm_decode_wmask(uint64_t *result, unsigned int immn,
+                            unsigned int imms, unsigned int immr)
+{
+    uint64_t mask;
+    unsigned e, levels, s, r;
+    int len;
+
+    assert(immn < 2 && imms < 64 && immr < 64);
+
+    /* The bit patterns we create here are 64 bit patterns which
+     * are vectors of identical elements of size e = 2, 4, 8, 16, 32 or
+     * 64 bits each. Each element contains the same value: a run
+     * of between 1 and e-1 non-zero bits, rotated within the
+     * element by between 0 and e-1 bits.
+     *
+     * The element size and run length are encoded into immn (1 bit)
+     * and imms (6 bits) as follows:
+     * 64 bit elements: immn = 1, imms = <length of run - 1>
+     * 32 bit elements: immn = 0, imms = 0 : <length of run - 1>
+     * 16 bit elements: immn = 0, imms = 10 : <length of run - 1>
+     *  8 bit elements: immn = 0, imms = 110 : <length of run - 1>
+     *  4 bit elements: immn = 0, imms = 1110 : <length of run - 1>
+     *  2 bit elements: immn = 0, imms = 11110 : <length of run - 1>
+     * Notice that immn = 0, imms = 11111x is the only combination
+     * not covered by one of the above options; this is reserved.
+     * Further, <length of run - 1> all-ones is a reserved pattern.
+     *
+     * In all cases the rotation is by immr % e (and immr is 6 bits).
+     */
+
+    /* First determine the element size */
+    len = 31 - clz32((immn << 6) | (~imms & 0x3f));
+    if (len < 1) {
+        /* This is the immn == 0, imms == 0x11111x case */
+        return false;
+    }
+    e = 1 << len;
+
+    levels = e - 1;
+    s = imms & levels;
+    r = immr & levels;
+
+    if (s == levels) {
+        /* <length of run - 1> mustn't be all-ones. */
+        return false;
+    }
+
+    /* Create the value of one element: s+1 set bits rotated
+     * by r within the element (which is e bits wide)...
+     */
+    mask = bitmask64(s + 1);
+    if (r) {
+        mask = (mask >> r) | (mask << (e - r));
+        mask &= bitmask64(e);
+    }
+    /* ...then replicate the element over the whole 64 bit value */
+    mask = bitfield_replicate(mask, e);
+    *result = mask;
+    return true;
+}
+
+/* Logical (immediate)
+ *   31  30 29 28         23 22  21  16 15  10 9    5 4    0
+ * +----+-----+-------------+---+------+------+------+------+
+ * | sf | opc | 1 0 0 1 0 0 | N | immr | imms |  Rn  |  Rd  |
+ * +----+-----+-------------+---+------+------+------+------+
+ */
+static void disas_logic_imm(DisasContext *s, uint32_t insn)
+{
+    unsigned int sf, opc, is_n, immr, imms, rn, rd;
+    TCGv_i64 tcg_rd, tcg_rn;
+    uint64_t wmask;
+    bool is_and = false;
+
+    sf = extract32(insn, 31, 1);
+    opc = extract32(insn, 29, 2);
+    is_n = extract32(insn, 22, 1);
+    immr = extract32(insn, 16, 6);
+    imms = extract32(insn, 10, 6);
+    rn = extract32(insn, 5, 5);
+    rd = extract32(insn, 0, 5);
+
+    if (!sf && is_n) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (opc == 0x3) { /* ANDS */
+        tcg_rd = cpu_reg(s, rd);
+    } else {
+        tcg_rd = cpu_reg_sp(s, rd);
+    }
+    tcg_rn = cpu_reg(s, rn);
+
+    if (!logic_imm_decode_wmask(&wmask, is_n, imms, immr)) {
+        /* some immediate field values are reserved */
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!sf) {
+        wmask &= 0xffffffff;
+    }
+
+    switch (opc) {
+    case 0x3: /* ANDS */
+    case 0x0: /* AND */
+        tcg_gen_andi_i64(tcg_rd, tcg_rn, wmask);
+        is_and = true;
+        break;
+    case 0x1: /* ORR */
+        tcg_gen_ori_i64(tcg_rd, tcg_rn, wmask);
+        break;
+    case 0x2: /* EOR */
+        tcg_gen_xori_i64(tcg_rd, tcg_rn, wmask);
+        break;
+    default:
+        assert(FALSE); /* must handle all above */
+        break;
+    }
+
+    if (!sf && !is_and) {
+        /* zero extend final result; we know we can skip this for AND
+         * since the immediate had the high 32 bits clear.
+         */
+        tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
+    }
+
+    if (opc == 3) { /* ANDS */
+        gen_logic_CC(sf, tcg_rd);
+    }
+}
+
+/*
+ * Move wide (immediate)
+ *
+ *  31 30 29 28         23 22 21 20             5 4    0
+ * +--+-----+-------------+-----+----------------+------+
+ * |sf| opc | 1 0 0 1 0 1 |  hw |  imm16         |  Rd  |
+ * +--+-----+-------------+-----+----------------+------+
+ *
+ * sf: 0 -> 32 bit, 1 -> 64 bit
+ * opc: 00 -> N, 10 -> Z, 11 -> K
+ * hw: shift/16 (0,16, and sf only 32, 48)
+ */
+static void disas_movw_imm(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    uint64_t imm = extract32(insn, 5, 16);
+    int sf = extract32(insn, 31, 1);
+    int opc = extract32(insn, 29, 2);
+    int pos = extract32(insn, 21, 2) << 4;
+    TCGv_i64 tcg_rd = cpu_reg(s, rd);
+
+    if (!sf && (pos >= 32)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (opc) {
+    case 0: /* MOVN */
+    case 2: /* MOVZ */
+        imm <<= pos;
+        if (opc == 0) {
+            imm = ~imm;
+        }
+        if (!sf) {
+            imm &= 0xffffffffu;
+        }
+        tcg_gen_movi_i64(tcg_rd, imm);
+        break;
+    case 3: /* MOVK */
+        tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_constant_i64(imm), pos, 16);
+        if (!sf) {
+            tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
+        }
+        break;
+    default:
+        unallocated_encoding(s);
+        break;
+    }
+}
+
+/* Bitfield
+ *   31  30 29 28         23 22  21  16 15  10 9    5 4    0
+ * +----+-----+-------------+---+------+------+------+------+
+ * | sf | opc | 1 0 0 1 1 0 | N | immr | imms |  Rn  |  Rd  |
+ * +----+-----+-------------+---+------+------+------+------+
+ */
+static void disas_bitfield(DisasContext *s, uint32_t insn)
+{
+    unsigned int sf, n, opc, ri, si, rn, rd, bitsize, pos, len;
+    TCGv_i64 tcg_rd, tcg_tmp;
+
+    sf = extract32(insn, 31, 1);
+    opc = extract32(insn, 29, 2);
+    n = extract32(insn, 22, 1);
+    ri = extract32(insn, 16, 6);
+    si = extract32(insn, 10, 6);
+    rn = extract32(insn, 5, 5);
+    rd = extract32(insn, 0, 5);
+    bitsize = sf ? 64 : 32;
+
+    if (sf != n || ri >= bitsize || si >= bitsize || opc > 2) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    tcg_rd = cpu_reg(s, rd);
+
+    /* Suppress the zero-extend for !sf.  Since RI and SI are constrained
+       to be smaller than bitsize, we'll never reference data outside the
+       low 32-bits anyway.  */
+    tcg_tmp = read_cpu_reg(s, rn, 1);
+
+    /* Recognize simple(r) extractions.  */
+    if (si >= ri) {
+        /* Wd<s-r:0> = Wn<s:r> */
+        len = (si - ri) + 1;
+        if (opc == 0) { /* SBFM: ASR, SBFX, SXTB, SXTH, SXTW */
+            tcg_gen_sextract_i64(tcg_rd, tcg_tmp, ri, len);
+            goto done;
+        } else if (opc == 2) { /* UBFM: UBFX, LSR, UXTB, UXTH */
+            tcg_gen_extract_i64(tcg_rd, tcg_tmp, ri, len);
+            return;
+        }
+        /* opc == 1, BFXIL fall through to deposit */
+        tcg_gen_shri_i64(tcg_tmp, tcg_tmp, ri);
+        pos = 0;
+    } else {
+        /* Handle the ri > si case with a deposit
+         * Wd<32+s-r,32-r> = Wn<s:0>
+         */
+        len = si + 1;
+        pos = (bitsize - ri) & (bitsize - 1);
+    }
+
+    if (opc == 0 && len < ri) {
+        /* SBFM: sign extend the destination field from len to fill
+           the balance of the word.  Let the deposit below insert all
+           of those sign bits.  */
+        tcg_gen_sextract_i64(tcg_tmp, tcg_tmp, 0, len);
+        len = ri;
+    }
+
+    if (opc == 1) { /* BFM, BFXIL */
+        tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_tmp, pos, len);
+    } else {
+        /* SBFM or UBFM: We start with zero, and we haven't modified
+           any bits outside bitsize, therefore the zero-extension
+           below is unneeded.  */
+        tcg_gen_deposit_z_i64(tcg_rd, tcg_tmp, pos, len);
+        return;
+    }
+
+ done:
+    if (!sf) { /* zero extend final result */
+        tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
+    }
+}
+
+/* Extract
+ *   31  30  29 28         23 22   21  20  16 15    10 9    5 4    0
+ * +----+------+-------------+---+----+------+--------+------+------+
+ * | sf | op21 | 1 0 0 1 1 1 | N | o0 |  Rm  |  imms  |  Rn  |  Rd  |
+ * +----+------+-------------+---+----+------+--------+------+------+
+ */
+static void disas_extract(DisasContext *s, uint32_t insn)
+{
+    unsigned int sf, n, rm, imm, rn, rd, bitsize, op21, op0;
+
+    sf = extract32(insn, 31, 1);
+    n = extract32(insn, 22, 1);
+    rm = extract32(insn, 16, 5);
+    imm = extract32(insn, 10, 6);
+    rn = extract32(insn, 5, 5);
+    rd = extract32(insn, 0, 5);
+    op21 = extract32(insn, 29, 2);
+    op0 = extract32(insn, 21, 1);
+    bitsize = sf ? 64 : 32;
+
+    if (sf != n || op21 || op0 || imm >= bitsize) {
+        unallocated_encoding(s);
+    } else {
+        TCGv_i64 tcg_rd, tcg_rm, tcg_rn;
+
+        tcg_rd = cpu_reg(s, rd);
+
+        if (unlikely(imm == 0)) {
+            /* tcg shl_i32/shl_i64 is undefined for 32/64 bit shifts,
+             * so an extract from bit 0 is a special case.
+             */
+            if (sf) {
+                tcg_gen_mov_i64(tcg_rd, cpu_reg(s, rm));
+            } else {
+                tcg_gen_ext32u_i64(tcg_rd, cpu_reg(s, rm));
+            }
+        } else {
+            tcg_rm = cpu_reg(s, rm);
+            tcg_rn = cpu_reg(s, rn);
+
+            if (sf) {
+                /* Specialization to ROR happens in EXTRACT2.  */
+                tcg_gen_extract2_i64(tcg_rd, tcg_rm, tcg_rn, imm);
+            } else {
+                TCGv_i32 t0 = tcg_temp_new_i32();
+
+                tcg_gen_extrl_i64_i32(t0, tcg_rm);
+                if (rm == rn) {
+                    tcg_gen_rotri_i32(t0, t0, imm);
+                } else {
+                    TCGv_i32 t1 = tcg_temp_new_i32();
+                    tcg_gen_extrl_i64_i32(t1, tcg_rn);
+                    tcg_gen_extract2_i32(t0, t0, t1, imm);
+                    tcg_temp_free_i32(t1);
+                }
+                tcg_gen_extu_i32_i64(tcg_rd, t0);
+                tcg_temp_free_i32(t0);
+            }
+        }
+    }
+}
+
+/* Data processing - immediate */
+static void disas_data_proc_imm(DisasContext *s, uint32_t insn)
+{
+    switch (extract32(insn, 23, 6)) {
+    case 0x20: case 0x21: /* PC-rel. addressing */
+        disas_pc_rel_adr(s, insn);
+        break;
+    case 0x22: /* Add/subtract (immediate) */
+        disas_add_sub_imm(s, insn);
+        break;
+    case 0x23: /* Add/subtract (immediate, with tags) */
+        disas_add_sub_imm_with_tags(s, insn);
+        break;
+    case 0x24: /* Logical (immediate) */
+        disas_logic_imm(s, insn);
+        break;
+    case 0x25: /* Move wide (immediate) */
+        disas_movw_imm(s, insn);
+        break;
+    case 0x26: /* Bitfield */
+        disas_bitfield(s, insn);
+        break;
+    case 0x27: /* Extract */
+        disas_extract(s, insn);
+        break;
+    default:
+        unallocated_encoding(s);
+        break;
+    }
+}
+
+/* Shift a TCGv src by TCGv shift_amount, put result in dst.
+ * Note that it is the caller's responsibility to ensure that the
+ * shift amount is in range (ie 0..31 or 0..63) and provide the ARM
+ * mandated semantics for out of range shifts.
+ */
+static void shift_reg(TCGv_i64 dst, TCGv_i64 src, int sf,
+                      enum a64_shift_type shift_type, TCGv_i64 shift_amount)
+{
+    switch (shift_type) {
+    case A64_SHIFT_TYPE_LSL:
+        tcg_gen_shl_i64(dst, src, shift_amount);
+        break;
+    case A64_SHIFT_TYPE_LSR:
+        tcg_gen_shr_i64(dst, src, shift_amount);
+        break;
+    case A64_SHIFT_TYPE_ASR:
+        if (!sf) {
+            tcg_gen_ext32s_i64(dst, src);
+        }
+        tcg_gen_sar_i64(dst, sf ? src : dst, shift_amount);
+        break;
+    case A64_SHIFT_TYPE_ROR:
+        if (sf) {
+            tcg_gen_rotr_i64(dst, src, shift_amount);
+        } else {
+            TCGv_i32 t0, t1;
+            t0 = tcg_temp_new_i32();
+            t1 = tcg_temp_new_i32();
+            tcg_gen_extrl_i64_i32(t0, src);
+            tcg_gen_extrl_i64_i32(t1, shift_amount);
+            tcg_gen_rotr_i32(t0, t0, t1);
+            tcg_gen_extu_i32_i64(dst, t0);
+            tcg_temp_free_i32(t0);
+            tcg_temp_free_i32(t1);
+        }
+        break;
+    default:
+        assert(FALSE); /* all shift types should be handled */
+        break;
+    }
+
+    if (!sf) { /* zero extend final result */
+        tcg_gen_ext32u_i64(dst, dst);
+    }
+}
+
+/* Shift a TCGv src by immediate, put result in dst.
+ * The shift amount must be in range (this should always be true as the
+ * relevant instructions will UNDEF on bad shift immediates).
+ */
+static void shift_reg_imm(TCGv_i64 dst, TCGv_i64 src, int sf,
+                          enum a64_shift_type shift_type, unsigned int shift_i)
+{
+    assert(shift_i < (sf ? 64 : 32));
+
+    if (shift_i == 0) {
+        tcg_gen_mov_i64(dst, src);
+    } else {
+        shift_reg(dst, src, sf, shift_type, tcg_constant_i64(shift_i));
+    }
+}
+
+/* Logical (shifted register)
+ *   31  30 29 28       24 23   22 21  20  16 15    10 9    5 4    0
+ * +----+-----+-----------+-------+---+------+--------+------+------+
+ * | sf | opc | 0 1 0 1 0 | shift | N |  Rm  |  imm6  |  Rn  |  Rd  |
+ * +----+-----+-----------+-------+---+------+--------+------+------+
+ */
+static void disas_logic_reg(DisasContext *s, uint32_t insn)
+{
+    TCGv_i64 tcg_rd, tcg_rn, tcg_rm;
+    unsigned int sf, opc, shift_type, invert, rm, shift_amount, rn, rd;
+
+    sf = extract32(insn, 31, 1);
+    opc = extract32(insn, 29, 2);
+    shift_type = extract32(insn, 22, 2);
+    invert = extract32(insn, 21, 1);
+    rm = extract32(insn, 16, 5);
+    shift_amount = extract32(insn, 10, 6);
+    rn = extract32(insn, 5, 5);
+    rd = extract32(insn, 0, 5);
+
+    if (!sf && (shift_amount & (1 << 5))) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    tcg_rd = cpu_reg(s, rd);
+
+    if (opc == 1 && shift_amount == 0 && shift_type == 0 && rn == 31) {
+        /* Unshifted ORR and ORN with WZR/XZR is the standard encoding for
+         * register-register MOV and MVN, so it is worth special casing.
+         */
+        tcg_rm = cpu_reg(s, rm);
+        if (invert) {
+            tcg_gen_not_i64(tcg_rd, tcg_rm);
+            if (!sf) {
+                tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
+            }
+        } else {
+            if (sf) {
+                tcg_gen_mov_i64(tcg_rd, tcg_rm);
+            } else {
+                tcg_gen_ext32u_i64(tcg_rd, tcg_rm);
+            }
+        }
+        return;
+    }
+
+    tcg_rm = read_cpu_reg(s, rm, sf);
+
+    if (shift_amount) {
+        shift_reg_imm(tcg_rm, tcg_rm, sf, shift_type, shift_amount);
+    }
+
+    tcg_rn = cpu_reg(s, rn);
+
+    switch (opc | (invert << 2)) {
+    case 0: /* AND */
+    case 3: /* ANDS */
+        tcg_gen_and_i64(tcg_rd, tcg_rn, tcg_rm);
+        break;
+    case 1: /* ORR */
+        tcg_gen_or_i64(tcg_rd, tcg_rn, tcg_rm);
+        break;
+    case 2: /* EOR */
+        tcg_gen_xor_i64(tcg_rd, tcg_rn, tcg_rm);
+        break;
+    case 4: /* BIC */
+    case 7: /* BICS */
+        tcg_gen_andc_i64(tcg_rd, tcg_rn, tcg_rm);
+        break;
+    case 5: /* ORN */
+        tcg_gen_orc_i64(tcg_rd, tcg_rn, tcg_rm);
+        break;
+    case 6: /* EON */
+        tcg_gen_eqv_i64(tcg_rd, tcg_rn, tcg_rm);
+        break;
+    default:
+        assert(FALSE);
+        break;
+    }
+
+    if (!sf) {
+        tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
+    }
+
+    if (opc == 3) {
+        gen_logic_CC(sf, tcg_rd);
+    }
+}
+
+/*
+ * Add/subtract (extended register)
+ *
+ *  31|30|29|28       24|23 22|21|20   16|15  13|12  10|9  5|4  0|
+ * +--+--+--+-----------+-----+--+-------+------+------+----+----+
+ * |sf|op| S| 0 1 0 1 1 | opt | 1|  Rm   |option| imm3 | Rn | Rd |
+ * +--+--+--+-----------+-----+--+-------+------+------+----+----+
+ *
+ *  sf: 0 -> 32bit, 1 -> 64bit
+ *  op: 0 -> add  , 1 -> sub
+ *   S: 1 -> set flags
+ * opt: 00
+ * option: extension type (see DecodeRegExtend)
+ * imm3: optional shift to Rm
+ *
+ * Rd = Rn + LSL(extend(Rm), amount)
+ */
+static void disas_add_sub_ext_reg(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int imm3 = extract32(insn, 10, 3);
+    int option = extract32(insn, 13, 3);
+    int rm = extract32(insn, 16, 5);
+    int opt = extract32(insn, 22, 2);
+    bool setflags = extract32(insn, 29, 1);
+    bool sub_op = extract32(insn, 30, 1);
+    bool sf = extract32(insn, 31, 1);
+
+    TCGv_i64 tcg_rm, tcg_rn; /* temps */
+    TCGv_i64 tcg_rd;
+    TCGv_i64 tcg_result;
+
+    if (imm3 > 4 || opt != 0) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    /* non-flag setting ops may use SP */
+    if (!setflags) {
+        tcg_rd = cpu_reg_sp(s, rd);
+    } else {
+        tcg_rd = cpu_reg(s, rd);
+    }
+    tcg_rn = read_cpu_reg_sp(s, rn, sf);
+
+    tcg_rm = read_cpu_reg(s, rm, sf);
+    ext_and_shift_reg(tcg_rm, tcg_rm, option, imm3);
+
+    tcg_result = tcg_temp_new_i64();
+
+    if (!setflags) {
+        if (sub_op) {
+            tcg_gen_sub_i64(tcg_result, tcg_rn, tcg_rm);
+        } else {
+            tcg_gen_add_i64(tcg_result, tcg_rn, tcg_rm);
+        }
+    } else {
+        if (sub_op) {
+            gen_sub_CC(sf, tcg_result, tcg_rn, tcg_rm);
+        } else {
+            gen_add_CC(sf, tcg_result, tcg_rn, tcg_rm);
+        }
+    }
+
+    if (sf) {
+        tcg_gen_mov_i64(tcg_rd, tcg_result);
+    } else {
+        tcg_gen_ext32u_i64(tcg_rd, tcg_result);
+    }
+
+    tcg_temp_free_i64(tcg_result);
+}
+
+/*
+ * Add/subtract (shifted register)
+ *
+ *  31 30 29 28       24 23 22 21 20   16 15     10 9    5 4    0
+ * +--+--+--+-----------+-----+--+-------+---------+------+------+
+ * |sf|op| S| 0 1 0 1 1 |shift| 0|  Rm   |  imm6   |  Rn  |  Rd  |
+ * +--+--+--+-----------+-----+--+-------+---------+------+------+
+ *
+ *    sf: 0 -> 32bit, 1 -> 64bit
+ *    op: 0 -> add  , 1 -> sub
+ *     S: 1 -> set flags
+ * shift: 00 -> LSL, 01 -> LSR, 10 -> ASR, 11 -> RESERVED
+ *  imm6: Shift amount to apply to Rm before the add/sub
+ */
+static void disas_add_sub_reg(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int imm6 = extract32(insn, 10, 6);
+    int rm = extract32(insn, 16, 5);
+    int shift_type = extract32(insn, 22, 2);
+    bool setflags = extract32(insn, 29, 1);
+    bool sub_op = extract32(insn, 30, 1);
+    bool sf = extract32(insn, 31, 1);
+
+    TCGv_i64 tcg_rd = cpu_reg(s, rd);
+    TCGv_i64 tcg_rn, tcg_rm;
+    TCGv_i64 tcg_result;
+
+    if ((shift_type == 3) || (!sf && (imm6 > 31))) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    tcg_rn = read_cpu_reg(s, rn, sf);
+    tcg_rm = read_cpu_reg(s, rm, sf);
+
+    shift_reg_imm(tcg_rm, tcg_rm, sf, shift_type, imm6);
+
+    tcg_result = tcg_temp_new_i64();
+
+    if (!setflags) {
+        if (sub_op) {
+            tcg_gen_sub_i64(tcg_result, tcg_rn, tcg_rm);
+        } else {
+            tcg_gen_add_i64(tcg_result, tcg_rn, tcg_rm);
+        }
+    } else {
+        if (sub_op) {
+            gen_sub_CC(sf, tcg_result, tcg_rn, tcg_rm);
+        } else {
+            gen_add_CC(sf, tcg_result, tcg_rn, tcg_rm);
+        }
+    }
+
+    if (sf) {
+        tcg_gen_mov_i64(tcg_rd, tcg_result);
+    } else {
+        tcg_gen_ext32u_i64(tcg_rd, tcg_result);
+    }
+
+    tcg_temp_free_i64(tcg_result);
+}
+
+/* Data-processing (3 source)
+ *
+ *    31 30  29 28       24 23 21  20  16  15  14  10 9    5 4    0
+ *  +--+------+-----------+------+------+----+------+------+------+
+ *  |sf| op54 | 1 1 0 1 1 | op31 |  Rm  | o0 |  Ra  |  Rn  |  Rd  |
+ *  +--+------+-----------+------+------+----+------+------+------+
+ */
+static void disas_data_proc_3src(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int ra = extract32(insn, 10, 5);
+    int rm = extract32(insn, 16, 5);
+    int op_id = (extract32(insn, 29, 3) << 4) |
+        (extract32(insn, 21, 3) << 1) |
+        extract32(insn, 15, 1);
+    bool sf = extract32(insn, 31, 1);
+    bool is_sub = extract32(op_id, 0, 1);
+    bool is_high = extract32(op_id, 2, 1);
+    bool is_signed = false;
+    TCGv_i64 tcg_op1;
+    TCGv_i64 tcg_op2;
+    TCGv_i64 tcg_tmp;
+
+    /* Note that op_id is sf:op54:op31:o0 so it includes the 32/64 size flag */
+    switch (op_id) {
+    case 0x42: /* SMADDL */
+    case 0x43: /* SMSUBL */
+    case 0x44: /* SMULH */
+        is_signed = true;
+        break;
+    case 0x0: /* MADD (32bit) */
+    case 0x1: /* MSUB (32bit) */
+    case 0x40: /* MADD (64bit) */
+    case 0x41: /* MSUB (64bit) */
+    case 0x4a: /* UMADDL */
+    case 0x4b: /* UMSUBL */
+    case 0x4c: /* UMULH */
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (is_high) {
+        TCGv_i64 low_bits = tcg_temp_new_i64(); /* low bits discarded */
+        TCGv_i64 tcg_rd = cpu_reg(s, rd);
+        TCGv_i64 tcg_rn = cpu_reg(s, rn);
+        TCGv_i64 tcg_rm = cpu_reg(s, rm);
+
+        if (is_signed) {
+            tcg_gen_muls2_i64(low_bits, tcg_rd, tcg_rn, tcg_rm);
+        } else {
+            tcg_gen_mulu2_i64(low_bits, tcg_rd, tcg_rn, tcg_rm);
+        }
+
+        tcg_temp_free_i64(low_bits);
+        return;
+    }
+
+    tcg_op1 = tcg_temp_new_i64();
+    tcg_op2 = tcg_temp_new_i64();
+    tcg_tmp = tcg_temp_new_i64();
+
+    if (op_id < 0x42) {
+        tcg_gen_mov_i64(tcg_op1, cpu_reg(s, rn));
+        tcg_gen_mov_i64(tcg_op2, cpu_reg(s, rm));
+    } else {
+        if (is_signed) {
+            tcg_gen_ext32s_i64(tcg_op1, cpu_reg(s, rn));
+            tcg_gen_ext32s_i64(tcg_op2, cpu_reg(s, rm));
+        } else {
+            tcg_gen_ext32u_i64(tcg_op1, cpu_reg(s, rn));
+            tcg_gen_ext32u_i64(tcg_op2, cpu_reg(s, rm));
+        }
+    }
+
+    if (ra == 31 && !is_sub) {
+        /* Special-case MADD with rA == XZR; it is the standard MUL alias */
+        tcg_gen_mul_i64(cpu_reg(s, rd), tcg_op1, tcg_op2);
+    } else {
+        tcg_gen_mul_i64(tcg_tmp, tcg_op1, tcg_op2);
+        if (is_sub) {
+            tcg_gen_sub_i64(cpu_reg(s, rd), cpu_reg(s, ra), tcg_tmp);
+        } else {
+            tcg_gen_add_i64(cpu_reg(s, rd), cpu_reg(s, ra), tcg_tmp);
+        }
+    }
+
+    if (!sf) {
+        tcg_gen_ext32u_i64(cpu_reg(s, rd), cpu_reg(s, rd));
+    }
+
+    tcg_temp_free_i64(tcg_op1);
+    tcg_temp_free_i64(tcg_op2);
+    tcg_temp_free_i64(tcg_tmp);
+}
+
+/* Add/subtract (with carry)
+ *  31 30 29 28 27 26 25 24 23 22 21  20  16  15       10  9    5 4   0
+ * +--+--+--+------------------------+------+-------------+------+-----+
+ * |sf|op| S| 1  1  0  1  0  0  0  0 |  rm  | 0 0 0 0 0 0 |  Rn  |  Rd |
+ * +--+--+--+------------------------+------+-------------+------+-----+
+ */
+
+static void disas_adc_sbc(DisasContext *s, uint32_t insn)
+{
+    unsigned int sf, op, setflags, rm, rn, rd;
+    TCGv_i64 tcg_y, tcg_rn, tcg_rd;
+
+    sf = extract32(insn, 31, 1);
+    op = extract32(insn, 30, 1);
+    setflags = extract32(insn, 29, 1);
+    rm = extract32(insn, 16, 5);
+    rn = extract32(insn, 5, 5);
+    rd = extract32(insn, 0, 5);
+
+    tcg_rd = cpu_reg(s, rd);
+    tcg_rn = cpu_reg(s, rn);
+
+    if (op) {
+        tcg_y = new_tmp_a64(s);
+        tcg_gen_not_i64(tcg_y, cpu_reg(s, rm));
+    } else {
+        tcg_y = cpu_reg(s, rm);
+    }
+
+    if (setflags) {
+        gen_adc_CC(sf, tcg_rd, tcg_rn, tcg_y);
+    } else {
+        gen_adc(sf, tcg_rd, tcg_rn, tcg_y);
+    }
+}
+
+/*
+ * Rotate right into flags
+ *  31 30 29                21       15          10      5  4      0
+ * +--+--+--+-----------------+--------+-----------+------+--+------+
+ * |sf|op| S| 1 1 0 1 0 0 0 0 |  imm6  | 0 0 0 0 1 |  Rn  |o2| mask |
+ * +--+--+--+-----------------+--------+-----------+------+--+------+
+ */
+static void disas_rotate_right_into_flags(DisasContext *s, uint32_t insn)
+{
+    int mask = extract32(insn, 0, 4);
+    int o2 = extract32(insn, 4, 1);
+    int rn = extract32(insn, 5, 5);
+    int imm6 = extract32(insn, 15, 6);
+    int sf_op_s = extract32(insn, 29, 3);
+    TCGv_i64 tcg_rn;
+    TCGv_i32 nzcv;
+
+    if (sf_op_s != 5 || o2 != 0 || !dc_isar_feature(aa64_condm_4, s)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    tcg_rn = read_cpu_reg(s, rn, 1);
+    tcg_gen_rotri_i64(tcg_rn, tcg_rn, imm6);
+
+    nzcv = tcg_temp_new_i32();
+    tcg_gen_extrl_i64_i32(nzcv, tcg_rn);
+
+    if (mask & 8) { /* N */
+        tcg_gen_shli_i32(cpu_NF, nzcv, 31 - 3);
+    }
+    if (mask & 4) { /* Z */
+        tcg_gen_not_i32(cpu_ZF, nzcv);
+        tcg_gen_andi_i32(cpu_ZF, cpu_ZF, 4);
+    }
+    if (mask & 2) { /* C */
+        tcg_gen_extract_i32(cpu_CF, nzcv, 1, 1);
+    }
+    if (mask & 1) { /* V */
+        tcg_gen_shli_i32(cpu_VF, nzcv, 31 - 0);
+    }
+
+    tcg_temp_free_i32(nzcv);
+}
+
+/*
+ * Evaluate into flags
+ *  31 30 29                21        15   14        10      5  4      0
+ * +--+--+--+-----------------+---------+----+---------+------+--+------+
+ * |sf|op| S| 1 1 0 1 0 0 0 0 | opcode2 | sz | 0 0 1 0 |  Rn  |o3| mask |
+ * +--+--+--+-----------------+---------+----+---------+------+--+------+
+ */
+static void disas_evaluate_into_flags(DisasContext *s, uint32_t insn)
+{
+    int o3_mask = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int o2 = extract32(insn, 15, 6);
+    int sz = extract32(insn, 14, 1);
+    int sf_op_s = extract32(insn, 29, 3);
+    TCGv_i32 tmp;
+    int shift;
+
+    if (sf_op_s != 1 || o2 != 0 || o3_mask != 0xd ||
+        !dc_isar_feature(aa64_condm_4, s)) {
+        unallocated_encoding(s);
+        return;
+    }
+    shift = sz ? 16 : 24;  /* SETF16 or SETF8 */
+
+    tmp = tcg_temp_new_i32();
+    tcg_gen_extrl_i64_i32(tmp, cpu_reg(s, rn));
+    tcg_gen_shli_i32(cpu_NF, tmp, shift);
+    tcg_gen_shli_i32(cpu_VF, tmp, shift - 1);
+    tcg_gen_mov_i32(cpu_ZF, cpu_NF);
+    tcg_gen_xor_i32(cpu_VF, cpu_VF, cpu_NF);
+    tcg_temp_free_i32(tmp);
+}
+
+/* Conditional compare (immediate / register)
+ *  31 30 29 28 27 26 25 24 23 22 21  20    16 15  12  11  10  9   5  4 3   0
+ * +--+--+--+------------------------+--------+------+----+--+------+--+-----+
+ * |sf|op| S| 1  1  0  1  0  0  1  0 |imm5/rm | cond |i/r |o2|  Rn  |o3|nzcv |
+ * +--+--+--+------------------------+--------+------+----+--+------+--+-----+
+ *        [1]                             y                [0]       [0]
+ */
+static void disas_cc(DisasContext *s, uint32_t insn)
+{
+    unsigned int sf, op, y, cond, rn, nzcv, is_imm;
+    TCGv_i32 tcg_t0, tcg_t1, tcg_t2;
+    TCGv_i64 tcg_tmp, tcg_y, tcg_rn;
+    DisasCompare c;
+
+    if (!extract32(insn, 29, 1)) {
+        unallocated_encoding(s);
+        return;
+    }
+    if (insn & (1 << 10 | 1 << 4)) {
+        unallocated_encoding(s);
+        return;
+    }
+    sf = extract32(insn, 31, 1);
+    op = extract32(insn, 30, 1);
+    is_imm = extract32(insn, 11, 1);
+    y = extract32(insn, 16, 5); /* y = rm (reg) or imm5 (imm) */
+    cond = extract32(insn, 12, 4);
+    rn = extract32(insn, 5, 5);
+    nzcv = extract32(insn, 0, 4);
+
+    /* Set T0 = !COND.  */
+    tcg_t0 = tcg_temp_new_i32();
+    arm_test_cc(&c, cond);
+    tcg_gen_setcondi_i32(tcg_invert_cond(c.cond), tcg_t0, c.value, 0);
+    arm_free_cc(&c);
+
+    /* Load the arguments for the new comparison.  */
+    if (is_imm) {
+        tcg_y = new_tmp_a64(s);
+        tcg_gen_movi_i64(tcg_y, y);
+    } else {
+        tcg_y = cpu_reg(s, y);
+    }
+    tcg_rn = cpu_reg(s, rn);
+
+    /* Set the flags for the new comparison.  */
+    tcg_tmp = tcg_temp_new_i64();
+    if (op) {
+        gen_sub_CC(sf, tcg_tmp, tcg_rn, tcg_y);
+    } else {
+        gen_add_CC(sf, tcg_tmp, tcg_rn, tcg_y);
+    }
+    tcg_temp_free_i64(tcg_tmp);
+
+    /* If COND was false, force the flags to #nzcv.  Compute two masks
+     * to help with this: T1 = (COND ? 0 : -1), T2 = (COND ? -1 : 0).
+     * For tcg hosts that support ANDC, we can make do with just T1.
+     * In either case, allow the tcg optimizer to delete any unused mask.
+     */
+    tcg_t1 = tcg_temp_new_i32();
+    tcg_t2 = tcg_temp_new_i32();
+    tcg_gen_neg_i32(tcg_t1, tcg_t0);
+    tcg_gen_subi_i32(tcg_t2, tcg_t0, 1);
+
+    if (nzcv & 8) { /* N */
+        tcg_gen_or_i32(cpu_NF, cpu_NF, tcg_t1);
+    } else {
+        if (TCG_TARGET_HAS_andc_i32) {
+            tcg_gen_andc_i32(cpu_NF, cpu_NF, tcg_t1);
+        } else {
+            tcg_gen_and_i32(cpu_NF, cpu_NF, tcg_t2);
+        }
+    }
+    if (nzcv & 4) { /* Z */
+        if (TCG_TARGET_HAS_andc_i32) {
+            tcg_gen_andc_i32(cpu_ZF, cpu_ZF, tcg_t1);
+        } else {
+            tcg_gen_and_i32(cpu_ZF, cpu_ZF, tcg_t2);
+        }
+    } else {
+        tcg_gen_or_i32(cpu_ZF, cpu_ZF, tcg_t0);
+    }
+    if (nzcv & 2) { /* C */
+        tcg_gen_or_i32(cpu_CF, cpu_CF, tcg_t0);
+    } else {
+        if (TCG_TARGET_HAS_andc_i32) {
+            tcg_gen_andc_i32(cpu_CF, cpu_CF, tcg_t1);
+        } else {
+            tcg_gen_and_i32(cpu_CF, cpu_CF, tcg_t2);
+        }
+    }
+    if (nzcv & 1) { /* V */
+        tcg_gen_or_i32(cpu_VF, cpu_VF, tcg_t1);
+    } else {
+        if (TCG_TARGET_HAS_andc_i32) {
+            tcg_gen_andc_i32(cpu_VF, cpu_VF, tcg_t1);
+        } else {
+            tcg_gen_and_i32(cpu_VF, cpu_VF, tcg_t2);
+        }
+    }
+    tcg_temp_free_i32(tcg_t0);
+    tcg_temp_free_i32(tcg_t1);
+    tcg_temp_free_i32(tcg_t2);
+}
+
+/* Conditional select
+ *   31   30  29  28             21 20  16 15  12 11 10 9    5 4    0
+ * +----+----+---+-----------------+------+------+-----+------+------+
+ * | sf | op | S | 1 1 0 1 0 1 0 0 |  Rm  | cond | op2 |  Rn  |  Rd  |
+ * +----+----+---+-----------------+------+------+-----+------+------+
+ */
+static void disas_cond_select(DisasContext *s, uint32_t insn)
+{
+    unsigned int sf, else_inv, rm, cond, else_inc, rn, rd;
+    TCGv_i64 tcg_rd, zero;
+    DisasCompare64 c;
+
+    if (extract32(insn, 29, 1) || extract32(insn, 11, 1)) {
+        /* S == 1 or op2<1> == 1 */
+        unallocated_encoding(s);
+        return;
+    }
+    sf = extract32(insn, 31, 1);
+    else_inv = extract32(insn, 30, 1);
+    rm = extract32(insn, 16, 5);
+    cond = extract32(insn, 12, 4);
+    else_inc = extract32(insn, 10, 1);
+    rn = extract32(insn, 5, 5);
+    rd = extract32(insn, 0, 5);
+
+    tcg_rd = cpu_reg(s, rd);
+
+    a64_test_cc(&c, cond);
+    zero = tcg_constant_i64(0);
+
+    if (rn == 31 && rm == 31 && (else_inc ^ else_inv)) {
+        /* CSET & CSETM.  */
+        tcg_gen_setcond_i64(tcg_invert_cond(c.cond), tcg_rd, c.value, zero);
+        if (else_inv) {
+            tcg_gen_neg_i64(tcg_rd, tcg_rd);
+        }
+    } else {
+        TCGv_i64 t_true = cpu_reg(s, rn);
+        TCGv_i64 t_false = read_cpu_reg(s, rm, 1);
+        if (else_inv && else_inc) {
+            tcg_gen_neg_i64(t_false, t_false);
+        } else if (else_inv) {
+            tcg_gen_not_i64(t_false, t_false);
+        } else if (else_inc) {
+            tcg_gen_addi_i64(t_false, t_false, 1);
+        }
+        tcg_gen_movcond_i64(c.cond, tcg_rd, c.value, zero, t_true, t_false);
+    }
+
+    a64_free_cc(&c);
+
+    if (!sf) {
+        tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
+    }
+}
+
+static void handle_clz(DisasContext *s, unsigned int sf,
+                       unsigned int rn, unsigned int rd)
+{
+    TCGv_i64 tcg_rd, tcg_rn;
+    tcg_rd = cpu_reg(s, rd);
+    tcg_rn = cpu_reg(s, rn);
+
+    if (sf) {
+        tcg_gen_clzi_i64(tcg_rd, tcg_rn, 64);
+    } else {
+        TCGv_i32 tcg_tmp32 = tcg_temp_new_i32();
+        tcg_gen_extrl_i64_i32(tcg_tmp32, tcg_rn);
+        tcg_gen_clzi_i32(tcg_tmp32, tcg_tmp32, 32);
+        tcg_gen_extu_i32_i64(tcg_rd, tcg_tmp32);
+        tcg_temp_free_i32(tcg_tmp32);
+    }
+}
+
+static void handle_cls(DisasContext *s, unsigned int sf,
+                       unsigned int rn, unsigned int rd)
+{
+    TCGv_i64 tcg_rd, tcg_rn;
+    tcg_rd = cpu_reg(s, rd);
+    tcg_rn = cpu_reg(s, rn);
+
+    if (sf) {
+        tcg_gen_clrsb_i64(tcg_rd, tcg_rn);
+    } else {
+        TCGv_i32 tcg_tmp32 = tcg_temp_new_i32();
+        tcg_gen_extrl_i64_i32(tcg_tmp32, tcg_rn);
+        tcg_gen_clrsb_i32(tcg_tmp32, tcg_tmp32);
+        tcg_gen_extu_i32_i64(tcg_rd, tcg_tmp32);
+        tcg_temp_free_i32(tcg_tmp32);
+    }
+}
+
+static void handle_rbit(DisasContext *s, unsigned int sf,
+                        unsigned int rn, unsigned int rd)
+{
+    TCGv_i64 tcg_rd, tcg_rn;
+    tcg_rd = cpu_reg(s, rd);
+    tcg_rn = cpu_reg(s, rn);
+
+    if (sf) {
+        gen_helper_rbit64(tcg_rd, tcg_rn);
+    } else {
+        TCGv_i32 tcg_tmp32 = tcg_temp_new_i32();
+        tcg_gen_extrl_i64_i32(tcg_tmp32, tcg_rn);
+        gen_helper_rbit(tcg_tmp32, tcg_tmp32);
+        tcg_gen_extu_i32_i64(tcg_rd, tcg_tmp32);
+        tcg_temp_free_i32(tcg_tmp32);
+    }
+}
+
+/* REV with sf==1, opcode==3 ("REV64") */
+static void handle_rev64(DisasContext *s, unsigned int sf,
+                         unsigned int rn, unsigned int rd)
+{
+    if (!sf) {
+        unallocated_encoding(s);
+        return;
+    }
+    tcg_gen_bswap64_i64(cpu_reg(s, rd), cpu_reg(s, rn));
+}
+
+/* REV with sf==0, opcode==2
+ * REV32 (sf==1, opcode==2)
+ */
+static void handle_rev32(DisasContext *s, unsigned int sf,
+                         unsigned int rn, unsigned int rd)
+{
+    TCGv_i64 tcg_rd = cpu_reg(s, rd);
+    TCGv_i64 tcg_rn = cpu_reg(s, rn);
+
+    if (sf) {
+        tcg_gen_bswap64_i64(tcg_rd, tcg_rn);
+        tcg_gen_rotri_i64(tcg_rd, tcg_rd, 32);
+    } else {
+        tcg_gen_bswap32_i64(tcg_rd, tcg_rn, TCG_BSWAP_OZ);
+    }
+}
+
+/* REV16 (opcode==1) */
+static void handle_rev16(DisasContext *s, unsigned int sf,
+                         unsigned int rn, unsigned int rd)
+{
+    TCGv_i64 tcg_rd = cpu_reg(s, rd);
+    TCGv_i64 tcg_tmp = tcg_temp_new_i64();
+    TCGv_i64 tcg_rn = read_cpu_reg(s, rn, sf);
+    TCGv_i64 mask = tcg_constant_i64(sf ? 0x00ff00ff00ff00ffull : 0x00ff00ff);
+
+    tcg_gen_shri_i64(tcg_tmp, tcg_rn, 8);
+    tcg_gen_and_i64(tcg_rd, tcg_rn, mask);
+    tcg_gen_and_i64(tcg_tmp, tcg_tmp, mask);
+    tcg_gen_shli_i64(tcg_rd, tcg_rd, 8);
+    tcg_gen_or_i64(tcg_rd, tcg_rd, tcg_tmp);
+
+    tcg_temp_free_i64(tcg_tmp);
+}
+
+/* Data-processing (1 source)
+ *   31  30  29  28             21 20     16 15    10 9    5 4    0
+ * +----+---+---+-----------------+---------+--------+------+------+
+ * | sf | 1 | S | 1 1 0 1 0 1 1 0 | opcode2 | opcode |  Rn  |  Rd  |
+ * +----+---+---+-----------------+---------+--------+------+------+
+ */
+static void disas_data_proc_1src(DisasContext *s, uint32_t insn)
+{
+    unsigned int sf, opcode, opcode2, rn, rd;
+    TCGv_i64 tcg_rd;
+
+    if (extract32(insn, 29, 1)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    sf = extract32(insn, 31, 1);
+    opcode = extract32(insn, 10, 6);
+    opcode2 = extract32(insn, 16, 5);
+    rn = extract32(insn, 5, 5);
+    rd = extract32(insn, 0, 5);
+
+#define MAP(SF, O2, O1) ((SF) | (O1 << 1) | (O2 << 7))
+
+    switch (MAP(sf, opcode2, opcode)) {
+    case MAP(0, 0x00, 0x00): /* RBIT */
+    case MAP(1, 0x00, 0x00):
+        handle_rbit(s, sf, rn, rd);
+        break;
+    case MAP(0, 0x00, 0x01): /* REV16 */
+    case MAP(1, 0x00, 0x01):
+        handle_rev16(s, sf, rn, rd);
+        break;
+    case MAP(0, 0x00, 0x02): /* REV/REV32 */
+    case MAP(1, 0x00, 0x02):
+        handle_rev32(s, sf, rn, rd);
+        break;
+    case MAP(1, 0x00, 0x03): /* REV64 */
+        handle_rev64(s, sf, rn, rd);
+        break;
+    case MAP(0, 0x00, 0x04): /* CLZ */
+    case MAP(1, 0x00, 0x04):
+        handle_clz(s, sf, rn, rd);
+        break;
+    case MAP(0, 0x00, 0x05): /* CLS */
+    case MAP(1, 0x00, 0x05):
+        handle_cls(s, sf, rn, rd);
+        break;
+    case MAP(1, 0x01, 0x00): /* PACIA */
+        if (s->pauth_active) {
+            tcg_rd = cpu_reg(s, rd);
+            gen_helper_pacia(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn));
+        } else if (!dc_isar_feature(aa64_pauth, s)) {
+            goto do_unallocated;
+        }
+        break;
+    case MAP(1, 0x01, 0x01): /* PACIB */
+        if (s->pauth_active) {
+            tcg_rd = cpu_reg(s, rd);
+            gen_helper_pacib(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn));
+        } else if (!dc_isar_feature(aa64_pauth, s)) {
+            goto do_unallocated;
+        }
+        break;
+    case MAP(1, 0x01, 0x02): /* PACDA */
+        if (s->pauth_active) {
+            tcg_rd = cpu_reg(s, rd);
+            gen_helper_pacda(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn));
+        } else if (!dc_isar_feature(aa64_pauth, s)) {
+            goto do_unallocated;
+        }
+        break;
+    case MAP(1, 0x01, 0x03): /* PACDB */
+        if (s->pauth_active) {
+            tcg_rd = cpu_reg(s, rd);
+            gen_helper_pacdb(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn));
+        } else if (!dc_isar_feature(aa64_pauth, s)) {
+            goto do_unallocated;
+        }
+        break;
+    case MAP(1, 0x01, 0x04): /* AUTIA */
+        if (s->pauth_active) {
+            tcg_rd = cpu_reg(s, rd);
+            gen_helper_autia(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn));
+        } else if (!dc_isar_feature(aa64_pauth, s)) {
+            goto do_unallocated;
+        }
+        break;
+    case MAP(1, 0x01, 0x05): /* AUTIB */
+        if (s->pauth_active) {
+            tcg_rd = cpu_reg(s, rd);
+            gen_helper_autib(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn));
+        } else if (!dc_isar_feature(aa64_pauth, s)) {
+            goto do_unallocated;
+        }
+        break;
+    case MAP(1, 0x01, 0x06): /* AUTDA */
+        if (s->pauth_active) {
+            tcg_rd = cpu_reg(s, rd);
+            gen_helper_autda(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn));
+        } else if (!dc_isar_feature(aa64_pauth, s)) {
+            goto do_unallocated;
+        }
+        break;
+    case MAP(1, 0x01, 0x07): /* AUTDB */
+        if (s->pauth_active) {
+            tcg_rd = cpu_reg(s, rd);
+            gen_helper_autdb(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn));
+        } else if (!dc_isar_feature(aa64_pauth, s)) {
+            goto do_unallocated;
+        }
+        break;
+    case MAP(1, 0x01, 0x08): /* PACIZA */
+        if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
+            goto do_unallocated;
+        } else if (s->pauth_active) {
+            tcg_rd = cpu_reg(s, rd);
+            gen_helper_pacia(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s));
+        }
+        break;
+    case MAP(1, 0x01, 0x09): /* PACIZB */
+        if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
+            goto do_unallocated;
+        } else if (s->pauth_active) {
+            tcg_rd = cpu_reg(s, rd);
+            gen_helper_pacib(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s));
+        }
+        break;
+    case MAP(1, 0x01, 0x0a): /* PACDZA */
+        if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
+            goto do_unallocated;
+        } else if (s->pauth_active) {
+            tcg_rd = cpu_reg(s, rd);
+            gen_helper_pacda(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s));
+        }
+        break;
+    case MAP(1, 0x01, 0x0b): /* PACDZB */
+        if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
+            goto do_unallocated;
+        } else if (s->pauth_active) {
+            tcg_rd = cpu_reg(s, rd);
+            gen_helper_pacdb(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s));
+        }
+        break;
+    case MAP(1, 0x01, 0x0c): /* AUTIZA */
+        if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
+            goto do_unallocated;
+        } else if (s->pauth_active) {
+            tcg_rd = cpu_reg(s, rd);
+            gen_helper_autia(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s));
+        }
+        break;
+    case MAP(1, 0x01, 0x0d): /* AUTIZB */
+        if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
+            goto do_unallocated;
+        } else if (s->pauth_active) {
+            tcg_rd = cpu_reg(s, rd);
+            gen_helper_autib(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s));
+        }
+        break;
+    case MAP(1, 0x01, 0x0e): /* AUTDZA */
+        if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
+            goto do_unallocated;
+        } else if (s->pauth_active) {
+            tcg_rd = cpu_reg(s, rd);
+            gen_helper_autda(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s));
+        }
+        break;
+    case MAP(1, 0x01, 0x0f): /* AUTDZB */
+        if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
+            goto do_unallocated;
+        } else if (s->pauth_active) {
+            tcg_rd = cpu_reg(s, rd);
+            gen_helper_autdb(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s));
+        }
+        break;
+    case MAP(1, 0x01, 0x10): /* XPACI */
+        if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
+            goto do_unallocated;
+        } else if (s->pauth_active) {
+            tcg_rd = cpu_reg(s, rd);
+            gen_helper_xpaci(tcg_rd, cpu_env, tcg_rd);
+        }
+        break;
+    case MAP(1, 0x01, 0x11): /* XPACD */
+        if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
+            goto do_unallocated;
+        } else if (s->pauth_active) {
+            tcg_rd = cpu_reg(s, rd);
+            gen_helper_xpacd(tcg_rd, cpu_env, tcg_rd);
+        }
+        break;
+    default:
+    do_unallocated:
+        unallocated_encoding(s);
+        break;
+    }
+
+#undef MAP
+}
+
+static void handle_div(DisasContext *s, bool is_signed, unsigned int sf,
+                       unsigned int rm, unsigned int rn, unsigned int rd)
+{
+    TCGv_i64 tcg_n, tcg_m, tcg_rd;
+    tcg_rd = cpu_reg(s, rd);
+
+    if (!sf && is_signed) {
+        tcg_n = new_tmp_a64(s);
+        tcg_m = new_tmp_a64(s);
+        tcg_gen_ext32s_i64(tcg_n, cpu_reg(s, rn));
+        tcg_gen_ext32s_i64(tcg_m, cpu_reg(s, rm));
+    } else {
+        tcg_n = read_cpu_reg(s, rn, sf);
+        tcg_m = read_cpu_reg(s, rm, sf);
+    }
+
+    if (is_signed) {
+        gen_helper_sdiv64(tcg_rd, tcg_n, tcg_m);
+    } else {
+        gen_helper_udiv64(tcg_rd, tcg_n, tcg_m);
+    }
+
+    if (!sf) { /* zero extend final result */
+        tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
+    }
+}
+
+/* LSLV, LSRV, ASRV, RORV */
+static void handle_shift_reg(DisasContext *s,
+                             enum a64_shift_type shift_type, unsigned int sf,
+                             unsigned int rm, unsigned int rn, unsigned int rd)
+{
+    TCGv_i64 tcg_shift = tcg_temp_new_i64();
+    TCGv_i64 tcg_rd = cpu_reg(s, rd);
+    TCGv_i64 tcg_rn = read_cpu_reg(s, rn, sf);
+
+    tcg_gen_andi_i64(tcg_shift, cpu_reg(s, rm), sf ? 63 : 31);
+    shift_reg(tcg_rd, tcg_rn, sf, shift_type, tcg_shift);
+    tcg_temp_free_i64(tcg_shift);
+}
+
+/* CRC32[BHWX], CRC32C[BHWX] */
+static void handle_crc32(DisasContext *s,
+                         unsigned int sf, unsigned int sz, bool crc32c,
+                         unsigned int rm, unsigned int rn, unsigned int rd)
+{
+    TCGv_i64 tcg_acc, tcg_val;
+    TCGv_i32 tcg_bytes;
+
+    if (!dc_isar_feature(aa64_crc32, s)
+        || (sf == 1 && sz != 3)
+        || (sf == 0 && sz == 3)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (sz == 3) {
+        tcg_val = cpu_reg(s, rm);
+    } else {
+        uint64_t mask;
+        switch (sz) {
+        case 0:
+            mask = 0xFF;
+            break;
+        case 1:
+            mask = 0xFFFF;
+            break;
+        case 2:
+            mask = 0xFFFFFFFF;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+        tcg_val = new_tmp_a64(s);
+        tcg_gen_andi_i64(tcg_val, cpu_reg(s, rm), mask);
+    }
+
+    tcg_acc = cpu_reg(s, rn);
+    tcg_bytes = tcg_constant_i32(1 << sz);
+
+    if (crc32c) {
+        gen_helper_crc32c_64(cpu_reg(s, rd), tcg_acc, tcg_val, tcg_bytes);
+    } else {
+        gen_helper_crc32_64(cpu_reg(s, rd), tcg_acc, tcg_val, tcg_bytes);
+    }
+}
+
+/* Data-processing (2 source)
+ *   31   30  29 28             21 20  16 15    10 9    5 4    0
+ * +----+---+---+-----------------+------+--------+------+------+
+ * | sf | 0 | S | 1 1 0 1 0 1 1 0 |  Rm  | opcode |  Rn  |  Rd  |
+ * +----+---+---+-----------------+------+--------+------+------+
+ */
+static void disas_data_proc_2src(DisasContext *s, uint32_t insn)
+{
+    unsigned int sf, rm, opcode, rn, rd, setflag;
+    sf = extract32(insn, 31, 1);
+    setflag = extract32(insn, 29, 1);
+    rm = extract32(insn, 16, 5);
+    opcode = extract32(insn, 10, 6);
+    rn = extract32(insn, 5, 5);
+    rd = extract32(insn, 0, 5);
+
+    if (setflag && opcode != 0) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (opcode) {
+    case 0: /* SUBP(S) */
+        if (sf == 0 || !dc_isar_feature(aa64_mte_insn_reg, s)) {
+            goto do_unallocated;
+        } else {
+            TCGv_i64 tcg_n, tcg_m, tcg_d;
+
+            tcg_n = read_cpu_reg_sp(s, rn, true);
+            tcg_m = read_cpu_reg_sp(s, rm, true);
+            tcg_gen_sextract_i64(tcg_n, tcg_n, 0, 56);
+            tcg_gen_sextract_i64(tcg_m, tcg_m, 0, 56);
+            tcg_d = cpu_reg(s, rd);
+
+            if (setflag) {
+                gen_sub_CC(true, tcg_d, tcg_n, tcg_m);
+            } else {
+                tcg_gen_sub_i64(tcg_d, tcg_n, tcg_m);
+            }
+        }
+        break;
+    case 2: /* UDIV */
+        handle_div(s, false, sf, rm, rn, rd);
+        break;
+    case 3: /* SDIV */
+        handle_div(s, true, sf, rm, rn, rd);
+        break;
+    case 4: /* IRG */
+        if (sf == 0 || !dc_isar_feature(aa64_mte_insn_reg, s)) {
+            goto do_unallocated;
+        }
+        if (s->ata) {
+            gen_helper_irg(cpu_reg_sp(s, rd), cpu_env,
+                           cpu_reg_sp(s, rn), cpu_reg(s, rm));
+        } else {
+            gen_address_with_allocation_tag0(cpu_reg_sp(s, rd),
+                                             cpu_reg_sp(s, rn));
+        }
+        break;
+    case 5: /* GMI */
+        if (sf == 0 || !dc_isar_feature(aa64_mte_insn_reg, s)) {
+            goto do_unallocated;
+        } else {
+            TCGv_i64 t = tcg_temp_new_i64();
+
+            tcg_gen_extract_i64(t, cpu_reg_sp(s, rn), 56, 4);
+            tcg_gen_shl_i64(t, tcg_constant_i64(1), t);
+            tcg_gen_or_i64(cpu_reg(s, rd), cpu_reg(s, rm), t);
+
+            tcg_temp_free_i64(t);
+        }
+        break;
+    case 8: /* LSLV */
+        handle_shift_reg(s, A64_SHIFT_TYPE_LSL, sf, rm, rn, rd);
+        break;
+    case 9: /* LSRV */
+        handle_shift_reg(s, A64_SHIFT_TYPE_LSR, sf, rm, rn, rd);
+        break;
+    case 10: /* ASRV */
+        handle_shift_reg(s, A64_SHIFT_TYPE_ASR, sf, rm, rn, rd);
+        break;
+    case 11: /* RORV */
+        handle_shift_reg(s, A64_SHIFT_TYPE_ROR, sf, rm, rn, rd);
+        break;
+    case 12: /* PACGA */
+        if (sf == 0 || !dc_isar_feature(aa64_pauth, s)) {
+            goto do_unallocated;
+        }
+        gen_helper_pacga(cpu_reg(s, rd), cpu_env,
+                         cpu_reg(s, rn), cpu_reg_sp(s, rm));
+        break;
+    case 16:
+    case 17:
+    case 18:
+    case 19:
+    case 20:
+    case 21:
+    case 22:
+    case 23: /* CRC32 */
+    {
+        int sz = extract32(opcode, 0, 2);
+        bool crc32c = extract32(opcode, 2, 1);
+        handle_crc32(s, sf, sz, crc32c, rm, rn, rd);
+        break;
+    }
+    default:
+    do_unallocated:
+        unallocated_encoding(s);
+        break;
+    }
+}
+
+/*
+ * Data processing - register
+ *  31  30 29  28      25    21  20  16      10         0
+ * +--+---+--+---+-------+-----+-------+-------+---------+
+ * |  |op0|  |op1| 1 0 1 | op2 |       |  op3  |         |
+ * +--+---+--+---+-------+-----+-------+-------+---------+
+ */
+static void disas_data_proc_reg(DisasContext *s, uint32_t insn)
+{
+    int op0 = extract32(insn, 30, 1);
+    int op1 = extract32(insn, 28, 1);
+    int op2 = extract32(insn, 21, 4);
+    int op3 = extract32(insn, 10, 6);
+
+    if (!op1) {
+        if (op2 & 8) {
+            if (op2 & 1) {
+                /* Add/sub (extended register) */
+                disas_add_sub_ext_reg(s, insn);
+            } else {
+                /* Add/sub (shifted register) */
+                disas_add_sub_reg(s, insn);
+            }
+        } else {
+            /* Logical (shifted register) */
+            disas_logic_reg(s, insn);
+        }
+        return;
+    }
+
+    switch (op2) {
+    case 0x0:
+        switch (op3) {
+        case 0x00: /* Add/subtract (with carry) */
+            disas_adc_sbc(s, insn);
+            break;
+
+        case 0x01: /* Rotate right into flags */
+        case 0x21:
+            disas_rotate_right_into_flags(s, insn);
+            break;
+
+        case 0x02: /* Evaluate into flags */
+        case 0x12:
+        case 0x22:
+        case 0x32:
+            disas_evaluate_into_flags(s, insn);
+            break;
+
+        default:
+            goto do_unallocated;
+        }
+        break;
+
+    case 0x2: /* Conditional compare */
+        disas_cc(s, insn); /* both imm and reg forms */
+        break;
+
+    case 0x4: /* Conditional select */
+        disas_cond_select(s, insn);
+        break;
+
+    case 0x6: /* Data-processing */
+        if (op0) {    /* (1 source) */
+            disas_data_proc_1src(s, insn);
+        } else {      /* (2 source) */
+            disas_data_proc_2src(s, insn);
+        }
+        break;
+    case 0x8 ... 0xf: /* (3 source) */
+        disas_data_proc_3src(s, insn);
+        break;
+
+    default:
+    do_unallocated:
+        unallocated_encoding(s);
+        break;
+    }
+}
+
+static void handle_fp_compare(DisasContext *s, int size,
+                              unsigned int rn, unsigned int rm,
+                              bool cmp_with_zero, bool signal_all_nans)
+{
+    TCGv_i64 tcg_flags = tcg_temp_new_i64();
+    TCGv_ptr fpst = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+
+    if (size == MO_64) {
+        TCGv_i64 tcg_vn, tcg_vm;
+
+        tcg_vn = read_fp_dreg(s, rn);
+        if (cmp_with_zero) {
+            tcg_vm = tcg_constant_i64(0);
+        } else {
+            tcg_vm = read_fp_dreg(s, rm);
+        }
+        if (signal_all_nans) {
+            gen_helper_vfp_cmped_a64(tcg_flags, tcg_vn, tcg_vm, fpst);
+        } else {
+            gen_helper_vfp_cmpd_a64(tcg_flags, tcg_vn, tcg_vm, fpst);
+        }
+        tcg_temp_free_i64(tcg_vn);
+        tcg_temp_free_i64(tcg_vm);
+    } else {
+        TCGv_i32 tcg_vn = tcg_temp_new_i32();
+        TCGv_i32 tcg_vm = tcg_temp_new_i32();
+
+        read_vec_element_i32(s, tcg_vn, rn, 0, size);
+        if (cmp_with_zero) {
+            tcg_gen_movi_i32(tcg_vm, 0);
+        } else {
+            read_vec_element_i32(s, tcg_vm, rm, 0, size);
+        }
+
+        switch (size) {
+        case MO_32:
+            if (signal_all_nans) {
+                gen_helper_vfp_cmpes_a64(tcg_flags, tcg_vn, tcg_vm, fpst);
+            } else {
+                gen_helper_vfp_cmps_a64(tcg_flags, tcg_vn, tcg_vm, fpst);
+            }
+            break;
+        case MO_16:
+            if (signal_all_nans) {
+                gen_helper_vfp_cmpeh_a64(tcg_flags, tcg_vn, tcg_vm, fpst);
+            } else {
+                gen_helper_vfp_cmph_a64(tcg_flags, tcg_vn, tcg_vm, fpst);
+            }
+            break;
+        default:
+            g_assert_not_reached();
+        }
+
+        tcg_temp_free_i32(tcg_vn);
+        tcg_temp_free_i32(tcg_vm);
+    }
+
+    tcg_temp_free_ptr(fpst);
+
+    gen_set_nzcv(tcg_flags);
+
+    tcg_temp_free_i64(tcg_flags);
+}
+
+/* Floating point compare
+ *   31  30  29 28       24 23  22  21 20  16 15 14 13  10    9    5 4     0
+ * +---+---+---+-----------+------+---+------+-----+---------+------+-------+
+ * | M | 0 | S | 1 1 1 1 0 | type | 1 |  Rm  | op  | 1 0 0 0 |  Rn  |  op2  |
+ * +---+---+---+-----------+------+---+------+-----+---------+------+-------+
+ */
+static void disas_fp_compare(DisasContext *s, uint32_t insn)
+{
+    unsigned int mos, type, rm, op, rn, opc, op2r;
+    int size;
+
+    mos = extract32(insn, 29, 3);
+    type = extract32(insn, 22, 2);
+    rm = extract32(insn, 16, 5);
+    op = extract32(insn, 14, 2);
+    rn = extract32(insn, 5, 5);
+    opc = extract32(insn, 3, 2);
+    op2r = extract32(insn, 0, 3);
+
+    if (mos || op || op2r) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (type) {
+    case 0:
+        size = MO_32;
+        break;
+    case 1:
+        size = MO_64;
+        break;
+    case 3:
+        size = MO_16;
+        if (dc_isar_feature(aa64_fp16, s)) {
+            break;
+        }
+        /* fallthru */
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    handle_fp_compare(s, size, rn, rm, opc & 1, opc & 2);
+}
+
+/* Floating point conditional compare
+ *   31  30  29 28       24 23  22  21 20  16 15  12 11 10 9    5  4   3    0
+ * +---+---+---+-----------+------+---+------+------+-----+------+----+------+
+ * | M | 0 | S | 1 1 1 1 0 | type | 1 |  Rm  | cond | 0 1 |  Rn  | op | nzcv |
+ * +---+---+---+-----------+------+---+------+------+-----+------+----+------+
+ */
+static void disas_fp_ccomp(DisasContext *s, uint32_t insn)
+{
+    unsigned int mos, type, rm, cond, rn, op, nzcv;
+    TCGLabel *label_continue = NULL;
+    int size;
+
+    mos = extract32(insn, 29, 3);
+    type = extract32(insn, 22, 2);
+    rm = extract32(insn, 16, 5);
+    cond = extract32(insn, 12, 4);
+    rn = extract32(insn, 5, 5);
+    op = extract32(insn, 4, 1);
+    nzcv = extract32(insn, 0, 4);
+
+    if (mos) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (type) {
+    case 0:
+        size = MO_32;
+        break;
+    case 1:
+        size = MO_64;
+        break;
+    case 3:
+        size = MO_16;
+        if (dc_isar_feature(aa64_fp16, s)) {
+            break;
+        }
+        /* fallthru */
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    if (cond < 0x0e) { /* not always */
+        TCGLabel *label_match = gen_new_label();
+        label_continue = gen_new_label();
+        arm_gen_test_cc(cond, label_match);
+        /* nomatch: */
+        gen_set_nzcv(tcg_constant_i64(nzcv << 28));
+        tcg_gen_br(label_continue);
+        gen_set_label(label_match);
+    }
+
+    handle_fp_compare(s, size, rn, rm, false, op);
+
+    if (cond < 0x0e) {
+        gen_set_label(label_continue);
+    }
+}
+
+/* Floating point conditional select
+ *   31  30  29 28       24 23  22  21 20  16 15  12 11 10 9    5 4    0
+ * +---+---+---+-----------+------+---+------+------+-----+------+------+
+ * | M | 0 | S | 1 1 1 1 0 | type | 1 |  Rm  | cond | 1 1 |  Rn  |  Rd  |
+ * +---+---+---+-----------+------+---+------+------+-----+------+------+
+ */
+static void disas_fp_csel(DisasContext *s, uint32_t insn)
+{
+    unsigned int mos, type, rm, cond, rn, rd;
+    TCGv_i64 t_true, t_false;
+    DisasCompare64 c;
+    MemOp sz;
+
+    mos = extract32(insn, 29, 3);
+    type = extract32(insn, 22, 2);
+    rm = extract32(insn, 16, 5);
+    cond = extract32(insn, 12, 4);
+    rn = extract32(insn, 5, 5);
+    rd = extract32(insn, 0, 5);
+
+    if (mos) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (type) {
+    case 0:
+        sz = MO_32;
+        break;
+    case 1:
+        sz = MO_64;
+        break;
+    case 3:
+        sz = MO_16;
+        if (dc_isar_feature(aa64_fp16, s)) {
+            break;
+        }
+        /* fallthru */
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    /* Zero extend sreg & hreg inputs to 64 bits now.  */
+    t_true = tcg_temp_new_i64();
+    t_false = tcg_temp_new_i64();
+    read_vec_element(s, t_true, rn, 0, sz);
+    read_vec_element(s, t_false, rm, 0, sz);
+
+    a64_test_cc(&c, cond);
+    tcg_gen_movcond_i64(c.cond, t_true, c.value, tcg_constant_i64(0),
+                        t_true, t_false);
+    tcg_temp_free_i64(t_false);
+    a64_free_cc(&c);
+
+    /* Note that sregs & hregs write back zeros to the high bits,
+       and we've already done the zero-extension.  */
+    write_fp_dreg(s, rd, t_true);
+    tcg_temp_free_i64(t_true);
+}
+
+/* Floating-point data-processing (1 source) - half precision */
+static void handle_fp_1src_half(DisasContext *s, int opcode, int rd, int rn)
+{
+    TCGv_ptr fpst = NULL;
+    TCGv_i32 tcg_op = read_fp_hreg(s, rn);
+    TCGv_i32 tcg_res = tcg_temp_new_i32();
+
+    switch (opcode) {
+    case 0x0: /* FMOV */
+        tcg_gen_mov_i32(tcg_res, tcg_op);
+        break;
+    case 0x1: /* FABS */
+        tcg_gen_andi_i32(tcg_res, tcg_op, 0x7fff);
+        break;
+    case 0x2: /* FNEG */
+        tcg_gen_xori_i32(tcg_res, tcg_op, 0x8000);
+        break;
+    case 0x3: /* FSQRT */
+        fpst = fpstatus_ptr(FPST_FPCR_F16);
+        gen_helper_sqrt_f16(tcg_res, tcg_op, fpst);
+        break;
+    case 0x8: /* FRINTN */
+    case 0x9: /* FRINTP */
+    case 0xa: /* FRINTM */
+    case 0xb: /* FRINTZ */
+    case 0xc: /* FRINTA */
+    {
+        TCGv_i32 tcg_rmode = tcg_const_i32(arm_rmode_to_sf(opcode & 7));
+        fpst = fpstatus_ptr(FPST_FPCR_F16);
+
+        gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+        gen_helper_advsimd_rinth(tcg_res, tcg_op, fpst);
+
+        gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+        tcg_temp_free_i32(tcg_rmode);
+        break;
+    }
+    case 0xe: /* FRINTX */
+        fpst = fpstatus_ptr(FPST_FPCR_F16);
+        gen_helper_advsimd_rinth_exact(tcg_res, tcg_op, fpst);
+        break;
+    case 0xf: /* FRINTI */
+        fpst = fpstatus_ptr(FPST_FPCR_F16);
+        gen_helper_advsimd_rinth(tcg_res, tcg_op, fpst);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    write_fp_sreg(s, rd, tcg_res);
+
+    if (fpst) {
+        tcg_temp_free_ptr(fpst);
+    }
+    tcg_temp_free_i32(tcg_op);
+    tcg_temp_free_i32(tcg_res);
+}
+
+/* Floating-point data-processing (1 source) - single precision */
+static void handle_fp_1src_single(DisasContext *s, int opcode, int rd, int rn)
+{
+    void (*gen_fpst)(TCGv_i32, TCGv_i32, TCGv_ptr);
+    TCGv_i32 tcg_op, tcg_res;
+    TCGv_ptr fpst;
+    int rmode = -1;
+
+    tcg_op = read_fp_sreg(s, rn);
+    tcg_res = tcg_temp_new_i32();
+
+    switch (opcode) {
+    case 0x0: /* FMOV */
+        tcg_gen_mov_i32(tcg_res, tcg_op);
+        goto done;
+    case 0x1: /* FABS */
+        gen_helper_vfp_abss(tcg_res, tcg_op);
+        goto done;
+    case 0x2: /* FNEG */
+        gen_helper_vfp_negs(tcg_res, tcg_op);
+        goto done;
+    case 0x3: /* FSQRT */
+        gen_helper_vfp_sqrts(tcg_res, tcg_op, cpu_env);
+        goto done;
+    case 0x6: /* BFCVT */
+        gen_fpst = gen_helper_bfcvt;
+        break;
+    case 0x8: /* FRINTN */
+    case 0x9: /* FRINTP */
+    case 0xa: /* FRINTM */
+    case 0xb: /* FRINTZ */
+    case 0xc: /* FRINTA */
+        rmode = arm_rmode_to_sf(opcode & 7);
+        gen_fpst = gen_helper_rints;
+        break;
+    case 0xe: /* FRINTX */
+        gen_fpst = gen_helper_rints_exact;
+        break;
+    case 0xf: /* FRINTI */
+        gen_fpst = gen_helper_rints;
+        break;
+    case 0x10: /* FRINT32Z */
+        rmode = float_round_to_zero;
+        gen_fpst = gen_helper_frint32_s;
+        break;
+    case 0x11: /* FRINT32X */
+        gen_fpst = gen_helper_frint32_s;
+        break;
+    case 0x12: /* FRINT64Z */
+        rmode = float_round_to_zero;
+        gen_fpst = gen_helper_frint64_s;
+        break;
+    case 0x13: /* FRINT64X */
+        gen_fpst = gen_helper_frint64_s;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    fpst = fpstatus_ptr(FPST_FPCR);
+    if (rmode >= 0) {
+        TCGv_i32 tcg_rmode = tcg_const_i32(rmode);
+        gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+        gen_fpst(tcg_res, tcg_op, fpst);
+        gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+        tcg_temp_free_i32(tcg_rmode);
+    } else {
+        gen_fpst(tcg_res, tcg_op, fpst);
+    }
+    tcg_temp_free_ptr(fpst);
+
+ done:
+    write_fp_sreg(s, rd, tcg_res);
+    tcg_temp_free_i32(tcg_op);
+    tcg_temp_free_i32(tcg_res);
+}
+
+/* Floating-point data-processing (1 source) - double precision */
+static void handle_fp_1src_double(DisasContext *s, int opcode, int rd, int rn)
+{
+    void (*gen_fpst)(TCGv_i64, TCGv_i64, TCGv_ptr);
+    TCGv_i64 tcg_op, tcg_res;
+    TCGv_ptr fpst;
+    int rmode = -1;
+
+    switch (opcode) {
+    case 0x0: /* FMOV */
+        gen_gvec_fn2(s, false, rd, rn, tcg_gen_gvec_mov, 0);
+        return;
+    }
+
+    tcg_op = read_fp_dreg(s, rn);
+    tcg_res = tcg_temp_new_i64();
+
+    switch (opcode) {
+    case 0x1: /* FABS */
+        gen_helper_vfp_absd(tcg_res, tcg_op);
+        goto done;
+    case 0x2: /* FNEG */
+        gen_helper_vfp_negd(tcg_res, tcg_op);
+        goto done;
+    case 0x3: /* FSQRT */
+        gen_helper_vfp_sqrtd(tcg_res, tcg_op, cpu_env);
+        goto done;
+    case 0x8: /* FRINTN */
+    case 0x9: /* FRINTP */
+    case 0xa: /* FRINTM */
+    case 0xb: /* FRINTZ */
+    case 0xc: /* FRINTA */
+        rmode = arm_rmode_to_sf(opcode & 7);
+        gen_fpst = gen_helper_rintd;
+        break;
+    case 0xe: /* FRINTX */
+        gen_fpst = gen_helper_rintd_exact;
+        break;
+    case 0xf: /* FRINTI */
+        gen_fpst = gen_helper_rintd;
+        break;
+    case 0x10: /* FRINT32Z */
+        rmode = float_round_to_zero;
+        gen_fpst = gen_helper_frint32_d;
+        break;
+    case 0x11: /* FRINT32X */
+        gen_fpst = gen_helper_frint32_d;
+        break;
+    case 0x12: /* FRINT64Z */
+        rmode = float_round_to_zero;
+        gen_fpst = gen_helper_frint64_d;
+        break;
+    case 0x13: /* FRINT64X */
+        gen_fpst = gen_helper_frint64_d;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    fpst = fpstatus_ptr(FPST_FPCR);
+    if (rmode >= 0) {
+        TCGv_i32 tcg_rmode = tcg_const_i32(rmode);
+        gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+        gen_fpst(tcg_res, tcg_op, fpst);
+        gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+        tcg_temp_free_i32(tcg_rmode);
+    } else {
+        gen_fpst(tcg_res, tcg_op, fpst);
+    }
+    tcg_temp_free_ptr(fpst);
+
+ done:
+    write_fp_dreg(s, rd, tcg_res);
+    tcg_temp_free_i64(tcg_op);
+    tcg_temp_free_i64(tcg_res);
+}
+
+static void handle_fp_fcvt(DisasContext *s, int opcode,
+                           int rd, int rn, int dtype, int ntype)
+{
+    switch (ntype) {
+    case 0x0:
+    {
+        TCGv_i32 tcg_rn = read_fp_sreg(s, rn);
+        if (dtype == 1) {
+            /* Single to double */
+            TCGv_i64 tcg_rd = tcg_temp_new_i64();
+            gen_helper_vfp_fcvtds(tcg_rd, tcg_rn, cpu_env);
+            write_fp_dreg(s, rd, tcg_rd);
+            tcg_temp_free_i64(tcg_rd);
+        } else {
+            /* Single to half */
+            TCGv_i32 tcg_rd = tcg_temp_new_i32();
+            TCGv_i32 ahp = get_ahp_flag();
+            TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
+
+            gen_helper_vfp_fcvt_f32_to_f16(tcg_rd, tcg_rn, fpst, ahp);
+            /* write_fp_sreg is OK here because top half of tcg_rd is zero */
+            write_fp_sreg(s, rd, tcg_rd);
+            tcg_temp_free_i32(tcg_rd);
+            tcg_temp_free_i32(ahp);
+            tcg_temp_free_ptr(fpst);
+        }
+        tcg_temp_free_i32(tcg_rn);
+        break;
+    }
+    case 0x1:
+    {
+        TCGv_i64 tcg_rn = read_fp_dreg(s, rn);
+        TCGv_i32 tcg_rd = tcg_temp_new_i32();
+        if (dtype == 0) {
+            /* Double to single */
+            gen_helper_vfp_fcvtsd(tcg_rd, tcg_rn, cpu_env);
+        } else {
+            TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
+            TCGv_i32 ahp = get_ahp_flag();
+            /* Double to half */
+            gen_helper_vfp_fcvt_f64_to_f16(tcg_rd, tcg_rn, fpst, ahp);
+            /* write_fp_sreg is OK here because top half of tcg_rd is zero */
+            tcg_temp_free_ptr(fpst);
+            tcg_temp_free_i32(ahp);
+        }
+        write_fp_sreg(s, rd, tcg_rd);
+        tcg_temp_free_i32(tcg_rd);
+        tcg_temp_free_i64(tcg_rn);
+        break;
+    }
+    case 0x3:
+    {
+        TCGv_i32 tcg_rn = read_fp_sreg(s, rn);
+        TCGv_ptr tcg_fpst = fpstatus_ptr(FPST_FPCR);
+        TCGv_i32 tcg_ahp = get_ahp_flag();
+        tcg_gen_ext16u_i32(tcg_rn, tcg_rn);
+        if (dtype == 0) {
+            /* Half to single */
+            TCGv_i32 tcg_rd = tcg_temp_new_i32();
+            gen_helper_vfp_fcvt_f16_to_f32(tcg_rd, tcg_rn, tcg_fpst, tcg_ahp);
+            write_fp_sreg(s, rd, tcg_rd);
+            tcg_temp_free_i32(tcg_rd);
+        } else {
+            /* Half to double */
+            TCGv_i64 tcg_rd = tcg_temp_new_i64();
+            gen_helper_vfp_fcvt_f16_to_f64(tcg_rd, tcg_rn, tcg_fpst, tcg_ahp);
+            write_fp_dreg(s, rd, tcg_rd);
+            tcg_temp_free_i64(tcg_rd);
+        }
+        tcg_temp_free_i32(tcg_rn);
+        tcg_temp_free_ptr(tcg_fpst);
+        tcg_temp_free_i32(tcg_ahp);
+        break;
+    }
+    default:
+        g_assert_not_reached();
+    }
+}
+
+/* Floating point data-processing (1 source)
+ *   31  30  29 28       24 23  22  21 20    15 14       10 9    5 4    0
+ * +---+---+---+-----------+------+---+--------+-----------+------+------+
+ * | M | 0 | S | 1 1 1 1 0 | type | 1 | opcode | 1 0 0 0 0 |  Rn  |  Rd  |
+ * +---+---+---+-----------+------+---+--------+-----------+------+------+
+ */
+static void disas_fp_1src(DisasContext *s, uint32_t insn)
+{
+    int mos = extract32(insn, 29, 3);
+    int type = extract32(insn, 22, 2);
+    int opcode = extract32(insn, 15, 6);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+
+    if (mos) {
+        goto do_unallocated;
+    }
+
+    switch (opcode) {
+    case 0x4: case 0x5: case 0x7:
+    {
+        /* FCVT between half, single and double precision */
+        int dtype = extract32(opcode, 0, 2);
+        if (type == 2 || dtype == type) {
+            goto do_unallocated;
+        }
+        if (!fp_access_check(s)) {
+            return;
+        }
+
+        handle_fp_fcvt(s, opcode, rd, rn, dtype, type);
+        break;
+    }
+
+    case 0x10 ... 0x13: /* FRINT{32,64}{X,Z} */
+        if (type > 1 || !dc_isar_feature(aa64_frint, s)) {
+            goto do_unallocated;
+        }
+        /* fall through */
+    case 0x0 ... 0x3:
+    case 0x8 ... 0xc:
+    case 0xe ... 0xf:
+        /* 32-to-32 and 64-to-64 ops */
+        switch (type) {
+        case 0:
+            if (!fp_access_check(s)) {
+                return;
+            }
+            handle_fp_1src_single(s, opcode, rd, rn);
+            break;
+        case 1:
+            if (!fp_access_check(s)) {
+                return;
+            }
+            handle_fp_1src_double(s, opcode, rd, rn);
+            break;
+        case 3:
+            if (!dc_isar_feature(aa64_fp16, s)) {
+                goto do_unallocated;
+            }
+
+            if (!fp_access_check(s)) {
+                return;
+            }
+            handle_fp_1src_half(s, opcode, rd, rn);
+            break;
+        default:
+            goto do_unallocated;
+        }
+        break;
+
+    case 0x6:
+        switch (type) {
+        case 1: /* BFCVT */
+            if (!dc_isar_feature(aa64_bf16, s)) {
+                goto do_unallocated;
+            }
+            if (!fp_access_check(s)) {
+                return;
+            }
+            handle_fp_1src_single(s, opcode, rd, rn);
+            break;
+        default:
+            goto do_unallocated;
+        }
+        break;
+
+    default:
+    do_unallocated:
+        unallocated_encoding(s);
+        break;
+    }
+}
+
+/* Floating-point data-processing (2 source) - single precision */
+static void handle_fp_2src_single(DisasContext *s, int opcode,
+                                  int rd, int rn, int rm)
+{
+    TCGv_i32 tcg_op1;
+    TCGv_i32 tcg_op2;
+    TCGv_i32 tcg_res;
+    TCGv_ptr fpst;
+
+    tcg_res = tcg_temp_new_i32();
+    fpst = fpstatus_ptr(FPST_FPCR);
+    tcg_op1 = read_fp_sreg(s, rn);
+    tcg_op2 = read_fp_sreg(s, rm);
+
+    switch (opcode) {
+    case 0x0: /* FMUL */
+        gen_helper_vfp_muls(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x1: /* FDIV */
+        gen_helper_vfp_divs(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x2: /* FADD */
+        gen_helper_vfp_adds(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x3: /* FSUB */
+        gen_helper_vfp_subs(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x4: /* FMAX */
+        gen_helper_vfp_maxs(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x5: /* FMIN */
+        gen_helper_vfp_mins(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x6: /* FMAXNM */
+        gen_helper_vfp_maxnums(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x7: /* FMINNM */
+        gen_helper_vfp_minnums(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x8: /* FNMUL */
+        gen_helper_vfp_muls(tcg_res, tcg_op1, tcg_op2, fpst);
+        gen_helper_vfp_negs(tcg_res, tcg_res);
+        break;
+    }
+
+    write_fp_sreg(s, rd, tcg_res);
+
+    tcg_temp_free_ptr(fpst);
+    tcg_temp_free_i32(tcg_op1);
+    tcg_temp_free_i32(tcg_op2);
+    tcg_temp_free_i32(tcg_res);
+}
+
+/* Floating-point data-processing (2 source) - double precision */
+static void handle_fp_2src_double(DisasContext *s, int opcode,
+                                  int rd, int rn, int rm)
+{
+    TCGv_i64 tcg_op1;
+    TCGv_i64 tcg_op2;
+    TCGv_i64 tcg_res;
+    TCGv_ptr fpst;
+
+    tcg_res = tcg_temp_new_i64();
+    fpst = fpstatus_ptr(FPST_FPCR);
+    tcg_op1 = read_fp_dreg(s, rn);
+    tcg_op2 = read_fp_dreg(s, rm);
+
+    switch (opcode) {
+    case 0x0: /* FMUL */
+        gen_helper_vfp_muld(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x1: /* FDIV */
+        gen_helper_vfp_divd(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x2: /* FADD */
+        gen_helper_vfp_addd(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x3: /* FSUB */
+        gen_helper_vfp_subd(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x4: /* FMAX */
+        gen_helper_vfp_maxd(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x5: /* FMIN */
+        gen_helper_vfp_mind(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x6: /* FMAXNM */
+        gen_helper_vfp_maxnumd(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x7: /* FMINNM */
+        gen_helper_vfp_minnumd(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x8: /* FNMUL */
+        gen_helper_vfp_muld(tcg_res, tcg_op1, tcg_op2, fpst);
+        gen_helper_vfp_negd(tcg_res, tcg_res);
+        break;
+    }
+
+    write_fp_dreg(s, rd, tcg_res);
+
+    tcg_temp_free_ptr(fpst);
+    tcg_temp_free_i64(tcg_op1);
+    tcg_temp_free_i64(tcg_op2);
+    tcg_temp_free_i64(tcg_res);
+}
+
+/* Floating-point data-processing (2 source) - half precision */
+static void handle_fp_2src_half(DisasContext *s, int opcode,
+                                int rd, int rn, int rm)
+{
+    TCGv_i32 tcg_op1;
+    TCGv_i32 tcg_op2;
+    TCGv_i32 tcg_res;
+    TCGv_ptr fpst;
+
+    tcg_res = tcg_temp_new_i32();
+    fpst = fpstatus_ptr(FPST_FPCR_F16);
+    tcg_op1 = read_fp_hreg(s, rn);
+    tcg_op2 = read_fp_hreg(s, rm);
+
+    switch (opcode) {
+    case 0x0: /* FMUL */
+        gen_helper_advsimd_mulh(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x1: /* FDIV */
+        gen_helper_advsimd_divh(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x2: /* FADD */
+        gen_helper_advsimd_addh(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x3: /* FSUB */
+        gen_helper_advsimd_subh(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x4: /* FMAX */
+        gen_helper_advsimd_maxh(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x5: /* FMIN */
+        gen_helper_advsimd_minh(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x6: /* FMAXNM */
+        gen_helper_advsimd_maxnumh(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x7: /* FMINNM */
+        gen_helper_advsimd_minnumh(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x8: /* FNMUL */
+        gen_helper_advsimd_mulh(tcg_res, tcg_op1, tcg_op2, fpst);
+        tcg_gen_xori_i32(tcg_res, tcg_res, 0x8000);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    write_fp_sreg(s, rd, tcg_res);
+
+    tcg_temp_free_ptr(fpst);
+    tcg_temp_free_i32(tcg_op1);
+    tcg_temp_free_i32(tcg_op2);
+    tcg_temp_free_i32(tcg_res);
+}
+
+/* Floating point data-processing (2 source)
+ *   31  30  29 28       24 23  22  21 20  16 15    12 11 10 9    5 4    0
+ * +---+---+---+-----------+------+---+------+--------+-----+------+------+
+ * | M | 0 | S | 1 1 1 1 0 | type | 1 |  Rm  | opcode | 1 0 |  Rn  |  Rd  |
+ * +---+---+---+-----------+------+---+------+--------+-----+------+------+
+ */
+static void disas_fp_2src(DisasContext *s, uint32_t insn)
+{
+    int mos = extract32(insn, 29, 3);
+    int type = extract32(insn, 22, 2);
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int rm = extract32(insn, 16, 5);
+    int opcode = extract32(insn, 12, 4);
+
+    if (opcode > 8 || mos) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (type) {
+    case 0:
+        if (!fp_access_check(s)) {
+            return;
+        }
+        handle_fp_2src_single(s, opcode, rd, rn, rm);
+        break;
+    case 1:
+        if (!fp_access_check(s)) {
+            return;
+        }
+        handle_fp_2src_double(s, opcode, rd, rn, rm);
+        break;
+    case 3:
+        if (!dc_isar_feature(aa64_fp16, s)) {
+            unallocated_encoding(s);
+            return;
+        }
+        if (!fp_access_check(s)) {
+            return;
+        }
+        handle_fp_2src_half(s, opcode, rd, rn, rm);
+        break;
+    default:
+        unallocated_encoding(s);
+    }
+}
+
+/* Floating-point data-processing (3 source) - single precision */
+static void handle_fp_3src_single(DisasContext *s, bool o0, bool o1,
+                                  int rd, int rn, int rm, int ra)
+{
+    TCGv_i32 tcg_op1, tcg_op2, tcg_op3;
+    TCGv_i32 tcg_res = tcg_temp_new_i32();
+    TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
+
+    tcg_op1 = read_fp_sreg(s, rn);
+    tcg_op2 = read_fp_sreg(s, rm);
+    tcg_op3 = read_fp_sreg(s, ra);
+
+    /* These are fused multiply-add, and must be done as one
+     * floating point operation with no rounding between the
+     * multiplication and addition steps.
+     * NB that doing the negations here as separate steps is
+     * correct : an input NaN should come out with its sign bit
+     * flipped if it is a negated-input.
+     */
+    if (o1 == true) {
+        gen_helper_vfp_negs(tcg_op3, tcg_op3);
+    }
+
+    if (o0 != o1) {
+        gen_helper_vfp_negs(tcg_op1, tcg_op1);
+    }
+
+    gen_helper_vfp_muladds(tcg_res, tcg_op1, tcg_op2, tcg_op3, fpst);
+
+    write_fp_sreg(s, rd, tcg_res);
+
+    tcg_temp_free_ptr(fpst);
+    tcg_temp_free_i32(tcg_op1);
+    tcg_temp_free_i32(tcg_op2);
+    tcg_temp_free_i32(tcg_op3);
+    tcg_temp_free_i32(tcg_res);
+}
+
+/* Floating-point data-processing (3 source) - double precision */
+static void handle_fp_3src_double(DisasContext *s, bool o0, bool o1,
+                                  int rd, int rn, int rm, int ra)
+{
+    TCGv_i64 tcg_op1, tcg_op2, tcg_op3;
+    TCGv_i64 tcg_res = tcg_temp_new_i64();
+    TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
+
+    tcg_op1 = read_fp_dreg(s, rn);
+    tcg_op2 = read_fp_dreg(s, rm);
+    tcg_op3 = read_fp_dreg(s, ra);
+
+    /* These are fused multiply-add, and must be done as one
+     * floating point operation with no rounding between the
+     * multiplication and addition steps.
+     * NB that doing the negations here as separate steps is
+     * correct : an input NaN should come out with its sign bit
+     * flipped if it is a negated-input.
+     */
+    if (o1 == true) {
+        gen_helper_vfp_negd(tcg_op3, tcg_op3);
+    }
+
+    if (o0 != o1) {
+        gen_helper_vfp_negd(tcg_op1, tcg_op1);
+    }
+
+    gen_helper_vfp_muladdd(tcg_res, tcg_op1, tcg_op2, tcg_op3, fpst);
+
+    write_fp_dreg(s, rd, tcg_res);
+
+    tcg_temp_free_ptr(fpst);
+    tcg_temp_free_i64(tcg_op1);
+    tcg_temp_free_i64(tcg_op2);
+    tcg_temp_free_i64(tcg_op3);
+    tcg_temp_free_i64(tcg_res);
+}
+
+/* Floating-point data-processing (3 source) - half precision */
+static void handle_fp_3src_half(DisasContext *s, bool o0, bool o1,
+                                int rd, int rn, int rm, int ra)
+{
+    TCGv_i32 tcg_op1, tcg_op2, tcg_op3;
+    TCGv_i32 tcg_res = tcg_temp_new_i32();
+    TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR_F16);
+
+    tcg_op1 = read_fp_hreg(s, rn);
+    tcg_op2 = read_fp_hreg(s, rm);
+    tcg_op3 = read_fp_hreg(s, ra);
+
+    /* These are fused multiply-add, and must be done as one
+     * floating point operation with no rounding between the
+     * multiplication and addition steps.
+     * NB that doing the negations here as separate steps is
+     * correct : an input NaN should come out with its sign bit
+     * flipped if it is a negated-input.
+     */
+    if (o1 == true) {
+        tcg_gen_xori_i32(tcg_op3, tcg_op3, 0x8000);
+    }
+
+    if (o0 != o1) {
+        tcg_gen_xori_i32(tcg_op1, tcg_op1, 0x8000);
+    }
+
+    gen_helper_advsimd_muladdh(tcg_res, tcg_op1, tcg_op2, tcg_op3, fpst);
+
+    write_fp_sreg(s, rd, tcg_res);
+
+    tcg_temp_free_ptr(fpst);
+    tcg_temp_free_i32(tcg_op1);
+    tcg_temp_free_i32(tcg_op2);
+    tcg_temp_free_i32(tcg_op3);
+    tcg_temp_free_i32(tcg_res);
+}
+
+/* Floating point data-processing (3 source)
+ *   31  30  29 28       24 23  22  21  20  16  15  14  10 9    5 4    0
+ * +---+---+---+-----------+------+----+------+----+------+------+------+
+ * | M | 0 | S | 1 1 1 1 1 | type | o1 |  Rm  | o0 |  Ra  |  Rn  |  Rd  |
+ * +---+---+---+-----------+------+----+------+----+------+------+------+
+ */
+static void disas_fp_3src(DisasContext *s, uint32_t insn)
+{
+    int mos = extract32(insn, 29, 3);
+    int type = extract32(insn, 22, 2);
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int ra = extract32(insn, 10, 5);
+    int rm = extract32(insn, 16, 5);
+    bool o0 = extract32(insn, 15, 1);
+    bool o1 = extract32(insn, 21, 1);
+
+    if (mos) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (type) {
+    case 0:
+        if (!fp_access_check(s)) {
+            return;
+        }
+        handle_fp_3src_single(s, o0, o1, rd, rn, rm, ra);
+        break;
+    case 1:
+        if (!fp_access_check(s)) {
+            return;
+        }
+        handle_fp_3src_double(s, o0, o1, rd, rn, rm, ra);
+        break;
+    case 3:
+        if (!dc_isar_feature(aa64_fp16, s)) {
+            unallocated_encoding(s);
+            return;
+        }
+        if (!fp_access_check(s)) {
+            return;
+        }
+        handle_fp_3src_half(s, o0, o1, rd, rn, rm, ra);
+        break;
+    default:
+        unallocated_encoding(s);
+    }
+}
+
+/* Floating point immediate
+ *   31  30  29 28       24 23  22  21 20        13 12   10 9    5 4    0
+ * +---+---+---+-----------+------+---+------------+-------+------+------+
+ * | M | 0 | S | 1 1 1 1 0 | type | 1 |    imm8    | 1 0 0 | imm5 |  Rd  |
+ * +---+---+---+-----------+------+---+------------+-------+------+------+
+ */
+static void disas_fp_imm(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int imm5 = extract32(insn, 5, 5);
+    int imm8 = extract32(insn, 13, 8);
+    int type = extract32(insn, 22, 2);
+    int mos = extract32(insn, 29, 3);
+    uint64_t imm;
+    MemOp sz;
+
+    if (mos || imm5) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (type) {
+    case 0:
+        sz = MO_32;
+        break;
+    case 1:
+        sz = MO_64;
+        break;
+    case 3:
+        sz = MO_16;
+        if (dc_isar_feature(aa64_fp16, s)) {
+            break;
+        }
+        /* fallthru */
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    imm = vfp_expand_imm(sz, imm8);
+    write_fp_dreg(s, rd, tcg_constant_i64(imm));
+}
+
+/* Handle floating point <=> fixed point conversions. Note that we can
+ * also deal with fp <=> integer conversions as a special case (scale == 64)
+ * OPTME: consider handling that special case specially or at least skipping
+ * the call to scalbn in the helpers for zero shifts.
+ */
+static void handle_fpfpcvt(DisasContext *s, int rd, int rn, int opcode,
+                           bool itof, int rmode, int scale, int sf, int type)
+{
+    bool is_signed = !(opcode & 1);
+    TCGv_ptr tcg_fpstatus;
+    TCGv_i32 tcg_shift, tcg_single;
+    TCGv_i64 tcg_double;
+
+    tcg_fpstatus = fpstatus_ptr(type == 3 ? FPST_FPCR_F16 : FPST_FPCR);
+
+    tcg_shift = tcg_constant_i32(64 - scale);
+
+    if (itof) {
+        TCGv_i64 tcg_int = cpu_reg(s, rn);
+        if (!sf) {
+            TCGv_i64 tcg_extend = new_tmp_a64(s);
+
+            if (is_signed) {
+                tcg_gen_ext32s_i64(tcg_extend, tcg_int);
+            } else {
+                tcg_gen_ext32u_i64(tcg_extend, tcg_int);
+            }
+
+            tcg_int = tcg_extend;
+        }
+
+        switch (type) {
+        case 1: /* float64 */
+            tcg_double = tcg_temp_new_i64();
+            if (is_signed) {
+                gen_helper_vfp_sqtod(tcg_double, tcg_int,
+                                     tcg_shift, tcg_fpstatus);
+            } else {
+                gen_helper_vfp_uqtod(tcg_double, tcg_int,
+                                     tcg_shift, tcg_fpstatus);
+            }
+            write_fp_dreg(s, rd, tcg_double);
+            tcg_temp_free_i64(tcg_double);
+            break;
+
+        case 0: /* float32 */
+            tcg_single = tcg_temp_new_i32();
+            if (is_signed) {
+                gen_helper_vfp_sqtos(tcg_single, tcg_int,
+                                     tcg_shift, tcg_fpstatus);
+            } else {
+                gen_helper_vfp_uqtos(tcg_single, tcg_int,
+                                     tcg_shift, tcg_fpstatus);
+            }
+            write_fp_sreg(s, rd, tcg_single);
+            tcg_temp_free_i32(tcg_single);
+            break;
+
+        case 3: /* float16 */
+            tcg_single = tcg_temp_new_i32();
+            if (is_signed) {
+                gen_helper_vfp_sqtoh(tcg_single, tcg_int,
+                                     tcg_shift, tcg_fpstatus);
+            } else {
+                gen_helper_vfp_uqtoh(tcg_single, tcg_int,
+                                     tcg_shift, tcg_fpstatus);
+            }
+            write_fp_sreg(s, rd, tcg_single);
+            tcg_temp_free_i32(tcg_single);
+            break;
+
+        default:
+            g_assert_not_reached();
+        }
+    } else {
+        TCGv_i64 tcg_int = cpu_reg(s, rd);
+        TCGv_i32 tcg_rmode;
+
+        if (extract32(opcode, 2, 1)) {
+            /* There are too many rounding modes to all fit into rmode,
+             * so FCVTA[US] is a special case.
+             */
+            rmode = FPROUNDING_TIEAWAY;
+        }
+
+        tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode));
+
+        gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
+
+        switch (type) {
+        case 1: /* float64 */
+            tcg_double = read_fp_dreg(s, rn);
+            if (is_signed) {
+                if (!sf) {
+                    gen_helper_vfp_tosld(tcg_int, tcg_double,
+                                         tcg_shift, tcg_fpstatus);
+                } else {
+                    gen_helper_vfp_tosqd(tcg_int, tcg_double,
+                                         tcg_shift, tcg_fpstatus);
+                }
+            } else {
+                if (!sf) {
+                    gen_helper_vfp_tould(tcg_int, tcg_double,
+                                         tcg_shift, tcg_fpstatus);
+                } else {
+                    gen_helper_vfp_touqd(tcg_int, tcg_double,
+                                         tcg_shift, tcg_fpstatus);
+                }
+            }
+            if (!sf) {
+                tcg_gen_ext32u_i64(tcg_int, tcg_int);
+            }
+            tcg_temp_free_i64(tcg_double);
+            break;
+
+        case 0: /* float32 */
+            tcg_single = read_fp_sreg(s, rn);
+            if (sf) {
+                if (is_signed) {
+                    gen_helper_vfp_tosqs(tcg_int, tcg_single,
+                                         tcg_shift, tcg_fpstatus);
+                } else {
+                    gen_helper_vfp_touqs(tcg_int, tcg_single,
+                                         tcg_shift, tcg_fpstatus);
+                }
+            } else {
+                TCGv_i32 tcg_dest = tcg_temp_new_i32();
+                if (is_signed) {
+                    gen_helper_vfp_tosls(tcg_dest, tcg_single,
+                                         tcg_shift, tcg_fpstatus);
+                } else {
+                    gen_helper_vfp_touls(tcg_dest, tcg_single,
+                                         tcg_shift, tcg_fpstatus);
+                }
+                tcg_gen_extu_i32_i64(tcg_int, tcg_dest);
+                tcg_temp_free_i32(tcg_dest);
+            }
+            tcg_temp_free_i32(tcg_single);
+            break;
+
+        case 3: /* float16 */
+            tcg_single = read_fp_sreg(s, rn);
+            if (sf) {
+                if (is_signed) {
+                    gen_helper_vfp_tosqh(tcg_int, tcg_single,
+                                         tcg_shift, tcg_fpstatus);
+                } else {
+                    gen_helper_vfp_touqh(tcg_int, tcg_single,
+                                         tcg_shift, tcg_fpstatus);
+                }
+            } else {
+                TCGv_i32 tcg_dest = tcg_temp_new_i32();
+                if (is_signed) {
+                    gen_helper_vfp_toslh(tcg_dest, tcg_single,
+                                         tcg_shift, tcg_fpstatus);
+                } else {
+                    gen_helper_vfp_toulh(tcg_dest, tcg_single,
+                                         tcg_shift, tcg_fpstatus);
+                }
+                tcg_gen_extu_i32_i64(tcg_int, tcg_dest);
+                tcg_temp_free_i32(tcg_dest);
+            }
+            tcg_temp_free_i32(tcg_single);
+            break;
+
+        default:
+            g_assert_not_reached();
+        }
+
+        gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
+        tcg_temp_free_i32(tcg_rmode);
+    }
+
+    tcg_temp_free_ptr(tcg_fpstatus);
+}
+
+/* Floating point <-> fixed point conversions
+ *   31   30  29 28       24 23  22  21 20   19 18    16 15   10 9    5 4    0
+ * +----+---+---+-----------+------+---+-------+--------+-------+------+------+
+ * | sf | 0 | S | 1 1 1 1 0 | type | 0 | rmode | opcode | scale |  Rn  |  Rd  |
+ * +----+---+---+-----------+------+---+-------+--------+-------+------+------+
+ */
+static void disas_fp_fixed_conv(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int scale = extract32(insn, 10, 6);
+    int opcode = extract32(insn, 16, 3);
+    int rmode = extract32(insn, 19, 2);
+    int type = extract32(insn, 22, 2);
+    bool sbit = extract32(insn, 29, 1);
+    bool sf = extract32(insn, 31, 1);
+    bool itof;
+
+    if (sbit || (!sf && scale < 32)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (type) {
+    case 0: /* float32 */
+    case 1: /* float64 */
+        break;
+    case 3: /* float16 */
+        if (dc_isar_feature(aa64_fp16, s)) {
+            break;
+        }
+        /* fallthru */
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch ((rmode << 3) | opcode) {
+    case 0x2: /* SCVTF */
+    case 0x3: /* UCVTF */
+        itof = true;
+        break;
+    case 0x18: /* FCVTZS */
+    case 0x19: /* FCVTZU */
+        itof = false;
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    handle_fpfpcvt(s, rd, rn, opcode, itof, FPROUNDING_ZERO, scale, sf, type);
+}
+
+static void handle_fmov(DisasContext *s, int rd, int rn, int type, bool itof)
+{
+    /* FMOV: gpr to or from float, double, or top half of quad fp reg,
+     * without conversion.
+     */
+
+    if (itof) {
+        TCGv_i64 tcg_rn = cpu_reg(s, rn);
+        TCGv_i64 tmp;
+
+        switch (type) {
+        case 0:
+            /* 32 bit */
+            tmp = tcg_temp_new_i64();
+            tcg_gen_ext32u_i64(tmp, tcg_rn);
+            write_fp_dreg(s, rd, tmp);
+            tcg_temp_free_i64(tmp);
+            break;
+        case 1:
+            /* 64 bit */
+            write_fp_dreg(s, rd, tcg_rn);
+            break;
+        case 2:
+            /* 64 bit to top half. */
+            tcg_gen_st_i64(tcg_rn, cpu_env, fp_reg_hi_offset(s, rd));
+            clear_vec_high(s, true, rd);
+            break;
+        case 3:
+            /* 16 bit */
+            tmp = tcg_temp_new_i64();
+            tcg_gen_ext16u_i64(tmp, tcg_rn);
+            write_fp_dreg(s, rd, tmp);
+            tcg_temp_free_i64(tmp);
+            break;
+        default:
+            g_assert_not_reached();
+        }
+    } else {
+        TCGv_i64 tcg_rd = cpu_reg(s, rd);
+
+        switch (type) {
+        case 0:
+            /* 32 bit */
+            tcg_gen_ld32u_i64(tcg_rd, cpu_env, fp_reg_offset(s, rn, MO_32));
+            break;
+        case 1:
+            /* 64 bit */
+            tcg_gen_ld_i64(tcg_rd, cpu_env, fp_reg_offset(s, rn, MO_64));
+            break;
+        case 2:
+            /* 64 bits from top half */
+            tcg_gen_ld_i64(tcg_rd, cpu_env, fp_reg_hi_offset(s, rn));
+            break;
+        case 3:
+            /* 16 bit */
+            tcg_gen_ld16u_i64(tcg_rd, cpu_env, fp_reg_offset(s, rn, MO_16));
+            break;
+        default:
+            g_assert_not_reached();
+        }
+    }
+}
+
+static void handle_fjcvtzs(DisasContext *s, int rd, int rn)
+{
+    TCGv_i64 t = read_fp_dreg(s, rn);
+    TCGv_ptr fpstatus = fpstatus_ptr(FPST_FPCR);
+
+    gen_helper_fjcvtzs(t, t, fpstatus);
+
+    tcg_temp_free_ptr(fpstatus);
+
+    tcg_gen_ext32u_i64(cpu_reg(s, rd), t);
+    tcg_gen_extrh_i64_i32(cpu_ZF, t);
+    tcg_gen_movi_i32(cpu_CF, 0);
+    tcg_gen_movi_i32(cpu_NF, 0);
+    tcg_gen_movi_i32(cpu_VF, 0);
+
+    tcg_temp_free_i64(t);
+}
+
+/* Floating point <-> integer conversions
+ *   31   30  29 28       24 23  22  21 20   19 18 16 15         10 9  5 4  0
+ * +----+---+---+-----------+------+---+-------+-----+-------------+----+----+
+ * | sf | 0 | S | 1 1 1 1 0 | type | 1 | rmode | opc | 0 0 0 0 0 0 | Rn | Rd |
+ * +----+---+---+-----------+------+---+-------+-----+-------------+----+----+
+ */
+static void disas_fp_int_conv(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int opcode = extract32(insn, 16, 3);
+    int rmode = extract32(insn, 19, 2);
+    int type = extract32(insn, 22, 2);
+    bool sbit = extract32(insn, 29, 1);
+    bool sf = extract32(insn, 31, 1);
+    bool itof = false;
+
+    if (sbit) {
+        goto do_unallocated;
+    }
+
+    switch (opcode) {
+    case 2: /* SCVTF */
+    case 3: /* UCVTF */
+        itof = true;
+        /* fallthru */
+    case 4: /* FCVTAS */
+    case 5: /* FCVTAU */
+        if (rmode != 0) {
+            goto do_unallocated;
+        }
+        /* fallthru */
+    case 0: /* FCVT[NPMZ]S */
+    case 1: /* FCVT[NPMZ]U */
+        switch (type) {
+        case 0: /* float32 */
+        case 1: /* float64 */
+            break;
+        case 3: /* float16 */
+            if (!dc_isar_feature(aa64_fp16, s)) {
+                goto do_unallocated;
+            }
+            break;
+        default:
+            goto do_unallocated;
+        }
+        if (!fp_access_check(s)) {
+            return;
+        }
+        handle_fpfpcvt(s, rd, rn, opcode, itof, rmode, 64, sf, type);
+        break;
+
+    default:
+        switch (sf << 7 | type << 5 | rmode << 3 | opcode) {
+        case 0b01100110: /* FMOV half <-> 32-bit int */
+        case 0b01100111:
+        case 0b11100110: /* FMOV half <-> 64-bit int */
+        case 0b11100111:
+            if (!dc_isar_feature(aa64_fp16, s)) {
+                goto do_unallocated;
+            }
+            /* fallthru */
+        case 0b00000110: /* FMOV 32-bit */
+        case 0b00000111:
+        case 0b10100110: /* FMOV 64-bit */
+        case 0b10100111:
+        case 0b11001110: /* FMOV top half of 128-bit */
+        case 0b11001111:
+            if (!fp_access_check(s)) {
+                return;
+            }
+            itof = opcode & 1;
+            handle_fmov(s, rd, rn, type, itof);
+            break;
+
+        case 0b00111110: /* FJCVTZS */
+            if (!dc_isar_feature(aa64_jscvt, s)) {
+                goto do_unallocated;
+            } else if (fp_access_check(s)) {
+                handle_fjcvtzs(s, rd, rn);
+            }
+            break;
+
+        default:
+        do_unallocated:
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    }
+}
+
+/* FP-specific subcases of table C3-6 (SIMD and FP data processing)
+ *   31  30  29 28     25 24                          0
+ * +---+---+---+---------+-----------------------------+
+ * |   | 0 |   | 1 1 1 1 |                             |
+ * +---+---+---+---------+-----------------------------+
+ */
+static void disas_data_proc_fp(DisasContext *s, uint32_t insn)
+{
+    if (extract32(insn, 24, 1)) {
+        /* Floating point data-processing (3 source) */
+        disas_fp_3src(s, insn);
+    } else if (extract32(insn, 21, 1) == 0) {
+        /* Floating point to fixed point conversions */
+        disas_fp_fixed_conv(s, insn);
+    } else {
+        switch (extract32(insn, 10, 2)) {
+        case 1:
+            /* Floating point conditional compare */
+            disas_fp_ccomp(s, insn);
+            break;
+        case 2:
+            /* Floating point data-processing (2 source) */
+            disas_fp_2src(s, insn);
+            break;
+        case 3:
+            /* Floating point conditional select */
+            disas_fp_csel(s, insn);
+            break;
+        case 0:
+            switch (ctz32(extract32(insn, 12, 4))) {
+            case 0: /* [15:12] == xxx1 */
+                /* Floating point immediate */
+                disas_fp_imm(s, insn);
+                break;
+            case 1: /* [15:12] == xx10 */
+                /* Floating point compare */
+                disas_fp_compare(s, insn);
+                break;
+            case 2: /* [15:12] == x100 */
+                /* Floating point data-processing (1 source) */
+                disas_fp_1src(s, insn);
+                break;
+            case 3: /* [15:12] == 1000 */
+                unallocated_encoding(s);
+                break;
+            default: /* [15:12] == 0000 */
+                /* Floating point <-> integer conversions */
+                disas_fp_int_conv(s, insn);
+                break;
+            }
+            break;
+        }
+    }
+}
+
+static void do_ext64(DisasContext *s, TCGv_i64 tcg_left, TCGv_i64 tcg_right,
+                     int pos)
+{
+    /* Extract 64 bits from the middle of two concatenated 64 bit
+     * vector register slices left:right. The extracted bits start
+     * at 'pos' bits into the right (least significant) side.
+     * We return the result in tcg_right, and guarantee not to
+     * trash tcg_left.
+     */
+    TCGv_i64 tcg_tmp = tcg_temp_new_i64();
+    assert(pos > 0 && pos < 64);
+
+    tcg_gen_shri_i64(tcg_right, tcg_right, pos);
+    tcg_gen_shli_i64(tcg_tmp, tcg_left, 64 - pos);
+    tcg_gen_or_i64(tcg_right, tcg_right, tcg_tmp);
+
+    tcg_temp_free_i64(tcg_tmp);
+}
+
+/* EXT
+ *   31  30 29         24 23 22  21 20  16 15  14  11 10  9    5 4    0
+ * +---+---+-------------+-----+---+------+---+------+---+------+------+
+ * | 0 | Q | 1 0 1 1 1 0 | op2 | 0 |  Rm  | 0 | imm4 | 0 |  Rn  |  Rd  |
+ * +---+---+-------------+-----+---+------+---+------+---+------+------+
+ */
+static void disas_simd_ext(DisasContext *s, uint32_t insn)
+{
+    int is_q = extract32(insn, 30, 1);
+    int op2 = extract32(insn, 22, 2);
+    int imm4 = extract32(insn, 11, 4);
+    int rm = extract32(insn, 16, 5);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+    int pos = imm4 << 3;
+    TCGv_i64 tcg_resl, tcg_resh;
+
+    if (op2 != 0 || (!is_q && extract32(imm4, 3, 1))) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    tcg_resh = tcg_temp_new_i64();
+    tcg_resl = tcg_temp_new_i64();
+
+    /* Vd gets bits starting at pos bits into Vm:Vn. This is
+     * either extracting 128 bits from a 128:128 concatenation, or
+     * extracting 64 bits from a 64:64 concatenation.
+     */
+    if (!is_q) {
+        read_vec_element(s, tcg_resl, rn, 0, MO_64);
+        if (pos != 0) {
+            read_vec_element(s, tcg_resh, rm, 0, MO_64);
+            do_ext64(s, tcg_resh, tcg_resl, pos);
+        }
+    } else {
+        TCGv_i64 tcg_hh;
+        typedef struct {
+            int reg;
+            int elt;
+        } EltPosns;
+        EltPosns eltposns[] = { {rn, 0}, {rn, 1}, {rm, 0}, {rm, 1} };
+        EltPosns *elt = eltposns;
+
+        if (pos >= 64) {
+            elt++;
+            pos -= 64;
+        }
+
+        read_vec_element(s, tcg_resl, elt->reg, elt->elt, MO_64);
+        elt++;
+        read_vec_element(s, tcg_resh, elt->reg, elt->elt, MO_64);
+        elt++;
+        if (pos != 0) {
+            do_ext64(s, tcg_resh, tcg_resl, pos);
+            tcg_hh = tcg_temp_new_i64();
+            read_vec_element(s, tcg_hh, elt->reg, elt->elt, MO_64);
+            do_ext64(s, tcg_hh, tcg_resh, pos);
+            tcg_temp_free_i64(tcg_hh);
+        }
+    }
+
+    write_vec_element(s, tcg_resl, rd, 0, MO_64);
+    tcg_temp_free_i64(tcg_resl);
+    if (is_q) {
+        write_vec_element(s, tcg_resh, rd, 1, MO_64);
+    }
+    tcg_temp_free_i64(tcg_resh);
+    clear_vec_high(s, is_q, rd);
+}
+
+/* TBL/TBX
+ *   31  30 29         24 23 22  21 20  16 15  14 13  12  11 10 9    5 4    0
+ * +---+---+-------------+-----+---+------+---+-----+----+-----+------+------+
+ * | 0 | Q | 0 0 1 1 1 0 | op2 | 0 |  Rm  | 0 | len | op | 0 0 |  Rn  |  Rd  |
+ * +---+---+-------------+-----+---+------+---+-----+----+-----+------+------+
+ */
+static void disas_simd_tb(DisasContext *s, uint32_t insn)
+{
+    int op2 = extract32(insn, 22, 2);
+    int is_q = extract32(insn, 30, 1);
+    int rm = extract32(insn, 16, 5);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+    int is_tbx = extract32(insn, 12, 1);
+    int len = (extract32(insn, 13, 2) + 1) * 16;
+
+    if (op2 != 0) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    tcg_gen_gvec_2_ptr(vec_full_reg_offset(s, rd),
+                       vec_full_reg_offset(s, rm), cpu_env,
+                       is_q ? 16 : 8, vec_full_reg_size(s),
+                       (len << 6) | (is_tbx << 5) | rn,
+                       gen_helper_simd_tblx);
+}
+
+/* ZIP/UZP/TRN
+ *   31  30 29         24 23  22  21 20   16 15 14 12 11 10 9    5 4    0
+ * +---+---+-------------+------+---+------+---+------------------+------+
+ * | 0 | Q | 0 0 1 1 1 0 | size | 0 |  Rm  | 0 | opc | 1 0 |  Rn  |  Rd  |
+ * +---+---+-------------+------+---+------+---+------------------+------+
+ */
+static void disas_simd_zip_trn(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int rm = extract32(insn, 16, 5);
+    int size = extract32(insn, 22, 2);
+    /* opc field bits [1:0] indicate ZIP/UZP/TRN;
+     * bit 2 indicates 1 vs 2 variant of the insn.
+     */
+    int opcode = extract32(insn, 12, 2);
+    bool part = extract32(insn, 14, 1);
+    bool is_q = extract32(insn, 30, 1);
+    int esize = 8 << size;
+    int i, ofs;
+    int datasize = is_q ? 128 : 64;
+    int elements = datasize / esize;
+    TCGv_i64 tcg_res, tcg_resl, tcg_resh;
+
+    if (opcode == 0 || (size == 3 && !is_q)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    tcg_resl = tcg_const_i64(0);
+    tcg_resh = is_q ? tcg_const_i64(0) : NULL;
+    tcg_res = tcg_temp_new_i64();
+
+    for (i = 0; i < elements; i++) {
+        switch (opcode) {
+        case 1: /* UZP1/2 */
+        {
+            int midpoint = elements / 2;
+            if (i < midpoint) {
+                read_vec_element(s, tcg_res, rn, 2 * i + part, size);
+            } else {
+                read_vec_element(s, tcg_res, rm,
+                                 2 * (i - midpoint) + part, size);
+            }
+            break;
+        }
+        case 2: /* TRN1/2 */
+            if (i & 1) {
+                read_vec_element(s, tcg_res, rm, (i & ~1) + part, size);
+            } else {
+                read_vec_element(s, tcg_res, rn, (i & ~1) + part, size);
+            }
+            break;
+        case 3: /* ZIP1/2 */
+        {
+            int base = part * elements / 2;
+            if (i & 1) {
+                read_vec_element(s, tcg_res, rm, base + (i >> 1), size);
+            } else {
+                read_vec_element(s, tcg_res, rn, base + (i >> 1), size);
+            }
+            break;
+        }
+        default:
+            g_assert_not_reached();
+        }
+
+        ofs = i * esize;
+        if (ofs < 64) {
+            tcg_gen_shli_i64(tcg_res, tcg_res, ofs);
+            tcg_gen_or_i64(tcg_resl, tcg_resl, tcg_res);
+        } else {
+            tcg_gen_shli_i64(tcg_res, tcg_res, ofs - 64);
+            tcg_gen_or_i64(tcg_resh, tcg_resh, tcg_res);
+        }
+    }
+
+    tcg_temp_free_i64(tcg_res);
+
+    write_vec_element(s, tcg_resl, rd, 0, MO_64);
+    tcg_temp_free_i64(tcg_resl);
+
+    if (is_q) {
+        write_vec_element(s, tcg_resh, rd, 1, MO_64);
+        tcg_temp_free_i64(tcg_resh);
+    }
+    clear_vec_high(s, is_q, rd);
+}
+
+/*
+ * do_reduction_op helper
+ *
+ * This mirrors the Reduce() pseudocode in the ARM ARM. It is
+ * important for correct NaN propagation that we do these
+ * operations in exactly the order specified by the pseudocode.
+ *
+ * This is a recursive function, TCG temps should be freed by the
+ * calling function once it is done with the values.
+ */
+static TCGv_i32 do_reduction_op(DisasContext *s, int fpopcode, int rn,
+                                int esize, int size, int vmap, TCGv_ptr fpst)
+{
+    if (esize == size) {
+        int element;
+        MemOp msize = esize == 16 ? MO_16 : MO_32;
+        TCGv_i32 tcg_elem;
+
+        /* We should have one register left here */
+        assert(ctpop8(vmap) == 1);
+        element = ctz32(vmap);
+        assert(element < 8);
+
+        tcg_elem = tcg_temp_new_i32();
+        read_vec_element_i32(s, tcg_elem, rn, element, msize);
+        return tcg_elem;
+    } else {
+        int bits = size / 2;
+        int shift = ctpop8(vmap) / 2;
+        int vmap_lo = (vmap >> shift) & vmap;
+        int vmap_hi = (vmap & ~vmap_lo);
+        TCGv_i32 tcg_hi, tcg_lo, tcg_res;
+
+        tcg_hi = do_reduction_op(s, fpopcode, rn, esize, bits, vmap_hi, fpst);
+        tcg_lo = do_reduction_op(s, fpopcode, rn, esize, bits, vmap_lo, fpst);
+        tcg_res = tcg_temp_new_i32();
+
+        switch (fpopcode) {
+        case 0x0c: /* fmaxnmv half-precision */
+            gen_helper_advsimd_maxnumh(tcg_res, tcg_lo, tcg_hi, fpst);
+            break;
+        case 0x0f: /* fmaxv half-precision */
+            gen_helper_advsimd_maxh(tcg_res, tcg_lo, tcg_hi, fpst);
+            break;
+        case 0x1c: /* fminnmv half-precision */
+            gen_helper_advsimd_minnumh(tcg_res, tcg_lo, tcg_hi, fpst);
+            break;
+        case 0x1f: /* fminv half-precision */
+            gen_helper_advsimd_minh(tcg_res, tcg_lo, tcg_hi, fpst);
+            break;
+        case 0x2c: /* fmaxnmv */
+            gen_helper_vfp_maxnums(tcg_res, tcg_lo, tcg_hi, fpst);
+            break;
+        case 0x2f: /* fmaxv */
+            gen_helper_vfp_maxs(tcg_res, tcg_lo, tcg_hi, fpst);
+            break;
+        case 0x3c: /* fminnmv */
+            gen_helper_vfp_minnums(tcg_res, tcg_lo, tcg_hi, fpst);
+            break;
+        case 0x3f: /* fminv */
+            gen_helper_vfp_mins(tcg_res, tcg_lo, tcg_hi, fpst);
+            break;
+        default:
+            g_assert_not_reached();
+        }
+
+        tcg_temp_free_i32(tcg_hi);
+        tcg_temp_free_i32(tcg_lo);
+        return tcg_res;
+    }
+}
+
+/* AdvSIMD across lanes
+ *   31  30  29 28       24 23  22 21       17 16    12 11 10 9    5 4    0
+ * +---+---+---+-----------+------+-----------+--------+-----+------+------+
+ * | 0 | Q | U | 0 1 1 1 0 | size | 1 1 0 0 0 | opcode | 1 0 |  Rn  |  Rd  |
+ * +---+---+---+-----------+------+-----------+--------+-----+------+------+
+ */
+static void disas_simd_across_lanes(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int size = extract32(insn, 22, 2);
+    int opcode = extract32(insn, 12, 5);
+    bool is_q = extract32(insn, 30, 1);
+    bool is_u = extract32(insn, 29, 1);
+    bool is_fp = false;
+    bool is_min = false;
+    int esize;
+    int elements;
+    int i;
+    TCGv_i64 tcg_res, tcg_elt;
+
+    switch (opcode) {
+    case 0x1b: /* ADDV */
+        if (is_u) {
+            unallocated_encoding(s);
+            return;
+        }
+        /* fall through */
+    case 0x3: /* SADDLV, UADDLV */
+    case 0xa: /* SMAXV, UMAXV */
+    case 0x1a: /* SMINV, UMINV */
+        if (size == 3 || (size == 2 && !is_q)) {
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    case 0xc: /* FMAXNMV, FMINNMV */
+    case 0xf: /* FMAXV, FMINV */
+        /* Bit 1 of size field encodes min vs max and the actual size
+         * depends on the encoding of the U bit. If not set (and FP16
+         * enabled) then we do half-precision float instead of single
+         * precision.
+         */
+        is_min = extract32(size, 1, 1);
+        is_fp = true;
+        if (!is_u && dc_isar_feature(aa64_fp16, s)) {
+            size = 1;
+        } else if (!is_u || !is_q || extract32(size, 0, 1)) {
+            unallocated_encoding(s);
+            return;
+        } else {
+            size = 2;
+        }
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    esize = 8 << size;
+    elements = (is_q ? 128 : 64) / esize;
+
+    tcg_res = tcg_temp_new_i64();
+    tcg_elt = tcg_temp_new_i64();
+
+    /* These instructions operate across all lanes of a vector
+     * to produce a single result. We can guarantee that a 64
+     * bit intermediate is sufficient:
+     *  + for [US]ADDLV the maximum element size is 32 bits, and
+     *    the result type is 64 bits
+     *  + for FMAX*V, FMIN*V, ADDV the intermediate type is the
+     *    same as the element size, which is 32 bits at most
+     * For the integer operations we can choose to work at 64
+     * or 32 bits and truncate at the end; for simplicity
+     * we use 64 bits always. The floating point
+     * ops do require 32 bit intermediates, though.
+     */
+    if (!is_fp) {
+        read_vec_element(s, tcg_res, rn, 0, size | (is_u ? 0 : MO_SIGN));
+
+        for (i = 1; i < elements; i++) {
+            read_vec_element(s, tcg_elt, rn, i, size | (is_u ? 0 : MO_SIGN));
+
+            switch (opcode) {
+            case 0x03: /* SADDLV / UADDLV */
+            case 0x1b: /* ADDV */
+                tcg_gen_add_i64(tcg_res, tcg_res, tcg_elt);
+                break;
+            case 0x0a: /* SMAXV / UMAXV */
+                if (is_u) {
+                    tcg_gen_umax_i64(tcg_res, tcg_res, tcg_elt);
+                } else {
+                    tcg_gen_smax_i64(tcg_res, tcg_res, tcg_elt);
+                }
+                break;
+            case 0x1a: /* SMINV / UMINV */
+                if (is_u) {
+                    tcg_gen_umin_i64(tcg_res, tcg_res, tcg_elt);
+                } else {
+                    tcg_gen_smin_i64(tcg_res, tcg_res, tcg_elt);
+                }
+                break;
+            default:
+                g_assert_not_reached();
+            }
+
+        }
+    } else {
+        /* Floating point vector reduction ops which work across 32
+         * bit (single) or 16 bit (half-precision) intermediates.
+         * Note that correct NaN propagation requires that we do these
+         * operations in exactly the order specified by the pseudocode.
+         */
+        TCGv_ptr fpst = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+        int fpopcode = opcode | is_min << 4 | is_u << 5;
+        int vmap = (1 << elements) - 1;
+        TCGv_i32 tcg_res32 = do_reduction_op(s, fpopcode, rn, esize,
+                                             (is_q ? 128 : 64), vmap, fpst);
+        tcg_gen_extu_i32_i64(tcg_res, tcg_res32);
+        tcg_temp_free_i32(tcg_res32);
+        tcg_temp_free_ptr(fpst);
+    }
+
+    tcg_temp_free_i64(tcg_elt);
+
+    /* Now truncate the result to the width required for the final output */
+    if (opcode == 0x03) {
+        /* SADDLV, UADDLV: result is 2*esize */
+        size++;
+    }
+
+    switch (size) {
+    case 0:
+        tcg_gen_ext8u_i64(tcg_res, tcg_res);
+        break;
+    case 1:
+        tcg_gen_ext16u_i64(tcg_res, tcg_res);
+        break;
+    case 2:
+        tcg_gen_ext32u_i64(tcg_res, tcg_res);
+        break;
+    case 3:
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    write_fp_dreg(s, rd, tcg_res);
+    tcg_temp_free_i64(tcg_res);
+}
+
+/* DUP (Element, Vector)
+ *
+ *  31  30   29              21 20    16 15        10  9    5 4    0
+ * +---+---+-------------------+--------+-------------+------+------+
+ * | 0 | Q | 0 0 1 1 1 0 0 0 0 |  imm5  | 0 0 0 0 0 1 |  Rn  |  Rd  |
+ * +---+---+-------------------+--------+-------------+------+------+
+ *
+ * size: encoded in imm5 (see ARM ARM LowestSetBit())
+ */
+static void handle_simd_dupe(DisasContext *s, int is_q, int rd, int rn,
+                             int imm5)
+{
+    int size = ctz32(imm5);
+    int index;
+
+    if (size > 3 || (size == 3 && !is_q)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    index = imm5 >> (size + 1);
+    tcg_gen_gvec_dup_mem(size, vec_full_reg_offset(s, rd),
+                         vec_reg_offset(s, rn, index, size),
+                         is_q ? 16 : 8, vec_full_reg_size(s));
+}
+
+/* DUP (element, scalar)
+ *  31                   21 20    16 15        10  9    5 4    0
+ * +-----------------------+--------+-------------+------+------+
+ * | 0 1 0 1 1 1 1 0 0 0 0 |  imm5  | 0 0 0 0 0 1 |  Rn  |  Rd  |
+ * +-----------------------+--------+-------------+------+------+
+ */
+static void handle_simd_dupes(DisasContext *s, int rd, int rn,
+                              int imm5)
+{
+    int size = ctz32(imm5);
+    int index;
+    TCGv_i64 tmp;
+
+    if (size > 3) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    index = imm5 >> (size + 1);
+
+    /* This instruction just extracts the specified element and
+     * zero-extends it into the bottom of the destination register.
+     */
+    tmp = tcg_temp_new_i64();
+    read_vec_element(s, tmp, rn, index, size);
+    write_fp_dreg(s, rd, tmp);
+    tcg_temp_free_i64(tmp);
+}
+
+/* DUP (General)
+ *
+ *  31  30   29              21 20    16 15        10  9    5 4    0
+ * +---+---+-------------------+--------+-------------+------+------+
+ * | 0 | Q | 0 0 1 1 1 0 0 0 0 |  imm5  | 0 0 0 0 1 1 |  Rn  |  Rd  |
+ * +---+---+-------------------+--------+-------------+------+------+
+ *
+ * size: encoded in imm5 (see ARM ARM LowestSetBit())
+ */
+static void handle_simd_dupg(DisasContext *s, int is_q, int rd, int rn,
+                             int imm5)
+{
+    int size = ctz32(imm5);
+    uint32_t dofs, oprsz, maxsz;
+
+    if (size > 3 || ((size == 3) && !is_q)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    dofs = vec_full_reg_offset(s, rd);
+    oprsz = is_q ? 16 : 8;
+    maxsz = vec_full_reg_size(s);
+
+    tcg_gen_gvec_dup_i64(size, dofs, oprsz, maxsz, cpu_reg(s, rn));
+}
+
+/* INS (Element)
+ *
+ *  31                   21 20    16 15  14    11  10 9    5 4    0
+ * +-----------------------+--------+------------+---+------+------+
+ * | 0 1 1 0 1 1 1 0 0 0 0 |  imm5  | 0 |  imm4  | 1 |  Rn  |  Rd  |
+ * +-----------------------+--------+------------+---+------+------+
+ *
+ * size: encoded in imm5 (see ARM ARM LowestSetBit())
+ * index: encoded in imm5<4:size+1>
+ */
+static void handle_simd_inse(DisasContext *s, int rd, int rn,
+                             int imm4, int imm5)
+{
+    int size = ctz32(imm5);
+    int src_index, dst_index;
+    TCGv_i64 tmp;
+
+    if (size > 3) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    dst_index = extract32(imm5, 1+size, 5);
+    src_index = extract32(imm4, size, 4);
+
+    tmp = tcg_temp_new_i64();
+
+    read_vec_element(s, tmp, rn, src_index, size);
+    write_vec_element(s, tmp, rd, dst_index, size);
+
+    tcg_temp_free_i64(tmp);
+
+    /* INS is considered a 128-bit write for SVE. */
+    clear_vec_high(s, true, rd);
+}
+
+
+/* INS (General)
+ *
+ *  31                   21 20    16 15        10  9    5 4    0
+ * +-----------------------+--------+-------------+------+------+
+ * | 0 1 0 0 1 1 1 0 0 0 0 |  imm5  | 0 0 0 1 1 1 |  Rn  |  Rd  |
+ * +-----------------------+--------+-------------+------+------+
+ *
+ * size: encoded in imm5 (see ARM ARM LowestSetBit())
+ * index: encoded in imm5<4:size+1>
+ */
+static void handle_simd_insg(DisasContext *s, int rd, int rn, int imm5)
+{
+    int size = ctz32(imm5);
+    int idx;
+
+    if (size > 3) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    idx = extract32(imm5, 1 + size, 4 - size);
+    write_vec_element(s, cpu_reg(s, rn), rd, idx, size);
+
+    /* INS is considered a 128-bit write for SVE. */
+    clear_vec_high(s, true, rd);
+}
+
+/*
+ * UMOV (General)
+ * SMOV (General)
+ *
+ *  31  30   29              21 20    16 15    12   10 9    5 4    0
+ * +---+---+-------------------+--------+-------------+------+------+
+ * | 0 | Q | 0 0 1 1 1 0 0 0 0 |  imm5  | 0 0 1 U 1 1 |  Rn  |  Rd  |
+ * +---+---+-------------------+--------+-------------+------+------+
+ *
+ * U: unsigned when set
+ * size: encoded in imm5 (see ARM ARM LowestSetBit())
+ */
+static void handle_simd_umov_smov(DisasContext *s, int is_q, int is_signed,
+                                  int rn, int rd, int imm5)
+{
+    int size = ctz32(imm5);
+    int element;
+    TCGv_i64 tcg_rd;
+
+    /* Check for UnallocatedEncodings */
+    if (is_signed) {
+        if (size > 2 || (size == 2 && !is_q)) {
+            unallocated_encoding(s);
+            return;
+        }
+    } else {
+        if (size > 3
+            || (size < 3 && is_q)
+            || (size == 3 && !is_q)) {
+            unallocated_encoding(s);
+            return;
+        }
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    element = extract32(imm5, 1+size, 4);
+
+    tcg_rd = cpu_reg(s, rd);
+    read_vec_element(s, tcg_rd, rn, element, size | (is_signed ? MO_SIGN : 0));
+    if (is_signed && !is_q) {
+        tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
+    }
+}
+
+/* AdvSIMD copy
+ *   31  30  29  28             21 20  16 15  14  11 10  9    5 4    0
+ * +---+---+----+-----------------+------+---+------+---+------+------+
+ * | 0 | Q | op | 0 1 1 1 0 0 0 0 | imm5 | 0 | imm4 | 1 |  Rn  |  Rd  |
+ * +---+---+----+-----------------+------+---+------+---+------+------+
+ */
+static void disas_simd_copy(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int imm4 = extract32(insn, 11, 4);
+    int op = extract32(insn, 29, 1);
+    int is_q = extract32(insn, 30, 1);
+    int imm5 = extract32(insn, 16, 5);
+
+    if (op) {
+        if (is_q) {
+            /* INS (element) */
+            handle_simd_inse(s, rd, rn, imm4, imm5);
+        } else {
+            unallocated_encoding(s);
+        }
+    } else {
+        switch (imm4) {
+        case 0:
+            /* DUP (element - vector) */
+            handle_simd_dupe(s, is_q, rd, rn, imm5);
+            break;
+        case 1:
+            /* DUP (general) */
+            handle_simd_dupg(s, is_q, rd, rn, imm5);
+            break;
+        case 3:
+            if (is_q) {
+                /* INS (general) */
+                handle_simd_insg(s, rd, rn, imm5);
+            } else {
+                unallocated_encoding(s);
+            }
+            break;
+        case 5:
+        case 7:
+            /* UMOV/SMOV (is_q indicates 32/64; imm4 indicates signedness) */
+            handle_simd_umov_smov(s, is_q, (imm4 == 5), rn, rd, imm5);
+            break;
+        default:
+            unallocated_encoding(s);
+            break;
+        }
+    }
+}
+
+/* AdvSIMD modified immediate
+ *  31  30   29  28                 19 18 16 15   12  11  10  9     5 4    0
+ * +---+---+----+---------------------+-----+-------+----+---+-------+------+
+ * | 0 | Q | op | 0 1 1 1 1 0 0 0 0 0 | abc | cmode | o2 | 1 | defgh |  Rd  |
+ * +---+---+----+---------------------+-----+-------+----+---+-------+------+
+ *
+ * There are a number of operations that can be carried out here:
+ *   MOVI - move (shifted) imm into register
+ *   MVNI - move inverted (shifted) imm into register
+ *   ORR  - bitwise OR of (shifted) imm with register
+ *   BIC  - bitwise clear of (shifted) imm with register
+ * With ARMv8.2 we also have:
+ *   FMOV half-precision
+ */
+static void disas_simd_mod_imm(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int cmode = extract32(insn, 12, 4);
+    int o2 = extract32(insn, 11, 1);
+    uint64_t abcdefgh = extract32(insn, 5, 5) | (extract32(insn, 16, 3) << 5);
+    bool is_neg = extract32(insn, 29, 1);
+    bool is_q = extract32(insn, 30, 1);
+    uint64_t imm = 0;
+
+    if (o2 != 0 || ((cmode == 0xf) && is_neg && !is_q)) {
+        /* Check for FMOV (vector, immediate) - half-precision */
+        if (!(dc_isar_feature(aa64_fp16, s) && o2 && cmode == 0xf)) {
+            unallocated_encoding(s);
+            return;
+        }
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    if (cmode == 15 && o2 && !is_neg) {
+        /* FMOV (vector, immediate) - half-precision */
+        imm = vfp_expand_imm(MO_16, abcdefgh);
+        /* now duplicate across the lanes */
+        imm = dup_const(MO_16, imm);
+    } else {
+        imm = asimd_imm_const(abcdefgh, cmode, is_neg);
+    }
+
+    if (!((cmode & 0x9) == 0x1 || (cmode & 0xd) == 0x9)) {
+        /* MOVI or MVNI, with MVNI negation handled above.  */
+        tcg_gen_gvec_dup_imm(MO_64, vec_full_reg_offset(s, rd), is_q ? 16 : 8,
+                             vec_full_reg_size(s), imm);
+    } else {
+        /* ORR or BIC, with BIC negation to AND handled above.  */
+        if (is_neg) {
+            gen_gvec_fn2i(s, is_q, rd, rd, imm, tcg_gen_gvec_andi, MO_64);
+        } else {
+            gen_gvec_fn2i(s, is_q, rd, rd, imm, tcg_gen_gvec_ori, MO_64);
+        }
+    }
+}
+
+/* AdvSIMD scalar copy
+ *  31 30  29  28             21 20  16 15  14  11 10  9    5 4    0
+ * +-----+----+-----------------+------+---+------+---+------+------+
+ * | 0 1 | op | 1 1 1 1 0 0 0 0 | imm5 | 0 | imm4 | 1 |  Rn  |  Rd  |
+ * +-----+----+-----------------+------+---+------+---+------+------+
+ */
+static void disas_simd_scalar_copy(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int imm4 = extract32(insn, 11, 4);
+    int imm5 = extract32(insn, 16, 5);
+    int op = extract32(insn, 29, 1);
+
+    if (op != 0 || imm4 != 0) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    /* DUP (element, scalar) */
+    handle_simd_dupes(s, rd, rn, imm5);
+}
+
+/* AdvSIMD scalar pairwise
+ *  31 30  29 28       24 23  22 21       17 16    12 11 10 9    5 4    0
+ * +-----+---+-----------+------+-----------+--------+-----+------+------+
+ * | 0 1 | U | 1 1 1 1 0 | size | 1 1 0 0 0 | opcode | 1 0 |  Rn  |  Rd  |
+ * +-----+---+-----------+------+-----------+--------+-----+------+------+
+ */
+static void disas_simd_scalar_pairwise(DisasContext *s, uint32_t insn)
+{
+    int u = extract32(insn, 29, 1);
+    int size = extract32(insn, 22, 2);
+    int opcode = extract32(insn, 12, 5);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+    TCGv_ptr fpst;
+
+    /* For some ops (the FP ones), size[1] is part of the encoding.
+     * For ADDP strictly it is not but size[1] is always 1 for valid
+     * encodings.
+     */
+    opcode |= (extract32(size, 1, 1) << 5);
+
+    switch (opcode) {
+    case 0x3b: /* ADDP */
+        if (u || size != 3) {
+            unallocated_encoding(s);
+            return;
+        }
+        if (!fp_access_check(s)) {
+            return;
+        }
+
+        fpst = NULL;
+        break;
+    case 0xc: /* FMAXNMP */
+    case 0xd: /* FADDP */
+    case 0xf: /* FMAXP */
+    case 0x2c: /* FMINNMP */
+    case 0x2f: /* FMINP */
+        /* FP op, size[0] is 32 or 64 bit*/
+        if (!u) {
+            if (!dc_isar_feature(aa64_fp16, s)) {
+                unallocated_encoding(s);
+                return;
+            } else {
+                size = MO_16;
+            }
+        } else {
+            size = extract32(size, 0, 1) ? MO_64 : MO_32;
+        }
+
+        if (!fp_access_check(s)) {
+            return;
+        }
+
+        fpst = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (size == MO_64) {
+        TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+        TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+        TCGv_i64 tcg_res = tcg_temp_new_i64();
+
+        read_vec_element(s, tcg_op1, rn, 0, MO_64);
+        read_vec_element(s, tcg_op2, rn, 1, MO_64);
+
+        switch (opcode) {
+        case 0x3b: /* ADDP */
+            tcg_gen_add_i64(tcg_res, tcg_op1, tcg_op2);
+            break;
+        case 0xc: /* FMAXNMP */
+            gen_helper_vfp_maxnumd(tcg_res, tcg_op1, tcg_op2, fpst);
+            break;
+        case 0xd: /* FADDP */
+            gen_helper_vfp_addd(tcg_res, tcg_op1, tcg_op2, fpst);
+            break;
+        case 0xf: /* FMAXP */
+            gen_helper_vfp_maxd(tcg_res, tcg_op1, tcg_op2, fpst);
+            break;
+        case 0x2c: /* FMINNMP */
+            gen_helper_vfp_minnumd(tcg_res, tcg_op1, tcg_op2, fpst);
+            break;
+        case 0x2f: /* FMINP */
+            gen_helper_vfp_mind(tcg_res, tcg_op1, tcg_op2, fpst);
+            break;
+        default:
+            g_assert_not_reached();
+        }
+
+        write_fp_dreg(s, rd, tcg_res);
+
+        tcg_temp_free_i64(tcg_op1);
+        tcg_temp_free_i64(tcg_op2);
+        tcg_temp_free_i64(tcg_res);
+    } else {
+        TCGv_i32 tcg_op1 = tcg_temp_new_i32();
+        TCGv_i32 tcg_op2 = tcg_temp_new_i32();
+        TCGv_i32 tcg_res = tcg_temp_new_i32();
+
+        read_vec_element_i32(s, tcg_op1, rn, 0, size);
+        read_vec_element_i32(s, tcg_op2, rn, 1, size);
+
+        if (size == MO_16) {
+            switch (opcode) {
+            case 0xc: /* FMAXNMP */
+                gen_helper_advsimd_maxnumh(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0xd: /* FADDP */
+                gen_helper_advsimd_addh(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0xf: /* FMAXP */
+                gen_helper_advsimd_maxh(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x2c: /* FMINNMP */
+                gen_helper_advsimd_minnumh(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x2f: /* FMINP */
+                gen_helper_advsimd_minh(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            default:
+                g_assert_not_reached();
+            }
+        } else {
+            switch (opcode) {
+            case 0xc: /* FMAXNMP */
+                gen_helper_vfp_maxnums(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0xd: /* FADDP */
+                gen_helper_vfp_adds(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0xf: /* FMAXP */
+                gen_helper_vfp_maxs(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x2c: /* FMINNMP */
+                gen_helper_vfp_minnums(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x2f: /* FMINP */
+                gen_helper_vfp_mins(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            default:
+                g_assert_not_reached();
+            }
+        }
+
+        write_fp_sreg(s, rd, tcg_res);
+
+        tcg_temp_free_i32(tcg_op1);
+        tcg_temp_free_i32(tcg_op2);
+        tcg_temp_free_i32(tcg_res);
+    }
+
+    if (fpst) {
+        tcg_temp_free_ptr(fpst);
+    }
+}
+
+/*
+ * Common SSHR[RA]/USHR[RA] - Shift right (optional rounding/accumulate)
+ *
+ * This code is handles the common shifting code and is used by both
+ * the vector and scalar code.
+ */
+static void handle_shri_with_rndacc(TCGv_i64 tcg_res, TCGv_i64 tcg_src,
+                                    TCGv_i64 tcg_rnd, bool accumulate,
+                                    bool is_u, int size, int shift)
+{
+    bool extended_result = false;
+    bool round = tcg_rnd != NULL;
+    int ext_lshift = 0;
+    TCGv_i64 tcg_src_hi;
+
+    if (round && size == 3) {
+        extended_result = true;
+        ext_lshift = 64 - shift;
+        tcg_src_hi = tcg_temp_new_i64();
+    } else if (shift == 64) {
+        if (!accumulate && is_u) {
+            /* result is zero */
+            tcg_gen_movi_i64(tcg_res, 0);
+            return;
+        }
+    }
+
+    /* Deal with the rounding step */
+    if (round) {
+        if (extended_result) {
+            TCGv_i64 tcg_zero = tcg_constant_i64(0);
+            if (!is_u) {
+                /* take care of sign extending tcg_res */
+                tcg_gen_sari_i64(tcg_src_hi, tcg_src, 63);
+                tcg_gen_add2_i64(tcg_src, tcg_src_hi,
+                                 tcg_src, tcg_src_hi,
+                                 tcg_rnd, tcg_zero);
+            } else {
+                tcg_gen_add2_i64(tcg_src, tcg_src_hi,
+                                 tcg_src, tcg_zero,
+                                 tcg_rnd, tcg_zero);
+            }
+        } else {
+            tcg_gen_add_i64(tcg_src, tcg_src, tcg_rnd);
+        }
+    }
+
+    /* Now do the shift right */
+    if (round && extended_result) {
+        /* extended case, >64 bit precision required */
+        if (ext_lshift == 0) {
+            /* special case, only high bits matter */
+            tcg_gen_mov_i64(tcg_src, tcg_src_hi);
+        } else {
+            tcg_gen_shri_i64(tcg_src, tcg_src, shift);
+            tcg_gen_shli_i64(tcg_src_hi, tcg_src_hi, ext_lshift);
+            tcg_gen_or_i64(tcg_src, tcg_src, tcg_src_hi);
+        }
+    } else {
+        if (is_u) {
+            if (shift == 64) {
+                /* essentially shifting in 64 zeros */
+                tcg_gen_movi_i64(tcg_src, 0);
+            } else {
+                tcg_gen_shri_i64(tcg_src, tcg_src, shift);
+            }
+        } else {
+            if (shift == 64) {
+                /* effectively extending the sign-bit */
+                tcg_gen_sari_i64(tcg_src, tcg_src, 63);
+            } else {
+                tcg_gen_sari_i64(tcg_src, tcg_src, shift);
+            }
+        }
+    }
+
+    if (accumulate) {
+        tcg_gen_add_i64(tcg_res, tcg_res, tcg_src);
+    } else {
+        tcg_gen_mov_i64(tcg_res, tcg_src);
+    }
+
+    if (extended_result) {
+        tcg_temp_free_i64(tcg_src_hi);
+    }
+}
+
+/* SSHR[RA]/USHR[RA] - Scalar shift right (optional rounding/accumulate) */
+static void handle_scalar_simd_shri(DisasContext *s,
+                                    bool is_u, int immh, int immb,
+                                    int opcode, int rn, int rd)
+{
+    const int size = 3;
+    int immhb = immh << 3 | immb;
+    int shift = 2 * (8 << size) - immhb;
+    bool accumulate = false;
+    bool round = false;
+    bool insert = false;
+    TCGv_i64 tcg_rn;
+    TCGv_i64 tcg_rd;
+    TCGv_i64 tcg_round;
+
+    if (!extract32(immh, 3, 1)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    switch (opcode) {
+    case 0x02: /* SSRA / USRA (accumulate) */
+        accumulate = true;
+        break;
+    case 0x04: /* SRSHR / URSHR (rounding) */
+        round = true;
+        break;
+    case 0x06: /* SRSRA / URSRA (accum + rounding) */
+        accumulate = round = true;
+        break;
+    case 0x08: /* SRI */
+        insert = true;
+        break;
+    }
+
+    if (round) {
+        tcg_round = tcg_constant_i64(1ULL << (shift - 1));
+    } else {
+        tcg_round = NULL;
+    }
+
+    tcg_rn = read_fp_dreg(s, rn);
+    tcg_rd = (accumulate || insert) ? read_fp_dreg(s, rd) : tcg_temp_new_i64();
+
+    if (insert) {
+        /* shift count same as element size is valid but does nothing;
+         * special case to avoid potential shift by 64.
+         */
+        int esize = 8 << size;
+        if (shift != esize) {
+            tcg_gen_shri_i64(tcg_rn, tcg_rn, shift);
+            tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_rn, 0, esize - shift);
+        }
+    } else {
+        handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
+                                accumulate, is_u, size, shift);
+    }
+
+    write_fp_dreg(s, rd, tcg_rd);
+
+    tcg_temp_free_i64(tcg_rn);
+    tcg_temp_free_i64(tcg_rd);
+}
+
+/* SHL/SLI - Scalar shift left */
+static void handle_scalar_simd_shli(DisasContext *s, bool insert,
+                                    int immh, int immb, int opcode,
+                                    int rn, int rd)
+{
+    int size = 32 - clz32(immh) - 1;
+    int immhb = immh << 3 | immb;
+    int shift = immhb - (8 << size);
+    TCGv_i64 tcg_rn;
+    TCGv_i64 tcg_rd;
+
+    if (!extract32(immh, 3, 1)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    tcg_rn = read_fp_dreg(s, rn);
+    tcg_rd = insert ? read_fp_dreg(s, rd) : tcg_temp_new_i64();
+
+    if (insert) {
+        tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_rn, shift, 64 - shift);
+    } else {
+        tcg_gen_shli_i64(tcg_rd, tcg_rn, shift);
+    }
+
+    write_fp_dreg(s, rd, tcg_rd);
+
+    tcg_temp_free_i64(tcg_rn);
+    tcg_temp_free_i64(tcg_rd);
+}
+
+/* SQSHRN/SQSHRUN - Saturating (signed/unsigned) shift right with
+ * (signed/unsigned) narrowing */
+static void handle_vec_simd_sqshrn(DisasContext *s, bool is_scalar, bool is_q,
+                                   bool is_u_shift, bool is_u_narrow,
+                                   int immh, int immb, int opcode,
+                                   int rn, int rd)
+{
+    int immhb = immh << 3 | immb;
+    int size = 32 - clz32(immh) - 1;
+    int esize = 8 << size;
+    int shift = (2 * esize) - immhb;
+    int elements = is_scalar ? 1 : (64 / esize);
+    bool round = extract32(opcode, 0, 1);
+    MemOp ldop = (size + 1) | (is_u_shift ? 0 : MO_SIGN);
+    TCGv_i64 tcg_rn, tcg_rd, tcg_round;
+    TCGv_i32 tcg_rd_narrowed;
+    TCGv_i64 tcg_final;
+
+    static NeonGenNarrowEnvFn * const signed_narrow_fns[4][2] = {
+        { gen_helper_neon_narrow_sat_s8,
+          gen_helper_neon_unarrow_sat8 },
+        { gen_helper_neon_narrow_sat_s16,
+          gen_helper_neon_unarrow_sat16 },
+        { gen_helper_neon_narrow_sat_s32,
+          gen_helper_neon_unarrow_sat32 },
+        { NULL, NULL },
+    };
+    static NeonGenNarrowEnvFn * const unsigned_narrow_fns[4] = {
+        gen_helper_neon_narrow_sat_u8,
+        gen_helper_neon_narrow_sat_u16,
+        gen_helper_neon_narrow_sat_u32,
+        NULL
+    };
+    NeonGenNarrowEnvFn *narrowfn;
+
+    int i;
+
+    assert(size < 4);
+
+    if (extract32(immh, 3, 1)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    if (is_u_shift) {
+        narrowfn = unsigned_narrow_fns[size];
+    } else {
+        narrowfn = signed_narrow_fns[size][is_u_narrow ? 1 : 0];
+    }
+
+    tcg_rn = tcg_temp_new_i64();
+    tcg_rd = tcg_temp_new_i64();
+    tcg_rd_narrowed = tcg_temp_new_i32();
+    tcg_final = tcg_const_i64(0);
+
+    if (round) {
+        tcg_round = tcg_constant_i64(1ULL << (shift - 1));
+    } else {
+        tcg_round = NULL;
+    }
+
+    for (i = 0; i < elements; i++) {
+        read_vec_element(s, tcg_rn, rn, i, ldop);
+        handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
+                                false, is_u_shift, size+1, shift);
+        narrowfn(tcg_rd_narrowed, cpu_env, tcg_rd);
+        tcg_gen_extu_i32_i64(tcg_rd, tcg_rd_narrowed);
+        tcg_gen_deposit_i64(tcg_final, tcg_final, tcg_rd, esize * i, esize);
+    }
+
+    if (!is_q) {
+        write_vec_element(s, tcg_final, rd, 0, MO_64);
+    } else {
+        write_vec_element(s, tcg_final, rd, 1, MO_64);
+    }
+
+    tcg_temp_free_i64(tcg_rn);
+    tcg_temp_free_i64(tcg_rd);
+    tcg_temp_free_i32(tcg_rd_narrowed);
+    tcg_temp_free_i64(tcg_final);
+
+    clear_vec_high(s, is_q, rd);
+}
+
+/* SQSHLU, UQSHL, SQSHL: saturating left shifts */
+static void handle_simd_qshl(DisasContext *s, bool scalar, bool is_q,
+                             bool src_unsigned, bool dst_unsigned,
+                             int immh, int immb, int rn, int rd)
+{
+    int immhb = immh << 3 | immb;
+    int size = 32 - clz32(immh) - 1;
+    int shift = immhb - (8 << size);
+    int pass;
+
+    assert(immh != 0);
+    assert(!(scalar && is_q));
+
+    if (!scalar) {
+        if (!is_q && extract32(immh, 3, 1)) {
+            unallocated_encoding(s);
+            return;
+        }
+
+        /* Since we use the variable-shift helpers we must
+         * replicate the shift count into each element of
+         * the tcg_shift value.
+         */
+        switch (size) {
+        case 0:
+            shift |= shift << 8;
+            /* fall through */
+        case 1:
+            shift |= shift << 16;
+            break;
+        case 2:
+        case 3:
+            break;
+        default:
+            g_assert_not_reached();
+        }
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    if (size == 3) {
+        TCGv_i64 tcg_shift = tcg_constant_i64(shift);
+        static NeonGenTwo64OpEnvFn * const fns[2][2] = {
+            { gen_helper_neon_qshl_s64, gen_helper_neon_qshlu_s64 },
+            { NULL, gen_helper_neon_qshl_u64 },
+        };
+        NeonGenTwo64OpEnvFn *genfn = fns[src_unsigned][dst_unsigned];
+        int maxpass = is_q ? 2 : 1;
+
+        for (pass = 0; pass < maxpass; pass++) {
+            TCGv_i64 tcg_op = tcg_temp_new_i64();
+
+            read_vec_element(s, tcg_op, rn, pass, MO_64);
+            genfn(tcg_op, cpu_env, tcg_op, tcg_shift);
+            write_vec_element(s, tcg_op, rd, pass, MO_64);
+
+            tcg_temp_free_i64(tcg_op);
+        }
+        clear_vec_high(s, is_q, rd);
+    } else {
+        TCGv_i32 tcg_shift = tcg_constant_i32(shift);
+        static NeonGenTwoOpEnvFn * const fns[2][2][3] = {
+            {
+                { gen_helper_neon_qshl_s8,
+                  gen_helper_neon_qshl_s16,
+                  gen_helper_neon_qshl_s32 },
+                { gen_helper_neon_qshlu_s8,
+                  gen_helper_neon_qshlu_s16,
+                  gen_helper_neon_qshlu_s32 }
+            }, {
+                { NULL, NULL, NULL },
+                { gen_helper_neon_qshl_u8,
+                  gen_helper_neon_qshl_u16,
+                  gen_helper_neon_qshl_u32 }
+            }
+        };
+        NeonGenTwoOpEnvFn *genfn = fns[src_unsigned][dst_unsigned][size];
+        MemOp memop = scalar ? size : MO_32;
+        int maxpass = scalar ? 1 : is_q ? 4 : 2;
+
+        for (pass = 0; pass < maxpass; pass++) {
+            TCGv_i32 tcg_op = tcg_temp_new_i32();
+
+            read_vec_element_i32(s, tcg_op, rn, pass, memop);
+            genfn(tcg_op, cpu_env, tcg_op, tcg_shift);
+            if (scalar) {
+                switch (size) {
+                case 0:
+                    tcg_gen_ext8u_i32(tcg_op, tcg_op);
+                    break;
+                case 1:
+                    tcg_gen_ext16u_i32(tcg_op, tcg_op);
+                    break;
+                case 2:
+                    break;
+                default:
+                    g_assert_not_reached();
+                }
+                write_fp_sreg(s, rd, tcg_op);
+            } else {
+                write_vec_element_i32(s, tcg_op, rd, pass, MO_32);
+            }
+
+            tcg_temp_free_i32(tcg_op);
+        }
+
+        if (!scalar) {
+            clear_vec_high(s, is_q, rd);
+        }
+    }
+}
+
+/* Common vector code for handling integer to FP conversion */
+static void handle_simd_intfp_conv(DisasContext *s, int rd, int rn,
+                                   int elements, int is_signed,
+                                   int fracbits, int size)
+{
+    TCGv_ptr tcg_fpst = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+    TCGv_i32 tcg_shift = NULL;
+
+    MemOp mop = size | (is_signed ? MO_SIGN : 0);
+    int pass;
+
+    if (fracbits || size == MO_64) {
+        tcg_shift = tcg_constant_i32(fracbits);
+    }
+
+    if (size == MO_64) {
+        TCGv_i64 tcg_int64 = tcg_temp_new_i64();
+        TCGv_i64 tcg_double = tcg_temp_new_i64();
+
+        for (pass = 0; pass < elements; pass++) {
+            read_vec_element(s, tcg_int64, rn, pass, mop);
+
+            if (is_signed) {
+                gen_helper_vfp_sqtod(tcg_double, tcg_int64,
+                                     tcg_shift, tcg_fpst);
+            } else {
+                gen_helper_vfp_uqtod(tcg_double, tcg_int64,
+                                     tcg_shift, tcg_fpst);
+            }
+            if (elements == 1) {
+                write_fp_dreg(s, rd, tcg_double);
+            } else {
+                write_vec_element(s, tcg_double, rd, pass, MO_64);
+            }
+        }
+
+        tcg_temp_free_i64(tcg_int64);
+        tcg_temp_free_i64(tcg_double);
+
+    } else {
+        TCGv_i32 tcg_int32 = tcg_temp_new_i32();
+        TCGv_i32 tcg_float = tcg_temp_new_i32();
+
+        for (pass = 0; pass < elements; pass++) {
+            read_vec_element_i32(s, tcg_int32, rn, pass, mop);
+
+            switch (size) {
+            case MO_32:
+                if (fracbits) {
+                    if (is_signed) {
+                        gen_helper_vfp_sltos(tcg_float, tcg_int32,
+                                             tcg_shift, tcg_fpst);
+                    } else {
+                        gen_helper_vfp_ultos(tcg_float, tcg_int32,
+                                             tcg_shift, tcg_fpst);
+                    }
+                } else {
+                    if (is_signed) {
+                        gen_helper_vfp_sitos(tcg_float, tcg_int32, tcg_fpst);
+                    } else {
+                        gen_helper_vfp_uitos(tcg_float, tcg_int32, tcg_fpst);
+                    }
+                }
+                break;
+            case MO_16:
+                if (fracbits) {
+                    if (is_signed) {
+                        gen_helper_vfp_sltoh(tcg_float, tcg_int32,
+                                             tcg_shift, tcg_fpst);
+                    } else {
+                        gen_helper_vfp_ultoh(tcg_float, tcg_int32,
+                                             tcg_shift, tcg_fpst);
+                    }
+                } else {
+                    if (is_signed) {
+                        gen_helper_vfp_sitoh(tcg_float, tcg_int32, tcg_fpst);
+                    } else {
+                        gen_helper_vfp_uitoh(tcg_float, tcg_int32, tcg_fpst);
+                    }
+                }
+                break;
+            default:
+                g_assert_not_reached();
+            }
+
+            if (elements == 1) {
+                write_fp_sreg(s, rd, tcg_float);
+            } else {
+                write_vec_element_i32(s, tcg_float, rd, pass, size);
+            }
+        }
+
+        tcg_temp_free_i32(tcg_int32);
+        tcg_temp_free_i32(tcg_float);
+    }
+
+    tcg_temp_free_ptr(tcg_fpst);
+
+    clear_vec_high(s, elements << size == 16, rd);
+}
+
+/* UCVTF/SCVTF - Integer to FP conversion */
+static void handle_simd_shift_intfp_conv(DisasContext *s, bool is_scalar,
+                                         bool is_q, bool is_u,
+                                         int immh, int immb, int opcode,
+                                         int rn, int rd)
+{
+    int size, elements, fracbits;
+    int immhb = immh << 3 | immb;
+
+    if (immh & 8) {
+        size = MO_64;
+        if (!is_scalar && !is_q) {
+            unallocated_encoding(s);
+            return;
+        }
+    } else if (immh & 4) {
+        size = MO_32;
+    } else if (immh & 2) {
+        size = MO_16;
+        if (!dc_isar_feature(aa64_fp16, s)) {
+            unallocated_encoding(s);
+            return;
+        }
+    } else {
+        /* immh == 0 would be a failure of the decode logic */
+        g_assert(immh == 1);
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (is_scalar) {
+        elements = 1;
+    } else {
+        elements = (8 << is_q) >> size;
+    }
+    fracbits = (16 << size) - immhb;
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    handle_simd_intfp_conv(s, rd, rn, elements, !is_u, fracbits, size);
+}
+
+/* FCVTZS, FVCVTZU - FP to fixedpoint conversion */
+static void handle_simd_shift_fpint_conv(DisasContext *s, bool is_scalar,
+                                         bool is_q, bool is_u,
+                                         int immh, int immb, int rn, int rd)
+{
+    int immhb = immh << 3 | immb;
+    int pass, size, fracbits;
+    TCGv_ptr tcg_fpstatus;
+    TCGv_i32 tcg_rmode, tcg_shift;
+
+    if (immh & 0x8) {
+        size = MO_64;
+        if (!is_scalar && !is_q) {
+            unallocated_encoding(s);
+            return;
+        }
+    } else if (immh & 0x4) {
+        size = MO_32;
+    } else if (immh & 0x2) {
+        size = MO_16;
+        if (!dc_isar_feature(aa64_fp16, s)) {
+            unallocated_encoding(s);
+            return;
+        }
+    } else {
+        /* Should have split out AdvSIMD modified immediate earlier.  */
+        assert(immh == 1);
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    assert(!(is_scalar && is_q));
+
+    tcg_rmode = tcg_const_i32(arm_rmode_to_sf(FPROUNDING_ZERO));
+    tcg_fpstatus = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+    gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
+    fracbits = (16 << size) - immhb;
+    tcg_shift = tcg_constant_i32(fracbits);
+
+    if (size == MO_64) {
+        int maxpass = is_scalar ? 1 : 2;
+
+        for (pass = 0; pass < maxpass; pass++) {
+            TCGv_i64 tcg_op = tcg_temp_new_i64();
+
+            read_vec_element(s, tcg_op, rn, pass, MO_64);
+            if (is_u) {
+                gen_helper_vfp_touqd(tcg_op, tcg_op, tcg_shift, tcg_fpstatus);
+            } else {
+                gen_helper_vfp_tosqd(tcg_op, tcg_op, tcg_shift, tcg_fpstatus);
+            }
+            write_vec_element(s, tcg_op, rd, pass, MO_64);
+            tcg_temp_free_i64(tcg_op);
+        }
+        clear_vec_high(s, is_q, rd);
+    } else {
+        void (*fn)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_ptr);
+        int maxpass = is_scalar ? 1 : ((8 << is_q) >> size);
+
+        switch (size) {
+        case MO_16:
+            if (is_u) {
+                fn = gen_helper_vfp_touhh;
+            } else {
+                fn = gen_helper_vfp_toshh;
+            }
+            break;
+        case MO_32:
+            if (is_u) {
+                fn = gen_helper_vfp_touls;
+            } else {
+                fn = gen_helper_vfp_tosls;
+            }
+            break;
+        default:
+            g_assert_not_reached();
+        }
+
+        for (pass = 0; pass < maxpass; pass++) {
+            TCGv_i32 tcg_op = tcg_temp_new_i32();
+
+            read_vec_element_i32(s, tcg_op, rn, pass, size);
+            fn(tcg_op, tcg_op, tcg_shift, tcg_fpstatus);
+            if (is_scalar) {
+                write_fp_sreg(s, rd, tcg_op);
+            } else {
+                write_vec_element_i32(s, tcg_op, rd, pass, size);
+            }
+            tcg_temp_free_i32(tcg_op);
+        }
+        if (!is_scalar) {
+            clear_vec_high(s, is_q, rd);
+        }
+    }
+
+    gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
+    tcg_temp_free_ptr(tcg_fpstatus);
+    tcg_temp_free_i32(tcg_rmode);
+}
+
+/* AdvSIMD scalar shift by immediate
+ *  31 30  29 28         23 22  19 18  16 15    11  10 9    5 4    0
+ * +-----+---+-------------+------+------+--------+---+------+------+
+ * | 0 1 | U | 1 1 1 1 1 0 | immh | immb | opcode | 1 |  Rn  |  Rd  |
+ * +-----+---+-------------+------+------+--------+---+------+------+
+ *
+ * This is the scalar version so it works on a fixed sized registers
+ */
+static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int opcode = extract32(insn, 11, 5);
+    int immb = extract32(insn, 16, 3);
+    int immh = extract32(insn, 19, 4);
+    bool is_u = extract32(insn, 29, 1);
+
+    if (immh == 0) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (opcode) {
+    case 0x08: /* SRI */
+        if (!is_u) {
+            unallocated_encoding(s);
+            return;
+        }
+        /* fall through */
+    case 0x00: /* SSHR / USHR */
+    case 0x02: /* SSRA / USRA */
+    case 0x04: /* SRSHR / URSHR */
+    case 0x06: /* SRSRA / URSRA */
+        handle_scalar_simd_shri(s, is_u, immh, immb, opcode, rn, rd);
+        break;
+    case 0x0a: /* SHL / SLI */
+        handle_scalar_simd_shli(s, is_u, immh, immb, opcode, rn, rd);
+        break;
+    case 0x1c: /* SCVTF, UCVTF */
+        handle_simd_shift_intfp_conv(s, true, false, is_u, immh, immb,
+                                     opcode, rn, rd);
+        break;
+    case 0x10: /* SQSHRUN, SQSHRUN2 */
+    case 0x11: /* SQRSHRUN, SQRSHRUN2 */
+        if (!is_u) {
+            unallocated_encoding(s);
+            return;
+        }
+        handle_vec_simd_sqshrn(s, true, false, false, true,
+                               immh, immb, opcode, rn, rd);
+        break;
+    case 0x12: /* SQSHRN, SQSHRN2, UQSHRN */
+    case 0x13: /* SQRSHRN, SQRSHRN2, UQRSHRN, UQRSHRN2 */
+        handle_vec_simd_sqshrn(s, true, false, is_u, is_u,
+                               immh, immb, opcode, rn, rd);
+        break;
+    case 0xc: /* SQSHLU */
+        if (!is_u) {
+            unallocated_encoding(s);
+            return;
+        }
+        handle_simd_qshl(s, true, false, false, true, immh, immb, rn, rd);
+        break;
+    case 0xe: /* SQSHL, UQSHL */
+        handle_simd_qshl(s, true, false, is_u, is_u, immh, immb, rn, rd);
+        break;
+    case 0x1f: /* FCVTZS, FCVTZU */
+        handle_simd_shift_fpint_conv(s, true, false, is_u, immh, immb, rn, rd);
+        break;
+    default:
+        unallocated_encoding(s);
+        break;
+    }
+}
+
+/* AdvSIMD scalar three different
+ *  31 30  29 28       24 23  22  21 20  16 15    12 11 10 9    5 4    0
+ * +-----+---+-----------+------+---+------+--------+-----+------+------+
+ * | 0 1 | U | 1 1 1 1 0 | size | 1 |  Rm  | opcode | 0 0 |  Rn  |  Rd  |
+ * +-----+---+-----------+------+---+------+--------+-----+------+------+
+ */
+static void disas_simd_scalar_three_reg_diff(DisasContext *s, uint32_t insn)
+{
+    bool is_u = extract32(insn, 29, 1);
+    int size = extract32(insn, 22, 2);
+    int opcode = extract32(insn, 12, 4);
+    int rm = extract32(insn, 16, 5);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+
+    if (is_u) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (opcode) {
+    case 0x9: /* SQDMLAL, SQDMLAL2 */
+    case 0xb: /* SQDMLSL, SQDMLSL2 */
+    case 0xd: /* SQDMULL, SQDMULL2 */
+        if (size == 0 || size == 3) {
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    if (size == 2) {
+        TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+        TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+        TCGv_i64 tcg_res = tcg_temp_new_i64();
+
+        read_vec_element(s, tcg_op1, rn, 0, MO_32 | MO_SIGN);
+        read_vec_element(s, tcg_op2, rm, 0, MO_32 | MO_SIGN);
+
+        tcg_gen_mul_i64(tcg_res, tcg_op1, tcg_op2);
+        gen_helper_neon_addl_saturate_s64(tcg_res, cpu_env, tcg_res, tcg_res);
+
+        switch (opcode) {
+        case 0xd: /* SQDMULL, SQDMULL2 */
+            break;
+        case 0xb: /* SQDMLSL, SQDMLSL2 */
+            tcg_gen_neg_i64(tcg_res, tcg_res);
+            /* fall through */
+        case 0x9: /* SQDMLAL, SQDMLAL2 */
+            read_vec_element(s, tcg_op1, rd, 0, MO_64);
+            gen_helper_neon_addl_saturate_s64(tcg_res, cpu_env,
+                                              tcg_res, tcg_op1);
+            break;
+        default:
+            g_assert_not_reached();
+        }
+
+        write_fp_dreg(s, rd, tcg_res);
+
+        tcg_temp_free_i64(tcg_op1);
+        tcg_temp_free_i64(tcg_op2);
+        tcg_temp_free_i64(tcg_res);
+    } else {
+        TCGv_i32 tcg_op1 = read_fp_hreg(s, rn);
+        TCGv_i32 tcg_op2 = read_fp_hreg(s, rm);
+        TCGv_i64 tcg_res = tcg_temp_new_i64();
+
+        gen_helper_neon_mull_s16(tcg_res, tcg_op1, tcg_op2);
+        gen_helper_neon_addl_saturate_s32(tcg_res, cpu_env, tcg_res, tcg_res);
+
+        switch (opcode) {
+        case 0xd: /* SQDMULL, SQDMULL2 */
+            break;
+        case 0xb: /* SQDMLSL, SQDMLSL2 */
+            gen_helper_neon_negl_u32(tcg_res, tcg_res);
+            /* fall through */
+        case 0x9: /* SQDMLAL, SQDMLAL2 */
+        {
+            TCGv_i64 tcg_op3 = tcg_temp_new_i64();
+            read_vec_element(s, tcg_op3, rd, 0, MO_32);
+            gen_helper_neon_addl_saturate_s32(tcg_res, cpu_env,
+                                              tcg_res, tcg_op3);
+            tcg_temp_free_i64(tcg_op3);
+            break;
+        }
+        default:
+            g_assert_not_reached();
+        }
+
+        tcg_gen_ext32u_i64(tcg_res, tcg_res);
+        write_fp_dreg(s, rd, tcg_res);
+
+        tcg_temp_free_i32(tcg_op1);
+        tcg_temp_free_i32(tcg_op2);
+        tcg_temp_free_i64(tcg_res);
+    }
+}
+
+static void handle_3same_64(DisasContext *s, int opcode, bool u,
+                            TCGv_i64 tcg_rd, TCGv_i64 tcg_rn, TCGv_i64 tcg_rm)
+{
+    /* Handle 64x64->64 opcodes which are shared between the scalar
+     * and vector 3-same groups. We cover every opcode where size == 3
+     * is valid in either the three-reg-same (integer, not pairwise)
+     * or scalar-three-reg-same groups.
+     */
+    TCGCond cond;
+
+    switch (opcode) {
+    case 0x1: /* SQADD */
+        if (u) {
+            gen_helper_neon_qadd_u64(tcg_rd, cpu_env, tcg_rn, tcg_rm);
+        } else {
+            gen_helper_neon_qadd_s64(tcg_rd, cpu_env, tcg_rn, tcg_rm);
+        }
+        break;
+    case 0x5: /* SQSUB */
+        if (u) {
+            gen_helper_neon_qsub_u64(tcg_rd, cpu_env, tcg_rn, tcg_rm);
+        } else {
+            gen_helper_neon_qsub_s64(tcg_rd, cpu_env, tcg_rn, tcg_rm);
+        }
+        break;
+    case 0x6: /* CMGT, CMHI */
+        /* 64 bit integer comparison, result = test ? (2^64 - 1) : 0.
+         * We implement this using setcond (test) and then negating.
+         */
+        cond = u ? TCG_COND_GTU : TCG_COND_GT;
+    do_cmop:
+        tcg_gen_setcond_i64(cond, tcg_rd, tcg_rn, tcg_rm);
+        tcg_gen_neg_i64(tcg_rd, tcg_rd);
+        break;
+    case 0x7: /* CMGE, CMHS */
+        cond = u ? TCG_COND_GEU : TCG_COND_GE;
+        goto do_cmop;
+    case 0x11: /* CMTST, CMEQ */
+        if (u) {
+            cond = TCG_COND_EQ;
+            goto do_cmop;
+        }
+        gen_cmtst_i64(tcg_rd, tcg_rn, tcg_rm);
+        break;
+    case 0x8: /* SSHL, USHL */
+        if (u) {
+            gen_ushl_i64(tcg_rd, tcg_rn, tcg_rm);
+        } else {
+            gen_sshl_i64(tcg_rd, tcg_rn, tcg_rm);
+        }
+        break;
+    case 0x9: /* SQSHL, UQSHL */
+        if (u) {
+            gen_helper_neon_qshl_u64(tcg_rd, cpu_env, tcg_rn, tcg_rm);
+        } else {
+            gen_helper_neon_qshl_s64(tcg_rd, cpu_env, tcg_rn, tcg_rm);
+        }
+        break;
+    case 0xa: /* SRSHL, URSHL */
+        if (u) {
+            gen_helper_neon_rshl_u64(tcg_rd, tcg_rn, tcg_rm);
+        } else {
+            gen_helper_neon_rshl_s64(tcg_rd, tcg_rn, tcg_rm);
+        }
+        break;
+    case 0xb: /* SQRSHL, UQRSHL */
+        if (u) {
+            gen_helper_neon_qrshl_u64(tcg_rd, cpu_env, tcg_rn, tcg_rm);
+        } else {
+            gen_helper_neon_qrshl_s64(tcg_rd, cpu_env, tcg_rn, tcg_rm);
+        }
+        break;
+    case 0x10: /* ADD, SUB */
+        if (u) {
+            tcg_gen_sub_i64(tcg_rd, tcg_rn, tcg_rm);
+        } else {
+            tcg_gen_add_i64(tcg_rd, tcg_rn, tcg_rm);
+        }
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+/* Handle the 3-same-operands float operations; shared by the scalar
+ * and vector encodings. The caller must filter out any encodings
+ * not allocated for the encoding it is dealing with.
+ */
+static void handle_3same_float(DisasContext *s, int size, int elements,
+                               int fpopcode, int rd, int rn, int rm)
+{
+    int pass;
+    TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
+
+    for (pass = 0; pass < elements; pass++) {
+        if (size) {
+            /* Double */
+            TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+            TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+            TCGv_i64 tcg_res = tcg_temp_new_i64();
+
+            read_vec_element(s, tcg_op1, rn, pass, MO_64);
+            read_vec_element(s, tcg_op2, rm, pass, MO_64);
+
+            switch (fpopcode) {
+            case 0x39: /* FMLS */
+                /* As usual for ARM, separate negation for fused multiply-add */
+                gen_helper_vfp_negd(tcg_op1, tcg_op1);
+                /* fall through */
+            case 0x19: /* FMLA */
+                read_vec_element(s, tcg_res, rd, pass, MO_64);
+                gen_helper_vfp_muladdd(tcg_res, tcg_op1, tcg_op2,
+                                       tcg_res, fpst);
+                break;
+            case 0x18: /* FMAXNM */
+                gen_helper_vfp_maxnumd(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x1a: /* FADD */
+                gen_helper_vfp_addd(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x1b: /* FMULX */
+                gen_helper_vfp_mulxd(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x1c: /* FCMEQ */
+                gen_helper_neon_ceq_f64(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x1e: /* FMAX */
+                gen_helper_vfp_maxd(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x1f: /* FRECPS */
+                gen_helper_recpsf_f64(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x38: /* FMINNM */
+                gen_helper_vfp_minnumd(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x3a: /* FSUB */
+                gen_helper_vfp_subd(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x3e: /* FMIN */
+                gen_helper_vfp_mind(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x3f: /* FRSQRTS */
+                gen_helper_rsqrtsf_f64(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x5b: /* FMUL */
+                gen_helper_vfp_muld(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x5c: /* FCMGE */
+                gen_helper_neon_cge_f64(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x5d: /* FACGE */
+                gen_helper_neon_acge_f64(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x5f: /* FDIV */
+                gen_helper_vfp_divd(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x7a: /* FABD */
+                gen_helper_vfp_subd(tcg_res, tcg_op1, tcg_op2, fpst);
+                gen_helper_vfp_absd(tcg_res, tcg_res);
+                break;
+            case 0x7c: /* FCMGT */
+                gen_helper_neon_cgt_f64(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x7d: /* FACGT */
+                gen_helper_neon_acgt_f64(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            default:
+                g_assert_not_reached();
+            }
+
+            write_vec_element(s, tcg_res, rd, pass, MO_64);
+
+            tcg_temp_free_i64(tcg_res);
+            tcg_temp_free_i64(tcg_op1);
+            tcg_temp_free_i64(tcg_op2);
+        } else {
+            /* Single */
+            TCGv_i32 tcg_op1 = tcg_temp_new_i32();
+            TCGv_i32 tcg_op2 = tcg_temp_new_i32();
+            TCGv_i32 tcg_res = tcg_temp_new_i32();
+
+            read_vec_element_i32(s, tcg_op1, rn, pass, MO_32);
+            read_vec_element_i32(s, tcg_op2, rm, pass, MO_32);
+
+            switch (fpopcode) {
+            case 0x39: /* FMLS */
+                /* As usual for ARM, separate negation for fused multiply-add */
+                gen_helper_vfp_negs(tcg_op1, tcg_op1);
+                /* fall through */
+            case 0x19: /* FMLA */
+                read_vec_element_i32(s, tcg_res, rd, pass, MO_32);
+                gen_helper_vfp_muladds(tcg_res, tcg_op1, tcg_op2,
+                                       tcg_res, fpst);
+                break;
+            case 0x1a: /* FADD */
+                gen_helper_vfp_adds(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x1b: /* FMULX */
+                gen_helper_vfp_mulxs(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x1c: /* FCMEQ */
+                gen_helper_neon_ceq_f32(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x1e: /* FMAX */
+                gen_helper_vfp_maxs(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x1f: /* FRECPS */
+                gen_helper_recpsf_f32(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x18: /* FMAXNM */
+                gen_helper_vfp_maxnums(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x38: /* FMINNM */
+                gen_helper_vfp_minnums(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x3a: /* FSUB */
+                gen_helper_vfp_subs(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x3e: /* FMIN */
+                gen_helper_vfp_mins(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x3f: /* FRSQRTS */
+                gen_helper_rsqrtsf_f32(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x5b: /* FMUL */
+                gen_helper_vfp_muls(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x5c: /* FCMGE */
+                gen_helper_neon_cge_f32(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x5d: /* FACGE */
+                gen_helper_neon_acge_f32(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x5f: /* FDIV */
+                gen_helper_vfp_divs(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x7a: /* FABD */
+                gen_helper_vfp_subs(tcg_res, tcg_op1, tcg_op2, fpst);
+                gen_helper_vfp_abss(tcg_res, tcg_res);
+                break;
+            case 0x7c: /* FCMGT */
+                gen_helper_neon_cgt_f32(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x7d: /* FACGT */
+                gen_helper_neon_acgt_f32(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            default:
+                g_assert_not_reached();
+            }
+
+            if (elements == 1) {
+                /* scalar single so clear high part */
+                TCGv_i64 tcg_tmp = tcg_temp_new_i64();
+
+                tcg_gen_extu_i32_i64(tcg_tmp, tcg_res);
+                write_vec_element(s, tcg_tmp, rd, pass, MO_64);
+                tcg_temp_free_i64(tcg_tmp);
+            } else {
+                write_vec_element_i32(s, tcg_res, rd, pass, MO_32);
+            }
+
+            tcg_temp_free_i32(tcg_res);
+            tcg_temp_free_i32(tcg_op1);
+            tcg_temp_free_i32(tcg_op2);
+        }
+    }
+
+    tcg_temp_free_ptr(fpst);
+
+    clear_vec_high(s, elements * (size ? 8 : 4) > 8, rd);
+}
+
+/* AdvSIMD scalar three same
+ *  31 30  29 28       24 23  22  21 20  16 15    11  10 9    5 4    0
+ * +-----+---+-----------+------+---+------+--------+---+------+------+
+ * | 0 1 | U | 1 1 1 1 0 | size | 1 |  Rm  | opcode | 1 |  Rn  |  Rd  |
+ * +-----+---+-----------+------+---+------+--------+---+------+------+
+ */
+static void disas_simd_scalar_three_reg_same(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int opcode = extract32(insn, 11, 5);
+    int rm = extract32(insn, 16, 5);
+    int size = extract32(insn, 22, 2);
+    bool u = extract32(insn, 29, 1);
+    TCGv_i64 tcg_rd;
+
+    if (opcode >= 0x18) {
+        /* Floating point: U, size[1] and opcode indicate operation */
+        int fpopcode = opcode | (extract32(size, 1, 1) << 5) | (u << 6);
+        switch (fpopcode) {
+        case 0x1b: /* FMULX */
+        case 0x1f: /* FRECPS */
+        case 0x3f: /* FRSQRTS */
+        case 0x5d: /* FACGE */
+        case 0x7d: /* FACGT */
+        case 0x1c: /* FCMEQ */
+        case 0x5c: /* FCMGE */
+        case 0x7c: /* FCMGT */
+        case 0x7a: /* FABD */
+            break;
+        default:
+            unallocated_encoding(s);
+            return;
+        }
+
+        if (!fp_access_check(s)) {
+            return;
+        }
+
+        handle_3same_float(s, extract32(size, 0, 1), 1, fpopcode, rd, rn, rm);
+        return;
+    }
+
+    switch (opcode) {
+    case 0x1: /* SQADD, UQADD */
+    case 0x5: /* SQSUB, UQSUB */
+    case 0x9: /* SQSHL, UQSHL */
+    case 0xb: /* SQRSHL, UQRSHL */
+        break;
+    case 0x8: /* SSHL, USHL */
+    case 0xa: /* SRSHL, URSHL */
+    case 0x6: /* CMGT, CMHI */
+    case 0x7: /* CMGE, CMHS */
+    case 0x11: /* CMTST, CMEQ */
+    case 0x10: /* ADD, SUB (vector) */
+        if (size != 3) {
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    case 0x16: /* SQDMULH, SQRDMULH (vector) */
+        if (size != 1 && size != 2) {
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    tcg_rd = tcg_temp_new_i64();
+
+    if (size == 3) {
+        TCGv_i64 tcg_rn = read_fp_dreg(s, rn);
+        TCGv_i64 tcg_rm = read_fp_dreg(s, rm);
+
+        handle_3same_64(s, opcode, u, tcg_rd, tcg_rn, tcg_rm);
+        tcg_temp_free_i64(tcg_rn);
+        tcg_temp_free_i64(tcg_rm);
+    } else {
+        /* Do a single operation on the lowest element in the vector.
+         * We use the standard Neon helpers and rely on 0 OP 0 == 0 with
+         * no side effects for all these operations.
+         * OPTME: special-purpose helpers would avoid doing some
+         * unnecessary work in the helper for the 8 and 16 bit cases.
+         */
+        NeonGenTwoOpEnvFn *genenvfn;
+        TCGv_i32 tcg_rn = tcg_temp_new_i32();
+        TCGv_i32 tcg_rm = tcg_temp_new_i32();
+        TCGv_i32 tcg_rd32 = tcg_temp_new_i32();
+
+        read_vec_element_i32(s, tcg_rn, rn, 0, size);
+        read_vec_element_i32(s, tcg_rm, rm, 0, size);
+
+        switch (opcode) {
+        case 0x1: /* SQADD, UQADD */
+        {
+            static NeonGenTwoOpEnvFn * const fns[3][2] = {
+                { gen_helper_neon_qadd_s8, gen_helper_neon_qadd_u8 },
+                { gen_helper_neon_qadd_s16, gen_helper_neon_qadd_u16 },
+                { gen_helper_neon_qadd_s32, gen_helper_neon_qadd_u32 },
+            };
+            genenvfn = fns[size][u];
+            break;
+        }
+        case 0x5: /* SQSUB, UQSUB */
+        {
+            static NeonGenTwoOpEnvFn * const fns[3][2] = {
+                { gen_helper_neon_qsub_s8, gen_helper_neon_qsub_u8 },
+                { gen_helper_neon_qsub_s16, gen_helper_neon_qsub_u16 },
+                { gen_helper_neon_qsub_s32, gen_helper_neon_qsub_u32 },
+            };
+            genenvfn = fns[size][u];
+            break;
+        }
+        case 0x9: /* SQSHL, UQSHL */
+        {
+            static NeonGenTwoOpEnvFn * const fns[3][2] = {
+                { gen_helper_neon_qshl_s8, gen_helper_neon_qshl_u8 },
+                { gen_helper_neon_qshl_s16, gen_helper_neon_qshl_u16 },
+                { gen_helper_neon_qshl_s32, gen_helper_neon_qshl_u32 },
+            };
+            genenvfn = fns[size][u];
+            break;
+        }
+        case 0xb: /* SQRSHL, UQRSHL */
+        {
+            static NeonGenTwoOpEnvFn * const fns[3][2] = {
+                { gen_helper_neon_qrshl_s8, gen_helper_neon_qrshl_u8 },
+                { gen_helper_neon_qrshl_s16, gen_helper_neon_qrshl_u16 },
+                { gen_helper_neon_qrshl_s32, gen_helper_neon_qrshl_u32 },
+            };
+            genenvfn = fns[size][u];
+            break;
+        }
+        case 0x16: /* SQDMULH, SQRDMULH */
+        {
+            static NeonGenTwoOpEnvFn * const fns[2][2] = {
+                { gen_helper_neon_qdmulh_s16, gen_helper_neon_qrdmulh_s16 },
+                { gen_helper_neon_qdmulh_s32, gen_helper_neon_qrdmulh_s32 },
+            };
+            assert(size == 1 || size == 2);
+            genenvfn = fns[size - 1][u];
+            break;
+        }
+        default:
+            g_assert_not_reached();
+        }
+
+        genenvfn(tcg_rd32, cpu_env, tcg_rn, tcg_rm);
+        tcg_gen_extu_i32_i64(tcg_rd, tcg_rd32);
+        tcg_temp_free_i32(tcg_rd32);
+        tcg_temp_free_i32(tcg_rn);
+        tcg_temp_free_i32(tcg_rm);
+    }
+
+    write_fp_dreg(s, rd, tcg_rd);
+
+    tcg_temp_free_i64(tcg_rd);
+}
+
+/* AdvSIMD scalar three same FP16
+ *  31 30  29 28       24 23  22 21 20  16 15 14 13    11 10  9  5 4  0
+ * +-----+---+-----------+---+-----+------+-----+--------+---+----+----+
+ * | 0 1 | U | 1 1 1 1 0 | a | 1 0 |  Rm  | 0 0 | opcode | 1 | Rn | Rd |
+ * +-----+---+-----------+---+-----+------+-----+--------+---+----+----+
+ * v: 0101 1110 0100 0000 0000 0100 0000 0000 => 5e400400
+ * m: 1101 1111 0110 0000 1100 0100 0000 0000 => df60c400
+ */
+static void disas_simd_scalar_three_reg_same_fp16(DisasContext *s,
+                                                  uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int opcode = extract32(insn, 11, 3);
+    int rm = extract32(insn, 16, 5);
+    bool u = extract32(insn, 29, 1);
+    bool a = extract32(insn, 23, 1);
+    int fpopcode = opcode | (a << 3) |  (u << 4);
+    TCGv_ptr fpst;
+    TCGv_i32 tcg_op1;
+    TCGv_i32 tcg_op2;
+    TCGv_i32 tcg_res;
+
+    switch (fpopcode) {
+    case 0x03: /* FMULX */
+    case 0x04: /* FCMEQ (reg) */
+    case 0x07: /* FRECPS */
+    case 0x0f: /* FRSQRTS */
+    case 0x14: /* FCMGE (reg) */
+    case 0x15: /* FACGE */
+    case 0x1a: /* FABD */
+    case 0x1c: /* FCMGT (reg) */
+    case 0x1d: /* FACGT */
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!dc_isar_feature(aa64_fp16, s)) {
+        unallocated_encoding(s);
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    fpst = fpstatus_ptr(FPST_FPCR_F16);
+
+    tcg_op1 = read_fp_hreg(s, rn);
+    tcg_op2 = read_fp_hreg(s, rm);
+    tcg_res = tcg_temp_new_i32();
+
+    switch (fpopcode) {
+    case 0x03: /* FMULX */
+        gen_helper_advsimd_mulxh(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x04: /* FCMEQ (reg) */
+        gen_helper_advsimd_ceq_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x07: /* FRECPS */
+        gen_helper_recpsf_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x0f: /* FRSQRTS */
+        gen_helper_rsqrtsf_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x14: /* FCMGE (reg) */
+        gen_helper_advsimd_cge_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x15: /* FACGE */
+        gen_helper_advsimd_acge_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x1a: /* FABD */
+        gen_helper_advsimd_subh(tcg_res, tcg_op1, tcg_op2, fpst);
+        tcg_gen_andi_i32(tcg_res, tcg_res, 0x7fff);
+        break;
+    case 0x1c: /* FCMGT (reg) */
+        gen_helper_advsimd_cgt_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x1d: /* FACGT */
+        gen_helper_advsimd_acgt_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    write_fp_sreg(s, rd, tcg_res);
+
+
+    tcg_temp_free_i32(tcg_res);
+    tcg_temp_free_i32(tcg_op1);
+    tcg_temp_free_i32(tcg_op2);
+    tcg_temp_free_ptr(fpst);
+}
+
+/* AdvSIMD scalar three same extra
+ *  31 30  29 28       24 23  22  21 20  16  15 14    11  10 9  5 4  0
+ * +-----+---+-----------+------+---+------+---+--------+---+----+----+
+ * | 0 1 | U | 1 1 1 1 0 | size | 0 |  Rm  | 1 | opcode | 1 | Rn | Rd |
+ * +-----+---+-----------+------+---+------+---+--------+---+----+----+
+ */
+static void disas_simd_scalar_three_reg_same_extra(DisasContext *s,
+                                                   uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int opcode = extract32(insn, 11, 4);
+    int rm = extract32(insn, 16, 5);
+    int size = extract32(insn, 22, 2);
+    bool u = extract32(insn, 29, 1);
+    TCGv_i32 ele1, ele2, ele3;
+    TCGv_i64 res;
+    bool feature;
+
+    switch (u * 16 + opcode) {
+    case 0x10: /* SQRDMLAH (vector) */
+    case 0x11: /* SQRDMLSH (vector) */
+        if (size != 1 && size != 2) {
+            unallocated_encoding(s);
+            return;
+        }
+        feature = dc_isar_feature(aa64_rdm, s);
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+    if (!feature) {
+        unallocated_encoding(s);
+        return;
+    }
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    /* Do a single operation on the lowest element in the vector.
+     * We use the standard Neon helpers and rely on 0 OP 0 == 0
+     * with no side effects for all these operations.
+     * OPTME: special-purpose helpers would avoid doing some
+     * unnecessary work in the helper for the 16 bit cases.
+     */
+    ele1 = tcg_temp_new_i32();
+    ele2 = tcg_temp_new_i32();
+    ele3 = tcg_temp_new_i32();
+
+    read_vec_element_i32(s, ele1, rn, 0, size);
+    read_vec_element_i32(s, ele2, rm, 0, size);
+    read_vec_element_i32(s, ele3, rd, 0, size);
+
+    switch (opcode) {
+    case 0x0: /* SQRDMLAH */
+        if (size == 1) {
+            gen_helper_neon_qrdmlah_s16(ele3, cpu_env, ele1, ele2, ele3);
+        } else {
+            gen_helper_neon_qrdmlah_s32(ele3, cpu_env, ele1, ele2, ele3);
+        }
+        break;
+    case 0x1: /* SQRDMLSH */
+        if (size == 1) {
+            gen_helper_neon_qrdmlsh_s16(ele3, cpu_env, ele1, ele2, ele3);
+        } else {
+            gen_helper_neon_qrdmlsh_s32(ele3, cpu_env, ele1, ele2, ele3);
+        }
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    tcg_temp_free_i32(ele1);
+    tcg_temp_free_i32(ele2);
+
+    res = tcg_temp_new_i64();
+    tcg_gen_extu_i32_i64(res, ele3);
+    tcg_temp_free_i32(ele3);
+
+    write_fp_dreg(s, rd, res);
+    tcg_temp_free_i64(res);
+}
+
+static void handle_2misc_64(DisasContext *s, int opcode, bool u,
+                            TCGv_i64 tcg_rd, TCGv_i64 tcg_rn,
+                            TCGv_i32 tcg_rmode, TCGv_ptr tcg_fpstatus)
+{
+    /* Handle 64->64 opcodes which are shared between the scalar and
+     * vector 2-reg-misc groups. We cover every integer opcode where size == 3
+     * is valid in either group and also the double-precision fp ops.
+     * The caller only need provide tcg_rmode and tcg_fpstatus if the op
+     * requires them.
+     */
+    TCGCond cond;
+
+    switch (opcode) {
+    case 0x4: /* CLS, CLZ */
+        if (u) {
+            tcg_gen_clzi_i64(tcg_rd, tcg_rn, 64);
+        } else {
+            tcg_gen_clrsb_i64(tcg_rd, tcg_rn);
+        }
+        break;
+    case 0x5: /* NOT */
+        /* This opcode is shared with CNT and RBIT but we have earlier
+         * enforced that size == 3 if and only if this is the NOT insn.
+         */
+        tcg_gen_not_i64(tcg_rd, tcg_rn);
+        break;
+    case 0x7: /* SQABS, SQNEG */
+        if (u) {
+            gen_helper_neon_qneg_s64(tcg_rd, cpu_env, tcg_rn);
+        } else {
+            gen_helper_neon_qabs_s64(tcg_rd, cpu_env, tcg_rn);
+        }
+        break;
+    case 0xa: /* CMLT */
+        /* 64 bit integer comparison against zero, result is
+         * test ? (2^64 - 1) : 0. We implement via setcond(!test) and
+         * subtracting 1.
+         */
+        cond = TCG_COND_LT;
+    do_cmop:
+        tcg_gen_setcondi_i64(cond, tcg_rd, tcg_rn, 0);
+        tcg_gen_neg_i64(tcg_rd, tcg_rd);
+        break;
+    case 0x8: /* CMGT, CMGE */
+        cond = u ? TCG_COND_GE : TCG_COND_GT;
+        goto do_cmop;
+    case 0x9: /* CMEQ, CMLE */
+        cond = u ? TCG_COND_LE : TCG_COND_EQ;
+        goto do_cmop;
+    case 0xb: /* ABS, NEG */
+        if (u) {
+            tcg_gen_neg_i64(tcg_rd, tcg_rn);
+        } else {
+            tcg_gen_abs_i64(tcg_rd, tcg_rn);
+        }
+        break;
+    case 0x2f: /* FABS */
+        gen_helper_vfp_absd(tcg_rd, tcg_rn);
+        break;
+    case 0x6f: /* FNEG */
+        gen_helper_vfp_negd(tcg_rd, tcg_rn);
+        break;
+    case 0x7f: /* FSQRT */
+        gen_helper_vfp_sqrtd(tcg_rd, tcg_rn, cpu_env);
+        break;
+    case 0x1a: /* FCVTNS */
+    case 0x1b: /* FCVTMS */
+    case 0x1c: /* FCVTAS */
+    case 0x3a: /* FCVTPS */
+    case 0x3b: /* FCVTZS */
+        gen_helper_vfp_tosqd(tcg_rd, tcg_rn, tcg_constant_i32(0), tcg_fpstatus);
+        break;
+    case 0x5a: /* FCVTNU */
+    case 0x5b: /* FCVTMU */
+    case 0x5c: /* FCVTAU */
+    case 0x7a: /* FCVTPU */
+    case 0x7b: /* FCVTZU */
+        gen_helper_vfp_touqd(tcg_rd, tcg_rn, tcg_constant_i32(0), tcg_fpstatus);
+        break;
+    case 0x18: /* FRINTN */
+    case 0x19: /* FRINTM */
+    case 0x38: /* FRINTP */
+    case 0x39: /* FRINTZ */
+    case 0x58: /* FRINTA */
+    case 0x79: /* FRINTI */
+        gen_helper_rintd(tcg_rd, tcg_rn, tcg_fpstatus);
+        break;
+    case 0x59: /* FRINTX */
+        gen_helper_rintd_exact(tcg_rd, tcg_rn, tcg_fpstatus);
+        break;
+    case 0x1e: /* FRINT32Z */
+    case 0x5e: /* FRINT32X */
+        gen_helper_frint32_d(tcg_rd, tcg_rn, tcg_fpstatus);
+        break;
+    case 0x1f: /* FRINT64Z */
+    case 0x5f: /* FRINT64X */
+        gen_helper_frint64_d(tcg_rd, tcg_rn, tcg_fpstatus);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+static void handle_2misc_fcmp_zero(DisasContext *s, int opcode,
+                                   bool is_scalar, bool is_u, bool is_q,
+                                   int size, int rn, int rd)
+{
+    bool is_double = (size == MO_64);
+    TCGv_ptr fpst;
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    fpst = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+
+    if (is_double) {
+        TCGv_i64 tcg_op = tcg_temp_new_i64();
+        TCGv_i64 tcg_zero = tcg_constant_i64(0);
+        TCGv_i64 tcg_res = tcg_temp_new_i64();
+        NeonGenTwoDoubleOpFn *genfn;
+        bool swap = false;
+        int pass;
+
+        switch (opcode) {
+        case 0x2e: /* FCMLT (zero) */
+            swap = true;
+            /* fallthrough */
+        case 0x2c: /* FCMGT (zero) */
+            genfn = gen_helper_neon_cgt_f64;
+            break;
+        case 0x2d: /* FCMEQ (zero) */
+            genfn = gen_helper_neon_ceq_f64;
+            break;
+        case 0x6d: /* FCMLE (zero) */
+            swap = true;
+            /* fall through */
+        case 0x6c: /* FCMGE (zero) */
+            genfn = gen_helper_neon_cge_f64;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+
+        for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) {
+            read_vec_element(s, tcg_op, rn, pass, MO_64);
+            if (swap) {
+                genfn(tcg_res, tcg_zero, tcg_op, fpst);
+            } else {
+                genfn(tcg_res, tcg_op, tcg_zero, fpst);
+            }
+            write_vec_element(s, tcg_res, rd, pass, MO_64);
+        }
+        tcg_temp_free_i64(tcg_res);
+        tcg_temp_free_i64(tcg_op);
+
+        clear_vec_high(s, !is_scalar, rd);
+    } else {
+        TCGv_i32 tcg_op = tcg_temp_new_i32();
+        TCGv_i32 tcg_zero = tcg_constant_i32(0);
+        TCGv_i32 tcg_res = tcg_temp_new_i32();
+        NeonGenTwoSingleOpFn *genfn;
+        bool swap = false;
+        int pass, maxpasses;
+
+        if (size == MO_16) {
+            switch (opcode) {
+            case 0x2e: /* FCMLT (zero) */
+                swap = true;
+                /* fall through */
+            case 0x2c: /* FCMGT (zero) */
+                genfn = gen_helper_advsimd_cgt_f16;
+                break;
+            case 0x2d: /* FCMEQ (zero) */
+                genfn = gen_helper_advsimd_ceq_f16;
+                break;
+            case 0x6d: /* FCMLE (zero) */
+                swap = true;
+                /* fall through */
+            case 0x6c: /* FCMGE (zero) */
+                genfn = gen_helper_advsimd_cge_f16;
+                break;
+            default:
+                g_assert_not_reached();
+            }
+        } else {
+            switch (opcode) {
+            case 0x2e: /* FCMLT (zero) */
+                swap = true;
+                /* fall through */
+            case 0x2c: /* FCMGT (zero) */
+                genfn = gen_helper_neon_cgt_f32;
+                break;
+            case 0x2d: /* FCMEQ (zero) */
+                genfn = gen_helper_neon_ceq_f32;
+                break;
+            case 0x6d: /* FCMLE (zero) */
+                swap = true;
+                /* fall through */
+            case 0x6c: /* FCMGE (zero) */
+                genfn = gen_helper_neon_cge_f32;
+                break;
+            default:
+                g_assert_not_reached();
+            }
+        }
+
+        if (is_scalar) {
+            maxpasses = 1;
+        } else {
+            int vector_size = 8 << is_q;
+            maxpasses = vector_size >> size;
+        }
+
+        for (pass = 0; pass < maxpasses; pass++) {
+            read_vec_element_i32(s, tcg_op, rn, pass, size);
+            if (swap) {
+                genfn(tcg_res, tcg_zero, tcg_op, fpst);
+            } else {
+                genfn(tcg_res, tcg_op, tcg_zero, fpst);
+            }
+            if (is_scalar) {
+                write_fp_sreg(s, rd, tcg_res);
+            } else {
+                write_vec_element_i32(s, tcg_res, rd, pass, size);
+            }
+        }
+        tcg_temp_free_i32(tcg_res);
+        tcg_temp_free_i32(tcg_op);
+        if (!is_scalar) {
+            clear_vec_high(s, is_q, rd);
+        }
+    }
+
+    tcg_temp_free_ptr(fpst);
+}
+
+static void handle_2misc_reciprocal(DisasContext *s, int opcode,
+                                    bool is_scalar, bool is_u, bool is_q,
+                                    int size, int rn, int rd)
+{
+    bool is_double = (size == 3);
+    TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
+
+    if (is_double) {
+        TCGv_i64 tcg_op = tcg_temp_new_i64();
+        TCGv_i64 tcg_res = tcg_temp_new_i64();
+        int pass;
+
+        for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) {
+            read_vec_element(s, tcg_op, rn, pass, MO_64);
+            switch (opcode) {
+            case 0x3d: /* FRECPE */
+                gen_helper_recpe_f64(tcg_res, tcg_op, fpst);
+                break;
+            case 0x3f: /* FRECPX */
+                gen_helper_frecpx_f64(tcg_res, tcg_op, fpst);
+                break;
+            case 0x7d: /* FRSQRTE */
+                gen_helper_rsqrte_f64(tcg_res, tcg_op, fpst);
+                break;
+            default:
+                g_assert_not_reached();
+            }
+            write_vec_element(s, tcg_res, rd, pass, MO_64);
+        }
+        tcg_temp_free_i64(tcg_res);
+        tcg_temp_free_i64(tcg_op);
+        clear_vec_high(s, !is_scalar, rd);
+    } else {
+        TCGv_i32 tcg_op = tcg_temp_new_i32();
+        TCGv_i32 tcg_res = tcg_temp_new_i32();
+        int pass, maxpasses;
+
+        if (is_scalar) {
+            maxpasses = 1;
+        } else {
+            maxpasses = is_q ? 4 : 2;
+        }
+
+        for (pass = 0; pass < maxpasses; pass++) {
+            read_vec_element_i32(s, tcg_op, rn, pass, MO_32);
+
+            switch (opcode) {
+            case 0x3c: /* URECPE */
+                gen_helper_recpe_u32(tcg_res, tcg_op);
+                break;
+            case 0x3d: /* FRECPE */
+                gen_helper_recpe_f32(tcg_res, tcg_op, fpst);
+                break;
+            case 0x3f: /* FRECPX */
+                gen_helper_frecpx_f32(tcg_res, tcg_op, fpst);
+                break;
+            case 0x7d: /* FRSQRTE */
+                gen_helper_rsqrte_f32(tcg_res, tcg_op, fpst);
+                break;
+            default:
+                g_assert_not_reached();
+            }
+
+            if (is_scalar) {
+                write_fp_sreg(s, rd, tcg_res);
+            } else {
+                write_vec_element_i32(s, tcg_res, rd, pass, MO_32);
+            }
+        }
+        tcg_temp_free_i32(tcg_res);
+        tcg_temp_free_i32(tcg_op);
+        if (!is_scalar) {
+            clear_vec_high(s, is_q, rd);
+        }
+    }
+    tcg_temp_free_ptr(fpst);
+}
+
+static void handle_2misc_narrow(DisasContext *s, bool scalar,
+                                int opcode, bool u, bool is_q,
+                                int size, int rn, int rd)
+{
+    /* Handle 2-reg-misc ops which are narrowing (so each 2*size element
+     * in the source becomes a size element in the destination).
+     */
+    int pass;
+    TCGv_i32 tcg_res[2];
+    int destelt = is_q ? 2 : 0;
+    int passes = scalar ? 1 : 2;
+
+    if (scalar) {
+        tcg_res[1] = tcg_constant_i32(0);
+    }
+
+    for (pass = 0; pass < passes; pass++) {
+        TCGv_i64 tcg_op = tcg_temp_new_i64();
+        NeonGenNarrowFn *genfn = NULL;
+        NeonGenNarrowEnvFn *genenvfn = NULL;
+
+        if (scalar) {
+            read_vec_element(s, tcg_op, rn, pass, size + 1);
+        } else {
+            read_vec_element(s, tcg_op, rn, pass, MO_64);
+        }
+        tcg_res[pass] = tcg_temp_new_i32();
+
+        switch (opcode) {
+        case 0x12: /* XTN, SQXTUN */
+        {
+            static NeonGenNarrowFn * const xtnfns[3] = {
+                gen_helper_neon_narrow_u8,
+                gen_helper_neon_narrow_u16,
+                tcg_gen_extrl_i64_i32,
+            };
+            static NeonGenNarrowEnvFn * const sqxtunfns[3] = {
+                gen_helper_neon_unarrow_sat8,
+                gen_helper_neon_unarrow_sat16,
+                gen_helper_neon_unarrow_sat32,
+            };
+            if (u) {
+                genenvfn = sqxtunfns[size];
+            } else {
+                genfn = xtnfns[size];
+            }
+            break;
+        }
+        case 0x14: /* SQXTN, UQXTN */
+        {
+            static NeonGenNarrowEnvFn * const fns[3][2] = {
+                { gen_helper_neon_narrow_sat_s8,
+                  gen_helper_neon_narrow_sat_u8 },
+                { gen_helper_neon_narrow_sat_s16,
+                  gen_helper_neon_narrow_sat_u16 },
+                { gen_helper_neon_narrow_sat_s32,
+                  gen_helper_neon_narrow_sat_u32 },
+            };
+            genenvfn = fns[size][u];
+            break;
+        }
+        case 0x16: /* FCVTN, FCVTN2 */
+            /* 32 bit to 16 bit or 64 bit to 32 bit float conversion */
+            if (size == 2) {
+                gen_helper_vfp_fcvtsd(tcg_res[pass], tcg_op, cpu_env);
+            } else {
+                TCGv_i32 tcg_lo = tcg_temp_new_i32();
+                TCGv_i32 tcg_hi = tcg_temp_new_i32();
+                TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
+                TCGv_i32 ahp = get_ahp_flag();
+
+                tcg_gen_extr_i64_i32(tcg_lo, tcg_hi, tcg_op);
+                gen_helper_vfp_fcvt_f32_to_f16(tcg_lo, tcg_lo, fpst, ahp);
+                gen_helper_vfp_fcvt_f32_to_f16(tcg_hi, tcg_hi, fpst, ahp);
+                tcg_gen_deposit_i32(tcg_res[pass], tcg_lo, tcg_hi, 16, 16);
+                tcg_temp_free_i32(tcg_lo);
+                tcg_temp_free_i32(tcg_hi);
+                tcg_temp_free_ptr(fpst);
+                tcg_temp_free_i32(ahp);
+            }
+            break;
+        case 0x36: /* BFCVTN, BFCVTN2 */
+            {
+                TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
+                gen_helper_bfcvt_pair(tcg_res[pass], tcg_op, fpst);
+                tcg_temp_free_ptr(fpst);
+            }
+            break;
+        case 0x56:  /* FCVTXN, FCVTXN2 */
+            /* 64 bit to 32 bit float conversion
+             * with von Neumann rounding (round to odd)
+             */
+            assert(size == 2);
+            gen_helper_fcvtx_f64_to_f32(tcg_res[pass], tcg_op, cpu_env);
+            break;
+        default:
+            g_assert_not_reached();
+        }
+
+        if (genfn) {
+            genfn(tcg_res[pass], tcg_op);
+        } else if (genenvfn) {
+            genenvfn(tcg_res[pass], cpu_env, tcg_op);
+        }
+
+        tcg_temp_free_i64(tcg_op);
+    }
+
+    for (pass = 0; pass < 2; pass++) {
+        write_vec_element_i32(s, tcg_res[pass], rd, destelt + pass, MO_32);
+        tcg_temp_free_i32(tcg_res[pass]);
+    }
+    clear_vec_high(s, is_q, rd);
+}
+
+/* Remaining saturating accumulating ops */
+static void handle_2misc_satacc(DisasContext *s, bool is_scalar, bool is_u,
+                                bool is_q, int size, int rn, int rd)
+{
+    bool is_double = (size == 3);
+
+    if (is_double) {
+        TCGv_i64 tcg_rn = tcg_temp_new_i64();
+        TCGv_i64 tcg_rd = tcg_temp_new_i64();
+        int pass;
+
+        for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) {
+            read_vec_element(s, tcg_rn, rn, pass, MO_64);
+            read_vec_element(s, tcg_rd, rd, pass, MO_64);
+
+            if (is_u) { /* USQADD */
+                gen_helper_neon_uqadd_s64(tcg_rd, cpu_env, tcg_rn, tcg_rd);
+            } else { /* SUQADD */
+                gen_helper_neon_sqadd_u64(tcg_rd, cpu_env, tcg_rn, tcg_rd);
+            }
+            write_vec_element(s, tcg_rd, rd, pass, MO_64);
+        }
+        tcg_temp_free_i64(tcg_rd);
+        tcg_temp_free_i64(tcg_rn);
+        clear_vec_high(s, !is_scalar, rd);
+    } else {
+        TCGv_i32 tcg_rn = tcg_temp_new_i32();
+        TCGv_i32 tcg_rd = tcg_temp_new_i32();
+        int pass, maxpasses;
+
+        if (is_scalar) {
+            maxpasses = 1;
+        } else {
+            maxpasses = is_q ? 4 : 2;
+        }
+
+        for (pass = 0; pass < maxpasses; pass++) {
+            if (is_scalar) {
+                read_vec_element_i32(s, tcg_rn, rn, pass, size);
+                read_vec_element_i32(s, tcg_rd, rd, pass, size);
+            } else {
+                read_vec_element_i32(s, tcg_rn, rn, pass, MO_32);
+                read_vec_element_i32(s, tcg_rd, rd, pass, MO_32);
+            }
+
+            if (is_u) { /* USQADD */
+                switch (size) {
+                case 0:
+                    gen_helper_neon_uqadd_s8(tcg_rd, cpu_env, tcg_rn, tcg_rd);
+                    break;
+                case 1:
+                    gen_helper_neon_uqadd_s16(tcg_rd, cpu_env, tcg_rn, tcg_rd);
+                    break;
+                case 2:
+                    gen_helper_neon_uqadd_s32(tcg_rd, cpu_env, tcg_rn, tcg_rd);
+                    break;
+                default:
+                    g_assert_not_reached();
+                }
+            } else { /* SUQADD */
+                switch (size) {
+                case 0:
+                    gen_helper_neon_sqadd_u8(tcg_rd, cpu_env, tcg_rn, tcg_rd);
+                    break;
+                case 1:
+                    gen_helper_neon_sqadd_u16(tcg_rd, cpu_env, tcg_rn, tcg_rd);
+                    break;
+                case 2:
+                    gen_helper_neon_sqadd_u32(tcg_rd, cpu_env, tcg_rn, tcg_rd);
+                    break;
+                default:
+                    g_assert_not_reached();
+                }
+            }
+
+            if (is_scalar) {
+                write_vec_element(s, tcg_constant_i64(0), rd, 0, MO_64);
+            }
+            write_vec_element_i32(s, tcg_rd, rd, pass, MO_32);
+        }
+        tcg_temp_free_i32(tcg_rd);
+        tcg_temp_free_i32(tcg_rn);
+        clear_vec_high(s, is_q, rd);
+    }
+}
+
+/* AdvSIMD scalar two reg misc
+ *  31 30  29 28       24 23  22 21       17 16    12 11 10 9    5 4    0
+ * +-----+---+-----------+------+-----------+--------+-----+------+------+
+ * | 0 1 | U | 1 1 1 1 0 | size | 1 0 0 0 0 | opcode | 1 0 |  Rn  |  Rd  |
+ * +-----+---+-----------+------+-----------+--------+-----+------+------+
+ */
+static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int opcode = extract32(insn, 12, 5);
+    int size = extract32(insn, 22, 2);
+    bool u = extract32(insn, 29, 1);
+    bool is_fcvt = false;
+    int rmode;
+    TCGv_i32 tcg_rmode;
+    TCGv_ptr tcg_fpstatus;
+
+    switch (opcode) {
+    case 0x3: /* USQADD / SUQADD*/
+        if (!fp_access_check(s)) {
+            return;
+        }
+        handle_2misc_satacc(s, true, u, false, size, rn, rd);
+        return;
+    case 0x7: /* SQABS / SQNEG */
+        break;
+    case 0xa: /* CMLT */
+        if (u) {
+            unallocated_encoding(s);
+            return;
+        }
+        /* fall through */
+    case 0x8: /* CMGT, CMGE */
+    case 0x9: /* CMEQ, CMLE */
+    case 0xb: /* ABS, NEG */
+        if (size != 3) {
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    case 0x12: /* SQXTUN */
+        if (!u) {
+            unallocated_encoding(s);
+            return;
+        }
+        /* fall through */
+    case 0x14: /* SQXTN, UQXTN */
+        if (size == 3) {
+            unallocated_encoding(s);
+            return;
+        }
+        if (!fp_access_check(s)) {
+            return;
+        }
+        handle_2misc_narrow(s, true, opcode, u, false, size, rn, rd);
+        return;
+    case 0xc ... 0xf:
+    case 0x16 ... 0x1d:
+    case 0x1f:
+        /* Floating point: U, size[1] and opcode indicate operation;
+         * size[0] indicates single or double precision.
+         */
+        opcode |= (extract32(size, 1, 1) << 5) | (u << 6);
+        size = extract32(size, 0, 1) ? 3 : 2;
+        switch (opcode) {
+        case 0x2c: /* FCMGT (zero) */
+        case 0x2d: /* FCMEQ (zero) */
+        case 0x2e: /* FCMLT (zero) */
+        case 0x6c: /* FCMGE (zero) */
+        case 0x6d: /* FCMLE (zero) */
+            handle_2misc_fcmp_zero(s, opcode, true, u, true, size, rn, rd);
+            return;
+        case 0x1d: /* SCVTF */
+        case 0x5d: /* UCVTF */
+        {
+            bool is_signed = (opcode == 0x1d);
+            if (!fp_access_check(s)) {
+                return;
+            }
+            handle_simd_intfp_conv(s, rd, rn, 1, is_signed, 0, size);
+            return;
+        }
+        case 0x3d: /* FRECPE */
+        case 0x3f: /* FRECPX */
+        case 0x7d: /* FRSQRTE */
+            if (!fp_access_check(s)) {
+                return;
+            }
+            handle_2misc_reciprocal(s, opcode, true, u, true, size, rn, rd);
+            return;
+        case 0x1a: /* FCVTNS */
+        case 0x1b: /* FCVTMS */
+        case 0x3a: /* FCVTPS */
+        case 0x3b: /* FCVTZS */
+        case 0x5a: /* FCVTNU */
+        case 0x5b: /* FCVTMU */
+        case 0x7a: /* FCVTPU */
+        case 0x7b: /* FCVTZU */
+            is_fcvt = true;
+            rmode = extract32(opcode, 5, 1) | (extract32(opcode, 0, 1) << 1);
+            break;
+        case 0x1c: /* FCVTAS */
+        case 0x5c: /* FCVTAU */
+            /* TIEAWAY doesn't fit in the usual rounding mode encoding */
+            is_fcvt = true;
+            rmode = FPROUNDING_TIEAWAY;
+            break;
+        case 0x56: /* FCVTXN, FCVTXN2 */
+            if (size == 2) {
+                unallocated_encoding(s);
+                return;
+            }
+            if (!fp_access_check(s)) {
+                return;
+            }
+            handle_2misc_narrow(s, true, opcode, u, false, size - 1, rn, rd);
+            return;
+        default:
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    if (is_fcvt) {
+        tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode));
+        tcg_fpstatus = fpstatus_ptr(FPST_FPCR);
+        gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
+    } else {
+        tcg_rmode = NULL;
+        tcg_fpstatus = NULL;
+    }
+
+    if (size == 3) {
+        TCGv_i64 tcg_rn = read_fp_dreg(s, rn);
+        TCGv_i64 tcg_rd = tcg_temp_new_i64();
+
+        handle_2misc_64(s, opcode, u, tcg_rd, tcg_rn, tcg_rmode, tcg_fpstatus);
+        write_fp_dreg(s, rd, tcg_rd);
+        tcg_temp_free_i64(tcg_rd);
+        tcg_temp_free_i64(tcg_rn);
+    } else {
+        TCGv_i32 tcg_rn = tcg_temp_new_i32();
+        TCGv_i32 tcg_rd = tcg_temp_new_i32();
+
+        read_vec_element_i32(s, tcg_rn, rn, 0, size);
+
+        switch (opcode) {
+        case 0x7: /* SQABS, SQNEG */
+        {
+            NeonGenOneOpEnvFn *genfn;
+            static NeonGenOneOpEnvFn * const fns[3][2] = {
+                { gen_helper_neon_qabs_s8, gen_helper_neon_qneg_s8 },
+                { gen_helper_neon_qabs_s16, gen_helper_neon_qneg_s16 },
+                { gen_helper_neon_qabs_s32, gen_helper_neon_qneg_s32 },
+            };
+            genfn = fns[size][u];
+            genfn(tcg_rd, cpu_env, tcg_rn);
+            break;
+        }
+        case 0x1a: /* FCVTNS */
+        case 0x1b: /* FCVTMS */
+        case 0x1c: /* FCVTAS */
+        case 0x3a: /* FCVTPS */
+        case 0x3b: /* FCVTZS */
+            gen_helper_vfp_tosls(tcg_rd, tcg_rn, tcg_constant_i32(0),
+                                 tcg_fpstatus);
+            break;
+        case 0x5a: /* FCVTNU */
+        case 0x5b: /* FCVTMU */
+        case 0x5c: /* FCVTAU */
+        case 0x7a: /* FCVTPU */
+        case 0x7b: /* FCVTZU */
+            gen_helper_vfp_touls(tcg_rd, tcg_rn, tcg_constant_i32(0),
+                                 tcg_fpstatus);
+            break;
+        default:
+            g_assert_not_reached();
+        }
+
+        write_fp_sreg(s, rd, tcg_rd);
+        tcg_temp_free_i32(tcg_rd);
+        tcg_temp_free_i32(tcg_rn);
+    }
+
+    if (is_fcvt) {
+        gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
+        tcg_temp_free_i32(tcg_rmode);
+        tcg_temp_free_ptr(tcg_fpstatus);
+    }
+}
+
+/* SSHR[RA]/USHR[RA] - Vector shift right (optional rounding/accumulate) */
+static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u,
+                                 int immh, int immb, int opcode, int rn, int rd)
+{
+    int size = 32 - clz32(immh) - 1;
+    int immhb = immh << 3 | immb;
+    int shift = 2 * (8 << size) - immhb;
+    GVecGen2iFn *gvec_fn;
+
+    if (extract32(immh, 3, 1) && !is_q) {
+        unallocated_encoding(s);
+        return;
+    }
+    tcg_debug_assert(size <= 3);
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    switch (opcode) {
+    case 0x02: /* SSRA / USRA (accumulate) */
+        gvec_fn = is_u ? gen_gvec_usra : gen_gvec_ssra;
+        break;
+
+    case 0x08: /* SRI */
+        gvec_fn = gen_gvec_sri;
+        break;
+
+    case 0x00: /* SSHR / USHR */
+        if (is_u) {
+            if (shift == 8 << size) {
+                /* Shift count the same size as element size produces zero.  */
+                tcg_gen_gvec_dup_imm(size, vec_full_reg_offset(s, rd),
+                                     is_q ? 16 : 8, vec_full_reg_size(s), 0);
+                return;
+            }
+            gvec_fn = tcg_gen_gvec_shri;
+        } else {
+            /* Shift count the same size as element size produces all sign.  */
+            if (shift == 8 << size) {
+                shift -= 1;
+            }
+            gvec_fn = tcg_gen_gvec_sari;
+        }
+        break;
+
+    case 0x04: /* SRSHR / URSHR (rounding) */
+        gvec_fn = is_u ? gen_gvec_urshr : gen_gvec_srshr;
+        break;
+
+    case 0x06: /* SRSRA / URSRA (accum + rounding) */
+        gvec_fn = is_u ? gen_gvec_ursra : gen_gvec_srsra;
+        break;
+
+    default:
+        g_assert_not_reached();
+    }
+
+    gen_gvec_fn2i(s, is_q, rd, rn, shift, gvec_fn, size);
+}
+
+/* SHL/SLI - Vector shift left */
+static void handle_vec_simd_shli(DisasContext *s, bool is_q, bool insert,
+                                 int immh, int immb, int opcode, int rn, int rd)
+{
+    int size = 32 - clz32(immh) - 1;
+    int immhb = immh << 3 | immb;
+    int shift = immhb - (8 << size);
+
+    /* Range of size is limited by decode: immh is a non-zero 4 bit field */
+    assert(size >= 0 && size <= 3);
+
+    if (extract32(immh, 3, 1) && !is_q) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    if (insert) {
+        gen_gvec_fn2i(s, is_q, rd, rn, shift, gen_gvec_sli, size);
+    } else {
+        gen_gvec_fn2i(s, is_q, rd, rn, shift, tcg_gen_gvec_shli, size);
+    }
+}
+
+/* USHLL/SHLL - Vector shift left with widening */
+static void handle_vec_simd_wshli(DisasContext *s, bool is_q, bool is_u,
+                                 int immh, int immb, int opcode, int rn, int rd)
+{
+    int size = 32 - clz32(immh) - 1;
+    int immhb = immh << 3 | immb;
+    int shift = immhb - (8 << size);
+    int dsize = 64;
+    int esize = 8 << size;
+    int elements = dsize/esize;
+    TCGv_i64 tcg_rn = new_tmp_a64(s);
+    TCGv_i64 tcg_rd = new_tmp_a64(s);
+    int i;
+
+    if (size >= 3) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    /* For the LL variants the store is larger than the load,
+     * so if rd == rn we would overwrite parts of our input.
+     * So load everything right now and use shifts in the main loop.
+     */
+    read_vec_element(s, tcg_rn, rn, is_q ? 1 : 0, MO_64);
+
+    for (i = 0; i < elements; i++) {
+        tcg_gen_shri_i64(tcg_rd, tcg_rn, i * esize);
+        ext_and_shift_reg(tcg_rd, tcg_rd, size | (!is_u << 2), 0);
+        tcg_gen_shli_i64(tcg_rd, tcg_rd, shift);
+        write_vec_element(s, tcg_rd, rd, i, size + 1);
+    }
+}
+
+/* SHRN/RSHRN - Shift right with narrowing (and potential rounding) */
+static void handle_vec_simd_shrn(DisasContext *s, bool is_q,
+                                 int immh, int immb, int opcode, int rn, int rd)
+{
+    int immhb = immh << 3 | immb;
+    int size = 32 - clz32(immh) - 1;
+    int dsize = 64;
+    int esize = 8 << size;
+    int elements = dsize/esize;
+    int shift = (2 * esize) - immhb;
+    bool round = extract32(opcode, 0, 1);
+    TCGv_i64 tcg_rn, tcg_rd, tcg_final;
+    TCGv_i64 tcg_round;
+    int i;
+
+    if (extract32(immh, 3, 1)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    tcg_rn = tcg_temp_new_i64();
+    tcg_rd = tcg_temp_new_i64();
+    tcg_final = tcg_temp_new_i64();
+    read_vec_element(s, tcg_final, rd, is_q ? 1 : 0, MO_64);
+
+    if (round) {
+        tcg_round = tcg_constant_i64(1ULL << (shift - 1));
+    } else {
+        tcg_round = NULL;
+    }
+
+    for (i = 0; i < elements; i++) {
+        read_vec_element(s, tcg_rn, rn, i, size+1);
+        handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
+                                false, true, size+1, shift);
+
+        tcg_gen_deposit_i64(tcg_final, tcg_final, tcg_rd, esize * i, esize);
+    }
+
+    if (!is_q) {
+        write_vec_element(s, tcg_final, rd, 0, MO_64);
+    } else {
+        write_vec_element(s, tcg_final, rd, 1, MO_64);
+    }
+    tcg_temp_free_i64(tcg_rn);
+    tcg_temp_free_i64(tcg_rd);
+    tcg_temp_free_i64(tcg_final);
+
+    clear_vec_high(s, is_q, rd);
+}
+
+
+/* AdvSIMD shift by immediate
+ *  31  30   29 28         23 22  19 18  16 15    11  10 9    5 4    0
+ * +---+---+---+-------------+------+------+--------+---+------+------+
+ * | 0 | Q | U | 0 1 1 1 1 0 | immh | immb | opcode | 1 |  Rn  |  Rd  |
+ * +---+---+---+-------------+------+------+--------+---+------+------+
+ */
+static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int opcode = extract32(insn, 11, 5);
+    int immb = extract32(insn, 16, 3);
+    int immh = extract32(insn, 19, 4);
+    bool is_u = extract32(insn, 29, 1);
+    bool is_q = extract32(insn, 30, 1);
+
+    /* data_proc_simd[] has sent immh == 0 to disas_simd_mod_imm. */
+    assert(immh != 0);
+
+    switch (opcode) {
+    case 0x08: /* SRI */
+        if (!is_u) {
+            unallocated_encoding(s);
+            return;
+        }
+        /* fall through */
+    case 0x00: /* SSHR / USHR */
+    case 0x02: /* SSRA / USRA (accumulate) */
+    case 0x04: /* SRSHR / URSHR (rounding) */
+    case 0x06: /* SRSRA / URSRA (accum + rounding) */
+        handle_vec_simd_shri(s, is_q, is_u, immh, immb, opcode, rn, rd);
+        break;
+    case 0x0a: /* SHL / SLI */
+        handle_vec_simd_shli(s, is_q, is_u, immh, immb, opcode, rn, rd);
+        break;
+    case 0x10: /* SHRN */
+    case 0x11: /* RSHRN / SQRSHRUN */
+        if (is_u) {
+            handle_vec_simd_sqshrn(s, false, is_q, false, true, immh, immb,
+                                   opcode, rn, rd);
+        } else {
+            handle_vec_simd_shrn(s, is_q, immh, immb, opcode, rn, rd);
+        }
+        break;
+    case 0x12: /* SQSHRN / UQSHRN */
+    case 0x13: /* SQRSHRN / UQRSHRN */
+        handle_vec_simd_sqshrn(s, false, is_q, is_u, is_u, immh, immb,
+                               opcode, rn, rd);
+        break;
+    case 0x14: /* SSHLL / USHLL */
+        handle_vec_simd_wshli(s, is_q, is_u, immh, immb, opcode, rn, rd);
+        break;
+    case 0x1c: /* SCVTF / UCVTF */
+        handle_simd_shift_intfp_conv(s, false, is_q, is_u, immh, immb,
+                                     opcode, rn, rd);
+        break;
+    case 0xc: /* SQSHLU */
+        if (!is_u) {
+            unallocated_encoding(s);
+            return;
+        }
+        handle_simd_qshl(s, false, is_q, false, true, immh, immb, rn, rd);
+        break;
+    case 0xe: /* SQSHL, UQSHL */
+        handle_simd_qshl(s, false, is_q, is_u, is_u, immh, immb, rn, rd);
+        break;
+    case 0x1f: /* FCVTZS/ FCVTZU */
+        handle_simd_shift_fpint_conv(s, false, is_q, is_u, immh, immb, rn, rd);
+        return;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+}
+
+/* Generate code to do a "long" addition or subtraction, ie one done in
+ * TCGv_i64 on vector lanes twice the width specified by size.
+ */
+static void gen_neon_addl(int size, bool is_sub, TCGv_i64 tcg_res,
+                          TCGv_i64 tcg_op1, TCGv_i64 tcg_op2)
+{
+    static NeonGenTwo64OpFn * const fns[3][2] = {
+        { gen_helper_neon_addl_u16, gen_helper_neon_subl_u16 },
+        { gen_helper_neon_addl_u32, gen_helper_neon_subl_u32 },
+        { tcg_gen_add_i64, tcg_gen_sub_i64 },
+    };
+    NeonGenTwo64OpFn *genfn;
+    assert(size < 3);
+
+    genfn = fns[size][is_sub];
+    genfn(tcg_res, tcg_op1, tcg_op2);
+}
+
+static void handle_3rd_widening(DisasContext *s, int is_q, int is_u, int size,
+                                int opcode, int rd, int rn, int rm)
+{
+    /* 3-reg-different widening insns: 64 x 64 -> 128 */
+    TCGv_i64 tcg_res[2];
+    int pass, accop;
+
+    tcg_res[0] = tcg_temp_new_i64();
+    tcg_res[1] = tcg_temp_new_i64();
+
+    /* Does this op do an adding accumulate, a subtracting accumulate,
+     * or no accumulate at all?
+     */
+    switch (opcode) {
+    case 5:
+    case 8:
+    case 9:
+        accop = 1;
+        break;
+    case 10:
+    case 11:
+        accop = -1;
+        break;
+    default:
+        accop = 0;
+        break;
+    }
+
+    if (accop != 0) {
+        read_vec_element(s, tcg_res[0], rd, 0, MO_64);
+        read_vec_element(s, tcg_res[1], rd, 1, MO_64);
+    }
+
+    /* size == 2 means two 32x32->64 operations; this is worth special
+     * casing because we can generally handle it inline.
+     */
+    if (size == 2) {
+        for (pass = 0; pass < 2; pass++) {
+            TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+            TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+            TCGv_i64 tcg_passres;
+            MemOp memop = MO_32 | (is_u ? 0 : MO_SIGN);
+
+            int elt = pass + is_q * 2;
+
+            read_vec_element(s, tcg_op1, rn, elt, memop);
+            read_vec_element(s, tcg_op2, rm, elt, memop);
+
+            if (accop == 0) {
+                tcg_passres = tcg_res[pass];
+            } else {
+                tcg_passres = tcg_temp_new_i64();
+            }
+
+            switch (opcode) {
+            case 0: /* SADDL, SADDL2, UADDL, UADDL2 */
+                tcg_gen_add_i64(tcg_passres, tcg_op1, tcg_op2);
+                break;
+            case 2: /* SSUBL, SSUBL2, USUBL, USUBL2 */
+                tcg_gen_sub_i64(tcg_passres, tcg_op1, tcg_op2);
+                break;
+            case 5: /* SABAL, SABAL2, UABAL, UABAL2 */
+            case 7: /* SABDL, SABDL2, UABDL, UABDL2 */
+            {
+                TCGv_i64 tcg_tmp1 = tcg_temp_new_i64();
+                TCGv_i64 tcg_tmp2 = tcg_temp_new_i64();
+
+                tcg_gen_sub_i64(tcg_tmp1, tcg_op1, tcg_op2);
+                tcg_gen_sub_i64(tcg_tmp2, tcg_op2, tcg_op1);
+                tcg_gen_movcond_i64(is_u ? TCG_COND_GEU : TCG_COND_GE,
+                                    tcg_passres,
+                                    tcg_op1, tcg_op2, tcg_tmp1, tcg_tmp2);
+                tcg_temp_free_i64(tcg_tmp1);
+                tcg_temp_free_i64(tcg_tmp2);
+                break;
+            }
+            case 8: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
+            case 10: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
+            case 12: /* UMULL, UMULL2, SMULL, SMULL2 */
+                tcg_gen_mul_i64(tcg_passres, tcg_op1, tcg_op2);
+                break;
+            case 9: /* SQDMLAL, SQDMLAL2 */
+            case 11: /* SQDMLSL, SQDMLSL2 */
+            case 13: /* SQDMULL, SQDMULL2 */
+                tcg_gen_mul_i64(tcg_passres, tcg_op1, tcg_op2);
+                gen_helper_neon_addl_saturate_s64(tcg_passres, cpu_env,
+                                                  tcg_passres, tcg_passres);
+                break;
+            default:
+                g_assert_not_reached();
+            }
+
+            if (opcode == 9 || opcode == 11) {
+                /* saturating accumulate ops */
+                if (accop < 0) {
+                    tcg_gen_neg_i64(tcg_passres, tcg_passres);
+                }
+                gen_helper_neon_addl_saturate_s64(tcg_res[pass], cpu_env,
+                                                  tcg_res[pass], tcg_passres);
+            } else if (accop > 0) {
+                tcg_gen_add_i64(tcg_res[pass], tcg_res[pass], tcg_passres);
+            } else if (accop < 0) {
+                tcg_gen_sub_i64(tcg_res[pass], tcg_res[pass], tcg_passres);
+            }
+
+            if (accop != 0) {
+                tcg_temp_free_i64(tcg_passres);
+            }
+
+            tcg_temp_free_i64(tcg_op1);
+            tcg_temp_free_i64(tcg_op2);
+        }
+    } else {
+        /* size 0 or 1, generally helper functions */
+        for (pass = 0; pass < 2; pass++) {
+            TCGv_i32 tcg_op1 = tcg_temp_new_i32();
+            TCGv_i32 tcg_op2 = tcg_temp_new_i32();
+            TCGv_i64 tcg_passres;
+            int elt = pass + is_q * 2;
+
+            read_vec_element_i32(s, tcg_op1, rn, elt, MO_32);
+            read_vec_element_i32(s, tcg_op2, rm, elt, MO_32);
+
+            if (accop == 0) {
+                tcg_passres = tcg_res[pass];
+            } else {
+                tcg_passres = tcg_temp_new_i64();
+            }
+
+            switch (opcode) {
+            case 0: /* SADDL, SADDL2, UADDL, UADDL2 */
+            case 2: /* SSUBL, SSUBL2, USUBL, USUBL2 */
+            {
+                TCGv_i64 tcg_op2_64 = tcg_temp_new_i64();
+                static NeonGenWidenFn * const widenfns[2][2] = {
+                    { gen_helper_neon_widen_s8, gen_helper_neon_widen_u8 },
+                    { gen_helper_neon_widen_s16, gen_helper_neon_widen_u16 },
+                };
+                NeonGenWidenFn *widenfn = widenfns[size][is_u];
+
+                widenfn(tcg_op2_64, tcg_op2);
+                widenfn(tcg_passres, tcg_op1);
+                gen_neon_addl(size, (opcode == 2), tcg_passres,
+                              tcg_passres, tcg_op2_64);
+                tcg_temp_free_i64(tcg_op2_64);
+                break;
+            }
+            case 5: /* SABAL, SABAL2, UABAL, UABAL2 */
+            case 7: /* SABDL, SABDL2, UABDL, UABDL2 */
+                if (size == 0) {
+                    if (is_u) {
+                        gen_helper_neon_abdl_u16(tcg_passres, tcg_op1, tcg_op2);
+                    } else {
+                        gen_helper_neon_abdl_s16(tcg_passres, tcg_op1, tcg_op2);
+                    }
+                } else {
+                    if (is_u) {
+                        gen_helper_neon_abdl_u32(tcg_passres, tcg_op1, tcg_op2);
+                    } else {
+                        gen_helper_neon_abdl_s32(tcg_passres, tcg_op1, tcg_op2);
+                    }
+                }
+                break;
+            case 8: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
+            case 10: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
+            case 12: /* UMULL, UMULL2, SMULL, SMULL2 */
+                if (size == 0) {
+                    if (is_u) {
+                        gen_helper_neon_mull_u8(tcg_passres, tcg_op1, tcg_op2);
+                    } else {
+                        gen_helper_neon_mull_s8(tcg_passres, tcg_op1, tcg_op2);
+                    }
+                } else {
+                    if (is_u) {
+                        gen_helper_neon_mull_u16(tcg_passres, tcg_op1, tcg_op2);
+                    } else {
+                        gen_helper_neon_mull_s16(tcg_passres, tcg_op1, tcg_op2);
+                    }
+                }
+                break;
+            case 9: /* SQDMLAL, SQDMLAL2 */
+            case 11: /* SQDMLSL, SQDMLSL2 */
+            case 13: /* SQDMULL, SQDMULL2 */
+                assert(size == 1);
+                gen_helper_neon_mull_s16(tcg_passres, tcg_op1, tcg_op2);
+                gen_helper_neon_addl_saturate_s32(tcg_passres, cpu_env,
+                                                  tcg_passres, tcg_passres);
+                break;
+            default:
+                g_assert_not_reached();
+            }
+            tcg_temp_free_i32(tcg_op1);
+            tcg_temp_free_i32(tcg_op2);
+
+            if (accop != 0) {
+                if (opcode == 9 || opcode == 11) {
+                    /* saturating accumulate ops */
+                    if (accop < 0) {
+                        gen_helper_neon_negl_u32(tcg_passres, tcg_passres);
+                    }
+                    gen_helper_neon_addl_saturate_s32(tcg_res[pass], cpu_env,
+                                                      tcg_res[pass],
+                                                      tcg_passres);
+                } else {
+                    gen_neon_addl(size, (accop < 0), tcg_res[pass],
+                                  tcg_res[pass], tcg_passres);
+                }
+                tcg_temp_free_i64(tcg_passres);
+            }
+        }
+    }
+
+    write_vec_element(s, tcg_res[0], rd, 0, MO_64);
+    write_vec_element(s, tcg_res[1], rd, 1, MO_64);
+    tcg_temp_free_i64(tcg_res[0]);
+    tcg_temp_free_i64(tcg_res[1]);
+}
+
+static void handle_3rd_wide(DisasContext *s, int is_q, int is_u, int size,
+                            int opcode, int rd, int rn, int rm)
+{
+    TCGv_i64 tcg_res[2];
+    int part = is_q ? 2 : 0;
+    int pass;
+
+    for (pass = 0; pass < 2; pass++) {
+        TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+        TCGv_i32 tcg_op2 = tcg_temp_new_i32();
+        TCGv_i64 tcg_op2_wide = tcg_temp_new_i64();
+        static NeonGenWidenFn * const widenfns[3][2] = {
+            { gen_helper_neon_widen_s8, gen_helper_neon_widen_u8 },
+            { gen_helper_neon_widen_s16, gen_helper_neon_widen_u16 },
+            { tcg_gen_ext_i32_i64, tcg_gen_extu_i32_i64 },
+        };
+        NeonGenWidenFn *widenfn = widenfns[size][is_u];
+
+        read_vec_element(s, tcg_op1, rn, pass, MO_64);
+        read_vec_element_i32(s, tcg_op2, rm, part + pass, MO_32);
+        widenfn(tcg_op2_wide, tcg_op2);
+        tcg_temp_free_i32(tcg_op2);
+        tcg_res[pass] = tcg_temp_new_i64();
+        gen_neon_addl(size, (opcode == 3),
+                      tcg_res[pass], tcg_op1, tcg_op2_wide);
+        tcg_temp_free_i64(tcg_op1);
+        tcg_temp_free_i64(tcg_op2_wide);
+    }
+
+    for (pass = 0; pass < 2; pass++) {
+        write_vec_element(s, tcg_res[pass], rd, pass, MO_64);
+        tcg_temp_free_i64(tcg_res[pass]);
+    }
+}
+
+static void do_narrow_round_high_u32(TCGv_i32 res, TCGv_i64 in)
+{
+    tcg_gen_addi_i64(in, in, 1U << 31);
+    tcg_gen_extrh_i64_i32(res, in);
+}
+
+static void handle_3rd_narrowing(DisasContext *s, int is_q, int is_u, int size,
+                                 int opcode, int rd, int rn, int rm)
+{
+    TCGv_i32 tcg_res[2];
+    int part = is_q ? 2 : 0;
+    int pass;
+
+    for (pass = 0; pass < 2; pass++) {
+        TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+        TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+        TCGv_i64 tcg_wideres = tcg_temp_new_i64();
+        static NeonGenNarrowFn * const narrowfns[3][2] = {
+            { gen_helper_neon_narrow_high_u8,
+              gen_helper_neon_narrow_round_high_u8 },
+            { gen_helper_neon_narrow_high_u16,
+              gen_helper_neon_narrow_round_high_u16 },
+            { tcg_gen_extrh_i64_i32, do_narrow_round_high_u32 },
+        };
+        NeonGenNarrowFn *gennarrow = narrowfns[size][is_u];
+
+        read_vec_element(s, tcg_op1, rn, pass, MO_64);
+        read_vec_element(s, tcg_op2, rm, pass, MO_64);
+
+        gen_neon_addl(size, (opcode == 6), tcg_wideres, tcg_op1, tcg_op2);
+
+        tcg_temp_free_i64(tcg_op1);
+        tcg_temp_free_i64(tcg_op2);
+
+        tcg_res[pass] = tcg_temp_new_i32();
+        gennarrow(tcg_res[pass], tcg_wideres);
+        tcg_temp_free_i64(tcg_wideres);
+    }
+
+    for (pass = 0; pass < 2; pass++) {
+        write_vec_element_i32(s, tcg_res[pass], rd, pass + part, MO_32);
+        tcg_temp_free_i32(tcg_res[pass]);
+    }
+    clear_vec_high(s, is_q, rd);
+}
+
+/* AdvSIMD three different
+ *   31  30  29 28       24 23  22  21 20  16 15    12 11 10 9    5 4    0
+ * +---+---+---+-----------+------+---+------+--------+-----+------+------+
+ * | 0 | Q | U | 0 1 1 1 0 | size | 1 |  Rm  | opcode | 0 0 |  Rn  |  Rd  |
+ * +---+---+---+-----------+------+---+------+--------+-----+------+------+
+ */
+static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
+{
+    /* Instructions in this group fall into three basic classes
+     * (in each case with the operation working on each element in
+     * the input vectors):
+     * (1) widening 64 x 64 -> 128 (with possibly Vd as an extra
+     *     128 bit input)
+     * (2) wide 64 x 128 -> 128
+     * (3) narrowing 128 x 128 -> 64
+     * Here we do initial decode, catch unallocated cases and
+     * dispatch to separate functions for each class.
+     */
+    int is_q = extract32(insn, 30, 1);
+    int is_u = extract32(insn, 29, 1);
+    int size = extract32(insn, 22, 2);
+    int opcode = extract32(insn, 12, 4);
+    int rm = extract32(insn, 16, 5);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+
+    switch (opcode) {
+    case 1: /* SADDW, SADDW2, UADDW, UADDW2 */
+    case 3: /* SSUBW, SSUBW2, USUBW, USUBW2 */
+        /* 64 x 128 -> 128 */
+        if (size == 3) {
+            unallocated_encoding(s);
+            return;
+        }
+        if (!fp_access_check(s)) {
+            return;
+        }
+        handle_3rd_wide(s, is_q, is_u, size, opcode, rd, rn, rm);
+        break;
+    case 4: /* ADDHN, ADDHN2, RADDHN, RADDHN2 */
+    case 6: /* SUBHN, SUBHN2, RSUBHN, RSUBHN2 */
+        /* 128 x 128 -> 64 */
+        if (size == 3) {
+            unallocated_encoding(s);
+            return;
+        }
+        if (!fp_access_check(s)) {
+            return;
+        }
+        handle_3rd_narrowing(s, is_q, is_u, size, opcode, rd, rn, rm);
+        break;
+    case 14: /* PMULL, PMULL2 */
+        if (is_u) {
+            unallocated_encoding(s);
+            return;
+        }
+        switch (size) {
+        case 0: /* PMULL.P8 */
+            if (!fp_access_check(s)) {
+                return;
+            }
+            /* The Q field specifies lo/hi half input for this insn.  */
+            gen_gvec_op3_ool(s, true, rd, rn, rm, is_q,
+                             gen_helper_neon_pmull_h);
+            break;
+
+        case 3: /* PMULL.P64 */
+            if (!dc_isar_feature(aa64_pmull, s)) {
+                unallocated_encoding(s);
+                return;
+            }
+            if (!fp_access_check(s)) {
+                return;
+            }
+            /* The Q field specifies lo/hi half input for this insn.  */
+            gen_gvec_op3_ool(s, true, rd, rn, rm, is_q,
+                             gen_helper_gvec_pmull_q);
+            break;
+
+        default:
+            unallocated_encoding(s);
+            break;
+        }
+        return;
+    case 9: /* SQDMLAL, SQDMLAL2 */
+    case 11: /* SQDMLSL, SQDMLSL2 */
+    case 13: /* SQDMULL, SQDMULL2 */
+        if (is_u || size == 0) {
+            unallocated_encoding(s);
+            return;
+        }
+        /* fall through */
+    case 0: /* SADDL, SADDL2, UADDL, UADDL2 */
+    case 2: /* SSUBL, SSUBL2, USUBL, USUBL2 */
+    case 5: /* SABAL, SABAL2, UABAL, UABAL2 */
+    case 7: /* SABDL, SABDL2, UABDL, UABDL2 */
+    case 8: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
+    case 10: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
+    case 12: /* SMULL, SMULL2, UMULL, UMULL2 */
+        /* 64 x 64 -> 128 */
+        if (size == 3) {
+            unallocated_encoding(s);
+            return;
+        }
+        if (!fp_access_check(s)) {
+            return;
+        }
+
+        handle_3rd_widening(s, is_q, is_u, size, opcode, rd, rn, rm);
+        break;
+    default:
+        /* opcode 15 not allocated */
+        unallocated_encoding(s);
+        break;
+    }
+}
+
+/* Logic op (opcode == 3) subgroup of C3.6.16. */
+static void disas_simd_3same_logic(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int rm = extract32(insn, 16, 5);
+    int size = extract32(insn, 22, 2);
+    bool is_u = extract32(insn, 29, 1);
+    bool is_q = extract32(insn, 30, 1);
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    switch (size + 4 * is_u) {
+    case 0: /* AND */
+        gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_and, 0);
+        return;
+    case 1: /* BIC */
+        gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_andc, 0);
+        return;
+    case 2: /* ORR */
+        gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_or, 0);
+        return;
+    case 3: /* ORN */
+        gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_orc, 0);
+        return;
+    case 4: /* EOR */
+        gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_xor, 0);
+        return;
+
+    case 5: /* BSL bitwise select */
+        gen_gvec_fn4(s, is_q, rd, rd, rn, rm, tcg_gen_gvec_bitsel, 0);
+        return;
+    case 6: /* BIT, bitwise insert if true */
+        gen_gvec_fn4(s, is_q, rd, rm, rn, rd, tcg_gen_gvec_bitsel, 0);
+        return;
+    case 7: /* BIF, bitwise insert if false */
+        gen_gvec_fn4(s, is_q, rd, rm, rd, rn, tcg_gen_gvec_bitsel, 0);
+        return;
+
+    default:
+        g_assert_not_reached();
+    }
+}
+
+/* Pairwise op subgroup of C3.6.16.
+ *
+ * This is called directly or via the handle_3same_float for float pairwise
+ * operations where the opcode and size are calculated differently.
+ */
+static void handle_simd_3same_pair(DisasContext *s, int is_q, int u, int opcode,
+                                   int size, int rn, int rm, int rd)
+{
+    TCGv_ptr fpst;
+    int pass;
+
+    /* Floating point operations need fpst */
+    if (opcode >= 0x58) {
+        fpst = fpstatus_ptr(FPST_FPCR);
+    } else {
+        fpst = NULL;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    /* These operations work on the concatenated rm:rn, with each pair of
+     * adjacent elements being operated on to produce an element in the result.
+     */
+    if (size == 3) {
+        TCGv_i64 tcg_res[2];
+
+        for (pass = 0; pass < 2; pass++) {
+            TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+            TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+            int passreg = (pass == 0) ? rn : rm;
+
+            read_vec_element(s, tcg_op1, passreg, 0, MO_64);
+            read_vec_element(s, tcg_op2, passreg, 1, MO_64);
+            tcg_res[pass] = tcg_temp_new_i64();
+
+            switch (opcode) {
+            case 0x17: /* ADDP */
+                tcg_gen_add_i64(tcg_res[pass], tcg_op1, tcg_op2);
+                break;
+            case 0x58: /* FMAXNMP */
+                gen_helper_vfp_maxnumd(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x5a: /* FADDP */
+                gen_helper_vfp_addd(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x5e: /* FMAXP */
+                gen_helper_vfp_maxd(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x78: /* FMINNMP */
+                gen_helper_vfp_minnumd(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x7e: /* FMINP */
+                gen_helper_vfp_mind(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+                break;
+            default:
+                g_assert_not_reached();
+            }
+
+            tcg_temp_free_i64(tcg_op1);
+            tcg_temp_free_i64(tcg_op2);
+        }
+
+        for (pass = 0; pass < 2; pass++) {
+            write_vec_element(s, tcg_res[pass], rd, pass, MO_64);
+            tcg_temp_free_i64(tcg_res[pass]);
+        }
+    } else {
+        int maxpass = is_q ? 4 : 2;
+        TCGv_i32 tcg_res[4];
+
+        for (pass = 0; pass < maxpass; pass++) {
+            TCGv_i32 tcg_op1 = tcg_temp_new_i32();
+            TCGv_i32 tcg_op2 = tcg_temp_new_i32();
+            NeonGenTwoOpFn *genfn = NULL;
+            int passreg = pass < (maxpass / 2) ? rn : rm;
+            int passelt = (is_q && (pass & 1)) ? 2 : 0;
+
+            read_vec_element_i32(s, tcg_op1, passreg, passelt, MO_32);
+            read_vec_element_i32(s, tcg_op2, passreg, passelt + 1, MO_32);
+            tcg_res[pass] = tcg_temp_new_i32();
+
+            switch (opcode) {
+            case 0x17: /* ADDP */
+            {
+                static NeonGenTwoOpFn * const fns[3] = {
+                    gen_helper_neon_padd_u8,
+                    gen_helper_neon_padd_u16,
+                    tcg_gen_add_i32,
+                };
+                genfn = fns[size];
+                break;
+            }
+            case 0x14: /* SMAXP, UMAXP */
+            {
+                static NeonGenTwoOpFn * const fns[3][2] = {
+                    { gen_helper_neon_pmax_s8, gen_helper_neon_pmax_u8 },
+                    { gen_helper_neon_pmax_s16, gen_helper_neon_pmax_u16 },
+                    { tcg_gen_smax_i32, tcg_gen_umax_i32 },
+                };
+                genfn = fns[size][u];
+                break;
+            }
+            case 0x15: /* SMINP, UMINP */
+            {
+                static NeonGenTwoOpFn * const fns[3][2] = {
+                    { gen_helper_neon_pmin_s8, gen_helper_neon_pmin_u8 },
+                    { gen_helper_neon_pmin_s16, gen_helper_neon_pmin_u16 },
+                    { tcg_gen_smin_i32, tcg_gen_umin_i32 },
+                };
+                genfn = fns[size][u];
+                break;
+            }
+            /* The FP operations are all on single floats (32 bit) */
+            case 0x58: /* FMAXNMP */
+                gen_helper_vfp_maxnums(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x5a: /* FADDP */
+                gen_helper_vfp_adds(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x5e: /* FMAXP */
+                gen_helper_vfp_maxs(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x78: /* FMINNMP */
+                gen_helper_vfp_minnums(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x7e: /* FMINP */
+                gen_helper_vfp_mins(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+                break;
+            default:
+                g_assert_not_reached();
+            }
+
+            /* FP ops called directly, otherwise call now */
+            if (genfn) {
+                genfn(tcg_res[pass], tcg_op1, tcg_op2);
+            }
+
+            tcg_temp_free_i32(tcg_op1);
+            tcg_temp_free_i32(tcg_op2);
+        }
+
+        for (pass = 0; pass < maxpass; pass++) {
+            write_vec_element_i32(s, tcg_res[pass], rd, pass, MO_32);
+            tcg_temp_free_i32(tcg_res[pass]);
+        }
+        clear_vec_high(s, is_q, rd);
+    }
+
+    if (fpst) {
+        tcg_temp_free_ptr(fpst);
+    }
+}
+
+/* Floating point op subgroup of C3.6.16. */
+static void disas_simd_3same_float(DisasContext *s, uint32_t insn)
+{
+    /* For floating point ops, the U, size[1] and opcode bits
+     * together indicate the operation. size[0] indicates single
+     * or double.
+     */
+    int fpopcode = extract32(insn, 11, 5)
+        | (extract32(insn, 23, 1) << 5)
+        | (extract32(insn, 29, 1) << 6);
+    int is_q = extract32(insn, 30, 1);
+    int size = extract32(insn, 22, 1);
+    int rm = extract32(insn, 16, 5);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+
+    int datasize = is_q ? 128 : 64;
+    int esize = 32 << size;
+    int elements = datasize / esize;
+
+    if (size == 1 && !is_q) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (fpopcode) {
+    case 0x58: /* FMAXNMP */
+    case 0x5a: /* FADDP */
+    case 0x5e: /* FMAXP */
+    case 0x78: /* FMINNMP */
+    case 0x7e: /* FMINP */
+        if (size && !is_q) {
+            unallocated_encoding(s);
+            return;
+        }
+        handle_simd_3same_pair(s, is_q, 0, fpopcode, size ? MO_64 : MO_32,
+                               rn, rm, rd);
+        return;
+    case 0x1b: /* FMULX */
+    case 0x1f: /* FRECPS */
+    case 0x3f: /* FRSQRTS */
+    case 0x5d: /* FACGE */
+    case 0x7d: /* FACGT */
+    case 0x19: /* FMLA */
+    case 0x39: /* FMLS */
+    case 0x18: /* FMAXNM */
+    case 0x1a: /* FADD */
+    case 0x1c: /* FCMEQ */
+    case 0x1e: /* FMAX */
+    case 0x38: /* FMINNM */
+    case 0x3a: /* FSUB */
+    case 0x3e: /* FMIN */
+    case 0x5b: /* FMUL */
+    case 0x5c: /* FCMGE */
+    case 0x5f: /* FDIV */
+    case 0x7a: /* FABD */
+    case 0x7c: /* FCMGT */
+        if (!fp_access_check(s)) {
+            return;
+        }
+        handle_3same_float(s, size, elements, fpopcode, rd, rn, rm);
+        return;
+
+    case 0x1d: /* FMLAL  */
+    case 0x3d: /* FMLSL  */
+    case 0x59: /* FMLAL2 */
+    case 0x79: /* FMLSL2 */
+        if (size & 1 || !dc_isar_feature(aa64_fhm, s)) {
+            unallocated_encoding(s);
+            return;
+        }
+        if (fp_access_check(s)) {
+            int is_s = extract32(insn, 23, 1);
+            int is_2 = extract32(insn, 29, 1);
+            int data = (is_2 << 1) | is_s;
+            tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd),
+                               vec_full_reg_offset(s, rn),
+                               vec_full_reg_offset(s, rm), cpu_env,
+                               is_q ? 16 : 8, vec_full_reg_size(s),
+                               data, gen_helper_gvec_fmlal_a64);
+        }
+        return;
+
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+}
+
+/* Integer op subgroup of C3.6.16. */
+static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
+{
+    int is_q = extract32(insn, 30, 1);
+    int u = extract32(insn, 29, 1);
+    int size = extract32(insn, 22, 2);
+    int opcode = extract32(insn, 11, 5);
+    int rm = extract32(insn, 16, 5);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+    int pass;
+    TCGCond cond;
+
+    switch (opcode) {
+    case 0x13: /* MUL, PMUL */
+        if (u && size != 0) {
+            unallocated_encoding(s);
+            return;
+        }
+        /* fall through */
+    case 0x0: /* SHADD, UHADD */
+    case 0x2: /* SRHADD, URHADD */
+    case 0x4: /* SHSUB, UHSUB */
+    case 0xc: /* SMAX, UMAX */
+    case 0xd: /* SMIN, UMIN */
+    case 0xe: /* SABD, UABD */
+    case 0xf: /* SABA, UABA */
+    case 0x12: /* MLA, MLS */
+        if (size == 3) {
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    case 0x16: /* SQDMULH, SQRDMULH */
+        if (size == 0 || size == 3) {
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    default:
+        if (size == 3 && !is_q) {
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    switch (opcode) {
+    case 0x01: /* SQADD, UQADD */
+        if (u) {
+            gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_uqadd_qc, size);
+        } else {
+            gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_sqadd_qc, size);
+        }
+        return;
+    case 0x05: /* SQSUB, UQSUB */
+        if (u) {
+            gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_uqsub_qc, size);
+        } else {
+            gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_sqsub_qc, size);
+        }
+        return;
+    case 0x08: /* SSHL, USHL */
+        if (u) {
+            gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_ushl, size);
+        } else {
+            gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_sshl, size);
+        }
+        return;
+    case 0x0c: /* SMAX, UMAX */
+        if (u) {
+            gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_umax, size);
+        } else {
+            gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_smax, size);
+        }
+        return;
+    case 0x0d: /* SMIN, UMIN */
+        if (u) {
+            gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_umin, size);
+        } else {
+            gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_smin, size);
+        }
+        return;
+    case 0xe: /* SABD, UABD */
+        if (u) {
+            gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_uabd, size);
+        } else {
+            gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_sabd, size);
+        }
+        return;
+    case 0xf: /* SABA, UABA */
+        if (u) {
+            gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_uaba, size);
+        } else {
+            gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_saba, size);
+        }
+        return;
+    case 0x10: /* ADD, SUB */
+        if (u) {
+            gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_sub, size);
+        } else {
+            gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_add, size);
+        }
+        return;
+    case 0x13: /* MUL, PMUL */
+        if (!u) { /* MUL */
+            gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_mul, size);
+        } else {  /* PMUL */
+            gen_gvec_op3_ool(s, is_q, rd, rn, rm, 0, gen_helper_gvec_pmul_b);
+        }
+        return;
+    case 0x12: /* MLA, MLS */
+        if (u) {
+            gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_mls, size);
+        } else {
+            gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_mla, size);
+        }
+        return;
+    case 0x16: /* SQDMULH, SQRDMULH */
+        {
+            static gen_helper_gvec_3_ptr * const fns[2][2] = {
+                { gen_helper_neon_sqdmulh_h, gen_helper_neon_sqrdmulh_h },
+                { gen_helper_neon_sqdmulh_s, gen_helper_neon_sqrdmulh_s },
+            };
+            gen_gvec_op3_qc(s, is_q, rd, rn, rm, fns[size - 1][u]);
+        }
+        return;
+    case 0x11:
+        if (!u) { /* CMTST */
+            gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_cmtst, size);
+            return;
+        }
+        /* else CMEQ */
+        cond = TCG_COND_EQ;
+        goto do_gvec_cmp;
+    case 0x06: /* CMGT, CMHI */
+        cond = u ? TCG_COND_GTU : TCG_COND_GT;
+        goto do_gvec_cmp;
+    case 0x07: /* CMGE, CMHS */
+        cond = u ? TCG_COND_GEU : TCG_COND_GE;
+    do_gvec_cmp:
+        tcg_gen_gvec_cmp(cond, size, vec_full_reg_offset(s, rd),
+                         vec_full_reg_offset(s, rn),
+                         vec_full_reg_offset(s, rm),
+                         is_q ? 16 : 8, vec_full_reg_size(s));
+        return;
+    }
+
+    if (size == 3) {
+        assert(is_q);
+        for (pass = 0; pass < 2; pass++) {
+            TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+            TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+            TCGv_i64 tcg_res = tcg_temp_new_i64();
+
+            read_vec_element(s, tcg_op1, rn, pass, MO_64);
+            read_vec_element(s, tcg_op2, rm, pass, MO_64);
+
+            handle_3same_64(s, opcode, u, tcg_res, tcg_op1, tcg_op2);
+
+            write_vec_element(s, tcg_res, rd, pass, MO_64);
+
+            tcg_temp_free_i64(tcg_res);
+            tcg_temp_free_i64(tcg_op1);
+            tcg_temp_free_i64(tcg_op2);
+        }
+    } else {
+        for (pass = 0; pass < (is_q ? 4 : 2); pass++) {
+            TCGv_i32 tcg_op1 = tcg_temp_new_i32();
+            TCGv_i32 tcg_op2 = tcg_temp_new_i32();
+            TCGv_i32 tcg_res = tcg_temp_new_i32();
+            NeonGenTwoOpFn *genfn = NULL;
+            NeonGenTwoOpEnvFn *genenvfn = NULL;
+
+            read_vec_element_i32(s, tcg_op1, rn, pass, MO_32);
+            read_vec_element_i32(s, tcg_op2, rm, pass, MO_32);
+
+            switch (opcode) {
+            case 0x0: /* SHADD, UHADD */
+            {
+                static NeonGenTwoOpFn * const fns[3][2] = {
+                    { gen_helper_neon_hadd_s8, gen_helper_neon_hadd_u8 },
+                    { gen_helper_neon_hadd_s16, gen_helper_neon_hadd_u16 },
+                    { gen_helper_neon_hadd_s32, gen_helper_neon_hadd_u32 },
+                };
+                genfn = fns[size][u];
+                break;
+            }
+            case 0x2: /* SRHADD, URHADD */
+            {
+                static NeonGenTwoOpFn * const fns[3][2] = {
+                    { gen_helper_neon_rhadd_s8, gen_helper_neon_rhadd_u8 },
+                    { gen_helper_neon_rhadd_s16, gen_helper_neon_rhadd_u16 },
+                    { gen_helper_neon_rhadd_s32, gen_helper_neon_rhadd_u32 },
+                };
+                genfn = fns[size][u];
+                break;
+            }
+            case 0x4: /* SHSUB, UHSUB */
+            {
+                static NeonGenTwoOpFn * const fns[3][2] = {
+                    { gen_helper_neon_hsub_s8, gen_helper_neon_hsub_u8 },
+                    { gen_helper_neon_hsub_s16, gen_helper_neon_hsub_u16 },
+                    { gen_helper_neon_hsub_s32, gen_helper_neon_hsub_u32 },
+                };
+                genfn = fns[size][u];
+                break;
+            }
+            case 0x9: /* SQSHL, UQSHL */
+            {
+                static NeonGenTwoOpEnvFn * const fns[3][2] = {
+                    { gen_helper_neon_qshl_s8, gen_helper_neon_qshl_u8 },
+                    { gen_helper_neon_qshl_s16, gen_helper_neon_qshl_u16 },
+                    { gen_helper_neon_qshl_s32, gen_helper_neon_qshl_u32 },
+                };
+                genenvfn = fns[size][u];
+                break;
+            }
+            case 0xa: /* SRSHL, URSHL */
+            {
+                static NeonGenTwoOpFn * const fns[3][2] = {
+                    { gen_helper_neon_rshl_s8, gen_helper_neon_rshl_u8 },
+                    { gen_helper_neon_rshl_s16, gen_helper_neon_rshl_u16 },
+                    { gen_helper_neon_rshl_s32, gen_helper_neon_rshl_u32 },
+                };
+                genfn = fns[size][u];
+                break;
+            }
+            case 0xb: /* SQRSHL, UQRSHL */
+            {
+                static NeonGenTwoOpEnvFn * const fns[3][2] = {
+                    { gen_helper_neon_qrshl_s8, gen_helper_neon_qrshl_u8 },
+                    { gen_helper_neon_qrshl_s16, gen_helper_neon_qrshl_u16 },
+                    { gen_helper_neon_qrshl_s32, gen_helper_neon_qrshl_u32 },
+                };
+                genenvfn = fns[size][u];
+                break;
+            }
+            default:
+                g_assert_not_reached();
+            }
+
+            if (genenvfn) {
+                genenvfn(tcg_res, cpu_env, tcg_op1, tcg_op2);
+            } else {
+                genfn(tcg_res, tcg_op1, tcg_op2);
+            }
+
+            write_vec_element_i32(s, tcg_res, rd, pass, MO_32);
+
+            tcg_temp_free_i32(tcg_res);
+            tcg_temp_free_i32(tcg_op1);
+            tcg_temp_free_i32(tcg_op2);
+        }
+    }
+    clear_vec_high(s, is_q, rd);
+}
+
+/* AdvSIMD three same
+ *  31  30  29  28       24 23  22  21 20  16 15    11  10 9    5 4    0
+ * +---+---+---+-----------+------+---+------+--------+---+------+------+
+ * | 0 | Q | U | 0 1 1 1 0 | size | 1 |  Rm  | opcode | 1 |  Rn  |  Rd  |
+ * +---+---+---+-----------+------+---+------+--------+---+------+------+
+ */
+static void disas_simd_three_reg_same(DisasContext *s, uint32_t insn)
+{
+    int opcode = extract32(insn, 11, 5);
+
+    switch (opcode) {
+    case 0x3: /* logic ops */
+        disas_simd_3same_logic(s, insn);
+        break;
+    case 0x17: /* ADDP */
+    case 0x14: /* SMAXP, UMAXP */
+    case 0x15: /* SMINP, UMINP */
+    {
+        /* Pairwise operations */
+        int is_q = extract32(insn, 30, 1);
+        int u = extract32(insn, 29, 1);
+        int size = extract32(insn, 22, 2);
+        int rm = extract32(insn, 16, 5);
+        int rn = extract32(insn, 5, 5);
+        int rd = extract32(insn, 0, 5);
+        if (opcode == 0x17) {
+            if (u || (size == 3 && !is_q)) {
+                unallocated_encoding(s);
+                return;
+            }
+        } else {
+            if (size == 3) {
+                unallocated_encoding(s);
+                return;
+            }
+        }
+        handle_simd_3same_pair(s, is_q, u, opcode, size, rn, rm, rd);
+        break;
+    }
+    case 0x18 ... 0x31:
+        /* floating point ops, sz[1] and U are part of opcode */
+        disas_simd_3same_float(s, insn);
+        break;
+    default:
+        disas_simd_3same_int(s, insn);
+        break;
+    }
+}
+
+/*
+ * Advanced SIMD three same (ARMv8.2 FP16 variants)
+ *
+ *  31  30  29  28       24 23  22 21 20  16 15 14 13    11 10  9    5 4    0
+ * +---+---+---+-----------+---------+------+-----+--------+---+------+------+
+ * | 0 | Q | U | 0 1 1 1 0 | a | 1 0 |  Rm  | 0 0 | opcode | 1 |  Rn  |  Rd  |
+ * +---+---+---+-----------+---------+------+-----+--------+---+------+------+
+ *
+ * This includes FMULX, FCMEQ (register), FRECPS, FRSQRTS, FCMGE
+ * (register), FACGE, FABD, FCMGT (register) and FACGT.
+ *
+ */
+static void disas_simd_three_reg_same_fp16(DisasContext *s, uint32_t insn)
+{
+    int opcode = extract32(insn, 11, 3);
+    int u = extract32(insn, 29, 1);
+    int a = extract32(insn, 23, 1);
+    int is_q = extract32(insn, 30, 1);
+    int rm = extract32(insn, 16, 5);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+    /*
+     * For these floating point ops, the U, a and opcode bits
+     * together indicate the operation.
+     */
+    int fpopcode = opcode | (a << 3) | (u << 4);
+    int datasize = is_q ? 128 : 64;
+    int elements = datasize / 16;
+    bool pairwise;
+    TCGv_ptr fpst;
+    int pass;
+
+    switch (fpopcode) {
+    case 0x0: /* FMAXNM */
+    case 0x1: /* FMLA */
+    case 0x2: /* FADD */
+    case 0x3: /* FMULX */
+    case 0x4: /* FCMEQ */
+    case 0x6: /* FMAX */
+    case 0x7: /* FRECPS */
+    case 0x8: /* FMINNM */
+    case 0x9: /* FMLS */
+    case 0xa: /* FSUB */
+    case 0xe: /* FMIN */
+    case 0xf: /* FRSQRTS */
+    case 0x13: /* FMUL */
+    case 0x14: /* FCMGE */
+    case 0x15: /* FACGE */
+    case 0x17: /* FDIV */
+    case 0x1a: /* FABD */
+    case 0x1c: /* FCMGT */
+    case 0x1d: /* FACGT */
+        pairwise = false;
+        break;
+    case 0x10: /* FMAXNMP */
+    case 0x12: /* FADDP */
+    case 0x16: /* FMAXP */
+    case 0x18: /* FMINNMP */
+    case 0x1e: /* FMINP */
+        pairwise = true;
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!dc_isar_feature(aa64_fp16, s)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    fpst = fpstatus_ptr(FPST_FPCR_F16);
+
+    if (pairwise) {
+        int maxpass = is_q ? 8 : 4;
+        TCGv_i32 tcg_op1 = tcg_temp_new_i32();
+        TCGv_i32 tcg_op2 = tcg_temp_new_i32();
+        TCGv_i32 tcg_res[8];
+
+        for (pass = 0; pass < maxpass; pass++) {
+            int passreg = pass < (maxpass / 2) ? rn : rm;
+            int passelt = (pass << 1) & (maxpass - 1);
+
+            read_vec_element_i32(s, tcg_op1, passreg, passelt, MO_16);
+            read_vec_element_i32(s, tcg_op2, passreg, passelt + 1, MO_16);
+            tcg_res[pass] = tcg_temp_new_i32();
+
+            switch (fpopcode) {
+            case 0x10: /* FMAXNMP */
+                gen_helper_advsimd_maxnumh(tcg_res[pass], tcg_op1, tcg_op2,
+                                           fpst);
+                break;
+            case 0x12: /* FADDP */
+                gen_helper_advsimd_addh(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x16: /* FMAXP */
+                gen_helper_advsimd_maxh(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x18: /* FMINNMP */
+                gen_helper_advsimd_minnumh(tcg_res[pass], tcg_op1, tcg_op2,
+                                           fpst);
+                break;
+            case 0x1e: /* FMINP */
+                gen_helper_advsimd_minh(tcg_res[pass], tcg_op1, tcg_op2, fpst);
+                break;
+            default:
+                g_assert_not_reached();
+            }
+        }
+
+        for (pass = 0; pass < maxpass; pass++) {
+            write_vec_element_i32(s, tcg_res[pass], rd, pass, MO_16);
+            tcg_temp_free_i32(tcg_res[pass]);
+        }
+
+        tcg_temp_free_i32(tcg_op1);
+        tcg_temp_free_i32(tcg_op2);
+
+    } else {
+        for (pass = 0; pass < elements; pass++) {
+            TCGv_i32 tcg_op1 = tcg_temp_new_i32();
+            TCGv_i32 tcg_op2 = tcg_temp_new_i32();
+            TCGv_i32 tcg_res = tcg_temp_new_i32();
+
+            read_vec_element_i32(s, tcg_op1, rn, pass, MO_16);
+            read_vec_element_i32(s, tcg_op2, rm, pass, MO_16);
+
+            switch (fpopcode) {
+            case 0x0: /* FMAXNM */
+                gen_helper_advsimd_maxnumh(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x1: /* FMLA */
+                read_vec_element_i32(s, tcg_res, rd, pass, MO_16);
+                gen_helper_advsimd_muladdh(tcg_res, tcg_op1, tcg_op2, tcg_res,
+                                           fpst);
+                break;
+            case 0x2: /* FADD */
+                gen_helper_advsimd_addh(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x3: /* FMULX */
+                gen_helper_advsimd_mulxh(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x4: /* FCMEQ */
+                gen_helper_advsimd_ceq_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x6: /* FMAX */
+                gen_helper_advsimd_maxh(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x7: /* FRECPS */
+                gen_helper_recpsf_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x8: /* FMINNM */
+                gen_helper_advsimd_minnumh(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x9: /* FMLS */
+                /* As usual for ARM, separate negation for fused multiply-add */
+                tcg_gen_xori_i32(tcg_op1, tcg_op1, 0x8000);
+                read_vec_element_i32(s, tcg_res, rd, pass, MO_16);
+                gen_helper_advsimd_muladdh(tcg_res, tcg_op1, tcg_op2, tcg_res,
+                                           fpst);
+                break;
+            case 0xa: /* FSUB */
+                gen_helper_advsimd_subh(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0xe: /* FMIN */
+                gen_helper_advsimd_minh(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0xf: /* FRSQRTS */
+                gen_helper_rsqrtsf_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x13: /* FMUL */
+                gen_helper_advsimd_mulh(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x14: /* FCMGE */
+                gen_helper_advsimd_cge_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x15: /* FACGE */
+                gen_helper_advsimd_acge_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x17: /* FDIV */
+                gen_helper_advsimd_divh(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x1a: /* FABD */
+                gen_helper_advsimd_subh(tcg_res, tcg_op1, tcg_op2, fpst);
+                tcg_gen_andi_i32(tcg_res, tcg_res, 0x7fff);
+                break;
+            case 0x1c: /* FCMGT */
+                gen_helper_advsimd_cgt_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            case 0x1d: /* FACGT */
+                gen_helper_advsimd_acgt_f16(tcg_res, tcg_op1, tcg_op2, fpst);
+                break;
+            default:
+                g_assert_not_reached();
+            }
+
+            write_vec_element_i32(s, tcg_res, rd, pass, MO_16);
+            tcg_temp_free_i32(tcg_res);
+            tcg_temp_free_i32(tcg_op1);
+            tcg_temp_free_i32(tcg_op2);
+        }
+    }
+
+    tcg_temp_free_ptr(fpst);
+
+    clear_vec_high(s, is_q, rd);
+}
+
+/* AdvSIMD three same extra
+ *  31   30  29 28       24 23  22  21 20  16  15 14    11  10 9  5 4  0
+ * +---+---+---+-----------+------+---+------+---+--------+---+----+----+
+ * | 0 | Q | U | 0 1 1 1 0 | size | 0 |  Rm  | 1 | opcode | 1 | Rn | Rd |
+ * +---+---+---+-----------+------+---+------+---+--------+---+----+----+
+ */
+static void disas_simd_three_reg_same_extra(DisasContext *s, uint32_t insn)
+{
+    int rd = extract32(insn, 0, 5);
+    int rn = extract32(insn, 5, 5);
+    int opcode = extract32(insn, 11, 4);
+    int rm = extract32(insn, 16, 5);
+    int size = extract32(insn, 22, 2);
+    bool u = extract32(insn, 29, 1);
+    bool is_q = extract32(insn, 30, 1);
+    bool feature;
+    int rot;
+
+    switch (u * 16 + opcode) {
+    case 0x10: /* SQRDMLAH (vector) */
+    case 0x11: /* SQRDMLSH (vector) */
+        if (size != 1 && size != 2) {
+            unallocated_encoding(s);
+            return;
+        }
+        feature = dc_isar_feature(aa64_rdm, s);
+        break;
+    case 0x02: /* SDOT (vector) */
+    case 0x12: /* UDOT (vector) */
+        if (size != MO_32) {
+            unallocated_encoding(s);
+            return;
+        }
+        feature = dc_isar_feature(aa64_dp, s);
+        break;
+    case 0x03: /* USDOT */
+        if (size != MO_32) {
+            unallocated_encoding(s);
+            return;
+        }
+        feature = dc_isar_feature(aa64_i8mm, s);
+        break;
+    case 0x04: /* SMMLA */
+    case 0x14: /* UMMLA */
+    case 0x05: /* USMMLA */
+        if (!is_q || size != MO_32) {
+            unallocated_encoding(s);
+            return;
+        }
+        feature = dc_isar_feature(aa64_i8mm, s);
+        break;
+    case 0x18: /* FCMLA, #0 */
+    case 0x19: /* FCMLA, #90 */
+    case 0x1a: /* FCMLA, #180 */
+    case 0x1b: /* FCMLA, #270 */
+    case 0x1c: /* FCADD, #90 */
+    case 0x1e: /* FCADD, #270 */
+        if (size == 0
+            || (size == 1 && !dc_isar_feature(aa64_fp16, s))
+            || (size == 3 && !is_q)) {
+            unallocated_encoding(s);
+            return;
+        }
+        feature = dc_isar_feature(aa64_fcma, s);
+        break;
+    case 0x1d: /* BFMMLA */
+        if (size != MO_16 || !is_q) {
+            unallocated_encoding(s);
+            return;
+        }
+        feature = dc_isar_feature(aa64_bf16, s);
+        break;
+    case 0x1f:
+        switch (size) {
+        case 1: /* BFDOT */
+        case 3: /* BFMLAL{B,T} */
+            feature = dc_isar_feature(aa64_bf16, s);
+            break;
+        default:
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+    if (!feature) {
+        unallocated_encoding(s);
+        return;
+    }
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    switch (opcode) {
+    case 0x0: /* SQRDMLAH (vector) */
+        gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_sqrdmlah_qc, size);
+        return;
+
+    case 0x1: /* SQRDMLSH (vector) */
+        gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_sqrdmlsh_qc, size);
+        return;
+
+    case 0x2: /* SDOT / UDOT */
+        gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, 0,
+                         u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b);
+        return;
+
+    case 0x3: /* USDOT */
+        gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, 0, gen_helper_gvec_usdot_b);
+        return;
+
+    case 0x04: /* SMMLA, UMMLA */
+        gen_gvec_op4_ool(s, 1, rd, rn, rm, rd, 0,
+                         u ? gen_helper_gvec_ummla_b
+                         : gen_helper_gvec_smmla_b);
+        return;
+    case 0x05: /* USMMLA */
+        gen_gvec_op4_ool(s, 1, rd, rn, rm, rd, 0, gen_helper_gvec_usmmla_b);
+        return;
+
+    case 0x8: /* FCMLA, #0 */
+    case 0x9: /* FCMLA, #90 */
+    case 0xa: /* FCMLA, #180 */
+    case 0xb: /* FCMLA, #270 */
+        rot = extract32(opcode, 0, 2);
+        switch (size) {
+        case 1:
+            gen_gvec_op4_fpst(s, is_q, rd, rn, rm, rd, true, rot,
+                              gen_helper_gvec_fcmlah);
+            break;
+        case 2:
+            gen_gvec_op4_fpst(s, is_q, rd, rn, rm, rd, false, rot,
+                              gen_helper_gvec_fcmlas);
+            break;
+        case 3:
+            gen_gvec_op4_fpst(s, is_q, rd, rn, rm, rd, false, rot,
+                              gen_helper_gvec_fcmlad);
+            break;
+        default:
+            g_assert_not_reached();
+        }
+        return;
+
+    case 0xc: /* FCADD, #90 */
+    case 0xe: /* FCADD, #270 */
+        rot = extract32(opcode, 1, 1);
+        switch (size) {
+        case 1:
+            gen_gvec_op3_fpst(s, is_q, rd, rn, rm, size == 1, rot,
+                              gen_helper_gvec_fcaddh);
+            break;
+        case 2:
+            gen_gvec_op3_fpst(s, is_q, rd, rn, rm, size == 1, rot,
+                              gen_helper_gvec_fcadds);
+            break;
+        case 3:
+            gen_gvec_op3_fpst(s, is_q, rd, rn, rm, size == 1, rot,
+                              gen_helper_gvec_fcaddd);
+            break;
+        default:
+            g_assert_not_reached();
+        }
+        return;
+
+    case 0xd: /* BFMMLA */
+        gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, 0, gen_helper_gvec_bfmmla);
+        return;
+    case 0xf:
+        switch (size) {
+        case 1: /* BFDOT */
+            gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, 0, gen_helper_gvec_bfdot);
+            break;
+        case 3: /* BFMLAL{B,T} */
+            gen_gvec_op4_fpst(s, 1, rd, rn, rm, rd, false, is_q,
+                              gen_helper_gvec_bfmlal);
+            break;
+        default:
+            g_assert_not_reached();
+        }
+        return;
+
+    default:
+        g_assert_not_reached();
+    }
+}
+
+static void handle_2misc_widening(DisasContext *s, int opcode, bool is_q,
+                                  int size, int rn, int rd)
+{
+    /* Handle 2-reg-misc ops which are widening (so each size element
+     * in the source becomes a 2*size element in the destination.
+     * The only instruction like this is FCVTL.
+     */
+    int pass;
+
+    if (size == 3) {
+        /* 32 -> 64 bit fp conversion */
+        TCGv_i64 tcg_res[2];
+        int srcelt = is_q ? 2 : 0;
+
+        for (pass = 0; pass < 2; pass++) {
+            TCGv_i32 tcg_op = tcg_temp_new_i32();
+            tcg_res[pass] = tcg_temp_new_i64();
+
+            read_vec_element_i32(s, tcg_op, rn, srcelt + pass, MO_32);
+            gen_helper_vfp_fcvtds(tcg_res[pass], tcg_op, cpu_env);
+            tcg_temp_free_i32(tcg_op);
+        }
+        for (pass = 0; pass < 2; pass++) {
+            write_vec_element(s, tcg_res[pass], rd, pass, MO_64);
+            tcg_temp_free_i64(tcg_res[pass]);
+        }
+    } else {
+        /* 16 -> 32 bit fp conversion */
+        int srcelt = is_q ? 4 : 0;
+        TCGv_i32 tcg_res[4];
+        TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
+        TCGv_i32 ahp = get_ahp_flag();
+
+        for (pass = 0; pass < 4; pass++) {
+            tcg_res[pass] = tcg_temp_new_i32();
+
+            read_vec_element_i32(s, tcg_res[pass], rn, srcelt + pass, MO_16);
+            gen_helper_vfp_fcvt_f16_to_f32(tcg_res[pass], tcg_res[pass],
+                                           fpst, ahp);
+        }
+        for (pass = 0; pass < 4; pass++) {
+            write_vec_element_i32(s, tcg_res[pass], rd, pass, MO_32);
+            tcg_temp_free_i32(tcg_res[pass]);
+        }
+
+        tcg_temp_free_ptr(fpst);
+        tcg_temp_free_i32(ahp);
+    }
+}
+
+static void handle_rev(DisasContext *s, int opcode, bool u,
+                       bool is_q, int size, int rn, int rd)
+{
+    int op = (opcode << 1) | u;
+    int opsz = op + size;
+    int grp_size = 3 - opsz;
+    int dsize = is_q ? 128 : 64;
+    int i;
+
+    if (opsz >= 3) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    if (size == 0) {
+        /* Special case bytes, use bswap op on each group of elements */
+        int groups = dsize / (8 << grp_size);
+
+        for (i = 0; i < groups; i++) {
+            TCGv_i64 tcg_tmp = tcg_temp_new_i64();
+
+            read_vec_element(s, tcg_tmp, rn, i, grp_size);
+            switch (grp_size) {
+            case MO_16:
+                tcg_gen_bswap16_i64(tcg_tmp, tcg_tmp, TCG_BSWAP_IZ);
+                break;
+            case MO_32:
+                tcg_gen_bswap32_i64(tcg_tmp, tcg_tmp, TCG_BSWAP_IZ);
+                break;
+            case MO_64:
+                tcg_gen_bswap64_i64(tcg_tmp, tcg_tmp);
+                break;
+            default:
+                g_assert_not_reached();
+            }
+            write_vec_element(s, tcg_tmp, rd, i, grp_size);
+            tcg_temp_free_i64(tcg_tmp);
+        }
+        clear_vec_high(s, is_q, rd);
+    } else {
+        int revmask = (1 << grp_size) - 1;
+        int esize = 8 << size;
+        int elements = dsize / esize;
+        TCGv_i64 tcg_rn = tcg_temp_new_i64();
+        TCGv_i64 tcg_rd = tcg_const_i64(0);
+        TCGv_i64 tcg_rd_hi = tcg_const_i64(0);
+
+        for (i = 0; i < elements; i++) {
+            int e_rev = (i & 0xf) ^ revmask;
+            int off = e_rev * esize;
+            read_vec_element(s, tcg_rn, rn, i, size);
+            if (off >= 64) {
+                tcg_gen_deposit_i64(tcg_rd_hi, tcg_rd_hi,
+                                    tcg_rn, off - 64, esize);
+            } else {
+                tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_rn, off, esize);
+            }
+        }
+        write_vec_element(s, tcg_rd, rd, 0, MO_64);
+        write_vec_element(s, tcg_rd_hi, rd, 1, MO_64);
+
+        tcg_temp_free_i64(tcg_rd_hi);
+        tcg_temp_free_i64(tcg_rd);
+        tcg_temp_free_i64(tcg_rn);
+    }
+}
+
+static void handle_2misc_pairwise(DisasContext *s, int opcode, bool u,
+                                  bool is_q, int size, int rn, int rd)
+{
+    /* Implement the pairwise operations from 2-misc:
+     * SADDLP, UADDLP, SADALP, UADALP.
+     * These all add pairs of elements in the input to produce a
+     * double-width result element in the output (possibly accumulating).
+     */
+    bool accum = (opcode == 0x6);
+    int maxpass = is_q ? 2 : 1;
+    int pass;
+    TCGv_i64 tcg_res[2];
+
+    if (size == 2) {
+        /* 32 + 32 -> 64 op */
+        MemOp memop = size + (u ? 0 : MO_SIGN);
+
+        for (pass = 0; pass < maxpass; pass++) {
+            TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+            TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+
+            tcg_res[pass] = tcg_temp_new_i64();
+
+            read_vec_element(s, tcg_op1, rn, pass * 2, memop);
+            read_vec_element(s, tcg_op2, rn, pass * 2 + 1, memop);
+            tcg_gen_add_i64(tcg_res[pass], tcg_op1, tcg_op2);
+            if (accum) {
+                read_vec_element(s, tcg_op1, rd, pass, MO_64);
+                tcg_gen_add_i64(tcg_res[pass], tcg_res[pass], tcg_op1);
+            }
+
+            tcg_temp_free_i64(tcg_op1);
+            tcg_temp_free_i64(tcg_op2);
+        }
+    } else {
+        for (pass = 0; pass < maxpass; pass++) {
+            TCGv_i64 tcg_op = tcg_temp_new_i64();
+            NeonGenOne64OpFn *genfn;
+            static NeonGenOne64OpFn * const fns[2][2] = {
+                { gen_helper_neon_addlp_s8,  gen_helper_neon_addlp_u8 },
+                { gen_helper_neon_addlp_s16,  gen_helper_neon_addlp_u16 },
+            };
+
+            genfn = fns[size][u];
+
+            tcg_res[pass] = tcg_temp_new_i64();
+
+            read_vec_element(s, tcg_op, rn, pass, MO_64);
+            genfn(tcg_res[pass], tcg_op);
+
+            if (accum) {
+                read_vec_element(s, tcg_op, rd, pass, MO_64);
+                if (size == 0) {
+                    gen_helper_neon_addl_u16(tcg_res[pass],
+                                             tcg_res[pass], tcg_op);
+                } else {
+                    gen_helper_neon_addl_u32(tcg_res[pass],
+                                             tcg_res[pass], tcg_op);
+                }
+            }
+            tcg_temp_free_i64(tcg_op);
+        }
+    }
+    if (!is_q) {
+        tcg_res[1] = tcg_constant_i64(0);
+    }
+    for (pass = 0; pass < 2; pass++) {
+        write_vec_element(s, tcg_res[pass], rd, pass, MO_64);
+        tcg_temp_free_i64(tcg_res[pass]);
+    }
+}
+
+static void handle_shll(DisasContext *s, bool is_q, int size, int rn, int rd)
+{
+    /* Implement SHLL and SHLL2 */
+    int pass;
+    int part = is_q ? 2 : 0;
+    TCGv_i64 tcg_res[2];
+
+    for (pass = 0; pass < 2; pass++) {
+        static NeonGenWidenFn * const widenfns[3] = {
+            gen_helper_neon_widen_u8,
+            gen_helper_neon_widen_u16,
+            tcg_gen_extu_i32_i64,
+        };
+        NeonGenWidenFn *widenfn = widenfns[size];
+        TCGv_i32 tcg_op = tcg_temp_new_i32();
+
+        read_vec_element_i32(s, tcg_op, rn, part + pass, MO_32);
+        tcg_res[pass] = tcg_temp_new_i64();
+        widenfn(tcg_res[pass], tcg_op);
+        tcg_gen_shli_i64(tcg_res[pass], tcg_res[pass], 8 << size);
+
+        tcg_temp_free_i32(tcg_op);
+    }
+
+    for (pass = 0; pass < 2; pass++) {
+        write_vec_element(s, tcg_res[pass], rd, pass, MO_64);
+        tcg_temp_free_i64(tcg_res[pass]);
+    }
+}
+
+/* AdvSIMD two reg misc
+ *   31  30  29 28       24 23  22 21       17 16    12 11 10 9    5 4    0
+ * +---+---+---+-----------+------+-----------+--------+-----+------+------+
+ * | 0 | Q | U | 0 1 1 1 0 | size | 1 0 0 0 0 | opcode | 1 0 |  Rn  |  Rd  |
+ * +---+---+---+-----------+------+-----------+--------+-----+------+------+
+ */
+static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
+{
+    int size = extract32(insn, 22, 2);
+    int opcode = extract32(insn, 12, 5);
+    bool u = extract32(insn, 29, 1);
+    bool is_q = extract32(insn, 30, 1);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+    bool need_fpstatus = false;
+    bool need_rmode = false;
+    int rmode = -1;
+    TCGv_i32 tcg_rmode;
+    TCGv_ptr tcg_fpstatus;
+
+    switch (opcode) {
+    case 0x0: /* REV64, REV32 */
+    case 0x1: /* REV16 */
+        handle_rev(s, opcode, u, is_q, size, rn, rd);
+        return;
+    case 0x5: /* CNT, NOT, RBIT */
+        if (u && size == 0) {
+            /* NOT */
+            break;
+        } else if (u && size == 1) {
+            /* RBIT */
+            break;
+        } else if (!u && size == 0) {
+            /* CNT */
+            break;
+        }
+        unallocated_encoding(s);
+        return;
+    case 0x12: /* XTN, XTN2, SQXTUN, SQXTUN2 */
+    case 0x14: /* SQXTN, SQXTN2, UQXTN, UQXTN2 */
+        if (size == 3) {
+            unallocated_encoding(s);
+            return;
+        }
+        if (!fp_access_check(s)) {
+            return;
+        }
+
+        handle_2misc_narrow(s, false, opcode, u, is_q, size, rn, rd);
+        return;
+    case 0x4: /* CLS, CLZ */
+        if (size == 3) {
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    case 0x2: /* SADDLP, UADDLP */
+    case 0x6: /* SADALP, UADALP */
+        if (size == 3) {
+            unallocated_encoding(s);
+            return;
+        }
+        if (!fp_access_check(s)) {
+            return;
+        }
+        handle_2misc_pairwise(s, opcode, u, is_q, size, rn, rd);
+        return;
+    case 0x13: /* SHLL, SHLL2 */
+        if (u == 0 || size == 3) {
+            unallocated_encoding(s);
+            return;
+        }
+        if (!fp_access_check(s)) {
+            return;
+        }
+        handle_shll(s, is_q, size, rn, rd);
+        return;
+    case 0xa: /* CMLT */
+        if (u == 1) {
+            unallocated_encoding(s);
+            return;
+        }
+        /* fall through */
+    case 0x8: /* CMGT, CMGE */
+    case 0x9: /* CMEQ, CMLE */
+    case 0xb: /* ABS, NEG */
+        if (size == 3 && !is_q) {
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    case 0x3: /* SUQADD, USQADD */
+        if (size == 3 && !is_q) {
+            unallocated_encoding(s);
+            return;
+        }
+        if (!fp_access_check(s)) {
+            return;
+        }
+        handle_2misc_satacc(s, false, u, is_q, size, rn, rd);
+        return;
+    case 0x7: /* SQABS, SQNEG */
+        if (size == 3 && !is_q) {
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    case 0xc ... 0xf:
+    case 0x16 ... 0x1f:
+    {
+        /* Floating point: U, size[1] and opcode indicate operation;
+         * size[0] indicates single or double precision.
+         */
+        int is_double = extract32(size, 0, 1);
+        opcode |= (extract32(size, 1, 1) << 5) | (u << 6);
+        size = is_double ? 3 : 2;
+        switch (opcode) {
+        case 0x2f: /* FABS */
+        case 0x6f: /* FNEG */
+            if (size == 3 && !is_q) {
+                unallocated_encoding(s);
+                return;
+            }
+            break;
+        case 0x1d: /* SCVTF */
+        case 0x5d: /* UCVTF */
+        {
+            bool is_signed = (opcode == 0x1d) ? true : false;
+            int elements = is_double ? 2 : is_q ? 4 : 2;
+            if (is_double && !is_q) {
+                unallocated_encoding(s);
+                return;
+            }
+            if (!fp_access_check(s)) {
+                return;
+            }
+            handle_simd_intfp_conv(s, rd, rn, elements, is_signed, 0, size);
+            return;
+        }
+        case 0x2c: /* FCMGT (zero) */
+        case 0x2d: /* FCMEQ (zero) */
+        case 0x2e: /* FCMLT (zero) */
+        case 0x6c: /* FCMGE (zero) */
+        case 0x6d: /* FCMLE (zero) */
+            if (size == 3 && !is_q) {
+                unallocated_encoding(s);
+                return;
+            }
+            handle_2misc_fcmp_zero(s, opcode, false, u, is_q, size, rn, rd);
+            return;
+        case 0x7f: /* FSQRT */
+            if (size == 3 && !is_q) {
+                unallocated_encoding(s);
+                return;
+            }
+            break;
+        case 0x1a: /* FCVTNS */
+        case 0x1b: /* FCVTMS */
+        case 0x3a: /* FCVTPS */
+        case 0x3b: /* FCVTZS */
+        case 0x5a: /* FCVTNU */
+        case 0x5b: /* FCVTMU */
+        case 0x7a: /* FCVTPU */
+        case 0x7b: /* FCVTZU */
+            need_fpstatus = true;
+            need_rmode = true;
+            rmode = extract32(opcode, 5, 1) | (extract32(opcode, 0, 1) << 1);
+            if (size == 3 && !is_q) {
+                unallocated_encoding(s);
+                return;
+            }
+            break;
+        case 0x5c: /* FCVTAU */
+        case 0x1c: /* FCVTAS */
+            need_fpstatus = true;
+            need_rmode = true;
+            rmode = FPROUNDING_TIEAWAY;
+            if (size == 3 && !is_q) {
+                unallocated_encoding(s);
+                return;
+            }
+            break;
+        case 0x3c: /* URECPE */
+            if (size == 3) {
+                unallocated_encoding(s);
+                return;
+            }
+            /* fall through */
+        case 0x3d: /* FRECPE */
+        case 0x7d: /* FRSQRTE */
+            if (size == 3 && !is_q) {
+                unallocated_encoding(s);
+                return;
+            }
+            if (!fp_access_check(s)) {
+                return;
+            }
+            handle_2misc_reciprocal(s, opcode, false, u, is_q, size, rn, rd);
+            return;
+        case 0x56: /* FCVTXN, FCVTXN2 */
+            if (size == 2) {
+                unallocated_encoding(s);
+                return;
+            }
+            /* fall through */
+        case 0x16: /* FCVTN, FCVTN2 */
+            /* handle_2misc_narrow does a 2*size -> size operation, but these
+             * instructions encode the source size rather than dest size.
+             */
+            if (!fp_access_check(s)) {
+                return;
+            }
+            handle_2misc_narrow(s, false, opcode, 0, is_q, size - 1, rn, rd);
+            return;
+        case 0x36: /* BFCVTN, BFCVTN2 */
+            if (!dc_isar_feature(aa64_bf16, s) || size != 2) {
+                unallocated_encoding(s);
+                return;
+            }
+            if (!fp_access_check(s)) {
+                return;
+            }
+            handle_2misc_narrow(s, false, opcode, 0, is_q, size - 1, rn, rd);
+            return;
+        case 0x17: /* FCVTL, FCVTL2 */
+            if (!fp_access_check(s)) {
+                return;
+            }
+            handle_2misc_widening(s, opcode, is_q, size, rn, rd);
+            return;
+        case 0x18: /* FRINTN */
+        case 0x19: /* FRINTM */
+        case 0x38: /* FRINTP */
+        case 0x39: /* FRINTZ */
+            need_rmode = true;
+            rmode = extract32(opcode, 5, 1) | (extract32(opcode, 0, 1) << 1);
+            /* fall through */
+        case 0x59: /* FRINTX */
+        case 0x79: /* FRINTI */
+            need_fpstatus = true;
+            if (size == 3 && !is_q) {
+                unallocated_encoding(s);
+                return;
+            }
+            break;
+        case 0x58: /* FRINTA */
+            need_rmode = true;
+            rmode = FPROUNDING_TIEAWAY;
+            need_fpstatus = true;
+            if (size == 3 && !is_q) {
+                unallocated_encoding(s);
+                return;
+            }
+            break;
+        case 0x7c: /* URSQRTE */
+            if (size == 3) {
+                unallocated_encoding(s);
+                return;
+            }
+            break;
+        case 0x1e: /* FRINT32Z */
+        case 0x1f: /* FRINT64Z */
+            need_rmode = true;
+            rmode = FPROUNDING_ZERO;
+            /* fall through */
+        case 0x5e: /* FRINT32X */
+        case 0x5f: /* FRINT64X */
+            need_fpstatus = true;
+            if ((size == 3 && !is_q) || !dc_isar_feature(aa64_frint, s)) {
+                unallocated_encoding(s);
+                return;
+            }
+            break;
+        default:
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    }
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    if (need_fpstatus || need_rmode) {
+        tcg_fpstatus = fpstatus_ptr(FPST_FPCR);
+    } else {
+        tcg_fpstatus = NULL;
+    }
+    if (need_rmode) {
+        tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode));
+        gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
+    } else {
+        tcg_rmode = NULL;
+    }
+
+    switch (opcode) {
+    case 0x5:
+        if (u && size == 0) { /* NOT */
+            gen_gvec_fn2(s, is_q, rd, rn, tcg_gen_gvec_not, 0);
+            return;
+        }
+        break;
+    case 0x8: /* CMGT, CMGE */
+        if (u) {
+            gen_gvec_fn2(s, is_q, rd, rn, gen_gvec_cge0, size);
+        } else {
+            gen_gvec_fn2(s, is_q, rd, rn, gen_gvec_cgt0, size);
+        }
+        return;
+    case 0x9: /* CMEQ, CMLE */
+        if (u) {
+            gen_gvec_fn2(s, is_q, rd, rn, gen_gvec_cle0, size);
+        } else {
+            gen_gvec_fn2(s, is_q, rd, rn, gen_gvec_ceq0, size);
+        }
+        return;
+    case 0xa: /* CMLT */
+        gen_gvec_fn2(s, is_q, rd, rn, gen_gvec_clt0, size);
+        return;
+    case 0xb:
+        if (u) { /* ABS, NEG */
+            gen_gvec_fn2(s, is_q, rd, rn, tcg_gen_gvec_neg, size);
+        } else {
+            gen_gvec_fn2(s, is_q, rd, rn, tcg_gen_gvec_abs, size);
+        }
+        return;
+    }
+
+    if (size == 3) {
+        /* All 64-bit element operations can be shared with scalar 2misc */
+        int pass;
+
+        /* Coverity claims (size == 3 && !is_q) has been eliminated
+         * from all paths leading to here.
+         */
+        tcg_debug_assert(is_q);
+        for (pass = 0; pass < 2; pass++) {
+            TCGv_i64 tcg_op = tcg_temp_new_i64();
+            TCGv_i64 tcg_res = tcg_temp_new_i64();
+
+            read_vec_element(s, tcg_op, rn, pass, MO_64);
+
+            handle_2misc_64(s, opcode, u, tcg_res, tcg_op,
+                            tcg_rmode, tcg_fpstatus);
+
+            write_vec_element(s, tcg_res, rd, pass, MO_64);
+
+            tcg_temp_free_i64(tcg_res);
+            tcg_temp_free_i64(tcg_op);
+        }
+    } else {
+        int pass;
+
+        for (pass = 0; pass < (is_q ? 4 : 2); pass++) {
+            TCGv_i32 tcg_op = tcg_temp_new_i32();
+            TCGv_i32 tcg_res = tcg_temp_new_i32();
+
+            read_vec_element_i32(s, tcg_op, rn, pass, MO_32);
+
+            if (size == 2) {
+                /* Special cases for 32 bit elements */
+                switch (opcode) {
+                case 0x4: /* CLS */
+                    if (u) {
+                        tcg_gen_clzi_i32(tcg_res, tcg_op, 32);
+                    } else {
+                        tcg_gen_clrsb_i32(tcg_res, tcg_op);
+                    }
+                    break;
+                case 0x7: /* SQABS, SQNEG */
+                    if (u) {
+                        gen_helper_neon_qneg_s32(tcg_res, cpu_env, tcg_op);
+                    } else {
+                        gen_helper_neon_qabs_s32(tcg_res, cpu_env, tcg_op);
+                    }
+                    break;
+                case 0x2f: /* FABS */
+                    gen_helper_vfp_abss(tcg_res, tcg_op);
+                    break;
+                case 0x6f: /* FNEG */
+                    gen_helper_vfp_negs(tcg_res, tcg_op);
+                    break;
+                case 0x7f: /* FSQRT */
+                    gen_helper_vfp_sqrts(tcg_res, tcg_op, cpu_env);
+                    break;
+                case 0x1a: /* FCVTNS */
+                case 0x1b: /* FCVTMS */
+                case 0x1c: /* FCVTAS */
+                case 0x3a: /* FCVTPS */
+                case 0x3b: /* FCVTZS */
+                    gen_helper_vfp_tosls(tcg_res, tcg_op,
+                                         tcg_constant_i32(0), tcg_fpstatus);
+                    break;
+                case 0x5a: /* FCVTNU */
+                case 0x5b: /* FCVTMU */
+                case 0x5c: /* FCVTAU */
+                case 0x7a: /* FCVTPU */
+                case 0x7b: /* FCVTZU */
+                    gen_helper_vfp_touls(tcg_res, tcg_op,
+                                         tcg_constant_i32(0), tcg_fpstatus);
+                    break;
+                case 0x18: /* FRINTN */
+                case 0x19: /* FRINTM */
+                case 0x38: /* FRINTP */
+                case 0x39: /* FRINTZ */
+                case 0x58: /* FRINTA */
+                case 0x79: /* FRINTI */
+                    gen_helper_rints(tcg_res, tcg_op, tcg_fpstatus);
+                    break;
+                case 0x59: /* FRINTX */
+                    gen_helper_rints_exact(tcg_res, tcg_op, tcg_fpstatus);
+                    break;
+                case 0x7c: /* URSQRTE */
+                    gen_helper_rsqrte_u32(tcg_res, tcg_op);
+                    break;
+                case 0x1e: /* FRINT32Z */
+                case 0x5e: /* FRINT32X */
+                    gen_helper_frint32_s(tcg_res, tcg_op, tcg_fpstatus);
+                    break;
+                case 0x1f: /* FRINT64Z */
+                case 0x5f: /* FRINT64X */
+                    gen_helper_frint64_s(tcg_res, tcg_op, tcg_fpstatus);
+                    break;
+                default:
+                    g_assert_not_reached();
+                }
+            } else {
+                /* Use helpers for 8 and 16 bit elements */
+                switch (opcode) {
+                case 0x5: /* CNT, RBIT */
+                    /* For these two insns size is part of the opcode specifier
+                     * (handled earlier); they always operate on byte elements.
+                     */
+                    if (u) {
+                        gen_helper_neon_rbit_u8(tcg_res, tcg_op);
+                    } else {
+                        gen_helper_neon_cnt_u8(tcg_res, tcg_op);
+                    }
+                    break;
+                case 0x7: /* SQABS, SQNEG */
+                {
+                    NeonGenOneOpEnvFn *genfn;
+                    static NeonGenOneOpEnvFn * const fns[2][2] = {
+                        { gen_helper_neon_qabs_s8, gen_helper_neon_qneg_s8 },
+                        { gen_helper_neon_qabs_s16, gen_helper_neon_qneg_s16 },
+                    };
+                    genfn = fns[size][u];
+                    genfn(tcg_res, cpu_env, tcg_op);
+                    break;
+                }
+                case 0x4: /* CLS, CLZ */
+                    if (u) {
+                        if (size == 0) {
+                            gen_helper_neon_clz_u8(tcg_res, tcg_op);
+                        } else {
+                            gen_helper_neon_clz_u16(tcg_res, tcg_op);
+                        }
+                    } else {
+                        if (size == 0) {
+                            gen_helper_neon_cls_s8(tcg_res, tcg_op);
+                        } else {
+                            gen_helper_neon_cls_s16(tcg_res, tcg_op);
+                        }
+                    }
+                    break;
+                default:
+                    g_assert_not_reached();
+                }
+            }
+
+            write_vec_element_i32(s, tcg_res, rd, pass, MO_32);
+
+            tcg_temp_free_i32(tcg_res);
+            tcg_temp_free_i32(tcg_op);
+        }
+    }
+    clear_vec_high(s, is_q, rd);
+
+    if (need_rmode) {
+        gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
+        tcg_temp_free_i32(tcg_rmode);
+    }
+    if (need_fpstatus) {
+        tcg_temp_free_ptr(tcg_fpstatus);
+    }
+}
+
+/* AdvSIMD [scalar] two register miscellaneous (FP16)
+ *
+ *   31  30  29 28  27     24  23 22 21       17 16    12 11 10 9    5 4    0
+ * +---+---+---+---+---------+---+-------------+--------+-----+------+------+
+ * | 0 | Q | U | S | 1 1 1 0 | a | 1 1 1 1 0 0 | opcode | 1 0 |  Rn  |  Rd  |
+ * +---+---+---+---+---------+---+-------------+--------+-----+------+------+
+ *   mask: 1000 1111 0111 1110 0000 1100 0000 0000 0x8f7e 0c00
+ *   val:  0000 1110 0111 1000 0000 1000 0000 0000 0x0e78 0800
+ *
+ * This actually covers two groups where scalar access is governed by
+ * bit 28. A bunch of the instructions (float to integral) only exist
+ * in the vector form and are un-allocated for the scalar decode. Also
+ * in the scalar decode Q is always 1.
+ */
+static void disas_simd_two_reg_misc_fp16(DisasContext *s, uint32_t insn)
+{
+    int fpop, opcode, a, u;
+    int rn, rd;
+    bool is_q;
+    bool is_scalar;
+    bool only_in_vector = false;
+
+    int pass;
+    TCGv_i32 tcg_rmode = NULL;
+    TCGv_ptr tcg_fpstatus = NULL;
+    bool need_rmode = false;
+    bool need_fpst = true;
+    int rmode;
+
+    if (!dc_isar_feature(aa64_fp16, s)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    rd = extract32(insn, 0, 5);
+    rn = extract32(insn, 5, 5);
+
+    a = extract32(insn, 23, 1);
+    u = extract32(insn, 29, 1);
+    is_scalar = extract32(insn, 28, 1);
+    is_q = extract32(insn, 30, 1);
+
+    opcode = extract32(insn, 12, 5);
+    fpop = deposit32(opcode, 5, 1, a);
+    fpop = deposit32(fpop, 6, 1, u);
+
+    switch (fpop) {
+    case 0x1d: /* SCVTF */
+    case 0x5d: /* UCVTF */
+    {
+        int elements;
+
+        if (is_scalar) {
+            elements = 1;
+        } else {
+            elements = (is_q ? 8 : 4);
+        }
+
+        if (!fp_access_check(s)) {
+            return;
+        }
+        handle_simd_intfp_conv(s, rd, rn, elements, !u, 0, MO_16);
+        return;
+    }
+    break;
+    case 0x2c: /* FCMGT (zero) */
+    case 0x2d: /* FCMEQ (zero) */
+    case 0x2e: /* FCMLT (zero) */
+    case 0x6c: /* FCMGE (zero) */
+    case 0x6d: /* FCMLE (zero) */
+        handle_2misc_fcmp_zero(s, fpop, is_scalar, 0, is_q, MO_16, rn, rd);
+        return;
+    case 0x3d: /* FRECPE */
+    case 0x3f: /* FRECPX */
+        break;
+    case 0x18: /* FRINTN */
+        need_rmode = true;
+        only_in_vector = true;
+        rmode = FPROUNDING_TIEEVEN;
+        break;
+    case 0x19: /* FRINTM */
+        need_rmode = true;
+        only_in_vector = true;
+        rmode = FPROUNDING_NEGINF;
+        break;
+    case 0x38: /* FRINTP */
+        need_rmode = true;
+        only_in_vector = true;
+        rmode = FPROUNDING_POSINF;
+        break;
+    case 0x39: /* FRINTZ */
+        need_rmode = true;
+        only_in_vector = true;
+        rmode = FPROUNDING_ZERO;
+        break;
+    case 0x58: /* FRINTA */
+        need_rmode = true;
+        only_in_vector = true;
+        rmode = FPROUNDING_TIEAWAY;
+        break;
+    case 0x59: /* FRINTX */
+    case 0x79: /* FRINTI */
+        only_in_vector = true;
+        /* current rounding mode */
+        break;
+    case 0x1a: /* FCVTNS */
+        need_rmode = true;
+        rmode = FPROUNDING_TIEEVEN;
+        break;
+    case 0x1b: /* FCVTMS */
+        need_rmode = true;
+        rmode = FPROUNDING_NEGINF;
+        break;
+    case 0x1c: /* FCVTAS */
+        need_rmode = true;
+        rmode = FPROUNDING_TIEAWAY;
+        break;
+    case 0x3a: /* FCVTPS */
+        need_rmode = true;
+        rmode = FPROUNDING_POSINF;
+        break;
+    case 0x3b: /* FCVTZS */
+        need_rmode = true;
+        rmode = FPROUNDING_ZERO;
+        break;
+    case 0x5a: /* FCVTNU */
+        need_rmode = true;
+        rmode = FPROUNDING_TIEEVEN;
+        break;
+    case 0x5b: /* FCVTMU */
+        need_rmode = true;
+        rmode = FPROUNDING_NEGINF;
+        break;
+    case 0x5c: /* FCVTAU */
+        need_rmode = true;
+        rmode = FPROUNDING_TIEAWAY;
+        break;
+    case 0x7a: /* FCVTPU */
+        need_rmode = true;
+        rmode = FPROUNDING_POSINF;
+        break;
+    case 0x7b: /* FCVTZU */
+        need_rmode = true;
+        rmode = FPROUNDING_ZERO;
+        break;
+    case 0x2f: /* FABS */
+    case 0x6f: /* FNEG */
+        need_fpst = false;
+        break;
+    case 0x7d: /* FRSQRTE */
+    case 0x7f: /* FSQRT (vector) */
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+
+    /* Check additional constraints for the scalar encoding */
+    if (is_scalar) {
+        if (!is_q) {
+            unallocated_encoding(s);
+            return;
+        }
+        /* FRINTxx is only in the vector form */
+        if (only_in_vector) {
+            unallocated_encoding(s);
+            return;
+        }
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    if (need_rmode || need_fpst) {
+        tcg_fpstatus = fpstatus_ptr(FPST_FPCR_F16);
+    }
+
+    if (need_rmode) {
+        tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode));
+        gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
+    }
+
+    if (is_scalar) {
+        TCGv_i32 tcg_op = read_fp_hreg(s, rn);
+        TCGv_i32 tcg_res = tcg_temp_new_i32();
+
+        switch (fpop) {
+        case 0x1a: /* FCVTNS */
+        case 0x1b: /* FCVTMS */
+        case 0x1c: /* FCVTAS */
+        case 0x3a: /* FCVTPS */
+        case 0x3b: /* FCVTZS */
+            gen_helper_advsimd_f16tosinth(tcg_res, tcg_op, tcg_fpstatus);
+            break;
+        case 0x3d: /* FRECPE */
+            gen_helper_recpe_f16(tcg_res, tcg_op, tcg_fpstatus);
+            break;
+        case 0x3f: /* FRECPX */
+            gen_helper_frecpx_f16(tcg_res, tcg_op, tcg_fpstatus);
+            break;
+        case 0x5a: /* FCVTNU */
+        case 0x5b: /* FCVTMU */
+        case 0x5c: /* FCVTAU */
+        case 0x7a: /* FCVTPU */
+        case 0x7b: /* FCVTZU */
+            gen_helper_advsimd_f16touinth(tcg_res, tcg_op, tcg_fpstatus);
+            break;
+        case 0x6f: /* FNEG */
+            tcg_gen_xori_i32(tcg_res, tcg_op, 0x8000);
+            break;
+        case 0x7d: /* FRSQRTE */
+            gen_helper_rsqrte_f16(tcg_res, tcg_op, tcg_fpstatus);
+            break;
+        default:
+            g_assert_not_reached();
+        }
+
+        /* limit any sign extension going on */
+        tcg_gen_andi_i32(tcg_res, tcg_res, 0xffff);
+        write_fp_sreg(s, rd, tcg_res);
+
+        tcg_temp_free_i32(tcg_res);
+        tcg_temp_free_i32(tcg_op);
+    } else {
+        for (pass = 0; pass < (is_q ? 8 : 4); pass++) {
+            TCGv_i32 tcg_op = tcg_temp_new_i32();
+            TCGv_i32 tcg_res = tcg_temp_new_i32();
+
+            read_vec_element_i32(s, tcg_op, rn, pass, MO_16);
+
+            switch (fpop) {
+            case 0x1a: /* FCVTNS */
+            case 0x1b: /* FCVTMS */
+            case 0x1c: /* FCVTAS */
+            case 0x3a: /* FCVTPS */
+            case 0x3b: /* FCVTZS */
+                gen_helper_advsimd_f16tosinth(tcg_res, tcg_op, tcg_fpstatus);
+                break;
+            case 0x3d: /* FRECPE */
+                gen_helper_recpe_f16(tcg_res, tcg_op, tcg_fpstatus);
+                break;
+            case 0x5a: /* FCVTNU */
+            case 0x5b: /* FCVTMU */
+            case 0x5c: /* FCVTAU */
+            case 0x7a: /* FCVTPU */
+            case 0x7b: /* FCVTZU */
+                gen_helper_advsimd_f16touinth(tcg_res, tcg_op, tcg_fpstatus);
+                break;
+            case 0x18: /* FRINTN */
+            case 0x19: /* FRINTM */
+            case 0x38: /* FRINTP */
+            case 0x39: /* FRINTZ */
+            case 0x58: /* FRINTA */
+            case 0x79: /* FRINTI */
+                gen_helper_advsimd_rinth(tcg_res, tcg_op, tcg_fpstatus);
+                break;
+            case 0x59: /* FRINTX */
+                gen_helper_advsimd_rinth_exact(tcg_res, tcg_op, tcg_fpstatus);
+                break;
+            case 0x2f: /* FABS */
+                tcg_gen_andi_i32(tcg_res, tcg_op, 0x7fff);
+                break;
+            case 0x6f: /* FNEG */
+                tcg_gen_xori_i32(tcg_res, tcg_op, 0x8000);
+                break;
+            case 0x7d: /* FRSQRTE */
+                gen_helper_rsqrte_f16(tcg_res, tcg_op, tcg_fpstatus);
+                break;
+            case 0x7f: /* FSQRT */
+                gen_helper_sqrt_f16(tcg_res, tcg_op, tcg_fpstatus);
+                break;
+            default:
+                g_assert_not_reached();
+            }
+
+            write_vec_element_i32(s, tcg_res, rd, pass, MO_16);
+
+            tcg_temp_free_i32(tcg_res);
+            tcg_temp_free_i32(tcg_op);
+        }
+
+        clear_vec_high(s, is_q, rd);
+    }
+
+    if (tcg_rmode) {
+        gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
+        tcg_temp_free_i32(tcg_rmode);
+    }
+
+    if (tcg_fpstatus) {
+        tcg_temp_free_ptr(tcg_fpstatus);
+    }
+}
+
+/* AdvSIMD scalar x indexed element
+ *  31 30  29 28       24 23  22 21  20  19  16 15 12  11  10 9    5 4    0
+ * +-----+---+-----------+------+---+---+------+-----+---+---+------+------+
+ * | 0 1 | U | 1 1 1 1 1 | size | L | M |  Rm  | opc | H | 0 |  Rn  |  Rd  |
+ * +-----+---+-----------+------+---+---+------+-----+---+---+------+------+
+ * AdvSIMD vector x indexed element
+ *   31  30  29 28       24 23  22 21  20  19  16 15 12  11  10 9    5 4    0
+ * +---+---+---+-----------+------+---+---+------+-----+---+---+------+------+
+ * | 0 | Q | U | 0 1 1 1 1 | size | L | M |  Rm  | opc | H | 0 |  Rn  |  Rd  |
+ * +---+---+---+-----------+------+---+---+------+-----+---+---+------+------+
+ */
+static void disas_simd_indexed(DisasContext *s, uint32_t insn)
+{
+    /* This encoding has two kinds of instruction:
+     *  normal, where we perform elt x idxelt => elt for each
+     *     element in the vector
+     *  long, where we perform elt x idxelt and generate a result of
+     *     double the width of the input element
+     * The long ops have a 'part' specifier (ie come in INSN, INSN2 pairs).
+     */
+    bool is_scalar = extract32(insn, 28, 1);
+    bool is_q = extract32(insn, 30, 1);
+    bool u = extract32(insn, 29, 1);
+    int size = extract32(insn, 22, 2);
+    int l = extract32(insn, 21, 1);
+    int m = extract32(insn, 20, 1);
+    /* Note that the Rm field here is only 4 bits, not 5 as it usually is */
+    int rm = extract32(insn, 16, 4);
+    int opcode = extract32(insn, 12, 4);
+    int h = extract32(insn, 11, 1);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+    bool is_long = false;
+    int is_fp = 0;
+    bool is_fp16 = false;
+    int index;
+    TCGv_ptr fpst;
+
+    switch (16 * u + opcode) {
+    case 0x08: /* MUL */
+    case 0x10: /* MLA */
+    case 0x14: /* MLS */
+        if (is_scalar) {
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    case 0x02: /* SMLAL, SMLAL2 */
+    case 0x12: /* UMLAL, UMLAL2 */
+    case 0x06: /* SMLSL, SMLSL2 */
+    case 0x16: /* UMLSL, UMLSL2 */
+    case 0x0a: /* SMULL, SMULL2 */
+    case 0x1a: /* UMULL, UMULL2 */
+        if (is_scalar) {
+            unallocated_encoding(s);
+            return;
+        }
+        is_long = true;
+        break;
+    case 0x03: /* SQDMLAL, SQDMLAL2 */
+    case 0x07: /* SQDMLSL, SQDMLSL2 */
+    case 0x0b: /* SQDMULL, SQDMULL2 */
+        is_long = true;
+        break;
+    case 0x0c: /* SQDMULH */
+    case 0x0d: /* SQRDMULH */
+        break;
+    case 0x01: /* FMLA */
+    case 0x05: /* FMLS */
+    case 0x09: /* FMUL */
+    case 0x19: /* FMULX */
+        is_fp = 1;
+        break;
+    case 0x1d: /* SQRDMLAH */
+    case 0x1f: /* SQRDMLSH */
+        if (!dc_isar_feature(aa64_rdm, s)) {
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    case 0x0e: /* SDOT */
+    case 0x1e: /* UDOT */
+        if (is_scalar || size != MO_32 || !dc_isar_feature(aa64_dp, s)) {
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    case 0x0f:
+        switch (size) {
+        case 0: /* SUDOT */
+        case 2: /* USDOT */
+            if (is_scalar || !dc_isar_feature(aa64_i8mm, s)) {
+                unallocated_encoding(s);
+                return;
+            }
+            size = MO_32;
+            break;
+        case 1: /* BFDOT */
+            if (is_scalar || !dc_isar_feature(aa64_bf16, s)) {
+                unallocated_encoding(s);
+                return;
+            }
+            size = MO_32;
+            break;
+        case 3: /* BFMLAL{B,T} */
+            if (is_scalar || !dc_isar_feature(aa64_bf16, s)) {
+                unallocated_encoding(s);
+                return;
+            }
+            /* can't set is_fp without other incorrect size checks */
+            size = MO_16;
+            break;
+        default:
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    case 0x11: /* FCMLA #0 */
+    case 0x13: /* FCMLA #90 */
+    case 0x15: /* FCMLA #180 */
+    case 0x17: /* FCMLA #270 */
+        if (is_scalar || !dc_isar_feature(aa64_fcma, s)) {
+            unallocated_encoding(s);
+            return;
+        }
+        is_fp = 2;
+        break;
+    case 0x00: /* FMLAL */
+    case 0x04: /* FMLSL */
+    case 0x18: /* FMLAL2 */
+    case 0x1c: /* FMLSL2 */
+        if (is_scalar || size != MO_32 || !dc_isar_feature(aa64_fhm, s)) {
+            unallocated_encoding(s);
+            return;
+        }
+        size = MO_16;
+        /* is_fp, but we pass cpu_env not fp_status.  */
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (is_fp) {
+    case 1: /* normal fp */
+        /* convert insn encoded size to MemOp size */
+        switch (size) {
+        case 0: /* half-precision */
+            size = MO_16;
+            is_fp16 = true;
+            break;
+        case MO_32: /* single precision */
+        case MO_64: /* double precision */
+            break;
+        default:
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+
+    case 2: /* complex fp */
+        /* Each indexable element is a complex pair.  */
+        size += 1;
+        switch (size) {
+        case MO_32:
+            if (h && !is_q) {
+                unallocated_encoding(s);
+                return;
+            }
+            is_fp16 = true;
+            break;
+        case MO_64:
+            break;
+        default:
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+
+    default: /* integer */
+        switch (size) {
+        case MO_8:
+        case MO_64:
+            unallocated_encoding(s);
+            return;
+        }
+        break;
+    }
+    if (is_fp16 && !dc_isar_feature(aa64_fp16, s)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    /* Given MemOp size, adjust register and indexing.  */
+    switch (size) {
+    case MO_16:
+        index = h << 2 | l << 1 | m;
+        break;
+    case MO_32:
+        index = h << 1 | l;
+        rm |= m << 4;
+        break;
+    case MO_64:
+        if (l || !is_q) {
+            unallocated_encoding(s);
+            return;
+        }
+        index = h;
+        rm |= m << 4;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    if (is_fp) {
+        fpst = fpstatus_ptr(is_fp16 ? FPST_FPCR_F16 : FPST_FPCR);
+    } else {
+        fpst = NULL;
+    }
+
+    switch (16 * u + opcode) {
+    case 0x0e: /* SDOT */
+    case 0x1e: /* UDOT */
+        gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index,
+                         u ? gen_helper_gvec_udot_idx_b
+                         : gen_helper_gvec_sdot_idx_b);
+        return;
+    case 0x0f:
+        switch (extract32(insn, 22, 2)) {
+        case 0: /* SUDOT */
+            gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index,
+                             gen_helper_gvec_sudot_idx_b);
+            return;
+        case 1: /* BFDOT */
+            gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index,
+                             gen_helper_gvec_bfdot_idx);
+            return;
+        case 2: /* USDOT */
+            gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index,
+                             gen_helper_gvec_usdot_idx_b);
+            return;
+        case 3: /* BFMLAL{B,T} */
+            gen_gvec_op4_fpst(s, 1, rd, rn, rm, rd, 0, (index << 1) | is_q,
+                              gen_helper_gvec_bfmlal_idx);
+            return;
+        }
+        g_assert_not_reached();
+    case 0x11: /* FCMLA #0 */
+    case 0x13: /* FCMLA #90 */
+    case 0x15: /* FCMLA #180 */
+    case 0x17: /* FCMLA #270 */
+        {
+            int rot = extract32(insn, 13, 2);
+            int data = (index << 2) | rot;
+            tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, rd),
+                               vec_full_reg_offset(s, rn),
+                               vec_full_reg_offset(s, rm),
+                               vec_full_reg_offset(s, rd), fpst,
+                               is_q ? 16 : 8, vec_full_reg_size(s), data,
+                               size == MO_64
+                               ? gen_helper_gvec_fcmlas_idx
+                               : gen_helper_gvec_fcmlah_idx);
+            tcg_temp_free_ptr(fpst);
+        }
+        return;
+
+    case 0x00: /* FMLAL */
+    case 0x04: /* FMLSL */
+    case 0x18: /* FMLAL2 */
+    case 0x1c: /* FMLSL2 */
+        {
+            int is_s = extract32(opcode, 2, 1);
+            int is_2 = u;
+            int data = (index << 2) | (is_2 << 1) | is_s;
+            tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd),
+                               vec_full_reg_offset(s, rn),
+                               vec_full_reg_offset(s, rm), cpu_env,
+                               is_q ? 16 : 8, vec_full_reg_size(s),
+                               data, gen_helper_gvec_fmlal_idx_a64);
+        }
+        return;
+
+    case 0x08: /* MUL */
+        if (!is_long && !is_scalar) {
+            static gen_helper_gvec_3 * const fns[3] = {
+                gen_helper_gvec_mul_idx_h,
+                gen_helper_gvec_mul_idx_s,
+                gen_helper_gvec_mul_idx_d,
+            };
+            tcg_gen_gvec_3_ool(vec_full_reg_offset(s, rd),
+                               vec_full_reg_offset(s, rn),
+                               vec_full_reg_offset(s, rm),
+                               is_q ? 16 : 8, vec_full_reg_size(s),
+                               index, fns[size - 1]);
+            return;
+        }
+        break;
+
+    case 0x10: /* MLA */
+        if (!is_long && !is_scalar) {
+            static gen_helper_gvec_4 * const fns[3] = {
+                gen_helper_gvec_mla_idx_h,
+                gen_helper_gvec_mla_idx_s,
+                gen_helper_gvec_mla_idx_d,
+            };
+            tcg_gen_gvec_4_ool(vec_full_reg_offset(s, rd),
+                               vec_full_reg_offset(s, rn),
+                               vec_full_reg_offset(s, rm),
+                               vec_full_reg_offset(s, rd),
+                               is_q ? 16 : 8, vec_full_reg_size(s),
+                               index, fns[size - 1]);
+            return;
+        }
+        break;
+
+    case 0x14: /* MLS */
+        if (!is_long && !is_scalar) {
+            static gen_helper_gvec_4 * const fns[3] = {
+                gen_helper_gvec_mls_idx_h,
+                gen_helper_gvec_mls_idx_s,
+                gen_helper_gvec_mls_idx_d,
+            };
+            tcg_gen_gvec_4_ool(vec_full_reg_offset(s, rd),
+                               vec_full_reg_offset(s, rn),
+                               vec_full_reg_offset(s, rm),
+                               vec_full_reg_offset(s, rd),
+                               is_q ? 16 : 8, vec_full_reg_size(s),
+                               index, fns[size - 1]);
+            return;
+        }
+        break;
+    }
+
+    if (size == 3) {
+        TCGv_i64 tcg_idx = tcg_temp_new_i64();
+        int pass;
+
+        assert(is_fp && is_q && !is_long);
+
+        read_vec_element(s, tcg_idx, rm, index, MO_64);
+
+        for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) {
+            TCGv_i64 tcg_op = tcg_temp_new_i64();
+            TCGv_i64 tcg_res = tcg_temp_new_i64();
+
+            read_vec_element(s, tcg_op, rn, pass, MO_64);
+
+            switch (16 * u + opcode) {
+            case 0x05: /* FMLS */
+                /* As usual for ARM, separate negation for fused multiply-add */
+                gen_helper_vfp_negd(tcg_op, tcg_op);
+                /* fall through */
+            case 0x01: /* FMLA */
+                read_vec_element(s, tcg_res, rd, pass, MO_64);
+                gen_helper_vfp_muladdd(tcg_res, tcg_op, tcg_idx, tcg_res, fpst);
+                break;
+            case 0x09: /* FMUL */
+                gen_helper_vfp_muld(tcg_res, tcg_op, tcg_idx, fpst);
+                break;
+            case 0x19: /* FMULX */
+                gen_helper_vfp_mulxd(tcg_res, tcg_op, tcg_idx, fpst);
+                break;
+            default:
+                g_assert_not_reached();
+            }
+
+            write_vec_element(s, tcg_res, rd, pass, MO_64);
+            tcg_temp_free_i64(tcg_op);
+            tcg_temp_free_i64(tcg_res);
+        }
+
+        tcg_temp_free_i64(tcg_idx);
+        clear_vec_high(s, !is_scalar, rd);
+    } else if (!is_long) {
+        /* 32 bit floating point, or 16 or 32 bit integer.
+         * For the 16 bit scalar case we use the usual Neon helpers and
+         * rely on the fact that 0 op 0 == 0 with no side effects.
+         */
+        TCGv_i32 tcg_idx = tcg_temp_new_i32();
+        int pass, maxpasses;
+
+        if (is_scalar) {
+            maxpasses = 1;
+        } else {
+            maxpasses = is_q ? 4 : 2;
+        }
+
+        read_vec_element_i32(s, tcg_idx, rm, index, size);
+
+        if (size == 1 && !is_scalar) {
+            /* The simplest way to handle the 16x16 indexed ops is to duplicate
+             * the index into both halves of the 32 bit tcg_idx and then use
+             * the usual Neon helpers.
+             */
+            tcg_gen_deposit_i32(tcg_idx, tcg_idx, tcg_idx, 16, 16);
+        }
+
+        for (pass = 0; pass < maxpasses; pass++) {
+            TCGv_i32 tcg_op = tcg_temp_new_i32();
+            TCGv_i32 tcg_res = tcg_temp_new_i32();
+
+            read_vec_element_i32(s, tcg_op, rn, pass, is_scalar ? size : MO_32);
+
+            switch (16 * u + opcode) {
+            case 0x08: /* MUL */
+            case 0x10: /* MLA */
+            case 0x14: /* MLS */
+            {
+                static NeonGenTwoOpFn * const fns[2][2] = {
+                    { gen_helper_neon_add_u16, gen_helper_neon_sub_u16 },
+                    { tcg_gen_add_i32, tcg_gen_sub_i32 },
+                };
+                NeonGenTwoOpFn *genfn;
+                bool is_sub = opcode == 0x4;
+
+                if (size == 1) {
+                    gen_helper_neon_mul_u16(tcg_res, tcg_op, tcg_idx);
+                } else {
+                    tcg_gen_mul_i32(tcg_res, tcg_op, tcg_idx);
+                }
+                if (opcode == 0x8) {
+                    break;
+                }
+                read_vec_element_i32(s, tcg_op, rd, pass, MO_32);
+                genfn = fns[size - 1][is_sub];
+                genfn(tcg_res, tcg_op, tcg_res);
+                break;
+            }
+            case 0x05: /* FMLS */
+            case 0x01: /* FMLA */
+                read_vec_element_i32(s, tcg_res, rd, pass,
+                                     is_scalar ? size : MO_32);
+                switch (size) {
+                case 1:
+                    if (opcode == 0x5) {
+                        /* As usual for ARM, separate negation for fused
+                         * multiply-add */
+                        tcg_gen_xori_i32(tcg_op, tcg_op, 0x80008000);
+                    }
+                    if (is_scalar) {
+                        gen_helper_advsimd_muladdh(tcg_res, tcg_op, tcg_idx,
+                                                   tcg_res, fpst);
+                    } else {
+                        gen_helper_advsimd_muladd2h(tcg_res, tcg_op, tcg_idx,
+                                                    tcg_res, fpst);
+                    }
+                    break;
+                case 2:
+                    if (opcode == 0x5) {
+                        /* As usual for ARM, separate negation for
+                         * fused multiply-add */
+                        tcg_gen_xori_i32(tcg_op, tcg_op, 0x80000000);
+                    }
+                    gen_helper_vfp_muladds(tcg_res, tcg_op, tcg_idx,
+                                           tcg_res, fpst);
+                    break;
+                default:
+                    g_assert_not_reached();
+                }
+                break;
+            case 0x09: /* FMUL */
+                switch (size) {
+                case 1:
+                    if (is_scalar) {
+                        gen_helper_advsimd_mulh(tcg_res, tcg_op,
+                                                tcg_idx, fpst);
+                    } else {
+                        gen_helper_advsimd_mul2h(tcg_res, tcg_op,
+                                                 tcg_idx, fpst);
+                    }
+                    break;
+                case 2:
+                    gen_helper_vfp_muls(tcg_res, tcg_op, tcg_idx, fpst);
+                    break;
+                default:
+                    g_assert_not_reached();
+                }
+                break;
+            case 0x19: /* FMULX */
+                switch (size) {
+                case 1:
+                    if (is_scalar) {
+                        gen_helper_advsimd_mulxh(tcg_res, tcg_op,
+                                                 tcg_idx, fpst);
+                    } else {
+                        gen_helper_advsimd_mulx2h(tcg_res, tcg_op,
+                                                  tcg_idx, fpst);
+                    }
+                    break;
+                case 2:
+                    gen_helper_vfp_mulxs(tcg_res, tcg_op, tcg_idx, fpst);
+                    break;
+                default:
+                    g_assert_not_reached();
+                }
+                break;
+            case 0x0c: /* SQDMULH */
+                if (size == 1) {
+                    gen_helper_neon_qdmulh_s16(tcg_res, cpu_env,
+                                               tcg_op, tcg_idx);
+                } else {
+                    gen_helper_neon_qdmulh_s32(tcg_res, cpu_env,
+                                               tcg_op, tcg_idx);
+                }
+                break;
+            case 0x0d: /* SQRDMULH */
+                if (size == 1) {
+                    gen_helper_neon_qrdmulh_s16(tcg_res, cpu_env,
+                                                tcg_op, tcg_idx);
+                } else {
+                    gen_helper_neon_qrdmulh_s32(tcg_res, cpu_env,
+                                                tcg_op, tcg_idx);
+                }
+                break;
+            case 0x1d: /* SQRDMLAH */
+                read_vec_element_i32(s, tcg_res, rd, pass,
+                                     is_scalar ? size : MO_32);
+                if (size == 1) {
+                    gen_helper_neon_qrdmlah_s16(tcg_res, cpu_env,
+                                                tcg_op, tcg_idx, tcg_res);
+                } else {
+                    gen_helper_neon_qrdmlah_s32(tcg_res, cpu_env,
+                                                tcg_op, tcg_idx, tcg_res);
+                }
+                break;
+            case 0x1f: /* SQRDMLSH */
+                read_vec_element_i32(s, tcg_res, rd, pass,
+                                     is_scalar ? size : MO_32);
+                if (size == 1) {
+                    gen_helper_neon_qrdmlsh_s16(tcg_res, cpu_env,
+                                                tcg_op, tcg_idx, tcg_res);
+                } else {
+                    gen_helper_neon_qrdmlsh_s32(tcg_res, cpu_env,
+                                                tcg_op, tcg_idx, tcg_res);
+                }
+                break;
+            default:
+                g_assert_not_reached();
+            }
+
+            if (is_scalar) {
+                write_fp_sreg(s, rd, tcg_res);
+            } else {
+                write_vec_element_i32(s, tcg_res, rd, pass, MO_32);
+            }
+
+            tcg_temp_free_i32(tcg_op);
+            tcg_temp_free_i32(tcg_res);
+        }
+
+        tcg_temp_free_i32(tcg_idx);
+        clear_vec_high(s, is_q, rd);
+    } else {
+        /* long ops: 16x16->32 or 32x32->64 */
+        TCGv_i64 tcg_res[2];
+        int pass;
+        bool satop = extract32(opcode, 0, 1);
+        MemOp memop = MO_32;
+
+        if (satop || !u) {
+            memop |= MO_SIGN;
+        }
+
+        if (size == 2) {
+            TCGv_i64 tcg_idx = tcg_temp_new_i64();
+
+            read_vec_element(s, tcg_idx, rm, index, memop);
+
+            for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) {
+                TCGv_i64 tcg_op = tcg_temp_new_i64();
+                TCGv_i64 tcg_passres;
+                int passelt;
+
+                if (is_scalar) {
+                    passelt = 0;
+                } else {
+                    passelt = pass + (is_q * 2);
+                }
+
+                read_vec_element(s, tcg_op, rn, passelt, memop);
+
+                tcg_res[pass] = tcg_temp_new_i64();
+
+                if (opcode == 0xa || opcode == 0xb) {
+                    /* Non-accumulating ops */
+                    tcg_passres = tcg_res[pass];
+                } else {
+                    tcg_passres = tcg_temp_new_i64();
+                }
+
+                tcg_gen_mul_i64(tcg_passres, tcg_op, tcg_idx);
+                tcg_temp_free_i64(tcg_op);
+
+                if (satop) {
+                    /* saturating, doubling */
+                    gen_helper_neon_addl_saturate_s64(tcg_passres, cpu_env,
+                                                      tcg_passres, tcg_passres);
+                }
+
+                if (opcode == 0xa || opcode == 0xb) {
+                    continue;
+                }
+
+                /* Accumulating op: handle accumulate step */
+                read_vec_element(s, tcg_res[pass], rd, pass, MO_64);
+
+                switch (opcode) {
+                case 0x2: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
+                    tcg_gen_add_i64(tcg_res[pass], tcg_res[pass], tcg_passres);
+                    break;
+                case 0x6: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
+                    tcg_gen_sub_i64(tcg_res[pass], tcg_res[pass], tcg_passres);
+                    break;
+                case 0x7: /* SQDMLSL, SQDMLSL2 */
+                    tcg_gen_neg_i64(tcg_passres, tcg_passres);
+                    /* fall through */
+                case 0x3: /* SQDMLAL, SQDMLAL2 */
+                    gen_helper_neon_addl_saturate_s64(tcg_res[pass], cpu_env,
+                                                      tcg_res[pass],
+                                                      tcg_passres);
+                    break;
+                default:
+                    g_assert_not_reached();
+                }
+                tcg_temp_free_i64(tcg_passres);
+            }
+            tcg_temp_free_i64(tcg_idx);
+
+            clear_vec_high(s, !is_scalar, rd);
+        } else {
+            TCGv_i32 tcg_idx = tcg_temp_new_i32();
+
+            assert(size == 1);
+            read_vec_element_i32(s, tcg_idx, rm, index, size);
+
+            if (!is_scalar) {
+                /* The simplest way to handle the 16x16 indexed ops is to
+                 * duplicate the index into both halves of the 32 bit tcg_idx
+                 * and then use the usual Neon helpers.
+                 */
+                tcg_gen_deposit_i32(tcg_idx, tcg_idx, tcg_idx, 16, 16);
+            }
+
+            for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) {
+                TCGv_i32 tcg_op = tcg_temp_new_i32();
+                TCGv_i64 tcg_passres;
+
+                if (is_scalar) {
+                    read_vec_element_i32(s, tcg_op, rn, pass, size);
+                } else {
+                    read_vec_element_i32(s, tcg_op, rn,
+                                         pass + (is_q * 2), MO_32);
+                }
+
+                tcg_res[pass] = tcg_temp_new_i64();
+
+                if (opcode == 0xa || opcode == 0xb) {
+                    /* Non-accumulating ops */
+                    tcg_passres = tcg_res[pass];
+                } else {
+                    tcg_passres = tcg_temp_new_i64();
+                }
+
+                if (memop & MO_SIGN) {
+                    gen_helper_neon_mull_s16(tcg_passres, tcg_op, tcg_idx);
+                } else {
+                    gen_helper_neon_mull_u16(tcg_passres, tcg_op, tcg_idx);
+                }
+                if (satop) {
+                    gen_helper_neon_addl_saturate_s32(tcg_passres, cpu_env,
+                                                      tcg_passres, tcg_passres);
+                }
+                tcg_temp_free_i32(tcg_op);
+
+                if (opcode == 0xa || opcode == 0xb) {
+                    continue;
+                }
+
+                /* Accumulating op: handle accumulate step */
+                read_vec_element(s, tcg_res[pass], rd, pass, MO_64);
+
+                switch (opcode) {
+                case 0x2: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
+                    gen_helper_neon_addl_u32(tcg_res[pass], tcg_res[pass],
+                                             tcg_passres);
+                    break;
+                case 0x6: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
+                    gen_helper_neon_subl_u32(tcg_res[pass], tcg_res[pass],
+                                             tcg_passres);
+                    break;
+                case 0x7: /* SQDMLSL, SQDMLSL2 */
+                    gen_helper_neon_negl_u32(tcg_passres, tcg_passres);
+                    /* fall through */
+                case 0x3: /* SQDMLAL, SQDMLAL2 */
+                    gen_helper_neon_addl_saturate_s32(tcg_res[pass], cpu_env,
+                                                      tcg_res[pass],
+                                                      tcg_passres);
+                    break;
+                default:
+                    g_assert_not_reached();
+                }
+                tcg_temp_free_i64(tcg_passres);
+            }
+            tcg_temp_free_i32(tcg_idx);
+
+            if (is_scalar) {
+                tcg_gen_ext32u_i64(tcg_res[0], tcg_res[0]);
+            }
+        }
+
+        if (is_scalar) {
+            tcg_res[1] = tcg_constant_i64(0);
+        }
+
+        for (pass = 0; pass < 2; pass++) {
+            write_vec_element(s, tcg_res[pass], rd, pass, MO_64);
+            tcg_temp_free_i64(tcg_res[pass]);
+        }
+    }
+
+    if (fpst) {
+        tcg_temp_free_ptr(fpst);
+    }
+}
+
+/* Crypto AES
+ *  31             24 23  22 21       17 16    12 11 10 9    5 4    0
+ * +-----------------+------+-----------+--------+-----+------+------+
+ * | 0 1 0 0 1 1 1 0 | size | 1 0 1 0 0 | opcode | 1 0 |  Rn  |  Rd  |
+ * +-----------------+------+-----------+--------+-----+------+------+
+ */
+static void disas_crypto_aes(DisasContext *s, uint32_t insn)
+{
+    int size = extract32(insn, 22, 2);
+    int opcode = extract32(insn, 12, 5);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+    int decrypt;
+    gen_helper_gvec_2 *genfn2 = NULL;
+    gen_helper_gvec_3 *genfn3 = NULL;
+
+    if (!dc_isar_feature(aa64_aes, s) || size != 0) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (opcode) {
+    case 0x4: /* AESE */
+        decrypt = 0;
+        genfn3 = gen_helper_crypto_aese;
+        break;
+    case 0x6: /* AESMC */
+        decrypt = 0;
+        genfn2 = gen_helper_crypto_aesmc;
+        break;
+    case 0x5: /* AESD */
+        decrypt = 1;
+        genfn3 = gen_helper_crypto_aese;
+        break;
+    case 0x7: /* AESIMC */
+        decrypt = 1;
+        genfn2 = gen_helper_crypto_aesmc;
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+    if (genfn2) {
+        gen_gvec_op2_ool(s, true, rd, rn, decrypt, genfn2);
+    } else {
+        gen_gvec_op3_ool(s, true, rd, rd, rn, decrypt, genfn3);
+    }
+}
+
+/* Crypto three-reg SHA
+ *  31             24 23  22  21 20  16  15 14    12 11 10 9    5 4    0
+ * +-----------------+------+---+------+---+--------+-----+------+------+
+ * | 0 1 0 1 1 1 1 0 | size | 0 |  Rm  | 0 | opcode | 0 0 |  Rn  |  Rd  |
+ * +-----------------+------+---+------+---+--------+-----+------+------+
+ */
+static void disas_crypto_three_reg_sha(DisasContext *s, uint32_t insn)
+{
+    int size = extract32(insn, 22, 2);
+    int opcode = extract32(insn, 12, 3);
+    int rm = extract32(insn, 16, 5);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+    gen_helper_gvec_3 *genfn;
+    bool feature;
+
+    if (size != 0) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (opcode) {
+    case 0: /* SHA1C */
+        genfn = gen_helper_crypto_sha1c;
+        feature = dc_isar_feature(aa64_sha1, s);
+        break;
+    case 1: /* SHA1P */
+        genfn = gen_helper_crypto_sha1p;
+        feature = dc_isar_feature(aa64_sha1, s);
+        break;
+    case 2: /* SHA1M */
+        genfn = gen_helper_crypto_sha1m;
+        feature = dc_isar_feature(aa64_sha1, s);
+        break;
+    case 3: /* SHA1SU0 */
+        genfn = gen_helper_crypto_sha1su0;
+        feature = dc_isar_feature(aa64_sha1, s);
+        break;
+    case 4: /* SHA256H */
+        genfn = gen_helper_crypto_sha256h;
+        feature = dc_isar_feature(aa64_sha256, s);
+        break;
+    case 5: /* SHA256H2 */
+        genfn = gen_helper_crypto_sha256h2;
+        feature = dc_isar_feature(aa64_sha256, s);
+        break;
+    case 6: /* SHA256SU1 */
+        genfn = gen_helper_crypto_sha256su1;
+        feature = dc_isar_feature(aa64_sha256, s);
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!feature) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+    gen_gvec_op3_ool(s, true, rd, rn, rm, 0, genfn);
+}
+
+/* Crypto two-reg SHA
+ *  31             24 23  22 21       17 16    12 11 10 9    5 4    0
+ * +-----------------+------+-----------+--------+-----+------+------+
+ * | 0 1 0 1 1 1 1 0 | size | 1 0 1 0 0 | opcode | 1 0 |  Rn  |  Rd  |
+ * +-----------------+------+-----------+--------+-----+------+------+
+ */
+static void disas_crypto_two_reg_sha(DisasContext *s, uint32_t insn)
+{
+    int size = extract32(insn, 22, 2);
+    int opcode = extract32(insn, 12, 5);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+    gen_helper_gvec_2 *genfn;
+    bool feature;
+
+    if (size != 0) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (opcode) {
+    case 0: /* SHA1H */
+        feature = dc_isar_feature(aa64_sha1, s);
+        genfn = gen_helper_crypto_sha1h;
+        break;
+    case 1: /* SHA1SU1 */
+        feature = dc_isar_feature(aa64_sha1, s);
+        genfn = gen_helper_crypto_sha1su1;
+        break;
+    case 2: /* SHA256SU0 */
+        feature = dc_isar_feature(aa64_sha256, s);
+        genfn = gen_helper_crypto_sha256su0;
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!feature) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+    gen_gvec_op2_ool(s, true, rd, rn, 0, genfn);
+}
+
+static void gen_rax1_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)
+{
+    tcg_gen_rotli_i64(d, m, 1);
+    tcg_gen_xor_i64(d, d, n);
+}
+
+static void gen_rax1_vec(unsigned vece, TCGv_vec d, TCGv_vec n, TCGv_vec m)
+{
+    tcg_gen_rotli_vec(vece, d, m, 1);
+    tcg_gen_xor_vec(vece, d, d, n);
+}
+
+void gen_gvec_rax1(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+    static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 };
+    static const GVecGen3 op = {
+        .fni8 = gen_rax1_i64,
+        .fniv = gen_rax1_vec,
+        .opt_opc = vecop_list,
+        .fno = gen_helper_crypto_rax1,
+        .vece = MO_64,
+    };
+    tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &op);
+}
+
+/* Crypto three-reg SHA512
+ *  31                   21 20  16 15  14  13 12  11  10  9    5 4    0
+ * +-----------------------+------+---+---+-----+--------+------+------+
+ * | 1 1 0 0 1 1 1 0 0 1 1 |  Rm  | 1 | O | 0 0 | opcode |  Rn  |  Rd  |
+ * +-----------------------+------+---+---+-----+--------+------+------+
+ */
+static void disas_crypto_three_reg_sha512(DisasContext *s, uint32_t insn)
+{
+    int opcode = extract32(insn, 10, 2);
+    int o =  extract32(insn, 14, 1);
+    int rm = extract32(insn, 16, 5);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+    bool feature;
+    gen_helper_gvec_3 *oolfn = NULL;
+    GVecGen3Fn *gvecfn = NULL;
+
+    if (o == 0) {
+        switch (opcode) {
+        case 0: /* SHA512H */
+            feature = dc_isar_feature(aa64_sha512, s);
+            oolfn = gen_helper_crypto_sha512h;
+            break;
+        case 1: /* SHA512H2 */
+            feature = dc_isar_feature(aa64_sha512, s);
+            oolfn = gen_helper_crypto_sha512h2;
+            break;
+        case 2: /* SHA512SU1 */
+            feature = dc_isar_feature(aa64_sha512, s);
+            oolfn = gen_helper_crypto_sha512su1;
+            break;
+        case 3: /* RAX1 */
+            feature = dc_isar_feature(aa64_sha3, s);
+            gvecfn = gen_gvec_rax1;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+    } else {
+        switch (opcode) {
+        case 0: /* SM3PARTW1 */
+            feature = dc_isar_feature(aa64_sm3, s);
+            oolfn = gen_helper_crypto_sm3partw1;
+            break;
+        case 1: /* SM3PARTW2 */
+            feature = dc_isar_feature(aa64_sm3, s);
+            oolfn = gen_helper_crypto_sm3partw2;
+            break;
+        case 2: /* SM4EKEY */
+            feature = dc_isar_feature(aa64_sm4, s);
+            oolfn = gen_helper_crypto_sm4ekey;
+            break;
+        default:
+            unallocated_encoding(s);
+            return;
+        }
+    }
+
+    if (!feature) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    if (oolfn) {
+        gen_gvec_op3_ool(s, true, rd, rn, rm, 0, oolfn);
+    } else {
+        gen_gvec_fn3(s, true, rd, rn, rm, gvecfn, MO_64);
+    }
+}
+
+/* Crypto two-reg SHA512
+ *  31                                     12  11  10  9    5 4    0
+ * +-----------------------------------------+--------+------+------+
+ * | 1 1 0 0 1 1 1 0 1 1 0 0 0 0 0 0 1 0 0 0 | opcode |  Rn  |  Rd  |
+ * +-----------------------------------------+--------+------+------+
+ */
+static void disas_crypto_two_reg_sha512(DisasContext *s, uint32_t insn)
+{
+    int opcode = extract32(insn, 10, 2);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+    bool feature;
+
+    switch (opcode) {
+    case 0: /* SHA512SU0 */
+        feature = dc_isar_feature(aa64_sha512, s);
+        break;
+    case 1: /* SM4E */
+        feature = dc_isar_feature(aa64_sm4, s);
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!feature) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    switch (opcode) {
+    case 0: /* SHA512SU0 */
+        gen_gvec_op2_ool(s, true, rd, rn, 0, gen_helper_crypto_sha512su0);
+        break;
+    case 1: /* SM4E */
+        gen_gvec_op3_ool(s, true, rd, rd, rn, 0, gen_helper_crypto_sm4e);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+/* Crypto four-register
+ *  31               23 22 21 20  16 15  14  10 9    5 4    0
+ * +-------------------+-----+------+---+------+------+------+
+ * | 1 1 0 0 1 1 1 0 0 | Op0 |  Rm  | 0 |  Ra  |  Rn  |  Rd  |
+ * +-------------------+-----+------+---+------+------+------+
+ */
+static void disas_crypto_four_reg(DisasContext *s, uint32_t insn)
+{
+    int op0 = extract32(insn, 21, 2);
+    int rm = extract32(insn, 16, 5);
+    int ra = extract32(insn, 10, 5);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+    bool feature;
+
+    switch (op0) {
+    case 0: /* EOR3 */
+    case 1: /* BCAX */
+        feature = dc_isar_feature(aa64_sha3, s);
+        break;
+    case 2: /* SM3SS1 */
+        feature = dc_isar_feature(aa64_sm3, s);
+        break;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!feature) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    if (op0 < 2) {
+        TCGv_i64 tcg_op1, tcg_op2, tcg_op3, tcg_res[2];
+        int pass;
+
+        tcg_op1 = tcg_temp_new_i64();
+        tcg_op2 = tcg_temp_new_i64();
+        tcg_op3 = tcg_temp_new_i64();
+        tcg_res[0] = tcg_temp_new_i64();
+        tcg_res[1] = tcg_temp_new_i64();
+
+        for (pass = 0; pass < 2; pass++) {
+            read_vec_element(s, tcg_op1, rn, pass, MO_64);
+            read_vec_element(s, tcg_op2, rm, pass, MO_64);
+            read_vec_element(s, tcg_op3, ra, pass, MO_64);
+
+            if (op0 == 0) {
+                /* EOR3 */
+                tcg_gen_xor_i64(tcg_res[pass], tcg_op2, tcg_op3);
+            } else {
+                /* BCAX */
+                tcg_gen_andc_i64(tcg_res[pass], tcg_op2, tcg_op3);
+            }
+            tcg_gen_xor_i64(tcg_res[pass], tcg_res[pass], tcg_op1);
+        }
+        write_vec_element(s, tcg_res[0], rd, 0, MO_64);
+        write_vec_element(s, tcg_res[1], rd, 1, MO_64);
+
+        tcg_temp_free_i64(tcg_op1);
+        tcg_temp_free_i64(tcg_op2);
+        tcg_temp_free_i64(tcg_op3);
+        tcg_temp_free_i64(tcg_res[0]);
+        tcg_temp_free_i64(tcg_res[1]);
+    } else {
+        TCGv_i32 tcg_op1, tcg_op2, tcg_op3, tcg_res, tcg_zero;
+
+        tcg_op1 = tcg_temp_new_i32();
+        tcg_op2 = tcg_temp_new_i32();
+        tcg_op3 = tcg_temp_new_i32();
+        tcg_res = tcg_temp_new_i32();
+        tcg_zero = tcg_constant_i32(0);
+
+        read_vec_element_i32(s, tcg_op1, rn, 3, MO_32);
+        read_vec_element_i32(s, tcg_op2, rm, 3, MO_32);
+        read_vec_element_i32(s, tcg_op3, ra, 3, MO_32);
+
+        tcg_gen_rotri_i32(tcg_res, tcg_op1, 20);
+        tcg_gen_add_i32(tcg_res, tcg_res, tcg_op2);
+        tcg_gen_add_i32(tcg_res, tcg_res, tcg_op3);
+        tcg_gen_rotri_i32(tcg_res, tcg_res, 25);
+
+        write_vec_element_i32(s, tcg_zero, rd, 0, MO_32);
+        write_vec_element_i32(s, tcg_zero, rd, 1, MO_32);
+        write_vec_element_i32(s, tcg_zero, rd, 2, MO_32);
+        write_vec_element_i32(s, tcg_res, rd, 3, MO_32);
+
+        tcg_temp_free_i32(tcg_op1);
+        tcg_temp_free_i32(tcg_op2);
+        tcg_temp_free_i32(tcg_op3);
+        tcg_temp_free_i32(tcg_res);
+    }
+}
+
+/* Crypto XAR
+ *  31                   21 20  16 15    10 9    5 4    0
+ * +-----------------------+------+--------+------+------+
+ * | 1 1 0 0 1 1 1 0 1 0 0 |  Rm  |  imm6  |  Rn  |  Rd  |
+ * +-----------------------+------+--------+------+------+
+ */
+static void disas_crypto_xar(DisasContext *s, uint32_t insn)
+{
+    int rm = extract32(insn, 16, 5);
+    int imm6 = extract32(insn, 10, 6);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+
+    if (!dc_isar_feature(aa64_sha3, s)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    gen_gvec_xar(MO_64, vec_full_reg_offset(s, rd),
+                 vec_full_reg_offset(s, rn),
+                 vec_full_reg_offset(s, rm), imm6, 16,
+                 vec_full_reg_size(s));
+}
+
+/* Crypto three-reg imm2
+ *  31                   21 20  16 15  14 13 12  11  10  9    5 4    0
+ * +-----------------------+------+-----+------+--------+------+------+
+ * | 1 1 0 0 1 1 1 0 0 1 0 |  Rm  | 1 0 | imm2 | opcode |  Rn  |  Rd  |
+ * +-----------------------+------+-----+------+--------+------+------+
+ */
+static void disas_crypto_three_reg_imm2(DisasContext *s, uint32_t insn)
+{
+    static gen_helper_gvec_3 * const fns[4] = {
+        gen_helper_crypto_sm3tt1a, gen_helper_crypto_sm3tt1b,
+        gen_helper_crypto_sm3tt2a, gen_helper_crypto_sm3tt2b,
+    };
+    int opcode = extract32(insn, 10, 2);
+    int imm2 = extract32(insn, 12, 2);
+    int rm = extract32(insn, 16, 5);
+    int rn = extract32(insn, 5, 5);
+    int rd = extract32(insn, 0, 5);
+
+    if (!dc_isar_feature(aa64_sm3, s)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!fp_access_check(s)) {
+        return;
+    }
+
+    gen_gvec_op3_ool(s, true, rd, rn, rm, imm2, fns[opcode]);
+}
+
+/* C3.6 Data processing - SIMD, inc Crypto
+ *
+ * As the decode gets a little complex we are using a table based
+ * approach for this part of the decode.
+ */
+static const AArch64DecodeTable data_proc_simd[] = {
+    /* pattern  ,  mask     ,  fn                        */
+    { 0x0e200400, 0x9f200400, disas_simd_three_reg_same },
+    { 0x0e008400, 0x9f208400, disas_simd_three_reg_same_extra },
+    { 0x0e200000, 0x9f200c00, disas_simd_three_reg_diff },
+    { 0x0e200800, 0x9f3e0c00, disas_simd_two_reg_misc },
+    { 0x0e300800, 0x9f3e0c00, disas_simd_across_lanes },
+    { 0x0e000400, 0x9fe08400, disas_simd_copy },
+    { 0x0f000000, 0x9f000400, disas_simd_indexed }, /* vector indexed */
+    /* simd_mod_imm decode is a subset of simd_shift_imm, so must precede it */
+    { 0x0f000400, 0x9ff80400, disas_simd_mod_imm },
+    { 0x0f000400, 0x9f800400, disas_simd_shift_imm },
+    { 0x0e000000, 0xbf208c00, disas_simd_tb },
+    { 0x0e000800, 0xbf208c00, disas_simd_zip_trn },
+    { 0x2e000000, 0xbf208400, disas_simd_ext },
+    { 0x5e200400, 0xdf200400, disas_simd_scalar_three_reg_same },
+    { 0x5e008400, 0xdf208400, disas_simd_scalar_three_reg_same_extra },
+    { 0x5e200000, 0xdf200c00, disas_simd_scalar_three_reg_diff },
+    { 0x5e200800, 0xdf3e0c00, disas_simd_scalar_two_reg_misc },
+    { 0x5e300800, 0xdf3e0c00, disas_simd_scalar_pairwise },
+    { 0x5e000400, 0xdfe08400, disas_simd_scalar_copy },
+    { 0x5f000000, 0xdf000400, disas_simd_indexed }, /* scalar indexed */
+    { 0x5f000400, 0xdf800400, disas_simd_scalar_shift_imm },
+    { 0x4e280800, 0xff3e0c00, disas_crypto_aes },
+    { 0x5e000000, 0xff208c00, disas_crypto_three_reg_sha },
+    { 0x5e280800, 0xff3e0c00, disas_crypto_two_reg_sha },
+    { 0xce608000, 0xffe0b000, disas_crypto_three_reg_sha512 },
+    { 0xcec08000, 0xfffff000, disas_crypto_two_reg_sha512 },
+    { 0xce000000, 0xff808000, disas_crypto_four_reg },
+    { 0xce800000, 0xffe00000, disas_crypto_xar },
+    { 0xce408000, 0xffe0c000, disas_crypto_three_reg_imm2 },
+    { 0x0e400400, 0x9f60c400, disas_simd_three_reg_same_fp16 },
+    { 0x0e780800, 0x8f7e0c00, disas_simd_two_reg_misc_fp16 },
+    { 0x5e400400, 0xdf60c400, disas_simd_scalar_three_reg_same_fp16 },
+    { 0x00000000, 0x00000000, NULL }
+};
+
+static void disas_data_proc_simd(DisasContext *s, uint32_t insn)
+{
+    /* Note that this is called with all non-FP cases from
+     * table C3-6 so it must UNDEF for entries not specifically
+     * allocated to instructions in that table.
+     */
+    AArch64DecodeFn *fn = lookup_disas_fn(&data_proc_simd[0], insn);
+    if (fn) {
+        fn(s, insn);
+    } else {
+        unallocated_encoding(s);
+    }
+}
+
+/* C3.6 Data processing - SIMD and floating point */
+static void disas_data_proc_simd_fp(DisasContext *s, uint32_t insn)
+{
+    if (extract32(insn, 28, 1) == 1 && extract32(insn, 30, 1) == 0) {
+        disas_data_proc_fp(s, insn);
+    } else {
+        /* SIMD, including crypto */
+        disas_data_proc_simd(s, insn);
+    }
+}
+
+/*
+ * Include the generated SME FA64 decoder.
+ */
+
+#include "decode-sme-fa64.c.inc"
+
+static bool trans_OK(DisasContext *s, arg_OK *a)
+{
+    return true;
+}
+
+static bool trans_FAIL(DisasContext *s, arg_OK *a)
+{
+    s->is_nonstreaming = true;
+    return true;
+}
+
+/**
+ * is_guarded_page:
+ * @env: The cpu environment
+ * @s: The DisasContext
+ *
+ * Return true if the page is guarded.
+ */
+static bool is_guarded_page(CPUARMState *env, DisasContext *s)
+{
+    uint64_t addr = s->base.pc_first;
+#ifdef CONFIG_USER_ONLY
+    return page_get_flags(addr) & PAGE_BTI;
+#else
+    CPUTLBEntryFull *full;
+    void *host;
+    int mmu_idx = arm_to_core_mmu_idx(s->mmu_idx);
+    int flags;
+
+    /*
+     * We test this immediately after reading an insn, which means
+     * that the TLB entry must be present and valid, and thus this
+     * access will never raise an exception.
+     */
+    flags = probe_access_full(env, addr, MMU_INST_FETCH, mmu_idx,
+                              false, &host, &full, 0);
+    assert(!(flags & TLB_INVALID_MASK));
+
+    return full->guarded;
+#endif
+}
+
+/**
+ * btype_destination_ok:
+ * @insn: The instruction at the branch destination
+ * @bt: SCTLR_ELx.BT
+ * @btype: PSTATE.BTYPE, and is non-zero
+ *
+ * On a guarded page, there are a limited number of insns
+ * that may be present at the branch target:
+ *   - branch target identifiers,
+ *   - paciasp, pacibsp,
+ *   - BRK insn
+ *   - HLT insn
+ * Anything else causes a Branch Target Exception.
+ *
+ * Return true if the branch is compatible, false to raise BTITRAP.
+ */
+static bool btype_destination_ok(uint32_t insn, bool bt, int btype)
+{
+    if ((insn & 0xfffff01fu) == 0xd503201fu) {
+        /* HINT space */
+        switch (extract32(insn, 5, 7)) {
+        case 0b011001: /* PACIASP */
+        case 0b011011: /* PACIBSP */
+            /*
+             * If SCTLR_ELx.BT, then PACI*SP are not compatible
+             * with btype == 3.  Otherwise all btype are ok.
+             */
+            return !bt || btype != 3;
+        case 0b100000: /* BTI */
+            /* Not compatible with any btype.  */
+            return false;
+        case 0b100010: /* BTI c */
+            /* Not compatible with btype == 3 */
+            return btype != 3;
+        case 0b100100: /* BTI j */
+            /* Not compatible with btype == 2 */
+            return btype != 2;
+        case 0b100110: /* BTI jc */
+            /* Compatible with any btype.  */
+            return true;
+        }
+    } else {
+        switch (insn & 0xffe0001fu) {
+        case 0xd4200000u: /* BRK */
+        case 0xd4400000u: /* HLT */
+            /* Give priority to the breakpoint exception.  */
+            return true;
+        }
+    }
+    return false;
+}
+
+static void aarch64_tr_init_disas_context(DisasContextBase *dcbase,
+                                          CPUState *cpu)
+{
+    DisasContext *dc = container_of(dcbase, DisasContext, base);
+    CPUARMState *env = cpu->env_ptr;
+    ARMCPU *arm_cpu = env_archcpu(env);
+    CPUARMTBFlags tb_flags = arm_tbflags_from_tb(dc->base.tb);
+    int bound, core_mmu_idx;
+
+    dc->isar = &arm_cpu->isar;
+    dc->condjmp = 0;
+    dc->pc_save = dc->base.pc_first;
+    dc->aarch64 = true;
+    dc->thumb = false;
+    dc->sctlr_b = 0;
+    dc->be_data = EX_TBFLAG_ANY(tb_flags, BE_DATA) ? MO_BE : MO_LE;
+    dc->condexec_mask = 0;
+    dc->condexec_cond = 0;
+    core_mmu_idx = EX_TBFLAG_ANY(tb_flags, MMUIDX);
+    dc->mmu_idx = core_to_aa64_mmu_idx(core_mmu_idx);
+    dc->tbii = EX_TBFLAG_A64(tb_flags, TBII);
+    dc->tbid = EX_TBFLAG_A64(tb_flags, TBID);
+    dc->tcma = EX_TBFLAG_A64(tb_flags, TCMA);
+    dc->current_el = arm_mmu_idx_to_el(dc->mmu_idx);
+#if !defined(CONFIG_USER_ONLY)
+    dc->user = (dc->current_el == 0);
+#endif
+    dc->fp_excp_el = EX_TBFLAG_ANY(tb_flags, FPEXC_EL);
+    dc->align_mem = EX_TBFLAG_ANY(tb_flags, ALIGN_MEM);
+    dc->pstate_il = EX_TBFLAG_ANY(tb_flags, PSTATE__IL);
+    dc->fgt_active = EX_TBFLAG_ANY(tb_flags, FGT_ACTIVE);
+    dc->fgt_svc = EX_TBFLAG_ANY(tb_flags, FGT_SVC);
+    dc->fgt_eret = EX_TBFLAG_A64(tb_flags, FGT_ERET);
+    dc->sve_excp_el = EX_TBFLAG_A64(tb_flags, SVEEXC_EL);
+    dc->sme_excp_el = EX_TBFLAG_A64(tb_flags, SMEEXC_EL);
+    dc->vl = (EX_TBFLAG_A64(tb_flags, VL) + 1) * 16;
+    dc->svl = (EX_TBFLAG_A64(tb_flags, SVL) + 1) * 16;
+    dc->pauth_active = EX_TBFLAG_A64(tb_flags, PAUTH_ACTIVE);
+    dc->bt = EX_TBFLAG_A64(tb_flags, BT);
+    dc->btype = EX_TBFLAG_A64(tb_flags, BTYPE);
+    dc->unpriv = EX_TBFLAG_A64(tb_flags, UNPRIV);
+    dc->ata = EX_TBFLAG_A64(tb_flags, ATA);
+    dc->mte_active[0] = EX_TBFLAG_A64(tb_flags, MTE_ACTIVE);
+    dc->mte_active[1] = EX_TBFLAG_A64(tb_flags, MTE0_ACTIVE);
+    dc->pstate_sm = EX_TBFLAG_A64(tb_flags, PSTATE_SM);
+    dc->pstate_za = EX_TBFLAG_A64(tb_flags, PSTATE_ZA);
+    dc->sme_trap_nonstreaming = EX_TBFLAG_A64(tb_flags, SME_TRAP_NONSTREAMING);
+    dc->vec_len = 0;
+    dc->vec_stride = 0;
+    dc->cp_regs = arm_cpu->cp_regs;
+    dc->features = env->features;
+    dc->dcz_blocksize = arm_cpu->dcz_blocksize;
+
+#ifdef CONFIG_USER_ONLY
+    /* In sve_probe_page, we assume TBI is enabled. */
+    tcg_debug_assert(dc->tbid & 1);
+#endif
+
+    /* Single step state. The code-generation logic here is:
+     *  SS_ACTIVE == 0:
+     *   generate code with no special handling for single-stepping (except
+     *   that anything that can make us go to SS_ACTIVE == 1 must end the TB;
+     *   this happens anyway because those changes are all system register or
+     *   PSTATE writes).
+     *  SS_ACTIVE == 1, PSTATE.SS == 1: (active-not-pending)
+     *   emit code for one insn
+     *   emit code to clear PSTATE.SS
+     *   emit code to generate software step exception for completed step
+     *   end TB (as usual for having generated an exception)
+     *  SS_ACTIVE == 1, PSTATE.SS == 0: (active-pending)
+     *   emit code to generate a software step exception
+     *   end the TB
+     */
+    dc->ss_active = EX_TBFLAG_ANY(tb_flags, SS_ACTIVE);
+    dc->pstate_ss = EX_TBFLAG_ANY(tb_flags, PSTATE__SS);
+    dc->is_ldex = false;
+
+    /* Bound the number of insns to execute to those left on the page.  */
+    bound = -(dc->base.pc_first | TARGET_PAGE_MASK) / 4;
+
+    /* If architectural single step active, limit to 1.  */
+    if (dc->ss_active) {
+        bound = 1;
+    }
+    dc->base.max_insns = MIN(dc->base.max_insns, bound);
+
+    init_tmp_a64_array(dc);
+}
+
+static void aarch64_tr_tb_start(DisasContextBase *db, CPUState *cpu)
+{
+}
+
+static void aarch64_tr_insn_start(DisasContextBase *dcbase, CPUState *cpu)
+{
+    DisasContext *dc = container_of(dcbase, DisasContext, base);
+    target_ulong pc_arg = dc->base.pc_next;
+
+    if (TARGET_TB_PCREL) {
+        pc_arg &= ~TARGET_PAGE_MASK;
+    }
+    tcg_gen_insn_start(pc_arg, 0, 0);
+    dc->insn_start = tcg_last_op();
+}
+
+static void aarch64_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu)
+{
+    DisasContext *s = container_of(dcbase, DisasContext, base);
+    CPUARMState *env = cpu->env_ptr;
+    uint64_t pc = s->base.pc_next;
+    uint32_t insn;
+
+    /* Singlestep exceptions have the highest priority. */
+    if (s->ss_active && !s->pstate_ss) {
+        /* Singlestep state is Active-pending.
+         * If we're in this state at the start of a TB then either
+         *  a) we just took an exception to an EL which is being debugged
+         *     and this is the first insn in the exception handler
+         *  b) debug exceptions were masked and we just unmasked them
+         *     without changing EL (eg by clearing PSTATE.D)
+         * In either case we're going to take a swstep exception in the
+         * "did not step an insn" case, and so the syndrome ISV and EX
+         * bits should be zero.
+         */
+        assert(s->base.num_insns == 1);
+        gen_swstep_exception(s, 0, 0);
+        s->base.is_jmp = DISAS_NORETURN;
+        s->base.pc_next = pc + 4;
+        return;
+    }
+
+    if (pc & 3) {
+        /*
+         * PC alignment fault.  This has priority over the instruction abort
+         * that we would receive from a translation fault via arm_ldl_code.
+         * This should only be possible after an indirect branch, at the
+         * start of the TB.
+         */
+        assert(s->base.num_insns == 1);
+        gen_helper_exception_pc_alignment(cpu_env, tcg_constant_tl(pc));
+        s->base.is_jmp = DISAS_NORETURN;
+        s->base.pc_next = QEMU_ALIGN_UP(pc, 4);
+        return;
+    }
+
+    s->pc_curr = pc;
+    insn = arm_ldl_code(env, &s->base, pc, s->sctlr_b);
+    s->insn = insn;
+    s->base.pc_next = pc + 4;
+
+    s->fp_access_checked = false;
+    s->sve_access_checked = false;
+
+    if (s->pstate_il) {
+        /*
+         * Illegal execution state. This has priority over BTI
+         * exceptions, but comes after instruction abort exceptions.
+         */
+        gen_exception_insn(s, 0, EXCP_UDEF, syn_illegalstate());
+        return;
+    }
+
+    if (dc_isar_feature(aa64_bti, s)) {
+        if (s->base.num_insns == 1) {
+            /*
+             * At the first insn of the TB, compute s->guarded_page.
+             * We delayed computing this until successfully reading
+             * the first insn of the TB, above.  This (mostly) ensures
+             * that the softmmu tlb entry has been populated, and the
+             * page table GP bit is available.
+             *
+             * Note that we need to compute this even if btype == 0,
+             * because this value is used for BR instructions later
+             * where ENV is not available.
+             */
+            s->guarded_page = is_guarded_page(env, s);
+
+            /* First insn can have btype set to non-zero.  */
+            tcg_debug_assert(s->btype >= 0);
+
+            /*
+             * Note that the Branch Target Exception has fairly high
+             * priority -- below debugging exceptions but above most
+             * everything else.  This allows us to handle this now
+             * instead of waiting until the insn is otherwise decoded.
+             */
+            if (s->btype != 0
+                && s->guarded_page
+                && !btype_destination_ok(insn, s->bt, s->btype)) {
+                gen_exception_insn(s, 0, EXCP_UDEF, syn_btitrap(s->btype));
+                return;
+            }
+        } else {
+            /* Not the first insn: btype must be 0.  */
+            tcg_debug_assert(s->btype == 0);
+        }
+    }
+
+    s->is_nonstreaming = false;
+    if (s->sme_trap_nonstreaming) {
+        disas_sme_fa64(s, insn);
+    }
+
+    switch (extract32(insn, 25, 4)) {
+    case 0x0:
+        if (!extract32(insn, 31, 1) || !disas_sme(s, insn)) {
+            unallocated_encoding(s);
+        }
+        break;
+    case 0x1: case 0x3: /* UNALLOCATED */
+        unallocated_encoding(s);
+        break;
+    case 0x2:
+        if (!disas_sve(s, insn)) {
+            unallocated_encoding(s);
+        }
+        break;
+    case 0x8: case 0x9: /* Data processing - immediate */
+        disas_data_proc_imm(s, insn);
+        break;
+    case 0xa: case 0xb: /* Branch, exception generation and system insns */
+        disas_b_exc_sys(s, insn);
+        break;
+    case 0x4:
+    case 0x6:
+    case 0xc:
+    case 0xe:      /* Loads and stores */
+        disas_ldst(s, insn);
+        break;
+    case 0x5:
+    case 0xd:      /* Data processing - register */
+        disas_data_proc_reg(s, insn);
+        break;
+    case 0x7:
+    case 0xf:      /* Data processing - SIMD and floating point */
+        disas_data_proc_simd_fp(s, insn);
+        break;
+    default:
+        assert(FALSE); /* all 15 cases should be handled above */
+        break;
+    }
+
+    /* if we allocated any temporaries, free them here */
+    free_tmp_a64(s);
+
+    /*
+     * After execution of most insns, btype is reset to 0.
+     * Note that we set btype == -1 when the insn sets btype.
+     */
+    if (s->btype > 0 && s->base.is_jmp != DISAS_NORETURN) {
+        reset_btype(s);
+    }
+
+    translator_loop_temp_check(&s->base);
+}
+
+static void aarch64_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
+{
+    DisasContext *dc = container_of(dcbase, DisasContext, base);
+
+    if (unlikely(dc->ss_active)) {
+        /* Note that this means single stepping WFI doesn't halt the CPU.
+         * For conditional branch insns this is harmless unreachable code as
+         * gen_goto_tb() has already handled emitting the debug exception
+         * (and thus a tb-jump is not possible when singlestepping).
+         */
+        switch (dc->base.is_jmp) {
+        default:
+            gen_a64_update_pc(dc, 4);
+            /* fall through */
+        case DISAS_EXIT:
+        case DISAS_JUMP:
+            gen_step_complete_exception(dc);
+            break;
+        case DISAS_NORETURN:
+            break;
+        }
+    } else {
+        switch (dc->base.is_jmp) {
+        case DISAS_NEXT:
+        case DISAS_TOO_MANY:
+            gen_goto_tb(dc, 1, 4);
+            break;
+        default:
+        case DISAS_UPDATE_EXIT:
+            gen_a64_update_pc(dc, 4);
+            /* fall through */
+        case DISAS_EXIT:
+            tcg_gen_exit_tb(NULL, 0);
+            break;
+        case DISAS_UPDATE_NOCHAIN:
+            gen_a64_update_pc(dc, 4);
+            /* fall through */
+        case DISAS_JUMP:
+            tcg_gen_lookup_and_goto_ptr();
+            break;
+        case DISAS_NORETURN:
+        case DISAS_SWI:
+            break;
+        case DISAS_WFE:
+            gen_a64_update_pc(dc, 4);
+            gen_helper_wfe(cpu_env);
+            break;
+        case DISAS_YIELD:
+            gen_a64_update_pc(dc, 4);
+            gen_helper_yield(cpu_env);
+            break;
+        case DISAS_WFI:
+            /*
+             * This is a special case because we don't want to just halt
+             * the CPU if trying to debug across a WFI.
+             */
+            gen_a64_update_pc(dc, 4);
+            gen_helper_wfi(cpu_env, tcg_constant_i32(4));
+            /*
+             * The helper doesn't necessarily throw an exception, but we
+             * must go back to the main loop to check for interrupts anyway.
+             */
+            tcg_gen_exit_tb(NULL, 0);
+            break;
+        }
+    }
+}
+
+static void aarch64_tr_disas_log(const DisasContextBase *dcbase,
+                                 CPUState *cpu, FILE *logfile)
+{
+    DisasContext *dc = container_of(dcbase, DisasContext, base);
+
+    fprintf(logfile, "IN: %s\n", lookup_symbol(dc->base.pc_first));
+    target_disas(logfile, cpu, dc->base.pc_first, dc->base.tb->size);
+}
+
+const TranslatorOps aarch64_translator_ops = {
+    .init_disas_context = aarch64_tr_init_disas_context,
+    .tb_start           = aarch64_tr_tb_start,
+    .insn_start         = aarch64_tr_insn_start,
+    .translate_insn     = aarch64_tr_translate_insn,
+    .tb_stop            = aarch64_tr_tb_stop,
+    .disas_log          = aarch64_tr_disas_log,
+};
diff --git a/target/arm/tcg/translate-a64.h b/target/arm/tcg/translate-a64.h
new file mode 100644
index 0000000..ad3762d
--- /dev/null
+++ b/target/arm/tcg/translate-a64.h
@@ -0,0 +1,201 @@
+/*
+ *  AArch64 translation, common definitions.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef TARGET_ARM_TRANSLATE_A64_H
+#define TARGET_ARM_TRANSLATE_A64_H
+
+TCGv_i64 new_tmp_a64(DisasContext *s);
+TCGv_i64 new_tmp_a64_local(DisasContext *s);
+TCGv_i64 new_tmp_a64_zero(DisasContext *s);
+TCGv_i64 cpu_reg(DisasContext *s, int reg);
+TCGv_i64 cpu_reg_sp(DisasContext *s, int reg);
+TCGv_i64 read_cpu_reg(DisasContext *s, int reg, int sf);
+TCGv_i64 read_cpu_reg_sp(DisasContext *s, int reg, int sf);
+void write_fp_dreg(DisasContext *s, int reg, TCGv_i64 v);
+bool logic_imm_decode_wmask(uint64_t *result, unsigned int immn,
+                            unsigned int imms, unsigned int immr);
+bool sve_access_check(DisasContext *s);
+bool sme_enabled_check(DisasContext *s);
+bool sme_enabled_check_with_svcr(DisasContext *s, unsigned);
+
+/* This function corresponds to CheckStreamingSVEEnabled. */
+static inline bool sme_sm_enabled_check(DisasContext *s)
+{
+    return sme_enabled_check_with_svcr(s, R_SVCR_SM_MASK);
+}
+
+/* This function corresponds to CheckSMEAndZAEnabled. */
+static inline bool sme_za_enabled_check(DisasContext *s)
+{
+    return sme_enabled_check_with_svcr(s, R_SVCR_ZA_MASK);
+}
+
+/* Note that this function corresponds to CheckStreamingSVEAndZAEnabled. */
+static inline bool sme_smza_enabled_check(DisasContext *s)
+{
+    return sme_enabled_check_with_svcr(s, R_SVCR_SM_MASK | R_SVCR_ZA_MASK);
+}
+
+TCGv_i64 clean_data_tbi(DisasContext *s, TCGv_i64 addr);
+TCGv_i64 gen_mte_check1(DisasContext *s, TCGv_i64 addr, bool is_write,
+                        bool tag_checked, int log2_size);
+TCGv_i64 gen_mte_checkN(DisasContext *s, TCGv_i64 addr, bool is_write,
+                        bool tag_checked, int size);
+
+/* We should have at some point before trying to access an FP register
+ * done the necessary access check, so assert that
+ * (a) we did the check and
+ * (b) we didn't then just plough ahead anyway if it failed.
+ * Print the instruction pattern in the abort message so we can figure
+ * out what we need to fix if a user encounters this problem in the wild.
+ */
+static inline void assert_fp_access_checked(DisasContext *s)
+{
+#ifdef CONFIG_DEBUG_TCG
+    if (unlikely(!s->fp_access_checked || s->fp_excp_el)) {
+        fprintf(stderr, "target-arm: FP access check missing for "
+                "instruction 0x%08x\n", s->insn);
+        abort();
+    }
+#endif
+}
+
+/* Return the offset into CPUARMState of an element of specified
+ * size, 'element' places in from the least significant end of
+ * the FP/vector register Qn.
+ */
+static inline int vec_reg_offset(DisasContext *s, int regno,
+                                 int element, MemOp size)
+{
+    int element_size = 1 << size;
+    int offs = element * element_size;
+#if HOST_BIG_ENDIAN
+    /* This is complicated slightly because vfp.zregs[n].d[0] is
+     * still the lowest and vfp.zregs[n].d[15] the highest of the
+     * 256 byte vector, even on big endian systems.
+     *
+     * Calculate the offset assuming fully little-endian,
+     * then XOR to account for the order of the 8-byte units.
+     *
+     * For 16 byte elements, the two 8 byte halves will not form a
+     * host int128 if the host is bigendian, since they're in the
+     * wrong order.  However the only 16 byte operation we have is
+     * a move, so we can ignore this for the moment.  More complicated
+     * operations will have to special case loading and storing from
+     * the zregs array.
+     */
+    if (element_size < 8) {
+        offs ^= 8 - element_size;
+    }
+#endif
+    offs += offsetof(CPUARMState, vfp.zregs[regno]);
+    assert_fp_access_checked(s);
+    return offs;
+}
+
+/* Return the offset info CPUARMState of the "whole" vector register Qn.  */
+static inline int vec_full_reg_offset(DisasContext *s, int regno)
+{
+    assert_fp_access_checked(s);
+    return offsetof(CPUARMState, vfp.zregs[regno]);
+}
+
+/* Return a newly allocated pointer to the vector register.  */
+static inline TCGv_ptr vec_full_reg_ptr(DisasContext *s, int regno)
+{
+    TCGv_ptr ret = tcg_temp_new_ptr();
+    tcg_gen_addi_ptr(ret, cpu_env, vec_full_reg_offset(s, regno));
+    return ret;
+}
+
+/* Return the byte size of the "whole" vector register, VL / 8.  */
+static inline int vec_full_reg_size(DisasContext *s)
+{
+    return s->vl;
+}
+
+/* Return the byte size of the vector register, SVL / 8. */
+static inline int streaming_vec_reg_size(DisasContext *s)
+{
+    return s->svl;
+}
+
+/*
+ * Return the offset info CPUARMState of the predicate vector register Pn.
+ * Note for this purpose, FFR is P16.
+ */
+static inline int pred_full_reg_offset(DisasContext *s, int regno)
+{
+    return offsetof(CPUARMState, vfp.pregs[regno]);
+}
+
+/* Return the byte size of the whole predicate register, VL / 64.  */
+static inline int pred_full_reg_size(DisasContext *s)
+{
+    return s->vl >> 3;
+}
+
+/* Return the byte size of the predicate register, SVL / 64.  */
+static inline int streaming_pred_reg_size(DisasContext *s)
+{
+    return s->svl >> 3;
+}
+
+/*
+ * Round up the size of a register to a size allowed by
+ * the tcg vector infrastructure.  Any operation which uses this
+ * size may assume that the bits above pred_full_reg_size are zero,
+ * and must leave them the same way.
+ *
+ * Note that this is not needed for the vector registers as they
+ * are always properly sized for tcg vectors.
+ */
+static inline int size_for_gvec(int size)
+{
+    if (size <= 8) {
+        return 8;
+    } else {
+        return QEMU_ALIGN_UP(size, 16);
+    }
+}
+
+static inline int pred_gvec_reg_size(DisasContext *s)
+{
+    return size_for_gvec(pred_full_reg_size(s));
+}
+
+/* Return a newly allocated pointer to the predicate register.  */
+static inline TCGv_ptr pred_full_reg_ptr(DisasContext *s, int regno)
+{
+    TCGv_ptr ret = tcg_temp_new_ptr();
+    tcg_gen_addi_ptr(ret, cpu_env, pred_full_reg_offset(s, regno));
+    return ret;
+}
+
+bool disas_sve(DisasContext *, uint32_t);
+bool disas_sme(DisasContext *, uint32_t);
+
+void gen_gvec_rax1(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_xar(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                  uint32_t rm_ofs, int64_t shift,
+                  uint32_t opr_sz, uint32_t max_sz);
+
+void gen_sve_ldr(DisasContext *s, TCGv_ptr, int vofs, int len, int rn, int imm);
+void gen_sve_str(DisasContext *s, TCGv_ptr, int vofs, int len, int rn, int imm);
+
+#endif /* TARGET_ARM_TRANSLATE_A64_H */
diff --git a/target/arm/tcg/translate-m-nocp.c b/target/arm/tcg/translate-m-nocp.c
new file mode 100644
index 0000000..5df7d46
--- /dev/null
+++ b/target/arm/tcg/translate-m-nocp.c
@@ -0,0 +1,788 @@
+/*
+ *  ARM translation: M-profile NOCP special-case instructions
+ *
+ *  Copyright (c) 2020 Linaro, Ltd.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
+#include "translate.h"
+#include "translate-a32.h"
+
+#include "decode-m-nocp.c.inc"
+
+/*
+ * Decode VLLDM and VLSTM are nonstandard because:
+ *  * if there is no FPU then these insns must NOP in
+ *    Secure state and UNDEF in Nonsecure state
+ *  * if there is an FPU then these insns do not have
+ *    the usual behaviour that vfp_access_check() provides of
+ *    being controlled by CPACR/NSACR enable bits or the
+ *    lazy-stacking logic.
+ */
+static bool trans_VLLDM_VLSTM(DisasContext *s, arg_VLLDM_VLSTM *a)
+{
+    TCGv_i32 fptr;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_M) ||
+        !arm_dc_feature(s, ARM_FEATURE_V8)) {
+        return false;
+    }
+
+    if (a->op) {
+        /*
+         * T2 encoding ({D0-D31} reglist): v8.1M and up. We choose not
+         * to take the IMPDEF option to make memory accesses to the stack
+         * slots that correspond to the D16-D31 registers (discarding
+         * read data and writing UNKNOWN values), so for us the T2
+         * encoding behaves identically to the T1 encoding.
+         */
+        if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+            return false;
+        }
+    } else {
+        /*
+         * T1 encoding ({D0-D15} reglist); undef if we have 32 Dregs.
+         * This is currently architecturally impossible, but we add the
+         * check to stay in line with the pseudocode. Note that we must
+         * emit code for the UNDEF so it takes precedence over the NOCP.
+         */
+        if (dc_isar_feature(aa32_simd_r32, s)) {
+            unallocated_encoding(s);
+            return true;
+        }
+    }
+
+    /*
+     * If not secure, UNDEF. We must emit code for this
+     * rather than returning false so that this takes
+     * precedence over the m-nocp.decode NOCP fallback.
+     */
+    if (!s->v8m_secure) {
+        unallocated_encoding(s);
+        return true;
+    }
+
+    s->eci_handled = true;
+
+    /* If no fpu, NOP. */
+    if (!dc_isar_feature(aa32_vfp, s)) {
+        clear_eci_state(s);
+        return true;
+    }
+
+    fptr = load_reg(s, a->rn);
+    if (a->l) {
+        gen_helper_v7m_vlldm(cpu_env, fptr);
+    } else {
+        gen_helper_v7m_vlstm(cpu_env, fptr);
+    }
+    tcg_temp_free_i32(fptr);
+
+    clear_eci_state(s);
+
+    /*
+     * End the TB, because we have updated FP control bits,
+     * and possibly VPR or LTPSIZE.
+     */
+    s->base.is_jmp = DISAS_UPDATE_EXIT;
+    return true;
+}
+
+static bool trans_VSCCLRM(DisasContext *s, arg_VSCCLRM *a)
+{
+    int btmreg, topreg;
+    TCGv_i64 zero;
+    TCGv_i32 aspen, sfpa;
+
+    if (!dc_isar_feature(aa32_m_sec_state, s)) {
+        /* Before v8.1M, fall through in decode to NOCP check */
+        return false;
+    }
+
+    /* Explicitly UNDEF because this takes precedence over NOCP */
+    if (!arm_dc_feature(s, ARM_FEATURE_M_MAIN) || !s->v8m_secure) {
+        unallocated_encoding(s);
+        return true;
+    }
+
+    s->eci_handled = true;
+
+    if (!dc_isar_feature(aa32_vfp_simd, s)) {
+        /* NOP if we have neither FP nor MVE */
+        clear_eci_state(s);
+        return true;
+    }
+
+    /*
+     * If FPCCR.ASPEN != 0 && CONTROL_S.SFPA == 0 then there is no
+     * active floating point context so we must NOP (without doing
+     * any lazy state preservation or the NOCP check).
+     */
+    aspen = load_cpu_field(v7m.fpccr[M_REG_S]);
+    sfpa = load_cpu_field(v7m.control[M_REG_S]);
+    tcg_gen_andi_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK);
+    tcg_gen_xori_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK);
+    tcg_gen_andi_i32(sfpa, sfpa, R_V7M_CONTROL_SFPA_MASK);
+    tcg_gen_or_i32(sfpa, sfpa, aspen);
+    arm_gen_condlabel(s);
+    tcg_gen_brcondi_i32(TCG_COND_EQ, sfpa, 0, s->condlabel.label);
+
+    if (s->fp_excp_el != 0) {
+        gen_exception_insn_el(s, 0, EXCP_NOCP,
+                              syn_uncategorized(), s->fp_excp_el);
+        return true;
+    }
+
+    topreg = a->vd + a->imm - 1;
+    btmreg = a->vd;
+
+    /* Convert to Sreg numbers if the insn specified in Dregs */
+    if (a->size == 3) {
+        topreg = topreg * 2 + 1;
+        btmreg *= 2;
+    }
+
+    if (topreg > 63 || (topreg > 31 && !(topreg & 1))) {
+        /* UNPREDICTABLE: we choose to undef */
+        unallocated_encoding(s);
+        return true;
+    }
+
+    /* Silently ignore requests to clear D16-D31 if they don't exist */
+    if (topreg > 31 && !dc_isar_feature(aa32_simd_r32, s)) {
+        topreg = 31;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    /* Zero the Sregs from btmreg to topreg inclusive. */
+    zero = tcg_constant_i64(0);
+    if (btmreg & 1) {
+        write_neon_element64(zero, btmreg >> 1, 1, MO_32);
+        btmreg++;
+    }
+    for (; btmreg + 1 <= topreg; btmreg += 2) {
+        write_neon_element64(zero, btmreg >> 1, 0, MO_64);
+    }
+    if (btmreg == topreg) {
+        write_neon_element64(zero, btmreg >> 1, 0, MO_32);
+        btmreg++;
+    }
+    assert(btmreg == topreg + 1);
+    if (dc_isar_feature(aa32_mve, s)) {
+        store_cpu_field(tcg_constant_i32(0), v7m.vpr);
+    }
+
+    clear_eci_state(s);
+    return true;
+}
+
+/*
+ * M-profile provides two different sets of instructions that can
+ * access floating point system registers: VMSR/VMRS (which move
+ * to/from a general purpose register) and VLDR/VSTR sysreg (which
+ * move directly to/from memory). In some cases there are also side
+ * effects which must happen after any write to memory (which could
+ * cause an exception). So we implement the common logic for the
+ * sysreg access in gen_M_fp_sysreg_write() and gen_M_fp_sysreg_read(),
+ * which take pointers to callback functions which will perform the
+ * actual "read/write general purpose register" and "read/write
+ * memory" operations.
+ */
+
+/*
+ * Emit code to store the sysreg to its final destination; frees the
+ * TCG temp 'value' it is passed. do_access is true to do the store,
+ * and false to skip it and only perform side-effects like base
+ * register writeback.
+ */
+typedef void fp_sysreg_storefn(DisasContext *s, void *opaque, TCGv_i32 value,
+                               bool do_access);
+/*
+ * Emit code to load the value to be copied to the sysreg; returns
+ * a new TCG temporary. do_access is true to do the store,
+ * and false to skip it and only perform side-effects like base
+ * register writeback.
+ */
+typedef TCGv_i32 fp_sysreg_loadfn(DisasContext *s, void *opaque,
+                                  bool do_access);
+
+/* Common decode/access checks for fp sysreg read/write */
+typedef enum FPSysRegCheckResult {
+    FPSysRegCheckFailed, /* caller should return false */
+    FPSysRegCheckDone, /* caller should return true */
+    FPSysRegCheckContinue, /* caller should continue generating code */
+} FPSysRegCheckResult;
+
+static FPSysRegCheckResult fp_sysreg_checks(DisasContext *s, int regno)
+{
+    if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
+        return FPSysRegCheckFailed;
+    }
+
+    switch (regno) {
+    case ARM_VFP_FPSCR:
+    case QEMU_VFP_FPSCR_NZCV:
+        break;
+    case ARM_VFP_FPSCR_NZCVQC:
+        if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+            return FPSysRegCheckFailed;
+        }
+        break;
+    case ARM_VFP_FPCXT_S:
+    case ARM_VFP_FPCXT_NS:
+        if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+            return FPSysRegCheckFailed;
+        }
+        if (!s->v8m_secure) {
+            return FPSysRegCheckFailed;
+        }
+        break;
+    case ARM_VFP_VPR:
+    case ARM_VFP_P0:
+        if (!dc_isar_feature(aa32_mve, s)) {
+            return FPSysRegCheckFailed;
+        }
+        break;
+    default:
+        return FPSysRegCheckFailed;
+    }
+
+    /*
+     * FPCXT_NS is a special case: it has specific handling for
+     * "current FP state is inactive", and must do the PreserveFPState()
+     * but not the usual full set of actions done by ExecuteFPCheck().
+     * So we don't call vfp_access_check() and the callers must handle this.
+     */
+    if (regno != ARM_VFP_FPCXT_NS && !vfp_access_check(s)) {
+        return FPSysRegCheckDone;
+    }
+    return FPSysRegCheckContinue;
+}
+
+static void gen_branch_fpInactive(DisasContext *s, TCGCond cond,
+                                  TCGLabel *label)
+{
+    /*
+     * FPCXT_NS is a special case: it has specific handling for
+     * "current FP state is inactive", and must do the PreserveFPState()
+     * but not the usual full set of actions done by ExecuteFPCheck().
+     * We don't have a TB flag that matches the fpInactive check, so we
+     * do it at runtime as we don't expect FPCXT_NS accesses to be frequent.
+     *
+     * Emit code that checks fpInactive and does a conditional
+     * branch to label based on it:
+     *  if cond is TCG_COND_NE then branch if fpInactive != 0 (ie if inactive)
+     *  if cond is TCG_COND_EQ then branch if fpInactive == 0 (ie if active)
+     */
+    assert(cond == TCG_COND_EQ || cond == TCG_COND_NE);
+
+    /* fpInactive = FPCCR_NS.ASPEN == 1 && CONTROL.FPCA == 0 */
+    TCGv_i32 aspen, fpca;
+    aspen = load_cpu_field(v7m.fpccr[M_REG_NS]);
+    fpca = load_cpu_field(v7m.control[M_REG_S]);
+    tcg_gen_andi_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK);
+    tcg_gen_xori_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK);
+    tcg_gen_andi_i32(fpca, fpca, R_V7M_CONTROL_FPCA_MASK);
+    tcg_gen_or_i32(fpca, fpca, aspen);
+    tcg_gen_brcondi_i32(tcg_invert_cond(cond), fpca, 0, label);
+    tcg_temp_free_i32(aspen);
+    tcg_temp_free_i32(fpca);
+}
+
+static bool gen_M_fp_sysreg_write(DisasContext *s, int regno,
+                                  fp_sysreg_loadfn *loadfn,
+                                  void *opaque)
+{
+    /* Do a write to an M-profile floating point system register */
+    TCGv_i32 tmp;
+    TCGLabel *lab_end = NULL;
+
+    switch (fp_sysreg_checks(s, regno)) {
+    case FPSysRegCheckFailed:
+        return false;
+    case FPSysRegCheckDone:
+        return true;
+    case FPSysRegCheckContinue:
+        break;
+    }
+
+    switch (regno) {
+    case ARM_VFP_FPSCR:
+        tmp = loadfn(s, opaque, true);
+        gen_helper_vfp_set_fpscr(cpu_env, tmp);
+        tcg_temp_free_i32(tmp);
+        gen_lookup_tb(s);
+        break;
+    case ARM_VFP_FPSCR_NZCVQC:
+    {
+        TCGv_i32 fpscr;
+        tmp = loadfn(s, opaque, true);
+        if (dc_isar_feature(aa32_mve, s)) {
+            /* QC is only present for MVE; otherwise RES0 */
+            TCGv_i32 qc = tcg_temp_new_i32();
+            tcg_gen_andi_i32(qc, tmp, FPCR_QC);
+            /*
+             * The 4 vfp.qc[] fields need only be "zero" vs "non-zero";
+             * here writing the same value into all elements is simplest.
+             */
+            tcg_gen_gvec_dup_i32(MO_32, offsetof(CPUARMState, vfp.qc),
+                                 16, 16, qc);
+        }
+        tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK);
+        fpscr = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]);
+        tcg_gen_andi_i32(fpscr, fpscr, ~FPCR_NZCV_MASK);
+        tcg_gen_or_i32(fpscr, fpscr, tmp);
+        store_cpu_field(fpscr, vfp.xregs[ARM_VFP_FPSCR]);
+        tcg_temp_free_i32(tmp);
+        break;
+    }
+    case ARM_VFP_FPCXT_NS:
+    {
+        TCGLabel *lab_active = gen_new_label();
+
+        lab_end = gen_new_label();
+        gen_branch_fpInactive(s, TCG_COND_EQ, lab_active);
+        /*
+         * fpInactive case: write is a NOP, so only do side effects
+         * like register writeback before we branch to end
+         */
+        loadfn(s, opaque, false);
+        tcg_gen_br(lab_end);
+
+        gen_set_label(lab_active);
+        /*
+         * !fpInactive: if FPU disabled, take NOCP exception;
+         * otherwise PreserveFPState(), and then FPCXT_NS writes
+         * behave the same as FPCXT_S writes.
+         */
+        if (!vfp_access_check_m(s, true)) {
+            /*
+             * This was only a conditional exception, so override
+             * gen_exception_insn_el()'s default to DISAS_NORETURN
+             */
+            s->base.is_jmp = DISAS_NEXT;
+            break;
+        }
+    }
+    /* fall through */
+    case ARM_VFP_FPCXT_S:
+    {
+        TCGv_i32 sfpa, control;
+        /*
+         * Set FPSCR and CONTROL.SFPA from value; the new FPSCR takes
+         * bits [27:0] from value and zeroes bits [31:28].
+         */
+        tmp = loadfn(s, opaque, true);
+        sfpa = tcg_temp_new_i32();
+        tcg_gen_shri_i32(sfpa, tmp, 31);
+        control = load_cpu_field(v7m.control[M_REG_S]);
+        tcg_gen_deposit_i32(control, control, sfpa,
+                            R_V7M_CONTROL_SFPA_SHIFT, 1);
+        store_cpu_field(control, v7m.control[M_REG_S]);
+        tcg_gen_andi_i32(tmp, tmp, ~FPCR_NZCV_MASK);
+        gen_helper_vfp_set_fpscr(cpu_env, tmp);
+        s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
+        tcg_temp_free_i32(tmp);
+        tcg_temp_free_i32(sfpa);
+        break;
+    }
+    case ARM_VFP_VPR:
+        /* Behaves as NOP if not privileged */
+        if (IS_USER(s)) {
+            loadfn(s, opaque, false);
+            break;
+        }
+        tmp = loadfn(s, opaque, true);
+        store_cpu_field(tmp, v7m.vpr);
+        s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
+        break;
+    case ARM_VFP_P0:
+    {
+        TCGv_i32 vpr;
+        tmp = loadfn(s, opaque, true);
+        vpr = load_cpu_field(v7m.vpr);
+        tcg_gen_deposit_i32(vpr, vpr, tmp,
+                            R_V7M_VPR_P0_SHIFT, R_V7M_VPR_P0_LENGTH);
+        store_cpu_field(vpr, v7m.vpr);
+        s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
+        tcg_temp_free_i32(tmp);
+        break;
+    }
+    default:
+        g_assert_not_reached();
+    }
+    if (lab_end) {
+        gen_set_label(lab_end);
+    }
+    return true;
+}
+
+static bool gen_M_fp_sysreg_read(DisasContext *s, int regno,
+                                 fp_sysreg_storefn *storefn,
+                                 void *opaque)
+{
+    /* Do a read from an M-profile floating point system register */
+    TCGv_i32 tmp;
+    TCGLabel *lab_end = NULL;
+    bool lookup_tb = false;
+
+    switch (fp_sysreg_checks(s, regno)) {
+    case FPSysRegCheckFailed:
+        return false;
+    case FPSysRegCheckDone:
+        return true;
+    case FPSysRegCheckContinue:
+        break;
+    }
+
+    if (regno == ARM_VFP_FPSCR_NZCVQC && !dc_isar_feature(aa32_mve, s)) {
+        /* QC is RES0 without MVE, so NZCVQC simplifies to NZCV */
+        regno = QEMU_VFP_FPSCR_NZCV;
+    }
+
+    switch (regno) {
+    case ARM_VFP_FPSCR:
+        tmp = tcg_temp_new_i32();
+        gen_helper_vfp_get_fpscr(tmp, cpu_env);
+        storefn(s, opaque, tmp, true);
+        break;
+    case ARM_VFP_FPSCR_NZCVQC:
+        tmp = tcg_temp_new_i32();
+        gen_helper_vfp_get_fpscr(tmp, cpu_env);
+        tcg_gen_andi_i32(tmp, tmp, FPCR_NZCVQC_MASK);
+        storefn(s, opaque, tmp, true);
+        break;
+    case QEMU_VFP_FPSCR_NZCV:
+        /*
+         * Read just NZCV; this is a special case to avoid the
+         * helper call for the "VMRS to CPSR.NZCV" insn.
+         */
+        tmp = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]);
+        tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK);
+        storefn(s, opaque, tmp, true);
+        break;
+    case ARM_VFP_FPCXT_S:
+    {
+        TCGv_i32 control, sfpa, fpscr;
+        /* Bits [27:0] from FPSCR, bit [31] from CONTROL.SFPA */
+        tmp = tcg_temp_new_i32();
+        sfpa = tcg_temp_new_i32();
+        gen_helper_vfp_get_fpscr(tmp, cpu_env);
+        tcg_gen_andi_i32(tmp, tmp, ~FPCR_NZCV_MASK);
+        control = load_cpu_field(v7m.control[M_REG_S]);
+        tcg_gen_andi_i32(sfpa, control, R_V7M_CONTROL_SFPA_MASK);
+        tcg_gen_shli_i32(sfpa, sfpa, 31 - R_V7M_CONTROL_SFPA_SHIFT);
+        tcg_gen_or_i32(tmp, tmp, sfpa);
+        tcg_temp_free_i32(sfpa);
+        /*
+         * Store result before updating FPSCR etc, in case
+         * it is a memory write which causes an exception.
+         */
+        storefn(s, opaque, tmp, true);
+        /*
+         * Now we must reset FPSCR from FPDSCR_NS, and clear
+         * CONTROL.SFPA; so we'll end the TB here.
+         */
+        tcg_gen_andi_i32(control, control, ~R_V7M_CONTROL_SFPA_MASK);
+        store_cpu_field(control, v7m.control[M_REG_S]);
+        fpscr = load_cpu_field(v7m.fpdscr[M_REG_NS]);
+        gen_helper_vfp_set_fpscr(cpu_env, fpscr);
+        tcg_temp_free_i32(fpscr);
+        lookup_tb = true;
+        break;
+    }
+    case ARM_VFP_FPCXT_NS:
+    {
+        TCGv_i32 control, sfpa, fpscr, fpdscr;
+        TCGLabel *lab_active = gen_new_label();
+
+        lookup_tb = true;
+
+        gen_branch_fpInactive(s, TCG_COND_EQ, lab_active);
+        /* fpInactive case: reads as FPDSCR_NS */
+        TCGv_i32 tmp = load_cpu_field(v7m.fpdscr[M_REG_NS]);
+        storefn(s, opaque, tmp, true);
+        lab_end = gen_new_label();
+        tcg_gen_br(lab_end);
+
+        gen_set_label(lab_active);
+        /*
+         * !fpInactive: if FPU disabled, take NOCP exception;
+         * otherwise PreserveFPState(), and then FPCXT_NS
+         * reads the same as FPCXT_S.
+         */
+        if (!vfp_access_check_m(s, true)) {
+            /*
+             * This was only a conditional exception, so override
+             * gen_exception_insn_el()'s default to DISAS_NORETURN
+             */
+            s->base.is_jmp = DISAS_NEXT;
+            break;
+        }
+        tmp = tcg_temp_new_i32();
+        sfpa = tcg_temp_new_i32();
+        fpscr = tcg_temp_new_i32();
+        gen_helper_vfp_get_fpscr(fpscr, cpu_env);
+        tcg_gen_andi_i32(tmp, fpscr, ~FPCR_NZCV_MASK);
+        control = load_cpu_field(v7m.control[M_REG_S]);
+        tcg_gen_andi_i32(sfpa, control, R_V7M_CONTROL_SFPA_MASK);
+        tcg_gen_shli_i32(sfpa, sfpa, 31 - R_V7M_CONTROL_SFPA_SHIFT);
+        tcg_gen_or_i32(tmp, tmp, sfpa);
+        tcg_temp_free_i32(control);
+        /* Store result before updating FPSCR, in case it faults */
+        storefn(s, opaque, tmp, true);
+        /* If SFPA is zero then set FPSCR from FPDSCR_NS */
+        fpdscr = load_cpu_field(v7m.fpdscr[M_REG_NS]);
+        tcg_gen_movcond_i32(TCG_COND_EQ, fpscr, sfpa, tcg_constant_i32(0),
+                            fpdscr, fpscr);
+        gen_helper_vfp_set_fpscr(cpu_env, fpscr);
+        tcg_temp_free_i32(sfpa);
+        tcg_temp_free_i32(fpdscr);
+        tcg_temp_free_i32(fpscr);
+        break;
+    }
+    case ARM_VFP_VPR:
+        /* Behaves as NOP if not privileged */
+        if (IS_USER(s)) {
+            storefn(s, opaque, NULL, false);
+            break;
+        }
+        tmp = load_cpu_field(v7m.vpr);
+        storefn(s, opaque, tmp, true);
+        break;
+    case ARM_VFP_P0:
+        tmp = load_cpu_field(v7m.vpr);
+        tcg_gen_extract_i32(tmp, tmp, R_V7M_VPR_P0_SHIFT, R_V7M_VPR_P0_LENGTH);
+        storefn(s, opaque, tmp, true);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    if (lab_end) {
+        gen_set_label(lab_end);
+    }
+    if (lookup_tb) {
+        gen_lookup_tb(s);
+    }
+    return true;
+}
+
+static void fp_sysreg_to_gpr(DisasContext *s, void *opaque, TCGv_i32 value,
+                             bool do_access)
+{
+    arg_VMSR_VMRS *a = opaque;
+
+    if (!do_access) {
+        return;
+    }
+
+    if (a->rt == 15) {
+        /* Set the 4 flag bits in the CPSR */
+        gen_set_nzcv(value);
+        tcg_temp_free_i32(value);
+    } else {
+        store_reg(s, a->rt, value);
+    }
+}
+
+static TCGv_i32 gpr_to_fp_sysreg(DisasContext *s, void *opaque, bool do_access)
+{
+    arg_VMSR_VMRS *a = opaque;
+
+    if (!do_access) {
+        return NULL;
+    }
+    return load_reg(s, a->rt);
+}
+
+static bool trans_VMSR_VMRS(DisasContext *s, arg_VMSR_VMRS *a)
+{
+    /*
+     * Accesses to R15 are UNPREDICTABLE; we choose to undef.
+     * FPSCR -> r15 is a special case which writes to the PSR flags;
+     * set a->reg to a special value to tell gen_M_fp_sysreg_read()
+     * we only care about the top 4 bits of FPSCR there.
+     */
+    if (a->rt == 15) {
+        if (a->l && a->reg == ARM_VFP_FPSCR) {
+            a->reg = QEMU_VFP_FPSCR_NZCV;
+        } else {
+            return false;
+        }
+    }
+
+    if (a->l) {
+        /* VMRS, move FP system register to gp register */
+        return gen_M_fp_sysreg_read(s, a->reg, fp_sysreg_to_gpr, a);
+    } else {
+        /* VMSR, move gp register to FP system register */
+        return gen_M_fp_sysreg_write(s, a->reg, gpr_to_fp_sysreg, a);
+    }
+}
+
+static void fp_sysreg_to_memory(DisasContext *s, void *opaque, TCGv_i32 value,
+                                bool do_access)
+{
+    arg_vldr_sysreg *a = opaque;
+    uint32_t offset = a->imm;
+    TCGv_i32 addr;
+
+    if (!a->a) {
+        offset = -offset;
+    }
+
+    if (!do_access && !a->w) {
+        return;
+    }
+
+    addr = load_reg(s, a->rn);
+    if (a->p) {
+        tcg_gen_addi_i32(addr, addr, offset);
+    }
+
+    if (s->v8m_stackcheck && a->rn == 13 && a->w) {
+        gen_helper_v8m_stackcheck(cpu_env, addr);
+    }
+
+    if (do_access) {
+        gen_aa32_st_i32(s, value, addr, get_mem_index(s),
+                        MO_UL | MO_ALIGN | s->be_data);
+        tcg_temp_free_i32(value);
+    }
+
+    if (a->w) {
+        /* writeback */
+        if (!a->p) {
+            tcg_gen_addi_i32(addr, addr, offset);
+        }
+        store_reg(s, a->rn, addr);
+    } else {
+        tcg_temp_free_i32(addr);
+    }
+}
+
+static TCGv_i32 memory_to_fp_sysreg(DisasContext *s, void *opaque,
+                                    bool do_access)
+{
+    arg_vldr_sysreg *a = opaque;
+    uint32_t offset = a->imm;
+    TCGv_i32 addr;
+    TCGv_i32 value = NULL;
+
+    if (!a->a) {
+        offset = -offset;
+    }
+
+    if (!do_access && !a->w) {
+        return NULL;
+    }
+
+    addr = load_reg(s, a->rn);
+    if (a->p) {
+        tcg_gen_addi_i32(addr, addr, offset);
+    }
+
+    if (s->v8m_stackcheck && a->rn == 13 && a->w) {
+        gen_helper_v8m_stackcheck(cpu_env, addr);
+    }
+
+    if (do_access) {
+        value = tcg_temp_new_i32();
+        gen_aa32_ld_i32(s, value, addr, get_mem_index(s),
+                        MO_UL | MO_ALIGN | s->be_data);
+    }
+
+    if (a->w) {
+        /* writeback */
+        if (!a->p) {
+            tcg_gen_addi_i32(addr, addr, offset);
+        }
+        store_reg(s, a->rn, addr);
+    } else {
+        tcg_temp_free_i32(addr);
+    }
+    return value;
+}
+
+static bool trans_VLDR_sysreg(DisasContext *s, arg_vldr_sysreg *a)
+{
+    if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+        return false;
+    }
+    if (a->rn == 15) {
+        return false;
+    }
+    return gen_M_fp_sysreg_write(s, a->reg, memory_to_fp_sysreg, a);
+}
+
+static bool trans_VSTR_sysreg(DisasContext *s, arg_vldr_sysreg *a)
+{
+    if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+        return false;
+    }
+    if (a->rn == 15) {
+        return false;
+    }
+    return gen_M_fp_sysreg_read(s, a->reg, fp_sysreg_to_memory, a);
+}
+
+static bool trans_NOCP(DisasContext *s, arg_nocp *a)
+{
+    /*
+     * Handle M-profile early check for disabled coprocessor:
+     * all we need to do here is emit the NOCP exception if
+     * the coprocessor is disabled. Otherwise we return false
+     * and the real VFP/etc decode will handle the insn.
+     */
+    assert(arm_dc_feature(s, ARM_FEATURE_M));
+
+    if (a->cp == 11) {
+        a->cp = 10;
+    }
+    if (arm_dc_feature(s, ARM_FEATURE_V8_1M) &&
+        (a->cp == 8 || a->cp == 9 || a->cp == 14 || a->cp == 15)) {
+        /* in v8.1M cp 8, 9, 14, 15 also are governed by the cp10 enable */
+        a->cp = 10;
+    }
+
+    if (a->cp != 10) {
+        gen_exception_insn(s, 0, EXCP_NOCP, syn_uncategorized());
+        return true;
+    }
+
+    if (s->fp_excp_el != 0) {
+        gen_exception_insn_el(s, 0, EXCP_NOCP,
+                              syn_uncategorized(), s->fp_excp_el);
+        return true;
+    }
+
+    return false;
+}
+
+static bool trans_NOCP_8_1(DisasContext *s, arg_nocp *a)
+{
+    /* This range needs a coprocessor check for v8.1M and later only */
+    if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+        return false;
+    }
+    return trans_NOCP(s, a);
+}
diff --git a/target/arm/tcg/translate-mve.c b/target/arm/tcg/translate-mve.c
new file mode 100644
index 0000000..db7ea3f
--- /dev/null
+++ b/target/arm/tcg/translate-mve.c
@@ -0,0 +1,2310 @@
+/*
+ *  ARM translation: M-profile MVE instructions
+ *
+ *  Copyright (c) 2021 Linaro, Ltd.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
+#include "exec/exec-all.h"
+#include "exec/gen-icount.h"
+#include "translate.h"
+#include "translate-a32.h"
+
+static inline int vidup_imm(DisasContext *s, int x)
+{
+    return 1 << x;
+}
+
+/* Include the generated decoder */
+#include "decode-mve.c.inc"
+
+typedef void MVEGenLdStFn(TCGv_ptr, TCGv_ptr, TCGv_i32);
+typedef void MVEGenLdStSGFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
+typedef void MVEGenLdStIlFn(TCGv_ptr, TCGv_i32, TCGv_i32);
+typedef void MVEGenOneOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr);
+typedef void MVEGenTwoOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr);
+typedef void MVEGenTwoOpScalarFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
+typedef void MVEGenTwoOpShiftFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
+typedef void MVEGenLongDualAccOpFn(TCGv_i64, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i64);
+typedef void MVEGenVADDVFn(TCGv_i32, TCGv_ptr, TCGv_ptr, TCGv_i32);
+typedef void MVEGenOneOpImmFn(TCGv_ptr, TCGv_ptr, TCGv_i64);
+typedef void MVEGenVIDUPFn(TCGv_i32, TCGv_ptr, TCGv_ptr, TCGv_i32, TCGv_i32);
+typedef void MVEGenVIWDUPFn(TCGv_i32, TCGv_ptr, TCGv_ptr, TCGv_i32, TCGv_i32, TCGv_i32);
+typedef void MVEGenCmpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr);
+typedef void MVEGenScalarCmpFn(TCGv_ptr, TCGv_ptr, TCGv_i32);
+typedef void MVEGenVABAVFn(TCGv_i32, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
+typedef void MVEGenDualAccOpFn(TCGv_i32, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
+typedef void MVEGenVCVTRmodeFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
+
+/* Return the offset of a Qn register (same semantics as aa32_vfp_qreg()) */
+static inline long mve_qreg_offset(unsigned reg)
+{
+    return offsetof(CPUARMState, vfp.zregs[reg].d[0]);
+}
+
+static TCGv_ptr mve_qreg_ptr(unsigned reg)
+{
+    TCGv_ptr ret = tcg_temp_new_ptr();
+    tcg_gen_addi_ptr(ret, cpu_env, mve_qreg_offset(reg));
+    return ret;
+}
+
+static bool mve_no_predication(DisasContext *s)
+{
+    /*
+     * Return true if we are executing the entire MVE instruction
+     * with no predication or partial-execution, and so we can safely
+     * use an inline TCG vector implementation.
+     */
+    return s->eci == 0 && s->mve_no_pred;
+}
+
+static bool mve_check_qreg_bank(DisasContext *s, int qmask)
+{
+    /*
+     * Check whether Qregs are in range. For v8.1M only Q0..Q7
+     * are supported, see VFPSmallRegisterBank().
+     */
+    return qmask < 8;
+}
+
+bool mve_eci_check(DisasContext *s)
+{
+    /*
+     * This is a beatwise insn: check that ECI is valid (not a
+     * reserved value) and note that we are handling it.
+     * Return true if OK, false if we generated an exception.
+     */
+    s->eci_handled = true;
+    switch (s->eci) {
+    case ECI_NONE:
+    case ECI_A0:
+    case ECI_A0A1:
+    case ECI_A0A1A2:
+    case ECI_A0A1A2B0:
+        return true;
+    default:
+        /* Reserved value: INVSTATE UsageFault */
+        gen_exception_insn(s, 0, EXCP_INVSTATE, syn_uncategorized());
+        return false;
+    }
+}
+
+void mve_update_eci(DisasContext *s)
+{
+    /*
+     * The helper function will always update the CPUState field,
+     * so we only need to update the DisasContext field.
+     */
+    if (s->eci) {
+        s->eci = (s->eci == ECI_A0A1A2B0) ? ECI_A0 : ECI_NONE;
+    }
+}
+
+void mve_update_and_store_eci(DisasContext *s)
+{
+    /*
+     * For insns which don't call a helper function that will call
+     * mve_advance_vpt(), this version updates s->eci and also stores
+     * it out to the CPUState field.
+     */
+    if (s->eci) {
+        mve_update_eci(s);
+        store_cpu_field(tcg_constant_i32(s->eci << 4), condexec_bits);
+    }
+}
+
+static bool mve_skip_first_beat(DisasContext *s)
+{
+    /* Return true if PSR.ECI says we must skip the first beat of this insn */
+    switch (s->eci) {
+    case ECI_NONE:
+        return false;
+    case ECI_A0:
+    case ECI_A0A1:
+    case ECI_A0A1A2:
+    case ECI_A0A1A2B0:
+        return true;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+static bool do_ldst(DisasContext *s, arg_VLDR_VSTR *a, MVEGenLdStFn *fn,
+                    unsigned msize)
+{
+    TCGv_i32 addr;
+    uint32_t offset;
+    TCGv_ptr qreg;
+
+    if (!dc_isar_feature(aa32_mve, s) ||
+        !mve_check_qreg_bank(s, a->qd) ||
+        !fn) {
+        return false;
+    }
+
+    /* CONSTRAINED UNPREDICTABLE: we choose to UNDEF */
+    if (a->rn == 15 || (a->rn == 13 && a->w)) {
+        return false;
+    }
+
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+
+    offset = a->imm << msize;
+    if (!a->a) {
+        offset = -offset;
+    }
+    addr = load_reg(s, a->rn);
+    if (a->p) {
+        tcg_gen_addi_i32(addr, addr, offset);
+    }
+
+    qreg = mve_qreg_ptr(a->qd);
+    fn(cpu_env, qreg, addr);
+    tcg_temp_free_ptr(qreg);
+
+    /*
+     * Writeback always happens after the last beat of the insn,
+     * regardless of predication
+     */
+    if (a->w) {
+        if (!a->p) {
+            tcg_gen_addi_i32(addr, addr, offset);
+        }
+        store_reg(s, a->rn, addr);
+    } else {
+        tcg_temp_free_i32(addr);
+    }
+    mve_update_eci(s);
+    return true;
+}
+
+static bool trans_VLDR_VSTR(DisasContext *s, arg_VLDR_VSTR *a)
+{
+    static MVEGenLdStFn * const ldstfns[4][2] = {
+        { gen_helper_mve_vstrb, gen_helper_mve_vldrb },
+        { gen_helper_mve_vstrh, gen_helper_mve_vldrh },
+        { gen_helper_mve_vstrw, gen_helper_mve_vldrw },
+        { NULL, NULL }
+    };
+    return do_ldst(s, a, ldstfns[a->size][a->l], a->size);
+}
+
+#define DO_VLDST_WIDE_NARROW(OP, SLD, ULD, ST, MSIZE)           \
+    static bool trans_##OP(DisasContext *s, arg_VLDR_VSTR *a)   \
+    {                                                           \
+        static MVEGenLdStFn * const ldstfns[2][2] = {           \
+            { gen_helper_mve_##ST, gen_helper_mve_##SLD },      \
+            { NULL, gen_helper_mve_##ULD },                     \
+        };                                                      \
+        return do_ldst(s, a, ldstfns[a->u][a->l], MSIZE);       \
+    }
+
+DO_VLDST_WIDE_NARROW(VLDSTB_H, vldrb_sh, vldrb_uh, vstrb_h, MO_8)
+DO_VLDST_WIDE_NARROW(VLDSTB_W, vldrb_sw, vldrb_uw, vstrb_w, MO_8)
+DO_VLDST_WIDE_NARROW(VLDSTH_W, vldrh_sw, vldrh_uw, vstrh_w, MO_16)
+
+static bool do_ldst_sg(DisasContext *s, arg_vldst_sg *a, MVEGenLdStSGFn fn)
+{
+    TCGv_i32 addr;
+    TCGv_ptr qd, qm;
+
+    if (!dc_isar_feature(aa32_mve, s) ||
+        !mve_check_qreg_bank(s, a->qd | a->qm) ||
+        !fn || a->rn == 15) {
+        /* Rn case is UNPREDICTABLE */
+        return false;
+    }
+
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+
+    addr = load_reg(s, a->rn);
+
+    qd = mve_qreg_ptr(a->qd);
+    qm = mve_qreg_ptr(a->qm);
+    fn(cpu_env, qd, qm, addr);
+    tcg_temp_free_ptr(qd);
+    tcg_temp_free_ptr(qm);
+    tcg_temp_free_i32(addr);
+    mve_update_eci(s);
+    return true;
+}
+
+/*
+ * The naming scheme here is "vldrb_sg_sh == in-memory byte loads
+ * signextended to halfword elements in register". _os_ indicates that
+ * the offsets in Qm should be scaled by the element size.
+ */
+/* This macro is just to make the arrays more compact in these functions */
+#define F(N) gen_helper_mve_##N
+
+/* VLDRB/VSTRB (ie msize 1) with OS=1 is UNPREDICTABLE; we UNDEF */
+static bool trans_VLDR_S_sg(DisasContext *s, arg_vldst_sg *a)
+{
+    static MVEGenLdStSGFn * const fns[2][4][4] = { {
+            { NULL, F(vldrb_sg_sh), F(vldrb_sg_sw), NULL },
+            { NULL, NULL,           F(vldrh_sg_sw), NULL },
+            { NULL, NULL,           NULL,           NULL },
+            { NULL, NULL,           NULL,           NULL }
+        }, {
+            { NULL, NULL,              NULL,              NULL },
+            { NULL, NULL,              F(vldrh_sg_os_sw), NULL },
+            { NULL, NULL,              NULL,              NULL },
+            { NULL, NULL,              NULL,              NULL }
+        }
+    };
+    if (a->qd == a->qm) {
+        return false; /* UNPREDICTABLE */
+    }
+    return do_ldst_sg(s, a, fns[a->os][a->msize][a->size]);
+}
+
+static bool trans_VLDR_U_sg(DisasContext *s, arg_vldst_sg *a)
+{
+    static MVEGenLdStSGFn * const fns[2][4][4] = { {
+            { F(vldrb_sg_ub), F(vldrb_sg_uh), F(vldrb_sg_uw), NULL },
+            { NULL,           F(vldrh_sg_uh), F(vldrh_sg_uw), NULL },
+            { NULL,           NULL,           F(vldrw_sg_uw), NULL },
+            { NULL,           NULL,           NULL,           F(vldrd_sg_ud) }
+        }, {
+            { NULL, NULL,              NULL,              NULL },
+            { NULL, F(vldrh_sg_os_uh), F(vldrh_sg_os_uw), NULL },
+            { NULL, NULL,              F(vldrw_sg_os_uw), NULL },
+            { NULL, NULL,              NULL,              F(vldrd_sg_os_ud) }
+        }
+    };
+    if (a->qd == a->qm) {
+        return false; /* UNPREDICTABLE */
+    }
+    return do_ldst_sg(s, a, fns[a->os][a->msize][a->size]);
+}
+
+static bool trans_VSTR_sg(DisasContext *s, arg_vldst_sg *a)
+{
+    static MVEGenLdStSGFn * const fns[2][4][4] = { {
+            { F(vstrb_sg_ub), F(vstrb_sg_uh), F(vstrb_sg_uw), NULL },
+            { NULL,           F(vstrh_sg_uh), F(vstrh_sg_uw), NULL },
+            { NULL,           NULL,           F(vstrw_sg_uw), NULL },
+            { NULL,           NULL,           NULL,           F(vstrd_sg_ud) }
+        }, {
+            { NULL, NULL,              NULL,              NULL },
+            { NULL, F(vstrh_sg_os_uh), F(vstrh_sg_os_uw), NULL },
+            { NULL, NULL,              F(vstrw_sg_os_uw), NULL },
+            { NULL, NULL,              NULL,              F(vstrd_sg_os_ud) }
+        }
+    };
+    return do_ldst_sg(s, a, fns[a->os][a->msize][a->size]);
+}
+
+#undef F
+
+static bool do_ldst_sg_imm(DisasContext *s, arg_vldst_sg_imm *a,
+                           MVEGenLdStSGFn *fn, unsigned msize)
+{
+    uint32_t offset;
+    TCGv_ptr qd, qm;
+
+    if (!dc_isar_feature(aa32_mve, s) ||
+        !mve_check_qreg_bank(s, a->qd | a->qm) ||
+        !fn) {
+        return false;
+    }
+
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+
+    offset = a->imm << msize;
+    if (!a->a) {
+        offset = -offset;
+    }
+
+    qd = mve_qreg_ptr(a->qd);
+    qm = mve_qreg_ptr(a->qm);
+    fn(cpu_env, qd, qm, tcg_constant_i32(offset));
+    tcg_temp_free_ptr(qd);
+    tcg_temp_free_ptr(qm);
+    mve_update_eci(s);
+    return true;
+}
+
+static bool trans_VLDRW_sg_imm(DisasContext *s, arg_vldst_sg_imm *a)
+{
+    static MVEGenLdStSGFn * const fns[] = {
+        gen_helper_mve_vldrw_sg_uw,
+        gen_helper_mve_vldrw_sg_wb_uw,
+    };
+    if (a->qd == a->qm) {
+        return false; /* UNPREDICTABLE */
+    }
+    return do_ldst_sg_imm(s, a, fns[a->w], MO_32);
+}
+
+static bool trans_VLDRD_sg_imm(DisasContext *s, arg_vldst_sg_imm *a)
+{
+    static MVEGenLdStSGFn * const fns[] = {
+        gen_helper_mve_vldrd_sg_ud,
+        gen_helper_mve_vldrd_sg_wb_ud,
+    };
+    if (a->qd == a->qm) {
+        return false; /* UNPREDICTABLE */
+    }
+    return do_ldst_sg_imm(s, a, fns[a->w], MO_64);
+}
+
+static bool trans_VSTRW_sg_imm(DisasContext *s, arg_vldst_sg_imm *a)
+{
+    static MVEGenLdStSGFn * const fns[] = {
+        gen_helper_mve_vstrw_sg_uw,
+        gen_helper_mve_vstrw_sg_wb_uw,
+    };
+    return do_ldst_sg_imm(s, a, fns[a->w], MO_32);
+}
+
+static bool trans_VSTRD_sg_imm(DisasContext *s, arg_vldst_sg_imm *a)
+{
+    static MVEGenLdStSGFn * const fns[] = {
+        gen_helper_mve_vstrd_sg_ud,
+        gen_helper_mve_vstrd_sg_wb_ud,
+    };
+    return do_ldst_sg_imm(s, a, fns[a->w], MO_64);
+}
+
+static bool do_vldst_il(DisasContext *s, arg_vldst_il *a, MVEGenLdStIlFn *fn,
+                        int addrinc)
+{
+    TCGv_i32 rn;
+
+    if (!dc_isar_feature(aa32_mve, s) ||
+        !mve_check_qreg_bank(s, a->qd) ||
+        !fn || (a->rn == 13 && a->w) || a->rn == 15) {
+        /* Variously UNPREDICTABLE or UNDEF or related-encoding */
+        return false;
+    }
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+
+    rn = load_reg(s, a->rn);
+    /*
+     * We pass the index of Qd, not a pointer, because the helper must
+     * access multiple Q registers starting at Qd and working up.
+     */
+    fn(cpu_env, tcg_constant_i32(a->qd), rn);
+
+    if (a->w) {
+        tcg_gen_addi_i32(rn, rn, addrinc);
+        store_reg(s, a->rn, rn);
+    } else {
+        tcg_temp_free_i32(rn);
+    }
+    mve_update_and_store_eci(s);
+    return true;
+}
+
+/* This macro is just to make the arrays more compact in these functions */
+#define F(N) gen_helper_mve_##N
+
+static bool trans_VLD2(DisasContext *s, arg_vldst_il *a)
+{
+    static MVEGenLdStIlFn * const fns[4][4] = {
+        { F(vld20b), F(vld20h), F(vld20w), NULL, },
+        { F(vld21b), F(vld21h), F(vld21w), NULL, },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL },
+    };
+    if (a->qd > 6) {
+        return false;
+    }
+    return do_vldst_il(s, a, fns[a->pat][a->size], 32);
+}
+
+static bool trans_VLD4(DisasContext *s, arg_vldst_il *a)
+{
+    static MVEGenLdStIlFn * const fns[4][4] = {
+        { F(vld40b), F(vld40h), F(vld40w), NULL, },
+        { F(vld41b), F(vld41h), F(vld41w), NULL, },
+        { F(vld42b), F(vld42h), F(vld42w), NULL, },
+        { F(vld43b), F(vld43h), F(vld43w), NULL, },
+    };
+    if (a->qd > 4) {
+        return false;
+    }
+    return do_vldst_il(s, a, fns[a->pat][a->size], 64);
+}
+
+static bool trans_VST2(DisasContext *s, arg_vldst_il *a)
+{
+    static MVEGenLdStIlFn * const fns[4][4] = {
+        { F(vst20b), F(vst20h), F(vst20w), NULL, },
+        { F(vst21b), F(vst21h), F(vst21w), NULL, },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL },
+    };
+    if (a->qd > 6) {
+        return false;
+    }
+    return do_vldst_il(s, a, fns[a->pat][a->size], 32);
+}
+
+static bool trans_VST4(DisasContext *s, arg_vldst_il *a)
+{
+    static MVEGenLdStIlFn * const fns[4][4] = {
+        { F(vst40b), F(vst40h), F(vst40w), NULL, },
+        { F(vst41b), F(vst41h), F(vst41w), NULL, },
+        { F(vst42b), F(vst42h), F(vst42w), NULL, },
+        { F(vst43b), F(vst43h), F(vst43w), NULL, },
+    };
+    if (a->qd > 4) {
+        return false;
+    }
+    return do_vldst_il(s, a, fns[a->pat][a->size], 64);
+}
+
+#undef F
+
+static bool trans_VDUP(DisasContext *s, arg_VDUP *a)
+{
+    TCGv_ptr qd;
+    TCGv_i32 rt;
+
+    if (!dc_isar_feature(aa32_mve, s) ||
+        !mve_check_qreg_bank(s, a->qd)) {
+        return false;
+    }
+    if (a->rt == 13 || a->rt == 15) {
+        /* UNPREDICTABLE; we choose to UNDEF */
+        return false;
+    }
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+
+    rt = load_reg(s, a->rt);
+    if (mve_no_predication(s)) {
+        tcg_gen_gvec_dup_i32(a->size, mve_qreg_offset(a->qd), 16, 16, rt);
+    } else {
+        qd = mve_qreg_ptr(a->qd);
+        tcg_gen_dup_i32(a->size, rt, rt);
+        gen_helper_mve_vdup(cpu_env, qd, rt);
+        tcg_temp_free_ptr(qd);
+    }
+    tcg_temp_free_i32(rt);
+    mve_update_eci(s);
+    return true;
+}
+
+static bool do_1op_vec(DisasContext *s, arg_1op *a, MVEGenOneOpFn fn,
+                       GVecGen2Fn vecfn)
+{
+    TCGv_ptr qd, qm;
+
+    if (!dc_isar_feature(aa32_mve, s) ||
+        !mve_check_qreg_bank(s, a->qd | a->qm) ||
+        !fn) {
+        return false;
+    }
+
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+
+    if (vecfn && mve_no_predication(s)) {
+        vecfn(a->size, mve_qreg_offset(a->qd), mve_qreg_offset(a->qm), 16, 16);
+    } else {
+        qd = mve_qreg_ptr(a->qd);
+        qm = mve_qreg_ptr(a->qm);
+        fn(cpu_env, qd, qm);
+        tcg_temp_free_ptr(qd);
+        tcg_temp_free_ptr(qm);
+    }
+    mve_update_eci(s);
+    return true;
+}
+
+static bool do_1op(DisasContext *s, arg_1op *a, MVEGenOneOpFn fn)
+{
+    return do_1op_vec(s, a, fn, NULL);
+}
+
+#define DO_1OP_VEC(INSN, FN, VECFN)                             \
+    static bool trans_##INSN(DisasContext *s, arg_1op *a)       \
+    {                                                           \
+        static MVEGenOneOpFn * const fns[] = {                  \
+            gen_helper_mve_##FN##b,                             \
+            gen_helper_mve_##FN##h,                             \
+            gen_helper_mve_##FN##w,                             \
+            NULL,                                               \
+        };                                                      \
+        return do_1op_vec(s, a, fns[a->size], VECFN);           \
+    }
+
+#define DO_1OP(INSN, FN) DO_1OP_VEC(INSN, FN, NULL)
+
+DO_1OP(VCLZ, vclz)
+DO_1OP(VCLS, vcls)
+DO_1OP_VEC(VABS, vabs, tcg_gen_gvec_abs)
+DO_1OP_VEC(VNEG, vneg, tcg_gen_gvec_neg)
+DO_1OP(VQABS, vqabs)
+DO_1OP(VQNEG, vqneg)
+DO_1OP(VMAXA, vmaxa)
+DO_1OP(VMINA, vmina)
+
+/*
+ * For simple float/int conversions we use the fixed-point
+ * conversion helpers with a zero shift count
+ */
+#define DO_VCVT(INSN, HFN, SFN)                                         \
+    static void gen_##INSN##h(TCGv_ptr env, TCGv_ptr qd, TCGv_ptr qm)   \
+    {                                                                   \
+        gen_helper_mve_##HFN(env, qd, qm, tcg_constant_i32(0));         \
+    }                                                                   \
+    static void gen_##INSN##s(TCGv_ptr env, TCGv_ptr qd, TCGv_ptr qm)   \
+    {                                                                   \
+        gen_helper_mve_##SFN(env, qd, qm, tcg_constant_i32(0));         \
+    }                                                                   \
+    static bool trans_##INSN(DisasContext *s, arg_1op *a)               \
+    {                                                                   \
+        static MVEGenOneOpFn * const fns[] = {                          \
+            NULL,                                                       \
+            gen_##INSN##h,                                              \
+            gen_##INSN##s,                                              \
+            NULL,                                                       \
+        };                                                              \
+        if (!dc_isar_feature(aa32_mve_fp, s)) {                         \
+            return false;                                               \
+        }                                                               \
+        return do_1op(s, a, fns[a->size]);                              \
+    }
+
+DO_VCVT(VCVT_SF, vcvt_sh, vcvt_sf)
+DO_VCVT(VCVT_UF, vcvt_uh, vcvt_uf)
+DO_VCVT(VCVT_FS, vcvt_hs, vcvt_fs)
+DO_VCVT(VCVT_FU, vcvt_hu, vcvt_fu)
+
+static bool do_vcvt_rmode(DisasContext *s, arg_1op *a,
+                          enum arm_fprounding rmode, bool u)
+{
+    /*
+     * Handle VCVT fp to int with specified rounding mode.
+     * This is a 1op fn but we must pass the rounding mode as
+     * an immediate to the helper.
+     */
+    TCGv_ptr qd, qm;
+    static MVEGenVCVTRmodeFn * const fns[4][2] = {
+        { NULL, NULL },
+        { gen_helper_mve_vcvt_rm_sh, gen_helper_mve_vcvt_rm_uh },
+        { gen_helper_mve_vcvt_rm_ss, gen_helper_mve_vcvt_rm_us },
+        { NULL, NULL },
+    };
+    MVEGenVCVTRmodeFn *fn = fns[a->size][u];
+
+    if (!dc_isar_feature(aa32_mve_fp, s) ||
+        !mve_check_qreg_bank(s, a->qd | a->qm) ||
+        !fn) {
+        return false;
+    }
+
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+
+    qd = mve_qreg_ptr(a->qd);
+    qm = mve_qreg_ptr(a->qm);
+    fn(cpu_env, qd, qm, tcg_constant_i32(arm_rmode_to_sf(rmode)));
+    tcg_temp_free_ptr(qd);
+    tcg_temp_free_ptr(qm);
+    mve_update_eci(s);
+    return true;
+}
+
+#define DO_VCVT_RMODE(INSN, RMODE, U)                           \
+    static bool trans_##INSN(DisasContext *s, arg_1op *a)       \
+    {                                                           \
+        return do_vcvt_rmode(s, a, RMODE, U);                   \
+    }                                                           \
+
+DO_VCVT_RMODE(VCVTAS, FPROUNDING_TIEAWAY, false)
+DO_VCVT_RMODE(VCVTAU, FPROUNDING_TIEAWAY, true)
+DO_VCVT_RMODE(VCVTNS, FPROUNDING_TIEEVEN, false)
+DO_VCVT_RMODE(VCVTNU, FPROUNDING_TIEEVEN, true)
+DO_VCVT_RMODE(VCVTPS, FPROUNDING_POSINF, false)
+DO_VCVT_RMODE(VCVTPU, FPROUNDING_POSINF, true)
+DO_VCVT_RMODE(VCVTMS, FPROUNDING_NEGINF, false)
+DO_VCVT_RMODE(VCVTMU, FPROUNDING_NEGINF, true)
+
+#define DO_VCVT_SH(INSN, FN)                                    \
+    static bool trans_##INSN(DisasContext *s, arg_1op *a)       \
+    {                                                           \
+        if (!dc_isar_feature(aa32_mve_fp, s)) {                 \
+            return false;                                       \
+        }                                                       \
+        return do_1op(s, a, gen_helper_mve_##FN);               \
+    }                                                           \
+
+DO_VCVT_SH(VCVTB_SH, vcvtb_sh)
+DO_VCVT_SH(VCVTT_SH, vcvtt_sh)
+DO_VCVT_SH(VCVTB_HS, vcvtb_hs)
+DO_VCVT_SH(VCVTT_HS, vcvtt_hs)
+
+#define DO_VRINT(INSN, RMODE)                                           \
+    static void gen_##INSN##h(TCGv_ptr env, TCGv_ptr qd, TCGv_ptr qm)   \
+    {                                                                   \
+        gen_helper_mve_vrint_rm_h(env, qd, qm,                          \
+                                  tcg_constant_i32(arm_rmode_to_sf(RMODE))); \
+    }                                                                   \
+    static void gen_##INSN##s(TCGv_ptr env, TCGv_ptr qd, TCGv_ptr qm)   \
+    {                                                                   \
+        gen_helper_mve_vrint_rm_s(env, qd, qm,                          \
+                                  tcg_constant_i32(arm_rmode_to_sf(RMODE))); \
+    }                                                                   \
+    static bool trans_##INSN(DisasContext *s, arg_1op *a)               \
+    {                                                                   \
+        static MVEGenOneOpFn * const fns[] = {                          \
+            NULL,                                                       \
+            gen_##INSN##h,                                              \
+            gen_##INSN##s,                                              \
+            NULL,                                                       \
+        };                                                              \
+        if (!dc_isar_feature(aa32_mve_fp, s)) {                         \
+            return false;                                               \
+        }                                                               \
+        return do_1op(s, a, fns[a->size]);                              \
+    }
+
+DO_VRINT(VRINTN, FPROUNDING_TIEEVEN)
+DO_VRINT(VRINTA, FPROUNDING_TIEAWAY)
+DO_VRINT(VRINTZ, FPROUNDING_ZERO)
+DO_VRINT(VRINTM, FPROUNDING_NEGINF)
+DO_VRINT(VRINTP, FPROUNDING_POSINF)
+
+static bool trans_VRINTX(DisasContext *s, arg_1op *a)
+{
+    static MVEGenOneOpFn * const fns[] = {
+        NULL,
+        gen_helper_mve_vrintx_h,
+        gen_helper_mve_vrintx_s,
+        NULL,
+    };
+    if (!dc_isar_feature(aa32_mve_fp, s)) {
+        return false;
+    }
+    return do_1op(s, a, fns[a->size]);
+}
+
+/* Narrowing moves: only size 0 and 1 are valid */
+#define DO_VMOVN(INSN, FN) \
+    static bool trans_##INSN(DisasContext *s, arg_1op *a)       \
+    {                                                           \
+        static MVEGenOneOpFn * const fns[] = {                  \
+            gen_helper_mve_##FN##b,                             \
+            gen_helper_mve_##FN##h,                             \
+            NULL,                                               \
+            NULL,                                               \
+        };                                                      \
+        return do_1op(s, a, fns[a->size]);                      \
+    }
+
+DO_VMOVN(VMOVNB, vmovnb)
+DO_VMOVN(VMOVNT, vmovnt)
+DO_VMOVN(VQMOVUNB, vqmovunb)
+DO_VMOVN(VQMOVUNT, vqmovunt)
+DO_VMOVN(VQMOVN_BS, vqmovnbs)
+DO_VMOVN(VQMOVN_TS, vqmovnts)
+DO_VMOVN(VQMOVN_BU, vqmovnbu)
+DO_VMOVN(VQMOVN_TU, vqmovntu)
+
+static bool trans_VREV16(DisasContext *s, arg_1op *a)
+{
+    static MVEGenOneOpFn * const fns[] = {
+        gen_helper_mve_vrev16b,
+        NULL,
+        NULL,
+        NULL,
+    };
+    return do_1op(s, a, fns[a->size]);
+}
+
+static bool trans_VREV32(DisasContext *s, arg_1op *a)
+{
+    static MVEGenOneOpFn * const fns[] = {
+        gen_helper_mve_vrev32b,
+        gen_helper_mve_vrev32h,
+        NULL,
+        NULL,
+    };
+    return do_1op(s, a, fns[a->size]);
+}
+
+static bool trans_VREV64(DisasContext *s, arg_1op *a)
+{
+    static MVEGenOneOpFn * const fns[] = {
+        gen_helper_mve_vrev64b,
+        gen_helper_mve_vrev64h,
+        gen_helper_mve_vrev64w,
+        NULL,
+    };
+    return do_1op(s, a, fns[a->size]);
+}
+
+static bool trans_VMVN(DisasContext *s, arg_1op *a)
+{
+    return do_1op_vec(s, a, gen_helper_mve_vmvn, tcg_gen_gvec_not);
+}
+
+static bool trans_VABS_fp(DisasContext *s, arg_1op *a)
+{
+    static MVEGenOneOpFn * const fns[] = {
+        NULL,
+        gen_helper_mve_vfabsh,
+        gen_helper_mve_vfabss,
+        NULL,
+    };
+    if (!dc_isar_feature(aa32_mve_fp, s)) {
+        return false;
+    }
+    return do_1op(s, a, fns[a->size]);
+}
+
+static bool trans_VNEG_fp(DisasContext *s, arg_1op *a)
+{
+    static MVEGenOneOpFn * const fns[] = {
+        NULL,
+        gen_helper_mve_vfnegh,
+        gen_helper_mve_vfnegs,
+        NULL,
+    };
+    if (!dc_isar_feature(aa32_mve_fp, s)) {
+        return false;
+    }
+    return do_1op(s, a, fns[a->size]);
+}
+
+static bool do_2op_vec(DisasContext *s, arg_2op *a, MVEGenTwoOpFn fn,
+                       GVecGen3Fn *vecfn)
+{
+    TCGv_ptr qd, qn, qm;
+
+    if (!dc_isar_feature(aa32_mve, s) ||
+        !mve_check_qreg_bank(s, a->qd | a->qn | a->qm) ||
+        !fn) {
+        return false;
+    }
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+
+    if (vecfn && mve_no_predication(s)) {
+        vecfn(a->size, mve_qreg_offset(a->qd), mve_qreg_offset(a->qn),
+              mve_qreg_offset(a->qm), 16, 16);
+    } else {
+        qd = mve_qreg_ptr(a->qd);
+        qn = mve_qreg_ptr(a->qn);
+        qm = mve_qreg_ptr(a->qm);
+        fn(cpu_env, qd, qn, qm);
+        tcg_temp_free_ptr(qd);
+        tcg_temp_free_ptr(qn);
+        tcg_temp_free_ptr(qm);
+    }
+    mve_update_eci(s);
+    return true;
+}
+
+static bool do_2op(DisasContext *s, arg_2op *a, MVEGenTwoOpFn *fn)
+{
+    return do_2op_vec(s, a, fn, NULL);
+}
+
+#define DO_LOGIC(INSN, HELPER, VECFN)                           \
+    static bool trans_##INSN(DisasContext *s, arg_2op *a)       \
+    {                                                           \
+        return do_2op_vec(s, a, HELPER, VECFN);                 \
+    }
+
+DO_LOGIC(VAND, gen_helper_mve_vand, tcg_gen_gvec_and)
+DO_LOGIC(VBIC, gen_helper_mve_vbic, tcg_gen_gvec_andc)
+DO_LOGIC(VORR, gen_helper_mve_vorr, tcg_gen_gvec_or)
+DO_LOGIC(VORN, gen_helper_mve_vorn, tcg_gen_gvec_orc)
+DO_LOGIC(VEOR, gen_helper_mve_veor, tcg_gen_gvec_xor)
+
+static bool trans_VPSEL(DisasContext *s, arg_2op *a)
+{
+    /* This insn updates predication bits */
+    s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
+    return do_2op(s, a, gen_helper_mve_vpsel);
+}
+
+#define DO_2OP_VEC(INSN, FN, VECFN)                             \
+    static bool trans_##INSN(DisasContext *s, arg_2op *a)       \
+    {                                                           \
+        static MVEGenTwoOpFn * const fns[] = {                  \
+            gen_helper_mve_##FN##b,                             \
+            gen_helper_mve_##FN##h,                             \
+            gen_helper_mve_##FN##w,                             \
+            NULL,                                               \
+        };                                                      \
+        return do_2op_vec(s, a, fns[a->size], VECFN);           \
+    }
+
+#define DO_2OP(INSN, FN) DO_2OP_VEC(INSN, FN, NULL)
+
+DO_2OP_VEC(VADD, vadd, tcg_gen_gvec_add)
+DO_2OP_VEC(VSUB, vsub, tcg_gen_gvec_sub)
+DO_2OP_VEC(VMUL, vmul, tcg_gen_gvec_mul)
+DO_2OP(VMULH_S, vmulhs)
+DO_2OP(VMULH_U, vmulhu)
+DO_2OP(VRMULH_S, vrmulhs)
+DO_2OP(VRMULH_U, vrmulhu)
+DO_2OP_VEC(VMAX_S, vmaxs, tcg_gen_gvec_smax)
+DO_2OP_VEC(VMAX_U, vmaxu, tcg_gen_gvec_umax)
+DO_2OP_VEC(VMIN_S, vmins, tcg_gen_gvec_smin)
+DO_2OP_VEC(VMIN_U, vminu, tcg_gen_gvec_umin)
+DO_2OP(VABD_S, vabds)
+DO_2OP(VABD_U, vabdu)
+DO_2OP(VHADD_S, vhadds)
+DO_2OP(VHADD_U, vhaddu)
+DO_2OP(VHSUB_S, vhsubs)
+DO_2OP(VHSUB_U, vhsubu)
+DO_2OP(VMULL_BS, vmullbs)
+DO_2OP(VMULL_BU, vmullbu)
+DO_2OP(VMULL_TS, vmullts)
+DO_2OP(VMULL_TU, vmulltu)
+DO_2OP(VQDMULH, vqdmulh)
+DO_2OP(VQRDMULH, vqrdmulh)
+DO_2OP(VQADD_S, vqadds)
+DO_2OP(VQADD_U, vqaddu)
+DO_2OP(VQSUB_S, vqsubs)
+DO_2OP(VQSUB_U, vqsubu)
+DO_2OP(VSHL_S, vshls)
+DO_2OP(VSHL_U, vshlu)
+DO_2OP(VRSHL_S, vrshls)
+DO_2OP(VRSHL_U, vrshlu)
+DO_2OP(VQSHL_S, vqshls)
+DO_2OP(VQSHL_U, vqshlu)
+DO_2OP(VQRSHL_S, vqrshls)
+DO_2OP(VQRSHL_U, vqrshlu)
+DO_2OP(VQDMLADH, vqdmladh)
+DO_2OP(VQDMLADHX, vqdmladhx)
+DO_2OP(VQRDMLADH, vqrdmladh)
+DO_2OP(VQRDMLADHX, vqrdmladhx)
+DO_2OP(VQDMLSDH, vqdmlsdh)
+DO_2OP(VQDMLSDHX, vqdmlsdhx)
+DO_2OP(VQRDMLSDH, vqrdmlsdh)
+DO_2OP(VQRDMLSDHX, vqrdmlsdhx)
+DO_2OP(VRHADD_S, vrhadds)
+DO_2OP(VRHADD_U, vrhaddu)
+/*
+ * VCADD Qd == Qm at size MO_32 is UNPREDICTABLE; we choose not to diagnose
+ * so we can reuse the DO_2OP macro. (Our implementation calculates the
+ * "expected" results in this case.) Similarly for VHCADD.
+ */
+DO_2OP(VCADD90, vcadd90)
+DO_2OP(VCADD270, vcadd270)
+DO_2OP(VHCADD90, vhcadd90)
+DO_2OP(VHCADD270, vhcadd270)
+
+static bool trans_VQDMULLB(DisasContext *s, arg_2op *a)
+{
+    static MVEGenTwoOpFn * const fns[] = {
+        NULL,
+        gen_helper_mve_vqdmullbh,
+        gen_helper_mve_vqdmullbw,
+        NULL,
+    };
+    if (a->size == MO_32 && (a->qd == a->qm || a->qd == a->qn)) {
+        /* UNPREDICTABLE; we choose to undef */
+        return false;
+    }
+    return do_2op(s, a, fns[a->size]);
+}
+
+static bool trans_VQDMULLT(DisasContext *s, arg_2op *a)
+{
+    static MVEGenTwoOpFn * const fns[] = {
+        NULL,
+        gen_helper_mve_vqdmullth,
+        gen_helper_mve_vqdmulltw,
+        NULL,
+    };
+    if (a->size == MO_32 && (a->qd == a->qm || a->qd == a->qn)) {
+        /* UNPREDICTABLE; we choose to undef */
+        return false;
+    }
+    return do_2op(s, a, fns[a->size]);
+}
+
+static bool trans_VMULLP_B(DisasContext *s, arg_2op *a)
+{
+    /*
+     * Note that a->size indicates the output size, ie VMULL.P8
+     * is the 8x8->16 operation and a->size is MO_16; VMULL.P16
+     * is the 16x16->32 operation and a->size is MO_32.
+     */
+    static MVEGenTwoOpFn * const fns[] = {
+        NULL,
+        gen_helper_mve_vmullpbh,
+        gen_helper_mve_vmullpbw,
+        NULL,
+    };
+    return do_2op(s, a, fns[a->size]);
+}
+
+static bool trans_VMULLP_T(DisasContext *s, arg_2op *a)
+{
+    /* a->size is as for trans_VMULLP_B */
+    static MVEGenTwoOpFn * const fns[] = {
+        NULL,
+        gen_helper_mve_vmullpth,
+        gen_helper_mve_vmullptw,
+        NULL,
+    };
+    return do_2op(s, a, fns[a->size]);
+}
+
+/*
+ * VADC and VSBC: these perform an add-with-carry or subtract-with-carry
+ * of the 32-bit elements in each lane of the input vectors, where the
+ * carry-out of each add is the carry-in of the next.  The initial carry
+ * input is either fixed (0 for VADCI, 1 for VSBCI) or is from FPSCR.C
+ * (for VADC and VSBC); the carry out at the end is written back to FPSCR.C.
+ * These insns are subject to beat-wise execution.  Partial execution
+ * of an I=1 (initial carry input fixed) insn which does not
+ * execute the first beat must start with the current FPSCR.NZCV
+ * value, not the fixed constant input.
+ */
+static bool trans_VADC(DisasContext *s, arg_2op *a)
+{
+    return do_2op(s, a, gen_helper_mve_vadc);
+}
+
+static bool trans_VADCI(DisasContext *s, arg_2op *a)
+{
+    if (mve_skip_first_beat(s)) {
+        return trans_VADC(s, a);
+    }
+    return do_2op(s, a, gen_helper_mve_vadci);
+}
+
+static bool trans_VSBC(DisasContext *s, arg_2op *a)
+{
+    return do_2op(s, a, gen_helper_mve_vsbc);
+}
+
+static bool trans_VSBCI(DisasContext *s, arg_2op *a)
+{
+    if (mve_skip_first_beat(s)) {
+        return trans_VSBC(s, a);
+    }
+    return do_2op(s, a, gen_helper_mve_vsbci);
+}
+
+#define DO_2OP_FP(INSN, FN)                                     \
+    static bool trans_##INSN(DisasContext *s, arg_2op *a)       \
+    {                                                           \
+        static MVEGenTwoOpFn * const fns[] = {                  \
+            NULL,                                               \
+            gen_helper_mve_##FN##h,                             \
+            gen_helper_mve_##FN##s,                             \
+            NULL,                                               \
+        };                                                      \
+        if (!dc_isar_feature(aa32_mve_fp, s)) {                 \
+            return false;                                       \
+        }                                                       \
+        return do_2op(s, a, fns[a->size]);                      \
+    }
+
+DO_2OP_FP(VADD_fp, vfadd)
+DO_2OP_FP(VSUB_fp, vfsub)
+DO_2OP_FP(VMUL_fp, vfmul)
+DO_2OP_FP(VABD_fp, vfabd)
+DO_2OP_FP(VMAXNM, vmaxnm)
+DO_2OP_FP(VMINNM, vminnm)
+DO_2OP_FP(VCADD90_fp, vfcadd90)
+DO_2OP_FP(VCADD270_fp, vfcadd270)
+DO_2OP_FP(VFMA, vfma)
+DO_2OP_FP(VFMS, vfms)
+DO_2OP_FP(VCMUL0, vcmul0)
+DO_2OP_FP(VCMUL90, vcmul90)
+DO_2OP_FP(VCMUL180, vcmul180)
+DO_2OP_FP(VCMUL270, vcmul270)
+DO_2OP_FP(VCMLA0, vcmla0)
+DO_2OP_FP(VCMLA90, vcmla90)
+DO_2OP_FP(VCMLA180, vcmla180)
+DO_2OP_FP(VCMLA270, vcmla270)
+DO_2OP_FP(VMAXNMA, vmaxnma)
+DO_2OP_FP(VMINNMA, vminnma)
+
+static bool do_2op_scalar(DisasContext *s, arg_2scalar *a,
+                          MVEGenTwoOpScalarFn fn)
+{
+    TCGv_ptr qd, qn;
+    TCGv_i32 rm;
+
+    if (!dc_isar_feature(aa32_mve, s) ||
+        !mve_check_qreg_bank(s, a->qd | a->qn) ||
+        !fn) {
+        return false;
+    }
+    if (a->rm == 13 || a->rm == 15) {
+        /* UNPREDICTABLE */
+        return false;
+    }
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+
+    qd = mve_qreg_ptr(a->qd);
+    qn = mve_qreg_ptr(a->qn);
+    rm = load_reg(s, a->rm);
+    fn(cpu_env, qd, qn, rm);
+    tcg_temp_free_i32(rm);
+    tcg_temp_free_ptr(qd);
+    tcg_temp_free_ptr(qn);
+    mve_update_eci(s);
+    return true;
+}
+
+#define DO_2OP_SCALAR(INSN, FN)                                 \
+    static bool trans_##INSN(DisasContext *s, arg_2scalar *a)   \
+    {                                                           \
+        static MVEGenTwoOpScalarFn * const fns[] = {            \
+            gen_helper_mve_##FN##b,                             \
+            gen_helper_mve_##FN##h,                             \
+            gen_helper_mve_##FN##w,                             \
+            NULL,                                               \
+        };                                                      \
+        return do_2op_scalar(s, a, fns[a->size]);               \
+    }
+
+DO_2OP_SCALAR(VADD_scalar, vadd_scalar)
+DO_2OP_SCALAR(VSUB_scalar, vsub_scalar)
+DO_2OP_SCALAR(VMUL_scalar, vmul_scalar)
+DO_2OP_SCALAR(VHADD_S_scalar, vhadds_scalar)
+DO_2OP_SCALAR(VHADD_U_scalar, vhaddu_scalar)
+DO_2OP_SCALAR(VHSUB_S_scalar, vhsubs_scalar)
+DO_2OP_SCALAR(VHSUB_U_scalar, vhsubu_scalar)
+DO_2OP_SCALAR(VQADD_S_scalar, vqadds_scalar)
+DO_2OP_SCALAR(VQADD_U_scalar, vqaddu_scalar)
+DO_2OP_SCALAR(VQSUB_S_scalar, vqsubs_scalar)
+DO_2OP_SCALAR(VQSUB_U_scalar, vqsubu_scalar)
+DO_2OP_SCALAR(VQDMULH_scalar, vqdmulh_scalar)
+DO_2OP_SCALAR(VQRDMULH_scalar, vqrdmulh_scalar)
+DO_2OP_SCALAR(VBRSR, vbrsr)
+DO_2OP_SCALAR(VMLA, vmla)
+DO_2OP_SCALAR(VMLAS, vmlas)
+DO_2OP_SCALAR(VQDMLAH, vqdmlah)
+DO_2OP_SCALAR(VQRDMLAH, vqrdmlah)
+DO_2OP_SCALAR(VQDMLASH, vqdmlash)
+DO_2OP_SCALAR(VQRDMLASH, vqrdmlash)
+
+static bool trans_VQDMULLB_scalar(DisasContext *s, arg_2scalar *a)
+{
+    static MVEGenTwoOpScalarFn * const fns[] = {
+        NULL,
+        gen_helper_mve_vqdmullb_scalarh,
+        gen_helper_mve_vqdmullb_scalarw,
+        NULL,
+    };
+    if (a->qd == a->qn && a->size == MO_32) {
+        /* UNPREDICTABLE; we choose to undef */
+        return false;
+    }
+    return do_2op_scalar(s, a, fns[a->size]);
+}
+
+static bool trans_VQDMULLT_scalar(DisasContext *s, arg_2scalar *a)
+{
+    static MVEGenTwoOpScalarFn * const fns[] = {
+        NULL,
+        gen_helper_mve_vqdmullt_scalarh,
+        gen_helper_mve_vqdmullt_scalarw,
+        NULL,
+    };
+    if (a->qd == a->qn && a->size == MO_32) {
+        /* UNPREDICTABLE; we choose to undef */
+        return false;
+    }
+    return do_2op_scalar(s, a, fns[a->size]);
+}
+
+
+#define DO_2OP_FP_SCALAR(INSN, FN)                              \
+    static bool trans_##INSN(DisasContext *s, arg_2scalar *a)   \
+    {                                                           \
+        static MVEGenTwoOpScalarFn * const fns[] = {            \
+            NULL,                                               \
+            gen_helper_mve_##FN##h,                             \
+            gen_helper_mve_##FN##s,                             \
+            NULL,                                               \
+        };                                                      \
+        if (!dc_isar_feature(aa32_mve_fp, s)) {                 \
+            return false;                                       \
+        }                                                       \
+        return do_2op_scalar(s, a, fns[a->size]);               \
+    }
+
+DO_2OP_FP_SCALAR(VADD_fp_scalar, vfadd_scalar)
+DO_2OP_FP_SCALAR(VSUB_fp_scalar, vfsub_scalar)
+DO_2OP_FP_SCALAR(VMUL_fp_scalar, vfmul_scalar)
+DO_2OP_FP_SCALAR(VFMA_scalar, vfma_scalar)
+DO_2OP_FP_SCALAR(VFMAS_scalar, vfmas_scalar)
+
+static bool do_long_dual_acc(DisasContext *s, arg_vmlaldav *a,
+                             MVEGenLongDualAccOpFn *fn)
+{
+    TCGv_ptr qn, qm;
+    TCGv_i64 rda;
+    TCGv_i32 rdalo, rdahi;
+
+    if (!dc_isar_feature(aa32_mve, s) ||
+        !mve_check_qreg_bank(s, a->qn | a->qm) ||
+        !fn) {
+        return false;
+    }
+    /*
+     * rdahi == 13 is UNPREDICTABLE; rdahi == 15 is a related
+     * encoding; rdalo always has bit 0 clear so cannot be 13 or 15.
+     */
+    if (a->rdahi == 13 || a->rdahi == 15) {
+        return false;
+    }
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+
+    qn = mve_qreg_ptr(a->qn);
+    qm = mve_qreg_ptr(a->qm);
+
+    /*
+     * This insn is subject to beat-wise execution. Partial execution
+     * of an A=0 (no-accumulate) insn which does not execute the first
+     * beat must start with the current rda value, not 0.
+     */
+    if (a->a || mve_skip_first_beat(s)) {
+        rda = tcg_temp_new_i64();
+        rdalo = load_reg(s, a->rdalo);
+        rdahi = load_reg(s, a->rdahi);
+        tcg_gen_concat_i32_i64(rda, rdalo, rdahi);
+        tcg_temp_free_i32(rdalo);
+        tcg_temp_free_i32(rdahi);
+    } else {
+        rda = tcg_const_i64(0);
+    }
+
+    fn(rda, cpu_env, qn, qm, rda);
+    tcg_temp_free_ptr(qn);
+    tcg_temp_free_ptr(qm);
+
+    rdalo = tcg_temp_new_i32();
+    rdahi = tcg_temp_new_i32();
+    tcg_gen_extrl_i64_i32(rdalo, rda);
+    tcg_gen_extrh_i64_i32(rdahi, rda);
+    store_reg(s, a->rdalo, rdalo);
+    store_reg(s, a->rdahi, rdahi);
+    tcg_temp_free_i64(rda);
+    mve_update_eci(s);
+    return true;
+}
+
+static bool trans_VMLALDAV_S(DisasContext *s, arg_vmlaldav *a)
+{
+    static MVEGenLongDualAccOpFn * const fns[4][2] = {
+        { NULL, NULL },
+        { gen_helper_mve_vmlaldavsh, gen_helper_mve_vmlaldavxsh },
+        { gen_helper_mve_vmlaldavsw, gen_helper_mve_vmlaldavxsw },
+        { NULL, NULL },
+    };
+    return do_long_dual_acc(s, a, fns[a->size][a->x]);
+}
+
+static bool trans_VMLALDAV_U(DisasContext *s, arg_vmlaldav *a)
+{
+    static MVEGenLongDualAccOpFn * const fns[4][2] = {
+        { NULL, NULL },
+        { gen_helper_mve_vmlaldavuh, NULL },
+        { gen_helper_mve_vmlaldavuw, NULL },
+        { NULL, NULL },
+    };
+    return do_long_dual_acc(s, a, fns[a->size][a->x]);
+}
+
+static bool trans_VMLSLDAV(DisasContext *s, arg_vmlaldav *a)
+{
+    static MVEGenLongDualAccOpFn * const fns[4][2] = {
+        { NULL, NULL },
+        { gen_helper_mve_vmlsldavsh, gen_helper_mve_vmlsldavxsh },
+        { gen_helper_mve_vmlsldavsw, gen_helper_mve_vmlsldavxsw },
+        { NULL, NULL },
+    };
+    return do_long_dual_acc(s, a, fns[a->size][a->x]);
+}
+
+static bool trans_VRMLALDAVH_S(DisasContext *s, arg_vmlaldav *a)
+{
+    static MVEGenLongDualAccOpFn * const fns[] = {
+        gen_helper_mve_vrmlaldavhsw, gen_helper_mve_vrmlaldavhxsw,
+    };
+    return do_long_dual_acc(s, a, fns[a->x]);
+}
+
+static bool trans_VRMLALDAVH_U(DisasContext *s, arg_vmlaldav *a)
+{
+    static MVEGenLongDualAccOpFn * const fns[] = {
+        gen_helper_mve_vrmlaldavhuw, NULL,
+    };
+    return do_long_dual_acc(s, a, fns[a->x]);
+}
+
+static bool trans_VRMLSLDAVH(DisasContext *s, arg_vmlaldav *a)
+{
+    static MVEGenLongDualAccOpFn * const fns[] = {
+        gen_helper_mve_vrmlsldavhsw, gen_helper_mve_vrmlsldavhxsw,
+    };
+    return do_long_dual_acc(s, a, fns[a->x]);
+}
+
+static bool do_dual_acc(DisasContext *s, arg_vmladav *a, MVEGenDualAccOpFn *fn)
+{
+    TCGv_ptr qn, qm;
+    TCGv_i32 rda;
+
+    if (!dc_isar_feature(aa32_mve, s) ||
+        !mve_check_qreg_bank(s, a->qn) ||
+        !fn) {
+        return false;
+    }
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+
+    qn = mve_qreg_ptr(a->qn);
+    qm = mve_qreg_ptr(a->qm);
+
+    /*
+     * This insn is subject to beat-wise execution. Partial execution
+     * of an A=0 (no-accumulate) insn which does not execute the first
+     * beat must start with the current rda value, not 0.
+     */
+    if (a->a || mve_skip_first_beat(s)) {
+        rda = load_reg(s, a->rda);
+    } else {
+        rda = tcg_const_i32(0);
+    }
+
+    fn(rda, cpu_env, qn, qm, rda);
+    store_reg(s, a->rda, rda);
+    tcg_temp_free_ptr(qn);
+    tcg_temp_free_ptr(qm);
+
+    mve_update_eci(s);
+    return true;
+}
+
+#define DO_DUAL_ACC(INSN, FN)                                           \
+    static bool trans_##INSN(DisasContext *s, arg_vmladav *a)           \
+    {                                                                   \
+        static MVEGenDualAccOpFn * const fns[4][2] = {                  \
+            { gen_helper_mve_##FN##b, gen_helper_mve_##FN##xb },        \
+            { gen_helper_mve_##FN##h, gen_helper_mve_##FN##xh },        \
+            { gen_helper_mve_##FN##w, gen_helper_mve_##FN##xw },        \
+            { NULL, NULL },                                             \
+        };                                                              \
+        return do_dual_acc(s, a, fns[a->size][a->x]);                   \
+    }
+
+DO_DUAL_ACC(VMLADAV_S, vmladavs)
+DO_DUAL_ACC(VMLSDAV, vmlsdav)
+
+static bool trans_VMLADAV_U(DisasContext *s, arg_vmladav *a)
+{
+    static MVEGenDualAccOpFn * const fns[4][2] = {
+        { gen_helper_mve_vmladavub, NULL },
+        { gen_helper_mve_vmladavuh, NULL },
+        { gen_helper_mve_vmladavuw, NULL },
+        { NULL, NULL },
+    };
+    return do_dual_acc(s, a, fns[a->size][a->x]);
+}
+
+static void gen_vpst(DisasContext *s, uint32_t mask)
+{
+    /*
+     * Set the VPR mask fields. We take advantage of MASK01 and MASK23
+     * being adjacent fields in the register.
+     *
+     * Updating the masks is not predicated, but it is subject to beat-wise
+     * execution, and the mask is updated on the odd-numbered beats.
+     * So if PSR.ECI says we should skip beat 1, we mustn't update the
+     * 01 mask field.
+     */
+    TCGv_i32 vpr = load_cpu_field(v7m.vpr);
+    switch (s->eci) {
+    case ECI_NONE:
+    case ECI_A0:
+        /* Update both 01 and 23 fields */
+        tcg_gen_deposit_i32(vpr, vpr,
+                            tcg_constant_i32(mask | (mask << 4)),
+                            R_V7M_VPR_MASK01_SHIFT,
+                            R_V7M_VPR_MASK01_LENGTH + R_V7M_VPR_MASK23_LENGTH);
+        break;
+    case ECI_A0A1:
+    case ECI_A0A1A2:
+    case ECI_A0A1A2B0:
+        /* Update only the 23 mask field */
+        tcg_gen_deposit_i32(vpr, vpr,
+                            tcg_constant_i32(mask),
+                            R_V7M_VPR_MASK23_SHIFT, R_V7M_VPR_MASK23_LENGTH);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    store_cpu_field(vpr, v7m.vpr);
+}
+
+static bool trans_VPST(DisasContext *s, arg_VPST *a)
+{
+    /* mask == 0 is a "related encoding" */
+    if (!dc_isar_feature(aa32_mve, s) || !a->mask) {
+        return false;
+    }
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+    gen_vpst(s, a->mask);
+    mve_update_and_store_eci(s);
+    return true;
+}
+
+static bool trans_VPNOT(DisasContext *s, arg_VPNOT *a)
+{
+    /*
+     * Invert the predicate in VPR.P0. We have call out to
+     * a helper because this insn itself is beatwise and can
+     * be predicated.
+     */
+    if (!dc_isar_feature(aa32_mve, s)) {
+        return false;
+    }
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+
+    gen_helper_mve_vpnot(cpu_env);
+    /* This insn updates predication bits */
+    s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
+    mve_update_eci(s);
+    return true;
+}
+
+static bool trans_VADDV(DisasContext *s, arg_VADDV *a)
+{
+    /* VADDV: vector add across vector */
+    static MVEGenVADDVFn * const fns[4][2] = {
+        { gen_helper_mve_vaddvsb, gen_helper_mve_vaddvub },
+        { gen_helper_mve_vaddvsh, gen_helper_mve_vaddvuh },
+        { gen_helper_mve_vaddvsw, gen_helper_mve_vaddvuw },
+        { NULL, NULL }
+    };
+    TCGv_ptr qm;
+    TCGv_i32 rda;
+
+    if (!dc_isar_feature(aa32_mve, s) ||
+        a->size == 3) {
+        return false;
+    }
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+
+    /*
+     * This insn is subject to beat-wise execution. Partial execution
+     * of an A=0 (no-accumulate) insn which does not execute the first
+     * beat must start with the current value of Rda, not zero.
+     */
+    if (a->a || mve_skip_first_beat(s)) {
+        /* Accumulate input from Rda */
+        rda = load_reg(s, a->rda);
+    } else {
+        /* Accumulate starting at zero */
+        rda = tcg_const_i32(0);
+    }
+
+    qm = mve_qreg_ptr(a->qm);
+    fns[a->size][a->u](rda, cpu_env, qm, rda);
+    store_reg(s, a->rda, rda);
+    tcg_temp_free_ptr(qm);
+
+    mve_update_eci(s);
+    return true;
+}
+
+static bool trans_VADDLV(DisasContext *s, arg_VADDLV *a)
+{
+    /*
+     * Vector Add Long Across Vector: accumulate the 32-bit
+     * elements of the vector into a 64-bit result stored in
+     * a pair of general-purpose registers.
+     * No need to check Qm's bank: it is only 3 bits in decode.
+     */
+    TCGv_ptr qm;
+    TCGv_i64 rda;
+    TCGv_i32 rdalo, rdahi;
+
+    if (!dc_isar_feature(aa32_mve, s)) {
+        return false;
+    }
+    /*
+     * rdahi == 13 is UNPREDICTABLE; rdahi == 15 is a related
+     * encoding; rdalo always has bit 0 clear so cannot be 13 or 15.
+     */
+    if (a->rdahi == 13 || a->rdahi == 15) {
+        return false;
+    }
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+
+    /*
+     * This insn is subject to beat-wise execution. Partial execution
+     * of an A=0 (no-accumulate) insn which does not execute the first
+     * beat must start with the current value of RdaHi:RdaLo, not zero.
+     */
+    if (a->a || mve_skip_first_beat(s)) {
+        /* Accumulate input from RdaHi:RdaLo */
+        rda = tcg_temp_new_i64();
+        rdalo = load_reg(s, a->rdalo);
+        rdahi = load_reg(s, a->rdahi);
+        tcg_gen_concat_i32_i64(rda, rdalo, rdahi);
+        tcg_temp_free_i32(rdalo);
+        tcg_temp_free_i32(rdahi);
+    } else {
+        /* Accumulate starting at zero */
+        rda = tcg_const_i64(0);
+    }
+
+    qm = mve_qreg_ptr(a->qm);
+    if (a->u) {
+        gen_helper_mve_vaddlv_u(rda, cpu_env, qm, rda);
+    } else {
+        gen_helper_mve_vaddlv_s(rda, cpu_env, qm, rda);
+    }
+    tcg_temp_free_ptr(qm);
+
+    rdalo = tcg_temp_new_i32();
+    rdahi = tcg_temp_new_i32();
+    tcg_gen_extrl_i64_i32(rdalo, rda);
+    tcg_gen_extrh_i64_i32(rdahi, rda);
+    store_reg(s, a->rdalo, rdalo);
+    store_reg(s, a->rdahi, rdahi);
+    tcg_temp_free_i64(rda);
+    mve_update_eci(s);
+    return true;
+}
+
+static bool do_1imm(DisasContext *s, arg_1imm *a, MVEGenOneOpImmFn *fn,
+                    GVecGen2iFn *vecfn)
+{
+    TCGv_ptr qd;
+    uint64_t imm;
+
+    if (!dc_isar_feature(aa32_mve, s) ||
+        !mve_check_qreg_bank(s, a->qd) ||
+        !fn) {
+        return false;
+    }
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+
+    imm = asimd_imm_const(a->imm, a->cmode, a->op);
+
+    if (vecfn && mve_no_predication(s)) {
+        vecfn(MO_64, mve_qreg_offset(a->qd), mve_qreg_offset(a->qd),
+              imm, 16, 16);
+    } else {
+        qd = mve_qreg_ptr(a->qd);
+        fn(cpu_env, qd, tcg_constant_i64(imm));
+        tcg_temp_free_ptr(qd);
+    }
+    mve_update_eci(s);
+    return true;
+}
+
+static void gen_gvec_vmovi(unsigned vece, uint32_t dofs, uint32_t aofs,
+                           int64_t c, uint32_t oprsz, uint32_t maxsz)
+{
+    tcg_gen_gvec_dup_imm(vece, dofs, oprsz, maxsz, c);
+}
+
+static bool trans_Vimm_1r(DisasContext *s, arg_1imm *a)
+{
+    /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
+    MVEGenOneOpImmFn *fn;
+    GVecGen2iFn *vecfn;
+
+    if ((a->cmode & 1) && a->cmode < 12) {
+        if (a->op) {
+            /*
+             * For op=1, the immediate will be inverted by asimd_imm_const(),
+             * so the VBIC becomes a logical AND operation.
+             */
+            fn = gen_helper_mve_vandi;
+            vecfn = tcg_gen_gvec_andi;
+        } else {
+            fn = gen_helper_mve_vorri;
+            vecfn = tcg_gen_gvec_ori;
+        }
+    } else {
+        /* There is one unallocated cmode/op combination in this space */
+        if (a->cmode == 15 && a->op == 1) {
+            return false;
+        }
+        /* asimd_imm_const() sorts out VMVNI vs VMOVI for us */
+        fn = gen_helper_mve_vmovi;
+        vecfn = gen_gvec_vmovi;
+    }
+    return do_1imm(s, a, fn, vecfn);
+}
+
+static bool do_2shift_vec(DisasContext *s, arg_2shift *a, MVEGenTwoOpShiftFn fn,
+                          bool negateshift, GVecGen2iFn vecfn)
+{
+    TCGv_ptr qd, qm;
+    int shift = a->shift;
+
+    if (!dc_isar_feature(aa32_mve, s) ||
+        !mve_check_qreg_bank(s, a->qd | a->qm) ||
+        !fn) {
+        return false;
+    }
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+
+    /*
+     * When we handle a right shift insn using a left-shift helper
+     * which permits a negative shift count to indicate a right-shift,
+     * we must negate the shift count.
+     */
+    if (negateshift) {
+        shift = -shift;
+    }
+
+    if (vecfn && mve_no_predication(s)) {
+        vecfn(a->size, mve_qreg_offset(a->qd), mve_qreg_offset(a->qm),
+              shift, 16, 16);
+    } else {
+        qd = mve_qreg_ptr(a->qd);
+        qm = mve_qreg_ptr(a->qm);
+        fn(cpu_env, qd, qm, tcg_constant_i32(shift));
+        tcg_temp_free_ptr(qd);
+        tcg_temp_free_ptr(qm);
+    }
+    mve_update_eci(s);
+    return true;
+}
+
+static bool do_2shift(DisasContext *s, arg_2shift *a, MVEGenTwoOpShiftFn fn,
+                      bool negateshift)
+{
+    return do_2shift_vec(s, a, fn, negateshift, NULL);
+}
+
+#define DO_2SHIFT_VEC(INSN, FN, NEGATESHIFT, VECFN)                     \
+    static bool trans_##INSN(DisasContext *s, arg_2shift *a)            \
+    {                                                                   \
+        static MVEGenTwoOpShiftFn * const fns[] = {                     \
+            gen_helper_mve_##FN##b,                                     \
+            gen_helper_mve_##FN##h,                                     \
+            gen_helper_mve_##FN##w,                                     \
+            NULL,                                                       \
+        };                                                              \
+        return do_2shift_vec(s, a, fns[a->size], NEGATESHIFT, VECFN);   \
+    }
+
+#define DO_2SHIFT(INSN, FN, NEGATESHIFT)        \
+    DO_2SHIFT_VEC(INSN, FN, NEGATESHIFT, NULL)
+
+static void do_gvec_shri_s(unsigned vece, uint32_t dofs, uint32_t aofs,
+                           int64_t shift, uint32_t oprsz, uint32_t maxsz)
+{
+    /*
+     * We get here with a negated shift count, and we must handle
+     * shifts by the element size, which tcg_gen_gvec_sari() does not do.
+     */
+    shift = -shift;
+    if (shift == (8 << vece)) {
+        shift--;
+    }
+    tcg_gen_gvec_sari(vece, dofs, aofs, shift, oprsz, maxsz);
+}
+
+static void do_gvec_shri_u(unsigned vece, uint32_t dofs, uint32_t aofs,
+                           int64_t shift, uint32_t oprsz, uint32_t maxsz)
+{
+    /*
+     * We get here with a negated shift count, and we must handle
+     * shifts by the element size, which tcg_gen_gvec_shri() does not do.
+     */
+    shift = -shift;
+    if (shift == (8 << vece)) {
+        tcg_gen_gvec_dup_imm(vece, dofs, oprsz, maxsz, 0);
+    } else {
+        tcg_gen_gvec_shri(vece, dofs, aofs, shift, oprsz, maxsz);
+    }
+}
+
+DO_2SHIFT_VEC(VSHLI, vshli_u, false, tcg_gen_gvec_shli)
+DO_2SHIFT(VQSHLI_S, vqshli_s, false)
+DO_2SHIFT(VQSHLI_U, vqshli_u, false)
+DO_2SHIFT(VQSHLUI, vqshlui_s, false)
+/* These right shifts use a left-shift helper with negated shift count */
+DO_2SHIFT_VEC(VSHRI_S, vshli_s, true, do_gvec_shri_s)
+DO_2SHIFT_VEC(VSHRI_U, vshli_u, true, do_gvec_shri_u)
+DO_2SHIFT(VRSHRI_S, vrshli_s, true)
+DO_2SHIFT(VRSHRI_U, vrshli_u, true)
+
+DO_2SHIFT_VEC(VSRI, vsri, false, gen_gvec_sri)
+DO_2SHIFT_VEC(VSLI, vsli, false, gen_gvec_sli)
+
+#define DO_2SHIFT_FP(INSN, FN)                                  \
+    static bool trans_##INSN(DisasContext *s, arg_2shift *a)    \
+    {                                                           \
+        if (!dc_isar_feature(aa32_mve_fp, s)) {                 \
+            return false;                                       \
+        }                                                       \
+        return do_2shift(s, a, gen_helper_mve_##FN, false);     \
+    }
+
+DO_2SHIFT_FP(VCVT_SH_fixed, vcvt_sh)
+DO_2SHIFT_FP(VCVT_UH_fixed, vcvt_uh)
+DO_2SHIFT_FP(VCVT_HS_fixed, vcvt_hs)
+DO_2SHIFT_FP(VCVT_HU_fixed, vcvt_hu)
+DO_2SHIFT_FP(VCVT_SF_fixed, vcvt_sf)
+DO_2SHIFT_FP(VCVT_UF_fixed, vcvt_uf)
+DO_2SHIFT_FP(VCVT_FS_fixed, vcvt_fs)
+DO_2SHIFT_FP(VCVT_FU_fixed, vcvt_fu)
+
+static bool do_2shift_scalar(DisasContext *s, arg_shl_scalar *a,
+                             MVEGenTwoOpShiftFn *fn)
+{
+    TCGv_ptr qda;
+    TCGv_i32 rm;
+
+    if (!dc_isar_feature(aa32_mve, s) ||
+        !mve_check_qreg_bank(s, a->qda) ||
+        a->rm == 13 || a->rm == 15 || !fn) {
+        /* Rm cases are UNPREDICTABLE */
+        return false;
+    }
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+
+    qda = mve_qreg_ptr(a->qda);
+    rm = load_reg(s, a->rm);
+    fn(cpu_env, qda, qda, rm);
+    tcg_temp_free_ptr(qda);
+    tcg_temp_free_i32(rm);
+    mve_update_eci(s);
+    return true;
+}
+
+#define DO_2SHIFT_SCALAR(INSN, FN)                                      \
+    static bool trans_##INSN(DisasContext *s, arg_shl_scalar *a)        \
+    {                                                                   \
+        static MVEGenTwoOpShiftFn * const fns[] = {                     \
+            gen_helper_mve_##FN##b,                                     \
+            gen_helper_mve_##FN##h,                                     \
+            gen_helper_mve_##FN##w,                                     \
+            NULL,                                                       \
+        };                                                              \
+        return do_2shift_scalar(s, a, fns[a->size]);                    \
+    }
+
+DO_2SHIFT_SCALAR(VSHL_S_scalar, vshli_s)
+DO_2SHIFT_SCALAR(VSHL_U_scalar, vshli_u)
+DO_2SHIFT_SCALAR(VRSHL_S_scalar, vrshli_s)
+DO_2SHIFT_SCALAR(VRSHL_U_scalar, vrshli_u)
+DO_2SHIFT_SCALAR(VQSHL_S_scalar, vqshli_s)
+DO_2SHIFT_SCALAR(VQSHL_U_scalar, vqshli_u)
+DO_2SHIFT_SCALAR(VQRSHL_S_scalar, vqrshli_s)
+DO_2SHIFT_SCALAR(VQRSHL_U_scalar, vqrshli_u)
+
+#define DO_VSHLL(INSN, FN)                                              \
+    static bool trans_##INSN(DisasContext *s, arg_2shift *a)            \
+    {                                                                   \
+        static MVEGenTwoOpShiftFn * const fns[] = {                     \
+            gen_helper_mve_##FN##b,                                     \
+            gen_helper_mve_##FN##h,                                     \
+        };                                                              \
+        return do_2shift_vec(s, a, fns[a->size], false, do_gvec_##FN);  \
+    }
+
+/*
+ * For the VSHLL vector helpers, the vece is the size of the input
+ * (ie MO_8 or MO_16); the helpers want to work in the output size.
+ * The shift count can be 0..<input size>, inclusive. (0 is VMOVL.)
+ */
+static void do_gvec_vshllbs(unsigned vece, uint32_t dofs, uint32_t aofs,
+                            int64_t shift, uint32_t oprsz, uint32_t maxsz)
+{
+    unsigned ovece = vece + 1;
+    unsigned ibits = vece == MO_8 ? 8 : 16;
+    tcg_gen_gvec_shli(ovece, dofs, aofs, ibits, oprsz, maxsz);
+    tcg_gen_gvec_sari(ovece, dofs, dofs, ibits - shift, oprsz, maxsz);
+}
+
+static void do_gvec_vshllbu(unsigned vece, uint32_t dofs, uint32_t aofs,
+                            int64_t shift, uint32_t oprsz, uint32_t maxsz)
+{
+    unsigned ovece = vece + 1;
+    tcg_gen_gvec_andi(ovece, dofs, aofs,
+                      ovece == MO_16 ? 0xff : 0xffff, oprsz, maxsz);
+    tcg_gen_gvec_shli(ovece, dofs, dofs, shift, oprsz, maxsz);
+}
+
+static void do_gvec_vshllts(unsigned vece, uint32_t dofs, uint32_t aofs,
+                            int64_t shift, uint32_t oprsz, uint32_t maxsz)
+{
+    unsigned ovece = vece + 1;
+    unsigned ibits = vece == MO_8 ? 8 : 16;
+    if (shift == 0) {
+        tcg_gen_gvec_sari(ovece, dofs, aofs, ibits, oprsz, maxsz);
+    } else {
+        tcg_gen_gvec_andi(ovece, dofs, aofs,
+                          ovece == MO_16 ? 0xff00 : 0xffff0000, oprsz, maxsz);
+        tcg_gen_gvec_sari(ovece, dofs, dofs, ibits - shift, oprsz, maxsz);
+    }
+}
+
+static void do_gvec_vshlltu(unsigned vece, uint32_t dofs, uint32_t aofs,
+                            int64_t shift, uint32_t oprsz, uint32_t maxsz)
+{
+    unsigned ovece = vece + 1;
+    unsigned ibits = vece == MO_8 ? 8 : 16;
+    if (shift == 0) {
+        tcg_gen_gvec_shri(ovece, dofs, aofs, ibits, oprsz, maxsz);
+    } else {
+        tcg_gen_gvec_andi(ovece, dofs, aofs,
+                          ovece == MO_16 ? 0xff00 : 0xffff0000, oprsz, maxsz);
+        tcg_gen_gvec_shri(ovece, dofs, dofs, ibits - shift, oprsz, maxsz);
+    }
+}
+
+DO_VSHLL(VSHLL_BS, vshllbs)
+DO_VSHLL(VSHLL_BU, vshllbu)
+DO_VSHLL(VSHLL_TS, vshllts)
+DO_VSHLL(VSHLL_TU, vshlltu)
+
+#define DO_2SHIFT_N(INSN, FN)                                   \
+    static bool trans_##INSN(DisasContext *s, arg_2shift *a)    \
+    {                                                           \
+        static MVEGenTwoOpShiftFn * const fns[] = {             \
+            gen_helper_mve_##FN##b,                             \
+            gen_helper_mve_##FN##h,                             \
+        };                                                      \
+        return do_2shift(s, a, fns[a->size], false);            \
+    }
+
+DO_2SHIFT_N(VSHRNB, vshrnb)
+DO_2SHIFT_N(VSHRNT, vshrnt)
+DO_2SHIFT_N(VRSHRNB, vrshrnb)
+DO_2SHIFT_N(VRSHRNT, vrshrnt)
+DO_2SHIFT_N(VQSHRNB_S, vqshrnb_s)
+DO_2SHIFT_N(VQSHRNT_S, vqshrnt_s)
+DO_2SHIFT_N(VQSHRNB_U, vqshrnb_u)
+DO_2SHIFT_N(VQSHRNT_U, vqshrnt_u)
+DO_2SHIFT_N(VQSHRUNB, vqshrunb)
+DO_2SHIFT_N(VQSHRUNT, vqshrunt)
+DO_2SHIFT_N(VQRSHRNB_S, vqrshrnb_s)
+DO_2SHIFT_N(VQRSHRNT_S, vqrshrnt_s)
+DO_2SHIFT_N(VQRSHRNB_U, vqrshrnb_u)
+DO_2SHIFT_N(VQRSHRNT_U, vqrshrnt_u)
+DO_2SHIFT_N(VQRSHRUNB, vqrshrunb)
+DO_2SHIFT_N(VQRSHRUNT, vqrshrunt)
+
+static bool trans_VSHLC(DisasContext *s, arg_VSHLC *a)
+{
+    /*
+     * Whole Vector Left Shift with Carry. The carry is taken
+     * from a general purpose register and written back there.
+     * An imm of 0 means "shift by 32".
+     */
+    TCGv_ptr qd;
+    TCGv_i32 rdm;
+
+    if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qd)) {
+        return false;
+    }
+    if (a->rdm == 13 || a->rdm == 15) {
+        /* CONSTRAINED UNPREDICTABLE: we UNDEF */
+        return false;
+    }
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+
+    qd = mve_qreg_ptr(a->qd);
+    rdm = load_reg(s, a->rdm);
+    gen_helper_mve_vshlc(rdm, cpu_env, qd, rdm, tcg_constant_i32(a->imm));
+    store_reg(s, a->rdm, rdm);
+    tcg_temp_free_ptr(qd);
+    mve_update_eci(s);
+    return true;
+}
+
+static bool do_vidup(DisasContext *s, arg_vidup *a, MVEGenVIDUPFn *fn)
+{
+    TCGv_ptr qd;
+    TCGv_i32 rn;
+
+    /*
+     * Vector increment/decrement with wrap and duplicate (VIDUP, VDDUP).
+     * This fills the vector with elements of successively increasing
+     * or decreasing values, starting from Rn.
+     */
+    if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qd)) {
+        return false;
+    }
+    if (a->size == MO_64) {
+        /* size 0b11 is another encoding */
+        return false;
+    }
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+
+    qd = mve_qreg_ptr(a->qd);
+    rn = load_reg(s, a->rn);
+    fn(rn, cpu_env, qd, rn, tcg_constant_i32(a->imm));
+    store_reg(s, a->rn, rn);
+    tcg_temp_free_ptr(qd);
+    mve_update_eci(s);
+    return true;
+}
+
+static bool do_viwdup(DisasContext *s, arg_viwdup *a, MVEGenVIWDUPFn *fn)
+{
+    TCGv_ptr qd;
+    TCGv_i32 rn, rm;
+
+    /*
+     * Vector increment/decrement with wrap and duplicate (VIWDUp, VDWDUP)
+     * This fills the vector with elements of successively increasing
+     * or decreasing values, starting from Rn. Rm specifies a point where
+     * the count wraps back around to 0. The updated offset is written back
+     * to Rn.
+     */
+    if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qd)) {
+        return false;
+    }
+    if (!fn || a->rm == 13 || a->rm == 15) {
+        /*
+         * size 0b11 is another encoding; Rm == 13 is UNPREDICTABLE;
+         * Rm == 13 is VIWDUP, VDWDUP.
+         */
+        return false;
+    }
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+
+    qd = mve_qreg_ptr(a->qd);
+    rn = load_reg(s, a->rn);
+    rm = load_reg(s, a->rm);
+    fn(rn, cpu_env, qd, rn, rm, tcg_constant_i32(a->imm));
+    store_reg(s, a->rn, rn);
+    tcg_temp_free_ptr(qd);
+    tcg_temp_free_i32(rm);
+    mve_update_eci(s);
+    return true;
+}
+
+static bool trans_VIDUP(DisasContext *s, arg_vidup *a)
+{
+    static MVEGenVIDUPFn * const fns[] = {
+        gen_helper_mve_vidupb,
+        gen_helper_mve_viduph,
+        gen_helper_mve_vidupw,
+        NULL,
+    };
+    return do_vidup(s, a, fns[a->size]);
+}
+
+static bool trans_VDDUP(DisasContext *s, arg_vidup *a)
+{
+    static MVEGenVIDUPFn * const fns[] = {
+        gen_helper_mve_vidupb,
+        gen_helper_mve_viduph,
+        gen_helper_mve_vidupw,
+        NULL,
+    };
+    /* VDDUP is just like VIDUP but with a negative immediate */
+    a->imm = -a->imm;
+    return do_vidup(s, a, fns[a->size]);
+}
+
+static bool trans_VIWDUP(DisasContext *s, arg_viwdup *a)
+{
+    static MVEGenVIWDUPFn * const fns[] = {
+        gen_helper_mve_viwdupb,
+        gen_helper_mve_viwduph,
+        gen_helper_mve_viwdupw,
+        NULL,
+    };
+    return do_viwdup(s, a, fns[a->size]);
+}
+
+static bool trans_VDWDUP(DisasContext *s, arg_viwdup *a)
+{
+    static MVEGenVIWDUPFn * const fns[] = {
+        gen_helper_mve_vdwdupb,
+        gen_helper_mve_vdwduph,
+        gen_helper_mve_vdwdupw,
+        NULL,
+    };
+    return do_viwdup(s, a, fns[a->size]);
+}
+
+static bool do_vcmp(DisasContext *s, arg_vcmp *a, MVEGenCmpFn *fn)
+{
+    TCGv_ptr qn, qm;
+
+    if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qm) ||
+        !fn) {
+        return false;
+    }
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+
+    qn = mve_qreg_ptr(a->qn);
+    qm = mve_qreg_ptr(a->qm);
+    fn(cpu_env, qn, qm);
+    tcg_temp_free_ptr(qn);
+    tcg_temp_free_ptr(qm);
+    if (a->mask) {
+        /* VPT */
+        gen_vpst(s, a->mask);
+    }
+    /* This insn updates predication bits */
+    s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
+    mve_update_eci(s);
+    return true;
+}
+
+static bool do_vcmp_scalar(DisasContext *s, arg_vcmp_scalar *a,
+                           MVEGenScalarCmpFn *fn)
+{
+    TCGv_ptr qn;
+    TCGv_i32 rm;
+
+    if (!dc_isar_feature(aa32_mve, s) || !fn || a->rm == 13) {
+        return false;
+    }
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+
+    qn = mve_qreg_ptr(a->qn);
+    if (a->rm == 15) {
+        /* Encoding Rm=0b1111 means "constant zero" */
+        rm = tcg_constant_i32(0);
+    } else {
+        rm = load_reg(s, a->rm);
+    }
+    fn(cpu_env, qn, rm);
+    tcg_temp_free_ptr(qn);
+    tcg_temp_free_i32(rm);
+    if (a->mask) {
+        /* VPT */
+        gen_vpst(s, a->mask);
+    }
+    /* This insn updates predication bits */
+    s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
+    mve_update_eci(s);
+    return true;
+}
+
+#define DO_VCMP(INSN, FN)                                       \
+    static bool trans_##INSN(DisasContext *s, arg_vcmp *a)      \
+    {                                                           \
+        static MVEGenCmpFn * const fns[] = {                    \
+            gen_helper_mve_##FN##b,                             \
+            gen_helper_mve_##FN##h,                             \
+            gen_helper_mve_##FN##w,                             \
+            NULL,                                               \
+        };                                                      \
+        return do_vcmp(s, a, fns[a->size]);                     \
+    }                                                           \
+    static bool trans_##INSN##_scalar(DisasContext *s,          \
+                                      arg_vcmp_scalar *a)       \
+    {                                                           \
+        static MVEGenScalarCmpFn * const fns[] = {              \
+            gen_helper_mve_##FN##_scalarb,                      \
+            gen_helper_mve_##FN##_scalarh,                      \
+            gen_helper_mve_##FN##_scalarw,                      \
+            NULL,                                               \
+        };                                                      \
+        return do_vcmp_scalar(s, a, fns[a->size]);              \
+    }
+
+DO_VCMP(VCMPEQ, vcmpeq)
+DO_VCMP(VCMPNE, vcmpne)
+DO_VCMP(VCMPCS, vcmpcs)
+DO_VCMP(VCMPHI, vcmphi)
+DO_VCMP(VCMPGE, vcmpge)
+DO_VCMP(VCMPLT, vcmplt)
+DO_VCMP(VCMPGT, vcmpgt)
+DO_VCMP(VCMPLE, vcmple)
+
+#define DO_VCMP_FP(INSN, FN)                                    \
+    static bool trans_##INSN(DisasContext *s, arg_vcmp *a)      \
+    {                                                           \
+        static MVEGenCmpFn * const fns[] = {                    \
+            NULL,                                               \
+            gen_helper_mve_##FN##h,                             \
+            gen_helper_mve_##FN##s,                             \
+            NULL,                                               \
+        };                                                      \
+        if (!dc_isar_feature(aa32_mve_fp, s)) {                 \
+            return false;                                       \
+        }                                                       \
+        return do_vcmp(s, a, fns[a->size]);                     \
+    }                                                           \
+    static bool trans_##INSN##_scalar(DisasContext *s,          \
+                                      arg_vcmp_scalar *a)       \
+    {                                                           \
+        static MVEGenScalarCmpFn * const fns[] = {              \
+            NULL,                                               \
+            gen_helper_mve_##FN##_scalarh,                      \
+            gen_helper_mve_##FN##_scalars,                      \
+            NULL,                                               \
+        };                                                      \
+        if (!dc_isar_feature(aa32_mve_fp, s)) {                 \
+            return false;                                       \
+        }                                                       \
+        return do_vcmp_scalar(s, a, fns[a->size]);              \
+    }
+
+DO_VCMP_FP(VCMPEQ_fp, vfcmpeq)
+DO_VCMP_FP(VCMPNE_fp, vfcmpne)
+DO_VCMP_FP(VCMPGE_fp, vfcmpge)
+DO_VCMP_FP(VCMPLT_fp, vfcmplt)
+DO_VCMP_FP(VCMPGT_fp, vfcmpgt)
+DO_VCMP_FP(VCMPLE_fp, vfcmple)
+
+static bool do_vmaxv(DisasContext *s, arg_vmaxv *a, MVEGenVADDVFn fn)
+{
+    /*
+     * MIN/MAX operations across a vector: compute the min or
+     * max of the initial value in a general purpose register
+     * and all the elements in the vector, and store it back
+     * into the general purpose register.
+     */
+    TCGv_ptr qm;
+    TCGv_i32 rda;
+
+    if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qm) ||
+        !fn || a->rda == 13 || a->rda == 15) {
+        /* Rda cases are UNPREDICTABLE */
+        return false;
+    }
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+
+    qm = mve_qreg_ptr(a->qm);
+    rda = load_reg(s, a->rda);
+    fn(rda, cpu_env, qm, rda);
+    store_reg(s, a->rda, rda);
+    tcg_temp_free_ptr(qm);
+    mve_update_eci(s);
+    return true;
+}
+
+#define DO_VMAXV(INSN, FN)                                      \
+    static bool trans_##INSN(DisasContext *s, arg_vmaxv *a)     \
+    {                                                           \
+        static MVEGenVADDVFn * const fns[] = {                  \
+            gen_helper_mve_##FN##b,                             \
+            gen_helper_mve_##FN##h,                             \
+            gen_helper_mve_##FN##w,                             \
+            NULL,                                               \
+        };                                                      \
+        return do_vmaxv(s, a, fns[a->size]);                    \
+    }
+
+DO_VMAXV(VMAXV_S, vmaxvs)
+DO_VMAXV(VMAXV_U, vmaxvu)
+DO_VMAXV(VMAXAV, vmaxav)
+DO_VMAXV(VMINV_S, vminvs)
+DO_VMAXV(VMINV_U, vminvu)
+DO_VMAXV(VMINAV, vminav)
+
+#define DO_VMAXV_FP(INSN, FN)                                   \
+    static bool trans_##INSN(DisasContext *s, arg_vmaxv *a)     \
+    {                                                           \
+        static MVEGenVADDVFn * const fns[] = {                  \
+            NULL,                                               \
+            gen_helper_mve_##FN##h,                             \
+            gen_helper_mve_##FN##s,                             \
+            NULL,                                               \
+        };                                                      \
+        if (!dc_isar_feature(aa32_mve_fp, s)) {                 \
+            return false;                                       \
+        }                                                       \
+        return do_vmaxv(s, a, fns[a->size]);                    \
+    }
+
+DO_VMAXV_FP(VMAXNMV, vmaxnmv)
+DO_VMAXV_FP(VMINNMV, vminnmv)
+DO_VMAXV_FP(VMAXNMAV, vmaxnmav)
+DO_VMAXV_FP(VMINNMAV, vminnmav)
+
+static bool do_vabav(DisasContext *s, arg_vabav *a, MVEGenVABAVFn *fn)
+{
+    /* Absolute difference accumulated across vector */
+    TCGv_ptr qn, qm;
+    TCGv_i32 rda;
+
+    if (!dc_isar_feature(aa32_mve, s) ||
+        !mve_check_qreg_bank(s, a->qm | a->qn) ||
+        !fn || a->rda == 13 || a->rda == 15) {
+        /* Rda cases are UNPREDICTABLE */
+        return false;
+    }
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+
+    qm = mve_qreg_ptr(a->qm);
+    qn = mve_qreg_ptr(a->qn);
+    rda = load_reg(s, a->rda);
+    fn(rda, cpu_env, qn, qm, rda);
+    store_reg(s, a->rda, rda);
+    tcg_temp_free_ptr(qm);
+    tcg_temp_free_ptr(qn);
+    mve_update_eci(s);
+    return true;
+}
+
+#define DO_VABAV(INSN, FN)                                      \
+    static bool trans_##INSN(DisasContext *s, arg_vabav *a)     \
+    {                                                           \
+        static MVEGenVABAVFn * const fns[] = {                  \
+            gen_helper_mve_##FN##b,                             \
+            gen_helper_mve_##FN##h,                             \
+            gen_helper_mve_##FN##w,                             \
+            NULL,                                               \
+        };                                                      \
+        return do_vabav(s, a, fns[a->size]);                    \
+    }
+
+DO_VABAV(VABAV_S, vabavs)
+DO_VABAV(VABAV_U, vabavu)
+
+static bool trans_VMOV_to_2gp(DisasContext *s, arg_VMOV_to_2gp *a)
+{
+    /*
+     * VMOV two 32-bit vector lanes to two general-purpose registers.
+     * This insn is not predicated but it is subject to beat-wise
+     * execution if it is not in an IT block. For us this means
+     * only that if PSR.ECI says we should not be executing the beat
+     * corresponding to the lane of the vector register being accessed
+     * then we should skip perfoming the move, and that we need to do
+     * the usual check for bad ECI state and advance of ECI state.
+     * (If PSR.ECI is non-zero then we cannot be in an IT block.)
+     */
+    TCGv_i32 tmp;
+    int vd;
+
+    if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qd) ||
+        a->rt == 13 || a->rt == 15 || a->rt2 == 13 || a->rt2 == 15 ||
+        a->rt == a->rt2) {
+        /* Rt/Rt2 cases are UNPREDICTABLE */
+        return false;
+    }
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+
+    /* Convert Qreg index to Dreg for read_neon_element32() etc */
+    vd = a->qd * 2;
+
+    if (!mve_skip_vmov(s, vd, a->idx, MO_32)) {
+        tmp = tcg_temp_new_i32();
+        read_neon_element32(tmp, vd, a->idx, MO_32);
+        store_reg(s, a->rt, tmp);
+    }
+    if (!mve_skip_vmov(s, vd + 1, a->idx, MO_32)) {
+        tmp = tcg_temp_new_i32();
+        read_neon_element32(tmp, vd + 1, a->idx, MO_32);
+        store_reg(s, a->rt2, tmp);
+    }
+
+    mve_update_and_store_eci(s);
+    return true;
+}
+
+static bool trans_VMOV_from_2gp(DisasContext *s, arg_VMOV_to_2gp *a)
+{
+    /*
+     * VMOV two general-purpose registers to two 32-bit vector lanes.
+     * This insn is not predicated but it is subject to beat-wise
+     * execution if it is not in an IT block. For us this means
+     * only that if PSR.ECI says we should not be executing the beat
+     * corresponding to the lane of the vector register being accessed
+     * then we should skip perfoming the move, and that we need to do
+     * the usual check for bad ECI state and advance of ECI state.
+     * (If PSR.ECI is non-zero then we cannot be in an IT block.)
+     */
+    TCGv_i32 tmp;
+    int vd;
+
+    if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qd) ||
+        a->rt == 13 || a->rt == 15 || a->rt2 == 13 || a->rt2 == 15) {
+        /* Rt/Rt2 cases are UNPREDICTABLE */
+        return false;
+    }
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+
+    /* Convert Qreg idx to Dreg for read_neon_element32() etc */
+    vd = a->qd * 2;
+
+    if (!mve_skip_vmov(s, vd, a->idx, MO_32)) {
+        tmp = load_reg(s, a->rt);
+        write_neon_element32(tmp, vd, a->idx, MO_32);
+        tcg_temp_free_i32(tmp);
+    }
+    if (!mve_skip_vmov(s, vd + 1, a->idx, MO_32)) {
+        tmp = load_reg(s, a->rt2);
+        write_neon_element32(tmp, vd + 1, a->idx, MO_32);
+        tcg_temp_free_i32(tmp);
+    }
+
+    mve_update_and_store_eci(s);
+    return true;
+}
diff --git a/target/arm/tcg/translate-neon.c b/target/arm/tcg/translate-neon.c
new file mode 100644
index 0000000..4016339
--- /dev/null
+++ b/target/arm/tcg/translate-neon.c
@@ -0,0 +1,4064 @@
+/*
+ *  ARM translation: AArch32 Neon instructions
+ *
+ *  Copyright (c) 2003 Fabrice Bellard
+ *  Copyright (c) 2005-2007 CodeSourcery
+ *  Copyright (c) 2007 OpenedHand, Ltd.
+ *  Copyright (c) 2020 Linaro, Ltd.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
+#include "exec/exec-all.h"
+#include "exec/gen-icount.h"
+#include "translate.h"
+#include "translate-a32.h"
+
+/* Include the generated Neon decoder */
+#include "decode-neon-dp.c.inc"
+#include "decode-neon-ls.c.inc"
+#include "decode-neon-shared.c.inc"
+
+static TCGv_ptr vfp_reg_ptr(bool dp, int reg)
+{
+    TCGv_ptr ret = tcg_temp_new_ptr();
+    tcg_gen_addi_ptr(ret, cpu_env, vfp_reg_offset(dp, reg));
+    return ret;
+}
+
+static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
+{
+    long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
+
+    switch (mop) {
+    case MO_UB:
+        tcg_gen_ld8u_i32(var, cpu_env, offset);
+        break;
+    case MO_UW:
+        tcg_gen_ld16u_i32(var, cpu_env, offset);
+        break;
+    case MO_UL:
+        tcg_gen_ld_i32(var, cpu_env, offset);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
+{
+    long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
+
+    switch (mop) {
+    case MO_UB:
+        tcg_gen_ld8u_i64(var, cpu_env, offset);
+        break;
+    case MO_UW:
+        tcg_gen_ld16u_i64(var, cpu_env, offset);
+        break;
+    case MO_UL:
+        tcg_gen_ld32u_i64(var, cpu_env, offset);
+        break;
+    case MO_UQ:
+        tcg_gen_ld_i64(var, cpu_env, offset);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
+{
+    long offset = neon_element_offset(reg, ele, size);
+
+    switch (size) {
+    case MO_8:
+        tcg_gen_st8_i32(var, cpu_env, offset);
+        break;
+    case MO_16:
+        tcg_gen_st16_i32(var, cpu_env, offset);
+        break;
+    case MO_32:
+        tcg_gen_st_i32(var, cpu_env, offset);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
+{
+    long offset = neon_element_offset(reg, ele, size);
+
+    switch (size) {
+    case MO_8:
+        tcg_gen_st8_i64(var, cpu_env, offset);
+        break;
+    case MO_16:
+        tcg_gen_st16_i64(var, cpu_env, offset);
+        break;
+    case MO_32:
+        tcg_gen_st32_i64(var, cpu_env, offset);
+        break;
+    case MO_64:
+        tcg_gen_st_i64(var, cpu_env, offset);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+static bool do_neon_ddda(DisasContext *s, int q, int vd, int vn, int vm,
+                         int data, gen_helper_gvec_4 *fn_gvec)
+{
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
+        return false;
+    }
+
+    /*
+     * UNDEF accesses to odd registers for each bit of Q.
+     * Q will be 0b111 for all Q-reg instructions, otherwise
+     * when we have mixed Q- and D-reg inputs.
+     */
+    if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    int opr_sz = q ? 16 : 8;
+    tcg_gen_gvec_4_ool(vfp_reg_offset(1, vd),
+                       vfp_reg_offset(1, vn),
+                       vfp_reg_offset(1, vm),
+                       vfp_reg_offset(1, vd),
+                       opr_sz, opr_sz, data, fn_gvec);
+    return true;
+}
+
+static bool do_neon_ddda_fpst(DisasContext *s, int q, int vd, int vn, int vm,
+                              int data, ARMFPStatusFlavour fp_flavour,
+                              gen_helper_gvec_4_ptr *fn_gvec_ptr)
+{
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
+        return false;
+    }
+
+    /*
+     * UNDEF accesses to odd registers for each bit of Q.
+     * Q will be 0b111 for all Q-reg instructions, otherwise
+     * when we have mixed Q- and D-reg inputs.
+     */
+    if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    int opr_sz = q ? 16 : 8;
+    TCGv_ptr fpst = fpstatus_ptr(fp_flavour);
+
+    tcg_gen_gvec_4_ptr(vfp_reg_offset(1, vd),
+                       vfp_reg_offset(1, vn),
+                       vfp_reg_offset(1, vm),
+                       vfp_reg_offset(1, vd),
+                       fpst, opr_sz, opr_sz, data, fn_gvec_ptr);
+    tcg_temp_free_ptr(fpst);
+    return true;
+}
+
+static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
+{
+    if (!dc_isar_feature(aa32_vcma, s)) {
+        return false;
+    }
+    if (a->size == MO_16) {
+        if (!dc_isar_feature(aa32_fp16_arith, s)) {
+            return false;
+        }
+        return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
+                                 FPST_STD_F16, gen_helper_gvec_fcmlah);
+    }
+    return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
+                             FPST_STD, gen_helper_gvec_fcmlas);
+}
+
+static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
+{
+    int opr_sz;
+    TCGv_ptr fpst;
+    gen_helper_gvec_3_ptr *fn_gvec_ptr;
+
+    if (!dc_isar_feature(aa32_vcma, s)
+        || (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s))) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vn | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if ((a->vn | a->vm | a->vd) & a->q) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    opr_sz = (1 + a->q) * 8;
+    fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
+    fn_gvec_ptr = (a->size == MO_16) ?
+        gen_helper_gvec_fcaddh : gen_helper_gvec_fcadds;
+    tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
+                       vfp_reg_offset(1, a->vn),
+                       vfp_reg_offset(1, a->vm),
+                       fpst, opr_sz, opr_sz, a->rot,
+                       fn_gvec_ptr);
+    tcg_temp_free_ptr(fpst);
+    return true;
+}
+
+static bool trans_VSDOT(DisasContext *s, arg_VSDOT *a)
+{
+    if (!dc_isar_feature(aa32_dp, s)) {
+        return false;
+    }
+    return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
+                        gen_helper_gvec_sdot_b);
+}
+
+static bool trans_VUDOT(DisasContext *s, arg_VUDOT *a)
+{
+    if (!dc_isar_feature(aa32_dp, s)) {
+        return false;
+    }
+    return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
+                        gen_helper_gvec_udot_b);
+}
+
+static bool trans_VUSDOT(DisasContext *s, arg_VUSDOT *a)
+{
+    if (!dc_isar_feature(aa32_i8mm, s)) {
+        return false;
+    }
+    return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
+                        gen_helper_gvec_usdot_b);
+}
+
+static bool trans_VDOT_b16(DisasContext *s, arg_VDOT_b16 *a)
+{
+    if (!dc_isar_feature(aa32_bf16, s)) {
+        return false;
+    }
+    return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
+                        gen_helper_gvec_bfdot);
+}
+
+static bool trans_VFML(DisasContext *s, arg_VFML *a)
+{
+    int opr_sz;
+
+    if (!dc_isar_feature(aa32_fhm, s)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        (a->vd & 0x10)) {
+        return false;
+    }
+
+    if (a->vd & a->q) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    opr_sz = (1 + a->q) * 8;
+    tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
+                       vfp_reg_offset(a->q, a->vn),
+                       vfp_reg_offset(a->q, a->vm),
+                       cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
+                       gen_helper_gvec_fmlal_a32);
+    return true;
+}
+
+static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
+{
+    int data = (a->index << 2) | a->rot;
+
+    if (!dc_isar_feature(aa32_vcma, s)) {
+        return false;
+    }
+    if (a->size == MO_16) {
+        if (!dc_isar_feature(aa32_fp16_arith, s)) {
+            return false;
+        }
+        return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
+                                 FPST_STD_F16, gen_helper_gvec_fcmlah_idx);
+    }
+    return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
+                             FPST_STD, gen_helper_gvec_fcmlas_idx);
+}
+
+static bool trans_VSDOT_scalar(DisasContext *s, arg_VSDOT_scalar *a)
+{
+    if (!dc_isar_feature(aa32_dp, s)) {
+        return false;
+    }
+    return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
+                        gen_helper_gvec_sdot_idx_b);
+}
+
+static bool trans_VUDOT_scalar(DisasContext *s, arg_VUDOT_scalar *a)
+{
+    if (!dc_isar_feature(aa32_dp, s)) {
+        return false;
+    }
+    return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
+                        gen_helper_gvec_udot_idx_b);
+}
+
+static bool trans_VUSDOT_scalar(DisasContext *s, arg_VUSDOT_scalar *a)
+{
+    if (!dc_isar_feature(aa32_i8mm, s)) {
+        return false;
+    }
+    return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
+                        gen_helper_gvec_usdot_idx_b);
+}
+
+static bool trans_VSUDOT_scalar(DisasContext *s, arg_VSUDOT_scalar *a)
+{
+    if (!dc_isar_feature(aa32_i8mm, s)) {
+        return false;
+    }
+    return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
+                        gen_helper_gvec_sudot_idx_b);
+}
+
+static bool trans_VDOT_b16_scal(DisasContext *s, arg_VDOT_b16_scal *a)
+{
+    if (!dc_isar_feature(aa32_bf16, s)) {
+        return false;
+    }
+    return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
+                        gen_helper_gvec_bfdot_idx);
+}
+
+static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
+{
+    int opr_sz;
+
+    if (!dc_isar_feature(aa32_fhm, s)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
+        return false;
+    }
+
+    if (a->vd & a->q) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    opr_sz = (1 + a->q) * 8;
+    tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
+                       vfp_reg_offset(a->q, a->vn),
+                       vfp_reg_offset(a->q, a->rm),
+                       cpu_env, opr_sz, opr_sz,
+                       (a->index << 2) | a->s, /* is_2 == 0 */
+                       gen_helper_gvec_fmlal_idx_a32);
+    return true;
+}
+
+static struct {
+    int nregs;
+    int interleave;
+    int spacing;
+} const neon_ls_element_type[11] = {
+    {1, 4, 1},
+    {1, 4, 2},
+    {4, 1, 1},
+    {2, 2, 2},
+    {1, 3, 1},
+    {1, 3, 2},
+    {3, 1, 1},
+    {1, 1, 1},
+    {1, 2, 1},
+    {1, 2, 2},
+    {2, 1, 1}
+};
+
+static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
+                                      int stride)
+{
+    if (rm != 15) {
+        TCGv_i32 base;
+
+        base = load_reg(s, rn);
+        if (rm == 13) {
+            tcg_gen_addi_i32(base, base, stride);
+        } else {
+            TCGv_i32 index;
+            index = load_reg(s, rm);
+            tcg_gen_add_i32(base, base, index);
+            tcg_temp_free_i32(index);
+        }
+        store_reg(s, rn, base);
+    }
+}
+
+static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
+{
+    /* Neon load/store multiple structures */
+    int nregs, interleave, spacing, reg, n;
+    MemOp mop, align, endian;
+    int mmu_idx = get_mem_index(s);
+    int size = a->size;
+    TCGv_i64 tmp64;
+    TCGv_i32 addr;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist */
+    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
+        return false;
+    }
+    if (a->itype > 10) {
+        return false;
+    }
+    /* Catch UNDEF cases for bad values of align field */
+    switch (a->itype & 0xc) {
+    case 4:
+        if (a->align >= 2) {
+            return false;
+        }
+        break;
+    case 8:
+        if (a->align == 3) {
+            return false;
+        }
+        break;
+    default:
+        break;
+    }
+    nregs = neon_ls_element_type[a->itype].nregs;
+    interleave = neon_ls_element_type[a->itype].interleave;
+    spacing = neon_ls_element_type[a->itype].spacing;
+    if (size == 3 && (interleave | spacing) != 1) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    /* For our purposes, bytes are always little-endian.  */
+    endian = s->be_data;
+    if (size == 0) {
+        endian = MO_LE;
+    }
+
+    /* Enforce alignment requested by the instruction */
+    if (a->align) {
+        align = pow2_align(a->align + 2); /* 4 ** a->align */
+    } else {
+        align = s->align_mem ? MO_ALIGN : 0;
+    }
+
+    /*
+     * Consecutive little-endian elements from a single register
+     * can be promoted to a larger little-endian operation.
+     */
+    if (interleave == 1 && endian == MO_LE) {
+        /* Retain any natural alignment. */
+        if (align == MO_ALIGN) {
+            align = pow2_align(size);
+        }
+        size = 3;
+    }
+
+    tmp64 = tcg_temp_new_i64();
+    addr = tcg_temp_new_i32();
+    load_reg_var(s, addr, a->rn);
+
+    mop = endian | size | align;
+    for (reg = 0; reg < nregs; reg++) {
+        for (n = 0; n < 8 >> size; n++) {
+            int xs;
+            for (xs = 0; xs < interleave; xs++) {
+                int tt = a->vd + reg + spacing * xs;
+
+                if (a->l) {
+                    gen_aa32_ld_internal_i64(s, tmp64, addr, mmu_idx, mop);
+                    neon_store_element64(tt, n, size, tmp64);
+                } else {
+                    neon_load_element64(tmp64, tt, n, size);
+                    gen_aa32_st_internal_i64(s, tmp64, addr, mmu_idx, mop);
+                }
+                tcg_gen_addi_i32(addr, addr, 1 << size);
+
+                /* Subsequent memory operations inherit alignment */
+                mop &= ~MO_AMASK;
+            }
+        }
+    }
+    tcg_temp_free_i32(addr);
+    tcg_temp_free_i64(tmp64);
+
+    gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
+    return true;
+}
+
+static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
+{
+    /* Neon load single structure to all lanes */
+    int reg, stride, vec_size;
+    int vd = a->vd;
+    int size = a->size;
+    int nregs = a->n + 1;
+    TCGv_i32 addr, tmp;
+    MemOp mop, align;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist */
+    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
+        return false;
+    }
+
+    align = 0;
+    if (size == 3) {
+        if (nregs != 4 || a->a == 0) {
+            return false;
+        }
+        /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
+        size = MO_32;
+        align = MO_ALIGN_16;
+    } else if (a->a) {
+        switch (nregs) {
+        case 1:
+            if (size == 0) {
+                return false;
+            }
+            align = MO_ALIGN;
+            break;
+        case 2:
+            align = pow2_align(size + 1);
+            break;
+        case 3:
+            return false;
+        case 4:
+            if (size == 2) {
+                align = pow2_align(3);
+            } else {
+                align = pow2_align(size + 2);
+            }
+            break;
+        default:
+            g_assert_not_reached();
+        }
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    /*
+     * VLD1 to all lanes: T bit indicates how many Dregs to write.
+     * VLD2/3/4 to all lanes: T bit indicates register stride.
+     */
+    stride = a->t ? 2 : 1;
+    vec_size = nregs == 1 ? stride * 8 : 8;
+    mop = size | align;
+    tmp = tcg_temp_new_i32();
+    addr = tcg_temp_new_i32();
+    load_reg_var(s, addr, a->rn);
+    for (reg = 0; reg < nregs; reg++) {
+        gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), mop);
+        if ((vd & 1) && vec_size == 16) {
+            /*
+             * We cannot write 16 bytes at once because the
+             * destination is unaligned.
+             */
+            tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
+                                 8, 8, tmp);
+            tcg_gen_gvec_mov(0, neon_full_reg_offset(vd + 1),
+                             neon_full_reg_offset(vd), 8, 8);
+        } else {
+            tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
+                                 vec_size, vec_size, tmp);
+        }
+        tcg_gen_addi_i32(addr, addr, 1 << size);
+        vd += stride;
+
+        /* Subsequent memory operations inherit alignment */
+        mop &= ~MO_AMASK;
+    }
+    tcg_temp_free_i32(tmp);
+    tcg_temp_free_i32(addr);
+
+    gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
+
+    return true;
+}
+
+static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
+{
+    /* Neon load/store single structure to one lane */
+    int reg;
+    int nregs = a->n + 1;
+    int vd = a->vd;
+    TCGv_i32 addr, tmp;
+    MemOp mop;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist */
+    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
+        return false;
+    }
+
+    /* Catch the UNDEF cases. This is unavoidably a bit messy. */
+    switch (nregs) {
+    case 1:
+        if (a->stride != 1) {
+            return false;
+        }
+        if (((a->align & (1 << a->size)) != 0) ||
+            (a->size == 2 && (a->align == 1 || a->align == 2))) {
+            return false;
+        }
+        break;
+    case 2:
+        if (a->size == 2 && (a->align & 2) != 0) {
+            return false;
+        }
+        break;
+    case 3:
+        if (a->align != 0) {
+            return false;
+        }
+        break;
+    case 4:
+        if (a->size == 2 && a->align == 3) {
+            return false;
+        }
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    if ((vd + a->stride * (nregs - 1)) > 31) {
+        /*
+         * Attempts to write off the end of the register file are
+         * UNPREDICTABLE; we choose to UNDEF because otherwise we would
+         * access off the end of the array that holds the register data.
+         */
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    /* Pick up SCTLR settings */
+    mop = finalize_memop(s, a->size);
+
+    if (a->align) {
+        MemOp align_op;
+
+        switch (nregs) {
+        case 1:
+            /* For VLD1, use natural alignment. */
+            align_op = MO_ALIGN;
+            break;
+        case 2:
+            /* For VLD2, use double alignment. */
+            align_op = pow2_align(a->size + 1);
+            break;
+        case 4:
+            if (a->size == MO_32) {
+                /*
+                 * For VLD4.32, align = 1 is double alignment, align = 2 is
+                 * quad alignment; align = 3 is rejected above.
+                 */
+                align_op = pow2_align(a->size + a->align);
+            } else {
+                /* For VLD4.8 and VLD.16, we want quad alignment. */
+                align_op = pow2_align(a->size + 2);
+            }
+            break;
+        default:
+            /* For VLD3, the alignment field is zero and rejected above. */
+            g_assert_not_reached();
+        }
+
+        mop = (mop & ~MO_AMASK) | align_op;
+    }
+
+    tmp = tcg_temp_new_i32();
+    addr = tcg_temp_new_i32();
+    load_reg_var(s, addr, a->rn);
+
+    for (reg = 0; reg < nregs; reg++) {
+        if (a->l) {
+            gen_aa32_ld_internal_i32(s, tmp, addr, get_mem_index(s), mop);
+            neon_store_element(vd, a->reg_idx, a->size, tmp);
+        } else { /* Store */
+            neon_load_element(tmp, vd, a->reg_idx, a->size);
+            gen_aa32_st_internal_i32(s, tmp, addr, get_mem_index(s), mop);
+        }
+        vd += a->stride;
+        tcg_gen_addi_i32(addr, addr, 1 << a->size);
+
+        /* Subsequent memory operations inherit alignment */
+        mop &= ~MO_AMASK;
+    }
+    tcg_temp_free_i32(addr);
+    tcg_temp_free_i32(tmp);
+
+    gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
+
+    return true;
+}
+
+static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
+{
+    int vec_size = a->q ? 16 : 8;
+    int rd_ofs = neon_full_reg_offset(a->vd);
+    int rn_ofs = neon_full_reg_offset(a->vn);
+    int rm_ofs = neon_full_reg_offset(a->vm);
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vn | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if ((a->vn | a->vm | a->vd) & a->q) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
+    return true;
+}
+
+#define DO_3SAME(INSN, FUNC)                                            \
+    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
+    {                                                                   \
+        return do_3same(s, a, FUNC);                                    \
+    }
+
+DO_3SAME(VADD, tcg_gen_gvec_add)
+DO_3SAME(VSUB, tcg_gen_gvec_sub)
+DO_3SAME(VAND, tcg_gen_gvec_and)
+DO_3SAME(VBIC, tcg_gen_gvec_andc)
+DO_3SAME(VORR, tcg_gen_gvec_or)
+DO_3SAME(VORN, tcg_gen_gvec_orc)
+DO_3SAME(VEOR, tcg_gen_gvec_xor)
+DO_3SAME(VSHL_S, gen_gvec_sshl)
+DO_3SAME(VSHL_U, gen_gvec_ushl)
+DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
+DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
+DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
+DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
+
+/* These insns are all gvec_bitsel but with the inputs in various orders. */
+#define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
+    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
+                                uint32_t rn_ofs, uint32_t rm_ofs,       \
+                                uint32_t oprsz, uint32_t maxsz)         \
+    {                                                                   \
+        tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
+    }                                                                   \
+    DO_3SAME(INSN, gen_##INSN##_3s)
+
+DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
+DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
+DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
+
+#define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
+    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
+    {                                                                   \
+        if (a->size == 3) {                                             \
+            return false;                                               \
+        }                                                               \
+        return do_3same(s, a, FUNC);                                    \
+    }
+
+DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
+DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
+DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
+DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
+DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
+DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
+DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
+DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
+DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
+DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
+DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
+DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
+
+#define DO_3SAME_CMP(INSN, COND)                                        \
+    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
+                                uint32_t rn_ofs, uint32_t rm_ofs,       \
+                                uint32_t oprsz, uint32_t maxsz)         \
+    {                                                                   \
+        tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
+    }                                                                   \
+    DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
+
+DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
+DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
+DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
+DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
+DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
+
+#define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
+    static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
+                         uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
+    {                                                                      \
+        tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
+    }
+
+WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
+
+static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
+{
+    if (a->size != 0) {
+        return false;
+    }
+    return do_3same(s, a, gen_VMUL_p_3s);
+}
+
+#define DO_VQRDMLAH(INSN, FUNC)                                         \
+    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
+    {                                                                   \
+        if (!dc_isar_feature(aa32_rdm, s)) {                            \
+            return false;                                               \
+        }                                                               \
+        if (a->size != 1 && a->size != 2) {                             \
+            return false;                                               \
+        }                                                               \
+        return do_3same(s, a, FUNC);                                    \
+    }
+
+DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
+DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
+
+#define DO_SHA1(NAME, FUNC)                                             \
+    WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
+    static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
+    {                                                                   \
+        if (!dc_isar_feature(aa32_sha1, s)) {                           \
+            return false;                                               \
+        }                                                               \
+        return do_3same(s, a, gen_##NAME##_3s);                         \
+    }
+
+DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
+DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
+DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
+DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
+
+#define DO_SHA2(NAME, FUNC)                                             \
+    WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
+    static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
+    {                                                                   \
+        if (!dc_isar_feature(aa32_sha2, s)) {                           \
+            return false;                                               \
+        }                                                               \
+        return do_3same(s, a, gen_##NAME##_3s);                         \
+    }
+
+DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
+DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
+DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
+
+#define DO_3SAME_64(INSN, FUNC)                                         \
+    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
+                                uint32_t rn_ofs, uint32_t rm_ofs,       \
+                                uint32_t oprsz, uint32_t maxsz)         \
+    {                                                                   \
+        static const GVecGen3 op = { .fni8 = FUNC };                    \
+        tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op);      \
+    }                                                                   \
+    DO_3SAME(INSN, gen_##INSN##_3s)
+
+#define DO_3SAME_64_ENV(INSN, FUNC)                                     \
+    static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)    \
+    {                                                                   \
+        FUNC(d, cpu_env, n, m);                                         \
+    }                                                                   \
+    DO_3SAME_64(INSN, gen_##INSN##_elt)
+
+DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
+DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
+DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
+DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
+DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
+DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
+
+#define DO_3SAME_32(INSN, FUNC)                                         \
+    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
+                                uint32_t rn_ofs, uint32_t rm_ofs,       \
+                                uint32_t oprsz, uint32_t maxsz)         \
+    {                                                                   \
+        static const GVecGen3 ops[4] = {                                \
+            { .fni4 = gen_helper_neon_##FUNC##8 },                      \
+            { .fni4 = gen_helper_neon_##FUNC##16 },                     \
+            { .fni4 = gen_helper_neon_##FUNC##32 },                     \
+            { 0 },                                                      \
+        };                                                              \
+        tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
+    }                                                                   \
+    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
+    {                                                                   \
+        if (a->size > 2) {                                              \
+            return false;                                               \
+        }                                                               \
+        return do_3same(s, a, gen_##INSN##_3s);                         \
+    }
+
+/*
+ * Some helper functions need to be passed the cpu_env. In order
+ * to use those with the gvec APIs like tcg_gen_gvec_3() we need
+ * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
+ * and which call a NeonGenTwoOpEnvFn().
+ */
+#define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
+    static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
+    {                                                                   \
+        FUNC(d, cpu_env, n, m);                                         \
+    }
+
+#define DO_3SAME_32_ENV(INSN, FUNC)                                     \
+    WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8);        \
+    WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16);      \
+    WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32);      \
+    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
+                                uint32_t rn_ofs, uint32_t rm_ofs,       \
+                                uint32_t oprsz, uint32_t maxsz)         \
+    {                                                                   \
+        static const GVecGen3 ops[4] = {                                \
+            { .fni4 = gen_##INSN##_tramp8 },                            \
+            { .fni4 = gen_##INSN##_tramp16 },                           \
+            { .fni4 = gen_##INSN##_tramp32 },                           \
+            { 0 },                                                      \
+        };                                                              \
+        tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
+    }                                                                   \
+    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
+    {                                                                   \
+        if (a->size > 2) {                                              \
+            return false;                                               \
+        }                                                               \
+        return do_3same(s, a, gen_##INSN##_3s);                         \
+    }
+
+DO_3SAME_32(VHADD_S, hadd_s)
+DO_3SAME_32(VHADD_U, hadd_u)
+DO_3SAME_32(VHSUB_S, hsub_s)
+DO_3SAME_32(VHSUB_U, hsub_u)
+DO_3SAME_32(VRHADD_S, rhadd_s)
+DO_3SAME_32(VRHADD_U, rhadd_u)
+DO_3SAME_32(VRSHL_S, rshl_s)
+DO_3SAME_32(VRSHL_U, rshl_u)
+
+DO_3SAME_32_ENV(VQSHL_S, qshl_s)
+DO_3SAME_32_ENV(VQSHL_U, qshl_u)
+DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
+DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
+
+static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn)
+{
+    /* Operations handled pairwise 32 bits at a time */
+    TCGv_i32 tmp, tmp2, tmp3;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vn | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if (a->size == 3) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    assert(a->q == 0); /* enforced by decode patterns */
+
+    /*
+     * Note that we have to be careful not to clobber the source operands
+     * in the "vm == vd" case by storing the result of the first pass too
+     * early. Since Q is 0 there are always just two passes, so instead
+     * of a complicated loop over each pass we just unroll.
+     */
+    tmp = tcg_temp_new_i32();
+    tmp2 = tcg_temp_new_i32();
+    tmp3 = tcg_temp_new_i32();
+
+    read_neon_element32(tmp, a->vn, 0, MO_32);
+    read_neon_element32(tmp2, a->vn, 1, MO_32);
+    fn(tmp, tmp, tmp2);
+
+    read_neon_element32(tmp3, a->vm, 0, MO_32);
+    read_neon_element32(tmp2, a->vm, 1, MO_32);
+    fn(tmp3, tmp3, tmp2);
+
+    write_neon_element32(tmp, a->vd, 0, MO_32);
+    write_neon_element32(tmp3, a->vd, 1, MO_32);
+
+    tcg_temp_free_i32(tmp);
+    tcg_temp_free_i32(tmp2);
+    tcg_temp_free_i32(tmp3);
+    return true;
+}
+
+#define DO_3SAME_PAIR(INSN, func)                                       \
+    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
+    {                                                                   \
+        static NeonGenTwoOpFn * const fns[] = {                         \
+            gen_helper_neon_##func##8,                                  \
+            gen_helper_neon_##func##16,                                 \
+            gen_helper_neon_##func##32,                                 \
+        };                                                              \
+        if (a->size > 2) {                                              \
+            return false;                                               \
+        }                                                               \
+        return do_3same_pair(s, a, fns[a->size]);                       \
+    }
+
+/* 32-bit pairwise ops end up the same as the elementwise versions.  */
+#define gen_helper_neon_pmax_s32  tcg_gen_smax_i32
+#define gen_helper_neon_pmax_u32  tcg_gen_umax_i32
+#define gen_helper_neon_pmin_s32  tcg_gen_smin_i32
+#define gen_helper_neon_pmin_u32  tcg_gen_umin_i32
+#define gen_helper_neon_padd_u32  tcg_gen_add_i32
+
+DO_3SAME_PAIR(VPMAX_S, pmax_s)
+DO_3SAME_PAIR(VPMIN_S, pmin_s)
+DO_3SAME_PAIR(VPMAX_U, pmax_u)
+DO_3SAME_PAIR(VPMIN_U, pmin_u)
+DO_3SAME_PAIR(VPADD, padd_u)
+
+#define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
+    WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16);    \
+    WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32);    \
+    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
+                                uint32_t rn_ofs, uint32_t rm_ofs,       \
+                                uint32_t oprsz, uint32_t maxsz)         \
+    {                                                                   \
+        static const GVecGen3 ops[2] = {                                \
+            { .fni4 = gen_##INSN##_tramp16 },                           \
+            { .fni4 = gen_##INSN##_tramp32 },                           \
+        };                                                              \
+        tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
+    }                                                                   \
+    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
+    {                                                                   \
+        if (a->size != 1 && a->size != 2) {                             \
+            return false;                                               \
+        }                                                               \
+        return do_3same(s, a, gen_##INSN##_3s);                         \
+    }
+
+DO_3SAME_VQDMULH(VQDMULH, qdmulh)
+DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
+
+#define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC)                              \
+    static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
+                         uint32_t rn_ofs, uint32_t rm_ofs,              \
+                         uint32_t oprsz, uint32_t maxsz)                \
+    {                                                                   \
+        TCGv_ptr fpst = fpstatus_ptr(FPST);                             \
+        tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
+                           oprsz, maxsz, 0, FUNC);                      \
+        tcg_temp_free_ptr(fpst);                                        \
+    }
+
+#define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC)                                 \
+    WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC)                 \
+    WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC)             \
+    static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
+    {                                                                   \
+        if (a->size == MO_16) {                                         \
+            if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
+                return false;                                           \
+            }                                                           \
+            return do_3same(s, a, gen_##INSN##_fp16_3s);                \
+        }                                                               \
+        return do_3same(s, a, gen_##INSN##_fp32_3s);                    \
+    }
+
+
+DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
+DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
+DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
+DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
+DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
+DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
+DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
+DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
+DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
+DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
+DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
+DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
+DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
+DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
+DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
+DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
+DO_3S_FP_GVEC(VRSQRTS, gen_helper_gvec_rsqrts_nf_s, gen_helper_gvec_rsqrts_nf_h)
+
+WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
+WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
+WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
+WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
+
+static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
+{
+    if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
+        return false;
+    }
+
+    if (a->size == MO_16) {
+        if (!dc_isar_feature(aa32_fp16_arith, s)) {
+            return false;
+        }
+        return do_3same(s, a, gen_VMAXNM_fp16_3s);
+    }
+    return do_3same(s, a, gen_VMAXNM_fp32_3s);
+}
+
+static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
+{
+    if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
+        return false;
+    }
+
+    if (a->size == MO_16) {
+        if (!dc_isar_feature(aa32_fp16_arith, s)) {
+            return false;
+        }
+        return do_3same(s, a, gen_VMINNM_fp16_3s);
+    }
+    return do_3same(s, a, gen_VMINNM_fp32_3s);
+}
+
+static bool do_3same_fp_pair(DisasContext *s, arg_3same *a,
+                             gen_helper_gvec_3_ptr *fn)
+{
+    /* FP pairwise operations */
+    TCGv_ptr fpstatus;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vn | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    assert(a->q == 0); /* enforced by decode patterns */
+
+
+    fpstatus = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
+    tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
+                       vfp_reg_offset(1, a->vn),
+                       vfp_reg_offset(1, a->vm),
+                       fpstatus, 8, 8, 0, fn);
+    tcg_temp_free_ptr(fpstatus);
+
+    return true;
+}
+
+/*
+ * For all the functions using this macro, size == 1 means fp16,
+ * which is an architecture extension we don't implement yet.
+ */
+#define DO_3S_FP_PAIR(INSN,FUNC)                                    \
+    static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
+    {                                                               \
+        if (a->size == MO_16) {                                     \
+            if (!dc_isar_feature(aa32_fp16_arith, s)) {             \
+                return false;                                       \
+            }                                                       \
+            return do_3same_fp_pair(s, a, FUNC##h);                 \
+        }                                                           \
+        return do_3same_fp_pair(s, a, FUNC##s);                     \
+    }
+
+DO_3S_FP_PAIR(VPADD, gen_helper_neon_padd)
+DO_3S_FP_PAIR(VPMAX, gen_helper_neon_pmax)
+DO_3S_FP_PAIR(VPMIN, gen_helper_neon_pmin)
+
+static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
+{
+    /* Handle a 2-reg-shift insn which can be vectorized. */
+    int vec_size = a->q ? 16 : 8;
+    int rd_ofs = neon_full_reg_offset(a->vd);
+    int rm_ofs = neon_full_reg_offset(a->vm);
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if ((a->vm | a->vd) & a->q) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
+    return true;
+}
+
+#define DO_2SH(INSN, FUNC)                                              \
+    static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
+    {                                                                   \
+        return do_vector_2sh(s, a, FUNC);                               \
+    }                                                                   \
+
+DO_2SH(VSHL, tcg_gen_gvec_shli)
+DO_2SH(VSLI, gen_gvec_sli)
+DO_2SH(VSRI, gen_gvec_sri)
+DO_2SH(VSRA_S, gen_gvec_ssra)
+DO_2SH(VSRA_U, gen_gvec_usra)
+DO_2SH(VRSHR_S, gen_gvec_srshr)
+DO_2SH(VRSHR_U, gen_gvec_urshr)
+DO_2SH(VRSRA_S, gen_gvec_srsra)
+DO_2SH(VRSRA_U, gen_gvec_ursra)
+
+static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
+{
+    /* Signed shift out of range results in all-sign-bits */
+    a->shift = MIN(a->shift, (8 << a->size) - 1);
+    return do_vector_2sh(s, a, tcg_gen_gvec_sari);
+}
+
+static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                            int64_t shift, uint32_t oprsz, uint32_t maxsz)
+{
+    tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
+}
+
+static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
+{
+    /* Shift out of range is architecturally valid and results in zero. */
+    if (a->shift >= (8 << a->size)) {
+        return do_vector_2sh(s, a, gen_zero_rd_2sh);
+    } else {
+        return do_vector_2sh(s, a, tcg_gen_gvec_shri);
+    }
+}
+
+static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
+                             NeonGenTwo64OpEnvFn *fn)
+{
+    /*
+     * 2-reg-and-shift operations, size == 3 case, where the
+     * function needs to be passed cpu_env.
+     */
+    TCGv_i64 constimm;
+    int pass;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if ((a->vm | a->vd) & a->q) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    /*
+     * To avoid excessive duplication of ops we implement shift
+     * by immediate using the variable shift operations.
+     */
+    constimm = tcg_constant_i64(dup_const(a->size, a->shift));
+
+    for (pass = 0; pass < a->q + 1; pass++) {
+        TCGv_i64 tmp = tcg_temp_new_i64();
+
+        read_neon_element64(tmp, a->vm, pass, MO_64);
+        fn(tmp, cpu_env, tmp, constimm);
+        write_neon_element64(tmp, a->vd, pass, MO_64);
+        tcg_temp_free_i64(tmp);
+    }
+    return true;
+}
+
+static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
+                             NeonGenTwoOpEnvFn *fn)
+{
+    /*
+     * 2-reg-and-shift operations, size < 3 case, where the
+     * helper needs to be passed cpu_env.
+     */
+    TCGv_i32 constimm, tmp;
+    int pass;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if ((a->vm | a->vd) & a->q) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    /*
+     * To avoid excessive duplication of ops we implement shift
+     * by immediate using the variable shift operations.
+     */
+    constimm = tcg_constant_i32(dup_const(a->size, a->shift));
+    tmp = tcg_temp_new_i32();
+
+    for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
+        read_neon_element32(tmp, a->vm, pass, MO_32);
+        fn(tmp, cpu_env, tmp, constimm);
+        write_neon_element32(tmp, a->vd, pass, MO_32);
+    }
+    tcg_temp_free_i32(tmp);
+    return true;
+}
+
+#define DO_2SHIFT_ENV(INSN, FUNC)                                       \
+    static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
+    {                                                                   \
+        return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
+    }                                                                   \
+    static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
+    {                                                                   \
+        static NeonGenTwoOpEnvFn * const fns[] = {                      \
+            gen_helper_neon_##FUNC##8,                                  \
+            gen_helper_neon_##FUNC##16,                                 \
+            gen_helper_neon_##FUNC##32,                                 \
+        };                                                              \
+        assert(a->size < ARRAY_SIZE(fns));                              \
+        return do_2shift_env_32(s, a, fns[a->size]);                    \
+    }
+
+DO_2SHIFT_ENV(VQSHLU, qshlu_s)
+DO_2SHIFT_ENV(VQSHL_U, qshl_u)
+DO_2SHIFT_ENV(VQSHL_S, qshl_s)
+
+static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
+                                NeonGenTwo64OpFn *shiftfn,
+                                NeonGenNarrowEnvFn *narrowfn)
+{
+    /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
+    TCGv_i64 constimm, rm1, rm2;
+    TCGv_i32 rd;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if (a->vm & 1) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    /*
+     * This is always a right shift, and the shiftfn is always a
+     * left-shift helper, which thus needs the negated shift count.
+     */
+    constimm = tcg_constant_i64(-a->shift);
+    rm1 = tcg_temp_new_i64();
+    rm2 = tcg_temp_new_i64();
+    rd = tcg_temp_new_i32();
+
+    /* Load both inputs first to avoid potential overwrite if rm == rd */
+    read_neon_element64(rm1, a->vm, 0, MO_64);
+    read_neon_element64(rm2, a->vm, 1, MO_64);
+
+    shiftfn(rm1, rm1, constimm);
+    narrowfn(rd, cpu_env, rm1);
+    write_neon_element32(rd, a->vd, 0, MO_32);
+
+    shiftfn(rm2, rm2, constimm);
+    narrowfn(rd, cpu_env, rm2);
+    write_neon_element32(rd, a->vd, 1, MO_32);
+
+    tcg_temp_free_i32(rd);
+    tcg_temp_free_i64(rm1);
+    tcg_temp_free_i64(rm2);
+
+    return true;
+}
+
+static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
+                                NeonGenTwoOpFn *shiftfn,
+                                NeonGenNarrowEnvFn *narrowfn)
+{
+    /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
+    TCGv_i32 constimm, rm1, rm2, rm3, rm4;
+    TCGv_i64 rtmp;
+    uint32_t imm;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if (a->vm & 1) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    /*
+     * This is always a right shift, and the shiftfn is always a
+     * left-shift helper, which thus needs the negated shift count
+     * duplicated into each lane of the immediate value.
+     */
+    if (a->size == 1) {
+        imm = (uint16_t)(-a->shift);
+        imm |= imm << 16;
+    } else {
+        /* size == 2 */
+        imm = -a->shift;
+    }
+    constimm = tcg_constant_i32(imm);
+
+    /* Load all inputs first to avoid potential overwrite */
+    rm1 = tcg_temp_new_i32();
+    rm2 = tcg_temp_new_i32();
+    rm3 = tcg_temp_new_i32();
+    rm4 = tcg_temp_new_i32();
+    read_neon_element32(rm1, a->vm, 0, MO_32);
+    read_neon_element32(rm2, a->vm, 1, MO_32);
+    read_neon_element32(rm3, a->vm, 2, MO_32);
+    read_neon_element32(rm4, a->vm, 3, MO_32);
+    rtmp = tcg_temp_new_i64();
+
+    shiftfn(rm1, rm1, constimm);
+    shiftfn(rm2, rm2, constimm);
+
+    tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
+    tcg_temp_free_i32(rm2);
+
+    narrowfn(rm1, cpu_env, rtmp);
+    write_neon_element32(rm1, a->vd, 0, MO_32);
+    tcg_temp_free_i32(rm1);
+
+    shiftfn(rm3, rm3, constimm);
+    shiftfn(rm4, rm4, constimm);
+
+    tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
+    tcg_temp_free_i32(rm4);
+
+    narrowfn(rm3, cpu_env, rtmp);
+    tcg_temp_free_i64(rtmp);
+    write_neon_element32(rm3, a->vd, 1, MO_32);
+    tcg_temp_free_i32(rm3);
+    return true;
+}
+
+#define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
+    static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
+    {                                                                   \
+        return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
+    }
+#define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
+    static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
+    {                                                                   \
+        return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
+    }
+
+static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
+{
+    tcg_gen_extrl_i64_i32(dest, src);
+}
+
+static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
+{
+    gen_helper_neon_narrow_u16(dest, src);
+}
+
+static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
+{
+    gen_helper_neon_narrow_u8(dest, src);
+}
+
+DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
+DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
+DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
+
+DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
+DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
+DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
+
+DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
+DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
+DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
+
+DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
+DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
+DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
+DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
+DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
+DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
+
+DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
+DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
+DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
+
+DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
+DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
+DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
+
+DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
+DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
+DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
+
+static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
+                         NeonGenWidenFn *widenfn, bool u)
+{
+    TCGv_i64 tmp;
+    TCGv_i32 rm0, rm1;
+    uint64_t widen_mask = 0;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if (a->vd & 1) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    /*
+     * This is a widen-and-shift operation. The shift is always less
+     * than the width of the source type, so after widening the input
+     * vector we can simply shift the whole 64-bit widened register,
+     * and then clear the potential overflow bits resulting from left
+     * bits of the narrow input appearing as right bits of the left
+     * neighbour narrow input. Calculate a mask of bits to clear.
+     */
+    if ((a->shift != 0) && (a->size < 2 || u)) {
+        int esize = 8 << a->size;
+        widen_mask = MAKE_64BIT_MASK(0, esize);
+        widen_mask >>= esize - a->shift;
+        widen_mask = dup_const(a->size + 1, widen_mask);
+    }
+
+    rm0 = tcg_temp_new_i32();
+    rm1 = tcg_temp_new_i32();
+    read_neon_element32(rm0, a->vm, 0, MO_32);
+    read_neon_element32(rm1, a->vm, 1, MO_32);
+    tmp = tcg_temp_new_i64();
+
+    widenfn(tmp, rm0);
+    tcg_temp_free_i32(rm0);
+    if (a->shift != 0) {
+        tcg_gen_shli_i64(tmp, tmp, a->shift);
+        tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
+    }
+    write_neon_element64(tmp, a->vd, 0, MO_64);
+
+    widenfn(tmp, rm1);
+    tcg_temp_free_i32(rm1);
+    if (a->shift != 0) {
+        tcg_gen_shli_i64(tmp, tmp, a->shift);
+        tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
+    }
+    write_neon_element64(tmp, a->vd, 1, MO_64);
+    tcg_temp_free_i64(tmp);
+    return true;
+}
+
+static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
+{
+    static NeonGenWidenFn * const widenfn[] = {
+        gen_helper_neon_widen_s8,
+        gen_helper_neon_widen_s16,
+        tcg_gen_ext_i32_i64,
+    };
+    return do_vshll_2sh(s, a, widenfn[a->size], false);
+}
+
+static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
+{
+    static NeonGenWidenFn * const widenfn[] = {
+        gen_helper_neon_widen_u8,
+        gen_helper_neon_widen_u16,
+        tcg_gen_extu_i32_i64,
+    };
+    return do_vshll_2sh(s, a, widenfn[a->size], true);
+}
+
+static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
+                      gen_helper_gvec_2_ptr *fn)
+{
+    /* FP operations in 2-reg-and-shift group */
+    int vec_size = a->q ? 16 : 8;
+    int rd_ofs = neon_full_reg_offset(a->vd);
+    int rm_ofs = neon_full_reg_offset(a->vm);
+    TCGv_ptr fpst;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    if (a->size == MO_16) {
+        if (!dc_isar_feature(aa32_fp16_arith, s)) {
+            return false;
+        }
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if ((a->vm | a->vd) & a->q) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
+    tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, vec_size, vec_size, a->shift, fn);
+    tcg_temp_free_ptr(fpst);
+    return true;
+}
+
+#define DO_FP_2SH(INSN, FUNC)                                           \
+    static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
+    {                                                                   \
+        return do_fp_2sh(s, a, FUNC);                                   \
+    }
+
+DO_FP_2SH(VCVT_SF, gen_helper_gvec_vcvt_sf)
+DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf)
+DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs)
+DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu)
+
+DO_FP_2SH(VCVT_SH, gen_helper_gvec_vcvt_sh)
+DO_FP_2SH(VCVT_UH, gen_helper_gvec_vcvt_uh)
+DO_FP_2SH(VCVT_HS, gen_helper_gvec_vcvt_hs)
+DO_FP_2SH(VCVT_HU, gen_helper_gvec_vcvt_hu)
+
+static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
+                        GVecGen2iFn *fn)
+{
+    uint64_t imm;
+    int reg_ofs, vec_size;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
+        return false;
+    }
+
+    if (a->vd & a->q) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    reg_ofs = neon_full_reg_offset(a->vd);
+    vec_size = a->q ? 16 : 8;
+    imm = asimd_imm_const(a->imm, a->cmode, a->op);
+
+    fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
+    return true;
+}
+
+static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
+                        int64_t c, uint32_t oprsz, uint32_t maxsz)
+{
+    tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
+}
+
+static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
+{
+    /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
+    GVecGen2iFn *fn;
+
+    if ((a->cmode & 1) && a->cmode < 12) {
+        /* for op=1, the imm will be inverted, so BIC becomes AND. */
+        fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
+    } else {
+        /* There is one unallocated cmode/op combination in this space */
+        if (a->cmode == 15 && a->op == 1) {
+            return false;
+        }
+        fn = gen_VMOV_1r;
+    }
+    return do_1reg_imm(s, a, fn);
+}
+
+static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
+                           NeonGenWidenFn *widenfn,
+                           NeonGenTwo64OpFn *opfn,
+                           int src1_mop, int src2_mop)
+{
+    /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
+    TCGv_i64 rn0_64, rn1_64, rm_64;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vn | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if (!opfn) {
+        /* size == 3 case, which is an entirely different insn group */
+        return false;
+    }
+
+    if ((a->vd & 1) || (src1_mop == MO_UQ && (a->vn & 1))) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    rn0_64 = tcg_temp_new_i64();
+    rn1_64 = tcg_temp_new_i64();
+    rm_64 = tcg_temp_new_i64();
+
+    if (src1_mop >= 0) {
+        read_neon_element64(rn0_64, a->vn, 0, src1_mop);
+    } else {
+        TCGv_i32 tmp = tcg_temp_new_i32();
+        read_neon_element32(tmp, a->vn, 0, MO_32);
+        widenfn(rn0_64, tmp);
+        tcg_temp_free_i32(tmp);
+    }
+    if (src2_mop >= 0) {
+        read_neon_element64(rm_64, a->vm, 0, src2_mop);
+    } else {
+        TCGv_i32 tmp = tcg_temp_new_i32();
+        read_neon_element32(tmp, a->vm, 0, MO_32);
+        widenfn(rm_64, tmp);
+        tcg_temp_free_i32(tmp);
+    }
+
+    opfn(rn0_64, rn0_64, rm_64);
+
+    /*
+     * Load second pass inputs before storing the first pass result, to
+     * avoid incorrect results if a narrow input overlaps with the result.
+     */
+    if (src1_mop >= 0) {
+        read_neon_element64(rn1_64, a->vn, 1, src1_mop);
+    } else {
+        TCGv_i32 tmp = tcg_temp_new_i32();
+        read_neon_element32(tmp, a->vn, 1, MO_32);
+        widenfn(rn1_64, tmp);
+        tcg_temp_free_i32(tmp);
+    }
+    if (src2_mop >= 0) {
+        read_neon_element64(rm_64, a->vm, 1, src2_mop);
+    } else {
+        TCGv_i32 tmp = tcg_temp_new_i32();
+        read_neon_element32(tmp, a->vm, 1, MO_32);
+        widenfn(rm_64, tmp);
+        tcg_temp_free_i32(tmp);
+    }
+
+    write_neon_element64(rn0_64, a->vd, 0, MO_64);
+
+    opfn(rn1_64, rn1_64, rm_64);
+    write_neon_element64(rn1_64, a->vd, 1, MO_64);
+
+    tcg_temp_free_i64(rn0_64);
+    tcg_temp_free_i64(rn1_64);
+    tcg_temp_free_i64(rm_64);
+
+    return true;
+}
+
+#define DO_PREWIDEN(INSN, S, OP, SRC1WIDE, SIGN)                        \
+    static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
+    {                                                                   \
+        static NeonGenWidenFn * const widenfn[] = {                     \
+            gen_helper_neon_widen_##S##8,                               \
+            gen_helper_neon_widen_##S##16,                              \
+            NULL, NULL,                                                 \
+        };                                                              \
+        static NeonGenTwo64OpFn * const addfn[] = {                     \
+            gen_helper_neon_##OP##l_u16,                                \
+            gen_helper_neon_##OP##l_u32,                                \
+            tcg_gen_##OP##_i64,                                         \
+            NULL,                                                       \
+        };                                                              \
+        int narrow_mop = a->size == MO_32 ? MO_32 | SIGN : -1;          \
+        return do_prewiden_3d(s, a, widenfn[a->size], addfn[a->size],   \
+                              SRC1WIDE ? MO_UQ : narrow_mop,             \
+                              narrow_mop);                              \
+    }
+
+DO_PREWIDEN(VADDL_S, s, add, false, MO_SIGN)
+DO_PREWIDEN(VADDL_U, u, add, false, 0)
+DO_PREWIDEN(VSUBL_S, s, sub, false, MO_SIGN)
+DO_PREWIDEN(VSUBL_U, u, sub, false, 0)
+DO_PREWIDEN(VADDW_S, s, add, true, MO_SIGN)
+DO_PREWIDEN(VADDW_U, u, add, true, 0)
+DO_PREWIDEN(VSUBW_S, s, sub, true, MO_SIGN)
+DO_PREWIDEN(VSUBW_U, u, sub, true, 0)
+
+static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
+                         NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
+{
+    /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
+    TCGv_i64 rn_64, rm_64;
+    TCGv_i32 rd0, rd1;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vn | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if (!opfn || !narrowfn) {
+        /* size == 3 case, which is an entirely different insn group */
+        return false;
+    }
+
+    if ((a->vn | a->vm) & 1) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    rn_64 = tcg_temp_new_i64();
+    rm_64 = tcg_temp_new_i64();
+    rd0 = tcg_temp_new_i32();
+    rd1 = tcg_temp_new_i32();
+
+    read_neon_element64(rn_64, a->vn, 0, MO_64);
+    read_neon_element64(rm_64, a->vm, 0, MO_64);
+
+    opfn(rn_64, rn_64, rm_64);
+
+    narrowfn(rd0, rn_64);
+
+    read_neon_element64(rn_64, a->vn, 1, MO_64);
+    read_neon_element64(rm_64, a->vm, 1, MO_64);
+
+    opfn(rn_64, rn_64, rm_64);
+
+    narrowfn(rd1, rn_64);
+
+    write_neon_element32(rd0, a->vd, 0, MO_32);
+    write_neon_element32(rd1, a->vd, 1, MO_32);
+
+    tcg_temp_free_i32(rd0);
+    tcg_temp_free_i32(rd1);
+    tcg_temp_free_i64(rn_64);
+    tcg_temp_free_i64(rm_64);
+
+    return true;
+}
+
+#define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
+    static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
+    {                                                                   \
+        static NeonGenTwo64OpFn * const addfn[] = {                     \
+            gen_helper_neon_##OP##l_u16,                                \
+            gen_helper_neon_##OP##l_u32,                                \
+            tcg_gen_##OP##_i64,                                         \
+            NULL,                                                       \
+        };                                                              \
+        static NeonGenNarrowFn * const narrowfn[] = {                   \
+            gen_helper_neon_##NARROWTYPE##_high_u8,                     \
+            gen_helper_neon_##NARROWTYPE##_high_u16,                    \
+            EXTOP,                                                      \
+            NULL,                                                       \
+        };                                                              \
+        return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
+    }
+
+static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
+{
+    tcg_gen_addi_i64(rn, rn, 1u << 31);
+    tcg_gen_extrh_i64_i32(rd, rn);
+}
+
+DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
+DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
+DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
+DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
+
+static bool do_long_3d(DisasContext *s, arg_3diff *a,
+                       NeonGenTwoOpWidenFn *opfn,
+                       NeonGenTwo64OpFn *accfn)
+{
+    /*
+     * 3-regs different lengths, long operations.
+     * These perform an operation on two inputs that returns a double-width
+     * result, and then possibly perform an accumulation operation of
+     * that result into the double-width destination.
+     */
+    TCGv_i64 rd0, rd1, tmp;
+    TCGv_i32 rn, rm;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vn | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if (!opfn) {
+        /* size == 3 case, which is an entirely different insn group */
+        return false;
+    }
+
+    if (a->vd & 1) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    rd0 = tcg_temp_new_i64();
+    rd1 = tcg_temp_new_i64();
+
+    rn = tcg_temp_new_i32();
+    rm = tcg_temp_new_i32();
+    read_neon_element32(rn, a->vn, 0, MO_32);
+    read_neon_element32(rm, a->vm, 0, MO_32);
+    opfn(rd0, rn, rm);
+
+    read_neon_element32(rn, a->vn, 1, MO_32);
+    read_neon_element32(rm, a->vm, 1, MO_32);
+    opfn(rd1, rn, rm);
+    tcg_temp_free_i32(rn);
+    tcg_temp_free_i32(rm);
+
+    /* Don't store results until after all loads: they might overlap */
+    if (accfn) {
+        tmp = tcg_temp_new_i64();
+        read_neon_element64(tmp, a->vd, 0, MO_64);
+        accfn(rd0, tmp, rd0);
+        read_neon_element64(tmp, a->vd, 1, MO_64);
+        accfn(rd1, tmp, rd1);
+        tcg_temp_free_i64(tmp);
+    }
+
+    write_neon_element64(rd0, a->vd, 0, MO_64);
+    write_neon_element64(rd1, a->vd, 1, MO_64);
+    tcg_temp_free_i64(rd0);
+    tcg_temp_free_i64(rd1);
+
+    return true;
+}
+
+static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
+{
+    static NeonGenTwoOpWidenFn * const opfn[] = {
+        gen_helper_neon_abdl_s16,
+        gen_helper_neon_abdl_s32,
+        gen_helper_neon_abdl_s64,
+        NULL,
+    };
+
+    return do_long_3d(s, a, opfn[a->size], NULL);
+}
+
+static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
+{
+    static NeonGenTwoOpWidenFn * const opfn[] = {
+        gen_helper_neon_abdl_u16,
+        gen_helper_neon_abdl_u32,
+        gen_helper_neon_abdl_u64,
+        NULL,
+    };
+
+    return do_long_3d(s, a, opfn[a->size], NULL);
+}
+
+static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
+{
+    static NeonGenTwoOpWidenFn * const opfn[] = {
+        gen_helper_neon_abdl_s16,
+        gen_helper_neon_abdl_s32,
+        gen_helper_neon_abdl_s64,
+        NULL,
+    };
+    static NeonGenTwo64OpFn * const addfn[] = {
+        gen_helper_neon_addl_u16,
+        gen_helper_neon_addl_u32,
+        tcg_gen_add_i64,
+        NULL,
+    };
+
+    return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
+}
+
+static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
+{
+    static NeonGenTwoOpWidenFn * const opfn[] = {
+        gen_helper_neon_abdl_u16,
+        gen_helper_neon_abdl_u32,
+        gen_helper_neon_abdl_u64,
+        NULL,
+    };
+    static NeonGenTwo64OpFn * const addfn[] = {
+        gen_helper_neon_addl_u16,
+        gen_helper_neon_addl_u32,
+        tcg_gen_add_i64,
+        NULL,
+    };
+
+    return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
+}
+
+static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
+{
+    TCGv_i32 lo = tcg_temp_new_i32();
+    TCGv_i32 hi = tcg_temp_new_i32();
+
+    tcg_gen_muls2_i32(lo, hi, rn, rm);
+    tcg_gen_concat_i32_i64(rd, lo, hi);
+
+    tcg_temp_free_i32(lo);
+    tcg_temp_free_i32(hi);
+}
+
+static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
+{
+    TCGv_i32 lo = tcg_temp_new_i32();
+    TCGv_i32 hi = tcg_temp_new_i32();
+
+    tcg_gen_mulu2_i32(lo, hi, rn, rm);
+    tcg_gen_concat_i32_i64(rd, lo, hi);
+
+    tcg_temp_free_i32(lo);
+    tcg_temp_free_i32(hi);
+}
+
+static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
+{
+    static NeonGenTwoOpWidenFn * const opfn[] = {
+        gen_helper_neon_mull_s8,
+        gen_helper_neon_mull_s16,
+        gen_mull_s32,
+        NULL,
+    };
+
+    return do_long_3d(s, a, opfn[a->size], NULL);
+}
+
+static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
+{
+    static NeonGenTwoOpWidenFn * const opfn[] = {
+        gen_helper_neon_mull_u8,
+        gen_helper_neon_mull_u16,
+        gen_mull_u32,
+        NULL,
+    };
+
+    return do_long_3d(s, a, opfn[a->size], NULL);
+}
+
+#define DO_VMLAL(INSN,MULL,ACC)                                         \
+    static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
+    {                                                                   \
+        static NeonGenTwoOpWidenFn * const opfn[] = {                   \
+            gen_helper_neon_##MULL##8,                                  \
+            gen_helper_neon_##MULL##16,                                 \
+            gen_##MULL##32,                                             \
+            NULL,                                                       \
+        };                                                              \
+        static NeonGenTwo64OpFn * const accfn[] = {                     \
+            gen_helper_neon_##ACC##l_u16,                               \
+            gen_helper_neon_##ACC##l_u32,                               \
+            tcg_gen_##ACC##_i64,                                        \
+            NULL,                                                       \
+        };                                                              \
+        return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
+    }
+
+DO_VMLAL(VMLAL_S,mull_s,add)
+DO_VMLAL(VMLAL_U,mull_u,add)
+DO_VMLAL(VMLSL_S,mull_s,sub)
+DO_VMLAL(VMLSL_U,mull_u,sub)
+
+static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
+{
+    gen_helper_neon_mull_s16(rd, rn, rm);
+    gen_helper_neon_addl_saturate_s32(rd, cpu_env, rd, rd);
+}
+
+static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
+{
+    gen_mull_s32(rd, rn, rm);
+    gen_helper_neon_addl_saturate_s64(rd, cpu_env, rd, rd);
+}
+
+static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
+{
+    static NeonGenTwoOpWidenFn * const opfn[] = {
+        NULL,
+        gen_VQDMULL_16,
+        gen_VQDMULL_32,
+        NULL,
+    };
+
+    return do_long_3d(s, a, opfn[a->size], NULL);
+}
+
+static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
+{
+    gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
+}
+
+static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
+{
+    gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
+}
+
+static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
+{
+    static NeonGenTwoOpWidenFn * const opfn[] = {
+        NULL,
+        gen_VQDMULL_16,
+        gen_VQDMULL_32,
+        NULL,
+    };
+    static NeonGenTwo64OpFn * const accfn[] = {
+        NULL,
+        gen_VQDMLAL_acc_16,
+        gen_VQDMLAL_acc_32,
+        NULL,
+    };
+
+    return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
+}
+
+static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
+{
+    gen_helper_neon_negl_u32(rm, rm);
+    gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
+}
+
+static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
+{
+    tcg_gen_neg_i64(rm, rm);
+    gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
+}
+
+static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
+{
+    static NeonGenTwoOpWidenFn * const opfn[] = {
+        NULL,
+        gen_VQDMULL_16,
+        gen_VQDMULL_32,
+        NULL,
+    };
+    static NeonGenTwo64OpFn * const accfn[] = {
+        NULL,
+        gen_VQDMLSL_acc_16,
+        gen_VQDMLSL_acc_32,
+        NULL,
+    };
+
+    return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
+}
+
+static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
+{
+    gen_helper_gvec_3 *fn_gvec;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vn | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if (a->vd & 1) {
+        return false;
+    }
+
+    switch (a->size) {
+    case 0:
+        fn_gvec = gen_helper_neon_pmull_h;
+        break;
+    case 2:
+        if (!dc_isar_feature(aa32_pmull, s)) {
+            return false;
+        }
+        fn_gvec = gen_helper_gvec_pmull_q;
+        break;
+    default:
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    tcg_gen_gvec_3_ool(neon_full_reg_offset(a->vd),
+                       neon_full_reg_offset(a->vn),
+                       neon_full_reg_offset(a->vm),
+                       16, 16, 0, fn_gvec);
+    return true;
+}
+
+static void gen_neon_dup_low16(TCGv_i32 var)
+{
+    TCGv_i32 tmp = tcg_temp_new_i32();
+    tcg_gen_ext16u_i32(var, var);
+    tcg_gen_shli_i32(tmp, var, 16);
+    tcg_gen_or_i32(var, var, tmp);
+    tcg_temp_free_i32(tmp);
+}
+
+static void gen_neon_dup_high16(TCGv_i32 var)
+{
+    TCGv_i32 tmp = tcg_temp_new_i32();
+    tcg_gen_andi_i32(var, var, 0xffff0000);
+    tcg_gen_shri_i32(tmp, var, 16);
+    tcg_gen_or_i32(var, var, tmp);
+    tcg_temp_free_i32(tmp);
+}
+
+static inline TCGv_i32 neon_get_scalar(int size, int reg)
+{
+    TCGv_i32 tmp = tcg_temp_new_i32();
+    if (size == MO_16) {
+        read_neon_element32(tmp, reg & 7, reg >> 4, MO_32);
+        if (reg & 8) {
+            gen_neon_dup_high16(tmp);
+        } else {
+            gen_neon_dup_low16(tmp);
+        }
+    } else {
+        read_neon_element32(tmp, reg & 15, reg >> 4, MO_32);
+    }
+    return tmp;
+}
+
+static bool do_2scalar(DisasContext *s, arg_2scalar *a,
+                       NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
+{
+    /*
+     * Two registers and a scalar: perform an operation between
+     * the input elements and the scalar, and then possibly
+     * perform an accumulation operation of that result into the
+     * destination.
+     */
+    TCGv_i32 scalar, tmp;
+    int pass;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vn | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if (!opfn) {
+        /* Bad size (including size == 3, which is a different insn group) */
+        return false;
+    }
+
+    if (a->q && ((a->vd | a->vn) & 1)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    scalar = neon_get_scalar(a->size, a->vm);
+    tmp = tcg_temp_new_i32();
+
+    for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
+        read_neon_element32(tmp, a->vn, pass, MO_32);
+        opfn(tmp, tmp, scalar);
+        if (accfn) {
+            TCGv_i32 rd = tcg_temp_new_i32();
+            read_neon_element32(rd, a->vd, pass, MO_32);
+            accfn(tmp, rd, tmp);
+            tcg_temp_free_i32(rd);
+        }
+        write_neon_element32(tmp, a->vd, pass, MO_32);
+    }
+    tcg_temp_free_i32(tmp);
+    tcg_temp_free_i32(scalar);
+    return true;
+}
+
+static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
+{
+    static NeonGenTwoOpFn * const opfn[] = {
+        NULL,
+        gen_helper_neon_mul_u16,
+        tcg_gen_mul_i32,
+        NULL,
+    };
+
+    return do_2scalar(s, a, opfn[a->size], NULL);
+}
+
+static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
+{
+    static NeonGenTwoOpFn * const opfn[] = {
+        NULL,
+        gen_helper_neon_mul_u16,
+        tcg_gen_mul_i32,
+        NULL,
+    };
+    static NeonGenTwoOpFn * const accfn[] = {
+        NULL,
+        gen_helper_neon_add_u16,
+        tcg_gen_add_i32,
+        NULL,
+    };
+
+    return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
+}
+
+static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
+{
+    static NeonGenTwoOpFn * const opfn[] = {
+        NULL,
+        gen_helper_neon_mul_u16,
+        tcg_gen_mul_i32,
+        NULL,
+    };
+    static NeonGenTwoOpFn * const accfn[] = {
+        NULL,
+        gen_helper_neon_sub_u16,
+        tcg_gen_sub_i32,
+        NULL,
+    };
+
+    return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
+}
+
+static bool do_2scalar_fp_vec(DisasContext *s, arg_2scalar *a,
+                              gen_helper_gvec_3_ptr *fn)
+{
+    /* Two registers and a scalar, using gvec */
+    int vec_size = a->q ? 16 : 8;
+    int rd_ofs = neon_full_reg_offset(a->vd);
+    int rn_ofs = neon_full_reg_offset(a->vn);
+    int rm_ofs;
+    int idx;
+    TCGv_ptr fpstatus;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vn | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if (!fn) {
+        /* Bad size (including size == 3, which is a different insn group) */
+        return false;
+    }
+
+    if (a->q && ((a->vd | a->vn) & 1)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    /* a->vm is M:Vm, which encodes both register and index */
+    idx = extract32(a->vm, a->size + 2, 2);
+    a->vm = extract32(a->vm, 0, a->size + 2);
+    rm_ofs = neon_full_reg_offset(a->vm);
+
+    fpstatus = fpstatus_ptr(a->size == 1 ? FPST_STD_F16 : FPST_STD);
+    tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpstatus,
+                       vec_size, vec_size, idx, fn);
+    tcg_temp_free_ptr(fpstatus);
+    return true;
+}
+
+#define DO_VMUL_F_2sc(NAME, FUNC)                                       \
+    static bool trans_##NAME##_F_2sc(DisasContext *s, arg_2scalar *a)   \
+    {                                                                   \
+        static gen_helper_gvec_3_ptr * const opfn[] = {                 \
+            NULL,                                                       \
+            gen_helper_##FUNC##_h,                                      \
+            gen_helper_##FUNC##_s,                                      \
+            NULL,                                                       \
+        };                                                              \
+        if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) { \
+            return false;                                               \
+        }                                                               \
+        return do_2scalar_fp_vec(s, a, opfn[a->size]);                  \
+    }
+
+DO_VMUL_F_2sc(VMUL, gvec_fmul_idx)
+DO_VMUL_F_2sc(VMLA, gvec_fmla_nf_idx)
+DO_VMUL_F_2sc(VMLS, gvec_fmls_nf_idx)
+
+WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
+WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
+WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
+WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
+
+static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
+{
+    static NeonGenTwoOpFn * const opfn[] = {
+        NULL,
+        gen_VQDMULH_16,
+        gen_VQDMULH_32,
+        NULL,
+    };
+
+    return do_2scalar(s, a, opfn[a->size], NULL);
+}
+
+static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
+{
+    static NeonGenTwoOpFn * const opfn[] = {
+        NULL,
+        gen_VQRDMULH_16,
+        gen_VQRDMULH_32,
+        NULL,
+    };
+
+    return do_2scalar(s, a, opfn[a->size], NULL);
+}
+
+static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
+                            NeonGenThreeOpEnvFn *opfn)
+{
+    /*
+     * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
+     * performs a kind of fused op-then-accumulate using a helper
+     * function that takes all of rd, rn and the scalar at once.
+     */
+    TCGv_i32 scalar, rn, rd;
+    int pass;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    if (!dc_isar_feature(aa32_rdm, s)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vn | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if (!opfn) {
+        /* Bad size (including size == 3, which is a different insn group) */
+        return false;
+    }
+
+    if (a->q && ((a->vd | a->vn) & 1)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    scalar = neon_get_scalar(a->size, a->vm);
+    rn = tcg_temp_new_i32();
+    rd = tcg_temp_new_i32();
+
+    for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
+        read_neon_element32(rn, a->vn, pass, MO_32);
+        read_neon_element32(rd, a->vd, pass, MO_32);
+        opfn(rd, cpu_env, rn, scalar, rd);
+        write_neon_element32(rd, a->vd, pass, MO_32);
+    }
+    tcg_temp_free_i32(rn);
+    tcg_temp_free_i32(rd);
+    tcg_temp_free_i32(scalar);
+
+    return true;
+}
+
+static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
+{
+    static NeonGenThreeOpEnvFn *opfn[] = {
+        NULL,
+        gen_helper_neon_qrdmlah_s16,
+        gen_helper_neon_qrdmlah_s32,
+        NULL,
+    };
+    return do_vqrdmlah_2sc(s, a, opfn[a->size]);
+}
+
+static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
+{
+    static NeonGenThreeOpEnvFn *opfn[] = {
+        NULL,
+        gen_helper_neon_qrdmlsh_s16,
+        gen_helper_neon_qrdmlsh_s32,
+        NULL,
+    };
+    return do_vqrdmlah_2sc(s, a, opfn[a->size]);
+}
+
+static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
+                            NeonGenTwoOpWidenFn *opfn,
+                            NeonGenTwo64OpFn *accfn)
+{
+    /*
+     * Two registers and a scalar, long operations: perform an
+     * operation on the input elements and the scalar which produces
+     * a double-width result, and then possibly perform an accumulation
+     * operation of that result into the destination.
+     */
+    TCGv_i32 scalar, rn;
+    TCGv_i64 rn0_64, rn1_64;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vn | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if (!opfn) {
+        /* Bad size (including size == 3, which is a different insn group) */
+        return false;
+    }
+
+    if (a->vd & 1) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    scalar = neon_get_scalar(a->size, a->vm);
+
+    /* Load all inputs before writing any outputs, in case of overlap */
+    rn = tcg_temp_new_i32();
+    read_neon_element32(rn, a->vn, 0, MO_32);
+    rn0_64 = tcg_temp_new_i64();
+    opfn(rn0_64, rn, scalar);
+
+    read_neon_element32(rn, a->vn, 1, MO_32);
+    rn1_64 = tcg_temp_new_i64();
+    opfn(rn1_64, rn, scalar);
+    tcg_temp_free_i32(rn);
+    tcg_temp_free_i32(scalar);
+
+    if (accfn) {
+        TCGv_i64 t64 = tcg_temp_new_i64();
+        read_neon_element64(t64, a->vd, 0, MO_64);
+        accfn(rn0_64, t64, rn0_64);
+        read_neon_element64(t64, a->vd, 1, MO_64);
+        accfn(rn1_64, t64, rn1_64);
+        tcg_temp_free_i64(t64);
+    }
+
+    write_neon_element64(rn0_64, a->vd, 0, MO_64);
+    write_neon_element64(rn1_64, a->vd, 1, MO_64);
+    tcg_temp_free_i64(rn0_64);
+    tcg_temp_free_i64(rn1_64);
+    return true;
+}
+
+static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
+{
+    static NeonGenTwoOpWidenFn * const opfn[] = {
+        NULL,
+        gen_helper_neon_mull_s16,
+        gen_mull_s32,
+        NULL,
+    };
+
+    return do_2scalar_long(s, a, opfn[a->size], NULL);
+}
+
+static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
+{
+    static NeonGenTwoOpWidenFn * const opfn[] = {
+        NULL,
+        gen_helper_neon_mull_u16,
+        gen_mull_u32,
+        NULL,
+    };
+
+    return do_2scalar_long(s, a, opfn[a->size], NULL);
+}
+
+#define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
+    static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
+    {                                                                   \
+        static NeonGenTwoOpWidenFn * const opfn[] = {                   \
+            NULL,                                                       \
+            gen_helper_neon_##MULL##16,                                 \
+            gen_##MULL##32,                                             \
+            NULL,                                                       \
+        };                                                              \
+        static NeonGenTwo64OpFn * const accfn[] = {                     \
+            NULL,                                                       \
+            gen_helper_neon_##ACC##l_u32,                               \
+            tcg_gen_##ACC##_i64,                                        \
+            NULL,                                                       \
+        };                                                              \
+        return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
+    }
+
+DO_VMLAL_2SC(VMLAL_S, mull_s, add)
+DO_VMLAL_2SC(VMLAL_U, mull_u, add)
+DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
+DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
+
+static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
+{
+    static NeonGenTwoOpWidenFn * const opfn[] = {
+        NULL,
+        gen_VQDMULL_16,
+        gen_VQDMULL_32,
+        NULL,
+    };
+
+    return do_2scalar_long(s, a, opfn[a->size], NULL);
+}
+
+static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
+{
+    static NeonGenTwoOpWidenFn * const opfn[] = {
+        NULL,
+        gen_VQDMULL_16,
+        gen_VQDMULL_32,
+        NULL,
+    };
+    static NeonGenTwo64OpFn * const accfn[] = {
+        NULL,
+        gen_VQDMLAL_acc_16,
+        gen_VQDMLAL_acc_32,
+        NULL,
+    };
+
+    return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
+}
+
+static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
+{
+    static NeonGenTwoOpWidenFn * const opfn[] = {
+        NULL,
+        gen_VQDMULL_16,
+        gen_VQDMULL_32,
+        NULL,
+    };
+    static NeonGenTwo64OpFn * const accfn[] = {
+        NULL,
+        gen_VQDMLSL_acc_16,
+        gen_VQDMLSL_acc_32,
+        NULL,
+    };
+
+    return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
+}
+
+static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
+{
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vn | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if ((a->vn | a->vm | a->vd) & a->q) {
+        return false;
+    }
+
+    if (a->imm > 7 && !a->q) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    if (!a->q) {
+        /* Extract 64 bits from <Vm:Vn> */
+        TCGv_i64 left, right, dest;
+
+        left = tcg_temp_new_i64();
+        right = tcg_temp_new_i64();
+        dest = tcg_temp_new_i64();
+
+        read_neon_element64(right, a->vn, 0, MO_64);
+        read_neon_element64(left, a->vm, 0, MO_64);
+        tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
+        write_neon_element64(dest, a->vd, 0, MO_64);
+
+        tcg_temp_free_i64(left);
+        tcg_temp_free_i64(right);
+        tcg_temp_free_i64(dest);
+    } else {
+        /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
+        TCGv_i64 left, middle, right, destleft, destright;
+
+        left = tcg_temp_new_i64();
+        middle = tcg_temp_new_i64();
+        right = tcg_temp_new_i64();
+        destleft = tcg_temp_new_i64();
+        destright = tcg_temp_new_i64();
+
+        if (a->imm < 8) {
+            read_neon_element64(right, a->vn, 0, MO_64);
+            read_neon_element64(middle, a->vn, 1, MO_64);
+            tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
+            read_neon_element64(left, a->vm, 0, MO_64);
+            tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
+        } else {
+            read_neon_element64(right, a->vn, 1, MO_64);
+            read_neon_element64(middle, a->vm, 0, MO_64);
+            tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
+            read_neon_element64(left, a->vm, 1, MO_64);
+            tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
+        }
+
+        write_neon_element64(destright, a->vd, 0, MO_64);
+        write_neon_element64(destleft, a->vd, 1, MO_64);
+
+        tcg_temp_free_i64(destright);
+        tcg_temp_free_i64(destleft);
+        tcg_temp_free_i64(right);
+        tcg_temp_free_i64(middle);
+        tcg_temp_free_i64(left);
+    }
+    return true;
+}
+
+static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
+{
+    TCGv_i64 val, def;
+    TCGv_i32 desc;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vn | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if ((a->vn + a->len + 1) > 32) {
+        /*
+         * This is UNPREDICTABLE; we choose to UNDEF to avoid the
+         * helper function running off the end of the register file.
+         */
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    desc = tcg_constant_i32((a->vn << 2) | a->len);
+    def = tcg_temp_new_i64();
+    if (a->op) {
+        read_neon_element64(def, a->vd, 0, MO_64);
+    } else {
+        tcg_gen_movi_i64(def, 0);
+    }
+    val = tcg_temp_new_i64();
+    read_neon_element64(val, a->vm, 0, MO_64);
+
+    gen_helper_neon_tbl(val, cpu_env, desc, val, def);
+    write_neon_element64(val, a->vd, 0, MO_64);
+
+    tcg_temp_free_i64(def);
+    tcg_temp_free_i64(val);
+    return true;
+}
+
+static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
+{
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if (a->vd & a->q) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    tcg_gen_gvec_dup_mem(a->size, neon_full_reg_offset(a->vd),
+                         neon_element_offset(a->vm, a->index, a->size),
+                         a->q ? 16 : 8, a->q ? 16 : 8);
+    return true;
+}
+
+static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
+{
+    int pass, half;
+    TCGv_i32 tmp[2];
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if ((a->vd | a->vm) & a->q) {
+        return false;
+    }
+
+    if (a->size == 3) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    tmp[0] = tcg_temp_new_i32();
+    tmp[1] = tcg_temp_new_i32();
+
+    for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
+        for (half = 0; half < 2; half++) {
+            read_neon_element32(tmp[half], a->vm, pass * 2 + half, MO_32);
+            switch (a->size) {
+            case 0:
+                tcg_gen_bswap32_i32(tmp[half], tmp[half]);
+                break;
+            case 1:
+                gen_swap_half(tmp[half], tmp[half]);
+                break;
+            case 2:
+                break;
+            default:
+                g_assert_not_reached();
+            }
+        }
+        write_neon_element32(tmp[1], a->vd, pass * 2, MO_32);
+        write_neon_element32(tmp[0], a->vd, pass * 2 + 1, MO_32);
+    }
+
+    tcg_temp_free_i32(tmp[0]);
+    tcg_temp_free_i32(tmp[1]);
+    return true;
+}
+
+static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
+                              NeonGenWidenFn *widenfn,
+                              NeonGenTwo64OpFn *opfn,
+                              NeonGenTwo64OpFn *accfn)
+{
+    /*
+     * Pairwise long operations: widen both halves of the pair,
+     * combine the pairs with the opfn, and then possibly accumulate
+     * into the destination with the accfn.
+     */
+    int pass;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if ((a->vd | a->vm) & a->q) {
+        return false;
+    }
+
+    if (!widenfn) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    for (pass = 0; pass < a->q + 1; pass++) {
+        TCGv_i32 tmp;
+        TCGv_i64 rm0_64, rm1_64, rd_64;
+
+        rm0_64 = tcg_temp_new_i64();
+        rm1_64 = tcg_temp_new_i64();
+        rd_64 = tcg_temp_new_i64();
+
+        tmp = tcg_temp_new_i32();
+        read_neon_element32(tmp, a->vm, pass * 2, MO_32);
+        widenfn(rm0_64, tmp);
+        read_neon_element32(tmp, a->vm, pass * 2 + 1, MO_32);
+        widenfn(rm1_64, tmp);
+        tcg_temp_free_i32(tmp);
+
+        opfn(rd_64, rm0_64, rm1_64);
+        tcg_temp_free_i64(rm0_64);
+        tcg_temp_free_i64(rm1_64);
+
+        if (accfn) {
+            TCGv_i64 tmp64 = tcg_temp_new_i64();
+            read_neon_element64(tmp64, a->vd, pass, MO_64);
+            accfn(rd_64, tmp64, rd_64);
+            tcg_temp_free_i64(tmp64);
+        }
+        write_neon_element64(rd_64, a->vd, pass, MO_64);
+        tcg_temp_free_i64(rd_64);
+    }
+    return true;
+}
+
+static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
+{
+    static NeonGenWidenFn * const widenfn[] = {
+        gen_helper_neon_widen_s8,
+        gen_helper_neon_widen_s16,
+        tcg_gen_ext_i32_i64,
+        NULL,
+    };
+    static NeonGenTwo64OpFn * const opfn[] = {
+        gen_helper_neon_paddl_u16,
+        gen_helper_neon_paddl_u32,
+        tcg_gen_add_i64,
+        NULL,
+    };
+
+    return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
+}
+
+static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
+{
+    static NeonGenWidenFn * const widenfn[] = {
+        gen_helper_neon_widen_u8,
+        gen_helper_neon_widen_u16,
+        tcg_gen_extu_i32_i64,
+        NULL,
+    };
+    static NeonGenTwo64OpFn * const opfn[] = {
+        gen_helper_neon_paddl_u16,
+        gen_helper_neon_paddl_u32,
+        tcg_gen_add_i64,
+        NULL,
+    };
+
+    return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
+}
+
+static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
+{
+    static NeonGenWidenFn * const widenfn[] = {
+        gen_helper_neon_widen_s8,
+        gen_helper_neon_widen_s16,
+        tcg_gen_ext_i32_i64,
+        NULL,
+    };
+    static NeonGenTwo64OpFn * const opfn[] = {
+        gen_helper_neon_paddl_u16,
+        gen_helper_neon_paddl_u32,
+        tcg_gen_add_i64,
+        NULL,
+    };
+    static NeonGenTwo64OpFn * const accfn[] = {
+        gen_helper_neon_addl_u16,
+        gen_helper_neon_addl_u32,
+        tcg_gen_add_i64,
+        NULL,
+    };
+
+    return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
+                             accfn[a->size]);
+}
+
+static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
+{
+    static NeonGenWidenFn * const widenfn[] = {
+        gen_helper_neon_widen_u8,
+        gen_helper_neon_widen_u16,
+        tcg_gen_extu_i32_i64,
+        NULL,
+    };
+    static NeonGenTwo64OpFn * const opfn[] = {
+        gen_helper_neon_paddl_u16,
+        gen_helper_neon_paddl_u32,
+        tcg_gen_add_i64,
+        NULL,
+    };
+    static NeonGenTwo64OpFn * const accfn[] = {
+        gen_helper_neon_addl_u16,
+        gen_helper_neon_addl_u32,
+        tcg_gen_add_i64,
+        NULL,
+    };
+
+    return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
+                             accfn[a->size]);
+}
+
+typedef void ZipFn(TCGv_ptr, TCGv_ptr);
+
+static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
+                       ZipFn *fn)
+{
+    TCGv_ptr pd, pm;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if ((a->vd | a->vm) & a->q) {
+        return false;
+    }
+
+    if (!fn) {
+        /* Bad size or size/q combination */
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    pd = vfp_reg_ptr(true, a->vd);
+    pm = vfp_reg_ptr(true, a->vm);
+    fn(pd, pm);
+    tcg_temp_free_ptr(pd);
+    tcg_temp_free_ptr(pm);
+    return true;
+}
+
+static bool trans_VUZP(DisasContext *s, arg_2misc *a)
+{
+    static ZipFn * const fn[2][4] = {
+        {
+            gen_helper_neon_unzip8,
+            gen_helper_neon_unzip16,
+            NULL,
+            NULL,
+        }, {
+            gen_helper_neon_qunzip8,
+            gen_helper_neon_qunzip16,
+            gen_helper_neon_qunzip32,
+            NULL,
+        }
+    };
+    return do_zip_uzp(s, a, fn[a->q][a->size]);
+}
+
+static bool trans_VZIP(DisasContext *s, arg_2misc *a)
+{
+    static ZipFn * const fn[2][4] = {
+        {
+            gen_helper_neon_zip8,
+            gen_helper_neon_zip16,
+            NULL,
+            NULL,
+        }, {
+            gen_helper_neon_qzip8,
+            gen_helper_neon_qzip16,
+            gen_helper_neon_qzip32,
+            NULL,
+        }
+    };
+    return do_zip_uzp(s, a, fn[a->q][a->size]);
+}
+
+static bool do_vmovn(DisasContext *s, arg_2misc *a,
+                     NeonGenNarrowEnvFn *narrowfn)
+{
+    TCGv_i64 rm;
+    TCGv_i32 rd0, rd1;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if (a->vm & 1) {
+        return false;
+    }
+
+    if (!narrowfn) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    rm = tcg_temp_new_i64();
+    rd0 = tcg_temp_new_i32();
+    rd1 = tcg_temp_new_i32();
+
+    read_neon_element64(rm, a->vm, 0, MO_64);
+    narrowfn(rd0, cpu_env, rm);
+    read_neon_element64(rm, a->vm, 1, MO_64);
+    narrowfn(rd1, cpu_env, rm);
+    write_neon_element32(rd0, a->vd, 0, MO_32);
+    write_neon_element32(rd1, a->vd, 1, MO_32);
+    tcg_temp_free_i32(rd0);
+    tcg_temp_free_i32(rd1);
+    tcg_temp_free_i64(rm);
+    return true;
+}
+
+#define DO_VMOVN(INSN, FUNC)                                    \
+    static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
+    {                                                           \
+        static NeonGenNarrowEnvFn * const narrowfn[] = {        \
+            FUNC##8,                                            \
+            FUNC##16,                                           \
+            FUNC##32,                                           \
+            NULL,                                               \
+        };                                                      \
+        return do_vmovn(s, a, narrowfn[a->size]);               \
+    }
+
+DO_VMOVN(VMOVN, gen_neon_narrow_u)
+DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
+DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
+DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
+
+static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
+{
+    TCGv_i32 rm0, rm1;
+    TCGv_i64 rd;
+    static NeonGenWidenFn * const widenfns[] = {
+        gen_helper_neon_widen_u8,
+        gen_helper_neon_widen_u16,
+        tcg_gen_extu_i32_i64,
+        NULL,
+    };
+    NeonGenWidenFn *widenfn = widenfns[a->size];
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if (a->vd & 1) {
+        return false;
+    }
+
+    if (!widenfn) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    rd = tcg_temp_new_i64();
+    rm0 = tcg_temp_new_i32();
+    rm1 = tcg_temp_new_i32();
+
+    read_neon_element32(rm0, a->vm, 0, MO_32);
+    read_neon_element32(rm1, a->vm, 1, MO_32);
+
+    widenfn(rd, rm0);
+    tcg_gen_shli_i64(rd, rd, 8 << a->size);
+    write_neon_element64(rd, a->vd, 0, MO_64);
+    widenfn(rd, rm1);
+    tcg_gen_shli_i64(rd, rd, 8 << a->size);
+    write_neon_element64(rd, a->vd, 1, MO_64);
+
+    tcg_temp_free_i64(rd);
+    tcg_temp_free_i32(rm0);
+    tcg_temp_free_i32(rm1);
+    return true;
+}
+
+static bool trans_VCVT_B16_F32(DisasContext *s, arg_2misc *a)
+{
+    TCGv_ptr fpst;
+    TCGv_i64 tmp;
+    TCGv_i32 dst0, dst1;
+
+    if (!dc_isar_feature(aa32_bf16, s)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if ((a->vm & 1) || (a->size != 1)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    fpst = fpstatus_ptr(FPST_STD);
+    tmp = tcg_temp_new_i64();
+    dst0 = tcg_temp_new_i32();
+    dst1 = tcg_temp_new_i32();
+
+    read_neon_element64(tmp, a->vm, 0, MO_64);
+    gen_helper_bfcvt_pair(dst0, tmp, fpst);
+
+    read_neon_element64(tmp, a->vm, 1, MO_64);
+    gen_helper_bfcvt_pair(dst1, tmp, fpst);
+
+    write_neon_element32(dst0, a->vd, 0, MO_32);
+    write_neon_element32(dst1, a->vd, 1, MO_32);
+
+    tcg_temp_free_i64(tmp);
+    tcg_temp_free_i32(dst0);
+    tcg_temp_free_i32(dst1);
+    tcg_temp_free_ptr(fpst);
+    return true;
+}
+
+static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
+{
+    TCGv_ptr fpst;
+    TCGv_i32 ahp, tmp, tmp2, tmp3;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
+        !dc_isar_feature(aa32_fp16_spconv, s)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if ((a->vm & 1) || (a->size != 1)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    fpst = fpstatus_ptr(FPST_STD);
+    ahp = get_ahp_flag();
+    tmp = tcg_temp_new_i32();
+    read_neon_element32(tmp, a->vm, 0, MO_32);
+    gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
+    tmp2 = tcg_temp_new_i32();
+    read_neon_element32(tmp2, a->vm, 1, MO_32);
+    gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
+    tcg_gen_shli_i32(tmp2, tmp2, 16);
+    tcg_gen_or_i32(tmp2, tmp2, tmp);
+    read_neon_element32(tmp, a->vm, 2, MO_32);
+    gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
+    tmp3 = tcg_temp_new_i32();
+    read_neon_element32(tmp3, a->vm, 3, MO_32);
+    write_neon_element32(tmp2, a->vd, 0, MO_32);
+    tcg_temp_free_i32(tmp2);
+    gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
+    tcg_gen_shli_i32(tmp3, tmp3, 16);
+    tcg_gen_or_i32(tmp3, tmp3, tmp);
+    write_neon_element32(tmp3, a->vd, 1, MO_32);
+    tcg_temp_free_i32(tmp3);
+    tcg_temp_free_i32(tmp);
+    tcg_temp_free_i32(ahp);
+    tcg_temp_free_ptr(fpst);
+
+    return true;
+}
+
+static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
+{
+    TCGv_ptr fpst;
+    TCGv_i32 ahp, tmp, tmp2, tmp3;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
+        !dc_isar_feature(aa32_fp16_spconv, s)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if ((a->vd & 1) || (a->size != 1)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    fpst = fpstatus_ptr(FPST_STD);
+    ahp = get_ahp_flag();
+    tmp3 = tcg_temp_new_i32();
+    tmp2 = tcg_temp_new_i32();
+    tmp = tcg_temp_new_i32();
+    read_neon_element32(tmp, a->vm, 0, MO_32);
+    read_neon_element32(tmp2, a->vm, 1, MO_32);
+    tcg_gen_ext16u_i32(tmp3, tmp);
+    gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
+    write_neon_element32(tmp3, a->vd, 0, MO_32);
+    tcg_gen_shri_i32(tmp, tmp, 16);
+    gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
+    write_neon_element32(tmp, a->vd, 1, MO_32);
+    tcg_temp_free_i32(tmp);
+    tcg_gen_ext16u_i32(tmp3, tmp2);
+    gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
+    write_neon_element32(tmp3, a->vd, 2, MO_32);
+    tcg_temp_free_i32(tmp3);
+    tcg_gen_shri_i32(tmp2, tmp2, 16);
+    gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
+    write_neon_element32(tmp2, a->vd, 3, MO_32);
+    tcg_temp_free_i32(tmp2);
+    tcg_temp_free_i32(ahp);
+    tcg_temp_free_ptr(fpst);
+
+    return true;
+}
+
+static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
+{
+    int vec_size = a->q ? 16 : 8;
+    int rd_ofs = neon_full_reg_offset(a->vd);
+    int rm_ofs = neon_full_reg_offset(a->vm);
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if (a->size == 3) {
+        return false;
+    }
+
+    if ((a->vd | a->vm) & a->q) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size);
+
+    return true;
+}
+
+#define DO_2MISC_VEC(INSN, FN)                                  \
+    static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
+    {                                                           \
+        return do_2misc_vec(s, a, FN);                          \
+    }
+
+DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg)
+DO_2MISC_VEC(VABS, tcg_gen_gvec_abs)
+DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0)
+DO_2MISC_VEC(VCGT0, gen_gvec_cgt0)
+DO_2MISC_VEC(VCLE0, gen_gvec_cle0)
+DO_2MISC_VEC(VCGE0, gen_gvec_cge0)
+DO_2MISC_VEC(VCLT0, gen_gvec_clt0)
+
+static bool trans_VMVN(DisasContext *s, arg_2misc *a)
+{
+    if (a->size != 0) {
+        return false;
+    }
+    return do_2misc_vec(s, a, tcg_gen_gvec_not);
+}
+
+#define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA)                          \
+    static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
+                         uint32_t rm_ofs, uint32_t oprsz,               \
+                         uint32_t maxsz)                                \
+    {                                                                   \
+        tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz,        \
+                           DATA, FUNC);                                 \
+    }
+
+#define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA)                          \
+    static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
+                         uint32_t rm_ofs, uint32_t oprsz,               \
+                         uint32_t maxsz)                                \
+    {                                                                   \
+        tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC);   \
+    }
+
+WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0)
+WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aese, 1)
+WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0)
+WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesmc, 1)
+WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0)
+WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0)
+WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0)
+
+#define DO_2M_CRYPTO(INSN, FEATURE, SIZE)                       \
+    static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
+    {                                                           \
+        if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) {  \
+            return false;                                       \
+        }                                                       \
+        return do_2misc_vec(s, a, gen_##INSN);                  \
+    }
+
+DO_2M_CRYPTO(AESE, aa32_aes, 0)
+DO_2M_CRYPTO(AESD, aa32_aes, 0)
+DO_2M_CRYPTO(AESMC, aa32_aes, 0)
+DO_2M_CRYPTO(AESIMC, aa32_aes, 0)
+DO_2M_CRYPTO(SHA1H, aa32_sha1, 2)
+DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2)
+DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2)
+
+static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
+{
+    TCGv_i32 tmp;
+    int pass;
+
+    /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if (!fn) {
+        return false;
+    }
+
+    if ((a->vd | a->vm) & a->q) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    tmp = tcg_temp_new_i32();
+    for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
+        read_neon_element32(tmp, a->vm, pass, MO_32);
+        fn(tmp, tmp);
+        write_neon_element32(tmp, a->vd, pass, MO_32);
+    }
+    tcg_temp_free_i32(tmp);
+
+    return true;
+}
+
+static bool trans_VREV32(DisasContext *s, arg_2misc *a)
+{
+    static NeonGenOneOpFn * const fn[] = {
+        tcg_gen_bswap32_i32,
+        gen_swap_half,
+        NULL,
+        NULL,
+    };
+    return do_2misc(s, a, fn[a->size]);
+}
+
+static bool trans_VREV16(DisasContext *s, arg_2misc *a)
+{
+    if (a->size != 0) {
+        return false;
+    }
+    return do_2misc(s, a, gen_rev16);
+}
+
+static bool trans_VCLS(DisasContext *s, arg_2misc *a)
+{
+    static NeonGenOneOpFn * const fn[] = {
+        gen_helper_neon_cls_s8,
+        gen_helper_neon_cls_s16,
+        gen_helper_neon_cls_s32,
+        NULL,
+    };
+    return do_2misc(s, a, fn[a->size]);
+}
+
+static void do_VCLZ_32(TCGv_i32 rd, TCGv_i32 rm)
+{
+    tcg_gen_clzi_i32(rd, rm, 32);
+}
+
+static bool trans_VCLZ(DisasContext *s, arg_2misc *a)
+{
+    static NeonGenOneOpFn * const fn[] = {
+        gen_helper_neon_clz_u8,
+        gen_helper_neon_clz_u16,
+        do_VCLZ_32,
+        NULL,
+    };
+    return do_2misc(s, a, fn[a->size]);
+}
+
+static bool trans_VCNT(DisasContext *s, arg_2misc *a)
+{
+    if (a->size != 0) {
+        return false;
+    }
+    return do_2misc(s, a, gen_helper_neon_cnt_u8);
+}
+
+static void gen_VABS_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                       uint32_t oprsz, uint32_t maxsz)
+{
+    tcg_gen_gvec_andi(vece, rd_ofs, rm_ofs,
+                      vece == MO_16 ? 0x7fff : 0x7fffffff,
+                      oprsz, maxsz);
+}
+
+static bool trans_VABS_F(DisasContext *s, arg_2misc *a)
+{
+    if (a->size == MO_16) {
+        if (!dc_isar_feature(aa32_fp16_arith, s)) {
+            return false;
+        }
+    } else if (a->size != MO_32) {
+        return false;
+    }
+    return do_2misc_vec(s, a, gen_VABS_F);
+}
+
+static void gen_VNEG_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                       uint32_t oprsz, uint32_t maxsz)
+{
+    tcg_gen_gvec_xori(vece, rd_ofs, rm_ofs,
+                      vece == MO_16 ? 0x8000 : 0x80000000,
+                      oprsz, maxsz);
+}
+
+static bool trans_VNEG_F(DisasContext *s, arg_2misc *a)
+{
+    if (a->size == MO_16) {
+        if (!dc_isar_feature(aa32_fp16_arith, s)) {
+            return false;
+        }
+    } else if (a->size != MO_32) {
+        return false;
+    }
+    return do_2misc_vec(s, a, gen_VNEG_F);
+}
+
+static bool trans_VRECPE(DisasContext *s, arg_2misc *a)
+{
+    if (a->size != 2) {
+        return false;
+    }
+    return do_2misc(s, a, gen_helper_recpe_u32);
+}
+
+static bool trans_VRSQRTE(DisasContext *s, arg_2misc *a)
+{
+    if (a->size != 2) {
+        return false;
+    }
+    return do_2misc(s, a, gen_helper_rsqrte_u32);
+}
+
+#define WRAP_1OP_ENV_FN(WRAPNAME, FUNC) \
+    static void WRAPNAME(TCGv_i32 d, TCGv_i32 m)        \
+    {                                                   \
+        FUNC(d, cpu_env, m);                            \
+    }
+
+WRAP_1OP_ENV_FN(gen_VQABS_s8, gen_helper_neon_qabs_s8)
+WRAP_1OP_ENV_FN(gen_VQABS_s16, gen_helper_neon_qabs_s16)
+WRAP_1OP_ENV_FN(gen_VQABS_s32, gen_helper_neon_qabs_s32)
+WRAP_1OP_ENV_FN(gen_VQNEG_s8, gen_helper_neon_qneg_s8)
+WRAP_1OP_ENV_FN(gen_VQNEG_s16, gen_helper_neon_qneg_s16)
+WRAP_1OP_ENV_FN(gen_VQNEG_s32, gen_helper_neon_qneg_s32)
+
+static bool trans_VQABS(DisasContext *s, arg_2misc *a)
+{
+    static NeonGenOneOpFn * const fn[] = {
+        gen_VQABS_s8,
+        gen_VQABS_s16,
+        gen_VQABS_s32,
+        NULL,
+    };
+    return do_2misc(s, a, fn[a->size]);
+}
+
+static bool trans_VQNEG(DisasContext *s, arg_2misc *a)
+{
+    static NeonGenOneOpFn * const fn[] = {
+        gen_VQNEG_s8,
+        gen_VQNEG_s16,
+        gen_VQNEG_s32,
+        NULL,
+    };
+    return do_2misc(s, a, fn[a->size]);
+}
+
+#define DO_2MISC_FP_VEC(INSN, HFUNC, SFUNC)                             \
+    static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
+                           uint32_t rm_ofs,                             \
+                           uint32_t oprsz, uint32_t maxsz)              \
+    {                                                                   \
+        static gen_helper_gvec_2_ptr * const fns[4] = {                 \
+            NULL, HFUNC, SFUNC, NULL,                                   \
+        };                                                              \
+        TCGv_ptr fpst;                                                  \
+        fpst = fpstatus_ptr(vece == MO_16 ? FPST_STD_F16 : FPST_STD);   \
+        tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, 0,       \
+                           fns[vece]);                                  \
+        tcg_temp_free_ptr(fpst);                                        \
+    }                                                                   \
+    static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
+    {                                                                   \
+        if (a->size == MO_16) {                                         \
+            if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
+                return false;                                           \
+            }                                                           \
+        } else if (a->size != MO_32) {                                  \
+            return false;                                               \
+        }                                                               \
+        return do_2misc_vec(s, a, gen_##INSN);                          \
+    }
+
+DO_2MISC_FP_VEC(VRECPE_F, gen_helper_gvec_frecpe_h, gen_helper_gvec_frecpe_s)
+DO_2MISC_FP_VEC(VRSQRTE_F, gen_helper_gvec_frsqrte_h, gen_helper_gvec_frsqrte_s)
+DO_2MISC_FP_VEC(VCGT0_F, gen_helper_gvec_fcgt0_h, gen_helper_gvec_fcgt0_s)
+DO_2MISC_FP_VEC(VCGE0_F, gen_helper_gvec_fcge0_h, gen_helper_gvec_fcge0_s)
+DO_2MISC_FP_VEC(VCEQ0_F, gen_helper_gvec_fceq0_h, gen_helper_gvec_fceq0_s)
+DO_2MISC_FP_VEC(VCLT0_F, gen_helper_gvec_fclt0_h, gen_helper_gvec_fclt0_s)
+DO_2MISC_FP_VEC(VCLE0_F, gen_helper_gvec_fcle0_h, gen_helper_gvec_fcle0_s)
+DO_2MISC_FP_VEC(VCVT_FS, gen_helper_gvec_sstoh, gen_helper_gvec_sitos)
+DO_2MISC_FP_VEC(VCVT_FU, gen_helper_gvec_ustoh, gen_helper_gvec_uitos)
+DO_2MISC_FP_VEC(VCVT_SF, gen_helper_gvec_tosszh, gen_helper_gvec_tosizs)
+DO_2MISC_FP_VEC(VCVT_UF, gen_helper_gvec_touszh, gen_helper_gvec_touizs)
+
+DO_2MISC_FP_VEC(VRINTX_impl, gen_helper_gvec_vrintx_h, gen_helper_gvec_vrintx_s)
+
+static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
+{
+    if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
+        return false;
+    }
+    return trans_VRINTX_impl(s, a);
+}
+
+#define DO_VEC_RMODE(INSN, RMODE, OP)                                   \
+    static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
+                           uint32_t rm_ofs,                             \
+                           uint32_t oprsz, uint32_t maxsz)              \
+    {                                                                   \
+        static gen_helper_gvec_2_ptr * const fns[4] = {                 \
+            NULL,                                                       \
+            gen_helper_gvec_##OP##h,                                    \
+            gen_helper_gvec_##OP##s,                                    \
+            NULL,                                                       \
+        };                                                              \
+        TCGv_ptr fpst;                                                  \
+        fpst = fpstatus_ptr(vece == 1 ? FPST_STD_F16 : FPST_STD);       \
+        tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz,          \
+                           arm_rmode_to_sf(RMODE), fns[vece]);          \
+        tcg_temp_free_ptr(fpst);                                        \
+    }                                                                   \
+    static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
+    {                                                                   \
+        if (!arm_dc_feature(s, ARM_FEATURE_V8)) {                       \
+            return false;                                               \
+        }                                                               \
+        if (a->size == MO_16) {                                         \
+            if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
+                return false;                                           \
+            }                                                           \
+        } else if (a->size != MO_32) {                                  \
+            return false;                                               \
+        }                                                               \
+        return do_2misc_vec(s, a, gen_##INSN);                          \
+    }
+
+DO_VEC_RMODE(VCVTAU, FPROUNDING_TIEAWAY, vcvt_rm_u)
+DO_VEC_RMODE(VCVTAS, FPROUNDING_TIEAWAY, vcvt_rm_s)
+DO_VEC_RMODE(VCVTNU, FPROUNDING_TIEEVEN, vcvt_rm_u)
+DO_VEC_RMODE(VCVTNS, FPROUNDING_TIEEVEN, vcvt_rm_s)
+DO_VEC_RMODE(VCVTPU, FPROUNDING_POSINF, vcvt_rm_u)
+DO_VEC_RMODE(VCVTPS, FPROUNDING_POSINF, vcvt_rm_s)
+DO_VEC_RMODE(VCVTMU, FPROUNDING_NEGINF, vcvt_rm_u)
+DO_VEC_RMODE(VCVTMS, FPROUNDING_NEGINF, vcvt_rm_s)
+
+DO_VEC_RMODE(VRINTN, FPROUNDING_TIEEVEN, vrint_rm_)
+DO_VEC_RMODE(VRINTA, FPROUNDING_TIEAWAY, vrint_rm_)
+DO_VEC_RMODE(VRINTZ, FPROUNDING_ZERO, vrint_rm_)
+DO_VEC_RMODE(VRINTM, FPROUNDING_NEGINF, vrint_rm_)
+DO_VEC_RMODE(VRINTP, FPROUNDING_POSINF, vrint_rm_)
+
+static bool trans_VSWP(DisasContext *s, arg_2misc *a)
+{
+    TCGv_i64 rm, rd;
+    int pass;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if (a->size != 0) {
+        return false;
+    }
+
+    if ((a->vd | a->vm) & a->q) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    rm = tcg_temp_new_i64();
+    rd = tcg_temp_new_i64();
+    for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
+        read_neon_element64(rm, a->vm, pass, MO_64);
+        read_neon_element64(rd, a->vd, pass, MO_64);
+        write_neon_element64(rm, a->vd, pass, MO_64);
+        write_neon_element64(rd, a->vm, pass, MO_64);
+    }
+    tcg_temp_free_i64(rm);
+    tcg_temp_free_i64(rd);
+
+    return true;
+}
+static void gen_neon_trn_u8(TCGv_i32 t0, TCGv_i32 t1)
+{
+    TCGv_i32 rd, tmp;
+
+    rd = tcg_temp_new_i32();
+    tmp = tcg_temp_new_i32();
+
+    tcg_gen_shli_i32(rd, t0, 8);
+    tcg_gen_andi_i32(rd, rd, 0xff00ff00);
+    tcg_gen_andi_i32(tmp, t1, 0x00ff00ff);
+    tcg_gen_or_i32(rd, rd, tmp);
+
+    tcg_gen_shri_i32(t1, t1, 8);
+    tcg_gen_andi_i32(t1, t1, 0x00ff00ff);
+    tcg_gen_andi_i32(tmp, t0, 0xff00ff00);
+    tcg_gen_or_i32(t1, t1, tmp);
+    tcg_gen_mov_i32(t0, rd);
+
+    tcg_temp_free_i32(tmp);
+    tcg_temp_free_i32(rd);
+}
+
+static void gen_neon_trn_u16(TCGv_i32 t0, TCGv_i32 t1)
+{
+    TCGv_i32 rd, tmp;
+
+    rd = tcg_temp_new_i32();
+    tmp = tcg_temp_new_i32();
+
+    tcg_gen_shli_i32(rd, t0, 16);
+    tcg_gen_andi_i32(tmp, t1, 0xffff);
+    tcg_gen_or_i32(rd, rd, tmp);
+    tcg_gen_shri_i32(t1, t1, 16);
+    tcg_gen_andi_i32(tmp, t0, 0xffff0000);
+    tcg_gen_or_i32(t1, t1, tmp);
+    tcg_gen_mov_i32(t0, rd);
+
+    tcg_temp_free_i32(tmp);
+    tcg_temp_free_i32(rd);
+}
+
+static bool trans_VTRN(DisasContext *s, arg_2misc *a)
+{
+    TCGv_i32 tmp, tmp2;
+    int pass;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if ((a->vd | a->vm) & a->q) {
+        return false;
+    }
+
+    if (a->size == 3) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    tmp = tcg_temp_new_i32();
+    tmp2 = tcg_temp_new_i32();
+    if (a->size == MO_32) {
+        for (pass = 0; pass < (a->q ? 4 : 2); pass += 2) {
+            read_neon_element32(tmp, a->vm, pass, MO_32);
+            read_neon_element32(tmp2, a->vd, pass + 1, MO_32);
+            write_neon_element32(tmp2, a->vm, pass, MO_32);
+            write_neon_element32(tmp, a->vd, pass + 1, MO_32);
+        }
+    } else {
+        for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
+            read_neon_element32(tmp, a->vm, pass, MO_32);
+            read_neon_element32(tmp2, a->vd, pass, MO_32);
+            if (a->size == MO_8) {
+                gen_neon_trn_u8(tmp, tmp2);
+            } else {
+                gen_neon_trn_u16(tmp, tmp2);
+            }
+            write_neon_element32(tmp2, a->vm, pass, MO_32);
+            write_neon_element32(tmp, a->vd, pass, MO_32);
+        }
+    }
+    tcg_temp_free_i32(tmp);
+    tcg_temp_free_i32(tmp2);
+    return true;
+}
+
+static bool trans_VSMMLA(DisasContext *s, arg_VSMMLA *a)
+{
+    if (!dc_isar_feature(aa32_i8mm, s)) {
+        return false;
+    }
+    return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
+                        gen_helper_gvec_smmla_b);
+}
+
+static bool trans_VUMMLA(DisasContext *s, arg_VUMMLA *a)
+{
+    if (!dc_isar_feature(aa32_i8mm, s)) {
+        return false;
+    }
+    return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
+                        gen_helper_gvec_ummla_b);
+}
+
+static bool trans_VUSMMLA(DisasContext *s, arg_VUSMMLA *a)
+{
+    if (!dc_isar_feature(aa32_i8mm, s)) {
+        return false;
+    }
+    return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
+                        gen_helper_gvec_usmmla_b);
+}
+
+static bool trans_VMMLA_b16(DisasContext *s, arg_VMMLA_b16 *a)
+{
+    if (!dc_isar_feature(aa32_bf16, s)) {
+        return false;
+    }
+    return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
+                        gen_helper_gvec_bfmmla);
+}
+
+static bool trans_VFMA_b16(DisasContext *s, arg_VFMA_b16 *a)
+{
+    if (!dc_isar_feature(aa32_bf16, s)) {
+        return false;
+    }
+    return do_neon_ddda_fpst(s, 7, a->vd, a->vn, a->vm, a->q, FPST_STD,
+                             gen_helper_gvec_bfmlal);
+}
+
+static bool trans_VFMA_b16_scal(DisasContext *s, arg_VFMA_b16_scal *a)
+{
+    if (!dc_isar_feature(aa32_bf16, s)) {
+        return false;
+    }
+    return do_neon_ddda_fpst(s, 6, a->vd, a->vn, a->vm,
+                             (a->index << 1) | a->q, FPST_STD,
+                             gen_helper_gvec_bfmlal_idx);
+}
diff --git a/target/arm/tcg/translate-sme.c b/target/arm/tcg/translate-sme.c
new file mode 100644
index 0000000..7b87a9d
--- /dev/null
+++ b/target/arm/tcg/translate-sme.c
@@ -0,0 +1,373 @@
+/*
+ * AArch64 SME translation
+ *
+ * Copyright (c) 2022 Linaro, Ltd
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
+#include "tcg/tcg-gvec-desc.h"
+#include "translate.h"
+#include "exec/helper-gen.h"
+#include "translate-a64.h"
+#include "fpu/softfloat.h"
+
+
+/*
+ * Include the generated decoder.
+ */
+
+#include "decode-sme.c.inc"
+
+
+/*
+ * Resolve tile.size[index] to a host pointer, where tile and index
+ * are always decoded together, dependent on the element size.
+ */
+static TCGv_ptr get_tile_rowcol(DisasContext *s, int esz, int rs,
+                                int tile_index, bool vertical)
+{
+    int tile = tile_index >> (4 - esz);
+    int index = esz == MO_128 ? 0 : extract32(tile_index, 0, 4 - esz);
+    int pos, len, offset;
+    TCGv_i32 tmp;
+    TCGv_ptr addr;
+
+    /* Compute the final index, which is Rs+imm. */
+    tmp = tcg_temp_new_i32();
+    tcg_gen_trunc_tl_i32(tmp, cpu_reg(s, rs));
+    tcg_gen_addi_i32(tmp, tmp, index);
+
+    /* Prepare a power-of-two modulo via extraction of @len bits. */
+    len = ctz32(streaming_vec_reg_size(s)) - esz;
+
+    if (vertical) {
+        /*
+         * Compute the byte offset of the index within the tile:
+         *     (index % (svl / size)) * size
+         *   = (index % (svl >> esz)) << esz
+         * Perform the power-of-two modulo via extraction of the low @len bits.
+         * Perform the multiply by shifting left by @pos bits.
+         * Perform these operations simultaneously via deposit into zero.
+         */
+        pos = esz;
+        tcg_gen_deposit_z_i32(tmp, tmp, pos, len);
+
+        /*
+         * For big-endian, adjust the indexed column byte offset within
+         * the uint64_t host words that make up env->zarray[].
+         */
+        if (HOST_BIG_ENDIAN && esz < MO_64) {
+            tcg_gen_xori_i32(tmp, tmp, 8 - (1 << esz));
+        }
+    } else {
+        /*
+         * Compute the byte offset of the index within the tile:
+         *     (index % (svl / size)) * (size * sizeof(row))
+         *   = (index % (svl >> esz)) << (esz + log2(sizeof(row)))
+         */
+        pos = esz + ctz32(sizeof(ARMVectorReg));
+        tcg_gen_deposit_z_i32(tmp, tmp, pos, len);
+
+        /* Row slices are always aligned and need no endian adjustment. */
+    }
+
+    /* The tile byte offset within env->zarray is the row. */
+    offset = tile * sizeof(ARMVectorReg);
+
+    /* Include the byte offset of zarray to make this relative to env. */
+    offset += offsetof(CPUARMState, zarray);
+    tcg_gen_addi_i32(tmp, tmp, offset);
+
+    /* Add the byte offset to env to produce the final pointer. */
+    addr = tcg_temp_new_ptr();
+    tcg_gen_ext_i32_ptr(addr, tmp);
+    tcg_temp_free_i32(tmp);
+    tcg_gen_add_ptr(addr, addr, cpu_env);
+
+    return addr;
+}
+
+static bool trans_ZERO(DisasContext *s, arg_ZERO *a)
+{
+    if (!dc_isar_feature(aa64_sme, s)) {
+        return false;
+    }
+    if (sme_za_enabled_check(s)) {
+        gen_helper_sme_zero(cpu_env, tcg_constant_i32(a->imm),
+                            tcg_constant_i32(streaming_vec_reg_size(s)));
+    }
+    return true;
+}
+
+static bool trans_MOVA(DisasContext *s, arg_MOVA *a)
+{
+    static gen_helper_gvec_4 * const h_fns[5] = {
+        gen_helper_sve_sel_zpzz_b, gen_helper_sve_sel_zpzz_h,
+        gen_helper_sve_sel_zpzz_s, gen_helper_sve_sel_zpzz_d,
+        gen_helper_sve_sel_zpzz_q
+    };
+    static gen_helper_gvec_3 * const cz_fns[5] = {
+        gen_helper_sme_mova_cz_b, gen_helper_sme_mova_cz_h,
+        gen_helper_sme_mova_cz_s, gen_helper_sme_mova_cz_d,
+        gen_helper_sme_mova_cz_q,
+    };
+    static gen_helper_gvec_3 * const zc_fns[5] = {
+        gen_helper_sme_mova_zc_b, gen_helper_sme_mova_zc_h,
+        gen_helper_sme_mova_zc_s, gen_helper_sme_mova_zc_d,
+        gen_helper_sme_mova_zc_q,
+    };
+
+    TCGv_ptr t_za, t_zr, t_pg;
+    TCGv_i32 t_desc;
+    int svl;
+
+    if (!dc_isar_feature(aa64_sme, s)) {
+        return false;
+    }
+    if (!sme_smza_enabled_check(s)) {
+        return true;
+    }
+
+    t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v);
+    t_zr = vec_full_reg_ptr(s, a->zr);
+    t_pg = pred_full_reg_ptr(s, a->pg);
+
+    svl = streaming_vec_reg_size(s);
+    t_desc = tcg_constant_i32(simd_desc(svl, svl, 0));
+
+    if (a->v) {
+        /* Vertical slice -- use sme mova helpers. */
+        if (a->to_vec) {
+            zc_fns[a->esz](t_zr, t_za, t_pg, t_desc);
+        } else {
+            cz_fns[a->esz](t_za, t_zr, t_pg, t_desc);
+        }
+    } else {
+        /* Horizontal slice -- reuse sve sel helpers. */
+        if (a->to_vec) {
+            h_fns[a->esz](t_zr, t_za, t_zr, t_pg, t_desc);
+        } else {
+            h_fns[a->esz](t_za, t_zr, t_za, t_pg, t_desc);
+        }
+    }
+
+    tcg_temp_free_ptr(t_za);
+    tcg_temp_free_ptr(t_zr);
+    tcg_temp_free_ptr(t_pg);
+
+    return true;
+}
+
+static bool trans_LDST1(DisasContext *s, arg_LDST1 *a)
+{
+    typedef void GenLdSt1(TCGv_env, TCGv_ptr, TCGv_ptr, TCGv, TCGv_i32);
+
+    /*
+     * Indexed by [esz][be][v][mte][st], which is (except for load/store)
+     * also the order in which the elements appear in the function names,
+     * and so how we must concatenate the pieces.
+     */
+
+#define FN_LS(F)     { gen_helper_sme_ld1##F, gen_helper_sme_st1##F }
+#define FN_MTE(F)    { FN_LS(F), FN_LS(F##_mte) }
+#define FN_HV(F)     { FN_MTE(F##_h), FN_MTE(F##_v) }
+#define FN_END(L, B) { FN_HV(L), FN_HV(B) }
+
+    static GenLdSt1 * const fns[5][2][2][2][2] = {
+        FN_END(b, b),
+        FN_END(h_le, h_be),
+        FN_END(s_le, s_be),
+        FN_END(d_le, d_be),
+        FN_END(q_le, q_be),
+    };
+
+#undef FN_LS
+#undef FN_MTE
+#undef FN_HV
+#undef FN_END
+
+    TCGv_ptr t_za, t_pg;
+    TCGv_i64 addr;
+    int svl, desc = 0;
+    bool be = s->be_data == MO_BE;
+    bool mte = s->mte_active[0];
+
+    if (!dc_isar_feature(aa64_sme, s)) {
+        return false;
+    }
+    if (!sme_smza_enabled_check(s)) {
+        return true;
+    }
+
+    t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v);
+    t_pg = pred_full_reg_ptr(s, a->pg);
+    addr = tcg_temp_new_i64();
+
+    tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), a->esz);
+    tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
+
+    if (mte) {
+        desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s));
+        desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid);
+        desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma);
+        desc = FIELD_DP32(desc, MTEDESC, WRITE, a->st);
+        desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (1 << a->esz) - 1);
+        desc <<= SVE_MTEDESC_SHIFT;
+    } else {
+        addr = clean_data_tbi(s, addr);
+    }
+    svl = streaming_vec_reg_size(s);
+    desc = simd_desc(svl, svl, desc);
+
+    fns[a->esz][be][a->v][mte][a->st](cpu_env, t_za, t_pg, addr,
+                                      tcg_constant_i32(desc));
+
+    tcg_temp_free_ptr(t_za);
+    tcg_temp_free_ptr(t_pg);
+    tcg_temp_free_i64(addr);
+    return true;
+}
+
+typedef void GenLdStR(DisasContext *, TCGv_ptr, int, int, int, int);
+
+static bool do_ldst_r(DisasContext *s, arg_ldstr *a, GenLdStR *fn)
+{
+    int svl = streaming_vec_reg_size(s);
+    int imm = a->imm;
+    TCGv_ptr base;
+
+    if (!sme_za_enabled_check(s)) {
+        return true;
+    }
+
+    /* ZA[n] equates to ZA0H.B[n]. */
+    base = get_tile_rowcol(s, MO_8, a->rv, imm, false);
+
+    fn(s, base, 0, svl, a->rn, imm * svl);
+
+    tcg_temp_free_ptr(base);
+    return true;
+}
+
+TRANS_FEAT(LDR, aa64_sme, do_ldst_r, a, gen_sve_ldr)
+TRANS_FEAT(STR, aa64_sme, do_ldst_r, a, gen_sve_str)
+
+static bool do_adda(DisasContext *s, arg_adda *a, MemOp esz,
+                    gen_helper_gvec_4 *fn)
+{
+    int svl = streaming_vec_reg_size(s);
+    uint32_t desc = simd_desc(svl, svl, 0);
+    TCGv_ptr za, zn, pn, pm;
+
+    if (!sme_smza_enabled_check(s)) {
+        return true;
+    }
+
+    /* Sum XZR+zad to find ZAd. */
+    za = get_tile_rowcol(s, esz, 31, a->zad, false);
+    zn = vec_full_reg_ptr(s, a->zn);
+    pn = pred_full_reg_ptr(s, a->pn);
+    pm = pred_full_reg_ptr(s, a->pm);
+
+    fn(za, zn, pn, pm, tcg_constant_i32(desc));
+
+    tcg_temp_free_ptr(za);
+    tcg_temp_free_ptr(zn);
+    tcg_temp_free_ptr(pn);
+    tcg_temp_free_ptr(pm);
+    return true;
+}
+
+TRANS_FEAT(ADDHA_s, aa64_sme, do_adda, a, MO_32, gen_helper_sme_addha_s)
+TRANS_FEAT(ADDVA_s, aa64_sme, do_adda, a, MO_32, gen_helper_sme_addva_s)
+TRANS_FEAT(ADDHA_d, aa64_sme_i16i64, do_adda, a, MO_64, gen_helper_sme_addha_d)
+TRANS_FEAT(ADDVA_d, aa64_sme_i16i64, do_adda, a, MO_64, gen_helper_sme_addva_d)
+
+static bool do_outprod(DisasContext *s, arg_op *a, MemOp esz,
+                       gen_helper_gvec_5 *fn)
+{
+    int svl = streaming_vec_reg_size(s);
+    uint32_t desc = simd_desc(svl, svl, a->sub);
+    TCGv_ptr za, zn, zm, pn, pm;
+
+    if (!sme_smza_enabled_check(s)) {
+        return true;
+    }
+
+    /* Sum XZR+zad to find ZAd. */
+    za = get_tile_rowcol(s, esz, 31, a->zad, false);
+    zn = vec_full_reg_ptr(s, a->zn);
+    zm = vec_full_reg_ptr(s, a->zm);
+    pn = pred_full_reg_ptr(s, a->pn);
+    pm = pred_full_reg_ptr(s, a->pm);
+
+    fn(za, zn, zm, pn, pm, tcg_constant_i32(desc));
+
+    tcg_temp_free_ptr(za);
+    tcg_temp_free_ptr(zn);
+    tcg_temp_free_ptr(pn);
+    tcg_temp_free_ptr(pm);
+    return true;
+}
+
+static bool do_outprod_fpst(DisasContext *s, arg_op *a, MemOp esz,
+                            gen_helper_gvec_5_ptr *fn)
+{
+    int svl = streaming_vec_reg_size(s);
+    uint32_t desc = simd_desc(svl, svl, a->sub);
+    TCGv_ptr za, zn, zm, pn, pm, fpst;
+
+    if (!sme_smza_enabled_check(s)) {
+        return true;
+    }
+
+    /* Sum XZR+zad to find ZAd. */
+    za = get_tile_rowcol(s, esz, 31, a->zad, false);
+    zn = vec_full_reg_ptr(s, a->zn);
+    zm = vec_full_reg_ptr(s, a->zm);
+    pn = pred_full_reg_ptr(s, a->pn);
+    pm = pred_full_reg_ptr(s, a->pm);
+    fpst = fpstatus_ptr(FPST_FPCR);
+
+    fn(za, zn, zm, pn, pm, fpst, tcg_constant_i32(desc));
+
+    tcg_temp_free_ptr(za);
+    tcg_temp_free_ptr(zn);
+    tcg_temp_free_ptr(pn);
+    tcg_temp_free_ptr(pm);
+    tcg_temp_free_ptr(fpst);
+    return true;
+}
+
+TRANS_FEAT(FMOPA_h, aa64_sme, do_outprod_fpst, a, MO_32, gen_helper_sme_fmopa_h)
+TRANS_FEAT(FMOPA_s, aa64_sme, do_outprod_fpst, a, MO_32, gen_helper_sme_fmopa_s)
+TRANS_FEAT(FMOPA_d, aa64_sme_f64f64, do_outprod_fpst, a, MO_64, gen_helper_sme_fmopa_d)
+
+/* TODO: FEAT_EBF16 */
+TRANS_FEAT(BFMOPA, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_bfmopa)
+
+TRANS_FEAT(SMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_smopa_s)
+TRANS_FEAT(UMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_umopa_s)
+TRANS_FEAT(SUMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_sumopa_s)
+TRANS_FEAT(USMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_usmopa_s)
+
+TRANS_FEAT(SMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_smopa_d)
+TRANS_FEAT(UMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_umopa_d)
+TRANS_FEAT(SUMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_sumopa_d)
+TRANS_FEAT(USMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_usmopa_d)
diff --git a/target/arm/tcg/translate-sve.c b/target/arm/tcg/translate-sve.c
new file mode 100644
index 0000000..621a2ab
--- /dev/null
+++ b/target/arm/tcg/translate-sve.c
@@ -0,0 +1,7583 @@
+/*
+ * AArch64 SVE translation
+ *
+ * Copyright (c) 2018 Linaro, Ltd
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "exec/exec-all.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
+#include "tcg/tcg-gvec-desc.h"
+#include "qemu/log.h"
+#include "arm_ldst.h"
+#include "translate.h"
+#include "internals.h"
+#include "exec/helper-proto.h"
+#include "exec/helper-gen.h"
+#include "exec/log.h"
+#include "translate-a64.h"
+#include "fpu/softfloat.h"
+
+
+typedef void GVecGen2sFn(unsigned, uint32_t, uint32_t,
+                         TCGv_i64, uint32_t, uint32_t);
+
+typedef void gen_helper_gvec_flags_3(TCGv_i32, TCGv_ptr, TCGv_ptr,
+                                     TCGv_ptr, TCGv_i32);
+typedef void gen_helper_gvec_flags_4(TCGv_i32, TCGv_ptr, TCGv_ptr,
+                                     TCGv_ptr, TCGv_ptr, TCGv_i32);
+
+typedef void gen_helper_gvec_mem(TCGv_env, TCGv_ptr, TCGv_i64, TCGv_i32);
+typedef void gen_helper_gvec_mem_scatter(TCGv_env, TCGv_ptr, TCGv_ptr,
+                                         TCGv_ptr, TCGv_i64, TCGv_i32);
+
+/*
+ * Helpers for extracting complex instruction fields.
+ */
+
+/* See e.g. ASR (immediate, predicated).
+ * Returns -1 for unallocated encoding; diagnose later.
+ */
+static int tszimm_esz(DisasContext *s, int x)
+{
+    x >>= 3;  /* discard imm3 */
+    return 31 - clz32(x);
+}
+
+static int tszimm_shr(DisasContext *s, int x)
+{
+    return (16 << tszimm_esz(s, x)) - x;
+}
+
+/* See e.g. LSL (immediate, predicated).  */
+static int tszimm_shl(DisasContext *s, int x)
+{
+    return x - (8 << tszimm_esz(s, x));
+}
+
+/* The SH bit is in bit 8.  Extract the low 8 and shift.  */
+static inline int expand_imm_sh8s(DisasContext *s, int x)
+{
+    return (int8_t)x << (x & 0x100 ? 8 : 0);
+}
+
+static inline int expand_imm_sh8u(DisasContext *s, int x)
+{
+    return (uint8_t)x << (x & 0x100 ? 8 : 0);
+}
+
+/* Convert a 2-bit memory size (msz) to a 4-bit data type (dtype)
+ * with unsigned data.  C.f. SVE Memory Contiguous Load Group.
+ */
+static inline int msz_dtype(DisasContext *s, int msz)
+{
+    static const uint8_t dtype[4] = { 0, 5, 10, 15 };
+    return dtype[msz];
+}
+
+/*
+ * Include the generated decoder.
+ */
+
+#include "decode-sve.c.inc"
+
+/*
+ * Implement all of the translator functions referenced by the decoder.
+ */
+
+/* Invoke an out-of-line helper on 2 Zregs. */
+static bool gen_gvec_ool_zz(DisasContext *s, gen_helper_gvec_2 *fn,
+                            int rd, int rn, int data)
+{
+    if (fn == NULL) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        tcg_gen_gvec_2_ool(vec_full_reg_offset(s, rd),
+                           vec_full_reg_offset(s, rn),
+                           vsz, vsz, data, fn);
+    }
+    return true;
+}
+
+static bool gen_gvec_fpst_zz(DisasContext *s, gen_helper_gvec_2_ptr *fn,
+                             int rd, int rn, int data,
+                             ARMFPStatusFlavour flavour)
+{
+    if (fn == NULL) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        TCGv_ptr status = fpstatus_ptr(flavour);
+
+        tcg_gen_gvec_2_ptr(vec_full_reg_offset(s, rd),
+                           vec_full_reg_offset(s, rn),
+                           status, vsz, vsz, data, fn);
+        tcg_temp_free_ptr(status);
+    }
+    return true;
+}
+
+static bool gen_gvec_fpst_arg_zz(DisasContext *s, gen_helper_gvec_2_ptr *fn,
+                                 arg_rr_esz *a, int data)
+{
+    return gen_gvec_fpst_zz(s, fn, a->rd, a->rn, data,
+                            a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+}
+
+/* Invoke an out-of-line helper on 3 Zregs. */
+static bool gen_gvec_ool_zzz(DisasContext *s, gen_helper_gvec_3 *fn,
+                             int rd, int rn, int rm, int data)
+{
+    if (fn == NULL) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        tcg_gen_gvec_3_ool(vec_full_reg_offset(s, rd),
+                           vec_full_reg_offset(s, rn),
+                           vec_full_reg_offset(s, rm),
+                           vsz, vsz, data, fn);
+    }
+    return true;
+}
+
+static bool gen_gvec_ool_arg_zzz(DisasContext *s, gen_helper_gvec_3 *fn,
+                                 arg_rrr_esz *a, int data)
+{
+    return gen_gvec_ool_zzz(s, fn, a->rd, a->rn, a->rm, data);
+}
+
+/* Invoke an out-of-line helper on 3 Zregs, plus float_status. */
+static bool gen_gvec_fpst_zzz(DisasContext *s, gen_helper_gvec_3_ptr *fn,
+                              int rd, int rn, int rm,
+                              int data, ARMFPStatusFlavour flavour)
+{
+    if (fn == NULL) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        TCGv_ptr status = fpstatus_ptr(flavour);
+
+        tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd),
+                           vec_full_reg_offset(s, rn),
+                           vec_full_reg_offset(s, rm),
+                           status, vsz, vsz, data, fn);
+
+        tcg_temp_free_ptr(status);
+    }
+    return true;
+}
+
+static bool gen_gvec_fpst_arg_zzz(DisasContext *s, gen_helper_gvec_3_ptr *fn,
+                                  arg_rrr_esz *a, int data)
+{
+    return gen_gvec_fpst_zzz(s, fn, a->rd, a->rn, a->rm, data,
+                             a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+}
+
+/* Invoke an out-of-line helper on 4 Zregs. */
+static bool gen_gvec_ool_zzzz(DisasContext *s, gen_helper_gvec_4 *fn,
+                              int rd, int rn, int rm, int ra, int data)
+{
+    if (fn == NULL) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        tcg_gen_gvec_4_ool(vec_full_reg_offset(s, rd),
+                           vec_full_reg_offset(s, rn),
+                           vec_full_reg_offset(s, rm),
+                           vec_full_reg_offset(s, ra),
+                           vsz, vsz, data, fn);
+    }
+    return true;
+}
+
+static bool gen_gvec_ool_arg_zzzz(DisasContext *s, gen_helper_gvec_4 *fn,
+                                  arg_rrrr_esz *a, int data)
+{
+    return gen_gvec_ool_zzzz(s, fn, a->rd, a->rn, a->rm, a->ra, data);
+}
+
+static bool gen_gvec_ool_arg_zzxz(DisasContext *s, gen_helper_gvec_4 *fn,
+                                  arg_rrxr_esz *a)
+{
+    return gen_gvec_ool_zzzz(s, fn, a->rd, a->rn, a->rm, a->ra, a->index);
+}
+
+/* Invoke an out-of-line helper on 4 Zregs, plus a pointer. */
+static bool gen_gvec_ptr_zzzz(DisasContext *s, gen_helper_gvec_4_ptr *fn,
+                              int rd, int rn, int rm, int ra,
+                              int data, TCGv_ptr ptr)
+{
+    if (fn == NULL) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, rd),
+                           vec_full_reg_offset(s, rn),
+                           vec_full_reg_offset(s, rm),
+                           vec_full_reg_offset(s, ra),
+                           ptr, vsz, vsz, data, fn);
+    }
+    return true;
+}
+
+static bool gen_gvec_fpst_zzzz(DisasContext *s, gen_helper_gvec_4_ptr *fn,
+                               int rd, int rn, int rm, int ra,
+                               int data, ARMFPStatusFlavour flavour)
+{
+    TCGv_ptr status = fpstatus_ptr(flavour);
+    bool ret = gen_gvec_ptr_zzzz(s, fn, rd, rn, rm, ra, data, status);
+    tcg_temp_free_ptr(status);
+    return ret;
+}
+
+/* Invoke an out-of-line helper on 4 Zregs, 1 Preg, plus fpst. */
+static bool gen_gvec_fpst_zzzzp(DisasContext *s, gen_helper_gvec_5_ptr *fn,
+                                int rd, int rn, int rm, int ra, int pg,
+                                int data, ARMFPStatusFlavour flavour)
+{
+    if (fn == NULL) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        TCGv_ptr status = fpstatus_ptr(flavour);
+
+        tcg_gen_gvec_5_ptr(vec_full_reg_offset(s, rd),
+                           vec_full_reg_offset(s, rn),
+                           vec_full_reg_offset(s, rm),
+                           vec_full_reg_offset(s, ra),
+                           pred_full_reg_offset(s, pg),
+                           status, vsz, vsz, data, fn);
+
+        tcg_temp_free_ptr(status);
+    }
+    return true;
+}
+
+/* Invoke an out-of-line helper on 2 Zregs and a predicate. */
+static bool gen_gvec_ool_zzp(DisasContext *s, gen_helper_gvec_3 *fn,
+                             int rd, int rn, int pg, int data)
+{
+    if (fn == NULL) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        tcg_gen_gvec_3_ool(vec_full_reg_offset(s, rd),
+                           vec_full_reg_offset(s, rn),
+                           pred_full_reg_offset(s, pg),
+                           vsz, vsz, data, fn);
+    }
+    return true;
+}
+
+static bool gen_gvec_ool_arg_zpz(DisasContext *s, gen_helper_gvec_3 *fn,
+                                 arg_rpr_esz *a, int data)
+{
+    return gen_gvec_ool_zzp(s, fn, a->rd, a->rn, a->pg, data);
+}
+
+static bool gen_gvec_ool_arg_zpzi(DisasContext *s, gen_helper_gvec_3 *fn,
+                                  arg_rpri_esz *a)
+{
+    return gen_gvec_ool_zzp(s, fn, a->rd, a->rn, a->pg, a->imm);
+}
+
+static bool gen_gvec_fpst_zzp(DisasContext *s, gen_helper_gvec_3_ptr *fn,
+                              int rd, int rn, int pg, int data,
+                              ARMFPStatusFlavour flavour)
+{
+    if (fn == NULL) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        TCGv_ptr status = fpstatus_ptr(flavour);
+
+        tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd),
+                           vec_full_reg_offset(s, rn),
+                           pred_full_reg_offset(s, pg),
+                           status, vsz, vsz, data, fn);
+        tcg_temp_free_ptr(status);
+    }
+    return true;
+}
+
+static bool gen_gvec_fpst_arg_zpz(DisasContext *s, gen_helper_gvec_3_ptr *fn,
+                                  arg_rpr_esz *a, int data,
+                                  ARMFPStatusFlavour flavour)
+{
+    return gen_gvec_fpst_zzp(s, fn, a->rd, a->rn, a->pg, data, flavour);
+}
+
+/* Invoke an out-of-line helper on 3 Zregs and a predicate. */
+static bool gen_gvec_ool_zzzp(DisasContext *s, gen_helper_gvec_4 *fn,
+                              int rd, int rn, int rm, int pg, int data)
+{
+    if (fn == NULL) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        tcg_gen_gvec_4_ool(vec_full_reg_offset(s, rd),
+                           vec_full_reg_offset(s, rn),
+                           vec_full_reg_offset(s, rm),
+                           pred_full_reg_offset(s, pg),
+                           vsz, vsz, data, fn);
+    }
+    return true;
+}
+
+static bool gen_gvec_ool_arg_zpzz(DisasContext *s, gen_helper_gvec_4 *fn,
+                                  arg_rprr_esz *a, int data)
+{
+    return gen_gvec_ool_zzzp(s, fn, a->rd, a->rn, a->rm, a->pg, data);
+}
+
+/* Invoke an out-of-line helper on 3 Zregs and a predicate. */
+static bool gen_gvec_fpst_zzzp(DisasContext *s, gen_helper_gvec_4_ptr *fn,
+                               int rd, int rn, int rm, int pg, int data,
+                               ARMFPStatusFlavour flavour)
+{
+    if (fn == NULL) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        TCGv_ptr status = fpstatus_ptr(flavour);
+
+        tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, rd),
+                           vec_full_reg_offset(s, rn),
+                           vec_full_reg_offset(s, rm),
+                           pred_full_reg_offset(s, pg),
+                           status, vsz, vsz, data, fn);
+        tcg_temp_free_ptr(status);
+    }
+    return true;
+}
+
+static bool gen_gvec_fpst_arg_zpzz(DisasContext *s, gen_helper_gvec_4_ptr *fn,
+                                   arg_rprr_esz *a)
+{
+    return gen_gvec_fpst_zzzp(s, fn, a->rd, a->rn, a->rm, a->pg, 0,
+                              a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+}
+
+/* Invoke a vector expander on two Zregs and an immediate.  */
+static bool gen_gvec_fn_zzi(DisasContext *s, GVecGen2iFn *gvec_fn,
+                            int esz, int rd, int rn, uint64_t imm)
+{
+    if (gvec_fn == NULL) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        gvec_fn(esz, vec_full_reg_offset(s, rd),
+                vec_full_reg_offset(s, rn), imm, vsz, vsz);
+    }
+    return true;
+}
+
+static bool gen_gvec_fn_arg_zzi(DisasContext *s, GVecGen2iFn *gvec_fn,
+                                arg_rri_esz *a)
+{
+    if (a->esz < 0) {
+        /* Invalid tsz encoding -- see tszimm_esz. */
+        return false;
+    }
+    return gen_gvec_fn_zzi(s, gvec_fn, a->esz, a->rd, a->rn, a->imm);
+}
+
+/* Invoke a vector expander on three Zregs.  */
+static bool gen_gvec_fn_zzz(DisasContext *s, GVecGen3Fn *gvec_fn,
+                            int esz, int rd, int rn, int rm)
+{
+    if (gvec_fn == NULL) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        gvec_fn(esz, vec_full_reg_offset(s, rd),
+                vec_full_reg_offset(s, rn),
+                vec_full_reg_offset(s, rm), vsz, vsz);
+    }
+    return true;
+}
+
+static bool gen_gvec_fn_arg_zzz(DisasContext *s, GVecGen3Fn *fn,
+                                arg_rrr_esz *a)
+{
+    return gen_gvec_fn_zzz(s, fn, a->esz, a->rd, a->rn, a->rm);
+}
+
+/* Invoke a vector expander on four Zregs.  */
+static bool gen_gvec_fn_arg_zzzz(DisasContext *s, GVecGen4Fn *gvec_fn,
+                                 arg_rrrr_esz *a)
+{
+    if (gvec_fn == NULL) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        gvec_fn(a->esz, vec_full_reg_offset(s, a->rd),
+                vec_full_reg_offset(s, a->rn),
+                vec_full_reg_offset(s, a->rm),
+                vec_full_reg_offset(s, a->ra), vsz, vsz);
+    }
+    return true;
+}
+
+/* Invoke a vector move on two Zregs.  */
+static bool do_mov_z(DisasContext *s, int rd, int rn)
+{
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        tcg_gen_gvec_mov(MO_8, vec_full_reg_offset(s, rd),
+                         vec_full_reg_offset(s, rn), vsz, vsz);
+    }
+    return true;
+}
+
+/* Initialize a Zreg with replications of a 64-bit immediate.  */
+static void do_dupi_z(DisasContext *s, int rd, uint64_t word)
+{
+    unsigned vsz = vec_full_reg_size(s);
+    tcg_gen_gvec_dup_imm(MO_64, vec_full_reg_offset(s, rd), vsz, vsz, word);
+}
+
+/* Invoke a vector expander on three Pregs.  */
+static bool gen_gvec_fn_ppp(DisasContext *s, GVecGen3Fn *gvec_fn,
+                            int rd, int rn, int rm)
+{
+    if (sve_access_check(s)) {
+        unsigned psz = pred_gvec_reg_size(s);
+        gvec_fn(MO_64, pred_full_reg_offset(s, rd),
+                pred_full_reg_offset(s, rn),
+                pred_full_reg_offset(s, rm), psz, psz);
+    }
+    return true;
+}
+
+/* Invoke a vector move on two Pregs.  */
+static bool do_mov_p(DisasContext *s, int rd, int rn)
+{
+    if (sve_access_check(s)) {
+        unsigned psz = pred_gvec_reg_size(s);
+        tcg_gen_gvec_mov(MO_8, pred_full_reg_offset(s, rd),
+                         pred_full_reg_offset(s, rn), psz, psz);
+    }
+    return true;
+}
+
+/* Set the cpu flags as per a return from an SVE helper.  */
+static void do_pred_flags(TCGv_i32 t)
+{
+    tcg_gen_mov_i32(cpu_NF, t);
+    tcg_gen_andi_i32(cpu_ZF, t, 2);
+    tcg_gen_andi_i32(cpu_CF, t, 1);
+    tcg_gen_movi_i32(cpu_VF, 0);
+}
+
+/* Subroutines computing the ARM PredTest psuedofunction.  */
+static void do_predtest1(TCGv_i64 d, TCGv_i64 g)
+{
+    TCGv_i32 t = tcg_temp_new_i32();
+
+    gen_helper_sve_predtest1(t, d, g);
+    do_pred_flags(t);
+    tcg_temp_free_i32(t);
+}
+
+static void do_predtest(DisasContext *s, int dofs, int gofs, int words)
+{
+    TCGv_ptr dptr = tcg_temp_new_ptr();
+    TCGv_ptr gptr = tcg_temp_new_ptr();
+    TCGv_i32 t = tcg_temp_new_i32();
+
+    tcg_gen_addi_ptr(dptr, cpu_env, dofs);
+    tcg_gen_addi_ptr(gptr, cpu_env, gofs);
+
+    gen_helper_sve_predtest(t, dptr, gptr, tcg_constant_i32(words));
+    tcg_temp_free_ptr(dptr);
+    tcg_temp_free_ptr(gptr);
+
+    do_pred_flags(t);
+    tcg_temp_free_i32(t);
+}
+
+/* For each element size, the bits within a predicate word that are active.  */
+const uint64_t pred_esz_masks[5] = {
+    0xffffffffffffffffull, 0x5555555555555555ull,
+    0x1111111111111111ull, 0x0101010101010101ull,
+    0x0001000100010001ull,
+};
+
+static bool trans_INVALID(DisasContext *s, arg_INVALID *a)
+{
+    unallocated_encoding(s);
+    return true;
+}
+
+/*
+ *** SVE Logical - Unpredicated Group
+ */
+
+TRANS_FEAT(AND_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_and, a)
+TRANS_FEAT(ORR_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_or, a)
+TRANS_FEAT(EOR_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_xor, a)
+TRANS_FEAT(BIC_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_andc, a)
+
+static void gen_xar8_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, int64_t sh)
+{
+    TCGv_i64 t = tcg_temp_new_i64();
+    uint64_t mask = dup_const(MO_8, 0xff >> sh);
+
+    tcg_gen_xor_i64(t, n, m);
+    tcg_gen_shri_i64(d, t, sh);
+    tcg_gen_shli_i64(t, t, 8 - sh);
+    tcg_gen_andi_i64(d, d, mask);
+    tcg_gen_andi_i64(t, t, ~mask);
+    tcg_gen_or_i64(d, d, t);
+    tcg_temp_free_i64(t);
+}
+
+static void gen_xar16_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, int64_t sh)
+{
+    TCGv_i64 t = tcg_temp_new_i64();
+    uint64_t mask = dup_const(MO_16, 0xffff >> sh);
+
+    tcg_gen_xor_i64(t, n, m);
+    tcg_gen_shri_i64(d, t, sh);
+    tcg_gen_shli_i64(t, t, 16 - sh);
+    tcg_gen_andi_i64(d, d, mask);
+    tcg_gen_andi_i64(t, t, ~mask);
+    tcg_gen_or_i64(d, d, t);
+    tcg_temp_free_i64(t);
+}
+
+static void gen_xar_i32(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, int32_t sh)
+{
+    tcg_gen_xor_i32(d, n, m);
+    tcg_gen_rotri_i32(d, d, sh);
+}
+
+static void gen_xar_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, int64_t sh)
+{
+    tcg_gen_xor_i64(d, n, m);
+    tcg_gen_rotri_i64(d, d, sh);
+}
+
+static void gen_xar_vec(unsigned vece, TCGv_vec d, TCGv_vec n,
+                        TCGv_vec m, int64_t sh)
+{
+    tcg_gen_xor_vec(vece, d, n, m);
+    tcg_gen_rotri_vec(vece, d, d, sh);
+}
+
+void gen_gvec_xar(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                  uint32_t rm_ofs, int64_t shift,
+                  uint32_t opr_sz, uint32_t max_sz)
+{
+    static const TCGOpcode vecop[] = { INDEX_op_rotli_vec, 0 };
+    static const GVecGen3i ops[4] = {
+        { .fni8 = gen_xar8_i64,
+          .fniv = gen_xar_vec,
+          .fno = gen_helper_sve2_xar_b,
+          .opt_opc = vecop,
+          .vece = MO_8 },
+        { .fni8 = gen_xar16_i64,
+          .fniv = gen_xar_vec,
+          .fno = gen_helper_sve2_xar_h,
+          .opt_opc = vecop,
+          .vece = MO_16 },
+        { .fni4 = gen_xar_i32,
+          .fniv = gen_xar_vec,
+          .fno = gen_helper_sve2_xar_s,
+          .opt_opc = vecop,
+          .vece = MO_32 },
+        { .fni8 = gen_xar_i64,
+          .fniv = gen_xar_vec,
+          .fno = gen_helper_gvec_xar_d,
+          .opt_opc = vecop,
+          .vece = MO_64 }
+    };
+    int esize = 8 << vece;
+
+    /* The SVE2 range is 1 .. esize; the AdvSIMD range is 0 .. esize-1. */
+    tcg_debug_assert(shift >= 0);
+    tcg_debug_assert(shift <= esize);
+    shift &= esize - 1;
+
+    if (shift == 0) {
+        /* xar with no rotate devolves to xor. */
+        tcg_gen_gvec_xor(vece, rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz);
+    } else {
+        tcg_gen_gvec_3i(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz,
+                        shift, &ops[vece]);
+    }
+}
+
+static bool trans_XAR(DisasContext *s, arg_rrri_esz *a)
+{
+    if (a->esz < 0 || !dc_isar_feature(aa64_sve2, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        gen_gvec_xar(a->esz, vec_full_reg_offset(s, a->rd),
+                     vec_full_reg_offset(s, a->rn),
+                     vec_full_reg_offset(s, a->rm), a->imm, vsz, vsz);
+    }
+    return true;
+}
+
+static void gen_eor3_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 k)
+{
+    tcg_gen_xor_i64(d, n, m);
+    tcg_gen_xor_i64(d, d, k);
+}
+
+static void gen_eor3_vec(unsigned vece, TCGv_vec d, TCGv_vec n,
+                         TCGv_vec m, TCGv_vec k)
+{
+    tcg_gen_xor_vec(vece, d, n, m);
+    tcg_gen_xor_vec(vece, d, d, k);
+}
+
+static void gen_eor3(unsigned vece, uint32_t d, uint32_t n, uint32_t m,
+                     uint32_t a, uint32_t oprsz, uint32_t maxsz)
+{
+    static const GVecGen4 op = {
+        .fni8 = gen_eor3_i64,
+        .fniv = gen_eor3_vec,
+        .fno = gen_helper_sve2_eor3,
+        .vece = MO_64,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+    };
+    tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &op);
+}
+
+TRANS_FEAT(EOR3, aa64_sve2, gen_gvec_fn_arg_zzzz, gen_eor3, a)
+
+static void gen_bcax_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 k)
+{
+    tcg_gen_andc_i64(d, m, k);
+    tcg_gen_xor_i64(d, d, n);
+}
+
+static void gen_bcax_vec(unsigned vece, TCGv_vec d, TCGv_vec n,
+                         TCGv_vec m, TCGv_vec k)
+{
+    tcg_gen_andc_vec(vece, d, m, k);
+    tcg_gen_xor_vec(vece, d, d, n);
+}
+
+static void gen_bcax(unsigned vece, uint32_t d, uint32_t n, uint32_t m,
+                     uint32_t a, uint32_t oprsz, uint32_t maxsz)
+{
+    static const GVecGen4 op = {
+        .fni8 = gen_bcax_i64,
+        .fniv = gen_bcax_vec,
+        .fno = gen_helper_sve2_bcax,
+        .vece = MO_64,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+    };
+    tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &op);
+}
+
+TRANS_FEAT(BCAX, aa64_sve2, gen_gvec_fn_arg_zzzz, gen_bcax, a)
+
+static void gen_bsl(unsigned vece, uint32_t d, uint32_t n, uint32_t m,
+                    uint32_t a, uint32_t oprsz, uint32_t maxsz)
+{
+    /* BSL differs from the generic bitsel in argument ordering. */
+    tcg_gen_gvec_bitsel(vece, d, a, n, m, oprsz, maxsz);
+}
+
+TRANS_FEAT(BSL, aa64_sve2, gen_gvec_fn_arg_zzzz, gen_bsl, a)
+
+static void gen_bsl1n_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 k)
+{
+    tcg_gen_andc_i64(n, k, n);
+    tcg_gen_andc_i64(m, m, k);
+    tcg_gen_or_i64(d, n, m);
+}
+
+static void gen_bsl1n_vec(unsigned vece, TCGv_vec d, TCGv_vec n,
+                          TCGv_vec m, TCGv_vec k)
+{
+    if (TCG_TARGET_HAS_bitsel_vec) {
+        tcg_gen_not_vec(vece, n, n);
+        tcg_gen_bitsel_vec(vece, d, k, n, m);
+    } else {
+        tcg_gen_andc_vec(vece, n, k, n);
+        tcg_gen_andc_vec(vece, m, m, k);
+        tcg_gen_or_vec(vece, d, n, m);
+    }
+}
+
+static void gen_bsl1n(unsigned vece, uint32_t d, uint32_t n, uint32_t m,
+                      uint32_t a, uint32_t oprsz, uint32_t maxsz)
+{
+    static const GVecGen4 op = {
+        .fni8 = gen_bsl1n_i64,
+        .fniv = gen_bsl1n_vec,
+        .fno = gen_helper_sve2_bsl1n,
+        .vece = MO_64,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+    };
+    tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &op);
+}
+
+TRANS_FEAT(BSL1N, aa64_sve2, gen_gvec_fn_arg_zzzz, gen_bsl1n, a)
+
+static void gen_bsl2n_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 k)
+{
+    /*
+     * Z[dn] = (n & k) | (~m & ~k)
+     *       =         | ~(m | k)
+     */
+    tcg_gen_and_i64(n, n, k);
+    if (TCG_TARGET_HAS_orc_i64) {
+        tcg_gen_or_i64(m, m, k);
+        tcg_gen_orc_i64(d, n, m);
+    } else {
+        tcg_gen_nor_i64(m, m, k);
+        tcg_gen_or_i64(d, n, m);
+    }
+}
+
+static void gen_bsl2n_vec(unsigned vece, TCGv_vec d, TCGv_vec n,
+                          TCGv_vec m, TCGv_vec k)
+{
+    if (TCG_TARGET_HAS_bitsel_vec) {
+        tcg_gen_not_vec(vece, m, m);
+        tcg_gen_bitsel_vec(vece, d, k, n, m);
+    } else {
+        tcg_gen_and_vec(vece, n, n, k);
+        tcg_gen_or_vec(vece, m, m, k);
+        tcg_gen_orc_vec(vece, d, n, m);
+    }
+}
+
+static void gen_bsl2n(unsigned vece, uint32_t d, uint32_t n, uint32_t m,
+                      uint32_t a, uint32_t oprsz, uint32_t maxsz)
+{
+    static const GVecGen4 op = {
+        .fni8 = gen_bsl2n_i64,
+        .fniv = gen_bsl2n_vec,
+        .fno = gen_helper_sve2_bsl2n,
+        .vece = MO_64,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+    };
+    tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &op);
+}
+
+TRANS_FEAT(BSL2N, aa64_sve2, gen_gvec_fn_arg_zzzz, gen_bsl2n, a)
+
+static void gen_nbsl_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 k)
+{
+    tcg_gen_and_i64(n, n, k);
+    tcg_gen_andc_i64(m, m, k);
+    tcg_gen_nor_i64(d, n, m);
+}
+
+static void gen_nbsl_vec(unsigned vece, TCGv_vec d, TCGv_vec n,
+                          TCGv_vec m, TCGv_vec k)
+{
+    tcg_gen_bitsel_vec(vece, d, k, n, m);
+    tcg_gen_not_vec(vece, d, d);
+}
+
+static void gen_nbsl(unsigned vece, uint32_t d, uint32_t n, uint32_t m,
+                     uint32_t a, uint32_t oprsz, uint32_t maxsz)
+{
+    static const GVecGen4 op = {
+        .fni8 = gen_nbsl_i64,
+        .fniv = gen_nbsl_vec,
+        .fno = gen_helper_sve2_nbsl,
+        .vece = MO_64,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+    };
+    tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &op);
+}
+
+TRANS_FEAT(NBSL, aa64_sve2, gen_gvec_fn_arg_zzzz, gen_nbsl, a)
+
+/*
+ *** SVE Integer Arithmetic - Unpredicated Group
+ */
+
+TRANS_FEAT(ADD_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_add, a)
+TRANS_FEAT(SUB_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_sub, a)
+TRANS_FEAT(SQADD_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_ssadd, a)
+TRANS_FEAT(SQSUB_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_sssub, a)
+TRANS_FEAT(UQADD_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_usadd, a)
+TRANS_FEAT(UQSUB_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_ussub, a)
+
+/*
+ *** SVE Integer Arithmetic - Binary Predicated Group
+ */
+
+/* Select active elememnts from Zn and inactive elements from Zm,
+ * storing the result in Zd.
+ */
+static bool do_sel_z(DisasContext *s, int rd, int rn, int rm, int pg, int esz)
+{
+    static gen_helper_gvec_4 * const fns[4] = {
+        gen_helper_sve_sel_zpzz_b, gen_helper_sve_sel_zpzz_h,
+        gen_helper_sve_sel_zpzz_s, gen_helper_sve_sel_zpzz_d
+    };
+    return gen_gvec_ool_zzzp(s, fns[esz], rd, rn, rm, pg, 0);
+}
+
+#define DO_ZPZZ(NAME, FEAT, name) \
+    static gen_helper_gvec_4 * const name##_zpzz_fns[4] = {               \
+        gen_helper_##name##_zpzz_b, gen_helper_##name##_zpzz_h,           \
+        gen_helper_##name##_zpzz_s, gen_helper_##name##_zpzz_d,           \
+    };                                                                    \
+    TRANS_FEAT(NAME, FEAT, gen_gvec_ool_arg_zpzz,                         \
+               name##_zpzz_fns[a->esz], a, 0)
+
+DO_ZPZZ(AND_zpzz, aa64_sve, sve_and)
+DO_ZPZZ(EOR_zpzz, aa64_sve, sve_eor)
+DO_ZPZZ(ORR_zpzz, aa64_sve, sve_orr)
+DO_ZPZZ(BIC_zpzz, aa64_sve, sve_bic)
+
+DO_ZPZZ(ADD_zpzz, aa64_sve, sve_add)
+DO_ZPZZ(SUB_zpzz, aa64_sve, sve_sub)
+
+DO_ZPZZ(SMAX_zpzz, aa64_sve, sve_smax)
+DO_ZPZZ(UMAX_zpzz, aa64_sve, sve_umax)
+DO_ZPZZ(SMIN_zpzz, aa64_sve, sve_smin)
+DO_ZPZZ(UMIN_zpzz, aa64_sve, sve_umin)
+DO_ZPZZ(SABD_zpzz, aa64_sve, sve_sabd)
+DO_ZPZZ(UABD_zpzz, aa64_sve, sve_uabd)
+
+DO_ZPZZ(MUL_zpzz, aa64_sve, sve_mul)
+DO_ZPZZ(SMULH_zpzz, aa64_sve, sve_smulh)
+DO_ZPZZ(UMULH_zpzz, aa64_sve, sve_umulh)
+
+DO_ZPZZ(ASR_zpzz, aa64_sve, sve_asr)
+DO_ZPZZ(LSR_zpzz, aa64_sve, sve_lsr)
+DO_ZPZZ(LSL_zpzz, aa64_sve, sve_lsl)
+
+static gen_helper_gvec_4 * const sdiv_fns[4] = {
+    NULL, NULL, gen_helper_sve_sdiv_zpzz_s, gen_helper_sve_sdiv_zpzz_d
+};
+TRANS_FEAT(SDIV_zpzz, aa64_sve, gen_gvec_ool_arg_zpzz, sdiv_fns[a->esz], a, 0)
+
+static gen_helper_gvec_4 * const udiv_fns[4] = {
+    NULL, NULL, gen_helper_sve_udiv_zpzz_s, gen_helper_sve_udiv_zpzz_d
+};
+TRANS_FEAT(UDIV_zpzz, aa64_sve, gen_gvec_ool_arg_zpzz, udiv_fns[a->esz], a, 0)
+
+TRANS_FEAT(SEL_zpzz, aa64_sve, do_sel_z, a->rd, a->rn, a->rm, a->pg, a->esz)
+
+/*
+ *** SVE Integer Arithmetic - Unary Predicated Group
+ */
+
+#define DO_ZPZ(NAME, FEAT, name) \
+    static gen_helper_gvec_3 * const name##_fns[4] = {              \
+        gen_helper_##name##_b, gen_helper_##name##_h,               \
+        gen_helper_##name##_s, gen_helper_##name##_d,               \
+    };                                                              \
+    TRANS_FEAT(NAME, FEAT, gen_gvec_ool_arg_zpz, name##_fns[a->esz], a, 0)
+
+DO_ZPZ(CLS, aa64_sve, sve_cls)
+DO_ZPZ(CLZ, aa64_sve, sve_clz)
+DO_ZPZ(CNT_zpz, aa64_sve, sve_cnt_zpz)
+DO_ZPZ(CNOT, aa64_sve, sve_cnot)
+DO_ZPZ(NOT_zpz, aa64_sve, sve_not_zpz)
+DO_ZPZ(ABS, aa64_sve, sve_abs)
+DO_ZPZ(NEG, aa64_sve, sve_neg)
+DO_ZPZ(RBIT, aa64_sve, sve_rbit)
+
+static gen_helper_gvec_3 * const fabs_fns[4] = {
+    NULL,                  gen_helper_sve_fabs_h,
+    gen_helper_sve_fabs_s, gen_helper_sve_fabs_d,
+};
+TRANS_FEAT(FABS, aa64_sve, gen_gvec_ool_arg_zpz, fabs_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const fneg_fns[4] = {
+    NULL,                  gen_helper_sve_fneg_h,
+    gen_helper_sve_fneg_s, gen_helper_sve_fneg_d,
+};
+TRANS_FEAT(FNEG, aa64_sve, gen_gvec_ool_arg_zpz, fneg_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const sxtb_fns[4] = {
+    NULL,                  gen_helper_sve_sxtb_h,
+    gen_helper_sve_sxtb_s, gen_helper_sve_sxtb_d,
+};
+TRANS_FEAT(SXTB, aa64_sve, gen_gvec_ool_arg_zpz, sxtb_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const uxtb_fns[4] = {
+    NULL,                  gen_helper_sve_uxtb_h,
+    gen_helper_sve_uxtb_s, gen_helper_sve_uxtb_d,
+};
+TRANS_FEAT(UXTB, aa64_sve, gen_gvec_ool_arg_zpz, uxtb_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const sxth_fns[4] = {
+    NULL, NULL, gen_helper_sve_sxth_s, gen_helper_sve_sxth_d
+};
+TRANS_FEAT(SXTH, aa64_sve, gen_gvec_ool_arg_zpz, sxth_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const uxth_fns[4] = {
+    NULL, NULL, gen_helper_sve_uxth_s, gen_helper_sve_uxth_d
+};
+TRANS_FEAT(UXTH, aa64_sve, gen_gvec_ool_arg_zpz, uxth_fns[a->esz], a, 0)
+
+TRANS_FEAT(SXTW, aa64_sve, gen_gvec_ool_arg_zpz,
+           a->esz == 3 ? gen_helper_sve_sxtw_d : NULL, a, 0)
+TRANS_FEAT(UXTW, aa64_sve, gen_gvec_ool_arg_zpz,
+           a->esz == 3 ? gen_helper_sve_uxtw_d : NULL, a, 0)
+
+/*
+ *** SVE Integer Reduction Group
+ */
+
+typedef void gen_helper_gvec_reduc(TCGv_i64, TCGv_ptr, TCGv_ptr, TCGv_i32);
+static bool do_vpz_ool(DisasContext *s, arg_rpr_esz *a,
+                       gen_helper_gvec_reduc *fn)
+{
+    unsigned vsz = vec_full_reg_size(s);
+    TCGv_ptr t_zn, t_pg;
+    TCGv_i32 desc;
+    TCGv_i64 temp;
+
+    if (fn == NULL) {
+        return false;
+    }
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    desc = tcg_constant_i32(simd_desc(vsz, vsz, 0));
+    temp = tcg_temp_new_i64();
+    t_zn = tcg_temp_new_ptr();
+    t_pg = tcg_temp_new_ptr();
+
+    tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, a->rn));
+    tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, a->pg));
+    fn(temp, t_zn, t_pg, desc);
+    tcg_temp_free_ptr(t_zn);
+    tcg_temp_free_ptr(t_pg);
+
+    write_fp_dreg(s, a->rd, temp);
+    tcg_temp_free_i64(temp);
+    return true;
+}
+
+#define DO_VPZ(NAME, name) \
+    static gen_helper_gvec_reduc * const name##_fns[4] = {               \
+        gen_helper_sve_##name##_b, gen_helper_sve_##name##_h,            \
+        gen_helper_sve_##name##_s, gen_helper_sve_##name##_d,            \
+    };                                                                   \
+    TRANS_FEAT(NAME, aa64_sve, do_vpz_ool, a, name##_fns[a->esz])
+
+DO_VPZ(ORV, orv)
+DO_VPZ(ANDV, andv)
+DO_VPZ(EORV, eorv)
+
+DO_VPZ(UADDV, uaddv)
+DO_VPZ(SMAXV, smaxv)
+DO_VPZ(UMAXV, umaxv)
+DO_VPZ(SMINV, sminv)
+DO_VPZ(UMINV, uminv)
+
+static gen_helper_gvec_reduc * const saddv_fns[4] = {
+    gen_helper_sve_saddv_b, gen_helper_sve_saddv_h,
+    gen_helper_sve_saddv_s, NULL
+};
+TRANS_FEAT(SADDV, aa64_sve, do_vpz_ool, a, saddv_fns[a->esz])
+
+#undef DO_VPZ
+
+/*
+ *** SVE Shift by Immediate - Predicated Group
+ */
+
+/*
+ * Copy Zn into Zd, storing zeros into inactive elements.
+ * If invert, store zeros into the active elements.
+ */
+static bool do_movz_zpz(DisasContext *s, int rd, int rn, int pg,
+                        int esz, bool invert)
+{
+    static gen_helper_gvec_3 * const fns[4] = {
+        gen_helper_sve_movz_b, gen_helper_sve_movz_h,
+        gen_helper_sve_movz_s, gen_helper_sve_movz_d,
+    };
+    return gen_gvec_ool_zzp(s, fns[esz], rd, rn, pg, invert);
+}
+
+static bool do_shift_zpzi(DisasContext *s, arg_rpri_esz *a, bool asr,
+                          gen_helper_gvec_3 * const fns[4])
+{
+    int max;
+
+    if (a->esz < 0) {
+        /* Invalid tsz encoding -- see tszimm_esz. */
+        return false;
+    }
+
+    /*
+     * Shift by element size is architecturally valid.
+     * For arithmetic right-shift, it's the same as by one less.
+     * For logical shifts and ASRD, it is a zeroing operation.
+     */
+    max = 8 << a->esz;
+    if (a->imm >= max) {
+        if (asr) {
+            a->imm = max - 1;
+        } else {
+            return do_movz_zpz(s, a->rd, a->rd, a->pg, a->esz, true);
+        }
+    }
+    return gen_gvec_ool_arg_zpzi(s, fns[a->esz], a);
+}
+
+static gen_helper_gvec_3 * const asr_zpzi_fns[4] = {
+    gen_helper_sve_asr_zpzi_b, gen_helper_sve_asr_zpzi_h,
+    gen_helper_sve_asr_zpzi_s, gen_helper_sve_asr_zpzi_d,
+};
+TRANS_FEAT(ASR_zpzi, aa64_sve, do_shift_zpzi, a, true, asr_zpzi_fns)
+
+static gen_helper_gvec_3 * const lsr_zpzi_fns[4] = {
+    gen_helper_sve_lsr_zpzi_b, gen_helper_sve_lsr_zpzi_h,
+    gen_helper_sve_lsr_zpzi_s, gen_helper_sve_lsr_zpzi_d,
+};
+TRANS_FEAT(LSR_zpzi, aa64_sve, do_shift_zpzi, a, false, lsr_zpzi_fns)
+
+static gen_helper_gvec_3 * const lsl_zpzi_fns[4] = {
+    gen_helper_sve_lsl_zpzi_b, gen_helper_sve_lsl_zpzi_h,
+    gen_helper_sve_lsl_zpzi_s, gen_helper_sve_lsl_zpzi_d,
+};
+TRANS_FEAT(LSL_zpzi, aa64_sve, do_shift_zpzi, a, false, lsl_zpzi_fns)
+
+static gen_helper_gvec_3 * const asrd_fns[4] = {
+    gen_helper_sve_asrd_b, gen_helper_sve_asrd_h,
+    gen_helper_sve_asrd_s, gen_helper_sve_asrd_d,
+};
+TRANS_FEAT(ASRD, aa64_sve, do_shift_zpzi, a, false, asrd_fns)
+
+static gen_helper_gvec_3 * const sqshl_zpzi_fns[4] = {
+    gen_helper_sve2_sqshl_zpzi_b, gen_helper_sve2_sqshl_zpzi_h,
+    gen_helper_sve2_sqshl_zpzi_s, gen_helper_sve2_sqshl_zpzi_d,
+};
+TRANS_FEAT(SQSHL_zpzi, aa64_sve2, gen_gvec_ool_arg_zpzi,
+           a->esz < 0 ? NULL : sqshl_zpzi_fns[a->esz], a)
+
+static gen_helper_gvec_3 * const uqshl_zpzi_fns[4] = {
+    gen_helper_sve2_uqshl_zpzi_b, gen_helper_sve2_uqshl_zpzi_h,
+    gen_helper_sve2_uqshl_zpzi_s, gen_helper_sve2_uqshl_zpzi_d,
+};
+TRANS_FEAT(UQSHL_zpzi, aa64_sve2, gen_gvec_ool_arg_zpzi,
+           a->esz < 0 ? NULL : uqshl_zpzi_fns[a->esz], a)
+
+static gen_helper_gvec_3 * const srshr_fns[4] = {
+    gen_helper_sve2_srshr_b, gen_helper_sve2_srshr_h,
+    gen_helper_sve2_srshr_s, gen_helper_sve2_srshr_d,
+};
+TRANS_FEAT(SRSHR, aa64_sve2, gen_gvec_ool_arg_zpzi,
+           a->esz < 0 ? NULL : srshr_fns[a->esz], a)
+
+static gen_helper_gvec_3 * const urshr_fns[4] = {
+    gen_helper_sve2_urshr_b, gen_helper_sve2_urshr_h,
+    gen_helper_sve2_urshr_s, gen_helper_sve2_urshr_d,
+};
+TRANS_FEAT(URSHR, aa64_sve2, gen_gvec_ool_arg_zpzi,
+           a->esz < 0 ? NULL : urshr_fns[a->esz], a)
+
+static gen_helper_gvec_3 * const sqshlu_fns[4] = {
+    gen_helper_sve2_sqshlu_b, gen_helper_sve2_sqshlu_h,
+    gen_helper_sve2_sqshlu_s, gen_helper_sve2_sqshlu_d,
+};
+TRANS_FEAT(SQSHLU, aa64_sve2, gen_gvec_ool_arg_zpzi,
+           a->esz < 0 ? NULL : sqshlu_fns[a->esz], a)
+
+/*
+ *** SVE Bitwise Shift - Predicated Group
+ */
+
+#define DO_ZPZW(NAME, name) \
+    static gen_helper_gvec_4 * const name##_zpzw_fns[4] = {               \
+        gen_helper_sve_##name##_zpzw_b, gen_helper_sve_##name##_zpzw_h,   \
+        gen_helper_sve_##name##_zpzw_s, NULL                              \
+    };                                                                    \
+    TRANS_FEAT(NAME##_zpzw, aa64_sve, gen_gvec_ool_arg_zpzz,              \
+               a->esz < 0 ? NULL : name##_zpzw_fns[a->esz], a, 0)
+
+DO_ZPZW(ASR, asr)
+DO_ZPZW(LSR, lsr)
+DO_ZPZW(LSL, lsl)
+
+#undef DO_ZPZW
+
+/*
+ *** SVE Bitwise Shift - Unpredicated Group
+ */
+
+static bool do_shift_imm(DisasContext *s, arg_rri_esz *a, bool asr,
+                         void (*gvec_fn)(unsigned, uint32_t, uint32_t,
+                                         int64_t, uint32_t, uint32_t))
+{
+    if (a->esz < 0) {
+        /* Invalid tsz encoding -- see tszimm_esz. */
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        /* Shift by element size is architecturally valid.  For
+           arithmetic right-shift, it's the same as by one less.
+           Otherwise it is a zeroing operation.  */
+        if (a->imm >= 8 << a->esz) {
+            if (asr) {
+                a->imm = (8 << a->esz) - 1;
+            } else {
+                do_dupi_z(s, a->rd, 0);
+                return true;
+            }
+        }
+        gvec_fn(a->esz, vec_full_reg_offset(s, a->rd),
+                vec_full_reg_offset(s, a->rn), a->imm, vsz, vsz);
+    }
+    return true;
+}
+
+TRANS_FEAT(ASR_zzi, aa64_sve, do_shift_imm, a, true, tcg_gen_gvec_sari)
+TRANS_FEAT(LSR_zzi, aa64_sve, do_shift_imm, a, false, tcg_gen_gvec_shri)
+TRANS_FEAT(LSL_zzi, aa64_sve, do_shift_imm, a, false, tcg_gen_gvec_shli)
+
+#define DO_ZZW(NAME, name) \
+    static gen_helper_gvec_3 * const name##_zzw_fns[4] = {                \
+        gen_helper_sve_##name##_zzw_b, gen_helper_sve_##name##_zzw_h,     \
+        gen_helper_sve_##name##_zzw_s, NULL                               \
+    };                                                                    \
+    TRANS_FEAT(NAME, aa64_sve, gen_gvec_ool_arg_zzz,                      \
+               name##_zzw_fns[a->esz], a, 0)
+
+DO_ZZW(ASR_zzw, asr)
+DO_ZZW(LSR_zzw, lsr)
+DO_ZZW(LSL_zzw, lsl)
+
+#undef DO_ZZW
+
+/*
+ *** SVE Integer Multiply-Add Group
+ */
+
+static bool do_zpzzz_ool(DisasContext *s, arg_rprrr_esz *a,
+                         gen_helper_gvec_5 *fn)
+{
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        tcg_gen_gvec_5_ool(vec_full_reg_offset(s, a->rd),
+                           vec_full_reg_offset(s, a->ra),
+                           vec_full_reg_offset(s, a->rn),
+                           vec_full_reg_offset(s, a->rm),
+                           pred_full_reg_offset(s, a->pg),
+                           vsz, vsz, 0, fn);
+    }
+    return true;
+}
+
+static gen_helper_gvec_5 * const mla_fns[4] = {
+    gen_helper_sve_mla_b, gen_helper_sve_mla_h,
+    gen_helper_sve_mla_s, gen_helper_sve_mla_d,
+};
+TRANS_FEAT(MLA, aa64_sve, do_zpzzz_ool, a, mla_fns[a->esz])
+
+static gen_helper_gvec_5 * const mls_fns[4] = {
+    gen_helper_sve_mls_b, gen_helper_sve_mls_h,
+    gen_helper_sve_mls_s, gen_helper_sve_mls_d,
+};
+TRANS_FEAT(MLS, aa64_sve, do_zpzzz_ool, a, mls_fns[a->esz])
+
+/*
+ *** SVE Index Generation Group
+ */
+
+static bool do_index(DisasContext *s, int esz, int rd,
+                     TCGv_i64 start, TCGv_i64 incr)
+{
+    unsigned vsz;
+    TCGv_i32 desc;
+    TCGv_ptr t_zd;
+
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    vsz = vec_full_reg_size(s);
+    desc = tcg_constant_i32(simd_desc(vsz, vsz, 0));
+    t_zd = tcg_temp_new_ptr();
+
+    tcg_gen_addi_ptr(t_zd, cpu_env, vec_full_reg_offset(s, rd));
+    if (esz == 3) {
+        gen_helper_sve_index_d(t_zd, start, incr, desc);
+    } else {
+        typedef void index_fn(TCGv_ptr, TCGv_i32, TCGv_i32, TCGv_i32);
+        static index_fn * const fns[3] = {
+            gen_helper_sve_index_b,
+            gen_helper_sve_index_h,
+            gen_helper_sve_index_s,
+        };
+        TCGv_i32 s32 = tcg_temp_new_i32();
+        TCGv_i32 i32 = tcg_temp_new_i32();
+
+        tcg_gen_extrl_i64_i32(s32, start);
+        tcg_gen_extrl_i64_i32(i32, incr);
+        fns[esz](t_zd, s32, i32, desc);
+
+        tcg_temp_free_i32(s32);
+        tcg_temp_free_i32(i32);
+    }
+    tcg_temp_free_ptr(t_zd);
+    return true;
+}
+
+TRANS_FEAT(INDEX_ii, aa64_sve, do_index, a->esz, a->rd,
+           tcg_constant_i64(a->imm1), tcg_constant_i64(a->imm2))
+TRANS_FEAT(INDEX_ir, aa64_sve, do_index, a->esz, a->rd,
+           tcg_constant_i64(a->imm), cpu_reg(s, a->rm))
+TRANS_FEAT(INDEX_ri, aa64_sve, do_index, a->esz, a->rd,
+           cpu_reg(s, a->rn), tcg_constant_i64(a->imm))
+TRANS_FEAT(INDEX_rr, aa64_sve, do_index, a->esz, a->rd,
+           cpu_reg(s, a->rn), cpu_reg(s, a->rm))
+
+/*
+ *** SVE Stack Allocation Group
+ */
+
+static bool trans_ADDVL(DisasContext *s, arg_ADDVL *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        TCGv_i64 rd = cpu_reg_sp(s, a->rd);
+        TCGv_i64 rn = cpu_reg_sp(s, a->rn);
+        tcg_gen_addi_i64(rd, rn, a->imm * vec_full_reg_size(s));
+    }
+    return true;
+}
+
+static bool trans_ADDSVL(DisasContext *s, arg_ADDSVL *a)
+{
+    if (!dc_isar_feature(aa64_sme, s)) {
+        return false;
+    }
+    if (sme_enabled_check(s)) {
+        TCGv_i64 rd = cpu_reg_sp(s, a->rd);
+        TCGv_i64 rn = cpu_reg_sp(s, a->rn);
+        tcg_gen_addi_i64(rd, rn, a->imm * streaming_vec_reg_size(s));
+    }
+    return true;
+}
+
+static bool trans_ADDPL(DisasContext *s, arg_ADDPL *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        TCGv_i64 rd = cpu_reg_sp(s, a->rd);
+        TCGv_i64 rn = cpu_reg_sp(s, a->rn);
+        tcg_gen_addi_i64(rd, rn, a->imm * pred_full_reg_size(s));
+    }
+    return true;
+}
+
+static bool trans_ADDSPL(DisasContext *s, arg_ADDSPL *a)
+{
+    if (!dc_isar_feature(aa64_sme, s)) {
+        return false;
+    }
+    if (sme_enabled_check(s)) {
+        TCGv_i64 rd = cpu_reg_sp(s, a->rd);
+        TCGv_i64 rn = cpu_reg_sp(s, a->rn);
+        tcg_gen_addi_i64(rd, rn, a->imm * streaming_pred_reg_size(s));
+    }
+    return true;
+}
+
+static bool trans_RDVL(DisasContext *s, arg_RDVL *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        TCGv_i64 reg = cpu_reg(s, a->rd);
+        tcg_gen_movi_i64(reg, a->imm * vec_full_reg_size(s));
+    }
+    return true;
+}
+
+static bool trans_RDSVL(DisasContext *s, arg_RDSVL *a)
+{
+    if (!dc_isar_feature(aa64_sme, s)) {
+        return false;
+    }
+    if (sme_enabled_check(s)) {
+        TCGv_i64 reg = cpu_reg(s, a->rd);
+        tcg_gen_movi_i64(reg, a->imm * streaming_vec_reg_size(s));
+    }
+    return true;
+}
+
+/*
+ *** SVE Compute Vector Address Group
+ */
+
+static bool do_adr(DisasContext *s, arg_rrri *a, gen_helper_gvec_3 *fn)
+{
+    return gen_gvec_ool_zzz(s, fn, a->rd, a->rn, a->rm, a->imm);
+}
+
+TRANS_FEAT_NONSTREAMING(ADR_p32, aa64_sve, do_adr, a, gen_helper_sve_adr_p32)
+TRANS_FEAT_NONSTREAMING(ADR_p64, aa64_sve, do_adr, a, gen_helper_sve_adr_p64)
+TRANS_FEAT_NONSTREAMING(ADR_s32, aa64_sve, do_adr, a, gen_helper_sve_adr_s32)
+TRANS_FEAT_NONSTREAMING(ADR_u32, aa64_sve, do_adr, a, gen_helper_sve_adr_u32)
+
+/*
+ *** SVE Integer Misc - Unpredicated Group
+ */
+
+static gen_helper_gvec_2 * const fexpa_fns[4] = {
+    NULL,                   gen_helper_sve_fexpa_h,
+    gen_helper_sve_fexpa_s, gen_helper_sve_fexpa_d,
+};
+TRANS_FEAT_NONSTREAMING(FEXPA, aa64_sve, gen_gvec_ool_zz,
+                        fexpa_fns[a->esz], a->rd, a->rn, 0)
+
+static gen_helper_gvec_3 * const ftssel_fns[4] = {
+    NULL,                    gen_helper_sve_ftssel_h,
+    gen_helper_sve_ftssel_s, gen_helper_sve_ftssel_d,
+};
+TRANS_FEAT_NONSTREAMING(FTSSEL, aa64_sve, gen_gvec_ool_arg_zzz,
+                        ftssel_fns[a->esz], a, 0)
+
+/*
+ *** SVE Predicate Logical Operations Group
+ */
+
+static bool do_pppp_flags(DisasContext *s, arg_rprr_s *a,
+                          const GVecGen4 *gvec_op)
+{
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    unsigned psz = pred_gvec_reg_size(s);
+    int dofs = pred_full_reg_offset(s, a->rd);
+    int nofs = pred_full_reg_offset(s, a->rn);
+    int mofs = pred_full_reg_offset(s, a->rm);
+    int gofs = pred_full_reg_offset(s, a->pg);
+
+    if (!a->s) {
+        tcg_gen_gvec_4(dofs, nofs, mofs, gofs, psz, psz, gvec_op);
+        return true;
+    }
+
+    if (psz == 8) {
+        /* Do the operation and the flags generation in temps.  */
+        TCGv_i64 pd = tcg_temp_new_i64();
+        TCGv_i64 pn = tcg_temp_new_i64();
+        TCGv_i64 pm = tcg_temp_new_i64();
+        TCGv_i64 pg = tcg_temp_new_i64();
+
+        tcg_gen_ld_i64(pn, cpu_env, nofs);
+        tcg_gen_ld_i64(pm, cpu_env, mofs);
+        tcg_gen_ld_i64(pg, cpu_env, gofs);
+
+        gvec_op->fni8(pd, pn, pm, pg);
+        tcg_gen_st_i64(pd, cpu_env, dofs);
+
+        do_predtest1(pd, pg);
+
+        tcg_temp_free_i64(pd);
+        tcg_temp_free_i64(pn);
+        tcg_temp_free_i64(pm);
+        tcg_temp_free_i64(pg);
+    } else {
+        /* The operation and flags generation is large.  The computation
+         * of the flags depends on the original contents of the guarding
+         * predicate.  If the destination overwrites the guarding predicate,
+         * then the easiest way to get this right is to save a copy.
+          */
+        int tofs = gofs;
+        if (a->rd == a->pg) {
+            tofs = offsetof(CPUARMState, vfp.preg_tmp);
+            tcg_gen_gvec_mov(0, tofs, gofs, psz, psz);
+        }
+
+        tcg_gen_gvec_4(dofs, nofs, mofs, gofs, psz, psz, gvec_op);
+        do_predtest(s, dofs, tofs, psz / 8);
+    }
+    return true;
+}
+
+static void gen_and_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg)
+{
+    tcg_gen_and_i64(pd, pn, pm);
+    tcg_gen_and_i64(pd, pd, pg);
+}
+
+static void gen_and_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn,
+                           TCGv_vec pm, TCGv_vec pg)
+{
+    tcg_gen_and_vec(vece, pd, pn, pm);
+    tcg_gen_and_vec(vece, pd, pd, pg);
+}
+
+static bool trans_AND_pppp(DisasContext *s, arg_rprr_s *a)
+{
+    static const GVecGen4 op = {
+        .fni8 = gen_and_pg_i64,
+        .fniv = gen_and_pg_vec,
+        .fno = gen_helper_sve_and_pppp,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+    };
+
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (!a->s) {
+        if (a->rn == a->rm) {
+            if (a->pg == a->rn) {
+                return do_mov_p(s, a->rd, a->rn);
+            }
+            return gen_gvec_fn_ppp(s, tcg_gen_gvec_and, a->rd, a->rn, a->pg);
+        } else if (a->pg == a->rn || a->pg == a->rm) {
+            return gen_gvec_fn_ppp(s, tcg_gen_gvec_and, a->rd, a->rn, a->rm);
+        }
+    }
+    return do_pppp_flags(s, a, &op);
+}
+
+static void gen_bic_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg)
+{
+    tcg_gen_andc_i64(pd, pn, pm);
+    tcg_gen_and_i64(pd, pd, pg);
+}
+
+static void gen_bic_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn,
+                           TCGv_vec pm, TCGv_vec pg)
+{
+    tcg_gen_andc_vec(vece, pd, pn, pm);
+    tcg_gen_and_vec(vece, pd, pd, pg);
+}
+
+static bool trans_BIC_pppp(DisasContext *s, arg_rprr_s *a)
+{
+    static const GVecGen4 op = {
+        .fni8 = gen_bic_pg_i64,
+        .fniv = gen_bic_pg_vec,
+        .fno = gen_helper_sve_bic_pppp,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+    };
+
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (!a->s && a->pg == a->rn) {
+        return gen_gvec_fn_ppp(s, tcg_gen_gvec_andc, a->rd, a->rn, a->rm);
+    }
+    return do_pppp_flags(s, a, &op);
+}
+
+static void gen_eor_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg)
+{
+    tcg_gen_xor_i64(pd, pn, pm);
+    tcg_gen_and_i64(pd, pd, pg);
+}
+
+static void gen_eor_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn,
+                           TCGv_vec pm, TCGv_vec pg)
+{
+    tcg_gen_xor_vec(vece, pd, pn, pm);
+    tcg_gen_and_vec(vece, pd, pd, pg);
+}
+
+static bool trans_EOR_pppp(DisasContext *s, arg_rprr_s *a)
+{
+    static const GVecGen4 op = {
+        .fni8 = gen_eor_pg_i64,
+        .fniv = gen_eor_pg_vec,
+        .fno = gen_helper_sve_eor_pppp,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+    };
+
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    /* Alias NOT (predicate) is EOR Pd.B, Pg/Z, Pn.B, Pg.B */
+    if (!a->s && a->pg == a->rm) {
+        return gen_gvec_fn_ppp(s, tcg_gen_gvec_andc, a->rd, a->pg, a->rn);
+    }
+    return do_pppp_flags(s, a, &op);
+}
+
+static bool trans_SEL_pppp(DisasContext *s, arg_rprr_s *a)
+{
+    if (a->s || !dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned psz = pred_gvec_reg_size(s);
+        tcg_gen_gvec_bitsel(MO_8, pred_full_reg_offset(s, a->rd),
+                            pred_full_reg_offset(s, a->pg),
+                            pred_full_reg_offset(s, a->rn),
+                            pred_full_reg_offset(s, a->rm), psz, psz);
+    }
+    return true;
+}
+
+static void gen_orr_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg)
+{
+    tcg_gen_or_i64(pd, pn, pm);
+    tcg_gen_and_i64(pd, pd, pg);
+}
+
+static void gen_orr_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn,
+                           TCGv_vec pm, TCGv_vec pg)
+{
+    tcg_gen_or_vec(vece, pd, pn, pm);
+    tcg_gen_and_vec(vece, pd, pd, pg);
+}
+
+static bool trans_ORR_pppp(DisasContext *s, arg_rprr_s *a)
+{
+    static const GVecGen4 op = {
+        .fni8 = gen_orr_pg_i64,
+        .fniv = gen_orr_pg_vec,
+        .fno = gen_helper_sve_orr_pppp,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+    };
+
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (!a->s && a->pg == a->rn && a->rn == a->rm) {
+        return do_mov_p(s, a->rd, a->rn);
+    }
+    return do_pppp_flags(s, a, &op);
+}
+
+static void gen_orn_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg)
+{
+    tcg_gen_orc_i64(pd, pn, pm);
+    tcg_gen_and_i64(pd, pd, pg);
+}
+
+static void gen_orn_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn,
+                           TCGv_vec pm, TCGv_vec pg)
+{
+    tcg_gen_orc_vec(vece, pd, pn, pm);
+    tcg_gen_and_vec(vece, pd, pd, pg);
+}
+
+static bool trans_ORN_pppp(DisasContext *s, arg_rprr_s *a)
+{
+    static const GVecGen4 op = {
+        .fni8 = gen_orn_pg_i64,
+        .fniv = gen_orn_pg_vec,
+        .fno = gen_helper_sve_orn_pppp,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+    };
+
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    return do_pppp_flags(s, a, &op);
+}
+
+static void gen_nor_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg)
+{
+    tcg_gen_or_i64(pd, pn, pm);
+    tcg_gen_andc_i64(pd, pg, pd);
+}
+
+static void gen_nor_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn,
+                           TCGv_vec pm, TCGv_vec pg)
+{
+    tcg_gen_or_vec(vece, pd, pn, pm);
+    tcg_gen_andc_vec(vece, pd, pg, pd);
+}
+
+static bool trans_NOR_pppp(DisasContext *s, arg_rprr_s *a)
+{
+    static const GVecGen4 op = {
+        .fni8 = gen_nor_pg_i64,
+        .fniv = gen_nor_pg_vec,
+        .fno = gen_helper_sve_nor_pppp,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+    };
+
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    return do_pppp_flags(s, a, &op);
+}
+
+static void gen_nand_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg)
+{
+    tcg_gen_and_i64(pd, pn, pm);
+    tcg_gen_andc_i64(pd, pg, pd);
+}
+
+static void gen_nand_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn,
+                           TCGv_vec pm, TCGv_vec pg)
+{
+    tcg_gen_and_vec(vece, pd, pn, pm);
+    tcg_gen_andc_vec(vece, pd, pg, pd);
+}
+
+static bool trans_NAND_pppp(DisasContext *s, arg_rprr_s *a)
+{
+    static const GVecGen4 op = {
+        .fni8 = gen_nand_pg_i64,
+        .fniv = gen_nand_pg_vec,
+        .fno = gen_helper_sve_nand_pppp,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+    };
+
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    return do_pppp_flags(s, a, &op);
+}
+
+/*
+ *** SVE Predicate Misc Group
+ */
+
+static bool trans_PTEST(DisasContext *s, arg_PTEST *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        int nofs = pred_full_reg_offset(s, a->rn);
+        int gofs = pred_full_reg_offset(s, a->pg);
+        int words = DIV_ROUND_UP(pred_full_reg_size(s), 8);
+
+        if (words == 1) {
+            TCGv_i64 pn = tcg_temp_new_i64();
+            TCGv_i64 pg = tcg_temp_new_i64();
+
+            tcg_gen_ld_i64(pn, cpu_env, nofs);
+            tcg_gen_ld_i64(pg, cpu_env, gofs);
+            do_predtest1(pn, pg);
+
+            tcg_temp_free_i64(pn);
+            tcg_temp_free_i64(pg);
+        } else {
+            do_predtest(s, nofs, gofs, words);
+        }
+    }
+    return true;
+}
+
+/* See the ARM pseudocode DecodePredCount.  */
+static unsigned decode_pred_count(unsigned fullsz, int pattern, int esz)
+{
+    unsigned elements = fullsz >> esz;
+    unsigned bound;
+
+    switch (pattern) {
+    case 0x0: /* POW2 */
+        return pow2floor(elements);
+    case 0x1: /* VL1 */
+    case 0x2: /* VL2 */
+    case 0x3: /* VL3 */
+    case 0x4: /* VL4 */
+    case 0x5: /* VL5 */
+    case 0x6: /* VL6 */
+    case 0x7: /* VL7 */
+    case 0x8: /* VL8 */
+        bound = pattern;
+        break;
+    case 0x9: /* VL16 */
+    case 0xa: /* VL32 */
+    case 0xb: /* VL64 */
+    case 0xc: /* VL128 */
+    case 0xd: /* VL256 */
+        bound = 16 << (pattern - 9);
+        break;
+    case 0x1d: /* MUL4 */
+        return elements - elements % 4;
+    case 0x1e: /* MUL3 */
+        return elements - elements % 3;
+    case 0x1f: /* ALL */
+        return elements;
+    default:   /* #uimm5 */
+        return 0;
+    }
+    return elements >= bound ? bound : 0;
+}
+
+/* This handles all of the predicate initialization instructions,
+ * PTRUE, PFALSE, SETFFR.  For PFALSE, we will have set PAT == 32
+ * so that decode_pred_count returns 0.  For SETFFR, we will have
+ * set RD == 16 == FFR.
+ */
+static bool do_predset(DisasContext *s, int esz, int rd, int pat, bool setflag)
+{
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    unsigned fullsz = vec_full_reg_size(s);
+    unsigned ofs = pred_full_reg_offset(s, rd);
+    unsigned numelem, setsz, i;
+    uint64_t word, lastword;
+    TCGv_i64 t;
+
+    numelem = decode_pred_count(fullsz, pat, esz);
+
+    /* Determine what we must store into each bit, and how many.  */
+    if (numelem == 0) {
+        lastword = word = 0;
+        setsz = fullsz;
+    } else {
+        setsz = numelem << esz;
+        lastword = word = pred_esz_masks[esz];
+        if (setsz % 64) {
+            lastword &= MAKE_64BIT_MASK(0, setsz % 64);
+        }
+    }
+
+    t = tcg_temp_new_i64();
+    if (fullsz <= 64) {
+        tcg_gen_movi_i64(t, lastword);
+        tcg_gen_st_i64(t, cpu_env, ofs);
+        goto done;
+    }
+
+    if (word == lastword) {
+        unsigned maxsz = size_for_gvec(fullsz / 8);
+        unsigned oprsz = size_for_gvec(setsz / 8);
+
+        if (oprsz * 8 == setsz) {
+            tcg_gen_gvec_dup_imm(MO_64, ofs, oprsz, maxsz, word);
+            goto done;
+        }
+    }
+
+    setsz /= 8;
+    fullsz /= 8;
+
+    tcg_gen_movi_i64(t, word);
+    for (i = 0; i < QEMU_ALIGN_DOWN(setsz, 8); i += 8) {
+        tcg_gen_st_i64(t, cpu_env, ofs + i);
+    }
+    if (lastword != word) {
+        tcg_gen_movi_i64(t, lastword);
+        tcg_gen_st_i64(t, cpu_env, ofs + i);
+        i += 8;
+    }
+    if (i < fullsz) {
+        tcg_gen_movi_i64(t, 0);
+        for (; i < fullsz; i += 8) {
+            tcg_gen_st_i64(t, cpu_env, ofs + i);
+        }
+    }
+
+ done:
+    tcg_temp_free_i64(t);
+
+    /* PTRUES */
+    if (setflag) {
+        tcg_gen_movi_i32(cpu_NF, -(word != 0));
+        tcg_gen_movi_i32(cpu_CF, word == 0);
+        tcg_gen_movi_i32(cpu_VF, 0);
+        tcg_gen_mov_i32(cpu_ZF, cpu_NF);
+    }
+    return true;
+}
+
+TRANS_FEAT(PTRUE, aa64_sve, do_predset, a->esz, a->rd, a->pat, a->s)
+
+/* Note pat == 31 is #all, to set all elements.  */
+TRANS_FEAT_NONSTREAMING(SETFFR, aa64_sve,
+                        do_predset, 0, FFR_PRED_NUM, 31, false)
+
+/* Note pat == 32 is #unimp, to set no elements.  */
+TRANS_FEAT(PFALSE, aa64_sve, do_predset, 0, a->rd, 32, false)
+
+static bool trans_RDFFR_p(DisasContext *s, arg_RDFFR_p *a)
+{
+    /* The path through do_pppp_flags is complicated enough to want to avoid
+     * duplication.  Frob the arguments into the form of a predicated AND.
+     */
+    arg_rprr_s alt_a = {
+        .rd = a->rd, .pg = a->pg, .s = a->s,
+        .rn = FFR_PRED_NUM, .rm = FFR_PRED_NUM,
+    };
+
+    s->is_nonstreaming = true;
+    return trans_AND_pppp(s, &alt_a);
+}
+
+TRANS_FEAT_NONSTREAMING(RDFFR, aa64_sve, do_mov_p, a->rd, FFR_PRED_NUM)
+TRANS_FEAT_NONSTREAMING(WRFFR, aa64_sve, do_mov_p, FFR_PRED_NUM, a->rn)
+
+static bool do_pfirst_pnext(DisasContext *s, arg_rr_esz *a,
+                            void (*gen_fn)(TCGv_i32, TCGv_ptr,
+                                           TCGv_ptr, TCGv_i32))
+{
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    TCGv_ptr t_pd = tcg_temp_new_ptr();
+    TCGv_ptr t_pg = tcg_temp_new_ptr();
+    TCGv_i32 t;
+    unsigned desc = 0;
+
+    desc = FIELD_DP32(desc, PREDDESC, OPRSZ, pred_full_reg_size(s));
+    desc = FIELD_DP32(desc, PREDDESC, ESZ, a->esz);
+
+    tcg_gen_addi_ptr(t_pd, cpu_env, pred_full_reg_offset(s, a->rd));
+    tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, a->rn));
+    t = tcg_temp_new_i32();
+
+    gen_fn(t, t_pd, t_pg, tcg_constant_i32(desc));
+    tcg_temp_free_ptr(t_pd);
+    tcg_temp_free_ptr(t_pg);
+
+    do_pred_flags(t);
+    tcg_temp_free_i32(t);
+    return true;
+}
+
+TRANS_FEAT(PFIRST, aa64_sve, do_pfirst_pnext, a, gen_helper_sve_pfirst)
+TRANS_FEAT(PNEXT, aa64_sve, do_pfirst_pnext, a, gen_helper_sve_pnext)
+
+/*
+ *** SVE Element Count Group
+ */
+
+/* Perform an inline saturating addition of a 32-bit value within
+ * a 64-bit register.  The second operand is known to be positive,
+ * which halves the comparisions we must perform to bound the result.
+ */
+static void do_sat_addsub_32(TCGv_i64 reg, TCGv_i64 val, bool u, bool d)
+{
+    int64_t ibound;
+
+    /* Use normal 64-bit arithmetic to detect 32-bit overflow.  */
+    if (u) {
+        tcg_gen_ext32u_i64(reg, reg);
+    } else {
+        tcg_gen_ext32s_i64(reg, reg);
+    }
+    if (d) {
+        tcg_gen_sub_i64(reg, reg, val);
+        ibound = (u ? 0 : INT32_MIN);
+        tcg_gen_smax_i64(reg, reg, tcg_constant_i64(ibound));
+    } else {
+        tcg_gen_add_i64(reg, reg, val);
+        ibound = (u ? UINT32_MAX : INT32_MAX);
+        tcg_gen_smin_i64(reg, reg, tcg_constant_i64(ibound));
+    }
+}
+
+/* Similarly with 64-bit values.  */
+static void do_sat_addsub_64(TCGv_i64 reg, TCGv_i64 val, bool u, bool d)
+{
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    TCGv_i64 t2;
+
+    if (u) {
+        if (d) {
+            tcg_gen_sub_i64(t0, reg, val);
+            t2 = tcg_constant_i64(0);
+            tcg_gen_movcond_i64(TCG_COND_LTU, reg, reg, val, t2, t0);
+        } else {
+            tcg_gen_add_i64(t0, reg, val);
+            t2 = tcg_constant_i64(-1);
+            tcg_gen_movcond_i64(TCG_COND_LTU, reg, t0, reg, t2, t0);
+        }
+    } else {
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        if (d) {
+            /* Detect signed overflow for subtraction.  */
+            tcg_gen_xor_i64(t0, reg, val);
+            tcg_gen_sub_i64(t1, reg, val);
+            tcg_gen_xor_i64(reg, reg, t1);
+            tcg_gen_and_i64(t0, t0, reg);
+
+            /* Bound the result.  */
+            tcg_gen_movi_i64(reg, INT64_MIN);
+            t2 = tcg_constant_i64(0);
+            tcg_gen_movcond_i64(TCG_COND_LT, reg, t0, t2, reg, t1);
+        } else {
+            /* Detect signed overflow for addition.  */
+            tcg_gen_xor_i64(t0, reg, val);
+            tcg_gen_add_i64(reg, reg, val);
+            tcg_gen_xor_i64(t1, reg, val);
+            tcg_gen_andc_i64(t0, t1, t0);
+
+            /* Bound the result.  */
+            tcg_gen_movi_i64(t1, INT64_MAX);
+            t2 = tcg_constant_i64(0);
+            tcg_gen_movcond_i64(TCG_COND_LT, reg, t0, t2, t1, reg);
+        }
+        tcg_temp_free_i64(t1);
+    }
+    tcg_temp_free_i64(t0);
+}
+
+/* Similarly with a vector and a scalar operand.  */
+static void do_sat_addsub_vec(DisasContext *s, int esz, int rd, int rn,
+                              TCGv_i64 val, bool u, bool d)
+{
+    unsigned vsz = vec_full_reg_size(s);
+    TCGv_ptr dptr, nptr;
+    TCGv_i32 t32, desc;
+    TCGv_i64 t64;
+
+    dptr = tcg_temp_new_ptr();
+    nptr = tcg_temp_new_ptr();
+    tcg_gen_addi_ptr(dptr, cpu_env, vec_full_reg_offset(s, rd));
+    tcg_gen_addi_ptr(nptr, cpu_env, vec_full_reg_offset(s, rn));
+    desc = tcg_constant_i32(simd_desc(vsz, vsz, 0));
+
+    switch (esz) {
+    case MO_8:
+        t32 = tcg_temp_new_i32();
+        tcg_gen_extrl_i64_i32(t32, val);
+        if (d) {
+            tcg_gen_neg_i32(t32, t32);
+        }
+        if (u) {
+            gen_helper_sve_uqaddi_b(dptr, nptr, t32, desc);
+        } else {
+            gen_helper_sve_sqaddi_b(dptr, nptr, t32, desc);
+        }
+        tcg_temp_free_i32(t32);
+        break;
+
+    case MO_16:
+        t32 = tcg_temp_new_i32();
+        tcg_gen_extrl_i64_i32(t32, val);
+        if (d) {
+            tcg_gen_neg_i32(t32, t32);
+        }
+        if (u) {
+            gen_helper_sve_uqaddi_h(dptr, nptr, t32, desc);
+        } else {
+            gen_helper_sve_sqaddi_h(dptr, nptr, t32, desc);
+        }
+        tcg_temp_free_i32(t32);
+        break;
+
+    case MO_32:
+        t64 = tcg_temp_new_i64();
+        if (d) {
+            tcg_gen_neg_i64(t64, val);
+        } else {
+            tcg_gen_mov_i64(t64, val);
+        }
+        if (u) {
+            gen_helper_sve_uqaddi_s(dptr, nptr, t64, desc);
+        } else {
+            gen_helper_sve_sqaddi_s(dptr, nptr, t64, desc);
+        }
+        tcg_temp_free_i64(t64);
+        break;
+
+    case MO_64:
+        if (u) {
+            if (d) {
+                gen_helper_sve_uqsubi_d(dptr, nptr, val, desc);
+            } else {
+                gen_helper_sve_uqaddi_d(dptr, nptr, val, desc);
+            }
+        } else if (d) {
+            t64 = tcg_temp_new_i64();
+            tcg_gen_neg_i64(t64, val);
+            gen_helper_sve_sqaddi_d(dptr, nptr, t64, desc);
+            tcg_temp_free_i64(t64);
+        } else {
+            gen_helper_sve_sqaddi_d(dptr, nptr, val, desc);
+        }
+        break;
+
+    default:
+        g_assert_not_reached();
+    }
+
+    tcg_temp_free_ptr(dptr);
+    tcg_temp_free_ptr(nptr);
+}
+
+static bool trans_CNT_r(DisasContext *s, arg_CNT_r *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned fullsz = vec_full_reg_size(s);
+        unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz);
+        tcg_gen_movi_i64(cpu_reg(s, a->rd), numelem * a->imm);
+    }
+    return true;
+}
+
+static bool trans_INCDEC_r(DisasContext *s, arg_incdec_cnt *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned fullsz = vec_full_reg_size(s);
+        unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz);
+        int inc = numelem * a->imm * (a->d ? -1 : 1);
+        TCGv_i64 reg = cpu_reg(s, a->rd);
+
+        tcg_gen_addi_i64(reg, reg, inc);
+    }
+    return true;
+}
+
+static bool trans_SINCDEC_r_32(DisasContext *s, arg_incdec_cnt *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    unsigned fullsz = vec_full_reg_size(s);
+    unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz);
+    int inc = numelem * a->imm;
+    TCGv_i64 reg = cpu_reg(s, a->rd);
+
+    /* Use normal 64-bit arithmetic to detect 32-bit overflow.  */
+    if (inc == 0) {
+        if (a->u) {
+            tcg_gen_ext32u_i64(reg, reg);
+        } else {
+            tcg_gen_ext32s_i64(reg, reg);
+        }
+    } else {
+        do_sat_addsub_32(reg, tcg_constant_i64(inc), a->u, a->d);
+    }
+    return true;
+}
+
+static bool trans_SINCDEC_r_64(DisasContext *s, arg_incdec_cnt *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    unsigned fullsz = vec_full_reg_size(s);
+    unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz);
+    int inc = numelem * a->imm;
+    TCGv_i64 reg = cpu_reg(s, a->rd);
+
+    if (inc != 0) {
+        do_sat_addsub_64(reg, tcg_constant_i64(inc), a->u, a->d);
+    }
+    return true;
+}
+
+static bool trans_INCDEC_v(DisasContext *s, arg_incdec2_cnt *a)
+{
+    if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+
+    unsigned fullsz = vec_full_reg_size(s);
+    unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz);
+    int inc = numelem * a->imm;
+
+    if (inc != 0) {
+        if (sve_access_check(s)) {
+            tcg_gen_gvec_adds(a->esz, vec_full_reg_offset(s, a->rd),
+                              vec_full_reg_offset(s, a->rn),
+                              tcg_constant_i64(a->d ? -inc : inc),
+                              fullsz, fullsz);
+        }
+    } else {
+        do_mov_z(s, a->rd, a->rn);
+    }
+    return true;
+}
+
+static bool trans_SINCDEC_v(DisasContext *s, arg_incdec2_cnt *a)
+{
+    if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+
+    unsigned fullsz = vec_full_reg_size(s);
+    unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz);
+    int inc = numelem * a->imm;
+
+    if (inc != 0) {
+        if (sve_access_check(s)) {
+            do_sat_addsub_vec(s, a->esz, a->rd, a->rn,
+                              tcg_constant_i64(inc), a->u, a->d);
+        }
+    } else {
+        do_mov_z(s, a->rd, a->rn);
+    }
+    return true;
+}
+
+/*
+ *** SVE Bitwise Immediate Group
+ */
+
+static bool do_zz_dbm(DisasContext *s, arg_rr_dbm *a, GVecGen2iFn *gvec_fn)
+{
+    uint64_t imm;
+    if (!logic_imm_decode_wmask(&imm, extract32(a->dbm, 12, 1),
+                                extract32(a->dbm, 0, 6),
+                                extract32(a->dbm, 6, 6))) {
+        return false;
+    }
+    return gen_gvec_fn_zzi(s, gvec_fn, MO_64, a->rd, a->rn, imm);
+}
+
+TRANS_FEAT(AND_zzi, aa64_sve, do_zz_dbm, a, tcg_gen_gvec_andi)
+TRANS_FEAT(ORR_zzi, aa64_sve, do_zz_dbm, a, tcg_gen_gvec_ori)
+TRANS_FEAT(EOR_zzi, aa64_sve, do_zz_dbm, a, tcg_gen_gvec_xori)
+
+static bool trans_DUPM(DisasContext *s, arg_DUPM *a)
+{
+    uint64_t imm;
+
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (!logic_imm_decode_wmask(&imm, extract32(a->dbm, 12, 1),
+                                extract32(a->dbm, 0, 6),
+                                extract32(a->dbm, 6, 6))) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        do_dupi_z(s, a->rd, imm);
+    }
+    return true;
+}
+
+/*
+ *** SVE Integer Wide Immediate - Predicated Group
+ */
+
+/* Implement all merging copies.  This is used for CPY (immediate),
+ * FCPY, CPY (scalar), CPY (SIMD&FP scalar).
+ */
+static void do_cpy_m(DisasContext *s, int esz, int rd, int rn, int pg,
+                     TCGv_i64 val)
+{
+    typedef void gen_cpy(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i64, TCGv_i32);
+    static gen_cpy * const fns[4] = {
+        gen_helper_sve_cpy_m_b, gen_helper_sve_cpy_m_h,
+        gen_helper_sve_cpy_m_s, gen_helper_sve_cpy_m_d,
+    };
+    unsigned vsz = vec_full_reg_size(s);
+    TCGv_i32 desc = tcg_constant_i32(simd_desc(vsz, vsz, 0));
+    TCGv_ptr t_zd = tcg_temp_new_ptr();
+    TCGv_ptr t_zn = tcg_temp_new_ptr();
+    TCGv_ptr t_pg = tcg_temp_new_ptr();
+
+    tcg_gen_addi_ptr(t_zd, cpu_env, vec_full_reg_offset(s, rd));
+    tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, rn));
+    tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg));
+
+    fns[esz](t_zd, t_zn, t_pg, val, desc);
+
+    tcg_temp_free_ptr(t_zd);
+    tcg_temp_free_ptr(t_zn);
+    tcg_temp_free_ptr(t_pg);
+}
+
+static bool trans_FCPY(DisasContext *s, arg_FCPY *a)
+{
+    if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        /* Decode the VFP immediate.  */
+        uint64_t imm = vfp_expand_imm(a->esz, a->imm);
+        do_cpy_m(s, a->esz, a->rd, a->rn, a->pg, tcg_constant_i64(imm));
+    }
+    return true;
+}
+
+static bool trans_CPY_m_i(DisasContext *s, arg_rpri_esz *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        do_cpy_m(s, a->esz, a->rd, a->rn, a->pg, tcg_constant_i64(a->imm));
+    }
+    return true;
+}
+
+static bool trans_CPY_z_i(DisasContext *s, arg_CPY_z_i *a)
+{
+    static gen_helper_gvec_2i * const fns[4] = {
+        gen_helper_sve_cpy_z_b, gen_helper_sve_cpy_z_h,
+        gen_helper_sve_cpy_z_s, gen_helper_sve_cpy_z_d,
+    };
+
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        tcg_gen_gvec_2i_ool(vec_full_reg_offset(s, a->rd),
+                            pred_full_reg_offset(s, a->pg),
+                            tcg_constant_i64(a->imm),
+                            vsz, vsz, 0, fns[a->esz]);
+    }
+    return true;
+}
+
+/*
+ *** SVE Permute Extract Group
+ */
+
+static bool do_EXT(DisasContext *s, int rd, int rn, int rm, int imm)
+{
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    unsigned vsz = vec_full_reg_size(s);
+    unsigned n_ofs = imm >= vsz ? 0 : imm;
+    unsigned n_siz = vsz - n_ofs;
+    unsigned d = vec_full_reg_offset(s, rd);
+    unsigned n = vec_full_reg_offset(s, rn);
+    unsigned m = vec_full_reg_offset(s, rm);
+
+    /* Use host vector move insns if we have appropriate sizes
+     * and no unfortunate overlap.
+     */
+    if (m != d
+        && n_ofs == size_for_gvec(n_ofs)
+        && n_siz == size_for_gvec(n_siz)
+        && (d != n || n_siz <= n_ofs)) {
+        tcg_gen_gvec_mov(0, d, n + n_ofs, n_siz, n_siz);
+        if (n_ofs != 0) {
+            tcg_gen_gvec_mov(0, d + n_siz, m, n_ofs, n_ofs);
+        }
+    } else {
+        tcg_gen_gvec_3_ool(d, n, m, vsz, vsz, n_ofs, gen_helper_sve_ext);
+    }
+    return true;
+}
+
+TRANS_FEAT(EXT, aa64_sve, do_EXT, a->rd, a->rn, a->rm, a->imm)
+TRANS_FEAT(EXT_sve2, aa64_sve2, do_EXT, a->rd, a->rn, (a->rn + 1) % 32, a->imm)
+
+/*
+ *** SVE Permute - Unpredicated Group
+ */
+
+static bool trans_DUP_s(DisasContext *s, arg_DUP_s *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        tcg_gen_gvec_dup_i64(a->esz, vec_full_reg_offset(s, a->rd),
+                             vsz, vsz, cpu_reg_sp(s, a->rn));
+    }
+    return true;
+}
+
+static bool trans_DUP_x(DisasContext *s, arg_DUP_x *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if ((a->imm & 0x1f) == 0) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        unsigned dofs = vec_full_reg_offset(s, a->rd);
+        unsigned esz, index;
+
+        esz = ctz32(a->imm);
+        index = a->imm >> (esz + 1);
+
+        if ((index << esz) < vsz) {
+            unsigned nofs = vec_reg_offset(s, a->rn, index, esz);
+            tcg_gen_gvec_dup_mem(esz, dofs, nofs, vsz, vsz);
+        } else {
+            /*
+             * While dup_mem handles 128-bit elements, dup_imm does not.
+             * Thankfully element size doesn't matter for splatting zero.
+             */
+            tcg_gen_gvec_dup_imm(MO_64, dofs, vsz, vsz, 0);
+        }
+    }
+    return true;
+}
+
+static void do_insr_i64(DisasContext *s, arg_rrr_esz *a, TCGv_i64 val)
+{
+    typedef void gen_insr(TCGv_ptr, TCGv_ptr, TCGv_i64, TCGv_i32);
+    static gen_insr * const fns[4] = {
+        gen_helper_sve_insr_b, gen_helper_sve_insr_h,
+        gen_helper_sve_insr_s, gen_helper_sve_insr_d,
+    };
+    unsigned vsz = vec_full_reg_size(s);
+    TCGv_i32 desc = tcg_constant_i32(simd_desc(vsz, vsz, 0));
+    TCGv_ptr t_zd = tcg_temp_new_ptr();
+    TCGv_ptr t_zn = tcg_temp_new_ptr();
+
+    tcg_gen_addi_ptr(t_zd, cpu_env, vec_full_reg_offset(s, a->rd));
+    tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, a->rn));
+
+    fns[a->esz](t_zd, t_zn, val, desc);
+
+    tcg_temp_free_ptr(t_zd);
+    tcg_temp_free_ptr(t_zn);
+}
+
+static bool trans_INSR_f(DisasContext *s, arg_rrr_esz *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        TCGv_i64 t = tcg_temp_new_i64();
+        tcg_gen_ld_i64(t, cpu_env, vec_reg_offset(s, a->rm, 0, MO_64));
+        do_insr_i64(s, a, t);
+        tcg_temp_free_i64(t);
+    }
+    return true;
+}
+
+static bool trans_INSR_r(DisasContext *s, arg_rrr_esz *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        do_insr_i64(s, a, cpu_reg(s, a->rm));
+    }
+    return true;
+}
+
+static gen_helper_gvec_2 * const rev_fns[4] = {
+    gen_helper_sve_rev_b, gen_helper_sve_rev_h,
+    gen_helper_sve_rev_s, gen_helper_sve_rev_d
+};
+TRANS_FEAT(REV_v, aa64_sve, gen_gvec_ool_zz, rev_fns[a->esz], a->rd, a->rn, 0)
+
+static gen_helper_gvec_3 * const sve_tbl_fns[4] = {
+    gen_helper_sve_tbl_b, gen_helper_sve_tbl_h,
+    gen_helper_sve_tbl_s, gen_helper_sve_tbl_d
+};
+TRANS_FEAT(TBL, aa64_sve, gen_gvec_ool_arg_zzz, sve_tbl_fns[a->esz], a, 0)
+
+static gen_helper_gvec_4 * const sve2_tbl_fns[4] = {
+    gen_helper_sve2_tbl_b, gen_helper_sve2_tbl_h,
+    gen_helper_sve2_tbl_s, gen_helper_sve2_tbl_d
+};
+TRANS_FEAT(TBL_sve2, aa64_sve2, gen_gvec_ool_zzzz, sve2_tbl_fns[a->esz],
+           a->rd, a->rn, (a->rn + 1) % 32, a->rm, 0)
+
+static gen_helper_gvec_3 * const tbx_fns[4] = {
+    gen_helper_sve2_tbx_b, gen_helper_sve2_tbx_h,
+    gen_helper_sve2_tbx_s, gen_helper_sve2_tbx_d
+};
+TRANS_FEAT(TBX, aa64_sve2, gen_gvec_ool_arg_zzz, tbx_fns[a->esz], a, 0)
+
+static bool trans_UNPK(DisasContext *s, arg_UNPK *a)
+{
+    static gen_helper_gvec_2 * const fns[4][2] = {
+        { NULL, NULL },
+        { gen_helper_sve_sunpk_h, gen_helper_sve_uunpk_h },
+        { gen_helper_sve_sunpk_s, gen_helper_sve_uunpk_s },
+        { gen_helper_sve_sunpk_d, gen_helper_sve_uunpk_d },
+    };
+
+    if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        tcg_gen_gvec_2_ool(vec_full_reg_offset(s, a->rd),
+                           vec_full_reg_offset(s, a->rn)
+                           + (a->h ? vsz / 2 : 0),
+                           vsz, vsz, 0, fns[a->esz][a->u]);
+    }
+    return true;
+}
+
+/*
+ *** SVE Permute - Predicates Group
+ */
+
+static bool do_perm_pred3(DisasContext *s, arg_rrr_esz *a, bool high_odd,
+                          gen_helper_gvec_3 *fn)
+{
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    unsigned vsz = pred_full_reg_size(s);
+
+    TCGv_ptr t_d = tcg_temp_new_ptr();
+    TCGv_ptr t_n = tcg_temp_new_ptr();
+    TCGv_ptr t_m = tcg_temp_new_ptr();
+    uint32_t desc = 0;
+
+    desc = FIELD_DP32(desc, PREDDESC, OPRSZ, vsz);
+    desc = FIELD_DP32(desc, PREDDESC, ESZ, a->esz);
+    desc = FIELD_DP32(desc, PREDDESC, DATA, high_odd);
+
+    tcg_gen_addi_ptr(t_d, cpu_env, pred_full_reg_offset(s, a->rd));
+    tcg_gen_addi_ptr(t_n, cpu_env, pred_full_reg_offset(s, a->rn));
+    tcg_gen_addi_ptr(t_m, cpu_env, pred_full_reg_offset(s, a->rm));
+
+    fn(t_d, t_n, t_m, tcg_constant_i32(desc));
+
+    tcg_temp_free_ptr(t_d);
+    tcg_temp_free_ptr(t_n);
+    tcg_temp_free_ptr(t_m);
+    return true;
+}
+
+static bool do_perm_pred2(DisasContext *s, arg_rr_esz *a, bool high_odd,
+                          gen_helper_gvec_2 *fn)
+{
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    unsigned vsz = pred_full_reg_size(s);
+    TCGv_ptr t_d = tcg_temp_new_ptr();
+    TCGv_ptr t_n = tcg_temp_new_ptr();
+    uint32_t desc = 0;
+
+    tcg_gen_addi_ptr(t_d, cpu_env, pred_full_reg_offset(s, a->rd));
+    tcg_gen_addi_ptr(t_n, cpu_env, pred_full_reg_offset(s, a->rn));
+
+    desc = FIELD_DP32(desc, PREDDESC, OPRSZ, vsz);
+    desc = FIELD_DP32(desc, PREDDESC, ESZ, a->esz);
+    desc = FIELD_DP32(desc, PREDDESC, DATA, high_odd);
+
+    fn(t_d, t_n, tcg_constant_i32(desc));
+
+    tcg_temp_free_ptr(t_d);
+    tcg_temp_free_ptr(t_n);
+    return true;
+}
+
+TRANS_FEAT(ZIP1_p, aa64_sve, do_perm_pred3, a, 0, gen_helper_sve_zip_p)
+TRANS_FEAT(ZIP2_p, aa64_sve, do_perm_pred3, a, 1, gen_helper_sve_zip_p)
+TRANS_FEAT(UZP1_p, aa64_sve, do_perm_pred3, a, 0, gen_helper_sve_uzp_p)
+TRANS_FEAT(UZP2_p, aa64_sve, do_perm_pred3, a, 1, gen_helper_sve_uzp_p)
+TRANS_FEAT(TRN1_p, aa64_sve, do_perm_pred3, a, 0, gen_helper_sve_trn_p)
+TRANS_FEAT(TRN2_p, aa64_sve, do_perm_pred3, a, 1, gen_helper_sve_trn_p)
+
+TRANS_FEAT(REV_p, aa64_sve, do_perm_pred2, a, 0, gen_helper_sve_rev_p)
+TRANS_FEAT(PUNPKLO, aa64_sve, do_perm_pred2, a, 0, gen_helper_sve_punpk_p)
+TRANS_FEAT(PUNPKHI, aa64_sve, do_perm_pred2, a, 1, gen_helper_sve_punpk_p)
+
+/*
+ *** SVE Permute - Interleaving Group
+ */
+
+static gen_helper_gvec_3 * const zip_fns[4] = {
+    gen_helper_sve_zip_b, gen_helper_sve_zip_h,
+    gen_helper_sve_zip_s, gen_helper_sve_zip_d,
+};
+TRANS_FEAT(ZIP1_z, aa64_sve, gen_gvec_ool_arg_zzz,
+           zip_fns[a->esz], a, 0)
+TRANS_FEAT(ZIP2_z, aa64_sve, gen_gvec_ool_arg_zzz,
+           zip_fns[a->esz], a, vec_full_reg_size(s) / 2)
+
+TRANS_FEAT(ZIP1_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz,
+           gen_helper_sve2_zip_q, a, 0)
+TRANS_FEAT(ZIP2_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz,
+           gen_helper_sve2_zip_q, a,
+           QEMU_ALIGN_DOWN(vec_full_reg_size(s), 32) / 2)
+
+static gen_helper_gvec_3 * const uzp_fns[4] = {
+    gen_helper_sve_uzp_b, gen_helper_sve_uzp_h,
+    gen_helper_sve_uzp_s, gen_helper_sve_uzp_d,
+};
+
+TRANS_FEAT(UZP1_z, aa64_sve, gen_gvec_ool_arg_zzz,
+           uzp_fns[a->esz], a, 0)
+TRANS_FEAT(UZP2_z, aa64_sve, gen_gvec_ool_arg_zzz,
+           uzp_fns[a->esz], a, 1 << a->esz)
+
+TRANS_FEAT(UZP1_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz,
+           gen_helper_sve2_uzp_q, a, 0)
+TRANS_FEAT(UZP2_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz,
+           gen_helper_sve2_uzp_q, a, 16)
+
+static gen_helper_gvec_3 * const trn_fns[4] = {
+    gen_helper_sve_trn_b, gen_helper_sve_trn_h,
+    gen_helper_sve_trn_s, gen_helper_sve_trn_d,
+};
+
+TRANS_FEAT(TRN1_z, aa64_sve, gen_gvec_ool_arg_zzz,
+           trn_fns[a->esz], a, 0)
+TRANS_FEAT(TRN2_z, aa64_sve, gen_gvec_ool_arg_zzz,
+           trn_fns[a->esz], a, 1 << a->esz)
+
+TRANS_FEAT(TRN1_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz,
+           gen_helper_sve2_trn_q, a, 0)
+TRANS_FEAT(TRN2_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz,
+           gen_helper_sve2_trn_q, a, 16)
+
+/*
+ *** SVE Permute Vector - Predicated Group
+ */
+
+static gen_helper_gvec_3 * const compact_fns[4] = {
+    NULL, NULL, gen_helper_sve_compact_s, gen_helper_sve_compact_d
+};
+TRANS_FEAT_NONSTREAMING(COMPACT, aa64_sve, gen_gvec_ool_arg_zpz,
+                        compact_fns[a->esz], a, 0)
+
+/* Call the helper that computes the ARM LastActiveElement pseudocode
+ * function, scaled by the element size.  This includes the not found
+ * indication; e.g. not found for esz=3 is -8.
+ */
+static void find_last_active(DisasContext *s, TCGv_i32 ret, int esz, int pg)
+{
+    /* Predicate sizes may be smaller and cannot use simd_desc.  We cannot
+     * round up, as we do elsewhere, because we need the exact size.
+     */
+    TCGv_ptr t_p = tcg_temp_new_ptr();
+    unsigned desc = 0;
+
+    desc = FIELD_DP32(desc, PREDDESC, OPRSZ, pred_full_reg_size(s));
+    desc = FIELD_DP32(desc, PREDDESC, ESZ, esz);
+
+    tcg_gen_addi_ptr(t_p, cpu_env, pred_full_reg_offset(s, pg));
+
+    gen_helper_sve_last_active_element(ret, t_p, tcg_constant_i32(desc));
+
+    tcg_temp_free_ptr(t_p);
+}
+
+/* Increment LAST to the offset of the next element in the vector,
+ * wrapping around to 0.
+ */
+static void incr_last_active(DisasContext *s, TCGv_i32 last, int esz)
+{
+    unsigned vsz = vec_full_reg_size(s);
+
+    tcg_gen_addi_i32(last, last, 1 << esz);
+    if (is_power_of_2(vsz)) {
+        tcg_gen_andi_i32(last, last, vsz - 1);
+    } else {
+        TCGv_i32 max = tcg_constant_i32(vsz);
+        TCGv_i32 zero = tcg_constant_i32(0);
+        tcg_gen_movcond_i32(TCG_COND_GEU, last, last, max, zero, last);
+    }
+}
+
+/* If LAST < 0, set LAST to the offset of the last element in the vector.  */
+static void wrap_last_active(DisasContext *s, TCGv_i32 last, int esz)
+{
+    unsigned vsz = vec_full_reg_size(s);
+
+    if (is_power_of_2(vsz)) {
+        tcg_gen_andi_i32(last, last, vsz - 1);
+    } else {
+        TCGv_i32 max = tcg_constant_i32(vsz - (1 << esz));
+        TCGv_i32 zero = tcg_constant_i32(0);
+        tcg_gen_movcond_i32(TCG_COND_LT, last, last, zero, max, last);
+    }
+}
+
+/* Load an unsigned element of ESZ from BASE+OFS.  */
+static TCGv_i64 load_esz(TCGv_ptr base, int ofs, int esz)
+{
+    TCGv_i64 r = tcg_temp_new_i64();
+
+    switch (esz) {
+    case 0:
+        tcg_gen_ld8u_i64(r, base, ofs);
+        break;
+    case 1:
+        tcg_gen_ld16u_i64(r, base, ofs);
+        break;
+    case 2:
+        tcg_gen_ld32u_i64(r, base, ofs);
+        break;
+    case 3:
+        tcg_gen_ld_i64(r, base, ofs);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    return r;
+}
+
+/* Load an unsigned element of ESZ from RM[LAST].  */
+static TCGv_i64 load_last_active(DisasContext *s, TCGv_i32 last,
+                                 int rm, int esz)
+{
+    TCGv_ptr p = tcg_temp_new_ptr();
+    TCGv_i64 r;
+
+    /* Convert offset into vector into offset into ENV.
+     * The final adjustment for the vector register base
+     * is added via constant offset to the load.
+     */
+#if HOST_BIG_ENDIAN
+    /* Adjust for element ordering.  See vec_reg_offset.  */
+    if (esz < 3) {
+        tcg_gen_xori_i32(last, last, 8 - (1 << esz));
+    }
+#endif
+    tcg_gen_ext_i32_ptr(p, last);
+    tcg_gen_add_ptr(p, p, cpu_env);
+
+    r = load_esz(p, vec_full_reg_offset(s, rm), esz);
+    tcg_temp_free_ptr(p);
+
+    return r;
+}
+
+/* Compute CLAST for a Zreg.  */
+static bool do_clast_vector(DisasContext *s, arg_rprr_esz *a, bool before)
+{
+    TCGv_i32 last;
+    TCGLabel *over;
+    TCGv_i64 ele;
+    unsigned vsz, esz = a->esz;
+
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    last = tcg_temp_local_new_i32();
+    over = gen_new_label();
+
+    find_last_active(s, last, esz, a->pg);
+
+    /* There is of course no movcond for a 2048-bit vector,
+     * so we must branch over the actual store.
+     */
+    tcg_gen_brcondi_i32(TCG_COND_LT, last, 0, over);
+
+    if (!before) {
+        incr_last_active(s, last, esz);
+    }
+
+    ele = load_last_active(s, last, a->rm, esz);
+    tcg_temp_free_i32(last);
+
+    vsz = vec_full_reg_size(s);
+    tcg_gen_gvec_dup_i64(esz, vec_full_reg_offset(s, a->rd), vsz, vsz, ele);
+    tcg_temp_free_i64(ele);
+
+    /* If this insn used MOVPRFX, we may need a second move.  */
+    if (a->rd != a->rn) {
+        TCGLabel *done = gen_new_label();
+        tcg_gen_br(done);
+
+        gen_set_label(over);
+        do_mov_z(s, a->rd, a->rn);
+
+        gen_set_label(done);
+    } else {
+        gen_set_label(over);
+    }
+    return true;
+}
+
+TRANS_FEAT(CLASTA_z, aa64_sve, do_clast_vector, a, false)
+TRANS_FEAT(CLASTB_z, aa64_sve, do_clast_vector, a, true)
+
+/* Compute CLAST for a scalar.  */
+static void do_clast_scalar(DisasContext *s, int esz, int pg, int rm,
+                            bool before, TCGv_i64 reg_val)
+{
+    TCGv_i32 last = tcg_temp_new_i32();
+    TCGv_i64 ele, cmp;
+
+    find_last_active(s, last, esz, pg);
+
+    /* Extend the original value of last prior to incrementing.  */
+    cmp = tcg_temp_new_i64();
+    tcg_gen_ext_i32_i64(cmp, last);
+
+    if (!before) {
+        incr_last_active(s, last, esz);
+    }
+
+    /* The conceit here is that while last < 0 indicates not found, after
+     * adjusting for cpu_env->vfp.zregs[rm], it is still a valid address
+     * from which we can load garbage.  We then discard the garbage with
+     * a conditional move.
+     */
+    ele = load_last_active(s, last, rm, esz);
+    tcg_temp_free_i32(last);
+
+    tcg_gen_movcond_i64(TCG_COND_GE, reg_val, cmp, tcg_constant_i64(0),
+                        ele, reg_val);
+
+    tcg_temp_free_i64(cmp);
+    tcg_temp_free_i64(ele);
+}
+
+/* Compute CLAST for a Vreg.  */
+static bool do_clast_fp(DisasContext *s, arg_rpr_esz *a, bool before)
+{
+    if (sve_access_check(s)) {
+        int esz = a->esz;
+        int ofs = vec_reg_offset(s, a->rd, 0, esz);
+        TCGv_i64 reg = load_esz(cpu_env, ofs, esz);
+
+        do_clast_scalar(s, esz, a->pg, a->rn, before, reg);
+        write_fp_dreg(s, a->rd, reg);
+        tcg_temp_free_i64(reg);
+    }
+    return true;
+}
+
+TRANS_FEAT(CLASTA_v, aa64_sve, do_clast_fp, a, false)
+TRANS_FEAT(CLASTB_v, aa64_sve, do_clast_fp, a, true)
+
+/* Compute CLAST for a Xreg.  */
+static bool do_clast_general(DisasContext *s, arg_rpr_esz *a, bool before)
+{
+    TCGv_i64 reg;
+
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    reg = cpu_reg(s, a->rd);
+    switch (a->esz) {
+    case 0:
+        tcg_gen_ext8u_i64(reg, reg);
+        break;
+    case 1:
+        tcg_gen_ext16u_i64(reg, reg);
+        break;
+    case 2:
+        tcg_gen_ext32u_i64(reg, reg);
+        break;
+    case 3:
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    do_clast_scalar(s, a->esz, a->pg, a->rn, before, reg);
+    return true;
+}
+
+TRANS_FEAT(CLASTA_r, aa64_sve, do_clast_general, a, false)
+TRANS_FEAT(CLASTB_r, aa64_sve, do_clast_general, a, true)
+
+/* Compute LAST for a scalar.  */
+static TCGv_i64 do_last_scalar(DisasContext *s, int esz,
+                               int pg, int rm, bool before)
+{
+    TCGv_i32 last = tcg_temp_new_i32();
+    TCGv_i64 ret;
+
+    find_last_active(s, last, esz, pg);
+    if (before) {
+        wrap_last_active(s, last, esz);
+    } else {
+        incr_last_active(s, last, esz);
+    }
+
+    ret = load_last_active(s, last, rm, esz);
+    tcg_temp_free_i32(last);
+    return ret;
+}
+
+/* Compute LAST for a Vreg.  */
+static bool do_last_fp(DisasContext *s, arg_rpr_esz *a, bool before)
+{
+    if (sve_access_check(s)) {
+        TCGv_i64 val = do_last_scalar(s, a->esz, a->pg, a->rn, before);
+        write_fp_dreg(s, a->rd, val);
+        tcg_temp_free_i64(val);
+    }
+    return true;
+}
+
+TRANS_FEAT(LASTA_v, aa64_sve, do_last_fp, a, false)
+TRANS_FEAT(LASTB_v, aa64_sve, do_last_fp, a, true)
+
+/* Compute LAST for a Xreg.  */
+static bool do_last_general(DisasContext *s, arg_rpr_esz *a, bool before)
+{
+    if (sve_access_check(s)) {
+        TCGv_i64 val = do_last_scalar(s, a->esz, a->pg, a->rn, before);
+        tcg_gen_mov_i64(cpu_reg(s, a->rd), val);
+        tcg_temp_free_i64(val);
+    }
+    return true;
+}
+
+TRANS_FEAT(LASTA_r, aa64_sve, do_last_general, a, false)
+TRANS_FEAT(LASTB_r, aa64_sve, do_last_general, a, true)
+
+static bool trans_CPY_m_r(DisasContext *s, arg_rpr_esz *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        do_cpy_m(s, a->esz, a->rd, a->rd, a->pg, cpu_reg_sp(s, a->rn));
+    }
+    return true;
+}
+
+static bool trans_CPY_m_v(DisasContext *s, arg_rpr_esz *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        int ofs = vec_reg_offset(s, a->rn, 0, a->esz);
+        TCGv_i64 t = load_esz(cpu_env, ofs, a->esz);
+        do_cpy_m(s, a->esz, a->rd, a->rd, a->pg, t);
+        tcg_temp_free_i64(t);
+    }
+    return true;
+}
+
+static gen_helper_gvec_3 * const revb_fns[4] = {
+    NULL,                  gen_helper_sve_revb_h,
+    gen_helper_sve_revb_s, gen_helper_sve_revb_d,
+};
+TRANS_FEAT(REVB, aa64_sve, gen_gvec_ool_arg_zpz, revb_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const revh_fns[4] = {
+    NULL, NULL, gen_helper_sve_revh_s, gen_helper_sve_revh_d,
+};
+TRANS_FEAT(REVH, aa64_sve, gen_gvec_ool_arg_zpz, revh_fns[a->esz], a, 0)
+
+TRANS_FEAT(REVW, aa64_sve, gen_gvec_ool_arg_zpz,
+           a->esz == 3 ? gen_helper_sve_revw_d : NULL, a, 0)
+
+TRANS_FEAT(REVD, aa64_sme, gen_gvec_ool_arg_zpz, gen_helper_sme_revd_q, a, 0)
+
+TRANS_FEAT(SPLICE, aa64_sve, gen_gvec_ool_arg_zpzz,
+           gen_helper_sve_splice, a, a->esz)
+
+TRANS_FEAT(SPLICE_sve2, aa64_sve2, gen_gvec_ool_zzzp, gen_helper_sve_splice,
+           a->rd, a->rn, (a->rn + 1) % 32, a->pg, a->esz)
+
+/*
+ *** SVE Integer Compare - Vectors Group
+ */
+
+static bool do_ppzz_flags(DisasContext *s, arg_rprr_esz *a,
+                          gen_helper_gvec_flags_4 *gen_fn)
+{
+    TCGv_ptr pd, zn, zm, pg;
+    unsigned vsz;
+    TCGv_i32 t;
+
+    if (gen_fn == NULL) {
+        return false;
+    }
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    vsz = vec_full_reg_size(s);
+    t = tcg_temp_new_i32();
+    pd = tcg_temp_new_ptr();
+    zn = tcg_temp_new_ptr();
+    zm = tcg_temp_new_ptr();
+    pg = tcg_temp_new_ptr();
+
+    tcg_gen_addi_ptr(pd, cpu_env, pred_full_reg_offset(s, a->rd));
+    tcg_gen_addi_ptr(zn, cpu_env, vec_full_reg_offset(s, a->rn));
+    tcg_gen_addi_ptr(zm, cpu_env, vec_full_reg_offset(s, a->rm));
+    tcg_gen_addi_ptr(pg, cpu_env, pred_full_reg_offset(s, a->pg));
+
+    gen_fn(t, pd, zn, zm, pg, tcg_constant_i32(simd_desc(vsz, vsz, 0)));
+
+    tcg_temp_free_ptr(pd);
+    tcg_temp_free_ptr(zn);
+    tcg_temp_free_ptr(zm);
+    tcg_temp_free_ptr(pg);
+
+    do_pred_flags(t);
+
+    tcg_temp_free_i32(t);
+    return true;
+}
+
+#define DO_PPZZ(NAME, name) \
+    static gen_helper_gvec_flags_4 * const name##_ppzz_fns[4] = {       \
+        gen_helper_sve_##name##_ppzz_b, gen_helper_sve_##name##_ppzz_h, \
+        gen_helper_sve_##name##_ppzz_s, gen_helper_sve_##name##_ppzz_d, \
+    };                                                                  \
+    TRANS_FEAT(NAME##_ppzz, aa64_sve, do_ppzz_flags,                    \
+               a, name##_ppzz_fns[a->esz])
+
+DO_PPZZ(CMPEQ, cmpeq)
+DO_PPZZ(CMPNE, cmpne)
+DO_PPZZ(CMPGT, cmpgt)
+DO_PPZZ(CMPGE, cmpge)
+DO_PPZZ(CMPHI, cmphi)
+DO_PPZZ(CMPHS, cmphs)
+
+#undef DO_PPZZ
+
+#define DO_PPZW(NAME, name) \
+    static gen_helper_gvec_flags_4 * const name##_ppzw_fns[4] = {       \
+        gen_helper_sve_##name##_ppzw_b, gen_helper_sve_##name##_ppzw_h, \
+        gen_helper_sve_##name##_ppzw_s, NULL                            \
+    };                                                                  \
+    TRANS_FEAT(NAME##_ppzw, aa64_sve, do_ppzz_flags,                    \
+               a, name##_ppzw_fns[a->esz])
+
+DO_PPZW(CMPEQ, cmpeq)
+DO_PPZW(CMPNE, cmpne)
+DO_PPZW(CMPGT, cmpgt)
+DO_PPZW(CMPGE, cmpge)
+DO_PPZW(CMPHI, cmphi)
+DO_PPZW(CMPHS, cmphs)
+DO_PPZW(CMPLT, cmplt)
+DO_PPZW(CMPLE, cmple)
+DO_PPZW(CMPLO, cmplo)
+DO_PPZW(CMPLS, cmpls)
+
+#undef DO_PPZW
+
+/*
+ *** SVE Integer Compare - Immediate Groups
+ */
+
+static bool do_ppzi_flags(DisasContext *s, arg_rpri_esz *a,
+                          gen_helper_gvec_flags_3 *gen_fn)
+{
+    TCGv_ptr pd, zn, pg;
+    unsigned vsz;
+    TCGv_i32 t;
+
+    if (gen_fn == NULL) {
+        return false;
+    }
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    vsz = vec_full_reg_size(s);
+    t = tcg_temp_new_i32();
+    pd = tcg_temp_new_ptr();
+    zn = tcg_temp_new_ptr();
+    pg = tcg_temp_new_ptr();
+
+    tcg_gen_addi_ptr(pd, cpu_env, pred_full_reg_offset(s, a->rd));
+    tcg_gen_addi_ptr(zn, cpu_env, vec_full_reg_offset(s, a->rn));
+    tcg_gen_addi_ptr(pg, cpu_env, pred_full_reg_offset(s, a->pg));
+
+    gen_fn(t, pd, zn, pg, tcg_constant_i32(simd_desc(vsz, vsz, a->imm)));
+
+    tcg_temp_free_ptr(pd);
+    tcg_temp_free_ptr(zn);
+    tcg_temp_free_ptr(pg);
+
+    do_pred_flags(t);
+
+    tcg_temp_free_i32(t);
+    return true;
+}
+
+#define DO_PPZI(NAME, name) \
+    static gen_helper_gvec_flags_3 * const name##_ppzi_fns[4] = {         \
+        gen_helper_sve_##name##_ppzi_b, gen_helper_sve_##name##_ppzi_h,   \
+        gen_helper_sve_##name##_ppzi_s, gen_helper_sve_##name##_ppzi_d,   \
+    };                                                                    \
+    TRANS_FEAT(NAME##_ppzi, aa64_sve, do_ppzi_flags, a,                   \
+               name##_ppzi_fns[a->esz])
+
+DO_PPZI(CMPEQ, cmpeq)
+DO_PPZI(CMPNE, cmpne)
+DO_PPZI(CMPGT, cmpgt)
+DO_PPZI(CMPGE, cmpge)
+DO_PPZI(CMPHI, cmphi)
+DO_PPZI(CMPHS, cmphs)
+DO_PPZI(CMPLT, cmplt)
+DO_PPZI(CMPLE, cmple)
+DO_PPZI(CMPLO, cmplo)
+DO_PPZI(CMPLS, cmpls)
+
+#undef DO_PPZI
+
+/*
+ *** SVE Partition Break Group
+ */
+
+static bool do_brk3(DisasContext *s, arg_rprr_s *a,
+                    gen_helper_gvec_4 *fn, gen_helper_gvec_flags_4 *fn_s)
+{
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    unsigned vsz = pred_full_reg_size(s);
+
+    /* Predicate sizes may be smaller and cannot use simd_desc.  */
+    TCGv_ptr d = tcg_temp_new_ptr();
+    TCGv_ptr n = tcg_temp_new_ptr();
+    TCGv_ptr m = tcg_temp_new_ptr();
+    TCGv_ptr g = tcg_temp_new_ptr();
+    TCGv_i32 desc = tcg_constant_i32(FIELD_DP32(0, PREDDESC, OPRSZ, vsz));
+
+    tcg_gen_addi_ptr(d, cpu_env, pred_full_reg_offset(s, a->rd));
+    tcg_gen_addi_ptr(n, cpu_env, pred_full_reg_offset(s, a->rn));
+    tcg_gen_addi_ptr(m, cpu_env, pred_full_reg_offset(s, a->rm));
+    tcg_gen_addi_ptr(g, cpu_env, pred_full_reg_offset(s, a->pg));
+
+    if (a->s) {
+        TCGv_i32 t = tcg_temp_new_i32();
+        fn_s(t, d, n, m, g, desc);
+        do_pred_flags(t);
+        tcg_temp_free_i32(t);
+    } else {
+        fn(d, n, m, g, desc);
+    }
+    tcg_temp_free_ptr(d);
+    tcg_temp_free_ptr(n);
+    tcg_temp_free_ptr(m);
+    tcg_temp_free_ptr(g);
+    return true;
+}
+
+static bool do_brk2(DisasContext *s, arg_rpr_s *a,
+                    gen_helper_gvec_3 *fn, gen_helper_gvec_flags_3 *fn_s)
+{
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    unsigned vsz = pred_full_reg_size(s);
+
+    /* Predicate sizes may be smaller and cannot use simd_desc.  */
+    TCGv_ptr d = tcg_temp_new_ptr();
+    TCGv_ptr n = tcg_temp_new_ptr();
+    TCGv_ptr g = tcg_temp_new_ptr();
+    TCGv_i32 desc = tcg_constant_i32(FIELD_DP32(0, PREDDESC, OPRSZ, vsz));
+
+    tcg_gen_addi_ptr(d, cpu_env, pred_full_reg_offset(s, a->rd));
+    tcg_gen_addi_ptr(n, cpu_env, pred_full_reg_offset(s, a->rn));
+    tcg_gen_addi_ptr(g, cpu_env, pred_full_reg_offset(s, a->pg));
+
+    if (a->s) {
+        TCGv_i32 t = tcg_temp_new_i32();
+        fn_s(t, d, n, g, desc);
+        do_pred_flags(t);
+        tcg_temp_free_i32(t);
+    } else {
+        fn(d, n, g, desc);
+    }
+    tcg_temp_free_ptr(d);
+    tcg_temp_free_ptr(n);
+    tcg_temp_free_ptr(g);
+    return true;
+}
+
+TRANS_FEAT(BRKPA, aa64_sve, do_brk3, a,
+           gen_helper_sve_brkpa, gen_helper_sve_brkpas)
+TRANS_FEAT(BRKPB, aa64_sve, do_brk3, a,
+           gen_helper_sve_brkpb, gen_helper_sve_brkpbs)
+
+TRANS_FEAT(BRKA_m, aa64_sve, do_brk2, a,
+           gen_helper_sve_brka_m, gen_helper_sve_brkas_m)
+TRANS_FEAT(BRKB_m, aa64_sve, do_brk2, a,
+           gen_helper_sve_brkb_m, gen_helper_sve_brkbs_m)
+
+TRANS_FEAT(BRKA_z, aa64_sve, do_brk2, a,
+           gen_helper_sve_brka_z, gen_helper_sve_brkas_z)
+TRANS_FEAT(BRKB_z, aa64_sve, do_brk2, a,
+           gen_helper_sve_brkb_z, gen_helper_sve_brkbs_z)
+
+TRANS_FEAT(BRKN, aa64_sve, do_brk2, a,
+           gen_helper_sve_brkn, gen_helper_sve_brkns)
+
+/*
+ *** SVE Predicate Count Group
+ */
+
+static void do_cntp(DisasContext *s, TCGv_i64 val, int esz, int pn, int pg)
+{
+    unsigned psz = pred_full_reg_size(s);
+
+    if (psz <= 8) {
+        uint64_t psz_mask;
+
+        tcg_gen_ld_i64(val, cpu_env, pred_full_reg_offset(s, pn));
+        if (pn != pg) {
+            TCGv_i64 g = tcg_temp_new_i64();
+            tcg_gen_ld_i64(g, cpu_env, pred_full_reg_offset(s, pg));
+            tcg_gen_and_i64(val, val, g);
+            tcg_temp_free_i64(g);
+        }
+
+        /* Reduce the pred_esz_masks value simply to reduce the
+         * size of the code generated here.
+         */
+        psz_mask = MAKE_64BIT_MASK(0, psz * 8);
+        tcg_gen_andi_i64(val, val, pred_esz_masks[esz] & psz_mask);
+
+        tcg_gen_ctpop_i64(val, val);
+    } else {
+        TCGv_ptr t_pn = tcg_temp_new_ptr();
+        TCGv_ptr t_pg = tcg_temp_new_ptr();
+        unsigned desc = 0;
+
+        desc = FIELD_DP32(desc, PREDDESC, OPRSZ, psz);
+        desc = FIELD_DP32(desc, PREDDESC, ESZ, esz);
+
+        tcg_gen_addi_ptr(t_pn, cpu_env, pred_full_reg_offset(s, pn));
+        tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg));
+
+        gen_helper_sve_cntp(val, t_pn, t_pg, tcg_constant_i32(desc));
+        tcg_temp_free_ptr(t_pn);
+        tcg_temp_free_ptr(t_pg);
+    }
+}
+
+static bool trans_CNTP(DisasContext *s, arg_CNTP *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        do_cntp(s, cpu_reg(s, a->rd), a->esz, a->rn, a->pg);
+    }
+    return true;
+}
+
+static bool trans_INCDECP_r(DisasContext *s, arg_incdec_pred *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        TCGv_i64 reg = cpu_reg(s, a->rd);
+        TCGv_i64 val = tcg_temp_new_i64();
+
+        do_cntp(s, val, a->esz, a->pg, a->pg);
+        if (a->d) {
+            tcg_gen_sub_i64(reg, reg, val);
+        } else {
+            tcg_gen_add_i64(reg, reg, val);
+        }
+        tcg_temp_free_i64(val);
+    }
+    return true;
+}
+
+static bool trans_INCDECP_z(DisasContext *s, arg_incdec2_pred *a)
+{
+    if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        TCGv_i64 val = tcg_temp_new_i64();
+        GVecGen2sFn *gvec_fn = a->d ? tcg_gen_gvec_subs : tcg_gen_gvec_adds;
+
+        do_cntp(s, val, a->esz, a->pg, a->pg);
+        gvec_fn(a->esz, vec_full_reg_offset(s, a->rd),
+                vec_full_reg_offset(s, a->rn), val, vsz, vsz);
+    }
+    return true;
+}
+
+static bool trans_SINCDECP_r_32(DisasContext *s, arg_incdec_pred *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        TCGv_i64 reg = cpu_reg(s, a->rd);
+        TCGv_i64 val = tcg_temp_new_i64();
+
+        do_cntp(s, val, a->esz, a->pg, a->pg);
+        do_sat_addsub_32(reg, val, a->u, a->d);
+    }
+    return true;
+}
+
+static bool trans_SINCDECP_r_64(DisasContext *s, arg_incdec_pred *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        TCGv_i64 reg = cpu_reg(s, a->rd);
+        TCGv_i64 val = tcg_temp_new_i64();
+
+        do_cntp(s, val, a->esz, a->pg, a->pg);
+        do_sat_addsub_64(reg, val, a->u, a->d);
+    }
+    return true;
+}
+
+static bool trans_SINCDECP_z(DisasContext *s, arg_incdec2_pred *a)
+{
+    if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        TCGv_i64 val = tcg_temp_new_i64();
+        do_cntp(s, val, a->esz, a->pg, a->pg);
+        do_sat_addsub_vec(s, a->esz, a->rd, a->rn, val, a->u, a->d);
+    }
+    return true;
+}
+
+/*
+ *** SVE Integer Compare Scalars Group
+ */
+
+static bool trans_CTERM(DisasContext *s, arg_CTERM *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    TCGCond cond = (a->ne ? TCG_COND_NE : TCG_COND_EQ);
+    TCGv_i64 rn = read_cpu_reg(s, a->rn, a->sf);
+    TCGv_i64 rm = read_cpu_reg(s, a->rm, a->sf);
+    TCGv_i64 cmp = tcg_temp_new_i64();
+
+    tcg_gen_setcond_i64(cond, cmp, rn, rm);
+    tcg_gen_extrl_i64_i32(cpu_NF, cmp);
+    tcg_temp_free_i64(cmp);
+
+    /* VF = !NF & !CF.  */
+    tcg_gen_xori_i32(cpu_VF, cpu_NF, 1);
+    tcg_gen_andc_i32(cpu_VF, cpu_VF, cpu_CF);
+
+    /* Both NF and VF actually look at bit 31.  */
+    tcg_gen_neg_i32(cpu_NF, cpu_NF);
+    tcg_gen_neg_i32(cpu_VF, cpu_VF);
+    return true;
+}
+
+static bool trans_WHILE(DisasContext *s, arg_WHILE *a)
+{
+    TCGv_i64 op0, op1, t0, t1, tmax;
+    TCGv_i32 t2;
+    TCGv_ptr ptr;
+    unsigned vsz = vec_full_reg_size(s);
+    unsigned desc = 0;
+    TCGCond cond;
+    uint64_t maxval;
+    /* Note that GE/HS has a->eq == 0 and GT/HI has a->eq == 1. */
+    bool eq = a->eq == a->lt;
+
+    /* The greater-than conditions are all SVE2. */
+    if (a->lt
+        ? !dc_isar_feature(aa64_sve, s)
+        : !dc_isar_feature(aa64_sve2, s)) {
+        return false;
+    }
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    op0 = read_cpu_reg(s, a->rn, 1);
+    op1 = read_cpu_reg(s, a->rm, 1);
+
+    if (!a->sf) {
+        if (a->u) {
+            tcg_gen_ext32u_i64(op0, op0);
+            tcg_gen_ext32u_i64(op1, op1);
+        } else {
+            tcg_gen_ext32s_i64(op0, op0);
+            tcg_gen_ext32s_i64(op1, op1);
+        }
+    }
+
+    /* For the helper, compress the different conditions into a computation
+     * of how many iterations for which the condition is true.
+     */
+    t0 = tcg_temp_new_i64();
+    t1 = tcg_temp_new_i64();
+
+    if (a->lt) {
+        tcg_gen_sub_i64(t0, op1, op0);
+        if (a->u) {
+            maxval = a->sf ? UINT64_MAX : UINT32_MAX;
+            cond = eq ? TCG_COND_LEU : TCG_COND_LTU;
+        } else {
+            maxval = a->sf ? INT64_MAX : INT32_MAX;
+            cond = eq ? TCG_COND_LE : TCG_COND_LT;
+        }
+    } else {
+        tcg_gen_sub_i64(t0, op0, op1);
+        if (a->u) {
+            maxval = 0;
+            cond = eq ? TCG_COND_GEU : TCG_COND_GTU;
+        } else {
+            maxval = a->sf ? INT64_MIN : INT32_MIN;
+            cond = eq ? TCG_COND_GE : TCG_COND_GT;
+        }
+    }
+
+    tmax = tcg_constant_i64(vsz >> a->esz);
+    if (eq) {
+        /* Equality means one more iteration.  */
+        tcg_gen_addi_i64(t0, t0, 1);
+
+        /*
+         * For the less-than while, if op1 is maxval (and the only time
+         * the addition above could overflow), then we produce an all-true
+         * predicate by setting the count to the vector length.  This is
+         * because the pseudocode is described as an increment + compare
+         * loop, and the maximum integer would always compare true.
+         * Similarly, the greater-than while has the same issue with the
+         * minimum integer due to the decrement + compare loop.
+         */
+        tcg_gen_movi_i64(t1, maxval);
+        tcg_gen_movcond_i64(TCG_COND_EQ, t0, op1, t1, tmax, t0);
+    }
+
+    /* Bound to the maximum.  */
+    tcg_gen_umin_i64(t0, t0, tmax);
+
+    /* Set the count to zero if the condition is false.  */
+    tcg_gen_movi_i64(t1, 0);
+    tcg_gen_movcond_i64(cond, t0, op0, op1, t0, t1);
+    tcg_temp_free_i64(t1);
+
+    /* Since we're bounded, pass as a 32-bit type.  */
+    t2 = tcg_temp_new_i32();
+    tcg_gen_extrl_i64_i32(t2, t0);
+    tcg_temp_free_i64(t0);
+
+    /* Scale elements to bits.  */
+    tcg_gen_shli_i32(t2, t2, a->esz);
+
+    desc = FIELD_DP32(desc, PREDDESC, OPRSZ, vsz / 8);
+    desc = FIELD_DP32(desc, PREDDESC, ESZ, a->esz);
+
+    ptr = tcg_temp_new_ptr();
+    tcg_gen_addi_ptr(ptr, cpu_env, pred_full_reg_offset(s, a->rd));
+
+    if (a->lt) {
+        gen_helper_sve_whilel(t2, ptr, t2, tcg_constant_i32(desc));
+    } else {
+        gen_helper_sve_whileg(t2, ptr, t2, tcg_constant_i32(desc));
+    }
+    do_pred_flags(t2);
+
+    tcg_temp_free_ptr(ptr);
+    tcg_temp_free_i32(t2);
+    return true;
+}
+
+static bool trans_WHILE_ptr(DisasContext *s, arg_WHILE_ptr *a)
+{
+    TCGv_i64 op0, op1, diff, t1, tmax;
+    TCGv_i32 t2;
+    TCGv_ptr ptr;
+    unsigned vsz = vec_full_reg_size(s);
+    unsigned desc = 0;
+
+    if (!dc_isar_feature(aa64_sve2, s)) {
+        return false;
+    }
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    op0 = read_cpu_reg(s, a->rn, 1);
+    op1 = read_cpu_reg(s, a->rm, 1);
+
+    tmax = tcg_constant_i64(vsz);
+    diff = tcg_temp_new_i64();
+
+    if (a->rw) {
+        /* WHILERW */
+        /* diff = abs(op1 - op0), noting that op0/1 are unsigned. */
+        t1 = tcg_temp_new_i64();
+        tcg_gen_sub_i64(diff, op0, op1);
+        tcg_gen_sub_i64(t1, op1, op0);
+        tcg_gen_movcond_i64(TCG_COND_GEU, diff, op0, op1, diff, t1);
+        tcg_temp_free_i64(t1);
+        /* Round down to a multiple of ESIZE.  */
+        tcg_gen_andi_i64(diff, diff, -1 << a->esz);
+        /* If op1 == op0, diff == 0, and the condition is always true. */
+        tcg_gen_movcond_i64(TCG_COND_EQ, diff, op0, op1, tmax, diff);
+    } else {
+        /* WHILEWR */
+        tcg_gen_sub_i64(diff, op1, op0);
+        /* Round down to a multiple of ESIZE.  */
+        tcg_gen_andi_i64(diff, diff, -1 << a->esz);
+        /* If op0 >= op1, diff <= 0, the condition is always true. */
+        tcg_gen_movcond_i64(TCG_COND_GEU, diff, op0, op1, tmax, diff);
+    }
+
+    /* Bound to the maximum.  */
+    tcg_gen_umin_i64(diff, diff, tmax);
+
+    /* Since we're bounded, pass as a 32-bit type.  */
+    t2 = tcg_temp_new_i32();
+    tcg_gen_extrl_i64_i32(t2, diff);
+    tcg_temp_free_i64(diff);
+
+    desc = FIELD_DP32(desc, PREDDESC, OPRSZ, vsz / 8);
+    desc = FIELD_DP32(desc, PREDDESC, ESZ, a->esz);
+
+    ptr = tcg_temp_new_ptr();
+    tcg_gen_addi_ptr(ptr, cpu_env, pred_full_reg_offset(s, a->rd));
+
+    gen_helper_sve_whilel(t2, ptr, t2, tcg_constant_i32(desc));
+    do_pred_flags(t2);
+
+    tcg_temp_free_ptr(ptr);
+    tcg_temp_free_i32(t2);
+    return true;
+}
+
+/*
+ *** SVE Integer Wide Immediate - Unpredicated Group
+ */
+
+static bool trans_FDUP(DisasContext *s, arg_FDUP *a)
+{
+    if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        int dofs = vec_full_reg_offset(s, a->rd);
+        uint64_t imm;
+
+        /* Decode the VFP immediate.  */
+        imm = vfp_expand_imm(a->esz, a->imm);
+        tcg_gen_gvec_dup_imm(a->esz, dofs, vsz, vsz, imm);
+    }
+    return true;
+}
+
+static bool trans_DUP_i(DisasContext *s, arg_DUP_i *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        int dofs = vec_full_reg_offset(s, a->rd);
+        tcg_gen_gvec_dup_imm(a->esz, dofs, vsz, vsz, a->imm);
+    }
+    return true;
+}
+
+TRANS_FEAT(ADD_zzi, aa64_sve, gen_gvec_fn_arg_zzi, tcg_gen_gvec_addi, a)
+
+static bool trans_SUB_zzi(DisasContext *s, arg_rri_esz *a)
+{
+    a->imm = -a->imm;
+    return trans_ADD_zzi(s, a);
+}
+
+static bool trans_SUBR_zzi(DisasContext *s, arg_rri_esz *a)
+{
+    static const TCGOpcode vecop_list[] = { INDEX_op_sub_vec, 0 };
+    static const GVecGen2s op[4] = {
+        { .fni8 = tcg_gen_vec_sub8_i64,
+          .fniv = tcg_gen_sub_vec,
+          .fno = gen_helper_sve_subri_b,
+          .opt_opc = vecop_list,
+          .vece = MO_8,
+          .scalar_first = true },
+        { .fni8 = tcg_gen_vec_sub16_i64,
+          .fniv = tcg_gen_sub_vec,
+          .fno = gen_helper_sve_subri_h,
+          .opt_opc = vecop_list,
+          .vece = MO_16,
+          .scalar_first = true },
+        { .fni4 = tcg_gen_sub_i32,
+          .fniv = tcg_gen_sub_vec,
+          .fno = gen_helper_sve_subri_s,
+          .opt_opc = vecop_list,
+          .vece = MO_32,
+          .scalar_first = true },
+        { .fni8 = tcg_gen_sub_i64,
+          .fniv = tcg_gen_sub_vec,
+          .fno = gen_helper_sve_subri_d,
+          .opt_opc = vecop_list,
+          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+          .vece = MO_64,
+          .scalar_first = true }
+    };
+
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        tcg_gen_gvec_2s(vec_full_reg_offset(s, a->rd),
+                        vec_full_reg_offset(s, a->rn),
+                        vsz, vsz, tcg_constant_i64(a->imm), &op[a->esz]);
+    }
+    return true;
+}
+
+TRANS_FEAT(MUL_zzi, aa64_sve, gen_gvec_fn_arg_zzi, tcg_gen_gvec_muli, a)
+
+static bool do_zzi_sat(DisasContext *s, arg_rri_esz *a, bool u, bool d)
+{
+    if (sve_access_check(s)) {
+        do_sat_addsub_vec(s, a->esz, a->rd, a->rn,
+                          tcg_constant_i64(a->imm), u, d);
+    }
+    return true;
+}
+
+TRANS_FEAT(SQADD_zzi, aa64_sve, do_zzi_sat, a, false, false)
+TRANS_FEAT(UQADD_zzi, aa64_sve, do_zzi_sat, a, true, false)
+TRANS_FEAT(SQSUB_zzi, aa64_sve, do_zzi_sat, a, false, true)
+TRANS_FEAT(UQSUB_zzi, aa64_sve, do_zzi_sat, a, true, true)
+
+static bool do_zzi_ool(DisasContext *s, arg_rri_esz *a, gen_helper_gvec_2i *fn)
+{
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        tcg_gen_gvec_2i_ool(vec_full_reg_offset(s, a->rd),
+                            vec_full_reg_offset(s, a->rn),
+                            tcg_constant_i64(a->imm), vsz, vsz, 0, fn);
+    }
+    return true;
+}
+
+#define DO_ZZI(NAME, name) \
+    static gen_helper_gvec_2i * const name##i_fns[4] = {                \
+        gen_helper_sve_##name##i_b, gen_helper_sve_##name##i_h,         \
+        gen_helper_sve_##name##i_s, gen_helper_sve_##name##i_d,         \
+    };                                                                  \
+    TRANS_FEAT(NAME##_zzi, aa64_sve, do_zzi_ool, a, name##i_fns[a->esz])
+
+DO_ZZI(SMAX, smax)
+DO_ZZI(UMAX, umax)
+DO_ZZI(SMIN, smin)
+DO_ZZI(UMIN, umin)
+
+#undef DO_ZZI
+
+static gen_helper_gvec_4 * const dot_fns[2][2] = {
+    { gen_helper_gvec_sdot_b, gen_helper_gvec_sdot_h },
+    { gen_helper_gvec_udot_b, gen_helper_gvec_udot_h }
+};
+TRANS_FEAT(DOT_zzzz, aa64_sve, gen_gvec_ool_zzzz,
+           dot_fns[a->u][a->sz], a->rd, a->rn, a->rm, a->ra, 0)
+
+/*
+ * SVE Multiply - Indexed
+ */
+
+TRANS_FEAT(SDOT_zzxw_s, aa64_sve, gen_gvec_ool_arg_zzxz,
+           gen_helper_gvec_sdot_idx_b, a)
+TRANS_FEAT(SDOT_zzxw_d, aa64_sve, gen_gvec_ool_arg_zzxz,
+           gen_helper_gvec_sdot_idx_h, a)
+TRANS_FEAT(UDOT_zzxw_s, aa64_sve, gen_gvec_ool_arg_zzxz,
+           gen_helper_gvec_udot_idx_b, a)
+TRANS_FEAT(UDOT_zzxw_d, aa64_sve, gen_gvec_ool_arg_zzxz,
+           gen_helper_gvec_udot_idx_h, a)
+
+TRANS_FEAT(SUDOT_zzxw_s, aa64_sve_i8mm, gen_gvec_ool_arg_zzxz,
+           gen_helper_gvec_sudot_idx_b, a)
+TRANS_FEAT(USDOT_zzxw_s, aa64_sve_i8mm, gen_gvec_ool_arg_zzxz,
+           gen_helper_gvec_usdot_idx_b, a)
+
+#define DO_SVE2_RRX(NAME, FUNC) \
+    TRANS_FEAT(NAME, aa64_sve, gen_gvec_ool_zzz, FUNC,          \
+               a->rd, a->rn, a->rm, a->index)
+
+DO_SVE2_RRX(MUL_zzx_h, gen_helper_gvec_mul_idx_h)
+DO_SVE2_RRX(MUL_zzx_s, gen_helper_gvec_mul_idx_s)
+DO_SVE2_RRX(MUL_zzx_d, gen_helper_gvec_mul_idx_d)
+
+DO_SVE2_RRX(SQDMULH_zzx_h, gen_helper_sve2_sqdmulh_idx_h)
+DO_SVE2_RRX(SQDMULH_zzx_s, gen_helper_sve2_sqdmulh_idx_s)
+DO_SVE2_RRX(SQDMULH_zzx_d, gen_helper_sve2_sqdmulh_idx_d)
+
+DO_SVE2_RRX(SQRDMULH_zzx_h, gen_helper_sve2_sqrdmulh_idx_h)
+DO_SVE2_RRX(SQRDMULH_zzx_s, gen_helper_sve2_sqrdmulh_idx_s)
+DO_SVE2_RRX(SQRDMULH_zzx_d, gen_helper_sve2_sqrdmulh_idx_d)
+
+#undef DO_SVE2_RRX
+
+#define DO_SVE2_RRX_TB(NAME, FUNC, TOP) \
+    TRANS_FEAT(NAME, aa64_sve, gen_gvec_ool_zzz, FUNC,          \
+               a->rd, a->rn, a->rm, (a->index << 1) | TOP)
+
+DO_SVE2_RRX_TB(SQDMULLB_zzx_s, gen_helper_sve2_sqdmull_idx_s, false)
+DO_SVE2_RRX_TB(SQDMULLB_zzx_d, gen_helper_sve2_sqdmull_idx_d, false)
+DO_SVE2_RRX_TB(SQDMULLT_zzx_s, gen_helper_sve2_sqdmull_idx_s, true)
+DO_SVE2_RRX_TB(SQDMULLT_zzx_d, gen_helper_sve2_sqdmull_idx_d, true)
+
+DO_SVE2_RRX_TB(SMULLB_zzx_s, gen_helper_sve2_smull_idx_s, false)
+DO_SVE2_RRX_TB(SMULLB_zzx_d, gen_helper_sve2_smull_idx_d, false)
+DO_SVE2_RRX_TB(SMULLT_zzx_s, gen_helper_sve2_smull_idx_s, true)
+DO_SVE2_RRX_TB(SMULLT_zzx_d, gen_helper_sve2_smull_idx_d, true)
+
+DO_SVE2_RRX_TB(UMULLB_zzx_s, gen_helper_sve2_umull_idx_s, false)
+DO_SVE2_RRX_TB(UMULLB_zzx_d, gen_helper_sve2_umull_idx_d, false)
+DO_SVE2_RRX_TB(UMULLT_zzx_s, gen_helper_sve2_umull_idx_s, true)
+DO_SVE2_RRX_TB(UMULLT_zzx_d, gen_helper_sve2_umull_idx_d, true)
+
+#undef DO_SVE2_RRX_TB
+
+#define DO_SVE2_RRXR(NAME, FUNC) \
+    TRANS_FEAT(NAME, aa64_sve2, gen_gvec_ool_arg_zzxz, FUNC, a)
+
+DO_SVE2_RRXR(MLA_zzxz_h, gen_helper_gvec_mla_idx_h)
+DO_SVE2_RRXR(MLA_zzxz_s, gen_helper_gvec_mla_idx_s)
+DO_SVE2_RRXR(MLA_zzxz_d, gen_helper_gvec_mla_idx_d)
+
+DO_SVE2_RRXR(MLS_zzxz_h, gen_helper_gvec_mls_idx_h)
+DO_SVE2_RRXR(MLS_zzxz_s, gen_helper_gvec_mls_idx_s)
+DO_SVE2_RRXR(MLS_zzxz_d, gen_helper_gvec_mls_idx_d)
+
+DO_SVE2_RRXR(SQRDMLAH_zzxz_h, gen_helper_sve2_sqrdmlah_idx_h)
+DO_SVE2_RRXR(SQRDMLAH_zzxz_s, gen_helper_sve2_sqrdmlah_idx_s)
+DO_SVE2_RRXR(SQRDMLAH_zzxz_d, gen_helper_sve2_sqrdmlah_idx_d)
+
+DO_SVE2_RRXR(SQRDMLSH_zzxz_h, gen_helper_sve2_sqrdmlsh_idx_h)
+DO_SVE2_RRXR(SQRDMLSH_zzxz_s, gen_helper_sve2_sqrdmlsh_idx_s)
+DO_SVE2_RRXR(SQRDMLSH_zzxz_d, gen_helper_sve2_sqrdmlsh_idx_d)
+
+#undef DO_SVE2_RRXR
+
+#define DO_SVE2_RRXR_TB(NAME, FUNC, TOP) \
+    TRANS_FEAT(NAME, aa64_sve2, gen_gvec_ool_zzzz, FUNC,        \
+               a->rd, a->rn, a->rm, a->ra, (a->index << 1) | TOP)
+
+DO_SVE2_RRXR_TB(SQDMLALB_zzxw_s, gen_helper_sve2_sqdmlal_idx_s, false)
+DO_SVE2_RRXR_TB(SQDMLALB_zzxw_d, gen_helper_sve2_sqdmlal_idx_d, false)
+DO_SVE2_RRXR_TB(SQDMLALT_zzxw_s, gen_helper_sve2_sqdmlal_idx_s, true)
+DO_SVE2_RRXR_TB(SQDMLALT_zzxw_d, gen_helper_sve2_sqdmlal_idx_d, true)
+
+DO_SVE2_RRXR_TB(SQDMLSLB_zzxw_s, gen_helper_sve2_sqdmlsl_idx_s, false)
+DO_SVE2_RRXR_TB(SQDMLSLB_zzxw_d, gen_helper_sve2_sqdmlsl_idx_d, false)
+DO_SVE2_RRXR_TB(SQDMLSLT_zzxw_s, gen_helper_sve2_sqdmlsl_idx_s, true)
+DO_SVE2_RRXR_TB(SQDMLSLT_zzxw_d, gen_helper_sve2_sqdmlsl_idx_d, true)
+
+DO_SVE2_RRXR_TB(SMLALB_zzxw_s, gen_helper_sve2_smlal_idx_s, false)
+DO_SVE2_RRXR_TB(SMLALB_zzxw_d, gen_helper_sve2_smlal_idx_d, false)
+DO_SVE2_RRXR_TB(SMLALT_zzxw_s, gen_helper_sve2_smlal_idx_s, true)
+DO_SVE2_RRXR_TB(SMLALT_zzxw_d, gen_helper_sve2_smlal_idx_d, true)
+
+DO_SVE2_RRXR_TB(UMLALB_zzxw_s, gen_helper_sve2_umlal_idx_s, false)
+DO_SVE2_RRXR_TB(UMLALB_zzxw_d, gen_helper_sve2_umlal_idx_d, false)
+DO_SVE2_RRXR_TB(UMLALT_zzxw_s, gen_helper_sve2_umlal_idx_s, true)
+DO_SVE2_RRXR_TB(UMLALT_zzxw_d, gen_helper_sve2_umlal_idx_d, true)
+
+DO_SVE2_RRXR_TB(SMLSLB_zzxw_s, gen_helper_sve2_smlsl_idx_s, false)
+DO_SVE2_RRXR_TB(SMLSLB_zzxw_d, gen_helper_sve2_smlsl_idx_d, false)
+DO_SVE2_RRXR_TB(SMLSLT_zzxw_s, gen_helper_sve2_smlsl_idx_s, true)
+DO_SVE2_RRXR_TB(SMLSLT_zzxw_d, gen_helper_sve2_smlsl_idx_d, true)
+
+DO_SVE2_RRXR_TB(UMLSLB_zzxw_s, gen_helper_sve2_umlsl_idx_s, false)
+DO_SVE2_RRXR_TB(UMLSLB_zzxw_d, gen_helper_sve2_umlsl_idx_d, false)
+DO_SVE2_RRXR_TB(UMLSLT_zzxw_s, gen_helper_sve2_umlsl_idx_s, true)
+DO_SVE2_RRXR_TB(UMLSLT_zzxw_d, gen_helper_sve2_umlsl_idx_d, true)
+
+#undef DO_SVE2_RRXR_TB
+
+#define DO_SVE2_RRXR_ROT(NAME, FUNC) \
+    TRANS_FEAT(NAME, aa64_sve2, gen_gvec_ool_zzzz, FUNC,           \
+               a->rd, a->rn, a->rm, a->ra, (a->index << 2) | a->rot)
+
+DO_SVE2_RRXR_ROT(CMLA_zzxz_h, gen_helper_sve2_cmla_idx_h)
+DO_SVE2_RRXR_ROT(CMLA_zzxz_s, gen_helper_sve2_cmla_idx_s)
+
+DO_SVE2_RRXR_ROT(SQRDCMLAH_zzxz_h, gen_helper_sve2_sqrdcmlah_idx_h)
+DO_SVE2_RRXR_ROT(SQRDCMLAH_zzxz_s, gen_helper_sve2_sqrdcmlah_idx_s)
+
+DO_SVE2_RRXR_ROT(CDOT_zzxw_s, gen_helper_sve2_cdot_idx_s)
+DO_SVE2_RRXR_ROT(CDOT_zzxw_d, gen_helper_sve2_cdot_idx_d)
+
+#undef DO_SVE2_RRXR_ROT
+
+/*
+ *** SVE Floating Point Multiply-Add Indexed Group
+ */
+
+static bool do_FMLA_zzxz(DisasContext *s, arg_rrxr_esz *a, bool sub)
+{
+    static gen_helper_gvec_4_ptr * const fns[4] = {
+        NULL,
+        gen_helper_gvec_fmla_idx_h,
+        gen_helper_gvec_fmla_idx_s,
+        gen_helper_gvec_fmla_idx_d,
+    };
+    return gen_gvec_fpst_zzzz(s, fns[a->esz], a->rd, a->rn, a->rm, a->ra,
+                              (a->index << 1) | sub,
+                              a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+}
+
+TRANS_FEAT(FMLA_zzxz, aa64_sve, do_FMLA_zzxz, a, false)
+TRANS_FEAT(FMLS_zzxz, aa64_sve, do_FMLA_zzxz, a, true)
+
+/*
+ *** SVE Floating Point Multiply Indexed Group
+ */
+
+static gen_helper_gvec_3_ptr * const fmul_idx_fns[4] = {
+    NULL,                       gen_helper_gvec_fmul_idx_h,
+    gen_helper_gvec_fmul_idx_s, gen_helper_gvec_fmul_idx_d,
+};
+TRANS_FEAT(FMUL_zzx, aa64_sve, gen_gvec_fpst_zzz,
+           fmul_idx_fns[a->esz], a->rd, a->rn, a->rm, a->index,
+           a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
+
+/*
+ *** SVE Floating Point Fast Reduction Group
+ */
+
+typedef void gen_helper_fp_reduce(TCGv_i64, TCGv_ptr, TCGv_ptr,
+                                  TCGv_ptr, TCGv_i32);
+
+static bool do_reduce(DisasContext *s, arg_rpr_esz *a,
+                      gen_helper_fp_reduce *fn)
+{
+    unsigned vsz, p2vsz;
+    TCGv_i32 t_desc;
+    TCGv_ptr t_zn, t_pg, status;
+    TCGv_i64 temp;
+
+    if (fn == NULL) {
+        return false;
+    }
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    vsz = vec_full_reg_size(s);
+    p2vsz = pow2ceil(vsz);
+    t_desc = tcg_constant_i32(simd_desc(vsz, vsz, p2vsz));
+    temp = tcg_temp_new_i64();
+    t_zn = tcg_temp_new_ptr();
+    t_pg = tcg_temp_new_ptr();
+
+    tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, a->rn));
+    tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, a->pg));
+    status = fpstatus_ptr(a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+
+    fn(temp, t_zn, t_pg, status, t_desc);
+    tcg_temp_free_ptr(t_zn);
+    tcg_temp_free_ptr(t_pg);
+    tcg_temp_free_ptr(status);
+
+    write_fp_dreg(s, a->rd, temp);
+    tcg_temp_free_i64(temp);
+    return true;
+}
+
+#define DO_VPZ(NAME, name) \
+    static gen_helper_fp_reduce * const name##_fns[4] = {                \
+        NULL,                      gen_helper_sve_##name##_h,            \
+        gen_helper_sve_##name##_s, gen_helper_sve_##name##_d,            \
+    };                                                                   \
+    TRANS_FEAT(NAME, aa64_sve, do_reduce, a, name##_fns[a->esz])
+
+DO_VPZ(FADDV, faddv)
+DO_VPZ(FMINNMV, fminnmv)
+DO_VPZ(FMAXNMV, fmaxnmv)
+DO_VPZ(FMINV, fminv)
+DO_VPZ(FMAXV, fmaxv)
+
+#undef DO_VPZ
+
+/*
+ *** SVE Floating Point Unary Operations - Unpredicated Group
+ */
+
+static gen_helper_gvec_2_ptr * const frecpe_fns[] = {
+    NULL,                     gen_helper_gvec_frecpe_h,
+    gen_helper_gvec_frecpe_s, gen_helper_gvec_frecpe_d,
+};
+TRANS_FEAT(FRECPE, aa64_sve, gen_gvec_fpst_arg_zz, frecpe_fns[a->esz], a, 0)
+
+static gen_helper_gvec_2_ptr * const frsqrte_fns[] = {
+    NULL,                      gen_helper_gvec_frsqrte_h,
+    gen_helper_gvec_frsqrte_s, gen_helper_gvec_frsqrte_d,
+};
+TRANS_FEAT(FRSQRTE, aa64_sve, gen_gvec_fpst_arg_zz, frsqrte_fns[a->esz], a, 0)
+
+/*
+ *** SVE Floating Point Compare with Zero Group
+ */
+
+static bool do_ppz_fp(DisasContext *s, arg_rpr_esz *a,
+                      gen_helper_gvec_3_ptr *fn)
+{
+    if (fn == NULL) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        TCGv_ptr status =
+            fpstatus_ptr(a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+
+        tcg_gen_gvec_3_ptr(pred_full_reg_offset(s, a->rd),
+                           vec_full_reg_offset(s, a->rn),
+                           pred_full_reg_offset(s, a->pg),
+                           status, vsz, vsz, 0, fn);
+        tcg_temp_free_ptr(status);
+    }
+    return true;
+}
+
+#define DO_PPZ(NAME, name) \
+    static gen_helper_gvec_3_ptr * const name##_fns[] = {         \
+        NULL,                      gen_helper_sve_##name##_h,     \
+        gen_helper_sve_##name##_s, gen_helper_sve_##name##_d,     \
+    };                                                            \
+    TRANS_FEAT(NAME, aa64_sve, do_ppz_fp, a, name##_fns[a->esz])
+
+DO_PPZ(FCMGE_ppz0, fcmge0)
+DO_PPZ(FCMGT_ppz0, fcmgt0)
+DO_PPZ(FCMLE_ppz0, fcmle0)
+DO_PPZ(FCMLT_ppz0, fcmlt0)
+DO_PPZ(FCMEQ_ppz0, fcmeq0)
+DO_PPZ(FCMNE_ppz0, fcmne0)
+
+#undef DO_PPZ
+
+/*
+ *** SVE floating-point trig multiply-add coefficient
+ */
+
+static gen_helper_gvec_3_ptr * const ftmad_fns[4] = {
+    NULL,                   gen_helper_sve_ftmad_h,
+    gen_helper_sve_ftmad_s, gen_helper_sve_ftmad_d,
+};
+TRANS_FEAT_NONSTREAMING(FTMAD, aa64_sve, gen_gvec_fpst_zzz,
+                        ftmad_fns[a->esz], a->rd, a->rn, a->rm, a->imm,
+                        a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
+
+/*
+ *** SVE Floating Point Accumulating Reduction Group
+ */
+
+static bool trans_FADDA(DisasContext *s, arg_rprr_esz *a)
+{
+    typedef void fadda_fn(TCGv_i64, TCGv_i64, TCGv_ptr,
+                          TCGv_ptr, TCGv_ptr, TCGv_i32);
+    static fadda_fn * const fns[3] = {
+        gen_helper_sve_fadda_h,
+        gen_helper_sve_fadda_s,
+        gen_helper_sve_fadda_d,
+    };
+    unsigned vsz = vec_full_reg_size(s);
+    TCGv_ptr t_rm, t_pg, t_fpst;
+    TCGv_i64 t_val;
+    TCGv_i32 t_desc;
+
+    if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    s->is_nonstreaming = true;
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    t_val = load_esz(cpu_env, vec_reg_offset(s, a->rn, 0, a->esz), a->esz);
+    t_rm = tcg_temp_new_ptr();
+    t_pg = tcg_temp_new_ptr();
+    tcg_gen_addi_ptr(t_rm, cpu_env, vec_full_reg_offset(s, a->rm));
+    tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, a->pg));
+    t_fpst = fpstatus_ptr(a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+    t_desc = tcg_constant_i32(simd_desc(vsz, vsz, 0));
+
+    fns[a->esz - 1](t_val, t_val, t_rm, t_pg, t_fpst, t_desc);
+
+    tcg_temp_free_ptr(t_fpst);
+    tcg_temp_free_ptr(t_pg);
+    tcg_temp_free_ptr(t_rm);
+
+    write_fp_dreg(s, a->rd, t_val);
+    tcg_temp_free_i64(t_val);
+    return true;
+}
+
+/*
+ *** SVE Floating Point Arithmetic - Unpredicated Group
+ */
+
+#define DO_FP3(NAME, name) \
+    static gen_helper_gvec_3_ptr * const name##_fns[4] = {          \
+        NULL, gen_helper_gvec_##name##_h,                           \
+        gen_helper_gvec_##name##_s, gen_helper_gvec_##name##_d      \
+    };                                                              \
+    TRANS_FEAT(NAME, aa64_sve, gen_gvec_fpst_arg_zzz, name##_fns[a->esz], a, 0)
+
+DO_FP3(FADD_zzz, fadd)
+DO_FP3(FSUB_zzz, fsub)
+DO_FP3(FMUL_zzz, fmul)
+DO_FP3(FRECPS, recps)
+DO_FP3(FRSQRTS, rsqrts)
+
+#undef DO_FP3
+
+static gen_helper_gvec_3_ptr * const ftsmul_fns[4] = {
+    NULL,                     gen_helper_gvec_ftsmul_h,
+    gen_helper_gvec_ftsmul_s, gen_helper_gvec_ftsmul_d
+};
+TRANS_FEAT_NONSTREAMING(FTSMUL, aa64_sve, gen_gvec_fpst_arg_zzz,
+                        ftsmul_fns[a->esz], a, 0)
+
+/*
+ *** SVE Floating Point Arithmetic - Predicated Group
+ */
+
+#define DO_ZPZZ_FP(NAME, FEAT, name) \
+    static gen_helper_gvec_4_ptr * const name##_zpzz_fns[4] = { \
+        NULL,                  gen_helper_##name##_h,           \
+        gen_helper_##name##_s, gen_helper_##name##_d            \
+    };                                                          \
+    TRANS_FEAT(NAME, FEAT, gen_gvec_fpst_arg_zpzz, name##_zpzz_fns[a->esz], a)
+
+DO_ZPZZ_FP(FADD_zpzz, aa64_sve, sve_fadd)
+DO_ZPZZ_FP(FSUB_zpzz, aa64_sve, sve_fsub)
+DO_ZPZZ_FP(FMUL_zpzz, aa64_sve, sve_fmul)
+DO_ZPZZ_FP(FMIN_zpzz, aa64_sve, sve_fmin)
+DO_ZPZZ_FP(FMAX_zpzz, aa64_sve, sve_fmax)
+DO_ZPZZ_FP(FMINNM_zpzz, aa64_sve, sve_fminnum)
+DO_ZPZZ_FP(FMAXNM_zpzz, aa64_sve, sve_fmaxnum)
+DO_ZPZZ_FP(FABD, aa64_sve, sve_fabd)
+DO_ZPZZ_FP(FSCALE, aa64_sve, sve_fscalbn)
+DO_ZPZZ_FP(FDIV, aa64_sve, sve_fdiv)
+DO_ZPZZ_FP(FMULX, aa64_sve, sve_fmulx)
+
+typedef void gen_helper_sve_fp2scalar(TCGv_ptr, TCGv_ptr, TCGv_ptr,
+                                      TCGv_i64, TCGv_ptr, TCGv_i32);
+
+static void do_fp_scalar(DisasContext *s, int zd, int zn, int pg, bool is_fp16,
+                         TCGv_i64 scalar, gen_helper_sve_fp2scalar *fn)
+{
+    unsigned vsz = vec_full_reg_size(s);
+    TCGv_ptr t_zd, t_zn, t_pg, status;
+    TCGv_i32 desc;
+
+    t_zd = tcg_temp_new_ptr();
+    t_zn = tcg_temp_new_ptr();
+    t_pg = tcg_temp_new_ptr();
+    tcg_gen_addi_ptr(t_zd, cpu_env, vec_full_reg_offset(s, zd));
+    tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, zn));
+    tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg));
+
+    status = fpstatus_ptr(is_fp16 ? FPST_FPCR_F16 : FPST_FPCR);
+    desc = tcg_constant_i32(simd_desc(vsz, vsz, 0));
+    fn(t_zd, t_zn, t_pg, scalar, status, desc);
+
+    tcg_temp_free_ptr(status);
+    tcg_temp_free_ptr(t_pg);
+    tcg_temp_free_ptr(t_zn);
+    tcg_temp_free_ptr(t_zd);
+}
+
+static bool do_fp_imm(DisasContext *s, arg_rpri_esz *a, uint64_t imm,
+                      gen_helper_sve_fp2scalar *fn)
+{
+    if (fn == NULL) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        do_fp_scalar(s, a->rd, a->rn, a->pg, a->esz == MO_16,
+                     tcg_constant_i64(imm), fn);
+    }
+    return true;
+}
+
+#define DO_FP_IMM(NAME, name, const0, const1)                           \
+    static gen_helper_sve_fp2scalar * const name##_fns[4] = {           \
+        NULL, gen_helper_sve_##name##_h,                                \
+        gen_helper_sve_##name##_s,                                      \
+        gen_helper_sve_##name##_d                                       \
+    };                                                                  \
+    static uint64_t const name##_const[4][2] = {                        \
+        { -1, -1 },                                                     \
+        { float16_##const0, float16_##const1 },                         \
+        { float32_##const0, float32_##const1 },                         \
+        { float64_##const0, float64_##const1 },                         \
+    };                                                                  \
+    TRANS_FEAT(NAME##_zpzi, aa64_sve, do_fp_imm, a,                     \
+               name##_const[a->esz][a->imm], name##_fns[a->esz])
+
+DO_FP_IMM(FADD, fadds, half, one)
+DO_FP_IMM(FSUB, fsubs, half, one)
+DO_FP_IMM(FMUL, fmuls, half, two)
+DO_FP_IMM(FSUBR, fsubrs, half, one)
+DO_FP_IMM(FMAXNM, fmaxnms, zero, one)
+DO_FP_IMM(FMINNM, fminnms, zero, one)
+DO_FP_IMM(FMAX, fmaxs, zero, one)
+DO_FP_IMM(FMIN, fmins, zero, one)
+
+#undef DO_FP_IMM
+
+static bool do_fp_cmp(DisasContext *s, arg_rprr_esz *a,
+                      gen_helper_gvec_4_ptr *fn)
+{
+    if (fn == NULL) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        TCGv_ptr status = fpstatus_ptr(a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+        tcg_gen_gvec_4_ptr(pred_full_reg_offset(s, a->rd),
+                           vec_full_reg_offset(s, a->rn),
+                           vec_full_reg_offset(s, a->rm),
+                           pred_full_reg_offset(s, a->pg),
+                           status, vsz, vsz, 0, fn);
+        tcg_temp_free_ptr(status);
+    }
+    return true;
+}
+
+#define DO_FPCMP(NAME, name) \
+    static gen_helper_gvec_4_ptr * const name##_fns[4] = {            \
+        NULL, gen_helper_sve_##name##_h,                              \
+        gen_helper_sve_##name##_s, gen_helper_sve_##name##_d          \
+    };                                                                \
+    TRANS_FEAT(NAME##_ppzz, aa64_sve, do_fp_cmp, a, name##_fns[a->esz])
+
+DO_FPCMP(FCMGE, fcmge)
+DO_FPCMP(FCMGT, fcmgt)
+DO_FPCMP(FCMEQ, fcmeq)
+DO_FPCMP(FCMNE, fcmne)
+DO_FPCMP(FCMUO, fcmuo)
+DO_FPCMP(FACGE, facge)
+DO_FPCMP(FACGT, facgt)
+
+#undef DO_FPCMP
+
+static gen_helper_gvec_4_ptr * const fcadd_fns[] = {
+    NULL,                   gen_helper_sve_fcadd_h,
+    gen_helper_sve_fcadd_s, gen_helper_sve_fcadd_d,
+};
+TRANS_FEAT(FCADD, aa64_sve, gen_gvec_fpst_zzzp, fcadd_fns[a->esz],
+           a->rd, a->rn, a->rm, a->pg, a->rot,
+           a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
+
+#define DO_FMLA(NAME, name) \
+    static gen_helper_gvec_5_ptr * const name##_fns[4] = {              \
+        NULL, gen_helper_sve_##name##_h,                                \
+        gen_helper_sve_##name##_s, gen_helper_sve_##name##_d            \
+    };                                                                  \
+    TRANS_FEAT(NAME, aa64_sve, gen_gvec_fpst_zzzzp, name##_fns[a->esz], \
+               a->rd, a->rn, a->rm, a->ra, a->pg, 0,                    \
+               a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
+
+DO_FMLA(FMLA_zpzzz, fmla_zpzzz)
+DO_FMLA(FMLS_zpzzz, fmls_zpzzz)
+DO_FMLA(FNMLA_zpzzz, fnmla_zpzzz)
+DO_FMLA(FNMLS_zpzzz, fnmls_zpzzz)
+
+#undef DO_FMLA
+
+static gen_helper_gvec_5_ptr * const fcmla_fns[4] = {
+    NULL,                         gen_helper_sve_fcmla_zpzzz_h,
+    gen_helper_sve_fcmla_zpzzz_s, gen_helper_sve_fcmla_zpzzz_d,
+};
+TRANS_FEAT(FCMLA_zpzzz, aa64_sve, gen_gvec_fpst_zzzzp, fcmla_fns[a->esz],
+           a->rd, a->rn, a->rm, a->ra, a->pg, a->rot,
+           a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
+
+static gen_helper_gvec_4_ptr * const fcmla_idx_fns[4] = {
+    NULL, gen_helper_gvec_fcmlah_idx, gen_helper_gvec_fcmlas_idx, NULL
+};
+TRANS_FEAT(FCMLA_zzxz, aa64_sve, gen_gvec_fpst_zzzz, fcmla_idx_fns[a->esz],
+           a->rd, a->rn, a->rm, a->ra, a->index * 4 + a->rot,
+           a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
+
+/*
+ *** SVE Floating Point Unary Operations Predicated Group
+ */
+
+TRANS_FEAT(FCVT_sh, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_fcvt_sh, a, 0, FPST_FPCR)
+TRANS_FEAT(FCVT_hs, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_fcvt_hs, a, 0, FPST_FPCR)
+
+TRANS_FEAT(BFCVT, aa64_sve_bf16, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_bfcvt, a, 0, FPST_FPCR)
+
+TRANS_FEAT(FCVT_dh, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_fcvt_dh, a, 0, FPST_FPCR)
+TRANS_FEAT(FCVT_hd, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_fcvt_hd, a, 0, FPST_FPCR)
+TRANS_FEAT(FCVT_ds, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_fcvt_ds, a, 0, FPST_FPCR)
+TRANS_FEAT(FCVT_sd, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_fcvt_sd, a, 0, FPST_FPCR)
+
+TRANS_FEAT(FCVTZS_hh, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_fcvtzs_hh, a, 0, FPST_FPCR_F16)
+TRANS_FEAT(FCVTZU_hh, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_fcvtzu_hh, a, 0, FPST_FPCR_F16)
+TRANS_FEAT(FCVTZS_hs, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_fcvtzs_hs, a, 0, FPST_FPCR_F16)
+TRANS_FEAT(FCVTZU_hs, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_fcvtzu_hs, a, 0, FPST_FPCR_F16)
+TRANS_FEAT(FCVTZS_hd, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_fcvtzs_hd, a, 0, FPST_FPCR_F16)
+TRANS_FEAT(FCVTZU_hd, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_fcvtzu_hd, a, 0, FPST_FPCR_F16)
+
+TRANS_FEAT(FCVTZS_ss, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_fcvtzs_ss, a, 0, FPST_FPCR)
+TRANS_FEAT(FCVTZU_ss, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_fcvtzu_ss, a, 0, FPST_FPCR)
+TRANS_FEAT(FCVTZS_sd, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_fcvtzs_sd, a, 0, FPST_FPCR)
+TRANS_FEAT(FCVTZU_sd, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_fcvtzu_sd, a, 0, FPST_FPCR)
+TRANS_FEAT(FCVTZS_ds, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_fcvtzs_ds, a, 0, FPST_FPCR)
+TRANS_FEAT(FCVTZU_ds, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_fcvtzu_ds, a, 0, FPST_FPCR)
+
+TRANS_FEAT(FCVTZS_dd, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_fcvtzs_dd, a, 0, FPST_FPCR)
+TRANS_FEAT(FCVTZU_dd, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_fcvtzu_dd, a, 0, FPST_FPCR)
+
+static gen_helper_gvec_3_ptr * const frint_fns[] = {
+    NULL,
+    gen_helper_sve_frint_h,
+    gen_helper_sve_frint_s,
+    gen_helper_sve_frint_d
+};
+TRANS_FEAT(FRINTI, aa64_sve, gen_gvec_fpst_arg_zpz, frint_fns[a->esz],
+           a, 0, a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
+
+static gen_helper_gvec_3_ptr * const frintx_fns[] = {
+    NULL,
+    gen_helper_sve_frintx_h,
+    gen_helper_sve_frintx_s,
+    gen_helper_sve_frintx_d
+};
+TRANS_FEAT(FRINTX, aa64_sve, gen_gvec_fpst_arg_zpz, frintx_fns[a->esz],
+           a, 0, a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+
+static bool do_frint_mode(DisasContext *s, arg_rpr_esz *a,
+                          int mode, gen_helper_gvec_3_ptr *fn)
+{
+    unsigned vsz;
+    TCGv_i32 tmode;
+    TCGv_ptr status;
+
+    if (fn == NULL) {
+        return false;
+    }
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    vsz = vec_full_reg_size(s);
+    tmode = tcg_const_i32(mode);
+    status = fpstatus_ptr(a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+
+    gen_helper_set_rmode(tmode, tmode, status);
+
+    tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, a->rd),
+                       vec_full_reg_offset(s, a->rn),
+                       pred_full_reg_offset(s, a->pg),
+                       status, vsz, vsz, 0, fn);
+
+    gen_helper_set_rmode(tmode, tmode, status);
+    tcg_temp_free_i32(tmode);
+    tcg_temp_free_ptr(status);
+    return true;
+}
+
+TRANS_FEAT(FRINTN, aa64_sve, do_frint_mode, a,
+           float_round_nearest_even, frint_fns[a->esz])
+TRANS_FEAT(FRINTP, aa64_sve, do_frint_mode, a,
+           float_round_up, frint_fns[a->esz])
+TRANS_FEAT(FRINTM, aa64_sve, do_frint_mode, a,
+           float_round_down, frint_fns[a->esz])
+TRANS_FEAT(FRINTZ, aa64_sve, do_frint_mode, a,
+           float_round_to_zero, frint_fns[a->esz])
+TRANS_FEAT(FRINTA, aa64_sve, do_frint_mode, a,
+           float_round_ties_away, frint_fns[a->esz])
+
+static gen_helper_gvec_3_ptr * const frecpx_fns[] = {
+    NULL,                    gen_helper_sve_frecpx_h,
+    gen_helper_sve_frecpx_s, gen_helper_sve_frecpx_d,
+};
+TRANS_FEAT(FRECPX, aa64_sve, gen_gvec_fpst_arg_zpz, frecpx_fns[a->esz],
+           a, 0, a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
+
+static gen_helper_gvec_3_ptr * const fsqrt_fns[] = {
+    NULL,                   gen_helper_sve_fsqrt_h,
+    gen_helper_sve_fsqrt_s, gen_helper_sve_fsqrt_d,
+};
+TRANS_FEAT(FSQRT, aa64_sve, gen_gvec_fpst_arg_zpz, fsqrt_fns[a->esz],
+           a, 0, a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
+
+TRANS_FEAT(SCVTF_hh, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_scvt_hh, a, 0, FPST_FPCR_F16)
+TRANS_FEAT(SCVTF_sh, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_scvt_sh, a, 0, FPST_FPCR_F16)
+TRANS_FEAT(SCVTF_dh, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_scvt_dh, a, 0, FPST_FPCR_F16)
+
+TRANS_FEAT(SCVTF_ss, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_scvt_ss, a, 0, FPST_FPCR)
+TRANS_FEAT(SCVTF_ds, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_scvt_ds, a, 0, FPST_FPCR)
+
+TRANS_FEAT(SCVTF_sd, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_scvt_sd, a, 0, FPST_FPCR)
+TRANS_FEAT(SCVTF_dd, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_scvt_dd, a, 0, FPST_FPCR)
+
+TRANS_FEAT(UCVTF_hh, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_ucvt_hh, a, 0, FPST_FPCR_F16)
+TRANS_FEAT(UCVTF_sh, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_ucvt_sh, a, 0, FPST_FPCR_F16)
+TRANS_FEAT(UCVTF_dh, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_ucvt_dh, a, 0, FPST_FPCR_F16)
+
+TRANS_FEAT(UCVTF_ss, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_ucvt_ss, a, 0, FPST_FPCR)
+TRANS_FEAT(UCVTF_ds, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_ucvt_ds, a, 0, FPST_FPCR)
+TRANS_FEAT(UCVTF_sd, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_ucvt_sd, a, 0, FPST_FPCR)
+
+TRANS_FEAT(UCVTF_dd, aa64_sve, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_ucvt_dd, a, 0, FPST_FPCR)
+
+/*
+ *** SVE Memory - 32-bit Gather and Unsized Contiguous Group
+ */
+
+/* Subroutine loading a vector register at VOFS of LEN bytes.
+ * The load should begin at the address Rn + IMM.
+ */
+
+void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs,
+                 int len, int rn, int imm)
+{
+    int len_align = QEMU_ALIGN_DOWN(len, 8);
+    int len_remain = len % 8;
+    int nparts = len / 8 + ctpop8(len_remain);
+    int midx = get_mem_index(s);
+    TCGv_i64 dirty_addr, clean_addr, t0, t1;
+
+    dirty_addr = tcg_temp_new_i64();
+    tcg_gen_addi_i64(dirty_addr, cpu_reg_sp(s, rn), imm);
+    clean_addr = gen_mte_checkN(s, dirty_addr, false, rn != 31, len);
+    tcg_temp_free_i64(dirty_addr);
+
+    /*
+     * Note that unpredicated load/store of vector/predicate registers
+     * are defined as a stream of bytes, which equates to little-endian
+     * operations on larger quantities.
+     * Attempt to keep code expansion to a minimum by limiting the
+     * amount of unrolling done.
+     */
+    if (nparts <= 4) {
+        int i;
+
+        t0 = tcg_temp_new_i64();
+        for (i = 0; i < len_align; i += 8) {
+            tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ);
+            tcg_gen_st_i64(t0, base, vofs + i);
+            tcg_gen_addi_i64(clean_addr, clean_addr, 8);
+        }
+        tcg_temp_free_i64(t0);
+    } else {
+        TCGLabel *loop = gen_new_label();
+        TCGv_ptr tp, i = tcg_const_local_ptr(0);
+
+        /* Copy the clean address into a local temp, live across the loop. */
+        t0 = clean_addr;
+        clean_addr = new_tmp_a64_local(s);
+        tcg_gen_mov_i64(clean_addr, t0);
+
+        if (base != cpu_env) {
+            TCGv_ptr b = tcg_temp_local_new_ptr();
+            tcg_gen_mov_ptr(b, base);
+            base = b;
+        }
+
+        gen_set_label(loop);
+
+        t0 = tcg_temp_new_i64();
+        tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ);
+        tcg_gen_addi_i64(clean_addr, clean_addr, 8);
+
+        tp = tcg_temp_new_ptr();
+        tcg_gen_add_ptr(tp, base, i);
+        tcg_gen_addi_ptr(i, i, 8);
+        tcg_gen_st_i64(t0, tp, vofs);
+        tcg_temp_free_ptr(tp);
+        tcg_temp_free_i64(t0);
+
+        tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop);
+        tcg_temp_free_ptr(i);
+
+        if (base != cpu_env) {
+            tcg_temp_free_ptr(base);
+            assert(len_remain == 0);
+        }
+    }
+
+    /*
+     * Predicate register loads can be any multiple of 2.
+     * Note that we still store the entire 64-bit unit into cpu_env.
+     */
+    if (len_remain) {
+        t0 = tcg_temp_new_i64();
+        switch (len_remain) {
+        case 2:
+        case 4:
+        case 8:
+            tcg_gen_qemu_ld_i64(t0, clean_addr, midx,
+                                MO_LE | ctz32(len_remain));
+            break;
+
+        case 6:
+            t1 = tcg_temp_new_i64();
+            tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUL);
+            tcg_gen_addi_i64(clean_addr, clean_addr, 4);
+            tcg_gen_qemu_ld_i64(t1, clean_addr, midx, MO_LEUW);
+            tcg_gen_deposit_i64(t0, t0, t1, 32, 32);
+            tcg_temp_free_i64(t1);
+            break;
+
+        default:
+            g_assert_not_reached();
+        }
+        tcg_gen_st_i64(t0, base, vofs + len_align);
+        tcg_temp_free_i64(t0);
+    }
+}
+
+/* Similarly for stores.  */
+void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs,
+                 int len, int rn, int imm)
+{
+    int len_align = QEMU_ALIGN_DOWN(len, 8);
+    int len_remain = len % 8;
+    int nparts = len / 8 + ctpop8(len_remain);
+    int midx = get_mem_index(s);
+    TCGv_i64 dirty_addr, clean_addr, t0;
+
+    dirty_addr = tcg_temp_new_i64();
+    tcg_gen_addi_i64(dirty_addr, cpu_reg_sp(s, rn), imm);
+    clean_addr = gen_mte_checkN(s, dirty_addr, false, rn != 31, len);
+    tcg_temp_free_i64(dirty_addr);
+
+    /* Note that unpredicated load/store of vector/predicate registers
+     * are defined as a stream of bytes, which equates to little-endian
+     * operations on larger quantities.  There is no nice way to force
+     * a little-endian store for aarch64_be-linux-user out of line.
+     *
+     * Attempt to keep code expansion to a minimum by limiting the
+     * amount of unrolling done.
+     */
+    if (nparts <= 4) {
+        int i;
+
+        t0 = tcg_temp_new_i64();
+        for (i = 0; i < len_align; i += 8) {
+            tcg_gen_ld_i64(t0, base, vofs + i);
+            tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ);
+            tcg_gen_addi_i64(clean_addr, clean_addr, 8);
+        }
+        tcg_temp_free_i64(t0);
+    } else {
+        TCGLabel *loop = gen_new_label();
+        TCGv_ptr tp, i = tcg_const_local_ptr(0);
+
+        /* Copy the clean address into a local temp, live across the loop. */
+        t0 = clean_addr;
+        clean_addr = new_tmp_a64_local(s);
+        tcg_gen_mov_i64(clean_addr, t0);
+
+        if (base != cpu_env) {
+            TCGv_ptr b = tcg_temp_local_new_ptr();
+            tcg_gen_mov_ptr(b, base);
+            base = b;
+        }
+
+        gen_set_label(loop);
+
+        t0 = tcg_temp_new_i64();
+        tp = tcg_temp_new_ptr();
+        tcg_gen_add_ptr(tp, base, i);
+        tcg_gen_ld_i64(t0, tp, vofs);
+        tcg_gen_addi_ptr(i, i, 8);
+        tcg_temp_free_ptr(tp);
+
+        tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ);
+        tcg_gen_addi_i64(clean_addr, clean_addr, 8);
+        tcg_temp_free_i64(t0);
+
+        tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop);
+        tcg_temp_free_ptr(i);
+
+        if (base != cpu_env) {
+            tcg_temp_free_ptr(base);
+            assert(len_remain == 0);
+        }
+    }
+
+    /* Predicate register stores can be any multiple of 2.  */
+    if (len_remain) {
+        t0 = tcg_temp_new_i64();
+        tcg_gen_ld_i64(t0, base, vofs + len_align);
+
+        switch (len_remain) {
+        case 2:
+        case 4:
+        case 8:
+            tcg_gen_qemu_st_i64(t0, clean_addr, midx,
+                                MO_LE | ctz32(len_remain));
+            break;
+
+        case 6:
+            tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUL);
+            tcg_gen_addi_i64(clean_addr, clean_addr, 4);
+            tcg_gen_shri_i64(t0, t0, 32);
+            tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUW);
+            break;
+
+        default:
+            g_assert_not_reached();
+        }
+        tcg_temp_free_i64(t0);
+    }
+}
+
+static bool trans_LDR_zri(DisasContext *s, arg_rri *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        int size = vec_full_reg_size(s);
+        int off = vec_full_reg_offset(s, a->rd);
+        gen_sve_ldr(s, cpu_env, off, size, a->rn, a->imm * size);
+    }
+    return true;
+}
+
+static bool trans_LDR_pri(DisasContext *s, arg_rri *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        int size = pred_full_reg_size(s);
+        int off = pred_full_reg_offset(s, a->rd);
+        gen_sve_ldr(s, cpu_env, off, size, a->rn, a->imm * size);
+    }
+    return true;
+}
+
+static bool trans_STR_zri(DisasContext *s, arg_rri *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        int size = vec_full_reg_size(s);
+        int off = vec_full_reg_offset(s, a->rd);
+        gen_sve_str(s, cpu_env, off, size, a->rn, a->imm * size);
+    }
+    return true;
+}
+
+static bool trans_STR_pri(DisasContext *s, arg_rri *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        int size = pred_full_reg_size(s);
+        int off = pred_full_reg_offset(s, a->rd);
+        gen_sve_str(s, cpu_env, off, size, a->rn, a->imm * size);
+    }
+    return true;
+}
+
+/*
+ *** SVE Memory - Contiguous Load Group
+ */
+
+/* The memory mode of the dtype.  */
+static const MemOp dtype_mop[16] = {
+    MO_UB, MO_UB, MO_UB, MO_UB,
+    MO_SL, MO_UW, MO_UW, MO_UW,
+    MO_SW, MO_SW, MO_UL, MO_UL,
+    MO_SB, MO_SB, MO_SB, MO_UQ
+};
+
+#define dtype_msz(x)  (dtype_mop[x] & MO_SIZE)
+
+/* The vector element size of dtype.  */
+static const uint8_t dtype_esz[16] = {
+    0, 1, 2, 3,
+    3, 1, 2, 3,
+    3, 2, 2, 3,
+    3, 2, 1, 3
+};
+
+static void do_mem_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr,
+                       int dtype, uint32_t mte_n, bool is_write,
+                       gen_helper_gvec_mem *fn)
+{
+    unsigned vsz = vec_full_reg_size(s);
+    TCGv_ptr t_pg;
+    int desc = 0;
+
+    /*
+     * For e.g. LD4, there are not enough arguments to pass all 4
+     * registers as pointers, so encode the regno into the data field.
+     * For consistency, do this even for LD1.
+     */
+    if (s->mte_active[0]) {
+        int msz = dtype_msz(dtype);
+
+        desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s));
+        desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid);
+        desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma);
+        desc = FIELD_DP32(desc, MTEDESC, WRITE, is_write);
+        desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (mte_n << msz) - 1);
+        desc <<= SVE_MTEDESC_SHIFT;
+    } else {
+        addr = clean_data_tbi(s, addr);
+    }
+
+    desc = simd_desc(vsz, vsz, zt | desc);
+    t_pg = tcg_temp_new_ptr();
+
+    tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg));
+    fn(cpu_env, t_pg, addr, tcg_constant_i32(desc));
+
+    tcg_temp_free_ptr(t_pg);
+}
+
+/* Indexed by [mte][be][dtype][nreg] */
+static gen_helper_gvec_mem * const ldr_fns[2][2][16][4] = {
+    { /* mte inactive, little-endian */
+      { { gen_helper_sve_ld1bb_r, gen_helper_sve_ld2bb_r,
+          gen_helper_sve_ld3bb_r, gen_helper_sve_ld4bb_r },
+        { gen_helper_sve_ld1bhu_r, NULL, NULL, NULL },
+        { gen_helper_sve_ld1bsu_r, NULL, NULL, NULL },
+        { gen_helper_sve_ld1bdu_r, NULL, NULL, NULL },
+
+        { gen_helper_sve_ld1sds_le_r, NULL, NULL, NULL },
+        { gen_helper_sve_ld1hh_le_r, gen_helper_sve_ld2hh_le_r,
+          gen_helper_sve_ld3hh_le_r, gen_helper_sve_ld4hh_le_r },
+        { gen_helper_sve_ld1hsu_le_r, NULL, NULL, NULL },
+        { gen_helper_sve_ld1hdu_le_r, NULL, NULL, NULL },
+
+        { gen_helper_sve_ld1hds_le_r, NULL, NULL, NULL },
+        { gen_helper_sve_ld1hss_le_r, NULL, NULL, NULL },
+        { gen_helper_sve_ld1ss_le_r, gen_helper_sve_ld2ss_le_r,
+          gen_helper_sve_ld3ss_le_r, gen_helper_sve_ld4ss_le_r },
+        { gen_helper_sve_ld1sdu_le_r, NULL, NULL, NULL },
+
+        { gen_helper_sve_ld1bds_r, NULL, NULL, NULL },
+        { gen_helper_sve_ld1bss_r, NULL, NULL, NULL },
+        { gen_helper_sve_ld1bhs_r, NULL, NULL, NULL },
+        { gen_helper_sve_ld1dd_le_r, gen_helper_sve_ld2dd_le_r,
+          gen_helper_sve_ld3dd_le_r, gen_helper_sve_ld4dd_le_r } },
+
+      /* mte inactive, big-endian */
+      { { gen_helper_sve_ld1bb_r, gen_helper_sve_ld2bb_r,
+          gen_helper_sve_ld3bb_r, gen_helper_sve_ld4bb_r },
+        { gen_helper_sve_ld1bhu_r, NULL, NULL, NULL },
+        { gen_helper_sve_ld1bsu_r, NULL, NULL, NULL },
+        { gen_helper_sve_ld1bdu_r, NULL, NULL, NULL },
+
+        { gen_helper_sve_ld1sds_be_r, NULL, NULL, NULL },
+        { gen_helper_sve_ld1hh_be_r, gen_helper_sve_ld2hh_be_r,
+          gen_helper_sve_ld3hh_be_r, gen_helper_sve_ld4hh_be_r },
+        { gen_helper_sve_ld1hsu_be_r, NULL, NULL, NULL },
+        { gen_helper_sve_ld1hdu_be_r, NULL, NULL, NULL },
+
+        { gen_helper_sve_ld1hds_be_r, NULL, NULL, NULL },
+        { gen_helper_sve_ld1hss_be_r, NULL, NULL, NULL },
+        { gen_helper_sve_ld1ss_be_r, gen_helper_sve_ld2ss_be_r,
+          gen_helper_sve_ld3ss_be_r, gen_helper_sve_ld4ss_be_r },
+        { gen_helper_sve_ld1sdu_be_r, NULL, NULL, NULL },
+
+        { gen_helper_sve_ld1bds_r, NULL, NULL, NULL },
+        { gen_helper_sve_ld1bss_r, NULL, NULL, NULL },
+        { gen_helper_sve_ld1bhs_r, NULL, NULL, NULL },
+        { gen_helper_sve_ld1dd_be_r, gen_helper_sve_ld2dd_be_r,
+          gen_helper_sve_ld3dd_be_r, gen_helper_sve_ld4dd_be_r } } },
+
+    { /* mte active, little-endian */
+      { { gen_helper_sve_ld1bb_r_mte,
+          gen_helper_sve_ld2bb_r_mte,
+          gen_helper_sve_ld3bb_r_mte,
+          gen_helper_sve_ld4bb_r_mte },
+        { gen_helper_sve_ld1bhu_r_mte, NULL, NULL, NULL },
+        { gen_helper_sve_ld1bsu_r_mte, NULL, NULL, NULL },
+        { gen_helper_sve_ld1bdu_r_mte, NULL, NULL, NULL },
+
+        { gen_helper_sve_ld1sds_le_r_mte, NULL, NULL, NULL },
+        { gen_helper_sve_ld1hh_le_r_mte,
+          gen_helper_sve_ld2hh_le_r_mte,
+          gen_helper_sve_ld3hh_le_r_mte,
+          gen_helper_sve_ld4hh_le_r_mte },
+        { gen_helper_sve_ld1hsu_le_r_mte, NULL, NULL, NULL },
+        { gen_helper_sve_ld1hdu_le_r_mte, NULL, NULL, NULL },
+
+        { gen_helper_sve_ld1hds_le_r_mte, NULL, NULL, NULL },
+        { gen_helper_sve_ld1hss_le_r_mte, NULL, NULL, NULL },
+        { gen_helper_sve_ld1ss_le_r_mte,
+          gen_helper_sve_ld2ss_le_r_mte,
+          gen_helper_sve_ld3ss_le_r_mte,
+          gen_helper_sve_ld4ss_le_r_mte },
+        { gen_helper_sve_ld1sdu_le_r_mte, NULL, NULL, NULL },
+
+        { gen_helper_sve_ld1bds_r_mte, NULL, NULL, NULL },
+        { gen_helper_sve_ld1bss_r_mte, NULL, NULL, NULL },
+        { gen_helper_sve_ld1bhs_r_mte, NULL, NULL, NULL },
+        { gen_helper_sve_ld1dd_le_r_mte,
+          gen_helper_sve_ld2dd_le_r_mte,
+          gen_helper_sve_ld3dd_le_r_mte,
+          gen_helper_sve_ld4dd_le_r_mte } },
+
+      /* mte active, big-endian */
+      { { gen_helper_sve_ld1bb_r_mte,
+          gen_helper_sve_ld2bb_r_mte,
+          gen_helper_sve_ld3bb_r_mte,
+          gen_helper_sve_ld4bb_r_mte },
+        { gen_helper_sve_ld1bhu_r_mte, NULL, NULL, NULL },
+        { gen_helper_sve_ld1bsu_r_mte, NULL, NULL, NULL },
+        { gen_helper_sve_ld1bdu_r_mte, NULL, NULL, NULL },
+
+        { gen_helper_sve_ld1sds_be_r_mte, NULL, NULL, NULL },
+        { gen_helper_sve_ld1hh_be_r_mte,
+          gen_helper_sve_ld2hh_be_r_mte,
+          gen_helper_sve_ld3hh_be_r_mte,
+          gen_helper_sve_ld4hh_be_r_mte },
+        { gen_helper_sve_ld1hsu_be_r_mte, NULL, NULL, NULL },
+        { gen_helper_sve_ld1hdu_be_r_mte, NULL, NULL, NULL },
+
+        { gen_helper_sve_ld1hds_be_r_mte, NULL, NULL, NULL },
+        { gen_helper_sve_ld1hss_be_r_mte, NULL, NULL, NULL },
+        { gen_helper_sve_ld1ss_be_r_mte,
+          gen_helper_sve_ld2ss_be_r_mte,
+          gen_helper_sve_ld3ss_be_r_mte,
+          gen_helper_sve_ld4ss_be_r_mte },
+        { gen_helper_sve_ld1sdu_be_r_mte, NULL, NULL, NULL },
+
+        { gen_helper_sve_ld1bds_r_mte, NULL, NULL, NULL },
+        { gen_helper_sve_ld1bss_r_mte, NULL, NULL, NULL },
+        { gen_helper_sve_ld1bhs_r_mte, NULL, NULL, NULL },
+        { gen_helper_sve_ld1dd_be_r_mte,
+          gen_helper_sve_ld2dd_be_r_mte,
+          gen_helper_sve_ld3dd_be_r_mte,
+          gen_helper_sve_ld4dd_be_r_mte } } },
+};
+
+static void do_ld_zpa(DisasContext *s, int zt, int pg,
+                      TCGv_i64 addr, int dtype, int nreg)
+{
+    gen_helper_gvec_mem *fn
+        = ldr_fns[s->mte_active[0]][s->be_data == MO_BE][dtype][nreg];
+
+    /*
+     * While there are holes in the table, they are not
+     * accessible via the instruction encoding.
+     */
+    assert(fn != NULL);
+    do_mem_zpa(s, zt, pg, addr, dtype, nreg, false, fn);
+}
+
+static bool trans_LD_zprr(DisasContext *s, arg_rprr_load *a)
+{
+    if (a->rm == 31 || !dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        TCGv_i64 addr = new_tmp_a64(s);
+        tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), dtype_msz(a->dtype));
+        tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
+        do_ld_zpa(s, a->rd, a->pg, addr, a->dtype, a->nreg);
+    }
+    return true;
+}
+
+static bool trans_LD_zpri(DisasContext *s, arg_rpri_load *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        int vsz = vec_full_reg_size(s);
+        int elements = vsz >> dtype_esz[a->dtype];
+        TCGv_i64 addr = new_tmp_a64(s);
+
+        tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn),
+                         (a->imm * elements * (a->nreg + 1))
+                         << dtype_msz(a->dtype));
+        do_ld_zpa(s, a->rd, a->pg, addr, a->dtype, a->nreg);
+    }
+    return true;
+}
+
+static bool trans_LDFF1_zprr(DisasContext *s, arg_rprr_load *a)
+{
+    static gen_helper_gvec_mem * const fns[2][2][16] = {
+        { /* mte inactive, little-endian */
+          { gen_helper_sve_ldff1bb_r,
+            gen_helper_sve_ldff1bhu_r,
+            gen_helper_sve_ldff1bsu_r,
+            gen_helper_sve_ldff1bdu_r,
+
+            gen_helper_sve_ldff1sds_le_r,
+            gen_helper_sve_ldff1hh_le_r,
+            gen_helper_sve_ldff1hsu_le_r,
+            gen_helper_sve_ldff1hdu_le_r,
+
+            gen_helper_sve_ldff1hds_le_r,
+            gen_helper_sve_ldff1hss_le_r,
+            gen_helper_sve_ldff1ss_le_r,
+            gen_helper_sve_ldff1sdu_le_r,
+
+            gen_helper_sve_ldff1bds_r,
+            gen_helper_sve_ldff1bss_r,
+            gen_helper_sve_ldff1bhs_r,
+            gen_helper_sve_ldff1dd_le_r },
+
+          /* mte inactive, big-endian */
+          { gen_helper_sve_ldff1bb_r,
+            gen_helper_sve_ldff1bhu_r,
+            gen_helper_sve_ldff1bsu_r,
+            gen_helper_sve_ldff1bdu_r,
+
+            gen_helper_sve_ldff1sds_be_r,
+            gen_helper_sve_ldff1hh_be_r,
+            gen_helper_sve_ldff1hsu_be_r,
+            gen_helper_sve_ldff1hdu_be_r,
+
+            gen_helper_sve_ldff1hds_be_r,
+            gen_helper_sve_ldff1hss_be_r,
+            gen_helper_sve_ldff1ss_be_r,
+            gen_helper_sve_ldff1sdu_be_r,
+
+            gen_helper_sve_ldff1bds_r,
+            gen_helper_sve_ldff1bss_r,
+            gen_helper_sve_ldff1bhs_r,
+            gen_helper_sve_ldff1dd_be_r } },
+
+        { /* mte active, little-endian */
+          { gen_helper_sve_ldff1bb_r_mte,
+            gen_helper_sve_ldff1bhu_r_mte,
+            gen_helper_sve_ldff1bsu_r_mte,
+            gen_helper_sve_ldff1bdu_r_mte,
+
+            gen_helper_sve_ldff1sds_le_r_mte,
+            gen_helper_sve_ldff1hh_le_r_mte,
+            gen_helper_sve_ldff1hsu_le_r_mte,
+            gen_helper_sve_ldff1hdu_le_r_mte,
+
+            gen_helper_sve_ldff1hds_le_r_mte,
+            gen_helper_sve_ldff1hss_le_r_mte,
+            gen_helper_sve_ldff1ss_le_r_mte,
+            gen_helper_sve_ldff1sdu_le_r_mte,
+
+            gen_helper_sve_ldff1bds_r_mte,
+            gen_helper_sve_ldff1bss_r_mte,
+            gen_helper_sve_ldff1bhs_r_mte,
+            gen_helper_sve_ldff1dd_le_r_mte },
+
+          /* mte active, big-endian */
+          { gen_helper_sve_ldff1bb_r_mte,
+            gen_helper_sve_ldff1bhu_r_mte,
+            gen_helper_sve_ldff1bsu_r_mte,
+            gen_helper_sve_ldff1bdu_r_mte,
+
+            gen_helper_sve_ldff1sds_be_r_mte,
+            gen_helper_sve_ldff1hh_be_r_mte,
+            gen_helper_sve_ldff1hsu_be_r_mte,
+            gen_helper_sve_ldff1hdu_be_r_mte,
+
+            gen_helper_sve_ldff1hds_be_r_mte,
+            gen_helper_sve_ldff1hss_be_r_mte,
+            gen_helper_sve_ldff1ss_be_r_mte,
+            gen_helper_sve_ldff1sdu_be_r_mte,
+
+            gen_helper_sve_ldff1bds_r_mte,
+            gen_helper_sve_ldff1bss_r_mte,
+            gen_helper_sve_ldff1bhs_r_mte,
+            gen_helper_sve_ldff1dd_be_r_mte } },
+    };
+
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    s->is_nonstreaming = true;
+    if (sve_access_check(s)) {
+        TCGv_i64 addr = new_tmp_a64(s);
+        tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), dtype_msz(a->dtype));
+        tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
+        do_mem_zpa(s, a->rd, a->pg, addr, a->dtype, 1, false,
+                   fns[s->mte_active[0]][s->be_data == MO_BE][a->dtype]);
+    }
+    return true;
+}
+
+static bool trans_LDNF1_zpri(DisasContext *s, arg_rpri_load *a)
+{
+    static gen_helper_gvec_mem * const fns[2][2][16] = {
+        { /* mte inactive, little-endian */
+          { gen_helper_sve_ldnf1bb_r,
+            gen_helper_sve_ldnf1bhu_r,
+            gen_helper_sve_ldnf1bsu_r,
+            gen_helper_sve_ldnf1bdu_r,
+
+            gen_helper_sve_ldnf1sds_le_r,
+            gen_helper_sve_ldnf1hh_le_r,
+            gen_helper_sve_ldnf1hsu_le_r,
+            gen_helper_sve_ldnf1hdu_le_r,
+
+            gen_helper_sve_ldnf1hds_le_r,
+            gen_helper_sve_ldnf1hss_le_r,
+            gen_helper_sve_ldnf1ss_le_r,
+            gen_helper_sve_ldnf1sdu_le_r,
+
+            gen_helper_sve_ldnf1bds_r,
+            gen_helper_sve_ldnf1bss_r,
+            gen_helper_sve_ldnf1bhs_r,
+            gen_helper_sve_ldnf1dd_le_r },
+
+          /* mte inactive, big-endian */
+          { gen_helper_sve_ldnf1bb_r,
+            gen_helper_sve_ldnf1bhu_r,
+            gen_helper_sve_ldnf1bsu_r,
+            gen_helper_sve_ldnf1bdu_r,
+
+            gen_helper_sve_ldnf1sds_be_r,
+            gen_helper_sve_ldnf1hh_be_r,
+            gen_helper_sve_ldnf1hsu_be_r,
+            gen_helper_sve_ldnf1hdu_be_r,
+
+            gen_helper_sve_ldnf1hds_be_r,
+            gen_helper_sve_ldnf1hss_be_r,
+            gen_helper_sve_ldnf1ss_be_r,
+            gen_helper_sve_ldnf1sdu_be_r,
+
+            gen_helper_sve_ldnf1bds_r,
+            gen_helper_sve_ldnf1bss_r,
+            gen_helper_sve_ldnf1bhs_r,
+            gen_helper_sve_ldnf1dd_be_r } },
+
+        { /* mte inactive, little-endian */
+          { gen_helper_sve_ldnf1bb_r_mte,
+            gen_helper_sve_ldnf1bhu_r_mte,
+            gen_helper_sve_ldnf1bsu_r_mte,
+            gen_helper_sve_ldnf1bdu_r_mte,
+
+            gen_helper_sve_ldnf1sds_le_r_mte,
+            gen_helper_sve_ldnf1hh_le_r_mte,
+            gen_helper_sve_ldnf1hsu_le_r_mte,
+            gen_helper_sve_ldnf1hdu_le_r_mte,
+
+            gen_helper_sve_ldnf1hds_le_r_mte,
+            gen_helper_sve_ldnf1hss_le_r_mte,
+            gen_helper_sve_ldnf1ss_le_r_mte,
+            gen_helper_sve_ldnf1sdu_le_r_mte,
+
+            gen_helper_sve_ldnf1bds_r_mte,
+            gen_helper_sve_ldnf1bss_r_mte,
+            gen_helper_sve_ldnf1bhs_r_mte,
+            gen_helper_sve_ldnf1dd_le_r_mte },
+
+          /* mte inactive, big-endian */
+          { gen_helper_sve_ldnf1bb_r_mte,
+            gen_helper_sve_ldnf1bhu_r_mte,
+            gen_helper_sve_ldnf1bsu_r_mte,
+            gen_helper_sve_ldnf1bdu_r_mte,
+
+            gen_helper_sve_ldnf1sds_be_r_mte,
+            gen_helper_sve_ldnf1hh_be_r_mte,
+            gen_helper_sve_ldnf1hsu_be_r_mte,
+            gen_helper_sve_ldnf1hdu_be_r_mte,
+
+            gen_helper_sve_ldnf1hds_be_r_mte,
+            gen_helper_sve_ldnf1hss_be_r_mte,
+            gen_helper_sve_ldnf1ss_be_r_mte,
+            gen_helper_sve_ldnf1sdu_be_r_mte,
+
+            gen_helper_sve_ldnf1bds_r_mte,
+            gen_helper_sve_ldnf1bss_r_mte,
+            gen_helper_sve_ldnf1bhs_r_mte,
+            gen_helper_sve_ldnf1dd_be_r_mte } },
+    };
+
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    s->is_nonstreaming = true;
+    if (sve_access_check(s)) {
+        int vsz = vec_full_reg_size(s);
+        int elements = vsz >> dtype_esz[a->dtype];
+        int off = (a->imm * elements) << dtype_msz(a->dtype);
+        TCGv_i64 addr = new_tmp_a64(s);
+
+        tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), off);
+        do_mem_zpa(s, a->rd, a->pg, addr, a->dtype, 1, false,
+                   fns[s->mte_active[0]][s->be_data == MO_BE][a->dtype]);
+    }
+    return true;
+}
+
+static void do_ldrq(DisasContext *s, int zt, int pg, TCGv_i64 addr, int dtype)
+{
+    unsigned vsz = vec_full_reg_size(s);
+    TCGv_ptr t_pg;
+    int poff;
+
+    /* Load the first quadword using the normal predicated load helpers.  */
+    poff = pred_full_reg_offset(s, pg);
+    if (vsz > 16) {
+        /*
+         * Zero-extend the first 16 bits of the predicate into a temporary.
+         * This avoids triggering an assert making sure we don't have bits
+         * set within a predicate beyond VQ, but we have lowered VQ to 1
+         * for this load operation.
+         */
+        TCGv_i64 tmp = tcg_temp_new_i64();
+#if HOST_BIG_ENDIAN
+        poff += 6;
+#endif
+        tcg_gen_ld16u_i64(tmp, cpu_env, poff);
+
+        poff = offsetof(CPUARMState, vfp.preg_tmp);
+        tcg_gen_st_i64(tmp, cpu_env, poff);
+        tcg_temp_free_i64(tmp);
+    }
+
+    t_pg = tcg_temp_new_ptr();
+    tcg_gen_addi_ptr(t_pg, cpu_env, poff);
+
+    gen_helper_gvec_mem *fn
+        = ldr_fns[s->mte_active[0]][s->be_data == MO_BE][dtype][0];
+    fn(cpu_env, t_pg, addr, tcg_constant_i32(simd_desc(16, 16, zt)));
+
+    tcg_temp_free_ptr(t_pg);
+
+    /* Replicate that first quadword.  */
+    if (vsz > 16) {
+        int doff = vec_full_reg_offset(s, zt);
+        tcg_gen_gvec_dup_mem(4, doff + 16, doff, vsz - 16, vsz - 16);
+    }
+}
+
+static bool trans_LD1RQ_zprr(DisasContext *s, arg_rprr_load *a)
+{
+    if (a->rm == 31 || !dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        int msz = dtype_msz(a->dtype);
+        TCGv_i64 addr = new_tmp_a64(s);
+        tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), msz);
+        tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
+        do_ldrq(s, a->rd, a->pg, addr, a->dtype);
+    }
+    return true;
+}
+
+static bool trans_LD1RQ_zpri(DisasContext *s, arg_rpri_load *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        TCGv_i64 addr = new_tmp_a64(s);
+        tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), a->imm * 16);
+        do_ldrq(s, a->rd, a->pg, addr, a->dtype);
+    }
+    return true;
+}
+
+static void do_ldro(DisasContext *s, int zt, int pg, TCGv_i64 addr, int dtype)
+{
+    unsigned vsz = vec_full_reg_size(s);
+    unsigned vsz_r32;
+    TCGv_ptr t_pg;
+    int poff, doff;
+
+    if (vsz < 32) {
+        /*
+         * Note that this UNDEFINED check comes after CheckSVEEnabled()
+         * in the ARM pseudocode, which is the sve_access_check() done
+         * in our caller.  We should not now return false from the caller.
+         */
+        unallocated_encoding(s);
+        return;
+    }
+
+    /* Load the first octaword using the normal predicated load helpers.  */
+
+    poff = pred_full_reg_offset(s, pg);
+    if (vsz > 32) {
+        /*
+         * Zero-extend the first 32 bits of the predicate into a temporary.
+         * This avoids triggering an assert making sure we don't have bits
+         * set within a predicate beyond VQ, but we have lowered VQ to 2
+         * for this load operation.
+         */
+        TCGv_i64 tmp = tcg_temp_new_i64();
+#if HOST_BIG_ENDIAN
+        poff += 4;
+#endif
+        tcg_gen_ld32u_i64(tmp, cpu_env, poff);
+
+        poff = offsetof(CPUARMState, vfp.preg_tmp);
+        tcg_gen_st_i64(tmp, cpu_env, poff);
+        tcg_temp_free_i64(tmp);
+    }
+
+    t_pg = tcg_temp_new_ptr();
+    tcg_gen_addi_ptr(t_pg, cpu_env, poff);
+
+    gen_helper_gvec_mem *fn
+        = ldr_fns[s->mte_active[0]][s->be_data == MO_BE][dtype][0];
+    fn(cpu_env, t_pg, addr, tcg_constant_i32(simd_desc(32, 32, zt)));
+
+    tcg_temp_free_ptr(t_pg);
+
+    /*
+     * Replicate that first octaword.
+     * The replication happens in units of 32; if the full vector size
+     * is not a multiple of 32, the final bits are zeroed.
+     */
+    doff = vec_full_reg_offset(s, zt);
+    vsz_r32 = QEMU_ALIGN_DOWN(vsz, 32);
+    if (vsz >= 64) {
+        tcg_gen_gvec_dup_mem(5, doff + 32, doff, vsz_r32 - 32, vsz_r32 - 32);
+    }
+    vsz -= vsz_r32;
+    if (vsz) {
+        tcg_gen_gvec_dup_imm(MO_64, doff + vsz_r32, vsz, vsz, 0);
+    }
+}
+
+static bool trans_LD1RO_zprr(DisasContext *s, arg_rprr_load *a)
+{
+    if (!dc_isar_feature(aa64_sve_f64mm, s)) {
+        return false;
+    }
+    if (a->rm == 31) {
+        return false;
+    }
+    s->is_nonstreaming = true;
+    if (sve_access_check(s)) {
+        TCGv_i64 addr = new_tmp_a64(s);
+        tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), dtype_msz(a->dtype));
+        tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
+        do_ldro(s, a->rd, a->pg, addr, a->dtype);
+    }
+    return true;
+}
+
+static bool trans_LD1RO_zpri(DisasContext *s, arg_rpri_load *a)
+{
+    if (!dc_isar_feature(aa64_sve_f64mm, s)) {
+        return false;
+    }
+    s->is_nonstreaming = true;
+    if (sve_access_check(s)) {
+        TCGv_i64 addr = new_tmp_a64(s);
+        tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), a->imm * 32);
+        do_ldro(s, a->rd, a->pg, addr, a->dtype);
+    }
+    return true;
+}
+
+/* Load and broadcast element.  */
+static bool trans_LD1R_zpri(DisasContext *s, arg_rpri_load *a)
+{
+    unsigned vsz = vec_full_reg_size(s);
+    unsigned psz = pred_full_reg_size(s);
+    unsigned esz = dtype_esz[a->dtype];
+    unsigned msz = dtype_msz(a->dtype);
+    TCGLabel *over;
+    TCGv_i64 temp, clean_addr;
+
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    over = gen_new_label();
+
+    /* If the guarding predicate has no bits set, no load occurs.  */
+    if (psz <= 8) {
+        /* Reduce the pred_esz_masks value simply to reduce the
+         * size of the code generated here.
+         */
+        uint64_t psz_mask = MAKE_64BIT_MASK(0, psz * 8);
+        temp = tcg_temp_new_i64();
+        tcg_gen_ld_i64(temp, cpu_env, pred_full_reg_offset(s, a->pg));
+        tcg_gen_andi_i64(temp, temp, pred_esz_masks[esz] & psz_mask);
+        tcg_gen_brcondi_i64(TCG_COND_EQ, temp, 0, over);
+        tcg_temp_free_i64(temp);
+    } else {
+        TCGv_i32 t32 = tcg_temp_new_i32();
+        find_last_active(s, t32, esz, a->pg);
+        tcg_gen_brcondi_i32(TCG_COND_LT, t32, 0, over);
+        tcg_temp_free_i32(t32);
+    }
+
+    /* Load the data.  */
+    temp = tcg_temp_new_i64();
+    tcg_gen_addi_i64(temp, cpu_reg_sp(s, a->rn), a->imm << msz);
+    clean_addr = gen_mte_check1(s, temp, false, true, msz);
+
+    tcg_gen_qemu_ld_i64(temp, clean_addr, get_mem_index(s),
+                        finalize_memop(s, dtype_mop[a->dtype]));
+
+    /* Broadcast to *all* elements.  */
+    tcg_gen_gvec_dup_i64(esz, vec_full_reg_offset(s, a->rd),
+                         vsz, vsz, temp);
+    tcg_temp_free_i64(temp);
+
+    /* Zero the inactive elements.  */
+    gen_set_label(over);
+    return do_movz_zpz(s, a->rd, a->rd, a->pg, esz, false);
+}
+
+static void do_st_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr,
+                      int msz, int esz, int nreg)
+{
+    static gen_helper_gvec_mem * const fn_single[2][2][4][4] = {
+        { { { gen_helper_sve_st1bb_r,
+              gen_helper_sve_st1bh_r,
+              gen_helper_sve_st1bs_r,
+              gen_helper_sve_st1bd_r },
+            { NULL,
+              gen_helper_sve_st1hh_le_r,
+              gen_helper_sve_st1hs_le_r,
+              gen_helper_sve_st1hd_le_r },
+            { NULL, NULL,
+              gen_helper_sve_st1ss_le_r,
+              gen_helper_sve_st1sd_le_r },
+            { NULL, NULL, NULL,
+              gen_helper_sve_st1dd_le_r } },
+          { { gen_helper_sve_st1bb_r,
+              gen_helper_sve_st1bh_r,
+              gen_helper_sve_st1bs_r,
+              gen_helper_sve_st1bd_r },
+            { NULL,
+              gen_helper_sve_st1hh_be_r,
+              gen_helper_sve_st1hs_be_r,
+              gen_helper_sve_st1hd_be_r },
+            { NULL, NULL,
+              gen_helper_sve_st1ss_be_r,
+              gen_helper_sve_st1sd_be_r },
+            { NULL, NULL, NULL,
+              gen_helper_sve_st1dd_be_r } } },
+
+        { { { gen_helper_sve_st1bb_r_mte,
+              gen_helper_sve_st1bh_r_mte,
+              gen_helper_sve_st1bs_r_mte,
+              gen_helper_sve_st1bd_r_mte },
+            { NULL,
+              gen_helper_sve_st1hh_le_r_mte,
+              gen_helper_sve_st1hs_le_r_mte,
+              gen_helper_sve_st1hd_le_r_mte },
+            { NULL, NULL,
+              gen_helper_sve_st1ss_le_r_mte,
+              gen_helper_sve_st1sd_le_r_mte },
+            { NULL, NULL, NULL,
+              gen_helper_sve_st1dd_le_r_mte } },
+          { { gen_helper_sve_st1bb_r_mte,
+              gen_helper_sve_st1bh_r_mte,
+              gen_helper_sve_st1bs_r_mte,
+              gen_helper_sve_st1bd_r_mte },
+            { NULL,
+              gen_helper_sve_st1hh_be_r_mte,
+              gen_helper_sve_st1hs_be_r_mte,
+              gen_helper_sve_st1hd_be_r_mte },
+            { NULL, NULL,
+              gen_helper_sve_st1ss_be_r_mte,
+              gen_helper_sve_st1sd_be_r_mte },
+            { NULL, NULL, NULL,
+              gen_helper_sve_st1dd_be_r_mte } } },
+    };
+    static gen_helper_gvec_mem * const fn_multiple[2][2][3][4] = {
+        { { { gen_helper_sve_st2bb_r,
+              gen_helper_sve_st2hh_le_r,
+              gen_helper_sve_st2ss_le_r,
+              gen_helper_sve_st2dd_le_r },
+            { gen_helper_sve_st3bb_r,
+              gen_helper_sve_st3hh_le_r,
+              gen_helper_sve_st3ss_le_r,
+              gen_helper_sve_st3dd_le_r },
+            { gen_helper_sve_st4bb_r,
+              gen_helper_sve_st4hh_le_r,
+              gen_helper_sve_st4ss_le_r,
+              gen_helper_sve_st4dd_le_r } },
+          { { gen_helper_sve_st2bb_r,
+              gen_helper_sve_st2hh_be_r,
+              gen_helper_sve_st2ss_be_r,
+              gen_helper_sve_st2dd_be_r },
+            { gen_helper_sve_st3bb_r,
+              gen_helper_sve_st3hh_be_r,
+              gen_helper_sve_st3ss_be_r,
+              gen_helper_sve_st3dd_be_r },
+            { gen_helper_sve_st4bb_r,
+              gen_helper_sve_st4hh_be_r,
+              gen_helper_sve_st4ss_be_r,
+              gen_helper_sve_st4dd_be_r } } },
+        { { { gen_helper_sve_st2bb_r_mte,
+              gen_helper_sve_st2hh_le_r_mte,
+              gen_helper_sve_st2ss_le_r_mte,
+              gen_helper_sve_st2dd_le_r_mte },
+            { gen_helper_sve_st3bb_r_mte,
+              gen_helper_sve_st3hh_le_r_mte,
+              gen_helper_sve_st3ss_le_r_mte,
+              gen_helper_sve_st3dd_le_r_mte },
+            { gen_helper_sve_st4bb_r_mte,
+              gen_helper_sve_st4hh_le_r_mte,
+              gen_helper_sve_st4ss_le_r_mte,
+              gen_helper_sve_st4dd_le_r_mte } },
+          { { gen_helper_sve_st2bb_r_mte,
+              gen_helper_sve_st2hh_be_r_mte,
+              gen_helper_sve_st2ss_be_r_mte,
+              gen_helper_sve_st2dd_be_r_mte },
+            { gen_helper_sve_st3bb_r_mte,
+              gen_helper_sve_st3hh_be_r_mte,
+              gen_helper_sve_st3ss_be_r_mte,
+              gen_helper_sve_st3dd_be_r_mte },
+            { gen_helper_sve_st4bb_r_mte,
+              gen_helper_sve_st4hh_be_r_mte,
+              gen_helper_sve_st4ss_be_r_mte,
+              gen_helper_sve_st4dd_be_r_mte } } },
+    };
+    gen_helper_gvec_mem *fn;
+    int be = s->be_data == MO_BE;
+
+    if (nreg == 0) {
+        /* ST1 */
+        fn = fn_single[s->mte_active[0]][be][msz][esz];
+        nreg = 1;
+    } else {
+        /* ST2, ST3, ST4 -- msz == esz, enforced by encoding */
+        assert(msz == esz);
+        fn = fn_multiple[s->mte_active[0]][be][nreg - 1][msz];
+    }
+    assert(fn != NULL);
+    do_mem_zpa(s, zt, pg, addr, msz_dtype(s, msz), nreg, true, fn);
+}
+
+static bool trans_ST_zprr(DisasContext *s, arg_rprr_store *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (a->rm == 31 || a->msz > a->esz) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        TCGv_i64 addr = new_tmp_a64(s);
+        tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), a->msz);
+        tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
+        do_st_zpa(s, a->rd, a->pg, addr, a->msz, a->esz, a->nreg);
+    }
+    return true;
+}
+
+static bool trans_ST_zpri(DisasContext *s, arg_rpri_store *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    if (a->msz > a->esz) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        int vsz = vec_full_reg_size(s);
+        int elements = vsz >> a->esz;
+        TCGv_i64 addr = new_tmp_a64(s);
+
+        tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn),
+                         (a->imm * elements * (a->nreg + 1)) << a->msz);
+        do_st_zpa(s, a->rd, a->pg, addr, a->msz, a->esz, a->nreg);
+    }
+    return true;
+}
+
+/*
+ *** SVE gather loads / scatter stores
+ */
+
+static void do_mem_zpz(DisasContext *s, int zt, int pg, int zm,
+                       int scale, TCGv_i64 scalar, int msz, bool is_write,
+                       gen_helper_gvec_mem_scatter *fn)
+{
+    unsigned vsz = vec_full_reg_size(s);
+    TCGv_ptr t_zm = tcg_temp_new_ptr();
+    TCGv_ptr t_pg = tcg_temp_new_ptr();
+    TCGv_ptr t_zt = tcg_temp_new_ptr();
+    int desc = 0;
+
+    if (s->mte_active[0]) {
+        desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s));
+        desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid);
+        desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma);
+        desc = FIELD_DP32(desc, MTEDESC, WRITE, is_write);
+        desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (1 << msz) - 1);
+        desc <<= SVE_MTEDESC_SHIFT;
+    }
+    desc = simd_desc(vsz, vsz, desc | scale);
+
+    tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg));
+    tcg_gen_addi_ptr(t_zm, cpu_env, vec_full_reg_offset(s, zm));
+    tcg_gen_addi_ptr(t_zt, cpu_env, vec_full_reg_offset(s, zt));
+    fn(cpu_env, t_zt, t_pg, t_zm, scalar, tcg_constant_i32(desc));
+
+    tcg_temp_free_ptr(t_zt);
+    tcg_temp_free_ptr(t_zm);
+    tcg_temp_free_ptr(t_pg);
+}
+
+/* Indexed by [mte][be][ff][xs][u][msz].  */
+static gen_helper_gvec_mem_scatter * const
+gather_load_fn32[2][2][2][2][2][3] = {
+    { /* MTE Inactive */
+        { /* Little-endian */
+            { { { gen_helper_sve_ldbss_zsu,
+                  gen_helper_sve_ldhss_le_zsu,
+                  NULL, },
+                { gen_helper_sve_ldbsu_zsu,
+                  gen_helper_sve_ldhsu_le_zsu,
+                  gen_helper_sve_ldss_le_zsu, } },
+              { { gen_helper_sve_ldbss_zss,
+                  gen_helper_sve_ldhss_le_zss,
+                  NULL, },
+                { gen_helper_sve_ldbsu_zss,
+                  gen_helper_sve_ldhsu_le_zss,
+                  gen_helper_sve_ldss_le_zss, } } },
+
+            /* First-fault */
+            { { { gen_helper_sve_ldffbss_zsu,
+                  gen_helper_sve_ldffhss_le_zsu,
+                  NULL, },
+                { gen_helper_sve_ldffbsu_zsu,
+                  gen_helper_sve_ldffhsu_le_zsu,
+                  gen_helper_sve_ldffss_le_zsu, } },
+              { { gen_helper_sve_ldffbss_zss,
+                  gen_helper_sve_ldffhss_le_zss,
+                  NULL, },
+                { gen_helper_sve_ldffbsu_zss,
+                  gen_helper_sve_ldffhsu_le_zss,
+                  gen_helper_sve_ldffss_le_zss, } } } },
+
+        { /* Big-endian */
+            { { { gen_helper_sve_ldbss_zsu,
+                  gen_helper_sve_ldhss_be_zsu,
+                  NULL, },
+                { gen_helper_sve_ldbsu_zsu,
+                  gen_helper_sve_ldhsu_be_zsu,
+                  gen_helper_sve_ldss_be_zsu, } },
+              { { gen_helper_sve_ldbss_zss,
+                  gen_helper_sve_ldhss_be_zss,
+                  NULL, },
+                { gen_helper_sve_ldbsu_zss,
+                  gen_helper_sve_ldhsu_be_zss,
+                  gen_helper_sve_ldss_be_zss, } } },
+
+            /* First-fault */
+            { { { gen_helper_sve_ldffbss_zsu,
+                  gen_helper_sve_ldffhss_be_zsu,
+                  NULL, },
+                { gen_helper_sve_ldffbsu_zsu,
+                  gen_helper_sve_ldffhsu_be_zsu,
+                  gen_helper_sve_ldffss_be_zsu, } },
+              { { gen_helper_sve_ldffbss_zss,
+                  gen_helper_sve_ldffhss_be_zss,
+                  NULL, },
+                { gen_helper_sve_ldffbsu_zss,
+                  gen_helper_sve_ldffhsu_be_zss,
+                  gen_helper_sve_ldffss_be_zss, } } } } },
+    { /* MTE Active */
+        { /* Little-endian */
+            { { { gen_helper_sve_ldbss_zsu_mte,
+                  gen_helper_sve_ldhss_le_zsu_mte,
+                  NULL, },
+                { gen_helper_sve_ldbsu_zsu_mte,
+                  gen_helper_sve_ldhsu_le_zsu_mte,
+                  gen_helper_sve_ldss_le_zsu_mte, } },
+              { { gen_helper_sve_ldbss_zss_mte,
+                  gen_helper_sve_ldhss_le_zss_mte,
+                  NULL, },
+                { gen_helper_sve_ldbsu_zss_mte,
+                  gen_helper_sve_ldhsu_le_zss_mte,
+                  gen_helper_sve_ldss_le_zss_mte, } } },
+
+            /* First-fault */
+            { { { gen_helper_sve_ldffbss_zsu_mte,
+                  gen_helper_sve_ldffhss_le_zsu_mte,
+                  NULL, },
+                { gen_helper_sve_ldffbsu_zsu_mte,
+                  gen_helper_sve_ldffhsu_le_zsu_mte,
+                  gen_helper_sve_ldffss_le_zsu_mte, } },
+              { { gen_helper_sve_ldffbss_zss_mte,
+                  gen_helper_sve_ldffhss_le_zss_mte,
+                  NULL, },
+                { gen_helper_sve_ldffbsu_zss_mte,
+                  gen_helper_sve_ldffhsu_le_zss_mte,
+                  gen_helper_sve_ldffss_le_zss_mte, } } } },
+
+        { /* Big-endian */
+            { { { gen_helper_sve_ldbss_zsu_mte,
+                  gen_helper_sve_ldhss_be_zsu_mte,
+                  NULL, },
+                { gen_helper_sve_ldbsu_zsu_mte,
+                  gen_helper_sve_ldhsu_be_zsu_mte,
+                  gen_helper_sve_ldss_be_zsu_mte, } },
+              { { gen_helper_sve_ldbss_zss_mte,
+                  gen_helper_sve_ldhss_be_zss_mte,
+                  NULL, },
+                { gen_helper_sve_ldbsu_zss_mte,
+                  gen_helper_sve_ldhsu_be_zss_mte,
+                  gen_helper_sve_ldss_be_zss_mte, } } },
+
+            /* First-fault */
+            { { { gen_helper_sve_ldffbss_zsu_mte,
+                  gen_helper_sve_ldffhss_be_zsu_mte,
+                  NULL, },
+                { gen_helper_sve_ldffbsu_zsu_mte,
+                  gen_helper_sve_ldffhsu_be_zsu_mte,
+                  gen_helper_sve_ldffss_be_zsu_mte, } },
+              { { gen_helper_sve_ldffbss_zss_mte,
+                  gen_helper_sve_ldffhss_be_zss_mte,
+                  NULL, },
+                { gen_helper_sve_ldffbsu_zss_mte,
+                  gen_helper_sve_ldffhsu_be_zss_mte,
+                  gen_helper_sve_ldffss_be_zss_mte, } } } } },
+};
+
+/* Note that we overload xs=2 to indicate 64-bit offset.  */
+static gen_helper_gvec_mem_scatter * const
+gather_load_fn64[2][2][2][3][2][4] = {
+    { /* MTE Inactive */
+        { /* Little-endian */
+            { { { gen_helper_sve_ldbds_zsu,
+                  gen_helper_sve_ldhds_le_zsu,
+                  gen_helper_sve_ldsds_le_zsu,
+                  NULL, },
+                { gen_helper_sve_ldbdu_zsu,
+                  gen_helper_sve_ldhdu_le_zsu,
+                  gen_helper_sve_ldsdu_le_zsu,
+                  gen_helper_sve_lddd_le_zsu, } },
+              { { gen_helper_sve_ldbds_zss,
+                  gen_helper_sve_ldhds_le_zss,
+                  gen_helper_sve_ldsds_le_zss,
+                  NULL, },
+                { gen_helper_sve_ldbdu_zss,
+                  gen_helper_sve_ldhdu_le_zss,
+                  gen_helper_sve_ldsdu_le_zss,
+                  gen_helper_sve_lddd_le_zss, } },
+              { { gen_helper_sve_ldbds_zd,
+                  gen_helper_sve_ldhds_le_zd,
+                  gen_helper_sve_ldsds_le_zd,
+                  NULL, },
+                { gen_helper_sve_ldbdu_zd,
+                  gen_helper_sve_ldhdu_le_zd,
+                  gen_helper_sve_ldsdu_le_zd,
+                  gen_helper_sve_lddd_le_zd, } } },
+
+            /* First-fault */
+            { { { gen_helper_sve_ldffbds_zsu,
+                  gen_helper_sve_ldffhds_le_zsu,
+                  gen_helper_sve_ldffsds_le_zsu,
+                  NULL, },
+                { gen_helper_sve_ldffbdu_zsu,
+                  gen_helper_sve_ldffhdu_le_zsu,
+                  gen_helper_sve_ldffsdu_le_zsu,
+                  gen_helper_sve_ldffdd_le_zsu, } },
+              { { gen_helper_sve_ldffbds_zss,
+                  gen_helper_sve_ldffhds_le_zss,
+                  gen_helper_sve_ldffsds_le_zss,
+                  NULL, },
+                { gen_helper_sve_ldffbdu_zss,
+                  gen_helper_sve_ldffhdu_le_zss,
+                  gen_helper_sve_ldffsdu_le_zss,
+                  gen_helper_sve_ldffdd_le_zss, } },
+              { { gen_helper_sve_ldffbds_zd,
+                  gen_helper_sve_ldffhds_le_zd,
+                  gen_helper_sve_ldffsds_le_zd,
+                  NULL, },
+                { gen_helper_sve_ldffbdu_zd,
+                  gen_helper_sve_ldffhdu_le_zd,
+                  gen_helper_sve_ldffsdu_le_zd,
+                  gen_helper_sve_ldffdd_le_zd, } } } },
+        { /* Big-endian */
+            { { { gen_helper_sve_ldbds_zsu,
+                  gen_helper_sve_ldhds_be_zsu,
+                  gen_helper_sve_ldsds_be_zsu,
+                  NULL, },
+                { gen_helper_sve_ldbdu_zsu,
+                  gen_helper_sve_ldhdu_be_zsu,
+                  gen_helper_sve_ldsdu_be_zsu,
+                  gen_helper_sve_lddd_be_zsu, } },
+              { { gen_helper_sve_ldbds_zss,
+                  gen_helper_sve_ldhds_be_zss,
+                  gen_helper_sve_ldsds_be_zss,
+                  NULL, },
+                { gen_helper_sve_ldbdu_zss,
+                  gen_helper_sve_ldhdu_be_zss,
+                  gen_helper_sve_ldsdu_be_zss,
+                  gen_helper_sve_lddd_be_zss, } },
+              { { gen_helper_sve_ldbds_zd,
+                  gen_helper_sve_ldhds_be_zd,
+                  gen_helper_sve_ldsds_be_zd,
+                  NULL, },
+                { gen_helper_sve_ldbdu_zd,
+                  gen_helper_sve_ldhdu_be_zd,
+                  gen_helper_sve_ldsdu_be_zd,
+                  gen_helper_sve_lddd_be_zd, } } },
+
+            /* First-fault */
+            { { { gen_helper_sve_ldffbds_zsu,
+                  gen_helper_sve_ldffhds_be_zsu,
+                  gen_helper_sve_ldffsds_be_zsu,
+                  NULL, },
+                { gen_helper_sve_ldffbdu_zsu,
+                  gen_helper_sve_ldffhdu_be_zsu,
+                  gen_helper_sve_ldffsdu_be_zsu,
+                  gen_helper_sve_ldffdd_be_zsu, } },
+              { { gen_helper_sve_ldffbds_zss,
+                  gen_helper_sve_ldffhds_be_zss,
+                  gen_helper_sve_ldffsds_be_zss,
+                  NULL, },
+                { gen_helper_sve_ldffbdu_zss,
+                  gen_helper_sve_ldffhdu_be_zss,
+                  gen_helper_sve_ldffsdu_be_zss,
+                  gen_helper_sve_ldffdd_be_zss, } },
+              { { gen_helper_sve_ldffbds_zd,
+                  gen_helper_sve_ldffhds_be_zd,
+                  gen_helper_sve_ldffsds_be_zd,
+                  NULL, },
+                { gen_helper_sve_ldffbdu_zd,
+                  gen_helper_sve_ldffhdu_be_zd,
+                  gen_helper_sve_ldffsdu_be_zd,
+                  gen_helper_sve_ldffdd_be_zd, } } } } },
+    { /* MTE Active */
+        { /* Little-endian */
+            { { { gen_helper_sve_ldbds_zsu_mte,
+                  gen_helper_sve_ldhds_le_zsu_mte,
+                  gen_helper_sve_ldsds_le_zsu_mte,
+                  NULL, },
+                { gen_helper_sve_ldbdu_zsu_mte,
+                  gen_helper_sve_ldhdu_le_zsu_mte,
+                  gen_helper_sve_ldsdu_le_zsu_mte,
+                  gen_helper_sve_lddd_le_zsu_mte, } },
+              { { gen_helper_sve_ldbds_zss_mte,
+                  gen_helper_sve_ldhds_le_zss_mte,
+                  gen_helper_sve_ldsds_le_zss_mte,
+                  NULL, },
+                { gen_helper_sve_ldbdu_zss_mte,
+                  gen_helper_sve_ldhdu_le_zss_mte,
+                  gen_helper_sve_ldsdu_le_zss_mte,
+                  gen_helper_sve_lddd_le_zss_mte, } },
+              { { gen_helper_sve_ldbds_zd_mte,
+                  gen_helper_sve_ldhds_le_zd_mte,
+                  gen_helper_sve_ldsds_le_zd_mte,
+                  NULL, },
+                { gen_helper_sve_ldbdu_zd_mte,
+                  gen_helper_sve_ldhdu_le_zd_mte,
+                  gen_helper_sve_ldsdu_le_zd_mte,
+                  gen_helper_sve_lddd_le_zd_mte, } } },
+
+            /* First-fault */
+            { { { gen_helper_sve_ldffbds_zsu_mte,
+                  gen_helper_sve_ldffhds_le_zsu_mte,
+                  gen_helper_sve_ldffsds_le_zsu_mte,
+                  NULL, },
+                { gen_helper_sve_ldffbdu_zsu_mte,
+                  gen_helper_sve_ldffhdu_le_zsu_mte,
+                  gen_helper_sve_ldffsdu_le_zsu_mte,
+                  gen_helper_sve_ldffdd_le_zsu_mte, } },
+              { { gen_helper_sve_ldffbds_zss_mte,
+                  gen_helper_sve_ldffhds_le_zss_mte,
+                  gen_helper_sve_ldffsds_le_zss_mte,
+                  NULL, },
+                { gen_helper_sve_ldffbdu_zss_mte,
+                  gen_helper_sve_ldffhdu_le_zss_mte,
+                  gen_helper_sve_ldffsdu_le_zss_mte,
+                  gen_helper_sve_ldffdd_le_zss_mte, } },
+              { { gen_helper_sve_ldffbds_zd_mte,
+                  gen_helper_sve_ldffhds_le_zd_mte,
+                  gen_helper_sve_ldffsds_le_zd_mte,
+                  NULL, },
+                { gen_helper_sve_ldffbdu_zd_mte,
+                  gen_helper_sve_ldffhdu_le_zd_mte,
+                  gen_helper_sve_ldffsdu_le_zd_mte,
+                  gen_helper_sve_ldffdd_le_zd_mte, } } } },
+        { /* Big-endian */
+            { { { gen_helper_sve_ldbds_zsu_mte,
+                  gen_helper_sve_ldhds_be_zsu_mte,
+                  gen_helper_sve_ldsds_be_zsu_mte,
+                  NULL, },
+                { gen_helper_sve_ldbdu_zsu_mte,
+                  gen_helper_sve_ldhdu_be_zsu_mte,
+                  gen_helper_sve_ldsdu_be_zsu_mte,
+                  gen_helper_sve_lddd_be_zsu_mte, } },
+              { { gen_helper_sve_ldbds_zss_mte,
+                  gen_helper_sve_ldhds_be_zss_mte,
+                  gen_helper_sve_ldsds_be_zss_mte,
+                  NULL, },
+                { gen_helper_sve_ldbdu_zss_mte,
+                  gen_helper_sve_ldhdu_be_zss_mte,
+                  gen_helper_sve_ldsdu_be_zss_mte,
+                  gen_helper_sve_lddd_be_zss_mte, } },
+              { { gen_helper_sve_ldbds_zd_mte,
+                  gen_helper_sve_ldhds_be_zd_mte,
+                  gen_helper_sve_ldsds_be_zd_mte,
+                  NULL, },
+                { gen_helper_sve_ldbdu_zd_mte,
+                  gen_helper_sve_ldhdu_be_zd_mte,
+                  gen_helper_sve_ldsdu_be_zd_mte,
+                  gen_helper_sve_lddd_be_zd_mte, } } },
+
+            /* First-fault */
+            { { { gen_helper_sve_ldffbds_zsu_mte,
+                  gen_helper_sve_ldffhds_be_zsu_mte,
+                  gen_helper_sve_ldffsds_be_zsu_mte,
+                  NULL, },
+                { gen_helper_sve_ldffbdu_zsu_mte,
+                  gen_helper_sve_ldffhdu_be_zsu_mte,
+                  gen_helper_sve_ldffsdu_be_zsu_mte,
+                  gen_helper_sve_ldffdd_be_zsu_mte, } },
+              { { gen_helper_sve_ldffbds_zss_mte,
+                  gen_helper_sve_ldffhds_be_zss_mte,
+                  gen_helper_sve_ldffsds_be_zss_mte,
+                  NULL, },
+                { gen_helper_sve_ldffbdu_zss_mte,
+                  gen_helper_sve_ldffhdu_be_zss_mte,
+                  gen_helper_sve_ldffsdu_be_zss_mte,
+                  gen_helper_sve_ldffdd_be_zss_mte, } },
+              { { gen_helper_sve_ldffbds_zd_mte,
+                  gen_helper_sve_ldffhds_be_zd_mte,
+                  gen_helper_sve_ldffsds_be_zd_mte,
+                  NULL, },
+                { gen_helper_sve_ldffbdu_zd_mte,
+                  gen_helper_sve_ldffhdu_be_zd_mte,
+                  gen_helper_sve_ldffsdu_be_zd_mte,
+                  gen_helper_sve_ldffdd_be_zd_mte, } } } } },
+};
+
+static bool trans_LD1_zprz(DisasContext *s, arg_LD1_zprz *a)
+{
+    gen_helper_gvec_mem_scatter *fn = NULL;
+    bool be = s->be_data == MO_BE;
+    bool mte = s->mte_active[0];
+
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    s->is_nonstreaming = true;
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    switch (a->esz) {
+    case MO_32:
+        fn = gather_load_fn32[mte][be][a->ff][a->xs][a->u][a->msz];
+        break;
+    case MO_64:
+        fn = gather_load_fn64[mte][be][a->ff][a->xs][a->u][a->msz];
+        break;
+    }
+    assert(fn != NULL);
+
+    do_mem_zpz(s, a->rd, a->pg, a->rm, a->scale * a->msz,
+               cpu_reg_sp(s, a->rn), a->msz, false, fn);
+    return true;
+}
+
+static bool trans_LD1_zpiz(DisasContext *s, arg_LD1_zpiz *a)
+{
+    gen_helper_gvec_mem_scatter *fn = NULL;
+    bool be = s->be_data == MO_BE;
+    bool mte = s->mte_active[0];
+
+    if (a->esz < a->msz || (a->esz == a->msz && !a->u)) {
+        return false;
+    }
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    s->is_nonstreaming = true;
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    switch (a->esz) {
+    case MO_32:
+        fn = gather_load_fn32[mte][be][a->ff][0][a->u][a->msz];
+        break;
+    case MO_64:
+        fn = gather_load_fn64[mte][be][a->ff][2][a->u][a->msz];
+        break;
+    }
+    assert(fn != NULL);
+
+    /* Treat LD1_zpiz (zn[x] + imm) the same way as LD1_zprz (rn + zm[x])
+     * by loading the immediate into the scalar parameter.
+     */
+    do_mem_zpz(s, a->rd, a->pg, a->rn, 0,
+               tcg_constant_i64(a->imm << a->msz), a->msz, false, fn);
+    return true;
+}
+
+static bool trans_LDNT1_zprz(DisasContext *s, arg_LD1_zprz *a)
+{
+    gen_helper_gvec_mem_scatter *fn = NULL;
+    bool be = s->be_data == MO_BE;
+    bool mte = s->mte_active[0];
+
+    if (a->esz < a->msz + !a->u) {
+        return false;
+    }
+    if (!dc_isar_feature(aa64_sve2, s)) {
+        return false;
+    }
+    s->is_nonstreaming = true;
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    switch (a->esz) {
+    case MO_32:
+        fn = gather_load_fn32[mte][be][0][0][a->u][a->msz];
+        break;
+    case MO_64:
+        fn = gather_load_fn64[mte][be][0][2][a->u][a->msz];
+        break;
+    }
+    assert(fn != NULL);
+
+    do_mem_zpz(s, a->rd, a->pg, a->rn, 0,
+               cpu_reg(s, a->rm), a->msz, false, fn);
+    return true;
+}
+
+/* Indexed by [mte][be][xs][msz].  */
+static gen_helper_gvec_mem_scatter * const scatter_store_fn32[2][2][2][3] = {
+    { /* MTE Inactive */
+        { /* Little-endian */
+            { gen_helper_sve_stbs_zsu,
+              gen_helper_sve_sths_le_zsu,
+              gen_helper_sve_stss_le_zsu, },
+            { gen_helper_sve_stbs_zss,
+              gen_helper_sve_sths_le_zss,
+              gen_helper_sve_stss_le_zss, } },
+        { /* Big-endian */
+            { gen_helper_sve_stbs_zsu,
+              gen_helper_sve_sths_be_zsu,
+              gen_helper_sve_stss_be_zsu, },
+            { gen_helper_sve_stbs_zss,
+              gen_helper_sve_sths_be_zss,
+              gen_helper_sve_stss_be_zss, } } },
+    { /* MTE Active */
+        { /* Little-endian */
+            { gen_helper_sve_stbs_zsu_mte,
+              gen_helper_sve_sths_le_zsu_mte,
+              gen_helper_sve_stss_le_zsu_mte, },
+            { gen_helper_sve_stbs_zss_mte,
+              gen_helper_sve_sths_le_zss_mte,
+              gen_helper_sve_stss_le_zss_mte, } },
+        { /* Big-endian */
+            { gen_helper_sve_stbs_zsu_mte,
+              gen_helper_sve_sths_be_zsu_mte,
+              gen_helper_sve_stss_be_zsu_mte, },
+            { gen_helper_sve_stbs_zss_mte,
+              gen_helper_sve_sths_be_zss_mte,
+              gen_helper_sve_stss_be_zss_mte, } } },
+};
+
+/* Note that we overload xs=2 to indicate 64-bit offset.  */
+static gen_helper_gvec_mem_scatter * const scatter_store_fn64[2][2][3][4] = {
+    { /* MTE Inactive */
+         { /* Little-endian */
+             { gen_helper_sve_stbd_zsu,
+               gen_helper_sve_sthd_le_zsu,
+               gen_helper_sve_stsd_le_zsu,
+               gen_helper_sve_stdd_le_zsu, },
+             { gen_helper_sve_stbd_zss,
+               gen_helper_sve_sthd_le_zss,
+               gen_helper_sve_stsd_le_zss,
+               gen_helper_sve_stdd_le_zss, },
+             { gen_helper_sve_stbd_zd,
+               gen_helper_sve_sthd_le_zd,
+               gen_helper_sve_stsd_le_zd,
+               gen_helper_sve_stdd_le_zd, } },
+         { /* Big-endian */
+             { gen_helper_sve_stbd_zsu,
+               gen_helper_sve_sthd_be_zsu,
+               gen_helper_sve_stsd_be_zsu,
+               gen_helper_sve_stdd_be_zsu, },
+             { gen_helper_sve_stbd_zss,
+               gen_helper_sve_sthd_be_zss,
+               gen_helper_sve_stsd_be_zss,
+               gen_helper_sve_stdd_be_zss, },
+             { gen_helper_sve_stbd_zd,
+               gen_helper_sve_sthd_be_zd,
+               gen_helper_sve_stsd_be_zd,
+               gen_helper_sve_stdd_be_zd, } } },
+    { /* MTE Inactive */
+         { /* Little-endian */
+             { gen_helper_sve_stbd_zsu_mte,
+               gen_helper_sve_sthd_le_zsu_mte,
+               gen_helper_sve_stsd_le_zsu_mte,
+               gen_helper_sve_stdd_le_zsu_mte, },
+             { gen_helper_sve_stbd_zss_mte,
+               gen_helper_sve_sthd_le_zss_mte,
+               gen_helper_sve_stsd_le_zss_mte,
+               gen_helper_sve_stdd_le_zss_mte, },
+             { gen_helper_sve_stbd_zd_mte,
+               gen_helper_sve_sthd_le_zd_mte,
+               gen_helper_sve_stsd_le_zd_mte,
+               gen_helper_sve_stdd_le_zd_mte, } },
+         { /* Big-endian */
+             { gen_helper_sve_stbd_zsu_mte,
+               gen_helper_sve_sthd_be_zsu_mte,
+               gen_helper_sve_stsd_be_zsu_mte,
+               gen_helper_sve_stdd_be_zsu_mte, },
+             { gen_helper_sve_stbd_zss_mte,
+               gen_helper_sve_sthd_be_zss_mte,
+               gen_helper_sve_stsd_be_zss_mte,
+               gen_helper_sve_stdd_be_zss_mte, },
+             { gen_helper_sve_stbd_zd_mte,
+               gen_helper_sve_sthd_be_zd_mte,
+               gen_helper_sve_stsd_be_zd_mte,
+               gen_helper_sve_stdd_be_zd_mte, } } },
+};
+
+static bool trans_ST1_zprz(DisasContext *s, arg_ST1_zprz *a)
+{
+    gen_helper_gvec_mem_scatter *fn;
+    bool be = s->be_data == MO_BE;
+    bool mte = s->mte_active[0];
+
+    if (a->esz < a->msz || (a->msz == 0 && a->scale)) {
+        return false;
+    }
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    s->is_nonstreaming = true;
+    if (!sve_access_check(s)) {
+        return true;
+    }
+    switch (a->esz) {
+    case MO_32:
+        fn = scatter_store_fn32[mte][be][a->xs][a->msz];
+        break;
+    case MO_64:
+        fn = scatter_store_fn64[mte][be][a->xs][a->msz];
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    do_mem_zpz(s, a->rd, a->pg, a->rm, a->scale * a->msz,
+               cpu_reg_sp(s, a->rn), a->msz, true, fn);
+    return true;
+}
+
+static bool trans_ST1_zpiz(DisasContext *s, arg_ST1_zpiz *a)
+{
+    gen_helper_gvec_mem_scatter *fn = NULL;
+    bool be = s->be_data == MO_BE;
+    bool mte = s->mte_active[0];
+
+    if (a->esz < a->msz) {
+        return false;
+    }
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    s->is_nonstreaming = true;
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    switch (a->esz) {
+    case MO_32:
+        fn = scatter_store_fn32[mte][be][0][a->msz];
+        break;
+    case MO_64:
+        fn = scatter_store_fn64[mte][be][2][a->msz];
+        break;
+    }
+    assert(fn != NULL);
+
+    /* Treat ST1_zpiz (zn[x] + imm) the same way as ST1_zprz (rn + zm[x])
+     * by loading the immediate into the scalar parameter.
+     */
+    do_mem_zpz(s, a->rd, a->pg, a->rn, 0,
+               tcg_constant_i64(a->imm << a->msz), a->msz, true, fn);
+    return true;
+}
+
+static bool trans_STNT1_zprz(DisasContext *s, arg_ST1_zprz *a)
+{
+    gen_helper_gvec_mem_scatter *fn;
+    bool be = s->be_data == MO_BE;
+    bool mte = s->mte_active[0];
+
+    if (a->esz < a->msz) {
+        return false;
+    }
+    if (!dc_isar_feature(aa64_sve2, s)) {
+        return false;
+    }
+    s->is_nonstreaming = true;
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    switch (a->esz) {
+    case MO_32:
+        fn = scatter_store_fn32[mte][be][0][a->msz];
+        break;
+    case MO_64:
+        fn = scatter_store_fn64[mte][be][2][a->msz];
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    do_mem_zpz(s, a->rd, a->pg, a->rn, 0,
+               cpu_reg(s, a->rm), a->msz, true, fn);
+    return true;
+}
+
+/*
+ * Prefetches
+ */
+
+static bool trans_PRF(DisasContext *s, arg_PRF *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    /* Prefetch is a nop within QEMU.  */
+    (void)sve_access_check(s);
+    return true;
+}
+
+static bool trans_PRF_rr(DisasContext *s, arg_PRF_rr *a)
+{
+    if (a->rm == 31 || !dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    /* Prefetch is a nop within QEMU.  */
+    (void)sve_access_check(s);
+    return true;
+}
+
+static bool trans_PRF_ns(DisasContext *s, arg_PRF_ns *a)
+{
+    if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    /* Prefetch is a nop within QEMU.  */
+    s->is_nonstreaming = true;
+    (void)sve_access_check(s);
+    return true;
+}
+
+/*
+ * Move Prefix
+ *
+ * TODO: The implementation so far could handle predicated merging movprfx.
+ * The helper functions as written take an extra source register to
+ * use in the operation, but the result is only written when predication
+ * succeeds.  For unpredicated movprfx, we need to rearrange the helpers
+ * to allow the final write back to the destination to be unconditional.
+ * For predicated zeroing movprfx, we need to rearrange the helpers to
+ * allow the final write back to zero inactives.
+ *
+ * In the meantime, just emit the moves.
+ */
+
+TRANS_FEAT(MOVPRFX, aa64_sve, do_mov_z, a->rd, a->rn)
+TRANS_FEAT(MOVPRFX_m, aa64_sve, do_sel_z, a->rd, a->rn, a->rd, a->pg, a->esz)
+TRANS_FEAT(MOVPRFX_z, aa64_sve, do_movz_zpz, a->rd, a->rn, a->pg, a->esz, false)
+
+/*
+ * SVE2 Integer Multiply - Unpredicated
+ */
+
+TRANS_FEAT(MUL_zzz, aa64_sve2, gen_gvec_fn_arg_zzz, tcg_gen_gvec_mul, a)
+
+static gen_helper_gvec_3 * const smulh_zzz_fns[4] = {
+    gen_helper_gvec_smulh_b, gen_helper_gvec_smulh_h,
+    gen_helper_gvec_smulh_s, gen_helper_gvec_smulh_d,
+};
+TRANS_FEAT(SMULH_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
+           smulh_zzz_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const umulh_zzz_fns[4] = {
+    gen_helper_gvec_umulh_b, gen_helper_gvec_umulh_h,
+    gen_helper_gvec_umulh_s, gen_helper_gvec_umulh_d,
+};
+TRANS_FEAT(UMULH_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
+           umulh_zzz_fns[a->esz], a, 0)
+
+TRANS_FEAT(PMUL_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
+           gen_helper_gvec_pmul_b, a, 0)
+
+static gen_helper_gvec_3 * const sqdmulh_zzz_fns[4] = {
+    gen_helper_sve2_sqdmulh_b, gen_helper_sve2_sqdmulh_h,
+    gen_helper_sve2_sqdmulh_s, gen_helper_sve2_sqdmulh_d,
+};
+TRANS_FEAT(SQDMULH_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
+           sqdmulh_zzz_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const sqrdmulh_zzz_fns[4] = {
+    gen_helper_sve2_sqrdmulh_b, gen_helper_sve2_sqrdmulh_h,
+    gen_helper_sve2_sqrdmulh_s, gen_helper_sve2_sqrdmulh_d,
+};
+TRANS_FEAT(SQRDMULH_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
+           sqrdmulh_zzz_fns[a->esz], a, 0)
+
+/*
+ * SVE2 Integer - Predicated
+ */
+
+static gen_helper_gvec_4 * const sadlp_fns[4] = {
+    NULL,                          gen_helper_sve2_sadalp_zpzz_h,
+    gen_helper_sve2_sadalp_zpzz_s, gen_helper_sve2_sadalp_zpzz_d,
+};
+TRANS_FEAT(SADALP_zpzz, aa64_sve2, gen_gvec_ool_arg_zpzz,
+           sadlp_fns[a->esz], a, 0)
+
+static gen_helper_gvec_4 * const uadlp_fns[4] = {
+    NULL,                          gen_helper_sve2_uadalp_zpzz_h,
+    gen_helper_sve2_uadalp_zpzz_s, gen_helper_sve2_uadalp_zpzz_d,
+};
+TRANS_FEAT(UADALP_zpzz, aa64_sve2, gen_gvec_ool_arg_zpzz,
+           uadlp_fns[a->esz], a, 0)
+
+/*
+ * SVE2 integer unary operations (predicated)
+ */
+
+TRANS_FEAT(URECPE, aa64_sve2, gen_gvec_ool_arg_zpz,
+           a->esz == 2 ? gen_helper_sve2_urecpe_s : NULL, a, 0)
+
+TRANS_FEAT(URSQRTE, aa64_sve2, gen_gvec_ool_arg_zpz,
+           a->esz == 2 ? gen_helper_sve2_ursqrte_s : NULL, a, 0)
+
+static gen_helper_gvec_3 * const sqabs_fns[4] = {
+    gen_helper_sve2_sqabs_b, gen_helper_sve2_sqabs_h,
+    gen_helper_sve2_sqabs_s, gen_helper_sve2_sqabs_d,
+};
+TRANS_FEAT(SQABS, aa64_sve2, gen_gvec_ool_arg_zpz, sqabs_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const sqneg_fns[4] = {
+    gen_helper_sve2_sqneg_b, gen_helper_sve2_sqneg_h,
+    gen_helper_sve2_sqneg_s, gen_helper_sve2_sqneg_d,
+};
+TRANS_FEAT(SQNEG, aa64_sve2, gen_gvec_ool_arg_zpz, sqneg_fns[a->esz], a, 0)
+
+DO_ZPZZ(SQSHL, aa64_sve2, sve2_sqshl)
+DO_ZPZZ(SQRSHL, aa64_sve2, sve2_sqrshl)
+DO_ZPZZ(SRSHL, aa64_sve2, sve2_srshl)
+
+DO_ZPZZ(UQSHL, aa64_sve2, sve2_uqshl)
+DO_ZPZZ(UQRSHL, aa64_sve2, sve2_uqrshl)
+DO_ZPZZ(URSHL, aa64_sve2, sve2_urshl)
+
+DO_ZPZZ(SHADD, aa64_sve2, sve2_shadd)
+DO_ZPZZ(SRHADD, aa64_sve2, sve2_srhadd)
+DO_ZPZZ(SHSUB, aa64_sve2, sve2_shsub)
+
+DO_ZPZZ(UHADD, aa64_sve2, sve2_uhadd)
+DO_ZPZZ(URHADD, aa64_sve2, sve2_urhadd)
+DO_ZPZZ(UHSUB, aa64_sve2, sve2_uhsub)
+
+DO_ZPZZ(ADDP, aa64_sve2, sve2_addp)
+DO_ZPZZ(SMAXP, aa64_sve2, sve2_smaxp)
+DO_ZPZZ(UMAXP, aa64_sve2, sve2_umaxp)
+DO_ZPZZ(SMINP, aa64_sve2, sve2_sminp)
+DO_ZPZZ(UMINP, aa64_sve2, sve2_uminp)
+
+DO_ZPZZ(SQADD_zpzz, aa64_sve2, sve2_sqadd)
+DO_ZPZZ(UQADD_zpzz, aa64_sve2, sve2_uqadd)
+DO_ZPZZ(SQSUB_zpzz, aa64_sve2, sve2_sqsub)
+DO_ZPZZ(UQSUB_zpzz, aa64_sve2, sve2_uqsub)
+DO_ZPZZ(SUQADD, aa64_sve2, sve2_suqadd)
+DO_ZPZZ(USQADD, aa64_sve2, sve2_usqadd)
+
+/*
+ * SVE2 Widening Integer Arithmetic
+ */
+
+static gen_helper_gvec_3 * const saddl_fns[4] = {
+    NULL,                    gen_helper_sve2_saddl_h,
+    gen_helper_sve2_saddl_s, gen_helper_sve2_saddl_d,
+};
+TRANS_FEAT(SADDLB, aa64_sve2, gen_gvec_ool_arg_zzz,
+           saddl_fns[a->esz], a, 0)
+TRANS_FEAT(SADDLT, aa64_sve2, gen_gvec_ool_arg_zzz,
+           saddl_fns[a->esz], a, 3)
+TRANS_FEAT(SADDLBT, aa64_sve2, gen_gvec_ool_arg_zzz,
+           saddl_fns[a->esz], a, 2)
+
+static gen_helper_gvec_3 * const ssubl_fns[4] = {
+    NULL,                    gen_helper_sve2_ssubl_h,
+    gen_helper_sve2_ssubl_s, gen_helper_sve2_ssubl_d,
+};
+TRANS_FEAT(SSUBLB, aa64_sve2, gen_gvec_ool_arg_zzz,
+           ssubl_fns[a->esz], a, 0)
+TRANS_FEAT(SSUBLT, aa64_sve2, gen_gvec_ool_arg_zzz,
+           ssubl_fns[a->esz], a, 3)
+TRANS_FEAT(SSUBLBT, aa64_sve2, gen_gvec_ool_arg_zzz,
+           ssubl_fns[a->esz], a, 2)
+TRANS_FEAT(SSUBLTB, aa64_sve2, gen_gvec_ool_arg_zzz,
+           ssubl_fns[a->esz], a, 1)
+
+static gen_helper_gvec_3 * const sabdl_fns[4] = {
+    NULL,                    gen_helper_sve2_sabdl_h,
+    gen_helper_sve2_sabdl_s, gen_helper_sve2_sabdl_d,
+};
+TRANS_FEAT(SABDLB, aa64_sve2, gen_gvec_ool_arg_zzz,
+           sabdl_fns[a->esz], a, 0)
+TRANS_FEAT(SABDLT, aa64_sve2, gen_gvec_ool_arg_zzz,
+           sabdl_fns[a->esz], a, 3)
+
+static gen_helper_gvec_3 * const uaddl_fns[4] = {
+    NULL,                    gen_helper_sve2_uaddl_h,
+    gen_helper_sve2_uaddl_s, gen_helper_sve2_uaddl_d,
+};
+TRANS_FEAT(UADDLB, aa64_sve2, gen_gvec_ool_arg_zzz,
+           uaddl_fns[a->esz], a, 0)
+TRANS_FEAT(UADDLT, aa64_sve2, gen_gvec_ool_arg_zzz,
+           uaddl_fns[a->esz], a, 3)
+
+static gen_helper_gvec_3 * const usubl_fns[4] = {
+    NULL,                    gen_helper_sve2_usubl_h,
+    gen_helper_sve2_usubl_s, gen_helper_sve2_usubl_d,
+};
+TRANS_FEAT(USUBLB, aa64_sve2, gen_gvec_ool_arg_zzz,
+           usubl_fns[a->esz], a, 0)
+TRANS_FEAT(USUBLT, aa64_sve2, gen_gvec_ool_arg_zzz,
+           usubl_fns[a->esz], a, 3)
+
+static gen_helper_gvec_3 * const uabdl_fns[4] = {
+    NULL,                    gen_helper_sve2_uabdl_h,
+    gen_helper_sve2_uabdl_s, gen_helper_sve2_uabdl_d,
+};
+TRANS_FEAT(UABDLB, aa64_sve2, gen_gvec_ool_arg_zzz,
+           uabdl_fns[a->esz], a, 0)
+TRANS_FEAT(UABDLT, aa64_sve2, gen_gvec_ool_arg_zzz,
+           uabdl_fns[a->esz], a, 3)
+
+static gen_helper_gvec_3 * const sqdmull_fns[4] = {
+    NULL,                          gen_helper_sve2_sqdmull_zzz_h,
+    gen_helper_sve2_sqdmull_zzz_s, gen_helper_sve2_sqdmull_zzz_d,
+};
+TRANS_FEAT(SQDMULLB_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
+           sqdmull_fns[a->esz], a, 0)
+TRANS_FEAT(SQDMULLT_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
+           sqdmull_fns[a->esz], a, 3)
+
+static gen_helper_gvec_3 * const smull_fns[4] = {
+    NULL,                        gen_helper_sve2_smull_zzz_h,
+    gen_helper_sve2_smull_zzz_s, gen_helper_sve2_smull_zzz_d,
+};
+TRANS_FEAT(SMULLB_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
+           smull_fns[a->esz], a, 0)
+TRANS_FEAT(SMULLT_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
+           smull_fns[a->esz], a, 3)
+
+static gen_helper_gvec_3 * const umull_fns[4] = {
+    NULL,                        gen_helper_sve2_umull_zzz_h,
+    gen_helper_sve2_umull_zzz_s, gen_helper_sve2_umull_zzz_d,
+};
+TRANS_FEAT(UMULLB_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
+           umull_fns[a->esz], a, 0)
+TRANS_FEAT(UMULLT_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
+           umull_fns[a->esz], a, 3)
+
+static gen_helper_gvec_3 * const eoril_fns[4] = {
+    gen_helper_sve2_eoril_b, gen_helper_sve2_eoril_h,
+    gen_helper_sve2_eoril_s, gen_helper_sve2_eoril_d,
+};
+TRANS_FEAT(EORBT, aa64_sve2, gen_gvec_ool_arg_zzz, eoril_fns[a->esz], a, 2)
+TRANS_FEAT(EORTB, aa64_sve2, gen_gvec_ool_arg_zzz, eoril_fns[a->esz], a, 1)
+
+static bool do_trans_pmull(DisasContext *s, arg_rrr_esz *a, bool sel)
+{
+    static gen_helper_gvec_3 * const fns[4] = {
+        gen_helper_gvec_pmull_q, gen_helper_sve2_pmull_h,
+        NULL,                    gen_helper_sve2_pmull_d,
+    };
+
+    if (a->esz == 0) {
+        if (!dc_isar_feature(aa64_sve2_pmull128, s)) {
+            return false;
+        }
+        s->is_nonstreaming = true;
+    } else if (!dc_isar_feature(aa64_sve, s)) {
+        return false;
+    }
+    return gen_gvec_ool_arg_zzz(s, fns[a->esz], a, sel);
+}
+
+TRANS_FEAT(PMULLB, aa64_sve2, do_trans_pmull, a, false)
+TRANS_FEAT(PMULLT, aa64_sve2, do_trans_pmull, a, true)
+
+static gen_helper_gvec_3 * const saddw_fns[4] = {
+    NULL,                    gen_helper_sve2_saddw_h,
+    gen_helper_sve2_saddw_s, gen_helper_sve2_saddw_d,
+};
+TRANS_FEAT(SADDWB, aa64_sve2, gen_gvec_ool_arg_zzz, saddw_fns[a->esz], a, 0)
+TRANS_FEAT(SADDWT, aa64_sve2, gen_gvec_ool_arg_zzz, saddw_fns[a->esz], a, 1)
+
+static gen_helper_gvec_3 * const ssubw_fns[4] = {
+    NULL,                    gen_helper_sve2_ssubw_h,
+    gen_helper_sve2_ssubw_s, gen_helper_sve2_ssubw_d,
+};
+TRANS_FEAT(SSUBWB, aa64_sve2, gen_gvec_ool_arg_zzz, ssubw_fns[a->esz], a, 0)
+TRANS_FEAT(SSUBWT, aa64_sve2, gen_gvec_ool_arg_zzz, ssubw_fns[a->esz], a, 1)
+
+static gen_helper_gvec_3 * const uaddw_fns[4] = {
+    NULL,                    gen_helper_sve2_uaddw_h,
+    gen_helper_sve2_uaddw_s, gen_helper_sve2_uaddw_d,
+};
+TRANS_FEAT(UADDWB, aa64_sve2, gen_gvec_ool_arg_zzz, uaddw_fns[a->esz], a, 0)
+TRANS_FEAT(UADDWT, aa64_sve2, gen_gvec_ool_arg_zzz, uaddw_fns[a->esz], a, 1)
+
+static gen_helper_gvec_3 * const usubw_fns[4] = {
+    NULL,                    gen_helper_sve2_usubw_h,
+    gen_helper_sve2_usubw_s, gen_helper_sve2_usubw_d,
+};
+TRANS_FEAT(USUBWB, aa64_sve2, gen_gvec_ool_arg_zzz, usubw_fns[a->esz], a, 0)
+TRANS_FEAT(USUBWT, aa64_sve2, gen_gvec_ool_arg_zzz, usubw_fns[a->esz], a, 1)
+
+static void gen_sshll_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t imm)
+{
+    int top = imm & 1;
+    int shl = imm >> 1;
+    int halfbits = 4 << vece;
+
+    if (top) {
+        if (shl == halfbits) {
+            TCGv_vec t = tcg_temp_new_vec_matching(d);
+            tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(halfbits, halfbits));
+            tcg_gen_and_vec(vece, d, n, t);
+            tcg_temp_free_vec(t);
+        } else {
+            tcg_gen_sari_vec(vece, d, n, halfbits);
+            tcg_gen_shli_vec(vece, d, d, shl);
+        }
+    } else {
+        tcg_gen_shli_vec(vece, d, n, halfbits);
+        tcg_gen_sari_vec(vece, d, d, halfbits - shl);
+    }
+}
+
+static void gen_ushll_i64(unsigned vece, TCGv_i64 d, TCGv_i64 n, int imm)
+{
+    int halfbits = 4 << vece;
+    int top = imm & 1;
+    int shl = (imm >> 1);
+    int shift;
+    uint64_t mask;
+
+    mask = MAKE_64BIT_MASK(0, halfbits);
+    mask <<= shl;
+    mask = dup_const(vece, mask);
+
+    shift = shl - top * halfbits;
+    if (shift < 0) {
+        tcg_gen_shri_i64(d, n, -shift);
+    } else {
+        tcg_gen_shli_i64(d, n, shift);
+    }
+    tcg_gen_andi_i64(d, d, mask);
+}
+
+static void gen_ushll16_i64(TCGv_i64 d, TCGv_i64 n, int64_t imm)
+{
+    gen_ushll_i64(MO_16, d, n, imm);
+}
+
+static void gen_ushll32_i64(TCGv_i64 d, TCGv_i64 n, int64_t imm)
+{
+    gen_ushll_i64(MO_32, d, n, imm);
+}
+
+static void gen_ushll64_i64(TCGv_i64 d, TCGv_i64 n, int64_t imm)
+{
+    gen_ushll_i64(MO_64, d, n, imm);
+}
+
+static void gen_ushll_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t imm)
+{
+    int halfbits = 4 << vece;
+    int top = imm & 1;
+    int shl = imm >> 1;
+
+    if (top) {
+        if (shl == halfbits) {
+            TCGv_vec t = tcg_temp_new_vec_matching(d);
+            tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(halfbits, halfbits));
+            tcg_gen_and_vec(vece, d, n, t);
+            tcg_temp_free_vec(t);
+        } else {
+            tcg_gen_shri_vec(vece, d, n, halfbits);
+            tcg_gen_shli_vec(vece, d, d, shl);
+        }
+    } else {
+        if (shl == 0) {
+            TCGv_vec t = tcg_temp_new_vec_matching(d);
+            tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
+            tcg_gen_and_vec(vece, d, n, t);
+            tcg_temp_free_vec(t);
+        } else {
+            tcg_gen_shli_vec(vece, d, n, halfbits);
+            tcg_gen_shri_vec(vece, d, d, halfbits - shl);
+        }
+    }
+}
+
+static bool do_shll_tb(DisasContext *s, arg_rri_esz *a,
+                       const GVecGen2i ops[3], bool sel)
+{
+
+    if (a->esz < 0 || a->esz > 2) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        tcg_gen_gvec_2i(vec_full_reg_offset(s, a->rd),
+                        vec_full_reg_offset(s, a->rn),
+                        vsz, vsz, (a->imm << 1) | sel,
+                        &ops[a->esz]);
+    }
+    return true;
+}
+
+static const TCGOpcode sshll_list[] = {
+    INDEX_op_shli_vec, INDEX_op_sari_vec, 0
+};
+static const GVecGen2i sshll_ops[3] = {
+    { .fniv = gen_sshll_vec,
+      .opt_opc = sshll_list,
+      .fno = gen_helper_sve2_sshll_h,
+      .vece = MO_16 },
+    { .fniv = gen_sshll_vec,
+      .opt_opc = sshll_list,
+      .fno = gen_helper_sve2_sshll_s,
+      .vece = MO_32 },
+    { .fniv = gen_sshll_vec,
+      .opt_opc = sshll_list,
+      .fno = gen_helper_sve2_sshll_d,
+      .vece = MO_64 }
+};
+TRANS_FEAT(SSHLLB, aa64_sve2, do_shll_tb, a, sshll_ops, false)
+TRANS_FEAT(SSHLLT, aa64_sve2, do_shll_tb, a, sshll_ops, true)
+
+static const TCGOpcode ushll_list[] = {
+    INDEX_op_shli_vec, INDEX_op_shri_vec, 0
+};
+static const GVecGen2i ushll_ops[3] = {
+    { .fni8 = gen_ushll16_i64,
+      .fniv = gen_ushll_vec,
+      .opt_opc = ushll_list,
+      .fno = gen_helper_sve2_ushll_h,
+      .vece = MO_16 },
+    { .fni8 = gen_ushll32_i64,
+      .fniv = gen_ushll_vec,
+      .opt_opc = ushll_list,
+      .fno = gen_helper_sve2_ushll_s,
+      .vece = MO_32 },
+    { .fni8 = gen_ushll64_i64,
+      .fniv = gen_ushll_vec,
+      .opt_opc = ushll_list,
+      .fno = gen_helper_sve2_ushll_d,
+      .vece = MO_64 },
+};
+TRANS_FEAT(USHLLB, aa64_sve2, do_shll_tb, a, ushll_ops, false)
+TRANS_FEAT(USHLLT, aa64_sve2, do_shll_tb, a, ushll_ops, true)
+
+static gen_helper_gvec_3 * const bext_fns[4] = {
+    gen_helper_sve2_bext_b, gen_helper_sve2_bext_h,
+    gen_helper_sve2_bext_s, gen_helper_sve2_bext_d,
+};
+TRANS_FEAT_NONSTREAMING(BEXT, aa64_sve2_bitperm, gen_gvec_ool_arg_zzz,
+                        bext_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const bdep_fns[4] = {
+    gen_helper_sve2_bdep_b, gen_helper_sve2_bdep_h,
+    gen_helper_sve2_bdep_s, gen_helper_sve2_bdep_d,
+};
+TRANS_FEAT_NONSTREAMING(BDEP, aa64_sve2_bitperm, gen_gvec_ool_arg_zzz,
+                        bdep_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const bgrp_fns[4] = {
+    gen_helper_sve2_bgrp_b, gen_helper_sve2_bgrp_h,
+    gen_helper_sve2_bgrp_s, gen_helper_sve2_bgrp_d,
+};
+TRANS_FEAT_NONSTREAMING(BGRP, aa64_sve2_bitperm, gen_gvec_ool_arg_zzz,
+                        bgrp_fns[a->esz], a, 0)
+
+static gen_helper_gvec_3 * const cadd_fns[4] = {
+    gen_helper_sve2_cadd_b, gen_helper_sve2_cadd_h,
+    gen_helper_sve2_cadd_s, gen_helper_sve2_cadd_d,
+};
+TRANS_FEAT(CADD_rot90, aa64_sve2, gen_gvec_ool_arg_zzz,
+           cadd_fns[a->esz], a, 0)
+TRANS_FEAT(CADD_rot270, aa64_sve2, gen_gvec_ool_arg_zzz,
+           cadd_fns[a->esz], a, 1)
+
+static gen_helper_gvec_3 * const sqcadd_fns[4] = {
+    gen_helper_sve2_sqcadd_b, gen_helper_sve2_sqcadd_h,
+    gen_helper_sve2_sqcadd_s, gen_helper_sve2_sqcadd_d,
+};
+TRANS_FEAT(SQCADD_rot90, aa64_sve2, gen_gvec_ool_arg_zzz,
+           sqcadd_fns[a->esz], a, 0)
+TRANS_FEAT(SQCADD_rot270, aa64_sve2, gen_gvec_ool_arg_zzz,
+           sqcadd_fns[a->esz], a, 1)
+
+static gen_helper_gvec_4 * const sabal_fns[4] = {
+    NULL,                    gen_helper_sve2_sabal_h,
+    gen_helper_sve2_sabal_s, gen_helper_sve2_sabal_d,
+};
+TRANS_FEAT(SABALB, aa64_sve2, gen_gvec_ool_arg_zzzz, sabal_fns[a->esz], a, 0)
+TRANS_FEAT(SABALT, aa64_sve2, gen_gvec_ool_arg_zzzz, sabal_fns[a->esz], a, 1)
+
+static gen_helper_gvec_4 * const uabal_fns[4] = {
+    NULL,                    gen_helper_sve2_uabal_h,
+    gen_helper_sve2_uabal_s, gen_helper_sve2_uabal_d,
+};
+TRANS_FEAT(UABALB, aa64_sve2, gen_gvec_ool_arg_zzzz, uabal_fns[a->esz], a, 0)
+TRANS_FEAT(UABALT, aa64_sve2, gen_gvec_ool_arg_zzzz, uabal_fns[a->esz], a, 1)
+
+static bool do_adcl(DisasContext *s, arg_rrrr_esz *a, bool sel)
+{
+    static gen_helper_gvec_4 * const fns[2] = {
+        gen_helper_sve2_adcl_s,
+        gen_helper_sve2_adcl_d,
+    };
+    /*
+     * Note that in this case the ESZ field encodes both size and sign.
+     * Split out 'subtract' into bit 1 of the data field for the helper.
+     */
+    return gen_gvec_ool_arg_zzzz(s, fns[a->esz & 1], a, (a->esz & 2) | sel);
+}
+
+TRANS_FEAT(ADCLB, aa64_sve2, do_adcl, a, false)
+TRANS_FEAT(ADCLT, aa64_sve2, do_adcl, a, true)
+
+TRANS_FEAT(SSRA, aa64_sve2, gen_gvec_fn_arg_zzi, gen_gvec_ssra, a)
+TRANS_FEAT(USRA, aa64_sve2, gen_gvec_fn_arg_zzi, gen_gvec_usra, a)
+TRANS_FEAT(SRSRA, aa64_sve2, gen_gvec_fn_arg_zzi, gen_gvec_srsra, a)
+TRANS_FEAT(URSRA, aa64_sve2, gen_gvec_fn_arg_zzi, gen_gvec_ursra, a)
+TRANS_FEAT(SRI, aa64_sve2, gen_gvec_fn_arg_zzi, gen_gvec_sri, a)
+TRANS_FEAT(SLI, aa64_sve2, gen_gvec_fn_arg_zzi, gen_gvec_sli, a)
+
+TRANS_FEAT(SABA, aa64_sve2, gen_gvec_fn_arg_zzz, gen_gvec_saba, a)
+TRANS_FEAT(UABA, aa64_sve2, gen_gvec_fn_arg_zzz, gen_gvec_uaba, a)
+
+static bool do_narrow_extract(DisasContext *s, arg_rri_esz *a,
+                              const GVecGen2 ops[3])
+{
+    if (a->esz < 0 || a->esz > MO_32 || a->imm != 0) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        tcg_gen_gvec_2(vec_full_reg_offset(s, a->rd),
+                        vec_full_reg_offset(s, a->rn),
+                        vsz, vsz, &ops[a->esz]);
+    }
+    return true;
+}
+
+static const TCGOpcode sqxtn_list[] = {
+    INDEX_op_shli_vec, INDEX_op_smin_vec, INDEX_op_smax_vec, 0
+};
+
+static void gen_sqxtnb_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
+{
+    TCGv_vec t = tcg_temp_new_vec_matching(d);
+    int halfbits = 4 << vece;
+    int64_t mask = (1ull << halfbits) - 1;
+    int64_t min = -1ull << (halfbits - 1);
+    int64_t max = -min - 1;
+
+    tcg_gen_dupi_vec(vece, t, min);
+    tcg_gen_smax_vec(vece, d, n, t);
+    tcg_gen_dupi_vec(vece, t, max);
+    tcg_gen_smin_vec(vece, d, d, t);
+    tcg_gen_dupi_vec(vece, t, mask);
+    tcg_gen_and_vec(vece, d, d, t);
+    tcg_temp_free_vec(t);
+}
+
+static const GVecGen2 sqxtnb_ops[3] = {
+    { .fniv = gen_sqxtnb_vec,
+      .opt_opc = sqxtn_list,
+      .fno = gen_helper_sve2_sqxtnb_h,
+      .vece = MO_16 },
+    { .fniv = gen_sqxtnb_vec,
+      .opt_opc = sqxtn_list,
+      .fno = gen_helper_sve2_sqxtnb_s,
+      .vece = MO_32 },
+    { .fniv = gen_sqxtnb_vec,
+      .opt_opc = sqxtn_list,
+      .fno = gen_helper_sve2_sqxtnb_d,
+      .vece = MO_64 },
+};
+TRANS_FEAT(SQXTNB, aa64_sve2, do_narrow_extract, a, sqxtnb_ops)
+
+static void gen_sqxtnt_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
+{
+    TCGv_vec t = tcg_temp_new_vec_matching(d);
+    int halfbits = 4 << vece;
+    int64_t mask = (1ull << halfbits) - 1;
+    int64_t min = -1ull << (halfbits - 1);
+    int64_t max = -min - 1;
+
+    tcg_gen_dupi_vec(vece, t, min);
+    tcg_gen_smax_vec(vece, n, n, t);
+    tcg_gen_dupi_vec(vece, t, max);
+    tcg_gen_smin_vec(vece, n, n, t);
+    tcg_gen_shli_vec(vece, n, n, halfbits);
+    tcg_gen_dupi_vec(vece, t, mask);
+    tcg_gen_bitsel_vec(vece, d, t, d, n);
+    tcg_temp_free_vec(t);
+}
+
+static const GVecGen2 sqxtnt_ops[3] = {
+    { .fniv = gen_sqxtnt_vec,
+      .opt_opc = sqxtn_list,
+      .load_dest = true,
+      .fno = gen_helper_sve2_sqxtnt_h,
+      .vece = MO_16 },
+    { .fniv = gen_sqxtnt_vec,
+      .opt_opc = sqxtn_list,
+      .load_dest = true,
+      .fno = gen_helper_sve2_sqxtnt_s,
+      .vece = MO_32 },
+    { .fniv = gen_sqxtnt_vec,
+      .opt_opc = sqxtn_list,
+      .load_dest = true,
+      .fno = gen_helper_sve2_sqxtnt_d,
+      .vece = MO_64 },
+};
+TRANS_FEAT(SQXTNT, aa64_sve2, do_narrow_extract, a, sqxtnt_ops)
+
+static const TCGOpcode uqxtn_list[] = {
+    INDEX_op_shli_vec, INDEX_op_umin_vec, 0
+};
+
+static void gen_uqxtnb_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
+{
+    TCGv_vec t = tcg_temp_new_vec_matching(d);
+    int halfbits = 4 << vece;
+    int64_t max = (1ull << halfbits) - 1;
+
+    tcg_gen_dupi_vec(vece, t, max);
+    tcg_gen_umin_vec(vece, d, n, t);
+    tcg_temp_free_vec(t);
+}
+
+static const GVecGen2 uqxtnb_ops[3] = {
+    { .fniv = gen_uqxtnb_vec,
+      .opt_opc = uqxtn_list,
+      .fno = gen_helper_sve2_uqxtnb_h,
+      .vece = MO_16 },
+    { .fniv = gen_uqxtnb_vec,
+      .opt_opc = uqxtn_list,
+      .fno = gen_helper_sve2_uqxtnb_s,
+      .vece = MO_32 },
+    { .fniv = gen_uqxtnb_vec,
+      .opt_opc = uqxtn_list,
+      .fno = gen_helper_sve2_uqxtnb_d,
+      .vece = MO_64 },
+};
+TRANS_FEAT(UQXTNB, aa64_sve2, do_narrow_extract, a, uqxtnb_ops)
+
+static void gen_uqxtnt_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
+{
+    TCGv_vec t = tcg_temp_new_vec_matching(d);
+    int halfbits = 4 << vece;
+    int64_t max = (1ull << halfbits) - 1;
+
+    tcg_gen_dupi_vec(vece, t, max);
+    tcg_gen_umin_vec(vece, n, n, t);
+    tcg_gen_shli_vec(vece, n, n, halfbits);
+    tcg_gen_bitsel_vec(vece, d, t, d, n);
+    tcg_temp_free_vec(t);
+}
+
+static const GVecGen2 uqxtnt_ops[3] = {
+    { .fniv = gen_uqxtnt_vec,
+      .opt_opc = uqxtn_list,
+      .load_dest = true,
+      .fno = gen_helper_sve2_uqxtnt_h,
+      .vece = MO_16 },
+    { .fniv = gen_uqxtnt_vec,
+      .opt_opc = uqxtn_list,
+      .load_dest = true,
+      .fno = gen_helper_sve2_uqxtnt_s,
+      .vece = MO_32 },
+    { .fniv = gen_uqxtnt_vec,
+      .opt_opc = uqxtn_list,
+      .load_dest = true,
+      .fno = gen_helper_sve2_uqxtnt_d,
+      .vece = MO_64 },
+};
+TRANS_FEAT(UQXTNT, aa64_sve2, do_narrow_extract, a, uqxtnt_ops)
+
+static const TCGOpcode sqxtun_list[] = {
+    INDEX_op_shli_vec, INDEX_op_umin_vec, INDEX_op_smax_vec, 0
+};
+
+static void gen_sqxtunb_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
+{
+    TCGv_vec t = tcg_temp_new_vec_matching(d);
+    int halfbits = 4 << vece;
+    int64_t max = (1ull << halfbits) - 1;
+
+    tcg_gen_dupi_vec(vece, t, 0);
+    tcg_gen_smax_vec(vece, d, n, t);
+    tcg_gen_dupi_vec(vece, t, max);
+    tcg_gen_umin_vec(vece, d, d, t);
+    tcg_temp_free_vec(t);
+}
+
+static const GVecGen2 sqxtunb_ops[3] = {
+    { .fniv = gen_sqxtunb_vec,
+      .opt_opc = sqxtun_list,
+      .fno = gen_helper_sve2_sqxtunb_h,
+      .vece = MO_16 },
+    { .fniv = gen_sqxtunb_vec,
+      .opt_opc = sqxtun_list,
+      .fno = gen_helper_sve2_sqxtunb_s,
+      .vece = MO_32 },
+    { .fniv = gen_sqxtunb_vec,
+      .opt_opc = sqxtun_list,
+      .fno = gen_helper_sve2_sqxtunb_d,
+      .vece = MO_64 },
+};
+TRANS_FEAT(SQXTUNB, aa64_sve2, do_narrow_extract, a, sqxtunb_ops)
+
+static void gen_sqxtunt_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
+{
+    TCGv_vec t = tcg_temp_new_vec_matching(d);
+    int halfbits = 4 << vece;
+    int64_t max = (1ull << halfbits) - 1;
+
+    tcg_gen_dupi_vec(vece, t, 0);
+    tcg_gen_smax_vec(vece, n, n, t);
+    tcg_gen_dupi_vec(vece, t, max);
+    tcg_gen_umin_vec(vece, n, n, t);
+    tcg_gen_shli_vec(vece, n, n, halfbits);
+    tcg_gen_bitsel_vec(vece, d, t, d, n);
+    tcg_temp_free_vec(t);
+}
+
+static const GVecGen2 sqxtunt_ops[3] = {
+    { .fniv = gen_sqxtunt_vec,
+      .opt_opc = sqxtun_list,
+      .load_dest = true,
+      .fno = gen_helper_sve2_sqxtunt_h,
+      .vece = MO_16 },
+    { .fniv = gen_sqxtunt_vec,
+      .opt_opc = sqxtun_list,
+      .load_dest = true,
+      .fno = gen_helper_sve2_sqxtunt_s,
+      .vece = MO_32 },
+    { .fniv = gen_sqxtunt_vec,
+      .opt_opc = sqxtun_list,
+      .load_dest = true,
+      .fno = gen_helper_sve2_sqxtunt_d,
+      .vece = MO_64 },
+};
+TRANS_FEAT(SQXTUNT, aa64_sve2, do_narrow_extract, a, sqxtunt_ops)
+
+static bool do_shr_narrow(DisasContext *s, arg_rri_esz *a,
+                          const GVecGen2i ops[3])
+{
+    if (a->esz < 0 || a->esz > MO_32) {
+        return false;
+    }
+    assert(a->imm > 0 && a->imm <= (8 << a->esz));
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        tcg_gen_gvec_2i(vec_full_reg_offset(s, a->rd),
+                        vec_full_reg_offset(s, a->rn),
+                        vsz, vsz, a->imm, &ops[a->esz]);
+    }
+    return true;
+}
+
+static void gen_shrnb_i64(unsigned vece, TCGv_i64 d, TCGv_i64 n, int shr)
+{
+    int halfbits = 4 << vece;
+    uint64_t mask = dup_const(vece, MAKE_64BIT_MASK(0, halfbits));
+
+    tcg_gen_shri_i64(d, n, shr);
+    tcg_gen_andi_i64(d, d, mask);
+}
+
+static void gen_shrnb16_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr)
+{
+    gen_shrnb_i64(MO_16, d, n, shr);
+}
+
+static void gen_shrnb32_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr)
+{
+    gen_shrnb_i64(MO_32, d, n, shr);
+}
+
+static void gen_shrnb64_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr)
+{
+    gen_shrnb_i64(MO_64, d, n, shr);
+}
+
+static void gen_shrnb_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t shr)
+{
+    TCGv_vec t = tcg_temp_new_vec_matching(d);
+    int halfbits = 4 << vece;
+    uint64_t mask = MAKE_64BIT_MASK(0, halfbits);
+
+    tcg_gen_shri_vec(vece, n, n, shr);
+    tcg_gen_dupi_vec(vece, t, mask);
+    tcg_gen_and_vec(vece, d, n, t);
+    tcg_temp_free_vec(t);
+}
+
+static const TCGOpcode shrnb_vec_list[] = { INDEX_op_shri_vec, 0 };
+static const GVecGen2i shrnb_ops[3] = {
+    { .fni8 = gen_shrnb16_i64,
+      .fniv = gen_shrnb_vec,
+      .opt_opc = shrnb_vec_list,
+      .fno = gen_helper_sve2_shrnb_h,
+      .vece = MO_16 },
+    { .fni8 = gen_shrnb32_i64,
+      .fniv = gen_shrnb_vec,
+      .opt_opc = shrnb_vec_list,
+      .fno = gen_helper_sve2_shrnb_s,
+      .vece = MO_32 },
+    { .fni8 = gen_shrnb64_i64,
+      .fniv = gen_shrnb_vec,
+      .opt_opc = shrnb_vec_list,
+      .fno = gen_helper_sve2_shrnb_d,
+      .vece = MO_64 },
+};
+TRANS_FEAT(SHRNB, aa64_sve2, do_shr_narrow, a, shrnb_ops)
+
+static void gen_shrnt_i64(unsigned vece, TCGv_i64 d, TCGv_i64 n, int shr)
+{
+    int halfbits = 4 << vece;
+    uint64_t mask = dup_const(vece, MAKE_64BIT_MASK(0, halfbits));
+
+    tcg_gen_shli_i64(n, n, halfbits - shr);
+    tcg_gen_andi_i64(n, n, ~mask);
+    tcg_gen_andi_i64(d, d, mask);
+    tcg_gen_or_i64(d, d, n);
+}
+
+static void gen_shrnt16_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr)
+{
+    gen_shrnt_i64(MO_16, d, n, shr);
+}
+
+static void gen_shrnt32_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr)
+{
+    gen_shrnt_i64(MO_32, d, n, shr);
+}
+
+static void gen_shrnt64_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr)
+{
+    tcg_gen_shri_i64(n, n, shr);
+    tcg_gen_deposit_i64(d, d, n, 32, 32);
+}
+
+static void gen_shrnt_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t shr)
+{
+    TCGv_vec t = tcg_temp_new_vec_matching(d);
+    int halfbits = 4 << vece;
+    uint64_t mask = MAKE_64BIT_MASK(0, halfbits);
+
+    tcg_gen_shli_vec(vece, n, n, halfbits - shr);
+    tcg_gen_dupi_vec(vece, t, mask);
+    tcg_gen_bitsel_vec(vece, d, t, d, n);
+    tcg_temp_free_vec(t);
+}
+
+static const TCGOpcode shrnt_vec_list[] = { INDEX_op_shli_vec, 0 };
+static const GVecGen2i shrnt_ops[3] = {
+    { .fni8 = gen_shrnt16_i64,
+      .fniv = gen_shrnt_vec,
+      .opt_opc = shrnt_vec_list,
+      .load_dest = true,
+      .fno = gen_helper_sve2_shrnt_h,
+      .vece = MO_16 },
+    { .fni8 = gen_shrnt32_i64,
+      .fniv = gen_shrnt_vec,
+      .opt_opc = shrnt_vec_list,
+      .load_dest = true,
+      .fno = gen_helper_sve2_shrnt_s,
+      .vece = MO_32 },
+    { .fni8 = gen_shrnt64_i64,
+      .fniv = gen_shrnt_vec,
+      .opt_opc = shrnt_vec_list,
+      .load_dest = true,
+      .fno = gen_helper_sve2_shrnt_d,
+      .vece = MO_64 },
+};
+TRANS_FEAT(SHRNT, aa64_sve2, do_shr_narrow, a, shrnt_ops)
+
+static const GVecGen2i rshrnb_ops[3] = {
+    { .fno = gen_helper_sve2_rshrnb_h },
+    { .fno = gen_helper_sve2_rshrnb_s },
+    { .fno = gen_helper_sve2_rshrnb_d },
+};
+TRANS_FEAT(RSHRNB, aa64_sve2, do_shr_narrow, a, rshrnb_ops)
+
+static const GVecGen2i rshrnt_ops[3] = {
+    { .fno = gen_helper_sve2_rshrnt_h },
+    { .fno = gen_helper_sve2_rshrnt_s },
+    { .fno = gen_helper_sve2_rshrnt_d },
+};
+TRANS_FEAT(RSHRNT, aa64_sve2, do_shr_narrow, a, rshrnt_ops)
+
+static void gen_sqshrunb_vec(unsigned vece, TCGv_vec d,
+                             TCGv_vec n, int64_t shr)
+{
+    TCGv_vec t = tcg_temp_new_vec_matching(d);
+    int halfbits = 4 << vece;
+
+    tcg_gen_sari_vec(vece, n, n, shr);
+    tcg_gen_dupi_vec(vece, t, 0);
+    tcg_gen_smax_vec(vece, n, n, t);
+    tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
+    tcg_gen_umin_vec(vece, d, n, t);
+    tcg_temp_free_vec(t);
+}
+
+static const TCGOpcode sqshrunb_vec_list[] = {
+    INDEX_op_sari_vec, INDEX_op_smax_vec, INDEX_op_umin_vec, 0
+};
+static const GVecGen2i sqshrunb_ops[3] = {
+    { .fniv = gen_sqshrunb_vec,
+      .opt_opc = sqshrunb_vec_list,
+      .fno = gen_helper_sve2_sqshrunb_h,
+      .vece = MO_16 },
+    { .fniv = gen_sqshrunb_vec,
+      .opt_opc = sqshrunb_vec_list,
+      .fno = gen_helper_sve2_sqshrunb_s,
+      .vece = MO_32 },
+    { .fniv = gen_sqshrunb_vec,
+      .opt_opc = sqshrunb_vec_list,
+      .fno = gen_helper_sve2_sqshrunb_d,
+      .vece = MO_64 },
+};
+TRANS_FEAT(SQSHRUNB, aa64_sve2, do_shr_narrow, a, sqshrunb_ops)
+
+static void gen_sqshrunt_vec(unsigned vece, TCGv_vec d,
+                             TCGv_vec n, int64_t shr)
+{
+    TCGv_vec t = tcg_temp_new_vec_matching(d);
+    int halfbits = 4 << vece;
+
+    tcg_gen_sari_vec(vece, n, n, shr);
+    tcg_gen_dupi_vec(vece, t, 0);
+    tcg_gen_smax_vec(vece, n, n, t);
+    tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
+    tcg_gen_umin_vec(vece, n, n, t);
+    tcg_gen_shli_vec(vece, n, n, halfbits);
+    tcg_gen_bitsel_vec(vece, d, t, d, n);
+    tcg_temp_free_vec(t);
+}
+
+static const TCGOpcode sqshrunt_vec_list[] = {
+    INDEX_op_shli_vec, INDEX_op_sari_vec,
+    INDEX_op_smax_vec, INDEX_op_umin_vec, 0
+};
+static const GVecGen2i sqshrunt_ops[3] = {
+    { .fniv = gen_sqshrunt_vec,
+      .opt_opc = sqshrunt_vec_list,
+      .load_dest = true,
+      .fno = gen_helper_sve2_sqshrunt_h,
+      .vece = MO_16 },
+    { .fniv = gen_sqshrunt_vec,
+      .opt_opc = sqshrunt_vec_list,
+      .load_dest = true,
+      .fno = gen_helper_sve2_sqshrunt_s,
+      .vece = MO_32 },
+    { .fniv = gen_sqshrunt_vec,
+      .opt_opc = sqshrunt_vec_list,
+      .load_dest = true,
+      .fno = gen_helper_sve2_sqshrunt_d,
+      .vece = MO_64 },
+};
+TRANS_FEAT(SQSHRUNT, aa64_sve2, do_shr_narrow, a, sqshrunt_ops)
+
+static const GVecGen2i sqrshrunb_ops[3] = {
+    { .fno = gen_helper_sve2_sqrshrunb_h },
+    { .fno = gen_helper_sve2_sqrshrunb_s },
+    { .fno = gen_helper_sve2_sqrshrunb_d },
+};
+TRANS_FEAT(SQRSHRUNB, aa64_sve2, do_shr_narrow, a, sqrshrunb_ops)
+
+static const GVecGen2i sqrshrunt_ops[3] = {
+    { .fno = gen_helper_sve2_sqrshrunt_h },
+    { .fno = gen_helper_sve2_sqrshrunt_s },
+    { .fno = gen_helper_sve2_sqrshrunt_d },
+};
+TRANS_FEAT(SQRSHRUNT, aa64_sve2, do_shr_narrow, a, sqrshrunt_ops)
+
+static void gen_sqshrnb_vec(unsigned vece, TCGv_vec d,
+                            TCGv_vec n, int64_t shr)
+{
+    TCGv_vec t = tcg_temp_new_vec_matching(d);
+    int halfbits = 4 << vece;
+    int64_t max = MAKE_64BIT_MASK(0, halfbits - 1);
+    int64_t min = -max - 1;
+
+    tcg_gen_sari_vec(vece, n, n, shr);
+    tcg_gen_dupi_vec(vece, t, min);
+    tcg_gen_smax_vec(vece, n, n, t);
+    tcg_gen_dupi_vec(vece, t, max);
+    tcg_gen_smin_vec(vece, n, n, t);
+    tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
+    tcg_gen_and_vec(vece, d, n, t);
+    tcg_temp_free_vec(t);
+}
+
+static const TCGOpcode sqshrnb_vec_list[] = {
+    INDEX_op_sari_vec, INDEX_op_smax_vec, INDEX_op_smin_vec, 0
+};
+static const GVecGen2i sqshrnb_ops[3] = {
+    { .fniv = gen_sqshrnb_vec,
+      .opt_opc = sqshrnb_vec_list,
+      .fno = gen_helper_sve2_sqshrnb_h,
+      .vece = MO_16 },
+    { .fniv = gen_sqshrnb_vec,
+      .opt_opc = sqshrnb_vec_list,
+      .fno = gen_helper_sve2_sqshrnb_s,
+      .vece = MO_32 },
+    { .fniv = gen_sqshrnb_vec,
+      .opt_opc = sqshrnb_vec_list,
+      .fno = gen_helper_sve2_sqshrnb_d,
+      .vece = MO_64 },
+};
+TRANS_FEAT(SQSHRNB, aa64_sve2, do_shr_narrow, a, sqshrnb_ops)
+
+static void gen_sqshrnt_vec(unsigned vece, TCGv_vec d,
+                             TCGv_vec n, int64_t shr)
+{
+    TCGv_vec t = tcg_temp_new_vec_matching(d);
+    int halfbits = 4 << vece;
+    int64_t max = MAKE_64BIT_MASK(0, halfbits - 1);
+    int64_t min = -max - 1;
+
+    tcg_gen_sari_vec(vece, n, n, shr);
+    tcg_gen_dupi_vec(vece, t, min);
+    tcg_gen_smax_vec(vece, n, n, t);
+    tcg_gen_dupi_vec(vece, t, max);
+    tcg_gen_smin_vec(vece, n, n, t);
+    tcg_gen_shli_vec(vece, n, n, halfbits);
+    tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
+    tcg_gen_bitsel_vec(vece, d, t, d, n);
+    tcg_temp_free_vec(t);
+}
+
+static const TCGOpcode sqshrnt_vec_list[] = {
+    INDEX_op_shli_vec, INDEX_op_sari_vec,
+    INDEX_op_smax_vec, INDEX_op_smin_vec, 0
+};
+static const GVecGen2i sqshrnt_ops[3] = {
+    { .fniv = gen_sqshrnt_vec,
+      .opt_opc = sqshrnt_vec_list,
+      .load_dest = true,
+      .fno = gen_helper_sve2_sqshrnt_h,
+      .vece = MO_16 },
+    { .fniv = gen_sqshrnt_vec,
+      .opt_opc = sqshrnt_vec_list,
+      .load_dest = true,
+      .fno = gen_helper_sve2_sqshrnt_s,
+      .vece = MO_32 },
+    { .fniv = gen_sqshrnt_vec,
+      .opt_opc = sqshrnt_vec_list,
+      .load_dest = true,
+      .fno = gen_helper_sve2_sqshrnt_d,
+      .vece = MO_64 },
+};
+TRANS_FEAT(SQSHRNT, aa64_sve2, do_shr_narrow, a, sqshrnt_ops)
+
+static const GVecGen2i sqrshrnb_ops[3] = {
+    { .fno = gen_helper_sve2_sqrshrnb_h },
+    { .fno = gen_helper_sve2_sqrshrnb_s },
+    { .fno = gen_helper_sve2_sqrshrnb_d },
+};
+TRANS_FEAT(SQRSHRNB, aa64_sve2, do_shr_narrow, a, sqrshrnb_ops)
+
+static const GVecGen2i sqrshrnt_ops[3] = {
+    { .fno = gen_helper_sve2_sqrshrnt_h },
+    { .fno = gen_helper_sve2_sqrshrnt_s },
+    { .fno = gen_helper_sve2_sqrshrnt_d },
+};
+TRANS_FEAT(SQRSHRNT, aa64_sve2, do_shr_narrow, a, sqrshrnt_ops)
+
+static void gen_uqshrnb_vec(unsigned vece, TCGv_vec d,
+                            TCGv_vec n, int64_t shr)
+{
+    TCGv_vec t = tcg_temp_new_vec_matching(d);
+    int halfbits = 4 << vece;
+
+    tcg_gen_shri_vec(vece, n, n, shr);
+    tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
+    tcg_gen_umin_vec(vece, d, n, t);
+    tcg_temp_free_vec(t);
+}
+
+static const TCGOpcode uqshrnb_vec_list[] = {
+    INDEX_op_shri_vec, INDEX_op_umin_vec, 0
+};
+static const GVecGen2i uqshrnb_ops[3] = {
+    { .fniv = gen_uqshrnb_vec,
+      .opt_opc = uqshrnb_vec_list,
+      .fno = gen_helper_sve2_uqshrnb_h,
+      .vece = MO_16 },
+    { .fniv = gen_uqshrnb_vec,
+      .opt_opc = uqshrnb_vec_list,
+      .fno = gen_helper_sve2_uqshrnb_s,
+      .vece = MO_32 },
+    { .fniv = gen_uqshrnb_vec,
+      .opt_opc = uqshrnb_vec_list,
+      .fno = gen_helper_sve2_uqshrnb_d,
+      .vece = MO_64 },
+};
+TRANS_FEAT(UQSHRNB, aa64_sve2, do_shr_narrow, a, uqshrnb_ops)
+
+static void gen_uqshrnt_vec(unsigned vece, TCGv_vec d,
+                            TCGv_vec n, int64_t shr)
+{
+    TCGv_vec t = tcg_temp_new_vec_matching(d);
+    int halfbits = 4 << vece;
+
+    tcg_gen_shri_vec(vece, n, n, shr);
+    tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
+    tcg_gen_umin_vec(vece, n, n, t);
+    tcg_gen_shli_vec(vece, n, n, halfbits);
+    tcg_gen_bitsel_vec(vece, d, t, d, n);
+    tcg_temp_free_vec(t);
+}
+
+static const TCGOpcode uqshrnt_vec_list[] = {
+    INDEX_op_shli_vec, INDEX_op_shri_vec, INDEX_op_umin_vec, 0
+};
+static const GVecGen2i uqshrnt_ops[3] = {
+    { .fniv = gen_uqshrnt_vec,
+      .opt_opc = uqshrnt_vec_list,
+      .load_dest = true,
+      .fno = gen_helper_sve2_uqshrnt_h,
+      .vece = MO_16 },
+    { .fniv = gen_uqshrnt_vec,
+      .opt_opc = uqshrnt_vec_list,
+      .load_dest = true,
+      .fno = gen_helper_sve2_uqshrnt_s,
+      .vece = MO_32 },
+    { .fniv = gen_uqshrnt_vec,
+      .opt_opc = uqshrnt_vec_list,
+      .load_dest = true,
+      .fno = gen_helper_sve2_uqshrnt_d,
+      .vece = MO_64 },
+};
+TRANS_FEAT(UQSHRNT, aa64_sve2, do_shr_narrow, a, uqshrnt_ops)
+
+static const GVecGen2i uqrshrnb_ops[3] = {
+    { .fno = gen_helper_sve2_uqrshrnb_h },
+    { .fno = gen_helper_sve2_uqrshrnb_s },
+    { .fno = gen_helper_sve2_uqrshrnb_d },
+};
+TRANS_FEAT(UQRSHRNB, aa64_sve2, do_shr_narrow, a, uqrshrnb_ops)
+
+static const GVecGen2i uqrshrnt_ops[3] = {
+    { .fno = gen_helper_sve2_uqrshrnt_h },
+    { .fno = gen_helper_sve2_uqrshrnt_s },
+    { .fno = gen_helper_sve2_uqrshrnt_d },
+};
+TRANS_FEAT(UQRSHRNT, aa64_sve2, do_shr_narrow, a, uqrshrnt_ops)
+
+#define DO_SVE2_ZZZ_NARROW(NAME, name)                                    \
+    static gen_helper_gvec_3 * const name##_fns[4] = {                    \
+        NULL,                       gen_helper_sve2_##name##_h,           \
+        gen_helper_sve2_##name##_s, gen_helper_sve2_##name##_d,           \
+    };                                                                    \
+    TRANS_FEAT(NAME, aa64_sve2, gen_gvec_ool_arg_zzz,                     \
+               name##_fns[a->esz], a, 0)
+
+DO_SVE2_ZZZ_NARROW(ADDHNB, addhnb)
+DO_SVE2_ZZZ_NARROW(ADDHNT, addhnt)
+DO_SVE2_ZZZ_NARROW(RADDHNB, raddhnb)
+DO_SVE2_ZZZ_NARROW(RADDHNT, raddhnt)
+
+DO_SVE2_ZZZ_NARROW(SUBHNB, subhnb)
+DO_SVE2_ZZZ_NARROW(SUBHNT, subhnt)
+DO_SVE2_ZZZ_NARROW(RSUBHNB, rsubhnb)
+DO_SVE2_ZZZ_NARROW(RSUBHNT, rsubhnt)
+
+static gen_helper_gvec_flags_4 * const match_fns[4] = {
+    gen_helper_sve2_match_ppzz_b, gen_helper_sve2_match_ppzz_h, NULL, NULL
+};
+TRANS_FEAT_NONSTREAMING(MATCH, aa64_sve2, do_ppzz_flags, a, match_fns[a->esz])
+
+static gen_helper_gvec_flags_4 * const nmatch_fns[4] = {
+    gen_helper_sve2_nmatch_ppzz_b, gen_helper_sve2_nmatch_ppzz_h, NULL, NULL
+};
+TRANS_FEAT_NONSTREAMING(NMATCH, aa64_sve2, do_ppzz_flags, a, nmatch_fns[a->esz])
+
+static gen_helper_gvec_4 * const histcnt_fns[4] = {
+    NULL, NULL, gen_helper_sve2_histcnt_s, gen_helper_sve2_histcnt_d
+};
+TRANS_FEAT_NONSTREAMING(HISTCNT, aa64_sve2, gen_gvec_ool_arg_zpzz,
+                        histcnt_fns[a->esz], a, 0)
+
+TRANS_FEAT_NONSTREAMING(HISTSEG, aa64_sve2, gen_gvec_ool_arg_zzz,
+                        a->esz == 0 ? gen_helper_sve2_histseg : NULL, a, 0)
+
+DO_ZPZZ_FP(FADDP, aa64_sve2, sve2_faddp_zpzz)
+DO_ZPZZ_FP(FMAXNMP, aa64_sve2, sve2_fmaxnmp_zpzz)
+DO_ZPZZ_FP(FMINNMP, aa64_sve2, sve2_fminnmp_zpzz)
+DO_ZPZZ_FP(FMAXP, aa64_sve2, sve2_fmaxp_zpzz)
+DO_ZPZZ_FP(FMINP, aa64_sve2, sve2_fminp_zpzz)
+
+/*
+ * SVE Integer Multiply-Add (unpredicated)
+ */
+
+TRANS_FEAT_NONSTREAMING(FMMLA_s, aa64_sve_f32mm, gen_gvec_fpst_zzzz,
+                        gen_helper_fmmla_s, a->rd, a->rn, a->rm, a->ra,
+                        0, FPST_FPCR)
+TRANS_FEAT_NONSTREAMING(FMMLA_d, aa64_sve_f64mm, gen_gvec_fpst_zzzz,
+                        gen_helper_fmmla_d, a->rd, a->rn, a->rm, a->ra,
+                        0, FPST_FPCR)
+
+static gen_helper_gvec_4 * const sqdmlal_zzzw_fns[] = {
+    NULL,                           gen_helper_sve2_sqdmlal_zzzw_h,
+    gen_helper_sve2_sqdmlal_zzzw_s, gen_helper_sve2_sqdmlal_zzzw_d,
+};
+TRANS_FEAT(SQDMLALB_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
+           sqdmlal_zzzw_fns[a->esz], a, 0)
+TRANS_FEAT(SQDMLALT_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
+           sqdmlal_zzzw_fns[a->esz], a, 3)
+TRANS_FEAT(SQDMLALBT, aa64_sve2, gen_gvec_ool_arg_zzzz,
+           sqdmlal_zzzw_fns[a->esz], a, 2)
+
+static gen_helper_gvec_4 * const sqdmlsl_zzzw_fns[] = {
+    NULL,                           gen_helper_sve2_sqdmlsl_zzzw_h,
+    gen_helper_sve2_sqdmlsl_zzzw_s, gen_helper_sve2_sqdmlsl_zzzw_d,
+};
+TRANS_FEAT(SQDMLSLB_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
+           sqdmlsl_zzzw_fns[a->esz], a, 0)
+TRANS_FEAT(SQDMLSLT_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
+           sqdmlsl_zzzw_fns[a->esz], a, 3)
+TRANS_FEAT(SQDMLSLBT, aa64_sve2, gen_gvec_ool_arg_zzzz,
+           sqdmlsl_zzzw_fns[a->esz], a, 2)
+
+static gen_helper_gvec_4 * const sqrdmlah_fns[] = {
+    gen_helper_sve2_sqrdmlah_b, gen_helper_sve2_sqrdmlah_h,
+    gen_helper_sve2_sqrdmlah_s, gen_helper_sve2_sqrdmlah_d,
+};
+TRANS_FEAT(SQRDMLAH_zzzz, aa64_sve2, gen_gvec_ool_arg_zzzz,
+           sqrdmlah_fns[a->esz], a, 0)
+
+static gen_helper_gvec_4 * const sqrdmlsh_fns[] = {
+    gen_helper_sve2_sqrdmlsh_b, gen_helper_sve2_sqrdmlsh_h,
+    gen_helper_sve2_sqrdmlsh_s, gen_helper_sve2_sqrdmlsh_d,
+};
+TRANS_FEAT(SQRDMLSH_zzzz, aa64_sve2, gen_gvec_ool_arg_zzzz,
+           sqrdmlsh_fns[a->esz], a, 0)
+
+static gen_helper_gvec_4 * const smlal_zzzw_fns[] = {
+    NULL,                         gen_helper_sve2_smlal_zzzw_h,
+    gen_helper_sve2_smlal_zzzw_s, gen_helper_sve2_smlal_zzzw_d,
+};
+TRANS_FEAT(SMLALB_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
+           smlal_zzzw_fns[a->esz], a, 0)
+TRANS_FEAT(SMLALT_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
+           smlal_zzzw_fns[a->esz], a, 1)
+
+static gen_helper_gvec_4 * const umlal_zzzw_fns[] = {
+    NULL,                         gen_helper_sve2_umlal_zzzw_h,
+    gen_helper_sve2_umlal_zzzw_s, gen_helper_sve2_umlal_zzzw_d,
+};
+TRANS_FEAT(UMLALB_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
+           umlal_zzzw_fns[a->esz], a, 0)
+TRANS_FEAT(UMLALT_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
+           umlal_zzzw_fns[a->esz], a, 1)
+
+static gen_helper_gvec_4 * const smlsl_zzzw_fns[] = {
+    NULL,                         gen_helper_sve2_smlsl_zzzw_h,
+    gen_helper_sve2_smlsl_zzzw_s, gen_helper_sve2_smlsl_zzzw_d,
+};
+TRANS_FEAT(SMLSLB_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
+           smlsl_zzzw_fns[a->esz], a, 0)
+TRANS_FEAT(SMLSLT_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
+           smlsl_zzzw_fns[a->esz], a, 1)
+
+static gen_helper_gvec_4 * const umlsl_zzzw_fns[] = {
+    NULL,                         gen_helper_sve2_umlsl_zzzw_h,
+    gen_helper_sve2_umlsl_zzzw_s, gen_helper_sve2_umlsl_zzzw_d,
+};
+TRANS_FEAT(UMLSLB_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
+           umlsl_zzzw_fns[a->esz], a, 0)
+TRANS_FEAT(UMLSLT_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
+           umlsl_zzzw_fns[a->esz], a, 1)
+
+static gen_helper_gvec_4 * const cmla_fns[] = {
+    gen_helper_sve2_cmla_zzzz_b, gen_helper_sve2_cmla_zzzz_h,
+    gen_helper_sve2_cmla_zzzz_s, gen_helper_sve2_cmla_zzzz_d,
+};
+TRANS_FEAT(CMLA_zzzz, aa64_sve2, gen_gvec_ool_zzzz,
+           cmla_fns[a->esz], a->rd, a->rn, a->rm, a->ra, a->rot)
+
+static gen_helper_gvec_4 * const cdot_fns[] = {
+    NULL, NULL, gen_helper_sve2_cdot_zzzz_s, gen_helper_sve2_cdot_zzzz_d
+};
+TRANS_FEAT(CDOT_zzzz, aa64_sve2, gen_gvec_ool_zzzz,
+           cdot_fns[a->esz], a->rd, a->rn, a->rm, a->ra, a->rot)
+
+static gen_helper_gvec_4 * const sqrdcmlah_fns[] = {
+    gen_helper_sve2_sqrdcmlah_zzzz_b, gen_helper_sve2_sqrdcmlah_zzzz_h,
+    gen_helper_sve2_sqrdcmlah_zzzz_s, gen_helper_sve2_sqrdcmlah_zzzz_d,
+};
+TRANS_FEAT(SQRDCMLAH_zzzz, aa64_sve2, gen_gvec_ool_zzzz,
+           sqrdcmlah_fns[a->esz], a->rd, a->rn, a->rm, a->ra, a->rot)
+
+TRANS_FEAT(USDOT_zzzz, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz,
+           a->esz == 2 ? gen_helper_gvec_usdot_b : NULL, a, 0)
+
+TRANS_FEAT_NONSTREAMING(AESMC, aa64_sve2_aes, gen_gvec_ool_zz,
+                        gen_helper_crypto_aesmc, a->rd, a->rd, a->decrypt)
+
+TRANS_FEAT_NONSTREAMING(AESE, aa64_sve2_aes, gen_gvec_ool_arg_zzz,
+                        gen_helper_crypto_aese, a, false)
+TRANS_FEAT_NONSTREAMING(AESD, aa64_sve2_aes, gen_gvec_ool_arg_zzz,
+                        gen_helper_crypto_aese, a, true)
+
+TRANS_FEAT_NONSTREAMING(SM4E, aa64_sve2_sm4, gen_gvec_ool_arg_zzz,
+                        gen_helper_crypto_sm4e, a, 0)
+TRANS_FEAT_NONSTREAMING(SM4EKEY, aa64_sve2_sm4, gen_gvec_ool_arg_zzz,
+                        gen_helper_crypto_sm4ekey, a, 0)
+
+TRANS_FEAT_NONSTREAMING(RAX1, aa64_sve2_sha3, gen_gvec_fn_arg_zzz,
+                        gen_gvec_rax1, a)
+
+TRANS_FEAT(FCVTNT_sh, aa64_sve2, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve2_fcvtnt_sh, a, 0, FPST_FPCR)
+TRANS_FEAT(FCVTNT_ds, aa64_sve2, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve2_fcvtnt_ds, a, 0, FPST_FPCR)
+
+TRANS_FEAT(BFCVTNT, aa64_sve_bf16, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve_bfcvtnt, a, 0, FPST_FPCR)
+
+TRANS_FEAT(FCVTLT_hs, aa64_sve2, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve2_fcvtlt_hs, a, 0, FPST_FPCR)
+TRANS_FEAT(FCVTLT_sd, aa64_sve2, gen_gvec_fpst_arg_zpz,
+           gen_helper_sve2_fcvtlt_sd, a, 0, FPST_FPCR)
+
+TRANS_FEAT(FCVTX_ds, aa64_sve2, do_frint_mode, a,
+           float_round_to_odd, gen_helper_sve_fcvt_ds)
+TRANS_FEAT(FCVTXNT_ds, aa64_sve2, do_frint_mode, a,
+           float_round_to_odd, gen_helper_sve2_fcvtnt_ds)
+
+static gen_helper_gvec_3_ptr * const flogb_fns[] = {
+    NULL,               gen_helper_flogb_h,
+    gen_helper_flogb_s, gen_helper_flogb_d
+};
+TRANS_FEAT(FLOGB, aa64_sve2, gen_gvec_fpst_arg_zpz, flogb_fns[a->esz],
+           a, 0, a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
+
+static bool do_FMLAL_zzzw(DisasContext *s, arg_rrrr_esz *a, bool sub, bool sel)
+{
+    return gen_gvec_ptr_zzzz(s, gen_helper_sve2_fmlal_zzzw_s,
+                             a->rd, a->rn, a->rm, a->ra,
+                             (sel << 1) | sub, cpu_env);
+}
+
+TRANS_FEAT(FMLALB_zzzw, aa64_sve2, do_FMLAL_zzzw, a, false, false)
+TRANS_FEAT(FMLALT_zzzw, aa64_sve2, do_FMLAL_zzzw, a, false, true)
+TRANS_FEAT(FMLSLB_zzzw, aa64_sve2, do_FMLAL_zzzw, a, true, false)
+TRANS_FEAT(FMLSLT_zzzw, aa64_sve2, do_FMLAL_zzzw, a, true, true)
+
+static bool do_FMLAL_zzxw(DisasContext *s, arg_rrxr_esz *a, bool sub, bool sel)
+{
+    return gen_gvec_ptr_zzzz(s, gen_helper_sve2_fmlal_zzxw_s,
+                             a->rd, a->rn, a->rm, a->ra,
+                             (a->index << 2) | (sel << 1) | sub, cpu_env);
+}
+
+TRANS_FEAT(FMLALB_zzxw, aa64_sve2, do_FMLAL_zzxw, a, false, false)
+TRANS_FEAT(FMLALT_zzxw, aa64_sve2, do_FMLAL_zzxw, a, false, true)
+TRANS_FEAT(FMLSLB_zzxw, aa64_sve2, do_FMLAL_zzxw, a, true, false)
+TRANS_FEAT(FMLSLT_zzxw, aa64_sve2, do_FMLAL_zzxw, a, true, true)
+
+TRANS_FEAT_NONSTREAMING(SMMLA, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz,
+                        gen_helper_gvec_smmla_b, a, 0)
+TRANS_FEAT_NONSTREAMING(USMMLA, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz,
+                        gen_helper_gvec_usmmla_b, a, 0)
+TRANS_FEAT_NONSTREAMING(UMMLA, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz,
+                        gen_helper_gvec_ummla_b, a, 0)
+
+TRANS_FEAT(BFDOT_zzzz, aa64_sve_bf16, gen_gvec_ool_arg_zzzz,
+           gen_helper_gvec_bfdot, a, 0)
+TRANS_FEAT(BFDOT_zzxz, aa64_sve_bf16, gen_gvec_ool_arg_zzxz,
+           gen_helper_gvec_bfdot_idx, a)
+
+TRANS_FEAT_NONSTREAMING(BFMMLA, aa64_sve_bf16, gen_gvec_ool_arg_zzzz,
+                        gen_helper_gvec_bfmmla, a, 0)
+
+static bool do_BFMLAL_zzzw(DisasContext *s, arg_rrrr_esz *a, bool sel)
+{
+    return gen_gvec_fpst_zzzz(s, gen_helper_gvec_bfmlal,
+                              a->rd, a->rn, a->rm, a->ra, sel, FPST_FPCR);
+}
+
+TRANS_FEAT(BFMLALB_zzzw, aa64_sve_bf16, do_BFMLAL_zzzw, a, false)
+TRANS_FEAT(BFMLALT_zzzw, aa64_sve_bf16, do_BFMLAL_zzzw, a, true)
+
+static bool do_BFMLAL_zzxw(DisasContext *s, arg_rrxr_esz *a, bool sel)
+{
+    return gen_gvec_fpst_zzzz(s, gen_helper_gvec_bfmlal_idx,
+                              a->rd, a->rn, a->rm, a->ra,
+                              (a->index << 1) | sel, FPST_FPCR);
+}
+
+TRANS_FEAT(BFMLALB_zzxw, aa64_sve_bf16, do_BFMLAL_zzxw, a, false)
+TRANS_FEAT(BFMLALT_zzxw, aa64_sve_bf16, do_BFMLAL_zzxw, a, true)
+
+static bool trans_PSEL(DisasContext *s, arg_psel *a)
+{
+    int vl = vec_full_reg_size(s);
+    int pl = pred_gvec_reg_size(s);
+    int elements = vl >> a->esz;
+    TCGv_i64 tmp, didx, dbit;
+    TCGv_ptr ptr;
+
+    if (!dc_isar_feature(aa64_sme, s)) {
+        return false;
+    }
+    if (!sve_access_check(s)) {
+        return true;
+    }
+
+    tmp = tcg_temp_new_i64();
+    dbit = tcg_temp_new_i64();
+    didx = tcg_temp_new_i64();
+    ptr = tcg_temp_new_ptr();
+
+    /* Compute the predicate element. */
+    tcg_gen_addi_i64(tmp, cpu_reg(s, a->rv), a->imm);
+    if (is_power_of_2(elements)) {
+        tcg_gen_andi_i64(tmp, tmp, elements - 1);
+    } else {
+        tcg_gen_remu_i64(tmp, tmp, tcg_constant_i64(elements));
+    }
+
+    /* Extract the predicate byte and bit indices. */
+    tcg_gen_shli_i64(tmp, tmp, a->esz);
+    tcg_gen_andi_i64(dbit, tmp, 7);
+    tcg_gen_shri_i64(didx, tmp, 3);
+    if (HOST_BIG_ENDIAN) {
+        tcg_gen_xori_i64(didx, didx, 7);
+    }
+
+    /* Load the predicate word. */
+    tcg_gen_trunc_i64_ptr(ptr, didx);
+    tcg_gen_add_ptr(ptr, ptr, cpu_env);
+    tcg_gen_ld8u_i64(tmp, ptr, pred_full_reg_offset(s, a->pm));
+
+    /* Extract the predicate bit and replicate to MO_64. */
+    tcg_gen_shr_i64(tmp, tmp, dbit);
+    tcg_gen_andi_i64(tmp, tmp, 1);
+    tcg_gen_neg_i64(tmp, tmp);
+
+    /* Apply to either copy the source, or write zeros. */
+    tcg_gen_gvec_ands(MO_64, pred_full_reg_offset(s, a->pd),
+                      pred_full_reg_offset(s, a->pn), tmp, pl, pl);
+
+    tcg_temp_free_i64(tmp);
+    tcg_temp_free_i64(dbit);
+    tcg_temp_free_i64(didx);
+    tcg_temp_free_ptr(ptr);
+    return true;
+}
+
+static void gen_sclamp_i32(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, TCGv_i32 a)
+{
+    tcg_gen_smax_i32(d, a, n);
+    tcg_gen_smin_i32(d, d, m);
+}
+
+static void gen_sclamp_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 a)
+{
+    tcg_gen_smax_i64(d, a, n);
+    tcg_gen_smin_i64(d, d, m);
+}
+
+static void gen_sclamp_vec(unsigned vece, TCGv_vec d, TCGv_vec n,
+                           TCGv_vec m, TCGv_vec a)
+{
+    tcg_gen_smax_vec(vece, d, a, n);
+    tcg_gen_smin_vec(vece, d, d, m);
+}
+
+static void gen_sclamp(unsigned vece, uint32_t d, uint32_t n, uint32_t m,
+                       uint32_t a, uint32_t oprsz, uint32_t maxsz)
+{
+    static const TCGOpcode vecop[] = {
+        INDEX_op_smin_vec, INDEX_op_smax_vec, 0
+    };
+    static const GVecGen4 ops[4] = {
+        { .fniv = gen_sclamp_vec,
+          .fno  = gen_helper_gvec_sclamp_b,
+          .opt_opc = vecop,
+          .vece = MO_8 },
+        { .fniv = gen_sclamp_vec,
+          .fno  = gen_helper_gvec_sclamp_h,
+          .opt_opc = vecop,
+          .vece = MO_16 },
+        { .fni4 = gen_sclamp_i32,
+          .fniv = gen_sclamp_vec,
+          .fno  = gen_helper_gvec_sclamp_s,
+          .opt_opc = vecop,
+          .vece = MO_32 },
+        { .fni8 = gen_sclamp_i64,
+          .fniv = gen_sclamp_vec,
+          .fno  = gen_helper_gvec_sclamp_d,
+          .opt_opc = vecop,
+          .vece = MO_64,
+          .prefer_i64 = TCG_TARGET_REG_BITS == 64 }
+    };
+    tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &ops[vece]);
+}
+
+TRANS_FEAT(SCLAMP, aa64_sme, gen_gvec_fn_arg_zzzz, gen_sclamp, a)
+
+static void gen_uclamp_i32(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, TCGv_i32 a)
+{
+    tcg_gen_umax_i32(d, a, n);
+    tcg_gen_umin_i32(d, d, m);
+}
+
+static void gen_uclamp_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 a)
+{
+    tcg_gen_umax_i64(d, a, n);
+    tcg_gen_umin_i64(d, d, m);
+}
+
+static void gen_uclamp_vec(unsigned vece, TCGv_vec d, TCGv_vec n,
+                           TCGv_vec m, TCGv_vec a)
+{
+    tcg_gen_umax_vec(vece, d, a, n);
+    tcg_gen_umin_vec(vece, d, d, m);
+}
+
+static void gen_uclamp(unsigned vece, uint32_t d, uint32_t n, uint32_t m,
+                       uint32_t a, uint32_t oprsz, uint32_t maxsz)
+{
+    static const TCGOpcode vecop[] = {
+        INDEX_op_umin_vec, INDEX_op_umax_vec, 0
+    };
+    static const GVecGen4 ops[4] = {
+        { .fniv = gen_uclamp_vec,
+          .fno  = gen_helper_gvec_uclamp_b,
+          .opt_opc = vecop,
+          .vece = MO_8 },
+        { .fniv = gen_uclamp_vec,
+          .fno  = gen_helper_gvec_uclamp_h,
+          .opt_opc = vecop,
+          .vece = MO_16 },
+        { .fni4 = gen_uclamp_i32,
+          .fniv = gen_uclamp_vec,
+          .fno  = gen_helper_gvec_uclamp_s,
+          .opt_opc = vecop,
+          .vece = MO_32 },
+        { .fni8 = gen_uclamp_i64,
+          .fniv = gen_uclamp_vec,
+          .fno  = gen_helper_gvec_uclamp_d,
+          .opt_opc = vecop,
+          .vece = MO_64,
+          .prefer_i64 = TCG_TARGET_REG_BITS == 64 }
+    };
+    tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &ops[vece]);
+}
+
+TRANS_FEAT(UCLAMP, aa64_sme, gen_gvec_fn_arg_zzzz, gen_uclamp, a)
diff --git a/target/arm/tcg/translate-vfp.c b/target/arm/tcg/translate-vfp.c
new file mode 100644
index 0000000..5c5d58d
--- /dev/null
+++ b/target/arm/tcg/translate-vfp.c
@@ -0,0 +1,3619 @@
+/*
+ *  ARM translation: AArch32 VFP instructions
+ *
+ *  Copyright (c) 2003 Fabrice Bellard
+ *  Copyright (c) 2005-2007 CodeSourcery
+ *  Copyright (c) 2007 OpenedHand, Ltd.
+ *  Copyright (c) 2019 Linaro, Ltd.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
+#include "exec/exec-all.h"
+#include "exec/gen-icount.h"
+#include "translate.h"
+#include "translate-a32.h"
+
+/* Include the generated VFP decoder */
+#include "decode-vfp.c.inc"
+#include "decode-vfp-uncond.c.inc"
+
+static inline void vfp_load_reg64(TCGv_i64 var, int reg)
+{
+    tcg_gen_ld_i64(var, cpu_env, vfp_reg_offset(true, reg));
+}
+
+static inline void vfp_store_reg64(TCGv_i64 var, int reg)
+{
+    tcg_gen_st_i64(var, cpu_env, vfp_reg_offset(true, reg));
+}
+
+static inline void vfp_load_reg32(TCGv_i32 var, int reg)
+{
+    tcg_gen_ld_i32(var, cpu_env, vfp_reg_offset(false, reg));
+}
+
+static inline void vfp_store_reg32(TCGv_i32 var, int reg)
+{
+    tcg_gen_st_i32(var, cpu_env, vfp_reg_offset(false, reg));
+}
+
+/*
+ * The imm8 encodes the sign bit, enough bits to represent an exponent in
+ * the range 01....1xx to 10....0xx, and the most significant 4 bits of
+ * the mantissa; see VFPExpandImm() in the v8 ARM ARM.
+ */
+uint64_t vfp_expand_imm(int size, uint8_t imm8)
+{
+    uint64_t imm;
+
+    switch (size) {
+    case MO_64:
+        imm = (extract32(imm8, 7, 1) ? 0x8000 : 0) |
+            (extract32(imm8, 6, 1) ? 0x3fc0 : 0x4000) |
+            extract32(imm8, 0, 6);
+        imm <<= 48;
+        break;
+    case MO_32:
+        imm = (extract32(imm8, 7, 1) ? 0x8000 : 0) |
+            (extract32(imm8, 6, 1) ? 0x3e00 : 0x4000) |
+            (extract32(imm8, 0, 6) << 3);
+        imm <<= 16;
+        break;
+    case MO_16:
+        imm = (extract32(imm8, 7, 1) ? 0x8000 : 0) |
+            (extract32(imm8, 6, 1) ? 0x3000 : 0x4000) |
+            (extract32(imm8, 0, 6) << 6);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    return imm;
+}
+
+/*
+ * Return the offset of a 16-bit half of the specified VFP single-precision
+ * register. If top is true, returns the top 16 bits; otherwise the bottom
+ * 16 bits.
+ */
+static inline long vfp_f16_offset(unsigned reg, bool top)
+{
+    long offs = vfp_reg_offset(false, reg);
+#if HOST_BIG_ENDIAN
+    if (!top) {
+        offs += 2;
+    }
+#else
+    if (top) {
+        offs += 2;
+    }
+#endif
+    return offs;
+}
+
+/*
+ * Generate code for M-profile lazy FP state preservation if needed;
+ * this corresponds to the pseudocode PreserveFPState() function.
+ */
+static void gen_preserve_fp_state(DisasContext *s, bool skip_context_update)
+{
+    if (s->v7m_lspact) {
+        /*
+         * Lazy state saving affects external memory and also the NVIC,
+         * so we must mark it as an IO operation for icount (and cause
+         * this to be the last insn in the TB).
+         */
+        if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
+            s->base.is_jmp = DISAS_UPDATE_EXIT;
+            gen_io_start();
+        }
+        gen_helper_v7m_preserve_fp_state(cpu_env);
+        /*
+         * If the preserve_fp_state helper doesn't throw an exception
+         * then it will clear LSPACT; we don't need to repeat this for
+         * any further FP insns in this TB.
+         */
+        s->v7m_lspact = false;
+        /*
+         * The helper might have zeroed VPR, so we do not know the
+         * correct value for the MVE_NO_PRED TB flag any more.
+         * If we're about to create a new fp context then that
+         * will precisely determine the MVE_NO_PRED value (see
+         * gen_update_fp_context()). Otherwise, we must:
+         *  - set s->mve_no_pred to false, so this instruction
+         *    is generated to use helper functions
+         *  - end the TB now, without chaining to the next TB
+         */
+        if (skip_context_update || !s->v7m_new_fp_ctxt_needed) {
+            s->mve_no_pred = false;
+            s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
+        }
+    }
+}
+
+/*
+ * Generate code for M-profile FP context handling: update the
+ * ownership of the FP context, and create a new context if
+ * necessary. This corresponds to the parts of the pseudocode
+ * ExecuteFPCheck() after the inital PreserveFPState() call.
+ */
+static void gen_update_fp_context(DisasContext *s)
+{
+    /* Update ownership of FP context: set FPCCR.S to match current state */
+    if (s->v8m_fpccr_s_wrong) {
+        TCGv_i32 tmp;
+
+        tmp = load_cpu_field(v7m.fpccr[M_REG_S]);
+        if (s->v8m_secure) {
+            tcg_gen_ori_i32(tmp, tmp, R_V7M_FPCCR_S_MASK);
+        } else {
+            tcg_gen_andi_i32(tmp, tmp, ~R_V7M_FPCCR_S_MASK);
+        }
+        store_cpu_field(tmp, v7m.fpccr[M_REG_S]);
+        /* Don't need to do this for any further FP insns in this TB */
+        s->v8m_fpccr_s_wrong = false;
+    }
+
+    if (s->v7m_new_fp_ctxt_needed) {
+        /*
+         * Create new FP context by updating CONTROL.FPCA, CONTROL.SFPA,
+         * the FPSCR, and VPR.
+         */
+        TCGv_i32 control, fpscr;
+        uint32_t bits = R_V7M_CONTROL_FPCA_MASK;
+
+        fpscr = load_cpu_field(v7m.fpdscr[s->v8m_secure]);
+        gen_helper_vfp_set_fpscr(cpu_env, fpscr);
+        tcg_temp_free_i32(fpscr);
+        if (dc_isar_feature(aa32_mve, s)) {
+            store_cpu_field(tcg_constant_i32(0), v7m.vpr);
+        }
+        /*
+         * We just updated the FPSCR and VPR. Some of this state is cached
+         * in the MVE_NO_PRED TB flag. We want to avoid having to end the
+         * TB here, which means we need the new value of the MVE_NO_PRED
+         * flag to be exactly known here and the same for all executions.
+         * Luckily FPDSCR.LTPSIZE is always constant 4 and the VPR is
+         * always set to 0, so the new MVE_NO_PRED flag is always 1
+         * if and only if we have MVE.
+         *
+         * (The other FPSCR state cached in TB flags is VECLEN and VECSTRIDE,
+         * but those do not exist for M-profile, so are not relevant here.)
+         */
+        s->mve_no_pred = dc_isar_feature(aa32_mve, s);
+
+        if (s->v8m_secure) {
+            bits |= R_V7M_CONTROL_SFPA_MASK;
+        }
+        control = load_cpu_field(v7m.control[M_REG_S]);
+        tcg_gen_ori_i32(control, control, bits);
+        store_cpu_field(control, v7m.control[M_REG_S]);
+        /* Don't need to do this for any further FP insns in this TB */
+        s->v7m_new_fp_ctxt_needed = false;
+    }
+}
+
+/*
+ * Check that VFP access is enabled, A-profile specific version.
+ *
+ * If VFP is enabled, return true. If not, emit code to generate an
+ * appropriate exception and return false.
+ * The ignore_vfp_enabled argument specifies that we should ignore
+ * whether VFP is enabled via FPEXC.EN: this should be true for FMXR/FMRX
+ * accesses to FPSID, FPEXC, MVFR0, MVFR1, MVFR2, and false for all other insns.
+ */
+static bool vfp_access_check_a(DisasContext *s, bool ignore_vfp_enabled)
+{
+    if (s->fp_excp_el) {
+        /*
+         * The full syndrome is only used for HSR when HCPTR traps:
+         * For v8, when TA==0, coproc is RES0.
+         * For v7, any use of a Floating-point instruction or access
+         * to a Floating-point Extension register that is trapped to
+         * Hyp mode because of a trap configured in the HCPTR sets
+         * this field to 0xA.
+         */
+        int coproc = arm_dc_feature(s, ARM_FEATURE_V8) ? 0 : 0xa;
+        uint32_t syn = syn_fp_access_trap(1, 0xe, false, coproc);
+
+        gen_exception_insn_el(s, 0, EXCP_UDEF, syn, s->fp_excp_el);
+        return false;
+    }
+
+    /*
+     * Note that rebuild_hflags_a32 has already accounted for being in EL0
+     * and the higher EL in A64 mode, etc.  Unlike A64 mode, there do not
+     * appear to be any insns which touch VFP which are allowed.
+     */
+    if (s->sme_trap_nonstreaming) {
+        gen_exception_insn(s, 0, EXCP_UDEF,
+                           syn_smetrap(SME_ET_Streaming,
+                                       curr_insn_len(s) == 2));
+        return false;
+    }
+
+    if (!s->vfp_enabled && !ignore_vfp_enabled) {
+        assert(!arm_dc_feature(s, ARM_FEATURE_M));
+        unallocated_encoding(s);
+        return false;
+    }
+    return true;
+}
+
+/*
+ * Check that VFP access is enabled, M-profile specific version.
+ *
+ * If VFP is enabled, do the necessary M-profile lazy-FP handling and then
+ * return true. If not, emit code to generate an appropriate exception and
+ * return false.
+ * skip_context_update is true to skip the "update FP context" part of this.
+ */
+bool vfp_access_check_m(DisasContext *s, bool skip_context_update)
+{
+    if (s->fp_excp_el) {
+        /*
+         * M-profile mostly catches the "FPU disabled" case early, in
+         * disas_m_nocp(), but a few insns (eg LCTP, WLSTP, DLSTP)
+         * which do coprocessor-checks are outside the large ranges of
+         * the encoding space handled by the patterns in m-nocp.decode,
+         * and for them we may need to raise NOCP here.
+         */
+        gen_exception_insn_el(s, 0, EXCP_NOCP,
+                              syn_uncategorized(), s->fp_excp_el);
+        return false;
+    }
+
+    /* Handle M-profile lazy FP state mechanics */
+
+    /* Trigger lazy-state preservation if necessary */
+    gen_preserve_fp_state(s, skip_context_update);
+
+    if (!skip_context_update) {
+        /* Update ownership of FP context and create new FP context if needed */
+        gen_update_fp_context(s);
+    }
+
+    return true;
+}
+
+/*
+ * The most usual kind of VFP access check, for everything except
+ * FMXR/FMRX to the always-available special registers.
+ */
+bool vfp_access_check(DisasContext *s)
+{
+    if (arm_dc_feature(s, ARM_FEATURE_M)) {
+        return vfp_access_check_m(s, false);
+    } else {
+        return vfp_access_check_a(s, false);
+    }
+}
+
+static bool trans_VSEL(DisasContext *s, arg_VSEL *a)
+{
+    uint32_t rd, rn, rm;
+    int sz = a->sz;
+
+    if (!dc_isar_feature(aa32_vsel, s)) {
+        return false;
+    }
+
+    if (sz == 3 && !dc_isar_feature(aa32_fpdp_v2, s)) {
+        return false;
+    }
+
+    if (sz == 1 && !dc_isar_feature(aa32_fp16_arith, s)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist */
+    if (sz == 3 && !dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vm | a->vn | a->vd) & 0x10)) {
+        return false;
+    }
+
+    rd = a->vd;
+    rn = a->vn;
+    rm = a->vm;
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    if (sz == 3) {
+        TCGv_i64 frn, frm, dest;
+        TCGv_i64 tmp, zero, zf, nf, vf;
+
+        zero = tcg_constant_i64(0);
+
+        frn = tcg_temp_new_i64();
+        frm = tcg_temp_new_i64();
+        dest = tcg_temp_new_i64();
+
+        zf = tcg_temp_new_i64();
+        nf = tcg_temp_new_i64();
+        vf = tcg_temp_new_i64();
+
+        tcg_gen_extu_i32_i64(zf, cpu_ZF);
+        tcg_gen_ext_i32_i64(nf, cpu_NF);
+        tcg_gen_ext_i32_i64(vf, cpu_VF);
+
+        vfp_load_reg64(frn, rn);
+        vfp_load_reg64(frm, rm);
+        switch (a->cc) {
+        case 0: /* eq: Z */
+            tcg_gen_movcond_i64(TCG_COND_EQ, dest, zf, zero, frn, frm);
+            break;
+        case 1: /* vs: V */
+            tcg_gen_movcond_i64(TCG_COND_LT, dest, vf, zero, frn, frm);
+            break;
+        case 2: /* ge: N == V -> N ^ V == 0 */
+            tmp = tcg_temp_new_i64();
+            tcg_gen_xor_i64(tmp, vf, nf);
+            tcg_gen_movcond_i64(TCG_COND_GE, dest, tmp, zero, frn, frm);
+            tcg_temp_free_i64(tmp);
+            break;
+        case 3: /* gt: !Z && N == V */
+            tcg_gen_movcond_i64(TCG_COND_NE, dest, zf, zero, frn, frm);
+            tmp = tcg_temp_new_i64();
+            tcg_gen_xor_i64(tmp, vf, nf);
+            tcg_gen_movcond_i64(TCG_COND_GE, dest, tmp, zero, dest, frm);
+            tcg_temp_free_i64(tmp);
+            break;
+        }
+        vfp_store_reg64(dest, rd);
+        tcg_temp_free_i64(frn);
+        tcg_temp_free_i64(frm);
+        tcg_temp_free_i64(dest);
+
+        tcg_temp_free_i64(zf);
+        tcg_temp_free_i64(nf);
+        tcg_temp_free_i64(vf);
+    } else {
+        TCGv_i32 frn, frm, dest;
+        TCGv_i32 tmp, zero;
+
+        zero = tcg_constant_i32(0);
+
+        frn = tcg_temp_new_i32();
+        frm = tcg_temp_new_i32();
+        dest = tcg_temp_new_i32();
+        vfp_load_reg32(frn, rn);
+        vfp_load_reg32(frm, rm);
+        switch (a->cc) {
+        case 0: /* eq: Z */
+            tcg_gen_movcond_i32(TCG_COND_EQ, dest, cpu_ZF, zero, frn, frm);
+            break;
+        case 1: /* vs: V */
+            tcg_gen_movcond_i32(TCG_COND_LT, dest, cpu_VF, zero, frn, frm);
+            break;
+        case 2: /* ge: N == V -> N ^ V == 0 */
+            tmp = tcg_temp_new_i32();
+            tcg_gen_xor_i32(tmp, cpu_VF, cpu_NF);
+            tcg_gen_movcond_i32(TCG_COND_GE, dest, tmp, zero, frn, frm);
+            tcg_temp_free_i32(tmp);
+            break;
+        case 3: /* gt: !Z && N == V */
+            tcg_gen_movcond_i32(TCG_COND_NE, dest, cpu_ZF, zero, frn, frm);
+            tmp = tcg_temp_new_i32();
+            tcg_gen_xor_i32(tmp, cpu_VF, cpu_NF);
+            tcg_gen_movcond_i32(TCG_COND_GE, dest, tmp, zero, dest, frm);
+            tcg_temp_free_i32(tmp);
+            break;
+        }
+        /* For fp16 the top half is always zeroes */
+        if (sz == 1) {
+            tcg_gen_andi_i32(dest, dest, 0xffff);
+        }
+        vfp_store_reg32(dest, rd);
+        tcg_temp_free_i32(frn);
+        tcg_temp_free_i32(frm);
+        tcg_temp_free_i32(dest);
+    }
+
+    return true;
+}
+
+/*
+ * Table for converting the most common AArch32 encoding of
+ * rounding mode to arm_fprounding order (which matches the
+ * common AArch64 order); see ARM ARM pseudocode FPDecodeRM().
+ */
+static const uint8_t fp_decode_rm[] = {
+    FPROUNDING_TIEAWAY,
+    FPROUNDING_TIEEVEN,
+    FPROUNDING_POSINF,
+    FPROUNDING_NEGINF,
+};
+
+static bool trans_VRINT(DisasContext *s, arg_VRINT *a)
+{
+    uint32_t rd, rm;
+    int sz = a->sz;
+    TCGv_ptr fpst;
+    TCGv_i32 tcg_rmode;
+    int rounding = fp_decode_rm[a->rm];
+
+    if (!dc_isar_feature(aa32_vrint, s)) {
+        return false;
+    }
+
+    if (sz == 3 && !dc_isar_feature(aa32_fpdp_v2, s)) {
+        return false;
+    }
+
+    if (sz == 1 && !dc_isar_feature(aa32_fp16_arith, s)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist */
+    if (sz == 3 && !dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vm | a->vd) & 0x10)) {
+        return false;
+    }
+
+    rd = a->vd;
+    rm = a->vm;
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    if (sz == 1) {
+        fpst = fpstatus_ptr(FPST_FPCR_F16);
+    } else {
+        fpst = fpstatus_ptr(FPST_FPCR);
+    }
+
+    tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rounding));
+    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+
+    if (sz == 3) {
+        TCGv_i64 tcg_op;
+        TCGv_i64 tcg_res;
+        tcg_op = tcg_temp_new_i64();
+        tcg_res = tcg_temp_new_i64();
+        vfp_load_reg64(tcg_op, rm);
+        gen_helper_rintd(tcg_res, tcg_op, fpst);
+        vfp_store_reg64(tcg_res, rd);
+        tcg_temp_free_i64(tcg_op);
+        tcg_temp_free_i64(tcg_res);
+    } else {
+        TCGv_i32 tcg_op;
+        TCGv_i32 tcg_res;
+        tcg_op = tcg_temp_new_i32();
+        tcg_res = tcg_temp_new_i32();
+        vfp_load_reg32(tcg_op, rm);
+        if (sz == 1) {
+            gen_helper_rinth(tcg_res, tcg_op, fpst);
+        } else {
+            gen_helper_rints(tcg_res, tcg_op, fpst);
+        }
+        vfp_store_reg32(tcg_res, rd);
+        tcg_temp_free_i32(tcg_op);
+        tcg_temp_free_i32(tcg_res);
+    }
+
+    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+    tcg_temp_free_i32(tcg_rmode);
+
+    tcg_temp_free_ptr(fpst);
+    return true;
+}
+
+static bool trans_VCVT(DisasContext *s, arg_VCVT *a)
+{
+    uint32_t rd, rm;
+    int sz = a->sz;
+    TCGv_ptr fpst;
+    TCGv_i32 tcg_rmode, tcg_shift;
+    int rounding = fp_decode_rm[a->rm];
+    bool is_signed = a->op;
+
+    if (!dc_isar_feature(aa32_vcvt_dr, s)) {
+        return false;
+    }
+
+    if (sz == 3 && !dc_isar_feature(aa32_fpdp_v2, s)) {
+        return false;
+    }
+
+    if (sz == 1 && !dc_isar_feature(aa32_fp16_arith, s)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist */
+    if (sz == 3 && !dc_isar_feature(aa32_simd_r32, s) && (a->vm & 0x10)) {
+        return false;
+    }
+
+    rd = a->vd;
+    rm = a->vm;
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    if (sz == 1) {
+        fpst = fpstatus_ptr(FPST_FPCR_F16);
+    } else {
+        fpst = fpstatus_ptr(FPST_FPCR);
+    }
+
+    tcg_shift = tcg_constant_i32(0);
+
+    tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rounding));
+    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+
+    if (sz == 3) {
+        TCGv_i64 tcg_double, tcg_res;
+        TCGv_i32 tcg_tmp;
+        tcg_double = tcg_temp_new_i64();
+        tcg_res = tcg_temp_new_i64();
+        tcg_tmp = tcg_temp_new_i32();
+        vfp_load_reg64(tcg_double, rm);
+        if (is_signed) {
+            gen_helper_vfp_tosld(tcg_res, tcg_double, tcg_shift, fpst);
+        } else {
+            gen_helper_vfp_tould(tcg_res, tcg_double, tcg_shift, fpst);
+        }
+        tcg_gen_extrl_i64_i32(tcg_tmp, tcg_res);
+        vfp_store_reg32(tcg_tmp, rd);
+        tcg_temp_free_i32(tcg_tmp);
+        tcg_temp_free_i64(tcg_res);
+        tcg_temp_free_i64(tcg_double);
+    } else {
+        TCGv_i32 tcg_single, tcg_res;
+        tcg_single = tcg_temp_new_i32();
+        tcg_res = tcg_temp_new_i32();
+        vfp_load_reg32(tcg_single, rm);
+        if (sz == 1) {
+            if (is_signed) {
+                gen_helper_vfp_toslh(tcg_res, tcg_single, tcg_shift, fpst);
+            } else {
+                gen_helper_vfp_toulh(tcg_res, tcg_single, tcg_shift, fpst);
+            }
+        } else {
+            if (is_signed) {
+                gen_helper_vfp_tosls(tcg_res, tcg_single, tcg_shift, fpst);
+            } else {
+                gen_helper_vfp_touls(tcg_res, tcg_single, tcg_shift, fpst);
+            }
+        }
+        vfp_store_reg32(tcg_res, rd);
+        tcg_temp_free_i32(tcg_res);
+        tcg_temp_free_i32(tcg_single);
+    }
+
+    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+    tcg_temp_free_i32(tcg_rmode);
+
+    tcg_temp_free_ptr(fpst);
+
+    return true;
+}
+
+bool mve_skip_vmov(DisasContext *s, int vn, int index, int size)
+{
+    /*
+     * In a CPU with MVE, the VMOV (vector lane to general-purpose register)
+     * and VMOV (general-purpose register to vector lane) insns are not
+     * predicated, but they are subject to beatwise execution if they are
+     * not in an IT block.
+     *
+     * Since our implementation always executes all 4 beats in one tick,
+     * this means only that if PSR.ECI says we should not be executing
+     * the beat corresponding to the lane of the vector register being
+     * accessed then we should skip performing the move, and that we need
+     * to do the usual check for bad ECI state and advance of ECI state.
+     *
+     * Note that if PSR.ECI is non-zero then we cannot be in an IT block.
+     *
+     * Return true if this VMOV scalar <-> gpreg should be skipped because
+     * the MVE PSR.ECI state says we skip the beat where the store happens.
+     */
+
+    /* Calculate the byte offset into Qn which we're going to access */
+    int ofs = (index << size) + ((vn & 1) * 8);
+
+    if (!dc_isar_feature(aa32_mve, s)) {
+        return false;
+    }
+
+    switch (s->eci) {
+    case ECI_NONE:
+        return false;
+    case ECI_A0:
+        return ofs < 4;
+    case ECI_A0A1:
+        return ofs < 8;
+    case ECI_A0A1A2:
+    case ECI_A0A1A2B0:
+        return ofs < 12;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+static bool trans_VMOV_to_gp(DisasContext *s, arg_VMOV_to_gp *a)
+{
+    /* VMOV scalar to general purpose register */
+    TCGv_i32 tmp;
+
+    /*
+     * SIZE == MO_32 is a VFP instruction; otherwise NEON. MVE has
+     * all sizes, whether the CPU has fp or not.
+     */
+    if (!dc_isar_feature(aa32_mve, s)) {
+        if (a->size == MO_32
+            ? !dc_isar_feature(aa32_fpsp_v2, s)
+            : !arm_dc_feature(s, ARM_FEATURE_NEON)) {
+            return false;
+        }
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist */
+    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vn & 0x10)) {
+        return false;
+    }
+
+    if (dc_isar_feature(aa32_mve, s)) {
+        if (!mve_eci_check(s)) {
+            return true;
+        }
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    if (!mve_skip_vmov(s, a->vn, a->index, a->size)) {
+        tmp = tcg_temp_new_i32();
+        read_neon_element32(tmp, a->vn, a->index,
+                            a->size | (a->u ? 0 : MO_SIGN));
+        store_reg(s, a->rt, tmp);
+    }
+
+    if (dc_isar_feature(aa32_mve, s)) {
+        mve_update_and_store_eci(s);
+    }
+    return true;
+}
+
+static bool trans_VMOV_from_gp(DisasContext *s, arg_VMOV_from_gp *a)
+{
+    /* VMOV general purpose register to scalar */
+    TCGv_i32 tmp;
+
+    /*
+     * SIZE == MO_32 is a VFP instruction; otherwise NEON. MVE has
+     * all sizes, whether the CPU has fp or not.
+     */
+    if (!dc_isar_feature(aa32_mve, s)) {
+        if (a->size == MO_32
+            ? !dc_isar_feature(aa32_fpsp_v2, s)
+            : !arm_dc_feature(s, ARM_FEATURE_NEON)) {
+            return false;
+        }
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist */
+    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vn & 0x10)) {
+        return false;
+    }
+
+    if (dc_isar_feature(aa32_mve, s)) {
+        if (!mve_eci_check(s)) {
+            return true;
+        }
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    if (!mve_skip_vmov(s, a->vn, a->index, a->size)) {
+        tmp = load_reg(s, a->rt);
+        write_neon_element32(tmp, a->vn, a->index, a->size);
+        tcg_temp_free_i32(tmp);
+    }
+
+    if (dc_isar_feature(aa32_mve, s)) {
+        mve_update_and_store_eci(s);
+    }
+    return true;
+}
+
+static bool trans_VDUP(DisasContext *s, arg_VDUP *a)
+{
+    /* VDUP (general purpose register) */
+    TCGv_i32 tmp;
+    int size, vec_size;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist */
+    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vn & 0x10)) {
+        return false;
+    }
+
+    if (a->b && a->e) {
+        return false;
+    }
+
+    if (a->q && (a->vn & 1)) {
+        return false;
+    }
+
+    vec_size = a->q ? 16 : 8;
+    if (a->b) {
+        size = 0;
+    } else if (a->e) {
+        size = 1;
+    } else {
+        size = 2;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    tmp = load_reg(s, a->rt);
+    tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(a->vn),
+                         vec_size, vec_size, tmp);
+    tcg_temp_free_i32(tmp);
+
+    return true;
+}
+
+static bool trans_VMSR_VMRS(DisasContext *s, arg_VMSR_VMRS *a)
+{
+    TCGv_i32 tmp;
+    bool ignore_vfp_enabled = false;
+
+    if (arm_dc_feature(s, ARM_FEATURE_M)) {
+        /* M profile version was already handled in m-nocp.decode */
+        return false;
+    }
+
+    if (!dc_isar_feature(aa32_fpsp_v2, s)) {
+        return false;
+    }
+
+    switch (a->reg) {
+    case ARM_VFP_FPSID:
+        /*
+         * VFPv2 allows access to FPSID from userspace; VFPv3 restricts
+         * all ID registers to privileged access only.
+         */
+        if (IS_USER(s) && dc_isar_feature(aa32_fpsp_v3, s)) {
+            return false;
+        }
+        ignore_vfp_enabled = true;
+        break;
+    case ARM_VFP_MVFR0:
+    case ARM_VFP_MVFR1:
+        if (IS_USER(s) || !arm_dc_feature(s, ARM_FEATURE_MVFR)) {
+            return false;
+        }
+        ignore_vfp_enabled = true;
+        break;
+    case ARM_VFP_MVFR2:
+        if (IS_USER(s) || !arm_dc_feature(s, ARM_FEATURE_V8)) {
+            return false;
+        }
+        ignore_vfp_enabled = true;
+        break;
+    case ARM_VFP_FPSCR:
+        break;
+    case ARM_VFP_FPEXC:
+        if (IS_USER(s)) {
+            return false;
+        }
+        ignore_vfp_enabled = true;
+        break;
+    case ARM_VFP_FPINST:
+    case ARM_VFP_FPINST2:
+        /* Not present in VFPv3 */
+        if (IS_USER(s) || dc_isar_feature(aa32_fpsp_v3, s)) {
+            return false;
+        }
+        break;
+    default:
+        return false;
+    }
+
+    /*
+     * Call vfp_access_check_a() directly, because we need to tell
+     * it to ignore FPEXC.EN for some register accesses.
+     */
+    if (!vfp_access_check_a(s, ignore_vfp_enabled)) {
+        return true;
+    }
+
+    if (a->l) {
+        /* VMRS, move VFP special register to gp register */
+        switch (a->reg) {
+        case ARM_VFP_MVFR0:
+        case ARM_VFP_MVFR1:
+        case ARM_VFP_MVFR2:
+        case ARM_VFP_FPSID:
+            if (s->current_el == 1) {
+                gen_set_condexec(s);
+                gen_update_pc(s, 0);
+                gen_helper_check_hcr_el2_trap(cpu_env,
+                                              tcg_constant_i32(a->rt),
+                                              tcg_constant_i32(a->reg));
+            }
+            /* fall through */
+        case ARM_VFP_FPEXC:
+        case ARM_VFP_FPINST:
+        case ARM_VFP_FPINST2:
+            tmp = load_cpu_field(vfp.xregs[a->reg]);
+            break;
+        case ARM_VFP_FPSCR:
+            if (a->rt == 15) {
+                tmp = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]);
+                tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK);
+            } else {
+                tmp = tcg_temp_new_i32();
+                gen_helper_vfp_get_fpscr(tmp, cpu_env);
+            }
+            break;
+        default:
+            g_assert_not_reached();
+        }
+
+        if (a->rt == 15) {
+            /* Set the 4 flag bits in the CPSR.  */
+            gen_set_nzcv(tmp);
+            tcg_temp_free_i32(tmp);
+        } else {
+            store_reg(s, a->rt, tmp);
+        }
+    } else {
+        /* VMSR, move gp register to VFP special register */
+        switch (a->reg) {
+        case ARM_VFP_FPSID:
+        case ARM_VFP_MVFR0:
+        case ARM_VFP_MVFR1:
+        case ARM_VFP_MVFR2:
+            /* Writes are ignored.  */
+            break;
+        case ARM_VFP_FPSCR:
+            tmp = load_reg(s, a->rt);
+            gen_helper_vfp_set_fpscr(cpu_env, tmp);
+            tcg_temp_free_i32(tmp);
+            gen_lookup_tb(s);
+            break;
+        case ARM_VFP_FPEXC:
+            /*
+             * TODO: VFP subarchitecture support.
+             * For now, keep the EN bit only
+             */
+            tmp = load_reg(s, a->rt);
+            tcg_gen_andi_i32(tmp, tmp, 1 << 30);
+            store_cpu_field(tmp, vfp.xregs[a->reg]);
+            gen_lookup_tb(s);
+            break;
+        case ARM_VFP_FPINST:
+        case ARM_VFP_FPINST2:
+            tmp = load_reg(s, a->rt);
+            store_cpu_field(tmp, vfp.xregs[a->reg]);
+            break;
+        default:
+            g_assert_not_reached();
+        }
+    }
+
+    return true;
+}
+
+
+static bool trans_VMOV_half(DisasContext *s, arg_VMOV_single *a)
+{
+    TCGv_i32 tmp;
+
+    if (!dc_isar_feature(aa32_fp16_arith, s)) {
+        return false;
+    }
+
+    if (a->rt == 15) {
+        /* UNPREDICTABLE; we choose to UNDEF */
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    if (a->l) {
+        /* VFP to general purpose register */
+        tmp = tcg_temp_new_i32();
+        vfp_load_reg32(tmp, a->vn);
+        tcg_gen_andi_i32(tmp, tmp, 0xffff);
+        store_reg(s, a->rt, tmp);
+    } else {
+        /* general purpose register to VFP */
+        tmp = load_reg(s, a->rt);
+        tcg_gen_andi_i32(tmp, tmp, 0xffff);
+        vfp_store_reg32(tmp, a->vn);
+        tcg_temp_free_i32(tmp);
+    }
+
+    return true;
+}
+
+static bool trans_VMOV_single(DisasContext *s, arg_VMOV_single *a)
+{
+    TCGv_i32 tmp;
+
+    if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    if (a->l) {
+        /* VFP to general purpose register */
+        tmp = tcg_temp_new_i32();
+        vfp_load_reg32(tmp, a->vn);
+        if (a->rt == 15) {
+            /* Set the 4 flag bits in the CPSR.  */
+            gen_set_nzcv(tmp);
+            tcg_temp_free_i32(tmp);
+        } else {
+            store_reg(s, a->rt, tmp);
+        }
+    } else {
+        /* general purpose register to VFP */
+        tmp = load_reg(s, a->rt);
+        vfp_store_reg32(tmp, a->vn);
+        tcg_temp_free_i32(tmp);
+    }
+
+    return true;
+}
+
+static bool trans_VMOV_64_sp(DisasContext *s, arg_VMOV_64_sp *a)
+{
+    TCGv_i32 tmp;
+
+    if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
+        return false;
+    }
+
+    /*
+     * VMOV between two general-purpose registers and two single precision
+     * floating point registers
+     */
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    if (a->op) {
+        /* fpreg to gpreg */
+        tmp = tcg_temp_new_i32();
+        vfp_load_reg32(tmp, a->vm);
+        store_reg(s, a->rt, tmp);
+        tmp = tcg_temp_new_i32();
+        vfp_load_reg32(tmp, a->vm + 1);
+        store_reg(s, a->rt2, tmp);
+    } else {
+        /* gpreg to fpreg */
+        tmp = load_reg(s, a->rt);
+        vfp_store_reg32(tmp, a->vm);
+        tcg_temp_free_i32(tmp);
+        tmp = load_reg(s, a->rt2);
+        vfp_store_reg32(tmp, a->vm + 1);
+        tcg_temp_free_i32(tmp);
+    }
+
+    return true;
+}
+
+static bool trans_VMOV_64_dp(DisasContext *s, arg_VMOV_64_dp *a)
+{
+    TCGv_i32 tmp;
+
+    /*
+     * VMOV between two general-purpose registers and one double precision
+     * floating point register.  Note that this does not require support
+     * for double precision arithmetic.
+     */
+    if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist */
+    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vm & 0x10)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    if (a->op) {
+        /* fpreg to gpreg */
+        tmp = tcg_temp_new_i32();
+        vfp_load_reg32(tmp, a->vm * 2);
+        store_reg(s, a->rt, tmp);
+        tmp = tcg_temp_new_i32();
+        vfp_load_reg32(tmp, a->vm * 2 + 1);
+        store_reg(s, a->rt2, tmp);
+    } else {
+        /* gpreg to fpreg */
+        tmp = load_reg(s, a->rt);
+        vfp_store_reg32(tmp, a->vm * 2);
+        tcg_temp_free_i32(tmp);
+        tmp = load_reg(s, a->rt2);
+        vfp_store_reg32(tmp, a->vm * 2 + 1);
+        tcg_temp_free_i32(tmp);
+    }
+
+    return true;
+}
+
+static bool trans_VLDR_VSTR_hp(DisasContext *s, arg_VLDR_VSTR_sp *a)
+{
+    uint32_t offset;
+    TCGv_i32 addr, tmp;
+
+    if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    /* imm8 field is offset/2 for fp16, unlike fp32 and fp64 */
+    offset = a->imm << 1;
+    if (!a->u) {
+        offset = -offset;
+    }
+
+    /* For thumb, use of PC is UNPREDICTABLE.  */
+    addr = add_reg_for_lit(s, a->rn, offset);
+    tmp = tcg_temp_new_i32();
+    if (a->l) {
+        gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), MO_UW | MO_ALIGN);
+        vfp_store_reg32(tmp, a->vd);
+    } else {
+        vfp_load_reg32(tmp, a->vd);
+        gen_aa32_st_i32(s, tmp, addr, get_mem_index(s), MO_UW | MO_ALIGN);
+    }
+    tcg_temp_free_i32(tmp);
+    tcg_temp_free_i32(addr);
+
+    return true;
+}
+
+static bool trans_VLDR_VSTR_sp(DisasContext *s, arg_VLDR_VSTR_sp *a)
+{
+    uint32_t offset;
+    TCGv_i32 addr, tmp;
+
+    if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    offset = a->imm << 2;
+    if (!a->u) {
+        offset = -offset;
+    }
+
+    /* For thumb, use of PC is UNPREDICTABLE.  */
+    addr = add_reg_for_lit(s, a->rn, offset);
+    tmp = tcg_temp_new_i32();
+    if (a->l) {
+        gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), MO_UL | MO_ALIGN);
+        vfp_store_reg32(tmp, a->vd);
+    } else {
+        vfp_load_reg32(tmp, a->vd);
+        gen_aa32_st_i32(s, tmp, addr, get_mem_index(s), MO_UL | MO_ALIGN);
+    }
+    tcg_temp_free_i32(tmp);
+    tcg_temp_free_i32(addr);
+
+    return true;
+}
+
+static bool trans_VLDR_VSTR_dp(DisasContext *s, arg_VLDR_VSTR_dp *a)
+{
+    uint32_t offset;
+    TCGv_i32 addr;
+    TCGv_i64 tmp;
+
+    /* Note that this does not require support for double arithmetic.  */
+    if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist */
+    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    offset = a->imm << 2;
+    if (!a->u) {
+        offset = -offset;
+    }
+
+    /* For thumb, use of PC is UNPREDICTABLE.  */
+    addr = add_reg_for_lit(s, a->rn, offset);
+    tmp = tcg_temp_new_i64();
+    if (a->l) {
+        gen_aa32_ld_i64(s, tmp, addr, get_mem_index(s), MO_UQ | MO_ALIGN_4);
+        vfp_store_reg64(tmp, a->vd);
+    } else {
+        vfp_load_reg64(tmp, a->vd);
+        gen_aa32_st_i64(s, tmp, addr, get_mem_index(s), MO_UQ | MO_ALIGN_4);
+    }
+    tcg_temp_free_i64(tmp);
+    tcg_temp_free_i32(addr);
+
+    return true;
+}
+
+static bool trans_VLDM_VSTM_sp(DisasContext *s, arg_VLDM_VSTM_sp *a)
+{
+    uint32_t offset;
+    TCGv_i32 addr, tmp;
+    int i, n;
+
+    if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
+        return false;
+    }
+
+    n = a->imm;
+
+    if (n == 0 || (a->vd + n) > 32) {
+        /*
+         * UNPREDICTABLE cases for bad immediates: we choose to
+         * UNDEF to avoid generating huge numbers of TCG ops
+         */
+        return false;
+    }
+    if (a->rn == 15 && a->w) {
+        /* writeback to PC is UNPREDICTABLE, we choose to UNDEF */
+        return false;
+    }
+
+    s->eci_handled = true;
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    /* For thumb, use of PC is UNPREDICTABLE.  */
+    addr = add_reg_for_lit(s, a->rn, 0);
+    if (a->p) {
+        /* pre-decrement */
+        tcg_gen_addi_i32(addr, addr, -(a->imm << 2));
+    }
+
+    if (s->v8m_stackcheck && a->rn == 13 && a->w) {
+        /*
+         * Here 'addr' is the lowest address we will store to,
+         * and is either the old SP (if post-increment) or
+         * the new SP (if pre-decrement). For post-increment
+         * where the old value is below the limit and the new
+         * value is above, it is UNKNOWN whether the limit check
+         * triggers; we choose to trigger.
+         */
+        gen_helper_v8m_stackcheck(cpu_env, addr);
+    }
+
+    offset = 4;
+    tmp = tcg_temp_new_i32();
+    for (i = 0; i < n; i++) {
+        if (a->l) {
+            /* load */
+            gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), MO_UL | MO_ALIGN);
+            vfp_store_reg32(tmp, a->vd + i);
+        } else {
+            /* store */
+            vfp_load_reg32(tmp, a->vd + i);
+            gen_aa32_st_i32(s, tmp, addr, get_mem_index(s), MO_UL | MO_ALIGN);
+        }
+        tcg_gen_addi_i32(addr, addr, offset);
+    }
+    tcg_temp_free_i32(tmp);
+    if (a->w) {
+        /* writeback */
+        if (a->p) {
+            offset = -offset * n;
+            tcg_gen_addi_i32(addr, addr, offset);
+        }
+        store_reg(s, a->rn, addr);
+    } else {
+        tcg_temp_free_i32(addr);
+    }
+
+    clear_eci_state(s);
+    return true;
+}
+
+static bool trans_VLDM_VSTM_dp(DisasContext *s, arg_VLDM_VSTM_dp *a)
+{
+    uint32_t offset;
+    TCGv_i32 addr;
+    TCGv_i64 tmp;
+    int i, n;
+
+    /* Note that this does not require support for double arithmetic.  */
+    if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
+        return false;
+    }
+
+    n = a->imm >> 1;
+
+    if (n == 0 || (a->vd + n) > 32 || n > 16) {
+        /*
+         * UNPREDICTABLE cases for bad immediates: we choose to
+         * UNDEF to avoid generating huge numbers of TCG ops
+         */
+        return false;
+    }
+    if (a->rn == 15 && a->w) {
+        /* writeback to PC is UNPREDICTABLE, we choose to UNDEF */
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist */
+    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd + n) > 16) {
+        return false;
+    }
+
+    s->eci_handled = true;
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    /* For thumb, use of PC is UNPREDICTABLE.  */
+    addr = add_reg_for_lit(s, a->rn, 0);
+    if (a->p) {
+        /* pre-decrement */
+        tcg_gen_addi_i32(addr, addr, -(a->imm << 2));
+    }
+
+    if (s->v8m_stackcheck && a->rn == 13 && a->w) {
+        /*
+         * Here 'addr' is the lowest address we will store to,
+         * and is either the old SP (if post-increment) or
+         * the new SP (if pre-decrement). For post-increment
+         * where the old value is below the limit and the new
+         * value is above, it is UNKNOWN whether the limit check
+         * triggers; we choose to trigger.
+         */
+        gen_helper_v8m_stackcheck(cpu_env, addr);
+    }
+
+    offset = 8;
+    tmp = tcg_temp_new_i64();
+    for (i = 0; i < n; i++) {
+        if (a->l) {
+            /* load */
+            gen_aa32_ld_i64(s, tmp, addr, get_mem_index(s), MO_UQ | MO_ALIGN_4);
+            vfp_store_reg64(tmp, a->vd + i);
+        } else {
+            /* store */
+            vfp_load_reg64(tmp, a->vd + i);
+            gen_aa32_st_i64(s, tmp, addr, get_mem_index(s), MO_UQ | MO_ALIGN_4);
+        }
+        tcg_gen_addi_i32(addr, addr, offset);
+    }
+    tcg_temp_free_i64(tmp);
+    if (a->w) {
+        /* writeback */
+        if (a->p) {
+            offset = -offset * n;
+        } else if (a->imm & 1) {
+            offset = 4;
+        } else {
+            offset = 0;
+        }
+
+        if (offset != 0) {
+            tcg_gen_addi_i32(addr, addr, offset);
+        }
+        store_reg(s, a->rn, addr);
+    } else {
+        tcg_temp_free_i32(addr);
+    }
+
+    clear_eci_state(s);
+    return true;
+}
+
+/*
+ * Types for callbacks for do_vfp_3op_sp() and do_vfp_3op_dp().
+ * The callback should emit code to write a value to vd. If
+ * do_vfp_3op_{sp,dp}() was passed reads_vd then the TCGv vd
+ * will contain the old value of the relevant VFP register;
+ * otherwise it must be written to only.
+ */
+typedef void VFPGen3OpSPFn(TCGv_i32 vd,
+                           TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst);
+typedef void VFPGen3OpDPFn(TCGv_i64 vd,
+                           TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst);
+
+/*
+ * Types for callbacks for do_vfp_2op_sp() and do_vfp_2op_dp().
+ * The callback should emit code to write a value to vd (which
+ * should be written to only).
+ */
+typedef void VFPGen2OpSPFn(TCGv_i32 vd, TCGv_i32 vm);
+typedef void VFPGen2OpDPFn(TCGv_i64 vd, TCGv_i64 vm);
+
+/*
+ * Return true if the specified S reg is in a scalar bank
+ * (ie if it is s0..s7)
+ */
+static inline bool vfp_sreg_is_scalar(int reg)
+{
+    return (reg & 0x18) == 0;
+}
+
+/*
+ * Return true if the specified D reg is in a scalar bank
+ * (ie if it is d0..d3 or d16..d19)
+ */
+static inline bool vfp_dreg_is_scalar(int reg)
+{
+    return (reg & 0xc) == 0;
+}
+
+/*
+ * Advance the S reg number forwards by delta within its bank
+ * (ie increment the low 3 bits but leave the rest the same)
+ */
+static inline int vfp_advance_sreg(int reg, int delta)
+{
+    return ((reg + delta) & 0x7) | (reg & ~0x7);
+}
+
+/*
+ * Advance the D reg number forwards by delta within its bank
+ * (ie increment the low 2 bits but leave the rest the same)
+ */
+static inline int vfp_advance_dreg(int reg, int delta)
+{
+    return ((reg + delta) & 0x3) | (reg & ~0x3);
+}
+
+/*
+ * Perform a 3-operand VFP data processing instruction. fn is the
+ * callback to do the actual operation; this function deals with the
+ * code to handle looping around for VFP vector processing.
+ */
+static bool do_vfp_3op_sp(DisasContext *s, VFPGen3OpSPFn *fn,
+                          int vd, int vn, int vm, bool reads_vd)
+{
+    uint32_t delta_m = 0;
+    uint32_t delta_d = 0;
+    int veclen = s->vec_len;
+    TCGv_i32 f0, f1, fd;
+    TCGv_ptr fpst;
+
+    if (!dc_isar_feature(aa32_fpsp_v2, s)) {
+        return false;
+    }
+
+    if (!dc_isar_feature(aa32_fpshvec, s) &&
+        (veclen != 0 || s->vec_stride != 0)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    if (veclen > 0) {
+        /* Figure out what type of vector operation this is.  */
+        if (vfp_sreg_is_scalar(vd)) {
+            /* scalar */
+            veclen = 0;
+        } else {
+            delta_d = s->vec_stride + 1;
+
+            if (vfp_sreg_is_scalar(vm)) {
+                /* mixed scalar/vector */
+                delta_m = 0;
+            } else {
+                /* vector */
+                delta_m = delta_d;
+            }
+        }
+    }
+
+    f0 = tcg_temp_new_i32();
+    f1 = tcg_temp_new_i32();
+    fd = tcg_temp_new_i32();
+    fpst = fpstatus_ptr(FPST_FPCR);
+
+    vfp_load_reg32(f0, vn);
+    vfp_load_reg32(f1, vm);
+
+    for (;;) {
+        if (reads_vd) {
+            vfp_load_reg32(fd, vd);
+        }
+        fn(fd, f0, f1, fpst);
+        vfp_store_reg32(fd, vd);
+
+        if (veclen == 0) {
+            break;
+        }
+
+        /* Set up the operands for the next iteration */
+        veclen--;
+        vd = vfp_advance_sreg(vd, delta_d);
+        vn = vfp_advance_sreg(vn, delta_d);
+        vfp_load_reg32(f0, vn);
+        if (delta_m) {
+            vm = vfp_advance_sreg(vm, delta_m);
+            vfp_load_reg32(f1, vm);
+        }
+    }
+
+    tcg_temp_free_i32(f0);
+    tcg_temp_free_i32(f1);
+    tcg_temp_free_i32(fd);
+    tcg_temp_free_ptr(fpst);
+
+    return true;
+}
+
+static bool do_vfp_3op_hp(DisasContext *s, VFPGen3OpSPFn *fn,
+                          int vd, int vn, int vm, bool reads_vd)
+{
+    /*
+     * Do a half-precision operation. Functionally this is
+     * the same as do_vfp_3op_sp(), except:
+     *  - it uses the FPST_FPCR_F16
+     *  - it doesn't need the VFP vector handling (fp16 is a
+     *    v8 feature, and in v8 VFP vectors don't exist)
+     *  - it does the aa32_fp16_arith feature test
+     */
+    TCGv_i32 f0, f1, fd;
+    TCGv_ptr fpst;
+
+    if (!dc_isar_feature(aa32_fp16_arith, s)) {
+        return false;
+    }
+
+    if (s->vec_len != 0 || s->vec_stride != 0) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    f0 = tcg_temp_new_i32();
+    f1 = tcg_temp_new_i32();
+    fd = tcg_temp_new_i32();
+    fpst = fpstatus_ptr(FPST_FPCR_F16);
+
+    vfp_load_reg32(f0, vn);
+    vfp_load_reg32(f1, vm);
+
+    if (reads_vd) {
+        vfp_load_reg32(fd, vd);
+    }
+    fn(fd, f0, f1, fpst);
+    vfp_store_reg32(fd, vd);
+
+    tcg_temp_free_i32(f0);
+    tcg_temp_free_i32(f1);
+    tcg_temp_free_i32(fd);
+    tcg_temp_free_ptr(fpst);
+
+    return true;
+}
+
+static bool do_vfp_3op_dp(DisasContext *s, VFPGen3OpDPFn *fn,
+                          int vd, int vn, int vm, bool reads_vd)
+{
+    uint32_t delta_m = 0;
+    uint32_t delta_d = 0;
+    int veclen = s->vec_len;
+    TCGv_i64 f0, f1, fd;
+    TCGv_ptr fpst;
+
+    if (!dc_isar_feature(aa32_fpdp_v2, s)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist */
+    if (!dc_isar_feature(aa32_simd_r32, s) && ((vd | vn | vm) & 0x10)) {
+        return false;
+    }
+
+    if (!dc_isar_feature(aa32_fpshvec, s) &&
+        (veclen != 0 || s->vec_stride != 0)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    if (veclen > 0) {
+        /* Figure out what type of vector operation this is.  */
+        if (vfp_dreg_is_scalar(vd)) {
+            /* scalar */
+            veclen = 0;
+        } else {
+            delta_d = (s->vec_stride >> 1) + 1;
+
+            if (vfp_dreg_is_scalar(vm)) {
+                /* mixed scalar/vector */
+                delta_m = 0;
+            } else {
+                /* vector */
+                delta_m = delta_d;
+            }
+        }
+    }
+
+    f0 = tcg_temp_new_i64();
+    f1 = tcg_temp_new_i64();
+    fd = tcg_temp_new_i64();
+    fpst = fpstatus_ptr(FPST_FPCR);
+
+    vfp_load_reg64(f0, vn);
+    vfp_load_reg64(f1, vm);
+
+    for (;;) {
+        if (reads_vd) {
+            vfp_load_reg64(fd, vd);
+        }
+        fn(fd, f0, f1, fpst);
+        vfp_store_reg64(fd, vd);
+
+        if (veclen == 0) {
+            break;
+        }
+        /* Set up the operands for the next iteration */
+        veclen--;
+        vd = vfp_advance_dreg(vd, delta_d);
+        vn = vfp_advance_dreg(vn, delta_d);
+        vfp_load_reg64(f0, vn);
+        if (delta_m) {
+            vm = vfp_advance_dreg(vm, delta_m);
+            vfp_load_reg64(f1, vm);
+        }
+    }
+
+    tcg_temp_free_i64(f0);
+    tcg_temp_free_i64(f1);
+    tcg_temp_free_i64(fd);
+    tcg_temp_free_ptr(fpst);
+
+    return true;
+}
+
+static bool do_vfp_2op_sp(DisasContext *s, VFPGen2OpSPFn *fn, int vd, int vm)
+{
+    uint32_t delta_m = 0;
+    uint32_t delta_d = 0;
+    int veclen = s->vec_len;
+    TCGv_i32 f0, fd;
+
+    /* Note that the caller must check the aa32_fpsp_v2 feature. */
+
+    if (!dc_isar_feature(aa32_fpshvec, s) &&
+        (veclen != 0 || s->vec_stride != 0)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    if (veclen > 0) {
+        /* Figure out what type of vector operation this is.  */
+        if (vfp_sreg_is_scalar(vd)) {
+            /* scalar */
+            veclen = 0;
+        } else {
+            delta_d = s->vec_stride + 1;
+
+            if (vfp_sreg_is_scalar(vm)) {
+                /* mixed scalar/vector */
+                delta_m = 0;
+            } else {
+                /* vector */
+                delta_m = delta_d;
+            }
+        }
+    }
+
+    f0 = tcg_temp_new_i32();
+    fd = tcg_temp_new_i32();
+
+    vfp_load_reg32(f0, vm);
+
+    for (;;) {
+        fn(fd, f0);
+        vfp_store_reg32(fd, vd);
+
+        if (veclen == 0) {
+            break;
+        }
+
+        if (delta_m == 0) {
+            /* single source one-many */
+            while (veclen--) {
+                vd = vfp_advance_sreg(vd, delta_d);
+                vfp_store_reg32(fd, vd);
+            }
+            break;
+        }
+
+        /* Set up the operands for the next iteration */
+        veclen--;
+        vd = vfp_advance_sreg(vd, delta_d);
+        vm = vfp_advance_sreg(vm, delta_m);
+        vfp_load_reg32(f0, vm);
+    }
+
+    tcg_temp_free_i32(f0);
+    tcg_temp_free_i32(fd);
+
+    return true;
+}
+
+static bool do_vfp_2op_hp(DisasContext *s, VFPGen2OpSPFn *fn, int vd, int vm)
+{
+    /*
+     * Do a half-precision operation. Functionally this is
+     * the same as do_vfp_2op_sp(), except:
+     *  - it doesn't need the VFP vector handling (fp16 is a
+     *    v8 feature, and in v8 VFP vectors don't exist)
+     *  - it does the aa32_fp16_arith feature test
+     */
+    TCGv_i32 f0;
+
+    /* Note that the caller must check the aa32_fp16_arith feature */
+
+    if (!dc_isar_feature(aa32_fp16_arith, s)) {
+        return false;
+    }
+
+    if (s->vec_len != 0 || s->vec_stride != 0) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    f0 = tcg_temp_new_i32();
+    vfp_load_reg32(f0, vm);
+    fn(f0, f0);
+    vfp_store_reg32(f0, vd);
+    tcg_temp_free_i32(f0);
+
+    return true;
+}
+
+static bool do_vfp_2op_dp(DisasContext *s, VFPGen2OpDPFn *fn, int vd, int vm)
+{
+    uint32_t delta_m = 0;
+    uint32_t delta_d = 0;
+    int veclen = s->vec_len;
+    TCGv_i64 f0, fd;
+
+    /* Note that the caller must check the aa32_fpdp_v2 feature. */
+
+    /* UNDEF accesses to D16-D31 if they don't exist */
+    if (!dc_isar_feature(aa32_simd_r32, s) && ((vd | vm) & 0x10)) {
+        return false;
+    }
+
+    if (!dc_isar_feature(aa32_fpshvec, s) &&
+        (veclen != 0 || s->vec_stride != 0)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    if (veclen > 0) {
+        /* Figure out what type of vector operation this is.  */
+        if (vfp_dreg_is_scalar(vd)) {
+            /* scalar */
+            veclen = 0;
+        } else {
+            delta_d = (s->vec_stride >> 1) + 1;
+
+            if (vfp_dreg_is_scalar(vm)) {
+                /* mixed scalar/vector */
+                delta_m = 0;
+            } else {
+                /* vector */
+                delta_m = delta_d;
+            }
+        }
+    }
+
+    f0 = tcg_temp_new_i64();
+    fd = tcg_temp_new_i64();
+
+    vfp_load_reg64(f0, vm);
+
+    for (;;) {
+        fn(fd, f0);
+        vfp_store_reg64(fd, vd);
+
+        if (veclen == 0) {
+            break;
+        }
+
+        if (delta_m == 0) {
+            /* single source one-many */
+            while (veclen--) {
+                vd = vfp_advance_dreg(vd, delta_d);
+                vfp_store_reg64(fd, vd);
+            }
+            break;
+        }
+
+        /* Set up the operands for the next iteration */
+        veclen--;
+        vd = vfp_advance_dreg(vd, delta_d);
+        vd = vfp_advance_dreg(vm, delta_m);
+        vfp_load_reg64(f0, vm);
+    }
+
+    tcg_temp_free_i64(f0);
+    tcg_temp_free_i64(fd);
+
+    return true;
+}
+
+static void gen_VMLA_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
+{
+    /* Note that order of inputs to the add matters for NaNs */
+    TCGv_i32 tmp = tcg_temp_new_i32();
+
+    gen_helper_vfp_mulh(tmp, vn, vm, fpst);
+    gen_helper_vfp_addh(vd, vd, tmp, fpst);
+    tcg_temp_free_i32(tmp);
+}
+
+static bool trans_VMLA_hp(DisasContext *s, arg_VMLA_sp *a)
+{
+    return do_vfp_3op_hp(s, gen_VMLA_hp, a->vd, a->vn, a->vm, true);
+}
+
+static void gen_VMLA_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
+{
+    /* Note that order of inputs to the add matters for NaNs */
+    TCGv_i32 tmp = tcg_temp_new_i32();
+
+    gen_helper_vfp_muls(tmp, vn, vm, fpst);
+    gen_helper_vfp_adds(vd, vd, tmp, fpst);
+    tcg_temp_free_i32(tmp);
+}
+
+static bool trans_VMLA_sp(DisasContext *s, arg_VMLA_sp *a)
+{
+    return do_vfp_3op_sp(s, gen_VMLA_sp, a->vd, a->vn, a->vm, true);
+}
+
+static void gen_VMLA_dp(TCGv_i64 vd, TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst)
+{
+    /* Note that order of inputs to the add matters for NaNs */
+    TCGv_i64 tmp = tcg_temp_new_i64();
+
+    gen_helper_vfp_muld(tmp, vn, vm, fpst);
+    gen_helper_vfp_addd(vd, vd, tmp, fpst);
+    tcg_temp_free_i64(tmp);
+}
+
+static bool trans_VMLA_dp(DisasContext *s, arg_VMLA_dp *a)
+{
+    return do_vfp_3op_dp(s, gen_VMLA_dp, a->vd, a->vn, a->vm, true);
+}
+
+static void gen_VMLS_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
+{
+    /*
+     * VMLS: vd = vd + -(vn * vm)
+     * Note that order of inputs to the add matters for NaNs.
+     */
+    TCGv_i32 tmp = tcg_temp_new_i32();
+
+    gen_helper_vfp_mulh(tmp, vn, vm, fpst);
+    gen_helper_vfp_negh(tmp, tmp);
+    gen_helper_vfp_addh(vd, vd, tmp, fpst);
+    tcg_temp_free_i32(tmp);
+}
+
+static bool trans_VMLS_hp(DisasContext *s, arg_VMLS_sp *a)
+{
+    return do_vfp_3op_hp(s, gen_VMLS_hp, a->vd, a->vn, a->vm, true);
+}
+
+static void gen_VMLS_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
+{
+    /*
+     * VMLS: vd = vd + -(vn * vm)
+     * Note that order of inputs to the add matters for NaNs.
+     */
+    TCGv_i32 tmp = tcg_temp_new_i32();
+
+    gen_helper_vfp_muls(tmp, vn, vm, fpst);
+    gen_helper_vfp_negs(tmp, tmp);
+    gen_helper_vfp_adds(vd, vd, tmp, fpst);
+    tcg_temp_free_i32(tmp);
+}
+
+static bool trans_VMLS_sp(DisasContext *s, arg_VMLS_sp *a)
+{
+    return do_vfp_3op_sp(s, gen_VMLS_sp, a->vd, a->vn, a->vm, true);
+}
+
+static void gen_VMLS_dp(TCGv_i64 vd, TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst)
+{
+    /*
+     * VMLS: vd = vd + -(vn * vm)
+     * Note that order of inputs to the add matters for NaNs.
+     */
+    TCGv_i64 tmp = tcg_temp_new_i64();
+
+    gen_helper_vfp_muld(tmp, vn, vm, fpst);
+    gen_helper_vfp_negd(tmp, tmp);
+    gen_helper_vfp_addd(vd, vd, tmp, fpst);
+    tcg_temp_free_i64(tmp);
+}
+
+static bool trans_VMLS_dp(DisasContext *s, arg_VMLS_dp *a)
+{
+    return do_vfp_3op_dp(s, gen_VMLS_dp, a->vd, a->vn, a->vm, true);
+}
+
+static void gen_VNMLS_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
+{
+    /*
+     * VNMLS: -fd + (fn * fm)
+     * Note that it isn't valid to replace (-A + B) with (B - A) or similar
+     * plausible looking simplifications because this will give wrong results
+     * for NaNs.
+     */
+    TCGv_i32 tmp = tcg_temp_new_i32();
+
+    gen_helper_vfp_mulh(tmp, vn, vm, fpst);
+    gen_helper_vfp_negh(vd, vd);
+    gen_helper_vfp_addh(vd, vd, tmp, fpst);
+    tcg_temp_free_i32(tmp);
+}
+
+static bool trans_VNMLS_hp(DisasContext *s, arg_VNMLS_sp *a)
+{
+    return do_vfp_3op_hp(s, gen_VNMLS_hp, a->vd, a->vn, a->vm, true);
+}
+
+static void gen_VNMLS_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
+{
+    /*
+     * VNMLS: -fd + (fn * fm)
+     * Note that it isn't valid to replace (-A + B) with (B - A) or similar
+     * plausible looking simplifications because this will give wrong results
+     * for NaNs.
+     */
+    TCGv_i32 tmp = tcg_temp_new_i32();
+
+    gen_helper_vfp_muls(tmp, vn, vm, fpst);
+    gen_helper_vfp_negs(vd, vd);
+    gen_helper_vfp_adds(vd, vd, tmp, fpst);
+    tcg_temp_free_i32(tmp);
+}
+
+static bool trans_VNMLS_sp(DisasContext *s, arg_VNMLS_sp *a)
+{
+    return do_vfp_3op_sp(s, gen_VNMLS_sp, a->vd, a->vn, a->vm, true);
+}
+
+static void gen_VNMLS_dp(TCGv_i64 vd, TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst)
+{
+    /*
+     * VNMLS: -fd + (fn * fm)
+     * Note that it isn't valid to replace (-A + B) with (B - A) or similar
+     * plausible looking simplifications because this will give wrong results
+     * for NaNs.
+     */
+    TCGv_i64 tmp = tcg_temp_new_i64();
+
+    gen_helper_vfp_muld(tmp, vn, vm, fpst);
+    gen_helper_vfp_negd(vd, vd);
+    gen_helper_vfp_addd(vd, vd, tmp, fpst);
+    tcg_temp_free_i64(tmp);
+}
+
+static bool trans_VNMLS_dp(DisasContext *s, arg_VNMLS_dp *a)
+{
+    return do_vfp_3op_dp(s, gen_VNMLS_dp, a->vd, a->vn, a->vm, true);
+}
+
+static void gen_VNMLA_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
+{
+    /* VNMLA: -fd + -(fn * fm) */
+    TCGv_i32 tmp = tcg_temp_new_i32();
+
+    gen_helper_vfp_mulh(tmp, vn, vm, fpst);
+    gen_helper_vfp_negh(tmp, tmp);
+    gen_helper_vfp_negh(vd, vd);
+    gen_helper_vfp_addh(vd, vd, tmp, fpst);
+    tcg_temp_free_i32(tmp);
+}
+
+static bool trans_VNMLA_hp(DisasContext *s, arg_VNMLA_sp *a)
+{
+    return do_vfp_3op_hp(s, gen_VNMLA_hp, a->vd, a->vn, a->vm, true);
+}
+
+static void gen_VNMLA_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
+{
+    /* VNMLA: -fd + -(fn * fm) */
+    TCGv_i32 tmp = tcg_temp_new_i32();
+
+    gen_helper_vfp_muls(tmp, vn, vm, fpst);
+    gen_helper_vfp_negs(tmp, tmp);
+    gen_helper_vfp_negs(vd, vd);
+    gen_helper_vfp_adds(vd, vd, tmp, fpst);
+    tcg_temp_free_i32(tmp);
+}
+
+static bool trans_VNMLA_sp(DisasContext *s, arg_VNMLA_sp *a)
+{
+    return do_vfp_3op_sp(s, gen_VNMLA_sp, a->vd, a->vn, a->vm, true);
+}
+
+static void gen_VNMLA_dp(TCGv_i64 vd, TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst)
+{
+    /* VNMLA: -fd + (fn * fm) */
+    TCGv_i64 tmp = tcg_temp_new_i64();
+
+    gen_helper_vfp_muld(tmp, vn, vm, fpst);
+    gen_helper_vfp_negd(tmp, tmp);
+    gen_helper_vfp_negd(vd, vd);
+    gen_helper_vfp_addd(vd, vd, tmp, fpst);
+    tcg_temp_free_i64(tmp);
+}
+
+static bool trans_VNMLA_dp(DisasContext *s, arg_VNMLA_dp *a)
+{
+    return do_vfp_3op_dp(s, gen_VNMLA_dp, a->vd, a->vn, a->vm, true);
+}
+
+static bool trans_VMUL_hp(DisasContext *s, arg_VMUL_sp *a)
+{
+    return do_vfp_3op_hp(s, gen_helper_vfp_mulh, a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VMUL_sp(DisasContext *s, arg_VMUL_sp *a)
+{
+    return do_vfp_3op_sp(s, gen_helper_vfp_muls, a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VMUL_dp(DisasContext *s, arg_VMUL_dp *a)
+{
+    return do_vfp_3op_dp(s, gen_helper_vfp_muld, a->vd, a->vn, a->vm, false);
+}
+
+static void gen_VNMUL_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
+{
+    /* VNMUL: -(fn * fm) */
+    gen_helper_vfp_mulh(vd, vn, vm, fpst);
+    gen_helper_vfp_negh(vd, vd);
+}
+
+static bool trans_VNMUL_hp(DisasContext *s, arg_VNMUL_sp *a)
+{
+    return do_vfp_3op_hp(s, gen_VNMUL_hp, a->vd, a->vn, a->vm, false);
+}
+
+static void gen_VNMUL_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
+{
+    /* VNMUL: -(fn * fm) */
+    gen_helper_vfp_muls(vd, vn, vm, fpst);
+    gen_helper_vfp_negs(vd, vd);
+}
+
+static bool trans_VNMUL_sp(DisasContext *s, arg_VNMUL_sp *a)
+{
+    return do_vfp_3op_sp(s, gen_VNMUL_sp, a->vd, a->vn, a->vm, false);
+}
+
+static void gen_VNMUL_dp(TCGv_i64 vd, TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst)
+{
+    /* VNMUL: -(fn * fm) */
+    gen_helper_vfp_muld(vd, vn, vm, fpst);
+    gen_helper_vfp_negd(vd, vd);
+}
+
+static bool trans_VNMUL_dp(DisasContext *s, arg_VNMUL_dp *a)
+{
+    return do_vfp_3op_dp(s, gen_VNMUL_dp, a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VADD_hp(DisasContext *s, arg_VADD_sp *a)
+{
+    return do_vfp_3op_hp(s, gen_helper_vfp_addh, a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VADD_sp(DisasContext *s, arg_VADD_sp *a)
+{
+    return do_vfp_3op_sp(s, gen_helper_vfp_adds, a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VADD_dp(DisasContext *s, arg_VADD_dp *a)
+{
+    return do_vfp_3op_dp(s, gen_helper_vfp_addd, a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VSUB_hp(DisasContext *s, arg_VSUB_sp *a)
+{
+    return do_vfp_3op_hp(s, gen_helper_vfp_subh, a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VSUB_sp(DisasContext *s, arg_VSUB_sp *a)
+{
+    return do_vfp_3op_sp(s, gen_helper_vfp_subs, a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VSUB_dp(DisasContext *s, arg_VSUB_dp *a)
+{
+    return do_vfp_3op_dp(s, gen_helper_vfp_subd, a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VDIV_hp(DisasContext *s, arg_VDIV_sp *a)
+{
+    return do_vfp_3op_hp(s, gen_helper_vfp_divh, a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VDIV_sp(DisasContext *s, arg_VDIV_sp *a)
+{
+    return do_vfp_3op_sp(s, gen_helper_vfp_divs, a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VDIV_dp(DisasContext *s, arg_VDIV_dp *a)
+{
+    return do_vfp_3op_dp(s, gen_helper_vfp_divd, a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VMINNM_hp(DisasContext *s, arg_VMINNM_sp *a)
+{
+    if (!dc_isar_feature(aa32_vminmaxnm, s)) {
+        return false;
+    }
+    return do_vfp_3op_hp(s, gen_helper_vfp_minnumh,
+                         a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VMAXNM_hp(DisasContext *s, arg_VMAXNM_sp *a)
+{
+    if (!dc_isar_feature(aa32_vminmaxnm, s)) {
+        return false;
+    }
+    return do_vfp_3op_hp(s, gen_helper_vfp_maxnumh,
+                         a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VMINNM_sp(DisasContext *s, arg_VMINNM_sp *a)
+{
+    if (!dc_isar_feature(aa32_vminmaxnm, s)) {
+        return false;
+    }
+    return do_vfp_3op_sp(s, gen_helper_vfp_minnums,
+                         a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VMAXNM_sp(DisasContext *s, arg_VMAXNM_sp *a)
+{
+    if (!dc_isar_feature(aa32_vminmaxnm, s)) {
+        return false;
+    }
+    return do_vfp_3op_sp(s, gen_helper_vfp_maxnums,
+                         a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VMINNM_dp(DisasContext *s, arg_VMINNM_dp *a)
+{
+    if (!dc_isar_feature(aa32_vminmaxnm, s)) {
+        return false;
+    }
+    return do_vfp_3op_dp(s, gen_helper_vfp_minnumd,
+                         a->vd, a->vn, a->vm, false);
+}
+
+static bool trans_VMAXNM_dp(DisasContext *s, arg_VMAXNM_dp *a)
+{
+    if (!dc_isar_feature(aa32_vminmaxnm, s)) {
+        return false;
+    }
+    return do_vfp_3op_dp(s, gen_helper_vfp_maxnumd,
+                         a->vd, a->vn, a->vm, false);
+}
+
+static bool do_vfm_hp(DisasContext *s, arg_VFMA_sp *a, bool neg_n, bool neg_d)
+{
+    /*
+     * VFNMA : fd = muladd(-fd,  fn, fm)
+     * VFNMS : fd = muladd(-fd, -fn, fm)
+     * VFMA  : fd = muladd( fd,  fn, fm)
+     * VFMS  : fd = muladd( fd, -fn, fm)
+     *
+     * These are fused multiply-add, and must be done as one floating
+     * point operation with no rounding between the multiplication and
+     * addition steps.  NB that doing the negations here as separate
+     * steps is correct : an input NaN should come out with its sign
+     * bit flipped if it is a negated-input.
+     */
+    TCGv_ptr fpst;
+    TCGv_i32 vn, vm, vd;
+
+    /*
+     * Present in VFPv4 only, and only with the FP16 extension.
+     * Note that we can't rely on the SIMDFMAC check alone, because
+     * in a Neon-no-VFP core that ID register field will be non-zero.
+     */
+    if (!dc_isar_feature(aa32_fp16_arith, s) ||
+        !dc_isar_feature(aa32_simdfmac, s) ||
+        !dc_isar_feature(aa32_fpsp_v2, s)) {
+        return false;
+    }
+
+    if (s->vec_len != 0 || s->vec_stride != 0) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    vn = tcg_temp_new_i32();
+    vm = tcg_temp_new_i32();
+    vd = tcg_temp_new_i32();
+
+    vfp_load_reg32(vn, a->vn);
+    vfp_load_reg32(vm, a->vm);
+    if (neg_n) {
+        /* VFNMS, VFMS */
+        gen_helper_vfp_negh(vn, vn);
+    }
+    vfp_load_reg32(vd, a->vd);
+    if (neg_d) {
+        /* VFNMA, VFNMS */
+        gen_helper_vfp_negh(vd, vd);
+    }
+    fpst = fpstatus_ptr(FPST_FPCR_F16);
+    gen_helper_vfp_muladdh(vd, vn, vm, vd, fpst);
+    vfp_store_reg32(vd, a->vd);
+
+    tcg_temp_free_ptr(fpst);
+    tcg_temp_free_i32(vn);
+    tcg_temp_free_i32(vm);
+    tcg_temp_free_i32(vd);
+
+    return true;
+}
+
+static bool do_vfm_sp(DisasContext *s, arg_VFMA_sp *a, bool neg_n, bool neg_d)
+{
+    /*
+     * VFNMA : fd = muladd(-fd,  fn, fm)
+     * VFNMS : fd = muladd(-fd, -fn, fm)
+     * VFMA  : fd = muladd( fd,  fn, fm)
+     * VFMS  : fd = muladd( fd, -fn, fm)
+     *
+     * These are fused multiply-add, and must be done as one floating
+     * point operation with no rounding between the multiplication and
+     * addition steps.  NB that doing the negations here as separate
+     * steps is correct : an input NaN should come out with its sign
+     * bit flipped if it is a negated-input.
+     */
+    TCGv_ptr fpst;
+    TCGv_i32 vn, vm, vd;
+
+    /*
+     * Present in VFPv4 only.
+     * Note that we can't rely on the SIMDFMAC check alone, because
+     * in a Neon-no-VFP core that ID register field will be non-zero.
+     */
+    if (!dc_isar_feature(aa32_simdfmac, s) ||
+        !dc_isar_feature(aa32_fpsp_v2, s)) {
+        return false;
+    }
+    /*
+     * In v7A, UNPREDICTABLE with non-zero vector length/stride; from
+     * v8A, must UNDEF. We choose to UNDEF for both v7A and v8A.
+     */
+    if (s->vec_len != 0 || s->vec_stride != 0) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    vn = tcg_temp_new_i32();
+    vm = tcg_temp_new_i32();
+    vd = tcg_temp_new_i32();
+
+    vfp_load_reg32(vn, a->vn);
+    vfp_load_reg32(vm, a->vm);
+    if (neg_n) {
+        /* VFNMS, VFMS */
+        gen_helper_vfp_negs(vn, vn);
+    }
+    vfp_load_reg32(vd, a->vd);
+    if (neg_d) {
+        /* VFNMA, VFNMS */
+        gen_helper_vfp_negs(vd, vd);
+    }
+    fpst = fpstatus_ptr(FPST_FPCR);
+    gen_helper_vfp_muladds(vd, vn, vm, vd, fpst);
+    vfp_store_reg32(vd, a->vd);
+
+    tcg_temp_free_ptr(fpst);
+    tcg_temp_free_i32(vn);
+    tcg_temp_free_i32(vm);
+    tcg_temp_free_i32(vd);
+
+    return true;
+}
+
+static bool do_vfm_dp(DisasContext *s, arg_VFMA_dp *a, bool neg_n, bool neg_d)
+{
+    /*
+     * VFNMA : fd = muladd(-fd,  fn, fm)
+     * VFNMS : fd = muladd(-fd, -fn, fm)
+     * VFMA  : fd = muladd( fd,  fn, fm)
+     * VFMS  : fd = muladd( fd, -fn, fm)
+     *
+     * These are fused multiply-add, and must be done as one floating
+     * point operation with no rounding between the multiplication and
+     * addition steps.  NB that doing the negations here as separate
+     * steps is correct : an input NaN should come out with its sign
+     * bit flipped if it is a negated-input.
+     */
+    TCGv_ptr fpst;
+    TCGv_i64 vn, vm, vd;
+
+    /*
+     * Present in VFPv4 only.
+     * Note that we can't rely on the SIMDFMAC check alone, because
+     * in a Neon-no-VFP core that ID register field will be non-zero.
+     */
+    if (!dc_isar_feature(aa32_simdfmac, s) ||
+        !dc_isar_feature(aa32_fpdp_v2, s)) {
+        return false;
+    }
+    /*
+     * In v7A, UNPREDICTABLE with non-zero vector length/stride; from
+     * v8A, must UNDEF. We choose to UNDEF for both v7A and v8A.
+     */
+    if (s->vec_len != 0 || s->vec_stride != 0) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vn | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    vn = tcg_temp_new_i64();
+    vm = tcg_temp_new_i64();
+    vd = tcg_temp_new_i64();
+
+    vfp_load_reg64(vn, a->vn);
+    vfp_load_reg64(vm, a->vm);
+    if (neg_n) {
+        /* VFNMS, VFMS */
+        gen_helper_vfp_negd(vn, vn);
+    }
+    vfp_load_reg64(vd, a->vd);
+    if (neg_d) {
+        /* VFNMA, VFNMS */
+        gen_helper_vfp_negd(vd, vd);
+    }
+    fpst = fpstatus_ptr(FPST_FPCR);
+    gen_helper_vfp_muladdd(vd, vn, vm, vd, fpst);
+    vfp_store_reg64(vd, a->vd);
+
+    tcg_temp_free_ptr(fpst);
+    tcg_temp_free_i64(vn);
+    tcg_temp_free_i64(vm);
+    tcg_temp_free_i64(vd);
+
+    return true;
+}
+
+#define MAKE_ONE_VFM_TRANS_FN(INSN, PREC, NEGN, NEGD)                   \
+    static bool trans_##INSN##_##PREC(DisasContext *s,                  \
+                                      arg_##INSN##_##PREC *a)           \
+    {                                                                   \
+        return do_vfm_##PREC(s, a, NEGN, NEGD);                         \
+    }
+
+#define MAKE_VFM_TRANS_FNS(PREC) \
+    MAKE_ONE_VFM_TRANS_FN(VFMA, PREC, false, false) \
+    MAKE_ONE_VFM_TRANS_FN(VFMS, PREC, true, false) \
+    MAKE_ONE_VFM_TRANS_FN(VFNMA, PREC, false, true) \
+    MAKE_ONE_VFM_TRANS_FN(VFNMS, PREC, true, true)
+
+MAKE_VFM_TRANS_FNS(hp)
+MAKE_VFM_TRANS_FNS(sp)
+MAKE_VFM_TRANS_FNS(dp)
+
+static bool trans_VMOV_imm_hp(DisasContext *s, arg_VMOV_imm_sp *a)
+{
+    if (!dc_isar_feature(aa32_fp16_arith, s)) {
+        return false;
+    }
+
+    if (s->vec_len != 0 || s->vec_stride != 0) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    vfp_store_reg32(tcg_constant_i32(vfp_expand_imm(MO_16, a->imm)), a->vd);
+    return true;
+}
+
+static bool trans_VMOV_imm_sp(DisasContext *s, arg_VMOV_imm_sp *a)
+{
+    uint32_t delta_d = 0;
+    int veclen = s->vec_len;
+    TCGv_i32 fd;
+    uint32_t vd;
+
+    vd = a->vd;
+
+    if (!dc_isar_feature(aa32_fpsp_v3, s)) {
+        return false;
+    }
+
+    if (!dc_isar_feature(aa32_fpshvec, s) &&
+        (veclen != 0 || s->vec_stride != 0)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    if (veclen > 0) {
+        /* Figure out what type of vector operation this is.  */
+        if (vfp_sreg_is_scalar(vd)) {
+            /* scalar */
+            veclen = 0;
+        } else {
+            delta_d = s->vec_stride + 1;
+        }
+    }
+
+    fd = tcg_constant_i32(vfp_expand_imm(MO_32, a->imm));
+
+    for (;;) {
+        vfp_store_reg32(fd, vd);
+
+        if (veclen == 0) {
+            break;
+        }
+
+        /* Set up the operands for the next iteration */
+        veclen--;
+        vd = vfp_advance_sreg(vd, delta_d);
+    }
+
+    return true;
+}
+
+static bool trans_VMOV_imm_dp(DisasContext *s, arg_VMOV_imm_dp *a)
+{
+    uint32_t delta_d = 0;
+    int veclen = s->vec_len;
+    TCGv_i64 fd;
+    uint32_t vd;
+
+    vd = a->vd;
+
+    if (!dc_isar_feature(aa32_fpdp_v3, s)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) && (vd & 0x10)) {
+        return false;
+    }
+
+    if (!dc_isar_feature(aa32_fpshvec, s) &&
+        (veclen != 0 || s->vec_stride != 0)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    if (veclen > 0) {
+        /* Figure out what type of vector operation this is.  */
+        if (vfp_dreg_is_scalar(vd)) {
+            /* scalar */
+            veclen = 0;
+        } else {
+            delta_d = (s->vec_stride >> 1) + 1;
+        }
+    }
+
+    fd = tcg_constant_i64(vfp_expand_imm(MO_64, a->imm));
+
+    for (;;) {
+        vfp_store_reg64(fd, vd);
+
+        if (veclen == 0) {
+            break;
+        }
+
+        /* Set up the operands for the next iteration */
+        veclen--;
+        vd = vfp_advance_dreg(vd, delta_d);
+    }
+
+    return true;
+}
+
+#define DO_VFP_2OP(INSN, PREC, FN, CHECK)                       \
+    static bool trans_##INSN##_##PREC(DisasContext *s,          \
+                                      arg_##INSN##_##PREC *a)   \
+    {                                                           \
+        if (!dc_isar_feature(CHECK, s)) {                       \
+            return false;                                       \
+        }                                                       \
+        return do_vfp_2op_##PREC(s, FN, a->vd, a->vm);          \
+    }
+
+#define DO_VFP_VMOV(INSN, PREC, FN)                             \
+    static bool trans_##INSN##_##PREC(DisasContext *s,          \
+                                      arg_##INSN##_##PREC *a)   \
+    {                                                           \
+        if (!dc_isar_feature(aa32_fp##PREC##_v2, s) &&          \
+            !dc_isar_feature(aa32_mve, s)) {                    \
+            return false;                                       \
+        }                                                       \
+        return do_vfp_2op_##PREC(s, FN, a->vd, a->vm);          \
+    }
+
+DO_VFP_VMOV(VMOV_reg, sp, tcg_gen_mov_i32)
+DO_VFP_VMOV(VMOV_reg, dp, tcg_gen_mov_i64)
+
+DO_VFP_2OP(VABS, hp, gen_helper_vfp_absh, aa32_fp16_arith)
+DO_VFP_2OP(VABS, sp, gen_helper_vfp_abss, aa32_fpsp_v2)
+DO_VFP_2OP(VABS, dp, gen_helper_vfp_absd, aa32_fpdp_v2)
+
+DO_VFP_2OP(VNEG, hp, gen_helper_vfp_negh, aa32_fp16_arith)
+DO_VFP_2OP(VNEG, sp, gen_helper_vfp_negs, aa32_fpsp_v2)
+DO_VFP_2OP(VNEG, dp, gen_helper_vfp_negd, aa32_fpdp_v2)
+
+static void gen_VSQRT_hp(TCGv_i32 vd, TCGv_i32 vm)
+{
+    gen_helper_vfp_sqrth(vd, vm, cpu_env);
+}
+
+static void gen_VSQRT_sp(TCGv_i32 vd, TCGv_i32 vm)
+{
+    gen_helper_vfp_sqrts(vd, vm, cpu_env);
+}
+
+static void gen_VSQRT_dp(TCGv_i64 vd, TCGv_i64 vm)
+{
+    gen_helper_vfp_sqrtd(vd, vm, cpu_env);
+}
+
+DO_VFP_2OP(VSQRT, hp, gen_VSQRT_hp, aa32_fp16_arith)
+DO_VFP_2OP(VSQRT, sp, gen_VSQRT_sp, aa32_fpsp_v2)
+DO_VFP_2OP(VSQRT, dp, gen_VSQRT_dp, aa32_fpdp_v2)
+
+static bool trans_VCMP_hp(DisasContext *s, arg_VCMP_sp *a)
+{
+    TCGv_i32 vd, vm;
+
+    if (!dc_isar_feature(aa32_fp16_arith, s)) {
+        return false;
+    }
+
+    /* Vm/M bits must be zero for the Z variant */
+    if (a->z && a->vm != 0) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    vd = tcg_temp_new_i32();
+    vm = tcg_temp_new_i32();
+
+    vfp_load_reg32(vd, a->vd);
+    if (a->z) {
+        tcg_gen_movi_i32(vm, 0);
+    } else {
+        vfp_load_reg32(vm, a->vm);
+    }
+
+    if (a->e) {
+        gen_helper_vfp_cmpeh(vd, vm, cpu_env);
+    } else {
+        gen_helper_vfp_cmph(vd, vm, cpu_env);
+    }
+
+    tcg_temp_free_i32(vd);
+    tcg_temp_free_i32(vm);
+
+    return true;
+}
+
+static bool trans_VCMP_sp(DisasContext *s, arg_VCMP_sp *a)
+{
+    TCGv_i32 vd, vm;
+
+    if (!dc_isar_feature(aa32_fpsp_v2, s)) {
+        return false;
+    }
+
+    /* Vm/M bits must be zero for the Z variant */
+    if (a->z && a->vm != 0) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    vd = tcg_temp_new_i32();
+    vm = tcg_temp_new_i32();
+
+    vfp_load_reg32(vd, a->vd);
+    if (a->z) {
+        tcg_gen_movi_i32(vm, 0);
+    } else {
+        vfp_load_reg32(vm, a->vm);
+    }
+
+    if (a->e) {
+        gen_helper_vfp_cmpes(vd, vm, cpu_env);
+    } else {
+        gen_helper_vfp_cmps(vd, vm, cpu_env);
+    }
+
+    tcg_temp_free_i32(vd);
+    tcg_temp_free_i32(vm);
+
+    return true;
+}
+
+static bool trans_VCMP_dp(DisasContext *s, arg_VCMP_dp *a)
+{
+    TCGv_i64 vd, vm;
+
+    if (!dc_isar_feature(aa32_fpdp_v2, s)) {
+        return false;
+    }
+
+    /* Vm/M bits must be zero for the Z variant */
+    if (a->z && a->vm != 0) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) && ((a->vd | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    vd = tcg_temp_new_i64();
+    vm = tcg_temp_new_i64();
+
+    vfp_load_reg64(vd, a->vd);
+    if (a->z) {
+        tcg_gen_movi_i64(vm, 0);
+    } else {
+        vfp_load_reg64(vm, a->vm);
+    }
+
+    if (a->e) {
+        gen_helper_vfp_cmped(vd, vm, cpu_env);
+    } else {
+        gen_helper_vfp_cmpd(vd, vm, cpu_env);
+    }
+
+    tcg_temp_free_i64(vd);
+    tcg_temp_free_i64(vm);
+
+    return true;
+}
+
+static bool trans_VCVT_f32_f16(DisasContext *s, arg_VCVT_f32_f16 *a)
+{
+    TCGv_ptr fpst;
+    TCGv_i32 ahp_mode;
+    TCGv_i32 tmp;
+
+    if (!dc_isar_feature(aa32_fp16_spconv, s)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    fpst = fpstatus_ptr(FPST_FPCR);
+    ahp_mode = get_ahp_flag();
+    tmp = tcg_temp_new_i32();
+    /* The T bit tells us if we want the low or high 16 bits of Vm */
+    tcg_gen_ld16u_i32(tmp, cpu_env, vfp_f16_offset(a->vm, a->t));
+    gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp_mode);
+    vfp_store_reg32(tmp, a->vd);
+    tcg_temp_free_i32(ahp_mode);
+    tcg_temp_free_ptr(fpst);
+    tcg_temp_free_i32(tmp);
+    return true;
+}
+
+static bool trans_VCVT_f64_f16(DisasContext *s, arg_VCVT_f64_f16 *a)
+{
+    TCGv_ptr fpst;
+    TCGv_i32 ahp_mode;
+    TCGv_i32 tmp;
+    TCGv_i64 vd;
+
+    if (!dc_isar_feature(aa32_fpdp_v2, s)) {
+        return false;
+    }
+
+    if (!dc_isar_feature(aa32_fp16_dpconv, s)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd  & 0x10)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    fpst = fpstatus_ptr(FPST_FPCR);
+    ahp_mode = get_ahp_flag();
+    tmp = tcg_temp_new_i32();
+    /* The T bit tells us if we want the low or high 16 bits of Vm */
+    tcg_gen_ld16u_i32(tmp, cpu_env, vfp_f16_offset(a->vm, a->t));
+    vd = tcg_temp_new_i64();
+    gen_helper_vfp_fcvt_f16_to_f64(vd, tmp, fpst, ahp_mode);
+    vfp_store_reg64(vd, a->vd);
+    tcg_temp_free_i32(ahp_mode);
+    tcg_temp_free_ptr(fpst);
+    tcg_temp_free_i32(tmp);
+    tcg_temp_free_i64(vd);
+    return true;
+}
+
+static bool trans_VCVT_b16_f32(DisasContext *s, arg_VCVT_b16_f32 *a)
+{
+    TCGv_ptr fpst;
+    TCGv_i32 tmp;
+
+    if (!dc_isar_feature(aa32_bf16, s)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    fpst = fpstatus_ptr(FPST_FPCR);
+    tmp = tcg_temp_new_i32();
+
+    vfp_load_reg32(tmp, a->vm);
+    gen_helper_bfcvt(tmp, tmp, fpst);
+    tcg_gen_st16_i32(tmp, cpu_env, vfp_f16_offset(a->vd, a->t));
+    tcg_temp_free_ptr(fpst);
+    tcg_temp_free_i32(tmp);
+    return true;
+}
+
+static bool trans_VCVT_f16_f32(DisasContext *s, arg_VCVT_f16_f32 *a)
+{
+    TCGv_ptr fpst;
+    TCGv_i32 ahp_mode;
+    TCGv_i32 tmp;
+
+    if (!dc_isar_feature(aa32_fp16_spconv, s)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    fpst = fpstatus_ptr(FPST_FPCR);
+    ahp_mode = get_ahp_flag();
+    tmp = tcg_temp_new_i32();
+
+    vfp_load_reg32(tmp, a->vm);
+    gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp_mode);
+    tcg_gen_st16_i32(tmp, cpu_env, vfp_f16_offset(a->vd, a->t));
+    tcg_temp_free_i32(ahp_mode);
+    tcg_temp_free_ptr(fpst);
+    tcg_temp_free_i32(tmp);
+    return true;
+}
+
+static bool trans_VCVT_f16_f64(DisasContext *s, arg_VCVT_f16_f64 *a)
+{
+    TCGv_ptr fpst;
+    TCGv_i32 ahp_mode;
+    TCGv_i32 tmp;
+    TCGv_i64 vm;
+
+    if (!dc_isar_feature(aa32_fpdp_v2, s)) {
+        return false;
+    }
+
+    if (!dc_isar_feature(aa32_fp16_dpconv, s)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vm  & 0x10)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    fpst = fpstatus_ptr(FPST_FPCR);
+    ahp_mode = get_ahp_flag();
+    tmp = tcg_temp_new_i32();
+    vm = tcg_temp_new_i64();
+
+    vfp_load_reg64(vm, a->vm);
+    gen_helper_vfp_fcvt_f64_to_f16(tmp, vm, fpst, ahp_mode);
+    tcg_temp_free_i64(vm);
+    tcg_gen_st16_i32(tmp, cpu_env, vfp_f16_offset(a->vd, a->t));
+    tcg_temp_free_i32(ahp_mode);
+    tcg_temp_free_ptr(fpst);
+    tcg_temp_free_i32(tmp);
+    return true;
+}
+
+static bool trans_VRINTR_hp(DisasContext *s, arg_VRINTR_sp *a)
+{
+    TCGv_ptr fpst;
+    TCGv_i32 tmp;
+
+    if (!dc_isar_feature(aa32_fp16_arith, s)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    tmp = tcg_temp_new_i32();
+    vfp_load_reg32(tmp, a->vm);
+    fpst = fpstatus_ptr(FPST_FPCR_F16);
+    gen_helper_rinth(tmp, tmp, fpst);
+    vfp_store_reg32(tmp, a->vd);
+    tcg_temp_free_ptr(fpst);
+    tcg_temp_free_i32(tmp);
+    return true;
+}
+
+static bool trans_VRINTR_sp(DisasContext *s, arg_VRINTR_sp *a)
+{
+    TCGv_ptr fpst;
+    TCGv_i32 tmp;
+
+    if (!dc_isar_feature(aa32_vrint, s)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    tmp = tcg_temp_new_i32();
+    vfp_load_reg32(tmp, a->vm);
+    fpst = fpstatus_ptr(FPST_FPCR);
+    gen_helper_rints(tmp, tmp, fpst);
+    vfp_store_reg32(tmp, a->vd);
+    tcg_temp_free_ptr(fpst);
+    tcg_temp_free_i32(tmp);
+    return true;
+}
+
+static bool trans_VRINTR_dp(DisasContext *s, arg_VRINTR_dp *a)
+{
+    TCGv_ptr fpst;
+    TCGv_i64 tmp;
+
+    if (!dc_isar_feature(aa32_fpdp_v2, s)) {
+        return false;
+    }
+
+    if (!dc_isar_feature(aa32_vrint, s)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) && ((a->vd | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    tmp = tcg_temp_new_i64();
+    vfp_load_reg64(tmp, a->vm);
+    fpst = fpstatus_ptr(FPST_FPCR);
+    gen_helper_rintd(tmp, tmp, fpst);
+    vfp_store_reg64(tmp, a->vd);
+    tcg_temp_free_ptr(fpst);
+    tcg_temp_free_i64(tmp);
+    return true;
+}
+
+static bool trans_VRINTZ_hp(DisasContext *s, arg_VRINTZ_sp *a)
+{
+    TCGv_ptr fpst;
+    TCGv_i32 tmp;
+    TCGv_i32 tcg_rmode;
+
+    if (!dc_isar_feature(aa32_fp16_arith, s)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    tmp = tcg_temp_new_i32();
+    vfp_load_reg32(tmp, a->vm);
+    fpst = fpstatus_ptr(FPST_FPCR_F16);
+    tcg_rmode = tcg_const_i32(float_round_to_zero);
+    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+    gen_helper_rinth(tmp, tmp, fpst);
+    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+    vfp_store_reg32(tmp, a->vd);
+    tcg_temp_free_ptr(fpst);
+    tcg_temp_free_i32(tcg_rmode);
+    tcg_temp_free_i32(tmp);
+    return true;
+}
+
+static bool trans_VRINTZ_sp(DisasContext *s, arg_VRINTZ_sp *a)
+{
+    TCGv_ptr fpst;
+    TCGv_i32 tmp;
+    TCGv_i32 tcg_rmode;
+
+    if (!dc_isar_feature(aa32_vrint, s)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    tmp = tcg_temp_new_i32();
+    vfp_load_reg32(tmp, a->vm);
+    fpst = fpstatus_ptr(FPST_FPCR);
+    tcg_rmode = tcg_const_i32(float_round_to_zero);
+    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+    gen_helper_rints(tmp, tmp, fpst);
+    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+    vfp_store_reg32(tmp, a->vd);
+    tcg_temp_free_ptr(fpst);
+    tcg_temp_free_i32(tcg_rmode);
+    tcg_temp_free_i32(tmp);
+    return true;
+}
+
+static bool trans_VRINTZ_dp(DisasContext *s, arg_VRINTZ_dp *a)
+{
+    TCGv_ptr fpst;
+    TCGv_i64 tmp;
+    TCGv_i32 tcg_rmode;
+
+    if (!dc_isar_feature(aa32_fpdp_v2, s)) {
+        return false;
+    }
+
+    if (!dc_isar_feature(aa32_vrint, s)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) && ((a->vd | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    tmp = tcg_temp_new_i64();
+    vfp_load_reg64(tmp, a->vm);
+    fpst = fpstatus_ptr(FPST_FPCR);
+    tcg_rmode = tcg_const_i32(float_round_to_zero);
+    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+    gen_helper_rintd(tmp, tmp, fpst);
+    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
+    vfp_store_reg64(tmp, a->vd);
+    tcg_temp_free_ptr(fpst);
+    tcg_temp_free_i64(tmp);
+    tcg_temp_free_i32(tcg_rmode);
+    return true;
+}
+
+static bool trans_VRINTX_hp(DisasContext *s, arg_VRINTX_sp *a)
+{
+    TCGv_ptr fpst;
+    TCGv_i32 tmp;
+
+    if (!dc_isar_feature(aa32_fp16_arith, s)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    tmp = tcg_temp_new_i32();
+    vfp_load_reg32(tmp, a->vm);
+    fpst = fpstatus_ptr(FPST_FPCR_F16);
+    gen_helper_rinth_exact(tmp, tmp, fpst);
+    vfp_store_reg32(tmp, a->vd);
+    tcg_temp_free_ptr(fpst);
+    tcg_temp_free_i32(tmp);
+    return true;
+}
+
+static bool trans_VRINTX_sp(DisasContext *s, arg_VRINTX_sp *a)
+{
+    TCGv_ptr fpst;
+    TCGv_i32 tmp;
+
+    if (!dc_isar_feature(aa32_vrint, s)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    tmp = tcg_temp_new_i32();
+    vfp_load_reg32(tmp, a->vm);
+    fpst = fpstatus_ptr(FPST_FPCR);
+    gen_helper_rints_exact(tmp, tmp, fpst);
+    vfp_store_reg32(tmp, a->vd);
+    tcg_temp_free_ptr(fpst);
+    tcg_temp_free_i32(tmp);
+    return true;
+}
+
+static bool trans_VRINTX_dp(DisasContext *s, arg_VRINTX_dp *a)
+{
+    TCGv_ptr fpst;
+    TCGv_i64 tmp;
+
+    if (!dc_isar_feature(aa32_fpdp_v2, s)) {
+        return false;
+    }
+
+    if (!dc_isar_feature(aa32_vrint, s)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) && ((a->vd | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    tmp = tcg_temp_new_i64();
+    vfp_load_reg64(tmp, a->vm);
+    fpst = fpstatus_ptr(FPST_FPCR);
+    gen_helper_rintd_exact(tmp, tmp, fpst);
+    vfp_store_reg64(tmp, a->vd);
+    tcg_temp_free_ptr(fpst);
+    tcg_temp_free_i64(tmp);
+    return true;
+}
+
+static bool trans_VCVT_sp(DisasContext *s, arg_VCVT_sp *a)
+{
+    TCGv_i64 vd;
+    TCGv_i32 vm;
+
+    if (!dc_isar_feature(aa32_fpdp_v2, s)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    vm = tcg_temp_new_i32();
+    vd = tcg_temp_new_i64();
+    vfp_load_reg32(vm, a->vm);
+    gen_helper_vfp_fcvtds(vd, vm, cpu_env);
+    vfp_store_reg64(vd, a->vd);
+    tcg_temp_free_i32(vm);
+    tcg_temp_free_i64(vd);
+    return true;
+}
+
+static bool trans_VCVT_dp(DisasContext *s, arg_VCVT_dp *a)
+{
+    TCGv_i64 vm;
+    TCGv_i32 vd;
+
+    if (!dc_isar_feature(aa32_fpdp_v2, s)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vm & 0x10)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    vd = tcg_temp_new_i32();
+    vm = tcg_temp_new_i64();
+    vfp_load_reg64(vm, a->vm);
+    gen_helper_vfp_fcvtsd(vd, vm, cpu_env);
+    vfp_store_reg32(vd, a->vd);
+    tcg_temp_free_i32(vd);
+    tcg_temp_free_i64(vm);
+    return true;
+}
+
+static bool trans_VCVT_int_hp(DisasContext *s, arg_VCVT_int_sp *a)
+{
+    TCGv_i32 vm;
+    TCGv_ptr fpst;
+
+    if (!dc_isar_feature(aa32_fp16_arith, s)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    vm = tcg_temp_new_i32();
+    vfp_load_reg32(vm, a->vm);
+    fpst = fpstatus_ptr(FPST_FPCR_F16);
+    if (a->s) {
+        /* i32 -> f16 */
+        gen_helper_vfp_sitoh(vm, vm, fpst);
+    } else {
+        /* u32 -> f16 */
+        gen_helper_vfp_uitoh(vm, vm, fpst);
+    }
+    vfp_store_reg32(vm, a->vd);
+    tcg_temp_free_i32(vm);
+    tcg_temp_free_ptr(fpst);
+    return true;
+}
+
+static bool trans_VCVT_int_sp(DisasContext *s, arg_VCVT_int_sp *a)
+{
+    TCGv_i32 vm;
+    TCGv_ptr fpst;
+
+    if (!dc_isar_feature(aa32_fpsp_v2, s)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    vm = tcg_temp_new_i32();
+    vfp_load_reg32(vm, a->vm);
+    fpst = fpstatus_ptr(FPST_FPCR);
+    if (a->s) {
+        /* i32 -> f32 */
+        gen_helper_vfp_sitos(vm, vm, fpst);
+    } else {
+        /* u32 -> f32 */
+        gen_helper_vfp_uitos(vm, vm, fpst);
+    }
+    vfp_store_reg32(vm, a->vd);
+    tcg_temp_free_i32(vm);
+    tcg_temp_free_ptr(fpst);
+    return true;
+}
+
+static bool trans_VCVT_int_dp(DisasContext *s, arg_VCVT_int_dp *a)
+{
+    TCGv_i32 vm;
+    TCGv_i64 vd;
+    TCGv_ptr fpst;
+
+    if (!dc_isar_feature(aa32_fpdp_v2, s)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    vm = tcg_temp_new_i32();
+    vd = tcg_temp_new_i64();
+    vfp_load_reg32(vm, a->vm);
+    fpst = fpstatus_ptr(FPST_FPCR);
+    if (a->s) {
+        /* i32 -> f64 */
+        gen_helper_vfp_sitod(vd, vm, fpst);
+    } else {
+        /* u32 -> f64 */
+        gen_helper_vfp_uitod(vd, vm, fpst);
+    }
+    vfp_store_reg64(vd, a->vd);
+    tcg_temp_free_i32(vm);
+    tcg_temp_free_i64(vd);
+    tcg_temp_free_ptr(fpst);
+    return true;
+}
+
+static bool trans_VJCVT(DisasContext *s, arg_VJCVT *a)
+{
+    TCGv_i32 vd;
+    TCGv_i64 vm;
+
+    if (!dc_isar_feature(aa32_fpdp_v2, s)) {
+        return false;
+    }
+
+    if (!dc_isar_feature(aa32_jscvt, s)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vm & 0x10)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    vm = tcg_temp_new_i64();
+    vd = tcg_temp_new_i32();
+    vfp_load_reg64(vm, a->vm);
+    gen_helper_vjcvt(vd, vm, cpu_env);
+    vfp_store_reg32(vd, a->vd);
+    tcg_temp_free_i64(vm);
+    tcg_temp_free_i32(vd);
+    return true;
+}
+
+static bool trans_VCVT_fix_hp(DisasContext *s, arg_VCVT_fix_sp *a)
+{
+    TCGv_i32 vd, shift;
+    TCGv_ptr fpst;
+    int frac_bits;
+
+    if (!dc_isar_feature(aa32_fp16_arith, s)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    frac_bits = (a->opc & 1) ? (32 - a->imm) : (16 - a->imm);
+
+    vd = tcg_temp_new_i32();
+    vfp_load_reg32(vd, a->vd);
+
+    fpst = fpstatus_ptr(FPST_FPCR_F16);
+    shift = tcg_constant_i32(frac_bits);
+
+    /* Switch on op:U:sx bits */
+    switch (a->opc) {
+    case 0:
+        gen_helper_vfp_shtoh_round_to_nearest(vd, vd, shift, fpst);
+        break;
+    case 1:
+        gen_helper_vfp_sltoh_round_to_nearest(vd, vd, shift, fpst);
+        break;
+    case 2:
+        gen_helper_vfp_uhtoh_round_to_nearest(vd, vd, shift, fpst);
+        break;
+    case 3:
+        gen_helper_vfp_ultoh_round_to_nearest(vd, vd, shift, fpst);
+        break;
+    case 4:
+        gen_helper_vfp_toshh_round_to_zero(vd, vd, shift, fpst);
+        break;
+    case 5:
+        gen_helper_vfp_toslh_round_to_zero(vd, vd, shift, fpst);
+        break;
+    case 6:
+        gen_helper_vfp_touhh_round_to_zero(vd, vd, shift, fpst);
+        break;
+    case 7:
+        gen_helper_vfp_toulh_round_to_zero(vd, vd, shift, fpst);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    vfp_store_reg32(vd, a->vd);
+    tcg_temp_free_i32(vd);
+    tcg_temp_free_ptr(fpst);
+    return true;
+}
+
+static bool trans_VCVT_fix_sp(DisasContext *s, arg_VCVT_fix_sp *a)
+{
+    TCGv_i32 vd, shift;
+    TCGv_ptr fpst;
+    int frac_bits;
+
+    if (!dc_isar_feature(aa32_fpsp_v3, s)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    frac_bits = (a->opc & 1) ? (32 - a->imm) : (16 - a->imm);
+
+    vd = tcg_temp_new_i32();
+    vfp_load_reg32(vd, a->vd);
+
+    fpst = fpstatus_ptr(FPST_FPCR);
+    shift = tcg_constant_i32(frac_bits);
+
+    /* Switch on op:U:sx bits */
+    switch (a->opc) {
+    case 0:
+        gen_helper_vfp_shtos_round_to_nearest(vd, vd, shift, fpst);
+        break;
+    case 1:
+        gen_helper_vfp_sltos_round_to_nearest(vd, vd, shift, fpst);
+        break;
+    case 2:
+        gen_helper_vfp_uhtos_round_to_nearest(vd, vd, shift, fpst);
+        break;
+    case 3:
+        gen_helper_vfp_ultos_round_to_nearest(vd, vd, shift, fpst);
+        break;
+    case 4:
+        gen_helper_vfp_toshs_round_to_zero(vd, vd, shift, fpst);
+        break;
+    case 5:
+        gen_helper_vfp_tosls_round_to_zero(vd, vd, shift, fpst);
+        break;
+    case 6:
+        gen_helper_vfp_touhs_round_to_zero(vd, vd, shift, fpst);
+        break;
+    case 7:
+        gen_helper_vfp_touls_round_to_zero(vd, vd, shift, fpst);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    vfp_store_reg32(vd, a->vd);
+    tcg_temp_free_i32(vd);
+    tcg_temp_free_ptr(fpst);
+    return true;
+}
+
+static bool trans_VCVT_fix_dp(DisasContext *s, arg_VCVT_fix_dp *a)
+{
+    TCGv_i64 vd;
+    TCGv_i32 shift;
+    TCGv_ptr fpst;
+    int frac_bits;
+
+    if (!dc_isar_feature(aa32_fpdp_v3, s)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    frac_bits = (a->opc & 1) ? (32 - a->imm) : (16 - a->imm);
+
+    vd = tcg_temp_new_i64();
+    vfp_load_reg64(vd, a->vd);
+
+    fpst = fpstatus_ptr(FPST_FPCR);
+    shift = tcg_constant_i32(frac_bits);
+
+    /* Switch on op:U:sx bits */
+    switch (a->opc) {
+    case 0:
+        gen_helper_vfp_shtod_round_to_nearest(vd, vd, shift, fpst);
+        break;
+    case 1:
+        gen_helper_vfp_sltod_round_to_nearest(vd, vd, shift, fpst);
+        break;
+    case 2:
+        gen_helper_vfp_uhtod_round_to_nearest(vd, vd, shift, fpst);
+        break;
+    case 3:
+        gen_helper_vfp_ultod_round_to_nearest(vd, vd, shift, fpst);
+        break;
+    case 4:
+        gen_helper_vfp_toshd_round_to_zero(vd, vd, shift, fpst);
+        break;
+    case 5:
+        gen_helper_vfp_tosld_round_to_zero(vd, vd, shift, fpst);
+        break;
+    case 6:
+        gen_helper_vfp_touhd_round_to_zero(vd, vd, shift, fpst);
+        break;
+    case 7:
+        gen_helper_vfp_tould_round_to_zero(vd, vd, shift, fpst);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    vfp_store_reg64(vd, a->vd);
+    tcg_temp_free_i64(vd);
+    tcg_temp_free_ptr(fpst);
+    return true;
+}
+
+static bool trans_VCVT_hp_int(DisasContext *s, arg_VCVT_sp_int *a)
+{
+    TCGv_i32 vm;
+    TCGv_ptr fpst;
+
+    if (!dc_isar_feature(aa32_fp16_arith, s)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    fpst = fpstatus_ptr(FPST_FPCR_F16);
+    vm = tcg_temp_new_i32();
+    vfp_load_reg32(vm, a->vm);
+
+    if (a->s) {
+        if (a->rz) {
+            gen_helper_vfp_tosizh(vm, vm, fpst);
+        } else {
+            gen_helper_vfp_tosih(vm, vm, fpst);
+        }
+    } else {
+        if (a->rz) {
+            gen_helper_vfp_touizh(vm, vm, fpst);
+        } else {
+            gen_helper_vfp_touih(vm, vm, fpst);
+        }
+    }
+    vfp_store_reg32(vm, a->vd);
+    tcg_temp_free_i32(vm);
+    tcg_temp_free_ptr(fpst);
+    return true;
+}
+
+static bool trans_VCVT_sp_int(DisasContext *s, arg_VCVT_sp_int *a)
+{
+    TCGv_i32 vm;
+    TCGv_ptr fpst;
+
+    if (!dc_isar_feature(aa32_fpsp_v2, s)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    fpst = fpstatus_ptr(FPST_FPCR);
+    vm = tcg_temp_new_i32();
+    vfp_load_reg32(vm, a->vm);
+
+    if (a->s) {
+        if (a->rz) {
+            gen_helper_vfp_tosizs(vm, vm, fpst);
+        } else {
+            gen_helper_vfp_tosis(vm, vm, fpst);
+        }
+    } else {
+        if (a->rz) {
+            gen_helper_vfp_touizs(vm, vm, fpst);
+        } else {
+            gen_helper_vfp_touis(vm, vm, fpst);
+        }
+    }
+    vfp_store_reg32(vm, a->vd);
+    tcg_temp_free_i32(vm);
+    tcg_temp_free_ptr(fpst);
+    return true;
+}
+
+static bool trans_VCVT_dp_int(DisasContext *s, arg_VCVT_dp_int *a)
+{
+    TCGv_i32 vd;
+    TCGv_i64 vm;
+    TCGv_ptr fpst;
+
+    if (!dc_isar_feature(aa32_fpdp_v2, s)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vm & 0x10)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    fpst = fpstatus_ptr(FPST_FPCR);
+    vm = tcg_temp_new_i64();
+    vd = tcg_temp_new_i32();
+    vfp_load_reg64(vm, a->vm);
+
+    if (a->s) {
+        if (a->rz) {
+            gen_helper_vfp_tosizd(vd, vm, fpst);
+        } else {
+            gen_helper_vfp_tosid(vd, vm, fpst);
+        }
+    } else {
+        if (a->rz) {
+            gen_helper_vfp_touizd(vd, vm, fpst);
+        } else {
+            gen_helper_vfp_touid(vd, vm, fpst);
+        }
+    }
+    vfp_store_reg32(vd, a->vd);
+    tcg_temp_free_i32(vd);
+    tcg_temp_free_i64(vm);
+    tcg_temp_free_ptr(fpst);
+    return true;
+}
+
+static bool trans_VINS(DisasContext *s, arg_VINS *a)
+{
+    TCGv_i32 rd, rm;
+
+    if (!dc_isar_feature(aa32_fp16_arith, s)) {
+        return false;
+    }
+
+    if (s->vec_len != 0 || s->vec_stride != 0) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    /* Insert low half of Vm into high half of Vd */
+    rm = tcg_temp_new_i32();
+    rd = tcg_temp_new_i32();
+    vfp_load_reg32(rm, a->vm);
+    vfp_load_reg32(rd, a->vd);
+    tcg_gen_deposit_i32(rd, rd, rm, 16, 16);
+    vfp_store_reg32(rd, a->vd);
+    tcg_temp_free_i32(rm);
+    tcg_temp_free_i32(rd);
+    return true;
+}
+
+static bool trans_VMOVX(DisasContext *s, arg_VINS *a)
+{
+    TCGv_i32 rm;
+
+    if (!dc_isar_feature(aa32_fp16_arith, s)) {
+        return false;
+    }
+
+    if (s->vec_len != 0 || s->vec_stride != 0) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    /* Set Vd to high half of Vm */
+    rm = tcg_temp_new_i32();
+    vfp_load_reg32(rm, a->vm);
+    tcg_gen_shri_i32(rm, rm, 16);
+    vfp_store_reg32(rm, a->vd);
+    tcg_temp_free_i32(rm);
+    return true;
+}
diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c
new file mode 100644
index 0000000..c23a346
--- /dev/null
+++ b/target/arm/tcg/translate.c
@@ -0,0 +1,9990 @@
+/*
+ *  ARM translation
+ *
+ *  Copyright (c) 2003 Fabrice Bellard
+ *  Copyright (c) 2005-2007 CodeSourcery
+ *  Copyright (c) 2007 OpenedHand, Ltd.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include "qemu/osdep.h"
+
+#include "cpu.h"
+#include "internals.h"
+#include "disas/disas.h"
+#include "exec/exec-all.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
+#include "qemu/log.h"
+#include "qemu/bitops.h"
+#include "arm_ldst.h"
+#include "semihosting/semihost.h"
+#include "exec/helper-proto.h"
+#include "exec/helper-gen.h"
+#include "exec/log.h"
+#include "cpregs.h"
+
+
+#define ENABLE_ARCH_4T    arm_dc_feature(s, ARM_FEATURE_V4T)
+#define ENABLE_ARCH_5     arm_dc_feature(s, ARM_FEATURE_V5)
+/* currently all emulated v5 cores are also v5TE, so don't bother */
+#define ENABLE_ARCH_5TE   arm_dc_feature(s, ARM_FEATURE_V5)
+#define ENABLE_ARCH_5J    dc_isar_feature(aa32_jazelle, s)
+#define ENABLE_ARCH_6     arm_dc_feature(s, ARM_FEATURE_V6)
+#define ENABLE_ARCH_6K    arm_dc_feature(s, ARM_FEATURE_V6K)
+#define ENABLE_ARCH_6T2   arm_dc_feature(s, ARM_FEATURE_THUMB2)
+#define ENABLE_ARCH_7     arm_dc_feature(s, ARM_FEATURE_V7)
+#define ENABLE_ARCH_8     arm_dc_feature(s, ARM_FEATURE_V8)
+
+#include "translate.h"
+#include "translate-a32.h"
+
+/* These are TCG temporaries used only by the legacy iwMMXt decoder */
+static TCGv_i64 cpu_V0, cpu_V1, cpu_M0;
+/* These are TCG globals which alias CPUARMState fields */
+static TCGv_i32 cpu_R[16];
+TCGv_i32 cpu_CF, cpu_NF, cpu_VF, cpu_ZF;
+TCGv_i64 cpu_exclusive_addr;
+TCGv_i64 cpu_exclusive_val;
+
+#include "exec/gen-icount.h"
+
+static const char * const regnames[] =
+    { "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "pc" };
+
+
+/* initialize TCG globals.  */
+void arm_translate_init(void)
+{
+    int i;
+
+    for (i = 0; i < 16; i++) {
+        cpu_R[i] = tcg_global_mem_new_i32(cpu_env,
+                                          offsetof(CPUARMState, regs[i]),
+                                          regnames[i]);
+    }
+    cpu_CF = tcg_global_mem_new_i32(cpu_env, offsetof(CPUARMState, CF), "CF");
+    cpu_NF = tcg_global_mem_new_i32(cpu_env, offsetof(CPUARMState, NF), "NF");
+    cpu_VF = tcg_global_mem_new_i32(cpu_env, offsetof(CPUARMState, VF), "VF");
+    cpu_ZF = tcg_global_mem_new_i32(cpu_env, offsetof(CPUARMState, ZF), "ZF");
+
+    cpu_exclusive_addr = tcg_global_mem_new_i64(cpu_env,
+        offsetof(CPUARMState, exclusive_addr), "exclusive_addr");
+    cpu_exclusive_val = tcg_global_mem_new_i64(cpu_env,
+        offsetof(CPUARMState, exclusive_val), "exclusive_val");
+
+    a64_translate_init();
+}
+
+uint64_t asimd_imm_const(uint32_t imm, int cmode, int op)
+{
+    /* Expand the encoded constant as per AdvSIMDExpandImm pseudocode */
+    switch (cmode) {
+    case 0: case 1:
+        /* no-op */
+        break;
+    case 2: case 3:
+        imm <<= 8;
+        break;
+    case 4: case 5:
+        imm <<= 16;
+        break;
+    case 6: case 7:
+        imm <<= 24;
+        break;
+    case 8: case 9:
+        imm |= imm << 16;
+        break;
+    case 10: case 11:
+        imm = (imm << 8) | (imm << 24);
+        break;
+    case 12:
+        imm = (imm << 8) | 0xff;
+        break;
+    case 13:
+        imm = (imm << 16) | 0xffff;
+        break;
+    case 14:
+        if (op) {
+            /*
+             * This and cmode == 15 op == 1 are the only cases where
+             * the top and bottom 32 bits of the encoded constant differ.
+             */
+            uint64_t imm64 = 0;
+            int n;
+
+            for (n = 0; n < 8; n++) {
+                if (imm & (1 << n)) {
+                    imm64 |= (0xffULL << (n * 8));
+                }
+            }
+            return imm64;
+        }
+        imm |= (imm << 8) | (imm << 16) | (imm << 24);
+        break;
+    case 15:
+        if (op) {
+            /* Reserved encoding for AArch32; valid for AArch64 */
+            uint64_t imm64 = (uint64_t)(imm & 0x3f) << 48;
+            if (imm & 0x80) {
+                imm64 |= 0x8000000000000000ULL;
+            }
+            if (imm & 0x40) {
+                imm64 |= 0x3fc0000000000000ULL;
+            } else {
+                imm64 |= 0x4000000000000000ULL;
+            }
+            return imm64;
+        }
+        imm = ((imm & 0x80) << 24) | ((imm & 0x3f) << 19)
+            | ((imm & 0x40) ? (0x1f << 25) : (1 << 30));
+        break;
+    }
+    if (op) {
+        imm = ~imm;
+    }
+    return dup_const(MO_32, imm);
+}
+
+/* Generate a label used for skipping this instruction */
+void arm_gen_condlabel(DisasContext *s)
+{
+    if (!s->condjmp) {
+        s->condlabel = gen_disas_label(s);
+        s->condjmp = 1;
+    }
+}
+
+/* Flags for the disas_set_da_iss info argument:
+ * lower bits hold the Rt register number, higher bits are flags.
+ */
+typedef enum ISSInfo {
+    ISSNone = 0,
+    ISSRegMask = 0x1f,
+    ISSInvalid = (1 << 5),
+    ISSIsAcqRel = (1 << 6),
+    ISSIsWrite = (1 << 7),
+    ISSIs16Bit = (1 << 8),
+} ISSInfo;
+
+/*
+ * Store var into env + offset to a member with size bytes.
+ * Free var after use.
+ */
+void store_cpu_offset(TCGv_i32 var, int offset, int size)
+{
+    switch (size) {
+    case 1:
+        tcg_gen_st8_i32(var, cpu_env, offset);
+        break;
+    case 4:
+        tcg_gen_st_i32(var, cpu_env, offset);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    tcg_temp_free_i32(var);
+}
+
+/* Save the syndrome information for a Data Abort */
+static void disas_set_da_iss(DisasContext *s, MemOp memop, ISSInfo issinfo)
+{
+    uint32_t syn;
+    int sas = memop & MO_SIZE;
+    bool sse = memop & MO_SIGN;
+    bool is_acqrel = issinfo & ISSIsAcqRel;
+    bool is_write = issinfo & ISSIsWrite;
+    bool is_16bit = issinfo & ISSIs16Bit;
+    int srt = issinfo & ISSRegMask;
+
+    if (issinfo & ISSInvalid) {
+        /* Some callsites want to conditionally provide ISS info,
+         * eg "only if this was not a writeback"
+         */
+        return;
+    }
+
+    if (srt == 15) {
+        /* For AArch32, insns where the src/dest is R15 never generate
+         * ISS information. Catching that here saves checking at all
+         * the call sites.
+         */
+        return;
+    }
+
+    syn = syn_data_abort_with_iss(0, sas, sse, srt, 0, is_acqrel,
+                                  0, 0, 0, is_write, 0, is_16bit);
+    disas_set_insn_syndrome(s, syn);
+}
+
+static inline int get_a32_user_mem_index(DisasContext *s)
+{
+    /* Return the core mmu_idx to use for A32/T32 "unprivileged load/store"
+     * insns:
+     *  if PL2, UNPREDICTABLE (we choose to implement as if PL0)
+     *  otherwise, access as if at PL0.
+     */
+    switch (s->mmu_idx) {
+    case ARMMMUIdx_E3:
+    case ARMMMUIdx_E2:        /* this one is UNPREDICTABLE */
+    case ARMMMUIdx_E10_0:
+    case ARMMMUIdx_E10_1:
+    case ARMMMUIdx_E10_1_PAN:
+        return arm_to_core_mmu_idx(ARMMMUIdx_E10_0);
+    case ARMMMUIdx_MUser:
+    case ARMMMUIdx_MPriv:
+        return arm_to_core_mmu_idx(ARMMMUIdx_MUser);
+    case ARMMMUIdx_MUserNegPri:
+    case ARMMMUIdx_MPrivNegPri:
+        return arm_to_core_mmu_idx(ARMMMUIdx_MUserNegPri);
+    case ARMMMUIdx_MSUser:
+    case ARMMMUIdx_MSPriv:
+        return arm_to_core_mmu_idx(ARMMMUIdx_MSUser);
+    case ARMMMUIdx_MSUserNegPri:
+    case ARMMMUIdx_MSPrivNegPri:
+        return arm_to_core_mmu_idx(ARMMMUIdx_MSUserNegPri);
+    default:
+        g_assert_not_reached();
+    }
+}
+
+/* The pc_curr difference for an architectural jump. */
+static target_long jmp_diff(DisasContext *s, target_long diff)
+{
+    return diff + (s->thumb ? 4 : 8);
+}
+
+static void gen_pc_plus_diff(DisasContext *s, TCGv_i32 var, target_long diff)
+{
+    assert(s->pc_save != -1);
+    if (TARGET_TB_PCREL) {
+        tcg_gen_addi_i32(var, cpu_R[15], (s->pc_curr - s->pc_save) + diff);
+    } else {
+        tcg_gen_movi_i32(var, s->pc_curr + diff);
+    }
+}
+
+/* Set a variable to the value of a CPU register.  */
+void load_reg_var(DisasContext *s, TCGv_i32 var, int reg)
+{
+    if (reg == 15) {
+        gen_pc_plus_diff(s, var, jmp_diff(s, 0));
+    } else {
+        tcg_gen_mov_i32(var, cpu_R[reg]);
+    }
+}
+
+/*
+ * Create a new temp, REG + OFS, except PC is ALIGN(PC, 4).
+ * This is used for load/store for which use of PC implies (literal),
+ * or ADD that implies ADR.
+ */
+TCGv_i32 add_reg_for_lit(DisasContext *s, int reg, int ofs)
+{
+    TCGv_i32 tmp = tcg_temp_new_i32();
+
+    if (reg == 15) {
+        /*
+         * This address is computed from an aligned PC:
+         * subtract off the low bits.
+         */
+        gen_pc_plus_diff(s, tmp, jmp_diff(s, ofs - (s->pc_curr & 3)));
+    } else {
+        tcg_gen_addi_i32(tmp, cpu_R[reg], ofs);
+    }
+    return tmp;
+}
+
+/* Set a CPU register.  The source must be a temporary and will be
+   marked as dead.  */
+void store_reg(DisasContext *s, int reg, TCGv_i32 var)
+{
+    if (reg == 15) {
+        /* In Thumb mode, we must ignore bit 0.
+         * In ARM mode, for ARMv4 and ARMv5, it is UNPREDICTABLE if bits [1:0]
+         * are not 0b00, but for ARMv6 and above, we must ignore bits [1:0].
+         * We choose to ignore [1:0] in ARM mode for all architecture versions.
+         */
+        tcg_gen_andi_i32(var, var, s->thumb ? ~1 : ~3);
+        s->base.is_jmp = DISAS_JUMP;
+        s->pc_save = -1;
+    } else if (reg == 13 && arm_dc_feature(s, ARM_FEATURE_M)) {
+        /* For M-profile SP bits [1:0] are always zero */
+        tcg_gen_andi_i32(var, var, ~3);
+    }
+    tcg_gen_mov_i32(cpu_R[reg], var);
+    tcg_temp_free_i32(var);
+}
+
+/*
+ * Variant of store_reg which applies v8M stack-limit checks before updating
+ * SP. If the check fails this will result in an exception being taken.
+ * We disable the stack checks for CONFIG_USER_ONLY because we have
+ * no idea what the stack limits should be in that case.
+ * If stack checking is not being done this just acts like store_reg().
+ */
+static void store_sp_checked(DisasContext *s, TCGv_i32 var)
+{
+#ifndef CONFIG_USER_ONLY
+    if (s->v8m_stackcheck) {
+        gen_helper_v8m_stackcheck(cpu_env, var);
+    }
+#endif
+    store_reg(s, 13, var);
+}
+
+/* Value extensions.  */
+#define gen_uxtb(var) tcg_gen_ext8u_i32(var, var)
+#define gen_uxth(var) tcg_gen_ext16u_i32(var, var)
+#define gen_sxtb(var) tcg_gen_ext8s_i32(var, var)
+#define gen_sxth(var) tcg_gen_ext16s_i32(var, var)
+
+#define gen_sxtb16(var) gen_helper_sxtb16(var, var)
+#define gen_uxtb16(var) gen_helper_uxtb16(var, var)
+
+void gen_set_cpsr(TCGv_i32 var, uint32_t mask)
+{
+    gen_helper_cpsr_write(cpu_env, var, tcg_constant_i32(mask));
+}
+
+static void gen_rebuild_hflags(DisasContext *s, bool new_el)
+{
+    bool m_profile = arm_dc_feature(s, ARM_FEATURE_M);
+
+    if (new_el) {
+        if (m_profile) {
+            gen_helper_rebuild_hflags_m32_newel(cpu_env);
+        } else {
+            gen_helper_rebuild_hflags_a32_newel(cpu_env);
+        }
+    } else {
+        TCGv_i32 tcg_el = tcg_constant_i32(s->current_el);
+        if (m_profile) {
+            gen_helper_rebuild_hflags_m32(cpu_env, tcg_el);
+        } else {
+            gen_helper_rebuild_hflags_a32(cpu_env, tcg_el);
+        }
+    }
+}
+
+static void gen_exception_internal(int excp)
+{
+    assert(excp_is_internal(excp));
+    gen_helper_exception_internal(cpu_env, tcg_constant_i32(excp));
+}
+
+static void gen_singlestep_exception(DisasContext *s)
+{
+    /* We just completed step of an insn. Move from Active-not-pending
+     * to Active-pending, and then also take the swstep exception.
+     * This corresponds to making the (IMPDEF) choice to prioritize
+     * swstep exceptions over asynchronous exceptions taken to an exception
+     * level where debug is disabled. This choice has the advantage that
+     * we do not need to maintain internal state corresponding to the
+     * ISV/EX syndrome bits between completion of the step and generation
+     * of the exception, and our syndrome information is always correct.
+     */
+    gen_ss_advance(s);
+    gen_swstep_exception(s, 1, s->is_ldex);
+    s->base.is_jmp = DISAS_NORETURN;
+}
+
+void clear_eci_state(DisasContext *s)
+{
+    /*
+     * Clear any ECI/ICI state: used when a load multiple/store
+     * multiple insn executes.
+     */
+    if (s->eci) {
+        store_cpu_field_constant(0, condexec_bits);
+        s->eci = 0;
+    }
+}
+
+static void gen_smul_dual(TCGv_i32 a, TCGv_i32 b)
+{
+    TCGv_i32 tmp1 = tcg_temp_new_i32();
+    TCGv_i32 tmp2 = tcg_temp_new_i32();
+    tcg_gen_ext16s_i32(tmp1, a);
+    tcg_gen_ext16s_i32(tmp2, b);
+    tcg_gen_mul_i32(tmp1, tmp1, tmp2);
+    tcg_temp_free_i32(tmp2);
+    tcg_gen_sari_i32(a, a, 16);
+    tcg_gen_sari_i32(b, b, 16);
+    tcg_gen_mul_i32(b, b, a);
+    tcg_gen_mov_i32(a, tmp1);
+    tcg_temp_free_i32(tmp1);
+}
+
+/* Byteswap each halfword.  */
+void gen_rev16(TCGv_i32 dest, TCGv_i32 var)
+{
+    TCGv_i32 tmp = tcg_temp_new_i32();
+    TCGv_i32 mask = tcg_constant_i32(0x00ff00ff);
+    tcg_gen_shri_i32(tmp, var, 8);
+    tcg_gen_and_i32(tmp, tmp, mask);
+    tcg_gen_and_i32(var, var, mask);
+    tcg_gen_shli_i32(var, var, 8);
+    tcg_gen_or_i32(dest, var, tmp);
+    tcg_temp_free_i32(tmp);
+}
+
+/* Byteswap low halfword and sign extend.  */
+static void gen_revsh(TCGv_i32 dest, TCGv_i32 var)
+{
+    tcg_gen_bswap16_i32(var, var, TCG_BSWAP_OS);
+}
+
+/* Dual 16-bit add.  Result placed in t0 and t1 is marked as dead.
+    tmp = (t0 ^ t1) & 0x8000;
+    t0 &= ~0x8000;
+    t1 &= ~0x8000;
+    t0 = (t0 + t1) ^ tmp;
+ */
+
+static void gen_add16(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
+{
+    TCGv_i32 tmp = tcg_temp_new_i32();
+    tcg_gen_xor_i32(tmp, t0, t1);
+    tcg_gen_andi_i32(tmp, tmp, 0x8000);
+    tcg_gen_andi_i32(t0, t0, ~0x8000);
+    tcg_gen_andi_i32(t1, t1, ~0x8000);
+    tcg_gen_add_i32(t0, t0, t1);
+    tcg_gen_xor_i32(dest, t0, tmp);
+    tcg_temp_free_i32(tmp);
+}
+
+/* Set N and Z flags from var.  */
+static inline void gen_logic_CC(TCGv_i32 var)
+{
+    tcg_gen_mov_i32(cpu_NF, var);
+    tcg_gen_mov_i32(cpu_ZF, var);
+}
+
+/* dest = T0 + T1 + CF. */
+static void gen_add_carry(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
+{
+    tcg_gen_add_i32(dest, t0, t1);
+    tcg_gen_add_i32(dest, dest, cpu_CF);
+}
+
+/* dest = T0 - T1 + CF - 1.  */
+static void gen_sub_carry(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
+{
+    tcg_gen_sub_i32(dest, t0, t1);
+    tcg_gen_add_i32(dest, dest, cpu_CF);
+    tcg_gen_subi_i32(dest, dest, 1);
+}
+
+/* dest = T0 + T1. Compute C, N, V and Z flags */
+static void gen_add_CC(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
+{
+    TCGv_i32 tmp = tcg_temp_new_i32();
+    tcg_gen_movi_i32(tmp, 0);
+    tcg_gen_add2_i32(cpu_NF, cpu_CF, t0, tmp, t1, tmp);
+    tcg_gen_mov_i32(cpu_ZF, cpu_NF);
+    tcg_gen_xor_i32(cpu_VF, cpu_NF, t0);
+    tcg_gen_xor_i32(tmp, t0, t1);
+    tcg_gen_andc_i32(cpu_VF, cpu_VF, tmp);
+    tcg_temp_free_i32(tmp);
+    tcg_gen_mov_i32(dest, cpu_NF);
+}
+
+/* dest = T0 + T1 + CF.  Compute C, N, V and Z flags */
+static void gen_adc_CC(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
+{
+    TCGv_i32 tmp = tcg_temp_new_i32();
+    if (TCG_TARGET_HAS_add2_i32) {
+        tcg_gen_movi_i32(tmp, 0);
+        tcg_gen_add2_i32(cpu_NF, cpu_CF, t0, tmp, cpu_CF, tmp);
+        tcg_gen_add2_i32(cpu_NF, cpu_CF, cpu_NF, cpu_CF, t1, tmp);
+    } else {
+        TCGv_i64 q0 = tcg_temp_new_i64();
+        TCGv_i64 q1 = tcg_temp_new_i64();
+        tcg_gen_extu_i32_i64(q0, t0);
+        tcg_gen_extu_i32_i64(q1, t1);
+        tcg_gen_add_i64(q0, q0, q1);
+        tcg_gen_extu_i32_i64(q1, cpu_CF);
+        tcg_gen_add_i64(q0, q0, q1);
+        tcg_gen_extr_i64_i32(cpu_NF, cpu_CF, q0);
+        tcg_temp_free_i64(q0);
+        tcg_temp_free_i64(q1);
+    }
+    tcg_gen_mov_i32(cpu_ZF, cpu_NF);
+    tcg_gen_xor_i32(cpu_VF, cpu_NF, t0);
+    tcg_gen_xor_i32(tmp, t0, t1);
+    tcg_gen_andc_i32(cpu_VF, cpu_VF, tmp);
+    tcg_temp_free_i32(tmp);
+    tcg_gen_mov_i32(dest, cpu_NF);
+}
+
+/* dest = T0 - T1. Compute C, N, V and Z flags */
+static void gen_sub_CC(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
+{
+    TCGv_i32 tmp;
+    tcg_gen_sub_i32(cpu_NF, t0, t1);
+    tcg_gen_mov_i32(cpu_ZF, cpu_NF);
+    tcg_gen_setcond_i32(TCG_COND_GEU, cpu_CF, t0, t1);
+    tcg_gen_xor_i32(cpu_VF, cpu_NF, t0);
+    tmp = tcg_temp_new_i32();
+    tcg_gen_xor_i32(tmp, t0, t1);
+    tcg_gen_and_i32(cpu_VF, cpu_VF, tmp);
+    tcg_temp_free_i32(tmp);
+    tcg_gen_mov_i32(dest, cpu_NF);
+}
+
+/* dest = T0 + ~T1 + CF.  Compute C, N, V and Z flags */
+static void gen_sbc_CC(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
+{
+    TCGv_i32 tmp = tcg_temp_new_i32();
+    tcg_gen_not_i32(tmp, t1);
+    gen_adc_CC(dest, t0, tmp);
+    tcg_temp_free_i32(tmp);
+}
+
+#define GEN_SHIFT(name)                                               \
+static void gen_##name(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)       \
+{                                                                     \
+    TCGv_i32 tmpd = tcg_temp_new_i32();                               \
+    TCGv_i32 tmp1 = tcg_temp_new_i32();                               \
+    TCGv_i32 zero = tcg_constant_i32(0);                              \
+    tcg_gen_andi_i32(tmp1, t1, 0x1f);                                 \
+    tcg_gen_##name##_i32(tmpd, t0, tmp1);                             \
+    tcg_gen_andi_i32(tmp1, t1, 0xe0);                                 \
+    tcg_gen_movcond_i32(TCG_COND_NE, dest, tmp1, zero, zero, tmpd);   \
+    tcg_temp_free_i32(tmpd);                                          \
+    tcg_temp_free_i32(tmp1);                                          \
+}
+GEN_SHIFT(shl)
+GEN_SHIFT(shr)
+#undef GEN_SHIFT
+
+static void gen_sar(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
+{
+    TCGv_i32 tmp1 = tcg_temp_new_i32();
+
+    tcg_gen_andi_i32(tmp1, t1, 0xff);
+    tcg_gen_umin_i32(tmp1, tmp1, tcg_constant_i32(31));
+    tcg_gen_sar_i32(dest, t0, tmp1);
+    tcg_temp_free_i32(tmp1);
+}
+
+static void shifter_out_im(TCGv_i32 var, int shift)
+{
+    tcg_gen_extract_i32(cpu_CF, var, shift, 1);
+}
+
+/* Shift by immediate.  Includes special handling for shift == 0.  */
+static inline void gen_arm_shift_im(TCGv_i32 var, int shiftop,
+                                    int shift, int flags)
+{
+    switch (shiftop) {
+    case 0: /* LSL */
+        if (shift != 0) {
+            if (flags)
+                shifter_out_im(var, 32 - shift);
+            tcg_gen_shli_i32(var, var, shift);
+        }
+        break;
+    case 1: /* LSR */
+        if (shift == 0) {
+            if (flags) {
+                tcg_gen_shri_i32(cpu_CF, var, 31);
+            }
+            tcg_gen_movi_i32(var, 0);
+        } else {
+            if (flags)
+                shifter_out_im(var, shift - 1);
+            tcg_gen_shri_i32(var, var, shift);
+        }
+        break;
+    case 2: /* ASR */
+        if (shift == 0)
+            shift = 32;
+        if (flags)
+            shifter_out_im(var, shift - 1);
+        if (shift == 32)
+          shift = 31;
+        tcg_gen_sari_i32(var, var, shift);
+        break;
+    case 3: /* ROR/RRX */
+        if (shift != 0) {
+            if (flags)
+                shifter_out_im(var, shift - 1);
+            tcg_gen_rotri_i32(var, var, shift); break;
+        } else {
+            TCGv_i32 tmp = tcg_temp_new_i32();
+            tcg_gen_shli_i32(tmp, cpu_CF, 31);
+            if (flags)
+                shifter_out_im(var, 0);
+            tcg_gen_shri_i32(var, var, 1);
+            tcg_gen_or_i32(var, var, tmp);
+            tcg_temp_free_i32(tmp);
+        }
+    }
+};
+
+static inline void gen_arm_shift_reg(TCGv_i32 var, int shiftop,
+                                     TCGv_i32 shift, int flags)
+{
+    if (flags) {
+        switch (shiftop) {
+        case 0: gen_helper_shl_cc(var, cpu_env, var, shift); break;
+        case 1: gen_helper_shr_cc(var, cpu_env, var, shift); break;
+        case 2: gen_helper_sar_cc(var, cpu_env, var, shift); break;
+        case 3: gen_helper_ror_cc(var, cpu_env, var, shift); break;
+        }
+    } else {
+        switch (shiftop) {
+        case 0:
+            gen_shl(var, var, shift);
+            break;
+        case 1:
+            gen_shr(var, var, shift);
+            break;
+        case 2:
+            gen_sar(var, var, shift);
+            break;
+        case 3: tcg_gen_andi_i32(shift, shift, 0x1f);
+                tcg_gen_rotr_i32(var, var, shift); break;
+        }
+    }
+    tcg_temp_free_i32(shift);
+}
+
+/*
+ * Generate a conditional based on ARM condition code cc.
+ * This is common between ARM and Aarch64 targets.
+ */
+void arm_test_cc(DisasCompare *cmp, int cc)
+{
+    TCGv_i32 value;
+    TCGCond cond;
+    bool global = true;
+
+    switch (cc) {
+    case 0: /* eq: Z */
+    case 1: /* ne: !Z */
+        cond = TCG_COND_EQ;
+        value = cpu_ZF;
+        break;
+
+    case 2: /* cs: C */
+    case 3: /* cc: !C */
+        cond = TCG_COND_NE;
+        value = cpu_CF;
+        break;
+
+    case 4: /* mi: N */
+    case 5: /* pl: !N */
+        cond = TCG_COND_LT;
+        value = cpu_NF;
+        break;
+
+    case 6: /* vs: V */
+    case 7: /* vc: !V */
+        cond = TCG_COND_LT;
+        value = cpu_VF;
+        break;
+
+    case 8: /* hi: C && !Z */
+    case 9: /* ls: !C || Z -> !(C && !Z) */
+        cond = TCG_COND_NE;
+        value = tcg_temp_new_i32();
+        global = false;
+        /* CF is 1 for C, so -CF is an all-bits-set mask for C;
+           ZF is non-zero for !Z; so AND the two subexpressions.  */
+        tcg_gen_neg_i32(value, cpu_CF);
+        tcg_gen_and_i32(value, value, cpu_ZF);
+        break;
+
+    case 10: /* ge: N == V -> N ^ V == 0 */
+    case 11: /* lt: N != V -> N ^ V != 0 */
+        /* Since we're only interested in the sign bit, == 0 is >= 0.  */
+        cond = TCG_COND_GE;
+        value = tcg_temp_new_i32();
+        global = false;
+        tcg_gen_xor_i32(value, cpu_VF, cpu_NF);
+        break;
+
+    case 12: /* gt: !Z && N == V */
+    case 13: /* le: Z || N != V */
+        cond = TCG_COND_NE;
+        value = tcg_temp_new_i32();
+        global = false;
+        /* (N == V) is equal to the sign bit of ~(NF ^ VF).  Propagate
+         * the sign bit then AND with ZF to yield the result.  */
+        tcg_gen_xor_i32(value, cpu_VF, cpu_NF);
+        tcg_gen_sari_i32(value, value, 31);
+        tcg_gen_andc_i32(value, cpu_ZF, value);
+        break;
+
+    case 14: /* always */
+    case 15: /* always */
+        /* Use the ALWAYS condition, which will fold early.
+         * It doesn't matter what we use for the value.  */
+        cond = TCG_COND_ALWAYS;
+        value = cpu_ZF;
+        goto no_invert;
+
+    default:
+        fprintf(stderr, "Bad condition code 0x%x\n", cc);
+        abort();
+    }
+
+    if (cc & 1) {
+        cond = tcg_invert_cond(cond);
+    }
+
+ no_invert:
+    cmp->cond = cond;
+    cmp->value = value;
+    cmp->value_global = global;
+}
+
+void arm_free_cc(DisasCompare *cmp)
+{
+    if (!cmp->value_global) {
+        tcg_temp_free_i32(cmp->value);
+    }
+}
+
+void arm_jump_cc(DisasCompare *cmp, TCGLabel *label)
+{
+    tcg_gen_brcondi_i32(cmp->cond, cmp->value, 0, label);
+}
+
+void arm_gen_test_cc(int cc, TCGLabel *label)
+{
+    DisasCompare cmp;
+    arm_test_cc(&cmp, cc);
+    arm_jump_cc(&cmp, label);
+    arm_free_cc(&cmp);
+}
+
+void gen_set_condexec(DisasContext *s)
+{
+    if (s->condexec_mask) {
+        uint32_t val = (s->condexec_cond << 4) | (s->condexec_mask >> 1);
+
+        store_cpu_field_constant(val, condexec_bits);
+    }
+}
+
+void gen_update_pc(DisasContext *s, target_long diff)
+{
+    gen_pc_plus_diff(s, cpu_R[15], diff);
+    s->pc_save = s->pc_curr + diff;
+}
+
+/* Set PC and Thumb state from var.  var is marked as dead.  */
+static inline void gen_bx(DisasContext *s, TCGv_i32 var)
+{
+    s->base.is_jmp = DISAS_JUMP;
+    tcg_gen_andi_i32(cpu_R[15], var, ~1);
+    tcg_gen_andi_i32(var, var, 1);
+    store_cpu_field(var, thumb);
+    s->pc_save = -1;
+}
+
+/*
+ * Set PC and Thumb state from var. var is marked as dead.
+ * For M-profile CPUs, include logic to detect exception-return
+ * branches and handle them. This is needed for Thumb POP/LDM to PC, LDR to PC,
+ * and BX reg, and no others, and happens only for code in Handler mode.
+ * The Security Extension also requires us to check for the FNC_RETURN
+ * which signals a function return from non-secure state; this can happen
+ * in both Handler and Thread mode.
+ * To avoid having to do multiple comparisons in inline generated code,
+ * we make the check we do here loose, so it will match for EXC_RETURN
+ * in Thread mode. For system emulation do_v7m_exception_exit() checks
+ * for these spurious cases and returns without doing anything (giving
+ * the same behaviour as for a branch to a non-magic address).
+ *
+ * In linux-user mode it is unclear what the right behaviour for an
+ * attempted FNC_RETURN should be, because in real hardware this will go
+ * directly to Secure code (ie not the Linux kernel) which will then treat
+ * the error in any way it chooses. For QEMU we opt to make the FNC_RETURN
+ * attempt behave the way it would on a CPU without the security extension,
+ * which is to say "like a normal branch". That means we can simply treat
+ * all branches as normal with no magic address behaviour.
+ */
+static inline void gen_bx_excret(DisasContext *s, TCGv_i32 var)
+{
+    /* Generate the same code here as for a simple bx, but flag via
+     * s->base.is_jmp that we need to do the rest of the work later.
+     */
+    gen_bx(s, var);
+#ifndef CONFIG_USER_ONLY
+    if (arm_dc_feature(s, ARM_FEATURE_M_SECURITY) ||
+        (s->v7m_handler_mode && arm_dc_feature(s, ARM_FEATURE_M))) {
+        s->base.is_jmp = DISAS_BX_EXCRET;
+    }
+#endif
+}
+
+static inline void gen_bx_excret_final_code(DisasContext *s)
+{
+    /* Generate the code to finish possible exception return and end the TB */
+    DisasLabel excret_label = gen_disas_label(s);
+    uint32_t min_magic;
+
+    if (arm_dc_feature(s, ARM_FEATURE_M_SECURITY)) {
+        /* Covers FNC_RETURN and EXC_RETURN magic */
+        min_magic = FNC_RETURN_MIN_MAGIC;
+    } else {
+        /* EXC_RETURN magic only */
+        min_magic = EXC_RETURN_MIN_MAGIC;
+    }
+
+    /* Is the new PC value in the magic range indicating exception return? */
+    tcg_gen_brcondi_i32(TCG_COND_GEU, cpu_R[15], min_magic, excret_label.label);
+    /* No: end the TB as we would for a DISAS_JMP */
+    if (s->ss_active) {
+        gen_singlestep_exception(s);
+    } else {
+        tcg_gen_exit_tb(NULL, 0);
+    }
+    set_disas_label(s, excret_label);
+    /* Yes: this is an exception return.
+     * At this point in runtime env->regs[15] and env->thumb will hold
+     * the exception-return magic number, which do_v7m_exception_exit()
+     * will read. Nothing else will be able to see those values because
+     * the cpu-exec main loop guarantees that we will always go straight
+     * from raising the exception to the exception-handling code.
+     *
+     * gen_ss_advance(s) does nothing on M profile currently but
+     * calling it is conceptually the right thing as we have executed
+     * this instruction (compare SWI, HVC, SMC handling).
+     */
+    gen_ss_advance(s);
+    gen_exception_internal(EXCP_EXCEPTION_EXIT);
+}
+
+static inline void gen_bxns(DisasContext *s, int rm)
+{
+    TCGv_i32 var = load_reg(s, rm);
+
+    /* The bxns helper may raise an EXCEPTION_EXIT exception, so in theory
+     * we need to sync state before calling it, but:
+     *  - we don't need to do gen_update_pc() because the bxns helper will
+     *    always set the PC itself
+     *  - we don't need to do gen_set_condexec() because BXNS is UNPREDICTABLE
+     *    unless it's outside an IT block or the last insn in an IT block,
+     *    so we know that condexec == 0 (already set at the top of the TB)
+     *    is correct in the non-UNPREDICTABLE cases, and we can choose
+     *    "zeroes the IT bits" as our UNPREDICTABLE behaviour otherwise.
+     */
+    gen_helper_v7m_bxns(cpu_env, var);
+    tcg_temp_free_i32(var);
+    s->base.is_jmp = DISAS_EXIT;
+}
+
+static inline void gen_blxns(DisasContext *s, int rm)
+{
+    TCGv_i32 var = load_reg(s, rm);
+
+    /* We don't need to sync condexec state, for the same reason as bxns.
+     * We do however need to set the PC, because the blxns helper reads it.
+     * The blxns helper may throw an exception.
+     */
+    gen_update_pc(s, curr_insn_len(s));
+    gen_helper_v7m_blxns(cpu_env, var);
+    tcg_temp_free_i32(var);
+    s->base.is_jmp = DISAS_EXIT;
+}
+
+/* Variant of store_reg which uses branch&exchange logic when storing
+   to r15 in ARM architecture v7 and above. The source must be a temporary
+   and will be marked as dead. */
+static inline void store_reg_bx(DisasContext *s, int reg, TCGv_i32 var)
+{
+    if (reg == 15 && ENABLE_ARCH_7) {
+        gen_bx(s, var);
+    } else {
+        store_reg(s, reg, var);
+    }
+}
+
+/* Variant of store_reg which uses branch&exchange logic when storing
+ * to r15 in ARM architecture v5T and above. This is used for storing
+ * the results of a LDR/LDM/POP into r15, and corresponds to the cases
+ * in the ARM ARM which use the LoadWritePC() pseudocode function. */
+static inline void store_reg_from_load(DisasContext *s, int reg, TCGv_i32 var)
+{
+    if (reg == 15 && ENABLE_ARCH_5) {
+        gen_bx_excret(s, var);
+    } else {
+        store_reg(s, reg, var);
+    }
+}
+
+#ifdef CONFIG_USER_ONLY
+#define IS_USER_ONLY 1
+#else
+#define IS_USER_ONLY 0
+#endif
+
+MemOp pow2_align(unsigned i)
+{
+    static const MemOp mop_align[] = {
+        0, MO_ALIGN_2, MO_ALIGN_4, MO_ALIGN_8, MO_ALIGN_16,
+        /*
+         * FIXME: TARGET_PAGE_BITS_MIN affects TLB_FLAGS_MASK such
+         * that 256-bit alignment (MO_ALIGN_32) cannot be supported:
+         * see get_alignment_bits(). Enforce only 128-bit alignment for now.
+         */
+        MO_ALIGN_16
+    };
+    g_assert(i < ARRAY_SIZE(mop_align));
+    return mop_align[i];
+}
+
+/*
+ * Abstractions of "generate code to do a guest load/store for
+ * AArch32", where a vaddr is always 32 bits (and is zero
+ * extended if we're a 64 bit core) and  data is also
+ * 32 bits unless specifically doing a 64 bit access.
+ * These functions work like tcg_gen_qemu_{ld,st}* except
+ * that the address argument is TCGv_i32 rather than TCGv.
+ */
+
+static TCGv gen_aa32_addr(DisasContext *s, TCGv_i32 a32, MemOp op)
+{
+    TCGv addr = tcg_temp_new();
+    tcg_gen_extu_i32_tl(addr, a32);
+
+    /* Not needed for user-mode BE32, where we use MO_BE instead.  */
+    if (!IS_USER_ONLY && s->sctlr_b && (op & MO_SIZE) < MO_32) {
+        tcg_gen_xori_tl(addr, addr, 4 - (1 << (op & MO_SIZE)));
+    }
+    return addr;
+}
+
+/*
+ * Internal routines are used for NEON cases where the endianness
+ * and/or alignment has already been taken into account and manipulated.
+ */
+void gen_aa32_ld_internal_i32(DisasContext *s, TCGv_i32 val,
+                              TCGv_i32 a32, int index, MemOp opc)
+{
+    TCGv addr = gen_aa32_addr(s, a32, opc);
+    tcg_gen_qemu_ld_i32(val, addr, index, opc);
+    tcg_temp_free(addr);
+}
+
+void gen_aa32_st_internal_i32(DisasContext *s, TCGv_i32 val,
+                              TCGv_i32 a32, int index, MemOp opc)
+{
+    TCGv addr = gen_aa32_addr(s, a32, opc);
+    tcg_gen_qemu_st_i32(val, addr, index, opc);
+    tcg_temp_free(addr);
+}
+
+void gen_aa32_ld_internal_i64(DisasContext *s, TCGv_i64 val,
+                              TCGv_i32 a32, int index, MemOp opc)
+{
+    TCGv addr = gen_aa32_addr(s, a32, opc);
+
+    tcg_gen_qemu_ld_i64(val, addr, index, opc);
+
+    /* Not needed for user-mode BE32, where we use MO_BE instead.  */
+    if (!IS_USER_ONLY && s->sctlr_b && (opc & MO_SIZE) == MO_64) {
+        tcg_gen_rotri_i64(val, val, 32);
+    }
+    tcg_temp_free(addr);
+}
+
+void gen_aa32_st_internal_i64(DisasContext *s, TCGv_i64 val,
+                              TCGv_i32 a32, int index, MemOp opc)
+{
+    TCGv addr = gen_aa32_addr(s, a32, opc);
+
+    /* Not needed for user-mode BE32, where we use MO_BE instead.  */
+    if (!IS_USER_ONLY && s->sctlr_b && (opc & MO_SIZE) == MO_64) {
+        TCGv_i64 tmp = tcg_temp_new_i64();
+        tcg_gen_rotri_i64(tmp, val, 32);
+        tcg_gen_qemu_st_i64(tmp, addr, index, opc);
+        tcg_temp_free_i64(tmp);
+    } else {
+        tcg_gen_qemu_st_i64(val, addr, index, opc);
+    }
+    tcg_temp_free(addr);
+}
+
+void gen_aa32_ld_i32(DisasContext *s, TCGv_i32 val, TCGv_i32 a32,
+                     int index, MemOp opc)
+{
+    gen_aa32_ld_internal_i32(s, val, a32, index, finalize_memop(s, opc));
+}
+
+void gen_aa32_st_i32(DisasContext *s, TCGv_i32 val, TCGv_i32 a32,
+                     int index, MemOp opc)
+{
+    gen_aa32_st_internal_i32(s, val, a32, index, finalize_memop(s, opc));
+}
+
+void gen_aa32_ld_i64(DisasContext *s, TCGv_i64 val, TCGv_i32 a32,
+                     int index, MemOp opc)
+{
+    gen_aa32_ld_internal_i64(s, val, a32, index, finalize_memop(s, opc));
+}
+
+void gen_aa32_st_i64(DisasContext *s, TCGv_i64 val, TCGv_i32 a32,
+                     int index, MemOp opc)
+{
+    gen_aa32_st_internal_i64(s, val, a32, index, finalize_memop(s, opc));
+}
+
+#define DO_GEN_LD(SUFF, OPC)                                            \
+    static inline void gen_aa32_ld##SUFF(DisasContext *s, TCGv_i32 val, \
+                                         TCGv_i32 a32, int index)       \
+    {                                                                   \
+        gen_aa32_ld_i32(s, val, a32, index, OPC);                       \
+    }
+
+#define DO_GEN_ST(SUFF, OPC)                                            \
+    static inline void gen_aa32_st##SUFF(DisasContext *s, TCGv_i32 val, \
+                                         TCGv_i32 a32, int index)       \
+    {                                                                   \
+        gen_aa32_st_i32(s, val, a32, index, OPC);                       \
+    }
+
+static inline void gen_hvc(DisasContext *s, int imm16)
+{
+    /* The pre HVC helper handles cases when HVC gets trapped
+     * as an undefined insn by runtime configuration (ie before
+     * the insn really executes).
+     */
+    gen_update_pc(s, 0);
+    gen_helper_pre_hvc(cpu_env);
+    /* Otherwise we will treat this as a real exception which
+     * happens after execution of the insn. (The distinction matters
+     * for the PC value reported to the exception handler and also
+     * for single stepping.)
+     */
+    s->svc_imm = imm16;
+    gen_update_pc(s, curr_insn_len(s));
+    s->base.is_jmp = DISAS_HVC;
+}
+
+static inline void gen_smc(DisasContext *s)
+{
+    /* As with HVC, we may take an exception either before or after
+     * the insn executes.
+     */
+    gen_update_pc(s, 0);
+    gen_helper_pre_smc(cpu_env, tcg_constant_i32(syn_aa32_smc()));
+    gen_update_pc(s, curr_insn_len(s));
+    s->base.is_jmp = DISAS_SMC;
+}
+
+static void gen_exception_internal_insn(DisasContext *s, int excp)
+{
+    gen_set_condexec(s);
+    gen_update_pc(s, 0);
+    gen_exception_internal(excp);
+    s->base.is_jmp = DISAS_NORETURN;
+}
+
+static void gen_exception_el_v(int excp, uint32_t syndrome, TCGv_i32 tcg_el)
+{
+    gen_helper_exception_with_syndrome_el(cpu_env, tcg_constant_i32(excp),
+                                          tcg_constant_i32(syndrome), tcg_el);
+}
+
+static void gen_exception_el(int excp, uint32_t syndrome, uint32_t target_el)
+{
+    gen_exception_el_v(excp, syndrome, tcg_constant_i32(target_el));
+}
+
+static void gen_exception(int excp, uint32_t syndrome)
+{
+    gen_helper_exception_with_syndrome(cpu_env, tcg_constant_i32(excp),
+                                       tcg_constant_i32(syndrome));
+}
+
+static void gen_exception_insn_el_v(DisasContext *s, target_long pc_diff,
+                                    int excp, uint32_t syn, TCGv_i32 tcg_el)
+{
+    if (s->aarch64) {
+        gen_a64_update_pc(s, pc_diff);
+    } else {
+        gen_set_condexec(s);
+        gen_update_pc(s, pc_diff);
+    }
+    gen_exception_el_v(excp, syn, tcg_el);
+    s->base.is_jmp = DISAS_NORETURN;
+}
+
+void gen_exception_insn_el(DisasContext *s, target_long pc_diff, int excp,
+                           uint32_t syn, uint32_t target_el)
+{
+    gen_exception_insn_el_v(s, pc_diff, excp, syn,
+                            tcg_constant_i32(target_el));
+}
+
+void gen_exception_insn(DisasContext *s, target_long pc_diff,
+                        int excp, uint32_t syn)
+{
+    if (s->aarch64) {
+        gen_a64_update_pc(s, pc_diff);
+    } else {
+        gen_set_condexec(s);
+        gen_update_pc(s, pc_diff);
+    }
+    gen_exception(excp, syn);
+    s->base.is_jmp = DISAS_NORETURN;
+}
+
+static void gen_exception_bkpt_insn(DisasContext *s, uint32_t syn)
+{
+    gen_set_condexec(s);
+    gen_update_pc(s, 0);
+    gen_helper_exception_bkpt_insn(cpu_env, tcg_constant_i32(syn));
+    s->base.is_jmp = DISAS_NORETURN;
+}
+
+void unallocated_encoding(DisasContext *s)
+{
+    /* Unallocated and reserved encodings are uncategorized */
+    gen_exception_insn(s, 0, EXCP_UDEF, syn_uncategorized());
+}
+
+/* Force a TB lookup after an instruction that changes the CPU state.  */
+void gen_lookup_tb(DisasContext *s)
+{
+    gen_pc_plus_diff(s, cpu_R[15], curr_insn_len(s));
+    s->base.is_jmp = DISAS_EXIT;
+}
+
+static inline void gen_hlt(DisasContext *s, int imm)
+{
+    /* HLT. This has two purposes.
+     * Architecturally, it is an external halting debug instruction.
+     * Since QEMU doesn't implement external debug, we treat this as
+     * it is required for halting debug disabled: it will UNDEF.
+     * Secondly, "HLT 0x3C" is a T32 semihosting trap instruction,
+     * and "HLT 0xF000" is an A32 semihosting syscall. These traps
+     * must trigger semihosting even for ARMv7 and earlier, where
+     * HLT was an undefined encoding.
+     * In system mode, we don't allow userspace access to
+     * semihosting, to provide some semblance of security
+     * (and for consistency with our 32-bit semihosting).
+     */
+    if (semihosting_enabled(s->current_el == 0) &&
+        (imm == (s->thumb ? 0x3c : 0xf000))) {
+        gen_exception_internal_insn(s, EXCP_SEMIHOST);
+        return;
+    }
+
+    unallocated_encoding(s);
+}
+
+/*
+ * Return the offset of a "full" NEON Dreg.
+ */
+long neon_full_reg_offset(unsigned reg)
+{
+    return offsetof(CPUARMState, vfp.zregs[reg >> 1].d[reg & 1]);
+}
+
+/*
+ * Return the offset of a 2**SIZE piece of a NEON register, at index ELE,
+ * where 0 is the least significant end of the register.
+ */
+long neon_element_offset(int reg, int element, MemOp memop)
+{
+    int element_size = 1 << (memop & MO_SIZE);
+    int ofs = element * element_size;
+#if HOST_BIG_ENDIAN
+    /*
+     * Calculate the offset assuming fully little-endian,
+     * then XOR to account for the order of the 8-byte units.
+     */
+    if (element_size < 8) {
+        ofs ^= 8 - element_size;
+    }
+#endif
+    return neon_full_reg_offset(reg) + ofs;
+}
+
+/* Return the offset of a VFP Dreg (dp = true) or VFP Sreg (dp = false). */
+long vfp_reg_offset(bool dp, unsigned reg)
+{
+    if (dp) {
+        return neon_element_offset(reg, 0, MO_64);
+    } else {
+        return neon_element_offset(reg >> 1, reg & 1, MO_32);
+    }
+}
+
+void read_neon_element32(TCGv_i32 dest, int reg, int ele, MemOp memop)
+{
+    long off = neon_element_offset(reg, ele, memop);
+
+    switch (memop) {
+    case MO_SB:
+        tcg_gen_ld8s_i32(dest, cpu_env, off);
+        break;
+    case MO_UB:
+        tcg_gen_ld8u_i32(dest, cpu_env, off);
+        break;
+    case MO_SW:
+        tcg_gen_ld16s_i32(dest, cpu_env, off);
+        break;
+    case MO_UW:
+        tcg_gen_ld16u_i32(dest, cpu_env, off);
+        break;
+    case MO_UL:
+    case MO_SL:
+        tcg_gen_ld_i32(dest, cpu_env, off);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+void read_neon_element64(TCGv_i64 dest, int reg, int ele, MemOp memop)
+{
+    long off = neon_element_offset(reg, ele, memop);
+
+    switch (memop) {
+    case MO_SL:
+        tcg_gen_ld32s_i64(dest, cpu_env, off);
+        break;
+    case MO_UL:
+        tcg_gen_ld32u_i64(dest, cpu_env, off);
+        break;
+    case MO_UQ:
+        tcg_gen_ld_i64(dest, cpu_env, off);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+void write_neon_element32(TCGv_i32 src, int reg, int ele, MemOp memop)
+{
+    long off = neon_element_offset(reg, ele, memop);
+
+    switch (memop) {
+    case MO_8:
+        tcg_gen_st8_i32(src, cpu_env, off);
+        break;
+    case MO_16:
+        tcg_gen_st16_i32(src, cpu_env, off);
+        break;
+    case MO_32:
+        tcg_gen_st_i32(src, cpu_env, off);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+void write_neon_element64(TCGv_i64 src, int reg, int ele, MemOp memop)
+{
+    long off = neon_element_offset(reg, ele, memop);
+
+    switch (memop) {
+    case MO_32:
+        tcg_gen_st32_i64(src, cpu_env, off);
+        break;
+    case MO_64:
+        tcg_gen_st_i64(src, cpu_env, off);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+#define ARM_CP_RW_BIT   (1 << 20)
+
+static inline void iwmmxt_load_reg(TCGv_i64 var, int reg)
+{
+    tcg_gen_ld_i64(var, cpu_env, offsetof(CPUARMState, iwmmxt.regs[reg]));
+}
+
+static inline void iwmmxt_store_reg(TCGv_i64 var, int reg)
+{
+    tcg_gen_st_i64(var, cpu_env, offsetof(CPUARMState, iwmmxt.regs[reg]));
+}
+
+static inline TCGv_i32 iwmmxt_load_creg(int reg)
+{
+    TCGv_i32 var = tcg_temp_new_i32();
+    tcg_gen_ld_i32(var, cpu_env, offsetof(CPUARMState, iwmmxt.cregs[reg]));
+    return var;
+}
+
+static inline void iwmmxt_store_creg(int reg, TCGv_i32 var)
+{
+    tcg_gen_st_i32(var, cpu_env, offsetof(CPUARMState, iwmmxt.cregs[reg]));
+    tcg_temp_free_i32(var);
+}
+
+static inline void gen_op_iwmmxt_movq_wRn_M0(int rn)
+{
+    iwmmxt_store_reg(cpu_M0, rn);
+}
+
+static inline void gen_op_iwmmxt_movq_M0_wRn(int rn)
+{
+    iwmmxt_load_reg(cpu_M0, rn);
+}
+
+static inline void gen_op_iwmmxt_orq_M0_wRn(int rn)
+{
+    iwmmxt_load_reg(cpu_V1, rn);
+    tcg_gen_or_i64(cpu_M0, cpu_M0, cpu_V1);
+}
+
+static inline void gen_op_iwmmxt_andq_M0_wRn(int rn)
+{
+    iwmmxt_load_reg(cpu_V1, rn);
+    tcg_gen_and_i64(cpu_M0, cpu_M0, cpu_V1);
+}
+
+static inline void gen_op_iwmmxt_xorq_M0_wRn(int rn)
+{
+    iwmmxt_load_reg(cpu_V1, rn);
+    tcg_gen_xor_i64(cpu_M0, cpu_M0, cpu_V1);
+}
+
+#define IWMMXT_OP(name) \
+static inline void gen_op_iwmmxt_##name##_M0_wRn(int rn) \
+{ \
+    iwmmxt_load_reg(cpu_V1, rn); \
+    gen_helper_iwmmxt_##name(cpu_M0, cpu_M0, cpu_V1); \
+}
+
+#define IWMMXT_OP_ENV(name) \
+static inline void gen_op_iwmmxt_##name##_M0_wRn(int rn) \
+{ \
+    iwmmxt_load_reg(cpu_V1, rn); \
+    gen_helper_iwmmxt_##name(cpu_M0, cpu_env, cpu_M0, cpu_V1); \
+}
+
+#define IWMMXT_OP_ENV_SIZE(name) \
+IWMMXT_OP_ENV(name##b) \
+IWMMXT_OP_ENV(name##w) \
+IWMMXT_OP_ENV(name##l)
+
+#define IWMMXT_OP_ENV1(name) \
+static inline void gen_op_iwmmxt_##name##_M0(void) \
+{ \
+    gen_helper_iwmmxt_##name(cpu_M0, cpu_env, cpu_M0); \
+}
+
+IWMMXT_OP(maddsq)
+IWMMXT_OP(madduq)
+IWMMXT_OP(sadb)
+IWMMXT_OP(sadw)
+IWMMXT_OP(mulslw)
+IWMMXT_OP(mulshw)
+IWMMXT_OP(mululw)
+IWMMXT_OP(muluhw)
+IWMMXT_OP(macsw)
+IWMMXT_OP(macuw)
+
+IWMMXT_OP_ENV_SIZE(unpackl)
+IWMMXT_OP_ENV_SIZE(unpackh)
+
+IWMMXT_OP_ENV1(unpacklub)
+IWMMXT_OP_ENV1(unpackluw)
+IWMMXT_OP_ENV1(unpacklul)
+IWMMXT_OP_ENV1(unpackhub)
+IWMMXT_OP_ENV1(unpackhuw)
+IWMMXT_OP_ENV1(unpackhul)
+IWMMXT_OP_ENV1(unpacklsb)
+IWMMXT_OP_ENV1(unpacklsw)
+IWMMXT_OP_ENV1(unpacklsl)
+IWMMXT_OP_ENV1(unpackhsb)
+IWMMXT_OP_ENV1(unpackhsw)
+IWMMXT_OP_ENV1(unpackhsl)
+
+IWMMXT_OP_ENV_SIZE(cmpeq)
+IWMMXT_OP_ENV_SIZE(cmpgtu)
+IWMMXT_OP_ENV_SIZE(cmpgts)
+
+IWMMXT_OP_ENV_SIZE(mins)
+IWMMXT_OP_ENV_SIZE(minu)
+IWMMXT_OP_ENV_SIZE(maxs)
+IWMMXT_OP_ENV_SIZE(maxu)
+
+IWMMXT_OP_ENV_SIZE(subn)
+IWMMXT_OP_ENV_SIZE(addn)
+IWMMXT_OP_ENV_SIZE(subu)
+IWMMXT_OP_ENV_SIZE(addu)
+IWMMXT_OP_ENV_SIZE(subs)
+IWMMXT_OP_ENV_SIZE(adds)
+
+IWMMXT_OP_ENV(avgb0)
+IWMMXT_OP_ENV(avgb1)
+IWMMXT_OP_ENV(avgw0)
+IWMMXT_OP_ENV(avgw1)
+
+IWMMXT_OP_ENV(packuw)
+IWMMXT_OP_ENV(packul)
+IWMMXT_OP_ENV(packuq)
+IWMMXT_OP_ENV(packsw)
+IWMMXT_OP_ENV(packsl)
+IWMMXT_OP_ENV(packsq)
+
+static void gen_op_iwmmxt_set_mup(void)
+{
+    TCGv_i32 tmp;
+    tmp = load_cpu_field(iwmmxt.cregs[ARM_IWMMXT_wCon]);
+    tcg_gen_ori_i32(tmp, tmp, 2);
+    store_cpu_field(tmp, iwmmxt.cregs[ARM_IWMMXT_wCon]);
+}
+
+static void gen_op_iwmmxt_set_cup(void)
+{
+    TCGv_i32 tmp;
+    tmp = load_cpu_field(iwmmxt.cregs[ARM_IWMMXT_wCon]);
+    tcg_gen_ori_i32(tmp, tmp, 1);
+    store_cpu_field(tmp, iwmmxt.cregs[ARM_IWMMXT_wCon]);
+}
+
+static void gen_op_iwmmxt_setpsr_nz(void)
+{
+    TCGv_i32 tmp = tcg_temp_new_i32();
+    gen_helper_iwmmxt_setpsr_nz(tmp, cpu_M0);
+    store_cpu_field(tmp, iwmmxt.cregs[ARM_IWMMXT_wCASF]);
+}
+
+static inline void gen_op_iwmmxt_addl_M0_wRn(int rn)
+{
+    iwmmxt_load_reg(cpu_V1, rn);
+    tcg_gen_ext32u_i64(cpu_V1, cpu_V1);
+    tcg_gen_add_i64(cpu_M0, cpu_M0, cpu_V1);
+}
+
+static inline int gen_iwmmxt_address(DisasContext *s, uint32_t insn,
+                                     TCGv_i32 dest)
+{
+    int rd;
+    uint32_t offset;
+    TCGv_i32 tmp;
+
+    rd = (insn >> 16) & 0xf;
+    tmp = load_reg(s, rd);
+
+    offset = (insn & 0xff) << ((insn >> 7) & 2);
+    if (insn & (1 << 24)) {
+        /* Pre indexed */
+        if (insn & (1 << 23))
+            tcg_gen_addi_i32(tmp, tmp, offset);
+        else
+            tcg_gen_addi_i32(tmp, tmp, -offset);
+        tcg_gen_mov_i32(dest, tmp);
+        if (insn & (1 << 21))
+            store_reg(s, rd, tmp);
+        else
+            tcg_temp_free_i32(tmp);
+    } else if (insn & (1 << 21)) {
+        /* Post indexed */
+        tcg_gen_mov_i32(dest, tmp);
+        if (insn & (1 << 23))
+            tcg_gen_addi_i32(tmp, tmp, offset);
+        else
+            tcg_gen_addi_i32(tmp, tmp, -offset);
+        store_reg(s, rd, tmp);
+    } else if (!(insn & (1 << 23)))
+        return 1;
+    return 0;
+}
+
+static inline int gen_iwmmxt_shift(uint32_t insn, uint32_t mask, TCGv_i32 dest)
+{
+    int rd = (insn >> 0) & 0xf;
+    TCGv_i32 tmp;
+
+    if (insn & (1 << 8)) {
+        if (rd < ARM_IWMMXT_wCGR0 || rd > ARM_IWMMXT_wCGR3) {
+            return 1;
+        } else {
+            tmp = iwmmxt_load_creg(rd);
+        }
+    } else {
+        tmp = tcg_temp_new_i32();
+        iwmmxt_load_reg(cpu_V0, rd);
+        tcg_gen_extrl_i64_i32(tmp, cpu_V0);
+    }
+    tcg_gen_andi_i32(tmp, tmp, mask);
+    tcg_gen_mov_i32(dest, tmp);
+    tcg_temp_free_i32(tmp);
+    return 0;
+}
+
+/* Disassemble an iwMMXt instruction.  Returns nonzero if an error occurred
+   (ie. an undefined instruction).  */
+static int disas_iwmmxt_insn(DisasContext *s, uint32_t insn)
+{
+    int rd, wrd;
+    int rdhi, rdlo, rd0, rd1, i;
+    TCGv_i32 addr;
+    TCGv_i32 tmp, tmp2, tmp3;
+
+    if ((insn & 0x0e000e00) == 0x0c000000) {
+        if ((insn & 0x0fe00ff0) == 0x0c400000) {
+            wrd = insn & 0xf;
+            rdlo = (insn >> 12) & 0xf;
+            rdhi = (insn >> 16) & 0xf;
+            if (insn & ARM_CP_RW_BIT) {                         /* TMRRC */
+                iwmmxt_load_reg(cpu_V0, wrd);
+                tcg_gen_extrl_i64_i32(cpu_R[rdlo], cpu_V0);
+                tcg_gen_extrh_i64_i32(cpu_R[rdhi], cpu_V0);
+            } else {                                    /* TMCRR */
+                tcg_gen_concat_i32_i64(cpu_V0, cpu_R[rdlo], cpu_R[rdhi]);
+                iwmmxt_store_reg(cpu_V0, wrd);
+                gen_op_iwmmxt_set_mup();
+            }
+            return 0;
+        }
+
+        wrd = (insn >> 12) & 0xf;
+        addr = tcg_temp_new_i32();
+        if (gen_iwmmxt_address(s, insn, addr)) {
+            tcg_temp_free_i32(addr);
+            return 1;
+        }
+        if (insn & ARM_CP_RW_BIT) {
+            if ((insn >> 28) == 0xf) {                  /* WLDRW wCx */
+                tmp = tcg_temp_new_i32();
+                gen_aa32_ld32u(s, tmp, addr, get_mem_index(s));
+                iwmmxt_store_creg(wrd, tmp);
+            } else {
+                i = 1;
+                if (insn & (1 << 8)) {
+                    if (insn & (1 << 22)) {             /* WLDRD */
+                        gen_aa32_ld64(s, cpu_M0, addr, get_mem_index(s));
+                        i = 0;
+                    } else {                            /* WLDRW wRd */
+                        tmp = tcg_temp_new_i32();
+                        gen_aa32_ld32u(s, tmp, addr, get_mem_index(s));
+                    }
+                } else {
+                    tmp = tcg_temp_new_i32();
+                    if (insn & (1 << 22)) {             /* WLDRH */
+                        gen_aa32_ld16u(s, tmp, addr, get_mem_index(s));
+                    } else {                            /* WLDRB */
+                        gen_aa32_ld8u(s, tmp, addr, get_mem_index(s));
+                    }
+                }
+                if (i) {
+                    tcg_gen_extu_i32_i64(cpu_M0, tmp);
+                    tcg_temp_free_i32(tmp);
+                }
+                gen_op_iwmmxt_movq_wRn_M0(wrd);
+            }
+        } else {
+            if ((insn >> 28) == 0xf) {                  /* WSTRW wCx */
+                tmp = iwmmxt_load_creg(wrd);
+                gen_aa32_st32(s, tmp, addr, get_mem_index(s));
+            } else {
+                gen_op_iwmmxt_movq_M0_wRn(wrd);
+                tmp = tcg_temp_new_i32();
+                if (insn & (1 << 8)) {
+                    if (insn & (1 << 22)) {             /* WSTRD */
+                        gen_aa32_st64(s, cpu_M0, addr, get_mem_index(s));
+                    } else {                            /* WSTRW wRd */
+                        tcg_gen_extrl_i64_i32(tmp, cpu_M0);
+                        gen_aa32_st32(s, tmp, addr, get_mem_index(s));
+                    }
+                } else {
+                    if (insn & (1 << 22)) {             /* WSTRH */
+                        tcg_gen_extrl_i64_i32(tmp, cpu_M0);
+                        gen_aa32_st16(s, tmp, addr, get_mem_index(s));
+                    } else {                            /* WSTRB */
+                        tcg_gen_extrl_i64_i32(tmp, cpu_M0);
+                        gen_aa32_st8(s, tmp, addr, get_mem_index(s));
+                    }
+                }
+            }
+            tcg_temp_free_i32(tmp);
+        }
+        tcg_temp_free_i32(addr);
+        return 0;
+    }
+
+    if ((insn & 0x0f000000) != 0x0e000000)
+        return 1;
+
+    switch (((insn >> 12) & 0xf00) | ((insn >> 4) & 0xff)) {
+    case 0x000:                                                 /* WOR */
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 0) & 0xf;
+        rd1 = (insn >> 16) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        gen_op_iwmmxt_orq_M0_wRn(rd1);
+        gen_op_iwmmxt_setpsr_nz();
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        gen_op_iwmmxt_set_cup();
+        break;
+    case 0x011:                                                 /* TMCR */
+        if (insn & 0xf)
+            return 1;
+        rd = (insn >> 12) & 0xf;
+        wrd = (insn >> 16) & 0xf;
+        switch (wrd) {
+        case ARM_IWMMXT_wCID:
+        case ARM_IWMMXT_wCASF:
+            break;
+        case ARM_IWMMXT_wCon:
+            gen_op_iwmmxt_set_cup();
+            /* Fall through.  */
+        case ARM_IWMMXT_wCSSF:
+            tmp = iwmmxt_load_creg(wrd);
+            tmp2 = load_reg(s, rd);
+            tcg_gen_andc_i32(tmp, tmp, tmp2);
+            tcg_temp_free_i32(tmp2);
+            iwmmxt_store_creg(wrd, tmp);
+            break;
+        case ARM_IWMMXT_wCGR0:
+        case ARM_IWMMXT_wCGR1:
+        case ARM_IWMMXT_wCGR2:
+        case ARM_IWMMXT_wCGR3:
+            gen_op_iwmmxt_set_cup();
+            tmp = load_reg(s, rd);
+            iwmmxt_store_creg(wrd, tmp);
+            break;
+        default:
+            return 1;
+        }
+        break;
+    case 0x100:                                                 /* WXOR */
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 0) & 0xf;
+        rd1 = (insn >> 16) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        gen_op_iwmmxt_xorq_M0_wRn(rd1);
+        gen_op_iwmmxt_setpsr_nz();
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        gen_op_iwmmxt_set_cup();
+        break;
+    case 0x111:                                                 /* TMRC */
+        if (insn & 0xf)
+            return 1;
+        rd = (insn >> 12) & 0xf;
+        wrd = (insn >> 16) & 0xf;
+        tmp = iwmmxt_load_creg(wrd);
+        store_reg(s, rd, tmp);
+        break;
+    case 0x300:                                                 /* WANDN */
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 0) & 0xf;
+        rd1 = (insn >> 16) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        tcg_gen_neg_i64(cpu_M0, cpu_M0);
+        gen_op_iwmmxt_andq_M0_wRn(rd1);
+        gen_op_iwmmxt_setpsr_nz();
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        gen_op_iwmmxt_set_cup();
+        break;
+    case 0x200:                                                 /* WAND */
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 0) & 0xf;
+        rd1 = (insn >> 16) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        gen_op_iwmmxt_andq_M0_wRn(rd1);
+        gen_op_iwmmxt_setpsr_nz();
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        gen_op_iwmmxt_set_cup();
+        break;
+    case 0x810: case 0xa10:                             /* WMADD */
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 0) & 0xf;
+        rd1 = (insn >> 16) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        if (insn & (1 << 21))
+            gen_op_iwmmxt_maddsq_M0_wRn(rd1);
+        else
+            gen_op_iwmmxt_madduq_M0_wRn(rd1);
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        break;
+    case 0x10e: case 0x50e: case 0x90e: case 0xd0e:     /* WUNPCKIL */
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 16) & 0xf;
+        rd1 = (insn >> 0) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        switch ((insn >> 22) & 3) {
+        case 0:
+            gen_op_iwmmxt_unpacklb_M0_wRn(rd1);
+            break;
+        case 1:
+            gen_op_iwmmxt_unpacklw_M0_wRn(rd1);
+            break;
+        case 2:
+            gen_op_iwmmxt_unpackll_M0_wRn(rd1);
+            break;
+        case 3:
+            return 1;
+        }
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        gen_op_iwmmxt_set_cup();
+        break;
+    case 0x10c: case 0x50c: case 0x90c: case 0xd0c:     /* WUNPCKIH */
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 16) & 0xf;
+        rd1 = (insn >> 0) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        switch ((insn >> 22) & 3) {
+        case 0:
+            gen_op_iwmmxt_unpackhb_M0_wRn(rd1);
+            break;
+        case 1:
+            gen_op_iwmmxt_unpackhw_M0_wRn(rd1);
+            break;
+        case 2:
+            gen_op_iwmmxt_unpackhl_M0_wRn(rd1);
+            break;
+        case 3:
+            return 1;
+        }
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        gen_op_iwmmxt_set_cup();
+        break;
+    case 0x012: case 0x112: case 0x412: case 0x512:     /* WSAD */
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 16) & 0xf;
+        rd1 = (insn >> 0) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        if (insn & (1 << 22))
+            gen_op_iwmmxt_sadw_M0_wRn(rd1);
+        else
+            gen_op_iwmmxt_sadb_M0_wRn(rd1);
+        if (!(insn & (1 << 20)))
+            gen_op_iwmmxt_addl_M0_wRn(wrd);
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        break;
+    case 0x010: case 0x110: case 0x210: case 0x310:     /* WMUL */
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 16) & 0xf;
+        rd1 = (insn >> 0) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        if (insn & (1 << 21)) {
+            if (insn & (1 << 20))
+                gen_op_iwmmxt_mulshw_M0_wRn(rd1);
+            else
+                gen_op_iwmmxt_mulslw_M0_wRn(rd1);
+        } else {
+            if (insn & (1 << 20))
+                gen_op_iwmmxt_muluhw_M0_wRn(rd1);
+            else
+                gen_op_iwmmxt_mululw_M0_wRn(rd1);
+        }
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        break;
+    case 0x410: case 0x510: case 0x610: case 0x710:     /* WMAC */
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 16) & 0xf;
+        rd1 = (insn >> 0) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        if (insn & (1 << 21))
+            gen_op_iwmmxt_macsw_M0_wRn(rd1);
+        else
+            gen_op_iwmmxt_macuw_M0_wRn(rd1);
+        if (!(insn & (1 << 20))) {
+            iwmmxt_load_reg(cpu_V1, wrd);
+            tcg_gen_add_i64(cpu_M0, cpu_M0, cpu_V1);
+        }
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        break;
+    case 0x006: case 0x406: case 0x806: case 0xc06:     /* WCMPEQ */
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 16) & 0xf;
+        rd1 = (insn >> 0) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        switch ((insn >> 22) & 3) {
+        case 0:
+            gen_op_iwmmxt_cmpeqb_M0_wRn(rd1);
+            break;
+        case 1:
+            gen_op_iwmmxt_cmpeqw_M0_wRn(rd1);
+            break;
+        case 2:
+            gen_op_iwmmxt_cmpeql_M0_wRn(rd1);
+            break;
+        case 3:
+            return 1;
+        }
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        gen_op_iwmmxt_set_cup();
+        break;
+    case 0x800: case 0x900: case 0xc00: case 0xd00:     /* WAVG2 */
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 16) & 0xf;
+        rd1 = (insn >> 0) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        if (insn & (1 << 22)) {
+            if (insn & (1 << 20))
+                gen_op_iwmmxt_avgw1_M0_wRn(rd1);
+            else
+                gen_op_iwmmxt_avgw0_M0_wRn(rd1);
+        } else {
+            if (insn & (1 << 20))
+                gen_op_iwmmxt_avgb1_M0_wRn(rd1);
+            else
+                gen_op_iwmmxt_avgb0_M0_wRn(rd1);
+        }
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        gen_op_iwmmxt_set_cup();
+        break;
+    case 0x802: case 0x902: case 0xa02: case 0xb02:     /* WALIGNR */
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 16) & 0xf;
+        rd1 = (insn >> 0) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        tmp = iwmmxt_load_creg(ARM_IWMMXT_wCGR0 + ((insn >> 20) & 3));
+        tcg_gen_andi_i32(tmp, tmp, 7);
+        iwmmxt_load_reg(cpu_V1, rd1);
+        gen_helper_iwmmxt_align(cpu_M0, cpu_M0, cpu_V1, tmp);
+        tcg_temp_free_i32(tmp);
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        break;
+    case 0x601: case 0x605: case 0x609: case 0x60d:     /* TINSR */
+        if (((insn >> 6) & 3) == 3)
+            return 1;
+        rd = (insn >> 12) & 0xf;
+        wrd = (insn >> 16) & 0xf;
+        tmp = load_reg(s, rd);
+        gen_op_iwmmxt_movq_M0_wRn(wrd);
+        switch ((insn >> 6) & 3) {
+        case 0:
+            tmp2 = tcg_constant_i32(0xff);
+            tmp3 = tcg_constant_i32((insn & 7) << 3);
+            break;
+        case 1:
+            tmp2 = tcg_constant_i32(0xffff);
+            tmp3 = tcg_constant_i32((insn & 3) << 4);
+            break;
+        case 2:
+            tmp2 = tcg_constant_i32(0xffffffff);
+            tmp3 = tcg_constant_i32((insn & 1) << 5);
+            break;
+        default:
+            g_assert_not_reached();
+        }
+        gen_helper_iwmmxt_insr(cpu_M0, cpu_M0, tmp, tmp2, tmp3);
+        tcg_temp_free_i32(tmp);
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        break;
+    case 0x107: case 0x507: case 0x907: case 0xd07:     /* TEXTRM */
+        rd = (insn >> 12) & 0xf;
+        wrd = (insn >> 16) & 0xf;
+        if (rd == 15 || ((insn >> 22) & 3) == 3)
+            return 1;
+        gen_op_iwmmxt_movq_M0_wRn(wrd);
+        tmp = tcg_temp_new_i32();
+        switch ((insn >> 22) & 3) {
+        case 0:
+            tcg_gen_shri_i64(cpu_M0, cpu_M0, (insn & 7) << 3);
+            tcg_gen_extrl_i64_i32(tmp, cpu_M0);
+            if (insn & 8) {
+                tcg_gen_ext8s_i32(tmp, tmp);
+            } else {
+                tcg_gen_andi_i32(tmp, tmp, 0xff);
+            }
+            break;
+        case 1:
+            tcg_gen_shri_i64(cpu_M0, cpu_M0, (insn & 3) << 4);
+            tcg_gen_extrl_i64_i32(tmp, cpu_M0);
+            if (insn & 8) {
+                tcg_gen_ext16s_i32(tmp, tmp);
+            } else {
+                tcg_gen_andi_i32(tmp, tmp, 0xffff);
+            }
+            break;
+        case 2:
+            tcg_gen_shri_i64(cpu_M0, cpu_M0, (insn & 1) << 5);
+            tcg_gen_extrl_i64_i32(tmp, cpu_M0);
+            break;
+        }
+        store_reg(s, rd, tmp);
+        break;
+    case 0x117: case 0x517: case 0x917: case 0xd17:     /* TEXTRC */
+        if ((insn & 0x000ff008) != 0x0003f000 || ((insn >> 22) & 3) == 3)
+            return 1;
+        tmp = iwmmxt_load_creg(ARM_IWMMXT_wCASF);
+        switch ((insn >> 22) & 3) {
+        case 0:
+            tcg_gen_shri_i32(tmp, tmp, ((insn & 7) << 2) + 0);
+            break;
+        case 1:
+            tcg_gen_shri_i32(tmp, tmp, ((insn & 3) << 3) + 4);
+            break;
+        case 2:
+            tcg_gen_shri_i32(tmp, tmp, ((insn & 1) << 4) + 12);
+            break;
+        }
+        tcg_gen_shli_i32(tmp, tmp, 28);
+        gen_set_nzcv(tmp);
+        tcg_temp_free_i32(tmp);
+        break;
+    case 0x401: case 0x405: case 0x409: case 0x40d:     /* TBCST */
+        if (((insn >> 6) & 3) == 3)
+            return 1;
+        rd = (insn >> 12) & 0xf;
+        wrd = (insn >> 16) & 0xf;
+        tmp = load_reg(s, rd);
+        switch ((insn >> 6) & 3) {
+        case 0:
+            gen_helper_iwmmxt_bcstb(cpu_M0, tmp);
+            break;
+        case 1:
+            gen_helper_iwmmxt_bcstw(cpu_M0, tmp);
+            break;
+        case 2:
+            gen_helper_iwmmxt_bcstl(cpu_M0, tmp);
+            break;
+        }
+        tcg_temp_free_i32(tmp);
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        break;
+    case 0x113: case 0x513: case 0x913: case 0xd13:     /* TANDC */
+        if ((insn & 0x000ff00f) != 0x0003f000 || ((insn >> 22) & 3) == 3)
+            return 1;
+        tmp = iwmmxt_load_creg(ARM_IWMMXT_wCASF);
+        tmp2 = tcg_temp_new_i32();
+        tcg_gen_mov_i32(tmp2, tmp);
+        switch ((insn >> 22) & 3) {
+        case 0:
+            for (i = 0; i < 7; i ++) {
+                tcg_gen_shli_i32(tmp2, tmp2, 4);
+                tcg_gen_and_i32(tmp, tmp, tmp2);
+            }
+            break;
+        case 1:
+            for (i = 0; i < 3; i ++) {
+                tcg_gen_shli_i32(tmp2, tmp2, 8);
+                tcg_gen_and_i32(tmp, tmp, tmp2);
+            }
+            break;
+        case 2:
+            tcg_gen_shli_i32(tmp2, tmp2, 16);
+            tcg_gen_and_i32(tmp, tmp, tmp2);
+            break;
+        }
+        gen_set_nzcv(tmp);
+        tcg_temp_free_i32(tmp2);
+        tcg_temp_free_i32(tmp);
+        break;
+    case 0x01c: case 0x41c: case 0x81c: case 0xc1c:     /* WACC */
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 16) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        switch ((insn >> 22) & 3) {
+        case 0:
+            gen_helper_iwmmxt_addcb(cpu_M0, cpu_M0);
+            break;
+        case 1:
+            gen_helper_iwmmxt_addcw(cpu_M0, cpu_M0);
+            break;
+        case 2:
+            gen_helper_iwmmxt_addcl(cpu_M0, cpu_M0);
+            break;
+        case 3:
+            return 1;
+        }
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        break;
+    case 0x115: case 0x515: case 0x915: case 0xd15:     /* TORC */
+        if ((insn & 0x000ff00f) != 0x0003f000 || ((insn >> 22) & 3) == 3)
+            return 1;
+        tmp = iwmmxt_load_creg(ARM_IWMMXT_wCASF);
+        tmp2 = tcg_temp_new_i32();
+        tcg_gen_mov_i32(tmp2, tmp);
+        switch ((insn >> 22) & 3) {
+        case 0:
+            for (i = 0; i < 7; i ++) {
+                tcg_gen_shli_i32(tmp2, tmp2, 4);
+                tcg_gen_or_i32(tmp, tmp, tmp2);
+            }
+            break;
+        case 1:
+            for (i = 0; i < 3; i ++) {
+                tcg_gen_shli_i32(tmp2, tmp2, 8);
+                tcg_gen_or_i32(tmp, tmp, tmp2);
+            }
+            break;
+        case 2:
+            tcg_gen_shli_i32(tmp2, tmp2, 16);
+            tcg_gen_or_i32(tmp, tmp, tmp2);
+            break;
+        }
+        gen_set_nzcv(tmp);
+        tcg_temp_free_i32(tmp2);
+        tcg_temp_free_i32(tmp);
+        break;
+    case 0x103: case 0x503: case 0x903: case 0xd03:     /* TMOVMSK */
+        rd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 16) & 0xf;
+        if ((insn & 0xf) != 0 || ((insn >> 22) & 3) == 3)
+            return 1;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        tmp = tcg_temp_new_i32();
+        switch ((insn >> 22) & 3) {
+        case 0:
+            gen_helper_iwmmxt_msbb(tmp, cpu_M0);
+            break;
+        case 1:
+            gen_helper_iwmmxt_msbw(tmp, cpu_M0);
+            break;
+        case 2:
+            gen_helper_iwmmxt_msbl(tmp, cpu_M0);
+            break;
+        }
+        store_reg(s, rd, tmp);
+        break;
+    case 0x106: case 0x306: case 0x506: case 0x706:     /* WCMPGT */
+    case 0x906: case 0xb06: case 0xd06: case 0xf06:
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 16) & 0xf;
+        rd1 = (insn >> 0) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        switch ((insn >> 22) & 3) {
+        case 0:
+            if (insn & (1 << 21))
+                gen_op_iwmmxt_cmpgtsb_M0_wRn(rd1);
+            else
+                gen_op_iwmmxt_cmpgtub_M0_wRn(rd1);
+            break;
+        case 1:
+            if (insn & (1 << 21))
+                gen_op_iwmmxt_cmpgtsw_M0_wRn(rd1);
+            else
+                gen_op_iwmmxt_cmpgtuw_M0_wRn(rd1);
+            break;
+        case 2:
+            if (insn & (1 << 21))
+                gen_op_iwmmxt_cmpgtsl_M0_wRn(rd1);
+            else
+                gen_op_iwmmxt_cmpgtul_M0_wRn(rd1);
+            break;
+        case 3:
+            return 1;
+        }
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        gen_op_iwmmxt_set_cup();
+        break;
+    case 0x00e: case 0x20e: case 0x40e: case 0x60e:     /* WUNPCKEL */
+    case 0x80e: case 0xa0e: case 0xc0e: case 0xe0e:
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 16) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        switch ((insn >> 22) & 3) {
+        case 0:
+            if (insn & (1 << 21))
+                gen_op_iwmmxt_unpacklsb_M0();
+            else
+                gen_op_iwmmxt_unpacklub_M0();
+            break;
+        case 1:
+            if (insn & (1 << 21))
+                gen_op_iwmmxt_unpacklsw_M0();
+            else
+                gen_op_iwmmxt_unpackluw_M0();
+            break;
+        case 2:
+            if (insn & (1 << 21))
+                gen_op_iwmmxt_unpacklsl_M0();
+            else
+                gen_op_iwmmxt_unpacklul_M0();
+            break;
+        case 3:
+            return 1;
+        }
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        gen_op_iwmmxt_set_cup();
+        break;
+    case 0x00c: case 0x20c: case 0x40c: case 0x60c:     /* WUNPCKEH */
+    case 0x80c: case 0xa0c: case 0xc0c: case 0xe0c:
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 16) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        switch ((insn >> 22) & 3) {
+        case 0:
+            if (insn & (1 << 21))
+                gen_op_iwmmxt_unpackhsb_M0();
+            else
+                gen_op_iwmmxt_unpackhub_M0();
+            break;
+        case 1:
+            if (insn & (1 << 21))
+                gen_op_iwmmxt_unpackhsw_M0();
+            else
+                gen_op_iwmmxt_unpackhuw_M0();
+            break;
+        case 2:
+            if (insn & (1 << 21))
+                gen_op_iwmmxt_unpackhsl_M0();
+            else
+                gen_op_iwmmxt_unpackhul_M0();
+            break;
+        case 3:
+            return 1;
+        }
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        gen_op_iwmmxt_set_cup();
+        break;
+    case 0x204: case 0x604: case 0xa04: case 0xe04:     /* WSRL */
+    case 0x214: case 0x614: case 0xa14: case 0xe14:
+        if (((insn >> 22) & 3) == 0)
+            return 1;
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 16) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        tmp = tcg_temp_new_i32();
+        if (gen_iwmmxt_shift(insn, 0xff, tmp)) {
+            tcg_temp_free_i32(tmp);
+            return 1;
+        }
+        switch ((insn >> 22) & 3) {
+        case 1:
+            gen_helper_iwmmxt_srlw(cpu_M0, cpu_env, cpu_M0, tmp);
+            break;
+        case 2:
+            gen_helper_iwmmxt_srll(cpu_M0, cpu_env, cpu_M0, tmp);
+            break;
+        case 3:
+            gen_helper_iwmmxt_srlq(cpu_M0, cpu_env, cpu_M0, tmp);
+            break;
+        }
+        tcg_temp_free_i32(tmp);
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        gen_op_iwmmxt_set_cup();
+        break;
+    case 0x004: case 0x404: case 0x804: case 0xc04:     /* WSRA */
+    case 0x014: case 0x414: case 0x814: case 0xc14:
+        if (((insn >> 22) & 3) == 0)
+            return 1;
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 16) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        tmp = tcg_temp_new_i32();
+        if (gen_iwmmxt_shift(insn, 0xff, tmp)) {
+            tcg_temp_free_i32(tmp);
+            return 1;
+        }
+        switch ((insn >> 22) & 3) {
+        case 1:
+            gen_helper_iwmmxt_sraw(cpu_M0, cpu_env, cpu_M0, tmp);
+            break;
+        case 2:
+            gen_helper_iwmmxt_sral(cpu_M0, cpu_env, cpu_M0, tmp);
+            break;
+        case 3:
+            gen_helper_iwmmxt_sraq(cpu_M0, cpu_env, cpu_M0, tmp);
+            break;
+        }
+        tcg_temp_free_i32(tmp);
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        gen_op_iwmmxt_set_cup();
+        break;
+    case 0x104: case 0x504: case 0x904: case 0xd04:     /* WSLL */
+    case 0x114: case 0x514: case 0x914: case 0xd14:
+        if (((insn >> 22) & 3) == 0)
+            return 1;
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 16) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        tmp = tcg_temp_new_i32();
+        if (gen_iwmmxt_shift(insn, 0xff, tmp)) {
+            tcg_temp_free_i32(tmp);
+            return 1;
+        }
+        switch ((insn >> 22) & 3) {
+        case 1:
+            gen_helper_iwmmxt_sllw(cpu_M0, cpu_env, cpu_M0, tmp);
+            break;
+        case 2:
+            gen_helper_iwmmxt_slll(cpu_M0, cpu_env, cpu_M0, tmp);
+            break;
+        case 3:
+            gen_helper_iwmmxt_sllq(cpu_M0, cpu_env, cpu_M0, tmp);
+            break;
+        }
+        tcg_temp_free_i32(tmp);
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        gen_op_iwmmxt_set_cup();
+        break;
+    case 0x304: case 0x704: case 0xb04: case 0xf04:     /* WROR */
+    case 0x314: case 0x714: case 0xb14: case 0xf14:
+        if (((insn >> 22) & 3) == 0)
+            return 1;
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 16) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        tmp = tcg_temp_new_i32();
+        switch ((insn >> 22) & 3) {
+        case 1:
+            if (gen_iwmmxt_shift(insn, 0xf, tmp)) {
+                tcg_temp_free_i32(tmp);
+                return 1;
+            }
+            gen_helper_iwmmxt_rorw(cpu_M0, cpu_env, cpu_M0, tmp);
+            break;
+        case 2:
+            if (gen_iwmmxt_shift(insn, 0x1f, tmp)) {
+                tcg_temp_free_i32(tmp);
+                return 1;
+            }
+            gen_helper_iwmmxt_rorl(cpu_M0, cpu_env, cpu_M0, tmp);
+            break;
+        case 3:
+            if (gen_iwmmxt_shift(insn, 0x3f, tmp)) {
+                tcg_temp_free_i32(tmp);
+                return 1;
+            }
+            gen_helper_iwmmxt_rorq(cpu_M0, cpu_env, cpu_M0, tmp);
+            break;
+        }
+        tcg_temp_free_i32(tmp);
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        gen_op_iwmmxt_set_cup();
+        break;
+    case 0x116: case 0x316: case 0x516: case 0x716:     /* WMIN */
+    case 0x916: case 0xb16: case 0xd16: case 0xf16:
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 16) & 0xf;
+        rd1 = (insn >> 0) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        switch ((insn >> 22) & 3) {
+        case 0:
+            if (insn & (1 << 21))
+                gen_op_iwmmxt_minsb_M0_wRn(rd1);
+            else
+                gen_op_iwmmxt_minub_M0_wRn(rd1);
+            break;
+        case 1:
+            if (insn & (1 << 21))
+                gen_op_iwmmxt_minsw_M0_wRn(rd1);
+            else
+                gen_op_iwmmxt_minuw_M0_wRn(rd1);
+            break;
+        case 2:
+            if (insn & (1 << 21))
+                gen_op_iwmmxt_minsl_M0_wRn(rd1);
+            else
+                gen_op_iwmmxt_minul_M0_wRn(rd1);
+            break;
+        case 3:
+            return 1;
+        }
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        break;
+    case 0x016: case 0x216: case 0x416: case 0x616:     /* WMAX */
+    case 0x816: case 0xa16: case 0xc16: case 0xe16:
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 16) & 0xf;
+        rd1 = (insn >> 0) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        switch ((insn >> 22) & 3) {
+        case 0:
+            if (insn & (1 << 21))
+                gen_op_iwmmxt_maxsb_M0_wRn(rd1);
+            else
+                gen_op_iwmmxt_maxub_M0_wRn(rd1);
+            break;
+        case 1:
+            if (insn & (1 << 21))
+                gen_op_iwmmxt_maxsw_M0_wRn(rd1);
+            else
+                gen_op_iwmmxt_maxuw_M0_wRn(rd1);
+            break;
+        case 2:
+            if (insn & (1 << 21))
+                gen_op_iwmmxt_maxsl_M0_wRn(rd1);
+            else
+                gen_op_iwmmxt_maxul_M0_wRn(rd1);
+            break;
+        case 3:
+            return 1;
+        }
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        break;
+    case 0x002: case 0x102: case 0x202: case 0x302:     /* WALIGNI */
+    case 0x402: case 0x502: case 0x602: case 0x702:
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 16) & 0xf;
+        rd1 = (insn >> 0) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        iwmmxt_load_reg(cpu_V1, rd1);
+        gen_helper_iwmmxt_align(cpu_M0, cpu_M0, cpu_V1,
+                                tcg_constant_i32((insn >> 20) & 3));
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        break;
+    case 0x01a: case 0x11a: case 0x21a: case 0x31a:     /* WSUB */
+    case 0x41a: case 0x51a: case 0x61a: case 0x71a:
+    case 0x81a: case 0x91a: case 0xa1a: case 0xb1a:
+    case 0xc1a: case 0xd1a: case 0xe1a: case 0xf1a:
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 16) & 0xf;
+        rd1 = (insn >> 0) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        switch ((insn >> 20) & 0xf) {
+        case 0x0:
+            gen_op_iwmmxt_subnb_M0_wRn(rd1);
+            break;
+        case 0x1:
+            gen_op_iwmmxt_subub_M0_wRn(rd1);
+            break;
+        case 0x3:
+            gen_op_iwmmxt_subsb_M0_wRn(rd1);
+            break;
+        case 0x4:
+            gen_op_iwmmxt_subnw_M0_wRn(rd1);
+            break;
+        case 0x5:
+            gen_op_iwmmxt_subuw_M0_wRn(rd1);
+            break;
+        case 0x7:
+            gen_op_iwmmxt_subsw_M0_wRn(rd1);
+            break;
+        case 0x8:
+            gen_op_iwmmxt_subnl_M0_wRn(rd1);
+            break;
+        case 0x9:
+            gen_op_iwmmxt_subul_M0_wRn(rd1);
+            break;
+        case 0xb:
+            gen_op_iwmmxt_subsl_M0_wRn(rd1);
+            break;
+        default:
+            return 1;
+        }
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        gen_op_iwmmxt_set_cup();
+        break;
+    case 0x01e: case 0x11e: case 0x21e: case 0x31e:     /* WSHUFH */
+    case 0x41e: case 0x51e: case 0x61e: case 0x71e:
+    case 0x81e: case 0x91e: case 0xa1e: case 0xb1e:
+    case 0xc1e: case 0xd1e: case 0xe1e: case 0xf1e:
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 16) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        tmp = tcg_constant_i32(((insn >> 16) & 0xf0) | (insn & 0x0f));
+        gen_helper_iwmmxt_shufh(cpu_M0, cpu_env, cpu_M0, tmp);
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        gen_op_iwmmxt_set_cup();
+        break;
+    case 0x018: case 0x118: case 0x218: case 0x318:     /* WADD */
+    case 0x418: case 0x518: case 0x618: case 0x718:
+    case 0x818: case 0x918: case 0xa18: case 0xb18:
+    case 0xc18: case 0xd18: case 0xe18: case 0xf18:
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 16) & 0xf;
+        rd1 = (insn >> 0) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        switch ((insn >> 20) & 0xf) {
+        case 0x0:
+            gen_op_iwmmxt_addnb_M0_wRn(rd1);
+            break;
+        case 0x1:
+            gen_op_iwmmxt_addub_M0_wRn(rd1);
+            break;
+        case 0x3:
+            gen_op_iwmmxt_addsb_M0_wRn(rd1);
+            break;
+        case 0x4:
+            gen_op_iwmmxt_addnw_M0_wRn(rd1);
+            break;
+        case 0x5:
+            gen_op_iwmmxt_adduw_M0_wRn(rd1);
+            break;
+        case 0x7:
+            gen_op_iwmmxt_addsw_M0_wRn(rd1);
+            break;
+        case 0x8:
+            gen_op_iwmmxt_addnl_M0_wRn(rd1);
+            break;
+        case 0x9:
+            gen_op_iwmmxt_addul_M0_wRn(rd1);
+            break;
+        case 0xb:
+            gen_op_iwmmxt_addsl_M0_wRn(rd1);
+            break;
+        default:
+            return 1;
+        }
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        gen_op_iwmmxt_set_cup();
+        break;
+    case 0x008: case 0x108: case 0x208: case 0x308:     /* WPACK */
+    case 0x408: case 0x508: case 0x608: case 0x708:
+    case 0x808: case 0x908: case 0xa08: case 0xb08:
+    case 0xc08: case 0xd08: case 0xe08: case 0xf08:
+        if (!(insn & (1 << 20)) || ((insn >> 22) & 3) == 0)
+            return 1;
+        wrd = (insn >> 12) & 0xf;
+        rd0 = (insn >> 16) & 0xf;
+        rd1 = (insn >> 0) & 0xf;
+        gen_op_iwmmxt_movq_M0_wRn(rd0);
+        switch ((insn >> 22) & 3) {
+        case 1:
+            if (insn & (1 << 21))
+                gen_op_iwmmxt_packsw_M0_wRn(rd1);
+            else
+                gen_op_iwmmxt_packuw_M0_wRn(rd1);
+            break;
+        case 2:
+            if (insn & (1 << 21))
+                gen_op_iwmmxt_packsl_M0_wRn(rd1);
+            else
+                gen_op_iwmmxt_packul_M0_wRn(rd1);
+            break;
+        case 3:
+            if (insn & (1 << 21))
+                gen_op_iwmmxt_packsq_M0_wRn(rd1);
+            else
+                gen_op_iwmmxt_packuq_M0_wRn(rd1);
+            break;
+        }
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        gen_op_iwmmxt_set_cup();
+        break;
+    case 0x201: case 0x203: case 0x205: case 0x207:
+    case 0x209: case 0x20b: case 0x20d: case 0x20f:
+    case 0x211: case 0x213: case 0x215: case 0x217:
+    case 0x219: case 0x21b: case 0x21d: case 0x21f:
+        wrd = (insn >> 5) & 0xf;
+        rd0 = (insn >> 12) & 0xf;
+        rd1 = (insn >> 0) & 0xf;
+        if (rd0 == 0xf || rd1 == 0xf)
+            return 1;
+        gen_op_iwmmxt_movq_M0_wRn(wrd);
+        tmp = load_reg(s, rd0);
+        tmp2 = load_reg(s, rd1);
+        switch ((insn >> 16) & 0xf) {
+        case 0x0:                                       /* TMIA */
+            gen_helper_iwmmxt_muladdsl(cpu_M0, cpu_M0, tmp, tmp2);
+            break;
+        case 0x8:                                       /* TMIAPH */
+            gen_helper_iwmmxt_muladdsw(cpu_M0, cpu_M0, tmp, tmp2);
+            break;
+        case 0xc: case 0xd: case 0xe: case 0xf:                 /* TMIAxy */
+            if (insn & (1 << 16))
+                tcg_gen_shri_i32(tmp, tmp, 16);
+            if (insn & (1 << 17))
+                tcg_gen_shri_i32(tmp2, tmp2, 16);
+            gen_helper_iwmmxt_muladdswl(cpu_M0, cpu_M0, tmp, tmp2);
+            break;
+        default:
+            tcg_temp_free_i32(tmp2);
+            tcg_temp_free_i32(tmp);
+            return 1;
+        }
+        tcg_temp_free_i32(tmp2);
+        tcg_temp_free_i32(tmp);
+        gen_op_iwmmxt_movq_wRn_M0(wrd);
+        gen_op_iwmmxt_set_mup();
+        break;
+    default:
+        return 1;
+    }
+
+    return 0;
+}
+
+/* Disassemble an XScale DSP instruction.  Returns nonzero if an error occurred
+   (ie. an undefined instruction).  */
+static int disas_dsp_insn(DisasContext *s, uint32_t insn)
+{
+    int acc, rd0, rd1, rdhi, rdlo;
+    TCGv_i32 tmp, tmp2;
+
+    if ((insn & 0x0ff00f10) == 0x0e200010) {
+        /* Multiply with Internal Accumulate Format */
+        rd0 = (insn >> 12) & 0xf;
+        rd1 = insn & 0xf;
+        acc = (insn >> 5) & 7;
+
+        if (acc != 0)
+            return 1;
+
+        tmp = load_reg(s, rd0);
+        tmp2 = load_reg(s, rd1);
+        switch ((insn >> 16) & 0xf) {
+        case 0x0:                                       /* MIA */
+            gen_helper_iwmmxt_muladdsl(cpu_M0, cpu_M0, tmp, tmp2);
+            break;
+        case 0x8:                                       /* MIAPH */
+            gen_helper_iwmmxt_muladdsw(cpu_M0, cpu_M0, tmp, tmp2);
+            break;
+        case 0xc:                                       /* MIABB */
+        case 0xd:                                       /* MIABT */
+        case 0xe:                                       /* MIATB */
+        case 0xf:                                       /* MIATT */
+            if (insn & (1 << 16))
+                tcg_gen_shri_i32(tmp, tmp, 16);
+            if (insn & (1 << 17))
+                tcg_gen_shri_i32(tmp2, tmp2, 16);
+            gen_helper_iwmmxt_muladdswl(cpu_M0, cpu_M0, tmp, tmp2);
+            break;
+        default:
+            return 1;
+        }
+        tcg_temp_free_i32(tmp2);
+        tcg_temp_free_i32(tmp);
+
+        gen_op_iwmmxt_movq_wRn_M0(acc);
+        return 0;
+    }
+
+    if ((insn & 0x0fe00ff8) == 0x0c400000) {
+        /* Internal Accumulator Access Format */
+        rdhi = (insn >> 16) & 0xf;
+        rdlo = (insn >> 12) & 0xf;
+        acc = insn & 7;
+
+        if (acc != 0)
+            return 1;
+
+        if (insn & ARM_CP_RW_BIT) {                     /* MRA */
+            iwmmxt_load_reg(cpu_V0, acc);
+            tcg_gen_extrl_i64_i32(cpu_R[rdlo], cpu_V0);
+            tcg_gen_extrh_i64_i32(cpu_R[rdhi], cpu_V0);
+            tcg_gen_andi_i32(cpu_R[rdhi], cpu_R[rdhi], (1 << (40 - 32)) - 1);
+        } else {                                        /* MAR */
+            tcg_gen_concat_i32_i64(cpu_V0, cpu_R[rdlo], cpu_R[rdhi]);
+            iwmmxt_store_reg(cpu_V0, acc);
+        }
+        return 0;
+    }
+
+    return 1;
+}
+
+static void gen_goto_ptr(void)
+{
+    tcg_gen_lookup_and_goto_ptr();
+}
+
+/* This will end the TB but doesn't guarantee we'll return to
+ * cpu_loop_exec. Any live exit_requests will be processed as we
+ * enter the next TB.
+ */
+static void gen_goto_tb(DisasContext *s, int n, target_long diff)
+{
+    if (translator_use_goto_tb(&s->base, s->pc_curr + diff)) {
+        /*
+         * For pcrel, the pc must always be up-to-date on entry to
+         * the linked TB, so that it can use simple additions for all
+         * further adjustments.  For !pcrel, the linked TB is compiled
+         * to know its full virtual address, so we can delay the
+         * update to pc to the unlinked path.  A long chain of links
+         * can thus avoid many updates to the PC.
+         */
+        if (TARGET_TB_PCREL) {
+            gen_update_pc(s, diff);
+            tcg_gen_goto_tb(n);
+        } else {
+            tcg_gen_goto_tb(n);
+            gen_update_pc(s, diff);
+        }
+        tcg_gen_exit_tb(s->base.tb, n);
+    } else {
+        gen_update_pc(s, diff);
+        gen_goto_ptr();
+    }
+    s->base.is_jmp = DISAS_NORETURN;
+}
+
+/* Jump, specifying which TB number to use if we gen_goto_tb() */
+static void gen_jmp_tb(DisasContext *s, target_long diff, int tbno)
+{
+    if (unlikely(s->ss_active)) {
+        /* An indirect jump so that we still trigger the debug exception.  */
+        gen_update_pc(s, diff);
+        s->base.is_jmp = DISAS_JUMP;
+        return;
+    }
+    switch (s->base.is_jmp) {
+    case DISAS_NEXT:
+    case DISAS_TOO_MANY:
+    case DISAS_NORETURN:
+        /*
+         * The normal case: just go to the destination TB.
+         * NB: NORETURN happens if we generate code like
+         *    gen_brcondi(l);
+         *    gen_jmp();
+         *    gen_set_label(l);
+         *    gen_jmp();
+         * on the second call to gen_jmp().
+         */
+        gen_goto_tb(s, tbno, diff);
+        break;
+    case DISAS_UPDATE_NOCHAIN:
+    case DISAS_UPDATE_EXIT:
+        /*
+         * We already decided we're leaving the TB for some other reason.
+         * Avoid using goto_tb so we really do exit back to the main loop
+         * and don't chain to another TB.
+         */
+        gen_update_pc(s, diff);
+        gen_goto_ptr();
+        s->base.is_jmp = DISAS_NORETURN;
+        break;
+    default:
+        /*
+         * We shouldn't be emitting code for a jump and also have
+         * is_jmp set to one of the special cases like DISAS_SWI.
+         */
+        g_assert_not_reached();
+    }
+}
+
+static inline void gen_jmp(DisasContext *s, target_long diff)
+{
+    gen_jmp_tb(s, diff, 0);
+}
+
+static inline void gen_mulxy(TCGv_i32 t0, TCGv_i32 t1, int x, int y)
+{
+    if (x)
+        tcg_gen_sari_i32(t0, t0, 16);
+    else
+        gen_sxth(t0);
+    if (y)
+        tcg_gen_sari_i32(t1, t1, 16);
+    else
+        gen_sxth(t1);
+    tcg_gen_mul_i32(t0, t0, t1);
+}
+
+/* Return the mask of PSR bits set by a MSR instruction.  */
+static uint32_t msr_mask(DisasContext *s, int flags, int spsr)
+{
+    uint32_t mask = 0;
+
+    if (flags & (1 << 0)) {
+        mask |= 0xff;
+    }
+    if (flags & (1 << 1)) {
+        mask |= 0xff00;
+    }
+    if (flags & (1 << 2)) {
+        mask |= 0xff0000;
+    }
+    if (flags & (1 << 3)) {
+        mask |= 0xff000000;
+    }
+
+    /* Mask out undefined and reserved bits.  */
+    mask &= aarch32_cpsr_valid_mask(s->features, s->isar);
+
+    /* Mask out execution state.  */
+    if (!spsr) {
+        mask &= ~CPSR_EXEC;
+    }
+
+    /* Mask out privileged bits.  */
+    if (IS_USER(s)) {
+        mask &= CPSR_USER;
+    }
+    return mask;
+}
+
+/* Returns nonzero if access to the PSR is not permitted. Marks t0 as dead. */
+static int gen_set_psr(DisasContext *s, uint32_t mask, int spsr, TCGv_i32 t0)
+{
+    TCGv_i32 tmp;
+    if (spsr) {
+        /* ??? This is also undefined in system mode.  */
+        if (IS_USER(s))
+            return 1;
+
+        tmp = load_cpu_field(spsr);
+        tcg_gen_andi_i32(tmp, tmp, ~mask);
+        tcg_gen_andi_i32(t0, t0, mask);
+        tcg_gen_or_i32(tmp, tmp, t0);
+        store_cpu_field(tmp, spsr);
+    } else {
+        gen_set_cpsr(t0, mask);
+    }
+    tcg_temp_free_i32(t0);
+    gen_lookup_tb(s);
+    return 0;
+}
+
+/* Returns nonzero if access to the PSR is not permitted.  */
+static int gen_set_psr_im(DisasContext *s, uint32_t mask, int spsr, uint32_t val)
+{
+    TCGv_i32 tmp;
+    tmp = tcg_temp_new_i32();
+    tcg_gen_movi_i32(tmp, val);
+    return gen_set_psr(s, mask, spsr, tmp);
+}
+
+static bool msr_banked_access_decode(DisasContext *s, int r, int sysm, int rn,
+                                     int *tgtmode, int *regno)
+{
+    /* Decode the r and sysm fields of MSR/MRS banked accesses into
+     * the target mode and register number, and identify the various
+     * unpredictable cases.
+     * MSR (banked) and MRS (banked) are CONSTRAINED UNPREDICTABLE if:
+     *  + executed in user mode
+     *  + using R15 as the src/dest register
+     *  + accessing an unimplemented register
+     *  + accessing a register that's inaccessible at current PL/security state*
+     *  + accessing a register that you could access with a different insn
+     * We choose to UNDEF in all these cases.
+     * Since we don't know which of the various AArch32 modes we are in
+     * we have to defer some checks to runtime.
+     * Accesses to Monitor mode registers from Secure EL1 (which implies
+     * that EL3 is AArch64) must trap to EL3.
+     *
+     * If the access checks fail this function will emit code to take
+     * an exception and return false. Otherwise it will return true,
+     * and set *tgtmode and *regno appropriately.
+     */
+    /* These instructions are present only in ARMv8, or in ARMv7 with the
+     * Virtualization Extensions.
+     */
+    if (!arm_dc_feature(s, ARM_FEATURE_V8) &&
+        !arm_dc_feature(s, ARM_FEATURE_EL2)) {
+        goto undef;
+    }
+
+    if (IS_USER(s) || rn == 15) {
+        goto undef;
+    }
+
+    /* The table in the v8 ARM ARM section F5.2.3 describes the encoding
+     * of registers into (r, sysm).
+     */
+    if (r) {
+        /* SPSRs for other modes */
+        switch (sysm) {
+        case 0xe: /* SPSR_fiq */
+            *tgtmode = ARM_CPU_MODE_FIQ;
+            break;
+        case 0x10: /* SPSR_irq */
+            *tgtmode = ARM_CPU_MODE_IRQ;
+            break;
+        case 0x12: /* SPSR_svc */
+            *tgtmode = ARM_CPU_MODE_SVC;
+            break;
+        case 0x14: /* SPSR_abt */
+            *tgtmode = ARM_CPU_MODE_ABT;
+            break;
+        case 0x16: /* SPSR_und */
+            *tgtmode = ARM_CPU_MODE_UND;
+            break;
+        case 0x1c: /* SPSR_mon */
+            *tgtmode = ARM_CPU_MODE_MON;
+            break;
+        case 0x1e: /* SPSR_hyp */
+            *tgtmode = ARM_CPU_MODE_HYP;
+            break;
+        default: /* unallocated */
+            goto undef;
+        }
+        /* We arbitrarily assign SPSR a register number of 16. */
+        *regno = 16;
+    } else {
+        /* general purpose registers for other modes */
+        switch (sysm) {
+        case 0x0 ... 0x6:   /* 0b00xxx : r8_usr ... r14_usr */
+            *tgtmode = ARM_CPU_MODE_USR;
+            *regno = sysm + 8;
+            break;
+        case 0x8 ... 0xe:   /* 0b01xxx : r8_fiq ... r14_fiq */
+            *tgtmode = ARM_CPU_MODE_FIQ;
+            *regno = sysm;
+            break;
+        case 0x10 ... 0x11: /* 0b1000x : r14_irq, r13_irq */
+            *tgtmode = ARM_CPU_MODE_IRQ;
+            *regno = sysm & 1 ? 13 : 14;
+            break;
+        case 0x12 ... 0x13: /* 0b1001x : r14_svc, r13_svc */
+            *tgtmode = ARM_CPU_MODE_SVC;
+            *regno = sysm & 1 ? 13 : 14;
+            break;
+        case 0x14 ... 0x15: /* 0b1010x : r14_abt, r13_abt */
+            *tgtmode = ARM_CPU_MODE_ABT;
+            *regno = sysm & 1 ? 13 : 14;
+            break;
+        case 0x16 ... 0x17: /* 0b1011x : r14_und, r13_und */
+            *tgtmode = ARM_CPU_MODE_UND;
+            *regno = sysm & 1 ? 13 : 14;
+            break;
+        case 0x1c ... 0x1d: /* 0b1110x : r14_mon, r13_mon */
+            *tgtmode = ARM_CPU_MODE_MON;
+            *regno = sysm & 1 ? 13 : 14;
+            break;
+        case 0x1e ... 0x1f: /* 0b1111x : elr_hyp, r13_hyp */
+            *tgtmode = ARM_CPU_MODE_HYP;
+            /* Arbitrarily pick 17 for ELR_Hyp (which is not a banked LR!) */
+            *regno = sysm & 1 ? 13 : 17;
+            break;
+        default: /* unallocated */
+            goto undef;
+        }
+    }
+
+    /* Catch the 'accessing inaccessible register' cases we can detect
+     * at translate time.
+     */
+    switch (*tgtmode) {
+    case ARM_CPU_MODE_MON:
+        if (!arm_dc_feature(s, ARM_FEATURE_EL3) || s->ns) {
+            goto undef;
+        }
+        if (s->current_el == 1) {
+            /* If we're in Secure EL1 (which implies that EL3 is AArch64)
+             * then accesses to Mon registers trap to Secure EL2, if it exists,
+             * otherwise EL3.
+             */
+            TCGv_i32 tcg_el;
+
+            if (arm_dc_feature(s, ARM_FEATURE_AARCH64) &&
+                dc_isar_feature(aa64_sel2, s)) {
+                /* Target EL is EL<3 minus SCR_EL3.EEL2> */
+                tcg_el = load_cpu_field(cp15.scr_el3);
+                tcg_gen_sextract_i32(tcg_el, tcg_el, ctz32(SCR_EEL2), 1);
+                tcg_gen_addi_i32(tcg_el, tcg_el, 3);
+            } else {
+                tcg_el = tcg_constant_i32(3);
+            }
+
+            gen_exception_insn_el_v(s, 0, EXCP_UDEF,
+                                    syn_uncategorized(), tcg_el);
+            tcg_temp_free_i32(tcg_el);
+            return false;
+        }
+        break;
+    case ARM_CPU_MODE_HYP:
+        /*
+         * SPSR_hyp and r13_hyp can only be accessed from Monitor mode
+         * (and so we can forbid accesses from EL2 or below). elr_hyp
+         * can be accessed also from Hyp mode, so forbid accesses from
+         * EL0 or EL1.
+         */
+        if (!arm_dc_feature(s, ARM_FEATURE_EL2) || s->current_el < 2 ||
+            (s->current_el < 3 && *regno != 17)) {
+            goto undef;
+        }
+        break;
+    default:
+        break;
+    }
+
+    return true;
+
+undef:
+    /* If we get here then some access check did not pass */
+    gen_exception_insn(s, 0, EXCP_UDEF, syn_uncategorized());
+    return false;
+}
+
+static void gen_msr_banked(DisasContext *s, int r, int sysm, int rn)
+{
+    TCGv_i32 tcg_reg;
+    int tgtmode = 0, regno = 0;
+
+    if (!msr_banked_access_decode(s, r, sysm, rn, &tgtmode, &regno)) {
+        return;
+    }
+
+    /* Sync state because msr_banked() can raise exceptions */
+    gen_set_condexec(s);
+    gen_update_pc(s, 0);
+    tcg_reg = load_reg(s, rn);
+    gen_helper_msr_banked(cpu_env, tcg_reg,
+                          tcg_constant_i32(tgtmode),
+                          tcg_constant_i32(regno));
+    tcg_temp_free_i32(tcg_reg);
+    s->base.is_jmp = DISAS_UPDATE_EXIT;
+}
+
+static void gen_mrs_banked(DisasContext *s, int r, int sysm, int rn)
+{
+    TCGv_i32 tcg_reg;
+    int tgtmode = 0, regno = 0;
+
+    if (!msr_banked_access_decode(s, r, sysm, rn, &tgtmode, &regno)) {
+        return;
+    }
+
+    /* Sync state because mrs_banked() can raise exceptions */
+    gen_set_condexec(s);
+    gen_update_pc(s, 0);
+    tcg_reg = tcg_temp_new_i32();
+    gen_helper_mrs_banked(tcg_reg, cpu_env,
+                          tcg_constant_i32(tgtmode),
+                          tcg_constant_i32(regno));
+    store_reg(s, rn, tcg_reg);
+    s->base.is_jmp = DISAS_UPDATE_EXIT;
+}
+
+/* Store value to PC as for an exception return (ie don't
+ * mask bits). The subsequent call to gen_helper_cpsr_write_eret()
+ * will do the masking based on the new value of the Thumb bit.
+ */
+static void store_pc_exc_ret(DisasContext *s, TCGv_i32 pc)
+{
+    tcg_gen_mov_i32(cpu_R[15], pc);
+    tcg_temp_free_i32(pc);
+}
+
+/* Generate a v6 exception return.  Marks both values as dead.  */
+static void gen_rfe(DisasContext *s, TCGv_i32 pc, TCGv_i32 cpsr)
+{
+    store_pc_exc_ret(s, pc);
+    /* The cpsr_write_eret helper will mask the low bits of PC
+     * appropriately depending on the new Thumb bit, so it must
+     * be called after storing the new PC.
+     */
+    if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
+        gen_io_start();
+    }
+    gen_helper_cpsr_write_eret(cpu_env, cpsr);
+    tcg_temp_free_i32(cpsr);
+    /* Must exit loop to check un-masked IRQs */
+    s->base.is_jmp = DISAS_EXIT;
+}
+
+/* Generate an old-style exception return. Marks pc as dead. */
+static void gen_exception_return(DisasContext *s, TCGv_i32 pc)
+{
+    gen_rfe(s, pc, load_cpu_field(spsr));
+}
+
+static void gen_gvec_fn3_qc(uint32_t rd_ofs, uint32_t rn_ofs, uint32_t rm_ofs,
+                            uint32_t opr_sz, uint32_t max_sz,
+                            gen_helper_gvec_3_ptr *fn)
+{
+    TCGv_ptr qc_ptr = tcg_temp_new_ptr();
+
+    tcg_gen_addi_ptr(qc_ptr, cpu_env, offsetof(CPUARMState, vfp.qc));
+    tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, qc_ptr,
+                       opr_sz, max_sz, 0, fn);
+    tcg_temp_free_ptr(qc_ptr);
+}
+
+void gen_gvec_sqrdmlah_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                          uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+    static gen_helper_gvec_3_ptr * const fns[2] = {
+        gen_helper_gvec_qrdmlah_s16, gen_helper_gvec_qrdmlah_s32
+    };
+    tcg_debug_assert(vece >= 1 && vece <= 2);
+    gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
+}
+
+void gen_gvec_sqrdmlsh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                          uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+    static gen_helper_gvec_3_ptr * const fns[2] = {
+        gen_helper_gvec_qrdmlsh_s16, gen_helper_gvec_qrdmlsh_s32
+    };
+    tcg_debug_assert(vece >= 1 && vece <= 2);
+    gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
+}
+
+#define GEN_CMP0(NAME, COND)                                            \
+    static void gen_##NAME##0_i32(TCGv_i32 d, TCGv_i32 a)               \
+    {                                                                   \
+        tcg_gen_setcondi_i32(COND, d, a, 0);                            \
+        tcg_gen_neg_i32(d, d);                                          \
+    }                                                                   \
+    static void gen_##NAME##0_i64(TCGv_i64 d, TCGv_i64 a)               \
+    {                                                                   \
+        tcg_gen_setcondi_i64(COND, d, a, 0);                            \
+        tcg_gen_neg_i64(d, d);                                          \
+    }                                                                   \
+    static void gen_##NAME##0_vec(unsigned vece, TCGv_vec d, TCGv_vec a) \
+    {                                                                   \
+        TCGv_vec zero = tcg_constant_vec_matching(d, vece, 0);          \
+        tcg_gen_cmp_vec(COND, vece, d, a, zero);                        \
+    }                                                                   \
+    void gen_gvec_##NAME##0(unsigned vece, uint32_t d, uint32_t m,      \
+                            uint32_t opr_sz, uint32_t max_sz)           \
+    {                                                                   \
+        const GVecGen2 op[4] = {                                        \
+            { .fno = gen_helper_gvec_##NAME##0_b,                       \
+              .fniv = gen_##NAME##0_vec,                                \
+              .opt_opc = vecop_list_cmp,                                \
+              .vece = MO_8 },                                           \
+            { .fno = gen_helper_gvec_##NAME##0_h,                       \
+              .fniv = gen_##NAME##0_vec,                                \
+              .opt_opc = vecop_list_cmp,                                \
+              .vece = MO_16 },                                          \
+            { .fni4 = gen_##NAME##0_i32,                                \
+              .fniv = gen_##NAME##0_vec,                                \
+              .opt_opc = vecop_list_cmp,                                \
+              .vece = MO_32 },                                          \
+            { .fni8 = gen_##NAME##0_i64,                                \
+              .fniv = gen_##NAME##0_vec,                                \
+              .opt_opc = vecop_list_cmp,                                \
+              .prefer_i64 = TCG_TARGET_REG_BITS == 64,                  \
+              .vece = MO_64 },                                          \
+        };                                                              \
+        tcg_gen_gvec_2(d, m, opr_sz, max_sz, &op[vece]);                \
+    }
+
+static const TCGOpcode vecop_list_cmp[] = {
+    INDEX_op_cmp_vec, 0
+};
+
+GEN_CMP0(ceq, TCG_COND_EQ)
+GEN_CMP0(cle, TCG_COND_LE)
+GEN_CMP0(cge, TCG_COND_GE)
+GEN_CMP0(clt, TCG_COND_LT)
+GEN_CMP0(cgt, TCG_COND_GT)
+
+#undef GEN_CMP0
+
+static void gen_ssra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+    tcg_gen_vec_sar8i_i64(a, a, shift);
+    tcg_gen_vec_add8_i64(d, d, a);
+}
+
+static void gen_ssra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+    tcg_gen_vec_sar16i_i64(a, a, shift);
+    tcg_gen_vec_add16_i64(d, d, a);
+}
+
+static void gen_ssra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
+{
+    tcg_gen_sari_i32(a, a, shift);
+    tcg_gen_add_i32(d, d, a);
+}
+
+static void gen_ssra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+    tcg_gen_sari_i64(a, a, shift);
+    tcg_gen_add_i64(d, d, a);
+}
+
+static void gen_ssra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
+{
+    tcg_gen_sari_vec(vece, a, a, sh);
+    tcg_gen_add_vec(vece, d, d, a);
+}
+
+void gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                   int64_t shift, uint32_t opr_sz, uint32_t max_sz)
+{
+    static const TCGOpcode vecop_list[] = {
+        INDEX_op_sari_vec, INDEX_op_add_vec, 0
+    };
+    static const GVecGen2i ops[4] = {
+        { .fni8 = gen_ssra8_i64,
+          .fniv = gen_ssra_vec,
+          .fno = gen_helper_gvec_ssra_b,
+          .load_dest = true,
+          .opt_opc = vecop_list,
+          .vece = MO_8 },
+        { .fni8 = gen_ssra16_i64,
+          .fniv = gen_ssra_vec,
+          .fno = gen_helper_gvec_ssra_h,
+          .load_dest = true,
+          .opt_opc = vecop_list,
+          .vece = MO_16 },
+        { .fni4 = gen_ssra32_i32,
+          .fniv = gen_ssra_vec,
+          .fno = gen_helper_gvec_ssra_s,
+          .load_dest = true,
+          .opt_opc = vecop_list,
+          .vece = MO_32 },
+        { .fni8 = gen_ssra64_i64,
+          .fniv = gen_ssra_vec,
+          .fno = gen_helper_gvec_ssra_b,
+          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+          .opt_opc = vecop_list,
+          .load_dest = true,
+          .vece = MO_64 },
+    };
+
+    /* tszimm encoding produces immediates in the range [1..esize]. */
+    tcg_debug_assert(shift > 0);
+    tcg_debug_assert(shift <= (8 << vece));
+
+    /*
+     * Shifts larger than the element size are architecturally valid.
+     * Signed results in all sign bits.
+     */
+    shift = MIN(shift, (8 << vece) - 1);
+    tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
+}
+
+static void gen_usra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+    tcg_gen_vec_shr8i_i64(a, a, shift);
+    tcg_gen_vec_add8_i64(d, d, a);
+}
+
+static void gen_usra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+    tcg_gen_vec_shr16i_i64(a, a, shift);
+    tcg_gen_vec_add16_i64(d, d, a);
+}
+
+static void gen_usra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
+{
+    tcg_gen_shri_i32(a, a, shift);
+    tcg_gen_add_i32(d, d, a);
+}
+
+static void gen_usra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+    tcg_gen_shri_i64(a, a, shift);
+    tcg_gen_add_i64(d, d, a);
+}
+
+static void gen_usra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
+{
+    tcg_gen_shri_vec(vece, a, a, sh);
+    tcg_gen_add_vec(vece, d, d, a);
+}
+
+void gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                   int64_t shift, uint32_t opr_sz, uint32_t max_sz)
+{
+    static const TCGOpcode vecop_list[] = {
+        INDEX_op_shri_vec, INDEX_op_add_vec, 0
+    };
+    static const GVecGen2i ops[4] = {
+        { .fni8 = gen_usra8_i64,
+          .fniv = gen_usra_vec,
+          .fno = gen_helper_gvec_usra_b,
+          .load_dest = true,
+          .opt_opc = vecop_list,
+          .vece = MO_8, },
+        { .fni8 = gen_usra16_i64,
+          .fniv = gen_usra_vec,
+          .fno = gen_helper_gvec_usra_h,
+          .load_dest = true,
+          .opt_opc = vecop_list,
+          .vece = MO_16, },
+        { .fni4 = gen_usra32_i32,
+          .fniv = gen_usra_vec,
+          .fno = gen_helper_gvec_usra_s,
+          .load_dest = true,
+          .opt_opc = vecop_list,
+          .vece = MO_32, },
+        { .fni8 = gen_usra64_i64,
+          .fniv = gen_usra_vec,
+          .fno = gen_helper_gvec_usra_d,
+          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+          .load_dest = true,
+          .opt_opc = vecop_list,
+          .vece = MO_64, },
+    };
+
+    /* tszimm encoding produces immediates in the range [1..esize]. */
+    tcg_debug_assert(shift > 0);
+    tcg_debug_assert(shift <= (8 << vece));
+
+    /*
+     * Shifts larger than the element size are architecturally valid.
+     * Unsigned results in all zeros as input to accumulate: nop.
+     */
+    if (shift < (8 << vece)) {
+        tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
+    } else {
+        /* Nop, but we do need to clear the tail. */
+        tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
+    }
+}
+
+/*
+ * Shift one less than the requested amount, and the low bit is
+ * the rounding bit.  For the 8 and 16-bit operations, because we
+ * mask the low bit, we can perform a normal integer shift instead
+ * of a vector shift.
+ */
+static void gen_srshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
+{
+    TCGv_i64 t = tcg_temp_new_i64();
+
+    tcg_gen_shri_i64(t, a, sh - 1);
+    tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
+    tcg_gen_vec_sar8i_i64(d, a, sh);
+    tcg_gen_vec_add8_i64(d, d, t);
+    tcg_temp_free_i64(t);
+}
+
+static void gen_srshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
+{
+    TCGv_i64 t = tcg_temp_new_i64();
+
+    tcg_gen_shri_i64(t, a, sh - 1);
+    tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
+    tcg_gen_vec_sar16i_i64(d, a, sh);
+    tcg_gen_vec_add16_i64(d, d, t);
+    tcg_temp_free_i64(t);
+}
+
+static void gen_srshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
+{
+    TCGv_i32 t;
+
+    /* Handle shift by the input size for the benefit of trans_SRSHR_ri */
+    if (sh == 32) {
+        tcg_gen_movi_i32(d, 0);
+        return;
+    }
+    t = tcg_temp_new_i32();
+    tcg_gen_extract_i32(t, a, sh - 1, 1);
+    tcg_gen_sari_i32(d, a, sh);
+    tcg_gen_add_i32(d, d, t);
+    tcg_temp_free_i32(t);
+}
+
+static void gen_srshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
+{
+    TCGv_i64 t = tcg_temp_new_i64();
+
+    tcg_gen_extract_i64(t, a, sh - 1, 1);
+    tcg_gen_sari_i64(d, a, sh);
+    tcg_gen_add_i64(d, d, t);
+    tcg_temp_free_i64(t);
+}
+
+static void gen_srshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
+{
+    TCGv_vec t = tcg_temp_new_vec_matching(d);
+    TCGv_vec ones = tcg_temp_new_vec_matching(d);
+
+    tcg_gen_shri_vec(vece, t, a, sh - 1);
+    tcg_gen_dupi_vec(vece, ones, 1);
+    tcg_gen_and_vec(vece, t, t, ones);
+    tcg_gen_sari_vec(vece, d, a, sh);
+    tcg_gen_add_vec(vece, d, d, t);
+
+    tcg_temp_free_vec(t);
+    tcg_temp_free_vec(ones);
+}
+
+void gen_gvec_srshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                    int64_t shift, uint32_t opr_sz, uint32_t max_sz)
+{
+    static const TCGOpcode vecop_list[] = {
+        INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0
+    };
+    static const GVecGen2i ops[4] = {
+        { .fni8 = gen_srshr8_i64,
+          .fniv = gen_srshr_vec,
+          .fno = gen_helper_gvec_srshr_b,
+          .opt_opc = vecop_list,
+          .vece = MO_8 },
+        { .fni8 = gen_srshr16_i64,
+          .fniv = gen_srshr_vec,
+          .fno = gen_helper_gvec_srshr_h,
+          .opt_opc = vecop_list,
+          .vece = MO_16 },
+        { .fni4 = gen_srshr32_i32,
+          .fniv = gen_srshr_vec,
+          .fno = gen_helper_gvec_srshr_s,
+          .opt_opc = vecop_list,
+          .vece = MO_32 },
+        { .fni8 = gen_srshr64_i64,
+          .fniv = gen_srshr_vec,
+          .fno = gen_helper_gvec_srshr_d,
+          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+          .opt_opc = vecop_list,
+          .vece = MO_64 },
+    };
+
+    /* tszimm encoding produces immediates in the range [1..esize] */
+    tcg_debug_assert(shift > 0);
+    tcg_debug_assert(shift <= (8 << vece));
+
+    if (shift == (8 << vece)) {
+        /*
+         * Shifts larger than the element size are architecturally valid.
+         * Signed results in all sign bits.  With rounding, this produces
+         *   (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0.
+         * I.e. always zero.
+         */
+        tcg_gen_gvec_dup_imm(vece, rd_ofs, opr_sz, max_sz, 0);
+    } else {
+        tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
+    }
+}
+
+static void gen_srsra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
+{
+    TCGv_i64 t = tcg_temp_new_i64();
+
+    gen_srshr8_i64(t, a, sh);
+    tcg_gen_vec_add8_i64(d, d, t);
+    tcg_temp_free_i64(t);
+}
+
+static void gen_srsra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
+{
+    TCGv_i64 t = tcg_temp_new_i64();
+
+    gen_srshr16_i64(t, a, sh);
+    tcg_gen_vec_add16_i64(d, d, t);
+    tcg_temp_free_i64(t);
+}
+
+static void gen_srsra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
+{
+    TCGv_i32 t = tcg_temp_new_i32();
+
+    gen_srshr32_i32(t, a, sh);
+    tcg_gen_add_i32(d, d, t);
+    tcg_temp_free_i32(t);
+}
+
+static void gen_srsra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
+{
+    TCGv_i64 t = tcg_temp_new_i64();
+
+    gen_srshr64_i64(t, a, sh);
+    tcg_gen_add_i64(d, d, t);
+    tcg_temp_free_i64(t);
+}
+
+static void gen_srsra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
+{
+    TCGv_vec t = tcg_temp_new_vec_matching(d);
+
+    gen_srshr_vec(vece, t, a, sh);
+    tcg_gen_add_vec(vece, d, d, t);
+    tcg_temp_free_vec(t);
+}
+
+void gen_gvec_srsra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                    int64_t shift, uint32_t opr_sz, uint32_t max_sz)
+{
+    static const TCGOpcode vecop_list[] = {
+        INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0
+    };
+    static const GVecGen2i ops[4] = {
+        { .fni8 = gen_srsra8_i64,
+          .fniv = gen_srsra_vec,
+          .fno = gen_helper_gvec_srsra_b,
+          .opt_opc = vecop_list,
+          .load_dest = true,
+          .vece = MO_8 },
+        { .fni8 = gen_srsra16_i64,
+          .fniv = gen_srsra_vec,
+          .fno = gen_helper_gvec_srsra_h,
+          .opt_opc = vecop_list,
+          .load_dest = true,
+          .vece = MO_16 },
+        { .fni4 = gen_srsra32_i32,
+          .fniv = gen_srsra_vec,
+          .fno = gen_helper_gvec_srsra_s,
+          .opt_opc = vecop_list,
+          .load_dest = true,
+          .vece = MO_32 },
+        { .fni8 = gen_srsra64_i64,
+          .fniv = gen_srsra_vec,
+          .fno = gen_helper_gvec_srsra_d,
+          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+          .opt_opc = vecop_list,
+          .load_dest = true,
+          .vece = MO_64 },
+    };
+
+    /* tszimm encoding produces immediates in the range [1..esize] */
+    tcg_debug_assert(shift > 0);
+    tcg_debug_assert(shift <= (8 << vece));
+
+    /*
+     * Shifts larger than the element size are architecturally valid.
+     * Signed results in all sign bits.  With rounding, this produces
+     *   (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0.
+     * I.e. always zero.  With accumulation, this leaves D unchanged.
+     */
+    if (shift == (8 << vece)) {
+        /* Nop, but we do need to clear the tail. */
+        tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
+    } else {
+        tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
+    }
+}
+
+static void gen_urshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
+{
+    TCGv_i64 t = tcg_temp_new_i64();
+
+    tcg_gen_shri_i64(t, a, sh - 1);
+    tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
+    tcg_gen_vec_shr8i_i64(d, a, sh);
+    tcg_gen_vec_add8_i64(d, d, t);
+    tcg_temp_free_i64(t);
+}
+
+static void gen_urshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
+{
+    TCGv_i64 t = tcg_temp_new_i64();
+
+    tcg_gen_shri_i64(t, a, sh - 1);
+    tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
+    tcg_gen_vec_shr16i_i64(d, a, sh);
+    tcg_gen_vec_add16_i64(d, d, t);
+    tcg_temp_free_i64(t);
+}
+
+static void gen_urshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
+{
+    TCGv_i32 t;
+
+    /* Handle shift by the input size for the benefit of trans_URSHR_ri */
+    if (sh == 32) {
+        tcg_gen_extract_i32(d, a, sh - 1, 1);
+        return;
+    }
+    t = tcg_temp_new_i32();
+    tcg_gen_extract_i32(t, a, sh - 1, 1);
+    tcg_gen_shri_i32(d, a, sh);
+    tcg_gen_add_i32(d, d, t);
+    tcg_temp_free_i32(t);
+}
+
+static void gen_urshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
+{
+    TCGv_i64 t = tcg_temp_new_i64();
+
+    tcg_gen_extract_i64(t, a, sh - 1, 1);
+    tcg_gen_shri_i64(d, a, sh);
+    tcg_gen_add_i64(d, d, t);
+    tcg_temp_free_i64(t);
+}
+
+static void gen_urshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t shift)
+{
+    TCGv_vec t = tcg_temp_new_vec_matching(d);
+    TCGv_vec ones = tcg_temp_new_vec_matching(d);
+
+    tcg_gen_shri_vec(vece, t, a, shift - 1);
+    tcg_gen_dupi_vec(vece, ones, 1);
+    tcg_gen_and_vec(vece, t, t, ones);
+    tcg_gen_shri_vec(vece, d, a, shift);
+    tcg_gen_add_vec(vece, d, d, t);
+
+    tcg_temp_free_vec(t);
+    tcg_temp_free_vec(ones);
+}
+
+void gen_gvec_urshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                    int64_t shift, uint32_t opr_sz, uint32_t max_sz)
+{
+    static const TCGOpcode vecop_list[] = {
+        INDEX_op_shri_vec, INDEX_op_add_vec, 0
+    };
+    static const GVecGen2i ops[4] = {
+        { .fni8 = gen_urshr8_i64,
+          .fniv = gen_urshr_vec,
+          .fno = gen_helper_gvec_urshr_b,
+          .opt_opc = vecop_list,
+          .vece = MO_8 },
+        { .fni8 = gen_urshr16_i64,
+          .fniv = gen_urshr_vec,
+          .fno = gen_helper_gvec_urshr_h,
+          .opt_opc = vecop_list,
+          .vece = MO_16 },
+        { .fni4 = gen_urshr32_i32,
+          .fniv = gen_urshr_vec,
+          .fno = gen_helper_gvec_urshr_s,
+          .opt_opc = vecop_list,
+          .vece = MO_32 },
+        { .fni8 = gen_urshr64_i64,
+          .fniv = gen_urshr_vec,
+          .fno = gen_helper_gvec_urshr_d,
+          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+          .opt_opc = vecop_list,
+          .vece = MO_64 },
+    };
+
+    /* tszimm encoding produces immediates in the range [1..esize] */
+    tcg_debug_assert(shift > 0);
+    tcg_debug_assert(shift <= (8 << vece));
+
+    if (shift == (8 << vece)) {
+        /*
+         * Shifts larger than the element size are architecturally valid.
+         * Unsigned results in zero.  With rounding, this produces a
+         * copy of the most significant bit.
+         */
+        tcg_gen_gvec_shri(vece, rd_ofs, rm_ofs, shift - 1, opr_sz, max_sz);
+    } else {
+        tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
+    }
+}
+
+static void gen_ursra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
+{
+    TCGv_i64 t = tcg_temp_new_i64();
+
+    if (sh == 8) {
+        tcg_gen_vec_shr8i_i64(t, a, 7);
+    } else {
+        gen_urshr8_i64(t, a, sh);
+    }
+    tcg_gen_vec_add8_i64(d, d, t);
+    tcg_temp_free_i64(t);
+}
+
+static void gen_ursra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
+{
+    TCGv_i64 t = tcg_temp_new_i64();
+
+    if (sh == 16) {
+        tcg_gen_vec_shr16i_i64(t, a, 15);
+    } else {
+        gen_urshr16_i64(t, a, sh);
+    }
+    tcg_gen_vec_add16_i64(d, d, t);
+    tcg_temp_free_i64(t);
+}
+
+static void gen_ursra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
+{
+    TCGv_i32 t = tcg_temp_new_i32();
+
+    if (sh == 32) {
+        tcg_gen_shri_i32(t, a, 31);
+    } else {
+        gen_urshr32_i32(t, a, sh);
+    }
+    tcg_gen_add_i32(d, d, t);
+    tcg_temp_free_i32(t);
+}
+
+static void gen_ursra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
+{
+    TCGv_i64 t = tcg_temp_new_i64();
+
+    if (sh == 64) {
+        tcg_gen_shri_i64(t, a, 63);
+    } else {
+        gen_urshr64_i64(t, a, sh);
+    }
+    tcg_gen_add_i64(d, d, t);
+    tcg_temp_free_i64(t);
+}
+
+static void gen_ursra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
+{
+    TCGv_vec t = tcg_temp_new_vec_matching(d);
+
+    if (sh == (8 << vece)) {
+        tcg_gen_shri_vec(vece, t, a, sh - 1);
+    } else {
+        gen_urshr_vec(vece, t, a, sh);
+    }
+    tcg_gen_add_vec(vece, d, d, t);
+    tcg_temp_free_vec(t);
+}
+
+void gen_gvec_ursra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                    int64_t shift, uint32_t opr_sz, uint32_t max_sz)
+{
+    static const TCGOpcode vecop_list[] = {
+        INDEX_op_shri_vec, INDEX_op_add_vec, 0
+    };
+    static const GVecGen2i ops[4] = {
+        { .fni8 = gen_ursra8_i64,
+          .fniv = gen_ursra_vec,
+          .fno = gen_helper_gvec_ursra_b,
+          .opt_opc = vecop_list,
+          .load_dest = true,
+          .vece = MO_8 },
+        { .fni8 = gen_ursra16_i64,
+          .fniv = gen_ursra_vec,
+          .fno = gen_helper_gvec_ursra_h,
+          .opt_opc = vecop_list,
+          .load_dest = true,
+          .vece = MO_16 },
+        { .fni4 = gen_ursra32_i32,
+          .fniv = gen_ursra_vec,
+          .fno = gen_helper_gvec_ursra_s,
+          .opt_opc = vecop_list,
+          .load_dest = true,
+          .vece = MO_32 },
+        { .fni8 = gen_ursra64_i64,
+          .fniv = gen_ursra_vec,
+          .fno = gen_helper_gvec_ursra_d,
+          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+          .opt_opc = vecop_list,
+          .load_dest = true,
+          .vece = MO_64 },
+    };
+
+    /* tszimm encoding produces immediates in the range [1..esize] */
+    tcg_debug_assert(shift > 0);
+    tcg_debug_assert(shift <= (8 << vece));
+
+    tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
+}
+
+static void gen_shr8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+    uint64_t mask = dup_const(MO_8, 0xff >> shift);
+    TCGv_i64 t = tcg_temp_new_i64();
+
+    tcg_gen_shri_i64(t, a, shift);
+    tcg_gen_andi_i64(t, t, mask);
+    tcg_gen_andi_i64(d, d, ~mask);
+    tcg_gen_or_i64(d, d, t);
+    tcg_temp_free_i64(t);
+}
+
+static void gen_shr16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+    uint64_t mask = dup_const(MO_16, 0xffff >> shift);
+    TCGv_i64 t = tcg_temp_new_i64();
+
+    tcg_gen_shri_i64(t, a, shift);
+    tcg_gen_andi_i64(t, t, mask);
+    tcg_gen_andi_i64(d, d, ~mask);
+    tcg_gen_or_i64(d, d, t);
+    tcg_temp_free_i64(t);
+}
+
+static void gen_shr32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
+{
+    tcg_gen_shri_i32(a, a, shift);
+    tcg_gen_deposit_i32(d, d, a, 0, 32 - shift);
+}
+
+static void gen_shr64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+    tcg_gen_shri_i64(a, a, shift);
+    tcg_gen_deposit_i64(d, d, a, 0, 64 - shift);
+}
+
+static void gen_shr_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
+{
+    TCGv_vec t = tcg_temp_new_vec_matching(d);
+    TCGv_vec m = tcg_temp_new_vec_matching(d);
+
+    tcg_gen_dupi_vec(vece, m, MAKE_64BIT_MASK((8 << vece) - sh, sh));
+    tcg_gen_shri_vec(vece, t, a, sh);
+    tcg_gen_and_vec(vece, d, d, m);
+    tcg_gen_or_vec(vece, d, d, t);
+
+    tcg_temp_free_vec(t);
+    tcg_temp_free_vec(m);
+}
+
+void gen_gvec_sri(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                  int64_t shift, uint32_t opr_sz, uint32_t max_sz)
+{
+    static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
+    const GVecGen2i ops[4] = {
+        { .fni8 = gen_shr8_ins_i64,
+          .fniv = gen_shr_ins_vec,
+          .fno = gen_helper_gvec_sri_b,
+          .load_dest = true,
+          .opt_opc = vecop_list,
+          .vece = MO_8 },
+        { .fni8 = gen_shr16_ins_i64,
+          .fniv = gen_shr_ins_vec,
+          .fno = gen_helper_gvec_sri_h,
+          .load_dest = true,
+          .opt_opc = vecop_list,
+          .vece = MO_16 },
+        { .fni4 = gen_shr32_ins_i32,
+          .fniv = gen_shr_ins_vec,
+          .fno = gen_helper_gvec_sri_s,
+          .load_dest = true,
+          .opt_opc = vecop_list,
+          .vece = MO_32 },
+        { .fni8 = gen_shr64_ins_i64,
+          .fniv = gen_shr_ins_vec,
+          .fno = gen_helper_gvec_sri_d,
+          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+          .load_dest = true,
+          .opt_opc = vecop_list,
+          .vece = MO_64 },
+    };
+
+    /* tszimm encoding produces immediates in the range [1..esize]. */
+    tcg_debug_assert(shift > 0);
+    tcg_debug_assert(shift <= (8 << vece));
+
+    /* Shift of esize leaves destination unchanged. */
+    if (shift < (8 << vece)) {
+        tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
+    } else {
+        /* Nop, but we do need to clear the tail. */
+        tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
+    }
+}
+
+static void gen_shl8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+    uint64_t mask = dup_const(MO_8, 0xff << shift);
+    TCGv_i64 t = tcg_temp_new_i64();
+
+    tcg_gen_shli_i64(t, a, shift);
+    tcg_gen_andi_i64(t, t, mask);
+    tcg_gen_andi_i64(d, d, ~mask);
+    tcg_gen_or_i64(d, d, t);
+    tcg_temp_free_i64(t);
+}
+
+static void gen_shl16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+    uint64_t mask = dup_const(MO_16, 0xffff << shift);
+    TCGv_i64 t = tcg_temp_new_i64();
+
+    tcg_gen_shli_i64(t, a, shift);
+    tcg_gen_andi_i64(t, t, mask);
+    tcg_gen_andi_i64(d, d, ~mask);
+    tcg_gen_or_i64(d, d, t);
+    tcg_temp_free_i64(t);
+}
+
+static void gen_shl32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
+{
+    tcg_gen_deposit_i32(d, d, a, shift, 32 - shift);
+}
+
+static void gen_shl64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+    tcg_gen_deposit_i64(d, d, a, shift, 64 - shift);
+}
+
+static void gen_shl_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
+{
+    TCGv_vec t = tcg_temp_new_vec_matching(d);
+    TCGv_vec m = tcg_temp_new_vec_matching(d);
+
+    tcg_gen_shli_vec(vece, t, a, sh);
+    tcg_gen_dupi_vec(vece, m, MAKE_64BIT_MASK(0, sh));
+    tcg_gen_and_vec(vece, d, d, m);
+    tcg_gen_or_vec(vece, d, d, t);
+
+    tcg_temp_free_vec(t);
+    tcg_temp_free_vec(m);
+}
+
+void gen_gvec_sli(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                  int64_t shift, uint32_t opr_sz, uint32_t max_sz)
+{
+    static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
+    const GVecGen2i ops[4] = {
+        { .fni8 = gen_shl8_ins_i64,
+          .fniv = gen_shl_ins_vec,
+          .fno = gen_helper_gvec_sli_b,
+          .load_dest = true,
+          .opt_opc = vecop_list,
+          .vece = MO_8 },
+        { .fni8 = gen_shl16_ins_i64,
+          .fniv = gen_shl_ins_vec,
+          .fno = gen_helper_gvec_sli_h,
+          .load_dest = true,
+          .opt_opc = vecop_list,
+          .vece = MO_16 },
+        { .fni4 = gen_shl32_ins_i32,
+          .fniv = gen_shl_ins_vec,
+          .fno = gen_helper_gvec_sli_s,
+          .load_dest = true,
+          .opt_opc = vecop_list,
+          .vece = MO_32 },
+        { .fni8 = gen_shl64_ins_i64,
+          .fniv = gen_shl_ins_vec,
+          .fno = gen_helper_gvec_sli_d,
+          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+          .load_dest = true,
+          .opt_opc = vecop_list,
+          .vece = MO_64 },
+    };
+
+    /* tszimm encoding produces immediates in the range [0..esize-1]. */
+    tcg_debug_assert(shift >= 0);
+    tcg_debug_assert(shift < (8 << vece));
+
+    if (shift == 0) {
+        tcg_gen_gvec_mov(vece, rd_ofs, rm_ofs, opr_sz, max_sz);
+    } else {
+        tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
+    }
+}
+
+static void gen_mla8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+    gen_helper_neon_mul_u8(a, a, b);
+    gen_helper_neon_add_u8(d, d, a);
+}
+
+static void gen_mls8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+    gen_helper_neon_mul_u8(a, a, b);
+    gen_helper_neon_sub_u8(d, d, a);
+}
+
+static void gen_mla16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+    gen_helper_neon_mul_u16(a, a, b);
+    gen_helper_neon_add_u16(d, d, a);
+}
+
+static void gen_mls16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+    gen_helper_neon_mul_u16(a, a, b);
+    gen_helper_neon_sub_u16(d, d, a);
+}
+
+static void gen_mla32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+    tcg_gen_mul_i32(a, a, b);
+    tcg_gen_add_i32(d, d, a);
+}
+
+static void gen_mls32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+    tcg_gen_mul_i32(a, a, b);
+    tcg_gen_sub_i32(d, d, a);
+}
+
+static void gen_mla64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+    tcg_gen_mul_i64(a, a, b);
+    tcg_gen_add_i64(d, d, a);
+}
+
+static void gen_mls64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+    tcg_gen_mul_i64(a, a, b);
+    tcg_gen_sub_i64(d, d, a);
+}
+
+static void gen_mla_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
+{
+    tcg_gen_mul_vec(vece, a, a, b);
+    tcg_gen_add_vec(vece, d, d, a);
+}
+
+static void gen_mls_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
+{
+    tcg_gen_mul_vec(vece, a, a, b);
+    tcg_gen_sub_vec(vece, d, d, a);
+}
+
+/* Note that while NEON does not support VMLA and VMLS as 64-bit ops,
+ * these tables are shared with AArch64 which does support them.
+ */
+void gen_gvec_mla(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                  uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+    static const TCGOpcode vecop_list[] = {
+        INDEX_op_mul_vec, INDEX_op_add_vec, 0
+    };
+    static const GVecGen3 ops[4] = {
+        { .fni4 = gen_mla8_i32,
+          .fniv = gen_mla_vec,
+          .load_dest = true,
+          .opt_opc = vecop_list,
+          .vece = MO_8 },
+        { .fni4 = gen_mla16_i32,
+          .fniv = gen_mla_vec,
+          .load_dest = true,
+          .opt_opc = vecop_list,
+          .vece = MO_16 },
+        { .fni4 = gen_mla32_i32,
+          .fniv = gen_mla_vec,
+          .load_dest = true,
+          .opt_opc = vecop_list,
+          .vece = MO_32 },
+        { .fni8 = gen_mla64_i64,
+          .fniv = gen_mla_vec,
+          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+          .load_dest = true,
+          .opt_opc = vecop_list,
+          .vece = MO_64 },
+    };
+    tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
+}
+
+void gen_gvec_mls(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                  uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+    static const TCGOpcode vecop_list[] = {
+        INDEX_op_mul_vec, INDEX_op_sub_vec, 0
+    };
+    static const GVecGen3 ops[4] = {
+        { .fni4 = gen_mls8_i32,
+          .fniv = gen_mls_vec,
+          .load_dest = true,
+          .opt_opc = vecop_list,
+          .vece = MO_8 },
+        { .fni4 = gen_mls16_i32,
+          .fniv = gen_mls_vec,
+          .load_dest = true,
+          .opt_opc = vecop_list,
+          .vece = MO_16 },
+        { .fni4 = gen_mls32_i32,
+          .fniv = gen_mls_vec,
+          .load_dest = true,
+          .opt_opc = vecop_list,
+          .vece = MO_32 },
+        { .fni8 = gen_mls64_i64,
+          .fniv = gen_mls_vec,
+          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+          .load_dest = true,
+          .opt_opc = vecop_list,
+          .vece = MO_64 },
+    };
+    tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
+}
+
+/* CMTST : test is "if (X & Y != 0)". */
+static void gen_cmtst_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+    tcg_gen_and_i32(d, a, b);
+    tcg_gen_setcondi_i32(TCG_COND_NE, d, d, 0);
+    tcg_gen_neg_i32(d, d);
+}
+
+void gen_cmtst_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+    tcg_gen_and_i64(d, a, b);
+    tcg_gen_setcondi_i64(TCG_COND_NE, d, d, 0);
+    tcg_gen_neg_i64(d, d);
+}
+
+static void gen_cmtst_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
+{
+    tcg_gen_and_vec(vece, d, a, b);
+    tcg_gen_dupi_vec(vece, a, 0);
+    tcg_gen_cmp_vec(TCG_COND_NE, vece, d, d, a);
+}
+
+void gen_gvec_cmtst(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+    static const TCGOpcode vecop_list[] = { INDEX_op_cmp_vec, 0 };
+    static const GVecGen3 ops[4] = {
+        { .fni4 = gen_helper_neon_tst_u8,
+          .fniv = gen_cmtst_vec,
+          .opt_opc = vecop_list,
+          .vece = MO_8 },
+        { .fni4 = gen_helper_neon_tst_u16,
+          .fniv = gen_cmtst_vec,
+          .opt_opc = vecop_list,
+          .vece = MO_16 },
+        { .fni4 = gen_cmtst_i32,
+          .fniv = gen_cmtst_vec,
+          .opt_opc = vecop_list,
+          .vece = MO_32 },
+        { .fni8 = gen_cmtst_i64,
+          .fniv = gen_cmtst_vec,
+          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+          .opt_opc = vecop_list,
+          .vece = MO_64 },
+    };
+    tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
+}
+
+void gen_ushl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift)
+{
+    TCGv_i32 lval = tcg_temp_new_i32();
+    TCGv_i32 rval = tcg_temp_new_i32();
+    TCGv_i32 lsh = tcg_temp_new_i32();
+    TCGv_i32 rsh = tcg_temp_new_i32();
+    TCGv_i32 zero = tcg_constant_i32(0);
+    TCGv_i32 max = tcg_constant_i32(32);
+
+    /*
+     * Rely on the TCG guarantee that out of range shifts produce
+     * unspecified results, not undefined behaviour (i.e. no trap).
+     * Discard out-of-range results after the fact.
+     */
+    tcg_gen_ext8s_i32(lsh, shift);
+    tcg_gen_neg_i32(rsh, lsh);
+    tcg_gen_shl_i32(lval, src, lsh);
+    tcg_gen_shr_i32(rval, src, rsh);
+    tcg_gen_movcond_i32(TCG_COND_LTU, dst, lsh, max, lval, zero);
+    tcg_gen_movcond_i32(TCG_COND_LTU, dst, rsh, max, rval, dst);
+
+    tcg_temp_free_i32(lval);
+    tcg_temp_free_i32(rval);
+    tcg_temp_free_i32(lsh);
+    tcg_temp_free_i32(rsh);
+}
+
+void gen_ushl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift)
+{
+    TCGv_i64 lval = tcg_temp_new_i64();
+    TCGv_i64 rval = tcg_temp_new_i64();
+    TCGv_i64 lsh = tcg_temp_new_i64();
+    TCGv_i64 rsh = tcg_temp_new_i64();
+    TCGv_i64 zero = tcg_constant_i64(0);
+    TCGv_i64 max = tcg_constant_i64(64);
+
+    /*
+     * Rely on the TCG guarantee that out of range shifts produce
+     * unspecified results, not undefined behaviour (i.e. no trap).
+     * Discard out-of-range results after the fact.
+     */
+    tcg_gen_ext8s_i64(lsh, shift);
+    tcg_gen_neg_i64(rsh, lsh);
+    tcg_gen_shl_i64(lval, src, lsh);
+    tcg_gen_shr_i64(rval, src, rsh);
+    tcg_gen_movcond_i64(TCG_COND_LTU, dst, lsh, max, lval, zero);
+    tcg_gen_movcond_i64(TCG_COND_LTU, dst, rsh, max, rval, dst);
+
+    tcg_temp_free_i64(lval);
+    tcg_temp_free_i64(rval);
+    tcg_temp_free_i64(lsh);
+    tcg_temp_free_i64(rsh);
+}
+
+static void gen_ushl_vec(unsigned vece, TCGv_vec dst,
+                         TCGv_vec src, TCGv_vec shift)
+{
+    TCGv_vec lval = tcg_temp_new_vec_matching(dst);
+    TCGv_vec rval = tcg_temp_new_vec_matching(dst);
+    TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
+    TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
+    TCGv_vec msk, max;
+
+    tcg_gen_neg_vec(vece, rsh, shift);
+    if (vece == MO_8) {
+        tcg_gen_mov_vec(lsh, shift);
+    } else {
+        msk = tcg_temp_new_vec_matching(dst);
+        tcg_gen_dupi_vec(vece, msk, 0xff);
+        tcg_gen_and_vec(vece, lsh, shift, msk);
+        tcg_gen_and_vec(vece, rsh, rsh, msk);
+        tcg_temp_free_vec(msk);
+    }
+
+    /*
+     * Rely on the TCG guarantee that out of range shifts produce
+     * unspecified results, not undefined behaviour (i.e. no trap).
+     * Discard out-of-range results after the fact.
+     */
+    tcg_gen_shlv_vec(vece, lval, src, lsh);
+    tcg_gen_shrv_vec(vece, rval, src, rsh);
+
+    max = tcg_temp_new_vec_matching(dst);
+    tcg_gen_dupi_vec(vece, max, 8 << vece);
+
+    /*
+     * The choice of LT (signed) and GEU (unsigned) are biased toward
+     * the instructions of the x86_64 host.  For MO_8, the whole byte
+     * is significant so we must use an unsigned compare; otherwise we
+     * have already masked to a byte and so a signed compare works.
+     * Other tcg hosts have a full set of comparisons and do not care.
+     */
+    if (vece == MO_8) {
+        tcg_gen_cmp_vec(TCG_COND_GEU, vece, lsh, lsh, max);
+        tcg_gen_cmp_vec(TCG_COND_GEU, vece, rsh, rsh, max);
+        tcg_gen_andc_vec(vece, lval, lval, lsh);
+        tcg_gen_andc_vec(vece, rval, rval, rsh);
+    } else {
+        tcg_gen_cmp_vec(TCG_COND_LT, vece, lsh, lsh, max);
+        tcg_gen_cmp_vec(TCG_COND_LT, vece, rsh, rsh, max);
+        tcg_gen_and_vec(vece, lval, lval, lsh);
+        tcg_gen_and_vec(vece, rval, rval, rsh);
+    }
+    tcg_gen_or_vec(vece, dst, lval, rval);
+
+    tcg_temp_free_vec(max);
+    tcg_temp_free_vec(lval);
+    tcg_temp_free_vec(rval);
+    tcg_temp_free_vec(lsh);
+    tcg_temp_free_vec(rsh);
+}
+
+void gen_gvec_ushl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+    static const TCGOpcode vecop_list[] = {
+        INDEX_op_neg_vec, INDEX_op_shlv_vec,
+        INDEX_op_shrv_vec, INDEX_op_cmp_vec, 0
+    };
+    static const GVecGen3 ops[4] = {
+        { .fniv = gen_ushl_vec,
+          .fno = gen_helper_gvec_ushl_b,
+          .opt_opc = vecop_list,
+          .vece = MO_8 },
+        { .fniv = gen_ushl_vec,
+          .fno = gen_helper_gvec_ushl_h,
+          .opt_opc = vecop_list,
+          .vece = MO_16 },
+        { .fni4 = gen_ushl_i32,
+          .fniv = gen_ushl_vec,
+          .opt_opc = vecop_list,
+          .vece = MO_32 },
+        { .fni8 = gen_ushl_i64,
+          .fniv = gen_ushl_vec,
+          .opt_opc = vecop_list,
+          .vece = MO_64 },
+    };
+    tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
+}
+
+void gen_sshl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift)
+{
+    TCGv_i32 lval = tcg_temp_new_i32();
+    TCGv_i32 rval = tcg_temp_new_i32();
+    TCGv_i32 lsh = tcg_temp_new_i32();
+    TCGv_i32 rsh = tcg_temp_new_i32();
+    TCGv_i32 zero = tcg_constant_i32(0);
+    TCGv_i32 max = tcg_constant_i32(31);
+
+    /*
+     * Rely on the TCG guarantee that out of range shifts produce
+     * unspecified results, not undefined behaviour (i.e. no trap).
+     * Discard out-of-range results after the fact.
+     */
+    tcg_gen_ext8s_i32(lsh, shift);
+    tcg_gen_neg_i32(rsh, lsh);
+    tcg_gen_shl_i32(lval, src, lsh);
+    tcg_gen_umin_i32(rsh, rsh, max);
+    tcg_gen_sar_i32(rval, src, rsh);
+    tcg_gen_movcond_i32(TCG_COND_LEU, lval, lsh, max, lval, zero);
+    tcg_gen_movcond_i32(TCG_COND_LT, dst, lsh, zero, rval, lval);
+
+    tcg_temp_free_i32(lval);
+    tcg_temp_free_i32(rval);
+    tcg_temp_free_i32(lsh);
+    tcg_temp_free_i32(rsh);
+}
+
+void gen_sshl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift)
+{
+    TCGv_i64 lval = tcg_temp_new_i64();
+    TCGv_i64 rval = tcg_temp_new_i64();
+    TCGv_i64 lsh = tcg_temp_new_i64();
+    TCGv_i64 rsh = tcg_temp_new_i64();
+    TCGv_i64 zero = tcg_constant_i64(0);
+    TCGv_i64 max = tcg_constant_i64(63);
+
+    /*
+     * Rely on the TCG guarantee that out of range shifts produce
+     * unspecified results, not undefined behaviour (i.e. no trap).
+     * Discard out-of-range results after the fact.
+     */
+    tcg_gen_ext8s_i64(lsh, shift);
+    tcg_gen_neg_i64(rsh, lsh);
+    tcg_gen_shl_i64(lval, src, lsh);
+    tcg_gen_umin_i64(rsh, rsh, max);
+    tcg_gen_sar_i64(rval, src, rsh);
+    tcg_gen_movcond_i64(TCG_COND_LEU, lval, lsh, max, lval, zero);
+    tcg_gen_movcond_i64(TCG_COND_LT, dst, lsh, zero, rval, lval);
+
+    tcg_temp_free_i64(lval);
+    tcg_temp_free_i64(rval);
+    tcg_temp_free_i64(lsh);
+    tcg_temp_free_i64(rsh);
+}
+
+static void gen_sshl_vec(unsigned vece, TCGv_vec dst,
+                         TCGv_vec src, TCGv_vec shift)
+{
+    TCGv_vec lval = tcg_temp_new_vec_matching(dst);
+    TCGv_vec rval = tcg_temp_new_vec_matching(dst);
+    TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
+    TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
+    TCGv_vec tmp = tcg_temp_new_vec_matching(dst);
+
+    /*
+     * Rely on the TCG guarantee that out of range shifts produce
+     * unspecified results, not undefined behaviour (i.e. no trap).
+     * Discard out-of-range results after the fact.
+     */
+    tcg_gen_neg_vec(vece, rsh, shift);
+    if (vece == MO_8) {
+        tcg_gen_mov_vec(lsh, shift);
+    } else {
+        tcg_gen_dupi_vec(vece, tmp, 0xff);
+        tcg_gen_and_vec(vece, lsh, shift, tmp);
+        tcg_gen_and_vec(vece, rsh, rsh, tmp);
+    }
+
+    /* Bound rsh so out of bound right shift gets -1.  */
+    tcg_gen_dupi_vec(vece, tmp, (8 << vece) - 1);
+    tcg_gen_umin_vec(vece, rsh, rsh, tmp);
+    tcg_gen_cmp_vec(TCG_COND_GT, vece, tmp, lsh, tmp);
+
+    tcg_gen_shlv_vec(vece, lval, src, lsh);
+    tcg_gen_sarv_vec(vece, rval, src, rsh);
+
+    /* Select in-bound left shift.  */
+    tcg_gen_andc_vec(vece, lval, lval, tmp);
+
+    /* Select between left and right shift.  */
+    if (vece == MO_8) {
+        tcg_gen_dupi_vec(vece, tmp, 0);
+        tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, tmp, rval, lval);
+    } else {
+        tcg_gen_dupi_vec(vece, tmp, 0x80);
+        tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, tmp, lval, rval);
+    }
+
+    tcg_temp_free_vec(lval);
+    tcg_temp_free_vec(rval);
+    tcg_temp_free_vec(lsh);
+    tcg_temp_free_vec(rsh);
+    tcg_temp_free_vec(tmp);
+}
+
+void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+    static const TCGOpcode vecop_list[] = {
+        INDEX_op_neg_vec, INDEX_op_umin_vec, INDEX_op_shlv_vec,
+        INDEX_op_sarv_vec, INDEX_op_cmp_vec, INDEX_op_cmpsel_vec, 0
+    };
+    static const GVecGen3 ops[4] = {
+        { .fniv = gen_sshl_vec,
+          .fno = gen_helper_gvec_sshl_b,
+          .opt_opc = vecop_list,
+          .vece = MO_8 },
+        { .fniv = gen_sshl_vec,
+          .fno = gen_helper_gvec_sshl_h,
+          .opt_opc = vecop_list,
+          .vece = MO_16 },
+        { .fni4 = gen_sshl_i32,
+          .fniv = gen_sshl_vec,
+          .opt_opc = vecop_list,
+          .vece = MO_32 },
+        { .fni8 = gen_sshl_i64,
+          .fniv = gen_sshl_vec,
+          .opt_opc = vecop_list,
+          .vece = MO_64 },
+    };
+    tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
+}
+
+static void gen_uqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec sat,
+                          TCGv_vec a, TCGv_vec b)
+{
+    TCGv_vec x = tcg_temp_new_vec_matching(t);
+    tcg_gen_add_vec(vece, x, a, b);
+    tcg_gen_usadd_vec(vece, t, a, b);
+    tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t);
+    tcg_gen_or_vec(vece, sat, sat, x);
+    tcg_temp_free_vec(x);
+}
+
+void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                       uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+    static const TCGOpcode vecop_list[] = {
+        INDEX_op_usadd_vec, INDEX_op_cmp_vec, INDEX_op_add_vec, 0
+    };
+    static const GVecGen4 ops[4] = {
+        { .fniv = gen_uqadd_vec,
+          .fno = gen_helper_gvec_uqadd_b,
+          .write_aofs = true,
+          .opt_opc = vecop_list,
+          .vece = MO_8 },
+        { .fniv = gen_uqadd_vec,
+          .fno = gen_helper_gvec_uqadd_h,
+          .write_aofs = true,
+          .opt_opc = vecop_list,
+          .vece = MO_16 },
+        { .fniv = gen_uqadd_vec,
+          .fno = gen_helper_gvec_uqadd_s,
+          .write_aofs = true,
+          .opt_opc = vecop_list,
+          .vece = MO_32 },
+        { .fniv = gen_uqadd_vec,
+          .fno = gen_helper_gvec_uqadd_d,
+          .write_aofs = true,
+          .opt_opc = vecop_list,
+          .vece = MO_64 },
+    };
+    tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
+                   rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
+}
+
+static void gen_sqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec sat,
+                          TCGv_vec a, TCGv_vec b)
+{
+    TCGv_vec x = tcg_temp_new_vec_matching(t);
+    tcg_gen_add_vec(vece, x, a, b);
+    tcg_gen_ssadd_vec(vece, t, a, b);
+    tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t);
+    tcg_gen_or_vec(vece, sat, sat, x);
+    tcg_temp_free_vec(x);
+}
+
+void gen_gvec_sqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                       uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+    static const TCGOpcode vecop_list[] = {
+        INDEX_op_ssadd_vec, INDEX_op_cmp_vec, INDEX_op_add_vec, 0
+    };
+    static const GVecGen4 ops[4] = {
+        { .fniv = gen_sqadd_vec,
+          .fno = gen_helper_gvec_sqadd_b,
+          .opt_opc = vecop_list,
+          .write_aofs = true,
+          .vece = MO_8 },
+        { .fniv = gen_sqadd_vec,
+          .fno = gen_helper_gvec_sqadd_h,
+          .opt_opc = vecop_list,
+          .write_aofs = true,
+          .vece = MO_16 },
+        { .fniv = gen_sqadd_vec,
+          .fno = gen_helper_gvec_sqadd_s,
+          .opt_opc = vecop_list,
+          .write_aofs = true,
+          .vece = MO_32 },
+        { .fniv = gen_sqadd_vec,
+          .fno = gen_helper_gvec_sqadd_d,
+          .opt_opc = vecop_list,
+          .write_aofs = true,
+          .vece = MO_64 },
+    };
+    tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
+                   rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
+}
+
+static void gen_uqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec sat,
+                          TCGv_vec a, TCGv_vec b)
+{
+    TCGv_vec x = tcg_temp_new_vec_matching(t);
+    tcg_gen_sub_vec(vece, x, a, b);
+    tcg_gen_ussub_vec(vece, t, a, b);
+    tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t);
+    tcg_gen_or_vec(vece, sat, sat, x);
+    tcg_temp_free_vec(x);
+}
+
+void gen_gvec_uqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                       uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+    static const TCGOpcode vecop_list[] = {
+        INDEX_op_ussub_vec, INDEX_op_cmp_vec, INDEX_op_sub_vec, 0
+    };
+    static const GVecGen4 ops[4] = {
+        { .fniv = gen_uqsub_vec,
+          .fno = gen_helper_gvec_uqsub_b,
+          .opt_opc = vecop_list,
+          .write_aofs = true,
+          .vece = MO_8 },
+        { .fniv = gen_uqsub_vec,
+          .fno = gen_helper_gvec_uqsub_h,
+          .opt_opc = vecop_list,
+          .write_aofs = true,
+          .vece = MO_16 },
+        { .fniv = gen_uqsub_vec,
+          .fno = gen_helper_gvec_uqsub_s,
+          .opt_opc = vecop_list,
+          .write_aofs = true,
+          .vece = MO_32 },
+        { .fniv = gen_uqsub_vec,
+          .fno = gen_helper_gvec_uqsub_d,
+          .opt_opc = vecop_list,
+          .write_aofs = true,
+          .vece = MO_64 },
+    };
+    tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
+                   rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
+}
+
+static void gen_sqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec sat,
+                          TCGv_vec a, TCGv_vec b)
+{
+    TCGv_vec x = tcg_temp_new_vec_matching(t);
+    tcg_gen_sub_vec(vece, x, a, b);
+    tcg_gen_sssub_vec(vece, t, a, b);
+    tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t);
+    tcg_gen_or_vec(vece, sat, sat, x);
+    tcg_temp_free_vec(x);
+}
+
+void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                       uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+    static const TCGOpcode vecop_list[] = {
+        INDEX_op_sssub_vec, INDEX_op_cmp_vec, INDEX_op_sub_vec, 0
+    };
+    static const GVecGen4 ops[4] = {
+        { .fniv = gen_sqsub_vec,
+          .fno = gen_helper_gvec_sqsub_b,
+          .opt_opc = vecop_list,
+          .write_aofs = true,
+          .vece = MO_8 },
+        { .fniv = gen_sqsub_vec,
+          .fno = gen_helper_gvec_sqsub_h,
+          .opt_opc = vecop_list,
+          .write_aofs = true,
+          .vece = MO_16 },
+        { .fniv = gen_sqsub_vec,
+          .fno = gen_helper_gvec_sqsub_s,
+          .opt_opc = vecop_list,
+          .write_aofs = true,
+          .vece = MO_32 },
+        { .fniv = gen_sqsub_vec,
+          .fno = gen_helper_gvec_sqsub_d,
+          .opt_opc = vecop_list,
+          .write_aofs = true,
+          .vece = MO_64 },
+    };
+    tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
+                   rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
+}
+
+static void gen_sabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+    TCGv_i32 t = tcg_temp_new_i32();
+
+    tcg_gen_sub_i32(t, a, b);
+    tcg_gen_sub_i32(d, b, a);
+    tcg_gen_movcond_i32(TCG_COND_LT, d, a, b, d, t);
+    tcg_temp_free_i32(t);
+}
+
+static void gen_sabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+    TCGv_i64 t = tcg_temp_new_i64();
+
+    tcg_gen_sub_i64(t, a, b);
+    tcg_gen_sub_i64(d, b, a);
+    tcg_gen_movcond_i64(TCG_COND_LT, d, a, b, d, t);
+    tcg_temp_free_i64(t);
+}
+
+static void gen_sabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
+{
+    TCGv_vec t = tcg_temp_new_vec_matching(d);
+
+    tcg_gen_smin_vec(vece, t, a, b);
+    tcg_gen_smax_vec(vece, d, a, b);
+    tcg_gen_sub_vec(vece, d, d, t);
+    tcg_temp_free_vec(t);
+}
+
+void gen_gvec_sabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+    static const TCGOpcode vecop_list[] = {
+        INDEX_op_sub_vec, INDEX_op_smin_vec, INDEX_op_smax_vec, 0
+    };
+    static const GVecGen3 ops[4] = {
+        { .fniv = gen_sabd_vec,
+          .fno = gen_helper_gvec_sabd_b,
+          .opt_opc = vecop_list,
+          .vece = MO_8 },
+        { .fniv = gen_sabd_vec,
+          .fno = gen_helper_gvec_sabd_h,
+          .opt_opc = vecop_list,
+          .vece = MO_16 },
+        { .fni4 = gen_sabd_i32,
+          .fniv = gen_sabd_vec,
+          .fno = gen_helper_gvec_sabd_s,
+          .opt_opc = vecop_list,
+          .vece = MO_32 },
+        { .fni8 = gen_sabd_i64,
+          .fniv = gen_sabd_vec,
+          .fno = gen_helper_gvec_sabd_d,
+          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+          .opt_opc = vecop_list,
+          .vece = MO_64 },
+    };
+    tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
+}
+
+static void gen_uabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+    TCGv_i32 t = tcg_temp_new_i32();
+
+    tcg_gen_sub_i32(t, a, b);
+    tcg_gen_sub_i32(d, b, a);
+    tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, d, t);
+    tcg_temp_free_i32(t);
+}
+
+static void gen_uabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+    TCGv_i64 t = tcg_temp_new_i64();
+
+    tcg_gen_sub_i64(t, a, b);
+    tcg_gen_sub_i64(d, b, a);
+    tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, d, t);
+    tcg_temp_free_i64(t);
+}
+
+static void gen_uabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
+{
+    TCGv_vec t = tcg_temp_new_vec_matching(d);
+
+    tcg_gen_umin_vec(vece, t, a, b);
+    tcg_gen_umax_vec(vece, d, a, b);
+    tcg_gen_sub_vec(vece, d, d, t);
+    tcg_temp_free_vec(t);
+}
+
+void gen_gvec_uabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+    static const TCGOpcode vecop_list[] = {
+        INDEX_op_sub_vec, INDEX_op_umin_vec, INDEX_op_umax_vec, 0
+    };
+    static const GVecGen3 ops[4] = {
+        { .fniv = gen_uabd_vec,
+          .fno = gen_helper_gvec_uabd_b,
+          .opt_opc = vecop_list,
+          .vece = MO_8 },
+        { .fniv = gen_uabd_vec,
+          .fno = gen_helper_gvec_uabd_h,
+          .opt_opc = vecop_list,
+          .vece = MO_16 },
+        { .fni4 = gen_uabd_i32,
+          .fniv = gen_uabd_vec,
+          .fno = gen_helper_gvec_uabd_s,
+          .opt_opc = vecop_list,
+          .vece = MO_32 },
+        { .fni8 = gen_uabd_i64,
+          .fniv = gen_uabd_vec,
+          .fno = gen_helper_gvec_uabd_d,
+          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+          .opt_opc = vecop_list,
+          .vece = MO_64 },
+    };
+    tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
+}
+
+static void gen_saba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+    TCGv_i32 t = tcg_temp_new_i32();
+    gen_sabd_i32(t, a, b);
+    tcg_gen_add_i32(d, d, t);
+    tcg_temp_free_i32(t);
+}
+
+static void gen_saba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+    TCGv_i64 t = tcg_temp_new_i64();
+    gen_sabd_i64(t, a, b);
+    tcg_gen_add_i64(d, d, t);
+    tcg_temp_free_i64(t);
+}
+
+static void gen_saba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
+{
+    TCGv_vec t = tcg_temp_new_vec_matching(d);
+    gen_sabd_vec(vece, t, a, b);
+    tcg_gen_add_vec(vece, d, d, t);
+    tcg_temp_free_vec(t);
+}
+
+void gen_gvec_saba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+    static const TCGOpcode vecop_list[] = {
+        INDEX_op_sub_vec, INDEX_op_add_vec,
+        INDEX_op_smin_vec, INDEX_op_smax_vec, 0
+    };
+    static const GVecGen3 ops[4] = {
+        { .fniv = gen_saba_vec,
+          .fno = gen_helper_gvec_saba_b,
+          .opt_opc = vecop_list,
+          .load_dest = true,
+          .vece = MO_8 },
+        { .fniv = gen_saba_vec,
+          .fno = gen_helper_gvec_saba_h,
+          .opt_opc = vecop_list,
+          .load_dest = true,
+          .vece = MO_16 },
+        { .fni4 = gen_saba_i32,
+          .fniv = gen_saba_vec,
+          .fno = gen_helper_gvec_saba_s,
+          .opt_opc = vecop_list,
+          .load_dest = true,
+          .vece = MO_32 },
+        { .fni8 = gen_saba_i64,
+          .fniv = gen_saba_vec,
+          .fno = gen_helper_gvec_saba_d,
+          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+          .opt_opc = vecop_list,
+          .load_dest = true,
+          .vece = MO_64 },
+    };
+    tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
+}
+
+static void gen_uaba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+    TCGv_i32 t = tcg_temp_new_i32();
+    gen_uabd_i32(t, a, b);
+    tcg_gen_add_i32(d, d, t);
+    tcg_temp_free_i32(t);
+}
+
+static void gen_uaba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+    TCGv_i64 t = tcg_temp_new_i64();
+    gen_uabd_i64(t, a, b);
+    tcg_gen_add_i64(d, d, t);
+    tcg_temp_free_i64(t);
+}
+
+static void gen_uaba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
+{
+    TCGv_vec t = tcg_temp_new_vec_matching(d);
+    gen_uabd_vec(vece, t, a, b);
+    tcg_gen_add_vec(vece, d, d, t);
+    tcg_temp_free_vec(t);
+}
+
+void gen_gvec_uaba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
+{
+    static const TCGOpcode vecop_list[] = {
+        INDEX_op_sub_vec, INDEX_op_add_vec,
+        INDEX_op_umin_vec, INDEX_op_umax_vec, 0
+    };
+    static const GVecGen3 ops[4] = {
+        { .fniv = gen_uaba_vec,
+          .fno = gen_helper_gvec_uaba_b,
+          .opt_opc = vecop_list,
+          .load_dest = true,
+          .vece = MO_8 },
+        { .fniv = gen_uaba_vec,
+          .fno = gen_helper_gvec_uaba_h,
+          .opt_opc = vecop_list,
+          .load_dest = true,
+          .vece = MO_16 },
+        { .fni4 = gen_uaba_i32,
+          .fniv = gen_uaba_vec,
+          .fno = gen_helper_gvec_uaba_s,
+          .opt_opc = vecop_list,
+          .load_dest = true,
+          .vece = MO_32 },
+        { .fni8 = gen_uaba_i64,
+          .fniv = gen_uaba_vec,
+          .fno = gen_helper_gvec_uaba_d,
+          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+          .opt_opc = vecop_list,
+          .load_dest = true,
+          .vece = MO_64 },
+    };
+    tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
+}
+
+static void do_coproc_insn(DisasContext *s, int cpnum, int is64,
+                           int opc1, int crn, int crm, int opc2,
+                           bool isread, int rt, int rt2)
+{
+    uint32_t key = ENCODE_CP_REG(cpnum, is64, s->ns, crn, crm, opc1, opc2);
+    const ARMCPRegInfo *ri = get_arm_cp_reginfo(s->cp_regs, key);
+    TCGv_ptr tcg_ri = NULL;
+    bool need_exit_tb;
+    uint32_t syndrome;
+
+    /*
+     * Note that since we are an implementation which takes an
+     * exception on a trapped conditional instruction only if the
+     * instruction passes its condition code check, we can take
+     * advantage of the clause in the ARM ARM that allows us to set
+     * the COND field in the instruction to 0xE in all cases.
+     * We could fish the actual condition out of the insn (ARM)
+     * or the condexec bits (Thumb) but it isn't necessary.
+     */
+    switch (cpnum) {
+    case 14:
+        if (is64) {
+            syndrome = syn_cp14_rrt_trap(1, 0xe, opc1, crm, rt, rt2,
+                                         isread, false);
+        } else {
+            syndrome = syn_cp14_rt_trap(1, 0xe, opc1, opc2, crn, crm,
+                                        rt, isread, false);
+        }
+        break;
+    case 15:
+        if (is64) {
+            syndrome = syn_cp15_rrt_trap(1, 0xe, opc1, crm, rt, rt2,
+                                         isread, false);
+        } else {
+            syndrome = syn_cp15_rt_trap(1, 0xe, opc1, opc2, crn, crm,
+                                        rt, isread, false);
+        }
+        break;
+    default:
+        /*
+         * ARMv8 defines that only coprocessors 14 and 15 exist,
+         * so this can only happen if this is an ARMv7 or earlier CPU,
+         * in which case the syndrome information won't actually be
+         * guest visible.
+         */
+        assert(!arm_dc_feature(s, ARM_FEATURE_V8));
+        syndrome = syn_uncategorized();
+        break;
+    }
+
+    if (s->hstr_active && cpnum == 15 && s->current_el == 1) {
+        /*
+         * At EL1, check for a HSTR_EL2 trap, which must take precedence
+         * over the UNDEF for "no such register" or the UNDEF for "access
+         * permissions forbid this EL1 access". HSTR_EL2 traps from EL0
+         * only happen if the cpreg doesn't UNDEF at EL0, so we do those in
+         * access_check_cp_reg(), after the checks for whether the access
+         * configurably trapped to EL1.
+         */
+        uint32_t maskbit = is64 ? crm : crn;
+
+        if (maskbit != 4 && maskbit != 14) {
+            /* T4 and T14 are RES0 so never cause traps */
+            TCGv_i32 t;
+            DisasLabel over = gen_disas_label(s);
+
+            t = load_cpu_offset(offsetoflow32(CPUARMState, cp15.hstr_el2));
+            tcg_gen_andi_i32(t, t, 1u << maskbit);
+            tcg_gen_brcondi_i32(TCG_COND_EQ, t, 0, over.label);
+            tcg_temp_free_i32(t);
+
+            gen_exception_insn(s, 0, EXCP_UDEF, syndrome);
+            set_disas_label(s, over);
+        }
+    }
+
+    if (!ri) {
+        /*
+         * Unknown register; this might be a guest error or a QEMU
+         * unimplemented feature.
+         */
+        if (is64) {
+            qemu_log_mask(LOG_UNIMP, "%s access to unsupported AArch32 "
+                          "64 bit system register cp:%d opc1: %d crm:%d "
+                          "(%s)\n",
+                          isread ? "read" : "write", cpnum, opc1, crm,
+                          s->ns ? "non-secure" : "secure");
+        } else {
+            qemu_log_mask(LOG_UNIMP, "%s access to unsupported AArch32 "
+                          "system register cp:%d opc1:%d crn:%d crm:%d "
+                          "opc2:%d (%s)\n",
+                          isread ? "read" : "write", cpnum, opc1, crn,
+                          crm, opc2, s->ns ? "non-secure" : "secure");
+        }
+        unallocated_encoding(s);
+        return;
+    }
+
+    /* Check access permissions */
+    if (!cp_access_ok(s->current_el, ri, isread)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if ((s->hstr_active && s->current_el == 0) || ri->accessfn ||
+        (ri->fgt && s->fgt_active) ||
+        (arm_dc_feature(s, ARM_FEATURE_XSCALE) && cpnum < 14)) {
+        /*
+         * Emit code to perform further access permissions checks at
+         * runtime; this may result in an exception.
+         * Note that on XScale all cp0..c13 registers do an access check
+         * call in order to handle c15_cpar.
+         */
+        gen_set_condexec(s);
+        gen_update_pc(s, 0);
+        tcg_ri = tcg_temp_new_ptr();
+        gen_helper_access_check_cp_reg(tcg_ri, cpu_env,
+                                       tcg_constant_i32(key),
+                                       tcg_constant_i32(syndrome),
+                                       tcg_constant_i32(isread));
+    } else if (ri->type & ARM_CP_RAISES_EXC) {
+        /*
+         * The readfn or writefn might raise an exception;
+         * synchronize the CPU state in case it does.
+         */
+        gen_set_condexec(s);
+        gen_update_pc(s, 0);
+    }
+
+    /* Handle special cases first */
+    switch (ri->type & ARM_CP_SPECIAL_MASK) {
+    case 0:
+        break;
+    case ARM_CP_NOP:
+        goto exit;
+    case ARM_CP_WFI:
+        if (isread) {
+            unallocated_encoding(s);
+        } else {
+            gen_update_pc(s, curr_insn_len(s));
+            s->base.is_jmp = DISAS_WFI;
+        }
+        goto exit;
+    default:
+        g_assert_not_reached();
+    }
+
+    if ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) {
+        gen_io_start();
+    }
+
+    if (isread) {
+        /* Read */
+        if (is64) {
+            TCGv_i64 tmp64;
+            TCGv_i32 tmp;
+            if (ri->type & ARM_CP_CONST) {
+                tmp64 = tcg_constant_i64(ri->resetvalue);
+            } else if (ri->readfn) {
+                if (!tcg_ri) {
+                    tcg_ri = gen_lookup_cp_reg(key);
+                }
+                tmp64 = tcg_temp_new_i64();
+                gen_helper_get_cp_reg64(tmp64, cpu_env, tcg_ri);
+            } else {
+                tmp64 = tcg_temp_new_i64();
+                tcg_gen_ld_i64(tmp64, cpu_env, ri->fieldoffset);
+            }
+            tmp = tcg_temp_new_i32();
+            tcg_gen_extrl_i64_i32(tmp, tmp64);
+            store_reg(s, rt, tmp);
+            tmp = tcg_temp_new_i32();
+            tcg_gen_extrh_i64_i32(tmp, tmp64);
+            tcg_temp_free_i64(tmp64);
+            store_reg(s, rt2, tmp);
+        } else {
+            TCGv_i32 tmp;
+            if (ri->type & ARM_CP_CONST) {
+                tmp = tcg_constant_i32(ri->resetvalue);
+            } else if (ri->readfn) {
+                if (!tcg_ri) {
+                    tcg_ri = gen_lookup_cp_reg(key);
+                }
+                tmp = tcg_temp_new_i32();
+                gen_helper_get_cp_reg(tmp, cpu_env, tcg_ri);
+            } else {
+                tmp = load_cpu_offset(ri->fieldoffset);
+            }
+            if (rt == 15) {
+                /* Destination register of r15 for 32 bit loads sets
+                 * the condition codes from the high 4 bits of the value
+                 */
+                gen_set_nzcv(tmp);
+                tcg_temp_free_i32(tmp);
+            } else {
+                store_reg(s, rt, tmp);
+            }
+        }
+    } else {
+        /* Write */
+        if (ri->type & ARM_CP_CONST) {
+            /* If not forbidden by access permissions, treat as WI */
+            goto exit;
+        }
+
+        if (is64) {
+            TCGv_i32 tmplo, tmphi;
+            TCGv_i64 tmp64 = tcg_temp_new_i64();
+            tmplo = load_reg(s, rt);
+            tmphi = load_reg(s, rt2);
+            tcg_gen_concat_i32_i64(tmp64, tmplo, tmphi);
+            tcg_temp_free_i32(tmplo);
+            tcg_temp_free_i32(tmphi);
+            if (ri->writefn) {
+                if (!tcg_ri) {
+                    tcg_ri = gen_lookup_cp_reg(key);
+                }
+                gen_helper_set_cp_reg64(cpu_env, tcg_ri, tmp64);
+            } else {
+                tcg_gen_st_i64(tmp64, cpu_env, ri->fieldoffset);
+            }
+            tcg_temp_free_i64(tmp64);
+        } else {
+            TCGv_i32 tmp = load_reg(s, rt);
+            if (ri->writefn) {
+                if (!tcg_ri) {
+                    tcg_ri = gen_lookup_cp_reg(key);
+                }
+                gen_helper_set_cp_reg(cpu_env, tcg_ri, tmp);
+                tcg_temp_free_i32(tmp);
+            } else {
+                store_cpu_offset(tmp, ri->fieldoffset, 4);
+            }
+        }
+    }
+
+    /* I/O operations must end the TB here (whether read or write) */
+    need_exit_tb = ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) &&
+                    (ri->type & ARM_CP_IO));
+
+    if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {
+        /*
+         * A write to any coprocessor register that ends a TB
+         * must rebuild the hflags for the next TB.
+         */
+        gen_rebuild_hflags(s, ri->type & ARM_CP_NEWEL);
+        /*
+         * We default to ending the TB on a coprocessor register write,
+         * but allow this to be suppressed by the register definition
+         * (usually only necessary to work around guest bugs).
+         */
+        need_exit_tb = true;
+    }
+    if (need_exit_tb) {
+        gen_lookup_tb(s);
+    }
+
+ exit:
+    if (tcg_ri) {
+        tcg_temp_free_ptr(tcg_ri);
+    }
+}
+
+/* Decode XScale DSP or iWMMXt insn (in the copro space, cp=0 or 1) */
+static void disas_xscale_insn(DisasContext *s, uint32_t insn)
+{
+    int cpnum = (insn >> 8) & 0xf;
+
+    if (extract32(s->c15_cpar, cpnum, 1) == 0) {
+        unallocated_encoding(s);
+    } else if (arm_dc_feature(s, ARM_FEATURE_IWMMXT)) {
+        if (disas_iwmmxt_insn(s, insn)) {
+            unallocated_encoding(s);
+        }
+    } else if (arm_dc_feature(s, ARM_FEATURE_XSCALE)) {
+        if (disas_dsp_insn(s, insn)) {
+            unallocated_encoding(s);
+        }
+    }
+}
+
+/* Store a 64-bit value to a register pair.  Clobbers val.  */
+static void gen_storeq_reg(DisasContext *s, int rlow, int rhigh, TCGv_i64 val)
+{
+    TCGv_i32 tmp;
+    tmp = tcg_temp_new_i32();
+    tcg_gen_extrl_i64_i32(tmp, val);
+    store_reg(s, rlow, tmp);
+    tmp = tcg_temp_new_i32();
+    tcg_gen_extrh_i64_i32(tmp, val);
+    store_reg(s, rhigh, tmp);
+}
+
+/* load and add a 64-bit value from a register pair.  */
+static void gen_addq(DisasContext *s, TCGv_i64 val, int rlow, int rhigh)
+{
+    TCGv_i64 tmp;
+    TCGv_i32 tmpl;
+    TCGv_i32 tmph;
+
+    /* Load 64-bit value rd:rn.  */
+    tmpl = load_reg(s, rlow);
+    tmph = load_reg(s, rhigh);
+    tmp = tcg_temp_new_i64();
+    tcg_gen_concat_i32_i64(tmp, tmpl, tmph);
+    tcg_temp_free_i32(tmpl);
+    tcg_temp_free_i32(tmph);
+    tcg_gen_add_i64(val, val, tmp);
+    tcg_temp_free_i64(tmp);
+}
+
+/* Set N and Z flags from hi|lo.  */
+static void gen_logicq_cc(TCGv_i32 lo, TCGv_i32 hi)
+{
+    tcg_gen_mov_i32(cpu_NF, hi);
+    tcg_gen_or_i32(cpu_ZF, lo, hi);
+}
+
+/* Load/Store exclusive instructions are implemented by remembering
+   the value/address loaded, and seeing if these are the same
+   when the store is performed.  This should be sufficient to implement
+   the architecturally mandated semantics, and avoids having to monitor
+   regular stores.  The compare vs the remembered value is done during
+   the cmpxchg operation, but we must compare the addresses manually.  */
+static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
+                               TCGv_i32 addr, int size)
+{
+    TCGv_i32 tmp = tcg_temp_new_i32();
+    MemOp opc = size | MO_ALIGN | s->be_data;
+
+    s->is_ldex = true;
+
+    if (size == 3) {
+        TCGv_i32 tmp2 = tcg_temp_new_i32();
+        TCGv_i64 t64 = tcg_temp_new_i64();
+
+        /*
+         * For AArch32, architecturally the 32-bit word at the lowest
+         * address is always Rt and the one at addr+4 is Rt2, even if
+         * the CPU is big-endian. That means we don't want to do a
+         * gen_aa32_ld_i64(), which checks SCTLR_B as if for an
+         * architecturally 64-bit access, but instead do a 64-bit access
+         * using MO_BE if appropriate and then split the two halves.
+         */
+        TCGv taddr = gen_aa32_addr(s, addr, opc);
+
+        tcg_gen_qemu_ld_i64(t64, taddr, get_mem_index(s), opc);
+        tcg_temp_free(taddr);
+        tcg_gen_mov_i64(cpu_exclusive_val, t64);
+        if (s->be_data == MO_BE) {
+            tcg_gen_extr_i64_i32(tmp2, tmp, t64);
+        } else {
+            tcg_gen_extr_i64_i32(tmp, tmp2, t64);
+        }
+        tcg_temp_free_i64(t64);
+
+        store_reg(s, rt2, tmp2);
+    } else {
+        gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), opc);
+        tcg_gen_extu_i32_i64(cpu_exclusive_val, tmp);
+    }
+
+    store_reg(s, rt, tmp);
+    tcg_gen_extu_i32_i64(cpu_exclusive_addr, addr);
+}
+
+static void gen_clrex(DisasContext *s)
+{
+    tcg_gen_movi_i64(cpu_exclusive_addr, -1);
+}
+
+static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
+                                TCGv_i32 addr, int size)
+{
+    TCGv_i32 t0, t1, t2;
+    TCGv_i64 extaddr;
+    TCGv taddr;
+    TCGLabel *done_label;
+    TCGLabel *fail_label;
+    MemOp opc = size | MO_ALIGN | s->be_data;
+
+    /* if (env->exclusive_addr == addr && env->exclusive_val == [addr]) {
+         [addr] = {Rt};
+         {Rd} = 0;
+       } else {
+         {Rd} = 1;
+       } */
+    fail_label = gen_new_label();
+    done_label = gen_new_label();
+    extaddr = tcg_temp_new_i64();
+    tcg_gen_extu_i32_i64(extaddr, addr);
+    tcg_gen_brcond_i64(TCG_COND_NE, extaddr, cpu_exclusive_addr, fail_label);
+    tcg_temp_free_i64(extaddr);
+
+    taddr = gen_aa32_addr(s, addr, opc);
+    t0 = tcg_temp_new_i32();
+    t1 = load_reg(s, rt);
+    if (size == 3) {
+        TCGv_i64 o64 = tcg_temp_new_i64();
+        TCGv_i64 n64 = tcg_temp_new_i64();
+
+        t2 = load_reg(s, rt2);
+
+        /*
+         * For AArch32, architecturally the 32-bit word at the lowest
+         * address is always Rt and the one at addr+4 is Rt2, even if
+         * the CPU is big-endian. Since we're going to treat this as a
+         * single 64-bit BE store, we need to put the two halves in the
+         * opposite order for BE to LE, so that they end up in the right
+         * places.  We don't want gen_aa32_st_i64, because that checks
+         * SCTLR_B as if for an architectural 64-bit access.
+         */
+        if (s->be_data == MO_BE) {
+            tcg_gen_concat_i32_i64(n64, t2, t1);
+        } else {
+            tcg_gen_concat_i32_i64(n64, t1, t2);
+        }
+        tcg_temp_free_i32(t2);
+
+        tcg_gen_atomic_cmpxchg_i64(o64, taddr, cpu_exclusive_val, n64,
+                                   get_mem_index(s), opc);
+        tcg_temp_free_i64(n64);
+
+        tcg_gen_setcond_i64(TCG_COND_NE, o64, o64, cpu_exclusive_val);
+        tcg_gen_extrl_i64_i32(t0, o64);
+
+        tcg_temp_free_i64(o64);
+    } else {
+        t2 = tcg_temp_new_i32();
+        tcg_gen_extrl_i64_i32(t2, cpu_exclusive_val);
+        tcg_gen_atomic_cmpxchg_i32(t0, taddr, t2, t1, get_mem_index(s), opc);
+        tcg_gen_setcond_i32(TCG_COND_NE, t0, t0, t2);
+        tcg_temp_free_i32(t2);
+    }
+    tcg_temp_free_i32(t1);
+    tcg_temp_free(taddr);
+    tcg_gen_mov_i32(cpu_R[rd], t0);
+    tcg_temp_free_i32(t0);
+    tcg_gen_br(done_label);
+
+    gen_set_label(fail_label);
+    tcg_gen_movi_i32(cpu_R[rd], 1);
+    gen_set_label(done_label);
+    tcg_gen_movi_i64(cpu_exclusive_addr, -1);
+}
+
+/* gen_srs:
+ * @env: CPUARMState
+ * @s: DisasContext
+ * @mode: mode field from insn (which stack to store to)
+ * @amode: addressing mode (DA/IA/DB/IB), encoded as per P,U bits in ARM insn
+ * @writeback: true if writeback bit set
+ *
+ * Generate code for the SRS (Store Return State) insn.
+ */
+static void gen_srs(DisasContext *s,
+                    uint32_t mode, uint32_t amode, bool writeback)
+{
+    int32_t offset;
+    TCGv_i32 addr, tmp;
+    bool undef = false;
+
+    /* SRS is:
+     * - trapped to EL3 if EL3 is AArch64 and we are at Secure EL1
+     *   and specified mode is monitor mode
+     * - UNDEFINED in Hyp mode
+     * - UNPREDICTABLE in User or System mode
+     * - UNPREDICTABLE if the specified mode is:
+     * -- not implemented
+     * -- not a valid mode number
+     * -- a mode that's at a higher exception level
+     * -- Monitor, if we are Non-secure
+     * For the UNPREDICTABLE cases we choose to UNDEF.
+     */
+    if (s->current_el == 1 && !s->ns && mode == ARM_CPU_MODE_MON) {
+        gen_exception_insn_el(s, 0, EXCP_UDEF, syn_uncategorized(), 3);
+        return;
+    }
+
+    if (s->current_el == 0 || s->current_el == 2) {
+        undef = true;
+    }
+
+    switch (mode) {
+    case ARM_CPU_MODE_USR:
+    case ARM_CPU_MODE_FIQ:
+    case ARM_CPU_MODE_IRQ:
+    case ARM_CPU_MODE_SVC:
+    case ARM_CPU_MODE_ABT:
+    case ARM_CPU_MODE_UND:
+    case ARM_CPU_MODE_SYS:
+        break;
+    case ARM_CPU_MODE_HYP:
+        if (s->current_el == 1 || !arm_dc_feature(s, ARM_FEATURE_EL2)) {
+            undef = true;
+        }
+        break;
+    case ARM_CPU_MODE_MON:
+        /* No need to check specifically for "are we non-secure" because
+         * we've already made EL0 UNDEF and handled the trap for S-EL1;
+         * so if this isn't EL3 then we must be non-secure.
+         */
+        if (s->current_el != 3) {
+            undef = true;
+        }
+        break;
+    default:
+        undef = true;
+    }
+
+    if (undef) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    addr = tcg_temp_new_i32();
+    /* get_r13_banked() will raise an exception if called from System mode */
+    gen_set_condexec(s);
+    gen_update_pc(s, 0);
+    gen_helper_get_r13_banked(addr, cpu_env, tcg_constant_i32(mode));
+    switch (amode) {
+    case 0: /* DA */
+        offset = -4;
+        break;
+    case 1: /* IA */
+        offset = 0;
+        break;
+    case 2: /* DB */
+        offset = -8;
+        break;
+    case 3: /* IB */
+        offset = 4;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    tcg_gen_addi_i32(addr, addr, offset);
+    tmp = load_reg(s, 14);
+    gen_aa32_st_i32(s, tmp, addr, get_mem_index(s), MO_UL | MO_ALIGN);
+    tcg_temp_free_i32(tmp);
+    tmp = load_cpu_field(spsr);
+    tcg_gen_addi_i32(addr, addr, 4);
+    gen_aa32_st_i32(s, tmp, addr, get_mem_index(s), MO_UL | MO_ALIGN);
+    tcg_temp_free_i32(tmp);
+    if (writeback) {
+        switch (amode) {
+        case 0:
+            offset = -8;
+            break;
+        case 1:
+            offset = 4;
+            break;
+        case 2:
+            offset = -4;
+            break;
+        case 3:
+            offset = 0;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+        tcg_gen_addi_i32(addr, addr, offset);
+        gen_helper_set_r13_banked(cpu_env, tcg_constant_i32(mode), addr);
+    }
+    tcg_temp_free_i32(addr);
+    s->base.is_jmp = DISAS_UPDATE_EXIT;
+}
+
+/* Skip this instruction if the ARM condition is false */
+static void arm_skip_unless(DisasContext *s, uint32_t cond)
+{
+    arm_gen_condlabel(s);
+    arm_gen_test_cc(cond ^ 1, s->condlabel.label);
+}
+
+
+/*
+ * Constant expanders used by T16/T32 decode
+ */
+
+/* Return only the rotation part of T32ExpandImm.  */
+static int t32_expandimm_rot(DisasContext *s, int x)
+{
+    return x & 0xc00 ? extract32(x, 7, 5) : 0;
+}
+
+/* Return the unrotated immediate from T32ExpandImm.  */
+static int t32_expandimm_imm(DisasContext *s, int x)
+{
+    int imm = extract32(x, 0, 8);
+
+    switch (extract32(x, 8, 4)) {
+    case 0: /* XY */
+        /* Nothing to do.  */
+        break;
+    case 1: /* 00XY00XY */
+        imm *= 0x00010001;
+        break;
+    case 2: /* XY00XY00 */
+        imm *= 0x01000100;
+        break;
+    case 3: /* XYXYXYXY */
+        imm *= 0x01010101;
+        break;
+    default:
+        /* Rotated constant.  */
+        imm |= 0x80;
+        break;
+    }
+    return imm;
+}
+
+static int t32_branch24(DisasContext *s, int x)
+{
+    /* Convert J1:J2 at x[22:21] to I2:I1, which involves I=J^~S.  */
+    x ^= !(x < 0) * (3 << 21);
+    /* Append the final zero.  */
+    return x << 1;
+}
+
+static int t16_setflags(DisasContext *s)
+{
+    return s->condexec_mask == 0;
+}
+
+static int t16_push_list(DisasContext *s, int x)
+{
+    return (x & 0xff) | (x & 0x100) << (14 - 8);
+}
+
+static int t16_pop_list(DisasContext *s, int x)
+{
+    return (x & 0xff) | (x & 0x100) << (15 - 8);
+}
+
+/*
+ * Include the generated decoders.
+ */
+
+#include "decode-a32.c.inc"
+#include "decode-a32-uncond.c.inc"
+#include "decode-t32.c.inc"
+#include "decode-t16.c.inc"
+
+static bool valid_cp(DisasContext *s, int cp)
+{
+    /*
+     * Return true if this coprocessor field indicates something
+     * that's really a possible coprocessor.
+     * For v7 and earlier, coprocessors 8..15 were reserved for Arm use,
+     * and of those only cp14 and cp15 were used for registers.
+     * cp10 and cp11 were used for VFP and Neon, whose decode is
+     * dealt with elsewhere. With the advent of fp16, cp9 is also
+     * now part of VFP.
+     * For v8A and later, the encoding has been tightened so that
+     * only cp14 and cp15 are valid, and other values aren't considered
+     * to be in the coprocessor-instruction space at all. v8M still
+     * permits coprocessors 0..7.
+     * For XScale, we must not decode the XScale cp0, cp1 space as
+     * a standard coprocessor insn, because we want to fall through to
+     * the legacy disas_xscale_insn() decoder after decodetree is done.
+     */
+    if (arm_dc_feature(s, ARM_FEATURE_XSCALE) && (cp == 0 || cp == 1)) {
+        return false;
+    }
+
+    if (arm_dc_feature(s, ARM_FEATURE_V8) &&
+        !arm_dc_feature(s, ARM_FEATURE_M)) {
+        return cp >= 14;
+    }
+    return cp < 8 || cp >= 14;
+}
+
+static bool trans_MCR(DisasContext *s, arg_MCR *a)
+{
+    if (!valid_cp(s, a->cp)) {
+        return false;
+    }
+    do_coproc_insn(s, a->cp, false, a->opc1, a->crn, a->crm, a->opc2,
+                   false, a->rt, 0);
+    return true;
+}
+
+static bool trans_MRC(DisasContext *s, arg_MRC *a)
+{
+    if (!valid_cp(s, a->cp)) {
+        return false;
+    }
+    do_coproc_insn(s, a->cp, false, a->opc1, a->crn, a->crm, a->opc2,
+                   true, a->rt, 0);
+    return true;
+}
+
+static bool trans_MCRR(DisasContext *s, arg_MCRR *a)
+{
+    if (!valid_cp(s, a->cp)) {
+        return false;
+    }
+    do_coproc_insn(s, a->cp, true, a->opc1, 0, a->crm, 0,
+                   false, a->rt, a->rt2);
+    return true;
+}
+
+static bool trans_MRRC(DisasContext *s, arg_MRRC *a)
+{
+    if (!valid_cp(s, a->cp)) {
+        return false;
+    }
+    do_coproc_insn(s, a->cp, true, a->opc1, 0, a->crm, 0,
+                   true, a->rt, a->rt2);
+    return true;
+}
+
+/* Helpers to swap operands for reverse-subtract.  */
+static void gen_rsb(TCGv_i32 dst, TCGv_i32 a, TCGv_i32 b)
+{
+    tcg_gen_sub_i32(dst, b, a);
+}
+
+static void gen_rsb_CC(TCGv_i32 dst, TCGv_i32 a, TCGv_i32 b)
+{
+    gen_sub_CC(dst, b, a);
+}
+
+static void gen_rsc(TCGv_i32 dest, TCGv_i32 a, TCGv_i32 b)
+{
+    gen_sub_carry(dest, b, a);
+}
+
+static void gen_rsc_CC(TCGv_i32 dest, TCGv_i32 a, TCGv_i32 b)
+{
+    gen_sbc_CC(dest, b, a);
+}
+
+/*
+ * Helpers for the data processing routines.
+ *
+ * After the computation store the results back.
+ * This may be suppressed altogether (STREG_NONE), require a runtime
+ * check against the stack limits (STREG_SP_CHECK), or generate an
+ * exception return.  Oh, or store into a register.
+ *
+ * Always return true, indicating success for a trans_* function.
+ */
+typedef enum {
+   STREG_NONE,
+   STREG_NORMAL,
+   STREG_SP_CHECK,
+   STREG_EXC_RET,
+} StoreRegKind;
+
+static bool store_reg_kind(DisasContext *s, int rd,
+                            TCGv_i32 val, StoreRegKind kind)
+{
+    switch (kind) {
+    case STREG_NONE:
+        tcg_temp_free_i32(val);
+        return true;
+    case STREG_NORMAL:
+        /* See ALUWritePC: Interworking only from a32 mode. */
+        if (s->thumb) {
+            store_reg(s, rd, val);
+        } else {
+            store_reg_bx(s, rd, val);
+        }
+        return true;
+    case STREG_SP_CHECK:
+        store_sp_checked(s, val);
+        return true;
+    case STREG_EXC_RET:
+        gen_exception_return(s, val);
+        return true;
+    }
+    g_assert_not_reached();
+}
+
+/*
+ * Data Processing (register)
+ *
+ * Operate, with set flags, one register source,
+ * one immediate shifted register source, and a destination.
+ */
+static bool op_s_rrr_shi(DisasContext *s, arg_s_rrr_shi *a,
+                         void (*gen)(TCGv_i32, TCGv_i32, TCGv_i32),
+                         int logic_cc, StoreRegKind kind)
+{
+    TCGv_i32 tmp1, tmp2;
+
+    tmp2 = load_reg(s, a->rm);
+    gen_arm_shift_im(tmp2, a->shty, a->shim, logic_cc);
+    tmp1 = load_reg(s, a->rn);
+
+    gen(tmp1, tmp1, tmp2);
+    tcg_temp_free_i32(tmp2);
+
+    if (logic_cc) {
+        gen_logic_CC(tmp1);
+    }
+    return store_reg_kind(s, a->rd, tmp1, kind);
+}
+
+static bool op_s_rxr_shi(DisasContext *s, arg_s_rrr_shi *a,
+                         void (*gen)(TCGv_i32, TCGv_i32),
+                         int logic_cc, StoreRegKind kind)
+{
+    TCGv_i32 tmp;
+
+    tmp = load_reg(s, a->rm);
+    gen_arm_shift_im(tmp, a->shty, a->shim, logic_cc);
+
+    gen(tmp, tmp);
+    if (logic_cc) {
+        gen_logic_CC(tmp);
+    }
+    return store_reg_kind(s, a->rd, tmp, kind);
+}
+
+/*
+ * Data-processing (register-shifted register)
+ *
+ * Operate, with set flags, one register source,
+ * one register shifted register source, and a destination.
+ */
+static bool op_s_rrr_shr(DisasContext *s, arg_s_rrr_shr *a,
+                         void (*gen)(TCGv_i32, TCGv_i32, TCGv_i32),
+                         int logic_cc, StoreRegKind kind)
+{
+    TCGv_i32 tmp1, tmp2;
+
+    tmp1 = load_reg(s, a->rs);
+    tmp2 = load_reg(s, a->rm);
+    gen_arm_shift_reg(tmp2, a->shty, tmp1, logic_cc);
+    tmp1 = load_reg(s, a->rn);
+
+    gen(tmp1, tmp1, tmp2);
+    tcg_temp_free_i32(tmp2);
+
+    if (logic_cc) {
+        gen_logic_CC(tmp1);
+    }
+    return store_reg_kind(s, a->rd, tmp1, kind);
+}
+
+static bool op_s_rxr_shr(DisasContext *s, arg_s_rrr_shr *a,
+                         void (*gen)(TCGv_i32, TCGv_i32),
+                         int logic_cc, StoreRegKind kind)
+{
+    TCGv_i32 tmp1, tmp2;
+
+    tmp1 = load_reg(s, a->rs);
+    tmp2 = load_reg(s, a->rm);
+    gen_arm_shift_reg(tmp2, a->shty, tmp1, logic_cc);
+
+    gen(tmp2, tmp2);
+    if (logic_cc) {
+        gen_logic_CC(tmp2);
+    }
+    return store_reg_kind(s, a->rd, tmp2, kind);
+}
+
+/*
+ * Data-processing (immediate)
+ *
+ * Operate, with set flags, one register source,
+ * one rotated immediate, and a destination.
+ *
+ * Note that logic_cc && a->rot setting CF based on the msb of the
+ * immediate is the reason why we must pass in the unrotated form
+ * of the immediate.
+ */
+static bool op_s_rri_rot(DisasContext *s, arg_s_rri_rot *a,
+                         void (*gen)(TCGv_i32, TCGv_i32, TCGv_i32),
+                         int logic_cc, StoreRegKind kind)
+{
+    TCGv_i32 tmp1;
+    uint32_t imm;
+
+    imm = ror32(a->imm, a->rot);
+    if (logic_cc && a->rot) {
+        tcg_gen_movi_i32(cpu_CF, imm >> 31);
+    }
+    tmp1 = load_reg(s, a->rn);
+
+    gen(tmp1, tmp1, tcg_constant_i32(imm));
+
+    if (logic_cc) {
+        gen_logic_CC(tmp1);
+    }
+    return store_reg_kind(s, a->rd, tmp1, kind);
+}
+
+static bool op_s_rxi_rot(DisasContext *s, arg_s_rri_rot *a,
+                         void (*gen)(TCGv_i32, TCGv_i32),
+                         int logic_cc, StoreRegKind kind)
+{
+    TCGv_i32 tmp;
+    uint32_t imm;
+
+    imm = ror32(a->imm, a->rot);
+    if (logic_cc && a->rot) {
+        tcg_gen_movi_i32(cpu_CF, imm >> 31);
+    }
+
+    tmp = tcg_temp_new_i32();
+    gen(tmp, tcg_constant_i32(imm));
+
+    if (logic_cc) {
+        gen_logic_CC(tmp);
+    }
+    return store_reg_kind(s, a->rd, tmp, kind);
+}
+
+#define DO_ANY3(NAME, OP, L, K)                                         \
+    static bool trans_##NAME##_rrri(DisasContext *s, arg_s_rrr_shi *a)  \
+    { StoreRegKind k = (K); return op_s_rrr_shi(s, a, OP, L, k); }      \
+    static bool trans_##NAME##_rrrr(DisasContext *s, arg_s_rrr_shr *a)  \
+    { StoreRegKind k = (K); return op_s_rrr_shr(s, a, OP, L, k); }      \
+    static bool trans_##NAME##_rri(DisasContext *s, arg_s_rri_rot *a)   \
+    { StoreRegKind k = (K); return op_s_rri_rot(s, a, OP, L, k); }
+
+#define DO_ANY2(NAME, OP, L, K)                                         \
+    static bool trans_##NAME##_rxri(DisasContext *s, arg_s_rrr_shi *a)  \
+    { StoreRegKind k = (K); return op_s_rxr_shi(s, a, OP, L, k); }      \
+    static bool trans_##NAME##_rxrr(DisasContext *s, arg_s_rrr_shr *a)  \
+    { StoreRegKind k = (K); return op_s_rxr_shr(s, a, OP, L, k); }      \
+    static bool trans_##NAME##_rxi(DisasContext *s, arg_s_rri_rot *a)   \
+    { StoreRegKind k = (K); return op_s_rxi_rot(s, a, OP, L, k); }
+
+#define DO_CMP2(NAME, OP, L)                                            \
+    static bool trans_##NAME##_xrri(DisasContext *s, arg_s_rrr_shi *a)  \
+    { return op_s_rrr_shi(s, a, OP, L, STREG_NONE); }                   \
+    static bool trans_##NAME##_xrrr(DisasContext *s, arg_s_rrr_shr *a)  \
+    { return op_s_rrr_shr(s, a, OP, L, STREG_NONE); }                   \
+    static bool trans_##NAME##_xri(DisasContext *s, arg_s_rri_rot *a)   \
+    { return op_s_rri_rot(s, a, OP, L, STREG_NONE); }
+
+DO_ANY3(AND, tcg_gen_and_i32, a->s, STREG_NORMAL)
+DO_ANY3(EOR, tcg_gen_xor_i32, a->s, STREG_NORMAL)
+DO_ANY3(ORR, tcg_gen_or_i32, a->s, STREG_NORMAL)
+DO_ANY3(BIC, tcg_gen_andc_i32, a->s, STREG_NORMAL)
+
+DO_ANY3(RSB, a->s ? gen_rsb_CC : gen_rsb, false, STREG_NORMAL)
+DO_ANY3(ADC, a->s ? gen_adc_CC : gen_add_carry, false, STREG_NORMAL)
+DO_ANY3(SBC, a->s ? gen_sbc_CC : gen_sub_carry, false, STREG_NORMAL)
+DO_ANY3(RSC, a->s ? gen_rsc_CC : gen_rsc, false, STREG_NORMAL)
+
+DO_CMP2(TST, tcg_gen_and_i32, true)
+DO_CMP2(TEQ, tcg_gen_xor_i32, true)
+DO_CMP2(CMN, gen_add_CC, false)
+DO_CMP2(CMP, gen_sub_CC, false)
+
+DO_ANY3(ADD, a->s ? gen_add_CC : tcg_gen_add_i32, false,
+        a->rd == 13 && a->rn == 13 ? STREG_SP_CHECK : STREG_NORMAL)
+
+/*
+ * Note for the computation of StoreRegKind we return out of the
+ * middle of the functions that are expanded by DO_ANY3, and that
+ * we modify a->s via that parameter before it is used by OP.
+ */
+DO_ANY3(SUB, a->s ? gen_sub_CC : tcg_gen_sub_i32, false,
+        ({
+            StoreRegKind ret = STREG_NORMAL;
+            if (a->rd == 15 && a->s) {
+                /*
+                 * See ALUExceptionReturn:
+                 * In User mode, UNPREDICTABLE; we choose UNDEF.
+                 * In Hyp mode, UNDEFINED.
+                 */
+                if (IS_USER(s) || s->current_el == 2) {
+                    unallocated_encoding(s);
+                    return true;
+                }
+                /* There is no writeback of nzcv to PSTATE.  */
+                a->s = 0;
+                ret = STREG_EXC_RET;
+            } else if (a->rd == 13 && a->rn == 13) {
+                ret = STREG_SP_CHECK;
+            }
+            ret;
+        }))
+
+DO_ANY2(MOV, tcg_gen_mov_i32, a->s,
+        ({
+            StoreRegKind ret = STREG_NORMAL;
+            if (a->rd == 15 && a->s) {
+                /*
+                 * See ALUExceptionReturn:
+                 * In User mode, UNPREDICTABLE; we choose UNDEF.
+                 * In Hyp mode, UNDEFINED.
+                 */
+                if (IS_USER(s) || s->current_el == 2) {
+                    unallocated_encoding(s);
+                    return true;
+                }
+                /* There is no writeback of nzcv to PSTATE.  */
+                a->s = 0;
+                ret = STREG_EXC_RET;
+            } else if (a->rd == 13) {
+                ret = STREG_SP_CHECK;
+            }
+            ret;
+        }))
+
+DO_ANY2(MVN, tcg_gen_not_i32, a->s, STREG_NORMAL)
+
+/*
+ * ORN is only available with T32, so there is no register-shifted-register
+ * form of the insn.  Using the DO_ANY3 macro would create an unused function.
+ */
+static bool trans_ORN_rrri(DisasContext *s, arg_s_rrr_shi *a)
+{
+    return op_s_rrr_shi(s, a, tcg_gen_orc_i32, a->s, STREG_NORMAL);
+}
+
+static bool trans_ORN_rri(DisasContext *s, arg_s_rri_rot *a)
+{
+    return op_s_rri_rot(s, a, tcg_gen_orc_i32, a->s, STREG_NORMAL);
+}
+
+#undef DO_ANY3
+#undef DO_ANY2
+#undef DO_CMP2
+
+static bool trans_ADR(DisasContext *s, arg_ri *a)
+{
+    store_reg_bx(s, a->rd, add_reg_for_lit(s, 15, a->imm));
+    return true;
+}
+
+static bool trans_MOVW(DisasContext *s, arg_MOVW *a)
+{
+    if (!ENABLE_ARCH_6T2) {
+        return false;
+    }
+
+    store_reg(s, a->rd, tcg_constant_i32(a->imm));
+    return true;
+}
+
+static bool trans_MOVT(DisasContext *s, arg_MOVW *a)
+{
+    TCGv_i32 tmp;
+
+    if (!ENABLE_ARCH_6T2) {
+        return false;
+    }
+
+    tmp = load_reg(s, a->rd);
+    tcg_gen_ext16u_i32(tmp, tmp);
+    tcg_gen_ori_i32(tmp, tmp, a->imm << 16);
+    store_reg(s, a->rd, tmp);
+    return true;
+}
+
+/*
+ * v8.1M MVE wide-shifts
+ */
+static bool do_mve_shl_ri(DisasContext *s, arg_mve_shl_ri *a,
+                          WideShiftImmFn *fn)
+{
+    TCGv_i64 rda;
+    TCGv_i32 rdalo, rdahi;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+        /* Decode falls through to ORR/MOV UNPREDICTABLE handling */
+        return false;
+    }
+    if (a->rdahi == 15) {
+        /* These are a different encoding (SQSHL/SRSHR/UQSHL/URSHR) */
+        return false;
+    }
+    if (!dc_isar_feature(aa32_mve, s) ||
+        !arm_dc_feature(s, ARM_FEATURE_M_MAIN) ||
+        a->rdahi == 13) {
+        /* RdaHi == 13 is UNPREDICTABLE; we choose to UNDEF */
+        unallocated_encoding(s);
+        return true;
+    }
+
+    if (a->shim == 0) {
+        a->shim = 32;
+    }
+
+    rda = tcg_temp_new_i64();
+    rdalo = load_reg(s, a->rdalo);
+    rdahi = load_reg(s, a->rdahi);
+    tcg_gen_concat_i32_i64(rda, rdalo, rdahi);
+
+    fn(rda, rda, a->shim);
+
+    tcg_gen_extrl_i64_i32(rdalo, rda);
+    tcg_gen_extrh_i64_i32(rdahi, rda);
+    store_reg(s, a->rdalo, rdalo);
+    store_reg(s, a->rdahi, rdahi);
+    tcg_temp_free_i64(rda);
+
+    return true;
+}
+
+static bool trans_ASRL_ri(DisasContext *s, arg_mve_shl_ri *a)
+{
+    return do_mve_shl_ri(s, a, tcg_gen_sari_i64);
+}
+
+static bool trans_LSLL_ri(DisasContext *s, arg_mve_shl_ri *a)
+{
+    return do_mve_shl_ri(s, a, tcg_gen_shli_i64);
+}
+
+static bool trans_LSRL_ri(DisasContext *s, arg_mve_shl_ri *a)
+{
+    return do_mve_shl_ri(s, a, tcg_gen_shri_i64);
+}
+
+static void gen_mve_sqshll(TCGv_i64 r, TCGv_i64 n, int64_t shift)
+{
+    gen_helper_mve_sqshll(r, cpu_env, n, tcg_constant_i32(shift));
+}
+
+static bool trans_SQSHLL_ri(DisasContext *s, arg_mve_shl_ri *a)
+{
+    return do_mve_shl_ri(s, a, gen_mve_sqshll);
+}
+
+static void gen_mve_uqshll(TCGv_i64 r, TCGv_i64 n, int64_t shift)
+{
+    gen_helper_mve_uqshll(r, cpu_env, n, tcg_constant_i32(shift));
+}
+
+static bool trans_UQSHLL_ri(DisasContext *s, arg_mve_shl_ri *a)
+{
+    return do_mve_shl_ri(s, a, gen_mve_uqshll);
+}
+
+static bool trans_SRSHRL_ri(DisasContext *s, arg_mve_shl_ri *a)
+{
+    return do_mve_shl_ri(s, a, gen_srshr64_i64);
+}
+
+static bool trans_URSHRL_ri(DisasContext *s, arg_mve_shl_ri *a)
+{
+    return do_mve_shl_ri(s, a, gen_urshr64_i64);
+}
+
+static bool do_mve_shl_rr(DisasContext *s, arg_mve_shl_rr *a, WideShiftFn *fn)
+{
+    TCGv_i64 rda;
+    TCGv_i32 rdalo, rdahi;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+        /* Decode falls through to ORR/MOV UNPREDICTABLE handling */
+        return false;
+    }
+    if (a->rdahi == 15) {
+        /* These are a different encoding (SQSHL/SRSHR/UQSHL/URSHR) */
+        return false;
+    }
+    if (!dc_isar_feature(aa32_mve, s) ||
+        !arm_dc_feature(s, ARM_FEATURE_M_MAIN) ||
+        a->rdahi == 13 || a->rm == 13 || a->rm == 15 ||
+        a->rm == a->rdahi || a->rm == a->rdalo) {
+        /* These rdahi/rdalo/rm cases are UNPREDICTABLE; we choose to UNDEF */
+        unallocated_encoding(s);
+        return true;
+    }
+
+    rda = tcg_temp_new_i64();
+    rdalo = load_reg(s, a->rdalo);
+    rdahi = load_reg(s, a->rdahi);
+    tcg_gen_concat_i32_i64(rda, rdalo, rdahi);
+
+    /* The helper takes care of the sign-extension of the low 8 bits of Rm */
+    fn(rda, cpu_env, rda, cpu_R[a->rm]);
+
+    tcg_gen_extrl_i64_i32(rdalo, rda);
+    tcg_gen_extrh_i64_i32(rdahi, rda);
+    store_reg(s, a->rdalo, rdalo);
+    store_reg(s, a->rdahi, rdahi);
+    tcg_temp_free_i64(rda);
+
+    return true;
+}
+
+static bool trans_LSLL_rr(DisasContext *s, arg_mve_shl_rr *a)
+{
+    return do_mve_shl_rr(s, a, gen_helper_mve_ushll);
+}
+
+static bool trans_ASRL_rr(DisasContext *s, arg_mve_shl_rr *a)
+{
+    return do_mve_shl_rr(s, a, gen_helper_mve_sshrl);
+}
+
+static bool trans_UQRSHLL64_rr(DisasContext *s, arg_mve_shl_rr *a)
+{
+    return do_mve_shl_rr(s, a, gen_helper_mve_uqrshll);
+}
+
+static bool trans_SQRSHRL64_rr(DisasContext *s, arg_mve_shl_rr *a)
+{
+    return do_mve_shl_rr(s, a, gen_helper_mve_sqrshrl);
+}
+
+static bool trans_UQRSHLL48_rr(DisasContext *s, arg_mve_shl_rr *a)
+{
+    return do_mve_shl_rr(s, a, gen_helper_mve_uqrshll48);
+}
+
+static bool trans_SQRSHRL48_rr(DisasContext *s, arg_mve_shl_rr *a)
+{
+    return do_mve_shl_rr(s, a, gen_helper_mve_sqrshrl48);
+}
+
+static bool do_mve_sh_ri(DisasContext *s, arg_mve_sh_ri *a, ShiftImmFn *fn)
+{
+    if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+        /* Decode falls through to ORR/MOV UNPREDICTABLE handling */
+        return false;
+    }
+    if (!dc_isar_feature(aa32_mve, s) ||
+        !arm_dc_feature(s, ARM_FEATURE_M_MAIN) ||
+        a->rda == 13 || a->rda == 15) {
+        /* These rda cases are UNPREDICTABLE; we choose to UNDEF */
+        unallocated_encoding(s);
+        return true;
+    }
+
+    if (a->shim == 0) {
+        a->shim = 32;
+    }
+    fn(cpu_R[a->rda], cpu_R[a->rda], a->shim);
+
+    return true;
+}
+
+static bool trans_URSHR_ri(DisasContext *s, arg_mve_sh_ri *a)
+{
+    return do_mve_sh_ri(s, a, gen_urshr32_i32);
+}
+
+static bool trans_SRSHR_ri(DisasContext *s, arg_mve_sh_ri *a)
+{
+    return do_mve_sh_ri(s, a, gen_srshr32_i32);
+}
+
+static void gen_mve_sqshl(TCGv_i32 r, TCGv_i32 n, int32_t shift)
+{
+    gen_helper_mve_sqshl(r, cpu_env, n, tcg_constant_i32(shift));
+}
+
+static bool trans_SQSHL_ri(DisasContext *s, arg_mve_sh_ri *a)
+{
+    return do_mve_sh_ri(s, a, gen_mve_sqshl);
+}
+
+static void gen_mve_uqshl(TCGv_i32 r, TCGv_i32 n, int32_t shift)
+{
+    gen_helper_mve_uqshl(r, cpu_env, n, tcg_constant_i32(shift));
+}
+
+static bool trans_UQSHL_ri(DisasContext *s, arg_mve_sh_ri *a)
+{
+    return do_mve_sh_ri(s, a, gen_mve_uqshl);
+}
+
+static bool do_mve_sh_rr(DisasContext *s, arg_mve_sh_rr *a, ShiftFn *fn)
+{
+    if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+        /* Decode falls through to ORR/MOV UNPREDICTABLE handling */
+        return false;
+    }
+    if (!dc_isar_feature(aa32_mve, s) ||
+        !arm_dc_feature(s, ARM_FEATURE_M_MAIN) ||
+        a->rda == 13 || a->rda == 15 || a->rm == 13 || a->rm == 15 ||
+        a->rm == a->rda) {
+        /* These rda/rm cases are UNPREDICTABLE; we choose to UNDEF */
+        unallocated_encoding(s);
+        return true;
+    }
+
+    /* The helper takes care of the sign-extension of the low 8 bits of Rm */
+    fn(cpu_R[a->rda], cpu_env, cpu_R[a->rda], cpu_R[a->rm]);
+    return true;
+}
+
+static bool trans_SQRSHR_rr(DisasContext *s, arg_mve_sh_rr *a)
+{
+    return do_mve_sh_rr(s, a, gen_helper_mve_sqrshr);
+}
+
+static bool trans_UQRSHL_rr(DisasContext *s, arg_mve_sh_rr *a)
+{
+    return do_mve_sh_rr(s, a, gen_helper_mve_uqrshl);
+}
+
+/*
+ * Multiply and multiply accumulate
+ */
+
+static bool op_mla(DisasContext *s, arg_s_rrrr *a, bool add)
+{
+    TCGv_i32 t1, t2;
+
+    t1 = load_reg(s, a->rn);
+    t2 = load_reg(s, a->rm);
+    tcg_gen_mul_i32(t1, t1, t2);
+    tcg_temp_free_i32(t2);
+    if (add) {
+        t2 = load_reg(s, a->ra);
+        tcg_gen_add_i32(t1, t1, t2);
+        tcg_temp_free_i32(t2);
+    }
+    if (a->s) {
+        gen_logic_CC(t1);
+    }
+    store_reg(s, a->rd, t1);
+    return true;
+}
+
+static bool trans_MUL(DisasContext *s, arg_MUL *a)
+{
+    return op_mla(s, a, false);
+}
+
+static bool trans_MLA(DisasContext *s, arg_MLA *a)
+{
+    return op_mla(s, a, true);
+}
+
+static bool trans_MLS(DisasContext *s, arg_MLS *a)
+{
+    TCGv_i32 t1, t2;
+
+    if (!ENABLE_ARCH_6T2) {
+        return false;
+    }
+    t1 = load_reg(s, a->rn);
+    t2 = load_reg(s, a->rm);
+    tcg_gen_mul_i32(t1, t1, t2);
+    tcg_temp_free_i32(t2);
+    t2 = load_reg(s, a->ra);
+    tcg_gen_sub_i32(t1, t2, t1);
+    tcg_temp_free_i32(t2);
+    store_reg(s, a->rd, t1);
+    return true;
+}
+
+static bool op_mlal(DisasContext *s, arg_s_rrrr *a, bool uns, bool add)
+{
+    TCGv_i32 t0, t1, t2, t3;
+
+    t0 = load_reg(s, a->rm);
+    t1 = load_reg(s, a->rn);
+    if (uns) {
+        tcg_gen_mulu2_i32(t0, t1, t0, t1);
+    } else {
+        tcg_gen_muls2_i32(t0, t1, t0, t1);
+    }
+    if (add) {
+        t2 = load_reg(s, a->ra);
+        t3 = load_reg(s, a->rd);
+        tcg_gen_add2_i32(t0, t1, t0, t1, t2, t3);
+        tcg_temp_free_i32(t2);
+        tcg_temp_free_i32(t3);
+    }
+    if (a->s) {
+        gen_logicq_cc(t0, t1);
+    }
+    store_reg(s, a->ra, t0);
+    store_reg(s, a->rd, t1);
+    return true;
+}
+
+static bool trans_UMULL(DisasContext *s, arg_UMULL *a)
+{
+    return op_mlal(s, a, true, false);
+}
+
+static bool trans_SMULL(DisasContext *s, arg_SMULL *a)
+{
+    return op_mlal(s, a, false, false);
+}
+
+static bool trans_UMLAL(DisasContext *s, arg_UMLAL *a)
+{
+    return op_mlal(s, a, true, true);
+}
+
+static bool trans_SMLAL(DisasContext *s, arg_SMLAL *a)
+{
+    return op_mlal(s, a, false, true);
+}
+
+static bool trans_UMAAL(DisasContext *s, arg_UMAAL *a)
+{
+    TCGv_i32 t0, t1, t2, zero;
+
+    if (s->thumb
+        ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)
+        : !ENABLE_ARCH_6) {
+        return false;
+    }
+
+    t0 = load_reg(s, a->rm);
+    t1 = load_reg(s, a->rn);
+    tcg_gen_mulu2_i32(t0, t1, t0, t1);
+    zero = tcg_constant_i32(0);
+    t2 = load_reg(s, a->ra);
+    tcg_gen_add2_i32(t0, t1, t0, t1, t2, zero);
+    tcg_temp_free_i32(t2);
+    t2 = load_reg(s, a->rd);
+    tcg_gen_add2_i32(t0, t1, t0, t1, t2, zero);
+    tcg_temp_free_i32(t2);
+    store_reg(s, a->ra, t0);
+    store_reg(s, a->rd, t1);
+    return true;
+}
+
+/*
+ * Saturating addition and subtraction
+ */
+
+static bool op_qaddsub(DisasContext *s, arg_rrr *a, bool add, bool doub)
+{
+    TCGv_i32 t0, t1;
+
+    if (s->thumb
+        ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)
+        : !ENABLE_ARCH_5TE) {
+        return false;
+    }
+
+    t0 = load_reg(s, a->rm);
+    t1 = load_reg(s, a->rn);
+    if (doub) {
+        gen_helper_add_saturate(t1, cpu_env, t1, t1);
+    }
+    if (add) {
+        gen_helper_add_saturate(t0, cpu_env, t0, t1);
+    } else {
+        gen_helper_sub_saturate(t0, cpu_env, t0, t1);
+    }
+    tcg_temp_free_i32(t1);
+    store_reg(s, a->rd, t0);
+    return true;
+}
+
+#define DO_QADDSUB(NAME, ADD, DOUB) \
+static bool trans_##NAME(DisasContext *s, arg_rrr *a)    \
+{                                                        \
+    return op_qaddsub(s, a, ADD, DOUB);                  \
+}
+
+DO_QADDSUB(QADD, true, false)
+DO_QADDSUB(QSUB, false, false)
+DO_QADDSUB(QDADD, true, true)
+DO_QADDSUB(QDSUB, false, true)
+
+#undef DO_QADDSUB
+
+/*
+ * Halfword multiply and multiply accumulate
+ */
+
+static bool op_smlaxxx(DisasContext *s, arg_rrrr *a,
+                       int add_long, bool nt, bool mt)
+{
+    TCGv_i32 t0, t1, tl, th;
+
+    if (s->thumb
+        ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)
+        : !ENABLE_ARCH_5TE) {
+        return false;
+    }
+
+    t0 = load_reg(s, a->rn);
+    t1 = load_reg(s, a->rm);
+    gen_mulxy(t0, t1, nt, mt);
+    tcg_temp_free_i32(t1);
+
+    switch (add_long) {
+    case 0:
+        store_reg(s, a->rd, t0);
+        break;
+    case 1:
+        t1 = load_reg(s, a->ra);
+        gen_helper_add_setq(t0, cpu_env, t0, t1);
+        tcg_temp_free_i32(t1);
+        store_reg(s, a->rd, t0);
+        break;
+    case 2:
+        tl = load_reg(s, a->ra);
+        th = load_reg(s, a->rd);
+        /* Sign-extend the 32-bit product to 64 bits.  */
+        t1 = tcg_temp_new_i32();
+        tcg_gen_sari_i32(t1, t0, 31);
+        tcg_gen_add2_i32(tl, th, tl, th, t0, t1);
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+        store_reg(s, a->ra, tl);
+        store_reg(s, a->rd, th);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    return true;
+}
+
+#define DO_SMLAX(NAME, add, nt, mt) \
+static bool trans_##NAME(DisasContext *s, arg_rrrr *a)     \
+{                                                          \
+    return op_smlaxxx(s, a, add, nt, mt);                  \
+}
+
+DO_SMLAX(SMULBB, 0, 0, 0)
+DO_SMLAX(SMULBT, 0, 0, 1)
+DO_SMLAX(SMULTB, 0, 1, 0)
+DO_SMLAX(SMULTT, 0, 1, 1)
+
+DO_SMLAX(SMLABB, 1, 0, 0)
+DO_SMLAX(SMLABT, 1, 0, 1)
+DO_SMLAX(SMLATB, 1, 1, 0)
+DO_SMLAX(SMLATT, 1, 1, 1)
+
+DO_SMLAX(SMLALBB, 2, 0, 0)
+DO_SMLAX(SMLALBT, 2, 0, 1)
+DO_SMLAX(SMLALTB, 2, 1, 0)
+DO_SMLAX(SMLALTT, 2, 1, 1)
+
+#undef DO_SMLAX
+
+static bool op_smlawx(DisasContext *s, arg_rrrr *a, bool add, bool mt)
+{
+    TCGv_i32 t0, t1;
+
+    if (!ENABLE_ARCH_5TE) {
+        return false;
+    }
+
+    t0 = load_reg(s, a->rn);
+    t1 = load_reg(s, a->rm);
+    /*
+     * Since the nominal result is product<47:16>, shift the 16-bit
+     * input up by 16 bits, so that the result is at product<63:32>.
+     */
+    if (mt) {
+        tcg_gen_andi_i32(t1, t1, 0xffff0000);
+    } else {
+        tcg_gen_shli_i32(t1, t1, 16);
+    }
+    tcg_gen_muls2_i32(t0, t1, t0, t1);
+    tcg_temp_free_i32(t0);
+    if (add) {
+        t0 = load_reg(s, a->ra);
+        gen_helper_add_setq(t1, cpu_env, t1, t0);
+        tcg_temp_free_i32(t0);
+    }
+    store_reg(s, a->rd, t1);
+    return true;
+}
+
+#define DO_SMLAWX(NAME, add, mt) \
+static bool trans_##NAME(DisasContext *s, arg_rrrr *a)     \
+{                                                          \
+    return op_smlawx(s, a, add, mt);                       \
+}
+
+DO_SMLAWX(SMULWB, 0, 0)
+DO_SMLAWX(SMULWT, 0, 1)
+DO_SMLAWX(SMLAWB, 1, 0)
+DO_SMLAWX(SMLAWT, 1, 1)
+
+#undef DO_SMLAWX
+
+/*
+ * MSR (immediate) and hints
+ */
+
+static bool trans_YIELD(DisasContext *s, arg_YIELD *a)
+{
+    /*
+     * When running single-threaded TCG code, use the helper to ensure that
+     * the next round-robin scheduled vCPU gets a crack.  When running in
+     * MTTCG we don't generate jumps to the helper as it won't affect the
+     * scheduling of other vCPUs.
+     */
+    if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) {
+        gen_update_pc(s, curr_insn_len(s));
+        s->base.is_jmp = DISAS_YIELD;
+    }
+    return true;
+}
+
+static bool trans_WFE(DisasContext *s, arg_WFE *a)
+{
+    /*
+     * When running single-threaded TCG code, use the helper to ensure that
+     * the next round-robin scheduled vCPU gets a crack.  In MTTCG mode we
+     * just skip this instruction.  Currently the SEV/SEVL instructions,
+     * which are *one* of many ways to wake the CPU from WFE, are not
+     * implemented so we can't sleep like WFI does.
+     */
+    if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) {
+        gen_update_pc(s, curr_insn_len(s));
+        s->base.is_jmp = DISAS_WFE;
+    }
+    return true;
+}
+
+static bool trans_WFI(DisasContext *s, arg_WFI *a)
+{
+    /* For WFI, halt the vCPU until an IRQ. */
+    gen_update_pc(s, curr_insn_len(s));
+    s->base.is_jmp = DISAS_WFI;
+    return true;
+}
+
+static bool trans_ESB(DisasContext *s, arg_ESB *a)
+{
+    /*
+     * For M-profile, minimal-RAS ESB can be a NOP.
+     * Without RAS, we must implement this as NOP.
+     */
+    if (!arm_dc_feature(s, ARM_FEATURE_M) && dc_isar_feature(aa32_ras, s)) {
+        /*
+         * QEMU does not have a source of physical SErrors,
+         * so we are only concerned with virtual SErrors.
+         * The pseudocode in the ARM for this case is
+         *   if PSTATE.EL IN {EL0, EL1} && EL2Enabled() then
+         *      AArch32.vESBOperation();
+         * Most of the condition can be evaluated at translation time.
+         * Test for EL2 present, and defer test for SEL2 to runtime.
+         */
+        if (s->current_el <= 1 && arm_dc_feature(s, ARM_FEATURE_EL2)) {
+            gen_helper_vesb(cpu_env);
+        }
+    }
+    return true;
+}
+
+static bool trans_NOP(DisasContext *s, arg_NOP *a)
+{
+    return true;
+}
+
+static bool trans_MSR_imm(DisasContext *s, arg_MSR_imm *a)
+{
+    uint32_t val = ror32(a->imm, a->rot * 2);
+    uint32_t mask = msr_mask(s, a->mask, a->r);
+
+    if (gen_set_psr_im(s, mask, a->r, val)) {
+        unallocated_encoding(s);
+    }
+    return true;
+}
+
+/*
+ * Cyclic Redundancy Check
+ */
+
+static bool op_crc32(DisasContext *s, arg_rrr *a, bool c, MemOp sz)
+{
+    TCGv_i32 t1, t2, t3;
+
+    if (!dc_isar_feature(aa32_crc32, s)) {
+        return false;
+    }
+
+    t1 = load_reg(s, a->rn);
+    t2 = load_reg(s, a->rm);
+    switch (sz) {
+    case MO_8:
+        gen_uxtb(t2);
+        break;
+    case MO_16:
+        gen_uxth(t2);
+        break;
+    case MO_32:
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    t3 = tcg_constant_i32(1 << sz);
+    if (c) {
+        gen_helper_crc32c(t1, t1, t2, t3);
+    } else {
+        gen_helper_crc32(t1, t1, t2, t3);
+    }
+    tcg_temp_free_i32(t2);
+    store_reg(s, a->rd, t1);
+    return true;
+}
+
+#define DO_CRC32(NAME, c, sz) \
+static bool trans_##NAME(DisasContext *s, arg_rrr *a)  \
+    { return op_crc32(s, a, c, sz); }
+
+DO_CRC32(CRC32B, false, MO_8)
+DO_CRC32(CRC32H, false, MO_16)
+DO_CRC32(CRC32W, false, MO_32)
+DO_CRC32(CRC32CB, true, MO_8)
+DO_CRC32(CRC32CH, true, MO_16)
+DO_CRC32(CRC32CW, true, MO_32)
+
+#undef DO_CRC32
+
+/*
+ * Miscellaneous instructions
+ */
+
+static bool trans_MRS_bank(DisasContext *s, arg_MRS_bank *a)
+{
+    if (arm_dc_feature(s, ARM_FEATURE_M)) {
+        return false;
+    }
+    gen_mrs_banked(s, a->r, a->sysm, a->rd);
+    return true;
+}
+
+static bool trans_MSR_bank(DisasContext *s, arg_MSR_bank *a)
+{
+    if (arm_dc_feature(s, ARM_FEATURE_M)) {
+        return false;
+    }
+    gen_msr_banked(s, a->r, a->sysm, a->rn);
+    return true;
+}
+
+static bool trans_MRS_reg(DisasContext *s, arg_MRS_reg *a)
+{
+    TCGv_i32 tmp;
+
+    if (arm_dc_feature(s, ARM_FEATURE_M)) {
+        return false;
+    }
+    if (a->r) {
+        if (IS_USER(s)) {
+            unallocated_encoding(s);
+            return true;
+        }
+        tmp = load_cpu_field(spsr);
+    } else {
+        tmp = tcg_temp_new_i32();
+        gen_helper_cpsr_read(tmp, cpu_env);
+    }
+    store_reg(s, a->rd, tmp);
+    return true;
+}
+
+static bool trans_MSR_reg(DisasContext *s, arg_MSR_reg *a)
+{
+    TCGv_i32 tmp;
+    uint32_t mask = msr_mask(s, a->mask, a->r);
+
+    if (arm_dc_feature(s, ARM_FEATURE_M)) {
+        return false;
+    }
+    tmp = load_reg(s, a->rn);
+    if (gen_set_psr(s, mask, a->r, tmp)) {
+        unallocated_encoding(s);
+    }
+    return true;
+}
+
+static bool trans_MRS_v7m(DisasContext *s, arg_MRS_v7m *a)
+{
+    TCGv_i32 tmp;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_M)) {
+        return false;
+    }
+    tmp = tcg_temp_new_i32();
+    gen_helper_v7m_mrs(tmp, cpu_env, tcg_constant_i32(a->sysm));
+    store_reg(s, a->rd, tmp);
+    return true;
+}
+
+static bool trans_MSR_v7m(DisasContext *s, arg_MSR_v7m *a)
+{
+    TCGv_i32 addr, reg;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_M)) {
+        return false;
+    }
+    addr = tcg_constant_i32((a->mask << 10) | a->sysm);
+    reg = load_reg(s, a->rn);
+    gen_helper_v7m_msr(cpu_env, addr, reg);
+    tcg_temp_free_i32(reg);
+    /* If we wrote to CONTROL, the EL might have changed */
+    gen_rebuild_hflags(s, true);
+    gen_lookup_tb(s);
+    return true;
+}
+
+static bool trans_BX(DisasContext *s, arg_BX *a)
+{
+    if (!ENABLE_ARCH_4T) {
+        return false;
+    }
+    gen_bx_excret(s, load_reg(s, a->rm));
+    return true;
+}
+
+static bool trans_BXJ(DisasContext *s, arg_BXJ *a)
+{
+    if (!ENABLE_ARCH_5J || arm_dc_feature(s, ARM_FEATURE_M)) {
+        return false;
+    }
+    /*
+     * v7A allows BXJ to be trapped via HSTR.TJDBX. We don't waste a
+     * TBFLAGS bit on a basically-never-happens case, so call a helper
+     * function to check for the trap and raise the exception if needed
+     * (passing it the register number for the syndrome value).
+     * v8A doesn't have this HSTR bit.
+     */
+    if (!arm_dc_feature(s, ARM_FEATURE_V8) &&
+        arm_dc_feature(s, ARM_FEATURE_EL2) &&
+        s->current_el < 2 && s->ns) {
+        gen_helper_check_bxj_trap(cpu_env, tcg_constant_i32(a->rm));
+    }
+    /* Trivial implementation equivalent to bx.  */
+    gen_bx(s, load_reg(s, a->rm));
+    return true;
+}
+
+static bool trans_BLX_r(DisasContext *s, arg_BLX_r *a)
+{
+    TCGv_i32 tmp;
+
+    if (!ENABLE_ARCH_5) {
+        return false;
+    }
+    tmp = load_reg(s, a->rm);
+    gen_pc_plus_diff(s, cpu_R[14], curr_insn_len(s) | s->thumb);
+    gen_bx(s, tmp);
+    return true;
+}
+
+/*
+ * BXNS/BLXNS: only exist for v8M with the security extensions,
+ * and always UNDEF if NonSecure.  We don't implement these in
+ * the user-only mode either (in theory you can use them from
+ * Secure User mode but they are too tied in to system emulation).
+ */
+static bool trans_BXNS(DisasContext *s, arg_BXNS *a)
+{
+    if (!s->v8m_secure || IS_USER_ONLY) {
+        unallocated_encoding(s);
+    } else {
+        gen_bxns(s, a->rm);
+    }
+    return true;
+}
+
+static bool trans_BLXNS(DisasContext *s, arg_BLXNS *a)
+{
+    if (!s->v8m_secure || IS_USER_ONLY) {
+        unallocated_encoding(s);
+    } else {
+        gen_blxns(s, a->rm);
+    }
+    return true;
+}
+
+static bool trans_CLZ(DisasContext *s, arg_CLZ *a)
+{
+    TCGv_i32 tmp;
+
+    if (!ENABLE_ARCH_5) {
+        return false;
+    }
+    tmp = load_reg(s, a->rm);
+    tcg_gen_clzi_i32(tmp, tmp, 32);
+    store_reg(s, a->rd, tmp);
+    return true;
+}
+
+static bool trans_ERET(DisasContext *s, arg_ERET *a)
+{
+    TCGv_i32 tmp;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_V7VE)) {
+        return false;
+    }
+    if (IS_USER(s)) {
+        unallocated_encoding(s);
+        return true;
+    }
+    if (s->current_el == 2) {
+        /* ERET from Hyp uses ELR_Hyp, not LR */
+        tmp = load_cpu_field(elr_el[2]);
+    } else {
+        tmp = load_reg(s, 14);
+    }
+    gen_exception_return(s, tmp);
+    return true;
+}
+
+static bool trans_HLT(DisasContext *s, arg_HLT *a)
+{
+    gen_hlt(s, a->imm);
+    return true;
+}
+
+static bool trans_BKPT(DisasContext *s, arg_BKPT *a)
+{
+    if (!ENABLE_ARCH_5) {
+        return false;
+    }
+    /* BKPT is OK with ECI set and leaves it untouched */
+    s->eci_handled = true;
+    if (arm_dc_feature(s, ARM_FEATURE_M) &&
+        semihosting_enabled(s->current_el == 0) &&
+        (a->imm == 0xab)) {
+        gen_exception_internal_insn(s, EXCP_SEMIHOST);
+    } else {
+        gen_exception_bkpt_insn(s, syn_aa32_bkpt(a->imm, false));
+    }
+    return true;
+}
+
+static bool trans_HVC(DisasContext *s, arg_HVC *a)
+{
+    if (!ENABLE_ARCH_7 || arm_dc_feature(s, ARM_FEATURE_M)) {
+        return false;
+    }
+    if (IS_USER(s)) {
+        unallocated_encoding(s);
+    } else {
+        gen_hvc(s, a->imm);
+    }
+    return true;
+}
+
+static bool trans_SMC(DisasContext *s, arg_SMC *a)
+{
+    if (!ENABLE_ARCH_6K || arm_dc_feature(s, ARM_FEATURE_M)) {
+        return false;
+    }
+    if (IS_USER(s)) {
+        unallocated_encoding(s);
+    } else {
+        gen_smc(s);
+    }
+    return true;
+}
+
+static bool trans_SG(DisasContext *s, arg_SG *a)
+{
+    if (!arm_dc_feature(s, ARM_FEATURE_M) ||
+        !arm_dc_feature(s, ARM_FEATURE_V8)) {
+        return false;
+    }
+    /*
+     * SG (v8M only)
+     * The bulk of the behaviour for this instruction is implemented
+     * in v7m_handle_execute_nsc(), which deals with the insn when
+     * it is executed by a CPU in non-secure state from memory
+     * which is Secure & NonSecure-Callable.
+     * Here we only need to handle the remaining cases:
+     *  * in NS memory (including the "security extension not
+     *    implemented" case) : NOP
+     *  * in S memory but CPU already secure (clear IT bits)
+     * We know that the attribute for the memory this insn is
+     * in must match the current CPU state, because otherwise
+     * get_phys_addr_pmsav8 would have generated an exception.
+     */
+    if (s->v8m_secure) {
+        /* Like the IT insn, we don't need to generate any code */
+        s->condexec_cond = 0;
+        s->condexec_mask = 0;
+    }
+    return true;
+}
+
+static bool trans_TT(DisasContext *s, arg_TT *a)
+{
+    TCGv_i32 addr, tmp;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_M) ||
+        !arm_dc_feature(s, ARM_FEATURE_V8)) {
+        return false;
+    }
+    if (a->rd == 13 || a->rd == 15 || a->rn == 15) {
+        /* We UNDEF for these UNPREDICTABLE cases */
+        unallocated_encoding(s);
+        return true;
+    }
+    if (a->A && !s->v8m_secure) {
+        /* This case is UNDEFINED.  */
+        unallocated_encoding(s);
+        return true;
+    }
+
+    addr = load_reg(s, a->rn);
+    tmp = tcg_temp_new_i32();
+    gen_helper_v7m_tt(tmp, cpu_env, addr, tcg_constant_i32((a->A << 1) | a->T));
+    tcg_temp_free_i32(addr);
+    store_reg(s, a->rd, tmp);
+    return true;
+}
+
+/*
+ * Load/store register index
+ */
+
+static ISSInfo make_issinfo(DisasContext *s, int rd, bool p, bool w)
+{
+    ISSInfo ret;
+
+    /* ISS not valid if writeback */
+    if (p && !w) {
+        ret = rd;
+        if (curr_insn_len(s) == 2) {
+            ret |= ISSIs16Bit;
+        }
+    } else {
+        ret = ISSInvalid;
+    }
+    return ret;
+}
+
+static TCGv_i32 op_addr_rr_pre(DisasContext *s, arg_ldst_rr *a)
+{
+    TCGv_i32 addr = load_reg(s, a->rn);
+
+    if (s->v8m_stackcheck && a->rn == 13 && a->w) {
+        gen_helper_v8m_stackcheck(cpu_env, addr);
+    }
+
+    if (a->p) {
+        TCGv_i32 ofs = load_reg(s, a->rm);
+        gen_arm_shift_im(ofs, a->shtype, a->shimm, 0);
+        if (a->u) {
+            tcg_gen_add_i32(addr, addr, ofs);
+        } else {
+            tcg_gen_sub_i32(addr, addr, ofs);
+        }
+        tcg_temp_free_i32(ofs);
+    }
+    return addr;
+}
+
+static void op_addr_rr_post(DisasContext *s, arg_ldst_rr *a,
+                            TCGv_i32 addr, int address_offset)
+{
+    if (!a->p) {
+        TCGv_i32 ofs = load_reg(s, a->rm);
+        gen_arm_shift_im(ofs, a->shtype, a->shimm, 0);
+        if (a->u) {
+            tcg_gen_add_i32(addr, addr, ofs);
+        } else {
+            tcg_gen_sub_i32(addr, addr, ofs);
+        }
+        tcg_temp_free_i32(ofs);
+    } else if (!a->w) {
+        tcg_temp_free_i32(addr);
+        return;
+    }
+    tcg_gen_addi_i32(addr, addr, address_offset);
+    store_reg(s, a->rn, addr);
+}
+
+static bool op_load_rr(DisasContext *s, arg_ldst_rr *a,
+                       MemOp mop, int mem_idx)
+{
+    ISSInfo issinfo = make_issinfo(s, a->rt, a->p, a->w);
+    TCGv_i32 addr, tmp;
+
+    addr = op_addr_rr_pre(s, a);
+
+    tmp = tcg_temp_new_i32();
+    gen_aa32_ld_i32(s, tmp, addr, mem_idx, mop);
+    disas_set_da_iss(s, mop, issinfo);
+
+    /*
+     * Perform base writeback before the loaded value to
+     * ensure correct behavior with overlapping index registers.
+     */
+    op_addr_rr_post(s, a, addr, 0);
+    store_reg_from_load(s, a->rt, tmp);
+    return true;
+}
+
+static bool op_store_rr(DisasContext *s, arg_ldst_rr *a,
+                        MemOp mop, int mem_idx)
+{
+    ISSInfo issinfo = make_issinfo(s, a->rt, a->p, a->w) | ISSIsWrite;
+    TCGv_i32 addr, tmp;
+
+    /*
+     * In Thumb encodings of stores Rn=1111 is UNDEF; for Arm it
+     * is either UNPREDICTABLE or has defined behaviour
+     */
+    if (s->thumb && a->rn == 15) {
+        return false;
+    }
+
+    addr = op_addr_rr_pre(s, a);
+
+    tmp = load_reg(s, a->rt);
+    gen_aa32_st_i32(s, tmp, addr, mem_idx, mop);
+    disas_set_da_iss(s, mop, issinfo);
+    tcg_temp_free_i32(tmp);
+
+    op_addr_rr_post(s, a, addr, 0);
+    return true;
+}
+
+static bool trans_LDRD_rr(DisasContext *s, arg_ldst_rr *a)
+{
+    int mem_idx = get_mem_index(s);
+    TCGv_i32 addr, tmp;
+
+    if (!ENABLE_ARCH_5TE) {
+        return false;
+    }
+    if (a->rt & 1) {
+        unallocated_encoding(s);
+        return true;
+    }
+    addr = op_addr_rr_pre(s, a);
+
+    tmp = tcg_temp_new_i32();
+    gen_aa32_ld_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
+    store_reg(s, a->rt, tmp);
+
+    tcg_gen_addi_i32(addr, addr, 4);
+
+    tmp = tcg_temp_new_i32();
+    gen_aa32_ld_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
+    store_reg(s, a->rt + 1, tmp);
+
+    /* LDRD w/ base writeback is undefined if the registers overlap.  */
+    op_addr_rr_post(s, a, addr, -4);
+    return true;
+}
+
+static bool trans_STRD_rr(DisasContext *s, arg_ldst_rr *a)
+{
+    int mem_idx = get_mem_index(s);
+    TCGv_i32 addr, tmp;
+
+    if (!ENABLE_ARCH_5TE) {
+        return false;
+    }
+    if (a->rt & 1) {
+        unallocated_encoding(s);
+        return true;
+    }
+    addr = op_addr_rr_pre(s, a);
+
+    tmp = load_reg(s, a->rt);
+    gen_aa32_st_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
+    tcg_temp_free_i32(tmp);
+
+    tcg_gen_addi_i32(addr, addr, 4);
+
+    tmp = load_reg(s, a->rt + 1);
+    gen_aa32_st_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
+    tcg_temp_free_i32(tmp);
+
+    op_addr_rr_post(s, a, addr, -4);
+    return true;
+}
+
+/*
+ * Load/store immediate index
+ */
+
+static TCGv_i32 op_addr_ri_pre(DisasContext *s, arg_ldst_ri *a)
+{
+    int ofs = a->imm;
+
+    if (!a->u) {
+        ofs = -ofs;
+    }
+
+    if (s->v8m_stackcheck && a->rn == 13 && a->w) {
+        /*
+         * Stackcheck. Here we know 'addr' is the current SP;
+         * U is set if we're moving SP up, else down. It is
+         * UNKNOWN whether the limit check triggers when SP starts
+         * below the limit and ends up above it; we chose to do so.
+         */
+        if (!a->u) {
+            TCGv_i32 newsp = tcg_temp_new_i32();
+            tcg_gen_addi_i32(newsp, cpu_R[13], ofs);
+            gen_helper_v8m_stackcheck(cpu_env, newsp);
+            tcg_temp_free_i32(newsp);
+        } else {
+            gen_helper_v8m_stackcheck(cpu_env, cpu_R[13]);
+        }
+    }
+
+    return add_reg_for_lit(s, a->rn, a->p ? ofs : 0);
+}
+
+static void op_addr_ri_post(DisasContext *s, arg_ldst_ri *a,
+                            TCGv_i32 addr, int address_offset)
+{
+    if (!a->p) {
+        if (a->u) {
+            address_offset += a->imm;
+        } else {
+            address_offset -= a->imm;
+        }
+    } else if (!a->w) {
+        tcg_temp_free_i32(addr);
+        return;
+    }
+    tcg_gen_addi_i32(addr, addr, address_offset);
+    store_reg(s, a->rn, addr);
+}
+
+static bool op_load_ri(DisasContext *s, arg_ldst_ri *a,
+                       MemOp mop, int mem_idx)
+{
+    ISSInfo issinfo = make_issinfo(s, a->rt, a->p, a->w);
+    TCGv_i32 addr, tmp;
+
+    addr = op_addr_ri_pre(s, a);
+
+    tmp = tcg_temp_new_i32();
+    gen_aa32_ld_i32(s, tmp, addr, mem_idx, mop);
+    disas_set_da_iss(s, mop, issinfo);
+
+    /*
+     * Perform base writeback before the loaded value to
+     * ensure correct behavior with overlapping index registers.
+     */
+    op_addr_ri_post(s, a, addr, 0);
+    store_reg_from_load(s, a->rt, tmp);
+    return true;
+}
+
+static bool op_store_ri(DisasContext *s, arg_ldst_ri *a,
+                        MemOp mop, int mem_idx)
+{
+    ISSInfo issinfo = make_issinfo(s, a->rt, a->p, a->w) | ISSIsWrite;
+    TCGv_i32 addr, tmp;
+
+    /*
+     * In Thumb encodings of stores Rn=1111 is UNDEF; for Arm it
+     * is either UNPREDICTABLE or has defined behaviour
+     */
+    if (s->thumb && a->rn == 15) {
+        return false;
+    }
+
+    addr = op_addr_ri_pre(s, a);
+
+    tmp = load_reg(s, a->rt);
+    gen_aa32_st_i32(s, tmp, addr, mem_idx, mop);
+    disas_set_da_iss(s, mop, issinfo);
+    tcg_temp_free_i32(tmp);
+
+    op_addr_ri_post(s, a, addr, 0);
+    return true;
+}
+
+static bool op_ldrd_ri(DisasContext *s, arg_ldst_ri *a, int rt2)
+{
+    int mem_idx = get_mem_index(s);
+    TCGv_i32 addr, tmp;
+
+    addr = op_addr_ri_pre(s, a);
+
+    tmp = tcg_temp_new_i32();
+    gen_aa32_ld_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
+    store_reg(s, a->rt, tmp);
+
+    tcg_gen_addi_i32(addr, addr, 4);
+
+    tmp = tcg_temp_new_i32();
+    gen_aa32_ld_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
+    store_reg(s, rt2, tmp);
+
+    /* LDRD w/ base writeback is undefined if the registers overlap.  */
+    op_addr_ri_post(s, a, addr, -4);
+    return true;
+}
+
+static bool trans_LDRD_ri_a32(DisasContext *s, arg_ldst_ri *a)
+{
+    if (!ENABLE_ARCH_5TE || (a->rt & 1)) {
+        return false;
+    }
+    return op_ldrd_ri(s, a, a->rt + 1);
+}
+
+static bool trans_LDRD_ri_t32(DisasContext *s, arg_ldst_ri2 *a)
+{
+    arg_ldst_ri b = {
+        .u = a->u, .w = a->w, .p = a->p,
+        .rn = a->rn, .rt = a->rt, .imm = a->imm
+    };
+    return op_ldrd_ri(s, &b, a->rt2);
+}
+
+static bool op_strd_ri(DisasContext *s, arg_ldst_ri *a, int rt2)
+{
+    int mem_idx = get_mem_index(s);
+    TCGv_i32 addr, tmp;
+
+    addr = op_addr_ri_pre(s, a);
+
+    tmp = load_reg(s, a->rt);
+    gen_aa32_st_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
+    tcg_temp_free_i32(tmp);
+
+    tcg_gen_addi_i32(addr, addr, 4);
+
+    tmp = load_reg(s, rt2);
+    gen_aa32_st_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
+    tcg_temp_free_i32(tmp);
+
+    op_addr_ri_post(s, a, addr, -4);
+    return true;
+}
+
+static bool trans_STRD_ri_a32(DisasContext *s, arg_ldst_ri *a)
+{
+    if (!ENABLE_ARCH_5TE || (a->rt & 1)) {
+        return false;
+    }
+    return op_strd_ri(s, a, a->rt + 1);
+}
+
+static bool trans_STRD_ri_t32(DisasContext *s, arg_ldst_ri2 *a)
+{
+    arg_ldst_ri b = {
+        .u = a->u, .w = a->w, .p = a->p,
+        .rn = a->rn, .rt = a->rt, .imm = a->imm
+    };
+    return op_strd_ri(s, &b, a->rt2);
+}
+
+#define DO_LDST(NAME, WHICH, MEMOP) \
+static bool trans_##NAME##_ri(DisasContext *s, arg_ldst_ri *a)        \
+{                                                                     \
+    return op_##WHICH##_ri(s, a, MEMOP, get_mem_index(s));            \
+}                                                                     \
+static bool trans_##NAME##T_ri(DisasContext *s, arg_ldst_ri *a)       \
+{                                                                     \
+    return op_##WHICH##_ri(s, a, MEMOP, get_a32_user_mem_index(s));   \
+}                                                                     \
+static bool trans_##NAME##_rr(DisasContext *s, arg_ldst_rr *a)        \
+{                                                                     \
+    return op_##WHICH##_rr(s, a, MEMOP, get_mem_index(s));            \
+}                                                                     \
+static bool trans_##NAME##T_rr(DisasContext *s, arg_ldst_rr *a)       \
+{                                                                     \
+    return op_##WHICH##_rr(s, a, MEMOP, get_a32_user_mem_index(s));   \
+}
+
+DO_LDST(LDR, load, MO_UL)
+DO_LDST(LDRB, load, MO_UB)
+DO_LDST(LDRH, load, MO_UW)
+DO_LDST(LDRSB, load, MO_SB)
+DO_LDST(LDRSH, load, MO_SW)
+
+DO_LDST(STR, store, MO_UL)
+DO_LDST(STRB, store, MO_UB)
+DO_LDST(STRH, store, MO_UW)
+
+#undef DO_LDST
+
+/*
+ * Synchronization primitives
+ */
+
+static bool op_swp(DisasContext *s, arg_SWP *a, MemOp opc)
+{
+    TCGv_i32 addr, tmp;
+    TCGv taddr;
+
+    opc |= s->be_data;
+    addr = load_reg(s, a->rn);
+    taddr = gen_aa32_addr(s, addr, opc);
+    tcg_temp_free_i32(addr);
+
+    tmp = load_reg(s, a->rt2);
+    tcg_gen_atomic_xchg_i32(tmp, taddr, tmp, get_mem_index(s), opc);
+    tcg_temp_free(taddr);
+
+    store_reg(s, a->rt, tmp);
+    return true;
+}
+
+static bool trans_SWP(DisasContext *s, arg_SWP *a)
+{
+    return op_swp(s, a, MO_UL | MO_ALIGN);
+}
+
+static bool trans_SWPB(DisasContext *s, arg_SWP *a)
+{
+    return op_swp(s, a, MO_UB);
+}
+
+/*
+ * Load/Store Exclusive and Load-Acquire/Store-Release
+ */
+
+static bool op_strex(DisasContext *s, arg_STREX *a, MemOp mop, bool rel)
+{
+    TCGv_i32 addr;
+    /* Some cases stopped being UNPREDICTABLE in v8A (but not v8M) */
+    bool v8a = ENABLE_ARCH_8 && !arm_dc_feature(s, ARM_FEATURE_M);
+
+    /* We UNDEF for these UNPREDICTABLE cases.  */
+    if (a->rd == 15 || a->rn == 15 || a->rt == 15
+        || a->rd == a->rn || a->rd == a->rt
+        || (!v8a && s->thumb && (a->rd == 13 || a->rt == 13))
+        || (mop == MO_64
+            && (a->rt2 == 15
+                || a->rd == a->rt2
+                || (!v8a && s->thumb && a->rt2 == 13)))) {
+        unallocated_encoding(s);
+        return true;
+    }
+
+    if (rel) {
+        tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL);
+    }
+
+    addr = tcg_temp_local_new_i32();
+    load_reg_var(s, addr, a->rn);
+    tcg_gen_addi_i32(addr, addr, a->imm);
+
+    gen_store_exclusive(s, a->rd, a->rt, a->rt2, addr, mop);
+    tcg_temp_free_i32(addr);
+    return true;
+}
+
+static bool trans_STREX(DisasContext *s, arg_STREX *a)
+{
+    if (!ENABLE_ARCH_6) {
+        return false;
+    }
+    return op_strex(s, a, MO_32, false);
+}
+
+static bool trans_STREXD_a32(DisasContext *s, arg_STREX *a)
+{
+    if (!ENABLE_ARCH_6K) {
+        return false;
+    }
+    /* We UNDEF for these UNPREDICTABLE cases.  */
+    if (a->rt & 1) {
+        unallocated_encoding(s);
+        return true;
+    }
+    a->rt2 = a->rt + 1;
+    return op_strex(s, a, MO_64, false);
+}
+
+static bool trans_STREXD_t32(DisasContext *s, arg_STREX *a)
+{
+    return op_strex(s, a, MO_64, false);
+}
+
+static bool trans_STREXB(DisasContext *s, arg_STREX *a)
+{
+    if (s->thumb ? !ENABLE_ARCH_7 : !ENABLE_ARCH_6K) {
+        return false;
+    }
+    return op_strex(s, a, MO_8, false);
+}
+
+static bool trans_STREXH(DisasContext *s, arg_STREX *a)
+{
+    if (s->thumb ? !ENABLE_ARCH_7 : !ENABLE_ARCH_6K) {
+        return false;
+    }
+    return op_strex(s, a, MO_16, false);
+}
+
+static bool trans_STLEX(DisasContext *s, arg_STREX *a)
+{
+    if (!ENABLE_ARCH_8) {
+        return false;
+    }
+    return op_strex(s, a, MO_32, true);
+}
+
+static bool trans_STLEXD_a32(DisasContext *s, arg_STREX *a)
+{
+    if (!ENABLE_ARCH_8) {
+        return false;
+    }
+    /* We UNDEF for these UNPREDICTABLE cases.  */
+    if (a->rt & 1) {
+        unallocated_encoding(s);
+        return true;
+    }
+    a->rt2 = a->rt + 1;
+    return op_strex(s, a, MO_64, true);
+}
+
+static bool trans_STLEXD_t32(DisasContext *s, arg_STREX *a)
+{
+    if (!ENABLE_ARCH_8) {
+        return false;
+    }
+    return op_strex(s, a, MO_64, true);
+}
+
+static bool trans_STLEXB(DisasContext *s, arg_STREX *a)
+{
+    if (!ENABLE_ARCH_8) {
+        return false;
+    }
+    return op_strex(s, a, MO_8, true);
+}
+
+static bool trans_STLEXH(DisasContext *s, arg_STREX *a)
+{
+    if (!ENABLE_ARCH_8) {
+        return false;
+    }
+    return op_strex(s, a, MO_16, true);
+}
+
+static bool op_stl(DisasContext *s, arg_STL *a, MemOp mop)
+{
+    TCGv_i32 addr, tmp;
+
+    if (!ENABLE_ARCH_8) {
+        return false;
+    }
+    /* We UNDEF for these UNPREDICTABLE cases.  */
+    if (a->rn == 15 || a->rt == 15) {
+        unallocated_encoding(s);
+        return true;
+    }
+
+    addr = load_reg(s, a->rn);
+    tmp = load_reg(s, a->rt);
+    tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL);
+    gen_aa32_st_i32(s, tmp, addr, get_mem_index(s), mop | MO_ALIGN);
+    disas_set_da_iss(s, mop, a->rt | ISSIsAcqRel | ISSIsWrite);
+
+    tcg_temp_free_i32(tmp);
+    tcg_temp_free_i32(addr);
+    return true;
+}
+
+static bool trans_STL(DisasContext *s, arg_STL *a)
+{
+    return op_stl(s, a, MO_UL);
+}
+
+static bool trans_STLB(DisasContext *s, arg_STL *a)
+{
+    return op_stl(s, a, MO_UB);
+}
+
+static bool trans_STLH(DisasContext *s, arg_STL *a)
+{
+    return op_stl(s, a, MO_UW);
+}
+
+static bool op_ldrex(DisasContext *s, arg_LDREX *a, MemOp mop, bool acq)
+{
+    TCGv_i32 addr;
+    /* Some cases stopped being UNPREDICTABLE in v8A (but not v8M) */
+    bool v8a = ENABLE_ARCH_8 && !arm_dc_feature(s, ARM_FEATURE_M);
+
+    /* We UNDEF for these UNPREDICTABLE cases.  */
+    if (a->rn == 15 || a->rt == 15
+        || (!v8a && s->thumb && a->rt == 13)
+        || (mop == MO_64
+            && (a->rt2 == 15 || a->rt == a->rt2
+                || (!v8a && s->thumb && a->rt2 == 13)))) {
+        unallocated_encoding(s);
+        return true;
+    }
+
+    addr = tcg_temp_local_new_i32();
+    load_reg_var(s, addr, a->rn);
+    tcg_gen_addi_i32(addr, addr, a->imm);
+
+    gen_load_exclusive(s, a->rt, a->rt2, addr, mop);
+    tcg_temp_free_i32(addr);
+
+    if (acq) {
+        tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ);
+    }
+    return true;
+}
+
+static bool trans_LDREX(DisasContext *s, arg_LDREX *a)
+{
+    if (!ENABLE_ARCH_6) {
+        return false;
+    }
+    return op_ldrex(s, a, MO_32, false);
+}
+
+static bool trans_LDREXD_a32(DisasContext *s, arg_LDREX *a)
+{
+    if (!ENABLE_ARCH_6K) {
+        return false;
+    }
+    /* We UNDEF for these UNPREDICTABLE cases.  */
+    if (a->rt & 1) {
+        unallocated_encoding(s);
+        return true;
+    }
+    a->rt2 = a->rt + 1;
+    return op_ldrex(s, a, MO_64, false);
+}
+
+static bool trans_LDREXD_t32(DisasContext *s, arg_LDREX *a)
+{
+    return op_ldrex(s, a, MO_64, false);
+}
+
+static bool trans_LDREXB(DisasContext *s, arg_LDREX *a)
+{
+    if (s->thumb ? !ENABLE_ARCH_7 : !ENABLE_ARCH_6K) {
+        return false;
+    }
+    return op_ldrex(s, a, MO_8, false);
+}
+
+static bool trans_LDREXH(DisasContext *s, arg_LDREX *a)
+{
+    if (s->thumb ? !ENABLE_ARCH_7 : !ENABLE_ARCH_6K) {
+        return false;
+    }
+    return op_ldrex(s, a, MO_16, false);
+}
+
+static bool trans_LDAEX(DisasContext *s, arg_LDREX *a)
+{
+    if (!ENABLE_ARCH_8) {
+        return false;
+    }
+    return op_ldrex(s, a, MO_32, true);
+}
+
+static bool trans_LDAEXD_a32(DisasContext *s, arg_LDREX *a)
+{
+    if (!ENABLE_ARCH_8) {
+        return false;
+    }
+    /* We UNDEF for these UNPREDICTABLE cases.  */
+    if (a->rt & 1) {
+        unallocated_encoding(s);
+        return true;
+    }
+    a->rt2 = a->rt + 1;
+    return op_ldrex(s, a, MO_64, true);
+}
+
+static bool trans_LDAEXD_t32(DisasContext *s, arg_LDREX *a)
+{
+    if (!ENABLE_ARCH_8) {
+        return false;
+    }
+    return op_ldrex(s, a, MO_64, true);
+}
+
+static bool trans_LDAEXB(DisasContext *s, arg_LDREX *a)
+{
+    if (!ENABLE_ARCH_8) {
+        return false;
+    }
+    return op_ldrex(s, a, MO_8, true);
+}
+
+static bool trans_LDAEXH(DisasContext *s, arg_LDREX *a)
+{
+    if (!ENABLE_ARCH_8) {
+        return false;
+    }
+    return op_ldrex(s, a, MO_16, true);
+}
+
+static bool op_lda(DisasContext *s, arg_LDA *a, MemOp mop)
+{
+    TCGv_i32 addr, tmp;
+
+    if (!ENABLE_ARCH_8) {
+        return false;
+    }
+    /* We UNDEF for these UNPREDICTABLE cases.  */
+    if (a->rn == 15 || a->rt == 15) {
+        unallocated_encoding(s);
+        return true;
+    }
+
+    addr = load_reg(s, a->rn);
+    tmp = tcg_temp_new_i32();
+    gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), mop | MO_ALIGN);
+    disas_set_da_iss(s, mop, a->rt | ISSIsAcqRel);
+    tcg_temp_free_i32(addr);
+
+    store_reg(s, a->rt, tmp);
+    tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL);
+    return true;
+}
+
+static bool trans_LDA(DisasContext *s, arg_LDA *a)
+{
+    return op_lda(s, a, MO_UL);
+}
+
+static bool trans_LDAB(DisasContext *s, arg_LDA *a)
+{
+    return op_lda(s, a, MO_UB);
+}
+
+static bool trans_LDAH(DisasContext *s, arg_LDA *a)
+{
+    return op_lda(s, a, MO_UW);
+}
+
+/*
+ * Media instructions
+ */
+
+static bool trans_USADA8(DisasContext *s, arg_USADA8 *a)
+{
+    TCGv_i32 t1, t2;
+
+    if (!ENABLE_ARCH_6) {
+        return false;
+    }
+
+    t1 = load_reg(s, a->rn);
+    t2 = load_reg(s, a->rm);
+    gen_helper_usad8(t1, t1, t2);
+    tcg_temp_free_i32(t2);
+    if (a->ra != 15) {
+        t2 = load_reg(s, a->ra);
+        tcg_gen_add_i32(t1, t1, t2);
+        tcg_temp_free_i32(t2);
+    }
+    store_reg(s, a->rd, t1);
+    return true;
+}
+
+static bool op_bfx(DisasContext *s, arg_UBFX *a, bool u)
+{
+    TCGv_i32 tmp;
+    int width = a->widthm1 + 1;
+    int shift = a->lsb;
+
+    if (!ENABLE_ARCH_6T2) {
+        return false;
+    }
+    if (shift + width > 32) {
+        /* UNPREDICTABLE; we choose to UNDEF */
+        unallocated_encoding(s);
+        return true;
+    }
+
+    tmp = load_reg(s, a->rn);
+    if (u) {
+        tcg_gen_extract_i32(tmp, tmp, shift, width);
+    } else {
+        tcg_gen_sextract_i32(tmp, tmp, shift, width);
+    }
+    store_reg(s, a->rd, tmp);
+    return true;
+}
+
+static bool trans_SBFX(DisasContext *s, arg_SBFX *a)
+{
+    return op_bfx(s, a, false);
+}
+
+static bool trans_UBFX(DisasContext *s, arg_UBFX *a)
+{
+    return op_bfx(s, a, true);
+}
+
+static bool trans_BFCI(DisasContext *s, arg_BFCI *a)
+{
+    TCGv_i32 tmp;
+    int msb = a->msb, lsb = a->lsb;
+    int width;
+
+    if (!ENABLE_ARCH_6T2) {
+        return false;
+    }
+    if (msb < lsb) {
+        /* UNPREDICTABLE; we choose to UNDEF */
+        unallocated_encoding(s);
+        return true;
+    }
+
+    width = msb + 1 - lsb;
+    if (a->rn == 15) {
+        /* BFC */
+        tmp = tcg_const_i32(0);
+    } else {
+        /* BFI */
+        tmp = load_reg(s, a->rn);
+    }
+    if (width != 32) {
+        TCGv_i32 tmp2 = load_reg(s, a->rd);
+        tcg_gen_deposit_i32(tmp, tmp2, tmp, lsb, width);
+        tcg_temp_free_i32(tmp2);
+    }
+    store_reg(s, a->rd, tmp);
+    return true;
+}
+
+static bool trans_UDF(DisasContext *s, arg_UDF *a)
+{
+    unallocated_encoding(s);
+    return true;
+}
+
+/*
+ * Parallel addition and subtraction
+ */
+
+static bool op_par_addsub(DisasContext *s, arg_rrr *a,
+                          void (*gen)(TCGv_i32, TCGv_i32, TCGv_i32))
+{
+    TCGv_i32 t0, t1;
+
+    if (s->thumb
+        ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)
+        : !ENABLE_ARCH_6) {
+        return false;
+    }
+
+    t0 = load_reg(s, a->rn);
+    t1 = load_reg(s, a->rm);
+
+    gen(t0, t0, t1);
+
+    tcg_temp_free_i32(t1);
+    store_reg(s, a->rd, t0);
+    return true;
+}
+
+static bool op_par_addsub_ge(DisasContext *s, arg_rrr *a,
+                             void (*gen)(TCGv_i32, TCGv_i32,
+                                         TCGv_i32, TCGv_ptr))
+{
+    TCGv_i32 t0, t1;
+    TCGv_ptr ge;
+
+    if (s->thumb
+        ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)
+        : !ENABLE_ARCH_6) {
+        return false;
+    }
+
+    t0 = load_reg(s, a->rn);
+    t1 = load_reg(s, a->rm);
+
+    ge = tcg_temp_new_ptr();
+    tcg_gen_addi_ptr(ge, cpu_env, offsetof(CPUARMState, GE));
+    gen(t0, t0, t1, ge);
+
+    tcg_temp_free_ptr(ge);
+    tcg_temp_free_i32(t1);
+    store_reg(s, a->rd, t0);
+    return true;
+}
+
+#define DO_PAR_ADDSUB(NAME, helper) \
+static bool trans_##NAME(DisasContext *s, arg_rrr *a)   \
+{                                                       \
+    return op_par_addsub(s, a, helper);                 \
+}
+
+#define DO_PAR_ADDSUB_GE(NAME, helper) \
+static bool trans_##NAME(DisasContext *s, arg_rrr *a)   \
+{                                                       \
+    return op_par_addsub_ge(s, a, helper);              \
+}
+
+DO_PAR_ADDSUB_GE(SADD16, gen_helper_sadd16)
+DO_PAR_ADDSUB_GE(SASX, gen_helper_saddsubx)
+DO_PAR_ADDSUB_GE(SSAX, gen_helper_ssubaddx)
+DO_PAR_ADDSUB_GE(SSUB16, gen_helper_ssub16)
+DO_PAR_ADDSUB_GE(SADD8, gen_helper_sadd8)
+DO_PAR_ADDSUB_GE(SSUB8, gen_helper_ssub8)
+
+DO_PAR_ADDSUB_GE(UADD16, gen_helper_uadd16)
+DO_PAR_ADDSUB_GE(UASX, gen_helper_uaddsubx)
+DO_PAR_ADDSUB_GE(USAX, gen_helper_usubaddx)
+DO_PAR_ADDSUB_GE(USUB16, gen_helper_usub16)
+DO_PAR_ADDSUB_GE(UADD8, gen_helper_uadd8)
+DO_PAR_ADDSUB_GE(USUB8, gen_helper_usub8)
+
+DO_PAR_ADDSUB(QADD16, gen_helper_qadd16)
+DO_PAR_ADDSUB(QASX, gen_helper_qaddsubx)
+DO_PAR_ADDSUB(QSAX, gen_helper_qsubaddx)
+DO_PAR_ADDSUB(QSUB16, gen_helper_qsub16)
+DO_PAR_ADDSUB(QADD8, gen_helper_qadd8)
+DO_PAR_ADDSUB(QSUB8, gen_helper_qsub8)
+
+DO_PAR_ADDSUB(UQADD16, gen_helper_uqadd16)
+DO_PAR_ADDSUB(UQASX, gen_helper_uqaddsubx)
+DO_PAR_ADDSUB(UQSAX, gen_helper_uqsubaddx)
+DO_PAR_ADDSUB(UQSUB16, gen_helper_uqsub16)
+DO_PAR_ADDSUB(UQADD8, gen_helper_uqadd8)
+DO_PAR_ADDSUB(UQSUB8, gen_helper_uqsub8)
+
+DO_PAR_ADDSUB(SHADD16, gen_helper_shadd16)
+DO_PAR_ADDSUB(SHASX, gen_helper_shaddsubx)
+DO_PAR_ADDSUB(SHSAX, gen_helper_shsubaddx)
+DO_PAR_ADDSUB(SHSUB16, gen_helper_shsub16)
+DO_PAR_ADDSUB(SHADD8, gen_helper_shadd8)
+DO_PAR_ADDSUB(SHSUB8, gen_helper_shsub8)
+
+DO_PAR_ADDSUB(UHADD16, gen_helper_uhadd16)
+DO_PAR_ADDSUB(UHASX, gen_helper_uhaddsubx)
+DO_PAR_ADDSUB(UHSAX, gen_helper_uhsubaddx)
+DO_PAR_ADDSUB(UHSUB16, gen_helper_uhsub16)
+DO_PAR_ADDSUB(UHADD8, gen_helper_uhadd8)
+DO_PAR_ADDSUB(UHSUB8, gen_helper_uhsub8)
+
+#undef DO_PAR_ADDSUB
+#undef DO_PAR_ADDSUB_GE
+
+/*
+ * Packing, unpacking, saturation, and reversal
+ */
+
+static bool trans_PKH(DisasContext *s, arg_PKH *a)
+{
+    TCGv_i32 tn, tm;
+    int shift = a->imm;
+
+    if (s->thumb
+        ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)
+        : !ENABLE_ARCH_6) {
+        return false;
+    }
+
+    tn = load_reg(s, a->rn);
+    tm = load_reg(s, a->rm);
+    if (a->tb) {
+        /* PKHTB */
+        if (shift == 0) {
+            shift = 31;
+        }
+        tcg_gen_sari_i32(tm, tm, shift);
+        tcg_gen_deposit_i32(tn, tn, tm, 0, 16);
+    } else {
+        /* PKHBT */
+        tcg_gen_shli_i32(tm, tm, shift);
+        tcg_gen_deposit_i32(tn, tm, tn, 0, 16);
+    }
+    tcg_temp_free_i32(tm);
+    store_reg(s, a->rd, tn);
+    return true;
+}
+
+static bool op_sat(DisasContext *s, arg_sat *a,
+                   void (*gen)(TCGv_i32, TCGv_env, TCGv_i32, TCGv_i32))
+{
+    TCGv_i32 tmp;
+    int shift = a->imm;
+
+    if (!ENABLE_ARCH_6) {
+        return false;
+    }
+
+    tmp = load_reg(s, a->rn);
+    if (a->sh) {
+        tcg_gen_sari_i32(tmp, tmp, shift ? shift : 31);
+    } else {
+        tcg_gen_shli_i32(tmp, tmp, shift);
+    }
+
+    gen(tmp, cpu_env, tmp, tcg_constant_i32(a->satimm));
+
+    store_reg(s, a->rd, tmp);
+    return true;
+}
+
+static bool trans_SSAT(DisasContext *s, arg_sat *a)
+{
+    return op_sat(s, a, gen_helper_ssat);
+}
+
+static bool trans_USAT(DisasContext *s, arg_sat *a)
+{
+    return op_sat(s, a, gen_helper_usat);
+}
+
+static bool trans_SSAT16(DisasContext *s, arg_sat *a)
+{
+    if (s->thumb && !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)) {
+        return false;
+    }
+    return op_sat(s, a, gen_helper_ssat16);
+}
+
+static bool trans_USAT16(DisasContext *s, arg_sat *a)
+{
+    if (s->thumb && !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)) {
+        return false;
+    }
+    return op_sat(s, a, gen_helper_usat16);
+}
+
+static bool op_xta(DisasContext *s, arg_rrr_rot *a,
+                   void (*gen_extract)(TCGv_i32, TCGv_i32),
+                   void (*gen_add)(TCGv_i32, TCGv_i32, TCGv_i32))
+{
+    TCGv_i32 tmp;
+
+    if (!ENABLE_ARCH_6) {
+        return false;
+    }
+
+    tmp = load_reg(s, a->rm);
+    /*
+     * TODO: In many cases we could do a shift instead of a rotate.
+     * Combined with a simple extend, that becomes an extract.
+     */
+    tcg_gen_rotri_i32(tmp, tmp, a->rot * 8);
+    gen_extract(tmp, tmp);
+
+    if (a->rn != 15) {
+        TCGv_i32 tmp2 = load_reg(s, a->rn);
+        gen_add(tmp, tmp, tmp2);
+        tcg_temp_free_i32(tmp2);
+    }
+    store_reg(s, a->rd, tmp);
+    return true;
+}
+
+static bool trans_SXTAB(DisasContext *s, arg_rrr_rot *a)
+{
+    return op_xta(s, a, tcg_gen_ext8s_i32, tcg_gen_add_i32);
+}
+
+static bool trans_SXTAH(DisasContext *s, arg_rrr_rot *a)
+{
+    return op_xta(s, a, tcg_gen_ext16s_i32, tcg_gen_add_i32);
+}
+
+static bool trans_SXTAB16(DisasContext *s, arg_rrr_rot *a)
+{
+    if (s->thumb && !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)) {
+        return false;
+    }
+    return op_xta(s, a, gen_helper_sxtb16, gen_add16);
+}
+
+static bool trans_UXTAB(DisasContext *s, arg_rrr_rot *a)
+{
+    return op_xta(s, a, tcg_gen_ext8u_i32, tcg_gen_add_i32);
+}
+
+static bool trans_UXTAH(DisasContext *s, arg_rrr_rot *a)
+{
+    return op_xta(s, a, tcg_gen_ext16u_i32, tcg_gen_add_i32);
+}
+
+static bool trans_UXTAB16(DisasContext *s, arg_rrr_rot *a)
+{
+    if (s->thumb && !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)) {
+        return false;
+    }
+    return op_xta(s, a, gen_helper_uxtb16, gen_add16);
+}
+
+static bool trans_SEL(DisasContext *s, arg_rrr *a)
+{
+    TCGv_i32 t1, t2, t3;
+
+    if (s->thumb
+        ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)
+        : !ENABLE_ARCH_6) {
+        return false;
+    }
+
+    t1 = load_reg(s, a->rn);
+    t2 = load_reg(s, a->rm);
+    t3 = tcg_temp_new_i32();
+    tcg_gen_ld_i32(t3, cpu_env, offsetof(CPUARMState, GE));
+    gen_helper_sel_flags(t1, t3, t1, t2);
+    tcg_temp_free_i32(t3);
+    tcg_temp_free_i32(t2);
+    store_reg(s, a->rd, t1);
+    return true;
+}
+
+static bool op_rr(DisasContext *s, arg_rr *a,
+                  void (*gen)(TCGv_i32, TCGv_i32))
+{
+    TCGv_i32 tmp;
+
+    tmp = load_reg(s, a->rm);
+    gen(tmp, tmp);
+    store_reg(s, a->rd, tmp);
+    return true;
+}
+
+static bool trans_REV(DisasContext *s, arg_rr *a)
+{
+    if (!ENABLE_ARCH_6) {
+        return false;
+    }
+    return op_rr(s, a, tcg_gen_bswap32_i32);
+}
+
+static bool trans_REV16(DisasContext *s, arg_rr *a)
+{
+    if (!ENABLE_ARCH_6) {
+        return false;
+    }
+    return op_rr(s, a, gen_rev16);
+}
+
+static bool trans_REVSH(DisasContext *s, arg_rr *a)
+{
+    if (!ENABLE_ARCH_6) {
+        return false;
+    }
+    return op_rr(s, a, gen_revsh);
+}
+
+static bool trans_RBIT(DisasContext *s, arg_rr *a)
+{
+    if (!ENABLE_ARCH_6T2) {
+        return false;
+    }
+    return op_rr(s, a, gen_helper_rbit);
+}
+
+/*
+ * Signed multiply, signed and unsigned divide
+ */
+
+static bool op_smlad(DisasContext *s, arg_rrrr *a, bool m_swap, bool sub)
+{
+    TCGv_i32 t1, t2;
+
+    if (!ENABLE_ARCH_6) {
+        return false;
+    }
+
+    t1 = load_reg(s, a->rn);
+    t2 = load_reg(s, a->rm);
+    if (m_swap) {
+        gen_swap_half(t2, t2);
+    }
+    gen_smul_dual(t1, t2);
+
+    if (sub) {
+        /*
+         * This subtraction cannot overflow, so we can do a simple
+         * 32-bit subtraction and then a possible 32-bit saturating
+         * addition of Ra.
+         */
+        tcg_gen_sub_i32(t1, t1, t2);
+        tcg_temp_free_i32(t2);
+
+        if (a->ra != 15) {
+            t2 = load_reg(s, a->ra);
+            gen_helper_add_setq(t1, cpu_env, t1, t2);
+            tcg_temp_free_i32(t2);
+        }
+    } else if (a->ra == 15) {
+        /* Single saturation-checking addition */
+        gen_helper_add_setq(t1, cpu_env, t1, t2);
+        tcg_temp_free_i32(t2);
+    } else {
+        /*
+         * We need to add the products and Ra together and then
+         * determine whether the final result overflowed. Doing
+         * this as two separate add-and-check-overflow steps incorrectly
+         * sets Q for cases like (-32768 * -32768) + (-32768 * -32768) + -1.
+         * Do all the arithmetic at 64-bits and then check for overflow.
+         */
+        TCGv_i64 p64, q64;
+        TCGv_i32 t3, qf, one;
+
+        p64 = tcg_temp_new_i64();
+        q64 = tcg_temp_new_i64();
+        tcg_gen_ext_i32_i64(p64, t1);
+        tcg_gen_ext_i32_i64(q64, t2);
+        tcg_gen_add_i64(p64, p64, q64);
+        load_reg_var(s, t2, a->ra);
+        tcg_gen_ext_i32_i64(q64, t2);
+        tcg_gen_add_i64(p64, p64, q64);
+        tcg_temp_free_i64(q64);
+
+        tcg_gen_extr_i64_i32(t1, t2, p64);
+        tcg_temp_free_i64(p64);
+        /*
+         * t1 is the low half of the result which goes into Rd.
+         * We have overflow and must set Q if the high half (t2)
+         * is different from the sign-extension of t1.
+         */
+        t3 = tcg_temp_new_i32();
+        tcg_gen_sari_i32(t3, t1, 31);
+        qf = load_cpu_field(QF);
+        one = tcg_constant_i32(1);
+        tcg_gen_movcond_i32(TCG_COND_NE, qf, t2, t3, one, qf);
+        store_cpu_field(qf, QF);
+        tcg_temp_free_i32(t3);
+        tcg_temp_free_i32(t2);
+    }
+    store_reg(s, a->rd, t1);
+    return true;
+}
+
+static bool trans_SMLAD(DisasContext *s, arg_rrrr *a)
+{
+    return op_smlad(s, a, false, false);
+}
+
+static bool trans_SMLADX(DisasContext *s, arg_rrrr *a)
+{
+    return op_smlad(s, a, true, false);
+}
+
+static bool trans_SMLSD(DisasContext *s, arg_rrrr *a)
+{
+    return op_smlad(s, a, false, true);
+}
+
+static bool trans_SMLSDX(DisasContext *s, arg_rrrr *a)
+{
+    return op_smlad(s, a, true, true);
+}
+
+static bool op_smlald(DisasContext *s, arg_rrrr *a, bool m_swap, bool sub)
+{
+    TCGv_i32 t1, t2;
+    TCGv_i64 l1, l2;
+
+    if (!ENABLE_ARCH_6) {
+        return false;
+    }
+
+    t1 = load_reg(s, a->rn);
+    t2 = load_reg(s, a->rm);
+    if (m_swap) {
+        gen_swap_half(t2, t2);
+    }
+    gen_smul_dual(t1, t2);
+
+    l1 = tcg_temp_new_i64();
+    l2 = tcg_temp_new_i64();
+    tcg_gen_ext_i32_i64(l1, t1);
+    tcg_gen_ext_i32_i64(l2, t2);
+    tcg_temp_free_i32(t1);
+    tcg_temp_free_i32(t2);
+
+    if (sub) {
+        tcg_gen_sub_i64(l1, l1, l2);
+    } else {
+        tcg_gen_add_i64(l1, l1, l2);
+    }
+    tcg_temp_free_i64(l2);
+
+    gen_addq(s, l1, a->ra, a->rd);
+    gen_storeq_reg(s, a->ra, a->rd, l1);
+    tcg_temp_free_i64(l1);
+    return true;
+}
+
+static bool trans_SMLALD(DisasContext *s, arg_rrrr *a)
+{
+    return op_smlald(s, a, false, false);
+}
+
+static bool trans_SMLALDX(DisasContext *s, arg_rrrr *a)
+{
+    return op_smlald(s, a, true, false);
+}
+
+static bool trans_SMLSLD(DisasContext *s, arg_rrrr *a)
+{
+    return op_smlald(s, a, false, true);
+}
+
+static bool trans_SMLSLDX(DisasContext *s, arg_rrrr *a)
+{
+    return op_smlald(s, a, true, true);
+}
+
+static bool op_smmla(DisasContext *s, arg_rrrr *a, bool round, bool sub)
+{
+    TCGv_i32 t1, t2;
+
+    if (s->thumb
+        ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)
+        : !ENABLE_ARCH_6) {
+        return false;
+    }
+
+    t1 = load_reg(s, a->rn);
+    t2 = load_reg(s, a->rm);
+    tcg_gen_muls2_i32(t2, t1, t1, t2);
+
+    if (a->ra != 15) {
+        TCGv_i32 t3 = load_reg(s, a->ra);
+        if (sub) {
+            /*
+             * For SMMLS, we need a 64-bit subtract.  Borrow caused by
+             * a non-zero multiplicand lowpart, and the correct result
+             * lowpart for rounding.
+             */
+            tcg_gen_sub2_i32(t2, t1, tcg_constant_i32(0), t3, t2, t1);
+        } else {
+            tcg_gen_add_i32(t1, t1, t3);
+        }
+        tcg_temp_free_i32(t3);
+    }
+    if (round) {
+        /*
+         * Adding 0x80000000 to the 64-bit quantity means that we have
+         * carry in to the high word when the low word has the msb set.
+         */
+        tcg_gen_shri_i32(t2, t2, 31);
+        tcg_gen_add_i32(t1, t1, t2);
+    }
+    tcg_temp_free_i32(t2);
+    store_reg(s, a->rd, t1);
+    return true;
+}
+
+static bool trans_SMMLA(DisasContext *s, arg_rrrr *a)
+{
+    return op_smmla(s, a, false, false);
+}
+
+static bool trans_SMMLAR(DisasContext *s, arg_rrrr *a)
+{
+    return op_smmla(s, a, true, false);
+}
+
+static bool trans_SMMLS(DisasContext *s, arg_rrrr *a)
+{
+    return op_smmla(s, a, false, true);
+}
+
+static bool trans_SMMLSR(DisasContext *s, arg_rrrr *a)
+{
+    return op_smmla(s, a, true, true);
+}
+
+static bool op_div(DisasContext *s, arg_rrr *a, bool u)
+{
+    TCGv_i32 t1, t2;
+
+    if (s->thumb
+        ? !dc_isar_feature(aa32_thumb_div, s)
+        : !dc_isar_feature(aa32_arm_div, s)) {
+        return false;
+    }
+
+    t1 = load_reg(s, a->rn);
+    t2 = load_reg(s, a->rm);
+    if (u) {
+        gen_helper_udiv(t1, cpu_env, t1, t2);
+    } else {
+        gen_helper_sdiv(t1, cpu_env, t1, t2);
+    }
+    tcg_temp_free_i32(t2);
+    store_reg(s, a->rd, t1);
+    return true;
+}
+
+static bool trans_SDIV(DisasContext *s, arg_rrr *a)
+{
+    return op_div(s, a, false);
+}
+
+static bool trans_UDIV(DisasContext *s, arg_rrr *a)
+{
+    return op_div(s, a, true);
+}
+
+/*
+ * Block data transfer
+ */
+
+static TCGv_i32 op_addr_block_pre(DisasContext *s, arg_ldst_block *a, int n)
+{
+    TCGv_i32 addr = load_reg(s, a->rn);
+
+    if (a->b) {
+        if (a->i) {
+            /* pre increment */
+            tcg_gen_addi_i32(addr, addr, 4);
+        } else {
+            /* pre decrement */
+            tcg_gen_addi_i32(addr, addr, -(n * 4));
+        }
+    } else if (!a->i && n != 1) {
+        /* post decrement */
+        tcg_gen_addi_i32(addr, addr, -((n - 1) * 4));
+    }
+
+    if (s->v8m_stackcheck && a->rn == 13 && a->w) {
+        /*
+         * If the writeback is incrementing SP rather than
+         * decrementing it, and the initial SP is below the
+         * stack limit but the final written-back SP would
+         * be above, then we must not perform any memory
+         * accesses, but it is IMPDEF whether we generate
+         * an exception. We choose to do so in this case.
+         * At this point 'addr' is the lowest address, so
+         * either the original SP (if incrementing) or our
+         * final SP (if decrementing), so that's what we check.
+         */
+        gen_helper_v8m_stackcheck(cpu_env, addr);
+    }
+
+    return addr;
+}
+
+static void op_addr_block_post(DisasContext *s, arg_ldst_block *a,
+                               TCGv_i32 addr, int n)
+{
+    if (a->w) {
+        /* write back */
+        if (!a->b) {
+            if (a->i) {
+                /* post increment */
+                tcg_gen_addi_i32(addr, addr, 4);
+            } else {
+                /* post decrement */
+                tcg_gen_addi_i32(addr, addr, -(n * 4));
+            }
+        } else if (!a->i && n != 1) {
+            /* pre decrement */
+            tcg_gen_addi_i32(addr, addr, -((n - 1) * 4));
+        }
+        store_reg(s, a->rn, addr);
+    } else {
+        tcg_temp_free_i32(addr);
+    }
+}
+
+static bool op_stm(DisasContext *s, arg_ldst_block *a, int min_n)
+{
+    int i, j, n, list, mem_idx;
+    bool user = a->u;
+    TCGv_i32 addr, tmp;
+
+    if (user) {
+        /* STM (user) */
+        if (IS_USER(s)) {
+            /* Only usable in supervisor mode.  */
+            unallocated_encoding(s);
+            return true;
+        }
+    }
+
+    list = a->list;
+    n = ctpop16(list);
+    if (n < min_n || a->rn == 15) {
+        unallocated_encoding(s);
+        return true;
+    }
+
+    s->eci_handled = true;
+
+    addr = op_addr_block_pre(s, a, n);
+    mem_idx = get_mem_index(s);
+
+    for (i = j = 0; i < 16; i++) {
+        if (!(list & (1 << i))) {
+            continue;
+        }
+
+        if (user && i != 15) {
+            tmp = tcg_temp_new_i32();
+            gen_helper_get_user_reg(tmp, cpu_env, tcg_constant_i32(i));
+        } else {
+            tmp = load_reg(s, i);
+        }
+        gen_aa32_st_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
+        tcg_temp_free_i32(tmp);
+
+        /* No need to add after the last transfer.  */
+        if (++j != n) {
+            tcg_gen_addi_i32(addr, addr, 4);
+        }
+    }
+
+    op_addr_block_post(s, a, addr, n);
+    clear_eci_state(s);
+    return true;
+}
+
+static bool trans_STM(DisasContext *s, arg_ldst_block *a)
+{
+    /* BitCount(list) < 1 is UNPREDICTABLE */
+    return op_stm(s, a, 1);
+}
+
+static bool trans_STM_t32(DisasContext *s, arg_ldst_block *a)
+{
+    /* Writeback register in register list is UNPREDICTABLE for T32.  */
+    if (a->w && (a->list & (1 << a->rn))) {
+        unallocated_encoding(s);
+        return true;
+    }
+    /* BitCount(list) < 2 is UNPREDICTABLE */
+    return op_stm(s, a, 2);
+}
+
+static bool do_ldm(DisasContext *s, arg_ldst_block *a, int min_n)
+{
+    int i, j, n, list, mem_idx;
+    bool loaded_base;
+    bool user = a->u;
+    bool exc_return = false;
+    TCGv_i32 addr, tmp, loaded_var;
+
+    if (user) {
+        /* LDM (user), LDM (exception return) */
+        if (IS_USER(s)) {
+            /* Only usable in supervisor mode.  */
+            unallocated_encoding(s);
+            return true;
+        }
+        if (extract32(a->list, 15, 1)) {
+            exc_return = true;
+            user = false;
+        } else {
+            /* LDM (user) does not allow writeback.  */
+            if (a->w) {
+                unallocated_encoding(s);
+                return true;
+            }
+        }
+    }
+
+    list = a->list;
+    n = ctpop16(list);
+    if (n < min_n || a->rn == 15) {
+        unallocated_encoding(s);
+        return true;
+    }
+
+    s->eci_handled = true;
+
+    addr = op_addr_block_pre(s, a, n);
+    mem_idx = get_mem_index(s);
+    loaded_base = false;
+    loaded_var = NULL;
+
+    for (i = j = 0; i < 16; i++) {
+        if (!(list & (1 << i))) {
+            continue;
+        }
+
+        tmp = tcg_temp_new_i32();
+        gen_aa32_ld_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
+        if (user) {
+            gen_helper_set_user_reg(cpu_env, tcg_constant_i32(i), tmp);
+            tcg_temp_free_i32(tmp);
+        } else if (i == a->rn) {
+            loaded_var = tmp;
+            loaded_base = true;
+        } else if (i == 15 && exc_return) {
+            store_pc_exc_ret(s, tmp);
+        } else {
+            store_reg_from_load(s, i, tmp);
+        }
+
+        /* No need to add after the last transfer.  */
+        if (++j != n) {
+            tcg_gen_addi_i32(addr, addr, 4);
+        }
+    }
+
+    op_addr_block_post(s, a, addr, n);
+
+    if (loaded_base) {
+        /* Note that we reject base == pc above.  */
+        store_reg(s, a->rn, loaded_var);
+    }
+
+    if (exc_return) {
+        /* Restore CPSR from SPSR.  */
+        tmp = load_cpu_field(spsr);
+        if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
+            gen_io_start();
+        }
+        gen_helper_cpsr_write_eret(cpu_env, tmp);
+        tcg_temp_free_i32(tmp);
+        /* Must exit loop to check un-masked IRQs */
+        s->base.is_jmp = DISAS_EXIT;
+    }
+    clear_eci_state(s);
+    return true;
+}
+
+static bool trans_LDM_a32(DisasContext *s, arg_ldst_block *a)
+{
+    /*
+     * Writeback register in register list is UNPREDICTABLE
+     * for ArchVersion() >= 7.  Prior to v7, A32 would write
+     * an UNKNOWN value to the base register.
+     */
+    if (ENABLE_ARCH_7 && a->w && (a->list & (1 << a->rn))) {
+        unallocated_encoding(s);
+        return true;
+    }
+    /* BitCount(list) < 1 is UNPREDICTABLE */
+    return do_ldm(s, a, 1);
+}
+
+static bool trans_LDM_t32(DisasContext *s, arg_ldst_block *a)
+{
+    /* Writeback register in register list is UNPREDICTABLE for T32. */
+    if (a->w && (a->list & (1 << a->rn))) {
+        unallocated_encoding(s);
+        return true;
+    }
+    /* BitCount(list) < 2 is UNPREDICTABLE */
+    return do_ldm(s, a, 2);
+}
+
+static bool trans_LDM_t16(DisasContext *s, arg_ldst_block *a)
+{
+    /* Writeback is conditional on the base register not being loaded.  */
+    a->w = !(a->list & (1 << a->rn));
+    /* BitCount(list) < 1 is UNPREDICTABLE */
+    return do_ldm(s, a, 1);
+}
+
+static bool trans_CLRM(DisasContext *s, arg_CLRM *a)
+{
+    int i;
+    TCGv_i32 zero;
+
+    if (!dc_isar_feature(aa32_m_sec_state, s)) {
+        return false;
+    }
+
+    if (extract32(a->list, 13, 1)) {
+        return false;
+    }
+
+    if (!a->list) {
+        /* UNPREDICTABLE; we choose to UNDEF */
+        return false;
+    }
+
+    s->eci_handled = true;
+
+    zero = tcg_constant_i32(0);
+    for (i = 0; i < 15; i++) {
+        if (extract32(a->list, i, 1)) {
+            /* Clear R[i] */
+            tcg_gen_mov_i32(cpu_R[i], zero);
+        }
+    }
+    if (extract32(a->list, 15, 1)) {
+        /*
+         * Clear APSR (by calling the MSR helper with the same argument
+         * as for "MSR APSR_nzcvqg, Rn": mask = 0b1100, SYSM=0)
+         */
+        gen_helper_v7m_msr(cpu_env, tcg_constant_i32(0xc00), zero);
+    }
+    clear_eci_state(s);
+    return true;
+}
+
+/*
+ * Branch, branch with link
+ */
+
+static bool trans_B(DisasContext *s, arg_i *a)
+{
+    gen_jmp(s, jmp_diff(s, a->imm));
+    return true;
+}
+
+static bool trans_B_cond_thumb(DisasContext *s, arg_ci *a)
+{
+    /* This has cond from encoding, required to be outside IT block.  */
+    if (a->cond >= 0xe) {
+        return false;
+    }
+    if (s->condexec_mask) {
+        unallocated_encoding(s);
+        return true;
+    }
+    arm_skip_unless(s, a->cond);
+    gen_jmp(s, jmp_diff(s, a->imm));
+    return true;
+}
+
+static bool trans_BL(DisasContext *s, arg_i *a)
+{
+    gen_pc_plus_diff(s, cpu_R[14], curr_insn_len(s) | s->thumb);
+    gen_jmp(s, jmp_diff(s, a->imm));
+    return true;
+}
+
+static bool trans_BLX_i(DisasContext *s, arg_BLX_i *a)
+{
+    /*
+     * BLX <imm> would be useless on M-profile; the encoding space
+     * is used for other insns from v8.1M onward, and UNDEFs before that.
+     */
+    if (arm_dc_feature(s, ARM_FEATURE_M)) {
+        return false;
+    }
+
+    /* For A32, ARM_FEATURE_V5 is checked near the start of the uncond block. */
+    if (s->thumb && (a->imm & 2)) {
+        return false;
+    }
+    gen_pc_plus_diff(s, cpu_R[14], curr_insn_len(s) | s->thumb);
+    store_cpu_field_constant(!s->thumb, thumb);
+    /* This jump is computed from an aligned PC: subtract off the low bits. */
+    gen_jmp(s, jmp_diff(s, a->imm - (s->pc_curr & 3)));
+    return true;
+}
+
+static bool trans_BL_BLX_prefix(DisasContext *s, arg_BL_BLX_prefix *a)
+{
+    assert(!arm_dc_feature(s, ARM_FEATURE_THUMB2));
+    gen_pc_plus_diff(s, cpu_R[14], jmp_diff(s, a->imm << 12));
+    return true;
+}
+
+static bool trans_BL_suffix(DisasContext *s, arg_BL_suffix *a)
+{
+    TCGv_i32 tmp = tcg_temp_new_i32();
+
+    assert(!arm_dc_feature(s, ARM_FEATURE_THUMB2));
+    tcg_gen_addi_i32(tmp, cpu_R[14], (a->imm << 1) | 1);
+    gen_pc_plus_diff(s, cpu_R[14], curr_insn_len(s) | 1);
+    gen_bx(s, tmp);
+    return true;
+}
+
+static bool trans_BLX_suffix(DisasContext *s, arg_BLX_suffix *a)
+{
+    TCGv_i32 tmp;
+
+    assert(!arm_dc_feature(s, ARM_FEATURE_THUMB2));
+    if (!ENABLE_ARCH_5) {
+        return false;
+    }
+    tmp = tcg_temp_new_i32();
+    tcg_gen_addi_i32(tmp, cpu_R[14], a->imm << 1);
+    tcg_gen_andi_i32(tmp, tmp, 0xfffffffc);
+    gen_pc_plus_diff(s, cpu_R[14], curr_insn_len(s) | 1);
+    gen_bx(s, tmp);
+    return true;
+}
+
+static bool trans_BF(DisasContext *s, arg_BF *a)
+{
+    /*
+     * M-profile branch future insns. The architecture permits an
+     * implementation to implement these as NOPs (equivalent to
+     * discarding the LO_BRANCH_INFO cache immediately), and we
+     * take that IMPDEF option because for QEMU a "real" implementation
+     * would be complicated and wouldn't execute any faster.
+     */
+    if (!dc_isar_feature(aa32_lob, s)) {
+        return false;
+    }
+    if (a->boff == 0) {
+        /* SEE "Related encodings" (loop insns) */
+        return false;
+    }
+    /* Handle as NOP */
+    return true;
+}
+
+static bool trans_DLS(DisasContext *s, arg_DLS *a)
+{
+    /* M-profile low-overhead loop start */
+    TCGv_i32 tmp;
+
+    if (!dc_isar_feature(aa32_lob, s)) {
+        return false;
+    }
+    if (a->rn == 13 || a->rn == 15) {
+        /*
+         * For DLSTP rn == 15 is a related encoding (LCTP); the
+         * other cases caught by this condition are all
+         * CONSTRAINED UNPREDICTABLE: we choose to UNDEF
+         */
+        return false;
+    }
+
+    if (a->size != 4) {
+        /* DLSTP */
+        if (!dc_isar_feature(aa32_mve, s)) {
+            return false;
+        }
+        if (!vfp_access_check(s)) {
+            return true;
+        }
+    }
+
+    /* Not a while loop: set LR to the count, and set LTPSIZE for DLSTP */
+    tmp = load_reg(s, a->rn);
+    store_reg(s, 14, tmp);
+    if (a->size != 4) {
+        /* DLSTP: set FPSCR.LTPSIZE */
+        store_cpu_field(tcg_constant_i32(a->size), v7m.ltpsize);
+        s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
+    }
+    return true;
+}
+
+static bool trans_WLS(DisasContext *s, arg_WLS *a)
+{
+    /* M-profile low-overhead while-loop start */
+    TCGv_i32 tmp;
+    DisasLabel nextlabel;
+
+    if (!dc_isar_feature(aa32_lob, s)) {
+        return false;
+    }
+    if (a->rn == 13 || a->rn == 15) {
+        /*
+         * For WLSTP rn == 15 is a related encoding (LE); the
+         * other cases caught by this condition are all
+         * CONSTRAINED UNPREDICTABLE: we choose to UNDEF
+         */
+        return false;
+    }
+    if (s->condexec_mask) {
+        /*
+         * WLS in an IT block is CONSTRAINED UNPREDICTABLE;
+         * we choose to UNDEF, because otherwise our use of
+         * gen_goto_tb(1) would clash with the use of TB exit 1
+         * in the dc->condjmp condition-failed codepath in
+         * arm_tr_tb_stop() and we'd get an assertion.
+         */
+        return false;
+    }
+    if (a->size != 4) {
+        /* WLSTP */
+        if (!dc_isar_feature(aa32_mve, s)) {
+            return false;
+        }
+        /*
+         * We need to check that the FPU is enabled here, but mustn't
+         * call vfp_access_check() to do that because we don't want to
+         * do the lazy state preservation in the "loop count is zero" case.
+         * Do the check-and-raise-exception by hand.
+         */
+        if (s->fp_excp_el) {
+            gen_exception_insn_el(s, 0, EXCP_NOCP,
+                                  syn_uncategorized(), s->fp_excp_el);
+            return true;
+        }
+    }
+
+    nextlabel = gen_disas_label(s);
+    tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_R[a->rn], 0, nextlabel.label);
+    tmp = load_reg(s, a->rn);
+    store_reg(s, 14, tmp);
+    if (a->size != 4) {
+        /*
+         * WLSTP: set FPSCR.LTPSIZE. This requires that we do the
+         * lazy state preservation, new FP context creation, etc,
+         * that vfp_access_check() does. We know that the actual
+         * access check will succeed (ie it won't generate code that
+         * throws an exception) because we did that check by hand earlier.
+         */
+        bool ok = vfp_access_check(s);
+        assert(ok);
+        store_cpu_field(tcg_constant_i32(a->size), v7m.ltpsize);
+        /*
+         * LTPSIZE updated, but MVE_NO_PRED will always be the same thing (0)
+         * when we take this upcoming exit from this TB, so gen_jmp_tb() is OK.
+         */
+    }
+    gen_jmp_tb(s, curr_insn_len(s), 1);
+
+    set_disas_label(s, nextlabel);
+    gen_jmp(s, jmp_diff(s, a->imm));
+    return true;
+}
+
+static bool trans_LE(DisasContext *s, arg_LE *a)
+{
+    /*
+     * M-profile low-overhead loop end. The architecture permits an
+     * implementation to discard the LO_BRANCH_INFO cache at any time,
+     * and we take the IMPDEF option to never set it in the first place
+     * (equivalent to always discarding it immediately), because for QEMU
+     * a "real" implementation would be complicated and wouldn't execute
+     * any faster.
+     */
+    TCGv_i32 tmp;
+    DisasLabel loopend;
+    bool fpu_active;
+
+    if (!dc_isar_feature(aa32_lob, s)) {
+        return false;
+    }
+    if (a->f && a->tp) {
+        return false;
+    }
+    if (s->condexec_mask) {
+        /*
+         * LE in an IT block is CONSTRAINED UNPREDICTABLE;
+         * we choose to UNDEF, because otherwise our use of
+         * gen_goto_tb(1) would clash with the use of TB exit 1
+         * in the dc->condjmp condition-failed codepath in
+         * arm_tr_tb_stop() and we'd get an assertion.
+         */
+        return false;
+    }
+    if (a->tp) {
+        /* LETP */
+        if (!dc_isar_feature(aa32_mve, s)) {
+            return false;
+        }
+        if (!vfp_access_check(s)) {
+            s->eci_handled = true;
+            return true;
+        }
+    }
+
+    /* LE/LETP is OK with ECI set and leaves it untouched */
+    s->eci_handled = true;
+
+    /*
+     * With MVE, LTPSIZE might not be 4, and we must emit an INVSTATE
+     * UsageFault exception for the LE insn in that case. Note that we
+     * are not directly checking FPSCR.LTPSIZE but instead check the
+     * pseudocode LTPSIZE() function, which returns 4 if the FPU is
+     * not currently active (ie ActiveFPState() returns false). We
+     * can identify not-active purely from our TB state flags, as the
+     * FPU is active only if:
+     *  the FPU is enabled
+     *  AND lazy state preservation is not active
+     *  AND we do not need a new fp context (this is the ASPEN/FPCA check)
+     *
+     * Usually we don't need to care about this distinction between
+     * LTPSIZE and FPSCR.LTPSIZE, because the code in vfp_access_check()
+     * will either take an exception or clear the conditions that make
+     * the FPU not active. But LE is an unusual case of a non-FP insn
+     * that looks at LTPSIZE.
+     */
+    fpu_active = !s->fp_excp_el && !s->v7m_lspact && !s->v7m_new_fp_ctxt_needed;
+
+    if (!a->tp && dc_isar_feature(aa32_mve, s) && fpu_active) {
+        /* Need to do a runtime check for LTPSIZE != 4 */
+        DisasLabel skipexc = gen_disas_label(s);
+        tmp = load_cpu_field(v7m.ltpsize);
+        tcg_gen_brcondi_i32(TCG_COND_EQ, tmp, 4, skipexc.label);
+        tcg_temp_free_i32(tmp);
+        gen_exception_insn(s, 0, EXCP_INVSTATE, syn_uncategorized());
+        set_disas_label(s, skipexc);
+    }
+
+    if (a->f) {
+        /* Loop-forever: just jump back to the loop start */
+        gen_jmp(s, jmp_diff(s, -a->imm));
+        return true;
+    }
+
+    /*
+     * Not loop-forever. If LR <= loop-decrement-value this is the last loop.
+     * For LE, we know at this point that LTPSIZE must be 4 and the
+     * loop decrement value is 1. For LETP we need to calculate the decrement
+     * value from LTPSIZE.
+     */
+    loopend = gen_disas_label(s);
+    if (!a->tp) {
+        tcg_gen_brcondi_i32(TCG_COND_LEU, cpu_R[14], 1, loopend.label);
+        tcg_gen_addi_i32(cpu_R[14], cpu_R[14], -1);
+    } else {
+        /*
+         * Decrement by 1 << (4 - LTPSIZE). We need to use a TCG local
+         * so that decr stays live after the brcondi.
+         */
+        TCGv_i32 decr = tcg_temp_local_new_i32();
+        TCGv_i32 ltpsize = load_cpu_field(v7m.ltpsize);
+        tcg_gen_sub_i32(decr, tcg_constant_i32(4), ltpsize);
+        tcg_gen_shl_i32(decr, tcg_constant_i32(1), decr);
+        tcg_temp_free_i32(ltpsize);
+
+        tcg_gen_brcond_i32(TCG_COND_LEU, cpu_R[14], decr, loopend.label);
+
+        tcg_gen_sub_i32(cpu_R[14], cpu_R[14], decr);
+        tcg_temp_free_i32(decr);
+    }
+    /* Jump back to the loop start */
+    gen_jmp(s, jmp_diff(s, -a->imm));
+
+    set_disas_label(s, loopend);
+    if (a->tp) {
+        /* Exits from tail-pred loops must reset LTPSIZE to 4 */
+        store_cpu_field(tcg_constant_i32(4), v7m.ltpsize);
+    }
+    /* End TB, continuing to following insn */
+    gen_jmp_tb(s, curr_insn_len(s), 1);
+    return true;
+}
+
+static bool trans_LCTP(DisasContext *s, arg_LCTP *a)
+{
+    /*
+     * M-profile Loop Clear with Tail Predication. Since our implementation
+     * doesn't cache branch information, all we need to do is reset
+     * FPSCR.LTPSIZE to 4.
+     */
+
+    if (!dc_isar_feature(aa32_lob, s) ||
+        !dc_isar_feature(aa32_mve, s)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    store_cpu_field_constant(4, v7m.ltpsize);
+    return true;
+}
+
+static bool trans_VCTP(DisasContext *s, arg_VCTP *a)
+{
+    /*
+     * M-profile Create Vector Tail Predicate. This insn is itself
+     * predicated and is subject to beatwise execution.
+     */
+    TCGv_i32 rn_shifted, masklen;
+
+    if (!dc_isar_feature(aa32_mve, s) || a->rn == 13 || a->rn == 15) {
+        return false;
+    }
+
+    if (!mve_eci_check(s) || !vfp_access_check(s)) {
+        return true;
+    }
+
+    /*
+     * We pre-calculate the mask length here to avoid having
+     * to have multiple helpers specialized for size.
+     * We pass the helper "rn <= (1 << (4 - size)) ? (rn << size) : 16".
+     */
+    rn_shifted = tcg_temp_new_i32();
+    masklen = load_reg(s, a->rn);
+    tcg_gen_shli_i32(rn_shifted, masklen, a->size);
+    tcg_gen_movcond_i32(TCG_COND_LEU, masklen,
+                        masklen, tcg_constant_i32(1 << (4 - a->size)),
+                        rn_shifted, tcg_constant_i32(16));
+    gen_helper_mve_vctp(cpu_env, masklen);
+    tcg_temp_free_i32(masklen);
+    tcg_temp_free_i32(rn_shifted);
+    /* This insn updates predication bits */
+    s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
+    mve_update_eci(s);
+    return true;
+}
+
+static bool op_tbranch(DisasContext *s, arg_tbranch *a, bool half)
+{
+    TCGv_i32 addr, tmp;
+
+    tmp = load_reg(s, a->rm);
+    if (half) {
+        tcg_gen_add_i32(tmp, tmp, tmp);
+    }
+    addr = load_reg(s, a->rn);
+    tcg_gen_add_i32(addr, addr, tmp);
+
+    gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), half ? MO_UW : MO_UB);
+
+    tcg_gen_add_i32(tmp, tmp, tmp);
+    gen_pc_plus_diff(s, addr, jmp_diff(s, 0));
+    tcg_gen_add_i32(tmp, tmp, addr);
+    tcg_temp_free_i32(addr);
+    store_reg(s, 15, tmp);
+    return true;
+}
+
+static bool trans_TBB(DisasContext *s, arg_tbranch *a)
+{
+    return op_tbranch(s, a, false);
+}
+
+static bool trans_TBH(DisasContext *s, arg_tbranch *a)
+{
+    return op_tbranch(s, a, true);
+}
+
+static bool trans_CBZ(DisasContext *s, arg_CBZ *a)
+{
+    TCGv_i32 tmp = load_reg(s, a->rn);
+
+    arm_gen_condlabel(s);
+    tcg_gen_brcondi_i32(a->nz ? TCG_COND_EQ : TCG_COND_NE,
+                        tmp, 0, s->condlabel.label);
+    tcg_temp_free_i32(tmp);
+    gen_jmp(s, jmp_diff(s, a->imm));
+    return true;
+}
+
+/*
+ * Supervisor call - both T32 & A32 come here so we need to check
+ * which mode we are in when checking for semihosting.
+ */
+
+static bool trans_SVC(DisasContext *s, arg_SVC *a)
+{
+    const uint32_t semihost_imm = s->thumb ? 0xab : 0x123456;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_M) &&
+        semihosting_enabled(s->current_el == 0) &&
+        (a->imm == semihost_imm)) {
+        gen_exception_internal_insn(s, EXCP_SEMIHOST);
+    } else {
+        if (s->fgt_svc) {
+            uint32_t syndrome = syn_aa32_svc(a->imm, s->thumb);
+            gen_exception_insn_el(s, 0, EXCP_UDEF, syndrome, 2);
+        } else {
+            gen_update_pc(s, curr_insn_len(s));
+            s->svc_imm = a->imm;
+            s->base.is_jmp = DISAS_SWI;
+        }
+    }
+    return true;
+}
+
+/*
+ * Unconditional system instructions
+ */
+
+static bool trans_RFE(DisasContext *s, arg_RFE *a)
+{
+    static const int8_t pre_offset[4] = {
+        /* DA */ -4, /* IA */ 0, /* DB */ -8, /* IB */ 4
+    };
+    static const int8_t post_offset[4] = {
+        /* DA */ -8, /* IA */ 4, /* DB */ -4, /* IB */ 0
+    };
+    TCGv_i32 addr, t1, t2;
+
+    if (!ENABLE_ARCH_6 || arm_dc_feature(s, ARM_FEATURE_M)) {
+        return false;
+    }
+    if (IS_USER(s)) {
+        unallocated_encoding(s);
+        return true;
+    }
+
+    addr = load_reg(s, a->rn);
+    tcg_gen_addi_i32(addr, addr, pre_offset[a->pu]);
+
+    /* Load PC into tmp and CPSR into tmp2.  */
+    t1 = tcg_temp_new_i32();
+    gen_aa32_ld_i32(s, t1, addr, get_mem_index(s), MO_UL | MO_ALIGN);
+    tcg_gen_addi_i32(addr, addr, 4);
+    t2 = tcg_temp_new_i32();
+    gen_aa32_ld_i32(s, t2, addr, get_mem_index(s), MO_UL | MO_ALIGN);
+
+    if (a->w) {
+        /* Base writeback.  */
+        tcg_gen_addi_i32(addr, addr, post_offset[a->pu]);
+        store_reg(s, a->rn, addr);
+    } else {
+        tcg_temp_free_i32(addr);
+    }
+    gen_rfe(s, t1, t2);
+    return true;
+}
+
+static bool trans_SRS(DisasContext *s, arg_SRS *a)
+{
+    if (!ENABLE_ARCH_6 || arm_dc_feature(s, ARM_FEATURE_M)) {
+        return false;
+    }
+    gen_srs(s, a->mode, a->pu, a->w);
+    return true;
+}
+
+static bool trans_CPS(DisasContext *s, arg_CPS *a)
+{
+    uint32_t mask, val;
+
+    if (!ENABLE_ARCH_6 || arm_dc_feature(s, ARM_FEATURE_M)) {
+        return false;
+    }
+    if (IS_USER(s)) {
+        /* Implemented as NOP in user mode.  */
+        return true;
+    }
+    /* TODO: There are quite a lot of UNPREDICTABLE argument combinations. */
+
+    mask = val = 0;
+    if (a->imod & 2) {
+        if (a->A) {
+            mask |= CPSR_A;
+        }
+        if (a->I) {
+            mask |= CPSR_I;
+        }
+        if (a->F) {
+            mask |= CPSR_F;
+        }
+        if (a->imod & 1) {
+            val |= mask;
+        }
+    }
+    if (a->M) {
+        mask |= CPSR_M;
+        val |= a->mode;
+    }
+    if (mask) {
+        gen_set_psr_im(s, mask, 0, val);
+    }
+    return true;
+}
+
+static bool trans_CPS_v7m(DisasContext *s, arg_CPS_v7m *a)
+{
+    TCGv_i32 tmp, addr;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_M)) {
+        return false;
+    }
+    if (IS_USER(s)) {
+        /* Implemented as NOP in user mode.  */
+        return true;
+    }
+
+    tmp = tcg_constant_i32(a->im);
+    /* FAULTMASK */
+    if (a->F) {
+        addr = tcg_constant_i32(19);
+        gen_helper_v7m_msr(cpu_env, addr, tmp);
+    }
+    /* PRIMASK */
+    if (a->I) {
+        addr = tcg_constant_i32(16);
+        gen_helper_v7m_msr(cpu_env, addr, tmp);
+    }
+    gen_rebuild_hflags(s, false);
+    gen_lookup_tb(s);
+    return true;
+}
+
+/*
+ * Clear-Exclusive, Barriers
+ */
+
+static bool trans_CLREX(DisasContext *s, arg_CLREX *a)
+{
+    if (s->thumb
+        ? !ENABLE_ARCH_7 && !arm_dc_feature(s, ARM_FEATURE_M)
+        : !ENABLE_ARCH_6K) {
+        return false;
+    }
+    gen_clrex(s);
+    return true;
+}
+
+static bool trans_DSB(DisasContext *s, arg_DSB *a)
+{
+    if (!ENABLE_ARCH_7 && !arm_dc_feature(s, ARM_FEATURE_M)) {
+        return false;
+    }
+    tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
+    return true;
+}
+
+static bool trans_DMB(DisasContext *s, arg_DMB *a)
+{
+    return trans_DSB(s, NULL);
+}
+
+static bool trans_ISB(DisasContext *s, arg_ISB *a)
+{
+    if (!ENABLE_ARCH_7 && !arm_dc_feature(s, ARM_FEATURE_M)) {
+        return false;
+    }
+    /*
+     * We need to break the TB after this insn to execute
+     * self-modifying code correctly and also to take
+     * any pending interrupts immediately.
+     */
+    s->base.is_jmp = DISAS_TOO_MANY;
+    return true;
+}
+
+static bool trans_SB(DisasContext *s, arg_SB *a)
+{
+    if (!dc_isar_feature(aa32_sb, s)) {
+        return false;
+    }
+    /*
+     * TODO: There is no speculation barrier opcode
+     * for TCG; MB and end the TB instead.
+     */
+    tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
+    s->base.is_jmp = DISAS_TOO_MANY;
+    return true;
+}
+
+static bool trans_SETEND(DisasContext *s, arg_SETEND *a)
+{
+    if (!ENABLE_ARCH_6) {
+        return false;
+    }
+    if (a->E != (s->be_data == MO_BE)) {
+        gen_helper_setend(cpu_env);
+        s->base.is_jmp = DISAS_UPDATE_EXIT;
+    }
+    return true;
+}
+
+/*
+ * Preload instructions
+ * All are nops, contingent on the appropriate arch level.
+ */
+
+static bool trans_PLD(DisasContext *s, arg_PLD *a)
+{
+    return ENABLE_ARCH_5TE;
+}
+
+static bool trans_PLDW(DisasContext *s, arg_PLD *a)
+{
+    return arm_dc_feature(s, ARM_FEATURE_V7MP);
+}
+
+static bool trans_PLI(DisasContext *s, arg_PLD *a)
+{
+    return ENABLE_ARCH_7;
+}
+
+/*
+ * If-then
+ */
+
+static bool trans_IT(DisasContext *s, arg_IT *a)
+{
+    int cond_mask = a->cond_mask;
+
+    /*
+     * No actual code generated for this insn, just setup state.
+     *
+     * Combinations of firstcond and mask which set up an 0b1111
+     * condition are UNPREDICTABLE; we take the CONSTRAINED
+     * UNPREDICTABLE choice to treat 0b1111 the same as 0b1110,
+     * i.e. both meaning "execute always".
+     */
+    s->condexec_cond = (cond_mask >> 4) & 0xe;
+    s->condexec_mask = cond_mask & 0x1f;
+    return true;
+}
+
+/* v8.1M CSEL/CSINC/CSNEG/CSINV */
+static bool trans_CSEL(DisasContext *s, arg_CSEL *a)
+{
+    TCGv_i32 rn, rm, zero;
+    DisasCompare c;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+        return false;
+    }
+
+    if (a->rm == 13) {
+        /* SEE "Related encodings" (MVE shifts) */
+        return false;
+    }
+
+    if (a->rd == 13 || a->rd == 15 || a->rn == 13 || a->fcond >= 14) {
+        /* CONSTRAINED UNPREDICTABLE: we choose to UNDEF */
+        return false;
+    }
+
+    /* In this insn input reg fields of 0b1111 mean "zero", not "PC" */
+    zero = tcg_constant_i32(0);
+    if (a->rn == 15) {
+        rn = zero;
+    } else {
+        rn = load_reg(s, a->rn);
+    }
+    if (a->rm == 15) {
+        rm = zero;
+    } else {
+        rm = load_reg(s, a->rm);
+    }
+
+    switch (a->op) {
+    case 0: /* CSEL */
+        break;
+    case 1: /* CSINC */
+        tcg_gen_addi_i32(rm, rm, 1);
+        break;
+    case 2: /* CSINV */
+        tcg_gen_not_i32(rm, rm);
+        break;
+    case 3: /* CSNEG */
+        tcg_gen_neg_i32(rm, rm);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    arm_test_cc(&c, a->fcond);
+    tcg_gen_movcond_i32(c.cond, rn, c.value, zero, rn, rm);
+    arm_free_cc(&c);
+
+    store_reg(s, a->rd, rn);
+    tcg_temp_free_i32(rm);
+
+    return true;
+}
+
+/*
+ * Legacy decoder.
+ */
+
+static void disas_arm_insn(DisasContext *s, unsigned int insn)
+{
+    unsigned int cond = insn >> 28;
+
+    /* M variants do not implement ARM mode; this must raise the INVSTATE
+     * UsageFault exception.
+     */
+    if (arm_dc_feature(s, ARM_FEATURE_M)) {
+        gen_exception_insn(s, 0, EXCP_INVSTATE, syn_uncategorized());
+        return;
+    }
+
+    if (s->pstate_il) {
+        /*
+         * Illegal execution state. This has priority over BTI
+         * exceptions, but comes after instruction abort exceptions.
+         */
+        gen_exception_insn(s, 0, EXCP_UDEF, syn_illegalstate());
+        return;
+    }
+
+    if (cond == 0xf) {
+        /* In ARMv3 and v4 the NV condition is UNPREDICTABLE; we
+         * choose to UNDEF. In ARMv5 and above the space is used
+         * for miscellaneous unconditional instructions.
+         */
+        if (!arm_dc_feature(s, ARM_FEATURE_V5)) {
+            unallocated_encoding(s);
+            return;
+        }
+
+        /* Unconditional instructions.  */
+        /* TODO: Perhaps merge these into one decodetree output file.  */
+        if (disas_a32_uncond(s, insn) ||
+            disas_vfp_uncond(s, insn) ||
+            disas_neon_dp(s, insn) ||
+            disas_neon_ls(s, insn) ||
+            disas_neon_shared(s, insn)) {
+            return;
+        }
+        /* fall back to legacy decoder */
+
+        if ((insn & 0x0e000f00) == 0x0c000100) {
+            if (arm_dc_feature(s, ARM_FEATURE_IWMMXT)) {
+                /* iWMMXt register transfer.  */
+                if (extract32(s->c15_cpar, 1, 1)) {
+                    if (!disas_iwmmxt_insn(s, insn)) {
+                        return;
+                    }
+                }
+            }
+        }
+        goto illegal_op;
+    }
+    if (cond != 0xe) {
+        /* if not always execute, we generate a conditional jump to
+           next instruction */
+        arm_skip_unless(s, cond);
+    }
+
+    /* TODO: Perhaps merge these into one decodetree output file.  */
+    if (disas_a32(s, insn) ||
+        disas_vfp(s, insn)) {
+        return;
+    }
+    /* fall back to legacy decoder */
+    /* TODO: convert xscale/iwmmxt decoder to decodetree ?? */
+    if (arm_dc_feature(s, ARM_FEATURE_XSCALE)) {
+        if (((insn & 0x0c000e00) == 0x0c000000)
+            && ((insn & 0x03000000) != 0x03000000)) {
+            /* Coprocessor insn, coprocessor 0 or 1 */
+            disas_xscale_insn(s, insn);
+            return;
+        }
+    }
+
+illegal_op:
+    unallocated_encoding(s);
+}
+
+static bool thumb_insn_is_16bit(DisasContext *s, uint32_t pc, uint32_t insn)
+{
+    /*
+     * Return true if this is a 16 bit instruction. We must be precise
+     * about this (matching the decode).
+     */
+    if ((insn >> 11) < 0x1d) {
+        /* Definitely a 16-bit instruction */
+        return true;
+    }
+
+    /* Top five bits 0b11101 / 0b11110 / 0b11111 : this is the
+     * first half of a 32-bit Thumb insn. Thumb-1 cores might
+     * end up actually treating this as two 16-bit insns, though,
+     * if it's half of a bl/blx pair that might span a page boundary.
+     */
+    if (arm_dc_feature(s, ARM_FEATURE_THUMB2) ||
+        arm_dc_feature(s, ARM_FEATURE_M)) {
+        /* Thumb2 cores (including all M profile ones) always treat
+         * 32-bit insns as 32-bit.
+         */
+        return false;
+    }
+
+    if ((insn >> 11) == 0x1e && pc - s->page_start < TARGET_PAGE_SIZE - 3) {
+        /* 0b1111_0xxx_xxxx_xxxx : BL/BLX prefix, and the suffix
+         * is not on the next page; we merge this into a 32-bit
+         * insn.
+         */
+        return false;
+    }
+    /* 0b1110_1xxx_xxxx_xxxx : BLX suffix (or UNDEF);
+     * 0b1111_1xxx_xxxx_xxxx : BL suffix;
+     * 0b1111_0xxx_xxxx_xxxx : BL/BLX prefix on the end of a page
+     *  -- handle as single 16 bit insn
+     */
+    return true;
+}
+
+/* Translate a 32-bit thumb instruction. */
+static void disas_thumb2_insn(DisasContext *s, uint32_t insn)
+{
+    /*
+     * ARMv6-M supports a limited subset of Thumb2 instructions.
+     * Other Thumb1 architectures allow only 32-bit
+     * combined BL/BLX prefix and suffix.
+     */
+    if (arm_dc_feature(s, ARM_FEATURE_M) &&
+        !arm_dc_feature(s, ARM_FEATURE_V7)) {
+        int i;
+        bool found = false;
+        static const uint32_t armv6m_insn[] = {0xf3808000 /* msr */,
+                                               0xf3b08040 /* dsb */,
+                                               0xf3b08050 /* dmb */,
+                                               0xf3b08060 /* isb */,
+                                               0xf3e08000 /* mrs */,
+                                               0xf000d000 /* bl */};
+        static const uint32_t armv6m_mask[] = {0xffe0d000,
+                                               0xfff0d0f0,
+                                               0xfff0d0f0,
+                                               0xfff0d0f0,
+                                               0xffe0d000,
+                                               0xf800d000};
+
+        for (i = 0; i < ARRAY_SIZE(armv6m_insn); i++) {
+            if ((insn & armv6m_mask[i]) == armv6m_insn[i]) {
+                found = true;
+                break;
+            }
+        }
+        if (!found) {
+            goto illegal_op;
+        }
+    } else if ((insn & 0xf800e800) != 0xf000e800)  {
+        if (!arm_dc_feature(s, ARM_FEATURE_THUMB2)) {
+            unallocated_encoding(s);
+            return;
+        }
+    }
+
+    if (arm_dc_feature(s, ARM_FEATURE_M)) {
+        /*
+         * NOCP takes precedence over any UNDEF for (almost) the
+         * entire wide range of coprocessor-space encodings, so check
+         * for it first before proceeding to actually decode eg VFP
+         * insns. This decode also handles the few insns which are
+         * in copro space but do not have NOCP checks (eg VLLDM, VLSTM).
+         */
+        if (disas_m_nocp(s, insn)) {
+            return;
+        }
+    }
+
+    if ((insn & 0xef000000) == 0xef000000) {
+        /*
+         * T32 encodings 0b111p_1111_qqqq_qqqq_qqqq_qqqq_qqqq_qqqq
+         * transform into
+         * A32 encodings 0b1111_001p_qqqq_qqqq_qqqq_qqqq_qqqq_qqqq
+         */
+        uint32_t a32_insn = (insn & 0xe2ffffff) |
+            ((insn & (1 << 28)) >> 4) | (1 << 28);
+
+        if (disas_neon_dp(s, a32_insn)) {
+            return;
+        }
+    }
+
+    if ((insn & 0xff100000) == 0xf9000000) {
+        /*
+         * T32 encodings 0b1111_1001_ppp0_qqqq_qqqq_qqqq_qqqq_qqqq
+         * transform into
+         * A32 encodings 0b1111_0100_ppp0_qqqq_qqqq_qqqq_qqqq_qqqq
+         */
+        uint32_t a32_insn = (insn & 0x00ffffff) | 0xf4000000;
+
+        if (disas_neon_ls(s, a32_insn)) {
+            return;
+        }
+    }
+
+    /*
+     * TODO: Perhaps merge these into one decodetree output file.
+     * Note disas_vfp is written for a32 with cond field in the
+     * top nibble.  The t32 encoding requires 0xe in the top nibble.
+     */
+    if (disas_t32(s, insn) ||
+        disas_vfp_uncond(s, insn) ||
+        disas_neon_shared(s, insn) ||
+        disas_mve(s, insn) ||
+        ((insn >> 28) == 0xe && disas_vfp(s, insn))) {
+        return;
+    }
+
+illegal_op:
+    unallocated_encoding(s);
+}
+
+static void disas_thumb_insn(DisasContext *s, uint32_t insn)
+{
+    if (!disas_t16(s, insn)) {
+        unallocated_encoding(s);
+    }
+}
+
+static bool insn_crosses_page(CPUARMState *env, DisasContext *s)
+{
+    /* Return true if the insn at dc->base.pc_next might cross a page boundary.
+     * (False positives are OK, false negatives are not.)
+     * We know this is a Thumb insn, and our caller ensures we are
+     * only called if dc->base.pc_next is less than 4 bytes from the page
+     * boundary, so we cross the page if the first 16 bits indicate
+     * that this is a 32 bit insn.
+     */
+    uint16_t insn = arm_lduw_code(env, &s->base, s->base.pc_next, s->sctlr_b);
+
+    return !thumb_insn_is_16bit(s, s->base.pc_next, insn);
+}
+
+static void arm_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs)
+{
+    DisasContext *dc = container_of(dcbase, DisasContext, base);
+    CPUARMState *env = cs->env_ptr;
+    ARMCPU *cpu = env_archcpu(env);
+    CPUARMTBFlags tb_flags = arm_tbflags_from_tb(dc->base.tb);
+    uint32_t condexec, core_mmu_idx;
+
+    dc->isar = &cpu->isar;
+    dc->condjmp = 0;
+    dc->pc_save = dc->base.pc_first;
+    dc->aarch64 = false;
+    dc->thumb = EX_TBFLAG_AM32(tb_flags, THUMB);
+    dc->be_data = EX_TBFLAG_ANY(tb_flags, BE_DATA) ? MO_BE : MO_LE;
+    condexec = EX_TBFLAG_AM32(tb_flags, CONDEXEC);
+    /*
+     * the CONDEXEC TB flags are CPSR bits [15:10][26:25]. On A-profile this
+     * is always the IT bits. On M-profile, some of the reserved encodings
+     * of IT are used instead to indicate either ICI or ECI, which
+     * indicate partial progress of a restartable insn that was interrupted
+     * partway through by an exception:
+     *  * if CONDEXEC[3:0] != 0b0000 : CONDEXEC is IT bits
+     *  * if CONDEXEC[3:0] == 0b0000 : CONDEXEC is ICI or ECI bits
+     * In all cases CONDEXEC == 0 means "not in IT block or restartable
+     * insn, behave normally".
+     */
+    dc->eci = dc->condexec_mask = dc->condexec_cond = 0;
+    dc->eci_handled = false;
+    if (condexec & 0xf) {
+        dc->condexec_mask = (condexec & 0xf) << 1;
+        dc->condexec_cond = condexec >> 4;
+    } else {
+        if (arm_feature(env, ARM_FEATURE_M)) {
+            dc->eci = condexec >> 4;
+        }
+    }
+
+    core_mmu_idx = EX_TBFLAG_ANY(tb_flags, MMUIDX);
+    dc->mmu_idx = core_to_arm_mmu_idx(env, core_mmu_idx);
+    dc->current_el = arm_mmu_idx_to_el(dc->mmu_idx);
+#if !defined(CONFIG_USER_ONLY)
+    dc->user = (dc->current_el == 0);
+#endif
+    dc->fp_excp_el = EX_TBFLAG_ANY(tb_flags, FPEXC_EL);
+    dc->align_mem = EX_TBFLAG_ANY(tb_flags, ALIGN_MEM);
+    dc->pstate_il = EX_TBFLAG_ANY(tb_flags, PSTATE__IL);
+    dc->fgt_active = EX_TBFLAG_ANY(tb_flags, FGT_ACTIVE);
+    dc->fgt_svc = EX_TBFLAG_ANY(tb_flags, FGT_SVC);
+
+    if (arm_feature(env, ARM_FEATURE_M)) {
+        dc->vfp_enabled = 1;
+        dc->be_data = MO_TE;
+        dc->v7m_handler_mode = EX_TBFLAG_M32(tb_flags, HANDLER);
+        dc->v8m_secure = EX_TBFLAG_M32(tb_flags, SECURE);
+        dc->v8m_stackcheck = EX_TBFLAG_M32(tb_flags, STACKCHECK);
+        dc->v8m_fpccr_s_wrong = EX_TBFLAG_M32(tb_flags, FPCCR_S_WRONG);
+        dc->v7m_new_fp_ctxt_needed =
+            EX_TBFLAG_M32(tb_flags, NEW_FP_CTXT_NEEDED);
+        dc->v7m_lspact = EX_TBFLAG_M32(tb_flags, LSPACT);
+        dc->mve_no_pred = EX_TBFLAG_M32(tb_flags, MVE_NO_PRED);
+    } else {
+        dc->sctlr_b = EX_TBFLAG_A32(tb_flags, SCTLR__B);
+        dc->hstr_active = EX_TBFLAG_A32(tb_flags, HSTR_ACTIVE);
+        dc->ns = EX_TBFLAG_A32(tb_flags, NS);
+        dc->vfp_enabled = EX_TBFLAG_A32(tb_flags, VFPEN);
+        if (arm_feature(env, ARM_FEATURE_XSCALE)) {
+            dc->c15_cpar = EX_TBFLAG_A32(tb_flags, XSCALE_CPAR);
+        } else {
+            dc->vec_len = EX_TBFLAG_A32(tb_flags, VECLEN);
+            dc->vec_stride = EX_TBFLAG_A32(tb_flags, VECSTRIDE);
+        }
+        dc->sme_trap_nonstreaming =
+            EX_TBFLAG_A32(tb_flags, SME_TRAP_NONSTREAMING);
+    }
+    dc->cp_regs = cpu->cp_regs;
+    dc->features = env->features;
+
+    /* Single step state. The code-generation logic here is:
+     *  SS_ACTIVE == 0:
+     *   generate code with no special handling for single-stepping (except
+     *   that anything that can make us go to SS_ACTIVE == 1 must end the TB;
+     *   this happens anyway because those changes are all system register or
+     *   PSTATE writes).
+     *  SS_ACTIVE == 1, PSTATE.SS == 1: (active-not-pending)
+     *   emit code for one insn
+     *   emit code to clear PSTATE.SS
+     *   emit code to generate software step exception for completed step
+     *   end TB (as usual for having generated an exception)
+     *  SS_ACTIVE == 1, PSTATE.SS == 0: (active-pending)
+     *   emit code to generate a software step exception
+     *   end the TB
+     */
+    dc->ss_active = EX_TBFLAG_ANY(tb_flags, SS_ACTIVE);
+    dc->pstate_ss = EX_TBFLAG_ANY(tb_flags, PSTATE__SS);
+    dc->is_ldex = false;
+
+    dc->page_start = dc->base.pc_first & TARGET_PAGE_MASK;
+
+    /* If architectural single step active, limit to 1.  */
+    if (dc->ss_active) {
+        dc->base.max_insns = 1;
+    }
+
+    /* ARM is a fixed-length ISA.  Bound the number of insns to execute
+       to those left on the page.  */
+    if (!dc->thumb) {
+        int bound = -(dc->base.pc_first | TARGET_PAGE_MASK) / 4;
+        dc->base.max_insns = MIN(dc->base.max_insns, bound);
+    }
+
+    cpu_V0 = tcg_temp_new_i64();
+    cpu_V1 = tcg_temp_new_i64();
+    cpu_M0 = tcg_temp_new_i64();
+}
+
+static void arm_tr_tb_start(DisasContextBase *dcbase, CPUState *cpu)
+{
+    DisasContext *dc = container_of(dcbase, DisasContext, base);
+
+    /* A note on handling of the condexec (IT) bits:
+     *
+     * We want to avoid the overhead of having to write the updated condexec
+     * bits back to the CPUARMState for every instruction in an IT block. So:
+     * (1) if the condexec bits are not already zero then we write
+     * zero back into the CPUARMState now. This avoids complications trying
+     * to do it at the end of the block. (For example if we don't do this
+     * it's hard to identify whether we can safely skip writing condexec
+     * at the end of the TB, which we definitely want to do for the case
+     * where a TB doesn't do anything with the IT state at all.)
+     * (2) if we are going to leave the TB then we call gen_set_condexec()
+     * which will write the correct value into CPUARMState if zero is wrong.
+     * This is done both for leaving the TB at the end, and for leaving
+     * it because of an exception we know will happen, which is done in
+     * gen_exception_insn(). The latter is necessary because we need to
+     * leave the TB with the PC/IT state just prior to execution of the
+     * instruction which caused the exception.
+     * (3) if we leave the TB unexpectedly (eg a data abort on a load)
+     * then the CPUARMState will be wrong and we need to reset it.
+     * This is handled in the same way as restoration of the
+     * PC in these situations; we save the value of the condexec bits
+     * for each PC via tcg_gen_insn_start(), and restore_state_to_opc()
+     * then uses this to restore them after an exception.
+     *
+     * Note that there are no instructions which can read the condexec
+     * bits, and none which can write non-static values to them, so
+     * we don't need to care about whether CPUARMState is correct in the
+     * middle of a TB.
+     */
+
+    /* Reset the conditional execution bits immediately. This avoids
+       complications trying to do it at the end of the block.  */
+    if (dc->condexec_mask || dc->condexec_cond) {
+        store_cpu_field_constant(0, condexec_bits);
+    }
+}
+
+static void arm_tr_insn_start(DisasContextBase *dcbase, CPUState *cpu)
+{
+    DisasContext *dc = container_of(dcbase, DisasContext, base);
+    /*
+     * The ECI/ICI bits share PSR bits with the IT bits, so we
+     * need to reconstitute the bits from the split-out DisasContext
+     * fields here.
+     */
+    uint32_t condexec_bits;
+    target_ulong pc_arg = dc->base.pc_next;
+
+    if (TARGET_TB_PCREL) {
+        pc_arg &= ~TARGET_PAGE_MASK;
+    }
+    if (dc->eci) {
+        condexec_bits = dc->eci << 4;
+    } else {
+        condexec_bits = (dc->condexec_cond << 4) | (dc->condexec_mask >> 1);
+    }
+    tcg_gen_insn_start(pc_arg, condexec_bits, 0);
+    dc->insn_start = tcg_last_op();
+}
+
+static bool arm_check_kernelpage(DisasContext *dc)
+{
+#ifdef CONFIG_USER_ONLY
+    /* Intercept jump to the magic kernel page.  */
+    if (dc->base.pc_next >= 0xffff0000) {
+        /* We always get here via a jump, so know we are not in a
+           conditional execution block.  */
+        gen_exception_internal(EXCP_KERNEL_TRAP);
+        dc->base.is_jmp = DISAS_NORETURN;
+        return true;
+    }
+#endif
+    return false;
+}
+
+static bool arm_check_ss_active(DisasContext *dc)
+{
+    if (dc->ss_active && !dc->pstate_ss) {
+        /* Singlestep state is Active-pending.
+         * If we're in this state at the start of a TB then either
+         *  a) we just took an exception to an EL which is being debugged
+         *     and this is the first insn in the exception handler
+         *  b) debug exceptions were masked and we just unmasked them
+         *     without changing EL (eg by clearing PSTATE.D)
+         * In either case we're going to take a swstep exception in the
+         * "did not step an insn" case, and so the syndrome ISV and EX
+         * bits should be zero.
+         */
+        assert(dc->base.num_insns == 1);
+        gen_swstep_exception(dc, 0, 0);
+        dc->base.is_jmp = DISAS_NORETURN;
+        return true;
+    }
+
+    return false;
+}
+
+static void arm_post_translate_insn(DisasContext *dc)
+{
+    if (dc->condjmp && dc->base.is_jmp == DISAS_NEXT) {
+        if (dc->pc_save != dc->condlabel.pc_save) {
+            gen_update_pc(dc, dc->condlabel.pc_save - dc->pc_save);
+        }
+        gen_set_label(dc->condlabel.label);
+        dc->condjmp = 0;
+    }
+    translator_loop_temp_check(&dc->base);
+}
+
+static void arm_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu)
+{
+    DisasContext *dc = container_of(dcbase, DisasContext, base);
+    CPUARMState *env = cpu->env_ptr;
+    uint32_t pc = dc->base.pc_next;
+    unsigned int insn;
+
+    /* Singlestep exceptions have the highest priority. */
+    if (arm_check_ss_active(dc)) {
+        dc->base.pc_next = pc + 4;
+        return;
+    }
+
+    if (pc & 3) {
+        /*
+         * PC alignment fault.  This has priority over the instruction abort
+         * that we would receive from a translation fault via arm_ldl_code
+         * (or the execution of the kernelpage entrypoint). This should only
+         * be possible after an indirect branch, at the start of the TB.
+         */
+        assert(dc->base.num_insns == 1);
+        gen_helper_exception_pc_alignment(cpu_env, tcg_constant_tl(pc));
+        dc->base.is_jmp = DISAS_NORETURN;
+        dc->base.pc_next = QEMU_ALIGN_UP(pc, 4);
+        return;
+    }
+
+    if (arm_check_kernelpage(dc)) {
+        dc->base.pc_next = pc + 4;
+        return;
+    }
+
+    dc->pc_curr = pc;
+    insn = arm_ldl_code(env, &dc->base, pc, dc->sctlr_b);
+    dc->insn = insn;
+    dc->base.pc_next = pc + 4;
+    disas_arm_insn(dc, insn);
+
+    arm_post_translate_insn(dc);
+
+    /* ARM is a fixed-length ISA.  We performed the cross-page check
+       in init_disas_context by adjusting max_insns.  */
+}
+
+static bool thumb_insn_is_unconditional(DisasContext *s, uint32_t insn)
+{
+    /* Return true if this Thumb insn is always unconditional,
+     * even inside an IT block. This is true of only a very few
+     * instructions: BKPT, HLT, and SG.
+     *
+     * A larger class of instructions are UNPREDICTABLE if used
+     * inside an IT block; we do not need to detect those here, because
+     * what we do by default (perform the cc check and update the IT
+     * bits state machine) is a permitted CONSTRAINED UNPREDICTABLE
+     * choice for those situations.
+     *
+     * insn is either a 16-bit or a 32-bit instruction; the two are
+     * distinguishable because for the 16-bit case the top 16 bits
+     * are zeroes, and that isn't a valid 32-bit encoding.
+     */
+    if ((insn & 0xffffff00) == 0xbe00) {
+        /* BKPT */
+        return true;
+    }
+
+    if ((insn & 0xffffffc0) == 0xba80 && arm_dc_feature(s, ARM_FEATURE_V8) &&
+        !arm_dc_feature(s, ARM_FEATURE_M)) {
+        /* HLT: v8A only. This is unconditional even when it is going to
+         * UNDEF; see the v8A ARM ARM DDI0487B.a H3.3.
+         * For v7 cores this was a plain old undefined encoding and so
+         * honours its cc check. (We might be using the encoding as
+         * a semihosting trap, but we don't change the cc check behaviour
+         * on that account, because a debugger connected to a real v7A
+         * core and emulating semihosting traps by catching the UNDEF
+         * exception would also only see cases where the cc check passed.
+         * No guest code should be trying to do a HLT semihosting trap
+         * in an IT block anyway.
+         */
+        return true;
+    }
+
+    if (insn == 0xe97fe97f && arm_dc_feature(s, ARM_FEATURE_V8) &&
+        arm_dc_feature(s, ARM_FEATURE_M)) {
+        /* SG: v8M only */
+        return true;
+    }
+
+    return false;
+}
+
+static void thumb_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu)
+{
+    DisasContext *dc = container_of(dcbase, DisasContext, base);
+    CPUARMState *env = cpu->env_ptr;
+    uint32_t pc = dc->base.pc_next;
+    uint32_t insn;
+    bool is_16bit;
+    /* TCG op to rewind to if this turns out to be an invalid ECI state */
+    TCGOp *insn_eci_rewind = NULL;
+    target_ulong insn_eci_pc_save = -1;
+
+    /* Misaligned thumb PC is architecturally impossible. */
+    assert((dc->base.pc_next & 1) == 0);
+
+    if (arm_check_ss_active(dc) || arm_check_kernelpage(dc)) {
+        dc->base.pc_next = pc + 2;
+        return;
+    }
+
+    dc->pc_curr = pc;
+    insn = arm_lduw_code(env, &dc->base, pc, dc->sctlr_b);
+    is_16bit = thumb_insn_is_16bit(dc, dc->base.pc_next, insn);
+    pc += 2;
+    if (!is_16bit) {
+        uint32_t insn2 = arm_lduw_code(env, &dc->base, pc, dc->sctlr_b);
+        insn = insn << 16 | insn2;
+        pc += 2;
+    }
+    dc->base.pc_next = pc;
+    dc->insn = insn;
+
+    if (dc->pstate_il) {
+        /*
+         * Illegal execution state. This has priority over BTI
+         * exceptions, but comes after instruction abort exceptions.
+         */
+        gen_exception_insn(dc, 0, EXCP_UDEF, syn_illegalstate());
+        return;
+    }
+
+    if (dc->eci) {
+        /*
+         * For M-profile continuable instructions, ECI/ICI handling
+         * falls into these cases:
+         *  - interrupt-continuable instructions
+         *     These are the various load/store multiple insns (both
+         *     integer and fp). The ICI bits indicate the register
+         *     where the load/store can resume. We make the IMPDEF
+         *     choice to always do "instruction restart", ie ignore
+         *     the ICI value and always execute the ldm/stm from the
+         *     start. So all we need to do is zero PSR.ICI if the
+         *     insn executes.
+         *  - MVE instructions subject to beat-wise execution
+         *     Here the ECI bits indicate which beats have already been
+         *     executed, and we must honour this. Each insn of this
+         *     type will handle it correctly. We will update PSR.ECI
+         *     in the helper function for the insn (some ECI values
+         *     mean that the following insn also has been partially
+         *     executed).
+         *  - Special cases which don't advance ECI
+         *     The insns LE, LETP and BKPT leave the ECI/ICI state
+         *     bits untouched.
+         *  - all other insns (the common case)
+         *     Non-zero ECI/ICI means an INVSTATE UsageFault.
+         *     We place a rewind-marker here. Insns in the previous
+         *     three categories will set a flag in the DisasContext.
+         *     If the flag isn't set after we call disas_thumb_insn()
+         *     or disas_thumb2_insn() then we know we have a "some other
+         *     insn" case. We will rewind to the marker (ie throwing away
+         *     all the generated code) and instead emit "take exception".
+         */
+        insn_eci_rewind = tcg_last_op();
+        insn_eci_pc_save = dc->pc_save;
+    }
+
+    if (dc->condexec_mask && !thumb_insn_is_unconditional(dc, insn)) {
+        uint32_t cond = dc->condexec_cond;
+
+        /*
+         * Conditionally skip the insn. Note that both 0xe and 0xf mean
+         * "always"; 0xf is not "never".
+         */
+        if (cond < 0x0e) {
+            arm_skip_unless(dc, cond);
+        }
+    }
+
+    if (is_16bit) {
+        disas_thumb_insn(dc, insn);
+    } else {
+        disas_thumb2_insn(dc, insn);
+    }
+
+    /* Advance the Thumb condexec condition.  */
+    if (dc->condexec_mask) {
+        dc->condexec_cond = ((dc->condexec_cond & 0xe) |
+                             ((dc->condexec_mask >> 4) & 1));
+        dc->condexec_mask = (dc->condexec_mask << 1) & 0x1f;
+        if (dc->condexec_mask == 0) {
+            dc->condexec_cond = 0;
+        }
+    }
+
+    if (dc->eci && !dc->eci_handled) {
+        /*
+         * Insn wasn't valid for ECI/ICI at all: undo what we
+         * just generated and instead emit an exception
+         */
+        tcg_remove_ops_after(insn_eci_rewind);
+        dc->pc_save = insn_eci_pc_save;
+        dc->condjmp = 0;
+        gen_exception_insn(dc, 0, EXCP_INVSTATE, syn_uncategorized());
+    }
+
+    arm_post_translate_insn(dc);
+
+    /* Thumb is a variable-length ISA.  Stop translation when the next insn
+     * will touch a new page.  This ensures that prefetch aborts occur at
+     * the right place.
+     *
+     * We want to stop the TB if the next insn starts in a new page,
+     * or if it spans between this page and the next. This means that
+     * if we're looking at the last halfword in the page we need to
+     * see if it's a 16-bit Thumb insn (which will fit in this TB)
+     * or a 32-bit Thumb insn (which won't).
+     * This is to avoid generating a silly TB with a single 16-bit insn
+     * in it at the end of this page (which would execute correctly
+     * but isn't very efficient).
+     */
+    if (dc->base.is_jmp == DISAS_NEXT
+        && (dc->base.pc_next - dc->page_start >= TARGET_PAGE_SIZE
+            || (dc->base.pc_next - dc->page_start >= TARGET_PAGE_SIZE - 3
+                && insn_crosses_page(env, dc)))) {
+        dc->base.is_jmp = DISAS_TOO_MANY;
+    }
+}
+
+static void arm_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
+{
+    DisasContext *dc = container_of(dcbase, DisasContext, base);
+
+    /* At this stage dc->condjmp will only be set when the skipped
+       instruction was a conditional branch or trap, and the PC has
+       already been written.  */
+    gen_set_condexec(dc);
+    if (dc->base.is_jmp == DISAS_BX_EXCRET) {
+        /* Exception return branches need some special case code at the
+         * end of the TB, which is complex enough that it has to
+         * handle the single-step vs not and the condition-failed
+         * insn codepath itself.
+         */
+        gen_bx_excret_final_code(dc);
+    } else if (unlikely(dc->ss_active)) {
+        /* Unconditional and "condition passed" instruction codepath. */
+        switch (dc->base.is_jmp) {
+        case DISAS_SWI:
+            gen_ss_advance(dc);
+            gen_exception(EXCP_SWI, syn_aa32_svc(dc->svc_imm, dc->thumb));
+            break;
+        case DISAS_HVC:
+            gen_ss_advance(dc);
+            gen_exception_el(EXCP_HVC, syn_aa32_hvc(dc->svc_imm), 2);
+            break;
+        case DISAS_SMC:
+            gen_ss_advance(dc);
+            gen_exception_el(EXCP_SMC, syn_aa32_smc(), 3);
+            break;
+        case DISAS_NEXT:
+        case DISAS_TOO_MANY:
+        case DISAS_UPDATE_EXIT:
+        case DISAS_UPDATE_NOCHAIN:
+            gen_update_pc(dc, curr_insn_len(dc));
+            /* fall through */
+        default:
+            /* FIXME: Single stepping a WFI insn will not halt the CPU. */
+            gen_singlestep_exception(dc);
+            break;
+        case DISAS_NORETURN:
+            break;
+        }
+    } else {
+        /* While branches must always occur at the end of an IT block,
+           there are a few other things that can cause us to terminate
+           the TB in the middle of an IT block:
+            - Exception generating instructions (bkpt, swi, undefined).
+            - Page boundaries.
+            - Hardware watchpoints.
+           Hardware breakpoints have already been handled and skip this code.
+         */
+        switch (dc->base.is_jmp) {
+        case DISAS_NEXT:
+        case DISAS_TOO_MANY:
+            gen_goto_tb(dc, 1, curr_insn_len(dc));
+            break;
+        case DISAS_UPDATE_NOCHAIN:
+            gen_update_pc(dc, curr_insn_len(dc));
+            /* fall through */
+        case DISAS_JUMP:
+            gen_goto_ptr();
+            break;
+        case DISAS_UPDATE_EXIT:
+            gen_update_pc(dc, curr_insn_len(dc));
+            /* fall through */
+        default:
+            /* indicate that the hash table must be used to find the next TB */
+            tcg_gen_exit_tb(NULL, 0);
+            break;
+        case DISAS_NORETURN:
+            /* nothing more to generate */
+            break;
+        case DISAS_WFI:
+            gen_helper_wfi(cpu_env, tcg_constant_i32(curr_insn_len(dc)));
+            /*
+             * The helper doesn't necessarily throw an exception, but we
+             * must go back to the main loop to check for interrupts anyway.
+             */
+            tcg_gen_exit_tb(NULL, 0);
+            break;
+        case DISAS_WFE:
+            gen_helper_wfe(cpu_env);
+            break;
+        case DISAS_YIELD:
+            gen_helper_yield(cpu_env);
+            break;
+        case DISAS_SWI:
+            gen_exception(EXCP_SWI, syn_aa32_svc(dc->svc_imm, dc->thumb));
+            break;
+        case DISAS_HVC:
+            gen_exception_el(EXCP_HVC, syn_aa32_hvc(dc->svc_imm), 2);
+            break;
+        case DISAS_SMC:
+            gen_exception_el(EXCP_SMC, syn_aa32_smc(), 3);
+            break;
+        }
+    }
+
+    if (dc->condjmp) {
+        /* "Condition failed" instruction codepath for the branch/trap insn */
+        set_disas_label(dc, dc->condlabel);
+        gen_set_condexec(dc);
+        if (unlikely(dc->ss_active)) {
+            gen_update_pc(dc, curr_insn_len(dc));
+            gen_singlestep_exception(dc);
+        } else {
+            gen_goto_tb(dc, 1, curr_insn_len(dc));
+        }
+    }
+}
+
+static void arm_tr_disas_log(const DisasContextBase *dcbase,
+                             CPUState *cpu, FILE *logfile)
+{
+    DisasContext *dc = container_of(dcbase, DisasContext, base);
+
+    fprintf(logfile, "IN: %s\n", lookup_symbol(dc->base.pc_first));
+    target_disas(logfile, cpu, dc->base.pc_first, dc->base.tb->size);
+}
+
+static const TranslatorOps arm_translator_ops = {
+    .init_disas_context = arm_tr_init_disas_context,
+    .tb_start           = arm_tr_tb_start,
+    .insn_start         = arm_tr_insn_start,
+    .translate_insn     = arm_tr_translate_insn,
+    .tb_stop            = arm_tr_tb_stop,
+    .disas_log          = arm_tr_disas_log,
+};
+
+static const TranslatorOps thumb_translator_ops = {
+    .init_disas_context = arm_tr_init_disas_context,
+    .tb_start           = arm_tr_tb_start,
+    .insn_start         = arm_tr_insn_start,
+    .translate_insn     = thumb_tr_translate_insn,
+    .tb_stop            = arm_tr_tb_stop,
+    .disas_log          = arm_tr_disas_log,
+};
+
+/* generate intermediate code for basic block 'tb'.  */
+void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int max_insns,
+                           target_ulong pc, void *host_pc)
+{
+    DisasContext dc = { };
+    const TranslatorOps *ops = &arm_translator_ops;
+    CPUARMTBFlags tb_flags = arm_tbflags_from_tb(tb);
+
+    if (EX_TBFLAG_AM32(tb_flags, THUMB)) {
+        ops = &thumb_translator_ops;
+    }
+#ifdef TARGET_AARCH64
+    if (EX_TBFLAG_ANY(tb_flags, AARCH64_STATE)) {
+        ops = &aarch64_translator_ops;
+    }
+#endif
+
+    translator_loop(cpu, tb, max_insns, pc, host_pc, ops, &dc.base);
+}
diff --git a/target/arm/tcg/translate.h b/target/arm/tcg/translate.h
new file mode 100644
index 0000000..3717824
--- /dev/null
+++ b/target/arm/tcg/translate.h
@@ -0,0 +1,644 @@
+#ifndef TARGET_ARM_TRANSLATE_H
+#define TARGET_ARM_TRANSLATE_H
+
+#include "exec/translator.h"
+#include "internals.h"
+
+
+/* internal defines */
+
+/*
+ * Save pc_save across a branch, so that we may restore the value from
+ * before the branch at the point the label is emitted.
+ */
+typedef struct DisasLabel {
+    TCGLabel *label;
+    target_ulong pc_save;
+} DisasLabel;
+
+typedef struct DisasContext {
+    DisasContextBase base;
+    const ARMISARegisters *isar;
+
+    /* The address of the current instruction being translated. */
+    target_ulong pc_curr;
+    /*
+     * For TARGET_TB_PCREL, the full value of cpu_pc is not known
+     * (although the page offset is known).  For convenience, the
+     * translation loop uses the full virtual address that triggered
+     * the translation, from base.pc_start through pc_curr.
+     * For efficiency, we do not update cpu_pc for every instruction.
+     * Instead, pc_save has the value of pc_curr at the time of the
+     * last update to cpu_pc, which allows us to compute the addend
+     * needed to bring cpu_pc current: pc_curr - pc_save.
+     * If cpu_pc now contains the destination of an indirect branch,
+     * pc_save contains -1 to indicate that relative updates are no
+     * longer possible.
+     */
+    target_ulong pc_save;
+    target_ulong page_start;
+    uint32_t insn;
+    /* Nonzero if this instruction has been conditionally skipped.  */
+    int condjmp;
+    /* The label that will be jumped to when the instruction is skipped.  */
+    DisasLabel condlabel;
+    /* Thumb-2 conditional execution bits.  */
+    int condexec_mask;
+    int condexec_cond;
+    /* M-profile ECI/ICI exception-continuable instruction state */
+    int eci;
+    /*
+     * trans_ functions for insns which are continuable should set this true
+     * after decode (ie after any UNDEF checks)
+     */
+    bool eci_handled;
+    int sctlr_b;
+    MemOp be_data;
+#if !defined(CONFIG_USER_ONLY)
+    int user;
+#endif
+    ARMMMUIdx mmu_idx; /* MMU index to use for normal loads/stores */
+    uint8_t tbii;      /* TBI1|TBI0 for insns */
+    uint8_t tbid;      /* TBI1|TBI0 for data */
+    uint8_t tcma;      /* TCMA1|TCMA0 for MTE */
+    bool ns;        /* Use non-secure CPREG bank on access */
+    int fp_excp_el; /* FP exception EL or 0 if enabled */
+    int sve_excp_el; /* SVE exception EL or 0 if enabled */
+    int sme_excp_el; /* SME exception EL or 0 if enabled */
+    int vl;          /* current vector length in bytes */
+    int svl;         /* current streaming vector length in bytes */
+    bool vfp_enabled; /* FP enabled via FPSCR.EN */
+    int vec_len;
+    int vec_stride;
+    bool v7m_handler_mode;
+    bool v8m_secure; /* true if v8M and we're in Secure mode */
+    bool v8m_stackcheck; /* true if we need to perform v8M stack limit checks */
+    bool v8m_fpccr_s_wrong; /* true if v8M FPCCR.S != v8m_secure */
+    bool v7m_new_fp_ctxt_needed; /* ASPEN set but no active FP context */
+    bool v7m_lspact; /* FPCCR.LSPACT set */
+    /* Immediate value in AArch32 SVC insn; must be set if is_jmp == DISAS_SWI
+     * so that top level loop can generate correct syndrome information.
+     */
+    uint32_t svc_imm;
+    int current_el;
+    GHashTable *cp_regs;
+    uint64_t features; /* CPU features bits */
+    bool aarch64;
+    bool thumb;
+    /* Because unallocated encodings generate different exception syndrome
+     * information from traps due to FP being disabled, we can't do a single
+     * "is fp access disabled" check at a high level in the decode tree.
+     * To help in catching bugs where the access check was forgotten in some
+     * code path, we set this flag when the access check is done, and assert
+     * that it is set at the point where we actually touch the FP regs.
+     */
+    bool fp_access_checked;
+    bool sve_access_checked;
+    /* ARMv8 single-step state (this is distinct from the QEMU gdbstub
+     * single-step support).
+     */
+    bool ss_active;
+    bool pstate_ss;
+    /* True if the insn just emitted was a load-exclusive instruction
+     * (necessary for syndrome information for single step exceptions),
+     * ie A64 LDX*, LDAX*, A32/T32 LDREX*, LDAEX*.
+     */
+    bool is_ldex;
+    /* True if AccType_UNPRIV should be used for LDTR et al */
+    bool unpriv;
+    /* True if v8.3-PAuth is active.  */
+    bool pauth_active;
+    /* True if v8.5-MTE access to tags is enabled.  */
+    bool ata;
+    /* True if v8.5-MTE tag checks affect the PE; index with is_unpriv.  */
+    bool mte_active[2];
+    /* True with v8.5-BTI and SCTLR_ELx.BT* set.  */
+    bool bt;
+    /* True if any CP15 access is trapped by HSTR_EL2 */
+    bool hstr_active;
+    /* True if memory operations require alignment */
+    bool align_mem;
+    /* True if PSTATE.IL is set */
+    bool pstate_il;
+    /* True if PSTATE.SM is set. */
+    bool pstate_sm;
+    /* True if PSTATE.ZA is set. */
+    bool pstate_za;
+    /* True if non-streaming insns should raise an SME Streaming exception. */
+    bool sme_trap_nonstreaming;
+    /* True if the current instruction is non-streaming. */
+    bool is_nonstreaming;
+    /* True if MVE insns are definitely not predicated by VPR or LTPSIZE */
+    bool mve_no_pred;
+    /* True if fine-grained traps are active */
+    bool fgt_active;
+    /* True if fine-grained trap on ERET is enabled */
+    bool fgt_eret;
+    /* True if fine-grained trap on SVC is enabled */
+    bool fgt_svc;
+    /*
+     * >= 0, a copy of PSTATE.BTYPE, which will be 0 without v8.5-BTI.
+     *  < 0, set by the current instruction.
+     */
+    int8_t btype;
+    /* A copy of cpu->dcz_blocksize. */
+    uint8_t dcz_blocksize;
+    /* True if this page is guarded.  */
+    bool guarded_page;
+    /* Bottom two bits of XScale c15_cpar coprocessor access control reg */
+    int c15_cpar;
+    /* TCG op of the current insn_start.  */
+    TCGOp *insn_start;
+#define TMP_A64_MAX 16
+    int tmp_a64_count;
+    TCGv_i64 tmp_a64[TMP_A64_MAX];
+} DisasContext;
+
+typedef struct DisasCompare {
+    TCGCond cond;
+    TCGv_i32 value;
+    bool value_global;
+} DisasCompare;
+
+/* Share the TCG temporaries common between 32 and 64 bit modes.  */
+extern TCGv_i32 cpu_NF, cpu_ZF, cpu_CF, cpu_VF;
+extern TCGv_i64 cpu_exclusive_addr;
+extern TCGv_i64 cpu_exclusive_val;
+
+/*
+ * Constant expanders for the decoders.
+ */
+
+static inline int negate(DisasContext *s, int x)
+{
+    return -x;
+}
+
+static inline int plus_1(DisasContext *s, int x)
+{
+    return x + 1;
+}
+
+static inline int plus_2(DisasContext *s, int x)
+{
+    return x + 2;
+}
+
+static inline int plus_12(DisasContext *s, int x)
+{
+    return x + 12;
+}
+
+static inline int times_2(DisasContext *s, int x)
+{
+    return x * 2;
+}
+
+static inline int times_4(DisasContext *s, int x)
+{
+    return x * 4;
+}
+
+static inline int times_2_plus_1(DisasContext *s, int x)
+{
+    return x * 2 + 1;
+}
+
+static inline int rsub_64(DisasContext *s, int x)
+{
+    return 64 - x;
+}
+
+static inline int rsub_32(DisasContext *s, int x)
+{
+    return 32 - x;
+}
+
+static inline int rsub_16(DisasContext *s, int x)
+{
+    return 16 - x;
+}
+
+static inline int rsub_8(DisasContext *s, int x)
+{
+    return 8 - x;
+}
+
+static inline int neon_3same_fp_size(DisasContext *s, int x)
+{
+    /* Convert 0==fp32, 1==fp16 into a MO_* value */
+    return MO_32 - x;
+}
+
+static inline int arm_dc_feature(DisasContext *dc, int feature)
+{
+    return (dc->features & (1ULL << feature)) != 0;
+}
+
+static inline int get_mem_index(DisasContext *s)
+{
+    return arm_to_core_mmu_idx(s->mmu_idx);
+}
+
+static inline void disas_set_insn_syndrome(DisasContext *s, uint32_t syn)
+{
+    /* We don't need to save all of the syndrome so we mask and shift
+     * out unneeded bits to help the sleb128 encoder do a better job.
+     */
+    syn &= ARM_INSN_START_WORD2_MASK;
+    syn >>= ARM_INSN_START_WORD2_SHIFT;
+
+    /* We check and clear insn_start_idx to catch multiple updates.  */
+    assert(s->insn_start != NULL);
+    tcg_set_insn_start_param(s->insn_start, 2, syn);
+    s->insn_start = NULL;
+}
+
+static inline int curr_insn_len(DisasContext *s)
+{
+    return s->base.pc_next - s->pc_curr;
+}
+
+/* is_jmp field values */
+#define DISAS_JUMP      DISAS_TARGET_0 /* only pc was modified dynamically */
+/* CPU state was modified dynamically; exit to main loop for interrupts. */
+#define DISAS_UPDATE_EXIT  DISAS_TARGET_1
+/* These instructions trap after executing, so the A32/T32 decoder must
+ * defer them until after the conditional execution state has been updated.
+ * WFI also needs special handling when single-stepping.
+ */
+#define DISAS_WFI       DISAS_TARGET_2
+#define DISAS_SWI       DISAS_TARGET_3
+/* WFE */
+#define DISAS_WFE       DISAS_TARGET_4
+#define DISAS_HVC       DISAS_TARGET_5
+#define DISAS_SMC       DISAS_TARGET_6
+#define DISAS_YIELD     DISAS_TARGET_7
+/* M profile branch which might be an exception return (and so needs
+ * custom end-of-TB code)
+ */
+#define DISAS_BX_EXCRET DISAS_TARGET_8
+/*
+ * For instructions which want an immediate exit to the main loop, as opposed
+ * to attempting to use lookup_and_goto_ptr.  Unlike DISAS_UPDATE_EXIT, this
+ * doesn't write the PC on exiting the translation loop so you need to ensure
+ * something (gen_a64_update_pc or runtime helper) has done so before we reach
+ * return from cpu_tb_exec.
+ */
+#define DISAS_EXIT      DISAS_TARGET_9
+/* CPU state was modified dynamically; no need to exit, but do not chain. */
+#define DISAS_UPDATE_NOCHAIN  DISAS_TARGET_10
+
+#ifdef TARGET_AARCH64
+void a64_translate_init(void);
+void gen_a64_update_pc(DisasContext *s, target_long diff);
+extern const TranslatorOps aarch64_translator_ops;
+#else
+static inline void a64_translate_init(void)
+{
+}
+
+static inline void gen_a64_update_pc(DisasContext *s, target_long diff)
+{
+}
+#endif
+
+void arm_test_cc(DisasCompare *cmp, int cc);
+void arm_free_cc(DisasCompare *cmp);
+void arm_jump_cc(DisasCompare *cmp, TCGLabel *label);
+void arm_gen_test_cc(int cc, TCGLabel *label);
+MemOp pow2_align(unsigned i);
+void unallocated_encoding(DisasContext *s);
+void gen_exception_insn_el(DisasContext *s, target_long pc_diff, int excp,
+                           uint32_t syn, uint32_t target_el);
+void gen_exception_insn(DisasContext *s, target_long pc_diff,
+                        int excp, uint32_t syn);
+
+/* Return state of Alternate Half-precision flag, caller frees result */
+static inline TCGv_i32 get_ahp_flag(void)
+{
+    TCGv_i32 ret = tcg_temp_new_i32();
+
+    tcg_gen_ld_i32(ret, cpu_env,
+                   offsetof(CPUARMState, vfp.xregs[ARM_VFP_FPSCR]));
+    tcg_gen_extract_i32(ret, ret, 26, 1);
+
+    return ret;
+}
+
+/* Set bits within PSTATE.  */
+static inline void set_pstate_bits(uint32_t bits)
+{
+    TCGv_i32 p = tcg_temp_new_i32();
+
+    tcg_debug_assert(!(bits & CACHED_PSTATE_BITS));
+
+    tcg_gen_ld_i32(p, cpu_env, offsetof(CPUARMState, pstate));
+    tcg_gen_ori_i32(p, p, bits);
+    tcg_gen_st_i32(p, cpu_env, offsetof(CPUARMState, pstate));
+    tcg_temp_free_i32(p);
+}
+
+/* Clear bits within PSTATE.  */
+static inline void clear_pstate_bits(uint32_t bits)
+{
+    TCGv_i32 p = tcg_temp_new_i32();
+
+    tcg_debug_assert(!(bits & CACHED_PSTATE_BITS));
+
+    tcg_gen_ld_i32(p, cpu_env, offsetof(CPUARMState, pstate));
+    tcg_gen_andi_i32(p, p, ~bits);
+    tcg_gen_st_i32(p, cpu_env, offsetof(CPUARMState, pstate));
+    tcg_temp_free_i32(p);
+}
+
+/* If the singlestep state is Active-not-pending, advance to Active-pending. */
+static inline void gen_ss_advance(DisasContext *s)
+{
+    if (s->ss_active) {
+        s->pstate_ss = 0;
+        clear_pstate_bits(PSTATE_SS);
+    }
+}
+
+/* Generate an architectural singlestep exception */
+static inline void gen_swstep_exception(DisasContext *s, int isv, int ex)
+{
+    /* Fill in the same_el field of the syndrome in the helper. */
+    uint32_t syn = syn_swstep(false, isv, ex);
+    gen_helper_exception_swstep(cpu_env, tcg_constant_i32(syn));
+}
+
+/*
+ * Given a VFP floating point constant encoded into an 8 bit immediate in an
+ * instruction, expand it to the actual constant value of the specified
+ * size, as per the VFPExpandImm() pseudocode in the Arm ARM.
+ */
+uint64_t vfp_expand_imm(int size, uint8_t imm8);
+
+/* Vector operations shared between ARM and AArch64.  */
+void gen_gvec_ceq0(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                   uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_clt0(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                   uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_cgt0(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                   uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_cle0(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                   uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_cge0(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                   uint32_t opr_sz, uint32_t max_sz);
+
+void gen_gvec_mla(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                  uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_mls(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                  uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+
+void gen_gvec_cmtst(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_ushl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+
+void gen_cmtst_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+void gen_ushl_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b);
+void gen_sshl_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b);
+void gen_ushl_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+void gen_sshl_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+
+void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                       uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_sqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                       uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_uqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                       uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                       uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+
+void gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                   int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                   int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+
+void gen_gvec_srshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                    int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_urshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                    int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_srsra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                    int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_ursra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                    int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+
+void gen_gvec_sri(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                  int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_sli(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                  int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+
+void gen_gvec_sqrdmlah_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                          uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_sqrdmlsh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                          uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+
+void gen_gvec_sabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_uabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+
+void gen_gvec_saba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_uaba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+
+/*
+ * Forward to the isar_feature_* tests given a DisasContext pointer.
+ */
+#define dc_isar_feature(name, ctx) \
+    ({ DisasContext *ctx_ = (ctx); isar_feature_##name(ctx_->isar); })
+
+/* Note that the gvec expanders operate on offsets + sizes.  */
+typedef void GVecGen2Fn(unsigned, uint32_t, uint32_t, uint32_t, uint32_t);
+typedef void GVecGen2iFn(unsigned, uint32_t, uint32_t, int64_t,
+                         uint32_t, uint32_t);
+typedef void GVecGen3Fn(unsigned, uint32_t, uint32_t,
+                        uint32_t, uint32_t, uint32_t);
+typedef void GVecGen4Fn(unsigned, uint32_t, uint32_t, uint32_t,
+                        uint32_t, uint32_t, uint32_t);
+
+/* Function prototype for gen_ functions for calling Neon helpers */
+typedef void NeonGenOneOpFn(TCGv_i32, TCGv_i32);
+typedef void NeonGenOneOpEnvFn(TCGv_i32, TCGv_ptr, TCGv_i32);
+typedef void NeonGenTwoOpFn(TCGv_i32, TCGv_i32, TCGv_i32);
+typedef void NeonGenTwoOpEnvFn(TCGv_i32, TCGv_ptr, TCGv_i32, TCGv_i32);
+typedef void NeonGenThreeOpEnvFn(TCGv_i32, TCGv_env, TCGv_i32,
+                                 TCGv_i32, TCGv_i32);
+typedef void NeonGenTwo64OpFn(TCGv_i64, TCGv_i64, TCGv_i64);
+typedef void NeonGenTwo64OpEnvFn(TCGv_i64, TCGv_ptr, TCGv_i64, TCGv_i64);
+typedef void NeonGenNarrowFn(TCGv_i32, TCGv_i64);
+typedef void NeonGenNarrowEnvFn(TCGv_i32, TCGv_ptr, TCGv_i64);
+typedef void NeonGenWidenFn(TCGv_i64, TCGv_i32);
+typedef void NeonGenTwoOpWidenFn(TCGv_i64, TCGv_i32, TCGv_i32);
+typedef void NeonGenOneSingleOpFn(TCGv_i32, TCGv_i32, TCGv_ptr);
+typedef void NeonGenTwoSingleOpFn(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_ptr);
+typedef void NeonGenTwoDoubleOpFn(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_ptr);
+typedef void NeonGenOne64OpFn(TCGv_i64, TCGv_i64);
+typedef void CryptoTwoOpFn(TCGv_ptr, TCGv_ptr);
+typedef void CryptoThreeOpIntFn(TCGv_ptr, TCGv_ptr, TCGv_i32);
+typedef void CryptoThreeOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr);
+typedef void AtomicThreeOpFn(TCGv_i64, TCGv_i64, TCGv_i64, TCGArg, MemOp);
+typedef void WideShiftImmFn(TCGv_i64, TCGv_i64, int64_t shift);
+typedef void WideShiftFn(TCGv_i64, TCGv_ptr, TCGv_i64, TCGv_i32);
+typedef void ShiftImmFn(TCGv_i32, TCGv_i32, int32_t shift);
+typedef void ShiftFn(TCGv_i32, TCGv_ptr, TCGv_i32, TCGv_i32);
+
+/**
+ * arm_tbflags_from_tb:
+ * @tb: the TranslationBlock
+ *
+ * Extract the flag values from @tb.
+ */
+static inline CPUARMTBFlags arm_tbflags_from_tb(const TranslationBlock *tb)
+{
+    return (CPUARMTBFlags){ tb->flags, tb->cs_base };
+}
+
+/*
+ * Enum for argument to fpstatus_ptr().
+ */
+typedef enum ARMFPStatusFlavour {
+    FPST_FPCR,
+    FPST_FPCR_F16,
+    FPST_STD,
+    FPST_STD_F16,
+} ARMFPStatusFlavour;
+
+/**
+ * fpstatus_ptr: return TCGv_ptr to the specified fp_status field
+ *
+ * We have multiple softfloat float_status fields in the Arm CPU state struct
+ * (see the comment in cpu.h for details). Return a TCGv_ptr which has
+ * been set up to point to the requested field in the CPU state struct.
+ * The options are:
+ *
+ * FPST_FPCR
+ *   for non-FP16 operations controlled by the FPCR
+ * FPST_FPCR_F16
+ *   for operations controlled by the FPCR where FPCR.FZ16 is to be used
+ * FPST_STD
+ *   for A32/T32 Neon operations using the "standard FPSCR value"
+ * FPST_STD_F16
+ *   as FPST_STD, but where FPCR.FZ16 is to be used
+ */
+static inline TCGv_ptr fpstatus_ptr(ARMFPStatusFlavour flavour)
+{
+    TCGv_ptr statusptr = tcg_temp_new_ptr();
+    int offset;
+
+    switch (flavour) {
+    case FPST_FPCR:
+        offset = offsetof(CPUARMState, vfp.fp_status);
+        break;
+    case FPST_FPCR_F16:
+        offset = offsetof(CPUARMState, vfp.fp_status_f16);
+        break;
+    case FPST_STD:
+        offset = offsetof(CPUARMState, vfp.standard_fp_status);
+        break;
+    case FPST_STD_F16:
+        offset = offsetof(CPUARMState, vfp.standard_fp_status_f16);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    tcg_gen_addi_ptr(statusptr, cpu_env, offset);
+    return statusptr;
+}
+
+/**
+ * finalize_memop:
+ * @s: DisasContext
+ * @opc: size+sign+align of the memory operation
+ *
+ * Build the complete MemOp for a memory operation, including alignment
+ * and endianness.
+ *
+ * If (op & MO_AMASK) then the operation already contains the required
+ * alignment, e.g. for AccType_ATOMIC.  Otherwise, this an optionally
+ * unaligned operation, e.g. for AccType_NORMAL.
+ *
+ * In the latter case, there are configuration bits that require alignment,
+ * and this is applied here.  Note that there is no way to indicate that
+ * no alignment should ever be enforced; this must be handled manually.
+ */
+static inline MemOp finalize_memop(DisasContext *s, MemOp opc)
+{
+    if (s->align_mem && !(opc & MO_AMASK)) {
+        opc |= MO_ALIGN;
+    }
+    return opc | s->be_data;
+}
+
+/**
+ * asimd_imm_const: Expand an encoded SIMD constant value
+ *
+ * Expand a SIMD constant value. This is essentially the pseudocode
+ * AdvSIMDExpandImm, except that we also perform the boolean NOT needed for
+ * VMVN and VBIC (when cmode < 14 && op == 1).
+ *
+ * The combination cmode == 15 op == 1 is a reserved encoding for AArch32;
+ * callers must catch this; we return the 64-bit constant value defined
+ * for AArch64.
+ *
+ * cmode = 2,3,4,5,6,7,10,11,12,13 imm=0 was UNPREDICTABLE in v7A but
+ * is either not unpredictable or merely CONSTRAINED UNPREDICTABLE in v8A;
+ * we produce an immediate constant value of 0 in these cases.
+ */
+uint64_t asimd_imm_const(uint32_t imm, int cmode, int op);
+
+/*
+ * gen_disas_label:
+ * Create a label and cache a copy of pc_save.
+ */
+static inline DisasLabel gen_disas_label(DisasContext *s)
+{
+    return (DisasLabel){
+        .label = gen_new_label(),
+        .pc_save = s->pc_save,
+    };
+}
+
+/*
+ * set_disas_label:
+ * Emit a label and restore the cached copy of pc_save.
+ */
+static inline void set_disas_label(DisasContext *s, DisasLabel l)
+{
+    gen_set_label(l.label);
+    s->pc_save = l.pc_save;
+}
+
+static inline TCGv_ptr gen_lookup_cp_reg(uint32_t key)
+{
+    TCGv_ptr ret = tcg_temp_new_ptr();
+    gen_helper_lookup_cp_reg(ret, cpu_env, tcg_constant_i32(key));
+    return ret;
+}
+
+/*
+ * Helpers for implementing sets of trans_* functions.
+ * Defer the implementation of NAME to FUNC, with optional extra arguments.
+ */
+#define TRANS(NAME, FUNC, ...) \
+    static bool trans_##NAME(DisasContext *s, arg_##NAME *a) \
+    { return FUNC(s, __VA_ARGS__); }
+#define TRANS_FEAT(NAME, FEAT, FUNC, ...) \
+    static bool trans_##NAME(DisasContext *s, arg_##NAME *a) \
+    { return dc_isar_feature(FEAT, s) && FUNC(s, __VA_ARGS__); }
+
+#define TRANS_FEAT_NONSTREAMING(NAME, FEAT, FUNC, ...)            \
+    static bool trans_##NAME(DisasContext *s, arg_##NAME *a)      \
+    {                                                             \
+        s->is_nonstreaming = true;                                \
+        return dc_isar_feature(FEAT, s) && FUNC(s, __VA_ARGS__);  \
+    }
+
+#endif /* TARGET_ARM_TRANSLATE_H */
diff --git a/target/arm/tcg/vfp-uncond.decode b/target/arm/tcg/vfp-uncond.decode
new file mode 100644
index 0000000..5c50447
--- /dev/null
+++ b/target/arm/tcg/vfp-uncond.decode
@@ -0,0 +1,82 @@
+# AArch32 VFP instruction descriptions (unconditional insns)
+#
+#  Copyright (c) 2019 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+# Encodings for the unconditional VFP instructions are here:
+# generally anything matching A32
+#  1111 1110 .... .... .... 101. ...0 ....
+# and T32
+#  1111 110. .... .... .... 101. .... ....
+#  1111 1110 .... .... .... 101. .... ....
+# (but those patterns might also cover some Neon instructions,
+# which do not live in this file.)
+
+# VFP registers have an odd encoding with a four-bit field
+# and a one-bit field which are assembled in different orders
+# depending on whether the register is double or single precision.
+# Each individual instruction function must do the checks for
+# "double register selected but CPU does not have double support"
+# and "double register number has bit 4 set but CPU does not
+# support D16-D31" (which should UNDEF).
+%vm_dp  5:1 0:4
+%vm_sp  0:4 5:1
+%vn_dp  7:1 16:4
+%vn_sp  16:4 7:1
+%vd_dp  22:1 12:4
+%vd_sp  12:4 22:1
+
+@vfp_dnm_s   ................................ vm=%vm_sp vn=%vn_sp vd=%vd_sp
+@vfp_dnm_d   ................................ vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+VSEL        1111 1110 0. cc:2 .... .... 1001 .0.0 .... \
+            vm=%vm_sp vn=%vn_sp vd=%vd_sp sz=1
+VSEL        1111 1110 0. cc:2 .... .... 1010 .0.0 .... \
+            vm=%vm_sp vn=%vn_sp vd=%vd_sp sz=2
+VSEL        1111 1110 0. cc:2 .... .... 1011 .0.0 .... \
+            vm=%vm_dp vn=%vn_dp vd=%vd_dp sz=3
+
+VMAXNM_hp   1111 1110 1.00 .... .... 1001 .0.0 ....         @vfp_dnm_s
+VMINNM_hp   1111 1110 1.00 .... .... 1001 .1.0 ....         @vfp_dnm_s
+
+VMAXNM_sp   1111 1110 1.00 .... .... 1010 .0.0 ....         @vfp_dnm_s
+VMINNM_sp   1111 1110 1.00 .... .... 1010 .1.0 ....         @vfp_dnm_s
+
+VMAXNM_dp   1111 1110 1.00 .... .... 1011 .0.0 ....         @vfp_dnm_d
+VMINNM_dp   1111 1110 1.00 .... .... 1011 .1.0 ....         @vfp_dnm_d
+
+VRINT       1111 1110 1.11 10 rm:2 .... 1001 01.0 .... \
+            vm=%vm_sp vd=%vd_sp sz=1
+VRINT       1111 1110 1.11 10 rm:2 .... 1010 01.0 .... \
+            vm=%vm_sp vd=%vd_sp sz=2
+VRINT       1111 1110 1.11 10 rm:2 .... 1011 01.0 .... \
+            vm=%vm_dp vd=%vd_dp sz=3
+
+# VCVT float to int with specified rounding mode; Vd is always single-precision
+VCVT        1111 1110 1.11 11 rm:2 .... 1001 op:1 1.0 .... \
+            vm=%vm_sp vd=%vd_sp sz=1
+VCVT        1111 1110 1.11 11 rm:2 .... 1010 op:1 1.0 .... \
+            vm=%vm_sp vd=%vd_sp sz=2
+VCVT        1111 1110 1.11 11 rm:2 .... 1011 op:1 1.0 .... \
+            vm=%vm_dp vd=%vd_sp sz=3
+
+VMOVX       1111 1110 1.11 0000 .... 1010 01 . 0 .... \
+            vd=%vd_sp vm=%vm_sp
+
+VINS        1111 1110 1.11 0000 .... 1010 11 . 0 .... \
+            vd=%vd_sp vm=%vm_sp
diff --git a/target/arm/tcg/vfp.decode b/target/arm/tcg/vfp.decode
new file mode 100644
index 0000000..5405e80
--- /dev/null
+++ b/target/arm/tcg/vfp.decode
@@ -0,0 +1,247 @@
+# AArch32 VFP instruction descriptions (conditional insns)
+#
+#  Copyright (c) 2019 Linaro, Ltd
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+#
+# This file is processed by scripts/decodetree.py
+#
+# Encodings for the conditional VFP instructions are here:
+# generally anything matching A32
+#  cccc 11.. .... .... .... 101. .... ....
+# and T32
+#  1110 110. .... .... .... 101. .... ....
+#  1110 1110 .... .... .... 101. .... ....
+# (but those patterns might also cover some Neon instructions,
+# which do not live in this file.)
+
+# VFP registers have an odd encoding with a four-bit field
+# and a one-bit field which are assembled in different orders
+# depending on whether the register is double or single precision.
+# Each individual instruction function must do the checks for
+# "double register selected but CPU does not have double support"
+# and "double register number has bit 4 set but CPU does not
+# support D16-D31" (which should UNDEF).
+%vm_dp  5:1 0:4
+%vm_sp  0:4 5:1
+%vn_dp  7:1 16:4
+%vn_sp  16:4 7:1
+%vd_dp  22:1 12:4
+%vd_sp  12:4 22:1
+
+%vmov_idx_b     21:1 5:2
+%vmov_idx_h     21:1 6:1
+
+%vmov_imm 16:4 0:4
+
+@vfp_dnm_s   ................................ vm=%vm_sp vn=%vn_sp vd=%vd_sp
+@vfp_dnm_d   ................................ vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
+@vfp_dm_ss   ................................ vm=%vm_sp vd=%vd_sp
+@vfp_dm_dd   ................................ vm=%vm_dp vd=%vd_dp
+@vfp_dm_ds   ................................ vm=%vm_sp vd=%vd_dp
+@vfp_dm_sd   ................................ vm=%vm_dp vd=%vd_sp
+
+# VMOV scalar to general-purpose register; note that this does
+# include some Neon cases.
+VMOV_to_gp   ---- 1110 u:1 1.        1 .... rt:4 1011 ... 1 0000 \
+             vn=%vn_dp size=0 index=%vmov_idx_b
+VMOV_to_gp   ---- 1110 u:1 0.        1 .... rt:4 1011 ..1 1 0000 \
+             vn=%vn_dp size=1 index=%vmov_idx_h
+VMOV_to_gp   ---- 1110 0   0 index:1 1 .... rt:4 1011 .00 1 0000 \
+             vn=%vn_dp size=2 u=0
+
+VMOV_from_gp ---- 1110 0 1.        0 .... rt:4 1011 ... 1 0000 \
+             vn=%vn_dp size=0 index=%vmov_idx_b
+VMOV_from_gp ---- 1110 0 0.        0 .... rt:4 1011 ..1 1 0000 \
+             vn=%vn_dp size=1 index=%vmov_idx_h
+VMOV_from_gp ---- 1110 0 0 index:1 0 .... rt:4 1011 .00 1 0000 \
+             vn=%vn_dp size=2
+
+VDUP         ---- 1110 1 b:1 q:1 0 .... rt:4 1011 . 0 e:1 1 0000 \
+             vn=%vn_dp
+
+VMSR_VMRS    ---- 1110 111 l:1 reg:4 rt:4 1010 0001 0000
+VMOV_half    ---- 1110 000 l:1 .... rt:4 1001 . 001 0000    vn=%vn_sp
+VMOV_single  ---- 1110 000 l:1 .... rt:4 1010 . 001 0000    vn=%vn_sp
+
+VMOV_64_sp   ---- 1100 010 op:1 rt2:4 rt:4 1010 00.1 ....   vm=%vm_sp
+VMOV_64_dp   ---- 1100 010 op:1 rt2:4 rt:4 1011 00.1 ....   vm=%vm_dp
+
+VLDR_VSTR_hp ---- 1101 u:1 .0 l:1 rn:4 .... 1001 imm:8      vd=%vd_sp
+VLDR_VSTR_sp ---- 1101 u:1 .0 l:1 rn:4 .... 1010 imm:8      vd=%vd_sp
+VLDR_VSTR_dp ---- 1101 u:1 .0 l:1 rn:4 .... 1011 imm:8      vd=%vd_dp
+
+# We split the load/store multiple up into two patterns to avoid
+# overlap with other insns in the "Advanced SIMD load/store and 64-bit move"
+# grouping:
+#   P=0 U=0 W=0 is 64-bit VMOV
+#   P=1 W=0 is VLDR/VSTR
+#   P=U W=1 is UNDEF
+# leaving P=0 U=1 W=x and P=1 U=0 W=1 for load/store multiple.
+# These include FSTM/FLDM.
+VLDM_VSTM_sp ---- 1100 1 . w:1 l:1 rn:4 .... 1010 imm:8 \
+             vd=%vd_sp p=0 u=1
+VLDM_VSTM_dp ---- 1100 1 . w:1 l:1 rn:4 .... 1011 imm:8 \
+             vd=%vd_dp p=0 u=1
+
+VLDM_VSTM_sp ---- 1101 0.1 l:1 rn:4 .... 1010 imm:8 \
+             vd=%vd_sp p=1 u=0 w=1
+VLDM_VSTM_dp ---- 1101 0.1 l:1 rn:4 .... 1011 imm:8 \
+             vd=%vd_dp p=1 u=0 w=1
+
+# 3-register VFP data-processing; bits [23,21:20,6] identify the operation.
+VMLA_hp      ---- 1110 0.00 .... .... 1001 .0.0 ....        @vfp_dnm_s
+VMLA_sp      ---- 1110 0.00 .... .... 1010 .0.0 ....        @vfp_dnm_s
+VMLA_dp      ---- 1110 0.00 .... .... 1011 .0.0 ....        @vfp_dnm_d
+
+VMLS_hp      ---- 1110 0.00 .... .... 1001 .1.0 ....        @vfp_dnm_s
+VMLS_sp      ---- 1110 0.00 .... .... 1010 .1.0 ....        @vfp_dnm_s
+VMLS_dp      ---- 1110 0.00 .... .... 1011 .1.0 ....        @vfp_dnm_d
+
+VNMLS_hp     ---- 1110 0.01 .... .... 1001 .0.0 ....        @vfp_dnm_s
+VNMLS_sp     ---- 1110 0.01 .... .... 1010 .0.0 ....        @vfp_dnm_s
+VNMLS_dp     ---- 1110 0.01 .... .... 1011 .0.0 ....        @vfp_dnm_d
+
+VNMLA_hp     ---- 1110 0.01 .... .... 1001 .1.0 ....        @vfp_dnm_s
+VNMLA_sp     ---- 1110 0.01 .... .... 1010 .1.0 ....        @vfp_dnm_s
+VNMLA_dp     ---- 1110 0.01 .... .... 1011 .1.0 ....        @vfp_dnm_d
+
+VMUL_hp      ---- 1110 0.10 .... .... 1001 .0.0 ....        @vfp_dnm_s
+VMUL_sp      ---- 1110 0.10 .... .... 1010 .0.0 ....        @vfp_dnm_s
+VMUL_dp      ---- 1110 0.10 .... .... 1011 .0.0 ....        @vfp_dnm_d
+
+VNMUL_hp     ---- 1110 0.10 .... .... 1001 .1.0 ....        @vfp_dnm_s
+VNMUL_sp     ---- 1110 0.10 .... .... 1010 .1.0 ....        @vfp_dnm_s
+VNMUL_dp     ---- 1110 0.10 .... .... 1011 .1.0 ....        @vfp_dnm_d
+
+VADD_hp      ---- 1110 0.11 .... .... 1001 .0.0 ....        @vfp_dnm_s
+VADD_sp      ---- 1110 0.11 .... .... 1010 .0.0 ....        @vfp_dnm_s
+VADD_dp      ---- 1110 0.11 .... .... 1011 .0.0 ....        @vfp_dnm_d
+
+VSUB_hp      ---- 1110 0.11 .... .... 1001 .1.0 ....        @vfp_dnm_s
+VSUB_sp      ---- 1110 0.11 .... .... 1010 .1.0 ....        @vfp_dnm_s
+VSUB_dp      ---- 1110 0.11 .... .... 1011 .1.0 ....        @vfp_dnm_d
+
+VDIV_hp      ---- 1110 1.00 .... .... 1001 .0.0 ....        @vfp_dnm_s
+VDIV_sp      ---- 1110 1.00 .... .... 1010 .0.0 ....        @vfp_dnm_s
+VDIV_dp      ---- 1110 1.00 .... .... 1011 .0.0 ....        @vfp_dnm_d
+
+VFMA_hp      ---- 1110 1.10 .... .... 1001 .0. 0 ....       @vfp_dnm_s
+VFMS_hp      ---- 1110 1.10 .... .... 1001 .1. 0 ....       @vfp_dnm_s
+VFNMA_hp     ---- 1110 1.01 .... .... 1001 .0. 0 ....       @vfp_dnm_s
+VFNMS_hp     ---- 1110 1.01 .... .... 1001 .1. 0 ....       @vfp_dnm_s
+
+VFMA_sp      ---- 1110 1.10 .... .... 1010 .0. 0 ....       @vfp_dnm_s
+VFMS_sp      ---- 1110 1.10 .... .... 1010 .1. 0 ....       @vfp_dnm_s
+VFNMA_sp     ---- 1110 1.01 .... .... 1010 .0. 0 ....       @vfp_dnm_s
+VFNMS_sp     ---- 1110 1.01 .... .... 1010 .1. 0 ....       @vfp_dnm_s
+
+VFMA_dp      ---- 1110 1.10 .... .... 1011 .0.0 ....        @vfp_dnm_d
+VFMS_dp      ---- 1110 1.10 .... .... 1011 .1.0 ....        @vfp_dnm_d
+VFNMA_dp     ---- 1110 1.01 .... .... 1011 .0.0 ....        @vfp_dnm_d
+VFNMS_dp     ---- 1110 1.01 .... .... 1011 .1.0 ....        @vfp_dnm_d
+
+VMOV_imm_hp  ---- 1110 1.11 .... .... 1001 0000 .... \
+             vd=%vd_sp imm=%vmov_imm
+VMOV_imm_sp  ---- 1110 1.11 .... .... 1010 0000 .... \
+             vd=%vd_sp imm=%vmov_imm
+VMOV_imm_dp  ---- 1110 1.11 .... .... 1011 0000 .... \
+             vd=%vd_dp imm=%vmov_imm
+
+VMOV_reg_sp  ---- 1110 1.11 0000 .... 1010 01.0 ....        @vfp_dm_ss
+VMOV_reg_dp  ---- 1110 1.11 0000 .... 1011 01.0 ....        @vfp_dm_dd
+
+VABS_hp      ---- 1110 1.11 0000 .... 1001 11.0 ....        @vfp_dm_ss
+VABS_sp      ---- 1110 1.11 0000 .... 1010 11.0 ....        @vfp_dm_ss
+VABS_dp      ---- 1110 1.11 0000 .... 1011 11.0 ....        @vfp_dm_dd
+
+VNEG_hp      ---- 1110 1.11 0001 .... 1001 01.0 ....        @vfp_dm_ss
+VNEG_sp      ---- 1110 1.11 0001 .... 1010 01.0 ....        @vfp_dm_ss
+VNEG_dp      ---- 1110 1.11 0001 .... 1011 01.0 ....        @vfp_dm_dd
+
+VSQRT_hp     ---- 1110 1.11 0001 .... 1001 11.0 ....        @vfp_dm_ss
+VSQRT_sp     ---- 1110 1.11 0001 .... 1010 11.0 ....        @vfp_dm_ss
+VSQRT_dp     ---- 1110 1.11 0001 .... 1011 11.0 ....        @vfp_dm_dd
+
+VCMP_hp      ---- 1110 1.11 010 z:1 .... 1001 e:1 1.0 .... \
+             vd=%vd_sp vm=%vm_sp
+VCMP_sp      ---- 1110 1.11 010 z:1 .... 1010 e:1 1.0 .... \
+             vd=%vd_sp vm=%vm_sp
+VCMP_dp      ---- 1110 1.11 010 z:1 .... 1011 e:1 1.0 .... \
+             vd=%vd_dp vm=%vm_dp
+
+# VCVTT and VCVTB from f16: Vd format depends on size bit; Vm is always vm_sp
+VCVT_f32_f16 ---- 1110 1.11 0010 .... 1010 t:1 1.0 .... \
+             vd=%vd_sp vm=%vm_sp
+VCVT_f64_f16 ---- 1110 1.11 0010 .... 1011 t:1 1.0 .... \
+             vd=%vd_dp vm=%vm_sp
+
+# VCVTB and VCVTT to f16: Vd format is always vd_sp;
+# Vm format depends on size bit
+VCVT_b16_f32 ---- 1110 1.11 0011 .... 1001 t:1 1.0 .... \
+             vd=%vd_sp vm=%vm_sp
+VCVT_f16_f32 ---- 1110 1.11 0011 .... 1010 t:1 1.0 .... \
+             vd=%vd_sp vm=%vm_sp
+VCVT_f16_f64 ---- 1110 1.11 0011 .... 1011 t:1 1.0 .... \
+             vd=%vd_sp vm=%vm_dp
+
+VRINTR_hp    ---- 1110 1.11 0110 .... 1001 01.0 ....        @vfp_dm_ss
+VRINTR_sp    ---- 1110 1.11 0110 .... 1010 01.0 ....        @vfp_dm_ss
+VRINTR_dp    ---- 1110 1.11 0110 .... 1011 01.0 ....        @vfp_dm_dd
+
+VRINTZ_hp    ---- 1110 1.11 0110 .... 1001 11.0 ....        @vfp_dm_ss
+VRINTZ_sp    ---- 1110 1.11 0110 .... 1010 11.0 ....        @vfp_dm_ss
+VRINTZ_dp    ---- 1110 1.11 0110 .... 1011 11.0 ....        @vfp_dm_dd
+
+VRINTX_hp    ---- 1110 1.11 0111 .... 1001 01.0 ....        @vfp_dm_ss
+VRINTX_sp    ---- 1110 1.11 0111 .... 1010 01.0 ....        @vfp_dm_ss
+VRINTX_dp    ---- 1110 1.11 0111 .... 1011 01.0 ....        @vfp_dm_dd
+
+# VCVT between single and double:
+# Vm precision depends on size; Vd is its reverse
+VCVT_sp      ---- 1110 1.11 0111 .... 1010 11.0 ....        @vfp_dm_ds
+VCVT_dp      ---- 1110 1.11 0111 .... 1011 11.0 ....        @vfp_dm_sd
+
+# VCVT from integer to floating point: Vm always single; Vd depends on size
+VCVT_int_hp  ---- 1110 1.11 1000 .... 1001 s:1 1.0 .... \
+             vd=%vd_sp vm=%vm_sp
+VCVT_int_sp  ---- 1110 1.11 1000 .... 1010 s:1 1.0 .... \
+             vd=%vd_sp vm=%vm_sp
+VCVT_int_dp  ---- 1110 1.11 1000 .... 1011 s:1 1.0 .... \
+             vd=%vd_dp vm=%vm_sp
+
+# VJCVT is always dp to sp
+VJCVT        ---- 1110 1.11 1001 .... 1011 11.0 ....        @vfp_dm_sd
+
+# VCVT between floating-point and fixed-point. The immediate value
+# is in the same format as a Vm single-precision register number.
+# We assemble bits 18 (op), 16 (u) and 7 (sx) into a single opc field
+# for the convenience of the trans_VCVT_fix functions.
+%vcvt_fix_op 18:1 16:1 7:1
+VCVT_fix_hp  ---- 1110 1.11 1.1. .... 1001 .1.0 .... \
+             vd=%vd_sp imm=%vm_sp opc=%vcvt_fix_op
+VCVT_fix_sp  ---- 1110 1.11 1.1. .... 1010 .1.0 .... \
+             vd=%vd_sp imm=%vm_sp opc=%vcvt_fix_op
+VCVT_fix_dp  ---- 1110 1.11 1.1. .... 1011 .1.0 .... \
+             vd=%vd_dp imm=%vm_sp opc=%vcvt_fix_op
+
+# VCVT float to integer (VCVT and VCVTR): Vd always single; Vd depends on size
+VCVT_hp_int  ---- 1110 1.11 110 s:1 .... 1001 rz:1 1.0 .... \
+             vd=%vd_sp vm=%vm_sp
+VCVT_sp_int  ---- 1110 1.11 110 s:1 .... 1010 rz:1 1.0 .... \
+             vd=%vd_sp vm=%vm_sp
+VCVT_dp_int  ---- 1110 1.11 110 s:1 .... 1011 rz:1 1.0 .... \
+             vd=%vd_sp vm=%vm_dp
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
deleted file mode 100644
index da9f877..0000000
--- a/target/arm/translate-a64.c
+++ /dev/null
@@ -1,15054 +0,0 @@
-/*
- *  AArch64 translation
- *
- *  Copyright (c) 2013 Alexander Graf <agraf@suse.de>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-#include "qemu/osdep.h"
-
-#include "cpu.h"
-#include "exec/exec-all.h"
-#include "tcg/tcg-op.h"
-#include "tcg/tcg-op-gvec.h"
-#include "qemu/log.h"
-#include "arm_ldst.h"
-#include "translate.h"
-#include "internals.h"
-#include "qemu/host-utils.h"
-#include "semihosting/semihost.h"
-#include "exec/gen-icount.h"
-#include "exec/helper-proto.h"
-#include "exec/helper-gen.h"
-#include "exec/log.h"
-#include "cpregs.h"
-#include "translate-a64.h"
-#include "qemu/atomic128.h"
-
-static TCGv_i64 cpu_X[32];
-static TCGv_i64 cpu_pc;
-
-/* Load/store exclusive handling */
-static TCGv_i64 cpu_exclusive_high;
-
-static const char *regnames[] = {
-    "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
-    "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
-    "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23",
-    "x24", "x25", "x26", "x27", "x28", "x29", "lr", "sp"
-};
-
-enum a64_shift_type {
-    A64_SHIFT_TYPE_LSL = 0,
-    A64_SHIFT_TYPE_LSR = 1,
-    A64_SHIFT_TYPE_ASR = 2,
-    A64_SHIFT_TYPE_ROR = 3
-};
-
-/* Table based decoder typedefs - used when the relevant bits for decode
- * are too awkwardly scattered across the instruction (eg SIMD).
- */
-typedef void AArch64DecodeFn(DisasContext *s, uint32_t insn);
-
-typedef struct AArch64DecodeTable {
-    uint32_t pattern;
-    uint32_t mask;
-    AArch64DecodeFn *disas_fn;
-} AArch64DecodeTable;
-
-/* initialize TCG globals.  */
-void a64_translate_init(void)
-{
-    int i;
-
-    cpu_pc = tcg_global_mem_new_i64(cpu_env,
-                                    offsetof(CPUARMState, pc),
-                                    "pc");
-    for (i = 0; i < 32; i++) {
-        cpu_X[i] = tcg_global_mem_new_i64(cpu_env,
-                                          offsetof(CPUARMState, xregs[i]),
-                                          regnames[i]);
-    }
-
-    cpu_exclusive_high = tcg_global_mem_new_i64(cpu_env,
-        offsetof(CPUARMState, exclusive_high), "exclusive_high");
-}
-
-/*
- * Return the core mmu_idx to use for A64 "unprivileged load/store" insns
- */
-static int get_a64_user_mem_index(DisasContext *s)
-{
-    /*
-     * If AccType_UNPRIV is not used, the insn uses AccType_NORMAL,
-     * which is the usual mmu_idx for this cpu state.
-     */
-    ARMMMUIdx useridx = s->mmu_idx;
-
-    if (s->unpriv) {
-        /*
-         * We have pre-computed the condition for AccType_UNPRIV.
-         * Therefore we should never get here with a mmu_idx for
-         * which we do not know the corresponding user mmu_idx.
-         */
-        switch (useridx) {
-        case ARMMMUIdx_E10_1:
-        case ARMMMUIdx_E10_1_PAN:
-            useridx = ARMMMUIdx_E10_0;
-            break;
-        case ARMMMUIdx_E20_2:
-        case ARMMMUIdx_E20_2_PAN:
-            useridx = ARMMMUIdx_E20_0;
-            break;
-        default:
-            g_assert_not_reached();
-        }
-    }
-    return arm_to_core_mmu_idx(useridx);
-}
-
-static void set_btype_raw(int val)
-{
-    tcg_gen_st_i32(tcg_constant_i32(val), cpu_env,
-                   offsetof(CPUARMState, btype));
-}
-
-static void set_btype(DisasContext *s, int val)
-{
-    /* BTYPE is a 2-bit field, and 0 should be done with reset_btype.  */
-    tcg_debug_assert(val >= 1 && val <= 3);
-    set_btype_raw(val);
-    s->btype = -1;
-}
-
-static void reset_btype(DisasContext *s)
-{
-    if (s->btype != 0) {
-        set_btype_raw(0);
-        s->btype = 0;
-    }
-}
-
-static void gen_pc_plus_diff(DisasContext *s, TCGv_i64 dest, target_long diff)
-{
-    assert(s->pc_save != -1);
-    if (TARGET_TB_PCREL) {
-        tcg_gen_addi_i64(dest, cpu_pc, (s->pc_curr - s->pc_save) + diff);
-    } else {
-        tcg_gen_movi_i64(dest, s->pc_curr + diff);
-    }
-}
-
-void gen_a64_update_pc(DisasContext *s, target_long diff)
-{
-    gen_pc_plus_diff(s, cpu_pc, diff);
-    s->pc_save = s->pc_curr + diff;
-}
-
-/*
- * Handle Top Byte Ignore (TBI) bits.
- *
- * If address tagging is enabled via the TCR TBI bits:
- *  + for EL2 and EL3 there is only one TBI bit, and if it is set
- *    then the address is zero-extended, clearing bits [63:56]
- *  + for EL0 and EL1, TBI0 controls addresses with bit 55 == 0
- *    and TBI1 controls addressses with bit 55 == 1.
- *    If the appropriate TBI bit is set for the address then
- *    the address is sign-extended from bit 55 into bits [63:56]
- *
- * Here We have concatenated TBI{1,0} into tbi.
- */
-static void gen_top_byte_ignore(DisasContext *s, TCGv_i64 dst,
-                                TCGv_i64 src, int tbi)
-{
-    if (tbi == 0) {
-        /* Load unmodified address */
-        tcg_gen_mov_i64(dst, src);
-    } else if (!regime_has_2_ranges(s->mmu_idx)) {
-        /* Force tag byte to all zero */
-        tcg_gen_extract_i64(dst, src, 0, 56);
-    } else {
-        /* Sign-extend from bit 55.  */
-        tcg_gen_sextract_i64(dst, src, 0, 56);
-
-        switch (tbi) {
-        case 1:
-            /* tbi0 but !tbi1: only use the extension if positive */
-            tcg_gen_and_i64(dst, dst, src);
-            break;
-        case 2:
-            /* !tbi0 but tbi1: only use the extension if negative */
-            tcg_gen_or_i64(dst, dst, src);
-            break;
-        case 3:
-            /* tbi0 and tbi1: always use the extension */
-            break;
-        default:
-            g_assert_not_reached();
-        }
-    }
-}
-
-static void gen_a64_set_pc(DisasContext *s, TCGv_i64 src)
-{
-    /*
-     * If address tagging is enabled for instructions via the TCR TBI bits,
-     * then loading an address into the PC will clear out any tag.
-     */
-    gen_top_byte_ignore(s, cpu_pc, src, s->tbii);
-    s->pc_save = -1;
-}
-
-/*
- * Handle MTE and/or TBI.
- *
- * For TBI, ideally, we would do nothing.  Proper behaviour on fault is
- * for the tag to be present in the FAR_ELx register.  But for user-only
- * mode we do not have a TLB with which to implement this, so we must
- * remove the top byte now.
- *
- * Always return a fresh temporary that we can increment independently
- * of the write-back address.
- */
-
-TCGv_i64 clean_data_tbi(DisasContext *s, TCGv_i64 addr)
-{
-    TCGv_i64 clean = new_tmp_a64(s);
-#ifdef CONFIG_USER_ONLY
-    gen_top_byte_ignore(s, clean, addr, s->tbid);
-#else
-    tcg_gen_mov_i64(clean, addr);
-#endif
-    return clean;
-}
-
-/* Insert a zero tag into src, with the result at dst. */
-static void gen_address_with_allocation_tag0(TCGv_i64 dst, TCGv_i64 src)
-{
-    tcg_gen_andi_i64(dst, src, ~MAKE_64BIT_MASK(56, 4));
-}
-
-static void gen_probe_access(DisasContext *s, TCGv_i64 ptr,
-                             MMUAccessType acc, int log2_size)
-{
-    gen_helper_probe_access(cpu_env, ptr,
-                            tcg_constant_i32(acc),
-                            tcg_constant_i32(get_mem_index(s)),
-                            tcg_constant_i32(1 << log2_size));
-}
-
-/*
- * For MTE, check a single logical or atomic access.  This probes a single
- * address, the exact one specified.  The size and alignment of the access
- * is not relevant to MTE, per se, but watchpoints do require the size,
- * and we want to recognize those before making any other changes to state.
- */
-static TCGv_i64 gen_mte_check1_mmuidx(DisasContext *s, TCGv_i64 addr,
-                                      bool is_write, bool tag_checked,
-                                      int log2_size, bool is_unpriv,
-                                      int core_idx)
-{
-    if (tag_checked && s->mte_active[is_unpriv]) {
-        TCGv_i64 ret;
-        int desc = 0;
-
-        desc = FIELD_DP32(desc, MTEDESC, MIDX, core_idx);
-        desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid);
-        desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma);
-        desc = FIELD_DP32(desc, MTEDESC, WRITE, is_write);
-        desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (1 << log2_size) - 1);
-
-        ret = new_tmp_a64(s);
-        gen_helper_mte_check(ret, cpu_env, tcg_constant_i32(desc), addr);
-
-        return ret;
-    }
-    return clean_data_tbi(s, addr);
-}
-
-TCGv_i64 gen_mte_check1(DisasContext *s, TCGv_i64 addr, bool is_write,
-                        bool tag_checked, int log2_size)
-{
-    return gen_mte_check1_mmuidx(s, addr, is_write, tag_checked, log2_size,
-                                 false, get_mem_index(s));
-}
-
-/*
- * For MTE, check multiple logical sequential accesses.
- */
-TCGv_i64 gen_mte_checkN(DisasContext *s, TCGv_i64 addr, bool is_write,
-                        bool tag_checked, int size)
-{
-    if (tag_checked && s->mte_active[0]) {
-        TCGv_i64 ret;
-        int desc = 0;
-
-        desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s));
-        desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid);
-        desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma);
-        desc = FIELD_DP32(desc, MTEDESC, WRITE, is_write);
-        desc = FIELD_DP32(desc, MTEDESC, SIZEM1, size - 1);
-
-        ret = new_tmp_a64(s);
-        gen_helper_mte_check(ret, cpu_env, tcg_constant_i32(desc), addr);
-
-        return ret;
-    }
-    return clean_data_tbi(s, addr);
-}
-
-typedef struct DisasCompare64 {
-    TCGCond cond;
-    TCGv_i64 value;
-} DisasCompare64;
-
-static void a64_test_cc(DisasCompare64 *c64, int cc)
-{
-    DisasCompare c32;
-
-    arm_test_cc(&c32, cc);
-
-    /* Sign-extend the 32-bit value so that the GE/LT comparisons work
-       * properly.  The NE/EQ comparisons are also fine with this choice.  */
-    c64->cond = c32.cond;
-    c64->value = tcg_temp_new_i64();
-    tcg_gen_ext_i32_i64(c64->value, c32.value);
-
-    arm_free_cc(&c32);
-}
-
-static void a64_free_cc(DisasCompare64 *c64)
-{
-    tcg_temp_free_i64(c64->value);
-}
-
-static void gen_rebuild_hflags(DisasContext *s)
-{
-    gen_helper_rebuild_hflags_a64(cpu_env, tcg_constant_i32(s->current_el));
-}
-
-static void gen_exception_internal(int excp)
-{
-    assert(excp_is_internal(excp));
-    gen_helper_exception_internal(cpu_env, tcg_constant_i32(excp));
-}
-
-static void gen_exception_internal_insn(DisasContext *s, int excp)
-{
-    gen_a64_update_pc(s, 0);
-    gen_exception_internal(excp);
-    s->base.is_jmp = DISAS_NORETURN;
-}
-
-static void gen_exception_bkpt_insn(DisasContext *s, uint32_t syndrome)
-{
-    gen_a64_update_pc(s, 0);
-    gen_helper_exception_bkpt_insn(cpu_env, tcg_constant_i32(syndrome));
-    s->base.is_jmp = DISAS_NORETURN;
-}
-
-static void gen_step_complete_exception(DisasContext *s)
-{
-    /* We just completed step of an insn. Move from Active-not-pending
-     * to Active-pending, and then also take the swstep exception.
-     * This corresponds to making the (IMPDEF) choice to prioritize
-     * swstep exceptions over asynchronous exceptions taken to an exception
-     * level where debug is disabled. This choice has the advantage that
-     * we do not need to maintain internal state corresponding to the
-     * ISV/EX syndrome bits between completion of the step and generation
-     * of the exception, and our syndrome information is always correct.
-     */
-    gen_ss_advance(s);
-    gen_swstep_exception(s, 1, s->is_ldex);
-    s->base.is_jmp = DISAS_NORETURN;
-}
-
-static inline bool use_goto_tb(DisasContext *s, uint64_t dest)
-{
-    if (s->ss_active) {
-        return false;
-    }
-    return translator_use_goto_tb(&s->base, dest);
-}
-
-static void gen_goto_tb(DisasContext *s, int n, int64_t diff)
-{
-    if (use_goto_tb(s, s->pc_curr + diff)) {
-        /*
-         * For pcrel, the pc must always be up-to-date on entry to
-         * the linked TB, so that it can use simple additions for all
-         * further adjustments.  For !pcrel, the linked TB is compiled
-         * to know its full virtual address, so we can delay the
-         * update to pc to the unlinked path.  A long chain of links
-         * can thus avoid many updates to the PC.
-         */
-        if (TARGET_TB_PCREL) {
-            gen_a64_update_pc(s, diff);
-            tcg_gen_goto_tb(n);
-        } else {
-            tcg_gen_goto_tb(n);
-            gen_a64_update_pc(s, diff);
-        }
-        tcg_gen_exit_tb(s->base.tb, n);
-        s->base.is_jmp = DISAS_NORETURN;
-    } else {
-        gen_a64_update_pc(s, diff);
-        if (s->ss_active) {
-            gen_step_complete_exception(s);
-        } else {
-            tcg_gen_lookup_and_goto_ptr();
-            s->base.is_jmp = DISAS_NORETURN;
-        }
-    }
-}
-
-static void init_tmp_a64_array(DisasContext *s)
-{
-#ifdef CONFIG_DEBUG_TCG
-    memset(s->tmp_a64, 0, sizeof(s->tmp_a64));
-#endif
-    s->tmp_a64_count = 0;
-}
-
-static void free_tmp_a64(DisasContext *s)
-{
-    int i;
-    for (i = 0; i < s->tmp_a64_count; i++) {
-        tcg_temp_free_i64(s->tmp_a64[i]);
-    }
-    init_tmp_a64_array(s);
-}
-
-TCGv_i64 new_tmp_a64(DisasContext *s)
-{
-    assert(s->tmp_a64_count < TMP_A64_MAX);
-    return s->tmp_a64[s->tmp_a64_count++] = tcg_temp_new_i64();
-}
-
-TCGv_i64 new_tmp_a64_local(DisasContext *s)
-{
-    assert(s->tmp_a64_count < TMP_A64_MAX);
-    return s->tmp_a64[s->tmp_a64_count++] = tcg_temp_local_new_i64();
-}
-
-TCGv_i64 new_tmp_a64_zero(DisasContext *s)
-{
-    TCGv_i64 t = new_tmp_a64(s);
-    tcg_gen_movi_i64(t, 0);
-    return t;
-}
-
-/*
- * Register access functions
- *
- * These functions are used for directly accessing a register in where
- * changes to the final register value are likely to be made. If you
- * need to use a register for temporary calculation (e.g. index type
- * operations) use the read_* form.
- *
- * B1.2.1 Register mappings
- *
- * In instruction register encoding 31 can refer to ZR (zero register) or
- * the SP (stack pointer) depending on context. In QEMU's case we map SP
- * to cpu_X[31] and ZR accesses to a temporary which can be discarded.
- * This is the point of the _sp forms.
- */
-TCGv_i64 cpu_reg(DisasContext *s, int reg)
-{
-    if (reg == 31) {
-        return new_tmp_a64_zero(s);
-    } else {
-        return cpu_X[reg];
-    }
-}
-
-/* register access for when 31 == SP */
-TCGv_i64 cpu_reg_sp(DisasContext *s, int reg)
-{
-    return cpu_X[reg];
-}
-
-/* read a cpu register in 32bit/64bit mode. Returns a TCGv_i64
- * representing the register contents. This TCGv is an auto-freed
- * temporary so it need not be explicitly freed, and may be modified.
- */
-TCGv_i64 read_cpu_reg(DisasContext *s, int reg, int sf)
-{
-    TCGv_i64 v = new_tmp_a64(s);
-    if (reg != 31) {
-        if (sf) {
-            tcg_gen_mov_i64(v, cpu_X[reg]);
-        } else {
-            tcg_gen_ext32u_i64(v, cpu_X[reg]);
-        }
-    } else {
-        tcg_gen_movi_i64(v, 0);
-    }
-    return v;
-}
-
-TCGv_i64 read_cpu_reg_sp(DisasContext *s, int reg, int sf)
-{
-    TCGv_i64 v = new_tmp_a64(s);
-    if (sf) {
-        tcg_gen_mov_i64(v, cpu_X[reg]);
-    } else {
-        tcg_gen_ext32u_i64(v, cpu_X[reg]);
-    }
-    return v;
-}
-
-/* Return the offset into CPUARMState of a slice (from
- * the least significant end) of FP register Qn (ie
- * Dn, Sn, Hn or Bn).
- * (Note that this is not the same mapping as for A32; see cpu.h)
- */
-static inline int fp_reg_offset(DisasContext *s, int regno, MemOp size)
-{
-    return vec_reg_offset(s, regno, 0, size);
-}
-
-/* Offset of the high half of the 128 bit vector Qn */
-static inline int fp_reg_hi_offset(DisasContext *s, int regno)
-{
-    return vec_reg_offset(s, regno, 1, MO_64);
-}
-
-/* Convenience accessors for reading and writing single and double
- * FP registers. Writing clears the upper parts of the associated
- * 128 bit vector register, as required by the architecture.
- * Note that unlike the GP register accessors, the values returned
- * by the read functions must be manually freed.
- */
-static TCGv_i64 read_fp_dreg(DisasContext *s, int reg)
-{
-    TCGv_i64 v = tcg_temp_new_i64();
-
-    tcg_gen_ld_i64(v, cpu_env, fp_reg_offset(s, reg, MO_64));
-    return v;
-}
-
-static TCGv_i32 read_fp_sreg(DisasContext *s, int reg)
-{
-    TCGv_i32 v = tcg_temp_new_i32();
-
-    tcg_gen_ld_i32(v, cpu_env, fp_reg_offset(s, reg, MO_32));
-    return v;
-}
-
-static TCGv_i32 read_fp_hreg(DisasContext *s, int reg)
-{
-    TCGv_i32 v = tcg_temp_new_i32();
-
-    tcg_gen_ld16u_i32(v, cpu_env, fp_reg_offset(s, reg, MO_16));
-    return v;
-}
-
-/* Clear the bits above an N-bit vector, for N = (is_q ? 128 : 64).
- * If SVE is not enabled, then there are only 128 bits in the vector.
- */
-static void clear_vec_high(DisasContext *s, bool is_q, int rd)
-{
-    unsigned ofs = fp_reg_offset(s, rd, MO_64);
-    unsigned vsz = vec_full_reg_size(s);
-
-    /* Nop move, with side effect of clearing the tail. */
-    tcg_gen_gvec_mov(MO_64, ofs, ofs, is_q ? 16 : 8, vsz);
-}
-
-void write_fp_dreg(DisasContext *s, int reg, TCGv_i64 v)
-{
-    unsigned ofs = fp_reg_offset(s, reg, MO_64);
-
-    tcg_gen_st_i64(v, cpu_env, ofs);
-    clear_vec_high(s, false, reg);
-}
-
-static void write_fp_sreg(DisasContext *s, int reg, TCGv_i32 v)
-{
-    TCGv_i64 tmp = tcg_temp_new_i64();
-
-    tcg_gen_extu_i32_i64(tmp, v);
-    write_fp_dreg(s, reg, tmp);
-    tcg_temp_free_i64(tmp);
-}
-
-/* Expand a 2-operand AdvSIMD vector operation using an expander function.  */
-static void gen_gvec_fn2(DisasContext *s, bool is_q, int rd, int rn,
-                         GVecGen2Fn *gvec_fn, int vece)
-{
-    gvec_fn(vece, vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn),
-            is_q ? 16 : 8, vec_full_reg_size(s));
-}
-
-/* Expand a 2-operand + immediate AdvSIMD vector operation using
- * an expander function.
- */
-static void gen_gvec_fn2i(DisasContext *s, bool is_q, int rd, int rn,
-                          int64_t imm, GVecGen2iFn *gvec_fn, int vece)
-{
-    gvec_fn(vece, vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn),
-            imm, is_q ? 16 : 8, vec_full_reg_size(s));
-}
-
-/* Expand a 3-operand AdvSIMD vector operation using an expander function.  */
-static void gen_gvec_fn3(DisasContext *s, bool is_q, int rd, int rn, int rm,
-                         GVecGen3Fn *gvec_fn, int vece)
-{
-    gvec_fn(vece, vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn),
-            vec_full_reg_offset(s, rm), is_q ? 16 : 8, vec_full_reg_size(s));
-}
-
-/* Expand a 4-operand AdvSIMD vector operation using an expander function.  */
-static void gen_gvec_fn4(DisasContext *s, bool is_q, int rd, int rn, int rm,
-                         int rx, GVecGen4Fn *gvec_fn, int vece)
-{
-    gvec_fn(vece, vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn),
-            vec_full_reg_offset(s, rm), vec_full_reg_offset(s, rx),
-            is_q ? 16 : 8, vec_full_reg_size(s));
-}
-
-/* Expand a 2-operand operation using an out-of-line helper.  */
-static void gen_gvec_op2_ool(DisasContext *s, bool is_q, int rd,
-                             int rn, int data, gen_helper_gvec_2 *fn)
-{
-    tcg_gen_gvec_2_ool(vec_full_reg_offset(s, rd),
-                       vec_full_reg_offset(s, rn),
-                       is_q ? 16 : 8, vec_full_reg_size(s), data, fn);
-}
-
-/* Expand a 3-operand operation using an out-of-line helper.  */
-static void gen_gvec_op3_ool(DisasContext *s, bool is_q, int rd,
-                             int rn, int rm, int data, gen_helper_gvec_3 *fn)
-{
-    tcg_gen_gvec_3_ool(vec_full_reg_offset(s, rd),
-                       vec_full_reg_offset(s, rn),
-                       vec_full_reg_offset(s, rm),
-                       is_q ? 16 : 8, vec_full_reg_size(s), data, fn);
-}
-
-/* Expand a 3-operand + fpstatus pointer + simd data value operation using
- * an out-of-line helper.
- */
-static void gen_gvec_op3_fpst(DisasContext *s, bool is_q, int rd, int rn,
-                              int rm, bool is_fp16, int data,
-                              gen_helper_gvec_3_ptr *fn)
-{
-    TCGv_ptr fpst = fpstatus_ptr(is_fp16 ? FPST_FPCR_F16 : FPST_FPCR);
-    tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd),
-                       vec_full_reg_offset(s, rn),
-                       vec_full_reg_offset(s, rm), fpst,
-                       is_q ? 16 : 8, vec_full_reg_size(s), data, fn);
-    tcg_temp_free_ptr(fpst);
-}
-
-/* Expand a 3-operand + qc + operation using an out-of-line helper.  */
-static void gen_gvec_op3_qc(DisasContext *s, bool is_q, int rd, int rn,
-                            int rm, gen_helper_gvec_3_ptr *fn)
-{
-    TCGv_ptr qc_ptr = tcg_temp_new_ptr();
-
-    tcg_gen_addi_ptr(qc_ptr, cpu_env, offsetof(CPUARMState, vfp.qc));
-    tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd),
-                       vec_full_reg_offset(s, rn),
-                       vec_full_reg_offset(s, rm), qc_ptr,
-                       is_q ? 16 : 8, vec_full_reg_size(s), 0, fn);
-    tcg_temp_free_ptr(qc_ptr);
-}
-
-/* Expand a 4-operand operation using an out-of-line helper.  */
-static void gen_gvec_op4_ool(DisasContext *s, bool is_q, int rd, int rn,
-                             int rm, int ra, int data, gen_helper_gvec_4 *fn)
-{
-    tcg_gen_gvec_4_ool(vec_full_reg_offset(s, rd),
-                       vec_full_reg_offset(s, rn),
-                       vec_full_reg_offset(s, rm),
-                       vec_full_reg_offset(s, ra),
-                       is_q ? 16 : 8, vec_full_reg_size(s), data, fn);
-}
-
-/*
- * Expand a 4-operand + fpstatus pointer + simd data value operation using
- * an out-of-line helper.
- */
-static void gen_gvec_op4_fpst(DisasContext *s, bool is_q, int rd, int rn,
-                              int rm, int ra, bool is_fp16, int data,
-                              gen_helper_gvec_4_ptr *fn)
-{
-    TCGv_ptr fpst = fpstatus_ptr(is_fp16 ? FPST_FPCR_F16 : FPST_FPCR);
-    tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, rd),
-                       vec_full_reg_offset(s, rn),
-                       vec_full_reg_offset(s, rm),
-                       vec_full_reg_offset(s, ra), fpst,
-                       is_q ? 16 : 8, vec_full_reg_size(s), data, fn);
-    tcg_temp_free_ptr(fpst);
-}
-
-/* Set ZF and NF based on a 64 bit result. This is alas fiddlier
- * than the 32 bit equivalent.
- */
-static inline void gen_set_NZ64(TCGv_i64 result)
-{
-    tcg_gen_extr_i64_i32(cpu_ZF, cpu_NF, result);
-    tcg_gen_or_i32(cpu_ZF, cpu_ZF, cpu_NF);
-}
-
-/* Set NZCV as for a logical operation: NZ as per result, CV cleared. */
-static inline void gen_logic_CC(int sf, TCGv_i64 result)
-{
-    if (sf) {
-        gen_set_NZ64(result);
-    } else {
-        tcg_gen_extrl_i64_i32(cpu_ZF, result);
-        tcg_gen_mov_i32(cpu_NF, cpu_ZF);
-    }
-    tcg_gen_movi_i32(cpu_CF, 0);
-    tcg_gen_movi_i32(cpu_VF, 0);
-}
-
-/* dest = T0 + T1; compute C, N, V and Z flags */
-static void gen_add_CC(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1)
-{
-    if (sf) {
-        TCGv_i64 result, flag, tmp;
-        result = tcg_temp_new_i64();
-        flag = tcg_temp_new_i64();
-        tmp = tcg_temp_new_i64();
-
-        tcg_gen_movi_i64(tmp, 0);
-        tcg_gen_add2_i64(result, flag, t0, tmp, t1, tmp);
-
-        tcg_gen_extrl_i64_i32(cpu_CF, flag);
-
-        gen_set_NZ64(result);
-
-        tcg_gen_xor_i64(flag, result, t0);
-        tcg_gen_xor_i64(tmp, t0, t1);
-        tcg_gen_andc_i64(flag, flag, tmp);
-        tcg_temp_free_i64(tmp);
-        tcg_gen_extrh_i64_i32(cpu_VF, flag);
-
-        tcg_gen_mov_i64(dest, result);
-        tcg_temp_free_i64(result);
-        tcg_temp_free_i64(flag);
-    } else {
-        /* 32 bit arithmetic */
-        TCGv_i32 t0_32 = tcg_temp_new_i32();
-        TCGv_i32 t1_32 = tcg_temp_new_i32();
-        TCGv_i32 tmp = tcg_temp_new_i32();
-
-        tcg_gen_movi_i32(tmp, 0);
-        tcg_gen_extrl_i64_i32(t0_32, t0);
-        tcg_gen_extrl_i64_i32(t1_32, t1);
-        tcg_gen_add2_i32(cpu_NF, cpu_CF, t0_32, tmp, t1_32, tmp);
-        tcg_gen_mov_i32(cpu_ZF, cpu_NF);
-        tcg_gen_xor_i32(cpu_VF, cpu_NF, t0_32);
-        tcg_gen_xor_i32(tmp, t0_32, t1_32);
-        tcg_gen_andc_i32(cpu_VF, cpu_VF, tmp);
-        tcg_gen_extu_i32_i64(dest, cpu_NF);
-
-        tcg_temp_free_i32(tmp);
-        tcg_temp_free_i32(t0_32);
-        tcg_temp_free_i32(t1_32);
-    }
-}
-
-/* dest = T0 - T1; compute C, N, V and Z flags */
-static void gen_sub_CC(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1)
-{
-    if (sf) {
-        /* 64 bit arithmetic */
-        TCGv_i64 result, flag, tmp;
-
-        result = tcg_temp_new_i64();
-        flag = tcg_temp_new_i64();
-        tcg_gen_sub_i64(result, t0, t1);
-
-        gen_set_NZ64(result);
-
-        tcg_gen_setcond_i64(TCG_COND_GEU, flag, t0, t1);
-        tcg_gen_extrl_i64_i32(cpu_CF, flag);
-
-        tcg_gen_xor_i64(flag, result, t0);
-        tmp = tcg_temp_new_i64();
-        tcg_gen_xor_i64(tmp, t0, t1);
-        tcg_gen_and_i64(flag, flag, tmp);
-        tcg_temp_free_i64(tmp);
-        tcg_gen_extrh_i64_i32(cpu_VF, flag);
-        tcg_gen_mov_i64(dest, result);
-        tcg_temp_free_i64(flag);
-        tcg_temp_free_i64(result);
-    } else {
-        /* 32 bit arithmetic */
-        TCGv_i32 t0_32 = tcg_temp_new_i32();
-        TCGv_i32 t1_32 = tcg_temp_new_i32();
-        TCGv_i32 tmp;
-
-        tcg_gen_extrl_i64_i32(t0_32, t0);
-        tcg_gen_extrl_i64_i32(t1_32, t1);
-        tcg_gen_sub_i32(cpu_NF, t0_32, t1_32);
-        tcg_gen_mov_i32(cpu_ZF, cpu_NF);
-        tcg_gen_setcond_i32(TCG_COND_GEU, cpu_CF, t0_32, t1_32);
-        tcg_gen_xor_i32(cpu_VF, cpu_NF, t0_32);
-        tmp = tcg_temp_new_i32();
-        tcg_gen_xor_i32(tmp, t0_32, t1_32);
-        tcg_temp_free_i32(t0_32);
-        tcg_temp_free_i32(t1_32);
-        tcg_gen_and_i32(cpu_VF, cpu_VF, tmp);
-        tcg_temp_free_i32(tmp);
-        tcg_gen_extu_i32_i64(dest, cpu_NF);
-    }
-}
-
-/* dest = T0 + T1 + CF; do not compute flags. */
-static void gen_adc(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1)
-{
-    TCGv_i64 flag = tcg_temp_new_i64();
-    tcg_gen_extu_i32_i64(flag, cpu_CF);
-    tcg_gen_add_i64(dest, t0, t1);
-    tcg_gen_add_i64(dest, dest, flag);
-    tcg_temp_free_i64(flag);
-
-    if (!sf) {
-        tcg_gen_ext32u_i64(dest, dest);
-    }
-}
-
-/* dest = T0 + T1 + CF; compute C, N, V and Z flags. */
-static void gen_adc_CC(int sf, TCGv_i64 dest, TCGv_i64 t0, TCGv_i64 t1)
-{
-    if (sf) {
-        TCGv_i64 result = tcg_temp_new_i64();
-        TCGv_i64 cf_64 = tcg_temp_new_i64();
-        TCGv_i64 vf_64 = tcg_temp_new_i64();
-        TCGv_i64 tmp = tcg_temp_new_i64();
-        TCGv_i64 zero = tcg_constant_i64(0);
-
-        tcg_gen_extu_i32_i64(cf_64, cpu_CF);
-        tcg_gen_add2_i64(result, cf_64, t0, zero, cf_64, zero);
-        tcg_gen_add2_i64(result, cf_64, result, cf_64, t1, zero);
-        tcg_gen_extrl_i64_i32(cpu_CF, cf_64);
-        gen_set_NZ64(result);
-
-        tcg_gen_xor_i64(vf_64, result, t0);
-        tcg_gen_xor_i64(tmp, t0, t1);
-        tcg_gen_andc_i64(vf_64, vf_64, tmp);
-        tcg_gen_extrh_i64_i32(cpu_VF, vf_64);
-
-        tcg_gen_mov_i64(dest, result);
-
-        tcg_temp_free_i64(tmp);
-        tcg_temp_free_i64(vf_64);
-        tcg_temp_free_i64(cf_64);
-        tcg_temp_free_i64(result);
-    } else {
-        TCGv_i32 t0_32 = tcg_temp_new_i32();
-        TCGv_i32 t1_32 = tcg_temp_new_i32();
-        TCGv_i32 tmp = tcg_temp_new_i32();
-        TCGv_i32 zero = tcg_constant_i32(0);
-
-        tcg_gen_extrl_i64_i32(t0_32, t0);
-        tcg_gen_extrl_i64_i32(t1_32, t1);
-        tcg_gen_add2_i32(cpu_NF, cpu_CF, t0_32, zero, cpu_CF, zero);
-        tcg_gen_add2_i32(cpu_NF, cpu_CF, cpu_NF, cpu_CF, t1_32, zero);
-
-        tcg_gen_mov_i32(cpu_ZF, cpu_NF);
-        tcg_gen_xor_i32(cpu_VF, cpu_NF, t0_32);
-        tcg_gen_xor_i32(tmp, t0_32, t1_32);
-        tcg_gen_andc_i32(cpu_VF, cpu_VF, tmp);
-        tcg_gen_extu_i32_i64(dest, cpu_NF);
-
-        tcg_temp_free_i32(tmp);
-        tcg_temp_free_i32(t1_32);
-        tcg_temp_free_i32(t0_32);
-    }
-}
-
-/*
- * Load/Store generators
- */
-
-/*
- * Store from GPR register to memory.
- */
-static void do_gpr_st_memidx(DisasContext *s, TCGv_i64 source,
-                             TCGv_i64 tcg_addr, MemOp memop, int memidx,
-                             bool iss_valid,
-                             unsigned int iss_srt,
-                             bool iss_sf, bool iss_ar)
-{
-    memop = finalize_memop(s, memop);
-    tcg_gen_qemu_st_i64(source, tcg_addr, memidx, memop);
-
-    if (iss_valid) {
-        uint32_t syn;
-
-        syn = syn_data_abort_with_iss(0,
-                                      (memop & MO_SIZE),
-                                      false,
-                                      iss_srt,
-                                      iss_sf,
-                                      iss_ar,
-                                      0, 0, 0, 0, 0, false);
-        disas_set_insn_syndrome(s, syn);
-    }
-}
-
-static void do_gpr_st(DisasContext *s, TCGv_i64 source,
-                      TCGv_i64 tcg_addr, MemOp memop,
-                      bool iss_valid,
-                      unsigned int iss_srt,
-                      bool iss_sf, bool iss_ar)
-{
-    do_gpr_st_memidx(s, source, tcg_addr, memop, get_mem_index(s),
-                     iss_valid, iss_srt, iss_sf, iss_ar);
-}
-
-/*
- * Load from memory to GPR register
- */
-static void do_gpr_ld_memidx(DisasContext *s, TCGv_i64 dest, TCGv_i64 tcg_addr,
-                             MemOp memop, bool extend, int memidx,
-                             bool iss_valid, unsigned int iss_srt,
-                             bool iss_sf, bool iss_ar)
-{
-    memop = finalize_memop(s, memop);
-    tcg_gen_qemu_ld_i64(dest, tcg_addr, memidx, memop);
-
-    if (extend && (memop & MO_SIGN)) {
-        g_assert((memop & MO_SIZE) <= MO_32);
-        tcg_gen_ext32u_i64(dest, dest);
-    }
-
-    if (iss_valid) {
-        uint32_t syn;
-
-        syn = syn_data_abort_with_iss(0,
-                                      (memop & MO_SIZE),
-                                      (memop & MO_SIGN) != 0,
-                                      iss_srt,
-                                      iss_sf,
-                                      iss_ar,
-                                      0, 0, 0, 0, 0, false);
-        disas_set_insn_syndrome(s, syn);
-    }
-}
-
-static void do_gpr_ld(DisasContext *s, TCGv_i64 dest, TCGv_i64 tcg_addr,
-                      MemOp memop, bool extend,
-                      bool iss_valid, unsigned int iss_srt,
-                      bool iss_sf, bool iss_ar)
-{
-    do_gpr_ld_memidx(s, dest, tcg_addr, memop, extend, get_mem_index(s),
-                     iss_valid, iss_srt, iss_sf, iss_ar);
-}
-
-/*
- * Store from FP register to memory
- */
-static void do_fp_st(DisasContext *s, int srcidx, TCGv_i64 tcg_addr, int size)
-{
-    /* This writes the bottom N bits of a 128 bit wide vector to memory */
-    TCGv_i64 tmplo = tcg_temp_new_i64();
-    MemOp mop;
-
-    tcg_gen_ld_i64(tmplo, cpu_env, fp_reg_offset(s, srcidx, MO_64));
-
-    if (size < 4) {
-        mop = finalize_memop(s, size);
-        tcg_gen_qemu_st_i64(tmplo, tcg_addr, get_mem_index(s), mop);
-    } else {
-        bool be = s->be_data == MO_BE;
-        TCGv_i64 tcg_hiaddr = tcg_temp_new_i64();
-        TCGv_i64 tmphi = tcg_temp_new_i64();
-
-        tcg_gen_ld_i64(tmphi, cpu_env, fp_reg_hi_offset(s, srcidx));
-
-        mop = s->be_data | MO_UQ;
-        tcg_gen_qemu_st_i64(be ? tmphi : tmplo, tcg_addr, get_mem_index(s),
-                            mop | (s->align_mem ? MO_ALIGN_16 : 0));
-        tcg_gen_addi_i64(tcg_hiaddr, tcg_addr, 8);
-        tcg_gen_qemu_st_i64(be ? tmplo : tmphi, tcg_hiaddr,
-                            get_mem_index(s), mop);
-
-        tcg_temp_free_i64(tcg_hiaddr);
-        tcg_temp_free_i64(tmphi);
-    }
-
-    tcg_temp_free_i64(tmplo);
-}
-
-/*
- * Load from memory to FP register
- */
-static void do_fp_ld(DisasContext *s, int destidx, TCGv_i64 tcg_addr, int size)
-{
-    /* This always zero-extends and writes to a full 128 bit wide vector */
-    TCGv_i64 tmplo = tcg_temp_new_i64();
-    TCGv_i64 tmphi = NULL;
-    MemOp mop;
-
-    if (size < 4) {
-        mop = finalize_memop(s, size);
-        tcg_gen_qemu_ld_i64(tmplo, tcg_addr, get_mem_index(s), mop);
-    } else {
-        bool be = s->be_data == MO_BE;
-        TCGv_i64 tcg_hiaddr;
-
-        tmphi = tcg_temp_new_i64();
-        tcg_hiaddr = tcg_temp_new_i64();
-
-        mop = s->be_data | MO_UQ;
-        tcg_gen_qemu_ld_i64(be ? tmphi : tmplo, tcg_addr, get_mem_index(s),
-                            mop | (s->align_mem ? MO_ALIGN_16 : 0));
-        tcg_gen_addi_i64(tcg_hiaddr, tcg_addr, 8);
-        tcg_gen_qemu_ld_i64(be ? tmplo : tmphi, tcg_hiaddr,
-                            get_mem_index(s), mop);
-        tcg_temp_free_i64(tcg_hiaddr);
-    }
-
-    tcg_gen_st_i64(tmplo, cpu_env, fp_reg_offset(s, destidx, MO_64));
-    tcg_temp_free_i64(tmplo);
-
-    if (tmphi) {
-        tcg_gen_st_i64(tmphi, cpu_env, fp_reg_hi_offset(s, destidx));
-        tcg_temp_free_i64(tmphi);
-    }
-    clear_vec_high(s, tmphi != NULL, destidx);
-}
-
-/*
- * Vector load/store helpers.
- *
- * The principal difference between this and a FP load is that we don't
- * zero extend as we are filling a partial chunk of the vector register.
- * These functions don't support 128 bit loads/stores, which would be
- * normal load/store operations.
- *
- * The _i32 versions are useful when operating on 32 bit quantities
- * (eg for floating point single or using Neon helper functions).
- */
-
-/* Get value of an element within a vector register */
-static void read_vec_element(DisasContext *s, TCGv_i64 tcg_dest, int srcidx,
-                             int element, MemOp memop)
-{
-    int vect_off = vec_reg_offset(s, srcidx, element, memop & MO_SIZE);
-    switch ((unsigned)memop) {
-    case MO_8:
-        tcg_gen_ld8u_i64(tcg_dest, cpu_env, vect_off);
-        break;
-    case MO_16:
-        tcg_gen_ld16u_i64(tcg_dest, cpu_env, vect_off);
-        break;
-    case MO_32:
-        tcg_gen_ld32u_i64(tcg_dest, cpu_env, vect_off);
-        break;
-    case MO_8|MO_SIGN:
-        tcg_gen_ld8s_i64(tcg_dest, cpu_env, vect_off);
-        break;
-    case MO_16|MO_SIGN:
-        tcg_gen_ld16s_i64(tcg_dest, cpu_env, vect_off);
-        break;
-    case MO_32|MO_SIGN:
-        tcg_gen_ld32s_i64(tcg_dest, cpu_env, vect_off);
-        break;
-    case MO_64:
-    case MO_64|MO_SIGN:
-        tcg_gen_ld_i64(tcg_dest, cpu_env, vect_off);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-}
-
-static void read_vec_element_i32(DisasContext *s, TCGv_i32 tcg_dest, int srcidx,
-                                 int element, MemOp memop)
-{
-    int vect_off = vec_reg_offset(s, srcidx, element, memop & MO_SIZE);
-    switch (memop) {
-    case MO_8:
-        tcg_gen_ld8u_i32(tcg_dest, cpu_env, vect_off);
-        break;
-    case MO_16:
-        tcg_gen_ld16u_i32(tcg_dest, cpu_env, vect_off);
-        break;
-    case MO_8|MO_SIGN:
-        tcg_gen_ld8s_i32(tcg_dest, cpu_env, vect_off);
-        break;
-    case MO_16|MO_SIGN:
-        tcg_gen_ld16s_i32(tcg_dest, cpu_env, vect_off);
-        break;
-    case MO_32:
-    case MO_32|MO_SIGN:
-        tcg_gen_ld_i32(tcg_dest, cpu_env, vect_off);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-}
-
-/* Set value of an element within a vector register */
-static void write_vec_element(DisasContext *s, TCGv_i64 tcg_src, int destidx,
-                              int element, MemOp memop)
-{
-    int vect_off = vec_reg_offset(s, destidx, element, memop & MO_SIZE);
-    switch (memop) {
-    case MO_8:
-        tcg_gen_st8_i64(tcg_src, cpu_env, vect_off);
-        break;
-    case MO_16:
-        tcg_gen_st16_i64(tcg_src, cpu_env, vect_off);
-        break;
-    case MO_32:
-        tcg_gen_st32_i64(tcg_src, cpu_env, vect_off);
-        break;
-    case MO_64:
-        tcg_gen_st_i64(tcg_src, cpu_env, vect_off);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-}
-
-static void write_vec_element_i32(DisasContext *s, TCGv_i32 tcg_src,
-                                  int destidx, int element, MemOp memop)
-{
-    int vect_off = vec_reg_offset(s, destidx, element, memop & MO_SIZE);
-    switch (memop) {
-    case MO_8:
-        tcg_gen_st8_i32(tcg_src, cpu_env, vect_off);
-        break;
-    case MO_16:
-        tcg_gen_st16_i32(tcg_src, cpu_env, vect_off);
-        break;
-    case MO_32:
-        tcg_gen_st_i32(tcg_src, cpu_env, vect_off);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-}
-
-/* Store from vector register to memory */
-static void do_vec_st(DisasContext *s, int srcidx, int element,
-                      TCGv_i64 tcg_addr, MemOp mop)
-{
-    TCGv_i64 tcg_tmp = tcg_temp_new_i64();
-
-    read_vec_element(s, tcg_tmp, srcidx, element, mop & MO_SIZE);
-    tcg_gen_qemu_st_i64(tcg_tmp, tcg_addr, get_mem_index(s), mop);
-
-    tcg_temp_free_i64(tcg_tmp);
-}
-
-/* Load from memory to vector register */
-static void do_vec_ld(DisasContext *s, int destidx, int element,
-                      TCGv_i64 tcg_addr, MemOp mop)
-{
-    TCGv_i64 tcg_tmp = tcg_temp_new_i64();
-
-    tcg_gen_qemu_ld_i64(tcg_tmp, tcg_addr, get_mem_index(s), mop);
-    write_vec_element(s, tcg_tmp, destidx, element, mop & MO_SIZE);
-
-    tcg_temp_free_i64(tcg_tmp);
-}
-
-/* Check that FP/Neon access is enabled. If it is, return
- * true. If not, emit code to generate an appropriate exception,
- * and return false; the caller should not emit any code for
- * the instruction. Note that this check must happen after all
- * unallocated-encoding checks (otherwise the syndrome information
- * for the resulting exception will be incorrect).
- */
-static bool fp_access_check_only(DisasContext *s)
-{
-    if (s->fp_excp_el) {
-        assert(!s->fp_access_checked);
-        s->fp_access_checked = true;
-
-        gen_exception_insn_el(s, 0, EXCP_UDEF,
-                              syn_fp_access_trap(1, 0xe, false, 0),
-                              s->fp_excp_el);
-        return false;
-    }
-    s->fp_access_checked = true;
-    return true;
-}
-
-static bool fp_access_check(DisasContext *s)
-{
-    if (!fp_access_check_only(s)) {
-        return false;
-    }
-    if (s->sme_trap_nonstreaming && s->is_nonstreaming) {
-        gen_exception_insn(s, 0, EXCP_UDEF,
-                           syn_smetrap(SME_ET_Streaming, false));
-        return false;
-    }
-    return true;
-}
-
-/*
- * Check that SVE access is enabled.  If it is, return true.
- * If not, emit code to generate an appropriate exception and return false.
- * This function corresponds to CheckSVEEnabled().
- */
-bool sve_access_check(DisasContext *s)
-{
-    if (s->pstate_sm || !dc_isar_feature(aa64_sve, s)) {
-        assert(dc_isar_feature(aa64_sme, s));
-        if (!sme_sm_enabled_check(s)) {
-            goto fail_exit;
-        }
-    } else if (s->sve_excp_el) {
-        gen_exception_insn_el(s, 0, EXCP_UDEF,
-                              syn_sve_access_trap(), s->sve_excp_el);
-        goto fail_exit;
-    }
-    s->sve_access_checked = true;
-    return fp_access_check(s);
-
- fail_exit:
-    /* Assert that we only raise one exception per instruction. */
-    assert(!s->sve_access_checked);
-    s->sve_access_checked = true;
-    return false;
-}
-
-/*
- * Check that SME access is enabled, raise an exception if not.
- * Note that this function corresponds to CheckSMEAccess and is
- * only used directly for cpregs.
- */
-static bool sme_access_check(DisasContext *s)
-{
-    if (s->sme_excp_el) {
-        gen_exception_insn_el(s, 0, EXCP_UDEF,
-                              syn_smetrap(SME_ET_AccessTrap, false),
-                              s->sme_excp_el);
-        return false;
-    }
-    return true;
-}
-
-/* This function corresponds to CheckSMEEnabled. */
-bool sme_enabled_check(DisasContext *s)
-{
-    /*
-     * Note that unlike sve_excp_el, we have not constrained sme_excp_el
-     * to be zero when fp_excp_el has priority.  This is because we need
-     * sme_excp_el by itself for cpregs access checks.
-     */
-    if (!s->fp_excp_el || s->sme_excp_el < s->fp_excp_el) {
-        s->fp_access_checked = true;
-        return sme_access_check(s);
-    }
-    return fp_access_check_only(s);
-}
-
-/* Common subroutine for CheckSMEAnd*Enabled. */
-bool sme_enabled_check_with_svcr(DisasContext *s, unsigned req)
-{
-    if (!sme_enabled_check(s)) {
-        return false;
-    }
-    if (FIELD_EX64(req, SVCR, SM) && !s->pstate_sm) {
-        gen_exception_insn(s, 0, EXCP_UDEF,
-                           syn_smetrap(SME_ET_NotStreaming, false));
-        return false;
-    }
-    if (FIELD_EX64(req, SVCR, ZA) && !s->pstate_za) {
-        gen_exception_insn(s, 0, EXCP_UDEF,
-                           syn_smetrap(SME_ET_InactiveZA, false));
-        return false;
-    }
-    return true;
-}
-
-/*
- * This utility function is for doing register extension with an
- * optional shift. You will likely want to pass a temporary for the
- * destination register. See DecodeRegExtend() in the ARM ARM.
- */
-static void ext_and_shift_reg(TCGv_i64 tcg_out, TCGv_i64 tcg_in,
-                              int option, unsigned int shift)
-{
-    int extsize = extract32(option, 0, 2);
-    bool is_signed = extract32(option, 2, 1);
-
-    if (is_signed) {
-        switch (extsize) {
-        case 0:
-            tcg_gen_ext8s_i64(tcg_out, tcg_in);
-            break;
-        case 1:
-            tcg_gen_ext16s_i64(tcg_out, tcg_in);
-            break;
-        case 2:
-            tcg_gen_ext32s_i64(tcg_out, tcg_in);
-            break;
-        case 3:
-            tcg_gen_mov_i64(tcg_out, tcg_in);
-            break;
-        }
-    } else {
-        switch (extsize) {
-        case 0:
-            tcg_gen_ext8u_i64(tcg_out, tcg_in);
-            break;
-        case 1:
-            tcg_gen_ext16u_i64(tcg_out, tcg_in);
-            break;
-        case 2:
-            tcg_gen_ext32u_i64(tcg_out, tcg_in);
-            break;
-        case 3:
-            tcg_gen_mov_i64(tcg_out, tcg_in);
-            break;
-        }
-    }
-
-    if (shift) {
-        tcg_gen_shli_i64(tcg_out, tcg_out, shift);
-    }
-}
-
-static inline void gen_check_sp_alignment(DisasContext *s)
-{
-    /* The AArch64 architecture mandates that (if enabled via PSTATE
-     * or SCTLR bits) there is a check that SP is 16-aligned on every
-     * SP-relative load or store (with an exception generated if it is not).
-     * In line with general QEMU practice regarding misaligned accesses,
-     * we omit these checks for the sake of guest program performance.
-     * This function is provided as a hook so we can more easily add these
-     * checks in future (possibly as a "favour catching guest program bugs
-     * over speed" user selectable option).
-     */
-}
-
-/*
- * This provides a simple table based table lookup decoder. It is
- * intended to be used when the relevant bits for decode are too
- * awkwardly placed and switch/if based logic would be confusing and
- * deeply nested. Since it's a linear search through the table, tables
- * should be kept small.
- *
- * It returns the first handler where insn & mask == pattern, or
- * NULL if there is no match.
- * The table is terminated by an empty mask (i.e. 0)
- */
-static inline AArch64DecodeFn *lookup_disas_fn(const AArch64DecodeTable *table,
-                                               uint32_t insn)
-{
-    const AArch64DecodeTable *tptr = table;
-
-    while (tptr->mask) {
-        if ((insn & tptr->mask) == tptr->pattern) {
-            return tptr->disas_fn;
-        }
-        tptr++;
-    }
-    return NULL;
-}
-
-/*
- * The instruction disassembly implemented here matches
- * the instruction encoding classifications in chapter C4
- * of the ARM Architecture Reference Manual (DDI0487B_a);
- * classification names and decode diagrams here should generally
- * match up with those in the manual.
- */
-
-/* Unconditional branch (immediate)
- *   31  30       26 25                                  0
- * +----+-----------+-------------------------------------+
- * | op | 0 0 1 0 1 |                 imm26               |
- * +----+-----------+-------------------------------------+
- */
-static void disas_uncond_b_imm(DisasContext *s, uint32_t insn)
-{
-    int64_t diff = sextract32(insn, 0, 26) * 4;
-
-    if (insn & (1U << 31)) {
-        /* BL Branch with link */
-        gen_pc_plus_diff(s, cpu_reg(s, 30), curr_insn_len(s));
-    }
-
-    /* B Branch / BL Branch with link */
-    reset_btype(s);
-    gen_goto_tb(s, 0, diff);
-}
-
-/* Compare and branch (immediate)
- *   31  30         25  24  23                  5 4      0
- * +----+-------------+----+---------------------+--------+
- * | sf | 0 1 1 0 1 0 | op |         imm19       |   Rt   |
- * +----+-------------+----+---------------------+--------+
- */
-static void disas_comp_b_imm(DisasContext *s, uint32_t insn)
-{
-    unsigned int sf, op, rt;
-    int64_t diff;
-    DisasLabel match;
-    TCGv_i64 tcg_cmp;
-
-    sf = extract32(insn, 31, 1);
-    op = extract32(insn, 24, 1); /* 0: CBZ; 1: CBNZ */
-    rt = extract32(insn, 0, 5);
-    diff = sextract32(insn, 5, 19) * 4;
-
-    tcg_cmp = read_cpu_reg(s, rt, sf);
-    reset_btype(s);
-
-    match = gen_disas_label(s);
-    tcg_gen_brcondi_i64(op ? TCG_COND_NE : TCG_COND_EQ,
-                        tcg_cmp, 0, match.label);
-    gen_goto_tb(s, 0, 4);
-    set_disas_label(s, match);
-    gen_goto_tb(s, 1, diff);
-}
-
-/* Test and branch (immediate)
- *   31  30         25  24  23   19 18          5 4    0
- * +----+-------------+----+-------+-------------+------+
- * | b5 | 0 1 1 0 1 1 | op |  b40  |    imm14    |  Rt  |
- * +----+-------------+----+-------+-------------+------+
- */
-static void disas_test_b_imm(DisasContext *s, uint32_t insn)
-{
-    unsigned int bit_pos, op, rt;
-    int64_t diff;
-    DisasLabel match;
-    TCGv_i64 tcg_cmp;
-
-    bit_pos = (extract32(insn, 31, 1) << 5) | extract32(insn, 19, 5);
-    op = extract32(insn, 24, 1); /* 0: TBZ; 1: TBNZ */
-    diff = sextract32(insn, 5, 14) * 4;
-    rt = extract32(insn, 0, 5);
-
-    tcg_cmp = tcg_temp_new_i64();
-    tcg_gen_andi_i64(tcg_cmp, cpu_reg(s, rt), (1ULL << bit_pos));
-
-    reset_btype(s);
-
-    match = gen_disas_label(s);
-    tcg_gen_brcondi_i64(op ? TCG_COND_NE : TCG_COND_EQ,
-                        tcg_cmp, 0, match.label);
-    tcg_temp_free_i64(tcg_cmp);
-    gen_goto_tb(s, 0, 4);
-    set_disas_label(s, match);
-    gen_goto_tb(s, 1, diff);
-}
-
-/* Conditional branch (immediate)
- *  31           25  24  23                  5   4  3    0
- * +---------------+----+---------------------+----+------+
- * | 0 1 0 1 0 1 0 | o1 |         imm19       | o0 | cond |
- * +---------------+----+---------------------+----+------+
- */
-static void disas_cond_b_imm(DisasContext *s, uint32_t insn)
-{
-    unsigned int cond;
-    int64_t diff;
-
-    if ((insn & (1 << 4)) || (insn & (1 << 24))) {
-        unallocated_encoding(s);
-        return;
-    }
-    diff = sextract32(insn, 5, 19) * 4;
-    cond = extract32(insn, 0, 4);
-
-    reset_btype(s);
-    if (cond < 0x0e) {
-        /* genuinely conditional branches */
-        DisasLabel match = gen_disas_label(s);
-        arm_gen_test_cc(cond, match.label);
-        gen_goto_tb(s, 0, 4);
-        set_disas_label(s, match);
-        gen_goto_tb(s, 1, diff);
-    } else {
-        /* 0xe and 0xf are both "always" conditions */
-        gen_goto_tb(s, 0, diff);
-    }
-}
-
-/* HINT instruction group, including various allocated HINTs */
-static void handle_hint(DisasContext *s, uint32_t insn,
-                        unsigned int op1, unsigned int op2, unsigned int crm)
-{
-    unsigned int selector = crm << 3 | op2;
-
-    if (op1 != 3) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    switch (selector) {
-    case 0b00000: /* NOP */
-        break;
-    case 0b00011: /* WFI */
-        s->base.is_jmp = DISAS_WFI;
-        break;
-    case 0b00001: /* YIELD */
-        /* When running in MTTCG we don't generate jumps to the yield and
-         * WFE helpers as it won't affect the scheduling of other vCPUs.
-         * If we wanted to more completely model WFE/SEV so we don't busy
-         * spin unnecessarily we would need to do something more involved.
-         */
-        if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) {
-            s->base.is_jmp = DISAS_YIELD;
-        }
-        break;
-    case 0b00010: /* WFE */
-        if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) {
-            s->base.is_jmp = DISAS_WFE;
-        }
-        break;
-    case 0b00100: /* SEV */
-    case 0b00101: /* SEVL */
-    case 0b00110: /* DGH */
-        /* we treat all as NOP at least for now */
-        break;
-    case 0b00111: /* XPACLRI */
-        if (s->pauth_active) {
-            gen_helper_xpaci(cpu_X[30], cpu_env, cpu_X[30]);
-        }
-        break;
-    case 0b01000: /* PACIA1716 */
-        if (s->pauth_active) {
-            gen_helper_pacia(cpu_X[17], cpu_env, cpu_X[17], cpu_X[16]);
-        }
-        break;
-    case 0b01010: /* PACIB1716 */
-        if (s->pauth_active) {
-            gen_helper_pacib(cpu_X[17], cpu_env, cpu_X[17], cpu_X[16]);
-        }
-        break;
-    case 0b01100: /* AUTIA1716 */
-        if (s->pauth_active) {
-            gen_helper_autia(cpu_X[17], cpu_env, cpu_X[17], cpu_X[16]);
-        }
-        break;
-    case 0b01110: /* AUTIB1716 */
-        if (s->pauth_active) {
-            gen_helper_autib(cpu_X[17], cpu_env, cpu_X[17], cpu_X[16]);
-        }
-        break;
-    case 0b10000: /* ESB */
-        /* Without RAS, we must implement this as NOP. */
-        if (dc_isar_feature(aa64_ras, s)) {
-            /*
-             * QEMU does not have a source of physical SErrors,
-             * so we are only concerned with virtual SErrors.
-             * The pseudocode in the ARM for this case is
-             *   if PSTATE.EL IN {EL0, EL1} && EL2Enabled() then
-             *      AArch64.vESBOperation();
-             * Most of the condition can be evaluated at translation time.
-             * Test for EL2 present, and defer test for SEL2 to runtime.
-             */
-            if (s->current_el <= 1 && arm_dc_feature(s, ARM_FEATURE_EL2)) {
-                gen_helper_vesb(cpu_env);
-            }
-        }
-        break;
-    case 0b11000: /* PACIAZ */
-        if (s->pauth_active) {
-            gen_helper_pacia(cpu_X[30], cpu_env, cpu_X[30],
-                                new_tmp_a64_zero(s));
-        }
-        break;
-    case 0b11001: /* PACIASP */
-        if (s->pauth_active) {
-            gen_helper_pacia(cpu_X[30], cpu_env, cpu_X[30], cpu_X[31]);
-        }
-        break;
-    case 0b11010: /* PACIBZ */
-        if (s->pauth_active) {
-            gen_helper_pacib(cpu_X[30], cpu_env, cpu_X[30],
-                                new_tmp_a64_zero(s));
-        }
-        break;
-    case 0b11011: /* PACIBSP */
-        if (s->pauth_active) {
-            gen_helper_pacib(cpu_X[30], cpu_env, cpu_X[30], cpu_X[31]);
-        }
-        break;
-    case 0b11100: /* AUTIAZ */
-        if (s->pauth_active) {
-            gen_helper_autia(cpu_X[30], cpu_env, cpu_X[30],
-                              new_tmp_a64_zero(s));
-        }
-        break;
-    case 0b11101: /* AUTIASP */
-        if (s->pauth_active) {
-            gen_helper_autia(cpu_X[30], cpu_env, cpu_X[30], cpu_X[31]);
-        }
-        break;
-    case 0b11110: /* AUTIBZ */
-        if (s->pauth_active) {
-            gen_helper_autib(cpu_X[30], cpu_env, cpu_X[30],
-                              new_tmp_a64_zero(s));
-        }
-        break;
-    case 0b11111: /* AUTIBSP */
-        if (s->pauth_active) {
-            gen_helper_autib(cpu_X[30], cpu_env, cpu_X[30], cpu_X[31]);
-        }
-        break;
-    default:
-        /* default specified as NOP equivalent */
-        break;
-    }
-}
-
-static void gen_clrex(DisasContext *s, uint32_t insn)
-{
-    tcg_gen_movi_i64(cpu_exclusive_addr, -1);
-}
-
-/* CLREX, DSB, DMB, ISB */
-static void handle_sync(DisasContext *s, uint32_t insn,
-                        unsigned int op1, unsigned int op2, unsigned int crm)
-{
-    TCGBar bar;
-
-    if (op1 != 3) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    switch (op2) {
-    case 2: /* CLREX */
-        gen_clrex(s, insn);
-        return;
-    case 4: /* DSB */
-    case 5: /* DMB */
-        switch (crm & 3) {
-        case 1: /* MBReqTypes_Reads */
-            bar = TCG_BAR_SC | TCG_MO_LD_LD | TCG_MO_LD_ST;
-            break;
-        case 2: /* MBReqTypes_Writes */
-            bar = TCG_BAR_SC | TCG_MO_ST_ST;
-            break;
-        default: /* MBReqTypes_All */
-            bar = TCG_BAR_SC | TCG_MO_ALL;
-            break;
-        }
-        tcg_gen_mb(bar);
-        return;
-    case 6: /* ISB */
-        /* We need to break the TB after this insn to execute
-         * a self-modified code correctly and also to take
-         * any pending interrupts immediately.
-         */
-        reset_btype(s);
-        gen_goto_tb(s, 0, 4);
-        return;
-
-    case 7: /* SB */
-        if (crm != 0 || !dc_isar_feature(aa64_sb, s)) {
-            goto do_unallocated;
-        }
-        /*
-         * TODO: There is no speculation barrier opcode for TCG;
-         * MB and end the TB instead.
-         */
-        tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
-        gen_goto_tb(s, 0, 4);
-        return;
-
-    default:
-    do_unallocated:
-        unallocated_encoding(s);
-        return;
-    }
-}
-
-static void gen_xaflag(void)
-{
-    TCGv_i32 z = tcg_temp_new_i32();
-
-    tcg_gen_setcondi_i32(TCG_COND_EQ, z, cpu_ZF, 0);
-
-    /*
-     * (!C & !Z) << 31
-     * (!(C | Z)) << 31
-     * ~((C | Z) << 31)
-     * ~-(C | Z)
-     * (C | Z) - 1
-     */
-    tcg_gen_or_i32(cpu_NF, cpu_CF, z);
-    tcg_gen_subi_i32(cpu_NF, cpu_NF, 1);
-
-    /* !(Z & C) */
-    tcg_gen_and_i32(cpu_ZF, z, cpu_CF);
-    tcg_gen_xori_i32(cpu_ZF, cpu_ZF, 1);
-
-    /* (!C & Z) << 31 -> -(Z & ~C) */
-    tcg_gen_andc_i32(cpu_VF, z, cpu_CF);
-    tcg_gen_neg_i32(cpu_VF, cpu_VF);
-
-    /* C | Z */
-    tcg_gen_or_i32(cpu_CF, cpu_CF, z);
-
-    tcg_temp_free_i32(z);
-}
-
-static void gen_axflag(void)
-{
-    tcg_gen_sari_i32(cpu_VF, cpu_VF, 31);         /* V ? -1 : 0 */
-    tcg_gen_andc_i32(cpu_CF, cpu_CF, cpu_VF);     /* C & !V */
-
-    /* !(Z | V) -> !(!ZF | V) -> ZF & !V -> ZF & ~VF */
-    tcg_gen_andc_i32(cpu_ZF, cpu_ZF, cpu_VF);
-
-    tcg_gen_movi_i32(cpu_NF, 0);
-    tcg_gen_movi_i32(cpu_VF, 0);
-}
-
-/* MSR (immediate) - move immediate to processor state field */
-static void handle_msr_i(DisasContext *s, uint32_t insn,
-                         unsigned int op1, unsigned int op2, unsigned int crm)
-{
-    int op = op1 << 3 | op2;
-
-    /* End the TB by default, chaining is ok.  */
-    s->base.is_jmp = DISAS_TOO_MANY;
-
-    switch (op) {
-    case 0x00: /* CFINV */
-        if (crm != 0 || !dc_isar_feature(aa64_condm_4, s)) {
-            goto do_unallocated;
-        }
-        tcg_gen_xori_i32(cpu_CF, cpu_CF, 1);
-        s->base.is_jmp = DISAS_NEXT;
-        break;
-
-    case 0x01: /* XAFlag */
-        if (crm != 0 || !dc_isar_feature(aa64_condm_5, s)) {
-            goto do_unallocated;
-        }
-        gen_xaflag();
-        s->base.is_jmp = DISAS_NEXT;
-        break;
-
-    case 0x02: /* AXFlag */
-        if (crm != 0 || !dc_isar_feature(aa64_condm_5, s)) {
-            goto do_unallocated;
-        }
-        gen_axflag();
-        s->base.is_jmp = DISAS_NEXT;
-        break;
-
-    case 0x03: /* UAO */
-        if (!dc_isar_feature(aa64_uao, s) || s->current_el == 0) {
-            goto do_unallocated;
-        }
-        if (crm & 1) {
-            set_pstate_bits(PSTATE_UAO);
-        } else {
-            clear_pstate_bits(PSTATE_UAO);
-        }
-        gen_rebuild_hflags(s);
-        break;
-
-    case 0x04: /* PAN */
-        if (!dc_isar_feature(aa64_pan, s) || s->current_el == 0) {
-            goto do_unallocated;
-        }
-        if (crm & 1) {
-            set_pstate_bits(PSTATE_PAN);
-        } else {
-            clear_pstate_bits(PSTATE_PAN);
-        }
-        gen_rebuild_hflags(s);
-        break;
-
-    case 0x05: /* SPSel */
-        if (s->current_el == 0) {
-            goto do_unallocated;
-        }
-        gen_helper_msr_i_spsel(cpu_env, tcg_constant_i32(crm & PSTATE_SP));
-        break;
-
-    case 0x19: /* SSBS */
-        if (!dc_isar_feature(aa64_ssbs, s)) {
-            goto do_unallocated;
-        }
-        if (crm & 1) {
-            set_pstate_bits(PSTATE_SSBS);
-        } else {
-            clear_pstate_bits(PSTATE_SSBS);
-        }
-        /* Don't need to rebuild hflags since SSBS is a nop */
-        break;
-
-    case 0x1a: /* DIT */
-        if (!dc_isar_feature(aa64_dit, s)) {
-            goto do_unallocated;
-        }
-        if (crm & 1) {
-            set_pstate_bits(PSTATE_DIT);
-        } else {
-            clear_pstate_bits(PSTATE_DIT);
-        }
-        /* There's no need to rebuild hflags because DIT is a nop */
-        break;
-
-    case 0x1e: /* DAIFSet */
-        gen_helper_msr_i_daifset(cpu_env, tcg_constant_i32(crm));
-        break;
-
-    case 0x1f: /* DAIFClear */
-        gen_helper_msr_i_daifclear(cpu_env, tcg_constant_i32(crm));
-        /* For DAIFClear, exit the cpu loop to re-evaluate pending IRQs.  */
-        s->base.is_jmp = DISAS_UPDATE_EXIT;
-        break;
-
-    case 0x1c: /* TCO */
-        if (dc_isar_feature(aa64_mte, s)) {
-            /* Full MTE is enabled -- set the TCO bit as directed. */
-            if (crm & 1) {
-                set_pstate_bits(PSTATE_TCO);
-            } else {
-                clear_pstate_bits(PSTATE_TCO);
-            }
-            gen_rebuild_hflags(s);
-            /* Many factors, including TCO, go into MTE_ACTIVE. */
-            s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
-        } else if (dc_isar_feature(aa64_mte_insn_reg, s)) {
-            /* Only "instructions accessible at EL0" -- PSTATE.TCO is WI.  */
-            s->base.is_jmp = DISAS_NEXT;
-        } else {
-            goto do_unallocated;
-        }
-        break;
-
-    case 0x1b: /* SVCR* */
-        if (!dc_isar_feature(aa64_sme, s) || crm < 2 || crm > 7) {
-            goto do_unallocated;
-        }
-        if (sme_access_check(s)) {
-            int old = s->pstate_sm | (s->pstate_za << 1);
-            int new = (crm & 1) * 3;
-            int msk = (crm >> 1) & 3;
-
-            if ((old ^ new) & msk) {
-                /* At least one bit changes. */
-                gen_helper_set_svcr(cpu_env, tcg_constant_i32(new),
-                                    tcg_constant_i32(msk));
-            } else {
-                s->base.is_jmp = DISAS_NEXT;
-            }
-        }
-        break;
-
-    default:
-    do_unallocated:
-        unallocated_encoding(s);
-        return;
-    }
-}
-
-static void gen_get_nzcv(TCGv_i64 tcg_rt)
-{
-    TCGv_i32 tmp = tcg_temp_new_i32();
-    TCGv_i32 nzcv = tcg_temp_new_i32();
-
-    /* build bit 31, N */
-    tcg_gen_andi_i32(nzcv, cpu_NF, (1U << 31));
-    /* build bit 30, Z */
-    tcg_gen_setcondi_i32(TCG_COND_EQ, tmp, cpu_ZF, 0);
-    tcg_gen_deposit_i32(nzcv, nzcv, tmp, 30, 1);
-    /* build bit 29, C */
-    tcg_gen_deposit_i32(nzcv, nzcv, cpu_CF, 29, 1);
-    /* build bit 28, V */
-    tcg_gen_shri_i32(tmp, cpu_VF, 31);
-    tcg_gen_deposit_i32(nzcv, nzcv, tmp, 28, 1);
-    /* generate result */
-    tcg_gen_extu_i32_i64(tcg_rt, nzcv);
-
-    tcg_temp_free_i32(nzcv);
-    tcg_temp_free_i32(tmp);
-}
-
-static void gen_set_nzcv(TCGv_i64 tcg_rt)
-{
-    TCGv_i32 nzcv = tcg_temp_new_i32();
-
-    /* take NZCV from R[t] */
-    tcg_gen_extrl_i64_i32(nzcv, tcg_rt);
-
-    /* bit 31, N */
-    tcg_gen_andi_i32(cpu_NF, nzcv, (1U << 31));
-    /* bit 30, Z */
-    tcg_gen_andi_i32(cpu_ZF, nzcv, (1 << 30));
-    tcg_gen_setcondi_i32(TCG_COND_EQ, cpu_ZF, cpu_ZF, 0);
-    /* bit 29, C */
-    tcg_gen_andi_i32(cpu_CF, nzcv, (1 << 29));
-    tcg_gen_shri_i32(cpu_CF, cpu_CF, 29);
-    /* bit 28, V */
-    tcg_gen_andi_i32(cpu_VF, nzcv, (1 << 28));
-    tcg_gen_shli_i32(cpu_VF, cpu_VF, 3);
-    tcg_temp_free_i32(nzcv);
-}
-
-static void gen_sysreg_undef(DisasContext *s, bool isread,
-                             uint8_t op0, uint8_t op1, uint8_t op2,
-                             uint8_t crn, uint8_t crm, uint8_t rt)
-{
-    /*
-     * Generate code to emit an UNDEF with correct syndrome
-     * information for a failed system register access.
-     * This is EC_UNCATEGORIZED (ie a standard UNDEF) in most cases,
-     * but if FEAT_IDST is implemented then read accesses to registers
-     * in the feature ID space are reported with the EC_SYSTEMREGISTERTRAP
-     * syndrome.
-     */
-    uint32_t syndrome;
-
-    if (isread && dc_isar_feature(aa64_ids, s) &&
-        arm_cpreg_encoding_in_idspace(op0, op1, op2, crn, crm)) {
-        syndrome = syn_aa64_sysregtrap(op0, op1, op2, crn, crm, rt, isread);
-    } else {
-        syndrome = syn_uncategorized();
-    }
-    gen_exception_insn(s, 0, EXCP_UDEF, syndrome);
-}
-
-/* MRS - move from system register
- * MSR (register) - move to system register
- * SYS
- * SYSL
- * These are all essentially the same insn in 'read' and 'write'
- * versions, with varying op0 fields.
- */
-static void handle_sys(DisasContext *s, uint32_t insn, bool isread,
-                       unsigned int op0, unsigned int op1, unsigned int op2,
-                       unsigned int crn, unsigned int crm, unsigned int rt)
-{
-    uint32_t key = ENCODE_AA64_CP_REG(CP_REG_ARM64_SYSREG_CP,
-                                      crn, crm, op0, op1, op2);
-    const ARMCPRegInfo *ri = get_arm_cp_reginfo(s->cp_regs, key);
-    TCGv_ptr tcg_ri = NULL;
-    TCGv_i64 tcg_rt;
-
-    if (!ri) {
-        /* Unknown register; this might be a guest error or a QEMU
-         * unimplemented feature.
-         */
-        qemu_log_mask(LOG_UNIMP, "%s access to unsupported AArch64 "
-                      "system register op0:%d op1:%d crn:%d crm:%d op2:%d\n",
-                      isread ? "read" : "write", op0, op1, crn, crm, op2);
-        gen_sysreg_undef(s, isread, op0, op1, op2, crn, crm, rt);
-        return;
-    }
-
-    /* Check access permissions */
-    if (!cp_access_ok(s->current_el, ri, isread)) {
-        gen_sysreg_undef(s, isread, op0, op1, op2, crn, crm, rt);
-        return;
-    }
-
-    if (ri->accessfn || (ri->fgt && s->fgt_active)) {
-        /* Emit code to perform further access permissions checks at
-         * runtime; this may result in an exception.
-         */
-        uint32_t syndrome;
-
-        syndrome = syn_aa64_sysregtrap(op0, op1, op2, crn, crm, rt, isread);
-        gen_a64_update_pc(s, 0);
-        tcg_ri = tcg_temp_new_ptr();
-        gen_helper_access_check_cp_reg(tcg_ri, cpu_env,
-                                       tcg_constant_i32(key),
-                                       tcg_constant_i32(syndrome),
-                                       tcg_constant_i32(isread));
-    } else if (ri->type & ARM_CP_RAISES_EXC) {
-        /*
-         * The readfn or writefn might raise an exception;
-         * synchronize the CPU state in case it does.
-         */
-        gen_a64_update_pc(s, 0);
-    }
-
-    /* Handle special cases first */
-    switch (ri->type & ARM_CP_SPECIAL_MASK) {
-    case 0:
-        break;
-    case ARM_CP_NOP:
-        goto exit;
-    case ARM_CP_NZCV:
-        tcg_rt = cpu_reg(s, rt);
-        if (isread) {
-            gen_get_nzcv(tcg_rt);
-        } else {
-            gen_set_nzcv(tcg_rt);
-        }
-        goto exit;
-    case ARM_CP_CURRENTEL:
-        /* Reads as current EL value from pstate, which is
-         * guaranteed to be constant by the tb flags.
-         */
-        tcg_rt = cpu_reg(s, rt);
-        tcg_gen_movi_i64(tcg_rt, s->current_el << 2);
-        goto exit;
-    case ARM_CP_DC_ZVA:
-        /* Writes clear the aligned block of memory which rt points into. */
-        if (s->mte_active[0]) {
-            int desc = 0;
-
-            desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s));
-            desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid);
-            desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma);
-
-            tcg_rt = new_tmp_a64(s);
-            gen_helper_mte_check_zva(tcg_rt, cpu_env,
-                                     tcg_constant_i32(desc), cpu_reg(s, rt));
-        } else {
-            tcg_rt = clean_data_tbi(s, cpu_reg(s, rt));
-        }
-        gen_helper_dc_zva(cpu_env, tcg_rt);
-        goto exit;
-    case ARM_CP_DC_GVA:
-        {
-            TCGv_i64 clean_addr, tag;
-
-            /*
-             * DC_GVA, like DC_ZVA, requires that we supply the original
-             * pointer for an invalid page.  Probe that address first.
-             */
-            tcg_rt = cpu_reg(s, rt);
-            clean_addr = clean_data_tbi(s, tcg_rt);
-            gen_probe_access(s, clean_addr, MMU_DATA_STORE, MO_8);
-
-            if (s->ata) {
-                /* Extract the tag from the register to match STZGM.  */
-                tag = tcg_temp_new_i64();
-                tcg_gen_shri_i64(tag, tcg_rt, 56);
-                gen_helper_stzgm_tags(cpu_env, clean_addr, tag);
-                tcg_temp_free_i64(tag);
-            }
-        }
-        goto exit;
-    case ARM_CP_DC_GZVA:
-        {
-            TCGv_i64 clean_addr, tag;
-
-            /* For DC_GZVA, we can rely on DC_ZVA for the proper fault. */
-            tcg_rt = cpu_reg(s, rt);
-            clean_addr = clean_data_tbi(s, tcg_rt);
-            gen_helper_dc_zva(cpu_env, clean_addr);
-
-            if (s->ata) {
-                /* Extract the tag from the register to match STZGM.  */
-                tag = tcg_temp_new_i64();
-                tcg_gen_shri_i64(tag, tcg_rt, 56);
-                gen_helper_stzgm_tags(cpu_env, clean_addr, tag);
-                tcg_temp_free_i64(tag);
-            }
-        }
-        goto exit;
-    default:
-        g_assert_not_reached();
-    }
-    if ((ri->type & ARM_CP_FPU) && !fp_access_check_only(s)) {
-        goto exit;
-    } else if ((ri->type & ARM_CP_SVE) && !sve_access_check(s)) {
-        goto exit;
-    } else if ((ri->type & ARM_CP_SME) && !sme_access_check(s)) {
-        goto exit;
-    }
-
-    if ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) {
-        gen_io_start();
-    }
-
-    tcg_rt = cpu_reg(s, rt);
-
-    if (isread) {
-        if (ri->type & ARM_CP_CONST) {
-            tcg_gen_movi_i64(tcg_rt, ri->resetvalue);
-        } else if (ri->readfn) {
-            if (!tcg_ri) {
-                tcg_ri = gen_lookup_cp_reg(key);
-            }
-            gen_helper_get_cp_reg64(tcg_rt, cpu_env, tcg_ri);
-        } else {
-            tcg_gen_ld_i64(tcg_rt, cpu_env, ri->fieldoffset);
-        }
-    } else {
-        if (ri->type & ARM_CP_CONST) {
-            /* If not forbidden by access permissions, treat as WI */
-            goto exit;
-        } else if (ri->writefn) {
-            if (!tcg_ri) {
-                tcg_ri = gen_lookup_cp_reg(key);
-            }
-            gen_helper_set_cp_reg64(cpu_env, tcg_ri, tcg_rt);
-        } else {
-            tcg_gen_st_i64(tcg_rt, cpu_env, ri->fieldoffset);
-        }
-    }
-
-    if ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) {
-        /* I/O operations must end the TB here (whether read or write) */
-        s->base.is_jmp = DISAS_UPDATE_EXIT;
-    }
-    if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {
-        /*
-         * A write to any coprocessor regiser that ends a TB
-         * must rebuild the hflags for the next TB.
-         */
-        gen_rebuild_hflags(s);
-        /*
-         * We default to ending the TB on a coprocessor register write,
-         * but allow this to be suppressed by the register definition
-         * (usually only necessary to work around guest bugs).
-         */
-        s->base.is_jmp = DISAS_UPDATE_EXIT;
-    }
-
- exit:
-    if (tcg_ri) {
-        tcg_temp_free_ptr(tcg_ri);
-    }
-}
-
-/* System
- *  31                 22 21  20 19 18 16 15   12 11    8 7   5 4    0
- * +---------------------+---+-----+-----+-------+-------+-----+------+
- * | 1 1 0 1 0 1 0 1 0 0 | L | op0 | op1 |  CRn  |  CRm  | op2 |  Rt  |
- * +---------------------+---+-----+-----+-------+-------+-----+------+
- */
-static void disas_system(DisasContext *s, uint32_t insn)
-{
-    unsigned int l, op0, op1, crn, crm, op2, rt;
-    l = extract32(insn, 21, 1);
-    op0 = extract32(insn, 19, 2);
-    op1 = extract32(insn, 16, 3);
-    crn = extract32(insn, 12, 4);
-    crm = extract32(insn, 8, 4);
-    op2 = extract32(insn, 5, 3);
-    rt = extract32(insn, 0, 5);
-
-    if (op0 == 0) {
-        if (l || rt != 31) {
-            unallocated_encoding(s);
-            return;
-        }
-        switch (crn) {
-        case 2: /* HINT (including allocated hints like NOP, YIELD, etc) */
-            handle_hint(s, insn, op1, op2, crm);
-            break;
-        case 3: /* CLREX, DSB, DMB, ISB */
-            handle_sync(s, insn, op1, op2, crm);
-            break;
-        case 4: /* MSR (immediate) */
-            handle_msr_i(s, insn, op1, op2, crm);
-            break;
-        default:
-            unallocated_encoding(s);
-            break;
-        }
-        return;
-    }
-    handle_sys(s, insn, l, op0, op1, op2, crn, crm, rt);
-}
-
-/* Exception generation
- *
- *  31             24 23 21 20                     5 4   2 1  0
- * +-----------------+-----+------------------------+-----+----+
- * | 1 1 0 1 0 1 0 0 | opc |          imm16         | op2 | LL |
- * +-----------------------+------------------------+----------+
- */
-static void disas_exc(DisasContext *s, uint32_t insn)
-{
-    int opc = extract32(insn, 21, 3);
-    int op2_ll = extract32(insn, 0, 5);
-    int imm16 = extract32(insn, 5, 16);
-    uint32_t syndrome;
-
-    switch (opc) {
-    case 0:
-        /* For SVC, HVC and SMC we advance the single-step state
-         * machine before taking the exception. This is architecturally
-         * mandated, to ensure that single-stepping a system call
-         * instruction works properly.
-         */
-        switch (op2_ll) {
-        case 1:                                                     /* SVC */
-            syndrome = syn_aa64_svc(imm16);
-            if (s->fgt_svc) {
-                gen_exception_insn_el(s, 0, EXCP_UDEF, syndrome, 2);
-                break;
-            }
-            gen_ss_advance(s);
-            gen_exception_insn(s, 4, EXCP_SWI, syndrome);
-            break;
-        case 2:                                                     /* HVC */
-            if (s->current_el == 0) {
-                unallocated_encoding(s);
-                break;
-            }
-            /* The pre HVC helper handles cases when HVC gets trapped
-             * as an undefined insn by runtime configuration.
-             */
-            gen_a64_update_pc(s, 0);
-            gen_helper_pre_hvc(cpu_env);
-            gen_ss_advance(s);
-            gen_exception_insn_el(s, 4, EXCP_HVC, syn_aa64_hvc(imm16), 2);
-            break;
-        case 3:                                                     /* SMC */
-            if (s->current_el == 0) {
-                unallocated_encoding(s);
-                break;
-            }
-            gen_a64_update_pc(s, 0);
-            gen_helper_pre_smc(cpu_env, tcg_constant_i32(syn_aa64_smc(imm16)));
-            gen_ss_advance(s);
-            gen_exception_insn_el(s, 4, EXCP_SMC, syn_aa64_smc(imm16), 3);
-            break;
-        default:
-            unallocated_encoding(s);
-            break;
-        }
-        break;
-    case 1:
-        if (op2_ll != 0) {
-            unallocated_encoding(s);
-            break;
-        }
-        /* BRK */
-        gen_exception_bkpt_insn(s, syn_aa64_bkpt(imm16));
-        break;
-    case 2:
-        if (op2_ll != 0) {
-            unallocated_encoding(s);
-            break;
-        }
-        /* HLT. This has two purposes.
-         * Architecturally, it is an external halting debug instruction.
-         * Since QEMU doesn't implement external debug, we treat this as
-         * it is required for halting debug disabled: it will UNDEF.
-         * Secondly, "HLT 0xf000" is the A64 semihosting syscall instruction.
-         */
-        if (semihosting_enabled(s->current_el == 0) && imm16 == 0xf000) {
-            gen_exception_internal_insn(s, EXCP_SEMIHOST);
-        } else {
-            unallocated_encoding(s);
-        }
-        break;
-    case 5:
-        if (op2_ll < 1 || op2_ll > 3) {
-            unallocated_encoding(s);
-            break;
-        }
-        /* DCPS1, DCPS2, DCPS3 */
-        unallocated_encoding(s);
-        break;
-    default:
-        unallocated_encoding(s);
-        break;
-    }
-}
-
-/* Unconditional branch (register)
- *  31           25 24   21 20   16 15   10 9    5 4     0
- * +---------------+-------+-------+-------+------+-------+
- * | 1 1 0 1 0 1 1 |  opc  |  op2  |  op3  |  Rn  |  op4  |
- * +---------------+-------+-------+-------+------+-------+
- */
-static void disas_uncond_b_reg(DisasContext *s, uint32_t insn)
-{
-    unsigned int opc, op2, op3, rn, op4;
-    unsigned btype_mod = 2;   /* 0: BR, 1: BLR, 2: other */
-    TCGv_i64 dst;
-    TCGv_i64 modifier;
-
-    opc = extract32(insn, 21, 4);
-    op2 = extract32(insn, 16, 5);
-    op3 = extract32(insn, 10, 6);
-    rn = extract32(insn, 5, 5);
-    op4 = extract32(insn, 0, 5);
-
-    if (op2 != 0x1f) {
-        goto do_unallocated;
-    }
-
-    switch (opc) {
-    case 0: /* BR */
-    case 1: /* BLR */
-    case 2: /* RET */
-        btype_mod = opc;
-        switch (op3) {
-        case 0:
-            /* BR, BLR, RET */
-            if (op4 != 0) {
-                goto do_unallocated;
-            }
-            dst = cpu_reg(s, rn);
-            break;
-
-        case 2:
-        case 3:
-            if (!dc_isar_feature(aa64_pauth, s)) {
-                goto do_unallocated;
-            }
-            if (opc == 2) {
-                /* RETAA, RETAB */
-                if (rn != 0x1f || op4 != 0x1f) {
-                    goto do_unallocated;
-                }
-                rn = 30;
-                modifier = cpu_X[31];
-            } else {
-                /* BRAAZ, BRABZ, BLRAAZ, BLRABZ */
-                if (op4 != 0x1f) {
-                    goto do_unallocated;
-                }
-                modifier = new_tmp_a64_zero(s);
-            }
-            if (s->pauth_active) {
-                dst = new_tmp_a64(s);
-                if (op3 == 2) {
-                    gen_helper_autia(dst, cpu_env, cpu_reg(s, rn), modifier);
-                } else {
-                    gen_helper_autib(dst, cpu_env, cpu_reg(s, rn), modifier);
-                }
-            } else {
-                dst = cpu_reg(s, rn);
-            }
-            break;
-
-        default:
-            goto do_unallocated;
-        }
-        /* BLR also needs to load return address */
-        if (opc == 1) {
-            TCGv_i64 lr = cpu_reg(s, 30);
-            if (dst == lr) {
-                TCGv_i64 tmp = new_tmp_a64(s);
-                tcg_gen_mov_i64(tmp, dst);
-                dst = tmp;
-            }
-            gen_pc_plus_diff(s, lr, curr_insn_len(s));
-        }
-        gen_a64_set_pc(s, dst);
-        break;
-
-    case 8: /* BRAA */
-    case 9: /* BLRAA */
-        if (!dc_isar_feature(aa64_pauth, s)) {
-            goto do_unallocated;
-        }
-        if ((op3 & ~1) != 2) {
-            goto do_unallocated;
-        }
-        btype_mod = opc & 1;
-        if (s->pauth_active) {
-            dst = new_tmp_a64(s);
-            modifier = cpu_reg_sp(s, op4);
-            if (op3 == 2) {
-                gen_helper_autia(dst, cpu_env, cpu_reg(s, rn), modifier);
-            } else {
-                gen_helper_autib(dst, cpu_env, cpu_reg(s, rn), modifier);
-            }
-        } else {
-            dst = cpu_reg(s, rn);
-        }
-        /* BLRAA also needs to load return address */
-        if (opc == 9) {
-            TCGv_i64 lr = cpu_reg(s, 30);
-            if (dst == lr) {
-                TCGv_i64 tmp = new_tmp_a64(s);
-                tcg_gen_mov_i64(tmp, dst);
-                dst = tmp;
-            }
-            gen_pc_plus_diff(s, lr, curr_insn_len(s));
-        }
-        gen_a64_set_pc(s, dst);
-        break;
-
-    case 4: /* ERET */
-        if (s->current_el == 0) {
-            goto do_unallocated;
-        }
-        switch (op3) {
-        case 0: /* ERET */
-            if (op4 != 0) {
-                goto do_unallocated;
-            }
-            if (s->fgt_eret) {
-                gen_exception_insn_el(s, 0, EXCP_UDEF, syn_erettrap(op3), 2);
-                return;
-            }
-            dst = tcg_temp_new_i64();
-            tcg_gen_ld_i64(dst, cpu_env,
-                           offsetof(CPUARMState, elr_el[s->current_el]));
-            break;
-
-        case 2: /* ERETAA */
-        case 3: /* ERETAB */
-            if (!dc_isar_feature(aa64_pauth, s)) {
-                goto do_unallocated;
-            }
-            if (rn != 0x1f || op4 != 0x1f) {
-                goto do_unallocated;
-            }
-            /* The FGT trap takes precedence over an auth trap. */
-            if (s->fgt_eret) {
-                gen_exception_insn_el(s, 0, EXCP_UDEF, syn_erettrap(op3), 2);
-                return;
-            }
-            dst = tcg_temp_new_i64();
-            tcg_gen_ld_i64(dst, cpu_env,
-                           offsetof(CPUARMState, elr_el[s->current_el]));
-            if (s->pauth_active) {
-                modifier = cpu_X[31];
-                if (op3 == 2) {
-                    gen_helper_autia(dst, cpu_env, dst, modifier);
-                } else {
-                    gen_helper_autib(dst, cpu_env, dst, modifier);
-                }
-            }
-            break;
-
-        default:
-            goto do_unallocated;
-        }
-        if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-            gen_io_start();
-        }
-
-        gen_helper_exception_return(cpu_env, dst);
-        tcg_temp_free_i64(dst);
-        /* Must exit loop to check un-masked IRQs */
-        s->base.is_jmp = DISAS_EXIT;
-        return;
-
-    case 5: /* DRPS */
-        if (op3 != 0 || op4 != 0 || rn != 0x1f) {
-            goto do_unallocated;
-        } else {
-            unallocated_encoding(s);
-        }
-        return;
-
-    default:
-    do_unallocated:
-        unallocated_encoding(s);
-        return;
-    }
-
-    switch (btype_mod) {
-    case 0: /* BR */
-        if (dc_isar_feature(aa64_bti, s)) {
-            /* BR to {x16,x17} or !guard -> 1, else 3.  */
-            set_btype(s, rn == 16 || rn == 17 || !s->guarded_page ? 1 : 3);
-        }
-        break;
-
-    case 1: /* BLR */
-        if (dc_isar_feature(aa64_bti, s)) {
-            /* BLR sets BTYPE to 2, regardless of source guarded page.  */
-            set_btype(s, 2);
-        }
-        break;
-
-    default: /* RET or none of the above.  */
-        /* BTYPE will be set to 0 by normal end-of-insn processing.  */
-        break;
-    }
-
-    s->base.is_jmp = DISAS_JUMP;
-}
-
-/* Branches, exception generating and system instructions */
-static void disas_b_exc_sys(DisasContext *s, uint32_t insn)
-{
-    switch (extract32(insn, 25, 7)) {
-    case 0x0a: case 0x0b:
-    case 0x4a: case 0x4b: /* Unconditional branch (immediate) */
-        disas_uncond_b_imm(s, insn);
-        break;
-    case 0x1a: case 0x5a: /* Compare & branch (immediate) */
-        disas_comp_b_imm(s, insn);
-        break;
-    case 0x1b: case 0x5b: /* Test & branch (immediate) */
-        disas_test_b_imm(s, insn);
-        break;
-    case 0x2a: /* Conditional branch (immediate) */
-        disas_cond_b_imm(s, insn);
-        break;
-    case 0x6a: /* Exception generation / System */
-        if (insn & (1 << 24)) {
-            if (extract32(insn, 22, 2) == 0) {
-                disas_system(s, insn);
-            } else {
-                unallocated_encoding(s);
-            }
-        } else {
-            disas_exc(s, insn);
-        }
-        break;
-    case 0x6b: /* Unconditional branch (register) */
-        disas_uncond_b_reg(s, insn);
-        break;
-    default:
-        unallocated_encoding(s);
-        break;
-    }
-}
-
-/*
- * Load/Store exclusive instructions are implemented by remembering
- * the value/address loaded, and seeing if these are the same
- * when the store is performed. This is not actually the architecturally
- * mandated semantics, but it works for typical guest code sequences
- * and avoids having to monitor regular stores.
- *
- * The store exclusive uses the atomic cmpxchg primitives to avoid
- * races in multi-threaded linux-user and when MTTCG softmmu is
- * enabled.
- */
-static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
-                               TCGv_i64 addr, int size, bool is_pair)
-{
-    int idx = get_mem_index(s);
-    MemOp memop = s->be_data;
-
-    g_assert(size <= 3);
-    if (is_pair) {
-        g_assert(size >= 2);
-        if (size == 2) {
-            /* The pair must be single-copy atomic for the doubleword.  */
-            memop |= MO_64 | MO_ALIGN;
-            tcg_gen_qemu_ld_i64(cpu_exclusive_val, addr, idx, memop);
-            if (s->be_data == MO_LE) {
-                tcg_gen_extract_i64(cpu_reg(s, rt), cpu_exclusive_val, 0, 32);
-                tcg_gen_extract_i64(cpu_reg(s, rt2), cpu_exclusive_val, 32, 32);
-            } else {
-                tcg_gen_extract_i64(cpu_reg(s, rt), cpu_exclusive_val, 32, 32);
-                tcg_gen_extract_i64(cpu_reg(s, rt2), cpu_exclusive_val, 0, 32);
-            }
-        } else {
-            /* The pair must be single-copy atomic for *each* doubleword, not
-               the entire quadword, however it must be quadword aligned.  */
-            memop |= MO_64;
-            tcg_gen_qemu_ld_i64(cpu_exclusive_val, addr, idx,
-                                memop | MO_ALIGN_16);
-
-            TCGv_i64 addr2 = tcg_temp_new_i64();
-            tcg_gen_addi_i64(addr2, addr, 8);
-            tcg_gen_qemu_ld_i64(cpu_exclusive_high, addr2, idx, memop);
-            tcg_temp_free_i64(addr2);
-
-            tcg_gen_mov_i64(cpu_reg(s, rt), cpu_exclusive_val);
-            tcg_gen_mov_i64(cpu_reg(s, rt2), cpu_exclusive_high);
-        }
-    } else {
-        memop |= size | MO_ALIGN;
-        tcg_gen_qemu_ld_i64(cpu_exclusive_val, addr, idx, memop);
-        tcg_gen_mov_i64(cpu_reg(s, rt), cpu_exclusive_val);
-    }
-    tcg_gen_mov_i64(cpu_exclusive_addr, addr);
-}
-
-static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
-                                TCGv_i64 addr, int size, int is_pair)
-{
-    /* if (env->exclusive_addr == addr && env->exclusive_val == [addr]
-     *     && (!is_pair || env->exclusive_high == [addr + datasize])) {
-     *     [addr] = {Rt};
-     *     if (is_pair) {
-     *         [addr + datasize] = {Rt2};
-     *     }
-     *     {Rd} = 0;
-     * } else {
-     *     {Rd} = 1;
-     * }
-     * env->exclusive_addr = -1;
-     */
-    TCGLabel *fail_label = gen_new_label();
-    TCGLabel *done_label = gen_new_label();
-    TCGv_i64 tmp;
-
-    tcg_gen_brcond_i64(TCG_COND_NE, addr, cpu_exclusive_addr, fail_label);
-
-    tmp = tcg_temp_new_i64();
-    if (is_pair) {
-        if (size == 2) {
-            if (s->be_data == MO_LE) {
-                tcg_gen_concat32_i64(tmp, cpu_reg(s, rt), cpu_reg(s, rt2));
-            } else {
-                tcg_gen_concat32_i64(tmp, cpu_reg(s, rt2), cpu_reg(s, rt));
-            }
-            tcg_gen_atomic_cmpxchg_i64(tmp, cpu_exclusive_addr,
-                                       cpu_exclusive_val, tmp,
-                                       get_mem_index(s),
-                                       MO_64 | MO_ALIGN | s->be_data);
-            tcg_gen_setcond_i64(TCG_COND_NE, tmp, tmp, cpu_exclusive_val);
-        } else {
-            TCGv_i128 t16 = tcg_temp_new_i128();
-            TCGv_i128 c16 = tcg_temp_new_i128();
-            TCGv_i64 a, b;
-
-            if (s->be_data == MO_LE) {
-                tcg_gen_concat_i64_i128(t16, cpu_reg(s, rt), cpu_reg(s, rt2));
-                tcg_gen_concat_i64_i128(c16, cpu_exclusive_val,
-                                        cpu_exclusive_high);
-            } else {
-                tcg_gen_concat_i64_i128(t16, cpu_reg(s, rt2), cpu_reg(s, rt));
-                tcg_gen_concat_i64_i128(c16, cpu_exclusive_high,
-                                        cpu_exclusive_val);
-            }
-
-            tcg_gen_atomic_cmpxchg_i128(t16, cpu_exclusive_addr, c16, t16,
-                                        get_mem_index(s),
-                                        MO_128 | MO_ALIGN | s->be_data);
-            tcg_temp_free_i128(c16);
-
-            a = tcg_temp_new_i64();
-            b = tcg_temp_new_i64();
-            if (s->be_data == MO_LE) {
-                tcg_gen_extr_i128_i64(a, b, t16);
-            } else {
-                tcg_gen_extr_i128_i64(b, a, t16);
-            }
-
-            tcg_gen_xor_i64(a, a, cpu_exclusive_val);
-            tcg_gen_xor_i64(b, b, cpu_exclusive_high);
-            tcg_gen_or_i64(tmp, a, b);
-            tcg_temp_free_i64(a);
-            tcg_temp_free_i64(b);
-            tcg_temp_free_i128(t16);
-
-            tcg_gen_setcondi_i64(TCG_COND_NE, tmp, tmp, 0);
-        }
-    } else {
-        tcg_gen_atomic_cmpxchg_i64(tmp, cpu_exclusive_addr, cpu_exclusive_val,
-                                   cpu_reg(s, rt), get_mem_index(s),
-                                   size | MO_ALIGN | s->be_data);
-        tcg_gen_setcond_i64(TCG_COND_NE, tmp, tmp, cpu_exclusive_val);
-    }
-    tcg_gen_mov_i64(cpu_reg(s, rd), tmp);
-    tcg_temp_free_i64(tmp);
-    tcg_gen_br(done_label);
-
-    gen_set_label(fail_label);
-    tcg_gen_movi_i64(cpu_reg(s, rd), 1);
-    gen_set_label(done_label);
-    tcg_gen_movi_i64(cpu_exclusive_addr, -1);
-}
-
-static void gen_compare_and_swap(DisasContext *s, int rs, int rt,
-                                 int rn, int size)
-{
-    TCGv_i64 tcg_rs = cpu_reg(s, rs);
-    TCGv_i64 tcg_rt = cpu_reg(s, rt);
-    int memidx = get_mem_index(s);
-    TCGv_i64 clean_addr;
-
-    if (rn == 31) {
-        gen_check_sp_alignment(s);
-    }
-    clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), true, rn != 31, size);
-    tcg_gen_atomic_cmpxchg_i64(tcg_rs, clean_addr, tcg_rs, tcg_rt, memidx,
-                               size | MO_ALIGN | s->be_data);
-}
-
-static void gen_compare_and_swap_pair(DisasContext *s, int rs, int rt,
-                                      int rn, int size)
-{
-    TCGv_i64 s1 = cpu_reg(s, rs);
-    TCGv_i64 s2 = cpu_reg(s, rs + 1);
-    TCGv_i64 t1 = cpu_reg(s, rt);
-    TCGv_i64 t2 = cpu_reg(s, rt + 1);
-    TCGv_i64 clean_addr;
-    int memidx = get_mem_index(s);
-
-    if (rn == 31) {
-        gen_check_sp_alignment(s);
-    }
-
-    /* This is a single atomic access, despite the "pair". */
-    clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), true, rn != 31, size + 1);
-
-    if (size == 2) {
-        TCGv_i64 cmp = tcg_temp_new_i64();
-        TCGv_i64 val = tcg_temp_new_i64();
-
-        if (s->be_data == MO_LE) {
-            tcg_gen_concat32_i64(val, t1, t2);
-            tcg_gen_concat32_i64(cmp, s1, s2);
-        } else {
-            tcg_gen_concat32_i64(val, t2, t1);
-            tcg_gen_concat32_i64(cmp, s2, s1);
-        }
-
-        tcg_gen_atomic_cmpxchg_i64(cmp, clean_addr, cmp, val, memidx,
-                                   MO_64 | MO_ALIGN | s->be_data);
-        tcg_temp_free_i64(val);
-
-        if (s->be_data == MO_LE) {
-            tcg_gen_extr32_i64(s1, s2, cmp);
-        } else {
-            tcg_gen_extr32_i64(s2, s1, cmp);
-        }
-        tcg_temp_free_i64(cmp);
-    } else {
-        TCGv_i128 cmp = tcg_temp_new_i128();
-        TCGv_i128 val = tcg_temp_new_i128();
-
-        if (s->be_data == MO_LE) {
-            tcg_gen_concat_i64_i128(val, t1, t2);
-            tcg_gen_concat_i64_i128(cmp, s1, s2);
-        } else {
-            tcg_gen_concat_i64_i128(val, t2, t1);
-            tcg_gen_concat_i64_i128(cmp, s2, s1);
-        }
-
-        tcg_gen_atomic_cmpxchg_i128(cmp, clean_addr, cmp, val, memidx,
-                                    MO_128 | MO_ALIGN | s->be_data);
-        tcg_temp_free_i128(val);
-
-        if (s->be_data == MO_LE) {
-            tcg_gen_extr_i128_i64(s1, s2, cmp);
-        } else {
-            tcg_gen_extr_i128_i64(s2, s1, cmp);
-        }
-        tcg_temp_free_i128(cmp);
-    }
-}
-
-/* Update the Sixty-Four bit (SF) registersize. This logic is derived
- * from the ARMv8 specs for LDR (Shared decode for all encodings).
- */
-static bool disas_ldst_compute_iss_sf(int size, bool is_signed, int opc)
-{
-    int opc0 = extract32(opc, 0, 1);
-    int regsize;
-
-    if (is_signed) {
-        regsize = opc0 ? 32 : 64;
-    } else {
-        regsize = size == 3 ? 64 : 32;
-    }
-    return regsize == 64;
-}
-
-/* Load/store exclusive
- *
- *  31 30 29         24  23  22   21  20  16  15  14   10 9    5 4    0
- * +-----+-------------+----+---+----+------+----+-------+------+------+
- * | sz  | 0 0 1 0 0 0 | o2 | L | o1 |  Rs  | o0 |  Rt2  |  Rn  | Rt   |
- * +-----+-------------+----+---+----+------+----+-------+------+------+
- *
- *  sz: 00 -> 8 bit, 01 -> 16 bit, 10 -> 32 bit, 11 -> 64 bit
- *   L: 0 -> store, 1 -> load
- *  o2: 0 -> exclusive, 1 -> not
- *  o1: 0 -> single register, 1 -> register pair
- *  o0: 1 -> load-acquire/store-release, 0 -> not
- */
-static void disas_ldst_excl(DisasContext *s, uint32_t insn)
-{
-    int rt = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    int rt2 = extract32(insn, 10, 5);
-    int rs = extract32(insn, 16, 5);
-    int is_lasr = extract32(insn, 15, 1);
-    int o2_L_o1_o0 = extract32(insn, 21, 3) * 2 | is_lasr;
-    int size = extract32(insn, 30, 2);
-    TCGv_i64 clean_addr;
-
-    switch (o2_L_o1_o0) {
-    case 0x0: /* STXR */
-    case 0x1: /* STLXR */
-        if (rn == 31) {
-            gen_check_sp_alignment(s);
-        }
-        if (is_lasr) {
-            tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL);
-        }
-        clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn),
-                                    true, rn != 31, size);
-        gen_store_exclusive(s, rs, rt, rt2, clean_addr, size, false);
-        return;
-
-    case 0x4: /* LDXR */
-    case 0x5: /* LDAXR */
-        if (rn == 31) {
-            gen_check_sp_alignment(s);
-        }
-        clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn),
-                                    false, rn != 31, size);
-        s->is_ldex = true;
-        gen_load_exclusive(s, rt, rt2, clean_addr, size, false);
-        if (is_lasr) {
-            tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ);
-        }
-        return;
-
-    case 0x8: /* STLLR */
-        if (!dc_isar_feature(aa64_lor, s)) {
-            break;
-        }
-        /* StoreLORelease is the same as Store-Release for QEMU.  */
-        /* fall through */
-    case 0x9: /* STLR */
-        /* Generate ISS for non-exclusive accesses including LASR.  */
-        if (rn == 31) {
-            gen_check_sp_alignment(s);
-        }
-        tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL);
-        clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn),
-                                    true, rn != 31, size);
-        /* TODO: ARMv8.4-LSE SCTLR.nAA */
-        do_gpr_st(s, cpu_reg(s, rt), clean_addr, size | MO_ALIGN, true, rt,
-                  disas_ldst_compute_iss_sf(size, false, 0), is_lasr);
-        return;
-
-    case 0xc: /* LDLAR */
-        if (!dc_isar_feature(aa64_lor, s)) {
-            break;
-        }
-        /* LoadLOAcquire is the same as Load-Acquire for QEMU.  */
-        /* fall through */
-    case 0xd: /* LDAR */
-        /* Generate ISS for non-exclusive accesses including LASR.  */
-        if (rn == 31) {
-            gen_check_sp_alignment(s);
-        }
-        clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn),
-                                    false, rn != 31, size);
-        /* TODO: ARMv8.4-LSE SCTLR.nAA */
-        do_gpr_ld(s, cpu_reg(s, rt), clean_addr, size | MO_ALIGN, false, true,
-                  rt, disas_ldst_compute_iss_sf(size, false, 0), is_lasr);
-        tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ);
-        return;
-
-    case 0x2: case 0x3: /* CASP / STXP */
-        if (size & 2) { /* STXP / STLXP */
-            if (rn == 31) {
-                gen_check_sp_alignment(s);
-            }
-            if (is_lasr) {
-                tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL);
-            }
-            clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn),
-                                        true, rn != 31, size);
-            gen_store_exclusive(s, rs, rt, rt2, clean_addr, size, true);
-            return;
-        }
-        if (rt2 == 31
-            && ((rt | rs) & 1) == 0
-            && dc_isar_feature(aa64_atomics, s)) {
-            /* CASP / CASPL */
-            gen_compare_and_swap_pair(s, rs, rt, rn, size | 2);
-            return;
-        }
-        break;
-
-    case 0x6: case 0x7: /* CASPA / LDXP */
-        if (size & 2) { /* LDXP / LDAXP */
-            if (rn == 31) {
-                gen_check_sp_alignment(s);
-            }
-            clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn),
-                                        false, rn != 31, size);
-            s->is_ldex = true;
-            gen_load_exclusive(s, rt, rt2, clean_addr, size, true);
-            if (is_lasr) {
-                tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ);
-            }
-            return;
-        }
-        if (rt2 == 31
-            && ((rt | rs) & 1) == 0
-            && dc_isar_feature(aa64_atomics, s)) {
-            /* CASPA / CASPAL */
-            gen_compare_and_swap_pair(s, rs, rt, rn, size | 2);
-            return;
-        }
-        break;
-
-    case 0xa: /* CAS */
-    case 0xb: /* CASL */
-    case 0xe: /* CASA */
-    case 0xf: /* CASAL */
-        if (rt2 == 31 && dc_isar_feature(aa64_atomics, s)) {
-            gen_compare_and_swap(s, rs, rt, rn, size);
-            return;
-        }
-        break;
-    }
-    unallocated_encoding(s);
-}
-
-/*
- * Load register (literal)
- *
- *  31 30 29   27  26 25 24 23                5 4     0
- * +-----+-------+---+-----+-------------------+-------+
- * | opc | 0 1 1 | V | 0 0 |     imm19         |  Rt   |
- * +-----+-------+---+-----+-------------------+-------+
- *
- * V: 1 -> vector (simd/fp)
- * opc (non-vector): 00 -> 32 bit, 01 -> 64 bit,
- *                   10-> 32 bit signed, 11 -> prefetch
- * opc (vector): 00 -> 32 bit, 01 -> 64 bit, 10 -> 128 bit (11 unallocated)
- */
-static void disas_ld_lit(DisasContext *s, uint32_t insn)
-{
-    int rt = extract32(insn, 0, 5);
-    int64_t imm = sextract32(insn, 5, 19) << 2;
-    bool is_vector = extract32(insn, 26, 1);
-    int opc = extract32(insn, 30, 2);
-    bool is_signed = false;
-    int size = 2;
-    TCGv_i64 tcg_rt, clean_addr;
-
-    if (is_vector) {
-        if (opc == 3) {
-            unallocated_encoding(s);
-            return;
-        }
-        size = 2 + opc;
-        if (!fp_access_check(s)) {
-            return;
-        }
-    } else {
-        if (opc == 3) {
-            /* PRFM (literal) : prefetch */
-            return;
-        }
-        size = 2 + extract32(opc, 0, 1);
-        is_signed = extract32(opc, 1, 1);
-    }
-
-    tcg_rt = cpu_reg(s, rt);
-
-    clean_addr = new_tmp_a64(s);
-    gen_pc_plus_diff(s, clean_addr, imm);
-    if (is_vector) {
-        do_fp_ld(s, rt, clean_addr, size);
-    } else {
-        /* Only unsigned 32bit loads target 32bit registers.  */
-        bool iss_sf = opc != 0;
-
-        do_gpr_ld(s, tcg_rt, clean_addr, size + is_signed * MO_SIGN,
-                  false, true, rt, iss_sf, false);
-    }
-}
-
-/*
- * LDNP (Load Pair - non-temporal hint)
- * LDP (Load Pair - non vector)
- * LDPSW (Load Pair Signed Word - non vector)
- * STNP (Store Pair - non-temporal hint)
- * STP (Store Pair - non vector)
- * LDNP (Load Pair of SIMD&FP - non-temporal hint)
- * LDP (Load Pair of SIMD&FP)
- * STNP (Store Pair of SIMD&FP - non-temporal hint)
- * STP (Store Pair of SIMD&FP)
- *
- *  31 30 29   27  26  25 24   23  22 21   15 14   10 9    5 4    0
- * +-----+-------+---+---+-------+---+-----------------------------+
- * | opc | 1 0 1 | V | 0 | index | L |  imm7 |  Rt2  |  Rn  | Rt   |
- * +-----+-------+---+---+-------+---+-------+-------+------+------+
- *
- * opc: LDP/STP/LDNP/STNP        00 -> 32 bit, 10 -> 64 bit
- *      LDPSW/STGP               01
- *      LDP/STP/LDNP/STNP (SIMD) 00 -> 32 bit, 01 -> 64 bit, 10 -> 128 bit
- *   V: 0 -> GPR, 1 -> Vector
- * idx: 00 -> signed offset with non-temporal hint, 01 -> post-index,
- *      10 -> signed offset, 11 -> pre-index
- *   L: 0 -> Store 1 -> Load
- *
- * Rt, Rt2 = GPR or SIMD registers to be stored
- * Rn = general purpose register containing address
- * imm7 = signed offset (multiple of 4 or 8 depending on size)
- */
-static void disas_ldst_pair(DisasContext *s, uint32_t insn)
-{
-    int rt = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    int rt2 = extract32(insn, 10, 5);
-    uint64_t offset = sextract64(insn, 15, 7);
-    int index = extract32(insn, 23, 2);
-    bool is_vector = extract32(insn, 26, 1);
-    bool is_load = extract32(insn, 22, 1);
-    int opc = extract32(insn, 30, 2);
-
-    bool is_signed = false;
-    bool postindex = false;
-    bool wback = false;
-    bool set_tag = false;
-
-    TCGv_i64 clean_addr, dirty_addr;
-
-    int size;
-
-    if (opc == 3) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (is_vector) {
-        size = 2 + opc;
-    } else if (opc == 1 && !is_load) {
-        /* STGP */
-        if (!dc_isar_feature(aa64_mte_insn_reg, s) || index == 0) {
-            unallocated_encoding(s);
-            return;
-        }
-        size = 3;
-        set_tag = true;
-    } else {
-        size = 2 + extract32(opc, 1, 1);
-        is_signed = extract32(opc, 0, 1);
-        if (!is_load && is_signed) {
-            unallocated_encoding(s);
-            return;
-        }
-    }
-
-    switch (index) {
-    case 1: /* post-index */
-        postindex = true;
-        wback = true;
-        break;
-    case 0:
-        /* signed offset with "non-temporal" hint. Since we don't emulate
-         * caches we don't care about hints to the cache system about
-         * data access patterns, and handle this identically to plain
-         * signed offset.
-         */
-        if (is_signed) {
-            /* There is no non-temporal-hint version of LDPSW */
-            unallocated_encoding(s);
-            return;
-        }
-        postindex = false;
-        break;
-    case 2: /* signed offset, rn not updated */
-        postindex = false;
-        break;
-    case 3: /* pre-index */
-        postindex = false;
-        wback = true;
-        break;
-    }
-
-    if (is_vector && !fp_access_check(s)) {
-        return;
-    }
-
-    offset <<= (set_tag ? LOG2_TAG_GRANULE : size);
-
-    if (rn == 31) {
-        gen_check_sp_alignment(s);
-    }
-
-    dirty_addr = read_cpu_reg_sp(s, rn, 1);
-    if (!postindex) {
-        tcg_gen_addi_i64(dirty_addr, dirty_addr, offset);
-    }
-
-    if (set_tag) {
-        if (!s->ata) {
-            /*
-             * TODO: We could rely on the stores below, at least for
-             * system mode, if we arrange to add MO_ALIGN_16.
-             */
-            gen_helper_stg_stub(cpu_env, dirty_addr);
-        } else if (tb_cflags(s->base.tb) & CF_PARALLEL) {
-            gen_helper_stg_parallel(cpu_env, dirty_addr, dirty_addr);
-        } else {
-            gen_helper_stg(cpu_env, dirty_addr, dirty_addr);
-        }
-    }
-
-    clean_addr = gen_mte_checkN(s, dirty_addr, !is_load,
-                                (wback || rn != 31) && !set_tag, 2 << size);
-
-    if (is_vector) {
-        if (is_load) {
-            do_fp_ld(s, rt, clean_addr, size);
-        } else {
-            do_fp_st(s, rt, clean_addr, size);
-        }
-        tcg_gen_addi_i64(clean_addr, clean_addr, 1 << size);
-        if (is_load) {
-            do_fp_ld(s, rt2, clean_addr, size);
-        } else {
-            do_fp_st(s, rt2, clean_addr, size);
-        }
-    } else {
-        TCGv_i64 tcg_rt = cpu_reg(s, rt);
-        TCGv_i64 tcg_rt2 = cpu_reg(s, rt2);
-
-        if (is_load) {
-            TCGv_i64 tmp = tcg_temp_new_i64();
-
-            /* Do not modify tcg_rt before recognizing any exception
-             * from the second load.
-             */
-            do_gpr_ld(s, tmp, clean_addr, size + is_signed * MO_SIGN,
-                      false, false, 0, false, false);
-            tcg_gen_addi_i64(clean_addr, clean_addr, 1 << size);
-            do_gpr_ld(s, tcg_rt2, clean_addr, size + is_signed * MO_SIGN,
-                      false, false, 0, false, false);
-
-            tcg_gen_mov_i64(tcg_rt, tmp);
-            tcg_temp_free_i64(tmp);
-        } else {
-            do_gpr_st(s, tcg_rt, clean_addr, size,
-                      false, 0, false, false);
-            tcg_gen_addi_i64(clean_addr, clean_addr, 1 << size);
-            do_gpr_st(s, tcg_rt2, clean_addr, size,
-                      false, 0, false, false);
-        }
-    }
-
-    if (wback) {
-        if (postindex) {
-            tcg_gen_addi_i64(dirty_addr, dirty_addr, offset);
-        }
-        tcg_gen_mov_i64(cpu_reg_sp(s, rn), dirty_addr);
-    }
-}
-
-/*
- * Load/store (immediate post-indexed)
- * Load/store (immediate pre-indexed)
- * Load/store (unscaled immediate)
- *
- * 31 30 29   27  26 25 24 23 22 21  20    12 11 10 9    5 4    0
- * +----+-------+---+-----+-----+---+--------+-----+------+------+
- * |size| 1 1 1 | V | 0 0 | opc | 0 |  imm9  | idx |  Rn  |  Rt  |
- * +----+-------+---+-----+-----+---+--------+-----+------+------+
- *
- * idx = 01 -> post-indexed, 11 pre-indexed, 00 unscaled imm. (no writeback)
-         10 -> unprivileged
- * V = 0 -> non-vector
- * size: 00 -> 8 bit, 01 -> 16 bit, 10 -> 32 bit, 11 -> 64bit
- * opc: 00 -> store, 01 -> loadu, 10 -> loads 64, 11 -> loads 32
- */
-static void disas_ldst_reg_imm9(DisasContext *s, uint32_t insn,
-                                int opc,
-                                int size,
-                                int rt,
-                                bool is_vector)
-{
-    int rn = extract32(insn, 5, 5);
-    int imm9 = sextract32(insn, 12, 9);
-    int idx = extract32(insn, 10, 2);
-    bool is_signed = false;
-    bool is_store = false;
-    bool is_extended = false;
-    bool is_unpriv = (idx == 2);
-    bool iss_valid;
-    bool post_index;
-    bool writeback;
-    int memidx;
-
-    TCGv_i64 clean_addr, dirty_addr;
-
-    if (is_vector) {
-        size |= (opc & 2) << 1;
-        if (size > 4 || is_unpriv) {
-            unallocated_encoding(s);
-            return;
-        }
-        is_store = ((opc & 1) == 0);
-        if (!fp_access_check(s)) {
-            return;
-        }
-    } else {
-        if (size == 3 && opc == 2) {
-            /* PRFM - prefetch */
-            if (idx != 0) {
-                unallocated_encoding(s);
-                return;
-            }
-            return;
-        }
-        if (opc == 3 && size > 1) {
-            unallocated_encoding(s);
-            return;
-        }
-        is_store = (opc == 0);
-        is_signed = extract32(opc, 1, 1);
-        is_extended = (size < 3) && extract32(opc, 0, 1);
-    }
-
-    switch (idx) {
-    case 0:
-    case 2:
-        post_index = false;
-        writeback = false;
-        break;
-    case 1:
-        post_index = true;
-        writeback = true;
-        break;
-    case 3:
-        post_index = false;
-        writeback = true;
-        break;
-    default:
-        g_assert_not_reached();
-    }
-
-    iss_valid = !is_vector && !writeback;
-
-    if (rn == 31) {
-        gen_check_sp_alignment(s);
-    }
-
-    dirty_addr = read_cpu_reg_sp(s, rn, 1);
-    if (!post_index) {
-        tcg_gen_addi_i64(dirty_addr, dirty_addr, imm9);
-    }
-
-    memidx = is_unpriv ? get_a64_user_mem_index(s) : get_mem_index(s);
-    clean_addr = gen_mte_check1_mmuidx(s, dirty_addr, is_store,
-                                       writeback || rn != 31,
-                                       size, is_unpriv, memidx);
-
-    if (is_vector) {
-        if (is_store) {
-            do_fp_st(s, rt, clean_addr, size);
-        } else {
-            do_fp_ld(s, rt, clean_addr, size);
-        }
-    } else {
-        TCGv_i64 tcg_rt = cpu_reg(s, rt);
-        bool iss_sf = disas_ldst_compute_iss_sf(size, is_signed, opc);
-
-        if (is_store) {
-            do_gpr_st_memidx(s, tcg_rt, clean_addr, size, memidx,
-                             iss_valid, rt, iss_sf, false);
-        } else {
-            do_gpr_ld_memidx(s, tcg_rt, clean_addr, size + is_signed * MO_SIGN,
-                             is_extended, memidx,
-                             iss_valid, rt, iss_sf, false);
-        }
-    }
-
-    if (writeback) {
-        TCGv_i64 tcg_rn = cpu_reg_sp(s, rn);
-        if (post_index) {
-            tcg_gen_addi_i64(dirty_addr, dirty_addr, imm9);
-        }
-        tcg_gen_mov_i64(tcg_rn, dirty_addr);
-    }
-}
-
-/*
- * Load/store (register offset)
- *
- * 31 30 29   27  26 25 24 23 22 21  20  16 15 13 12 11 10 9  5 4  0
- * +----+-------+---+-----+-----+---+------+-----+--+-----+----+----+
- * |size| 1 1 1 | V | 0 0 | opc | 1 |  Rm  | opt | S| 1 0 | Rn | Rt |
- * +----+-------+---+-----+-----+---+------+-----+--+-----+----+----+
- *
- * For non-vector:
- *   size: 00-> byte, 01 -> 16 bit, 10 -> 32bit, 11 -> 64bit
- *   opc: 00 -> store, 01 -> loadu, 10 -> loads 64, 11 -> loads 32
- * For vector:
- *   size is opc<1>:size<1:0> so 100 -> 128 bit; 110 and 111 unallocated
- *   opc<0>: 0 -> store, 1 -> load
- * V: 1 -> vector/simd
- * opt: extend encoding (see DecodeRegExtend)
- * S: if S=1 then scale (essentially index by sizeof(size))
- * Rt: register to transfer into/out of
- * Rn: address register or SP for base
- * Rm: offset register or ZR for offset
- */
-static void disas_ldst_reg_roffset(DisasContext *s, uint32_t insn,
-                                   int opc,
-                                   int size,
-                                   int rt,
-                                   bool is_vector)
-{
-    int rn = extract32(insn, 5, 5);
-    int shift = extract32(insn, 12, 1);
-    int rm = extract32(insn, 16, 5);
-    int opt = extract32(insn, 13, 3);
-    bool is_signed = false;
-    bool is_store = false;
-    bool is_extended = false;
-
-    TCGv_i64 tcg_rm, clean_addr, dirty_addr;
-
-    if (extract32(opt, 1, 1) == 0) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (is_vector) {
-        size |= (opc & 2) << 1;
-        if (size > 4) {
-            unallocated_encoding(s);
-            return;
-        }
-        is_store = !extract32(opc, 0, 1);
-        if (!fp_access_check(s)) {
-            return;
-        }
-    } else {
-        if (size == 3 && opc == 2) {
-            /* PRFM - prefetch */
-            return;
-        }
-        if (opc == 3 && size > 1) {
-            unallocated_encoding(s);
-            return;
-        }
-        is_store = (opc == 0);
-        is_signed = extract32(opc, 1, 1);
-        is_extended = (size < 3) && extract32(opc, 0, 1);
-    }
-
-    if (rn == 31) {
-        gen_check_sp_alignment(s);
-    }
-    dirty_addr = read_cpu_reg_sp(s, rn, 1);
-
-    tcg_rm = read_cpu_reg(s, rm, 1);
-    ext_and_shift_reg(tcg_rm, tcg_rm, opt, shift ? size : 0);
-
-    tcg_gen_add_i64(dirty_addr, dirty_addr, tcg_rm);
-    clean_addr = gen_mte_check1(s, dirty_addr, is_store, true, size);
-
-    if (is_vector) {
-        if (is_store) {
-            do_fp_st(s, rt, clean_addr, size);
-        } else {
-            do_fp_ld(s, rt, clean_addr, size);
-        }
-    } else {
-        TCGv_i64 tcg_rt = cpu_reg(s, rt);
-        bool iss_sf = disas_ldst_compute_iss_sf(size, is_signed, opc);
-        if (is_store) {
-            do_gpr_st(s, tcg_rt, clean_addr, size,
-                      true, rt, iss_sf, false);
-        } else {
-            do_gpr_ld(s, tcg_rt, clean_addr, size + is_signed * MO_SIGN,
-                      is_extended, true, rt, iss_sf, false);
-        }
-    }
-}
-
-/*
- * Load/store (unsigned immediate)
- *
- * 31 30 29   27  26 25 24 23 22 21        10 9     5
- * +----+-------+---+-----+-----+------------+-------+------+
- * |size| 1 1 1 | V | 0 1 | opc |   imm12    |  Rn   |  Rt  |
- * +----+-------+---+-----+-----+------------+-------+------+
- *
- * For non-vector:
- *   size: 00-> byte, 01 -> 16 bit, 10 -> 32bit, 11 -> 64bit
- *   opc: 00 -> store, 01 -> loadu, 10 -> loads 64, 11 -> loads 32
- * For vector:
- *   size is opc<1>:size<1:0> so 100 -> 128 bit; 110 and 111 unallocated
- *   opc<0>: 0 -> store, 1 -> load
- * Rn: base address register (inc SP)
- * Rt: target register
- */
-static void disas_ldst_reg_unsigned_imm(DisasContext *s, uint32_t insn,
-                                        int opc,
-                                        int size,
-                                        int rt,
-                                        bool is_vector)
-{
-    int rn = extract32(insn, 5, 5);
-    unsigned int imm12 = extract32(insn, 10, 12);
-    unsigned int offset;
-
-    TCGv_i64 clean_addr, dirty_addr;
-
-    bool is_store;
-    bool is_signed = false;
-    bool is_extended = false;
-
-    if (is_vector) {
-        size |= (opc & 2) << 1;
-        if (size > 4) {
-            unallocated_encoding(s);
-            return;
-        }
-        is_store = !extract32(opc, 0, 1);
-        if (!fp_access_check(s)) {
-            return;
-        }
-    } else {
-        if (size == 3 && opc == 2) {
-            /* PRFM - prefetch */
-            return;
-        }
-        if (opc == 3 && size > 1) {
-            unallocated_encoding(s);
-            return;
-        }
-        is_store = (opc == 0);
-        is_signed = extract32(opc, 1, 1);
-        is_extended = (size < 3) && extract32(opc, 0, 1);
-    }
-
-    if (rn == 31) {
-        gen_check_sp_alignment(s);
-    }
-    dirty_addr = read_cpu_reg_sp(s, rn, 1);
-    offset = imm12 << size;
-    tcg_gen_addi_i64(dirty_addr, dirty_addr, offset);
-    clean_addr = gen_mte_check1(s, dirty_addr, is_store, rn != 31, size);
-
-    if (is_vector) {
-        if (is_store) {
-            do_fp_st(s, rt, clean_addr, size);
-        } else {
-            do_fp_ld(s, rt, clean_addr, size);
-        }
-    } else {
-        TCGv_i64 tcg_rt = cpu_reg(s, rt);
-        bool iss_sf = disas_ldst_compute_iss_sf(size, is_signed, opc);
-        if (is_store) {
-            do_gpr_st(s, tcg_rt, clean_addr, size,
-                      true, rt, iss_sf, false);
-        } else {
-            do_gpr_ld(s, tcg_rt, clean_addr, size + is_signed * MO_SIGN,
-                      is_extended, true, rt, iss_sf, false);
-        }
-    }
-}
-
-/* Atomic memory operations
- *
- *  31  30      27  26    24    22  21   16   15    12    10    5     0
- * +------+-------+---+-----+-----+---+----+----+-----+-----+----+-----+
- * | size | 1 1 1 | V | 0 0 | A R | 1 | Rs | o3 | opc | 0 0 | Rn |  Rt |
- * +------+-------+---+-----+-----+--------+----+-----+-----+----+-----+
- *
- * Rt: the result register
- * Rn: base address or SP
- * Rs: the source register for the operation
- * V: vector flag (always 0 as of v8.3)
- * A: acquire flag
- * R: release flag
- */
-static void disas_ldst_atomic(DisasContext *s, uint32_t insn,
-                              int size, int rt, bool is_vector)
-{
-    int rs = extract32(insn, 16, 5);
-    int rn = extract32(insn, 5, 5);
-    int o3_opc = extract32(insn, 12, 4);
-    bool r = extract32(insn, 22, 1);
-    bool a = extract32(insn, 23, 1);
-    TCGv_i64 tcg_rs, tcg_rt, clean_addr;
-    AtomicThreeOpFn *fn = NULL;
-    MemOp mop = s->be_data | size | MO_ALIGN;
-
-    if (is_vector || !dc_isar_feature(aa64_atomics, s)) {
-        unallocated_encoding(s);
-        return;
-    }
-    switch (o3_opc) {
-    case 000: /* LDADD */
-        fn = tcg_gen_atomic_fetch_add_i64;
-        break;
-    case 001: /* LDCLR */
-        fn = tcg_gen_atomic_fetch_and_i64;
-        break;
-    case 002: /* LDEOR */
-        fn = tcg_gen_atomic_fetch_xor_i64;
-        break;
-    case 003: /* LDSET */
-        fn = tcg_gen_atomic_fetch_or_i64;
-        break;
-    case 004: /* LDSMAX */
-        fn = tcg_gen_atomic_fetch_smax_i64;
-        mop |= MO_SIGN;
-        break;
-    case 005: /* LDSMIN */
-        fn = tcg_gen_atomic_fetch_smin_i64;
-        mop |= MO_SIGN;
-        break;
-    case 006: /* LDUMAX */
-        fn = tcg_gen_atomic_fetch_umax_i64;
-        break;
-    case 007: /* LDUMIN */
-        fn = tcg_gen_atomic_fetch_umin_i64;
-        break;
-    case 010: /* SWP */
-        fn = tcg_gen_atomic_xchg_i64;
-        break;
-    case 014: /* LDAPR, LDAPRH, LDAPRB */
-        if (!dc_isar_feature(aa64_rcpc_8_3, s) ||
-            rs != 31 || a != 1 || r != 0) {
-            unallocated_encoding(s);
-            return;
-        }
-        break;
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (rn == 31) {
-        gen_check_sp_alignment(s);
-    }
-    clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), false, rn != 31, size);
-
-    if (o3_opc == 014) {
-        /*
-         * LDAPR* are a special case because they are a simple load, not a
-         * fetch-and-do-something op.
-         * The architectural consistency requirements here are weaker than
-         * full load-acquire (we only need "load-acquire processor consistent"),
-         * but we choose to implement them as full LDAQ.
-         */
-        do_gpr_ld(s, cpu_reg(s, rt), clean_addr, size, false,
-                  true, rt, disas_ldst_compute_iss_sf(size, false, 0), true);
-        tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ);
-        return;
-    }
-
-    tcg_rs = read_cpu_reg(s, rs, true);
-    tcg_rt = cpu_reg(s, rt);
-
-    if (o3_opc == 1) { /* LDCLR */
-        tcg_gen_not_i64(tcg_rs, tcg_rs);
-    }
-
-    /* The tcg atomic primitives are all full barriers.  Therefore we
-     * can ignore the Acquire and Release bits of this instruction.
-     */
-    fn(tcg_rt, clean_addr, tcg_rs, get_mem_index(s), mop);
-
-    if ((mop & MO_SIGN) && size != MO_64) {
-        tcg_gen_ext32u_i64(tcg_rt, tcg_rt);
-    }
-}
-
-/*
- * PAC memory operations
- *
- *  31  30      27  26    24    22  21       12  11  10    5     0
- * +------+-------+---+-----+-----+---+--------+---+---+----+-----+
- * | size | 1 1 1 | V | 0 0 | M S | 1 |  imm9  | W | 1 | Rn |  Rt |
- * +------+-------+---+-----+-----+---+--------+---+---+----+-----+
- *
- * Rt: the result register
- * Rn: base address or SP
- * V: vector flag (always 0 as of v8.3)
- * M: clear for key DA, set for key DB
- * W: pre-indexing flag
- * S: sign for imm9.
- */
-static void disas_ldst_pac(DisasContext *s, uint32_t insn,
-                           int size, int rt, bool is_vector)
-{
-    int rn = extract32(insn, 5, 5);
-    bool is_wback = extract32(insn, 11, 1);
-    bool use_key_a = !extract32(insn, 23, 1);
-    int offset;
-    TCGv_i64 clean_addr, dirty_addr, tcg_rt;
-
-    if (size != 3 || is_vector || !dc_isar_feature(aa64_pauth, s)) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (rn == 31) {
-        gen_check_sp_alignment(s);
-    }
-    dirty_addr = read_cpu_reg_sp(s, rn, 1);
-
-    if (s->pauth_active) {
-        if (use_key_a) {
-            gen_helper_autda(dirty_addr, cpu_env, dirty_addr,
-                             new_tmp_a64_zero(s));
-        } else {
-            gen_helper_autdb(dirty_addr, cpu_env, dirty_addr,
-                             new_tmp_a64_zero(s));
-        }
-    }
-
-    /* Form the 10-bit signed, scaled offset.  */
-    offset = (extract32(insn, 22, 1) << 9) | extract32(insn, 12, 9);
-    offset = sextract32(offset << size, 0, 10 + size);
-    tcg_gen_addi_i64(dirty_addr, dirty_addr, offset);
-
-    /* Note that "clean" and "dirty" here refer to TBI not PAC.  */
-    clean_addr = gen_mte_check1(s, dirty_addr, false,
-                                is_wback || rn != 31, size);
-
-    tcg_rt = cpu_reg(s, rt);
-    do_gpr_ld(s, tcg_rt, clean_addr, size,
-              /* extend */ false, /* iss_valid */ !is_wback,
-              /* iss_srt */ rt, /* iss_sf */ true, /* iss_ar */ false);
-
-    if (is_wback) {
-        tcg_gen_mov_i64(cpu_reg_sp(s, rn), dirty_addr);
-    }
-}
-
-/*
- * LDAPR/STLR (unscaled immediate)
- *
- *  31  30            24    22  21       12    10    5     0
- * +------+-------------+-----+---+--------+-----+----+-----+
- * | size | 0 1 1 0 0 1 | opc | 0 |  imm9  | 0 0 | Rn |  Rt |
- * +------+-------------+-----+---+--------+-----+----+-----+
- *
- * Rt: source or destination register
- * Rn: base register
- * imm9: unscaled immediate offset
- * opc: 00: STLUR*, 01/10/11: various LDAPUR*
- * size: size of load/store
- */
-static void disas_ldst_ldapr_stlr(DisasContext *s, uint32_t insn)
-{
-    int rt = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    int offset = sextract32(insn, 12, 9);
-    int opc = extract32(insn, 22, 2);
-    int size = extract32(insn, 30, 2);
-    TCGv_i64 clean_addr, dirty_addr;
-    bool is_store = false;
-    bool extend = false;
-    bool iss_sf;
-    MemOp mop;
-
-    if (!dc_isar_feature(aa64_rcpc_8_4, s)) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    /* TODO: ARMv8.4-LSE SCTLR.nAA */
-    mop = size | MO_ALIGN;
-
-    switch (opc) {
-    case 0: /* STLURB */
-        is_store = true;
-        break;
-    case 1: /* LDAPUR* */
-        break;
-    case 2: /* LDAPURS* 64-bit variant */
-        if (size == 3) {
-            unallocated_encoding(s);
-            return;
-        }
-        mop |= MO_SIGN;
-        break;
-    case 3: /* LDAPURS* 32-bit variant */
-        if (size > 1) {
-            unallocated_encoding(s);
-            return;
-        }
-        mop |= MO_SIGN;
-        extend = true; /* zero-extend 32->64 after signed load */
-        break;
-    default:
-        g_assert_not_reached();
-    }
-
-    iss_sf = disas_ldst_compute_iss_sf(size, (mop & MO_SIGN) != 0, opc);
-
-    if (rn == 31) {
-        gen_check_sp_alignment(s);
-    }
-
-    dirty_addr = read_cpu_reg_sp(s, rn, 1);
-    tcg_gen_addi_i64(dirty_addr, dirty_addr, offset);
-    clean_addr = clean_data_tbi(s, dirty_addr);
-
-    if (is_store) {
-        /* Store-Release semantics */
-        tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL);
-        do_gpr_st(s, cpu_reg(s, rt), clean_addr, mop, true, rt, iss_sf, true);
-    } else {
-        /*
-         * Load-AcquirePC semantics; we implement as the slightly more
-         * restrictive Load-Acquire.
-         */
-        do_gpr_ld(s, cpu_reg(s, rt), clean_addr, mop,
-                  extend, true, rt, iss_sf, true);
-        tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ);
-    }
-}
-
-/* Load/store register (all forms) */
-static void disas_ldst_reg(DisasContext *s, uint32_t insn)
-{
-    int rt = extract32(insn, 0, 5);
-    int opc = extract32(insn, 22, 2);
-    bool is_vector = extract32(insn, 26, 1);
-    int size = extract32(insn, 30, 2);
-
-    switch (extract32(insn, 24, 2)) {
-    case 0:
-        if (extract32(insn, 21, 1) == 0) {
-            /* Load/store register (unscaled immediate)
-             * Load/store immediate pre/post-indexed
-             * Load/store register unprivileged
-             */
-            disas_ldst_reg_imm9(s, insn, opc, size, rt, is_vector);
-            return;
-        }
-        switch (extract32(insn, 10, 2)) {
-        case 0:
-            disas_ldst_atomic(s, insn, size, rt, is_vector);
-            return;
-        case 2:
-            disas_ldst_reg_roffset(s, insn, opc, size, rt, is_vector);
-            return;
-        default:
-            disas_ldst_pac(s, insn, size, rt, is_vector);
-            return;
-        }
-        break;
-    case 1:
-        disas_ldst_reg_unsigned_imm(s, insn, opc, size, rt, is_vector);
-        return;
-    }
-    unallocated_encoding(s);
-}
-
-/* AdvSIMD load/store multiple structures
- *
- *  31  30  29           23 22  21         16 15    12 11  10 9    5 4    0
- * +---+---+---------------+---+-------------+--------+------+------+------+
- * | 0 | Q | 0 0 1 1 0 0 0 | L | 0 0 0 0 0 0 | opcode | size |  Rn  |  Rt  |
- * +---+---+---------------+---+-------------+--------+------+------+------+
- *
- * AdvSIMD load/store multiple structures (post-indexed)
- *
- *  31  30  29           23 22  21  20     16 15    12 11  10 9    5 4    0
- * +---+---+---------------+---+---+---------+--------+------+------+------+
- * | 0 | Q | 0 0 1 1 0 0 1 | L | 0 |   Rm    | opcode | size |  Rn  |  Rt  |
- * +---+---+---------------+---+---+---------+--------+------+------+------+
- *
- * Rt: first (or only) SIMD&FP register to be transferred
- * Rn: base address or SP
- * Rm (post-index only): post-index register (when !31) or size dependent #imm
- */
-static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn)
-{
-    int rt = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    int rm = extract32(insn, 16, 5);
-    int size = extract32(insn, 10, 2);
-    int opcode = extract32(insn, 12, 4);
-    bool is_store = !extract32(insn, 22, 1);
-    bool is_postidx = extract32(insn, 23, 1);
-    bool is_q = extract32(insn, 30, 1);
-    TCGv_i64 clean_addr, tcg_rn, tcg_ebytes;
-    MemOp endian, align, mop;
-
-    int total;    /* total bytes */
-    int elements; /* elements per vector */
-    int rpt;    /* num iterations */
-    int selem;  /* structure elements */
-    int r;
-
-    if (extract32(insn, 31, 1) || extract32(insn, 21, 1)) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!is_postidx && rm != 0) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    /* From the shared decode logic */
-    switch (opcode) {
-    case 0x0:
-        rpt = 1;
-        selem = 4;
-        break;
-    case 0x2:
-        rpt = 4;
-        selem = 1;
-        break;
-    case 0x4:
-        rpt = 1;
-        selem = 3;
-        break;
-    case 0x6:
-        rpt = 3;
-        selem = 1;
-        break;
-    case 0x7:
-        rpt = 1;
-        selem = 1;
-        break;
-    case 0x8:
-        rpt = 1;
-        selem = 2;
-        break;
-    case 0xa:
-        rpt = 2;
-        selem = 1;
-        break;
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (size == 3 && !is_q && selem != 1) {
-        /* reserved */
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    if (rn == 31) {
-        gen_check_sp_alignment(s);
-    }
-
-    /* For our purposes, bytes are always little-endian.  */
-    endian = s->be_data;
-    if (size == 0) {
-        endian = MO_LE;
-    }
-
-    total = rpt * selem * (is_q ? 16 : 8);
-    tcg_rn = cpu_reg_sp(s, rn);
-
-    /*
-     * Issue the MTE check vs the logical repeat count, before we
-     * promote consecutive little-endian elements below.
-     */
-    clean_addr = gen_mte_checkN(s, tcg_rn, is_store, is_postidx || rn != 31,
-                                total);
-
-    /*
-     * Consecutive little-endian elements from a single register
-     * can be promoted to a larger little-endian operation.
-     */
-    align = MO_ALIGN;
-    if (selem == 1 && endian == MO_LE) {
-        align = pow2_align(size);
-        size = 3;
-    }
-    if (!s->align_mem) {
-        align = 0;
-    }
-    mop = endian | size | align;
-
-    elements = (is_q ? 16 : 8) >> size;
-    tcg_ebytes = tcg_constant_i64(1 << size);
-    for (r = 0; r < rpt; r++) {
-        int e;
-        for (e = 0; e < elements; e++) {
-            int xs;
-            for (xs = 0; xs < selem; xs++) {
-                int tt = (rt + r + xs) % 32;
-                if (is_store) {
-                    do_vec_st(s, tt, e, clean_addr, mop);
-                } else {
-                    do_vec_ld(s, tt, e, clean_addr, mop);
-                }
-                tcg_gen_add_i64(clean_addr, clean_addr, tcg_ebytes);
-            }
-        }
-    }
-
-    if (!is_store) {
-        /* For non-quad operations, setting a slice of the low
-         * 64 bits of the register clears the high 64 bits (in
-         * the ARM ARM pseudocode this is implicit in the fact
-         * that 'rval' is a 64 bit wide variable).
-         * For quad operations, we might still need to zero the
-         * high bits of SVE.
-         */
-        for (r = 0; r < rpt * selem; r++) {
-            int tt = (rt + r) % 32;
-            clear_vec_high(s, is_q, tt);
-        }
-    }
-
-    if (is_postidx) {
-        if (rm == 31) {
-            tcg_gen_addi_i64(tcg_rn, tcg_rn, total);
-        } else {
-            tcg_gen_add_i64(tcg_rn, tcg_rn, cpu_reg(s, rm));
-        }
-    }
-}
-
-/* AdvSIMD load/store single structure
- *
- *  31  30  29           23 22 21 20       16 15 13 12  11  10 9    5 4    0
- * +---+---+---------------+-----+-----------+-----+---+------+------+------+
- * | 0 | Q | 0 0 1 1 0 1 0 | L R | 0 0 0 0 0 | opc | S | size |  Rn  |  Rt  |
- * +---+---+---------------+-----+-----------+-----+---+------+------+------+
- *
- * AdvSIMD load/store single structure (post-indexed)
- *
- *  31  30  29           23 22 21 20       16 15 13 12  11  10 9    5 4    0
- * +---+---+---------------+-----+-----------+-----+---+------+------+------+
- * | 0 | Q | 0 0 1 1 0 1 1 | L R |     Rm    | opc | S | size |  Rn  |  Rt  |
- * +---+---+---------------+-----+-----------+-----+---+------+------+------+
- *
- * Rt: first (or only) SIMD&FP register to be transferred
- * Rn: base address or SP
- * Rm (post-index only): post-index register (when !31) or size dependent #imm
- * index = encoded in Q:S:size dependent on size
- *
- * lane_size = encoded in R, opc
- * transfer width = encoded in opc, S, size
- */
-static void disas_ldst_single_struct(DisasContext *s, uint32_t insn)
-{
-    int rt = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    int rm = extract32(insn, 16, 5);
-    int size = extract32(insn, 10, 2);
-    int S = extract32(insn, 12, 1);
-    int opc = extract32(insn, 13, 3);
-    int R = extract32(insn, 21, 1);
-    int is_load = extract32(insn, 22, 1);
-    int is_postidx = extract32(insn, 23, 1);
-    int is_q = extract32(insn, 30, 1);
-
-    int scale = extract32(opc, 1, 2);
-    int selem = (extract32(opc, 0, 1) << 1 | R) + 1;
-    bool replicate = false;
-    int index = is_q << 3 | S << 2 | size;
-    int xs, total;
-    TCGv_i64 clean_addr, tcg_rn, tcg_ebytes;
-    MemOp mop;
-
-    if (extract32(insn, 31, 1)) {
-        unallocated_encoding(s);
-        return;
-    }
-    if (!is_postidx && rm != 0) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    switch (scale) {
-    case 3:
-        if (!is_load || S) {
-            unallocated_encoding(s);
-            return;
-        }
-        scale = size;
-        replicate = true;
-        break;
-    case 0:
-        break;
-    case 1:
-        if (extract32(size, 0, 1)) {
-            unallocated_encoding(s);
-            return;
-        }
-        index >>= 1;
-        break;
-    case 2:
-        if (extract32(size, 1, 1)) {
-            unallocated_encoding(s);
-            return;
-        }
-        if (!extract32(size, 0, 1)) {
-            index >>= 2;
-        } else {
-            if (S) {
-                unallocated_encoding(s);
-                return;
-            }
-            index >>= 3;
-            scale = 3;
-        }
-        break;
-    default:
-        g_assert_not_reached();
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    if (rn == 31) {
-        gen_check_sp_alignment(s);
-    }
-
-    total = selem << scale;
-    tcg_rn = cpu_reg_sp(s, rn);
-
-    clean_addr = gen_mte_checkN(s, tcg_rn, !is_load, is_postidx || rn != 31,
-                                total);
-    mop = finalize_memop(s, scale);
-
-    tcg_ebytes = tcg_constant_i64(1 << scale);
-    for (xs = 0; xs < selem; xs++) {
-        if (replicate) {
-            /* Load and replicate to all elements */
-            TCGv_i64 tcg_tmp = tcg_temp_new_i64();
-
-            tcg_gen_qemu_ld_i64(tcg_tmp, clean_addr, get_mem_index(s), mop);
-            tcg_gen_gvec_dup_i64(scale, vec_full_reg_offset(s, rt),
-                                 (is_q + 1) * 8, vec_full_reg_size(s),
-                                 tcg_tmp);
-            tcg_temp_free_i64(tcg_tmp);
-        } else {
-            /* Load/store one element per register */
-            if (is_load) {
-                do_vec_ld(s, rt, index, clean_addr, mop);
-            } else {
-                do_vec_st(s, rt, index, clean_addr, mop);
-            }
-        }
-        tcg_gen_add_i64(clean_addr, clean_addr, tcg_ebytes);
-        rt = (rt + 1) % 32;
-    }
-
-    if (is_postidx) {
-        if (rm == 31) {
-            tcg_gen_addi_i64(tcg_rn, tcg_rn, total);
-        } else {
-            tcg_gen_add_i64(tcg_rn, tcg_rn, cpu_reg(s, rm));
-        }
-    }
-}
-
-/*
- * Load/Store memory tags
- *
- *  31 30 29         24     22  21     12    10      5      0
- * +-----+-------------+-----+---+------+-----+------+------+
- * | 1 1 | 0 1 1 0 0 1 | op1 | 1 | imm9 | op2 |  Rn  |  Rt  |
- * +-----+-------------+-----+---+------+-----+------+------+
- */
-static void disas_ldst_tag(DisasContext *s, uint32_t insn)
-{
-    int rt = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    uint64_t offset = sextract64(insn, 12, 9) << LOG2_TAG_GRANULE;
-    int op2 = extract32(insn, 10, 2);
-    int op1 = extract32(insn, 22, 2);
-    bool is_load = false, is_pair = false, is_zero = false, is_mult = false;
-    int index = 0;
-    TCGv_i64 addr, clean_addr, tcg_rt;
-
-    /* We checked insn bits [29:24,21] in the caller.  */
-    if (extract32(insn, 30, 2) != 3) {
-        goto do_unallocated;
-    }
-
-    /*
-     * @index is a tri-state variable which has 3 states:
-     * < 0 : post-index, writeback
-     * = 0 : signed offset
-     * > 0 : pre-index, writeback
-     */
-    switch (op1) {
-    case 0:
-        if (op2 != 0) {
-            /* STG */
-            index = op2 - 2;
-        } else {
-            /* STZGM */
-            if (s->current_el == 0 || offset != 0) {
-                goto do_unallocated;
-            }
-            is_mult = is_zero = true;
-        }
-        break;
-    case 1:
-        if (op2 != 0) {
-            /* STZG */
-            is_zero = true;
-            index = op2 - 2;
-        } else {
-            /* LDG */
-            is_load = true;
-        }
-        break;
-    case 2:
-        if (op2 != 0) {
-            /* ST2G */
-            is_pair = true;
-            index = op2 - 2;
-        } else {
-            /* STGM */
-            if (s->current_el == 0 || offset != 0) {
-                goto do_unallocated;
-            }
-            is_mult = true;
-        }
-        break;
-    case 3:
-        if (op2 != 0) {
-            /* STZ2G */
-            is_pair = is_zero = true;
-            index = op2 - 2;
-        } else {
-            /* LDGM */
-            if (s->current_el == 0 || offset != 0) {
-                goto do_unallocated;
-            }
-            is_mult = is_load = true;
-        }
-        break;
-
-    default:
-    do_unallocated:
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (is_mult
-        ? !dc_isar_feature(aa64_mte, s)
-        : !dc_isar_feature(aa64_mte_insn_reg, s)) {
-        goto do_unallocated;
-    }
-
-    if (rn == 31) {
-        gen_check_sp_alignment(s);
-    }
-
-    addr = read_cpu_reg_sp(s, rn, true);
-    if (index >= 0) {
-        /* pre-index or signed offset */
-        tcg_gen_addi_i64(addr, addr, offset);
-    }
-
-    if (is_mult) {
-        tcg_rt = cpu_reg(s, rt);
-
-        if (is_zero) {
-            int size = 4 << s->dcz_blocksize;
-
-            if (s->ata) {
-                gen_helper_stzgm_tags(cpu_env, addr, tcg_rt);
-            }
-            /*
-             * The non-tags portion of STZGM is mostly like DC_ZVA,
-             * except the alignment happens before the access.
-             */
-            clean_addr = clean_data_tbi(s, addr);
-            tcg_gen_andi_i64(clean_addr, clean_addr, -size);
-            gen_helper_dc_zva(cpu_env, clean_addr);
-        } else if (s->ata) {
-            if (is_load) {
-                gen_helper_ldgm(tcg_rt, cpu_env, addr);
-            } else {
-                gen_helper_stgm(cpu_env, addr, tcg_rt);
-            }
-        } else {
-            MMUAccessType acc = is_load ? MMU_DATA_LOAD : MMU_DATA_STORE;
-            int size = 4 << GMID_EL1_BS;
-
-            clean_addr = clean_data_tbi(s, addr);
-            tcg_gen_andi_i64(clean_addr, clean_addr, -size);
-            gen_probe_access(s, clean_addr, acc, size);
-
-            if (is_load) {
-                /* The result tags are zeros.  */
-                tcg_gen_movi_i64(tcg_rt, 0);
-            }
-        }
-        return;
-    }
-
-    if (is_load) {
-        tcg_gen_andi_i64(addr, addr, -TAG_GRANULE);
-        tcg_rt = cpu_reg(s, rt);
-        if (s->ata) {
-            gen_helper_ldg(tcg_rt, cpu_env, addr, tcg_rt);
-        } else {
-            clean_addr = clean_data_tbi(s, addr);
-            gen_probe_access(s, clean_addr, MMU_DATA_LOAD, MO_8);
-            gen_address_with_allocation_tag0(tcg_rt, addr);
-        }
-    } else {
-        tcg_rt = cpu_reg_sp(s, rt);
-        if (!s->ata) {
-            /*
-             * For STG and ST2G, we need to check alignment and probe memory.
-             * TODO: For STZG and STZ2G, we could rely on the stores below,
-             * at least for system mode; user-only won't enforce alignment.
-             */
-            if (is_pair) {
-                gen_helper_st2g_stub(cpu_env, addr);
-            } else {
-                gen_helper_stg_stub(cpu_env, addr);
-            }
-        } else if (tb_cflags(s->base.tb) & CF_PARALLEL) {
-            if (is_pair) {
-                gen_helper_st2g_parallel(cpu_env, addr, tcg_rt);
-            } else {
-                gen_helper_stg_parallel(cpu_env, addr, tcg_rt);
-            }
-        } else {
-            if (is_pair) {
-                gen_helper_st2g(cpu_env, addr, tcg_rt);
-            } else {
-                gen_helper_stg(cpu_env, addr, tcg_rt);
-            }
-        }
-    }
-
-    if (is_zero) {
-        TCGv_i64 clean_addr = clean_data_tbi(s, addr);
-        TCGv_i64 tcg_zero = tcg_constant_i64(0);
-        int mem_index = get_mem_index(s);
-        int i, n = (1 + is_pair) << LOG2_TAG_GRANULE;
-
-        tcg_gen_qemu_st_i64(tcg_zero, clean_addr, mem_index,
-                            MO_UQ | MO_ALIGN_16);
-        for (i = 8; i < n; i += 8) {
-            tcg_gen_addi_i64(clean_addr, clean_addr, 8);
-            tcg_gen_qemu_st_i64(tcg_zero, clean_addr, mem_index, MO_UQ);
-        }
-    }
-
-    if (index != 0) {
-        /* pre-index or post-index */
-        if (index < 0) {
-            /* post-index */
-            tcg_gen_addi_i64(addr, addr, offset);
-        }
-        tcg_gen_mov_i64(cpu_reg_sp(s, rn), addr);
-    }
-}
-
-/* Loads and stores */
-static void disas_ldst(DisasContext *s, uint32_t insn)
-{
-    switch (extract32(insn, 24, 6)) {
-    case 0x08: /* Load/store exclusive */
-        disas_ldst_excl(s, insn);
-        break;
-    case 0x18: case 0x1c: /* Load register (literal) */
-        disas_ld_lit(s, insn);
-        break;
-    case 0x28: case 0x29:
-    case 0x2c: case 0x2d: /* Load/store pair (all forms) */
-        disas_ldst_pair(s, insn);
-        break;
-    case 0x38: case 0x39:
-    case 0x3c: case 0x3d: /* Load/store register (all forms) */
-        disas_ldst_reg(s, insn);
-        break;
-    case 0x0c: /* AdvSIMD load/store multiple structures */
-        disas_ldst_multiple_struct(s, insn);
-        break;
-    case 0x0d: /* AdvSIMD load/store single structure */
-        disas_ldst_single_struct(s, insn);
-        break;
-    case 0x19:
-        if (extract32(insn, 21, 1) != 0) {
-            disas_ldst_tag(s, insn);
-        } else if (extract32(insn, 10, 2) == 0) {
-            disas_ldst_ldapr_stlr(s, insn);
-        } else {
-            unallocated_encoding(s);
-        }
-        break;
-    default:
-        unallocated_encoding(s);
-        break;
-    }
-}
-
-/* PC-rel. addressing
- *   31  30   29 28       24 23                5 4    0
- * +----+-------+-----------+-------------------+------+
- * | op | immlo | 1 0 0 0 0 |       immhi       |  Rd  |
- * +----+-------+-----------+-------------------+------+
- */
-static void disas_pc_rel_adr(DisasContext *s, uint32_t insn)
-{
-    unsigned int page, rd;
-    int64_t offset;
-
-    page = extract32(insn, 31, 1);
-    /* SignExtend(immhi:immlo) -> offset */
-    offset = sextract64(insn, 5, 19);
-    offset = offset << 2 | extract32(insn, 29, 2);
-    rd = extract32(insn, 0, 5);
-
-    if (page) {
-        /* ADRP (page based) */
-        offset <<= 12;
-        /* The page offset is ok for TARGET_TB_PCREL. */
-        offset -= s->pc_curr & 0xfff;
-    }
-
-    gen_pc_plus_diff(s, cpu_reg(s, rd), offset);
-}
-
-/*
- * Add/subtract (immediate)
- *
- *  31 30 29 28         23 22 21         10 9   5 4   0
- * +--+--+--+-------------+--+-------------+-----+-----+
- * |sf|op| S| 1 0 0 0 1 0 |sh|    imm12    |  Rn | Rd  |
- * +--+--+--+-------------+--+-------------+-----+-----+
- *
- *    sf: 0 -> 32bit, 1 -> 64bit
- *    op: 0 -> add  , 1 -> sub
- *     S: 1 -> set flags
- *    sh: 1 -> LSL imm by 12
- */
-static void disas_add_sub_imm(DisasContext *s, uint32_t insn)
-{
-    int rd = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    uint64_t imm = extract32(insn, 10, 12);
-    bool shift = extract32(insn, 22, 1);
-    bool setflags = extract32(insn, 29, 1);
-    bool sub_op = extract32(insn, 30, 1);
-    bool is_64bit = extract32(insn, 31, 1);
-
-    TCGv_i64 tcg_rn = cpu_reg_sp(s, rn);
-    TCGv_i64 tcg_rd = setflags ? cpu_reg(s, rd) : cpu_reg_sp(s, rd);
-    TCGv_i64 tcg_result;
-
-    if (shift) {
-        imm <<= 12;
-    }
-
-    tcg_result = tcg_temp_new_i64();
-    if (!setflags) {
-        if (sub_op) {
-            tcg_gen_subi_i64(tcg_result, tcg_rn, imm);
-        } else {
-            tcg_gen_addi_i64(tcg_result, tcg_rn, imm);
-        }
-    } else {
-        TCGv_i64 tcg_imm = tcg_constant_i64(imm);
-        if (sub_op) {
-            gen_sub_CC(is_64bit, tcg_result, tcg_rn, tcg_imm);
-        } else {
-            gen_add_CC(is_64bit, tcg_result, tcg_rn, tcg_imm);
-        }
-    }
-
-    if (is_64bit) {
-        tcg_gen_mov_i64(tcg_rd, tcg_result);
-    } else {
-        tcg_gen_ext32u_i64(tcg_rd, tcg_result);
-    }
-
-    tcg_temp_free_i64(tcg_result);
-}
-
-/*
- * Add/subtract (immediate, with tags)
- *
- *  31 30 29 28         23 22 21     16 14      10 9   5 4   0
- * +--+--+--+-------------+--+---------+--+-------+-----+-----+
- * |sf|op| S| 1 0 0 0 1 1 |o2|  uimm6  |o3| uimm4 |  Rn | Rd  |
- * +--+--+--+-------------+--+---------+--+-------+-----+-----+
- *
- *    op: 0 -> add, 1 -> sub
- */
-static void disas_add_sub_imm_with_tags(DisasContext *s, uint32_t insn)
-{
-    int rd = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    int uimm4 = extract32(insn, 10, 4);
-    int uimm6 = extract32(insn, 16, 6);
-    bool sub_op = extract32(insn, 30, 1);
-    TCGv_i64 tcg_rn, tcg_rd;
-    int imm;
-
-    /* Test all of sf=1, S=0, o2=0, o3=0.  */
-    if ((insn & 0xa040c000u) != 0x80000000u ||
-        !dc_isar_feature(aa64_mte_insn_reg, s)) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    imm = uimm6 << LOG2_TAG_GRANULE;
-    if (sub_op) {
-        imm = -imm;
-    }
-
-    tcg_rn = cpu_reg_sp(s, rn);
-    tcg_rd = cpu_reg_sp(s, rd);
-
-    if (s->ata) {
-        gen_helper_addsubg(tcg_rd, cpu_env, tcg_rn,
-                           tcg_constant_i32(imm),
-                           tcg_constant_i32(uimm4));
-    } else {
-        tcg_gen_addi_i64(tcg_rd, tcg_rn, imm);
-        gen_address_with_allocation_tag0(tcg_rd, tcg_rd);
-    }
-}
-
-/* The input should be a value in the bottom e bits (with higher
- * bits zero); returns that value replicated into every element
- * of size e in a 64 bit integer.
- */
-static uint64_t bitfield_replicate(uint64_t mask, unsigned int e)
-{
-    assert(e != 0);
-    while (e < 64) {
-        mask |= mask << e;
-        e *= 2;
-    }
-    return mask;
-}
-
-/* Return a value with the bottom len bits set (where 0 < len <= 64) */
-static inline uint64_t bitmask64(unsigned int length)
-{
-    assert(length > 0 && length <= 64);
-    return ~0ULL >> (64 - length);
-}
-
-/* Simplified variant of pseudocode DecodeBitMasks() for the case where we
- * only require the wmask. Returns false if the imms/immr/immn are a reserved
- * value (ie should cause a guest UNDEF exception), and true if they are
- * valid, in which case the decoded bit pattern is written to result.
- */
-bool logic_imm_decode_wmask(uint64_t *result, unsigned int immn,
-                            unsigned int imms, unsigned int immr)
-{
-    uint64_t mask;
-    unsigned e, levels, s, r;
-    int len;
-
-    assert(immn < 2 && imms < 64 && immr < 64);
-
-    /* The bit patterns we create here are 64 bit patterns which
-     * are vectors of identical elements of size e = 2, 4, 8, 16, 32 or
-     * 64 bits each. Each element contains the same value: a run
-     * of between 1 and e-1 non-zero bits, rotated within the
-     * element by between 0 and e-1 bits.
-     *
-     * The element size and run length are encoded into immn (1 bit)
-     * and imms (6 bits) as follows:
-     * 64 bit elements: immn = 1, imms = <length of run - 1>
-     * 32 bit elements: immn = 0, imms = 0 : <length of run - 1>
-     * 16 bit elements: immn = 0, imms = 10 : <length of run - 1>
-     *  8 bit elements: immn = 0, imms = 110 : <length of run - 1>
-     *  4 bit elements: immn = 0, imms = 1110 : <length of run - 1>
-     *  2 bit elements: immn = 0, imms = 11110 : <length of run - 1>
-     * Notice that immn = 0, imms = 11111x is the only combination
-     * not covered by one of the above options; this is reserved.
-     * Further, <length of run - 1> all-ones is a reserved pattern.
-     *
-     * In all cases the rotation is by immr % e (and immr is 6 bits).
-     */
-
-    /* First determine the element size */
-    len = 31 - clz32((immn << 6) | (~imms & 0x3f));
-    if (len < 1) {
-        /* This is the immn == 0, imms == 0x11111x case */
-        return false;
-    }
-    e = 1 << len;
-
-    levels = e - 1;
-    s = imms & levels;
-    r = immr & levels;
-
-    if (s == levels) {
-        /* <length of run - 1> mustn't be all-ones. */
-        return false;
-    }
-
-    /* Create the value of one element: s+1 set bits rotated
-     * by r within the element (which is e bits wide)...
-     */
-    mask = bitmask64(s + 1);
-    if (r) {
-        mask = (mask >> r) | (mask << (e - r));
-        mask &= bitmask64(e);
-    }
-    /* ...then replicate the element over the whole 64 bit value */
-    mask = bitfield_replicate(mask, e);
-    *result = mask;
-    return true;
-}
-
-/* Logical (immediate)
- *   31  30 29 28         23 22  21  16 15  10 9    5 4    0
- * +----+-----+-------------+---+------+------+------+------+
- * | sf | opc | 1 0 0 1 0 0 | N | immr | imms |  Rn  |  Rd  |
- * +----+-----+-------------+---+------+------+------+------+
- */
-static void disas_logic_imm(DisasContext *s, uint32_t insn)
-{
-    unsigned int sf, opc, is_n, immr, imms, rn, rd;
-    TCGv_i64 tcg_rd, tcg_rn;
-    uint64_t wmask;
-    bool is_and = false;
-
-    sf = extract32(insn, 31, 1);
-    opc = extract32(insn, 29, 2);
-    is_n = extract32(insn, 22, 1);
-    immr = extract32(insn, 16, 6);
-    imms = extract32(insn, 10, 6);
-    rn = extract32(insn, 5, 5);
-    rd = extract32(insn, 0, 5);
-
-    if (!sf && is_n) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (opc == 0x3) { /* ANDS */
-        tcg_rd = cpu_reg(s, rd);
-    } else {
-        tcg_rd = cpu_reg_sp(s, rd);
-    }
-    tcg_rn = cpu_reg(s, rn);
-
-    if (!logic_imm_decode_wmask(&wmask, is_n, imms, immr)) {
-        /* some immediate field values are reserved */
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!sf) {
-        wmask &= 0xffffffff;
-    }
-
-    switch (opc) {
-    case 0x3: /* ANDS */
-    case 0x0: /* AND */
-        tcg_gen_andi_i64(tcg_rd, tcg_rn, wmask);
-        is_and = true;
-        break;
-    case 0x1: /* ORR */
-        tcg_gen_ori_i64(tcg_rd, tcg_rn, wmask);
-        break;
-    case 0x2: /* EOR */
-        tcg_gen_xori_i64(tcg_rd, tcg_rn, wmask);
-        break;
-    default:
-        assert(FALSE); /* must handle all above */
-        break;
-    }
-
-    if (!sf && !is_and) {
-        /* zero extend final result; we know we can skip this for AND
-         * since the immediate had the high 32 bits clear.
-         */
-        tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
-    }
-
-    if (opc == 3) { /* ANDS */
-        gen_logic_CC(sf, tcg_rd);
-    }
-}
-
-/*
- * Move wide (immediate)
- *
- *  31 30 29 28         23 22 21 20             5 4    0
- * +--+-----+-------------+-----+----------------+------+
- * |sf| opc | 1 0 0 1 0 1 |  hw |  imm16         |  Rd  |
- * +--+-----+-------------+-----+----------------+------+
- *
- * sf: 0 -> 32 bit, 1 -> 64 bit
- * opc: 00 -> N, 10 -> Z, 11 -> K
- * hw: shift/16 (0,16, and sf only 32, 48)
- */
-static void disas_movw_imm(DisasContext *s, uint32_t insn)
-{
-    int rd = extract32(insn, 0, 5);
-    uint64_t imm = extract32(insn, 5, 16);
-    int sf = extract32(insn, 31, 1);
-    int opc = extract32(insn, 29, 2);
-    int pos = extract32(insn, 21, 2) << 4;
-    TCGv_i64 tcg_rd = cpu_reg(s, rd);
-
-    if (!sf && (pos >= 32)) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    switch (opc) {
-    case 0: /* MOVN */
-    case 2: /* MOVZ */
-        imm <<= pos;
-        if (opc == 0) {
-            imm = ~imm;
-        }
-        if (!sf) {
-            imm &= 0xffffffffu;
-        }
-        tcg_gen_movi_i64(tcg_rd, imm);
-        break;
-    case 3: /* MOVK */
-        tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_constant_i64(imm), pos, 16);
-        if (!sf) {
-            tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
-        }
-        break;
-    default:
-        unallocated_encoding(s);
-        break;
-    }
-}
-
-/* Bitfield
- *   31  30 29 28         23 22  21  16 15  10 9    5 4    0
- * +----+-----+-------------+---+------+------+------+------+
- * | sf | opc | 1 0 0 1 1 0 | N | immr | imms |  Rn  |  Rd  |
- * +----+-----+-------------+---+------+------+------+------+
- */
-static void disas_bitfield(DisasContext *s, uint32_t insn)
-{
-    unsigned int sf, n, opc, ri, si, rn, rd, bitsize, pos, len;
-    TCGv_i64 tcg_rd, tcg_tmp;
-
-    sf = extract32(insn, 31, 1);
-    opc = extract32(insn, 29, 2);
-    n = extract32(insn, 22, 1);
-    ri = extract32(insn, 16, 6);
-    si = extract32(insn, 10, 6);
-    rn = extract32(insn, 5, 5);
-    rd = extract32(insn, 0, 5);
-    bitsize = sf ? 64 : 32;
-
-    if (sf != n || ri >= bitsize || si >= bitsize || opc > 2) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    tcg_rd = cpu_reg(s, rd);
-
-    /* Suppress the zero-extend for !sf.  Since RI and SI are constrained
-       to be smaller than bitsize, we'll never reference data outside the
-       low 32-bits anyway.  */
-    tcg_tmp = read_cpu_reg(s, rn, 1);
-
-    /* Recognize simple(r) extractions.  */
-    if (si >= ri) {
-        /* Wd<s-r:0> = Wn<s:r> */
-        len = (si - ri) + 1;
-        if (opc == 0) { /* SBFM: ASR, SBFX, SXTB, SXTH, SXTW */
-            tcg_gen_sextract_i64(tcg_rd, tcg_tmp, ri, len);
-            goto done;
-        } else if (opc == 2) { /* UBFM: UBFX, LSR, UXTB, UXTH */
-            tcg_gen_extract_i64(tcg_rd, tcg_tmp, ri, len);
-            return;
-        }
-        /* opc == 1, BFXIL fall through to deposit */
-        tcg_gen_shri_i64(tcg_tmp, tcg_tmp, ri);
-        pos = 0;
-    } else {
-        /* Handle the ri > si case with a deposit
-         * Wd<32+s-r,32-r> = Wn<s:0>
-         */
-        len = si + 1;
-        pos = (bitsize - ri) & (bitsize - 1);
-    }
-
-    if (opc == 0 && len < ri) {
-        /* SBFM: sign extend the destination field from len to fill
-           the balance of the word.  Let the deposit below insert all
-           of those sign bits.  */
-        tcg_gen_sextract_i64(tcg_tmp, tcg_tmp, 0, len);
-        len = ri;
-    }
-
-    if (opc == 1) { /* BFM, BFXIL */
-        tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_tmp, pos, len);
-    } else {
-        /* SBFM or UBFM: We start with zero, and we haven't modified
-           any bits outside bitsize, therefore the zero-extension
-           below is unneeded.  */
-        tcg_gen_deposit_z_i64(tcg_rd, tcg_tmp, pos, len);
-        return;
-    }
-
- done:
-    if (!sf) { /* zero extend final result */
-        tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
-    }
-}
-
-/* Extract
- *   31  30  29 28         23 22   21  20  16 15    10 9    5 4    0
- * +----+------+-------------+---+----+------+--------+------+------+
- * | sf | op21 | 1 0 0 1 1 1 | N | o0 |  Rm  |  imms  |  Rn  |  Rd  |
- * +----+------+-------------+---+----+------+--------+------+------+
- */
-static void disas_extract(DisasContext *s, uint32_t insn)
-{
-    unsigned int sf, n, rm, imm, rn, rd, bitsize, op21, op0;
-
-    sf = extract32(insn, 31, 1);
-    n = extract32(insn, 22, 1);
-    rm = extract32(insn, 16, 5);
-    imm = extract32(insn, 10, 6);
-    rn = extract32(insn, 5, 5);
-    rd = extract32(insn, 0, 5);
-    op21 = extract32(insn, 29, 2);
-    op0 = extract32(insn, 21, 1);
-    bitsize = sf ? 64 : 32;
-
-    if (sf != n || op21 || op0 || imm >= bitsize) {
-        unallocated_encoding(s);
-    } else {
-        TCGv_i64 tcg_rd, tcg_rm, tcg_rn;
-
-        tcg_rd = cpu_reg(s, rd);
-
-        if (unlikely(imm == 0)) {
-            /* tcg shl_i32/shl_i64 is undefined for 32/64 bit shifts,
-             * so an extract from bit 0 is a special case.
-             */
-            if (sf) {
-                tcg_gen_mov_i64(tcg_rd, cpu_reg(s, rm));
-            } else {
-                tcg_gen_ext32u_i64(tcg_rd, cpu_reg(s, rm));
-            }
-        } else {
-            tcg_rm = cpu_reg(s, rm);
-            tcg_rn = cpu_reg(s, rn);
-
-            if (sf) {
-                /* Specialization to ROR happens in EXTRACT2.  */
-                tcg_gen_extract2_i64(tcg_rd, tcg_rm, tcg_rn, imm);
-            } else {
-                TCGv_i32 t0 = tcg_temp_new_i32();
-
-                tcg_gen_extrl_i64_i32(t0, tcg_rm);
-                if (rm == rn) {
-                    tcg_gen_rotri_i32(t0, t0, imm);
-                } else {
-                    TCGv_i32 t1 = tcg_temp_new_i32();
-                    tcg_gen_extrl_i64_i32(t1, tcg_rn);
-                    tcg_gen_extract2_i32(t0, t0, t1, imm);
-                    tcg_temp_free_i32(t1);
-                }
-                tcg_gen_extu_i32_i64(tcg_rd, t0);
-                tcg_temp_free_i32(t0);
-            }
-        }
-    }
-}
-
-/* Data processing - immediate */
-static void disas_data_proc_imm(DisasContext *s, uint32_t insn)
-{
-    switch (extract32(insn, 23, 6)) {
-    case 0x20: case 0x21: /* PC-rel. addressing */
-        disas_pc_rel_adr(s, insn);
-        break;
-    case 0x22: /* Add/subtract (immediate) */
-        disas_add_sub_imm(s, insn);
-        break;
-    case 0x23: /* Add/subtract (immediate, with tags) */
-        disas_add_sub_imm_with_tags(s, insn);
-        break;
-    case 0x24: /* Logical (immediate) */
-        disas_logic_imm(s, insn);
-        break;
-    case 0x25: /* Move wide (immediate) */
-        disas_movw_imm(s, insn);
-        break;
-    case 0x26: /* Bitfield */
-        disas_bitfield(s, insn);
-        break;
-    case 0x27: /* Extract */
-        disas_extract(s, insn);
-        break;
-    default:
-        unallocated_encoding(s);
-        break;
-    }
-}
-
-/* Shift a TCGv src by TCGv shift_amount, put result in dst.
- * Note that it is the caller's responsibility to ensure that the
- * shift amount is in range (ie 0..31 or 0..63) and provide the ARM
- * mandated semantics for out of range shifts.
- */
-static void shift_reg(TCGv_i64 dst, TCGv_i64 src, int sf,
-                      enum a64_shift_type shift_type, TCGv_i64 shift_amount)
-{
-    switch (shift_type) {
-    case A64_SHIFT_TYPE_LSL:
-        tcg_gen_shl_i64(dst, src, shift_amount);
-        break;
-    case A64_SHIFT_TYPE_LSR:
-        tcg_gen_shr_i64(dst, src, shift_amount);
-        break;
-    case A64_SHIFT_TYPE_ASR:
-        if (!sf) {
-            tcg_gen_ext32s_i64(dst, src);
-        }
-        tcg_gen_sar_i64(dst, sf ? src : dst, shift_amount);
-        break;
-    case A64_SHIFT_TYPE_ROR:
-        if (sf) {
-            tcg_gen_rotr_i64(dst, src, shift_amount);
-        } else {
-            TCGv_i32 t0, t1;
-            t0 = tcg_temp_new_i32();
-            t1 = tcg_temp_new_i32();
-            tcg_gen_extrl_i64_i32(t0, src);
-            tcg_gen_extrl_i64_i32(t1, shift_amount);
-            tcg_gen_rotr_i32(t0, t0, t1);
-            tcg_gen_extu_i32_i64(dst, t0);
-            tcg_temp_free_i32(t0);
-            tcg_temp_free_i32(t1);
-        }
-        break;
-    default:
-        assert(FALSE); /* all shift types should be handled */
-        break;
-    }
-
-    if (!sf) { /* zero extend final result */
-        tcg_gen_ext32u_i64(dst, dst);
-    }
-}
-
-/* Shift a TCGv src by immediate, put result in dst.
- * The shift amount must be in range (this should always be true as the
- * relevant instructions will UNDEF on bad shift immediates).
- */
-static void shift_reg_imm(TCGv_i64 dst, TCGv_i64 src, int sf,
-                          enum a64_shift_type shift_type, unsigned int shift_i)
-{
-    assert(shift_i < (sf ? 64 : 32));
-
-    if (shift_i == 0) {
-        tcg_gen_mov_i64(dst, src);
-    } else {
-        shift_reg(dst, src, sf, shift_type, tcg_constant_i64(shift_i));
-    }
-}
-
-/* Logical (shifted register)
- *   31  30 29 28       24 23   22 21  20  16 15    10 9    5 4    0
- * +----+-----+-----------+-------+---+------+--------+------+------+
- * | sf | opc | 0 1 0 1 0 | shift | N |  Rm  |  imm6  |  Rn  |  Rd  |
- * +----+-----+-----------+-------+---+------+--------+------+------+
- */
-static void disas_logic_reg(DisasContext *s, uint32_t insn)
-{
-    TCGv_i64 tcg_rd, tcg_rn, tcg_rm;
-    unsigned int sf, opc, shift_type, invert, rm, shift_amount, rn, rd;
-
-    sf = extract32(insn, 31, 1);
-    opc = extract32(insn, 29, 2);
-    shift_type = extract32(insn, 22, 2);
-    invert = extract32(insn, 21, 1);
-    rm = extract32(insn, 16, 5);
-    shift_amount = extract32(insn, 10, 6);
-    rn = extract32(insn, 5, 5);
-    rd = extract32(insn, 0, 5);
-
-    if (!sf && (shift_amount & (1 << 5))) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    tcg_rd = cpu_reg(s, rd);
-
-    if (opc == 1 && shift_amount == 0 && shift_type == 0 && rn == 31) {
-        /* Unshifted ORR and ORN with WZR/XZR is the standard encoding for
-         * register-register MOV and MVN, so it is worth special casing.
-         */
-        tcg_rm = cpu_reg(s, rm);
-        if (invert) {
-            tcg_gen_not_i64(tcg_rd, tcg_rm);
-            if (!sf) {
-                tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
-            }
-        } else {
-            if (sf) {
-                tcg_gen_mov_i64(tcg_rd, tcg_rm);
-            } else {
-                tcg_gen_ext32u_i64(tcg_rd, tcg_rm);
-            }
-        }
-        return;
-    }
-
-    tcg_rm = read_cpu_reg(s, rm, sf);
-
-    if (shift_amount) {
-        shift_reg_imm(tcg_rm, tcg_rm, sf, shift_type, shift_amount);
-    }
-
-    tcg_rn = cpu_reg(s, rn);
-
-    switch (opc | (invert << 2)) {
-    case 0: /* AND */
-    case 3: /* ANDS */
-        tcg_gen_and_i64(tcg_rd, tcg_rn, tcg_rm);
-        break;
-    case 1: /* ORR */
-        tcg_gen_or_i64(tcg_rd, tcg_rn, tcg_rm);
-        break;
-    case 2: /* EOR */
-        tcg_gen_xor_i64(tcg_rd, tcg_rn, tcg_rm);
-        break;
-    case 4: /* BIC */
-    case 7: /* BICS */
-        tcg_gen_andc_i64(tcg_rd, tcg_rn, tcg_rm);
-        break;
-    case 5: /* ORN */
-        tcg_gen_orc_i64(tcg_rd, tcg_rn, tcg_rm);
-        break;
-    case 6: /* EON */
-        tcg_gen_eqv_i64(tcg_rd, tcg_rn, tcg_rm);
-        break;
-    default:
-        assert(FALSE);
-        break;
-    }
-
-    if (!sf) {
-        tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
-    }
-
-    if (opc == 3) {
-        gen_logic_CC(sf, tcg_rd);
-    }
-}
-
-/*
- * Add/subtract (extended register)
- *
- *  31|30|29|28       24|23 22|21|20   16|15  13|12  10|9  5|4  0|
- * +--+--+--+-----------+-----+--+-------+------+------+----+----+
- * |sf|op| S| 0 1 0 1 1 | opt | 1|  Rm   |option| imm3 | Rn | Rd |
- * +--+--+--+-----------+-----+--+-------+------+------+----+----+
- *
- *  sf: 0 -> 32bit, 1 -> 64bit
- *  op: 0 -> add  , 1 -> sub
- *   S: 1 -> set flags
- * opt: 00
- * option: extension type (see DecodeRegExtend)
- * imm3: optional shift to Rm
- *
- * Rd = Rn + LSL(extend(Rm), amount)
- */
-static void disas_add_sub_ext_reg(DisasContext *s, uint32_t insn)
-{
-    int rd = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    int imm3 = extract32(insn, 10, 3);
-    int option = extract32(insn, 13, 3);
-    int rm = extract32(insn, 16, 5);
-    int opt = extract32(insn, 22, 2);
-    bool setflags = extract32(insn, 29, 1);
-    bool sub_op = extract32(insn, 30, 1);
-    bool sf = extract32(insn, 31, 1);
-
-    TCGv_i64 tcg_rm, tcg_rn; /* temps */
-    TCGv_i64 tcg_rd;
-    TCGv_i64 tcg_result;
-
-    if (imm3 > 4 || opt != 0) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    /* non-flag setting ops may use SP */
-    if (!setflags) {
-        tcg_rd = cpu_reg_sp(s, rd);
-    } else {
-        tcg_rd = cpu_reg(s, rd);
-    }
-    tcg_rn = read_cpu_reg_sp(s, rn, sf);
-
-    tcg_rm = read_cpu_reg(s, rm, sf);
-    ext_and_shift_reg(tcg_rm, tcg_rm, option, imm3);
-
-    tcg_result = tcg_temp_new_i64();
-
-    if (!setflags) {
-        if (sub_op) {
-            tcg_gen_sub_i64(tcg_result, tcg_rn, tcg_rm);
-        } else {
-            tcg_gen_add_i64(tcg_result, tcg_rn, tcg_rm);
-        }
-    } else {
-        if (sub_op) {
-            gen_sub_CC(sf, tcg_result, tcg_rn, tcg_rm);
-        } else {
-            gen_add_CC(sf, tcg_result, tcg_rn, tcg_rm);
-        }
-    }
-
-    if (sf) {
-        tcg_gen_mov_i64(tcg_rd, tcg_result);
-    } else {
-        tcg_gen_ext32u_i64(tcg_rd, tcg_result);
-    }
-
-    tcg_temp_free_i64(tcg_result);
-}
-
-/*
- * Add/subtract (shifted register)
- *
- *  31 30 29 28       24 23 22 21 20   16 15     10 9    5 4    0
- * +--+--+--+-----------+-----+--+-------+---------+------+------+
- * |sf|op| S| 0 1 0 1 1 |shift| 0|  Rm   |  imm6   |  Rn  |  Rd  |
- * +--+--+--+-----------+-----+--+-------+---------+------+------+
- *
- *    sf: 0 -> 32bit, 1 -> 64bit
- *    op: 0 -> add  , 1 -> sub
- *     S: 1 -> set flags
- * shift: 00 -> LSL, 01 -> LSR, 10 -> ASR, 11 -> RESERVED
- *  imm6: Shift amount to apply to Rm before the add/sub
- */
-static void disas_add_sub_reg(DisasContext *s, uint32_t insn)
-{
-    int rd = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    int imm6 = extract32(insn, 10, 6);
-    int rm = extract32(insn, 16, 5);
-    int shift_type = extract32(insn, 22, 2);
-    bool setflags = extract32(insn, 29, 1);
-    bool sub_op = extract32(insn, 30, 1);
-    bool sf = extract32(insn, 31, 1);
-
-    TCGv_i64 tcg_rd = cpu_reg(s, rd);
-    TCGv_i64 tcg_rn, tcg_rm;
-    TCGv_i64 tcg_result;
-
-    if ((shift_type == 3) || (!sf && (imm6 > 31))) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    tcg_rn = read_cpu_reg(s, rn, sf);
-    tcg_rm = read_cpu_reg(s, rm, sf);
-
-    shift_reg_imm(tcg_rm, tcg_rm, sf, shift_type, imm6);
-
-    tcg_result = tcg_temp_new_i64();
-
-    if (!setflags) {
-        if (sub_op) {
-            tcg_gen_sub_i64(tcg_result, tcg_rn, tcg_rm);
-        } else {
-            tcg_gen_add_i64(tcg_result, tcg_rn, tcg_rm);
-        }
-    } else {
-        if (sub_op) {
-            gen_sub_CC(sf, tcg_result, tcg_rn, tcg_rm);
-        } else {
-            gen_add_CC(sf, tcg_result, tcg_rn, tcg_rm);
-        }
-    }
-
-    if (sf) {
-        tcg_gen_mov_i64(tcg_rd, tcg_result);
-    } else {
-        tcg_gen_ext32u_i64(tcg_rd, tcg_result);
-    }
-
-    tcg_temp_free_i64(tcg_result);
-}
-
-/* Data-processing (3 source)
- *
- *    31 30  29 28       24 23 21  20  16  15  14  10 9    5 4    0
- *  +--+------+-----------+------+------+----+------+------+------+
- *  |sf| op54 | 1 1 0 1 1 | op31 |  Rm  | o0 |  Ra  |  Rn  |  Rd  |
- *  +--+------+-----------+------+------+----+------+------+------+
- */
-static void disas_data_proc_3src(DisasContext *s, uint32_t insn)
-{
-    int rd = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    int ra = extract32(insn, 10, 5);
-    int rm = extract32(insn, 16, 5);
-    int op_id = (extract32(insn, 29, 3) << 4) |
-        (extract32(insn, 21, 3) << 1) |
-        extract32(insn, 15, 1);
-    bool sf = extract32(insn, 31, 1);
-    bool is_sub = extract32(op_id, 0, 1);
-    bool is_high = extract32(op_id, 2, 1);
-    bool is_signed = false;
-    TCGv_i64 tcg_op1;
-    TCGv_i64 tcg_op2;
-    TCGv_i64 tcg_tmp;
-
-    /* Note that op_id is sf:op54:op31:o0 so it includes the 32/64 size flag */
-    switch (op_id) {
-    case 0x42: /* SMADDL */
-    case 0x43: /* SMSUBL */
-    case 0x44: /* SMULH */
-        is_signed = true;
-        break;
-    case 0x0: /* MADD (32bit) */
-    case 0x1: /* MSUB (32bit) */
-    case 0x40: /* MADD (64bit) */
-    case 0x41: /* MSUB (64bit) */
-    case 0x4a: /* UMADDL */
-    case 0x4b: /* UMSUBL */
-    case 0x4c: /* UMULH */
-        break;
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (is_high) {
-        TCGv_i64 low_bits = tcg_temp_new_i64(); /* low bits discarded */
-        TCGv_i64 tcg_rd = cpu_reg(s, rd);
-        TCGv_i64 tcg_rn = cpu_reg(s, rn);
-        TCGv_i64 tcg_rm = cpu_reg(s, rm);
-
-        if (is_signed) {
-            tcg_gen_muls2_i64(low_bits, tcg_rd, tcg_rn, tcg_rm);
-        } else {
-            tcg_gen_mulu2_i64(low_bits, tcg_rd, tcg_rn, tcg_rm);
-        }
-
-        tcg_temp_free_i64(low_bits);
-        return;
-    }
-
-    tcg_op1 = tcg_temp_new_i64();
-    tcg_op2 = tcg_temp_new_i64();
-    tcg_tmp = tcg_temp_new_i64();
-
-    if (op_id < 0x42) {
-        tcg_gen_mov_i64(tcg_op1, cpu_reg(s, rn));
-        tcg_gen_mov_i64(tcg_op2, cpu_reg(s, rm));
-    } else {
-        if (is_signed) {
-            tcg_gen_ext32s_i64(tcg_op1, cpu_reg(s, rn));
-            tcg_gen_ext32s_i64(tcg_op2, cpu_reg(s, rm));
-        } else {
-            tcg_gen_ext32u_i64(tcg_op1, cpu_reg(s, rn));
-            tcg_gen_ext32u_i64(tcg_op2, cpu_reg(s, rm));
-        }
-    }
-
-    if (ra == 31 && !is_sub) {
-        /* Special-case MADD with rA == XZR; it is the standard MUL alias */
-        tcg_gen_mul_i64(cpu_reg(s, rd), tcg_op1, tcg_op2);
-    } else {
-        tcg_gen_mul_i64(tcg_tmp, tcg_op1, tcg_op2);
-        if (is_sub) {
-            tcg_gen_sub_i64(cpu_reg(s, rd), cpu_reg(s, ra), tcg_tmp);
-        } else {
-            tcg_gen_add_i64(cpu_reg(s, rd), cpu_reg(s, ra), tcg_tmp);
-        }
-    }
-
-    if (!sf) {
-        tcg_gen_ext32u_i64(cpu_reg(s, rd), cpu_reg(s, rd));
-    }
-
-    tcg_temp_free_i64(tcg_op1);
-    tcg_temp_free_i64(tcg_op2);
-    tcg_temp_free_i64(tcg_tmp);
-}
-
-/* Add/subtract (with carry)
- *  31 30 29 28 27 26 25 24 23 22 21  20  16  15       10  9    5 4   0
- * +--+--+--+------------------------+------+-------------+------+-----+
- * |sf|op| S| 1  1  0  1  0  0  0  0 |  rm  | 0 0 0 0 0 0 |  Rn  |  Rd |
- * +--+--+--+------------------------+------+-------------+------+-----+
- */
-
-static void disas_adc_sbc(DisasContext *s, uint32_t insn)
-{
-    unsigned int sf, op, setflags, rm, rn, rd;
-    TCGv_i64 tcg_y, tcg_rn, tcg_rd;
-
-    sf = extract32(insn, 31, 1);
-    op = extract32(insn, 30, 1);
-    setflags = extract32(insn, 29, 1);
-    rm = extract32(insn, 16, 5);
-    rn = extract32(insn, 5, 5);
-    rd = extract32(insn, 0, 5);
-
-    tcg_rd = cpu_reg(s, rd);
-    tcg_rn = cpu_reg(s, rn);
-
-    if (op) {
-        tcg_y = new_tmp_a64(s);
-        tcg_gen_not_i64(tcg_y, cpu_reg(s, rm));
-    } else {
-        tcg_y = cpu_reg(s, rm);
-    }
-
-    if (setflags) {
-        gen_adc_CC(sf, tcg_rd, tcg_rn, tcg_y);
-    } else {
-        gen_adc(sf, tcg_rd, tcg_rn, tcg_y);
-    }
-}
-
-/*
- * Rotate right into flags
- *  31 30 29                21       15          10      5  4      0
- * +--+--+--+-----------------+--------+-----------+------+--+------+
- * |sf|op| S| 1 1 0 1 0 0 0 0 |  imm6  | 0 0 0 0 1 |  Rn  |o2| mask |
- * +--+--+--+-----------------+--------+-----------+------+--+------+
- */
-static void disas_rotate_right_into_flags(DisasContext *s, uint32_t insn)
-{
-    int mask = extract32(insn, 0, 4);
-    int o2 = extract32(insn, 4, 1);
-    int rn = extract32(insn, 5, 5);
-    int imm6 = extract32(insn, 15, 6);
-    int sf_op_s = extract32(insn, 29, 3);
-    TCGv_i64 tcg_rn;
-    TCGv_i32 nzcv;
-
-    if (sf_op_s != 5 || o2 != 0 || !dc_isar_feature(aa64_condm_4, s)) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    tcg_rn = read_cpu_reg(s, rn, 1);
-    tcg_gen_rotri_i64(tcg_rn, tcg_rn, imm6);
-
-    nzcv = tcg_temp_new_i32();
-    tcg_gen_extrl_i64_i32(nzcv, tcg_rn);
-
-    if (mask & 8) { /* N */
-        tcg_gen_shli_i32(cpu_NF, nzcv, 31 - 3);
-    }
-    if (mask & 4) { /* Z */
-        tcg_gen_not_i32(cpu_ZF, nzcv);
-        tcg_gen_andi_i32(cpu_ZF, cpu_ZF, 4);
-    }
-    if (mask & 2) { /* C */
-        tcg_gen_extract_i32(cpu_CF, nzcv, 1, 1);
-    }
-    if (mask & 1) { /* V */
-        tcg_gen_shli_i32(cpu_VF, nzcv, 31 - 0);
-    }
-
-    tcg_temp_free_i32(nzcv);
-}
-
-/*
- * Evaluate into flags
- *  31 30 29                21        15   14        10      5  4      0
- * +--+--+--+-----------------+---------+----+---------+------+--+------+
- * |sf|op| S| 1 1 0 1 0 0 0 0 | opcode2 | sz | 0 0 1 0 |  Rn  |o3| mask |
- * +--+--+--+-----------------+---------+----+---------+------+--+------+
- */
-static void disas_evaluate_into_flags(DisasContext *s, uint32_t insn)
-{
-    int o3_mask = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    int o2 = extract32(insn, 15, 6);
-    int sz = extract32(insn, 14, 1);
-    int sf_op_s = extract32(insn, 29, 3);
-    TCGv_i32 tmp;
-    int shift;
-
-    if (sf_op_s != 1 || o2 != 0 || o3_mask != 0xd ||
-        !dc_isar_feature(aa64_condm_4, s)) {
-        unallocated_encoding(s);
-        return;
-    }
-    shift = sz ? 16 : 24;  /* SETF16 or SETF8 */
-
-    tmp = tcg_temp_new_i32();
-    tcg_gen_extrl_i64_i32(tmp, cpu_reg(s, rn));
-    tcg_gen_shli_i32(cpu_NF, tmp, shift);
-    tcg_gen_shli_i32(cpu_VF, tmp, shift - 1);
-    tcg_gen_mov_i32(cpu_ZF, cpu_NF);
-    tcg_gen_xor_i32(cpu_VF, cpu_VF, cpu_NF);
-    tcg_temp_free_i32(tmp);
-}
-
-/* Conditional compare (immediate / register)
- *  31 30 29 28 27 26 25 24 23 22 21  20    16 15  12  11  10  9   5  4 3   0
- * +--+--+--+------------------------+--------+------+----+--+------+--+-----+
- * |sf|op| S| 1  1  0  1  0  0  1  0 |imm5/rm | cond |i/r |o2|  Rn  |o3|nzcv |
- * +--+--+--+------------------------+--------+------+----+--+------+--+-----+
- *        [1]                             y                [0]       [0]
- */
-static void disas_cc(DisasContext *s, uint32_t insn)
-{
-    unsigned int sf, op, y, cond, rn, nzcv, is_imm;
-    TCGv_i32 tcg_t0, tcg_t1, tcg_t2;
-    TCGv_i64 tcg_tmp, tcg_y, tcg_rn;
-    DisasCompare c;
-
-    if (!extract32(insn, 29, 1)) {
-        unallocated_encoding(s);
-        return;
-    }
-    if (insn & (1 << 10 | 1 << 4)) {
-        unallocated_encoding(s);
-        return;
-    }
-    sf = extract32(insn, 31, 1);
-    op = extract32(insn, 30, 1);
-    is_imm = extract32(insn, 11, 1);
-    y = extract32(insn, 16, 5); /* y = rm (reg) or imm5 (imm) */
-    cond = extract32(insn, 12, 4);
-    rn = extract32(insn, 5, 5);
-    nzcv = extract32(insn, 0, 4);
-
-    /* Set T0 = !COND.  */
-    tcg_t0 = tcg_temp_new_i32();
-    arm_test_cc(&c, cond);
-    tcg_gen_setcondi_i32(tcg_invert_cond(c.cond), tcg_t0, c.value, 0);
-    arm_free_cc(&c);
-
-    /* Load the arguments for the new comparison.  */
-    if (is_imm) {
-        tcg_y = new_tmp_a64(s);
-        tcg_gen_movi_i64(tcg_y, y);
-    } else {
-        tcg_y = cpu_reg(s, y);
-    }
-    tcg_rn = cpu_reg(s, rn);
-
-    /* Set the flags for the new comparison.  */
-    tcg_tmp = tcg_temp_new_i64();
-    if (op) {
-        gen_sub_CC(sf, tcg_tmp, tcg_rn, tcg_y);
-    } else {
-        gen_add_CC(sf, tcg_tmp, tcg_rn, tcg_y);
-    }
-    tcg_temp_free_i64(tcg_tmp);
-
-    /* If COND was false, force the flags to #nzcv.  Compute two masks
-     * to help with this: T1 = (COND ? 0 : -1), T2 = (COND ? -1 : 0).
-     * For tcg hosts that support ANDC, we can make do with just T1.
-     * In either case, allow the tcg optimizer to delete any unused mask.
-     */
-    tcg_t1 = tcg_temp_new_i32();
-    tcg_t2 = tcg_temp_new_i32();
-    tcg_gen_neg_i32(tcg_t1, tcg_t0);
-    tcg_gen_subi_i32(tcg_t2, tcg_t0, 1);
-
-    if (nzcv & 8) { /* N */
-        tcg_gen_or_i32(cpu_NF, cpu_NF, tcg_t1);
-    } else {
-        if (TCG_TARGET_HAS_andc_i32) {
-            tcg_gen_andc_i32(cpu_NF, cpu_NF, tcg_t1);
-        } else {
-            tcg_gen_and_i32(cpu_NF, cpu_NF, tcg_t2);
-        }
-    }
-    if (nzcv & 4) { /* Z */
-        if (TCG_TARGET_HAS_andc_i32) {
-            tcg_gen_andc_i32(cpu_ZF, cpu_ZF, tcg_t1);
-        } else {
-            tcg_gen_and_i32(cpu_ZF, cpu_ZF, tcg_t2);
-        }
-    } else {
-        tcg_gen_or_i32(cpu_ZF, cpu_ZF, tcg_t0);
-    }
-    if (nzcv & 2) { /* C */
-        tcg_gen_or_i32(cpu_CF, cpu_CF, tcg_t0);
-    } else {
-        if (TCG_TARGET_HAS_andc_i32) {
-            tcg_gen_andc_i32(cpu_CF, cpu_CF, tcg_t1);
-        } else {
-            tcg_gen_and_i32(cpu_CF, cpu_CF, tcg_t2);
-        }
-    }
-    if (nzcv & 1) { /* V */
-        tcg_gen_or_i32(cpu_VF, cpu_VF, tcg_t1);
-    } else {
-        if (TCG_TARGET_HAS_andc_i32) {
-            tcg_gen_andc_i32(cpu_VF, cpu_VF, tcg_t1);
-        } else {
-            tcg_gen_and_i32(cpu_VF, cpu_VF, tcg_t2);
-        }
-    }
-    tcg_temp_free_i32(tcg_t0);
-    tcg_temp_free_i32(tcg_t1);
-    tcg_temp_free_i32(tcg_t2);
-}
-
-/* Conditional select
- *   31   30  29  28             21 20  16 15  12 11 10 9    5 4    0
- * +----+----+---+-----------------+------+------+-----+------+------+
- * | sf | op | S | 1 1 0 1 0 1 0 0 |  Rm  | cond | op2 |  Rn  |  Rd  |
- * +----+----+---+-----------------+------+------+-----+------+------+
- */
-static void disas_cond_select(DisasContext *s, uint32_t insn)
-{
-    unsigned int sf, else_inv, rm, cond, else_inc, rn, rd;
-    TCGv_i64 tcg_rd, zero;
-    DisasCompare64 c;
-
-    if (extract32(insn, 29, 1) || extract32(insn, 11, 1)) {
-        /* S == 1 or op2<1> == 1 */
-        unallocated_encoding(s);
-        return;
-    }
-    sf = extract32(insn, 31, 1);
-    else_inv = extract32(insn, 30, 1);
-    rm = extract32(insn, 16, 5);
-    cond = extract32(insn, 12, 4);
-    else_inc = extract32(insn, 10, 1);
-    rn = extract32(insn, 5, 5);
-    rd = extract32(insn, 0, 5);
-
-    tcg_rd = cpu_reg(s, rd);
-
-    a64_test_cc(&c, cond);
-    zero = tcg_constant_i64(0);
-
-    if (rn == 31 && rm == 31 && (else_inc ^ else_inv)) {
-        /* CSET & CSETM.  */
-        tcg_gen_setcond_i64(tcg_invert_cond(c.cond), tcg_rd, c.value, zero);
-        if (else_inv) {
-            tcg_gen_neg_i64(tcg_rd, tcg_rd);
-        }
-    } else {
-        TCGv_i64 t_true = cpu_reg(s, rn);
-        TCGv_i64 t_false = read_cpu_reg(s, rm, 1);
-        if (else_inv && else_inc) {
-            tcg_gen_neg_i64(t_false, t_false);
-        } else if (else_inv) {
-            tcg_gen_not_i64(t_false, t_false);
-        } else if (else_inc) {
-            tcg_gen_addi_i64(t_false, t_false, 1);
-        }
-        tcg_gen_movcond_i64(c.cond, tcg_rd, c.value, zero, t_true, t_false);
-    }
-
-    a64_free_cc(&c);
-
-    if (!sf) {
-        tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
-    }
-}
-
-static void handle_clz(DisasContext *s, unsigned int sf,
-                       unsigned int rn, unsigned int rd)
-{
-    TCGv_i64 tcg_rd, tcg_rn;
-    tcg_rd = cpu_reg(s, rd);
-    tcg_rn = cpu_reg(s, rn);
-
-    if (sf) {
-        tcg_gen_clzi_i64(tcg_rd, tcg_rn, 64);
-    } else {
-        TCGv_i32 tcg_tmp32 = tcg_temp_new_i32();
-        tcg_gen_extrl_i64_i32(tcg_tmp32, tcg_rn);
-        tcg_gen_clzi_i32(tcg_tmp32, tcg_tmp32, 32);
-        tcg_gen_extu_i32_i64(tcg_rd, tcg_tmp32);
-        tcg_temp_free_i32(tcg_tmp32);
-    }
-}
-
-static void handle_cls(DisasContext *s, unsigned int sf,
-                       unsigned int rn, unsigned int rd)
-{
-    TCGv_i64 tcg_rd, tcg_rn;
-    tcg_rd = cpu_reg(s, rd);
-    tcg_rn = cpu_reg(s, rn);
-
-    if (sf) {
-        tcg_gen_clrsb_i64(tcg_rd, tcg_rn);
-    } else {
-        TCGv_i32 tcg_tmp32 = tcg_temp_new_i32();
-        tcg_gen_extrl_i64_i32(tcg_tmp32, tcg_rn);
-        tcg_gen_clrsb_i32(tcg_tmp32, tcg_tmp32);
-        tcg_gen_extu_i32_i64(tcg_rd, tcg_tmp32);
-        tcg_temp_free_i32(tcg_tmp32);
-    }
-}
-
-static void handle_rbit(DisasContext *s, unsigned int sf,
-                        unsigned int rn, unsigned int rd)
-{
-    TCGv_i64 tcg_rd, tcg_rn;
-    tcg_rd = cpu_reg(s, rd);
-    tcg_rn = cpu_reg(s, rn);
-
-    if (sf) {
-        gen_helper_rbit64(tcg_rd, tcg_rn);
-    } else {
-        TCGv_i32 tcg_tmp32 = tcg_temp_new_i32();
-        tcg_gen_extrl_i64_i32(tcg_tmp32, tcg_rn);
-        gen_helper_rbit(tcg_tmp32, tcg_tmp32);
-        tcg_gen_extu_i32_i64(tcg_rd, tcg_tmp32);
-        tcg_temp_free_i32(tcg_tmp32);
-    }
-}
-
-/* REV with sf==1, opcode==3 ("REV64") */
-static void handle_rev64(DisasContext *s, unsigned int sf,
-                         unsigned int rn, unsigned int rd)
-{
-    if (!sf) {
-        unallocated_encoding(s);
-        return;
-    }
-    tcg_gen_bswap64_i64(cpu_reg(s, rd), cpu_reg(s, rn));
-}
-
-/* REV with sf==0, opcode==2
- * REV32 (sf==1, opcode==2)
- */
-static void handle_rev32(DisasContext *s, unsigned int sf,
-                         unsigned int rn, unsigned int rd)
-{
-    TCGv_i64 tcg_rd = cpu_reg(s, rd);
-    TCGv_i64 tcg_rn = cpu_reg(s, rn);
-
-    if (sf) {
-        tcg_gen_bswap64_i64(tcg_rd, tcg_rn);
-        tcg_gen_rotri_i64(tcg_rd, tcg_rd, 32);
-    } else {
-        tcg_gen_bswap32_i64(tcg_rd, tcg_rn, TCG_BSWAP_OZ);
-    }
-}
-
-/* REV16 (opcode==1) */
-static void handle_rev16(DisasContext *s, unsigned int sf,
-                         unsigned int rn, unsigned int rd)
-{
-    TCGv_i64 tcg_rd = cpu_reg(s, rd);
-    TCGv_i64 tcg_tmp = tcg_temp_new_i64();
-    TCGv_i64 tcg_rn = read_cpu_reg(s, rn, sf);
-    TCGv_i64 mask = tcg_constant_i64(sf ? 0x00ff00ff00ff00ffull : 0x00ff00ff);
-
-    tcg_gen_shri_i64(tcg_tmp, tcg_rn, 8);
-    tcg_gen_and_i64(tcg_rd, tcg_rn, mask);
-    tcg_gen_and_i64(tcg_tmp, tcg_tmp, mask);
-    tcg_gen_shli_i64(tcg_rd, tcg_rd, 8);
-    tcg_gen_or_i64(tcg_rd, tcg_rd, tcg_tmp);
-
-    tcg_temp_free_i64(tcg_tmp);
-}
-
-/* Data-processing (1 source)
- *   31  30  29  28             21 20     16 15    10 9    5 4    0
- * +----+---+---+-----------------+---------+--------+------+------+
- * | sf | 1 | S | 1 1 0 1 0 1 1 0 | opcode2 | opcode |  Rn  |  Rd  |
- * +----+---+---+-----------------+---------+--------+------+------+
- */
-static void disas_data_proc_1src(DisasContext *s, uint32_t insn)
-{
-    unsigned int sf, opcode, opcode2, rn, rd;
-    TCGv_i64 tcg_rd;
-
-    if (extract32(insn, 29, 1)) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    sf = extract32(insn, 31, 1);
-    opcode = extract32(insn, 10, 6);
-    opcode2 = extract32(insn, 16, 5);
-    rn = extract32(insn, 5, 5);
-    rd = extract32(insn, 0, 5);
-
-#define MAP(SF, O2, O1) ((SF) | (O1 << 1) | (O2 << 7))
-
-    switch (MAP(sf, opcode2, opcode)) {
-    case MAP(0, 0x00, 0x00): /* RBIT */
-    case MAP(1, 0x00, 0x00):
-        handle_rbit(s, sf, rn, rd);
-        break;
-    case MAP(0, 0x00, 0x01): /* REV16 */
-    case MAP(1, 0x00, 0x01):
-        handle_rev16(s, sf, rn, rd);
-        break;
-    case MAP(0, 0x00, 0x02): /* REV/REV32 */
-    case MAP(1, 0x00, 0x02):
-        handle_rev32(s, sf, rn, rd);
-        break;
-    case MAP(1, 0x00, 0x03): /* REV64 */
-        handle_rev64(s, sf, rn, rd);
-        break;
-    case MAP(0, 0x00, 0x04): /* CLZ */
-    case MAP(1, 0x00, 0x04):
-        handle_clz(s, sf, rn, rd);
-        break;
-    case MAP(0, 0x00, 0x05): /* CLS */
-    case MAP(1, 0x00, 0x05):
-        handle_cls(s, sf, rn, rd);
-        break;
-    case MAP(1, 0x01, 0x00): /* PACIA */
-        if (s->pauth_active) {
-            tcg_rd = cpu_reg(s, rd);
-            gen_helper_pacia(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn));
-        } else if (!dc_isar_feature(aa64_pauth, s)) {
-            goto do_unallocated;
-        }
-        break;
-    case MAP(1, 0x01, 0x01): /* PACIB */
-        if (s->pauth_active) {
-            tcg_rd = cpu_reg(s, rd);
-            gen_helper_pacib(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn));
-        } else if (!dc_isar_feature(aa64_pauth, s)) {
-            goto do_unallocated;
-        }
-        break;
-    case MAP(1, 0x01, 0x02): /* PACDA */
-        if (s->pauth_active) {
-            tcg_rd = cpu_reg(s, rd);
-            gen_helper_pacda(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn));
-        } else if (!dc_isar_feature(aa64_pauth, s)) {
-            goto do_unallocated;
-        }
-        break;
-    case MAP(1, 0x01, 0x03): /* PACDB */
-        if (s->pauth_active) {
-            tcg_rd = cpu_reg(s, rd);
-            gen_helper_pacdb(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn));
-        } else if (!dc_isar_feature(aa64_pauth, s)) {
-            goto do_unallocated;
-        }
-        break;
-    case MAP(1, 0x01, 0x04): /* AUTIA */
-        if (s->pauth_active) {
-            tcg_rd = cpu_reg(s, rd);
-            gen_helper_autia(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn));
-        } else if (!dc_isar_feature(aa64_pauth, s)) {
-            goto do_unallocated;
-        }
-        break;
-    case MAP(1, 0x01, 0x05): /* AUTIB */
-        if (s->pauth_active) {
-            tcg_rd = cpu_reg(s, rd);
-            gen_helper_autib(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn));
-        } else if (!dc_isar_feature(aa64_pauth, s)) {
-            goto do_unallocated;
-        }
-        break;
-    case MAP(1, 0x01, 0x06): /* AUTDA */
-        if (s->pauth_active) {
-            tcg_rd = cpu_reg(s, rd);
-            gen_helper_autda(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn));
-        } else if (!dc_isar_feature(aa64_pauth, s)) {
-            goto do_unallocated;
-        }
-        break;
-    case MAP(1, 0x01, 0x07): /* AUTDB */
-        if (s->pauth_active) {
-            tcg_rd = cpu_reg(s, rd);
-            gen_helper_autdb(tcg_rd, cpu_env, tcg_rd, cpu_reg_sp(s, rn));
-        } else if (!dc_isar_feature(aa64_pauth, s)) {
-            goto do_unallocated;
-        }
-        break;
-    case MAP(1, 0x01, 0x08): /* PACIZA */
-        if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
-            goto do_unallocated;
-        } else if (s->pauth_active) {
-            tcg_rd = cpu_reg(s, rd);
-            gen_helper_pacia(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s));
-        }
-        break;
-    case MAP(1, 0x01, 0x09): /* PACIZB */
-        if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
-            goto do_unallocated;
-        } else if (s->pauth_active) {
-            tcg_rd = cpu_reg(s, rd);
-            gen_helper_pacib(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s));
-        }
-        break;
-    case MAP(1, 0x01, 0x0a): /* PACDZA */
-        if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
-            goto do_unallocated;
-        } else if (s->pauth_active) {
-            tcg_rd = cpu_reg(s, rd);
-            gen_helper_pacda(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s));
-        }
-        break;
-    case MAP(1, 0x01, 0x0b): /* PACDZB */
-        if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
-            goto do_unallocated;
-        } else if (s->pauth_active) {
-            tcg_rd = cpu_reg(s, rd);
-            gen_helper_pacdb(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s));
-        }
-        break;
-    case MAP(1, 0x01, 0x0c): /* AUTIZA */
-        if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
-            goto do_unallocated;
-        } else if (s->pauth_active) {
-            tcg_rd = cpu_reg(s, rd);
-            gen_helper_autia(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s));
-        }
-        break;
-    case MAP(1, 0x01, 0x0d): /* AUTIZB */
-        if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
-            goto do_unallocated;
-        } else if (s->pauth_active) {
-            tcg_rd = cpu_reg(s, rd);
-            gen_helper_autib(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s));
-        }
-        break;
-    case MAP(1, 0x01, 0x0e): /* AUTDZA */
-        if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
-            goto do_unallocated;
-        } else if (s->pauth_active) {
-            tcg_rd = cpu_reg(s, rd);
-            gen_helper_autda(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s));
-        }
-        break;
-    case MAP(1, 0x01, 0x0f): /* AUTDZB */
-        if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
-            goto do_unallocated;
-        } else if (s->pauth_active) {
-            tcg_rd = cpu_reg(s, rd);
-            gen_helper_autdb(tcg_rd, cpu_env, tcg_rd, new_tmp_a64_zero(s));
-        }
-        break;
-    case MAP(1, 0x01, 0x10): /* XPACI */
-        if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
-            goto do_unallocated;
-        } else if (s->pauth_active) {
-            tcg_rd = cpu_reg(s, rd);
-            gen_helper_xpaci(tcg_rd, cpu_env, tcg_rd);
-        }
-        break;
-    case MAP(1, 0x01, 0x11): /* XPACD */
-        if (!dc_isar_feature(aa64_pauth, s) || rn != 31) {
-            goto do_unallocated;
-        } else if (s->pauth_active) {
-            tcg_rd = cpu_reg(s, rd);
-            gen_helper_xpacd(tcg_rd, cpu_env, tcg_rd);
-        }
-        break;
-    default:
-    do_unallocated:
-        unallocated_encoding(s);
-        break;
-    }
-
-#undef MAP
-}
-
-static void handle_div(DisasContext *s, bool is_signed, unsigned int sf,
-                       unsigned int rm, unsigned int rn, unsigned int rd)
-{
-    TCGv_i64 tcg_n, tcg_m, tcg_rd;
-    tcg_rd = cpu_reg(s, rd);
-
-    if (!sf && is_signed) {
-        tcg_n = new_tmp_a64(s);
-        tcg_m = new_tmp_a64(s);
-        tcg_gen_ext32s_i64(tcg_n, cpu_reg(s, rn));
-        tcg_gen_ext32s_i64(tcg_m, cpu_reg(s, rm));
-    } else {
-        tcg_n = read_cpu_reg(s, rn, sf);
-        tcg_m = read_cpu_reg(s, rm, sf);
-    }
-
-    if (is_signed) {
-        gen_helper_sdiv64(tcg_rd, tcg_n, tcg_m);
-    } else {
-        gen_helper_udiv64(tcg_rd, tcg_n, tcg_m);
-    }
-
-    if (!sf) { /* zero extend final result */
-        tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
-    }
-}
-
-/* LSLV, LSRV, ASRV, RORV */
-static void handle_shift_reg(DisasContext *s,
-                             enum a64_shift_type shift_type, unsigned int sf,
-                             unsigned int rm, unsigned int rn, unsigned int rd)
-{
-    TCGv_i64 tcg_shift = tcg_temp_new_i64();
-    TCGv_i64 tcg_rd = cpu_reg(s, rd);
-    TCGv_i64 tcg_rn = read_cpu_reg(s, rn, sf);
-
-    tcg_gen_andi_i64(tcg_shift, cpu_reg(s, rm), sf ? 63 : 31);
-    shift_reg(tcg_rd, tcg_rn, sf, shift_type, tcg_shift);
-    tcg_temp_free_i64(tcg_shift);
-}
-
-/* CRC32[BHWX], CRC32C[BHWX] */
-static void handle_crc32(DisasContext *s,
-                         unsigned int sf, unsigned int sz, bool crc32c,
-                         unsigned int rm, unsigned int rn, unsigned int rd)
-{
-    TCGv_i64 tcg_acc, tcg_val;
-    TCGv_i32 tcg_bytes;
-
-    if (!dc_isar_feature(aa64_crc32, s)
-        || (sf == 1 && sz != 3)
-        || (sf == 0 && sz == 3)) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (sz == 3) {
-        tcg_val = cpu_reg(s, rm);
-    } else {
-        uint64_t mask;
-        switch (sz) {
-        case 0:
-            mask = 0xFF;
-            break;
-        case 1:
-            mask = 0xFFFF;
-            break;
-        case 2:
-            mask = 0xFFFFFFFF;
-            break;
-        default:
-            g_assert_not_reached();
-        }
-        tcg_val = new_tmp_a64(s);
-        tcg_gen_andi_i64(tcg_val, cpu_reg(s, rm), mask);
-    }
-
-    tcg_acc = cpu_reg(s, rn);
-    tcg_bytes = tcg_constant_i32(1 << sz);
-
-    if (crc32c) {
-        gen_helper_crc32c_64(cpu_reg(s, rd), tcg_acc, tcg_val, tcg_bytes);
-    } else {
-        gen_helper_crc32_64(cpu_reg(s, rd), tcg_acc, tcg_val, tcg_bytes);
-    }
-}
-
-/* Data-processing (2 source)
- *   31   30  29 28             21 20  16 15    10 9    5 4    0
- * +----+---+---+-----------------+------+--------+------+------+
- * | sf | 0 | S | 1 1 0 1 0 1 1 0 |  Rm  | opcode |  Rn  |  Rd  |
- * +----+---+---+-----------------+------+--------+------+------+
- */
-static void disas_data_proc_2src(DisasContext *s, uint32_t insn)
-{
-    unsigned int sf, rm, opcode, rn, rd, setflag;
-    sf = extract32(insn, 31, 1);
-    setflag = extract32(insn, 29, 1);
-    rm = extract32(insn, 16, 5);
-    opcode = extract32(insn, 10, 6);
-    rn = extract32(insn, 5, 5);
-    rd = extract32(insn, 0, 5);
-
-    if (setflag && opcode != 0) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    switch (opcode) {
-    case 0: /* SUBP(S) */
-        if (sf == 0 || !dc_isar_feature(aa64_mte_insn_reg, s)) {
-            goto do_unallocated;
-        } else {
-            TCGv_i64 tcg_n, tcg_m, tcg_d;
-
-            tcg_n = read_cpu_reg_sp(s, rn, true);
-            tcg_m = read_cpu_reg_sp(s, rm, true);
-            tcg_gen_sextract_i64(tcg_n, tcg_n, 0, 56);
-            tcg_gen_sextract_i64(tcg_m, tcg_m, 0, 56);
-            tcg_d = cpu_reg(s, rd);
-
-            if (setflag) {
-                gen_sub_CC(true, tcg_d, tcg_n, tcg_m);
-            } else {
-                tcg_gen_sub_i64(tcg_d, tcg_n, tcg_m);
-            }
-        }
-        break;
-    case 2: /* UDIV */
-        handle_div(s, false, sf, rm, rn, rd);
-        break;
-    case 3: /* SDIV */
-        handle_div(s, true, sf, rm, rn, rd);
-        break;
-    case 4: /* IRG */
-        if (sf == 0 || !dc_isar_feature(aa64_mte_insn_reg, s)) {
-            goto do_unallocated;
-        }
-        if (s->ata) {
-            gen_helper_irg(cpu_reg_sp(s, rd), cpu_env,
-                           cpu_reg_sp(s, rn), cpu_reg(s, rm));
-        } else {
-            gen_address_with_allocation_tag0(cpu_reg_sp(s, rd),
-                                             cpu_reg_sp(s, rn));
-        }
-        break;
-    case 5: /* GMI */
-        if (sf == 0 || !dc_isar_feature(aa64_mte_insn_reg, s)) {
-            goto do_unallocated;
-        } else {
-            TCGv_i64 t = tcg_temp_new_i64();
-
-            tcg_gen_extract_i64(t, cpu_reg_sp(s, rn), 56, 4);
-            tcg_gen_shl_i64(t, tcg_constant_i64(1), t);
-            tcg_gen_or_i64(cpu_reg(s, rd), cpu_reg(s, rm), t);
-
-            tcg_temp_free_i64(t);
-        }
-        break;
-    case 8: /* LSLV */
-        handle_shift_reg(s, A64_SHIFT_TYPE_LSL, sf, rm, rn, rd);
-        break;
-    case 9: /* LSRV */
-        handle_shift_reg(s, A64_SHIFT_TYPE_LSR, sf, rm, rn, rd);
-        break;
-    case 10: /* ASRV */
-        handle_shift_reg(s, A64_SHIFT_TYPE_ASR, sf, rm, rn, rd);
-        break;
-    case 11: /* RORV */
-        handle_shift_reg(s, A64_SHIFT_TYPE_ROR, sf, rm, rn, rd);
-        break;
-    case 12: /* PACGA */
-        if (sf == 0 || !dc_isar_feature(aa64_pauth, s)) {
-            goto do_unallocated;
-        }
-        gen_helper_pacga(cpu_reg(s, rd), cpu_env,
-                         cpu_reg(s, rn), cpu_reg_sp(s, rm));
-        break;
-    case 16:
-    case 17:
-    case 18:
-    case 19:
-    case 20:
-    case 21:
-    case 22:
-    case 23: /* CRC32 */
-    {
-        int sz = extract32(opcode, 0, 2);
-        bool crc32c = extract32(opcode, 2, 1);
-        handle_crc32(s, sf, sz, crc32c, rm, rn, rd);
-        break;
-    }
-    default:
-    do_unallocated:
-        unallocated_encoding(s);
-        break;
-    }
-}
-
-/*
- * Data processing - register
- *  31  30 29  28      25    21  20  16      10         0
- * +--+---+--+---+-------+-----+-------+-------+---------+
- * |  |op0|  |op1| 1 0 1 | op2 |       |  op3  |         |
- * +--+---+--+---+-------+-----+-------+-------+---------+
- */
-static void disas_data_proc_reg(DisasContext *s, uint32_t insn)
-{
-    int op0 = extract32(insn, 30, 1);
-    int op1 = extract32(insn, 28, 1);
-    int op2 = extract32(insn, 21, 4);
-    int op3 = extract32(insn, 10, 6);
-
-    if (!op1) {
-        if (op2 & 8) {
-            if (op2 & 1) {
-                /* Add/sub (extended register) */
-                disas_add_sub_ext_reg(s, insn);
-            } else {
-                /* Add/sub (shifted register) */
-                disas_add_sub_reg(s, insn);
-            }
-        } else {
-            /* Logical (shifted register) */
-            disas_logic_reg(s, insn);
-        }
-        return;
-    }
-
-    switch (op2) {
-    case 0x0:
-        switch (op3) {
-        case 0x00: /* Add/subtract (with carry) */
-            disas_adc_sbc(s, insn);
-            break;
-
-        case 0x01: /* Rotate right into flags */
-        case 0x21:
-            disas_rotate_right_into_flags(s, insn);
-            break;
-
-        case 0x02: /* Evaluate into flags */
-        case 0x12:
-        case 0x22:
-        case 0x32:
-            disas_evaluate_into_flags(s, insn);
-            break;
-
-        default:
-            goto do_unallocated;
-        }
-        break;
-
-    case 0x2: /* Conditional compare */
-        disas_cc(s, insn); /* both imm and reg forms */
-        break;
-
-    case 0x4: /* Conditional select */
-        disas_cond_select(s, insn);
-        break;
-
-    case 0x6: /* Data-processing */
-        if (op0) {    /* (1 source) */
-            disas_data_proc_1src(s, insn);
-        } else {      /* (2 source) */
-            disas_data_proc_2src(s, insn);
-        }
-        break;
-    case 0x8 ... 0xf: /* (3 source) */
-        disas_data_proc_3src(s, insn);
-        break;
-
-    default:
-    do_unallocated:
-        unallocated_encoding(s);
-        break;
-    }
-}
-
-static void handle_fp_compare(DisasContext *s, int size,
-                              unsigned int rn, unsigned int rm,
-                              bool cmp_with_zero, bool signal_all_nans)
-{
-    TCGv_i64 tcg_flags = tcg_temp_new_i64();
-    TCGv_ptr fpst = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
-
-    if (size == MO_64) {
-        TCGv_i64 tcg_vn, tcg_vm;
-
-        tcg_vn = read_fp_dreg(s, rn);
-        if (cmp_with_zero) {
-            tcg_vm = tcg_constant_i64(0);
-        } else {
-            tcg_vm = read_fp_dreg(s, rm);
-        }
-        if (signal_all_nans) {
-            gen_helper_vfp_cmped_a64(tcg_flags, tcg_vn, tcg_vm, fpst);
-        } else {
-            gen_helper_vfp_cmpd_a64(tcg_flags, tcg_vn, tcg_vm, fpst);
-        }
-        tcg_temp_free_i64(tcg_vn);
-        tcg_temp_free_i64(tcg_vm);
-    } else {
-        TCGv_i32 tcg_vn = tcg_temp_new_i32();
-        TCGv_i32 tcg_vm = tcg_temp_new_i32();
-
-        read_vec_element_i32(s, tcg_vn, rn, 0, size);
-        if (cmp_with_zero) {
-            tcg_gen_movi_i32(tcg_vm, 0);
-        } else {
-            read_vec_element_i32(s, tcg_vm, rm, 0, size);
-        }
-
-        switch (size) {
-        case MO_32:
-            if (signal_all_nans) {
-                gen_helper_vfp_cmpes_a64(tcg_flags, tcg_vn, tcg_vm, fpst);
-            } else {
-                gen_helper_vfp_cmps_a64(tcg_flags, tcg_vn, tcg_vm, fpst);
-            }
-            break;
-        case MO_16:
-            if (signal_all_nans) {
-                gen_helper_vfp_cmpeh_a64(tcg_flags, tcg_vn, tcg_vm, fpst);
-            } else {
-                gen_helper_vfp_cmph_a64(tcg_flags, tcg_vn, tcg_vm, fpst);
-            }
-            break;
-        default:
-            g_assert_not_reached();
-        }
-
-        tcg_temp_free_i32(tcg_vn);
-        tcg_temp_free_i32(tcg_vm);
-    }
-
-    tcg_temp_free_ptr(fpst);
-
-    gen_set_nzcv(tcg_flags);
-
-    tcg_temp_free_i64(tcg_flags);
-}
-
-/* Floating point compare
- *   31  30  29 28       24 23  22  21 20  16 15 14 13  10    9    5 4     0
- * +---+---+---+-----------+------+---+------+-----+---------+------+-------+
- * | M | 0 | S | 1 1 1 1 0 | type | 1 |  Rm  | op  | 1 0 0 0 |  Rn  |  op2  |
- * +---+---+---+-----------+------+---+------+-----+---------+------+-------+
- */
-static void disas_fp_compare(DisasContext *s, uint32_t insn)
-{
-    unsigned int mos, type, rm, op, rn, opc, op2r;
-    int size;
-
-    mos = extract32(insn, 29, 3);
-    type = extract32(insn, 22, 2);
-    rm = extract32(insn, 16, 5);
-    op = extract32(insn, 14, 2);
-    rn = extract32(insn, 5, 5);
-    opc = extract32(insn, 3, 2);
-    op2r = extract32(insn, 0, 3);
-
-    if (mos || op || op2r) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    switch (type) {
-    case 0:
-        size = MO_32;
-        break;
-    case 1:
-        size = MO_64;
-        break;
-    case 3:
-        size = MO_16;
-        if (dc_isar_feature(aa64_fp16, s)) {
-            break;
-        }
-        /* fallthru */
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    handle_fp_compare(s, size, rn, rm, opc & 1, opc & 2);
-}
-
-/* Floating point conditional compare
- *   31  30  29 28       24 23  22  21 20  16 15  12 11 10 9    5  4   3    0
- * +---+---+---+-----------+------+---+------+------+-----+------+----+------+
- * | M | 0 | S | 1 1 1 1 0 | type | 1 |  Rm  | cond | 0 1 |  Rn  | op | nzcv |
- * +---+---+---+-----------+------+---+------+------+-----+------+----+------+
- */
-static void disas_fp_ccomp(DisasContext *s, uint32_t insn)
-{
-    unsigned int mos, type, rm, cond, rn, op, nzcv;
-    TCGLabel *label_continue = NULL;
-    int size;
-
-    mos = extract32(insn, 29, 3);
-    type = extract32(insn, 22, 2);
-    rm = extract32(insn, 16, 5);
-    cond = extract32(insn, 12, 4);
-    rn = extract32(insn, 5, 5);
-    op = extract32(insn, 4, 1);
-    nzcv = extract32(insn, 0, 4);
-
-    if (mos) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    switch (type) {
-    case 0:
-        size = MO_32;
-        break;
-    case 1:
-        size = MO_64;
-        break;
-    case 3:
-        size = MO_16;
-        if (dc_isar_feature(aa64_fp16, s)) {
-            break;
-        }
-        /* fallthru */
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    if (cond < 0x0e) { /* not always */
-        TCGLabel *label_match = gen_new_label();
-        label_continue = gen_new_label();
-        arm_gen_test_cc(cond, label_match);
-        /* nomatch: */
-        gen_set_nzcv(tcg_constant_i64(nzcv << 28));
-        tcg_gen_br(label_continue);
-        gen_set_label(label_match);
-    }
-
-    handle_fp_compare(s, size, rn, rm, false, op);
-
-    if (cond < 0x0e) {
-        gen_set_label(label_continue);
-    }
-}
-
-/* Floating point conditional select
- *   31  30  29 28       24 23  22  21 20  16 15  12 11 10 9    5 4    0
- * +---+---+---+-----------+------+---+------+------+-----+------+------+
- * | M | 0 | S | 1 1 1 1 0 | type | 1 |  Rm  | cond | 1 1 |  Rn  |  Rd  |
- * +---+---+---+-----------+------+---+------+------+-----+------+------+
- */
-static void disas_fp_csel(DisasContext *s, uint32_t insn)
-{
-    unsigned int mos, type, rm, cond, rn, rd;
-    TCGv_i64 t_true, t_false;
-    DisasCompare64 c;
-    MemOp sz;
-
-    mos = extract32(insn, 29, 3);
-    type = extract32(insn, 22, 2);
-    rm = extract32(insn, 16, 5);
-    cond = extract32(insn, 12, 4);
-    rn = extract32(insn, 5, 5);
-    rd = extract32(insn, 0, 5);
-
-    if (mos) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    switch (type) {
-    case 0:
-        sz = MO_32;
-        break;
-    case 1:
-        sz = MO_64;
-        break;
-    case 3:
-        sz = MO_16;
-        if (dc_isar_feature(aa64_fp16, s)) {
-            break;
-        }
-        /* fallthru */
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    /* Zero extend sreg & hreg inputs to 64 bits now.  */
-    t_true = tcg_temp_new_i64();
-    t_false = tcg_temp_new_i64();
-    read_vec_element(s, t_true, rn, 0, sz);
-    read_vec_element(s, t_false, rm, 0, sz);
-
-    a64_test_cc(&c, cond);
-    tcg_gen_movcond_i64(c.cond, t_true, c.value, tcg_constant_i64(0),
-                        t_true, t_false);
-    tcg_temp_free_i64(t_false);
-    a64_free_cc(&c);
-
-    /* Note that sregs & hregs write back zeros to the high bits,
-       and we've already done the zero-extension.  */
-    write_fp_dreg(s, rd, t_true);
-    tcg_temp_free_i64(t_true);
-}
-
-/* Floating-point data-processing (1 source) - half precision */
-static void handle_fp_1src_half(DisasContext *s, int opcode, int rd, int rn)
-{
-    TCGv_ptr fpst = NULL;
-    TCGv_i32 tcg_op = read_fp_hreg(s, rn);
-    TCGv_i32 tcg_res = tcg_temp_new_i32();
-
-    switch (opcode) {
-    case 0x0: /* FMOV */
-        tcg_gen_mov_i32(tcg_res, tcg_op);
-        break;
-    case 0x1: /* FABS */
-        tcg_gen_andi_i32(tcg_res, tcg_op, 0x7fff);
-        break;
-    case 0x2: /* FNEG */
-        tcg_gen_xori_i32(tcg_res, tcg_op, 0x8000);
-        break;
-    case 0x3: /* FSQRT */
-        fpst = fpstatus_ptr(FPST_FPCR_F16);
-        gen_helper_sqrt_f16(tcg_res, tcg_op, fpst);
-        break;
-    case 0x8: /* FRINTN */
-    case 0x9: /* FRINTP */
-    case 0xa: /* FRINTM */
-    case 0xb: /* FRINTZ */
-    case 0xc: /* FRINTA */
-    {
-        TCGv_i32 tcg_rmode = tcg_const_i32(arm_rmode_to_sf(opcode & 7));
-        fpst = fpstatus_ptr(FPST_FPCR_F16);
-
-        gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
-        gen_helper_advsimd_rinth(tcg_res, tcg_op, fpst);
-
-        gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
-        tcg_temp_free_i32(tcg_rmode);
-        break;
-    }
-    case 0xe: /* FRINTX */
-        fpst = fpstatus_ptr(FPST_FPCR_F16);
-        gen_helper_advsimd_rinth_exact(tcg_res, tcg_op, fpst);
-        break;
-    case 0xf: /* FRINTI */
-        fpst = fpstatus_ptr(FPST_FPCR_F16);
-        gen_helper_advsimd_rinth(tcg_res, tcg_op, fpst);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-
-    write_fp_sreg(s, rd, tcg_res);
-
-    if (fpst) {
-        tcg_temp_free_ptr(fpst);
-    }
-    tcg_temp_free_i32(tcg_op);
-    tcg_temp_free_i32(tcg_res);
-}
-
-/* Floating-point data-processing (1 source) - single precision */
-static void handle_fp_1src_single(DisasContext *s, int opcode, int rd, int rn)
-{
-    void (*gen_fpst)(TCGv_i32, TCGv_i32, TCGv_ptr);
-    TCGv_i32 tcg_op, tcg_res;
-    TCGv_ptr fpst;
-    int rmode = -1;
-
-    tcg_op = read_fp_sreg(s, rn);
-    tcg_res = tcg_temp_new_i32();
-
-    switch (opcode) {
-    case 0x0: /* FMOV */
-        tcg_gen_mov_i32(tcg_res, tcg_op);
-        goto done;
-    case 0x1: /* FABS */
-        gen_helper_vfp_abss(tcg_res, tcg_op);
-        goto done;
-    case 0x2: /* FNEG */
-        gen_helper_vfp_negs(tcg_res, tcg_op);
-        goto done;
-    case 0x3: /* FSQRT */
-        gen_helper_vfp_sqrts(tcg_res, tcg_op, cpu_env);
-        goto done;
-    case 0x6: /* BFCVT */
-        gen_fpst = gen_helper_bfcvt;
-        break;
-    case 0x8: /* FRINTN */
-    case 0x9: /* FRINTP */
-    case 0xa: /* FRINTM */
-    case 0xb: /* FRINTZ */
-    case 0xc: /* FRINTA */
-        rmode = arm_rmode_to_sf(opcode & 7);
-        gen_fpst = gen_helper_rints;
-        break;
-    case 0xe: /* FRINTX */
-        gen_fpst = gen_helper_rints_exact;
-        break;
-    case 0xf: /* FRINTI */
-        gen_fpst = gen_helper_rints;
-        break;
-    case 0x10: /* FRINT32Z */
-        rmode = float_round_to_zero;
-        gen_fpst = gen_helper_frint32_s;
-        break;
-    case 0x11: /* FRINT32X */
-        gen_fpst = gen_helper_frint32_s;
-        break;
-    case 0x12: /* FRINT64Z */
-        rmode = float_round_to_zero;
-        gen_fpst = gen_helper_frint64_s;
-        break;
-    case 0x13: /* FRINT64X */
-        gen_fpst = gen_helper_frint64_s;
-        break;
-    default:
-        g_assert_not_reached();
-    }
-
-    fpst = fpstatus_ptr(FPST_FPCR);
-    if (rmode >= 0) {
-        TCGv_i32 tcg_rmode = tcg_const_i32(rmode);
-        gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
-        gen_fpst(tcg_res, tcg_op, fpst);
-        gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
-        tcg_temp_free_i32(tcg_rmode);
-    } else {
-        gen_fpst(tcg_res, tcg_op, fpst);
-    }
-    tcg_temp_free_ptr(fpst);
-
- done:
-    write_fp_sreg(s, rd, tcg_res);
-    tcg_temp_free_i32(tcg_op);
-    tcg_temp_free_i32(tcg_res);
-}
-
-/* Floating-point data-processing (1 source) - double precision */
-static void handle_fp_1src_double(DisasContext *s, int opcode, int rd, int rn)
-{
-    void (*gen_fpst)(TCGv_i64, TCGv_i64, TCGv_ptr);
-    TCGv_i64 tcg_op, tcg_res;
-    TCGv_ptr fpst;
-    int rmode = -1;
-
-    switch (opcode) {
-    case 0x0: /* FMOV */
-        gen_gvec_fn2(s, false, rd, rn, tcg_gen_gvec_mov, 0);
-        return;
-    }
-
-    tcg_op = read_fp_dreg(s, rn);
-    tcg_res = tcg_temp_new_i64();
-
-    switch (opcode) {
-    case 0x1: /* FABS */
-        gen_helper_vfp_absd(tcg_res, tcg_op);
-        goto done;
-    case 0x2: /* FNEG */
-        gen_helper_vfp_negd(tcg_res, tcg_op);
-        goto done;
-    case 0x3: /* FSQRT */
-        gen_helper_vfp_sqrtd(tcg_res, tcg_op, cpu_env);
-        goto done;
-    case 0x8: /* FRINTN */
-    case 0x9: /* FRINTP */
-    case 0xa: /* FRINTM */
-    case 0xb: /* FRINTZ */
-    case 0xc: /* FRINTA */
-        rmode = arm_rmode_to_sf(opcode & 7);
-        gen_fpst = gen_helper_rintd;
-        break;
-    case 0xe: /* FRINTX */
-        gen_fpst = gen_helper_rintd_exact;
-        break;
-    case 0xf: /* FRINTI */
-        gen_fpst = gen_helper_rintd;
-        break;
-    case 0x10: /* FRINT32Z */
-        rmode = float_round_to_zero;
-        gen_fpst = gen_helper_frint32_d;
-        break;
-    case 0x11: /* FRINT32X */
-        gen_fpst = gen_helper_frint32_d;
-        break;
-    case 0x12: /* FRINT64Z */
-        rmode = float_round_to_zero;
-        gen_fpst = gen_helper_frint64_d;
-        break;
-    case 0x13: /* FRINT64X */
-        gen_fpst = gen_helper_frint64_d;
-        break;
-    default:
-        g_assert_not_reached();
-    }
-
-    fpst = fpstatus_ptr(FPST_FPCR);
-    if (rmode >= 0) {
-        TCGv_i32 tcg_rmode = tcg_const_i32(rmode);
-        gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
-        gen_fpst(tcg_res, tcg_op, fpst);
-        gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
-        tcg_temp_free_i32(tcg_rmode);
-    } else {
-        gen_fpst(tcg_res, tcg_op, fpst);
-    }
-    tcg_temp_free_ptr(fpst);
-
- done:
-    write_fp_dreg(s, rd, tcg_res);
-    tcg_temp_free_i64(tcg_op);
-    tcg_temp_free_i64(tcg_res);
-}
-
-static void handle_fp_fcvt(DisasContext *s, int opcode,
-                           int rd, int rn, int dtype, int ntype)
-{
-    switch (ntype) {
-    case 0x0:
-    {
-        TCGv_i32 tcg_rn = read_fp_sreg(s, rn);
-        if (dtype == 1) {
-            /* Single to double */
-            TCGv_i64 tcg_rd = tcg_temp_new_i64();
-            gen_helper_vfp_fcvtds(tcg_rd, tcg_rn, cpu_env);
-            write_fp_dreg(s, rd, tcg_rd);
-            tcg_temp_free_i64(tcg_rd);
-        } else {
-            /* Single to half */
-            TCGv_i32 tcg_rd = tcg_temp_new_i32();
-            TCGv_i32 ahp = get_ahp_flag();
-            TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
-
-            gen_helper_vfp_fcvt_f32_to_f16(tcg_rd, tcg_rn, fpst, ahp);
-            /* write_fp_sreg is OK here because top half of tcg_rd is zero */
-            write_fp_sreg(s, rd, tcg_rd);
-            tcg_temp_free_i32(tcg_rd);
-            tcg_temp_free_i32(ahp);
-            tcg_temp_free_ptr(fpst);
-        }
-        tcg_temp_free_i32(tcg_rn);
-        break;
-    }
-    case 0x1:
-    {
-        TCGv_i64 tcg_rn = read_fp_dreg(s, rn);
-        TCGv_i32 tcg_rd = tcg_temp_new_i32();
-        if (dtype == 0) {
-            /* Double to single */
-            gen_helper_vfp_fcvtsd(tcg_rd, tcg_rn, cpu_env);
-        } else {
-            TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
-            TCGv_i32 ahp = get_ahp_flag();
-            /* Double to half */
-            gen_helper_vfp_fcvt_f64_to_f16(tcg_rd, tcg_rn, fpst, ahp);
-            /* write_fp_sreg is OK here because top half of tcg_rd is zero */
-            tcg_temp_free_ptr(fpst);
-            tcg_temp_free_i32(ahp);
-        }
-        write_fp_sreg(s, rd, tcg_rd);
-        tcg_temp_free_i32(tcg_rd);
-        tcg_temp_free_i64(tcg_rn);
-        break;
-    }
-    case 0x3:
-    {
-        TCGv_i32 tcg_rn = read_fp_sreg(s, rn);
-        TCGv_ptr tcg_fpst = fpstatus_ptr(FPST_FPCR);
-        TCGv_i32 tcg_ahp = get_ahp_flag();
-        tcg_gen_ext16u_i32(tcg_rn, tcg_rn);
-        if (dtype == 0) {
-            /* Half to single */
-            TCGv_i32 tcg_rd = tcg_temp_new_i32();
-            gen_helper_vfp_fcvt_f16_to_f32(tcg_rd, tcg_rn, tcg_fpst, tcg_ahp);
-            write_fp_sreg(s, rd, tcg_rd);
-            tcg_temp_free_i32(tcg_rd);
-        } else {
-            /* Half to double */
-            TCGv_i64 tcg_rd = tcg_temp_new_i64();
-            gen_helper_vfp_fcvt_f16_to_f64(tcg_rd, tcg_rn, tcg_fpst, tcg_ahp);
-            write_fp_dreg(s, rd, tcg_rd);
-            tcg_temp_free_i64(tcg_rd);
-        }
-        tcg_temp_free_i32(tcg_rn);
-        tcg_temp_free_ptr(tcg_fpst);
-        tcg_temp_free_i32(tcg_ahp);
-        break;
-    }
-    default:
-        g_assert_not_reached();
-    }
-}
-
-/* Floating point data-processing (1 source)
- *   31  30  29 28       24 23  22  21 20    15 14       10 9    5 4    0
- * +---+---+---+-----------+------+---+--------+-----------+------+------+
- * | M | 0 | S | 1 1 1 1 0 | type | 1 | opcode | 1 0 0 0 0 |  Rn  |  Rd  |
- * +---+---+---+-----------+------+---+--------+-----------+------+------+
- */
-static void disas_fp_1src(DisasContext *s, uint32_t insn)
-{
-    int mos = extract32(insn, 29, 3);
-    int type = extract32(insn, 22, 2);
-    int opcode = extract32(insn, 15, 6);
-    int rn = extract32(insn, 5, 5);
-    int rd = extract32(insn, 0, 5);
-
-    if (mos) {
-        goto do_unallocated;
-    }
-
-    switch (opcode) {
-    case 0x4: case 0x5: case 0x7:
-    {
-        /* FCVT between half, single and double precision */
-        int dtype = extract32(opcode, 0, 2);
-        if (type == 2 || dtype == type) {
-            goto do_unallocated;
-        }
-        if (!fp_access_check(s)) {
-            return;
-        }
-
-        handle_fp_fcvt(s, opcode, rd, rn, dtype, type);
-        break;
-    }
-
-    case 0x10 ... 0x13: /* FRINT{32,64}{X,Z} */
-        if (type > 1 || !dc_isar_feature(aa64_frint, s)) {
-            goto do_unallocated;
-        }
-        /* fall through */
-    case 0x0 ... 0x3:
-    case 0x8 ... 0xc:
-    case 0xe ... 0xf:
-        /* 32-to-32 and 64-to-64 ops */
-        switch (type) {
-        case 0:
-            if (!fp_access_check(s)) {
-                return;
-            }
-            handle_fp_1src_single(s, opcode, rd, rn);
-            break;
-        case 1:
-            if (!fp_access_check(s)) {
-                return;
-            }
-            handle_fp_1src_double(s, opcode, rd, rn);
-            break;
-        case 3:
-            if (!dc_isar_feature(aa64_fp16, s)) {
-                goto do_unallocated;
-            }
-
-            if (!fp_access_check(s)) {
-                return;
-            }
-            handle_fp_1src_half(s, opcode, rd, rn);
-            break;
-        default:
-            goto do_unallocated;
-        }
-        break;
-
-    case 0x6:
-        switch (type) {
-        case 1: /* BFCVT */
-            if (!dc_isar_feature(aa64_bf16, s)) {
-                goto do_unallocated;
-            }
-            if (!fp_access_check(s)) {
-                return;
-            }
-            handle_fp_1src_single(s, opcode, rd, rn);
-            break;
-        default:
-            goto do_unallocated;
-        }
-        break;
-
-    default:
-    do_unallocated:
-        unallocated_encoding(s);
-        break;
-    }
-}
-
-/* Floating-point data-processing (2 source) - single precision */
-static void handle_fp_2src_single(DisasContext *s, int opcode,
-                                  int rd, int rn, int rm)
-{
-    TCGv_i32 tcg_op1;
-    TCGv_i32 tcg_op2;
-    TCGv_i32 tcg_res;
-    TCGv_ptr fpst;
-
-    tcg_res = tcg_temp_new_i32();
-    fpst = fpstatus_ptr(FPST_FPCR);
-    tcg_op1 = read_fp_sreg(s, rn);
-    tcg_op2 = read_fp_sreg(s, rm);
-
-    switch (opcode) {
-    case 0x0: /* FMUL */
-        gen_helper_vfp_muls(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x1: /* FDIV */
-        gen_helper_vfp_divs(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x2: /* FADD */
-        gen_helper_vfp_adds(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x3: /* FSUB */
-        gen_helper_vfp_subs(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x4: /* FMAX */
-        gen_helper_vfp_maxs(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x5: /* FMIN */
-        gen_helper_vfp_mins(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x6: /* FMAXNM */
-        gen_helper_vfp_maxnums(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x7: /* FMINNM */
-        gen_helper_vfp_minnums(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x8: /* FNMUL */
-        gen_helper_vfp_muls(tcg_res, tcg_op1, tcg_op2, fpst);
-        gen_helper_vfp_negs(tcg_res, tcg_res);
-        break;
-    }
-
-    write_fp_sreg(s, rd, tcg_res);
-
-    tcg_temp_free_ptr(fpst);
-    tcg_temp_free_i32(tcg_op1);
-    tcg_temp_free_i32(tcg_op2);
-    tcg_temp_free_i32(tcg_res);
-}
-
-/* Floating-point data-processing (2 source) - double precision */
-static void handle_fp_2src_double(DisasContext *s, int opcode,
-                                  int rd, int rn, int rm)
-{
-    TCGv_i64 tcg_op1;
-    TCGv_i64 tcg_op2;
-    TCGv_i64 tcg_res;
-    TCGv_ptr fpst;
-
-    tcg_res = tcg_temp_new_i64();
-    fpst = fpstatus_ptr(FPST_FPCR);
-    tcg_op1 = read_fp_dreg(s, rn);
-    tcg_op2 = read_fp_dreg(s, rm);
-
-    switch (opcode) {
-    case 0x0: /* FMUL */
-        gen_helper_vfp_muld(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x1: /* FDIV */
-        gen_helper_vfp_divd(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x2: /* FADD */
-        gen_helper_vfp_addd(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x3: /* FSUB */
-        gen_helper_vfp_subd(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x4: /* FMAX */
-        gen_helper_vfp_maxd(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x5: /* FMIN */
-        gen_helper_vfp_mind(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x6: /* FMAXNM */
-        gen_helper_vfp_maxnumd(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x7: /* FMINNM */
-        gen_helper_vfp_minnumd(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x8: /* FNMUL */
-        gen_helper_vfp_muld(tcg_res, tcg_op1, tcg_op2, fpst);
-        gen_helper_vfp_negd(tcg_res, tcg_res);
-        break;
-    }
-
-    write_fp_dreg(s, rd, tcg_res);
-
-    tcg_temp_free_ptr(fpst);
-    tcg_temp_free_i64(tcg_op1);
-    tcg_temp_free_i64(tcg_op2);
-    tcg_temp_free_i64(tcg_res);
-}
-
-/* Floating-point data-processing (2 source) - half precision */
-static void handle_fp_2src_half(DisasContext *s, int opcode,
-                                int rd, int rn, int rm)
-{
-    TCGv_i32 tcg_op1;
-    TCGv_i32 tcg_op2;
-    TCGv_i32 tcg_res;
-    TCGv_ptr fpst;
-
-    tcg_res = tcg_temp_new_i32();
-    fpst = fpstatus_ptr(FPST_FPCR_F16);
-    tcg_op1 = read_fp_hreg(s, rn);
-    tcg_op2 = read_fp_hreg(s, rm);
-
-    switch (opcode) {
-    case 0x0: /* FMUL */
-        gen_helper_advsimd_mulh(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x1: /* FDIV */
-        gen_helper_advsimd_divh(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x2: /* FADD */
-        gen_helper_advsimd_addh(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x3: /* FSUB */
-        gen_helper_advsimd_subh(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x4: /* FMAX */
-        gen_helper_advsimd_maxh(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x5: /* FMIN */
-        gen_helper_advsimd_minh(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x6: /* FMAXNM */
-        gen_helper_advsimd_maxnumh(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x7: /* FMINNM */
-        gen_helper_advsimd_minnumh(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x8: /* FNMUL */
-        gen_helper_advsimd_mulh(tcg_res, tcg_op1, tcg_op2, fpst);
-        tcg_gen_xori_i32(tcg_res, tcg_res, 0x8000);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-
-    write_fp_sreg(s, rd, tcg_res);
-
-    tcg_temp_free_ptr(fpst);
-    tcg_temp_free_i32(tcg_op1);
-    tcg_temp_free_i32(tcg_op2);
-    tcg_temp_free_i32(tcg_res);
-}
-
-/* Floating point data-processing (2 source)
- *   31  30  29 28       24 23  22  21 20  16 15    12 11 10 9    5 4    0
- * +---+---+---+-----------+------+---+------+--------+-----+------+------+
- * | M | 0 | S | 1 1 1 1 0 | type | 1 |  Rm  | opcode | 1 0 |  Rn  |  Rd  |
- * +---+---+---+-----------+------+---+------+--------+-----+------+------+
- */
-static void disas_fp_2src(DisasContext *s, uint32_t insn)
-{
-    int mos = extract32(insn, 29, 3);
-    int type = extract32(insn, 22, 2);
-    int rd = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    int rm = extract32(insn, 16, 5);
-    int opcode = extract32(insn, 12, 4);
-
-    if (opcode > 8 || mos) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    switch (type) {
-    case 0:
-        if (!fp_access_check(s)) {
-            return;
-        }
-        handle_fp_2src_single(s, opcode, rd, rn, rm);
-        break;
-    case 1:
-        if (!fp_access_check(s)) {
-            return;
-        }
-        handle_fp_2src_double(s, opcode, rd, rn, rm);
-        break;
-    case 3:
-        if (!dc_isar_feature(aa64_fp16, s)) {
-            unallocated_encoding(s);
-            return;
-        }
-        if (!fp_access_check(s)) {
-            return;
-        }
-        handle_fp_2src_half(s, opcode, rd, rn, rm);
-        break;
-    default:
-        unallocated_encoding(s);
-    }
-}
-
-/* Floating-point data-processing (3 source) - single precision */
-static void handle_fp_3src_single(DisasContext *s, bool o0, bool o1,
-                                  int rd, int rn, int rm, int ra)
-{
-    TCGv_i32 tcg_op1, tcg_op2, tcg_op3;
-    TCGv_i32 tcg_res = tcg_temp_new_i32();
-    TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
-
-    tcg_op1 = read_fp_sreg(s, rn);
-    tcg_op2 = read_fp_sreg(s, rm);
-    tcg_op3 = read_fp_sreg(s, ra);
-
-    /* These are fused multiply-add, and must be done as one
-     * floating point operation with no rounding between the
-     * multiplication and addition steps.
-     * NB that doing the negations here as separate steps is
-     * correct : an input NaN should come out with its sign bit
-     * flipped if it is a negated-input.
-     */
-    if (o1 == true) {
-        gen_helper_vfp_negs(tcg_op3, tcg_op3);
-    }
-
-    if (o0 != o1) {
-        gen_helper_vfp_negs(tcg_op1, tcg_op1);
-    }
-
-    gen_helper_vfp_muladds(tcg_res, tcg_op1, tcg_op2, tcg_op3, fpst);
-
-    write_fp_sreg(s, rd, tcg_res);
-
-    tcg_temp_free_ptr(fpst);
-    tcg_temp_free_i32(tcg_op1);
-    tcg_temp_free_i32(tcg_op2);
-    tcg_temp_free_i32(tcg_op3);
-    tcg_temp_free_i32(tcg_res);
-}
-
-/* Floating-point data-processing (3 source) - double precision */
-static void handle_fp_3src_double(DisasContext *s, bool o0, bool o1,
-                                  int rd, int rn, int rm, int ra)
-{
-    TCGv_i64 tcg_op1, tcg_op2, tcg_op3;
-    TCGv_i64 tcg_res = tcg_temp_new_i64();
-    TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
-
-    tcg_op1 = read_fp_dreg(s, rn);
-    tcg_op2 = read_fp_dreg(s, rm);
-    tcg_op3 = read_fp_dreg(s, ra);
-
-    /* These are fused multiply-add, and must be done as one
-     * floating point operation with no rounding between the
-     * multiplication and addition steps.
-     * NB that doing the negations here as separate steps is
-     * correct : an input NaN should come out with its sign bit
-     * flipped if it is a negated-input.
-     */
-    if (o1 == true) {
-        gen_helper_vfp_negd(tcg_op3, tcg_op3);
-    }
-
-    if (o0 != o1) {
-        gen_helper_vfp_negd(tcg_op1, tcg_op1);
-    }
-
-    gen_helper_vfp_muladdd(tcg_res, tcg_op1, tcg_op2, tcg_op3, fpst);
-
-    write_fp_dreg(s, rd, tcg_res);
-
-    tcg_temp_free_ptr(fpst);
-    tcg_temp_free_i64(tcg_op1);
-    tcg_temp_free_i64(tcg_op2);
-    tcg_temp_free_i64(tcg_op3);
-    tcg_temp_free_i64(tcg_res);
-}
-
-/* Floating-point data-processing (3 source) - half precision */
-static void handle_fp_3src_half(DisasContext *s, bool o0, bool o1,
-                                int rd, int rn, int rm, int ra)
-{
-    TCGv_i32 tcg_op1, tcg_op2, tcg_op3;
-    TCGv_i32 tcg_res = tcg_temp_new_i32();
-    TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR_F16);
-
-    tcg_op1 = read_fp_hreg(s, rn);
-    tcg_op2 = read_fp_hreg(s, rm);
-    tcg_op3 = read_fp_hreg(s, ra);
-
-    /* These are fused multiply-add, and must be done as one
-     * floating point operation with no rounding between the
-     * multiplication and addition steps.
-     * NB that doing the negations here as separate steps is
-     * correct : an input NaN should come out with its sign bit
-     * flipped if it is a negated-input.
-     */
-    if (o1 == true) {
-        tcg_gen_xori_i32(tcg_op3, tcg_op3, 0x8000);
-    }
-
-    if (o0 != o1) {
-        tcg_gen_xori_i32(tcg_op1, tcg_op1, 0x8000);
-    }
-
-    gen_helper_advsimd_muladdh(tcg_res, tcg_op1, tcg_op2, tcg_op3, fpst);
-
-    write_fp_sreg(s, rd, tcg_res);
-
-    tcg_temp_free_ptr(fpst);
-    tcg_temp_free_i32(tcg_op1);
-    tcg_temp_free_i32(tcg_op2);
-    tcg_temp_free_i32(tcg_op3);
-    tcg_temp_free_i32(tcg_res);
-}
-
-/* Floating point data-processing (3 source)
- *   31  30  29 28       24 23  22  21  20  16  15  14  10 9    5 4    0
- * +---+---+---+-----------+------+----+------+----+------+------+------+
- * | M | 0 | S | 1 1 1 1 1 | type | o1 |  Rm  | o0 |  Ra  |  Rn  |  Rd  |
- * +---+---+---+-----------+------+----+------+----+------+------+------+
- */
-static void disas_fp_3src(DisasContext *s, uint32_t insn)
-{
-    int mos = extract32(insn, 29, 3);
-    int type = extract32(insn, 22, 2);
-    int rd = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    int ra = extract32(insn, 10, 5);
-    int rm = extract32(insn, 16, 5);
-    bool o0 = extract32(insn, 15, 1);
-    bool o1 = extract32(insn, 21, 1);
-
-    if (mos) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    switch (type) {
-    case 0:
-        if (!fp_access_check(s)) {
-            return;
-        }
-        handle_fp_3src_single(s, o0, o1, rd, rn, rm, ra);
-        break;
-    case 1:
-        if (!fp_access_check(s)) {
-            return;
-        }
-        handle_fp_3src_double(s, o0, o1, rd, rn, rm, ra);
-        break;
-    case 3:
-        if (!dc_isar_feature(aa64_fp16, s)) {
-            unallocated_encoding(s);
-            return;
-        }
-        if (!fp_access_check(s)) {
-            return;
-        }
-        handle_fp_3src_half(s, o0, o1, rd, rn, rm, ra);
-        break;
-    default:
-        unallocated_encoding(s);
-    }
-}
-
-/* Floating point immediate
- *   31  30  29 28       24 23  22  21 20        13 12   10 9    5 4    0
- * +---+---+---+-----------+------+---+------------+-------+------+------+
- * | M | 0 | S | 1 1 1 1 0 | type | 1 |    imm8    | 1 0 0 | imm5 |  Rd  |
- * +---+---+---+-----------+------+---+------------+-------+------+------+
- */
-static void disas_fp_imm(DisasContext *s, uint32_t insn)
-{
-    int rd = extract32(insn, 0, 5);
-    int imm5 = extract32(insn, 5, 5);
-    int imm8 = extract32(insn, 13, 8);
-    int type = extract32(insn, 22, 2);
-    int mos = extract32(insn, 29, 3);
-    uint64_t imm;
-    MemOp sz;
-
-    if (mos || imm5) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    switch (type) {
-    case 0:
-        sz = MO_32;
-        break;
-    case 1:
-        sz = MO_64;
-        break;
-    case 3:
-        sz = MO_16;
-        if (dc_isar_feature(aa64_fp16, s)) {
-            break;
-        }
-        /* fallthru */
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    imm = vfp_expand_imm(sz, imm8);
-    write_fp_dreg(s, rd, tcg_constant_i64(imm));
-}
-
-/* Handle floating point <=> fixed point conversions. Note that we can
- * also deal with fp <=> integer conversions as a special case (scale == 64)
- * OPTME: consider handling that special case specially or at least skipping
- * the call to scalbn in the helpers for zero shifts.
- */
-static void handle_fpfpcvt(DisasContext *s, int rd, int rn, int opcode,
-                           bool itof, int rmode, int scale, int sf, int type)
-{
-    bool is_signed = !(opcode & 1);
-    TCGv_ptr tcg_fpstatus;
-    TCGv_i32 tcg_shift, tcg_single;
-    TCGv_i64 tcg_double;
-
-    tcg_fpstatus = fpstatus_ptr(type == 3 ? FPST_FPCR_F16 : FPST_FPCR);
-
-    tcg_shift = tcg_constant_i32(64 - scale);
-
-    if (itof) {
-        TCGv_i64 tcg_int = cpu_reg(s, rn);
-        if (!sf) {
-            TCGv_i64 tcg_extend = new_tmp_a64(s);
-
-            if (is_signed) {
-                tcg_gen_ext32s_i64(tcg_extend, tcg_int);
-            } else {
-                tcg_gen_ext32u_i64(tcg_extend, tcg_int);
-            }
-
-            tcg_int = tcg_extend;
-        }
-
-        switch (type) {
-        case 1: /* float64 */
-            tcg_double = tcg_temp_new_i64();
-            if (is_signed) {
-                gen_helper_vfp_sqtod(tcg_double, tcg_int,
-                                     tcg_shift, tcg_fpstatus);
-            } else {
-                gen_helper_vfp_uqtod(tcg_double, tcg_int,
-                                     tcg_shift, tcg_fpstatus);
-            }
-            write_fp_dreg(s, rd, tcg_double);
-            tcg_temp_free_i64(tcg_double);
-            break;
-
-        case 0: /* float32 */
-            tcg_single = tcg_temp_new_i32();
-            if (is_signed) {
-                gen_helper_vfp_sqtos(tcg_single, tcg_int,
-                                     tcg_shift, tcg_fpstatus);
-            } else {
-                gen_helper_vfp_uqtos(tcg_single, tcg_int,
-                                     tcg_shift, tcg_fpstatus);
-            }
-            write_fp_sreg(s, rd, tcg_single);
-            tcg_temp_free_i32(tcg_single);
-            break;
-
-        case 3: /* float16 */
-            tcg_single = tcg_temp_new_i32();
-            if (is_signed) {
-                gen_helper_vfp_sqtoh(tcg_single, tcg_int,
-                                     tcg_shift, tcg_fpstatus);
-            } else {
-                gen_helper_vfp_uqtoh(tcg_single, tcg_int,
-                                     tcg_shift, tcg_fpstatus);
-            }
-            write_fp_sreg(s, rd, tcg_single);
-            tcg_temp_free_i32(tcg_single);
-            break;
-
-        default:
-            g_assert_not_reached();
-        }
-    } else {
-        TCGv_i64 tcg_int = cpu_reg(s, rd);
-        TCGv_i32 tcg_rmode;
-
-        if (extract32(opcode, 2, 1)) {
-            /* There are too many rounding modes to all fit into rmode,
-             * so FCVTA[US] is a special case.
-             */
-            rmode = FPROUNDING_TIEAWAY;
-        }
-
-        tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode));
-
-        gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
-
-        switch (type) {
-        case 1: /* float64 */
-            tcg_double = read_fp_dreg(s, rn);
-            if (is_signed) {
-                if (!sf) {
-                    gen_helper_vfp_tosld(tcg_int, tcg_double,
-                                         tcg_shift, tcg_fpstatus);
-                } else {
-                    gen_helper_vfp_tosqd(tcg_int, tcg_double,
-                                         tcg_shift, tcg_fpstatus);
-                }
-            } else {
-                if (!sf) {
-                    gen_helper_vfp_tould(tcg_int, tcg_double,
-                                         tcg_shift, tcg_fpstatus);
-                } else {
-                    gen_helper_vfp_touqd(tcg_int, tcg_double,
-                                         tcg_shift, tcg_fpstatus);
-                }
-            }
-            if (!sf) {
-                tcg_gen_ext32u_i64(tcg_int, tcg_int);
-            }
-            tcg_temp_free_i64(tcg_double);
-            break;
-
-        case 0: /* float32 */
-            tcg_single = read_fp_sreg(s, rn);
-            if (sf) {
-                if (is_signed) {
-                    gen_helper_vfp_tosqs(tcg_int, tcg_single,
-                                         tcg_shift, tcg_fpstatus);
-                } else {
-                    gen_helper_vfp_touqs(tcg_int, tcg_single,
-                                         tcg_shift, tcg_fpstatus);
-                }
-            } else {
-                TCGv_i32 tcg_dest = tcg_temp_new_i32();
-                if (is_signed) {
-                    gen_helper_vfp_tosls(tcg_dest, tcg_single,
-                                         tcg_shift, tcg_fpstatus);
-                } else {
-                    gen_helper_vfp_touls(tcg_dest, tcg_single,
-                                         tcg_shift, tcg_fpstatus);
-                }
-                tcg_gen_extu_i32_i64(tcg_int, tcg_dest);
-                tcg_temp_free_i32(tcg_dest);
-            }
-            tcg_temp_free_i32(tcg_single);
-            break;
-
-        case 3: /* float16 */
-            tcg_single = read_fp_sreg(s, rn);
-            if (sf) {
-                if (is_signed) {
-                    gen_helper_vfp_tosqh(tcg_int, tcg_single,
-                                         tcg_shift, tcg_fpstatus);
-                } else {
-                    gen_helper_vfp_touqh(tcg_int, tcg_single,
-                                         tcg_shift, tcg_fpstatus);
-                }
-            } else {
-                TCGv_i32 tcg_dest = tcg_temp_new_i32();
-                if (is_signed) {
-                    gen_helper_vfp_toslh(tcg_dest, tcg_single,
-                                         tcg_shift, tcg_fpstatus);
-                } else {
-                    gen_helper_vfp_toulh(tcg_dest, tcg_single,
-                                         tcg_shift, tcg_fpstatus);
-                }
-                tcg_gen_extu_i32_i64(tcg_int, tcg_dest);
-                tcg_temp_free_i32(tcg_dest);
-            }
-            tcg_temp_free_i32(tcg_single);
-            break;
-
-        default:
-            g_assert_not_reached();
-        }
-
-        gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
-        tcg_temp_free_i32(tcg_rmode);
-    }
-
-    tcg_temp_free_ptr(tcg_fpstatus);
-}
-
-/* Floating point <-> fixed point conversions
- *   31   30  29 28       24 23  22  21 20   19 18    16 15   10 9    5 4    0
- * +----+---+---+-----------+------+---+-------+--------+-------+------+------+
- * | sf | 0 | S | 1 1 1 1 0 | type | 0 | rmode | opcode | scale |  Rn  |  Rd  |
- * +----+---+---+-----------+------+---+-------+--------+-------+------+------+
- */
-static void disas_fp_fixed_conv(DisasContext *s, uint32_t insn)
-{
-    int rd = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    int scale = extract32(insn, 10, 6);
-    int opcode = extract32(insn, 16, 3);
-    int rmode = extract32(insn, 19, 2);
-    int type = extract32(insn, 22, 2);
-    bool sbit = extract32(insn, 29, 1);
-    bool sf = extract32(insn, 31, 1);
-    bool itof;
-
-    if (sbit || (!sf && scale < 32)) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    switch (type) {
-    case 0: /* float32 */
-    case 1: /* float64 */
-        break;
-    case 3: /* float16 */
-        if (dc_isar_feature(aa64_fp16, s)) {
-            break;
-        }
-        /* fallthru */
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-
-    switch ((rmode << 3) | opcode) {
-    case 0x2: /* SCVTF */
-    case 0x3: /* UCVTF */
-        itof = true;
-        break;
-    case 0x18: /* FCVTZS */
-    case 0x19: /* FCVTZU */
-        itof = false;
-        break;
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    handle_fpfpcvt(s, rd, rn, opcode, itof, FPROUNDING_ZERO, scale, sf, type);
-}
-
-static void handle_fmov(DisasContext *s, int rd, int rn, int type, bool itof)
-{
-    /* FMOV: gpr to or from float, double, or top half of quad fp reg,
-     * without conversion.
-     */
-
-    if (itof) {
-        TCGv_i64 tcg_rn = cpu_reg(s, rn);
-        TCGv_i64 tmp;
-
-        switch (type) {
-        case 0:
-            /* 32 bit */
-            tmp = tcg_temp_new_i64();
-            tcg_gen_ext32u_i64(tmp, tcg_rn);
-            write_fp_dreg(s, rd, tmp);
-            tcg_temp_free_i64(tmp);
-            break;
-        case 1:
-            /* 64 bit */
-            write_fp_dreg(s, rd, tcg_rn);
-            break;
-        case 2:
-            /* 64 bit to top half. */
-            tcg_gen_st_i64(tcg_rn, cpu_env, fp_reg_hi_offset(s, rd));
-            clear_vec_high(s, true, rd);
-            break;
-        case 3:
-            /* 16 bit */
-            tmp = tcg_temp_new_i64();
-            tcg_gen_ext16u_i64(tmp, tcg_rn);
-            write_fp_dreg(s, rd, tmp);
-            tcg_temp_free_i64(tmp);
-            break;
-        default:
-            g_assert_not_reached();
-        }
-    } else {
-        TCGv_i64 tcg_rd = cpu_reg(s, rd);
-
-        switch (type) {
-        case 0:
-            /* 32 bit */
-            tcg_gen_ld32u_i64(tcg_rd, cpu_env, fp_reg_offset(s, rn, MO_32));
-            break;
-        case 1:
-            /* 64 bit */
-            tcg_gen_ld_i64(tcg_rd, cpu_env, fp_reg_offset(s, rn, MO_64));
-            break;
-        case 2:
-            /* 64 bits from top half */
-            tcg_gen_ld_i64(tcg_rd, cpu_env, fp_reg_hi_offset(s, rn));
-            break;
-        case 3:
-            /* 16 bit */
-            tcg_gen_ld16u_i64(tcg_rd, cpu_env, fp_reg_offset(s, rn, MO_16));
-            break;
-        default:
-            g_assert_not_reached();
-        }
-    }
-}
-
-static void handle_fjcvtzs(DisasContext *s, int rd, int rn)
-{
-    TCGv_i64 t = read_fp_dreg(s, rn);
-    TCGv_ptr fpstatus = fpstatus_ptr(FPST_FPCR);
-
-    gen_helper_fjcvtzs(t, t, fpstatus);
-
-    tcg_temp_free_ptr(fpstatus);
-
-    tcg_gen_ext32u_i64(cpu_reg(s, rd), t);
-    tcg_gen_extrh_i64_i32(cpu_ZF, t);
-    tcg_gen_movi_i32(cpu_CF, 0);
-    tcg_gen_movi_i32(cpu_NF, 0);
-    tcg_gen_movi_i32(cpu_VF, 0);
-
-    tcg_temp_free_i64(t);
-}
-
-/* Floating point <-> integer conversions
- *   31   30  29 28       24 23  22  21 20   19 18 16 15         10 9  5 4  0
- * +----+---+---+-----------+------+---+-------+-----+-------------+----+----+
- * | sf | 0 | S | 1 1 1 1 0 | type | 1 | rmode | opc | 0 0 0 0 0 0 | Rn | Rd |
- * +----+---+---+-----------+------+---+-------+-----+-------------+----+----+
- */
-static void disas_fp_int_conv(DisasContext *s, uint32_t insn)
-{
-    int rd = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    int opcode = extract32(insn, 16, 3);
-    int rmode = extract32(insn, 19, 2);
-    int type = extract32(insn, 22, 2);
-    bool sbit = extract32(insn, 29, 1);
-    bool sf = extract32(insn, 31, 1);
-    bool itof = false;
-
-    if (sbit) {
-        goto do_unallocated;
-    }
-
-    switch (opcode) {
-    case 2: /* SCVTF */
-    case 3: /* UCVTF */
-        itof = true;
-        /* fallthru */
-    case 4: /* FCVTAS */
-    case 5: /* FCVTAU */
-        if (rmode != 0) {
-            goto do_unallocated;
-        }
-        /* fallthru */
-    case 0: /* FCVT[NPMZ]S */
-    case 1: /* FCVT[NPMZ]U */
-        switch (type) {
-        case 0: /* float32 */
-        case 1: /* float64 */
-            break;
-        case 3: /* float16 */
-            if (!dc_isar_feature(aa64_fp16, s)) {
-                goto do_unallocated;
-            }
-            break;
-        default:
-            goto do_unallocated;
-        }
-        if (!fp_access_check(s)) {
-            return;
-        }
-        handle_fpfpcvt(s, rd, rn, opcode, itof, rmode, 64, sf, type);
-        break;
-
-    default:
-        switch (sf << 7 | type << 5 | rmode << 3 | opcode) {
-        case 0b01100110: /* FMOV half <-> 32-bit int */
-        case 0b01100111:
-        case 0b11100110: /* FMOV half <-> 64-bit int */
-        case 0b11100111:
-            if (!dc_isar_feature(aa64_fp16, s)) {
-                goto do_unallocated;
-            }
-            /* fallthru */
-        case 0b00000110: /* FMOV 32-bit */
-        case 0b00000111:
-        case 0b10100110: /* FMOV 64-bit */
-        case 0b10100111:
-        case 0b11001110: /* FMOV top half of 128-bit */
-        case 0b11001111:
-            if (!fp_access_check(s)) {
-                return;
-            }
-            itof = opcode & 1;
-            handle_fmov(s, rd, rn, type, itof);
-            break;
-
-        case 0b00111110: /* FJCVTZS */
-            if (!dc_isar_feature(aa64_jscvt, s)) {
-                goto do_unallocated;
-            } else if (fp_access_check(s)) {
-                handle_fjcvtzs(s, rd, rn);
-            }
-            break;
-
-        default:
-        do_unallocated:
-            unallocated_encoding(s);
-            return;
-        }
-        break;
-    }
-}
-
-/* FP-specific subcases of table C3-6 (SIMD and FP data processing)
- *   31  30  29 28     25 24                          0
- * +---+---+---+---------+-----------------------------+
- * |   | 0 |   | 1 1 1 1 |                             |
- * +---+---+---+---------+-----------------------------+
- */
-static void disas_data_proc_fp(DisasContext *s, uint32_t insn)
-{
-    if (extract32(insn, 24, 1)) {
-        /* Floating point data-processing (3 source) */
-        disas_fp_3src(s, insn);
-    } else if (extract32(insn, 21, 1) == 0) {
-        /* Floating point to fixed point conversions */
-        disas_fp_fixed_conv(s, insn);
-    } else {
-        switch (extract32(insn, 10, 2)) {
-        case 1:
-            /* Floating point conditional compare */
-            disas_fp_ccomp(s, insn);
-            break;
-        case 2:
-            /* Floating point data-processing (2 source) */
-            disas_fp_2src(s, insn);
-            break;
-        case 3:
-            /* Floating point conditional select */
-            disas_fp_csel(s, insn);
-            break;
-        case 0:
-            switch (ctz32(extract32(insn, 12, 4))) {
-            case 0: /* [15:12] == xxx1 */
-                /* Floating point immediate */
-                disas_fp_imm(s, insn);
-                break;
-            case 1: /* [15:12] == xx10 */
-                /* Floating point compare */
-                disas_fp_compare(s, insn);
-                break;
-            case 2: /* [15:12] == x100 */
-                /* Floating point data-processing (1 source) */
-                disas_fp_1src(s, insn);
-                break;
-            case 3: /* [15:12] == 1000 */
-                unallocated_encoding(s);
-                break;
-            default: /* [15:12] == 0000 */
-                /* Floating point <-> integer conversions */
-                disas_fp_int_conv(s, insn);
-                break;
-            }
-            break;
-        }
-    }
-}
-
-static void do_ext64(DisasContext *s, TCGv_i64 tcg_left, TCGv_i64 tcg_right,
-                     int pos)
-{
-    /* Extract 64 bits from the middle of two concatenated 64 bit
-     * vector register slices left:right. The extracted bits start
-     * at 'pos' bits into the right (least significant) side.
-     * We return the result in tcg_right, and guarantee not to
-     * trash tcg_left.
-     */
-    TCGv_i64 tcg_tmp = tcg_temp_new_i64();
-    assert(pos > 0 && pos < 64);
-
-    tcg_gen_shri_i64(tcg_right, tcg_right, pos);
-    tcg_gen_shli_i64(tcg_tmp, tcg_left, 64 - pos);
-    tcg_gen_or_i64(tcg_right, tcg_right, tcg_tmp);
-
-    tcg_temp_free_i64(tcg_tmp);
-}
-
-/* EXT
- *   31  30 29         24 23 22  21 20  16 15  14  11 10  9    5 4    0
- * +---+---+-------------+-----+---+------+---+------+---+------+------+
- * | 0 | Q | 1 0 1 1 1 0 | op2 | 0 |  Rm  | 0 | imm4 | 0 |  Rn  |  Rd  |
- * +---+---+-------------+-----+---+------+---+------+---+------+------+
- */
-static void disas_simd_ext(DisasContext *s, uint32_t insn)
-{
-    int is_q = extract32(insn, 30, 1);
-    int op2 = extract32(insn, 22, 2);
-    int imm4 = extract32(insn, 11, 4);
-    int rm = extract32(insn, 16, 5);
-    int rn = extract32(insn, 5, 5);
-    int rd = extract32(insn, 0, 5);
-    int pos = imm4 << 3;
-    TCGv_i64 tcg_resl, tcg_resh;
-
-    if (op2 != 0 || (!is_q && extract32(imm4, 3, 1))) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    tcg_resh = tcg_temp_new_i64();
-    tcg_resl = tcg_temp_new_i64();
-
-    /* Vd gets bits starting at pos bits into Vm:Vn. This is
-     * either extracting 128 bits from a 128:128 concatenation, or
-     * extracting 64 bits from a 64:64 concatenation.
-     */
-    if (!is_q) {
-        read_vec_element(s, tcg_resl, rn, 0, MO_64);
-        if (pos != 0) {
-            read_vec_element(s, tcg_resh, rm, 0, MO_64);
-            do_ext64(s, tcg_resh, tcg_resl, pos);
-        }
-    } else {
-        TCGv_i64 tcg_hh;
-        typedef struct {
-            int reg;
-            int elt;
-        } EltPosns;
-        EltPosns eltposns[] = { {rn, 0}, {rn, 1}, {rm, 0}, {rm, 1} };
-        EltPosns *elt = eltposns;
-
-        if (pos >= 64) {
-            elt++;
-            pos -= 64;
-        }
-
-        read_vec_element(s, tcg_resl, elt->reg, elt->elt, MO_64);
-        elt++;
-        read_vec_element(s, tcg_resh, elt->reg, elt->elt, MO_64);
-        elt++;
-        if (pos != 0) {
-            do_ext64(s, tcg_resh, tcg_resl, pos);
-            tcg_hh = tcg_temp_new_i64();
-            read_vec_element(s, tcg_hh, elt->reg, elt->elt, MO_64);
-            do_ext64(s, tcg_hh, tcg_resh, pos);
-            tcg_temp_free_i64(tcg_hh);
-        }
-    }
-
-    write_vec_element(s, tcg_resl, rd, 0, MO_64);
-    tcg_temp_free_i64(tcg_resl);
-    if (is_q) {
-        write_vec_element(s, tcg_resh, rd, 1, MO_64);
-    }
-    tcg_temp_free_i64(tcg_resh);
-    clear_vec_high(s, is_q, rd);
-}
-
-/* TBL/TBX
- *   31  30 29         24 23 22  21 20  16 15  14 13  12  11 10 9    5 4    0
- * +---+---+-------------+-----+---+------+---+-----+----+-----+------+------+
- * | 0 | Q | 0 0 1 1 1 0 | op2 | 0 |  Rm  | 0 | len | op | 0 0 |  Rn  |  Rd  |
- * +---+---+-------------+-----+---+------+---+-----+----+-----+------+------+
- */
-static void disas_simd_tb(DisasContext *s, uint32_t insn)
-{
-    int op2 = extract32(insn, 22, 2);
-    int is_q = extract32(insn, 30, 1);
-    int rm = extract32(insn, 16, 5);
-    int rn = extract32(insn, 5, 5);
-    int rd = extract32(insn, 0, 5);
-    int is_tbx = extract32(insn, 12, 1);
-    int len = (extract32(insn, 13, 2) + 1) * 16;
-
-    if (op2 != 0) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    tcg_gen_gvec_2_ptr(vec_full_reg_offset(s, rd),
-                       vec_full_reg_offset(s, rm), cpu_env,
-                       is_q ? 16 : 8, vec_full_reg_size(s),
-                       (len << 6) | (is_tbx << 5) | rn,
-                       gen_helper_simd_tblx);
-}
-
-/* ZIP/UZP/TRN
- *   31  30 29         24 23  22  21 20   16 15 14 12 11 10 9    5 4    0
- * +---+---+-------------+------+---+------+---+------------------+------+
- * | 0 | Q | 0 0 1 1 1 0 | size | 0 |  Rm  | 0 | opc | 1 0 |  Rn  |  Rd  |
- * +---+---+-------------+------+---+------+---+------------------+------+
- */
-static void disas_simd_zip_trn(DisasContext *s, uint32_t insn)
-{
-    int rd = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    int rm = extract32(insn, 16, 5);
-    int size = extract32(insn, 22, 2);
-    /* opc field bits [1:0] indicate ZIP/UZP/TRN;
-     * bit 2 indicates 1 vs 2 variant of the insn.
-     */
-    int opcode = extract32(insn, 12, 2);
-    bool part = extract32(insn, 14, 1);
-    bool is_q = extract32(insn, 30, 1);
-    int esize = 8 << size;
-    int i, ofs;
-    int datasize = is_q ? 128 : 64;
-    int elements = datasize / esize;
-    TCGv_i64 tcg_res, tcg_resl, tcg_resh;
-
-    if (opcode == 0 || (size == 3 && !is_q)) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    tcg_resl = tcg_const_i64(0);
-    tcg_resh = is_q ? tcg_const_i64(0) : NULL;
-    tcg_res = tcg_temp_new_i64();
-
-    for (i = 0; i < elements; i++) {
-        switch (opcode) {
-        case 1: /* UZP1/2 */
-        {
-            int midpoint = elements / 2;
-            if (i < midpoint) {
-                read_vec_element(s, tcg_res, rn, 2 * i + part, size);
-            } else {
-                read_vec_element(s, tcg_res, rm,
-                                 2 * (i - midpoint) + part, size);
-            }
-            break;
-        }
-        case 2: /* TRN1/2 */
-            if (i & 1) {
-                read_vec_element(s, tcg_res, rm, (i & ~1) + part, size);
-            } else {
-                read_vec_element(s, tcg_res, rn, (i & ~1) + part, size);
-            }
-            break;
-        case 3: /* ZIP1/2 */
-        {
-            int base = part * elements / 2;
-            if (i & 1) {
-                read_vec_element(s, tcg_res, rm, base + (i >> 1), size);
-            } else {
-                read_vec_element(s, tcg_res, rn, base + (i >> 1), size);
-            }
-            break;
-        }
-        default:
-            g_assert_not_reached();
-        }
-
-        ofs = i * esize;
-        if (ofs < 64) {
-            tcg_gen_shli_i64(tcg_res, tcg_res, ofs);
-            tcg_gen_or_i64(tcg_resl, tcg_resl, tcg_res);
-        } else {
-            tcg_gen_shli_i64(tcg_res, tcg_res, ofs - 64);
-            tcg_gen_or_i64(tcg_resh, tcg_resh, tcg_res);
-        }
-    }
-
-    tcg_temp_free_i64(tcg_res);
-
-    write_vec_element(s, tcg_resl, rd, 0, MO_64);
-    tcg_temp_free_i64(tcg_resl);
-
-    if (is_q) {
-        write_vec_element(s, tcg_resh, rd, 1, MO_64);
-        tcg_temp_free_i64(tcg_resh);
-    }
-    clear_vec_high(s, is_q, rd);
-}
-
-/*
- * do_reduction_op helper
- *
- * This mirrors the Reduce() pseudocode in the ARM ARM. It is
- * important for correct NaN propagation that we do these
- * operations in exactly the order specified by the pseudocode.
- *
- * This is a recursive function, TCG temps should be freed by the
- * calling function once it is done with the values.
- */
-static TCGv_i32 do_reduction_op(DisasContext *s, int fpopcode, int rn,
-                                int esize, int size, int vmap, TCGv_ptr fpst)
-{
-    if (esize == size) {
-        int element;
-        MemOp msize = esize == 16 ? MO_16 : MO_32;
-        TCGv_i32 tcg_elem;
-
-        /* We should have one register left here */
-        assert(ctpop8(vmap) == 1);
-        element = ctz32(vmap);
-        assert(element < 8);
-
-        tcg_elem = tcg_temp_new_i32();
-        read_vec_element_i32(s, tcg_elem, rn, element, msize);
-        return tcg_elem;
-    } else {
-        int bits = size / 2;
-        int shift = ctpop8(vmap) / 2;
-        int vmap_lo = (vmap >> shift) & vmap;
-        int vmap_hi = (vmap & ~vmap_lo);
-        TCGv_i32 tcg_hi, tcg_lo, tcg_res;
-
-        tcg_hi = do_reduction_op(s, fpopcode, rn, esize, bits, vmap_hi, fpst);
-        tcg_lo = do_reduction_op(s, fpopcode, rn, esize, bits, vmap_lo, fpst);
-        tcg_res = tcg_temp_new_i32();
-
-        switch (fpopcode) {
-        case 0x0c: /* fmaxnmv half-precision */
-            gen_helper_advsimd_maxnumh(tcg_res, tcg_lo, tcg_hi, fpst);
-            break;
-        case 0x0f: /* fmaxv half-precision */
-            gen_helper_advsimd_maxh(tcg_res, tcg_lo, tcg_hi, fpst);
-            break;
-        case 0x1c: /* fminnmv half-precision */
-            gen_helper_advsimd_minnumh(tcg_res, tcg_lo, tcg_hi, fpst);
-            break;
-        case 0x1f: /* fminv half-precision */
-            gen_helper_advsimd_minh(tcg_res, tcg_lo, tcg_hi, fpst);
-            break;
-        case 0x2c: /* fmaxnmv */
-            gen_helper_vfp_maxnums(tcg_res, tcg_lo, tcg_hi, fpst);
-            break;
-        case 0x2f: /* fmaxv */
-            gen_helper_vfp_maxs(tcg_res, tcg_lo, tcg_hi, fpst);
-            break;
-        case 0x3c: /* fminnmv */
-            gen_helper_vfp_minnums(tcg_res, tcg_lo, tcg_hi, fpst);
-            break;
-        case 0x3f: /* fminv */
-            gen_helper_vfp_mins(tcg_res, tcg_lo, tcg_hi, fpst);
-            break;
-        default:
-            g_assert_not_reached();
-        }
-
-        tcg_temp_free_i32(tcg_hi);
-        tcg_temp_free_i32(tcg_lo);
-        return tcg_res;
-    }
-}
-
-/* AdvSIMD across lanes
- *   31  30  29 28       24 23  22 21       17 16    12 11 10 9    5 4    0
- * +---+---+---+-----------+------+-----------+--------+-----+------+------+
- * | 0 | Q | U | 0 1 1 1 0 | size | 1 1 0 0 0 | opcode | 1 0 |  Rn  |  Rd  |
- * +---+---+---+-----------+------+-----------+--------+-----+------+------+
- */
-static void disas_simd_across_lanes(DisasContext *s, uint32_t insn)
-{
-    int rd = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    int size = extract32(insn, 22, 2);
-    int opcode = extract32(insn, 12, 5);
-    bool is_q = extract32(insn, 30, 1);
-    bool is_u = extract32(insn, 29, 1);
-    bool is_fp = false;
-    bool is_min = false;
-    int esize;
-    int elements;
-    int i;
-    TCGv_i64 tcg_res, tcg_elt;
-
-    switch (opcode) {
-    case 0x1b: /* ADDV */
-        if (is_u) {
-            unallocated_encoding(s);
-            return;
-        }
-        /* fall through */
-    case 0x3: /* SADDLV, UADDLV */
-    case 0xa: /* SMAXV, UMAXV */
-    case 0x1a: /* SMINV, UMINV */
-        if (size == 3 || (size == 2 && !is_q)) {
-            unallocated_encoding(s);
-            return;
-        }
-        break;
-    case 0xc: /* FMAXNMV, FMINNMV */
-    case 0xf: /* FMAXV, FMINV */
-        /* Bit 1 of size field encodes min vs max and the actual size
-         * depends on the encoding of the U bit. If not set (and FP16
-         * enabled) then we do half-precision float instead of single
-         * precision.
-         */
-        is_min = extract32(size, 1, 1);
-        is_fp = true;
-        if (!is_u && dc_isar_feature(aa64_fp16, s)) {
-            size = 1;
-        } else if (!is_u || !is_q || extract32(size, 0, 1)) {
-            unallocated_encoding(s);
-            return;
-        } else {
-            size = 2;
-        }
-        break;
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    esize = 8 << size;
-    elements = (is_q ? 128 : 64) / esize;
-
-    tcg_res = tcg_temp_new_i64();
-    tcg_elt = tcg_temp_new_i64();
-
-    /* These instructions operate across all lanes of a vector
-     * to produce a single result. We can guarantee that a 64
-     * bit intermediate is sufficient:
-     *  + for [US]ADDLV the maximum element size is 32 bits, and
-     *    the result type is 64 bits
-     *  + for FMAX*V, FMIN*V, ADDV the intermediate type is the
-     *    same as the element size, which is 32 bits at most
-     * For the integer operations we can choose to work at 64
-     * or 32 bits and truncate at the end; for simplicity
-     * we use 64 bits always. The floating point
-     * ops do require 32 bit intermediates, though.
-     */
-    if (!is_fp) {
-        read_vec_element(s, tcg_res, rn, 0, size | (is_u ? 0 : MO_SIGN));
-
-        for (i = 1; i < elements; i++) {
-            read_vec_element(s, tcg_elt, rn, i, size | (is_u ? 0 : MO_SIGN));
-
-            switch (opcode) {
-            case 0x03: /* SADDLV / UADDLV */
-            case 0x1b: /* ADDV */
-                tcg_gen_add_i64(tcg_res, tcg_res, tcg_elt);
-                break;
-            case 0x0a: /* SMAXV / UMAXV */
-                if (is_u) {
-                    tcg_gen_umax_i64(tcg_res, tcg_res, tcg_elt);
-                } else {
-                    tcg_gen_smax_i64(tcg_res, tcg_res, tcg_elt);
-                }
-                break;
-            case 0x1a: /* SMINV / UMINV */
-                if (is_u) {
-                    tcg_gen_umin_i64(tcg_res, tcg_res, tcg_elt);
-                } else {
-                    tcg_gen_smin_i64(tcg_res, tcg_res, tcg_elt);
-                }
-                break;
-            default:
-                g_assert_not_reached();
-            }
-
-        }
-    } else {
-        /* Floating point vector reduction ops which work across 32
-         * bit (single) or 16 bit (half-precision) intermediates.
-         * Note that correct NaN propagation requires that we do these
-         * operations in exactly the order specified by the pseudocode.
-         */
-        TCGv_ptr fpst = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
-        int fpopcode = opcode | is_min << 4 | is_u << 5;
-        int vmap = (1 << elements) - 1;
-        TCGv_i32 tcg_res32 = do_reduction_op(s, fpopcode, rn, esize,
-                                             (is_q ? 128 : 64), vmap, fpst);
-        tcg_gen_extu_i32_i64(tcg_res, tcg_res32);
-        tcg_temp_free_i32(tcg_res32);
-        tcg_temp_free_ptr(fpst);
-    }
-
-    tcg_temp_free_i64(tcg_elt);
-
-    /* Now truncate the result to the width required for the final output */
-    if (opcode == 0x03) {
-        /* SADDLV, UADDLV: result is 2*esize */
-        size++;
-    }
-
-    switch (size) {
-    case 0:
-        tcg_gen_ext8u_i64(tcg_res, tcg_res);
-        break;
-    case 1:
-        tcg_gen_ext16u_i64(tcg_res, tcg_res);
-        break;
-    case 2:
-        tcg_gen_ext32u_i64(tcg_res, tcg_res);
-        break;
-    case 3:
-        break;
-    default:
-        g_assert_not_reached();
-    }
-
-    write_fp_dreg(s, rd, tcg_res);
-    tcg_temp_free_i64(tcg_res);
-}
-
-/* DUP (Element, Vector)
- *
- *  31  30   29              21 20    16 15        10  9    5 4    0
- * +---+---+-------------------+--------+-------------+------+------+
- * | 0 | Q | 0 0 1 1 1 0 0 0 0 |  imm5  | 0 0 0 0 0 1 |  Rn  |  Rd  |
- * +---+---+-------------------+--------+-------------+------+------+
- *
- * size: encoded in imm5 (see ARM ARM LowestSetBit())
- */
-static void handle_simd_dupe(DisasContext *s, int is_q, int rd, int rn,
-                             int imm5)
-{
-    int size = ctz32(imm5);
-    int index;
-
-    if (size > 3 || (size == 3 && !is_q)) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    index = imm5 >> (size + 1);
-    tcg_gen_gvec_dup_mem(size, vec_full_reg_offset(s, rd),
-                         vec_reg_offset(s, rn, index, size),
-                         is_q ? 16 : 8, vec_full_reg_size(s));
-}
-
-/* DUP (element, scalar)
- *  31                   21 20    16 15        10  9    5 4    0
- * +-----------------------+--------+-------------+------+------+
- * | 0 1 0 1 1 1 1 0 0 0 0 |  imm5  | 0 0 0 0 0 1 |  Rn  |  Rd  |
- * +-----------------------+--------+-------------+------+------+
- */
-static void handle_simd_dupes(DisasContext *s, int rd, int rn,
-                              int imm5)
-{
-    int size = ctz32(imm5);
-    int index;
-    TCGv_i64 tmp;
-
-    if (size > 3) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    index = imm5 >> (size + 1);
-
-    /* This instruction just extracts the specified element and
-     * zero-extends it into the bottom of the destination register.
-     */
-    tmp = tcg_temp_new_i64();
-    read_vec_element(s, tmp, rn, index, size);
-    write_fp_dreg(s, rd, tmp);
-    tcg_temp_free_i64(tmp);
-}
-
-/* DUP (General)
- *
- *  31  30   29              21 20    16 15        10  9    5 4    0
- * +---+---+-------------------+--------+-------------+------+------+
- * | 0 | Q | 0 0 1 1 1 0 0 0 0 |  imm5  | 0 0 0 0 1 1 |  Rn  |  Rd  |
- * +---+---+-------------------+--------+-------------+------+------+
- *
- * size: encoded in imm5 (see ARM ARM LowestSetBit())
- */
-static void handle_simd_dupg(DisasContext *s, int is_q, int rd, int rn,
-                             int imm5)
-{
-    int size = ctz32(imm5);
-    uint32_t dofs, oprsz, maxsz;
-
-    if (size > 3 || ((size == 3) && !is_q)) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    dofs = vec_full_reg_offset(s, rd);
-    oprsz = is_q ? 16 : 8;
-    maxsz = vec_full_reg_size(s);
-
-    tcg_gen_gvec_dup_i64(size, dofs, oprsz, maxsz, cpu_reg(s, rn));
-}
-
-/* INS (Element)
- *
- *  31                   21 20    16 15  14    11  10 9    5 4    0
- * +-----------------------+--------+------------+---+------+------+
- * | 0 1 1 0 1 1 1 0 0 0 0 |  imm5  | 0 |  imm4  | 1 |  Rn  |  Rd  |
- * +-----------------------+--------+------------+---+------+------+
- *
- * size: encoded in imm5 (see ARM ARM LowestSetBit())
- * index: encoded in imm5<4:size+1>
- */
-static void handle_simd_inse(DisasContext *s, int rd, int rn,
-                             int imm4, int imm5)
-{
-    int size = ctz32(imm5);
-    int src_index, dst_index;
-    TCGv_i64 tmp;
-
-    if (size > 3) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    dst_index = extract32(imm5, 1+size, 5);
-    src_index = extract32(imm4, size, 4);
-
-    tmp = tcg_temp_new_i64();
-
-    read_vec_element(s, tmp, rn, src_index, size);
-    write_vec_element(s, tmp, rd, dst_index, size);
-
-    tcg_temp_free_i64(tmp);
-
-    /* INS is considered a 128-bit write for SVE. */
-    clear_vec_high(s, true, rd);
-}
-
-
-/* INS (General)
- *
- *  31                   21 20    16 15        10  9    5 4    0
- * +-----------------------+--------+-------------+------+------+
- * | 0 1 0 0 1 1 1 0 0 0 0 |  imm5  | 0 0 0 1 1 1 |  Rn  |  Rd  |
- * +-----------------------+--------+-------------+------+------+
- *
- * size: encoded in imm5 (see ARM ARM LowestSetBit())
- * index: encoded in imm5<4:size+1>
- */
-static void handle_simd_insg(DisasContext *s, int rd, int rn, int imm5)
-{
-    int size = ctz32(imm5);
-    int idx;
-
-    if (size > 3) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    idx = extract32(imm5, 1 + size, 4 - size);
-    write_vec_element(s, cpu_reg(s, rn), rd, idx, size);
-
-    /* INS is considered a 128-bit write for SVE. */
-    clear_vec_high(s, true, rd);
-}
-
-/*
- * UMOV (General)
- * SMOV (General)
- *
- *  31  30   29              21 20    16 15    12   10 9    5 4    0
- * +---+---+-------------------+--------+-------------+------+------+
- * | 0 | Q | 0 0 1 1 1 0 0 0 0 |  imm5  | 0 0 1 U 1 1 |  Rn  |  Rd  |
- * +---+---+-------------------+--------+-------------+------+------+
- *
- * U: unsigned when set
- * size: encoded in imm5 (see ARM ARM LowestSetBit())
- */
-static void handle_simd_umov_smov(DisasContext *s, int is_q, int is_signed,
-                                  int rn, int rd, int imm5)
-{
-    int size = ctz32(imm5);
-    int element;
-    TCGv_i64 tcg_rd;
-
-    /* Check for UnallocatedEncodings */
-    if (is_signed) {
-        if (size > 2 || (size == 2 && !is_q)) {
-            unallocated_encoding(s);
-            return;
-        }
-    } else {
-        if (size > 3
-            || (size < 3 && is_q)
-            || (size == 3 && !is_q)) {
-            unallocated_encoding(s);
-            return;
-        }
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    element = extract32(imm5, 1+size, 4);
-
-    tcg_rd = cpu_reg(s, rd);
-    read_vec_element(s, tcg_rd, rn, element, size | (is_signed ? MO_SIGN : 0));
-    if (is_signed && !is_q) {
-        tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
-    }
-}
-
-/* AdvSIMD copy
- *   31  30  29  28             21 20  16 15  14  11 10  9    5 4    0
- * +---+---+----+-----------------+------+---+------+---+------+------+
- * | 0 | Q | op | 0 1 1 1 0 0 0 0 | imm5 | 0 | imm4 | 1 |  Rn  |  Rd  |
- * +---+---+----+-----------------+------+---+------+---+------+------+
- */
-static void disas_simd_copy(DisasContext *s, uint32_t insn)
-{
-    int rd = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    int imm4 = extract32(insn, 11, 4);
-    int op = extract32(insn, 29, 1);
-    int is_q = extract32(insn, 30, 1);
-    int imm5 = extract32(insn, 16, 5);
-
-    if (op) {
-        if (is_q) {
-            /* INS (element) */
-            handle_simd_inse(s, rd, rn, imm4, imm5);
-        } else {
-            unallocated_encoding(s);
-        }
-    } else {
-        switch (imm4) {
-        case 0:
-            /* DUP (element - vector) */
-            handle_simd_dupe(s, is_q, rd, rn, imm5);
-            break;
-        case 1:
-            /* DUP (general) */
-            handle_simd_dupg(s, is_q, rd, rn, imm5);
-            break;
-        case 3:
-            if (is_q) {
-                /* INS (general) */
-                handle_simd_insg(s, rd, rn, imm5);
-            } else {
-                unallocated_encoding(s);
-            }
-            break;
-        case 5:
-        case 7:
-            /* UMOV/SMOV (is_q indicates 32/64; imm4 indicates signedness) */
-            handle_simd_umov_smov(s, is_q, (imm4 == 5), rn, rd, imm5);
-            break;
-        default:
-            unallocated_encoding(s);
-            break;
-        }
-    }
-}
-
-/* AdvSIMD modified immediate
- *  31  30   29  28                 19 18 16 15   12  11  10  9     5 4    0
- * +---+---+----+---------------------+-----+-------+----+---+-------+------+
- * | 0 | Q | op | 0 1 1 1 1 0 0 0 0 0 | abc | cmode | o2 | 1 | defgh |  Rd  |
- * +---+---+----+---------------------+-----+-------+----+---+-------+------+
- *
- * There are a number of operations that can be carried out here:
- *   MOVI - move (shifted) imm into register
- *   MVNI - move inverted (shifted) imm into register
- *   ORR  - bitwise OR of (shifted) imm with register
- *   BIC  - bitwise clear of (shifted) imm with register
- * With ARMv8.2 we also have:
- *   FMOV half-precision
- */
-static void disas_simd_mod_imm(DisasContext *s, uint32_t insn)
-{
-    int rd = extract32(insn, 0, 5);
-    int cmode = extract32(insn, 12, 4);
-    int o2 = extract32(insn, 11, 1);
-    uint64_t abcdefgh = extract32(insn, 5, 5) | (extract32(insn, 16, 3) << 5);
-    bool is_neg = extract32(insn, 29, 1);
-    bool is_q = extract32(insn, 30, 1);
-    uint64_t imm = 0;
-
-    if (o2 != 0 || ((cmode == 0xf) && is_neg && !is_q)) {
-        /* Check for FMOV (vector, immediate) - half-precision */
-        if (!(dc_isar_feature(aa64_fp16, s) && o2 && cmode == 0xf)) {
-            unallocated_encoding(s);
-            return;
-        }
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    if (cmode == 15 && o2 && !is_neg) {
-        /* FMOV (vector, immediate) - half-precision */
-        imm = vfp_expand_imm(MO_16, abcdefgh);
-        /* now duplicate across the lanes */
-        imm = dup_const(MO_16, imm);
-    } else {
-        imm = asimd_imm_const(abcdefgh, cmode, is_neg);
-    }
-
-    if (!((cmode & 0x9) == 0x1 || (cmode & 0xd) == 0x9)) {
-        /* MOVI or MVNI, with MVNI negation handled above.  */
-        tcg_gen_gvec_dup_imm(MO_64, vec_full_reg_offset(s, rd), is_q ? 16 : 8,
-                             vec_full_reg_size(s), imm);
-    } else {
-        /* ORR or BIC, with BIC negation to AND handled above.  */
-        if (is_neg) {
-            gen_gvec_fn2i(s, is_q, rd, rd, imm, tcg_gen_gvec_andi, MO_64);
-        } else {
-            gen_gvec_fn2i(s, is_q, rd, rd, imm, tcg_gen_gvec_ori, MO_64);
-        }
-    }
-}
-
-/* AdvSIMD scalar copy
- *  31 30  29  28             21 20  16 15  14  11 10  9    5 4    0
- * +-----+----+-----------------+------+---+------+---+------+------+
- * | 0 1 | op | 1 1 1 1 0 0 0 0 | imm5 | 0 | imm4 | 1 |  Rn  |  Rd  |
- * +-----+----+-----------------+------+---+------+---+------+------+
- */
-static void disas_simd_scalar_copy(DisasContext *s, uint32_t insn)
-{
-    int rd = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    int imm4 = extract32(insn, 11, 4);
-    int imm5 = extract32(insn, 16, 5);
-    int op = extract32(insn, 29, 1);
-
-    if (op != 0 || imm4 != 0) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    /* DUP (element, scalar) */
-    handle_simd_dupes(s, rd, rn, imm5);
-}
-
-/* AdvSIMD scalar pairwise
- *  31 30  29 28       24 23  22 21       17 16    12 11 10 9    5 4    0
- * +-----+---+-----------+------+-----------+--------+-----+------+------+
- * | 0 1 | U | 1 1 1 1 0 | size | 1 1 0 0 0 | opcode | 1 0 |  Rn  |  Rd  |
- * +-----+---+-----------+------+-----------+--------+-----+------+------+
- */
-static void disas_simd_scalar_pairwise(DisasContext *s, uint32_t insn)
-{
-    int u = extract32(insn, 29, 1);
-    int size = extract32(insn, 22, 2);
-    int opcode = extract32(insn, 12, 5);
-    int rn = extract32(insn, 5, 5);
-    int rd = extract32(insn, 0, 5);
-    TCGv_ptr fpst;
-
-    /* For some ops (the FP ones), size[1] is part of the encoding.
-     * For ADDP strictly it is not but size[1] is always 1 for valid
-     * encodings.
-     */
-    opcode |= (extract32(size, 1, 1) << 5);
-
-    switch (opcode) {
-    case 0x3b: /* ADDP */
-        if (u || size != 3) {
-            unallocated_encoding(s);
-            return;
-        }
-        if (!fp_access_check(s)) {
-            return;
-        }
-
-        fpst = NULL;
-        break;
-    case 0xc: /* FMAXNMP */
-    case 0xd: /* FADDP */
-    case 0xf: /* FMAXP */
-    case 0x2c: /* FMINNMP */
-    case 0x2f: /* FMINP */
-        /* FP op, size[0] is 32 or 64 bit*/
-        if (!u) {
-            if (!dc_isar_feature(aa64_fp16, s)) {
-                unallocated_encoding(s);
-                return;
-            } else {
-                size = MO_16;
-            }
-        } else {
-            size = extract32(size, 0, 1) ? MO_64 : MO_32;
-        }
-
-        if (!fp_access_check(s)) {
-            return;
-        }
-
-        fpst = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
-        break;
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (size == MO_64) {
-        TCGv_i64 tcg_op1 = tcg_temp_new_i64();
-        TCGv_i64 tcg_op2 = tcg_temp_new_i64();
-        TCGv_i64 tcg_res = tcg_temp_new_i64();
-
-        read_vec_element(s, tcg_op1, rn, 0, MO_64);
-        read_vec_element(s, tcg_op2, rn, 1, MO_64);
-
-        switch (opcode) {
-        case 0x3b: /* ADDP */
-            tcg_gen_add_i64(tcg_res, tcg_op1, tcg_op2);
-            break;
-        case 0xc: /* FMAXNMP */
-            gen_helper_vfp_maxnumd(tcg_res, tcg_op1, tcg_op2, fpst);
-            break;
-        case 0xd: /* FADDP */
-            gen_helper_vfp_addd(tcg_res, tcg_op1, tcg_op2, fpst);
-            break;
-        case 0xf: /* FMAXP */
-            gen_helper_vfp_maxd(tcg_res, tcg_op1, tcg_op2, fpst);
-            break;
-        case 0x2c: /* FMINNMP */
-            gen_helper_vfp_minnumd(tcg_res, tcg_op1, tcg_op2, fpst);
-            break;
-        case 0x2f: /* FMINP */
-            gen_helper_vfp_mind(tcg_res, tcg_op1, tcg_op2, fpst);
-            break;
-        default:
-            g_assert_not_reached();
-        }
-
-        write_fp_dreg(s, rd, tcg_res);
-
-        tcg_temp_free_i64(tcg_op1);
-        tcg_temp_free_i64(tcg_op2);
-        tcg_temp_free_i64(tcg_res);
-    } else {
-        TCGv_i32 tcg_op1 = tcg_temp_new_i32();
-        TCGv_i32 tcg_op2 = tcg_temp_new_i32();
-        TCGv_i32 tcg_res = tcg_temp_new_i32();
-
-        read_vec_element_i32(s, tcg_op1, rn, 0, size);
-        read_vec_element_i32(s, tcg_op2, rn, 1, size);
-
-        if (size == MO_16) {
-            switch (opcode) {
-            case 0xc: /* FMAXNMP */
-                gen_helper_advsimd_maxnumh(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0xd: /* FADDP */
-                gen_helper_advsimd_addh(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0xf: /* FMAXP */
-                gen_helper_advsimd_maxh(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x2c: /* FMINNMP */
-                gen_helper_advsimd_minnumh(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x2f: /* FMINP */
-                gen_helper_advsimd_minh(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            default:
-                g_assert_not_reached();
-            }
-        } else {
-            switch (opcode) {
-            case 0xc: /* FMAXNMP */
-                gen_helper_vfp_maxnums(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0xd: /* FADDP */
-                gen_helper_vfp_adds(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0xf: /* FMAXP */
-                gen_helper_vfp_maxs(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x2c: /* FMINNMP */
-                gen_helper_vfp_minnums(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x2f: /* FMINP */
-                gen_helper_vfp_mins(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            default:
-                g_assert_not_reached();
-            }
-        }
-
-        write_fp_sreg(s, rd, tcg_res);
-
-        tcg_temp_free_i32(tcg_op1);
-        tcg_temp_free_i32(tcg_op2);
-        tcg_temp_free_i32(tcg_res);
-    }
-
-    if (fpst) {
-        tcg_temp_free_ptr(fpst);
-    }
-}
-
-/*
- * Common SSHR[RA]/USHR[RA] - Shift right (optional rounding/accumulate)
- *
- * This code is handles the common shifting code and is used by both
- * the vector and scalar code.
- */
-static void handle_shri_with_rndacc(TCGv_i64 tcg_res, TCGv_i64 tcg_src,
-                                    TCGv_i64 tcg_rnd, bool accumulate,
-                                    bool is_u, int size, int shift)
-{
-    bool extended_result = false;
-    bool round = tcg_rnd != NULL;
-    int ext_lshift = 0;
-    TCGv_i64 tcg_src_hi;
-
-    if (round && size == 3) {
-        extended_result = true;
-        ext_lshift = 64 - shift;
-        tcg_src_hi = tcg_temp_new_i64();
-    } else if (shift == 64) {
-        if (!accumulate && is_u) {
-            /* result is zero */
-            tcg_gen_movi_i64(tcg_res, 0);
-            return;
-        }
-    }
-
-    /* Deal with the rounding step */
-    if (round) {
-        if (extended_result) {
-            TCGv_i64 tcg_zero = tcg_constant_i64(0);
-            if (!is_u) {
-                /* take care of sign extending tcg_res */
-                tcg_gen_sari_i64(tcg_src_hi, tcg_src, 63);
-                tcg_gen_add2_i64(tcg_src, tcg_src_hi,
-                                 tcg_src, tcg_src_hi,
-                                 tcg_rnd, tcg_zero);
-            } else {
-                tcg_gen_add2_i64(tcg_src, tcg_src_hi,
-                                 tcg_src, tcg_zero,
-                                 tcg_rnd, tcg_zero);
-            }
-        } else {
-            tcg_gen_add_i64(tcg_src, tcg_src, tcg_rnd);
-        }
-    }
-
-    /* Now do the shift right */
-    if (round && extended_result) {
-        /* extended case, >64 bit precision required */
-        if (ext_lshift == 0) {
-            /* special case, only high bits matter */
-            tcg_gen_mov_i64(tcg_src, tcg_src_hi);
-        } else {
-            tcg_gen_shri_i64(tcg_src, tcg_src, shift);
-            tcg_gen_shli_i64(tcg_src_hi, tcg_src_hi, ext_lshift);
-            tcg_gen_or_i64(tcg_src, tcg_src, tcg_src_hi);
-        }
-    } else {
-        if (is_u) {
-            if (shift == 64) {
-                /* essentially shifting in 64 zeros */
-                tcg_gen_movi_i64(tcg_src, 0);
-            } else {
-                tcg_gen_shri_i64(tcg_src, tcg_src, shift);
-            }
-        } else {
-            if (shift == 64) {
-                /* effectively extending the sign-bit */
-                tcg_gen_sari_i64(tcg_src, tcg_src, 63);
-            } else {
-                tcg_gen_sari_i64(tcg_src, tcg_src, shift);
-            }
-        }
-    }
-
-    if (accumulate) {
-        tcg_gen_add_i64(tcg_res, tcg_res, tcg_src);
-    } else {
-        tcg_gen_mov_i64(tcg_res, tcg_src);
-    }
-
-    if (extended_result) {
-        tcg_temp_free_i64(tcg_src_hi);
-    }
-}
-
-/* SSHR[RA]/USHR[RA] - Scalar shift right (optional rounding/accumulate) */
-static void handle_scalar_simd_shri(DisasContext *s,
-                                    bool is_u, int immh, int immb,
-                                    int opcode, int rn, int rd)
-{
-    const int size = 3;
-    int immhb = immh << 3 | immb;
-    int shift = 2 * (8 << size) - immhb;
-    bool accumulate = false;
-    bool round = false;
-    bool insert = false;
-    TCGv_i64 tcg_rn;
-    TCGv_i64 tcg_rd;
-    TCGv_i64 tcg_round;
-
-    if (!extract32(immh, 3, 1)) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    switch (opcode) {
-    case 0x02: /* SSRA / USRA (accumulate) */
-        accumulate = true;
-        break;
-    case 0x04: /* SRSHR / URSHR (rounding) */
-        round = true;
-        break;
-    case 0x06: /* SRSRA / URSRA (accum + rounding) */
-        accumulate = round = true;
-        break;
-    case 0x08: /* SRI */
-        insert = true;
-        break;
-    }
-
-    if (round) {
-        tcg_round = tcg_constant_i64(1ULL << (shift - 1));
-    } else {
-        tcg_round = NULL;
-    }
-
-    tcg_rn = read_fp_dreg(s, rn);
-    tcg_rd = (accumulate || insert) ? read_fp_dreg(s, rd) : tcg_temp_new_i64();
-
-    if (insert) {
-        /* shift count same as element size is valid but does nothing;
-         * special case to avoid potential shift by 64.
-         */
-        int esize = 8 << size;
-        if (shift != esize) {
-            tcg_gen_shri_i64(tcg_rn, tcg_rn, shift);
-            tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_rn, 0, esize - shift);
-        }
-    } else {
-        handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
-                                accumulate, is_u, size, shift);
-    }
-
-    write_fp_dreg(s, rd, tcg_rd);
-
-    tcg_temp_free_i64(tcg_rn);
-    tcg_temp_free_i64(tcg_rd);
-}
-
-/* SHL/SLI - Scalar shift left */
-static void handle_scalar_simd_shli(DisasContext *s, bool insert,
-                                    int immh, int immb, int opcode,
-                                    int rn, int rd)
-{
-    int size = 32 - clz32(immh) - 1;
-    int immhb = immh << 3 | immb;
-    int shift = immhb - (8 << size);
-    TCGv_i64 tcg_rn;
-    TCGv_i64 tcg_rd;
-
-    if (!extract32(immh, 3, 1)) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    tcg_rn = read_fp_dreg(s, rn);
-    tcg_rd = insert ? read_fp_dreg(s, rd) : tcg_temp_new_i64();
-
-    if (insert) {
-        tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_rn, shift, 64 - shift);
-    } else {
-        tcg_gen_shli_i64(tcg_rd, tcg_rn, shift);
-    }
-
-    write_fp_dreg(s, rd, tcg_rd);
-
-    tcg_temp_free_i64(tcg_rn);
-    tcg_temp_free_i64(tcg_rd);
-}
-
-/* SQSHRN/SQSHRUN - Saturating (signed/unsigned) shift right with
- * (signed/unsigned) narrowing */
-static void handle_vec_simd_sqshrn(DisasContext *s, bool is_scalar, bool is_q,
-                                   bool is_u_shift, bool is_u_narrow,
-                                   int immh, int immb, int opcode,
-                                   int rn, int rd)
-{
-    int immhb = immh << 3 | immb;
-    int size = 32 - clz32(immh) - 1;
-    int esize = 8 << size;
-    int shift = (2 * esize) - immhb;
-    int elements = is_scalar ? 1 : (64 / esize);
-    bool round = extract32(opcode, 0, 1);
-    MemOp ldop = (size + 1) | (is_u_shift ? 0 : MO_SIGN);
-    TCGv_i64 tcg_rn, tcg_rd, tcg_round;
-    TCGv_i32 tcg_rd_narrowed;
-    TCGv_i64 tcg_final;
-
-    static NeonGenNarrowEnvFn * const signed_narrow_fns[4][2] = {
-        { gen_helper_neon_narrow_sat_s8,
-          gen_helper_neon_unarrow_sat8 },
-        { gen_helper_neon_narrow_sat_s16,
-          gen_helper_neon_unarrow_sat16 },
-        { gen_helper_neon_narrow_sat_s32,
-          gen_helper_neon_unarrow_sat32 },
-        { NULL, NULL },
-    };
-    static NeonGenNarrowEnvFn * const unsigned_narrow_fns[4] = {
-        gen_helper_neon_narrow_sat_u8,
-        gen_helper_neon_narrow_sat_u16,
-        gen_helper_neon_narrow_sat_u32,
-        NULL
-    };
-    NeonGenNarrowEnvFn *narrowfn;
-
-    int i;
-
-    assert(size < 4);
-
-    if (extract32(immh, 3, 1)) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    if (is_u_shift) {
-        narrowfn = unsigned_narrow_fns[size];
-    } else {
-        narrowfn = signed_narrow_fns[size][is_u_narrow ? 1 : 0];
-    }
-
-    tcg_rn = tcg_temp_new_i64();
-    tcg_rd = tcg_temp_new_i64();
-    tcg_rd_narrowed = tcg_temp_new_i32();
-    tcg_final = tcg_const_i64(0);
-
-    if (round) {
-        tcg_round = tcg_constant_i64(1ULL << (shift - 1));
-    } else {
-        tcg_round = NULL;
-    }
-
-    for (i = 0; i < elements; i++) {
-        read_vec_element(s, tcg_rn, rn, i, ldop);
-        handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
-                                false, is_u_shift, size+1, shift);
-        narrowfn(tcg_rd_narrowed, cpu_env, tcg_rd);
-        tcg_gen_extu_i32_i64(tcg_rd, tcg_rd_narrowed);
-        tcg_gen_deposit_i64(tcg_final, tcg_final, tcg_rd, esize * i, esize);
-    }
-
-    if (!is_q) {
-        write_vec_element(s, tcg_final, rd, 0, MO_64);
-    } else {
-        write_vec_element(s, tcg_final, rd, 1, MO_64);
-    }
-
-    tcg_temp_free_i64(tcg_rn);
-    tcg_temp_free_i64(tcg_rd);
-    tcg_temp_free_i32(tcg_rd_narrowed);
-    tcg_temp_free_i64(tcg_final);
-
-    clear_vec_high(s, is_q, rd);
-}
-
-/* SQSHLU, UQSHL, SQSHL: saturating left shifts */
-static void handle_simd_qshl(DisasContext *s, bool scalar, bool is_q,
-                             bool src_unsigned, bool dst_unsigned,
-                             int immh, int immb, int rn, int rd)
-{
-    int immhb = immh << 3 | immb;
-    int size = 32 - clz32(immh) - 1;
-    int shift = immhb - (8 << size);
-    int pass;
-
-    assert(immh != 0);
-    assert(!(scalar && is_q));
-
-    if (!scalar) {
-        if (!is_q && extract32(immh, 3, 1)) {
-            unallocated_encoding(s);
-            return;
-        }
-
-        /* Since we use the variable-shift helpers we must
-         * replicate the shift count into each element of
-         * the tcg_shift value.
-         */
-        switch (size) {
-        case 0:
-            shift |= shift << 8;
-            /* fall through */
-        case 1:
-            shift |= shift << 16;
-            break;
-        case 2:
-        case 3:
-            break;
-        default:
-            g_assert_not_reached();
-        }
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    if (size == 3) {
-        TCGv_i64 tcg_shift = tcg_constant_i64(shift);
-        static NeonGenTwo64OpEnvFn * const fns[2][2] = {
-            { gen_helper_neon_qshl_s64, gen_helper_neon_qshlu_s64 },
-            { NULL, gen_helper_neon_qshl_u64 },
-        };
-        NeonGenTwo64OpEnvFn *genfn = fns[src_unsigned][dst_unsigned];
-        int maxpass = is_q ? 2 : 1;
-
-        for (pass = 0; pass < maxpass; pass++) {
-            TCGv_i64 tcg_op = tcg_temp_new_i64();
-
-            read_vec_element(s, tcg_op, rn, pass, MO_64);
-            genfn(tcg_op, cpu_env, tcg_op, tcg_shift);
-            write_vec_element(s, tcg_op, rd, pass, MO_64);
-
-            tcg_temp_free_i64(tcg_op);
-        }
-        clear_vec_high(s, is_q, rd);
-    } else {
-        TCGv_i32 tcg_shift = tcg_constant_i32(shift);
-        static NeonGenTwoOpEnvFn * const fns[2][2][3] = {
-            {
-                { gen_helper_neon_qshl_s8,
-                  gen_helper_neon_qshl_s16,
-                  gen_helper_neon_qshl_s32 },
-                { gen_helper_neon_qshlu_s8,
-                  gen_helper_neon_qshlu_s16,
-                  gen_helper_neon_qshlu_s32 }
-            }, {
-                { NULL, NULL, NULL },
-                { gen_helper_neon_qshl_u8,
-                  gen_helper_neon_qshl_u16,
-                  gen_helper_neon_qshl_u32 }
-            }
-        };
-        NeonGenTwoOpEnvFn *genfn = fns[src_unsigned][dst_unsigned][size];
-        MemOp memop = scalar ? size : MO_32;
-        int maxpass = scalar ? 1 : is_q ? 4 : 2;
-
-        for (pass = 0; pass < maxpass; pass++) {
-            TCGv_i32 tcg_op = tcg_temp_new_i32();
-
-            read_vec_element_i32(s, tcg_op, rn, pass, memop);
-            genfn(tcg_op, cpu_env, tcg_op, tcg_shift);
-            if (scalar) {
-                switch (size) {
-                case 0:
-                    tcg_gen_ext8u_i32(tcg_op, tcg_op);
-                    break;
-                case 1:
-                    tcg_gen_ext16u_i32(tcg_op, tcg_op);
-                    break;
-                case 2:
-                    break;
-                default:
-                    g_assert_not_reached();
-                }
-                write_fp_sreg(s, rd, tcg_op);
-            } else {
-                write_vec_element_i32(s, tcg_op, rd, pass, MO_32);
-            }
-
-            tcg_temp_free_i32(tcg_op);
-        }
-
-        if (!scalar) {
-            clear_vec_high(s, is_q, rd);
-        }
-    }
-}
-
-/* Common vector code for handling integer to FP conversion */
-static void handle_simd_intfp_conv(DisasContext *s, int rd, int rn,
-                                   int elements, int is_signed,
-                                   int fracbits, int size)
-{
-    TCGv_ptr tcg_fpst = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
-    TCGv_i32 tcg_shift = NULL;
-
-    MemOp mop = size | (is_signed ? MO_SIGN : 0);
-    int pass;
-
-    if (fracbits || size == MO_64) {
-        tcg_shift = tcg_constant_i32(fracbits);
-    }
-
-    if (size == MO_64) {
-        TCGv_i64 tcg_int64 = tcg_temp_new_i64();
-        TCGv_i64 tcg_double = tcg_temp_new_i64();
-
-        for (pass = 0; pass < elements; pass++) {
-            read_vec_element(s, tcg_int64, rn, pass, mop);
-
-            if (is_signed) {
-                gen_helper_vfp_sqtod(tcg_double, tcg_int64,
-                                     tcg_shift, tcg_fpst);
-            } else {
-                gen_helper_vfp_uqtod(tcg_double, tcg_int64,
-                                     tcg_shift, tcg_fpst);
-            }
-            if (elements == 1) {
-                write_fp_dreg(s, rd, tcg_double);
-            } else {
-                write_vec_element(s, tcg_double, rd, pass, MO_64);
-            }
-        }
-
-        tcg_temp_free_i64(tcg_int64);
-        tcg_temp_free_i64(tcg_double);
-
-    } else {
-        TCGv_i32 tcg_int32 = tcg_temp_new_i32();
-        TCGv_i32 tcg_float = tcg_temp_new_i32();
-
-        for (pass = 0; pass < elements; pass++) {
-            read_vec_element_i32(s, tcg_int32, rn, pass, mop);
-
-            switch (size) {
-            case MO_32:
-                if (fracbits) {
-                    if (is_signed) {
-                        gen_helper_vfp_sltos(tcg_float, tcg_int32,
-                                             tcg_shift, tcg_fpst);
-                    } else {
-                        gen_helper_vfp_ultos(tcg_float, tcg_int32,
-                                             tcg_shift, tcg_fpst);
-                    }
-                } else {
-                    if (is_signed) {
-                        gen_helper_vfp_sitos(tcg_float, tcg_int32, tcg_fpst);
-                    } else {
-                        gen_helper_vfp_uitos(tcg_float, tcg_int32, tcg_fpst);
-                    }
-                }
-                break;
-            case MO_16:
-                if (fracbits) {
-                    if (is_signed) {
-                        gen_helper_vfp_sltoh(tcg_float, tcg_int32,
-                                             tcg_shift, tcg_fpst);
-                    } else {
-                        gen_helper_vfp_ultoh(tcg_float, tcg_int32,
-                                             tcg_shift, tcg_fpst);
-                    }
-                } else {
-                    if (is_signed) {
-                        gen_helper_vfp_sitoh(tcg_float, tcg_int32, tcg_fpst);
-                    } else {
-                        gen_helper_vfp_uitoh(tcg_float, tcg_int32, tcg_fpst);
-                    }
-                }
-                break;
-            default:
-                g_assert_not_reached();
-            }
-
-            if (elements == 1) {
-                write_fp_sreg(s, rd, tcg_float);
-            } else {
-                write_vec_element_i32(s, tcg_float, rd, pass, size);
-            }
-        }
-
-        tcg_temp_free_i32(tcg_int32);
-        tcg_temp_free_i32(tcg_float);
-    }
-
-    tcg_temp_free_ptr(tcg_fpst);
-
-    clear_vec_high(s, elements << size == 16, rd);
-}
-
-/* UCVTF/SCVTF - Integer to FP conversion */
-static void handle_simd_shift_intfp_conv(DisasContext *s, bool is_scalar,
-                                         bool is_q, bool is_u,
-                                         int immh, int immb, int opcode,
-                                         int rn, int rd)
-{
-    int size, elements, fracbits;
-    int immhb = immh << 3 | immb;
-
-    if (immh & 8) {
-        size = MO_64;
-        if (!is_scalar && !is_q) {
-            unallocated_encoding(s);
-            return;
-        }
-    } else if (immh & 4) {
-        size = MO_32;
-    } else if (immh & 2) {
-        size = MO_16;
-        if (!dc_isar_feature(aa64_fp16, s)) {
-            unallocated_encoding(s);
-            return;
-        }
-    } else {
-        /* immh == 0 would be a failure of the decode logic */
-        g_assert(immh == 1);
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (is_scalar) {
-        elements = 1;
-    } else {
-        elements = (8 << is_q) >> size;
-    }
-    fracbits = (16 << size) - immhb;
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    handle_simd_intfp_conv(s, rd, rn, elements, !is_u, fracbits, size);
-}
-
-/* FCVTZS, FVCVTZU - FP to fixedpoint conversion */
-static void handle_simd_shift_fpint_conv(DisasContext *s, bool is_scalar,
-                                         bool is_q, bool is_u,
-                                         int immh, int immb, int rn, int rd)
-{
-    int immhb = immh << 3 | immb;
-    int pass, size, fracbits;
-    TCGv_ptr tcg_fpstatus;
-    TCGv_i32 tcg_rmode, tcg_shift;
-
-    if (immh & 0x8) {
-        size = MO_64;
-        if (!is_scalar && !is_q) {
-            unallocated_encoding(s);
-            return;
-        }
-    } else if (immh & 0x4) {
-        size = MO_32;
-    } else if (immh & 0x2) {
-        size = MO_16;
-        if (!dc_isar_feature(aa64_fp16, s)) {
-            unallocated_encoding(s);
-            return;
-        }
-    } else {
-        /* Should have split out AdvSIMD modified immediate earlier.  */
-        assert(immh == 1);
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    assert(!(is_scalar && is_q));
-
-    tcg_rmode = tcg_const_i32(arm_rmode_to_sf(FPROUNDING_ZERO));
-    tcg_fpstatus = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
-    gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
-    fracbits = (16 << size) - immhb;
-    tcg_shift = tcg_constant_i32(fracbits);
-
-    if (size == MO_64) {
-        int maxpass = is_scalar ? 1 : 2;
-
-        for (pass = 0; pass < maxpass; pass++) {
-            TCGv_i64 tcg_op = tcg_temp_new_i64();
-
-            read_vec_element(s, tcg_op, rn, pass, MO_64);
-            if (is_u) {
-                gen_helper_vfp_touqd(tcg_op, tcg_op, tcg_shift, tcg_fpstatus);
-            } else {
-                gen_helper_vfp_tosqd(tcg_op, tcg_op, tcg_shift, tcg_fpstatus);
-            }
-            write_vec_element(s, tcg_op, rd, pass, MO_64);
-            tcg_temp_free_i64(tcg_op);
-        }
-        clear_vec_high(s, is_q, rd);
-    } else {
-        void (*fn)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_ptr);
-        int maxpass = is_scalar ? 1 : ((8 << is_q) >> size);
-
-        switch (size) {
-        case MO_16:
-            if (is_u) {
-                fn = gen_helper_vfp_touhh;
-            } else {
-                fn = gen_helper_vfp_toshh;
-            }
-            break;
-        case MO_32:
-            if (is_u) {
-                fn = gen_helper_vfp_touls;
-            } else {
-                fn = gen_helper_vfp_tosls;
-            }
-            break;
-        default:
-            g_assert_not_reached();
-        }
-
-        for (pass = 0; pass < maxpass; pass++) {
-            TCGv_i32 tcg_op = tcg_temp_new_i32();
-
-            read_vec_element_i32(s, tcg_op, rn, pass, size);
-            fn(tcg_op, tcg_op, tcg_shift, tcg_fpstatus);
-            if (is_scalar) {
-                write_fp_sreg(s, rd, tcg_op);
-            } else {
-                write_vec_element_i32(s, tcg_op, rd, pass, size);
-            }
-            tcg_temp_free_i32(tcg_op);
-        }
-        if (!is_scalar) {
-            clear_vec_high(s, is_q, rd);
-        }
-    }
-
-    gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
-    tcg_temp_free_ptr(tcg_fpstatus);
-    tcg_temp_free_i32(tcg_rmode);
-}
-
-/* AdvSIMD scalar shift by immediate
- *  31 30  29 28         23 22  19 18  16 15    11  10 9    5 4    0
- * +-----+---+-------------+------+------+--------+---+------+------+
- * | 0 1 | U | 1 1 1 1 1 0 | immh | immb | opcode | 1 |  Rn  |  Rd  |
- * +-----+---+-------------+------+------+--------+---+------+------+
- *
- * This is the scalar version so it works on a fixed sized registers
- */
-static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn)
-{
-    int rd = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    int opcode = extract32(insn, 11, 5);
-    int immb = extract32(insn, 16, 3);
-    int immh = extract32(insn, 19, 4);
-    bool is_u = extract32(insn, 29, 1);
-
-    if (immh == 0) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    switch (opcode) {
-    case 0x08: /* SRI */
-        if (!is_u) {
-            unallocated_encoding(s);
-            return;
-        }
-        /* fall through */
-    case 0x00: /* SSHR / USHR */
-    case 0x02: /* SSRA / USRA */
-    case 0x04: /* SRSHR / URSHR */
-    case 0x06: /* SRSRA / URSRA */
-        handle_scalar_simd_shri(s, is_u, immh, immb, opcode, rn, rd);
-        break;
-    case 0x0a: /* SHL / SLI */
-        handle_scalar_simd_shli(s, is_u, immh, immb, opcode, rn, rd);
-        break;
-    case 0x1c: /* SCVTF, UCVTF */
-        handle_simd_shift_intfp_conv(s, true, false, is_u, immh, immb,
-                                     opcode, rn, rd);
-        break;
-    case 0x10: /* SQSHRUN, SQSHRUN2 */
-    case 0x11: /* SQRSHRUN, SQRSHRUN2 */
-        if (!is_u) {
-            unallocated_encoding(s);
-            return;
-        }
-        handle_vec_simd_sqshrn(s, true, false, false, true,
-                               immh, immb, opcode, rn, rd);
-        break;
-    case 0x12: /* SQSHRN, SQSHRN2, UQSHRN */
-    case 0x13: /* SQRSHRN, SQRSHRN2, UQRSHRN, UQRSHRN2 */
-        handle_vec_simd_sqshrn(s, true, false, is_u, is_u,
-                               immh, immb, opcode, rn, rd);
-        break;
-    case 0xc: /* SQSHLU */
-        if (!is_u) {
-            unallocated_encoding(s);
-            return;
-        }
-        handle_simd_qshl(s, true, false, false, true, immh, immb, rn, rd);
-        break;
-    case 0xe: /* SQSHL, UQSHL */
-        handle_simd_qshl(s, true, false, is_u, is_u, immh, immb, rn, rd);
-        break;
-    case 0x1f: /* FCVTZS, FCVTZU */
-        handle_simd_shift_fpint_conv(s, true, false, is_u, immh, immb, rn, rd);
-        break;
-    default:
-        unallocated_encoding(s);
-        break;
-    }
-}
-
-/* AdvSIMD scalar three different
- *  31 30  29 28       24 23  22  21 20  16 15    12 11 10 9    5 4    0
- * +-----+---+-----------+------+---+------+--------+-----+------+------+
- * | 0 1 | U | 1 1 1 1 0 | size | 1 |  Rm  | opcode | 0 0 |  Rn  |  Rd  |
- * +-----+---+-----------+------+---+------+--------+-----+------+------+
- */
-static void disas_simd_scalar_three_reg_diff(DisasContext *s, uint32_t insn)
-{
-    bool is_u = extract32(insn, 29, 1);
-    int size = extract32(insn, 22, 2);
-    int opcode = extract32(insn, 12, 4);
-    int rm = extract32(insn, 16, 5);
-    int rn = extract32(insn, 5, 5);
-    int rd = extract32(insn, 0, 5);
-
-    if (is_u) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    switch (opcode) {
-    case 0x9: /* SQDMLAL, SQDMLAL2 */
-    case 0xb: /* SQDMLSL, SQDMLSL2 */
-    case 0xd: /* SQDMULL, SQDMULL2 */
-        if (size == 0 || size == 3) {
-            unallocated_encoding(s);
-            return;
-        }
-        break;
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    if (size == 2) {
-        TCGv_i64 tcg_op1 = tcg_temp_new_i64();
-        TCGv_i64 tcg_op2 = tcg_temp_new_i64();
-        TCGv_i64 tcg_res = tcg_temp_new_i64();
-
-        read_vec_element(s, tcg_op1, rn, 0, MO_32 | MO_SIGN);
-        read_vec_element(s, tcg_op2, rm, 0, MO_32 | MO_SIGN);
-
-        tcg_gen_mul_i64(tcg_res, tcg_op1, tcg_op2);
-        gen_helper_neon_addl_saturate_s64(tcg_res, cpu_env, tcg_res, tcg_res);
-
-        switch (opcode) {
-        case 0xd: /* SQDMULL, SQDMULL2 */
-            break;
-        case 0xb: /* SQDMLSL, SQDMLSL2 */
-            tcg_gen_neg_i64(tcg_res, tcg_res);
-            /* fall through */
-        case 0x9: /* SQDMLAL, SQDMLAL2 */
-            read_vec_element(s, tcg_op1, rd, 0, MO_64);
-            gen_helper_neon_addl_saturate_s64(tcg_res, cpu_env,
-                                              tcg_res, tcg_op1);
-            break;
-        default:
-            g_assert_not_reached();
-        }
-
-        write_fp_dreg(s, rd, tcg_res);
-
-        tcg_temp_free_i64(tcg_op1);
-        tcg_temp_free_i64(tcg_op2);
-        tcg_temp_free_i64(tcg_res);
-    } else {
-        TCGv_i32 tcg_op1 = read_fp_hreg(s, rn);
-        TCGv_i32 tcg_op2 = read_fp_hreg(s, rm);
-        TCGv_i64 tcg_res = tcg_temp_new_i64();
-
-        gen_helper_neon_mull_s16(tcg_res, tcg_op1, tcg_op2);
-        gen_helper_neon_addl_saturate_s32(tcg_res, cpu_env, tcg_res, tcg_res);
-
-        switch (opcode) {
-        case 0xd: /* SQDMULL, SQDMULL2 */
-            break;
-        case 0xb: /* SQDMLSL, SQDMLSL2 */
-            gen_helper_neon_negl_u32(tcg_res, tcg_res);
-            /* fall through */
-        case 0x9: /* SQDMLAL, SQDMLAL2 */
-        {
-            TCGv_i64 tcg_op3 = tcg_temp_new_i64();
-            read_vec_element(s, tcg_op3, rd, 0, MO_32);
-            gen_helper_neon_addl_saturate_s32(tcg_res, cpu_env,
-                                              tcg_res, tcg_op3);
-            tcg_temp_free_i64(tcg_op3);
-            break;
-        }
-        default:
-            g_assert_not_reached();
-        }
-
-        tcg_gen_ext32u_i64(tcg_res, tcg_res);
-        write_fp_dreg(s, rd, tcg_res);
-
-        tcg_temp_free_i32(tcg_op1);
-        tcg_temp_free_i32(tcg_op2);
-        tcg_temp_free_i64(tcg_res);
-    }
-}
-
-static void handle_3same_64(DisasContext *s, int opcode, bool u,
-                            TCGv_i64 tcg_rd, TCGv_i64 tcg_rn, TCGv_i64 tcg_rm)
-{
-    /* Handle 64x64->64 opcodes which are shared between the scalar
-     * and vector 3-same groups. We cover every opcode where size == 3
-     * is valid in either the three-reg-same (integer, not pairwise)
-     * or scalar-three-reg-same groups.
-     */
-    TCGCond cond;
-
-    switch (opcode) {
-    case 0x1: /* SQADD */
-        if (u) {
-            gen_helper_neon_qadd_u64(tcg_rd, cpu_env, tcg_rn, tcg_rm);
-        } else {
-            gen_helper_neon_qadd_s64(tcg_rd, cpu_env, tcg_rn, tcg_rm);
-        }
-        break;
-    case 0x5: /* SQSUB */
-        if (u) {
-            gen_helper_neon_qsub_u64(tcg_rd, cpu_env, tcg_rn, tcg_rm);
-        } else {
-            gen_helper_neon_qsub_s64(tcg_rd, cpu_env, tcg_rn, tcg_rm);
-        }
-        break;
-    case 0x6: /* CMGT, CMHI */
-        /* 64 bit integer comparison, result = test ? (2^64 - 1) : 0.
-         * We implement this using setcond (test) and then negating.
-         */
-        cond = u ? TCG_COND_GTU : TCG_COND_GT;
-    do_cmop:
-        tcg_gen_setcond_i64(cond, tcg_rd, tcg_rn, tcg_rm);
-        tcg_gen_neg_i64(tcg_rd, tcg_rd);
-        break;
-    case 0x7: /* CMGE, CMHS */
-        cond = u ? TCG_COND_GEU : TCG_COND_GE;
-        goto do_cmop;
-    case 0x11: /* CMTST, CMEQ */
-        if (u) {
-            cond = TCG_COND_EQ;
-            goto do_cmop;
-        }
-        gen_cmtst_i64(tcg_rd, tcg_rn, tcg_rm);
-        break;
-    case 0x8: /* SSHL, USHL */
-        if (u) {
-            gen_ushl_i64(tcg_rd, tcg_rn, tcg_rm);
-        } else {
-            gen_sshl_i64(tcg_rd, tcg_rn, tcg_rm);
-        }
-        break;
-    case 0x9: /* SQSHL, UQSHL */
-        if (u) {
-            gen_helper_neon_qshl_u64(tcg_rd, cpu_env, tcg_rn, tcg_rm);
-        } else {
-            gen_helper_neon_qshl_s64(tcg_rd, cpu_env, tcg_rn, tcg_rm);
-        }
-        break;
-    case 0xa: /* SRSHL, URSHL */
-        if (u) {
-            gen_helper_neon_rshl_u64(tcg_rd, tcg_rn, tcg_rm);
-        } else {
-            gen_helper_neon_rshl_s64(tcg_rd, tcg_rn, tcg_rm);
-        }
-        break;
-    case 0xb: /* SQRSHL, UQRSHL */
-        if (u) {
-            gen_helper_neon_qrshl_u64(tcg_rd, cpu_env, tcg_rn, tcg_rm);
-        } else {
-            gen_helper_neon_qrshl_s64(tcg_rd, cpu_env, tcg_rn, tcg_rm);
-        }
-        break;
-    case 0x10: /* ADD, SUB */
-        if (u) {
-            tcg_gen_sub_i64(tcg_rd, tcg_rn, tcg_rm);
-        } else {
-            tcg_gen_add_i64(tcg_rd, tcg_rn, tcg_rm);
-        }
-        break;
-    default:
-        g_assert_not_reached();
-    }
-}
-
-/* Handle the 3-same-operands float operations; shared by the scalar
- * and vector encodings. The caller must filter out any encodings
- * not allocated for the encoding it is dealing with.
- */
-static void handle_3same_float(DisasContext *s, int size, int elements,
-                               int fpopcode, int rd, int rn, int rm)
-{
-    int pass;
-    TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
-
-    for (pass = 0; pass < elements; pass++) {
-        if (size) {
-            /* Double */
-            TCGv_i64 tcg_op1 = tcg_temp_new_i64();
-            TCGv_i64 tcg_op2 = tcg_temp_new_i64();
-            TCGv_i64 tcg_res = tcg_temp_new_i64();
-
-            read_vec_element(s, tcg_op1, rn, pass, MO_64);
-            read_vec_element(s, tcg_op2, rm, pass, MO_64);
-
-            switch (fpopcode) {
-            case 0x39: /* FMLS */
-                /* As usual for ARM, separate negation for fused multiply-add */
-                gen_helper_vfp_negd(tcg_op1, tcg_op1);
-                /* fall through */
-            case 0x19: /* FMLA */
-                read_vec_element(s, tcg_res, rd, pass, MO_64);
-                gen_helper_vfp_muladdd(tcg_res, tcg_op1, tcg_op2,
-                                       tcg_res, fpst);
-                break;
-            case 0x18: /* FMAXNM */
-                gen_helper_vfp_maxnumd(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x1a: /* FADD */
-                gen_helper_vfp_addd(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x1b: /* FMULX */
-                gen_helper_vfp_mulxd(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x1c: /* FCMEQ */
-                gen_helper_neon_ceq_f64(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x1e: /* FMAX */
-                gen_helper_vfp_maxd(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x1f: /* FRECPS */
-                gen_helper_recpsf_f64(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x38: /* FMINNM */
-                gen_helper_vfp_minnumd(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x3a: /* FSUB */
-                gen_helper_vfp_subd(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x3e: /* FMIN */
-                gen_helper_vfp_mind(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x3f: /* FRSQRTS */
-                gen_helper_rsqrtsf_f64(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x5b: /* FMUL */
-                gen_helper_vfp_muld(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x5c: /* FCMGE */
-                gen_helper_neon_cge_f64(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x5d: /* FACGE */
-                gen_helper_neon_acge_f64(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x5f: /* FDIV */
-                gen_helper_vfp_divd(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x7a: /* FABD */
-                gen_helper_vfp_subd(tcg_res, tcg_op1, tcg_op2, fpst);
-                gen_helper_vfp_absd(tcg_res, tcg_res);
-                break;
-            case 0x7c: /* FCMGT */
-                gen_helper_neon_cgt_f64(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x7d: /* FACGT */
-                gen_helper_neon_acgt_f64(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            default:
-                g_assert_not_reached();
-            }
-
-            write_vec_element(s, tcg_res, rd, pass, MO_64);
-
-            tcg_temp_free_i64(tcg_res);
-            tcg_temp_free_i64(tcg_op1);
-            tcg_temp_free_i64(tcg_op2);
-        } else {
-            /* Single */
-            TCGv_i32 tcg_op1 = tcg_temp_new_i32();
-            TCGv_i32 tcg_op2 = tcg_temp_new_i32();
-            TCGv_i32 tcg_res = tcg_temp_new_i32();
-
-            read_vec_element_i32(s, tcg_op1, rn, pass, MO_32);
-            read_vec_element_i32(s, tcg_op2, rm, pass, MO_32);
-
-            switch (fpopcode) {
-            case 0x39: /* FMLS */
-                /* As usual for ARM, separate negation for fused multiply-add */
-                gen_helper_vfp_negs(tcg_op1, tcg_op1);
-                /* fall through */
-            case 0x19: /* FMLA */
-                read_vec_element_i32(s, tcg_res, rd, pass, MO_32);
-                gen_helper_vfp_muladds(tcg_res, tcg_op1, tcg_op2,
-                                       tcg_res, fpst);
-                break;
-            case 0x1a: /* FADD */
-                gen_helper_vfp_adds(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x1b: /* FMULX */
-                gen_helper_vfp_mulxs(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x1c: /* FCMEQ */
-                gen_helper_neon_ceq_f32(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x1e: /* FMAX */
-                gen_helper_vfp_maxs(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x1f: /* FRECPS */
-                gen_helper_recpsf_f32(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x18: /* FMAXNM */
-                gen_helper_vfp_maxnums(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x38: /* FMINNM */
-                gen_helper_vfp_minnums(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x3a: /* FSUB */
-                gen_helper_vfp_subs(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x3e: /* FMIN */
-                gen_helper_vfp_mins(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x3f: /* FRSQRTS */
-                gen_helper_rsqrtsf_f32(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x5b: /* FMUL */
-                gen_helper_vfp_muls(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x5c: /* FCMGE */
-                gen_helper_neon_cge_f32(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x5d: /* FACGE */
-                gen_helper_neon_acge_f32(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x5f: /* FDIV */
-                gen_helper_vfp_divs(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x7a: /* FABD */
-                gen_helper_vfp_subs(tcg_res, tcg_op1, tcg_op2, fpst);
-                gen_helper_vfp_abss(tcg_res, tcg_res);
-                break;
-            case 0x7c: /* FCMGT */
-                gen_helper_neon_cgt_f32(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x7d: /* FACGT */
-                gen_helper_neon_acgt_f32(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            default:
-                g_assert_not_reached();
-            }
-
-            if (elements == 1) {
-                /* scalar single so clear high part */
-                TCGv_i64 tcg_tmp = tcg_temp_new_i64();
-
-                tcg_gen_extu_i32_i64(tcg_tmp, tcg_res);
-                write_vec_element(s, tcg_tmp, rd, pass, MO_64);
-                tcg_temp_free_i64(tcg_tmp);
-            } else {
-                write_vec_element_i32(s, tcg_res, rd, pass, MO_32);
-            }
-
-            tcg_temp_free_i32(tcg_res);
-            tcg_temp_free_i32(tcg_op1);
-            tcg_temp_free_i32(tcg_op2);
-        }
-    }
-
-    tcg_temp_free_ptr(fpst);
-
-    clear_vec_high(s, elements * (size ? 8 : 4) > 8, rd);
-}
-
-/* AdvSIMD scalar three same
- *  31 30  29 28       24 23  22  21 20  16 15    11  10 9    5 4    0
- * +-----+---+-----------+------+---+------+--------+---+------+------+
- * | 0 1 | U | 1 1 1 1 0 | size | 1 |  Rm  | opcode | 1 |  Rn  |  Rd  |
- * +-----+---+-----------+------+---+------+--------+---+------+------+
- */
-static void disas_simd_scalar_three_reg_same(DisasContext *s, uint32_t insn)
-{
-    int rd = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    int opcode = extract32(insn, 11, 5);
-    int rm = extract32(insn, 16, 5);
-    int size = extract32(insn, 22, 2);
-    bool u = extract32(insn, 29, 1);
-    TCGv_i64 tcg_rd;
-
-    if (opcode >= 0x18) {
-        /* Floating point: U, size[1] and opcode indicate operation */
-        int fpopcode = opcode | (extract32(size, 1, 1) << 5) | (u << 6);
-        switch (fpopcode) {
-        case 0x1b: /* FMULX */
-        case 0x1f: /* FRECPS */
-        case 0x3f: /* FRSQRTS */
-        case 0x5d: /* FACGE */
-        case 0x7d: /* FACGT */
-        case 0x1c: /* FCMEQ */
-        case 0x5c: /* FCMGE */
-        case 0x7c: /* FCMGT */
-        case 0x7a: /* FABD */
-            break;
-        default:
-            unallocated_encoding(s);
-            return;
-        }
-
-        if (!fp_access_check(s)) {
-            return;
-        }
-
-        handle_3same_float(s, extract32(size, 0, 1), 1, fpopcode, rd, rn, rm);
-        return;
-    }
-
-    switch (opcode) {
-    case 0x1: /* SQADD, UQADD */
-    case 0x5: /* SQSUB, UQSUB */
-    case 0x9: /* SQSHL, UQSHL */
-    case 0xb: /* SQRSHL, UQRSHL */
-        break;
-    case 0x8: /* SSHL, USHL */
-    case 0xa: /* SRSHL, URSHL */
-    case 0x6: /* CMGT, CMHI */
-    case 0x7: /* CMGE, CMHS */
-    case 0x11: /* CMTST, CMEQ */
-    case 0x10: /* ADD, SUB (vector) */
-        if (size != 3) {
-            unallocated_encoding(s);
-            return;
-        }
-        break;
-    case 0x16: /* SQDMULH, SQRDMULH (vector) */
-        if (size != 1 && size != 2) {
-            unallocated_encoding(s);
-            return;
-        }
-        break;
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    tcg_rd = tcg_temp_new_i64();
-
-    if (size == 3) {
-        TCGv_i64 tcg_rn = read_fp_dreg(s, rn);
-        TCGv_i64 tcg_rm = read_fp_dreg(s, rm);
-
-        handle_3same_64(s, opcode, u, tcg_rd, tcg_rn, tcg_rm);
-        tcg_temp_free_i64(tcg_rn);
-        tcg_temp_free_i64(tcg_rm);
-    } else {
-        /* Do a single operation on the lowest element in the vector.
-         * We use the standard Neon helpers and rely on 0 OP 0 == 0 with
-         * no side effects for all these operations.
-         * OPTME: special-purpose helpers would avoid doing some
-         * unnecessary work in the helper for the 8 and 16 bit cases.
-         */
-        NeonGenTwoOpEnvFn *genenvfn;
-        TCGv_i32 tcg_rn = tcg_temp_new_i32();
-        TCGv_i32 tcg_rm = tcg_temp_new_i32();
-        TCGv_i32 tcg_rd32 = tcg_temp_new_i32();
-
-        read_vec_element_i32(s, tcg_rn, rn, 0, size);
-        read_vec_element_i32(s, tcg_rm, rm, 0, size);
-
-        switch (opcode) {
-        case 0x1: /* SQADD, UQADD */
-        {
-            static NeonGenTwoOpEnvFn * const fns[3][2] = {
-                { gen_helper_neon_qadd_s8, gen_helper_neon_qadd_u8 },
-                { gen_helper_neon_qadd_s16, gen_helper_neon_qadd_u16 },
-                { gen_helper_neon_qadd_s32, gen_helper_neon_qadd_u32 },
-            };
-            genenvfn = fns[size][u];
-            break;
-        }
-        case 0x5: /* SQSUB, UQSUB */
-        {
-            static NeonGenTwoOpEnvFn * const fns[3][2] = {
-                { gen_helper_neon_qsub_s8, gen_helper_neon_qsub_u8 },
-                { gen_helper_neon_qsub_s16, gen_helper_neon_qsub_u16 },
-                { gen_helper_neon_qsub_s32, gen_helper_neon_qsub_u32 },
-            };
-            genenvfn = fns[size][u];
-            break;
-        }
-        case 0x9: /* SQSHL, UQSHL */
-        {
-            static NeonGenTwoOpEnvFn * const fns[3][2] = {
-                { gen_helper_neon_qshl_s8, gen_helper_neon_qshl_u8 },
-                { gen_helper_neon_qshl_s16, gen_helper_neon_qshl_u16 },
-                { gen_helper_neon_qshl_s32, gen_helper_neon_qshl_u32 },
-            };
-            genenvfn = fns[size][u];
-            break;
-        }
-        case 0xb: /* SQRSHL, UQRSHL */
-        {
-            static NeonGenTwoOpEnvFn * const fns[3][2] = {
-                { gen_helper_neon_qrshl_s8, gen_helper_neon_qrshl_u8 },
-                { gen_helper_neon_qrshl_s16, gen_helper_neon_qrshl_u16 },
-                { gen_helper_neon_qrshl_s32, gen_helper_neon_qrshl_u32 },
-            };
-            genenvfn = fns[size][u];
-            break;
-        }
-        case 0x16: /* SQDMULH, SQRDMULH */
-        {
-            static NeonGenTwoOpEnvFn * const fns[2][2] = {
-                { gen_helper_neon_qdmulh_s16, gen_helper_neon_qrdmulh_s16 },
-                { gen_helper_neon_qdmulh_s32, gen_helper_neon_qrdmulh_s32 },
-            };
-            assert(size == 1 || size == 2);
-            genenvfn = fns[size - 1][u];
-            break;
-        }
-        default:
-            g_assert_not_reached();
-        }
-
-        genenvfn(tcg_rd32, cpu_env, tcg_rn, tcg_rm);
-        tcg_gen_extu_i32_i64(tcg_rd, tcg_rd32);
-        tcg_temp_free_i32(tcg_rd32);
-        tcg_temp_free_i32(tcg_rn);
-        tcg_temp_free_i32(tcg_rm);
-    }
-
-    write_fp_dreg(s, rd, tcg_rd);
-
-    tcg_temp_free_i64(tcg_rd);
-}
-
-/* AdvSIMD scalar three same FP16
- *  31 30  29 28       24 23  22 21 20  16 15 14 13    11 10  9  5 4  0
- * +-----+---+-----------+---+-----+------+-----+--------+---+----+----+
- * | 0 1 | U | 1 1 1 1 0 | a | 1 0 |  Rm  | 0 0 | opcode | 1 | Rn | Rd |
- * +-----+---+-----------+---+-----+------+-----+--------+---+----+----+
- * v: 0101 1110 0100 0000 0000 0100 0000 0000 => 5e400400
- * m: 1101 1111 0110 0000 1100 0100 0000 0000 => df60c400
- */
-static void disas_simd_scalar_three_reg_same_fp16(DisasContext *s,
-                                                  uint32_t insn)
-{
-    int rd = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    int opcode = extract32(insn, 11, 3);
-    int rm = extract32(insn, 16, 5);
-    bool u = extract32(insn, 29, 1);
-    bool a = extract32(insn, 23, 1);
-    int fpopcode = opcode | (a << 3) |  (u << 4);
-    TCGv_ptr fpst;
-    TCGv_i32 tcg_op1;
-    TCGv_i32 tcg_op2;
-    TCGv_i32 tcg_res;
-
-    switch (fpopcode) {
-    case 0x03: /* FMULX */
-    case 0x04: /* FCMEQ (reg) */
-    case 0x07: /* FRECPS */
-    case 0x0f: /* FRSQRTS */
-    case 0x14: /* FCMGE (reg) */
-    case 0x15: /* FACGE */
-    case 0x1a: /* FABD */
-    case 0x1c: /* FCMGT (reg) */
-    case 0x1d: /* FACGT */
-        break;
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!dc_isar_feature(aa64_fp16, s)) {
-        unallocated_encoding(s);
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    fpst = fpstatus_ptr(FPST_FPCR_F16);
-
-    tcg_op1 = read_fp_hreg(s, rn);
-    tcg_op2 = read_fp_hreg(s, rm);
-    tcg_res = tcg_temp_new_i32();
-
-    switch (fpopcode) {
-    case 0x03: /* FMULX */
-        gen_helper_advsimd_mulxh(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x04: /* FCMEQ (reg) */
-        gen_helper_advsimd_ceq_f16(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x07: /* FRECPS */
-        gen_helper_recpsf_f16(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x0f: /* FRSQRTS */
-        gen_helper_rsqrtsf_f16(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x14: /* FCMGE (reg) */
-        gen_helper_advsimd_cge_f16(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x15: /* FACGE */
-        gen_helper_advsimd_acge_f16(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x1a: /* FABD */
-        gen_helper_advsimd_subh(tcg_res, tcg_op1, tcg_op2, fpst);
-        tcg_gen_andi_i32(tcg_res, tcg_res, 0x7fff);
-        break;
-    case 0x1c: /* FCMGT (reg) */
-        gen_helper_advsimd_cgt_f16(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    case 0x1d: /* FACGT */
-        gen_helper_advsimd_acgt_f16(tcg_res, tcg_op1, tcg_op2, fpst);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-
-    write_fp_sreg(s, rd, tcg_res);
-
-
-    tcg_temp_free_i32(tcg_res);
-    tcg_temp_free_i32(tcg_op1);
-    tcg_temp_free_i32(tcg_op2);
-    tcg_temp_free_ptr(fpst);
-}
-
-/* AdvSIMD scalar three same extra
- *  31 30  29 28       24 23  22  21 20  16  15 14    11  10 9  5 4  0
- * +-----+---+-----------+------+---+------+---+--------+---+----+----+
- * | 0 1 | U | 1 1 1 1 0 | size | 0 |  Rm  | 1 | opcode | 1 | Rn | Rd |
- * +-----+---+-----------+------+---+------+---+--------+---+----+----+
- */
-static void disas_simd_scalar_three_reg_same_extra(DisasContext *s,
-                                                   uint32_t insn)
-{
-    int rd = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    int opcode = extract32(insn, 11, 4);
-    int rm = extract32(insn, 16, 5);
-    int size = extract32(insn, 22, 2);
-    bool u = extract32(insn, 29, 1);
-    TCGv_i32 ele1, ele2, ele3;
-    TCGv_i64 res;
-    bool feature;
-
-    switch (u * 16 + opcode) {
-    case 0x10: /* SQRDMLAH (vector) */
-    case 0x11: /* SQRDMLSH (vector) */
-        if (size != 1 && size != 2) {
-            unallocated_encoding(s);
-            return;
-        }
-        feature = dc_isar_feature(aa64_rdm, s);
-        break;
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-    if (!feature) {
-        unallocated_encoding(s);
-        return;
-    }
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    /* Do a single operation on the lowest element in the vector.
-     * We use the standard Neon helpers and rely on 0 OP 0 == 0
-     * with no side effects for all these operations.
-     * OPTME: special-purpose helpers would avoid doing some
-     * unnecessary work in the helper for the 16 bit cases.
-     */
-    ele1 = tcg_temp_new_i32();
-    ele2 = tcg_temp_new_i32();
-    ele3 = tcg_temp_new_i32();
-
-    read_vec_element_i32(s, ele1, rn, 0, size);
-    read_vec_element_i32(s, ele2, rm, 0, size);
-    read_vec_element_i32(s, ele3, rd, 0, size);
-
-    switch (opcode) {
-    case 0x0: /* SQRDMLAH */
-        if (size == 1) {
-            gen_helper_neon_qrdmlah_s16(ele3, cpu_env, ele1, ele2, ele3);
-        } else {
-            gen_helper_neon_qrdmlah_s32(ele3, cpu_env, ele1, ele2, ele3);
-        }
-        break;
-    case 0x1: /* SQRDMLSH */
-        if (size == 1) {
-            gen_helper_neon_qrdmlsh_s16(ele3, cpu_env, ele1, ele2, ele3);
-        } else {
-            gen_helper_neon_qrdmlsh_s32(ele3, cpu_env, ele1, ele2, ele3);
-        }
-        break;
-    default:
-        g_assert_not_reached();
-    }
-    tcg_temp_free_i32(ele1);
-    tcg_temp_free_i32(ele2);
-
-    res = tcg_temp_new_i64();
-    tcg_gen_extu_i32_i64(res, ele3);
-    tcg_temp_free_i32(ele3);
-
-    write_fp_dreg(s, rd, res);
-    tcg_temp_free_i64(res);
-}
-
-static void handle_2misc_64(DisasContext *s, int opcode, bool u,
-                            TCGv_i64 tcg_rd, TCGv_i64 tcg_rn,
-                            TCGv_i32 tcg_rmode, TCGv_ptr tcg_fpstatus)
-{
-    /* Handle 64->64 opcodes which are shared between the scalar and
-     * vector 2-reg-misc groups. We cover every integer opcode where size == 3
-     * is valid in either group and also the double-precision fp ops.
-     * The caller only need provide tcg_rmode and tcg_fpstatus if the op
-     * requires them.
-     */
-    TCGCond cond;
-
-    switch (opcode) {
-    case 0x4: /* CLS, CLZ */
-        if (u) {
-            tcg_gen_clzi_i64(tcg_rd, tcg_rn, 64);
-        } else {
-            tcg_gen_clrsb_i64(tcg_rd, tcg_rn);
-        }
-        break;
-    case 0x5: /* NOT */
-        /* This opcode is shared with CNT and RBIT but we have earlier
-         * enforced that size == 3 if and only if this is the NOT insn.
-         */
-        tcg_gen_not_i64(tcg_rd, tcg_rn);
-        break;
-    case 0x7: /* SQABS, SQNEG */
-        if (u) {
-            gen_helper_neon_qneg_s64(tcg_rd, cpu_env, tcg_rn);
-        } else {
-            gen_helper_neon_qabs_s64(tcg_rd, cpu_env, tcg_rn);
-        }
-        break;
-    case 0xa: /* CMLT */
-        /* 64 bit integer comparison against zero, result is
-         * test ? (2^64 - 1) : 0. We implement via setcond(!test) and
-         * subtracting 1.
-         */
-        cond = TCG_COND_LT;
-    do_cmop:
-        tcg_gen_setcondi_i64(cond, tcg_rd, tcg_rn, 0);
-        tcg_gen_neg_i64(tcg_rd, tcg_rd);
-        break;
-    case 0x8: /* CMGT, CMGE */
-        cond = u ? TCG_COND_GE : TCG_COND_GT;
-        goto do_cmop;
-    case 0x9: /* CMEQ, CMLE */
-        cond = u ? TCG_COND_LE : TCG_COND_EQ;
-        goto do_cmop;
-    case 0xb: /* ABS, NEG */
-        if (u) {
-            tcg_gen_neg_i64(tcg_rd, tcg_rn);
-        } else {
-            tcg_gen_abs_i64(tcg_rd, tcg_rn);
-        }
-        break;
-    case 0x2f: /* FABS */
-        gen_helper_vfp_absd(tcg_rd, tcg_rn);
-        break;
-    case 0x6f: /* FNEG */
-        gen_helper_vfp_negd(tcg_rd, tcg_rn);
-        break;
-    case 0x7f: /* FSQRT */
-        gen_helper_vfp_sqrtd(tcg_rd, tcg_rn, cpu_env);
-        break;
-    case 0x1a: /* FCVTNS */
-    case 0x1b: /* FCVTMS */
-    case 0x1c: /* FCVTAS */
-    case 0x3a: /* FCVTPS */
-    case 0x3b: /* FCVTZS */
-        gen_helper_vfp_tosqd(tcg_rd, tcg_rn, tcg_constant_i32(0), tcg_fpstatus);
-        break;
-    case 0x5a: /* FCVTNU */
-    case 0x5b: /* FCVTMU */
-    case 0x5c: /* FCVTAU */
-    case 0x7a: /* FCVTPU */
-    case 0x7b: /* FCVTZU */
-        gen_helper_vfp_touqd(tcg_rd, tcg_rn, tcg_constant_i32(0), tcg_fpstatus);
-        break;
-    case 0x18: /* FRINTN */
-    case 0x19: /* FRINTM */
-    case 0x38: /* FRINTP */
-    case 0x39: /* FRINTZ */
-    case 0x58: /* FRINTA */
-    case 0x79: /* FRINTI */
-        gen_helper_rintd(tcg_rd, tcg_rn, tcg_fpstatus);
-        break;
-    case 0x59: /* FRINTX */
-        gen_helper_rintd_exact(tcg_rd, tcg_rn, tcg_fpstatus);
-        break;
-    case 0x1e: /* FRINT32Z */
-    case 0x5e: /* FRINT32X */
-        gen_helper_frint32_d(tcg_rd, tcg_rn, tcg_fpstatus);
-        break;
-    case 0x1f: /* FRINT64Z */
-    case 0x5f: /* FRINT64X */
-        gen_helper_frint64_d(tcg_rd, tcg_rn, tcg_fpstatus);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-}
-
-static void handle_2misc_fcmp_zero(DisasContext *s, int opcode,
-                                   bool is_scalar, bool is_u, bool is_q,
-                                   int size, int rn, int rd)
-{
-    bool is_double = (size == MO_64);
-    TCGv_ptr fpst;
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    fpst = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
-
-    if (is_double) {
-        TCGv_i64 tcg_op = tcg_temp_new_i64();
-        TCGv_i64 tcg_zero = tcg_constant_i64(0);
-        TCGv_i64 tcg_res = tcg_temp_new_i64();
-        NeonGenTwoDoubleOpFn *genfn;
-        bool swap = false;
-        int pass;
-
-        switch (opcode) {
-        case 0x2e: /* FCMLT (zero) */
-            swap = true;
-            /* fallthrough */
-        case 0x2c: /* FCMGT (zero) */
-            genfn = gen_helper_neon_cgt_f64;
-            break;
-        case 0x2d: /* FCMEQ (zero) */
-            genfn = gen_helper_neon_ceq_f64;
-            break;
-        case 0x6d: /* FCMLE (zero) */
-            swap = true;
-            /* fall through */
-        case 0x6c: /* FCMGE (zero) */
-            genfn = gen_helper_neon_cge_f64;
-            break;
-        default:
-            g_assert_not_reached();
-        }
-
-        for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) {
-            read_vec_element(s, tcg_op, rn, pass, MO_64);
-            if (swap) {
-                genfn(tcg_res, tcg_zero, tcg_op, fpst);
-            } else {
-                genfn(tcg_res, tcg_op, tcg_zero, fpst);
-            }
-            write_vec_element(s, tcg_res, rd, pass, MO_64);
-        }
-        tcg_temp_free_i64(tcg_res);
-        tcg_temp_free_i64(tcg_op);
-
-        clear_vec_high(s, !is_scalar, rd);
-    } else {
-        TCGv_i32 tcg_op = tcg_temp_new_i32();
-        TCGv_i32 tcg_zero = tcg_constant_i32(0);
-        TCGv_i32 tcg_res = tcg_temp_new_i32();
-        NeonGenTwoSingleOpFn *genfn;
-        bool swap = false;
-        int pass, maxpasses;
-
-        if (size == MO_16) {
-            switch (opcode) {
-            case 0x2e: /* FCMLT (zero) */
-                swap = true;
-                /* fall through */
-            case 0x2c: /* FCMGT (zero) */
-                genfn = gen_helper_advsimd_cgt_f16;
-                break;
-            case 0x2d: /* FCMEQ (zero) */
-                genfn = gen_helper_advsimd_ceq_f16;
-                break;
-            case 0x6d: /* FCMLE (zero) */
-                swap = true;
-                /* fall through */
-            case 0x6c: /* FCMGE (zero) */
-                genfn = gen_helper_advsimd_cge_f16;
-                break;
-            default:
-                g_assert_not_reached();
-            }
-        } else {
-            switch (opcode) {
-            case 0x2e: /* FCMLT (zero) */
-                swap = true;
-                /* fall through */
-            case 0x2c: /* FCMGT (zero) */
-                genfn = gen_helper_neon_cgt_f32;
-                break;
-            case 0x2d: /* FCMEQ (zero) */
-                genfn = gen_helper_neon_ceq_f32;
-                break;
-            case 0x6d: /* FCMLE (zero) */
-                swap = true;
-                /* fall through */
-            case 0x6c: /* FCMGE (zero) */
-                genfn = gen_helper_neon_cge_f32;
-                break;
-            default:
-                g_assert_not_reached();
-            }
-        }
-
-        if (is_scalar) {
-            maxpasses = 1;
-        } else {
-            int vector_size = 8 << is_q;
-            maxpasses = vector_size >> size;
-        }
-
-        for (pass = 0; pass < maxpasses; pass++) {
-            read_vec_element_i32(s, tcg_op, rn, pass, size);
-            if (swap) {
-                genfn(tcg_res, tcg_zero, tcg_op, fpst);
-            } else {
-                genfn(tcg_res, tcg_op, tcg_zero, fpst);
-            }
-            if (is_scalar) {
-                write_fp_sreg(s, rd, tcg_res);
-            } else {
-                write_vec_element_i32(s, tcg_res, rd, pass, size);
-            }
-        }
-        tcg_temp_free_i32(tcg_res);
-        tcg_temp_free_i32(tcg_op);
-        if (!is_scalar) {
-            clear_vec_high(s, is_q, rd);
-        }
-    }
-
-    tcg_temp_free_ptr(fpst);
-}
-
-static void handle_2misc_reciprocal(DisasContext *s, int opcode,
-                                    bool is_scalar, bool is_u, bool is_q,
-                                    int size, int rn, int rd)
-{
-    bool is_double = (size == 3);
-    TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
-
-    if (is_double) {
-        TCGv_i64 tcg_op = tcg_temp_new_i64();
-        TCGv_i64 tcg_res = tcg_temp_new_i64();
-        int pass;
-
-        for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) {
-            read_vec_element(s, tcg_op, rn, pass, MO_64);
-            switch (opcode) {
-            case 0x3d: /* FRECPE */
-                gen_helper_recpe_f64(tcg_res, tcg_op, fpst);
-                break;
-            case 0x3f: /* FRECPX */
-                gen_helper_frecpx_f64(tcg_res, tcg_op, fpst);
-                break;
-            case 0x7d: /* FRSQRTE */
-                gen_helper_rsqrte_f64(tcg_res, tcg_op, fpst);
-                break;
-            default:
-                g_assert_not_reached();
-            }
-            write_vec_element(s, tcg_res, rd, pass, MO_64);
-        }
-        tcg_temp_free_i64(tcg_res);
-        tcg_temp_free_i64(tcg_op);
-        clear_vec_high(s, !is_scalar, rd);
-    } else {
-        TCGv_i32 tcg_op = tcg_temp_new_i32();
-        TCGv_i32 tcg_res = tcg_temp_new_i32();
-        int pass, maxpasses;
-
-        if (is_scalar) {
-            maxpasses = 1;
-        } else {
-            maxpasses = is_q ? 4 : 2;
-        }
-
-        for (pass = 0; pass < maxpasses; pass++) {
-            read_vec_element_i32(s, tcg_op, rn, pass, MO_32);
-
-            switch (opcode) {
-            case 0x3c: /* URECPE */
-                gen_helper_recpe_u32(tcg_res, tcg_op);
-                break;
-            case 0x3d: /* FRECPE */
-                gen_helper_recpe_f32(tcg_res, tcg_op, fpst);
-                break;
-            case 0x3f: /* FRECPX */
-                gen_helper_frecpx_f32(tcg_res, tcg_op, fpst);
-                break;
-            case 0x7d: /* FRSQRTE */
-                gen_helper_rsqrte_f32(tcg_res, tcg_op, fpst);
-                break;
-            default:
-                g_assert_not_reached();
-            }
-
-            if (is_scalar) {
-                write_fp_sreg(s, rd, tcg_res);
-            } else {
-                write_vec_element_i32(s, tcg_res, rd, pass, MO_32);
-            }
-        }
-        tcg_temp_free_i32(tcg_res);
-        tcg_temp_free_i32(tcg_op);
-        if (!is_scalar) {
-            clear_vec_high(s, is_q, rd);
-        }
-    }
-    tcg_temp_free_ptr(fpst);
-}
-
-static void handle_2misc_narrow(DisasContext *s, bool scalar,
-                                int opcode, bool u, bool is_q,
-                                int size, int rn, int rd)
-{
-    /* Handle 2-reg-misc ops which are narrowing (so each 2*size element
-     * in the source becomes a size element in the destination).
-     */
-    int pass;
-    TCGv_i32 tcg_res[2];
-    int destelt = is_q ? 2 : 0;
-    int passes = scalar ? 1 : 2;
-
-    if (scalar) {
-        tcg_res[1] = tcg_constant_i32(0);
-    }
-
-    for (pass = 0; pass < passes; pass++) {
-        TCGv_i64 tcg_op = tcg_temp_new_i64();
-        NeonGenNarrowFn *genfn = NULL;
-        NeonGenNarrowEnvFn *genenvfn = NULL;
-
-        if (scalar) {
-            read_vec_element(s, tcg_op, rn, pass, size + 1);
-        } else {
-            read_vec_element(s, tcg_op, rn, pass, MO_64);
-        }
-        tcg_res[pass] = tcg_temp_new_i32();
-
-        switch (opcode) {
-        case 0x12: /* XTN, SQXTUN */
-        {
-            static NeonGenNarrowFn * const xtnfns[3] = {
-                gen_helper_neon_narrow_u8,
-                gen_helper_neon_narrow_u16,
-                tcg_gen_extrl_i64_i32,
-            };
-            static NeonGenNarrowEnvFn * const sqxtunfns[3] = {
-                gen_helper_neon_unarrow_sat8,
-                gen_helper_neon_unarrow_sat16,
-                gen_helper_neon_unarrow_sat32,
-            };
-            if (u) {
-                genenvfn = sqxtunfns[size];
-            } else {
-                genfn = xtnfns[size];
-            }
-            break;
-        }
-        case 0x14: /* SQXTN, UQXTN */
-        {
-            static NeonGenNarrowEnvFn * const fns[3][2] = {
-                { gen_helper_neon_narrow_sat_s8,
-                  gen_helper_neon_narrow_sat_u8 },
-                { gen_helper_neon_narrow_sat_s16,
-                  gen_helper_neon_narrow_sat_u16 },
-                { gen_helper_neon_narrow_sat_s32,
-                  gen_helper_neon_narrow_sat_u32 },
-            };
-            genenvfn = fns[size][u];
-            break;
-        }
-        case 0x16: /* FCVTN, FCVTN2 */
-            /* 32 bit to 16 bit or 64 bit to 32 bit float conversion */
-            if (size == 2) {
-                gen_helper_vfp_fcvtsd(tcg_res[pass], tcg_op, cpu_env);
-            } else {
-                TCGv_i32 tcg_lo = tcg_temp_new_i32();
-                TCGv_i32 tcg_hi = tcg_temp_new_i32();
-                TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
-                TCGv_i32 ahp = get_ahp_flag();
-
-                tcg_gen_extr_i64_i32(tcg_lo, tcg_hi, tcg_op);
-                gen_helper_vfp_fcvt_f32_to_f16(tcg_lo, tcg_lo, fpst, ahp);
-                gen_helper_vfp_fcvt_f32_to_f16(tcg_hi, tcg_hi, fpst, ahp);
-                tcg_gen_deposit_i32(tcg_res[pass], tcg_lo, tcg_hi, 16, 16);
-                tcg_temp_free_i32(tcg_lo);
-                tcg_temp_free_i32(tcg_hi);
-                tcg_temp_free_ptr(fpst);
-                tcg_temp_free_i32(ahp);
-            }
-            break;
-        case 0x36: /* BFCVTN, BFCVTN2 */
-            {
-                TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
-                gen_helper_bfcvt_pair(tcg_res[pass], tcg_op, fpst);
-                tcg_temp_free_ptr(fpst);
-            }
-            break;
-        case 0x56:  /* FCVTXN, FCVTXN2 */
-            /* 64 bit to 32 bit float conversion
-             * with von Neumann rounding (round to odd)
-             */
-            assert(size == 2);
-            gen_helper_fcvtx_f64_to_f32(tcg_res[pass], tcg_op, cpu_env);
-            break;
-        default:
-            g_assert_not_reached();
-        }
-
-        if (genfn) {
-            genfn(tcg_res[pass], tcg_op);
-        } else if (genenvfn) {
-            genenvfn(tcg_res[pass], cpu_env, tcg_op);
-        }
-
-        tcg_temp_free_i64(tcg_op);
-    }
-
-    for (pass = 0; pass < 2; pass++) {
-        write_vec_element_i32(s, tcg_res[pass], rd, destelt + pass, MO_32);
-        tcg_temp_free_i32(tcg_res[pass]);
-    }
-    clear_vec_high(s, is_q, rd);
-}
-
-/* Remaining saturating accumulating ops */
-static void handle_2misc_satacc(DisasContext *s, bool is_scalar, bool is_u,
-                                bool is_q, int size, int rn, int rd)
-{
-    bool is_double = (size == 3);
-
-    if (is_double) {
-        TCGv_i64 tcg_rn = tcg_temp_new_i64();
-        TCGv_i64 tcg_rd = tcg_temp_new_i64();
-        int pass;
-
-        for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) {
-            read_vec_element(s, tcg_rn, rn, pass, MO_64);
-            read_vec_element(s, tcg_rd, rd, pass, MO_64);
-
-            if (is_u) { /* USQADD */
-                gen_helper_neon_uqadd_s64(tcg_rd, cpu_env, tcg_rn, tcg_rd);
-            } else { /* SUQADD */
-                gen_helper_neon_sqadd_u64(tcg_rd, cpu_env, tcg_rn, tcg_rd);
-            }
-            write_vec_element(s, tcg_rd, rd, pass, MO_64);
-        }
-        tcg_temp_free_i64(tcg_rd);
-        tcg_temp_free_i64(tcg_rn);
-        clear_vec_high(s, !is_scalar, rd);
-    } else {
-        TCGv_i32 tcg_rn = tcg_temp_new_i32();
-        TCGv_i32 tcg_rd = tcg_temp_new_i32();
-        int pass, maxpasses;
-
-        if (is_scalar) {
-            maxpasses = 1;
-        } else {
-            maxpasses = is_q ? 4 : 2;
-        }
-
-        for (pass = 0; pass < maxpasses; pass++) {
-            if (is_scalar) {
-                read_vec_element_i32(s, tcg_rn, rn, pass, size);
-                read_vec_element_i32(s, tcg_rd, rd, pass, size);
-            } else {
-                read_vec_element_i32(s, tcg_rn, rn, pass, MO_32);
-                read_vec_element_i32(s, tcg_rd, rd, pass, MO_32);
-            }
-
-            if (is_u) { /* USQADD */
-                switch (size) {
-                case 0:
-                    gen_helper_neon_uqadd_s8(tcg_rd, cpu_env, tcg_rn, tcg_rd);
-                    break;
-                case 1:
-                    gen_helper_neon_uqadd_s16(tcg_rd, cpu_env, tcg_rn, tcg_rd);
-                    break;
-                case 2:
-                    gen_helper_neon_uqadd_s32(tcg_rd, cpu_env, tcg_rn, tcg_rd);
-                    break;
-                default:
-                    g_assert_not_reached();
-                }
-            } else { /* SUQADD */
-                switch (size) {
-                case 0:
-                    gen_helper_neon_sqadd_u8(tcg_rd, cpu_env, tcg_rn, tcg_rd);
-                    break;
-                case 1:
-                    gen_helper_neon_sqadd_u16(tcg_rd, cpu_env, tcg_rn, tcg_rd);
-                    break;
-                case 2:
-                    gen_helper_neon_sqadd_u32(tcg_rd, cpu_env, tcg_rn, tcg_rd);
-                    break;
-                default:
-                    g_assert_not_reached();
-                }
-            }
-
-            if (is_scalar) {
-                write_vec_element(s, tcg_constant_i64(0), rd, 0, MO_64);
-            }
-            write_vec_element_i32(s, tcg_rd, rd, pass, MO_32);
-        }
-        tcg_temp_free_i32(tcg_rd);
-        tcg_temp_free_i32(tcg_rn);
-        clear_vec_high(s, is_q, rd);
-    }
-}
-
-/* AdvSIMD scalar two reg misc
- *  31 30  29 28       24 23  22 21       17 16    12 11 10 9    5 4    0
- * +-----+---+-----------+------+-----------+--------+-----+------+------+
- * | 0 1 | U | 1 1 1 1 0 | size | 1 0 0 0 0 | opcode | 1 0 |  Rn  |  Rd  |
- * +-----+---+-----------+------+-----------+--------+-----+------+------+
- */
-static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn)
-{
-    int rd = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    int opcode = extract32(insn, 12, 5);
-    int size = extract32(insn, 22, 2);
-    bool u = extract32(insn, 29, 1);
-    bool is_fcvt = false;
-    int rmode;
-    TCGv_i32 tcg_rmode;
-    TCGv_ptr tcg_fpstatus;
-
-    switch (opcode) {
-    case 0x3: /* USQADD / SUQADD*/
-        if (!fp_access_check(s)) {
-            return;
-        }
-        handle_2misc_satacc(s, true, u, false, size, rn, rd);
-        return;
-    case 0x7: /* SQABS / SQNEG */
-        break;
-    case 0xa: /* CMLT */
-        if (u) {
-            unallocated_encoding(s);
-            return;
-        }
-        /* fall through */
-    case 0x8: /* CMGT, CMGE */
-    case 0x9: /* CMEQ, CMLE */
-    case 0xb: /* ABS, NEG */
-        if (size != 3) {
-            unallocated_encoding(s);
-            return;
-        }
-        break;
-    case 0x12: /* SQXTUN */
-        if (!u) {
-            unallocated_encoding(s);
-            return;
-        }
-        /* fall through */
-    case 0x14: /* SQXTN, UQXTN */
-        if (size == 3) {
-            unallocated_encoding(s);
-            return;
-        }
-        if (!fp_access_check(s)) {
-            return;
-        }
-        handle_2misc_narrow(s, true, opcode, u, false, size, rn, rd);
-        return;
-    case 0xc ... 0xf:
-    case 0x16 ... 0x1d:
-    case 0x1f:
-        /* Floating point: U, size[1] and opcode indicate operation;
-         * size[0] indicates single or double precision.
-         */
-        opcode |= (extract32(size, 1, 1) << 5) | (u << 6);
-        size = extract32(size, 0, 1) ? 3 : 2;
-        switch (opcode) {
-        case 0x2c: /* FCMGT (zero) */
-        case 0x2d: /* FCMEQ (zero) */
-        case 0x2e: /* FCMLT (zero) */
-        case 0x6c: /* FCMGE (zero) */
-        case 0x6d: /* FCMLE (zero) */
-            handle_2misc_fcmp_zero(s, opcode, true, u, true, size, rn, rd);
-            return;
-        case 0x1d: /* SCVTF */
-        case 0x5d: /* UCVTF */
-        {
-            bool is_signed = (opcode == 0x1d);
-            if (!fp_access_check(s)) {
-                return;
-            }
-            handle_simd_intfp_conv(s, rd, rn, 1, is_signed, 0, size);
-            return;
-        }
-        case 0x3d: /* FRECPE */
-        case 0x3f: /* FRECPX */
-        case 0x7d: /* FRSQRTE */
-            if (!fp_access_check(s)) {
-                return;
-            }
-            handle_2misc_reciprocal(s, opcode, true, u, true, size, rn, rd);
-            return;
-        case 0x1a: /* FCVTNS */
-        case 0x1b: /* FCVTMS */
-        case 0x3a: /* FCVTPS */
-        case 0x3b: /* FCVTZS */
-        case 0x5a: /* FCVTNU */
-        case 0x5b: /* FCVTMU */
-        case 0x7a: /* FCVTPU */
-        case 0x7b: /* FCVTZU */
-            is_fcvt = true;
-            rmode = extract32(opcode, 5, 1) | (extract32(opcode, 0, 1) << 1);
-            break;
-        case 0x1c: /* FCVTAS */
-        case 0x5c: /* FCVTAU */
-            /* TIEAWAY doesn't fit in the usual rounding mode encoding */
-            is_fcvt = true;
-            rmode = FPROUNDING_TIEAWAY;
-            break;
-        case 0x56: /* FCVTXN, FCVTXN2 */
-            if (size == 2) {
-                unallocated_encoding(s);
-                return;
-            }
-            if (!fp_access_check(s)) {
-                return;
-            }
-            handle_2misc_narrow(s, true, opcode, u, false, size - 1, rn, rd);
-            return;
-        default:
-            unallocated_encoding(s);
-            return;
-        }
-        break;
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    if (is_fcvt) {
-        tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode));
-        tcg_fpstatus = fpstatus_ptr(FPST_FPCR);
-        gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
-    } else {
-        tcg_rmode = NULL;
-        tcg_fpstatus = NULL;
-    }
-
-    if (size == 3) {
-        TCGv_i64 tcg_rn = read_fp_dreg(s, rn);
-        TCGv_i64 tcg_rd = tcg_temp_new_i64();
-
-        handle_2misc_64(s, opcode, u, tcg_rd, tcg_rn, tcg_rmode, tcg_fpstatus);
-        write_fp_dreg(s, rd, tcg_rd);
-        tcg_temp_free_i64(tcg_rd);
-        tcg_temp_free_i64(tcg_rn);
-    } else {
-        TCGv_i32 tcg_rn = tcg_temp_new_i32();
-        TCGv_i32 tcg_rd = tcg_temp_new_i32();
-
-        read_vec_element_i32(s, tcg_rn, rn, 0, size);
-
-        switch (opcode) {
-        case 0x7: /* SQABS, SQNEG */
-        {
-            NeonGenOneOpEnvFn *genfn;
-            static NeonGenOneOpEnvFn * const fns[3][2] = {
-                { gen_helper_neon_qabs_s8, gen_helper_neon_qneg_s8 },
-                { gen_helper_neon_qabs_s16, gen_helper_neon_qneg_s16 },
-                { gen_helper_neon_qabs_s32, gen_helper_neon_qneg_s32 },
-            };
-            genfn = fns[size][u];
-            genfn(tcg_rd, cpu_env, tcg_rn);
-            break;
-        }
-        case 0x1a: /* FCVTNS */
-        case 0x1b: /* FCVTMS */
-        case 0x1c: /* FCVTAS */
-        case 0x3a: /* FCVTPS */
-        case 0x3b: /* FCVTZS */
-            gen_helper_vfp_tosls(tcg_rd, tcg_rn, tcg_constant_i32(0),
-                                 tcg_fpstatus);
-            break;
-        case 0x5a: /* FCVTNU */
-        case 0x5b: /* FCVTMU */
-        case 0x5c: /* FCVTAU */
-        case 0x7a: /* FCVTPU */
-        case 0x7b: /* FCVTZU */
-            gen_helper_vfp_touls(tcg_rd, tcg_rn, tcg_constant_i32(0),
-                                 tcg_fpstatus);
-            break;
-        default:
-            g_assert_not_reached();
-        }
-
-        write_fp_sreg(s, rd, tcg_rd);
-        tcg_temp_free_i32(tcg_rd);
-        tcg_temp_free_i32(tcg_rn);
-    }
-
-    if (is_fcvt) {
-        gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
-        tcg_temp_free_i32(tcg_rmode);
-        tcg_temp_free_ptr(tcg_fpstatus);
-    }
-}
-
-/* SSHR[RA]/USHR[RA] - Vector shift right (optional rounding/accumulate) */
-static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u,
-                                 int immh, int immb, int opcode, int rn, int rd)
-{
-    int size = 32 - clz32(immh) - 1;
-    int immhb = immh << 3 | immb;
-    int shift = 2 * (8 << size) - immhb;
-    GVecGen2iFn *gvec_fn;
-
-    if (extract32(immh, 3, 1) && !is_q) {
-        unallocated_encoding(s);
-        return;
-    }
-    tcg_debug_assert(size <= 3);
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    switch (opcode) {
-    case 0x02: /* SSRA / USRA (accumulate) */
-        gvec_fn = is_u ? gen_gvec_usra : gen_gvec_ssra;
-        break;
-
-    case 0x08: /* SRI */
-        gvec_fn = gen_gvec_sri;
-        break;
-
-    case 0x00: /* SSHR / USHR */
-        if (is_u) {
-            if (shift == 8 << size) {
-                /* Shift count the same size as element size produces zero.  */
-                tcg_gen_gvec_dup_imm(size, vec_full_reg_offset(s, rd),
-                                     is_q ? 16 : 8, vec_full_reg_size(s), 0);
-                return;
-            }
-            gvec_fn = tcg_gen_gvec_shri;
-        } else {
-            /* Shift count the same size as element size produces all sign.  */
-            if (shift == 8 << size) {
-                shift -= 1;
-            }
-            gvec_fn = tcg_gen_gvec_sari;
-        }
-        break;
-
-    case 0x04: /* SRSHR / URSHR (rounding) */
-        gvec_fn = is_u ? gen_gvec_urshr : gen_gvec_srshr;
-        break;
-
-    case 0x06: /* SRSRA / URSRA (accum + rounding) */
-        gvec_fn = is_u ? gen_gvec_ursra : gen_gvec_srsra;
-        break;
-
-    default:
-        g_assert_not_reached();
-    }
-
-    gen_gvec_fn2i(s, is_q, rd, rn, shift, gvec_fn, size);
-}
-
-/* SHL/SLI - Vector shift left */
-static void handle_vec_simd_shli(DisasContext *s, bool is_q, bool insert,
-                                 int immh, int immb, int opcode, int rn, int rd)
-{
-    int size = 32 - clz32(immh) - 1;
-    int immhb = immh << 3 | immb;
-    int shift = immhb - (8 << size);
-
-    /* Range of size is limited by decode: immh is a non-zero 4 bit field */
-    assert(size >= 0 && size <= 3);
-
-    if (extract32(immh, 3, 1) && !is_q) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    if (insert) {
-        gen_gvec_fn2i(s, is_q, rd, rn, shift, gen_gvec_sli, size);
-    } else {
-        gen_gvec_fn2i(s, is_q, rd, rn, shift, tcg_gen_gvec_shli, size);
-    }
-}
-
-/* USHLL/SHLL - Vector shift left with widening */
-static void handle_vec_simd_wshli(DisasContext *s, bool is_q, bool is_u,
-                                 int immh, int immb, int opcode, int rn, int rd)
-{
-    int size = 32 - clz32(immh) - 1;
-    int immhb = immh << 3 | immb;
-    int shift = immhb - (8 << size);
-    int dsize = 64;
-    int esize = 8 << size;
-    int elements = dsize/esize;
-    TCGv_i64 tcg_rn = new_tmp_a64(s);
-    TCGv_i64 tcg_rd = new_tmp_a64(s);
-    int i;
-
-    if (size >= 3) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    /* For the LL variants the store is larger than the load,
-     * so if rd == rn we would overwrite parts of our input.
-     * So load everything right now and use shifts in the main loop.
-     */
-    read_vec_element(s, tcg_rn, rn, is_q ? 1 : 0, MO_64);
-
-    for (i = 0; i < elements; i++) {
-        tcg_gen_shri_i64(tcg_rd, tcg_rn, i * esize);
-        ext_and_shift_reg(tcg_rd, tcg_rd, size | (!is_u << 2), 0);
-        tcg_gen_shli_i64(tcg_rd, tcg_rd, shift);
-        write_vec_element(s, tcg_rd, rd, i, size + 1);
-    }
-}
-
-/* SHRN/RSHRN - Shift right with narrowing (and potential rounding) */
-static void handle_vec_simd_shrn(DisasContext *s, bool is_q,
-                                 int immh, int immb, int opcode, int rn, int rd)
-{
-    int immhb = immh << 3 | immb;
-    int size = 32 - clz32(immh) - 1;
-    int dsize = 64;
-    int esize = 8 << size;
-    int elements = dsize/esize;
-    int shift = (2 * esize) - immhb;
-    bool round = extract32(opcode, 0, 1);
-    TCGv_i64 tcg_rn, tcg_rd, tcg_final;
-    TCGv_i64 tcg_round;
-    int i;
-
-    if (extract32(immh, 3, 1)) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    tcg_rn = tcg_temp_new_i64();
-    tcg_rd = tcg_temp_new_i64();
-    tcg_final = tcg_temp_new_i64();
-    read_vec_element(s, tcg_final, rd, is_q ? 1 : 0, MO_64);
-
-    if (round) {
-        tcg_round = tcg_constant_i64(1ULL << (shift - 1));
-    } else {
-        tcg_round = NULL;
-    }
-
-    for (i = 0; i < elements; i++) {
-        read_vec_element(s, tcg_rn, rn, i, size+1);
-        handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
-                                false, true, size+1, shift);
-
-        tcg_gen_deposit_i64(tcg_final, tcg_final, tcg_rd, esize * i, esize);
-    }
-
-    if (!is_q) {
-        write_vec_element(s, tcg_final, rd, 0, MO_64);
-    } else {
-        write_vec_element(s, tcg_final, rd, 1, MO_64);
-    }
-    tcg_temp_free_i64(tcg_rn);
-    tcg_temp_free_i64(tcg_rd);
-    tcg_temp_free_i64(tcg_final);
-
-    clear_vec_high(s, is_q, rd);
-}
-
-
-/* AdvSIMD shift by immediate
- *  31  30   29 28         23 22  19 18  16 15    11  10 9    5 4    0
- * +---+---+---+-------------+------+------+--------+---+------+------+
- * | 0 | Q | U | 0 1 1 1 1 0 | immh | immb | opcode | 1 |  Rn  |  Rd  |
- * +---+---+---+-------------+------+------+--------+---+------+------+
- */
-static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
-{
-    int rd = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    int opcode = extract32(insn, 11, 5);
-    int immb = extract32(insn, 16, 3);
-    int immh = extract32(insn, 19, 4);
-    bool is_u = extract32(insn, 29, 1);
-    bool is_q = extract32(insn, 30, 1);
-
-    /* data_proc_simd[] has sent immh == 0 to disas_simd_mod_imm. */
-    assert(immh != 0);
-
-    switch (opcode) {
-    case 0x08: /* SRI */
-        if (!is_u) {
-            unallocated_encoding(s);
-            return;
-        }
-        /* fall through */
-    case 0x00: /* SSHR / USHR */
-    case 0x02: /* SSRA / USRA (accumulate) */
-    case 0x04: /* SRSHR / URSHR (rounding) */
-    case 0x06: /* SRSRA / URSRA (accum + rounding) */
-        handle_vec_simd_shri(s, is_q, is_u, immh, immb, opcode, rn, rd);
-        break;
-    case 0x0a: /* SHL / SLI */
-        handle_vec_simd_shli(s, is_q, is_u, immh, immb, opcode, rn, rd);
-        break;
-    case 0x10: /* SHRN */
-    case 0x11: /* RSHRN / SQRSHRUN */
-        if (is_u) {
-            handle_vec_simd_sqshrn(s, false, is_q, false, true, immh, immb,
-                                   opcode, rn, rd);
-        } else {
-            handle_vec_simd_shrn(s, is_q, immh, immb, opcode, rn, rd);
-        }
-        break;
-    case 0x12: /* SQSHRN / UQSHRN */
-    case 0x13: /* SQRSHRN / UQRSHRN */
-        handle_vec_simd_sqshrn(s, false, is_q, is_u, is_u, immh, immb,
-                               opcode, rn, rd);
-        break;
-    case 0x14: /* SSHLL / USHLL */
-        handle_vec_simd_wshli(s, is_q, is_u, immh, immb, opcode, rn, rd);
-        break;
-    case 0x1c: /* SCVTF / UCVTF */
-        handle_simd_shift_intfp_conv(s, false, is_q, is_u, immh, immb,
-                                     opcode, rn, rd);
-        break;
-    case 0xc: /* SQSHLU */
-        if (!is_u) {
-            unallocated_encoding(s);
-            return;
-        }
-        handle_simd_qshl(s, false, is_q, false, true, immh, immb, rn, rd);
-        break;
-    case 0xe: /* SQSHL, UQSHL */
-        handle_simd_qshl(s, false, is_q, is_u, is_u, immh, immb, rn, rd);
-        break;
-    case 0x1f: /* FCVTZS/ FCVTZU */
-        handle_simd_shift_fpint_conv(s, false, is_q, is_u, immh, immb, rn, rd);
-        return;
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-}
-
-/* Generate code to do a "long" addition or subtraction, ie one done in
- * TCGv_i64 on vector lanes twice the width specified by size.
- */
-static void gen_neon_addl(int size, bool is_sub, TCGv_i64 tcg_res,
-                          TCGv_i64 tcg_op1, TCGv_i64 tcg_op2)
-{
-    static NeonGenTwo64OpFn * const fns[3][2] = {
-        { gen_helper_neon_addl_u16, gen_helper_neon_subl_u16 },
-        { gen_helper_neon_addl_u32, gen_helper_neon_subl_u32 },
-        { tcg_gen_add_i64, tcg_gen_sub_i64 },
-    };
-    NeonGenTwo64OpFn *genfn;
-    assert(size < 3);
-
-    genfn = fns[size][is_sub];
-    genfn(tcg_res, tcg_op1, tcg_op2);
-}
-
-static void handle_3rd_widening(DisasContext *s, int is_q, int is_u, int size,
-                                int opcode, int rd, int rn, int rm)
-{
-    /* 3-reg-different widening insns: 64 x 64 -> 128 */
-    TCGv_i64 tcg_res[2];
-    int pass, accop;
-
-    tcg_res[0] = tcg_temp_new_i64();
-    tcg_res[1] = tcg_temp_new_i64();
-
-    /* Does this op do an adding accumulate, a subtracting accumulate,
-     * or no accumulate at all?
-     */
-    switch (opcode) {
-    case 5:
-    case 8:
-    case 9:
-        accop = 1;
-        break;
-    case 10:
-    case 11:
-        accop = -1;
-        break;
-    default:
-        accop = 0;
-        break;
-    }
-
-    if (accop != 0) {
-        read_vec_element(s, tcg_res[0], rd, 0, MO_64);
-        read_vec_element(s, tcg_res[1], rd, 1, MO_64);
-    }
-
-    /* size == 2 means two 32x32->64 operations; this is worth special
-     * casing because we can generally handle it inline.
-     */
-    if (size == 2) {
-        for (pass = 0; pass < 2; pass++) {
-            TCGv_i64 tcg_op1 = tcg_temp_new_i64();
-            TCGv_i64 tcg_op2 = tcg_temp_new_i64();
-            TCGv_i64 tcg_passres;
-            MemOp memop = MO_32 | (is_u ? 0 : MO_SIGN);
-
-            int elt = pass + is_q * 2;
-
-            read_vec_element(s, tcg_op1, rn, elt, memop);
-            read_vec_element(s, tcg_op2, rm, elt, memop);
-
-            if (accop == 0) {
-                tcg_passres = tcg_res[pass];
-            } else {
-                tcg_passres = tcg_temp_new_i64();
-            }
-
-            switch (opcode) {
-            case 0: /* SADDL, SADDL2, UADDL, UADDL2 */
-                tcg_gen_add_i64(tcg_passres, tcg_op1, tcg_op2);
-                break;
-            case 2: /* SSUBL, SSUBL2, USUBL, USUBL2 */
-                tcg_gen_sub_i64(tcg_passres, tcg_op1, tcg_op2);
-                break;
-            case 5: /* SABAL, SABAL2, UABAL, UABAL2 */
-            case 7: /* SABDL, SABDL2, UABDL, UABDL2 */
-            {
-                TCGv_i64 tcg_tmp1 = tcg_temp_new_i64();
-                TCGv_i64 tcg_tmp2 = tcg_temp_new_i64();
-
-                tcg_gen_sub_i64(tcg_tmp1, tcg_op1, tcg_op2);
-                tcg_gen_sub_i64(tcg_tmp2, tcg_op2, tcg_op1);
-                tcg_gen_movcond_i64(is_u ? TCG_COND_GEU : TCG_COND_GE,
-                                    tcg_passres,
-                                    tcg_op1, tcg_op2, tcg_tmp1, tcg_tmp2);
-                tcg_temp_free_i64(tcg_tmp1);
-                tcg_temp_free_i64(tcg_tmp2);
-                break;
-            }
-            case 8: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
-            case 10: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
-            case 12: /* UMULL, UMULL2, SMULL, SMULL2 */
-                tcg_gen_mul_i64(tcg_passres, tcg_op1, tcg_op2);
-                break;
-            case 9: /* SQDMLAL, SQDMLAL2 */
-            case 11: /* SQDMLSL, SQDMLSL2 */
-            case 13: /* SQDMULL, SQDMULL2 */
-                tcg_gen_mul_i64(tcg_passres, tcg_op1, tcg_op2);
-                gen_helper_neon_addl_saturate_s64(tcg_passres, cpu_env,
-                                                  tcg_passres, tcg_passres);
-                break;
-            default:
-                g_assert_not_reached();
-            }
-
-            if (opcode == 9 || opcode == 11) {
-                /* saturating accumulate ops */
-                if (accop < 0) {
-                    tcg_gen_neg_i64(tcg_passres, tcg_passres);
-                }
-                gen_helper_neon_addl_saturate_s64(tcg_res[pass], cpu_env,
-                                                  tcg_res[pass], tcg_passres);
-            } else if (accop > 0) {
-                tcg_gen_add_i64(tcg_res[pass], tcg_res[pass], tcg_passres);
-            } else if (accop < 0) {
-                tcg_gen_sub_i64(tcg_res[pass], tcg_res[pass], tcg_passres);
-            }
-
-            if (accop != 0) {
-                tcg_temp_free_i64(tcg_passres);
-            }
-
-            tcg_temp_free_i64(tcg_op1);
-            tcg_temp_free_i64(tcg_op2);
-        }
-    } else {
-        /* size 0 or 1, generally helper functions */
-        for (pass = 0; pass < 2; pass++) {
-            TCGv_i32 tcg_op1 = tcg_temp_new_i32();
-            TCGv_i32 tcg_op2 = tcg_temp_new_i32();
-            TCGv_i64 tcg_passres;
-            int elt = pass + is_q * 2;
-
-            read_vec_element_i32(s, tcg_op1, rn, elt, MO_32);
-            read_vec_element_i32(s, tcg_op2, rm, elt, MO_32);
-
-            if (accop == 0) {
-                tcg_passres = tcg_res[pass];
-            } else {
-                tcg_passres = tcg_temp_new_i64();
-            }
-
-            switch (opcode) {
-            case 0: /* SADDL, SADDL2, UADDL, UADDL2 */
-            case 2: /* SSUBL, SSUBL2, USUBL, USUBL2 */
-            {
-                TCGv_i64 tcg_op2_64 = tcg_temp_new_i64();
-                static NeonGenWidenFn * const widenfns[2][2] = {
-                    { gen_helper_neon_widen_s8, gen_helper_neon_widen_u8 },
-                    { gen_helper_neon_widen_s16, gen_helper_neon_widen_u16 },
-                };
-                NeonGenWidenFn *widenfn = widenfns[size][is_u];
-
-                widenfn(tcg_op2_64, tcg_op2);
-                widenfn(tcg_passres, tcg_op1);
-                gen_neon_addl(size, (opcode == 2), tcg_passres,
-                              tcg_passres, tcg_op2_64);
-                tcg_temp_free_i64(tcg_op2_64);
-                break;
-            }
-            case 5: /* SABAL, SABAL2, UABAL, UABAL2 */
-            case 7: /* SABDL, SABDL2, UABDL, UABDL2 */
-                if (size == 0) {
-                    if (is_u) {
-                        gen_helper_neon_abdl_u16(tcg_passres, tcg_op1, tcg_op2);
-                    } else {
-                        gen_helper_neon_abdl_s16(tcg_passres, tcg_op1, tcg_op2);
-                    }
-                } else {
-                    if (is_u) {
-                        gen_helper_neon_abdl_u32(tcg_passres, tcg_op1, tcg_op2);
-                    } else {
-                        gen_helper_neon_abdl_s32(tcg_passres, tcg_op1, tcg_op2);
-                    }
-                }
-                break;
-            case 8: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
-            case 10: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
-            case 12: /* UMULL, UMULL2, SMULL, SMULL2 */
-                if (size == 0) {
-                    if (is_u) {
-                        gen_helper_neon_mull_u8(tcg_passres, tcg_op1, tcg_op2);
-                    } else {
-                        gen_helper_neon_mull_s8(tcg_passres, tcg_op1, tcg_op2);
-                    }
-                } else {
-                    if (is_u) {
-                        gen_helper_neon_mull_u16(tcg_passres, tcg_op1, tcg_op2);
-                    } else {
-                        gen_helper_neon_mull_s16(tcg_passres, tcg_op1, tcg_op2);
-                    }
-                }
-                break;
-            case 9: /* SQDMLAL, SQDMLAL2 */
-            case 11: /* SQDMLSL, SQDMLSL2 */
-            case 13: /* SQDMULL, SQDMULL2 */
-                assert(size == 1);
-                gen_helper_neon_mull_s16(tcg_passres, tcg_op1, tcg_op2);
-                gen_helper_neon_addl_saturate_s32(tcg_passres, cpu_env,
-                                                  tcg_passres, tcg_passres);
-                break;
-            default:
-                g_assert_not_reached();
-            }
-            tcg_temp_free_i32(tcg_op1);
-            tcg_temp_free_i32(tcg_op2);
-
-            if (accop != 0) {
-                if (opcode == 9 || opcode == 11) {
-                    /* saturating accumulate ops */
-                    if (accop < 0) {
-                        gen_helper_neon_negl_u32(tcg_passres, tcg_passres);
-                    }
-                    gen_helper_neon_addl_saturate_s32(tcg_res[pass], cpu_env,
-                                                      tcg_res[pass],
-                                                      tcg_passres);
-                } else {
-                    gen_neon_addl(size, (accop < 0), tcg_res[pass],
-                                  tcg_res[pass], tcg_passres);
-                }
-                tcg_temp_free_i64(tcg_passres);
-            }
-        }
-    }
-
-    write_vec_element(s, tcg_res[0], rd, 0, MO_64);
-    write_vec_element(s, tcg_res[1], rd, 1, MO_64);
-    tcg_temp_free_i64(tcg_res[0]);
-    tcg_temp_free_i64(tcg_res[1]);
-}
-
-static void handle_3rd_wide(DisasContext *s, int is_q, int is_u, int size,
-                            int opcode, int rd, int rn, int rm)
-{
-    TCGv_i64 tcg_res[2];
-    int part = is_q ? 2 : 0;
-    int pass;
-
-    for (pass = 0; pass < 2; pass++) {
-        TCGv_i64 tcg_op1 = tcg_temp_new_i64();
-        TCGv_i32 tcg_op2 = tcg_temp_new_i32();
-        TCGv_i64 tcg_op2_wide = tcg_temp_new_i64();
-        static NeonGenWidenFn * const widenfns[3][2] = {
-            { gen_helper_neon_widen_s8, gen_helper_neon_widen_u8 },
-            { gen_helper_neon_widen_s16, gen_helper_neon_widen_u16 },
-            { tcg_gen_ext_i32_i64, tcg_gen_extu_i32_i64 },
-        };
-        NeonGenWidenFn *widenfn = widenfns[size][is_u];
-
-        read_vec_element(s, tcg_op1, rn, pass, MO_64);
-        read_vec_element_i32(s, tcg_op2, rm, part + pass, MO_32);
-        widenfn(tcg_op2_wide, tcg_op2);
-        tcg_temp_free_i32(tcg_op2);
-        tcg_res[pass] = tcg_temp_new_i64();
-        gen_neon_addl(size, (opcode == 3),
-                      tcg_res[pass], tcg_op1, tcg_op2_wide);
-        tcg_temp_free_i64(tcg_op1);
-        tcg_temp_free_i64(tcg_op2_wide);
-    }
-
-    for (pass = 0; pass < 2; pass++) {
-        write_vec_element(s, tcg_res[pass], rd, pass, MO_64);
-        tcg_temp_free_i64(tcg_res[pass]);
-    }
-}
-
-static void do_narrow_round_high_u32(TCGv_i32 res, TCGv_i64 in)
-{
-    tcg_gen_addi_i64(in, in, 1U << 31);
-    tcg_gen_extrh_i64_i32(res, in);
-}
-
-static void handle_3rd_narrowing(DisasContext *s, int is_q, int is_u, int size,
-                                 int opcode, int rd, int rn, int rm)
-{
-    TCGv_i32 tcg_res[2];
-    int part = is_q ? 2 : 0;
-    int pass;
-
-    for (pass = 0; pass < 2; pass++) {
-        TCGv_i64 tcg_op1 = tcg_temp_new_i64();
-        TCGv_i64 tcg_op2 = tcg_temp_new_i64();
-        TCGv_i64 tcg_wideres = tcg_temp_new_i64();
-        static NeonGenNarrowFn * const narrowfns[3][2] = {
-            { gen_helper_neon_narrow_high_u8,
-              gen_helper_neon_narrow_round_high_u8 },
-            { gen_helper_neon_narrow_high_u16,
-              gen_helper_neon_narrow_round_high_u16 },
-            { tcg_gen_extrh_i64_i32, do_narrow_round_high_u32 },
-        };
-        NeonGenNarrowFn *gennarrow = narrowfns[size][is_u];
-
-        read_vec_element(s, tcg_op1, rn, pass, MO_64);
-        read_vec_element(s, tcg_op2, rm, pass, MO_64);
-
-        gen_neon_addl(size, (opcode == 6), tcg_wideres, tcg_op1, tcg_op2);
-
-        tcg_temp_free_i64(tcg_op1);
-        tcg_temp_free_i64(tcg_op2);
-
-        tcg_res[pass] = tcg_temp_new_i32();
-        gennarrow(tcg_res[pass], tcg_wideres);
-        tcg_temp_free_i64(tcg_wideres);
-    }
-
-    for (pass = 0; pass < 2; pass++) {
-        write_vec_element_i32(s, tcg_res[pass], rd, pass + part, MO_32);
-        tcg_temp_free_i32(tcg_res[pass]);
-    }
-    clear_vec_high(s, is_q, rd);
-}
-
-/* AdvSIMD three different
- *   31  30  29 28       24 23  22  21 20  16 15    12 11 10 9    5 4    0
- * +---+---+---+-----------+------+---+------+--------+-----+------+------+
- * | 0 | Q | U | 0 1 1 1 0 | size | 1 |  Rm  | opcode | 0 0 |  Rn  |  Rd  |
- * +---+---+---+-----------+------+---+------+--------+-----+------+------+
- */
-static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
-{
-    /* Instructions in this group fall into three basic classes
-     * (in each case with the operation working on each element in
-     * the input vectors):
-     * (1) widening 64 x 64 -> 128 (with possibly Vd as an extra
-     *     128 bit input)
-     * (2) wide 64 x 128 -> 128
-     * (3) narrowing 128 x 128 -> 64
-     * Here we do initial decode, catch unallocated cases and
-     * dispatch to separate functions for each class.
-     */
-    int is_q = extract32(insn, 30, 1);
-    int is_u = extract32(insn, 29, 1);
-    int size = extract32(insn, 22, 2);
-    int opcode = extract32(insn, 12, 4);
-    int rm = extract32(insn, 16, 5);
-    int rn = extract32(insn, 5, 5);
-    int rd = extract32(insn, 0, 5);
-
-    switch (opcode) {
-    case 1: /* SADDW, SADDW2, UADDW, UADDW2 */
-    case 3: /* SSUBW, SSUBW2, USUBW, USUBW2 */
-        /* 64 x 128 -> 128 */
-        if (size == 3) {
-            unallocated_encoding(s);
-            return;
-        }
-        if (!fp_access_check(s)) {
-            return;
-        }
-        handle_3rd_wide(s, is_q, is_u, size, opcode, rd, rn, rm);
-        break;
-    case 4: /* ADDHN, ADDHN2, RADDHN, RADDHN2 */
-    case 6: /* SUBHN, SUBHN2, RSUBHN, RSUBHN2 */
-        /* 128 x 128 -> 64 */
-        if (size == 3) {
-            unallocated_encoding(s);
-            return;
-        }
-        if (!fp_access_check(s)) {
-            return;
-        }
-        handle_3rd_narrowing(s, is_q, is_u, size, opcode, rd, rn, rm);
-        break;
-    case 14: /* PMULL, PMULL2 */
-        if (is_u) {
-            unallocated_encoding(s);
-            return;
-        }
-        switch (size) {
-        case 0: /* PMULL.P8 */
-            if (!fp_access_check(s)) {
-                return;
-            }
-            /* The Q field specifies lo/hi half input for this insn.  */
-            gen_gvec_op3_ool(s, true, rd, rn, rm, is_q,
-                             gen_helper_neon_pmull_h);
-            break;
-
-        case 3: /* PMULL.P64 */
-            if (!dc_isar_feature(aa64_pmull, s)) {
-                unallocated_encoding(s);
-                return;
-            }
-            if (!fp_access_check(s)) {
-                return;
-            }
-            /* The Q field specifies lo/hi half input for this insn.  */
-            gen_gvec_op3_ool(s, true, rd, rn, rm, is_q,
-                             gen_helper_gvec_pmull_q);
-            break;
-
-        default:
-            unallocated_encoding(s);
-            break;
-        }
-        return;
-    case 9: /* SQDMLAL, SQDMLAL2 */
-    case 11: /* SQDMLSL, SQDMLSL2 */
-    case 13: /* SQDMULL, SQDMULL2 */
-        if (is_u || size == 0) {
-            unallocated_encoding(s);
-            return;
-        }
-        /* fall through */
-    case 0: /* SADDL, SADDL2, UADDL, UADDL2 */
-    case 2: /* SSUBL, SSUBL2, USUBL, USUBL2 */
-    case 5: /* SABAL, SABAL2, UABAL, UABAL2 */
-    case 7: /* SABDL, SABDL2, UABDL, UABDL2 */
-    case 8: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
-    case 10: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
-    case 12: /* SMULL, SMULL2, UMULL, UMULL2 */
-        /* 64 x 64 -> 128 */
-        if (size == 3) {
-            unallocated_encoding(s);
-            return;
-        }
-        if (!fp_access_check(s)) {
-            return;
-        }
-
-        handle_3rd_widening(s, is_q, is_u, size, opcode, rd, rn, rm);
-        break;
-    default:
-        /* opcode 15 not allocated */
-        unallocated_encoding(s);
-        break;
-    }
-}
-
-/* Logic op (opcode == 3) subgroup of C3.6.16. */
-static void disas_simd_3same_logic(DisasContext *s, uint32_t insn)
-{
-    int rd = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    int rm = extract32(insn, 16, 5);
-    int size = extract32(insn, 22, 2);
-    bool is_u = extract32(insn, 29, 1);
-    bool is_q = extract32(insn, 30, 1);
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    switch (size + 4 * is_u) {
-    case 0: /* AND */
-        gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_and, 0);
-        return;
-    case 1: /* BIC */
-        gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_andc, 0);
-        return;
-    case 2: /* ORR */
-        gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_or, 0);
-        return;
-    case 3: /* ORN */
-        gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_orc, 0);
-        return;
-    case 4: /* EOR */
-        gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_xor, 0);
-        return;
-
-    case 5: /* BSL bitwise select */
-        gen_gvec_fn4(s, is_q, rd, rd, rn, rm, tcg_gen_gvec_bitsel, 0);
-        return;
-    case 6: /* BIT, bitwise insert if true */
-        gen_gvec_fn4(s, is_q, rd, rm, rn, rd, tcg_gen_gvec_bitsel, 0);
-        return;
-    case 7: /* BIF, bitwise insert if false */
-        gen_gvec_fn4(s, is_q, rd, rm, rd, rn, tcg_gen_gvec_bitsel, 0);
-        return;
-
-    default:
-        g_assert_not_reached();
-    }
-}
-
-/* Pairwise op subgroup of C3.6.16.
- *
- * This is called directly or via the handle_3same_float for float pairwise
- * operations where the opcode and size are calculated differently.
- */
-static void handle_simd_3same_pair(DisasContext *s, int is_q, int u, int opcode,
-                                   int size, int rn, int rm, int rd)
-{
-    TCGv_ptr fpst;
-    int pass;
-
-    /* Floating point operations need fpst */
-    if (opcode >= 0x58) {
-        fpst = fpstatus_ptr(FPST_FPCR);
-    } else {
-        fpst = NULL;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    /* These operations work on the concatenated rm:rn, with each pair of
-     * adjacent elements being operated on to produce an element in the result.
-     */
-    if (size == 3) {
-        TCGv_i64 tcg_res[2];
-
-        for (pass = 0; pass < 2; pass++) {
-            TCGv_i64 tcg_op1 = tcg_temp_new_i64();
-            TCGv_i64 tcg_op2 = tcg_temp_new_i64();
-            int passreg = (pass == 0) ? rn : rm;
-
-            read_vec_element(s, tcg_op1, passreg, 0, MO_64);
-            read_vec_element(s, tcg_op2, passreg, 1, MO_64);
-            tcg_res[pass] = tcg_temp_new_i64();
-
-            switch (opcode) {
-            case 0x17: /* ADDP */
-                tcg_gen_add_i64(tcg_res[pass], tcg_op1, tcg_op2);
-                break;
-            case 0x58: /* FMAXNMP */
-                gen_helper_vfp_maxnumd(tcg_res[pass], tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x5a: /* FADDP */
-                gen_helper_vfp_addd(tcg_res[pass], tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x5e: /* FMAXP */
-                gen_helper_vfp_maxd(tcg_res[pass], tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x78: /* FMINNMP */
-                gen_helper_vfp_minnumd(tcg_res[pass], tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x7e: /* FMINP */
-                gen_helper_vfp_mind(tcg_res[pass], tcg_op1, tcg_op2, fpst);
-                break;
-            default:
-                g_assert_not_reached();
-            }
-
-            tcg_temp_free_i64(tcg_op1);
-            tcg_temp_free_i64(tcg_op2);
-        }
-
-        for (pass = 0; pass < 2; pass++) {
-            write_vec_element(s, tcg_res[pass], rd, pass, MO_64);
-            tcg_temp_free_i64(tcg_res[pass]);
-        }
-    } else {
-        int maxpass = is_q ? 4 : 2;
-        TCGv_i32 tcg_res[4];
-
-        for (pass = 0; pass < maxpass; pass++) {
-            TCGv_i32 tcg_op1 = tcg_temp_new_i32();
-            TCGv_i32 tcg_op2 = tcg_temp_new_i32();
-            NeonGenTwoOpFn *genfn = NULL;
-            int passreg = pass < (maxpass / 2) ? rn : rm;
-            int passelt = (is_q && (pass & 1)) ? 2 : 0;
-
-            read_vec_element_i32(s, tcg_op1, passreg, passelt, MO_32);
-            read_vec_element_i32(s, tcg_op2, passreg, passelt + 1, MO_32);
-            tcg_res[pass] = tcg_temp_new_i32();
-
-            switch (opcode) {
-            case 0x17: /* ADDP */
-            {
-                static NeonGenTwoOpFn * const fns[3] = {
-                    gen_helper_neon_padd_u8,
-                    gen_helper_neon_padd_u16,
-                    tcg_gen_add_i32,
-                };
-                genfn = fns[size];
-                break;
-            }
-            case 0x14: /* SMAXP, UMAXP */
-            {
-                static NeonGenTwoOpFn * const fns[3][2] = {
-                    { gen_helper_neon_pmax_s8, gen_helper_neon_pmax_u8 },
-                    { gen_helper_neon_pmax_s16, gen_helper_neon_pmax_u16 },
-                    { tcg_gen_smax_i32, tcg_gen_umax_i32 },
-                };
-                genfn = fns[size][u];
-                break;
-            }
-            case 0x15: /* SMINP, UMINP */
-            {
-                static NeonGenTwoOpFn * const fns[3][2] = {
-                    { gen_helper_neon_pmin_s8, gen_helper_neon_pmin_u8 },
-                    { gen_helper_neon_pmin_s16, gen_helper_neon_pmin_u16 },
-                    { tcg_gen_smin_i32, tcg_gen_umin_i32 },
-                };
-                genfn = fns[size][u];
-                break;
-            }
-            /* The FP operations are all on single floats (32 bit) */
-            case 0x58: /* FMAXNMP */
-                gen_helper_vfp_maxnums(tcg_res[pass], tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x5a: /* FADDP */
-                gen_helper_vfp_adds(tcg_res[pass], tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x5e: /* FMAXP */
-                gen_helper_vfp_maxs(tcg_res[pass], tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x78: /* FMINNMP */
-                gen_helper_vfp_minnums(tcg_res[pass], tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x7e: /* FMINP */
-                gen_helper_vfp_mins(tcg_res[pass], tcg_op1, tcg_op2, fpst);
-                break;
-            default:
-                g_assert_not_reached();
-            }
-
-            /* FP ops called directly, otherwise call now */
-            if (genfn) {
-                genfn(tcg_res[pass], tcg_op1, tcg_op2);
-            }
-
-            tcg_temp_free_i32(tcg_op1);
-            tcg_temp_free_i32(tcg_op2);
-        }
-
-        for (pass = 0; pass < maxpass; pass++) {
-            write_vec_element_i32(s, tcg_res[pass], rd, pass, MO_32);
-            tcg_temp_free_i32(tcg_res[pass]);
-        }
-        clear_vec_high(s, is_q, rd);
-    }
-
-    if (fpst) {
-        tcg_temp_free_ptr(fpst);
-    }
-}
-
-/* Floating point op subgroup of C3.6.16. */
-static void disas_simd_3same_float(DisasContext *s, uint32_t insn)
-{
-    /* For floating point ops, the U, size[1] and opcode bits
-     * together indicate the operation. size[0] indicates single
-     * or double.
-     */
-    int fpopcode = extract32(insn, 11, 5)
-        | (extract32(insn, 23, 1) << 5)
-        | (extract32(insn, 29, 1) << 6);
-    int is_q = extract32(insn, 30, 1);
-    int size = extract32(insn, 22, 1);
-    int rm = extract32(insn, 16, 5);
-    int rn = extract32(insn, 5, 5);
-    int rd = extract32(insn, 0, 5);
-
-    int datasize = is_q ? 128 : 64;
-    int esize = 32 << size;
-    int elements = datasize / esize;
-
-    if (size == 1 && !is_q) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    switch (fpopcode) {
-    case 0x58: /* FMAXNMP */
-    case 0x5a: /* FADDP */
-    case 0x5e: /* FMAXP */
-    case 0x78: /* FMINNMP */
-    case 0x7e: /* FMINP */
-        if (size && !is_q) {
-            unallocated_encoding(s);
-            return;
-        }
-        handle_simd_3same_pair(s, is_q, 0, fpopcode, size ? MO_64 : MO_32,
-                               rn, rm, rd);
-        return;
-    case 0x1b: /* FMULX */
-    case 0x1f: /* FRECPS */
-    case 0x3f: /* FRSQRTS */
-    case 0x5d: /* FACGE */
-    case 0x7d: /* FACGT */
-    case 0x19: /* FMLA */
-    case 0x39: /* FMLS */
-    case 0x18: /* FMAXNM */
-    case 0x1a: /* FADD */
-    case 0x1c: /* FCMEQ */
-    case 0x1e: /* FMAX */
-    case 0x38: /* FMINNM */
-    case 0x3a: /* FSUB */
-    case 0x3e: /* FMIN */
-    case 0x5b: /* FMUL */
-    case 0x5c: /* FCMGE */
-    case 0x5f: /* FDIV */
-    case 0x7a: /* FABD */
-    case 0x7c: /* FCMGT */
-        if (!fp_access_check(s)) {
-            return;
-        }
-        handle_3same_float(s, size, elements, fpopcode, rd, rn, rm);
-        return;
-
-    case 0x1d: /* FMLAL  */
-    case 0x3d: /* FMLSL  */
-    case 0x59: /* FMLAL2 */
-    case 0x79: /* FMLSL2 */
-        if (size & 1 || !dc_isar_feature(aa64_fhm, s)) {
-            unallocated_encoding(s);
-            return;
-        }
-        if (fp_access_check(s)) {
-            int is_s = extract32(insn, 23, 1);
-            int is_2 = extract32(insn, 29, 1);
-            int data = (is_2 << 1) | is_s;
-            tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd),
-                               vec_full_reg_offset(s, rn),
-                               vec_full_reg_offset(s, rm), cpu_env,
-                               is_q ? 16 : 8, vec_full_reg_size(s),
-                               data, gen_helper_gvec_fmlal_a64);
-        }
-        return;
-
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-}
-
-/* Integer op subgroup of C3.6.16. */
-static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
-{
-    int is_q = extract32(insn, 30, 1);
-    int u = extract32(insn, 29, 1);
-    int size = extract32(insn, 22, 2);
-    int opcode = extract32(insn, 11, 5);
-    int rm = extract32(insn, 16, 5);
-    int rn = extract32(insn, 5, 5);
-    int rd = extract32(insn, 0, 5);
-    int pass;
-    TCGCond cond;
-
-    switch (opcode) {
-    case 0x13: /* MUL, PMUL */
-        if (u && size != 0) {
-            unallocated_encoding(s);
-            return;
-        }
-        /* fall through */
-    case 0x0: /* SHADD, UHADD */
-    case 0x2: /* SRHADD, URHADD */
-    case 0x4: /* SHSUB, UHSUB */
-    case 0xc: /* SMAX, UMAX */
-    case 0xd: /* SMIN, UMIN */
-    case 0xe: /* SABD, UABD */
-    case 0xf: /* SABA, UABA */
-    case 0x12: /* MLA, MLS */
-        if (size == 3) {
-            unallocated_encoding(s);
-            return;
-        }
-        break;
-    case 0x16: /* SQDMULH, SQRDMULH */
-        if (size == 0 || size == 3) {
-            unallocated_encoding(s);
-            return;
-        }
-        break;
-    default:
-        if (size == 3 && !is_q) {
-            unallocated_encoding(s);
-            return;
-        }
-        break;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    switch (opcode) {
-    case 0x01: /* SQADD, UQADD */
-        if (u) {
-            gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_uqadd_qc, size);
-        } else {
-            gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_sqadd_qc, size);
-        }
-        return;
-    case 0x05: /* SQSUB, UQSUB */
-        if (u) {
-            gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_uqsub_qc, size);
-        } else {
-            gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_sqsub_qc, size);
-        }
-        return;
-    case 0x08: /* SSHL, USHL */
-        if (u) {
-            gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_ushl, size);
-        } else {
-            gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_sshl, size);
-        }
-        return;
-    case 0x0c: /* SMAX, UMAX */
-        if (u) {
-            gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_umax, size);
-        } else {
-            gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_smax, size);
-        }
-        return;
-    case 0x0d: /* SMIN, UMIN */
-        if (u) {
-            gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_umin, size);
-        } else {
-            gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_smin, size);
-        }
-        return;
-    case 0xe: /* SABD, UABD */
-        if (u) {
-            gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_uabd, size);
-        } else {
-            gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_sabd, size);
-        }
-        return;
-    case 0xf: /* SABA, UABA */
-        if (u) {
-            gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_uaba, size);
-        } else {
-            gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_saba, size);
-        }
-        return;
-    case 0x10: /* ADD, SUB */
-        if (u) {
-            gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_sub, size);
-        } else {
-            gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_add, size);
-        }
-        return;
-    case 0x13: /* MUL, PMUL */
-        if (!u) { /* MUL */
-            gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_mul, size);
-        } else {  /* PMUL */
-            gen_gvec_op3_ool(s, is_q, rd, rn, rm, 0, gen_helper_gvec_pmul_b);
-        }
-        return;
-    case 0x12: /* MLA, MLS */
-        if (u) {
-            gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_mls, size);
-        } else {
-            gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_mla, size);
-        }
-        return;
-    case 0x16: /* SQDMULH, SQRDMULH */
-        {
-            static gen_helper_gvec_3_ptr * const fns[2][2] = {
-                { gen_helper_neon_sqdmulh_h, gen_helper_neon_sqrdmulh_h },
-                { gen_helper_neon_sqdmulh_s, gen_helper_neon_sqrdmulh_s },
-            };
-            gen_gvec_op3_qc(s, is_q, rd, rn, rm, fns[size - 1][u]);
-        }
-        return;
-    case 0x11:
-        if (!u) { /* CMTST */
-            gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_cmtst, size);
-            return;
-        }
-        /* else CMEQ */
-        cond = TCG_COND_EQ;
-        goto do_gvec_cmp;
-    case 0x06: /* CMGT, CMHI */
-        cond = u ? TCG_COND_GTU : TCG_COND_GT;
-        goto do_gvec_cmp;
-    case 0x07: /* CMGE, CMHS */
-        cond = u ? TCG_COND_GEU : TCG_COND_GE;
-    do_gvec_cmp:
-        tcg_gen_gvec_cmp(cond, size, vec_full_reg_offset(s, rd),
-                         vec_full_reg_offset(s, rn),
-                         vec_full_reg_offset(s, rm),
-                         is_q ? 16 : 8, vec_full_reg_size(s));
-        return;
-    }
-
-    if (size == 3) {
-        assert(is_q);
-        for (pass = 0; pass < 2; pass++) {
-            TCGv_i64 tcg_op1 = tcg_temp_new_i64();
-            TCGv_i64 tcg_op2 = tcg_temp_new_i64();
-            TCGv_i64 tcg_res = tcg_temp_new_i64();
-
-            read_vec_element(s, tcg_op1, rn, pass, MO_64);
-            read_vec_element(s, tcg_op2, rm, pass, MO_64);
-
-            handle_3same_64(s, opcode, u, tcg_res, tcg_op1, tcg_op2);
-
-            write_vec_element(s, tcg_res, rd, pass, MO_64);
-
-            tcg_temp_free_i64(tcg_res);
-            tcg_temp_free_i64(tcg_op1);
-            tcg_temp_free_i64(tcg_op2);
-        }
-    } else {
-        for (pass = 0; pass < (is_q ? 4 : 2); pass++) {
-            TCGv_i32 tcg_op1 = tcg_temp_new_i32();
-            TCGv_i32 tcg_op2 = tcg_temp_new_i32();
-            TCGv_i32 tcg_res = tcg_temp_new_i32();
-            NeonGenTwoOpFn *genfn = NULL;
-            NeonGenTwoOpEnvFn *genenvfn = NULL;
-
-            read_vec_element_i32(s, tcg_op1, rn, pass, MO_32);
-            read_vec_element_i32(s, tcg_op2, rm, pass, MO_32);
-
-            switch (opcode) {
-            case 0x0: /* SHADD, UHADD */
-            {
-                static NeonGenTwoOpFn * const fns[3][2] = {
-                    { gen_helper_neon_hadd_s8, gen_helper_neon_hadd_u8 },
-                    { gen_helper_neon_hadd_s16, gen_helper_neon_hadd_u16 },
-                    { gen_helper_neon_hadd_s32, gen_helper_neon_hadd_u32 },
-                };
-                genfn = fns[size][u];
-                break;
-            }
-            case 0x2: /* SRHADD, URHADD */
-            {
-                static NeonGenTwoOpFn * const fns[3][2] = {
-                    { gen_helper_neon_rhadd_s8, gen_helper_neon_rhadd_u8 },
-                    { gen_helper_neon_rhadd_s16, gen_helper_neon_rhadd_u16 },
-                    { gen_helper_neon_rhadd_s32, gen_helper_neon_rhadd_u32 },
-                };
-                genfn = fns[size][u];
-                break;
-            }
-            case 0x4: /* SHSUB, UHSUB */
-            {
-                static NeonGenTwoOpFn * const fns[3][2] = {
-                    { gen_helper_neon_hsub_s8, gen_helper_neon_hsub_u8 },
-                    { gen_helper_neon_hsub_s16, gen_helper_neon_hsub_u16 },
-                    { gen_helper_neon_hsub_s32, gen_helper_neon_hsub_u32 },
-                };
-                genfn = fns[size][u];
-                break;
-            }
-            case 0x9: /* SQSHL, UQSHL */
-            {
-                static NeonGenTwoOpEnvFn * const fns[3][2] = {
-                    { gen_helper_neon_qshl_s8, gen_helper_neon_qshl_u8 },
-                    { gen_helper_neon_qshl_s16, gen_helper_neon_qshl_u16 },
-                    { gen_helper_neon_qshl_s32, gen_helper_neon_qshl_u32 },
-                };
-                genenvfn = fns[size][u];
-                break;
-            }
-            case 0xa: /* SRSHL, URSHL */
-            {
-                static NeonGenTwoOpFn * const fns[3][2] = {
-                    { gen_helper_neon_rshl_s8, gen_helper_neon_rshl_u8 },
-                    { gen_helper_neon_rshl_s16, gen_helper_neon_rshl_u16 },
-                    { gen_helper_neon_rshl_s32, gen_helper_neon_rshl_u32 },
-                };
-                genfn = fns[size][u];
-                break;
-            }
-            case 0xb: /* SQRSHL, UQRSHL */
-            {
-                static NeonGenTwoOpEnvFn * const fns[3][2] = {
-                    { gen_helper_neon_qrshl_s8, gen_helper_neon_qrshl_u8 },
-                    { gen_helper_neon_qrshl_s16, gen_helper_neon_qrshl_u16 },
-                    { gen_helper_neon_qrshl_s32, gen_helper_neon_qrshl_u32 },
-                };
-                genenvfn = fns[size][u];
-                break;
-            }
-            default:
-                g_assert_not_reached();
-            }
-
-            if (genenvfn) {
-                genenvfn(tcg_res, cpu_env, tcg_op1, tcg_op2);
-            } else {
-                genfn(tcg_res, tcg_op1, tcg_op2);
-            }
-
-            write_vec_element_i32(s, tcg_res, rd, pass, MO_32);
-
-            tcg_temp_free_i32(tcg_res);
-            tcg_temp_free_i32(tcg_op1);
-            tcg_temp_free_i32(tcg_op2);
-        }
-    }
-    clear_vec_high(s, is_q, rd);
-}
-
-/* AdvSIMD three same
- *  31  30  29  28       24 23  22  21 20  16 15    11  10 9    5 4    0
- * +---+---+---+-----------+------+---+------+--------+---+------+------+
- * | 0 | Q | U | 0 1 1 1 0 | size | 1 |  Rm  | opcode | 1 |  Rn  |  Rd  |
- * +---+---+---+-----------+------+---+------+--------+---+------+------+
- */
-static void disas_simd_three_reg_same(DisasContext *s, uint32_t insn)
-{
-    int opcode = extract32(insn, 11, 5);
-
-    switch (opcode) {
-    case 0x3: /* logic ops */
-        disas_simd_3same_logic(s, insn);
-        break;
-    case 0x17: /* ADDP */
-    case 0x14: /* SMAXP, UMAXP */
-    case 0x15: /* SMINP, UMINP */
-    {
-        /* Pairwise operations */
-        int is_q = extract32(insn, 30, 1);
-        int u = extract32(insn, 29, 1);
-        int size = extract32(insn, 22, 2);
-        int rm = extract32(insn, 16, 5);
-        int rn = extract32(insn, 5, 5);
-        int rd = extract32(insn, 0, 5);
-        if (opcode == 0x17) {
-            if (u || (size == 3 && !is_q)) {
-                unallocated_encoding(s);
-                return;
-            }
-        } else {
-            if (size == 3) {
-                unallocated_encoding(s);
-                return;
-            }
-        }
-        handle_simd_3same_pair(s, is_q, u, opcode, size, rn, rm, rd);
-        break;
-    }
-    case 0x18 ... 0x31:
-        /* floating point ops, sz[1] and U are part of opcode */
-        disas_simd_3same_float(s, insn);
-        break;
-    default:
-        disas_simd_3same_int(s, insn);
-        break;
-    }
-}
-
-/*
- * Advanced SIMD three same (ARMv8.2 FP16 variants)
- *
- *  31  30  29  28       24 23  22 21 20  16 15 14 13    11 10  9    5 4    0
- * +---+---+---+-----------+---------+------+-----+--------+---+------+------+
- * | 0 | Q | U | 0 1 1 1 0 | a | 1 0 |  Rm  | 0 0 | opcode | 1 |  Rn  |  Rd  |
- * +---+---+---+-----------+---------+------+-----+--------+---+------+------+
- *
- * This includes FMULX, FCMEQ (register), FRECPS, FRSQRTS, FCMGE
- * (register), FACGE, FABD, FCMGT (register) and FACGT.
- *
- */
-static void disas_simd_three_reg_same_fp16(DisasContext *s, uint32_t insn)
-{
-    int opcode = extract32(insn, 11, 3);
-    int u = extract32(insn, 29, 1);
-    int a = extract32(insn, 23, 1);
-    int is_q = extract32(insn, 30, 1);
-    int rm = extract32(insn, 16, 5);
-    int rn = extract32(insn, 5, 5);
-    int rd = extract32(insn, 0, 5);
-    /*
-     * For these floating point ops, the U, a and opcode bits
-     * together indicate the operation.
-     */
-    int fpopcode = opcode | (a << 3) | (u << 4);
-    int datasize = is_q ? 128 : 64;
-    int elements = datasize / 16;
-    bool pairwise;
-    TCGv_ptr fpst;
-    int pass;
-
-    switch (fpopcode) {
-    case 0x0: /* FMAXNM */
-    case 0x1: /* FMLA */
-    case 0x2: /* FADD */
-    case 0x3: /* FMULX */
-    case 0x4: /* FCMEQ */
-    case 0x6: /* FMAX */
-    case 0x7: /* FRECPS */
-    case 0x8: /* FMINNM */
-    case 0x9: /* FMLS */
-    case 0xa: /* FSUB */
-    case 0xe: /* FMIN */
-    case 0xf: /* FRSQRTS */
-    case 0x13: /* FMUL */
-    case 0x14: /* FCMGE */
-    case 0x15: /* FACGE */
-    case 0x17: /* FDIV */
-    case 0x1a: /* FABD */
-    case 0x1c: /* FCMGT */
-    case 0x1d: /* FACGT */
-        pairwise = false;
-        break;
-    case 0x10: /* FMAXNMP */
-    case 0x12: /* FADDP */
-    case 0x16: /* FMAXP */
-    case 0x18: /* FMINNMP */
-    case 0x1e: /* FMINP */
-        pairwise = true;
-        break;
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!dc_isar_feature(aa64_fp16, s)) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    fpst = fpstatus_ptr(FPST_FPCR_F16);
-
-    if (pairwise) {
-        int maxpass = is_q ? 8 : 4;
-        TCGv_i32 tcg_op1 = tcg_temp_new_i32();
-        TCGv_i32 tcg_op2 = tcg_temp_new_i32();
-        TCGv_i32 tcg_res[8];
-
-        for (pass = 0; pass < maxpass; pass++) {
-            int passreg = pass < (maxpass / 2) ? rn : rm;
-            int passelt = (pass << 1) & (maxpass - 1);
-
-            read_vec_element_i32(s, tcg_op1, passreg, passelt, MO_16);
-            read_vec_element_i32(s, tcg_op2, passreg, passelt + 1, MO_16);
-            tcg_res[pass] = tcg_temp_new_i32();
-
-            switch (fpopcode) {
-            case 0x10: /* FMAXNMP */
-                gen_helper_advsimd_maxnumh(tcg_res[pass], tcg_op1, tcg_op2,
-                                           fpst);
-                break;
-            case 0x12: /* FADDP */
-                gen_helper_advsimd_addh(tcg_res[pass], tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x16: /* FMAXP */
-                gen_helper_advsimd_maxh(tcg_res[pass], tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x18: /* FMINNMP */
-                gen_helper_advsimd_minnumh(tcg_res[pass], tcg_op1, tcg_op2,
-                                           fpst);
-                break;
-            case 0x1e: /* FMINP */
-                gen_helper_advsimd_minh(tcg_res[pass], tcg_op1, tcg_op2, fpst);
-                break;
-            default:
-                g_assert_not_reached();
-            }
-        }
-
-        for (pass = 0; pass < maxpass; pass++) {
-            write_vec_element_i32(s, tcg_res[pass], rd, pass, MO_16);
-            tcg_temp_free_i32(tcg_res[pass]);
-        }
-
-        tcg_temp_free_i32(tcg_op1);
-        tcg_temp_free_i32(tcg_op2);
-
-    } else {
-        for (pass = 0; pass < elements; pass++) {
-            TCGv_i32 tcg_op1 = tcg_temp_new_i32();
-            TCGv_i32 tcg_op2 = tcg_temp_new_i32();
-            TCGv_i32 tcg_res = tcg_temp_new_i32();
-
-            read_vec_element_i32(s, tcg_op1, rn, pass, MO_16);
-            read_vec_element_i32(s, tcg_op2, rm, pass, MO_16);
-
-            switch (fpopcode) {
-            case 0x0: /* FMAXNM */
-                gen_helper_advsimd_maxnumh(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x1: /* FMLA */
-                read_vec_element_i32(s, tcg_res, rd, pass, MO_16);
-                gen_helper_advsimd_muladdh(tcg_res, tcg_op1, tcg_op2, tcg_res,
-                                           fpst);
-                break;
-            case 0x2: /* FADD */
-                gen_helper_advsimd_addh(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x3: /* FMULX */
-                gen_helper_advsimd_mulxh(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x4: /* FCMEQ */
-                gen_helper_advsimd_ceq_f16(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x6: /* FMAX */
-                gen_helper_advsimd_maxh(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x7: /* FRECPS */
-                gen_helper_recpsf_f16(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x8: /* FMINNM */
-                gen_helper_advsimd_minnumh(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x9: /* FMLS */
-                /* As usual for ARM, separate negation for fused multiply-add */
-                tcg_gen_xori_i32(tcg_op1, tcg_op1, 0x8000);
-                read_vec_element_i32(s, tcg_res, rd, pass, MO_16);
-                gen_helper_advsimd_muladdh(tcg_res, tcg_op1, tcg_op2, tcg_res,
-                                           fpst);
-                break;
-            case 0xa: /* FSUB */
-                gen_helper_advsimd_subh(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0xe: /* FMIN */
-                gen_helper_advsimd_minh(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0xf: /* FRSQRTS */
-                gen_helper_rsqrtsf_f16(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x13: /* FMUL */
-                gen_helper_advsimd_mulh(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x14: /* FCMGE */
-                gen_helper_advsimd_cge_f16(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x15: /* FACGE */
-                gen_helper_advsimd_acge_f16(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x17: /* FDIV */
-                gen_helper_advsimd_divh(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x1a: /* FABD */
-                gen_helper_advsimd_subh(tcg_res, tcg_op1, tcg_op2, fpst);
-                tcg_gen_andi_i32(tcg_res, tcg_res, 0x7fff);
-                break;
-            case 0x1c: /* FCMGT */
-                gen_helper_advsimd_cgt_f16(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            case 0x1d: /* FACGT */
-                gen_helper_advsimd_acgt_f16(tcg_res, tcg_op1, tcg_op2, fpst);
-                break;
-            default:
-                g_assert_not_reached();
-            }
-
-            write_vec_element_i32(s, tcg_res, rd, pass, MO_16);
-            tcg_temp_free_i32(tcg_res);
-            tcg_temp_free_i32(tcg_op1);
-            tcg_temp_free_i32(tcg_op2);
-        }
-    }
-
-    tcg_temp_free_ptr(fpst);
-
-    clear_vec_high(s, is_q, rd);
-}
-
-/* AdvSIMD three same extra
- *  31   30  29 28       24 23  22  21 20  16  15 14    11  10 9  5 4  0
- * +---+---+---+-----------+------+---+------+---+--------+---+----+----+
- * | 0 | Q | U | 0 1 1 1 0 | size | 0 |  Rm  | 1 | opcode | 1 | Rn | Rd |
- * +---+---+---+-----------+------+---+------+---+--------+---+----+----+
- */
-static void disas_simd_three_reg_same_extra(DisasContext *s, uint32_t insn)
-{
-    int rd = extract32(insn, 0, 5);
-    int rn = extract32(insn, 5, 5);
-    int opcode = extract32(insn, 11, 4);
-    int rm = extract32(insn, 16, 5);
-    int size = extract32(insn, 22, 2);
-    bool u = extract32(insn, 29, 1);
-    bool is_q = extract32(insn, 30, 1);
-    bool feature;
-    int rot;
-
-    switch (u * 16 + opcode) {
-    case 0x10: /* SQRDMLAH (vector) */
-    case 0x11: /* SQRDMLSH (vector) */
-        if (size != 1 && size != 2) {
-            unallocated_encoding(s);
-            return;
-        }
-        feature = dc_isar_feature(aa64_rdm, s);
-        break;
-    case 0x02: /* SDOT (vector) */
-    case 0x12: /* UDOT (vector) */
-        if (size != MO_32) {
-            unallocated_encoding(s);
-            return;
-        }
-        feature = dc_isar_feature(aa64_dp, s);
-        break;
-    case 0x03: /* USDOT */
-        if (size != MO_32) {
-            unallocated_encoding(s);
-            return;
-        }
-        feature = dc_isar_feature(aa64_i8mm, s);
-        break;
-    case 0x04: /* SMMLA */
-    case 0x14: /* UMMLA */
-    case 0x05: /* USMMLA */
-        if (!is_q || size != MO_32) {
-            unallocated_encoding(s);
-            return;
-        }
-        feature = dc_isar_feature(aa64_i8mm, s);
-        break;
-    case 0x18: /* FCMLA, #0 */
-    case 0x19: /* FCMLA, #90 */
-    case 0x1a: /* FCMLA, #180 */
-    case 0x1b: /* FCMLA, #270 */
-    case 0x1c: /* FCADD, #90 */
-    case 0x1e: /* FCADD, #270 */
-        if (size == 0
-            || (size == 1 && !dc_isar_feature(aa64_fp16, s))
-            || (size == 3 && !is_q)) {
-            unallocated_encoding(s);
-            return;
-        }
-        feature = dc_isar_feature(aa64_fcma, s);
-        break;
-    case 0x1d: /* BFMMLA */
-        if (size != MO_16 || !is_q) {
-            unallocated_encoding(s);
-            return;
-        }
-        feature = dc_isar_feature(aa64_bf16, s);
-        break;
-    case 0x1f:
-        switch (size) {
-        case 1: /* BFDOT */
-        case 3: /* BFMLAL{B,T} */
-            feature = dc_isar_feature(aa64_bf16, s);
-            break;
-        default:
-            unallocated_encoding(s);
-            return;
-        }
-        break;
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-    if (!feature) {
-        unallocated_encoding(s);
-        return;
-    }
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    switch (opcode) {
-    case 0x0: /* SQRDMLAH (vector) */
-        gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_sqrdmlah_qc, size);
-        return;
-
-    case 0x1: /* SQRDMLSH (vector) */
-        gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_sqrdmlsh_qc, size);
-        return;
-
-    case 0x2: /* SDOT / UDOT */
-        gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, 0,
-                         u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b);
-        return;
-
-    case 0x3: /* USDOT */
-        gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, 0, gen_helper_gvec_usdot_b);
-        return;
-
-    case 0x04: /* SMMLA, UMMLA */
-        gen_gvec_op4_ool(s, 1, rd, rn, rm, rd, 0,
-                         u ? gen_helper_gvec_ummla_b
-                         : gen_helper_gvec_smmla_b);
-        return;
-    case 0x05: /* USMMLA */
-        gen_gvec_op4_ool(s, 1, rd, rn, rm, rd, 0, gen_helper_gvec_usmmla_b);
-        return;
-
-    case 0x8: /* FCMLA, #0 */
-    case 0x9: /* FCMLA, #90 */
-    case 0xa: /* FCMLA, #180 */
-    case 0xb: /* FCMLA, #270 */
-        rot = extract32(opcode, 0, 2);
-        switch (size) {
-        case 1:
-            gen_gvec_op4_fpst(s, is_q, rd, rn, rm, rd, true, rot,
-                              gen_helper_gvec_fcmlah);
-            break;
-        case 2:
-            gen_gvec_op4_fpst(s, is_q, rd, rn, rm, rd, false, rot,
-                              gen_helper_gvec_fcmlas);
-            break;
-        case 3:
-            gen_gvec_op4_fpst(s, is_q, rd, rn, rm, rd, false, rot,
-                              gen_helper_gvec_fcmlad);
-            break;
-        default:
-            g_assert_not_reached();
-        }
-        return;
-
-    case 0xc: /* FCADD, #90 */
-    case 0xe: /* FCADD, #270 */
-        rot = extract32(opcode, 1, 1);
-        switch (size) {
-        case 1:
-            gen_gvec_op3_fpst(s, is_q, rd, rn, rm, size == 1, rot,
-                              gen_helper_gvec_fcaddh);
-            break;
-        case 2:
-            gen_gvec_op3_fpst(s, is_q, rd, rn, rm, size == 1, rot,
-                              gen_helper_gvec_fcadds);
-            break;
-        case 3:
-            gen_gvec_op3_fpst(s, is_q, rd, rn, rm, size == 1, rot,
-                              gen_helper_gvec_fcaddd);
-            break;
-        default:
-            g_assert_not_reached();
-        }
-        return;
-
-    case 0xd: /* BFMMLA */
-        gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, 0, gen_helper_gvec_bfmmla);
-        return;
-    case 0xf:
-        switch (size) {
-        case 1: /* BFDOT */
-            gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, 0, gen_helper_gvec_bfdot);
-            break;
-        case 3: /* BFMLAL{B,T} */
-            gen_gvec_op4_fpst(s, 1, rd, rn, rm, rd, false, is_q,
-                              gen_helper_gvec_bfmlal);
-            break;
-        default:
-            g_assert_not_reached();
-        }
-        return;
-
-    default:
-        g_assert_not_reached();
-    }
-}
-
-static void handle_2misc_widening(DisasContext *s, int opcode, bool is_q,
-                                  int size, int rn, int rd)
-{
-    /* Handle 2-reg-misc ops which are widening (so each size element
-     * in the source becomes a 2*size element in the destination.
-     * The only instruction like this is FCVTL.
-     */
-    int pass;
-
-    if (size == 3) {
-        /* 32 -> 64 bit fp conversion */
-        TCGv_i64 tcg_res[2];
-        int srcelt = is_q ? 2 : 0;
-
-        for (pass = 0; pass < 2; pass++) {
-            TCGv_i32 tcg_op = tcg_temp_new_i32();
-            tcg_res[pass] = tcg_temp_new_i64();
-
-            read_vec_element_i32(s, tcg_op, rn, srcelt + pass, MO_32);
-            gen_helper_vfp_fcvtds(tcg_res[pass], tcg_op, cpu_env);
-            tcg_temp_free_i32(tcg_op);
-        }
-        for (pass = 0; pass < 2; pass++) {
-            write_vec_element(s, tcg_res[pass], rd, pass, MO_64);
-            tcg_temp_free_i64(tcg_res[pass]);
-        }
-    } else {
-        /* 16 -> 32 bit fp conversion */
-        int srcelt = is_q ? 4 : 0;
-        TCGv_i32 tcg_res[4];
-        TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
-        TCGv_i32 ahp = get_ahp_flag();
-
-        for (pass = 0; pass < 4; pass++) {
-            tcg_res[pass] = tcg_temp_new_i32();
-
-            read_vec_element_i32(s, tcg_res[pass], rn, srcelt + pass, MO_16);
-            gen_helper_vfp_fcvt_f16_to_f32(tcg_res[pass], tcg_res[pass],
-                                           fpst, ahp);
-        }
-        for (pass = 0; pass < 4; pass++) {
-            write_vec_element_i32(s, tcg_res[pass], rd, pass, MO_32);
-            tcg_temp_free_i32(tcg_res[pass]);
-        }
-
-        tcg_temp_free_ptr(fpst);
-        tcg_temp_free_i32(ahp);
-    }
-}
-
-static void handle_rev(DisasContext *s, int opcode, bool u,
-                       bool is_q, int size, int rn, int rd)
-{
-    int op = (opcode << 1) | u;
-    int opsz = op + size;
-    int grp_size = 3 - opsz;
-    int dsize = is_q ? 128 : 64;
-    int i;
-
-    if (opsz >= 3) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    if (size == 0) {
-        /* Special case bytes, use bswap op on each group of elements */
-        int groups = dsize / (8 << grp_size);
-
-        for (i = 0; i < groups; i++) {
-            TCGv_i64 tcg_tmp = tcg_temp_new_i64();
-
-            read_vec_element(s, tcg_tmp, rn, i, grp_size);
-            switch (grp_size) {
-            case MO_16:
-                tcg_gen_bswap16_i64(tcg_tmp, tcg_tmp, TCG_BSWAP_IZ);
-                break;
-            case MO_32:
-                tcg_gen_bswap32_i64(tcg_tmp, tcg_tmp, TCG_BSWAP_IZ);
-                break;
-            case MO_64:
-                tcg_gen_bswap64_i64(tcg_tmp, tcg_tmp);
-                break;
-            default:
-                g_assert_not_reached();
-            }
-            write_vec_element(s, tcg_tmp, rd, i, grp_size);
-            tcg_temp_free_i64(tcg_tmp);
-        }
-        clear_vec_high(s, is_q, rd);
-    } else {
-        int revmask = (1 << grp_size) - 1;
-        int esize = 8 << size;
-        int elements = dsize / esize;
-        TCGv_i64 tcg_rn = tcg_temp_new_i64();
-        TCGv_i64 tcg_rd = tcg_const_i64(0);
-        TCGv_i64 tcg_rd_hi = tcg_const_i64(0);
-
-        for (i = 0; i < elements; i++) {
-            int e_rev = (i & 0xf) ^ revmask;
-            int off = e_rev * esize;
-            read_vec_element(s, tcg_rn, rn, i, size);
-            if (off >= 64) {
-                tcg_gen_deposit_i64(tcg_rd_hi, tcg_rd_hi,
-                                    tcg_rn, off - 64, esize);
-            } else {
-                tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_rn, off, esize);
-            }
-        }
-        write_vec_element(s, tcg_rd, rd, 0, MO_64);
-        write_vec_element(s, tcg_rd_hi, rd, 1, MO_64);
-
-        tcg_temp_free_i64(tcg_rd_hi);
-        tcg_temp_free_i64(tcg_rd);
-        tcg_temp_free_i64(tcg_rn);
-    }
-}
-
-static void handle_2misc_pairwise(DisasContext *s, int opcode, bool u,
-                                  bool is_q, int size, int rn, int rd)
-{
-    /* Implement the pairwise operations from 2-misc:
-     * SADDLP, UADDLP, SADALP, UADALP.
-     * These all add pairs of elements in the input to produce a
-     * double-width result element in the output (possibly accumulating).
-     */
-    bool accum = (opcode == 0x6);
-    int maxpass = is_q ? 2 : 1;
-    int pass;
-    TCGv_i64 tcg_res[2];
-
-    if (size == 2) {
-        /* 32 + 32 -> 64 op */
-        MemOp memop = size + (u ? 0 : MO_SIGN);
-
-        for (pass = 0; pass < maxpass; pass++) {
-            TCGv_i64 tcg_op1 = tcg_temp_new_i64();
-            TCGv_i64 tcg_op2 = tcg_temp_new_i64();
-
-            tcg_res[pass] = tcg_temp_new_i64();
-
-            read_vec_element(s, tcg_op1, rn, pass * 2, memop);
-            read_vec_element(s, tcg_op2, rn, pass * 2 + 1, memop);
-            tcg_gen_add_i64(tcg_res[pass], tcg_op1, tcg_op2);
-            if (accum) {
-                read_vec_element(s, tcg_op1, rd, pass, MO_64);
-                tcg_gen_add_i64(tcg_res[pass], tcg_res[pass], tcg_op1);
-            }
-
-            tcg_temp_free_i64(tcg_op1);
-            tcg_temp_free_i64(tcg_op2);
-        }
-    } else {
-        for (pass = 0; pass < maxpass; pass++) {
-            TCGv_i64 tcg_op = tcg_temp_new_i64();
-            NeonGenOne64OpFn *genfn;
-            static NeonGenOne64OpFn * const fns[2][2] = {
-                { gen_helper_neon_addlp_s8,  gen_helper_neon_addlp_u8 },
-                { gen_helper_neon_addlp_s16,  gen_helper_neon_addlp_u16 },
-            };
-
-            genfn = fns[size][u];
-
-            tcg_res[pass] = tcg_temp_new_i64();
-
-            read_vec_element(s, tcg_op, rn, pass, MO_64);
-            genfn(tcg_res[pass], tcg_op);
-
-            if (accum) {
-                read_vec_element(s, tcg_op, rd, pass, MO_64);
-                if (size == 0) {
-                    gen_helper_neon_addl_u16(tcg_res[pass],
-                                             tcg_res[pass], tcg_op);
-                } else {
-                    gen_helper_neon_addl_u32(tcg_res[pass],
-                                             tcg_res[pass], tcg_op);
-                }
-            }
-            tcg_temp_free_i64(tcg_op);
-        }
-    }
-    if (!is_q) {
-        tcg_res[1] = tcg_constant_i64(0);
-    }
-    for (pass = 0; pass < 2; pass++) {
-        write_vec_element(s, tcg_res[pass], rd, pass, MO_64);
-        tcg_temp_free_i64(tcg_res[pass]);
-    }
-}
-
-static void handle_shll(DisasContext *s, bool is_q, int size, int rn, int rd)
-{
-    /* Implement SHLL and SHLL2 */
-    int pass;
-    int part = is_q ? 2 : 0;
-    TCGv_i64 tcg_res[2];
-
-    for (pass = 0; pass < 2; pass++) {
-        static NeonGenWidenFn * const widenfns[3] = {
-            gen_helper_neon_widen_u8,
-            gen_helper_neon_widen_u16,
-            tcg_gen_extu_i32_i64,
-        };
-        NeonGenWidenFn *widenfn = widenfns[size];
-        TCGv_i32 tcg_op = tcg_temp_new_i32();
-
-        read_vec_element_i32(s, tcg_op, rn, part + pass, MO_32);
-        tcg_res[pass] = tcg_temp_new_i64();
-        widenfn(tcg_res[pass], tcg_op);
-        tcg_gen_shli_i64(tcg_res[pass], tcg_res[pass], 8 << size);
-
-        tcg_temp_free_i32(tcg_op);
-    }
-
-    for (pass = 0; pass < 2; pass++) {
-        write_vec_element(s, tcg_res[pass], rd, pass, MO_64);
-        tcg_temp_free_i64(tcg_res[pass]);
-    }
-}
-
-/* AdvSIMD two reg misc
- *   31  30  29 28       24 23  22 21       17 16    12 11 10 9    5 4    0
- * +---+---+---+-----------+------+-----------+--------+-----+------+------+
- * | 0 | Q | U | 0 1 1 1 0 | size | 1 0 0 0 0 | opcode | 1 0 |  Rn  |  Rd  |
- * +---+---+---+-----------+------+-----------+--------+-----+------+------+
- */
-static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
-{
-    int size = extract32(insn, 22, 2);
-    int opcode = extract32(insn, 12, 5);
-    bool u = extract32(insn, 29, 1);
-    bool is_q = extract32(insn, 30, 1);
-    int rn = extract32(insn, 5, 5);
-    int rd = extract32(insn, 0, 5);
-    bool need_fpstatus = false;
-    bool need_rmode = false;
-    int rmode = -1;
-    TCGv_i32 tcg_rmode;
-    TCGv_ptr tcg_fpstatus;
-
-    switch (opcode) {
-    case 0x0: /* REV64, REV32 */
-    case 0x1: /* REV16 */
-        handle_rev(s, opcode, u, is_q, size, rn, rd);
-        return;
-    case 0x5: /* CNT, NOT, RBIT */
-        if (u && size == 0) {
-            /* NOT */
-            break;
-        } else if (u && size == 1) {
-            /* RBIT */
-            break;
-        } else if (!u && size == 0) {
-            /* CNT */
-            break;
-        }
-        unallocated_encoding(s);
-        return;
-    case 0x12: /* XTN, XTN2, SQXTUN, SQXTUN2 */
-    case 0x14: /* SQXTN, SQXTN2, UQXTN, UQXTN2 */
-        if (size == 3) {
-            unallocated_encoding(s);
-            return;
-        }
-        if (!fp_access_check(s)) {
-            return;
-        }
-
-        handle_2misc_narrow(s, false, opcode, u, is_q, size, rn, rd);
-        return;
-    case 0x4: /* CLS, CLZ */
-        if (size == 3) {
-            unallocated_encoding(s);
-            return;
-        }
-        break;
-    case 0x2: /* SADDLP, UADDLP */
-    case 0x6: /* SADALP, UADALP */
-        if (size == 3) {
-            unallocated_encoding(s);
-            return;
-        }
-        if (!fp_access_check(s)) {
-            return;
-        }
-        handle_2misc_pairwise(s, opcode, u, is_q, size, rn, rd);
-        return;
-    case 0x13: /* SHLL, SHLL2 */
-        if (u == 0 || size == 3) {
-            unallocated_encoding(s);
-            return;
-        }
-        if (!fp_access_check(s)) {
-            return;
-        }
-        handle_shll(s, is_q, size, rn, rd);
-        return;
-    case 0xa: /* CMLT */
-        if (u == 1) {
-            unallocated_encoding(s);
-            return;
-        }
-        /* fall through */
-    case 0x8: /* CMGT, CMGE */
-    case 0x9: /* CMEQ, CMLE */
-    case 0xb: /* ABS, NEG */
-        if (size == 3 && !is_q) {
-            unallocated_encoding(s);
-            return;
-        }
-        break;
-    case 0x3: /* SUQADD, USQADD */
-        if (size == 3 && !is_q) {
-            unallocated_encoding(s);
-            return;
-        }
-        if (!fp_access_check(s)) {
-            return;
-        }
-        handle_2misc_satacc(s, false, u, is_q, size, rn, rd);
-        return;
-    case 0x7: /* SQABS, SQNEG */
-        if (size == 3 && !is_q) {
-            unallocated_encoding(s);
-            return;
-        }
-        break;
-    case 0xc ... 0xf:
-    case 0x16 ... 0x1f:
-    {
-        /* Floating point: U, size[1] and opcode indicate operation;
-         * size[0] indicates single or double precision.
-         */
-        int is_double = extract32(size, 0, 1);
-        opcode |= (extract32(size, 1, 1) << 5) | (u << 6);
-        size = is_double ? 3 : 2;
-        switch (opcode) {
-        case 0x2f: /* FABS */
-        case 0x6f: /* FNEG */
-            if (size == 3 && !is_q) {
-                unallocated_encoding(s);
-                return;
-            }
-            break;
-        case 0x1d: /* SCVTF */
-        case 0x5d: /* UCVTF */
-        {
-            bool is_signed = (opcode == 0x1d) ? true : false;
-            int elements = is_double ? 2 : is_q ? 4 : 2;
-            if (is_double && !is_q) {
-                unallocated_encoding(s);
-                return;
-            }
-            if (!fp_access_check(s)) {
-                return;
-            }
-            handle_simd_intfp_conv(s, rd, rn, elements, is_signed, 0, size);
-            return;
-        }
-        case 0x2c: /* FCMGT (zero) */
-        case 0x2d: /* FCMEQ (zero) */
-        case 0x2e: /* FCMLT (zero) */
-        case 0x6c: /* FCMGE (zero) */
-        case 0x6d: /* FCMLE (zero) */
-            if (size == 3 && !is_q) {
-                unallocated_encoding(s);
-                return;
-            }
-            handle_2misc_fcmp_zero(s, opcode, false, u, is_q, size, rn, rd);
-            return;
-        case 0x7f: /* FSQRT */
-            if (size == 3 && !is_q) {
-                unallocated_encoding(s);
-                return;
-            }
-            break;
-        case 0x1a: /* FCVTNS */
-        case 0x1b: /* FCVTMS */
-        case 0x3a: /* FCVTPS */
-        case 0x3b: /* FCVTZS */
-        case 0x5a: /* FCVTNU */
-        case 0x5b: /* FCVTMU */
-        case 0x7a: /* FCVTPU */
-        case 0x7b: /* FCVTZU */
-            need_fpstatus = true;
-            need_rmode = true;
-            rmode = extract32(opcode, 5, 1) | (extract32(opcode, 0, 1) << 1);
-            if (size == 3 && !is_q) {
-                unallocated_encoding(s);
-                return;
-            }
-            break;
-        case 0x5c: /* FCVTAU */
-        case 0x1c: /* FCVTAS */
-            need_fpstatus = true;
-            need_rmode = true;
-            rmode = FPROUNDING_TIEAWAY;
-            if (size == 3 && !is_q) {
-                unallocated_encoding(s);
-                return;
-            }
-            break;
-        case 0x3c: /* URECPE */
-            if (size == 3) {
-                unallocated_encoding(s);
-                return;
-            }
-            /* fall through */
-        case 0x3d: /* FRECPE */
-        case 0x7d: /* FRSQRTE */
-            if (size == 3 && !is_q) {
-                unallocated_encoding(s);
-                return;
-            }
-            if (!fp_access_check(s)) {
-                return;
-            }
-            handle_2misc_reciprocal(s, opcode, false, u, is_q, size, rn, rd);
-            return;
-        case 0x56: /* FCVTXN, FCVTXN2 */
-            if (size == 2) {
-                unallocated_encoding(s);
-                return;
-            }
-            /* fall through */
-        case 0x16: /* FCVTN, FCVTN2 */
-            /* handle_2misc_narrow does a 2*size -> size operation, but these
-             * instructions encode the source size rather than dest size.
-             */
-            if (!fp_access_check(s)) {
-                return;
-            }
-            handle_2misc_narrow(s, false, opcode, 0, is_q, size - 1, rn, rd);
-            return;
-        case 0x36: /* BFCVTN, BFCVTN2 */
-            if (!dc_isar_feature(aa64_bf16, s) || size != 2) {
-                unallocated_encoding(s);
-                return;
-            }
-            if (!fp_access_check(s)) {
-                return;
-            }
-            handle_2misc_narrow(s, false, opcode, 0, is_q, size - 1, rn, rd);
-            return;
-        case 0x17: /* FCVTL, FCVTL2 */
-            if (!fp_access_check(s)) {
-                return;
-            }
-            handle_2misc_widening(s, opcode, is_q, size, rn, rd);
-            return;
-        case 0x18: /* FRINTN */
-        case 0x19: /* FRINTM */
-        case 0x38: /* FRINTP */
-        case 0x39: /* FRINTZ */
-            need_rmode = true;
-            rmode = extract32(opcode, 5, 1) | (extract32(opcode, 0, 1) << 1);
-            /* fall through */
-        case 0x59: /* FRINTX */
-        case 0x79: /* FRINTI */
-            need_fpstatus = true;
-            if (size == 3 && !is_q) {
-                unallocated_encoding(s);
-                return;
-            }
-            break;
-        case 0x58: /* FRINTA */
-            need_rmode = true;
-            rmode = FPROUNDING_TIEAWAY;
-            need_fpstatus = true;
-            if (size == 3 && !is_q) {
-                unallocated_encoding(s);
-                return;
-            }
-            break;
-        case 0x7c: /* URSQRTE */
-            if (size == 3) {
-                unallocated_encoding(s);
-                return;
-            }
-            break;
-        case 0x1e: /* FRINT32Z */
-        case 0x1f: /* FRINT64Z */
-            need_rmode = true;
-            rmode = FPROUNDING_ZERO;
-            /* fall through */
-        case 0x5e: /* FRINT32X */
-        case 0x5f: /* FRINT64X */
-            need_fpstatus = true;
-            if ((size == 3 && !is_q) || !dc_isar_feature(aa64_frint, s)) {
-                unallocated_encoding(s);
-                return;
-            }
-            break;
-        default:
-            unallocated_encoding(s);
-            return;
-        }
-        break;
-    }
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    if (need_fpstatus || need_rmode) {
-        tcg_fpstatus = fpstatus_ptr(FPST_FPCR);
-    } else {
-        tcg_fpstatus = NULL;
-    }
-    if (need_rmode) {
-        tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode));
-        gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
-    } else {
-        tcg_rmode = NULL;
-    }
-
-    switch (opcode) {
-    case 0x5:
-        if (u && size == 0) { /* NOT */
-            gen_gvec_fn2(s, is_q, rd, rn, tcg_gen_gvec_not, 0);
-            return;
-        }
-        break;
-    case 0x8: /* CMGT, CMGE */
-        if (u) {
-            gen_gvec_fn2(s, is_q, rd, rn, gen_gvec_cge0, size);
-        } else {
-            gen_gvec_fn2(s, is_q, rd, rn, gen_gvec_cgt0, size);
-        }
-        return;
-    case 0x9: /* CMEQ, CMLE */
-        if (u) {
-            gen_gvec_fn2(s, is_q, rd, rn, gen_gvec_cle0, size);
-        } else {
-            gen_gvec_fn2(s, is_q, rd, rn, gen_gvec_ceq0, size);
-        }
-        return;
-    case 0xa: /* CMLT */
-        gen_gvec_fn2(s, is_q, rd, rn, gen_gvec_clt0, size);
-        return;
-    case 0xb:
-        if (u) { /* ABS, NEG */
-            gen_gvec_fn2(s, is_q, rd, rn, tcg_gen_gvec_neg, size);
-        } else {
-            gen_gvec_fn2(s, is_q, rd, rn, tcg_gen_gvec_abs, size);
-        }
-        return;
-    }
-
-    if (size == 3) {
-        /* All 64-bit element operations can be shared with scalar 2misc */
-        int pass;
-
-        /* Coverity claims (size == 3 && !is_q) has been eliminated
-         * from all paths leading to here.
-         */
-        tcg_debug_assert(is_q);
-        for (pass = 0; pass < 2; pass++) {
-            TCGv_i64 tcg_op = tcg_temp_new_i64();
-            TCGv_i64 tcg_res = tcg_temp_new_i64();
-
-            read_vec_element(s, tcg_op, rn, pass, MO_64);
-
-            handle_2misc_64(s, opcode, u, tcg_res, tcg_op,
-                            tcg_rmode, tcg_fpstatus);
-
-            write_vec_element(s, tcg_res, rd, pass, MO_64);
-
-            tcg_temp_free_i64(tcg_res);
-            tcg_temp_free_i64(tcg_op);
-        }
-    } else {
-        int pass;
-
-        for (pass = 0; pass < (is_q ? 4 : 2); pass++) {
-            TCGv_i32 tcg_op = tcg_temp_new_i32();
-            TCGv_i32 tcg_res = tcg_temp_new_i32();
-
-            read_vec_element_i32(s, tcg_op, rn, pass, MO_32);
-
-            if (size == 2) {
-                /* Special cases for 32 bit elements */
-                switch (opcode) {
-                case 0x4: /* CLS */
-                    if (u) {
-                        tcg_gen_clzi_i32(tcg_res, tcg_op, 32);
-                    } else {
-                        tcg_gen_clrsb_i32(tcg_res, tcg_op);
-                    }
-                    break;
-                case 0x7: /* SQABS, SQNEG */
-                    if (u) {
-                        gen_helper_neon_qneg_s32(tcg_res, cpu_env, tcg_op);
-                    } else {
-                        gen_helper_neon_qabs_s32(tcg_res, cpu_env, tcg_op);
-                    }
-                    break;
-                case 0x2f: /* FABS */
-                    gen_helper_vfp_abss(tcg_res, tcg_op);
-                    break;
-                case 0x6f: /* FNEG */
-                    gen_helper_vfp_negs(tcg_res, tcg_op);
-                    break;
-                case 0x7f: /* FSQRT */
-                    gen_helper_vfp_sqrts(tcg_res, tcg_op, cpu_env);
-                    break;
-                case 0x1a: /* FCVTNS */
-                case 0x1b: /* FCVTMS */
-                case 0x1c: /* FCVTAS */
-                case 0x3a: /* FCVTPS */
-                case 0x3b: /* FCVTZS */
-                    gen_helper_vfp_tosls(tcg_res, tcg_op,
-                                         tcg_constant_i32(0), tcg_fpstatus);
-                    break;
-                case 0x5a: /* FCVTNU */
-                case 0x5b: /* FCVTMU */
-                case 0x5c: /* FCVTAU */
-                case 0x7a: /* FCVTPU */
-                case 0x7b: /* FCVTZU */
-                    gen_helper_vfp_touls(tcg_res, tcg_op,
-                                         tcg_constant_i32(0), tcg_fpstatus);
-                    break;
-                case 0x18: /* FRINTN */
-                case 0x19: /* FRINTM */
-                case 0x38: /* FRINTP */
-                case 0x39: /* FRINTZ */
-                case 0x58: /* FRINTA */
-                case 0x79: /* FRINTI */
-                    gen_helper_rints(tcg_res, tcg_op, tcg_fpstatus);
-                    break;
-                case 0x59: /* FRINTX */
-                    gen_helper_rints_exact(tcg_res, tcg_op, tcg_fpstatus);
-                    break;
-                case 0x7c: /* URSQRTE */
-                    gen_helper_rsqrte_u32(tcg_res, tcg_op);
-                    break;
-                case 0x1e: /* FRINT32Z */
-                case 0x5e: /* FRINT32X */
-                    gen_helper_frint32_s(tcg_res, tcg_op, tcg_fpstatus);
-                    break;
-                case 0x1f: /* FRINT64Z */
-                case 0x5f: /* FRINT64X */
-                    gen_helper_frint64_s(tcg_res, tcg_op, tcg_fpstatus);
-                    break;
-                default:
-                    g_assert_not_reached();
-                }
-            } else {
-                /* Use helpers for 8 and 16 bit elements */
-                switch (opcode) {
-                case 0x5: /* CNT, RBIT */
-                    /* For these two insns size is part of the opcode specifier
-                     * (handled earlier); they always operate on byte elements.
-                     */
-                    if (u) {
-                        gen_helper_neon_rbit_u8(tcg_res, tcg_op);
-                    } else {
-                        gen_helper_neon_cnt_u8(tcg_res, tcg_op);
-                    }
-                    break;
-                case 0x7: /* SQABS, SQNEG */
-                {
-                    NeonGenOneOpEnvFn *genfn;
-                    static NeonGenOneOpEnvFn * const fns[2][2] = {
-                        { gen_helper_neon_qabs_s8, gen_helper_neon_qneg_s8 },
-                        { gen_helper_neon_qabs_s16, gen_helper_neon_qneg_s16 },
-                    };
-                    genfn = fns[size][u];
-                    genfn(tcg_res, cpu_env, tcg_op);
-                    break;
-                }
-                case 0x4: /* CLS, CLZ */
-                    if (u) {
-                        if (size == 0) {
-                            gen_helper_neon_clz_u8(tcg_res, tcg_op);
-                        } else {
-                            gen_helper_neon_clz_u16(tcg_res, tcg_op);
-                        }
-                    } else {
-                        if (size == 0) {
-                            gen_helper_neon_cls_s8(tcg_res, tcg_op);
-                        } else {
-                            gen_helper_neon_cls_s16(tcg_res, tcg_op);
-                        }
-                    }
-                    break;
-                default:
-                    g_assert_not_reached();
-                }
-            }
-
-            write_vec_element_i32(s, tcg_res, rd, pass, MO_32);
-
-            tcg_temp_free_i32(tcg_res);
-            tcg_temp_free_i32(tcg_op);
-        }
-    }
-    clear_vec_high(s, is_q, rd);
-
-    if (need_rmode) {
-        gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
-        tcg_temp_free_i32(tcg_rmode);
-    }
-    if (need_fpstatus) {
-        tcg_temp_free_ptr(tcg_fpstatus);
-    }
-}
-
-/* AdvSIMD [scalar] two register miscellaneous (FP16)
- *
- *   31  30  29 28  27     24  23 22 21       17 16    12 11 10 9    5 4    0
- * +---+---+---+---+---------+---+-------------+--------+-----+------+------+
- * | 0 | Q | U | S | 1 1 1 0 | a | 1 1 1 1 0 0 | opcode | 1 0 |  Rn  |  Rd  |
- * +---+---+---+---+---------+---+-------------+--------+-----+------+------+
- *   mask: 1000 1111 0111 1110 0000 1100 0000 0000 0x8f7e 0c00
- *   val:  0000 1110 0111 1000 0000 1000 0000 0000 0x0e78 0800
- *
- * This actually covers two groups where scalar access is governed by
- * bit 28. A bunch of the instructions (float to integral) only exist
- * in the vector form and are un-allocated for the scalar decode. Also
- * in the scalar decode Q is always 1.
- */
-static void disas_simd_two_reg_misc_fp16(DisasContext *s, uint32_t insn)
-{
-    int fpop, opcode, a, u;
-    int rn, rd;
-    bool is_q;
-    bool is_scalar;
-    bool only_in_vector = false;
-
-    int pass;
-    TCGv_i32 tcg_rmode = NULL;
-    TCGv_ptr tcg_fpstatus = NULL;
-    bool need_rmode = false;
-    bool need_fpst = true;
-    int rmode;
-
-    if (!dc_isar_feature(aa64_fp16, s)) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    rd = extract32(insn, 0, 5);
-    rn = extract32(insn, 5, 5);
-
-    a = extract32(insn, 23, 1);
-    u = extract32(insn, 29, 1);
-    is_scalar = extract32(insn, 28, 1);
-    is_q = extract32(insn, 30, 1);
-
-    opcode = extract32(insn, 12, 5);
-    fpop = deposit32(opcode, 5, 1, a);
-    fpop = deposit32(fpop, 6, 1, u);
-
-    switch (fpop) {
-    case 0x1d: /* SCVTF */
-    case 0x5d: /* UCVTF */
-    {
-        int elements;
-
-        if (is_scalar) {
-            elements = 1;
-        } else {
-            elements = (is_q ? 8 : 4);
-        }
-
-        if (!fp_access_check(s)) {
-            return;
-        }
-        handle_simd_intfp_conv(s, rd, rn, elements, !u, 0, MO_16);
-        return;
-    }
-    break;
-    case 0x2c: /* FCMGT (zero) */
-    case 0x2d: /* FCMEQ (zero) */
-    case 0x2e: /* FCMLT (zero) */
-    case 0x6c: /* FCMGE (zero) */
-    case 0x6d: /* FCMLE (zero) */
-        handle_2misc_fcmp_zero(s, fpop, is_scalar, 0, is_q, MO_16, rn, rd);
-        return;
-    case 0x3d: /* FRECPE */
-    case 0x3f: /* FRECPX */
-        break;
-    case 0x18: /* FRINTN */
-        need_rmode = true;
-        only_in_vector = true;
-        rmode = FPROUNDING_TIEEVEN;
-        break;
-    case 0x19: /* FRINTM */
-        need_rmode = true;
-        only_in_vector = true;
-        rmode = FPROUNDING_NEGINF;
-        break;
-    case 0x38: /* FRINTP */
-        need_rmode = true;
-        only_in_vector = true;
-        rmode = FPROUNDING_POSINF;
-        break;
-    case 0x39: /* FRINTZ */
-        need_rmode = true;
-        only_in_vector = true;
-        rmode = FPROUNDING_ZERO;
-        break;
-    case 0x58: /* FRINTA */
-        need_rmode = true;
-        only_in_vector = true;
-        rmode = FPROUNDING_TIEAWAY;
-        break;
-    case 0x59: /* FRINTX */
-    case 0x79: /* FRINTI */
-        only_in_vector = true;
-        /* current rounding mode */
-        break;
-    case 0x1a: /* FCVTNS */
-        need_rmode = true;
-        rmode = FPROUNDING_TIEEVEN;
-        break;
-    case 0x1b: /* FCVTMS */
-        need_rmode = true;
-        rmode = FPROUNDING_NEGINF;
-        break;
-    case 0x1c: /* FCVTAS */
-        need_rmode = true;
-        rmode = FPROUNDING_TIEAWAY;
-        break;
-    case 0x3a: /* FCVTPS */
-        need_rmode = true;
-        rmode = FPROUNDING_POSINF;
-        break;
-    case 0x3b: /* FCVTZS */
-        need_rmode = true;
-        rmode = FPROUNDING_ZERO;
-        break;
-    case 0x5a: /* FCVTNU */
-        need_rmode = true;
-        rmode = FPROUNDING_TIEEVEN;
-        break;
-    case 0x5b: /* FCVTMU */
-        need_rmode = true;
-        rmode = FPROUNDING_NEGINF;
-        break;
-    case 0x5c: /* FCVTAU */
-        need_rmode = true;
-        rmode = FPROUNDING_TIEAWAY;
-        break;
-    case 0x7a: /* FCVTPU */
-        need_rmode = true;
-        rmode = FPROUNDING_POSINF;
-        break;
-    case 0x7b: /* FCVTZU */
-        need_rmode = true;
-        rmode = FPROUNDING_ZERO;
-        break;
-    case 0x2f: /* FABS */
-    case 0x6f: /* FNEG */
-        need_fpst = false;
-        break;
-    case 0x7d: /* FRSQRTE */
-    case 0x7f: /* FSQRT (vector) */
-        break;
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-
-
-    /* Check additional constraints for the scalar encoding */
-    if (is_scalar) {
-        if (!is_q) {
-            unallocated_encoding(s);
-            return;
-        }
-        /* FRINTxx is only in the vector form */
-        if (only_in_vector) {
-            unallocated_encoding(s);
-            return;
-        }
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    if (need_rmode || need_fpst) {
-        tcg_fpstatus = fpstatus_ptr(FPST_FPCR_F16);
-    }
-
-    if (need_rmode) {
-        tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode));
-        gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
-    }
-
-    if (is_scalar) {
-        TCGv_i32 tcg_op = read_fp_hreg(s, rn);
-        TCGv_i32 tcg_res = tcg_temp_new_i32();
-
-        switch (fpop) {
-        case 0x1a: /* FCVTNS */
-        case 0x1b: /* FCVTMS */
-        case 0x1c: /* FCVTAS */
-        case 0x3a: /* FCVTPS */
-        case 0x3b: /* FCVTZS */
-            gen_helper_advsimd_f16tosinth(tcg_res, tcg_op, tcg_fpstatus);
-            break;
-        case 0x3d: /* FRECPE */
-            gen_helper_recpe_f16(tcg_res, tcg_op, tcg_fpstatus);
-            break;
-        case 0x3f: /* FRECPX */
-            gen_helper_frecpx_f16(tcg_res, tcg_op, tcg_fpstatus);
-            break;
-        case 0x5a: /* FCVTNU */
-        case 0x5b: /* FCVTMU */
-        case 0x5c: /* FCVTAU */
-        case 0x7a: /* FCVTPU */
-        case 0x7b: /* FCVTZU */
-            gen_helper_advsimd_f16touinth(tcg_res, tcg_op, tcg_fpstatus);
-            break;
-        case 0x6f: /* FNEG */
-            tcg_gen_xori_i32(tcg_res, tcg_op, 0x8000);
-            break;
-        case 0x7d: /* FRSQRTE */
-            gen_helper_rsqrte_f16(tcg_res, tcg_op, tcg_fpstatus);
-            break;
-        default:
-            g_assert_not_reached();
-        }
-
-        /* limit any sign extension going on */
-        tcg_gen_andi_i32(tcg_res, tcg_res, 0xffff);
-        write_fp_sreg(s, rd, tcg_res);
-
-        tcg_temp_free_i32(tcg_res);
-        tcg_temp_free_i32(tcg_op);
-    } else {
-        for (pass = 0; pass < (is_q ? 8 : 4); pass++) {
-            TCGv_i32 tcg_op = tcg_temp_new_i32();
-            TCGv_i32 tcg_res = tcg_temp_new_i32();
-
-            read_vec_element_i32(s, tcg_op, rn, pass, MO_16);
-
-            switch (fpop) {
-            case 0x1a: /* FCVTNS */
-            case 0x1b: /* FCVTMS */
-            case 0x1c: /* FCVTAS */
-            case 0x3a: /* FCVTPS */
-            case 0x3b: /* FCVTZS */
-                gen_helper_advsimd_f16tosinth(tcg_res, tcg_op, tcg_fpstatus);
-                break;
-            case 0x3d: /* FRECPE */
-                gen_helper_recpe_f16(tcg_res, tcg_op, tcg_fpstatus);
-                break;
-            case 0x5a: /* FCVTNU */
-            case 0x5b: /* FCVTMU */
-            case 0x5c: /* FCVTAU */
-            case 0x7a: /* FCVTPU */
-            case 0x7b: /* FCVTZU */
-                gen_helper_advsimd_f16touinth(tcg_res, tcg_op, tcg_fpstatus);
-                break;
-            case 0x18: /* FRINTN */
-            case 0x19: /* FRINTM */
-            case 0x38: /* FRINTP */
-            case 0x39: /* FRINTZ */
-            case 0x58: /* FRINTA */
-            case 0x79: /* FRINTI */
-                gen_helper_advsimd_rinth(tcg_res, tcg_op, tcg_fpstatus);
-                break;
-            case 0x59: /* FRINTX */
-                gen_helper_advsimd_rinth_exact(tcg_res, tcg_op, tcg_fpstatus);
-                break;
-            case 0x2f: /* FABS */
-                tcg_gen_andi_i32(tcg_res, tcg_op, 0x7fff);
-                break;
-            case 0x6f: /* FNEG */
-                tcg_gen_xori_i32(tcg_res, tcg_op, 0x8000);
-                break;
-            case 0x7d: /* FRSQRTE */
-                gen_helper_rsqrte_f16(tcg_res, tcg_op, tcg_fpstatus);
-                break;
-            case 0x7f: /* FSQRT */
-                gen_helper_sqrt_f16(tcg_res, tcg_op, tcg_fpstatus);
-                break;
-            default:
-                g_assert_not_reached();
-            }
-
-            write_vec_element_i32(s, tcg_res, rd, pass, MO_16);
-
-            tcg_temp_free_i32(tcg_res);
-            tcg_temp_free_i32(tcg_op);
-        }
-
-        clear_vec_high(s, is_q, rd);
-    }
-
-    if (tcg_rmode) {
-        gen_helper_set_rmode(tcg_rmode, tcg_rmode, tcg_fpstatus);
-        tcg_temp_free_i32(tcg_rmode);
-    }
-
-    if (tcg_fpstatus) {
-        tcg_temp_free_ptr(tcg_fpstatus);
-    }
-}
-
-/* AdvSIMD scalar x indexed element
- *  31 30  29 28       24 23  22 21  20  19  16 15 12  11  10 9    5 4    0
- * +-----+---+-----------+------+---+---+------+-----+---+---+------+------+
- * | 0 1 | U | 1 1 1 1 1 | size | L | M |  Rm  | opc | H | 0 |  Rn  |  Rd  |
- * +-----+---+-----------+------+---+---+------+-----+---+---+------+------+
- * AdvSIMD vector x indexed element
- *   31  30  29 28       24 23  22 21  20  19  16 15 12  11  10 9    5 4    0
- * +---+---+---+-----------+------+---+---+------+-----+---+---+------+------+
- * | 0 | Q | U | 0 1 1 1 1 | size | L | M |  Rm  | opc | H | 0 |  Rn  |  Rd  |
- * +---+---+---+-----------+------+---+---+------+-----+---+---+------+------+
- */
-static void disas_simd_indexed(DisasContext *s, uint32_t insn)
-{
-    /* This encoding has two kinds of instruction:
-     *  normal, where we perform elt x idxelt => elt for each
-     *     element in the vector
-     *  long, where we perform elt x idxelt and generate a result of
-     *     double the width of the input element
-     * The long ops have a 'part' specifier (ie come in INSN, INSN2 pairs).
-     */
-    bool is_scalar = extract32(insn, 28, 1);
-    bool is_q = extract32(insn, 30, 1);
-    bool u = extract32(insn, 29, 1);
-    int size = extract32(insn, 22, 2);
-    int l = extract32(insn, 21, 1);
-    int m = extract32(insn, 20, 1);
-    /* Note that the Rm field here is only 4 bits, not 5 as it usually is */
-    int rm = extract32(insn, 16, 4);
-    int opcode = extract32(insn, 12, 4);
-    int h = extract32(insn, 11, 1);
-    int rn = extract32(insn, 5, 5);
-    int rd = extract32(insn, 0, 5);
-    bool is_long = false;
-    int is_fp = 0;
-    bool is_fp16 = false;
-    int index;
-    TCGv_ptr fpst;
-
-    switch (16 * u + opcode) {
-    case 0x08: /* MUL */
-    case 0x10: /* MLA */
-    case 0x14: /* MLS */
-        if (is_scalar) {
-            unallocated_encoding(s);
-            return;
-        }
-        break;
-    case 0x02: /* SMLAL, SMLAL2 */
-    case 0x12: /* UMLAL, UMLAL2 */
-    case 0x06: /* SMLSL, SMLSL2 */
-    case 0x16: /* UMLSL, UMLSL2 */
-    case 0x0a: /* SMULL, SMULL2 */
-    case 0x1a: /* UMULL, UMULL2 */
-        if (is_scalar) {
-            unallocated_encoding(s);
-            return;
-        }
-        is_long = true;
-        break;
-    case 0x03: /* SQDMLAL, SQDMLAL2 */
-    case 0x07: /* SQDMLSL, SQDMLSL2 */
-    case 0x0b: /* SQDMULL, SQDMULL2 */
-        is_long = true;
-        break;
-    case 0x0c: /* SQDMULH */
-    case 0x0d: /* SQRDMULH */
-        break;
-    case 0x01: /* FMLA */
-    case 0x05: /* FMLS */
-    case 0x09: /* FMUL */
-    case 0x19: /* FMULX */
-        is_fp = 1;
-        break;
-    case 0x1d: /* SQRDMLAH */
-    case 0x1f: /* SQRDMLSH */
-        if (!dc_isar_feature(aa64_rdm, s)) {
-            unallocated_encoding(s);
-            return;
-        }
-        break;
-    case 0x0e: /* SDOT */
-    case 0x1e: /* UDOT */
-        if (is_scalar || size != MO_32 || !dc_isar_feature(aa64_dp, s)) {
-            unallocated_encoding(s);
-            return;
-        }
-        break;
-    case 0x0f:
-        switch (size) {
-        case 0: /* SUDOT */
-        case 2: /* USDOT */
-            if (is_scalar || !dc_isar_feature(aa64_i8mm, s)) {
-                unallocated_encoding(s);
-                return;
-            }
-            size = MO_32;
-            break;
-        case 1: /* BFDOT */
-            if (is_scalar || !dc_isar_feature(aa64_bf16, s)) {
-                unallocated_encoding(s);
-                return;
-            }
-            size = MO_32;
-            break;
-        case 3: /* BFMLAL{B,T} */
-            if (is_scalar || !dc_isar_feature(aa64_bf16, s)) {
-                unallocated_encoding(s);
-                return;
-            }
-            /* can't set is_fp without other incorrect size checks */
-            size = MO_16;
-            break;
-        default:
-            unallocated_encoding(s);
-            return;
-        }
-        break;
-    case 0x11: /* FCMLA #0 */
-    case 0x13: /* FCMLA #90 */
-    case 0x15: /* FCMLA #180 */
-    case 0x17: /* FCMLA #270 */
-        if (is_scalar || !dc_isar_feature(aa64_fcma, s)) {
-            unallocated_encoding(s);
-            return;
-        }
-        is_fp = 2;
-        break;
-    case 0x00: /* FMLAL */
-    case 0x04: /* FMLSL */
-    case 0x18: /* FMLAL2 */
-    case 0x1c: /* FMLSL2 */
-        if (is_scalar || size != MO_32 || !dc_isar_feature(aa64_fhm, s)) {
-            unallocated_encoding(s);
-            return;
-        }
-        size = MO_16;
-        /* is_fp, but we pass cpu_env not fp_status.  */
-        break;
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-
-    switch (is_fp) {
-    case 1: /* normal fp */
-        /* convert insn encoded size to MemOp size */
-        switch (size) {
-        case 0: /* half-precision */
-            size = MO_16;
-            is_fp16 = true;
-            break;
-        case MO_32: /* single precision */
-        case MO_64: /* double precision */
-            break;
-        default:
-            unallocated_encoding(s);
-            return;
-        }
-        break;
-
-    case 2: /* complex fp */
-        /* Each indexable element is a complex pair.  */
-        size += 1;
-        switch (size) {
-        case MO_32:
-            if (h && !is_q) {
-                unallocated_encoding(s);
-                return;
-            }
-            is_fp16 = true;
-            break;
-        case MO_64:
-            break;
-        default:
-            unallocated_encoding(s);
-            return;
-        }
-        break;
-
-    default: /* integer */
-        switch (size) {
-        case MO_8:
-        case MO_64:
-            unallocated_encoding(s);
-            return;
-        }
-        break;
-    }
-    if (is_fp16 && !dc_isar_feature(aa64_fp16, s)) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    /* Given MemOp size, adjust register and indexing.  */
-    switch (size) {
-    case MO_16:
-        index = h << 2 | l << 1 | m;
-        break;
-    case MO_32:
-        index = h << 1 | l;
-        rm |= m << 4;
-        break;
-    case MO_64:
-        if (l || !is_q) {
-            unallocated_encoding(s);
-            return;
-        }
-        index = h;
-        rm |= m << 4;
-        break;
-    default:
-        g_assert_not_reached();
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    if (is_fp) {
-        fpst = fpstatus_ptr(is_fp16 ? FPST_FPCR_F16 : FPST_FPCR);
-    } else {
-        fpst = NULL;
-    }
-
-    switch (16 * u + opcode) {
-    case 0x0e: /* SDOT */
-    case 0x1e: /* UDOT */
-        gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index,
-                         u ? gen_helper_gvec_udot_idx_b
-                         : gen_helper_gvec_sdot_idx_b);
-        return;
-    case 0x0f:
-        switch (extract32(insn, 22, 2)) {
-        case 0: /* SUDOT */
-            gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index,
-                             gen_helper_gvec_sudot_idx_b);
-            return;
-        case 1: /* BFDOT */
-            gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index,
-                             gen_helper_gvec_bfdot_idx);
-            return;
-        case 2: /* USDOT */
-            gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index,
-                             gen_helper_gvec_usdot_idx_b);
-            return;
-        case 3: /* BFMLAL{B,T} */
-            gen_gvec_op4_fpst(s, 1, rd, rn, rm, rd, 0, (index << 1) | is_q,
-                              gen_helper_gvec_bfmlal_idx);
-            return;
-        }
-        g_assert_not_reached();
-    case 0x11: /* FCMLA #0 */
-    case 0x13: /* FCMLA #90 */
-    case 0x15: /* FCMLA #180 */
-    case 0x17: /* FCMLA #270 */
-        {
-            int rot = extract32(insn, 13, 2);
-            int data = (index << 2) | rot;
-            tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, rd),
-                               vec_full_reg_offset(s, rn),
-                               vec_full_reg_offset(s, rm),
-                               vec_full_reg_offset(s, rd), fpst,
-                               is_q ? 16 : 8, vec_full_reg_size(s), data,
-                               size == MO_64
-                               ? gen_helper_gvec_fcmlas_idx
-                               : gen_helper_gvec_fcmlah_idx);
-            tcg_temp_free_ptr(fpst);
-        }
-        return;
-
-    case 0x00: /* FMLAL */
-    case 0x04: /* FMLSL */
-    case 0x18: /* FMLAL2 */
-    case 0x1c: /* FMLSL2 */
-        {
-            int is_s = extract32(opcode, 2, 1);
-            int is_2 = u;
-            int data = (index << 2) | (is_2 << 1) | is_s;
-            tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd),
-                               vec_full_reg_offset(s, rn),
-                               vec_full_reg_offset(s, rm), cpu_env,
-                               is_q ? 16 : 8, vec_full_reg_size(s),
-                               data, gen_helper_gvec_fmlal_idx_a64);
-        }
-        return;
-
-    case 0x08: /* MUL */
-        if (!is_long && !is_scalar) {
-            static gen_helper_gvec_3 * const fns[3] = {
-                gen_helper_gvec_mul_idx_h,
-                gen_helper_gvec_mul_idx_s,
-                gen_helper_gvec_mul_idx_d,
-            };
-            tcg_gen_gvec_3_ool(vec_full_reg_offset(s, rd),
-                               vec_full_reg_offset(s, rn),
-                               vec_full_reg_offset(s, rm),
-                               is_q ? 16 : 8, vec_full_reg_size(s),
-                               index, fns[size - 1]);
-            return;
-        }
-        break;
-
-    case 0x10: /* MLA */
-        if (!is_long && !is_scalar) {
-            static gen_helper_gvec_4 * const fns[3] = {
-                gen_helper_gvec_mla_idx_h,
-                gen_helper_gvec_mla_idx_s,
-                gen_helper_gvec_mla_idx_d,
-            };
-            tcg_gen_gvec_4_ool(vec_full_reg_offset(s, rd),
-                               vec_full_reg_offset(s, rn),
-                               vec_full_reg_offset(s, rm),
-                               vec_full_reg_offset(s, rd),
-                               is_q ? 16 : 8, vec_full_reg_size(s),
-                               index, fns[size - 1]);
-            return;
-        }
-        break;
-
-    case 0x14: /* MLS */
-        if (!is_long && !is_scalar) {
-            static gen_helper_gvec_4 * const fns[3] = {
-                gen_helper_gvec_mls_idx_h,
-                gen_helper_gvec_mls_idx_s,
-                gen_helper_gvec_mls_idx_d,
-            };
-            tcg_gen_gvec_4_ool(vec_full_reg_offset(s, rd),
-                               vec_full_reg_offset(s, rn),
-                               vec_full_reg_offset(s, rm),
-                               vec_full_reg_offset(s, rd),
-                               is_q ? 16 : 8, vec_full_reg_size(s),
-                               index, fns[size - 1]);
-            return;
-        }
-        break;
-    }
-
-    if (size == 3) {
-        TCGv_i64 tcg_idx = tcg_temp_new_i64();
-        int pass;
-
-        assert(is_fp && is_q && !is_long);
-
-        read_vec_element(s, tcg_idx, rm, index, MO_64);
-
-        for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) {
-            TCGv_i64 tcg_op = tcg_temp_new_i64();
-            TCGv_i64 tcg_res = tcg_temp_new_i64();
-
-            read_vec_element(s, tcg_op, rn, pass, MO_64);
-
-            switch (16 * u + opcode) {
-            case 0x05: /* FMLS */
-                /* As usual for ARM, separate negation for fused multiply-add */
-                gen_helper_vfp_negd(tcg_op, tcg_op);
-                /* fall through */
-            case 0x01: /* FMLA */
-                read_vec_element(s, tcg_res, rd, pass, MO_64);
-                gen_helper_vfp_muladdd(tcg_res, tcg_op, tcg_idx, tcg_res, fpst);
-                break;
-            case 0x09: /* FMUL */
-                gen_helper_vfp_muld(tcg_res, tcg_op, tcg_idx, fpst);
-                break;
-            case 0x19: /* FMULX */
-                gen_helper_vfp_mulxd(tcg_res, tcg_op, tcg_idx, fpst);
-                break;
-            default:
-                g_assert_not_reached();
-            }
-
-            write_vec_element(s, tcg_res, rd, pass, MO_64);
-            tcg_temp_free_i64(tcg_op);
-            tcg_temp_free_i64(tcg_res);
-        }
-
-        tcg_temp_free_i64(tcg_idx);
-        clear_vec_high(s, !is_scalar, rd);
-    } else if (!is_long) {
-        /* 32 bit floating point, or 16 or 32 bit integer.
-         * For the 16 bit scalar case we use the usual Neon helpers and
-         * rely on the fact that 0 op 0 == 0 with no side effects.
-         */
-        TCGv_i32 tcg_idx = tcg_temp_new_i32();
-        int pass, maxpasses;
-
-        if (is_scalar) {
-            maxpasses = 1;
-        } else {
-            maxpasses = is_q ? 4 : 2;
-        }
-
-        read_vec_element_i32(s, tcg_idx, rm, index, size);
-
-        if (size == 1 && !is_scalar) {
-            /* The simplest way to handle the 16x16 indexed ops is to duplicate
-             * the index into both halves of the 32 bit tcg_idx and then use
-             * the usual Neon helpers.
-             */
-            tcg_gen_deposit_i32(tcg_idx, tcg_idx, tcg_idx, 16, 16);
-        }
-
-        for (pass = 0; pass < maxpasses; pass++) {
-            TCGv_i32 tcg_op = tcg_temp_new_i32();
-            TCGv_i32 tcg_res = tcg_temp_new_i32();
-
-            read_vec_element_i32(s, tcg_op, rn, pass, is_scalar ? size : MO_32);
-
-            switch (16 * u + opcode) {
-            case 0x08: /* MUL */
-            case 0x10: /* MLA */
-            case 0x14: /* MLS */
-            {
-                static NeonGenTwoOpFn * const fns[2][2] = {
-                    { gen_helper_neon_add_u16, gen_helper_neon_sub_u16 },
-                    { tcg_gen_add_i32, tcg_gen_sub_i32 },
-                };
-                NeonGenTwoOpFn *genfn;
-                bool is_sub = opcode == 0x4;
-
-                if (size == 1) {
-                    gen_helper_neon_mul_u16(tcg_res, tcg_op, tcg_idx);
-                } else {
-                    tcg_gen_mul_i32(tcg_res, tcg_op, tcg_idx);
-                }
-                if (opcode == 0x8) {
-                    break;
-                }
-                read_vec_element_i32(s, tcg_op, rd, pass, MO_32);
-                genfn = fns[size - 1][is_sub];
-                genfn(tcg_res, tcg_op, tcg_res);
-                break;
-            }
-            case 0x05: /* FMLS */
-            case 0x01: /* FMLA */
-                read_vec_element_i32(s, tcg_res, rd, pass,
-                                     is_scalar ? size : MO_32);
-                switch (size) {
-                case 1:
-                    if (opcode == 0x5) {
-                        /* As usual for ARM, separate negation for fused
-                         * multiply-add */
-                        tcg_gen_xori_i32(tcg_op, tcg_op, 0x80008000);
-                    }
-                    if (is_scalar) {
-                        gen_helper_advsimd_muladdh(tcg_res, tcg_op, tcg_idx,
-                                                   tcg_res, fpst);
-                    } else {
-                        gen_helper_advsimd_muladd2h(tcg_res, tcg_op, tcg_idx,
-                                                    tcg_res, fpst);
-                    }
-                    break;
-                case 2:
-                    if (opcode == 0x5) {
-                        /* As usual for ARM, separate negation for
-                         * fused multiply-add */
-                        tcg_gen_xori_i32(tcg_op, tcg_op, 0x80000000);
-                    }
-                    gen_helper_vfp_muladds(tcg_res, tcg_op, tcg_idx,
-                                           tcg_res, fpst);
-                    break;
-                default:
-                    g_assert_not_reached();
-                }
-                break;
-            case 0x09: /* FMUL */
-                switch (size) {
-                case 1:
-                    if (is_scalar) {
-                        gen_helper_advsimd_mulh(tcg_res, tcg_op,
-                                                tcg_idx, fpst);
-                    } else {
-                        gen_helper_advsimd_mul2h(tcg_res, tcg_op,
-                                                 tcg_idx, fpst);
-                    }
-                    break;
-                case 2:
-                    gen_helper_vfp_muls(tcg_res, tcg_op, tcg_idx, fpst);
-                    break;
-                default:
-                    g_assert_not_reached();
-                }
-                break;
-            case 0x19: /* FMULX */
-                switch (size) {
-                case 1:
-                    if (is_scalar) {
-                        gen_helper_advsimd_mulxh(tcg_res, tcg_op,
-                                                 tcg_idx, fpst);
-                    } else {
-                        gen_helper_advsimd_mulx2h(tcg_res, tcg_op,
-                                                  tcg_idx, fpst);
-                    }
-                    break;
-                case 2:
-                    gen_helper_vfp_mulxs(tcg_res, tcg_op, tcg_idx, fpst);
-                    break;
-                default:
-                    g_assert_not_reached();
-                }
-                break;
-            case 0x0c: /* SQDMULH */
-                if (size == 1) {
-                    gen_helper_neon_qdmulh_s16(tcg_res, cpu_env,
-                                               tcg_op, tcg_idx);
-                } else {
-                    gen_helper_neon_qdmulh_s32(tcg_res, cpu_env,
-                                               tcg_op, tcg_idx);
-                }
-                break;
-            case 0x0d: /* SQRDMULH */
-                if (size == 1) {
-                    gen_helper_neon_qrdmulh_s16(tcg_res, cpu_env,
-                                                tcg_op, tcg_idx);
-                } else {
-                    gen_helper_neon_qrdmulh_s32(tcg_res, cpu_env,
-                                                tcg_op, tcg_idx);
-                }
-                break;
-            case 0x1d: /* SQRDMLAH */
-                read_vec_element_i32(s, tcg_res, rd, pass,
-                                     is_scalar ? size : MO_32);
-                if (size == 1) {
-                    gen_helper_neon_qrdmlah_s16(tcg_res, cpu_env,
-                                                tcg_op, tcg_idx, tcg_res);
-                } else {
-                    gen_helper_neon_qrdmlah_s32(tcg_res, cpu_env,
-                                                tcg_op, tcg_idx, tcg_res);
-                }
-                break;
-            case 0x1f: /* SQRDMLSH */
-                read_vec_element_i32(s, tcg_res, rd, pass,
-                                     is_scalar ? size : MO_32);
-                if (size == 1) {
-                    gen_helper_neon_qrdmlsh_s16(tcg_res, cpu_env,
-                                                tcg_op, tcg_idx, tcg_res);
-                } else {
-                    gen_helper_neon_qrdmlsh_s32(tcg_res, cpu_env,
-                                                tcg_op, tcg_idx, tcg_res);
-                }
-                break;
-            default:
-                g_assert_not_reached();
-            }
-
-            if (is_scalar) {
-                write_fp_sreg(s, rd, tcg_res);
-            } else {
-                write_vec_element_i32(s, tcg_res, rd, pass, MO_32);
-            }
-
-            tcg_temp_free_i32(tcg_op);
-            tcg_temp_free_i32(tcg_res);
-        }
-
-        tcg_temp_free_i32(tcg_idx);
-        clear_vec_high(s, is_q, rd);
-    } else {
-        /* long ops: 16x16->32 or 32x32->64 */
-        TCGv_i64 tcg_res[2];
-        int pass;
-        bool satop = extract32(opcode, 0, 1);
-        MemOp memop = MO_32;
-
-        if (satop || !u) {
-            memop |= MO_SIGN;
-        }
-
-        if (size == 2) {
-            TCGv_i64 tcg_idx = tcg_temp_new_i64();
-
-            read_vec_element(s, tcg_idx, rm, index, memop);
-
-            for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) {
-                TCGv_i64 tcg_op = tcg_temp_new_i64();
-                TCGv_i64 tcg_passres;
-                int passelt;
-
-                if (is_scalar) {
-                    passelt = 0;
-                } else {
-                    passelt = pass + (is_q * 2);
-                }
-
-                read_vec_element(s, tcg_op, rn, passelt, memop);
-
-                tcg_res[pass] = tcg_temp_new_i64();
-
-                if (opcode == 0xa || opcode == 0xb) {
-                    /* Non-accumulating ops */
-                    tcg_passres = tcg_res[pass];
-                } else {
-                    tcg_passres = tcg_temp_new_i64();
-                }
-
-                tcg_gen_mul_i64(tcg_passres, tcg_op, tcg_idx);
-                tcg_temp_free_i64(tcg_op);
-
-                if (satop) {
-                    /* saturating, doubling */
-                    gen_helper_neon_addl_saturate_s64(tcg_passres, cpu_env,
-                                                      tcg_passres, tcg_passres);
-                }
-
-                if (opcode == 0xa || opcode == 0xb) {
-                    continue;
-                }
-
-                /* Accumulating op: handle accumulate step */
-                read_vec_element(s, tcg_res[pass], rd, pass, MO_64);
-
-                switch (opcode) {
-                case 0x2: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
-                    tcg_gen_add_i64(tcg_res[pass], tcg_res[pass], tcg_passres);
-                    break;
-                case 0x6: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
-                    tcg_gen_sub_i64(tcg_res[pass], tcg_res[pass], tcg_passres);
-                    break;
-                case 0x7: /* SQDMLSL, SQDMLSL2 */
-                    tcg_gen_neg_i64(tcg_passres, tcg_passres);
-                    /* fall through */
-                case 0x3: /* SQDMLAL, SQDMLAL2 */
-                    gen_helper_neon_addl_saturate_s64(tcg_res[pass], cpu_env,
-                                                      tcg_res[pass],
-                                                      tcg_passres);
-                    break;
-                default:
-                    g_assert_not_reached();
-                }
-                tcg_temp_free_i64(tcg_passres);
-            }
-            tcg_temp_free_i64(tcg_idx);
-
-            clear_vec_high(s, !is_scalar, rd);
-        } else {
-            TCGv_i32 tcg_idx = tcg_temp_new_i32();
-
-            assert(size == 1);
-            read_vec_element_i32(s, tcg_idx, rm, index, size);
-
-            if (!is_scalar) {
-                /* The simplest way to handle the 16x16 indexed ops is to
-                 * duplicate the index into both halves of the 32 bit tcg_idx
-                 * and then use the usual Neon helpers.
-                 */
-                tcg_gen_deposit_i32(tcg_idx, tcg_idx, tcg_idx, 16, 16);
-            }
-
-            for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) {
-                TCGv_i32 tcg_op = tcg_temp_new_i32();
-                TCGv_i64 tcg_passres;
-
-                if (is_scalar) {
-                    read_vec_element_i32(s, tcg_op, rn, pass, size);
-                } else {
-                    read_vec_element_i32(s, tcg_op, rn,
-                                         pass + (is_q * 2), MO_32);
-                }
-
-                tcg_res[pass] = tcg_temp_new_i64();
-
-                if (opcode == 0xa || opcode == 0xb) {
-                    /* Non-accumulating ops */
-                    tcg_passres = tcg_res[pass];
-                } else {
-                    tcg_passres = tcg_temp_new_i64();
-                }
-
-                if (memop & MO_SIGN) {
-                    gen_helper_neon_mull_s16(tcg_passres, tcg_op, tcg_idx);
-                } else {
-                    gen_helper_neon_mull_u16(tcg_passres, tcg_op, tcg_idx);
-                }
-                if (satop) {
-                    gen_helper_neon_addl_saturate_s32(tcg_passres, cpu_env,
-                                                      tcg_passres, tcg_passres);
-                }
-                tcg_temp_free_i32(tcg_op);
-
-                if (opcode == 0xa || opcode == 0xb) {
-                    continue;
-                }
-
-                /* Accumulating op: handle accumulate step */
-                read_vec_element(s, tcg_res[pass], rd, pass, MO_64);
-
-                switch (opcode) {
-                case 0x2: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
-                    gen_helper_neon_addl_u32(tcg_res[pass], tcg_res[pass],
-                                             tcg_passres);
-                    break;
-                case 0x6: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
-                    gen_helper_neon_subl_u32(tcg_res[pass], tcg_res[pass],
-                                             tcg_passres);
-                    break;
-                case 0x7: /* SQDMLSL, SQDMLSL2 */
-                    gen_helper_neon_negl_u32(tcg_passres, tcg_passres);
-                    /* fall through */
-                case 0x3: /* SQDMLAL, SQDMLAL2 */
-                    gen_helper_neon_addl_saturate_s32(tcg_res[pass], cpu_env,
-                                                      tcg_res[pass],
-                                                      tcg_passres);
-                    break;
-                default:
-                    g_assert_not_reached();
-                }
-                tcg_temp_free_i64(tcg_passres);
-            }
-            tcg_temp_free_i32(tcg_idx);
-
-            if (is_scalar) {
-                tcg_gen_ext32u_i64(tcg_res[0], tcg_res[0]);
-            }
-        }
-
-        if (is_scalar) {
-            tcg_res[1] = tcg_constant_i64(0);
-        }
-
-        for (pass = 0; pass < 2; pass++) {
-            write_vec_element(s, tcg_res[pass], rd, pass, MO_64);
-            tcg_temp_free_i64(tcg_res[pass]);
-        }
-    }
-
-    if (fpst) {
-        tcg_temp_free_ptr(fpst);
-    }
-}
-
-/* Crypto AES
- *  31             24 23  22 21       17 16    12 11 10 9    5 4    0
- * +-----------------+------+-----------+--------+-----+------+------+
- * | 0 1 0 0 1 1 1 0 | size | 1 0 1 0 0 | opcode | 1 0 |  Rn  |  Rd  |
- * +-----------------+------+-----------+--------+-----+------+------+
- */
-static void disas_crypto_aes(DisasContext *s, uint32_t insn)
-{
-    int size = extract32(insn, 22, 2);
-    int opcode = extract32(insn, 12, 5);
-    int rn = extract32(insn, 5, 5);
-    int rd = extract32(insn, 0, 5);
-    int decrypt;
-    gen_helper_gvec_2 *genfn2 = NULL;
-    gen_helper_gvec_3 *genfn3 = NULL;
-
-    if (!dc_isar_feature(aa64_aes, s) || size != 0) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    switch (opcode) {
-    case 0x4: /* AESE */
-        decrypt = 0;
-        genfn3 = gen_helper_crypto_aese;
-        break;
-    case 0x6: /* AESMC */
-        decrypt = 0;
-        genfn2 = gen_helper_crypto_aesmc;
-        break;
-    case 0x5: /* AESD */
-        decrypt = 1;
-        genfn3 = gen_helper_crypto_aese;
-        break;
-    case 0x7: /* AESIMC */
-        decrypt = 1;
-        genfn2 = gen_helper_crypto_aesmc;
-        break;
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-    if (genfn2) {
-        gen_gvec_op2_ool(s, true, rd, rn, decrypt, genfn2);
-    } else {
-        gen_gvec_op3_ool(s, true, rd, rd, rn, decrypt, genfn3);
-    }
-}
-
-/* Crypto three-reg SHA
- *  31             24 23  22  21 20  16  15 14    12 11 10 9    5 4    0
- * +-----------------+------+---+------+---+--------+-----+------+------+
- * | 0 1 0 1 1 1 1 0 | size | 0 |  Rm  | 0 | opcode | 0 0 |  Rn  |  Rd  |
- * +-----------------+------+---+------+---+--------+-----+------+------+
- */
-static void disas_crypto_three_reg_sha(DisasContext *s, uint32_t insn)
-{
-    int size = extract32(insn, 22, 2);
-    int opcode = extract32(insn, 12, 3);
-    int rm = extract32(insn, 16, 5);
-    int rn = extract32(insn, 5, 5);
-    int rd = extract32(insn, 0, 5);
-    gen_helper_gvec_3 *genfn;
-    bool feature;
-
-    if (size != 0) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    switch (opcode) {
-    case 0: /* SHA1C */
-        genfn = gen_helper_crypto_sha1c;
-        feature = dc_isar_feature(aa64_sha1, s);
-        break;
-    case 1: /* SHA1P */
-        genfn = gen_helper_crypto_sha1p;
-        feature = dc_isar_feature(aa64_sha1, s);
-        break;
-    case 2: /* SHA1M */
-        genfn = gen_helper_crypto_sha1m;
-        feature = dc_isar_feature(aa64_sha1, s);
-        break;
-    case 3: /* SHA1SU0 */
-        genfn = gen_helper_crypto_sha1su0;
-        feature = dc_isar_feature(aa64_sha1, s);
-        break;
-    case 4: /* SHA256H */
-        genfn = gen_helper_crypto_sha256h;
-        feature = dc_isar_feature(aa64_sha256, s);
-        break;
-    case 5: /* SHA256H2 */
-        genfn = gen_helper_crypto_sha256h2;
-        feature = dc_isar_feature(aa64_sha256, s);
-        break;
-    case 6: /* SHA256SU1 */
-        genfn = gen_helper_crypto_sha256su1;
-        feature = dc_isar_feature(aa64_sha256, s);
-        break;
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!feature) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-    gen_gvec_op3_ool(s, true, rd, rn, rm, 0, genfn);
-}
-
-/* Crypto two-reg SHA
- *  31             24 23  22 21       17 16    12 11 10 9    5 4    0
- * +-----------------+------+-----------+--------+-----+------+------+
- * | 0 1 0 1 1 1 1 0 | size | 1 0 1 0 0 | opcode | 1 0 |  Rn  |  Rd  |
- * +-----------------+------+-----------+--------+-----+------+------+
- */
-static void disas_crypto_two_reg_sha(DisasContext *s, uint32_t insn)
-{
-    int size = extract32(insn, 22, 2);
-    int opcode = extract32(insn, 12, 5);
-    int rn = extract32(insn, 5, 5);
-    int rd = extract32(insn, 0, 5);
-    gen_helper_gvec_2 *genfn;
-    bool feature;
-
-    if (size != 0) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    switch (opcode) {
-    case 0: /* SHA1H */
-        feature = dc_isar_feature(aa64_sha1, s);
-        genfn = gen_helper_crypto_sha1h;
-        break;
-    case 1: /* SHA1SU1 */
-        feature = dc_isar_feature(aa64_sha1, s);
-        genfn = gen_helper_crypto_sha1su1;
-        break;
-    case 2: /* SHA256SU0 */
-        feature = dc_isar_feature(aa64_sha256, s);
-        genfn = gen_helper_crypto_sha256su0;
-        break;
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!feature) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-    gen_gvec_op2_ool(s, true, rd, rn, 0, genfn);
-}
-
-static void gen_rax1_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)
-{
-    tcg_gen_rotli_i64(d, m, 1);
-    tcg_gen_xor_i64(d, d, n);
-}
-
-static void gen_rax1_vec(unsigned vece, TCGv_vec d, TCGv_vec n, TCGv_vec m)
-{
-    tcg_gen_rotli_vec(vece, d, m, 1);
-    tcg_gen_xor_vec(vece, d, d, n);
-}
-
-void gen_gvec_rax1(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
-{
-    static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 };
-    static const GVecGen3 op = {
-        .fni8 = gen_rax1_i64,
-        .fniv = gen_rax1_vec,
-        .opt_opc = vecop_list,
-        .fno = gen_helper_crypto_rax1,
-        .vece = MO_64,
-    };
-    tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &op);
-}
-
-/* Crypto three-reg SHA512
- *  31                   21 20  16 15  14  13 12  11  10  9    5 4    0
- * +-----------------------+------+---+---+-----+--------+------+------+
- * | 1 1 0 0 1 1 1 0 0 1 1 |  Rm  | 1 | O | 0 0 | opcode |  Rn  |  Rd  |
- * +-----------------------+------+---+---+-----+--------+------+------+
- */
-static void disas_crypto_three_reg_sha512(DisasContext *s, uint32_t insn)
-{
-    int opcode = extract32(insn, 10, 2);
-    int o =  extract32(insn, 14, 1);
-    int rm = extract32(insn, 16, 5);
-    int rn = extract32(insn, 5, 5);
-    int rd = extract32(insn, 0, 5);
-    bool feature;
-    gen_helper_gvec_3 *oolfn = NULL;
-    GVecGen3Fn *gvecfn = NULL;
-
-    if (o == 0) {
-        switch (opcode) {
-        case 0: /* SHA512H */
-            feature = dc_isar_feature(aa64_sha512, s);
-            oolfn = gen_helper_crypto_sha512h;
-            break;
-        case 1: /* SHA512H2 */
-            feature = dc_isar_feature(aa64_sha512, s);
-            oolfn = gen_helper_crypto_sha512h2;
-            break;
-        case 2: /* SHA512SU1 */
-            feature = dc_isar_feature(aa64_sha512, s);
-            oolfn = gen_helper_crypto_sha512su1;
-            break;
-        case 3: /* RAX1 */
-            feature = dc_isar_feature(aa64_sha3, s);
-            gvecfn = gen_gvec_rax1;
-            break;
-        default:
-            g_assert_not_reached();
-        }
-    } else {
-        switch (opcode) {
-        case 0: /* SM3PARTW1 */
-            feature = dc_isar_feature(aa64_sm3, s);
-            oolfn = gen_helper_crypto_sm3partw1;
-            break;
-        case 1: /* SM3PARTW2 */
-            feature = dc_isar_feature(aa64_sm3, s);
-            oolfn = gen_helper_crypto_sm3partw2;
-            break;
-        case 2: /* SM4EKEY */
-            feature = dc_isar_feature(aa64_sm4, s);
-            oolfn = gen_helper_crypto_sm4ekey;
-            break;
-        default:
-            unallocated_encoding(s);
-            return;
-        }
-    }
-
-    if (!feature) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    if (oolfn) {
-        gen_gvec_op3_ool(s, true, rd, rn, rm, 0, oolfn);
-    } else {
-        gen_gvec_fn3(s, true, rd, rn, rm, gvecfn, MO_64);
-    }
-}
-
-/* Crypto two-reg SHA512
- *  31                                     12  11  10  9    5 4    0
- * +-----------------------------------------+--------+------+------+
- * | 1 1 0 0 1 1 1 0 1 1 0 0 0 0 0 0 1 0 0 0 | opcode |  Rn  |  Rd  |
- * +-----------------------------------------+--------+------+------+
- */
-static void disas_crypto_two_reg_sha512(DisasContext *s, uint32_t insn)
-{
-    int opcode = extract32(insn, 10, 2);
-    int rn = extract32(insn, 5, 5);
-    int rd = extract32(insn, 0, 5);
-    bool feature;
-
-    switch (opcode) {
-    case 0: /* SHA512SU0 */
-        feature = dc_isar_feature(aa64_sha512, s);
-        break;
-    case 1: /* SM4E */
-        feature = dc_isar_feature(aa64_sm4, s);
-        break;
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!feature) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    switch (opcode) {
-    case 0: /* SHA512SU0 */
-        gen_gvec_op2_ool(s, true, rd, rn, 0, gen_helper_crypto_sha512su0);
-        break;
-    case 1: /* SM4E */
-        gen_gvec_op3_ool(s, true, rd, rd, rn, 0, gen_helper_crypto_sm4e);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-}
-
-/* Crypto four-register
- *  31               23 22 21 20  16 15  14  10 9    5 4    0
- * +-------------------+-----+------+---+------+------+------+
- * | 1 1 0 0 1 1 1 0 0 | Op0 |  Rm  | 0 |  Ra  |  Rn  |  Rd  |
- * +-------------------+-----+------+---+------+------+------+
- */
-static void disas_crypto_four_reg(DisasContext *s, uint32_t insn)
-{
-    int op0 = extract32(insn, 21, 2);
-    int rm = extract32(insn, 16, 5);
-    int ra = extract32(insn, 10, 5);
-    int rn = extract32(insn, 5, 5);
-    int rd = extract32(insn, 0, 5);
-    bool feature;
-
-    switch (op0) {
-    case 0: /* EOR3 */
-    case 1: /* BCAX */
-        feature = dc_isar_feature(aa64_sha3, s);
-        break;
-    case 2: /* SM3SS1 */
-        feature = dc_isar_feature(aa64_sm3, s);
-        break;
-    default:
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!feature) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    if (op0 < 2) {
-        TCGv_i64 tcg_op1, tcg_op2, tcg_op3, tcg_res[2];
-        int pass;
-
-        tcg_op1 = tcg_temp_new_i64();
-        tcg_op2 = tcg_temp_new_i64();
-        tcg_op3 = tcg_temp_new_i64();
-        tcg_res[0] = tcg_temp_new_i64();
-        tcg_res[1] = tcg_temp_new_i64();
-
-        for (pass = 0; pass < 2; pass++) {
-            read_vec_element(s, tcg_op1, rn, pass, MO_64);
-            read_vec_element(s, tcg_op2, rm, pass, MO_64);
-            read_vec_element(s, tcg_op3, ra, pass, MO_64);
-
-            if (op0 == 0) {
-                /* EOR3 */
-                tcg_gen_xor_i64(tcg_res[pass], tcg_op2, tcg_op3);
-            } else {
-                /* BCAX */
-                tcg_gen_andc_i64(tcg_res[pass], tcg_op2, tcg_op3);
-            }
-            tcg_gen_xor_i64(tcg_res[pass], tcg_res[pass], tcg_op1);
-        }
-        write_vec_element(s, tcg_res[0], rd, 0, MO_64);
-        write_vec_element(s, tcg_res[1], rd, 1, MO_64);
-
-        tcg_temp_free_i64(tcg_op1);
-        tcg_temp_free_i64(tcg_op2);
-        tcg_temp_free_i64(tcg_op3);
-        tcg_temp_free_i64(tcg_res[0]);
-        tcg_temp_free_i64(tcg_res[1]);
-    } else {
-        TCGv_i32 tcg_op1, tcg_op2, tcg_op3, tcg_res, tcg_zero;
-
-        tcg_op1 = tcg_temp_new_i32();
-        tcg_op2 = tcg_temp_new_i32();
-        tcg_op3 = tcg_temp_new_i32();
-        tcg_res = tcg_temp_new_i32();
-        tcg_zero = tcg_constant_i32(0);
-
-        read_vec_element_i32(s, tcg_op1, rn, 3, MO_32);
-        read_vec_element_i32(s, tcg_op2, rm, 3, MO_32);
-        read_vec_element_i32(s, tcg_op3, ra, 3, MO_32);
-
-        tcg_gen_rotri_i32(tcg_res, tcg_op1, 20);
-        tcg_gen_add_i32(tcg_res, tcg_res, tcg_op2);
-        tcg_gen_add_i32(tcg_res, tcg_res, tcg_op3);
-        tcg_gen_rotri_i32(tcg_res, tcg_res, 25);
-
-        write_vec_element_i32(s, tcg_zero, rd, 0, MO_32);
-        write_vec_element_i32(s, tcg_zero, rd, 1, MO_32);
-        write_vec_element_i32(s, tcg_zero, rd, 2, MO_32);
-        write_vec_element_i32(s, tcg_res, rd, 3, MO_32);
-
-        tcg_temp_free_i32(tcg_op1);
-        tcg_temp_free_i32(tcg_op2);
-        tcg_temp_free_i32(tcg_op3);
-        tcg_temp_free_i32(tcg_res);
-    }
-}
-
-/* Crypto XAR
- *  31                   21 20  16 15    10 9    5 4    0
- * +-----------------------+------+--------+------+------+
- * | 1 1 0 0 1 1 1 0 1 0 0 |  Rm  |  imm6  |  Rn  |  Rd  |
- * +-----------------------+------+--------+------+------+
- */
-static void disas_crypto_xar(DisasContext *s, uint32_t insn)
-{
-    int rm = extract32(insn, 16, 5);
-    int imm6 = extract32(insn, 10, 6);
-    int rn = extract32(insn, 5, 5);
-    int rd = extract32(insn, 0, 5);
-
-    if (!dc_isar_feature(aa64_sha3, s)) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    gen_gvec_xar(MO_64, vec_full_reg_offset(s, rd),
-                 vec_full_reg_offset(s, rn),
-                 vec_full_reg_offset(s, rm), imm6, 16,
-                 vec_full_reg_size(s));
-}
-
-/* Crypto three-reg imm2
- *  31                   21 20  16 15  14 13 12  11  10  9    5 4    0
- * +-----------------------+------+-----+------+--------+------+------+
- * | 1 1 0 0 1 1 1 0 0 1 0 |  Rm  | 1 0 | imm2 | opcode |  Rn  |  Rd  |
- * +-----------------------+------+-----+------+--------+------+------+
- */
-static void disas_crypto_three_reg_imm2(DisasContext *s, uint32_t insn)
-{
-    static gen_helper_gvec_3 * const fns[4] = {
-        gen_helper_crypto_sm3tt1a, gen_helper_crypto_sm3tt1b,
-        gen_helper_crypto_sm3tt2a, gen_helper_crypto_sm3tt2b,
-    };
-    int opcode = extract32(insn, 10, 2);
-    int imm2 = extract32(insn, 12, 2);
-    int rm = extract32(insn, 16, 5);
-    int rn = extract32(insn, 5, 5);
-    int rd = extract32(insn, 0, 5);
-
-    if (!dc_isar_feature(aa64_sm3, s)) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if (!fp_access_check(s)) {
-        return;
-    }
-
-    gen_gvec_op3_ool(s, true, rd, rn, rm, imm2, fns[opcode]);
-}
-
-/* C3.6 Data processing - SIMD, inc Crypto
- *
- * As the decode gets a little complex we are using a table based
- * approach for this part of the decode.
- */
-static const AArch64DecodeTable data_proc_simd[] = {
-    /* pattern  ,  mask     ,  fn                        */
-    { 0x0e200400, 0x9f200400, disas_simd_three_reg_same },
-    { 0x0e008400, 0x9f208400, disas_simd_three_reg_same_extra },
-    { 0x0e200000, 0x9f200c00, disas_simd_three_reg_diff },
-    { 0x0e200800, 0x9f3e0c00, disas_simd_two_reg_misc },
-    { 0x0e300800, 0x9f3e0c00, disas_simd_across_lanes },
-    { 0x0e000400, 0x9fe08400, disas_simd_copy },
-    { 0x0f000000, 0x9f000400, disas_simd_indexed }, /* vector indexed */
-    /* simd_mod_imm decode is a subset of simd_shift_imm, so must precede it */
-    { 0x0f000400, 0x9ff80400, disas_simd_mod_imm },
-    { 0x0f000400, 0x9f800400, disas_simd_shift_imm },
-    { 0x0e000000, 0xbf208c00, disas_simd_tb },
-    { 0x0e000800, 0xbf208c00, disas_simd_zip_trn },
-    { 0x2e000000, 0xbf208400, disas_simd_ext },
-    { 0x5e200400, 0xdf200400, disas_simd_scalar_three_reg_same },
-    { 0x5e008400, 0xdf208400, disas_simd_scalar_three_reg_same_extra },
-    { 0x5e200000, 0xdf200c00, disas_simd_scalar_three_reg_diff },
-    { 0x5e200800, 0xdf3e0c00, disas_simd_scalar_two_reg_misc },
-    { 0x5e300800, 0xdf3e0c00, disas_simd_scalar_pairwise },
-    { 0x5e000400, 0xdfe08400, disas_simd_scalar_copy },
-    { 0x5f000000, 0xdf000400, disas_simd_indexed }, /* scalar indexed */
-    { 0x5f000400, 0xdf800400, disas_simd_scalar_shift_imm },
-    { 0x4e280800, 0xff3e0c00, disas_crypto_aes },
-    { 0x5e000000, 0xff208c00, disas_crypto_three_reg_sha },
-    { 0x5e280800, 0xff3e0c00, disas_crypto_two_reg_sha },
-    { 0xce608000, 0xffe0b000, disas_crypto_three_reg_sha512 },
-    { 0xcec08000, 0xfffff000, disas_crypto_two_reg_sha512 },
-    { 0xce000000, 0xff808000, disas_crypto_four_reg },
-    { 0xce800000, 0xffe00000, disas_crypto_xar },
-    { 0xce408000, 0xffe0c000, disas_crypto_three_reg_imm2 },
-    { 0x0e400400, 0x9f60c400, disas_simd_three_reg_same_fp16 },
-    { 0x0e780800, 0x8f7e0c00, disas_simd_two_reg_misc_fp16 },
-    { 0x5e400400, 0xdf60c400, disas_simd_scalar_three_reg_same_fp16 },
-    { 0x00000000, 0x00000000, NULL }
-};
-
-static void disas_data_proc_simd(DisasContext *s, uint32_t insn)
-{
-    /* Note that this is called with all non-FP cases from
-     * table C3-6 so it must UNDEF for entries not specifically
-     * allocated to instructions in that table.
-     */
-    AArch64DecodeFn *fn = lookup_disas_fn(&data_proc_simd[0], insn);
-    if (fn) {
-        fn(s, insn);
-    } else {
-        unallocated_encoding(s);
-    }
-}
-
-/* C3.6 Data processing - SIMD and floating point */
-static void disas_data_proc_simd_fp(DisasContext *s, uint32_t insn)
-{
-    if (extract32(insn, 28, 1) == 1 && extract32(insn, 30, 1) == 0) {
-        disas_data_proc_fp(s, insn);
-    } else {
-        /* SIMD, including crypto */
-        disas_data_proc_simd(s, insn);
-    }
-}
-
-/*
- * Include the generated SME FA64 decoder.
- */
-
-#include "decode-sme-fa64.c.inc"
-
-static bool trans_OK(DisasContext *s, arg_OK *a)
-{
-    return true;
-}
-
-static bool trans_FAIL(DisasContext *s, arg_OK *a)
-{
-    s->is_nonstreaming = true;
-    return true;
-}
-
-/**
- * is_guarded_page:
- * @env: The cpu environment
- * @s: The DisasContext
- *
- * Return true if the page is guarded.
- */
-static bool is_guarded_page(CPUARMState *env, DisasContext *s)
-{
-    uint64_t addr = s->base.pc_first;
-#ifdef CONFIG_USER_ONLY
-    return page_get_flags(addr) & PAGE_BTI;
-#else
-    CPUTLBEntryFull *full;
-    void *host;
-    int mmu_idx = arm_to_core_mmu_idx(s->mmu_idx);
-    int flags;
-
-    /*
-     * We test this immediately after reading an insn, which means
-     * that the TLB entry must be present and valid, and thus this
-     * access will never raise an exception.
-     */
-    flags = probe_access_full(env, addr, MMU_INST_FETCH, mmu_idx,
-                              false, &host, &full, 0);
-    assert(!(flags & TLB_INVALID_MASK));
-
-    return full->guarded;
-#endif
-}
-
-/**
- * btype_destination_ok:
- * @insn: The instruction at the branch destination
- * @bt: SCTLR_ELx.BT
- * @btype: PSTATE.BTYPE, and is non-zero
- *
- * On a guarded page, there are a limited number of insns
- * that may be present at the branch target:
- *   - branch target identifiers,
- *   - paciasp, pacibsp,
- *   - BRK insn
- *   - HLT insn
- * Anything else causes a Branch Target Exception.
- *
- * Return true if the branch is compatible, false to raise BTITRAP.
- */
-static bool btype_destination_ok(uint32_t insn, bool bt, int btype)
-{
-    if ((insn & 0xfffff01fu) == 0xd503201fu) {
-        /* HINT space */
-        switch (extract32(insn, 5, 7)) {
-        case 0b011001: /* PACIASP */
-        case 0b011011: /* PACIBSP */
-            /*
-             * If SCTLR_ELx.BT, then PACI*SP are not compatible
-             * with btype == 3.  Otherwise all btype are ok.
-             */
-            return !bt || btype != 3;
-        case 0b100000: /* BTI */
-            /* Not compatible with any btype.  */
-            return false;
-        case 0b100010: /* BTI c */
-            /* Not compatible with btype == 3 */
-            return btype != 3;
-        case 0b100100: /* BTI j */
-            /* Not compatible with btype == 2 */
-            return btype != 2;
-        case 0b100110: /* BTI jc */
-            /* Compatible with any btype.  */
-            return true;
-        }
-    } else {
-        switch (insn & 0xffe0001fu) {
-        case 0xd4200000u: /* BRK */
-        case 0xd4400000u: /* HLT */
-            /* Give priority to the breakpoint exception.  */
-            return true;
-        }
-    }
-    return false;
-}
-
-static void aarch64_tr_init_disas_context(DisasContextBase *dcbase,
-                                          CPUState *cpu)
-{
-    DisasContext *dc = container_of(dcbase, DisasContext, base);
-    CPUARMState *env = cpu->env_ptr;
-    ARMCPU *arm_cpu = env_archcpu(env);
-    CPUARMTBFlags tb_flags = arm_tbflags_from_tb(dc->base.tb);
-    int bound, core_mmu_idx;
-
-    dc->isar = &arm_cpu->isar;
-    dc->condjmp = 0;
-    dc->pc_save = dc->base.pc_first;
-    dc->aarch64 = true;
-    dc->thumb = false;
-    dc->sctlr_b = 0;
-    dc->be_data = EX_TBFLAG_ANY(tb_flags, BE_DATA) ? MO_BE : MO_LE;
-    dc->condexec_mask = 0;
-    dc->condexec_cond = 0;
-    core_mmu_idx = EX_TBFLAG_ANY(tb_flags, MMUIDX);
-    dc->mmu_idx = core_to_aa64_mmu_idx(core_mmu_idx);
-    dc->tbii = EX_TBFLAG_A64(tb_flags, TBII);
-    dc->tbid = EX_TBFLAG_A64(tb_flags, TBID);
-    dc->tcma = EX_TBFLAG_A64(tb_flags, TCMA);
-    dc->current_el = arm_mmu_idx_to_el(dc->mmu_idx);
-#if !defined(CONFIG_USER_ONLY)
-    dc->user = (dc->current_el == 0);
-#endif
-    dc->fp_excp_el = EX_TBFLAG_ANY(tb_flags, FPEXC_EL);
-    dc->align_mem = EX_TBFLAG_ANY(tb_flags, ALIGN_MEM);
-    dc->pstate_il = EX_TBFLAG_ANY(tb_flags, PSTATE__IL);
-    dc->fgt_active = EX_TBFLAG_ANY(tb_flags, FGT_ACTIVE);
-    dc->fgt_svc = EX_TBFLAG_ANY(tb_flags, FGT_SVC);
-    dc->fgt_eret = EX_TBFLAG_A64(tb_flags, FGT_ERET);
-    dc->sve_excp_el = EX_TBFLAG_A64(tb_flags, SVEEXC_EL);
-    dc->sme_excp_el = EX_TBFLAG_A64(tb_flags, SMEEXC_EL);
-    dc->vl = (EX_TBFLAG_A64(tb_flags, VL) + 1) * 16;
-    dc->svl = (EX_TBFLAG_A64(tb_flags, SVL) + 1) * 16;
-    dc->pauth_active = EX_TBFLAG_A64(tb_flags, PAUTH_ACTIVE);
-    dc->bt = EX_TBFLAG_A64(tb_flags, BT);
-    dc->btype = EX_TBFLAG_A64(tb_flags, BTYPE);
-    dc->unpriv = EX_TBFLAG_A64(tb_flags, UNPRIV);
-    dc->ata = EX_TBFLAG_A64(tb_flags, ATA);
-    dc->mte_active[0] = EX_TBFLAG_A64(tb_flags, MTE_ACTIVE);
-    dc->mte_active[1] = EX_TBFLAG_A64(tb_flags, MTE0_ACTIVE);
-    dc->pstate_sm = EX_TBFLAG_A64(tb_flags, PSTATE_SM);
-    dc->pstate_za = EX_TBFLAG_A64(tb_flags, PSTATE_ZA);
-    dc->sme_trap_nonstreaming = EX_TBFLAG_A64(tb_flags, SME_TRAP_NONSTREAMING);
-    dc->vec_len = 0;
-    dc->vec_stride = 0;
-    dc->cp_regs = arm_cpu->cp_regs;
-    dc->features = env->features;
-    dc->dcz_blocksize = arm_cpu->dcz_blocksize;
-
-#ifdef CONFIG_USER_ONLY
-    /* In sve_probe_page, we assume TBI is enabled. */
-    tcg_debug_assert(dc->tbid & 1);
-#endif
-
-    /* Single step state. The code-generation logic here is:
-     *  SS_ACTIVE == 0:
-     *   generate code with no special handling for single-stepping (except
-     *   that anything that can make us go to SS_ACTIVE == 1 must end the TB;
-     *   this happens anyway because those changes are all system register or
-     *   PSTATE writes).
-     *  SS_ACTIVE == 1, PSTATE.SS == 1: (active-not-pending)
-     *   emit code for one insn
-     *   emit code to clear PSTATE.SS
-     *   emit code to generate software step exception for completed step
-     *   end TB (as usual for having generated an exception)
-     *  SS_ACTIVE == 1, PSTATE.SS == 0: (active-pending)
-     *   emit code to generate a software step exception
-     *   end the TB
-     */
-    dc->ss_active = EX_TBFLAG_ANY(tb_flags, SS_ACTIVE);
-    dc->pstate_ss = EX_TBFLAG_ANY(tb_flags, PSTATE__SS);
-    dc->is_ldex = false;
-
-    /* Bound the number of insns to execute to those left on the page.  */
-    bound = -(dc->base.pc_first | TARGET_PAGE_MASK) / 4;
-
-    /* If architectural single step active, limit to 1.  */
-    if (dc->ss_active) {
-        bound = 1;
-    }
-    dc->base.max_insns = MIN(dc->base.max_insns, bound);
-
-    init_tmp_a64_array(dc);
-}
-
-static void aarch64_tr_tb_start(DisasContextBase *db, CPUState *cpu)
-{
-}
-
-static void aarch64_tr_insn_start(DisasContextBase *dcbase, CPUState *cpu)
-{
-    DisasContext *dc = container_of(dcbase, DisasContext, base);
-    target_ulong pc_arg = dc->base.pc_next;
-
-    if (TARGET_TB_PCREL) {
-        pc_arg &= ~TARGET_PAGE_MASK;
-    }
-    tcg_gen_insn_start(pc_arg, 0, 0);
-    dc->insn_start = tcg_last_op();
-}
-
-static void aarch64_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu)
-{
-    DisasContext *s = container_of(dcbase, DisasContext, base);
-    CPUARMState *env = cpu->env_ptr;
-    uint64_t pc = s->base.pc_next;
-    uint32_t insn;
-
-    /* Singlestep exceptions have the highest priority. */
-    if (s->ss_active && !s->pstate_ss) {
-        /* Singlestep state is Active-pending.
-         * If we're in this state at the start of a TB then either
-         *  a) we just took an exception to an EL which is being debugged
-         *     and this is the first insn in the exception handler
-         *  b) debug exceptions were masked and we just unmasked them
-         *     without changing EL (eg by clearing PSTATE.D)
-         * In either case we're going to take a swstep exception in the
-         * "did not step an insn" case, and so the syndrome ISV and EX
-         * bits should be zero.
-         */
-        assert(s->base.num_insns == 1);
-        gen_swstep_exception(s, 0, 0);
-        s->base.is_jmp = DISAS_NORETURN;
-        s->base.pc_next = pc + 4;
-        return;
-    }
-
-    if (pc & 3) {
-        /*
-         * PC alignment fault.  This has priority over the instruction abort
-         * that we would receive from a translation fault via arm_ldl_code.
-         * This should only be possible after an indirect branch, at the
-         * start of the TB.
-         */
-        assert(s->base.num_insns == 1);
-        gen_helper_exception_pc_alignment(cpu_env, tcg_constant_tl(pc));
-        s->base.is_jmp = DISAS_NORETURN;
-        s->base.pc_next = QEMU_ALIGN_UP(pc, 4);
-        return;
-    }
-
-    s->pc_curr = pc;
-    insn = arm_ldl_code(env, &s->base, pc, s->sctlr_b);
-    s->insn = insn;
-    s->base.pc_next = pc + 4;
-
-    s->fp_access_checked = false;
-    s->sve_access_checked = false;
-
-    if (s->pstate_il) {
-        /*
-         * Illegal execution state. This has priority over BTI
-         * exceptions, but comes after instruction abort exceptions.
-         */
-        gen_exception_insn(s, 0, EXCP_UDEF, syn_illegalstate());
-        return;
-    }
-
-    if (dc_isar_feature(aa64_bti, s)) {
-        if (s->base.num_insns == 1) {
-            /*
-             * At the first insn of the TB, compute s->guarded_page.
-             * We delayed computing this until successfully reading
-             * the first insn of the TB, above.  This (mostly) ensures
-             * that the softmmu tlb entry has been populated, and the
-             * page table GP bit is available.
-             *
-             * Note that we need to compute this even if btype == 0,
-             * because this value is used for BR instructions later
-             * where ENV is not available.
-             */
-            s->guarded_page = is_guarded_page(env, s);
-
-            /* First insn can have btype set to non-zero.  */
-            tcg_debug_assert(s->btype >= 0);
-
-            /*
-             * Note that the Branch Target Exception has fairly high
-             * priority -- below debugging exceptions but above most
-             * everything else.  This allows us to handle this now
-             * instead of waiting until the insn is otherwise decoded.
-             */
-            if (s->btype != 0
-                && s->guarded_page
-                && !btype_destination_ok(insn, s->bt, s->btype)) {
-                gen_exception_insn(s, 0, EXCP_UDEF, syn_btitrap(s->btype));
-                return;
-            }
-        } else {
-            /* Not the first insn: btype must be 0.  */
-            tcg_debug_assert(s->btype == 0);
-        }
-    }
-
-    s->is_nonstreaming = false;
-    if (s->sme_trap_nonstreaming) {
-        disas_sme_fa64(s, insn);
-    }
-
-    switch (extract32(insn, 25, 4)) {
-    case 0x0:
-        if (!extract32(insn, 31, 1) || !disas_sme(s, insn)) {
-            unallocated_encoding(s);
-        }
-        break;
-    case 0x1: case 0x3: /* UNALLOCATED */
-        unallocated_encoding(s);
-        break;
-    case 0x2:
-        if (!disas_sve(s, insn)) {
-            unallocated_encoding(s);
-        }
-        break;
-    case 0x8: case 0x9: /* Data processing - immediate */
-        disas_data_proc_imm(s, insn);
-        break;
-    case 0xa: case 0xb: /* Branch, exception generation and system insns */
-        disas_b_exc_sys(s, insn);
-        break;
-    case 0x4:
-    case 0x6:
-    case 0xc:
-    case 0xe:      /* Loads and stores */
-        disas_ldst(s, insn);
-        break;
-    case 0x5:
-    case 0xd:      /* Data processing - register */
-        disas_data_proc_reg(s, insn);
-        break;
-    case 0x7:
-    case 0xf:      /* Data processing - SIMD and floating point */
-        disas_data_proc_simd_fp(s, insn);
-        break;
-    default:
-        assert(FALSE); /* all 15 cases should be handled above */
-        break;
-    }
-
-    /* if we allocated any temporaries, free them here */
-    free_tmp_a64(s);
-
-    /*
-     * After execution of most insns, btype is reset to 0.
-     * Note that we set btype == -1 when the insn sets btype.
-     */
-    if (s->btype > 0 && s->base.is_jmp != DISAS_NORETURN) {
-        reset_btype(s);
-    }
-
-    translator_loop_temp_check(&s->base);
-}
-
-static void aarch64_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
-{
-    DisasContext *dc = container_of(dcbase, DisasContext, base);
-
-    if (unlikely(dc->ss_active)) {
-        /* Note that this means single stepping WFI doesn't halt the CPU.
-         * For conditional branch insns this is harmless unreachable code as
-         * gen_goto_tb() has already handled emitting the debug exception
-         * (and thus a tb-jump is not possible when singlestepping).
-         */
-        switch (dc->base.is_jmp) {
-        default:
-            gen_a64_update_pc(dc, 4);
-            /* fall through */
-        case DISAS_EXIT:
-        case DISAS_JUMP:
-            gen_step_complete_exception(dc);
-            break;
-        case DISAS_NORETURN:
-            break;
-        }
-    } else {
-        switch (dc->base.is_jmp) {
-        case DISAS_NEXT:
-        case DISAS_TOO_MANY:
-            gen_goto_tb(dc, 1, 4);
-            break;
-        default:
-        case DISAS_UPDATE_EXIT:
-            gen_a64_update_pc(dc, 4);
-            /* fall through */
-        case DISAS_EXIT:
-            tcg_gen_exit_tb(NULL, 0);
-            break;
-        case DISAS_UPDATE_NOCHAIN:
-            gen_a64_update_pc(dc, 4);
-            /* fall through */
-        case DISAS_JUMP:
-            tcg_gen_lookup_and_goto_ptr();
-            break;
-        case DISAS_NORETURN:
-        case DISAS_SWI:
-            break;
-        case DISAS_WFE:
-            gen_a64_update_pc(dc, 4);
-            gen_helper_wfe(cpu_env);
-            break;
-        case DISAS_YIELD:
-            gen_a64_update_pc(dc, 4);
-            gen_helper_yield(cpu_env);
-            break;
-        case DISAS_WFI:
-            /*
-             * This is a special case because we don't want to just halt
-             * the CPU if trying to debug across a WFI.
-             */
-            gen_a64_update_pc(dc, 4);
-            gen_helper_wfi(cpu_env, tcg_constant_i32(4));
-            /*
-             * The helper doesn't necessarily throw an exception, but we
-             * must go back to the main loop to check for interrupts anyway.
-             */
-            tcg_gen_exit_tb(NULL, 0);
-            break;
-        }
-    }
-}
-
-static void aarch64_tr_disas_log(const DisasContextBase *dcbase,
-                                 CPUState *cpu, FILE *logfile)
-{
-    DisasContext *dc = container_of(dcbase, DisasContext, base);
-
-    fprintf(logfile, "IN: %s\n", lookup_symbol(dc->base.pc_first));
-    target_disas(logfile, cpu, dc->base.pc_first, dc->base.tb->size);
-}
-
-const TranslatorOps aarch64_translator_ops = {
-    .init_disas_context = aarch64_tr_init_disas_context,
-    .tb_start           = aarch64_tr_tb_start,
-    .insn_start         = aarch64_tr_insn_start,
-    .translate_insn     = aarch64_tr_translate_insn,
-    .tb_stop            = aarch64_tr_tb_stop,
-    .disas_log          = aarch64_tr_disas_log,
-};
diff --git a/target/arm/translate-a64.h b/target/arm/translate-a64.h
deleted file mode 100644
index ad3762d..0000000
--- a/target/arm/translate-a64.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- *  AArch64 translation, common definitions.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef TARGET_ARM_TRANSLATE_A64_H
-#define TARGET_ARM_TRANSLATE_A64_H
-
-TCGv_i64 new_tmp_a64(DisasContext *s);
-TCGv_i64 new_tmp_a64_local(DisasContext *s);
-TCGv_i64 new_tmp_a64_zero(DisasContext *s);
-TCGv_i64 cpu_reg(DisasContext *s, int reg);
-TCGv_i64 cpu_reg_sp(DisasContext *s, int reg);
-TCGv_i64 read_cpu_reg(DisasContext *s, int reg, int sf);
-TCGv_i64 read_cpu_reg_sp(DisasContext *s, int reg, int sf);
-void write_fp_dreg(DisasContext *s, int reg, TCGv_i64 v);
-bool logic_imm_decode_wmask(uint64_t *result, unsigned int immn,
-                            unsigned int imms, unsigned int immr);
-bool sve_access_check(DisasContext *s);
-bool sme_enabled_check(DisasContext *s);
-bool sme_enabled_check_with_svcr(DisasContext *s, unsigned);
-
-/* This function corresponds to CheckStreamingSVEEnabled. */
-static inline bool sme_sm_enabled_check(DisasContext *s)
-{
-    return sme_enabled_check_with_svcr(s, R_SVCR_SM_MASK);
-}
-
-/* This function corresponds to CheckSMEAndZAEnabled. */
-static inline bool sme_za_enabled_check(DisasContext *s)
-{
-    return sme_enabled_check_with_svcr(s, R_SVCR_ZA_MASK);
-}
-
-/* Note that this function corresponds to CheckStreamingSVEAndZAEnabled. */
-static inline bool sme_smza_enabled_check(DisasContext *s)
-{
-    return sme_enabled_check_with_svcr(s, R_SVCR_SM_MASK | R_SVCR_ZA_MASK);
-}
-
-TCGv_i64 clean_data_tbi(DisasContext *s, TCGv_i64 addr);
-TCGv_i64 gen_mte_check1(DisasContext *s, TCGv_i64 addr, bool is_write,
-                        bool tag_checked, int log2_size);
-TCGv_i64 gen_mte_checkN(DisasContext *s, TCGv_i64 addr, bool is_write,
-                        bool tag_checked, int size);
-
-/* We should have at some point before trying to access an FP register
- * done the necessary access check, so assert that
- * (a) we did the check and
- * (b) we didn't then just plough ahead anyway if it failed.
- * Print the instruction pattern in the abort message so we can figure
- * out what we need to fix if a user encounters this problem in the wild.
- */
-static inline void assert_fp_access_checked(DisasContext *s)
-{
-#ifdef CONFIG_DEBUG_TCG
-    if (unlikely(!s->fp_access_checked || s->fp_excp_el)) {
-        fprintf(stderr, "target-arm: FP access check missing for "
-                "instruction 0x%08x\n", s->insn);
-        abort();
-    }
-#endif
-}
-
-/* Return the offset into CPUARMState of an element of specified
- * size, 'element' places in from the least significant end of
- * the FP/vector register Qn.
- */
-static inline int vec_reg_offset(DisasContext *s, int regno,
-                                 int element, MemOp size)
-{
-    int element_size = 1 << size;
-    int offs = element * element_size;
-#if HOST_BIG_ENDIAN
-    /* This is complicated slightly because vfp.zregs[n].d[0] is
-     * still the lowest and vfp.zregs[n].d[15] the highest of the
-     * 256 byte vector, even on big endian systems.
-     *
-     * Calculate the offset assuming fully little-endian,
-     * then XOR to account for the order of the 8-byte units.
-     *
-     * For 16 byte elements, the two 8 byte halves will not form a
-     * host int128 if the host is bigendian, since they're in the
-     * wrong order.  However the only 16 byte operation we have is
-     * a move, so we can ignore this for the moment.  More complicated
-     * operations will have to special case loading and storing from
-     * the zregs array.
-     */
-    if (element_size < 8) {
-        offs ^= 8 - element_size;
-    }
-#endif
-    offs += offsetof(CPUARMState, vfp.zregs[regno]);
-    assert_fp_access_checked(s);
-    return offs;
-}
-
-/* Return the offset info CPUARMState of the "whole" vector register Qn.  */
-static inline int vec_full_reg_offset(DisasContext *s, int regno)
-{
-    assert_fp_access_checked(s);
-    return offsetof(CPUARMState, vfp.zregs[regno]);
-}
-
-/* Return a newly allocated pointer to the vector register.  */
-static inline TCGv_ptr vec_full_reg_ptr(DisasContext *s, int regno)
-{
-    TCGv_ptr ret = tcg_temp_new_ptr();
-    tcg_gen_addi_ptr(ret, cpu_env, vec_full_reg_offset(s, regno));
-    return ret;
-}
-
-/* Return the byte size of the "whole" vector register, VL / 8.  */
-static inline int vec_full_reg_size(DisasContext *s)
-{
-    return s->vl;
-}
-
-/* Return the byte size of the vector register, SVL / 8. */
-static inline int streaming_vec_reg_size(DisasContext *s)
-{
-    return s->svl;
-}
-
-/*
- * Return the offset info CPUARMState of the predicate vector register Pn.
- * Note for this purpose, FFR is P16.
- */
-static inline int pred_full_reg_offset(DisasContext *s, int regno)
-{
-    return offsetof(CPUARMState, vfp.pregs[regno]);
-}
-
-/* Return the byte size of the whole predicate register, VL / 64.  */
-static inline int pred_full_reg_size(DisasContext *s)
-{
-    return s->vl >> 3;
-}
-
-/* Return the byte size of the predicate register, SVL / 64.  */
-static inline int streaming_pred_reg_size(DisasContext *s)
-{
-    return s->svl >> 3;
-}
-
-/*
- * Round up the size of a register to a size allowed by
- * the tcg vector infrastructure.  Any operation which uses this
- * size may assume that the bits above pred_full_reg_size are zero,
- * and must leave them the same way.
- *
- * Note that this is not needed for the vector registers as they
- * are always properly sized for tcg vectors.
- */
-static inline int size_for_gvec(int size)
-{
-    if (size <= 8) {
-        return 8;
-    } else {
-        return QEMU_ALIGN_UP(size, 16);
-    }
-}
-
-static inline int pred_gvec_reg_size(DisasContext *s)
-{
-    return size_for_gvec(pred_full_reg_size(s));
-}
-
-/* Return a newly allocated pointer to the predicate register.  */
-static inline TCGv_ptr pred_full_reg_ptr(DisasContext *s, int regno)
-{
-    TCGv_ptr ret = tcg_temp_new_ptr();
-    tcg_gen_addi_ptr(ret, cpu_env, pred_full_reg_offset(s, regno));
-    return ret;
-}
-
-bool disas_sve(DisasContext *, uint32_t);
-bool disas_sme(DisasContext *, uint32_t);
-
-void gen_gvec_rax1(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
-void gen_gvec_xar(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                  uint32_t rm_ofs, int64_t shift,
-                  uint32_t opr_sz, uint32_t max_sz);
-
-void gen_sve_ldr(DisasContext *s, TCGv_ptr, int vofs, int len, int rn, int imm);
-void gen_sve_str(DisasContext *s, TCGv_ptr, int vofs, int len, int rn, int imm);
-
-#endif /* TARGET_ARM_TRANSLATE_A64_H */
diff --git a/target/arm/translate-m-nocp.c b/target/arm/translate-m-nocp.c
deleted file mode 100644
index 5df7d46..0000000
--- a/target/arm/translate-m-nocp.c
+++ /dev/null
@@ -1,788 +0,0 @@
-/*
- *  ARM translation: M-profile NOCP special-case instructions
- *
- *  Copyright (c) 2020 Linaro, Ltd.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "qemu/osdep.h"
-#include "tcg/tcg-op.h"
-#include "tcg/tcg-op-gvec.h"
-#include "translate.h"
-#include "translate-a32.h"
-
-#include "decode-m-nocp.c.inc"
-
-/*
- * Decode VLLDM and VLSTM are nonstandard because:
- *  * if there is no FPU then these insns must NOP in
- *    Secure state and UNDEF in Nonsecure state
- *  * if there is an FPU then these insns do not have
- *    the usual behaviour that vfp_access_check() provides of
- *    being controlled by CPACR/NSACR enable bits or the
- *    lazy-stacking logic.
- */
-static bool trans_VLLDM_VLSTM(DisasContext *s, arg_VLLDM_VLSTM *a)
-{
-    TCGv_i32 fptr;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_M) ||
-        !arm_dc_feature(s, ARM_FEATURE_V8)) {
-        return false;
-    }
-
-    if (a->op) {
-        /*
-         * T2 encoding ({D0-D31} reglist): v8.1M and up. We choose not
-         * to take the IMPDEF option to make memory accesses to the stack
-         * slots that correspond to the D16-D31 registers (discarding
-         * read data and writing UNKNOWN values), so for us the T2
-         * encoding behaves identically to the T1 encoding.
-         */
-        if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
-            return false;
-        }
-    } else {
-        /*
-         * T1 encoding ({D0-D15} reglist); undef if we have 32 Dregs.
-         * This is currently architecturally impossible, but we add the
-         * check to stay in line with the pseudocode. Note that we must
-         * emit code for the UNDEF so it takes precedence over the NOCP.
-         */
-        if (dc_isar_feature(aa32_simd_r32, s)) {
-            unallocated_encoding(s);
-            return true;
-        }
-    }
-
-    /*
-     * If not secure, UNDEF. We must emit code for this
-     * rather than returning false so that this takes
-     * precedence over the m-nocp.decode NOCP fallback.
-     */
-    if (!s->v8m_secure) {
-        unallocated_encoding(s);
-        return true;
-    }
-
-    s->eci_handled = true;
-
-    /* If no fpu, NOP. */
-    if (!dc_isar_feature(aa32_vfp, s)) {
-        clear_eci_state(s);
-        return true;
-    }
-
-    fptr = load_reg(s, a->rn);
-    if (a->l) {
-        gen_helper_v7m_vlldm(cpu_env, fptr);
-    } else {
-        gen_helper_v7m_vlstm(cpu_env, fptr);
-    }
-    tcg_temp_free_i32(fptr);
-
-    clear_eci_state(s);
-
-    /*
-     * End the TB, because we have updated FP control bits,
-     * and possibly VPR or LTPSIZE.
-     */
-    s->base.is_jmp = DISAS_UPDATE_EXIT;
-    return true;
-}
-
-static bool trans_VSCCLRM(DisasContext *s, arg_VSCCLRM *a)
-{
-    int btmreg, topreg;
-    TCGv_i64 zero;
-    TCGv_i32 aspen, sfpa;
-
-    if (!dc_isar_feature(aa32_m_sec_state, s)) {
-        /* Before v8.1M, fall through in decode to NOCP check */
-        return false;
-    }
-
-    /* Explicitly UNDEF because this takes precedence over NOCP */
-    if (!arm_dc_feature(s, ARM_FEATURE_M_MAIN) || !s->v8m_secure) {
-        unallocated_encoding(s);
-        return true;
-    }
-
-    s->eci_handled = true;
-
-    if (!dc_isar_feature(aa32_vfp_simd, s)) {
-        /* NOP if we have neither FP nor MVE */
-        clear_eci_state(s);
-        return true;
-    }
-
-    /*
-     * If FPCCR.ASPEN != 0 && CONTROL_S.SFPA == 0 then there is no
-     * active floating point context so we must NOP (without doing
-     * any lazy state preservation or the NOCP check).
-     */
-    aspen = load_cpu_field(v7m.fpccr[M_REG_S]);
-    sfpa = load_cpu_field(v7m.control[M_REG_S]);
-    tcg_gen_andi_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK);
-    tcg_gen_xori_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK);
-    tcg_gen_andi_i32(sfpa, sfpa, R_V7M_CONTROL_SFPA_MASK);
-    tcg_gen_or_i32(sfpa, sfpa, aspen);
-    arm_gen_condlabel(s);
-    tcg_gen_brcondi_i32(TCG_COND_EQ, sfpa, 0, s->condlabel.label);
-
-    if (s->fp_excp_el != 0) {
-        gen_exception_insn_el(s, 0, EXCP_NOCP,
-                              syn_uncategorized(), s->fp_excp_el);
-        return true;
-    }
-
-    topreg = a->vd + a->imm - 1;
-    btmreg = a->vd;
-
-    /* Convert to Sreg numbers if the insn specified in Dregs */
-    if (a->size == 3) {
-        topreg = topreg * 2 + 1;
-        btmreg *= 2;
-    }
-
-    if (topreg > 63 || (topreg > 31 && !(topreg & 1))) {
-        /* UNPREDICTABLE: we choose to undef */
-        unallocated_encoding(s);
-        return true;
-    }
-
-    /* Silently ignore requests to clear D16-D31 if they don't exist */
-    if (topreg > 31 && !dc_isar_feature(aa32_simd_r32, s)) {
-        topreg = 31;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    /* Zero the Sregs from btmreg to topreg inclusive. */
-    zero = tcg_constant_i64(0);
-    if (btmreg & 1) {
-        write_neon_element64(zero, btmreg >> 1, 1, MO_32);
-        btmreg++;
-    }
-    for (; btmreg + 1 <= topreg; btmreg += 2) {
-        write_neon_element64(zero, btmreg >> 1, 0, MO_64);
-    }
-    if (btmreg == topreg) {
-        write_neon_element64(zero, btmreg >> 1, 0, MO_32);
-        btmreg++;
-    }
-    assert(btmreg == topreg + 1);
-    if (dc_isar_feature(aa32_mve, s)) {
-        store_cpu_field(tcg_constant_i32(0), v7m.vpr);
-    }
-
-    clear_eci_state(s);
-    return true;
-}
-
-/*
- * M-profile provides two different sets of instructions that can
- * access floating point system registers: VMSR/VMRS (which move
- * to/from a general purpose register) and VLDR/VSTR sysreg (which
- * move directly to/from memory). In some cases there are also side
- * effects which must happen after any write to memory (which could
- * cause an exception). So we implement the common logic for the
- * sysreg access in gen_M_fp_sysreg_write() and gen_M_fp_sysreg_read(),
- * which take pointers to callback functions which will perform the
- * actual "read/write general purpose register" and "read/write
- * memory" operations.
- */
-
-/*
- * Emit code to store the sysreg to its final destination; frees the
- * TCG temp 'value' it is passed. do_access is true to do the store,
- * and false to skip it and only perform side-effects like base
- * register writeback.
- */
-typedef void fp_sysreg_storefn(DisasContext *s, void *opaque, TCGv_i32 value,
-                               bool do_access);
-/*
- * Emit code to load the value to be copied to the sysreg; returns
- * a new TCG temporary. do_access is true to do the store,
- * and false to skip it and only perform side-effects like base
- * register writeback.
- */
-typedef TCGv_i32 fp_sysreg_loadfn(DisasContext *s, void *opaque,
-                                  bool do_access);
-
-/* Common decode/access checks for fp sysreg read/write */
-typedef enum FPSysRegCheckResult {
-    FPSysRegCheckFailed, /* caller should return false */
-    FPSysRegCheckDone, /* caller should return true */
-    FPSysRegCheckContinue, /* caller should continue generating code */
-} FPSysRegCheckResult;
-
-static FPSysRegCheckResult fp_sysreg_checks(DisasContext *s, int regno)
-{
-    if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
-        return FPSysRegCheckFailed;
-    }
-
-    switch (regno) {
-    case ARM_VFP_FPSCR:
-    case QEMU_VFP_FPSCR_NZCV:
-        break;
-    case ARM_VFP_FPSCR_NZCVQC:
-        if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
-            return FPSysRegCheckFailed;
-        }
-        break;
-    case ARM_VFP_FPCXT_S:
-    case ARM_VFP_FPCXT_NS:
-        if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
-            return FPSysRegCheckFailed;
-        }
-        if (!s->v8m_secure) {
-            return FPSysRegCheckFailed;
-        }
-        break;
-    case ARM_VFP_VPR:
-    case ARM_VFP_P0:
-        if (!dc_isar_feature(aa32_mve, s)) {
-            return FPSysRegCheckFailed;
-        }
-        break;
-    default:
-        return FPSysRegCheckFailed;
-    }
-
-    /*
-     * FPCXT_NS is a special case: it has specific handling for
-     * "current FP state is inactive", and must do the PreserveFPState()
-     * but not the usual full set of actions done by ExecuteFPCheck().
-     * So we don't call vfp_access_check() and the callers must handle this.
-     */
-    if (regno != ARM_VFP_FPCXT_NS && !vfp_access_check(s)) {
-        return FPSysRegCheckDone;
-    }
-    return FPSysRegCheckContinue;
-}
-
-static void gen_branch_fpInactive(DisasContext *s, TCGCond cond,
-                                  TCGLabel *label)
-{
-    /*
-     * FPCXT_NS is a special case: it has specific handling for
-     * "current FP state is inactive", and must do the PreserveFPState()
-     * but not the usual full set of actions done by ExecuteFPCheck().
-     * We don't have a TB flag that matches the fpInactive check, so we
-     * do it at runtime as we don't expect FPCXT_NS accesses to be frequent.
-     *
-     * Emit code that checks fpInactive and does a conditional
-     * branch to label based on it:
-     *  if cond is TCG_COND_NE then branch if fpInactive != 0 (ie if inactive)
-     *  if cond is TCG_COND_EQ then branch if fpInactive == 0 (ie if active)
-     */
-    assert(cond == TCG_COND_EQ || cond == TCG_COND_NE);
-
-    /* fpInactive = FPCCR_NS.ASPEN == 1 && CONTROL.FPCA == 0 */
-    TCGv_i32 aspen, fpca;
-    aspen = load_cpu_field(v7m.fpccr[M_REG_NS]);
-    fpca = load_cpu_field(v7m.control[M_REG_S]);
-    tcg_gen_andi_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK);
-    tcg_gen_xori_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK);
-    tcg_gen_andi_i32(fpca, fpca, R_V7M_CONTROL_FPCA_MASK);
-    tcg_gen_or_i32(fpca, fpca, aspen);
-    tcg_gen_brcondi_i32(tcg_invert_cond(cond), fpca, 0, label);
-    tcg_temp_free_i32(aspen);
-    tcg_temp_free_i32(fpca);
-}
-
-static bool gen_M_fp_sysreg_write(DisasContext *s, int regno,
-                                  fp_sysreg_loadfn *loadfn,
-                                  void *opaque)
-{
-    /* Do a write to an M-profile floating point system register */
-    TCGv_i32 tmp;
-    TCGLabel *lab_end = NULL;
-
-    switch (fp_sysreg_checks(s, regno)) {
-    case FPSysRegCheckFailed:
-        return false;
-    case FPSysRegCheckDone:
-        return true;
-    case FPSysRegCheckContinue:
-        break;
-    }
-
-    switch (regno) {
-    case ARM_VFP_FPSCR:
-        tmp = loadfn(s, opaque, true);
-        gen_helper_vfp_set_fpscr(cpu_env, tmp);
-        tcg_temp_free_i32(tmp);
-        gen_lookup_tb(s);
-        break;
-    case ARM_VFP_FPSCR_NZCVQC:
-    {
-        TCGv_i32 fpscr;
-        tmp = loadfn(s, opaque, true);
-        if (dc_isar_feature(aa32_mve, s)) {
-            /* QC is only present for MVE; otherwise RES0 */
-            TCGv_i32 qc = tcg_temp_new_i32();
-            tcg_gen_andi_i32(qc, tmp, FPCR_QC);
-            /*
-             * The 4 vfp.qc[] fields need only be "zero" vs "non-zero";
-             * here writing the same value into all elements is simplest.
-             */
-            tcg_gen_gvec_dup_i32(MO_32, offsetof(CPUARMState, vfp.qc),
-                                 16, 16, qc);
-        }
-        tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK);
-        fpscr = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]);
-        tcg_gen_andi_i32(fpscr, fpscr, ~FPCR_NZCV_MASK);
-        tcg_gen_or_i32(fpscr, fpscr, tmp);
-        store_cpu_field(fpscr, vfp.xregs[ARM_VFP_FPSCR]);
-        tcg_temp_free_i32(tmp);
-        break;
-    }
-    case ARM_VFP_FPCXT_NS:
-    {
-        TCGLabel *lab_active = gen_new_label();
-
-        lab_end = gen_new_label();
-        gen_branch_fpInactive(s, TCG_COND_EQ, lab_active);
-        /*
-         * fpInactive case: write is a NOP, so only do side effects
-         * like register writeback before we branch to end
-         */
-        loadfn(s, opaque, false);
-        tcg_gen_br(lab_end);
-
-        gen_set_label(lab_active);
-        /*
-         * !fpInactive: if FPU disabled, take NOCP exception;
-         * otherwise PreserveFPState(), and then FPCXT_NS writes
-         * behave the same as FPCXT_S writes.
-         */
-        if (!vfp_access_check_m(s, true)) {
-            /*
-             * This was only a conditional exception, so override
-             * gen_exception_insn_el()'s default to DISAS_NORETURN
-             */
-            s->base.is_jmp = DISAS_NEXT;
-            break;
-        }
-    }
-    /* fall through */
-    case ARM_VFP_FPCXT_S:
-    {
-        TCGv_i32 sfpa, control;
-        /*
-         * Set FPSCR and CONTROL.SFPA from value; the new FPSCR takes
-         * bits [27:0] from value and zeroes bits [31:28].
-         */
-        tmp = loadfn(s, opaque, true);
-        sfpa = tcg_temp_new_i32();
-        tcg_gen_shri_i32(sfpa, tmp, 31);
-        control = load_cpu_field(v7m.control[M_REG_S]);
-        tcg_gen_deposit_i32(control, control, sfpa,
-                            R_V7M_CONTROL_SFPA_SHIFT, 1);
-        store_cpu_field(control, v7m.control[M_REG_S]);
-        tcg_gen_andi_i32(tmp, tmp, ~FPCR_NZCV_MASK);
-        gen_helper_vfp_set_fpscr(cpu_env, tmp);
-        s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
-        tcg_temp_free_i32(tmp);
-        tcg_temp_free_i32(sfpa);
-        break;
-    }
-    case ARM_VFP_VPR:
-        /* Behaves as NOP if not privileged */
-        if (IS_USER(s)) {
-            loadfn(s, opaque, false);
-            break;
-        }
-        tmp = loadfn(s, opaque, true);
-        store_cpu_field(tmp, v7m.vpr);
-        s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
-        break;
-    case ARM_VFP_P0:
-    {
-        TCGv_i32 vpr;
-        tmp = loadfn(s, opaque, true);
-        vpr = load_cpu_field(v7m.vpr);
-        tcg_gen_deposit_i32(vpr, vpr, tmp,
-                            R_V7M_VPR_P0_SHIFT, R_V7M_VPR_P0_LENGTH);
-        store_cpu_field(vpr, v7m.vpr);
-        s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
-        tcg_temp_free_i32(tmp);
-        break;
-    }
-    default:
-        g_assert_not_reached();
-    }
-    if (lab_end) {
-        gen_set_label(lab_end);
-    }
-    return true;
-}
-
-static bool gen_M_fp_sysreg_read(DisasContext *s, int regno,
-                                 fp_sysreg_storefn *storefn,
-                                 void *opaque)
-{
-    /* Do a read from an M-profile floating point system register */
-    TCGv_i32 tmp;
-    TCGLabel *lab_end = NULL;
-    bool lookup_tb = false;
-
-    switch (fp_sysreg_checks(s, regno)) {
-    case FPSysRegCheckFailed:
-        return false;
-    case FPSysRegCheckDone:
-        return true;
-    case FPSysRegCheckContinue:
-        break;
-    }
-
-    if (regno == ARM_VFP_FPSCR_NZCVQC && !dc_isar_feature(aa32_mve, s)) {
-        /* QC is RES0 without MVE, so NZCVQC simplifies to NZCV */
-        regno = QEMU_VFP_FPSCR_NZCV;
-    }
-
-    switch (regno) {
-    case ARM_VFP_FPSCR:
-        tmp = tcg_temp_new_i32();
-        gen_helper_vfp_get_fpscr(tmp, cpu_env);
-        storefn(s, opaque, tmp, true);
-        break;
-    case ARM_VFP_FPSCR_NZCVQC:
-        tmp = tcg_temp_new_i32();
-        gen_helper_vfp_get_fpscr(tmp, cpu_env);
-        tcg_gen_andi_i32(tmp, tmp, FPCR_NZCVQC_MASK);
-        storefn(s, opaque, tmp, true);
-        break;
-    case QEMU_VFP_FPSCR_NZCV:
-        /*
-         * Read just NZCV; this is a special case to avoid the
-         * helper call for the "VMRS to CPSR.NZCV" insn.
-         */
-        tmp = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]);
-        tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK);
-        storefn(s, opaque, tmp, true);
-        break;
-    case ARM_VFP_FPCXT_S:
-    {
-        TCGv_i32 control, sfpa, fpscr;
-        /* Bits [27:0] from FPSCR, bit [31] from CONTROL.SFPA */
-        tmp = tcg_temp_new_i32();
-        sfpa = tcg_temp_new_i32();
-        gen_helper_vfp_get_fpscr(tmp, cpu_env);
-        tcg_gen_andi_i32(tmp, tmp, ~FPCR_NZCV_MASK);
-        control = load_cpu_field(v7m.control[M_REG_S]);
-        tcg_gen_andi_i32(sfpa, control, R_V7M_CONTROL_SFPA_MASK);
-        tcg_gen_shli_i32(sfpa, sfpa, 31 - R_V7M_CONTROL_SFPA_SHIFT);
-        tcg_gen_or_i32(tmp, tmp, sfpa);
-        tcg_temp_free_i32(sfpa);
-        /*
-         * Store result before updating FPSCR etc, in case
-         * it is a memory write which causes an exception.
-         */
-        storefn(s, opaque, tmp, true);
-        /*
-         * Now we must reset FPSCR from FPDSCR_NS, and clear
-         * CONTROL.SFPA; so we'll end the TB here.
-         */
-        tcg_gen_andi_i32(control, control, ~R_V7M_CONTROL_SFPA_MASK);
-        store_cpu_field(control, v7m.control[M_REG_S]);
-        fpscr = load_cpu_field(v7m.fpdscr[M_REG_NS]);
-        gen_helper_vfp_set_fpscr(cpu_env, fpscr);
-        tcg_temp_free_i32(fpscr);
-        lookup_tb = true;
-        break;
-    }
-    case ARM_VFP_FPCXT_NS:
-    {
-        TCGv_i32 control, sfpa, fpscr, fpdscr;
-        TCGLabel *lab_active = gen_new_label();
-
-        lookup_tb = true;
-
-        gen_branch_fpInactive(s, TCG_COND_EQ, lab_active);
-        /* fpInactive case: reads as FPDSCR_NS */
-        TCGv_i32 tmp = load_cpu_field(v7m.fpdscr[M_REG_NS]);
-        storefn(s, opaque, tmp, true);
-        lab_end = gen_new_label();
-        tcg_gen_br(lab_end);
-
-        gen_set_label(lab_active);
-        /*
-         * !fpInactive: if FPU disabled, take NOCP exception;
-         * otherwise PreserveFPState(), and then FPCXT_NS
-         * reads the same as FPCXT_S.
-         */
-        if (!vfp_access_check_m(s, true)) {
-            /*
-             * This was only a conditional exception, so override
-             * gen_exception_insn_el()'s default to DISAS_NORETURN
-             */
-            s->base.is_jmp = DISAS_NEXT;
-            break;
-        }
-        tmp = tcg_temp_new_i32();
-        sfpa = tcg_temp_new_i32();
-        fpscr = tcg_temp_new_i32();
-        gen_helper_vfp_get_fpscr(fpscr, cpu_env);
-        tcg_gen_andi_i32(tmp, fpscr, ~FPCR_NZCV_MASK);
-        control = load_cpu_field(v7m.control[M_REG_S]);
-        tcg_gen_andi_i32(sfpa, control, R_V7M_CONTROL_SFPA_MASK);
-        tcg_gen_shli_i32(sfpa, sfpa, 31 - R_V7M_CONTROL_SFPA_SHIFT);
-        tcg_gen_or_i32(tmp, tmp, sfpa);
-        tcg_temp_free_i32(control);
-        /* Store result before updating FPSCR, in case it faults */
-        storefn(s, opaque, tmp, true);
-        /* If SFPA is zero then set FPSCR from FPDSCR_NS */
-        fpdscr = load_cpu_field(v7m.fpdscr[M_REG_NS]);
-        tcg_gen_movcond_i32(TCG_COND_EQ, fpscr, sfpa, tcg_constant_i32(0),
-                            fpdscr, fpscr);
-        gen_helper_vfp_set_fpscr(cpu_env, fpscr);
-        tcg_temp_free_i32(sfpa);
-        tcg_temp_free_i32(fpdscr);
-        tcg_temp_free_i32(fpscr);
-        break;
-    }
-    case ARM_VFP_VPR:
-        /* Behaves as NOP if not privileged */
-        if (IS_USER(s)) {
-            storefn(s, opaque, NULL, false);
-            break;
-        }
-        tmp = load_cpu_field(v7m.vpr);
-        storefn(s, opaque, tmp, true);
-        break;
-    case ARM_VFP_P0:
-        tmp = load_cpu_field(v7m.vpr);
-        tcg_gen_extract_i32(tmp, tmp, R_V7M_VPR_P0_SHIFT, R_V7M_VPR_P0_LENGTH);
-        storefn(s, opaque, tmp, true);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-
-    if (lab_end) {
-        gen_set_label(lab_end);
-    }
-    if (lookup_tb) {
-        gen_lookup_tb(s);
-    }
-    return true;
-}
-
-static void fp_sysreg_to_gpr(DisasContext *s, void *opaque, TCGv_i32 value,
-                             bool do_access)
-{
-    arg_VMSR_VMRS *a = opaque;
-
-    if (!do_access) {
-        return;
-    }
-
-    if (a->rt == 15) {
-        /* Set the 4 flag bits in the CPSR */
-        gen_set_nzcv(value);
-        tcg_temp_free_i32(value);
-    } else {
-        store_reg(s, a->rt, value);
-    }
-}
-
-static TCGv_i32 gpr_to_fp_sysreg(DisasContext *s, void *opaque, bool do_access)
-{
-    arg_VMSR_VMRS *a = opaque;
-
-    if (!do_access) {
-        return NULL;
-    }
-    return load_reg(s, a->rt);
-}
-
-static bool trans_VMSR_VMRS(DisasContext *s, arg_VMSR_VMRS *a)
-{
-    /*
-     * Accesses to R15 are UNPREDICTABLE; we choose to undef.
-     * FPSCR -> r15 is a special case which writes to the PSR flags;
-     * set a->reg to a special value to tell gen_M_fp_sysreg_read()
-     * we only care about the top 4 bits of FPSCR there.
-     */
-    if (a->rt == 15) {
-        if (a->l && a->reg == ARM_VFP_FPSCR) {
-            a->reg = QEMU_VFP_FPSCR_NZCV;
-        } else {
-            return false;
-        }
-    }
-
-    if (a->l) {
-        /* VMRS, move FP system register to gp register */
-        return gen_M_fp_sysreg_read(s, a->reg, fp_sysreg_to_gpr, a);
-    } else {
-        /* VMSR, move gp register to FP system register */
-        return gen_M_fp_sysreg_write(s, a->reg, gpr_to_fp_sysreg, a);
-    }
-}
-
-static void fp_sysreg_to_memory(DisasContext *s, void *opaque, TCGv_i32 value,
-                                bool do_access)
-{
-    arg_vldr_sysreg *a = opaque;
-    uint32_t offset = a->imm;
-    TCGv_i32 addr;
-
-    if (!a->a) {
-        offset = -offset;
-    }
-
-    if (!do_access && !a->w) {
-        return;
-    }
-
-    addr = load_reg(s, a->rn);
-    if (a->p) {
-        tcg_gen_addi_i32(addr, addr, offset);
-    }
-
-    if (s->v8m_stackcheck && a->rn == 13 && a->w) {
-        gen_helper_v8m_stackcheck(cpu_env, addr);
-    }
-
-    if (do_access) {
-        gen_aa32_st_i32(s, value, addr, get_mem_index(s),
-                        MO_UL | MO_ALIGN | s->be_data);
-        tcg_temp_free_i32(value);
-    }
-
-    if (a->w) {
-        /* writeback */
-        if (!a->p) {
-            tcg_gen_addi_i32(addr, addr, offset);
-        }
-        store_reg(s, a->rn, addr);
-    } else {
-        tcg_temp_free_i32(addr);
-    }
-}
-
-static TCGv_i32 memory_to_fp_sysreg(DisasContext *s, void *opaque,
-                                    bool do_access)
-{
-    arg_vldr_sysreg *a = opaque;
-    uint32_t offset = a->imm;
-    TCGv_i32 addr;
-    TCGv_i32 value = NULL;
-
-    if (!a->a) {
-        offset = -offset;
-    }
-
-    if (!do_access && !a->w) {
-        return NULL;
-    }
-
-    addr = load_reg(s, a->rn);
-    if (a->p) {
-        tcg_gen_addi_i32(addr, addr, offset);
-    }
-
-    if (s->v8m_stackcheck && a->rn == 13 && a->w) {
-        gen_helper_v8m_stackcheck(cpu_env, addr);
-    }
-
-    if (do_access) {
-        value = tcg_temp_new_i32();
-        gen_aa32_ld_i32(s, value, addr, get_mem_index(s),
-                        MO_UL | MO_ALIGN | s->be_data);
-    }
-
-    if (a->w) {
-        /* writeback */
-        if (!a->p) {
-            tcg_gen_addi_i32(addr, addr, offset);
-        }
-        store_reg(s, a->rn, addr);
-    } else {
-        tcg_temp_free_i32(addr);
-    }
-    return value;
-}
-
-static bool trans_VLDR_sysreg(DisasContext *s, arg_vldr_sysreg *a)
-{
-    if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
-        return false;
-    }
-    if (a->rn == 15) {
-        return false;
-    }
-    return gen_M_fp_sysreg_write(s, a->reg, memory_to_fp_sysreg, a);
-}
-
-static bool trans_VSTR_sysreg(DisasContext *s, arg_vldr_sysreg *a)
-{
-    if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
-        return false;
-    }
-    if (a->rn == 15) {
-        return false;
-    }
-    return gen_M_fp_sysreg_read(s, a->reg, fp_sysreg_to_memory, a);
-}
-
-static bool trans_NOCP(DisasContext *s, arg_nocp *a)
-{
-    /*
-     * Handle M-profile early check for disabled coprocessor:
-     * all we need to do here is emit the NOCP exception if
-     * the coprocessor is disabled. Otherwise we return false
-     * and the real VFP/etc decode will handle the insn.
-     */
-    assert(arm_dc_feature(s, ARM_FEATURE_M));
-
-    if (a->cp == 11) {
-        a->cp = 10;
-    }
-    if (arm_dc_feature(s, ARM_FEATURE_V8_1M) &&
-        (a->cp == 8 || a->cp == 9 || a->cp == 14 || a->cp == 15)) {
-        /* in v8.1M cp 8, 9, 14, 15 also are governed by the cp10 enable */
-        a->cp = 10;
-    }
-
-    if (a->cp != 10) {
-        gen_exception_insn(s, 0, EXCP_NOCP, syn_uncategorized());
-        return true;
-    }
-
-    if (s->fp_excp_el != 0) {
-        gen_exception_insn_el(s, 0, EXCP_NOCP,
-                              syn_uncategorized(), s->fp_excp_el);
-        return true;
-    }
-
-    return false;
-}
-
-static bool trans_NOCP_8_1(DisasContext *s, arg_nocp *a)
-{
-    /* This range needs a coprocessor check for v8.1M and later only */
-    if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
-        return false;
-    }
-    return trans_NOCP(s, a);
-}
diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c
deleted file mode 100644
index db7ea3f..0000000
--- a/target/arm/translate-mve.c
+++ /dev/null
@@ -1,2310 +0,0 @@
-/*
- *  ARM translation: M-profile MVE instructions
- *
- *  Copyright (c) 2021 Linaro, Ltd.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "qemu/osdep.h"
-#include "tcg/tcg-op.h"
-#include "tcg/tcg-op-gvec.h"
-#include "exec/exec-all.h"
-#include "exec/gen-icount.h"
-#include "translate.h"
-#include "translate-a32.h"
-
-static inline int vidup_imm(DisasContext *s, int x)
-{
-    return 1 << x;
-}
-
-/* Include the generated decoder */
-#include "decode-mve.c.inc"
-
-typedef void MVEGenLdStFn(TCGv_ptr, TCGv_ptr, TCGv_i32);
-typedef void MVEGenLdStSGFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
-typedef void MVEGenLdStIlFn(TCGv_ptr, TCGv_i32, TCGv_i32);
-typedef void MVEGenOneOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr);
-typedef void MVEGenTwoOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr);
-typedef void MVEGenTwoOpScalarFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
-typedef void MVEGenTwoOpShiftFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
-typedef void MVEGenLongDualAccOpFn(TCGv_i64, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i64);
-typedef void MVEGenVADDVFn(TCGv_i32, TCGv_ptr, TCGv_ptr, TCGv_i32);
-typedef void MVEGenOneOpImmFn(TCGv_ptr, TCGv_ptr, TCGv_i64);
-typedef void MVEGenVIDUPFn(TCGv_i32, TCGv_ptr, TCGv_ptr, TCGv_i32, TCGv_i32);
-typedef void MVEGenVIWDUPFn(TCGv_i32, TCGv_ptr, TCGv_ptr, TCGv_i32, TCGv_i32, TCGv_i32);
-typedef void MVEGenCmpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr);
-typedef void MVEGenScalarCmpFn(TCGv_ptr, TCGv_ptr, TCGv_i32);
-typedef void MVEGenVABAVFn(TCGv_i32, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
-typedef void MVEGenDualAccOpFn(TCGv_i32, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
-typedef void MVEGenVCVTRmodeFn(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
-
-/* Return the offset of a Qn register (same semantics as aa32_vfp_qreg()) */
-static inline long mve_qreg_offset(unsigned reg)
-{
-    return offsetof(CPUARMState, vfp.zregs[reg].d[0]);
-}
-
-static TCGv_ptr mve_qreg_ptr(unsigned reg)
-{
-    TCGv_ptr ret = tcg_temp_new_ptr();
-    tcg_gen_addi_ptr(ret, cpu_env, mve_qreg_offset(reg));
-    return ret;
-}
-
-static bool mve_no_predication(DisasContext *s)
-{
-    /*
-     * Return true if we are executing the entire MVE instruction
-     * with no predication or partial-execution, and so we can safely
-     * use an inline TCG vector implementation.
-     */
-    return s->eci == 0 && s->mve_no_pred;
-}
-
-static bool mve_check_qreg_bank(DisasContext *s, int qmask)
-{
-    /*
-     * Check whether Qregs are in range. For v8.1M only Q0..Q7
-     * are supported, see VFPSmallRegisterBank().
-     */
-    return qmask < 8;
-}
-
-bool mve_eci_check(DisasContext *s)
-{
-    /*
-     * This is a beatwise insn: check that ECI is valid (not a
-     * reserved value) and note that we are handling it.
-     * Return true if OK, false if we generated an exception.
-     */
-    s->eci_handled = true;
-    switch (s->eci) {
-    case ECI_NONE:
-    case ECI_A0:
-    case ECI_A0A1:
-    case ECI_A0A1A2:
-    case ECI_A0A1A2B0:
-        return true;
-    default:
-        /* Reserved value: INVSTATE UsageFault */
-        gen_exception_insn(s, 0, EXCP_INVSTATE, syn_uncategorized());
-        return false;
-    }
-}
-
-void mve_update_eci(DisasContext *s)
-{
-    /*
-     * The helper function will always update the CPUState field,
-     * so we only need to update the DisasContext field.
-     */
-    if (s->eci) {
-        s->eci = (s->eci == ECI_A0A1A2B0) ? ECI_A0 : ECI_NONE;
-    }
-}
-
-void mve_update_and_store_eci(DisasContext *s)
-{
-    /*
-     * For insns which don't call a helper function that will call
-     * mve_advance_vpt(), this version updates s->eci and also stores
-     * it out to the CPUState field.
-     */
-    if (s->eci) {
-        mve_update_eci(s);
-        store_cpu_field(tcg_constant_i32(s->eci << 4), condexec_bits);
-    }
-}
-
-static bool mve_skip_first_beat(DisasContext *s)
-{
-    /* Return true if PSR.ECI says we must skip the first beat of this insn */
-    switch (s->eci) {
-    case ECI_NONE:
-        return false;
-    case ECI_A0:
-    case ECI_A0A1:
-    case ECI_A0A1A2:
-    case ECI_A0A1A2B0:
-        return true;
-    default:
-        g_assert_not_reached();
-    }
-}
-
-static bool do_ldst(DisasContext *s, arg_VLDR_VSTR *a, MVEGenLdStFn *fn,
-                    unsigned msize)
-{
-    TCGv_i32 addr;
-    uint32_t offset;
-    TCGv_ptr qreg;
-
-    if (!dc_isar_feature(aa32_mve, s) ||
-        !mve_check_qreg_bank(s, a->qd) ||
-        !fn) {
-        return false;
-    }
-
-    /* CONSTRAINED UNPREDICTABLE: we choose to UNDEF */
-    if (a->rn == 15 || (a->rn == 13 && a->w)) {
-        return false;
-    }
-
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-
-    offset = a->imm << msize;
-    if (!a->a) {
-        offset = -offset;
-    }
-    addr = load_reg(s, a->rn);
-    if (a->p) {
-        tcg_gen_addi_i32(addr, addr, offset);
-    }
-
-    qreg = mve_qreg_ptr(a->qd);
-    fn(cpu_env, qreg, addr);
-    tcg_temp_free_ptr(qreg);
-
-    /*
-     * Writeback always happens after the last beat of the insn,
-     * regardless of predication
-     */
-    if (a->w) {
-        if (!a->p) {
-            tcg_gen_addi_i32(addr, addr, offset);
-        }
-        store_reg(s, a->rn, addr);
-    } else {
-        tcg_temp_free_i32(addr);
-    }
-    mve_update_eci(s);
-    return true;
-}
-
-static bool trans_VLDR_VSTR(DisasContext *s, arg_VLDR_VSTR *a)
-{
-    static MVEGenLdStFn * const ldstfns[4][2] = {
-        { gen_helper_mve_vstrb, gen_helper_mve_vldrb },
-        { gen_helper_mve_vstrh, gen_helper_mve_vldrh },
-        { gen_helper_mve_vstrw, gen_helper_mve_vldrw },
-        { NULL, NULL }
-    };
-    return do_ldst(s, a, ldstfns[a->size][a->l], a->size);
-}
-
-#define DO_VLDST_WIDE_NARROW(OP, SLD, ULD, ST, MSIZE)           \
-    static bool trans_##OP(DisasContext *s, arg_VLDR_VSTR *a)   \
-    {                                                           \
-        static MVEGenLdStFn * const ldstfns[2][2] = {           \
-            { gen_helper_mve_##ST, gen_helper_mve_##SLD },      \
-            { NULL, gen_helper_mve_##ULD },                     \
-        };                                                      \
-        return do_ldst(s, a, ldstfns[a->u][a->l], MSIZE);       \
-    }
-
-DO_VLDST_WIDE_NARROW(VLDSTB_H, vldrb_sh, vldrb_uh, vstrb_h, MO_8)
-DO_VLDST_WIDE_NARROW(VLDSTB_W, vldrb_sw, vldrb_uw, vstrb_w, MO_8)
-DO_VLDST_WIDE_NARROW(VLDSTH_W, vldrh_sw, vldrh_uw, vstrh_w, MO_16)
-
-static bool do_ldst_sg(DisasContext *s, arg_vldst_sg *a, MVEGenLdStSGFn fn)
-{
-    TCGv_i32 addr;
-    TCGv_ptr qd, qm;
-
-    if (!dc_isar_feature(aa32_mve, s) ||
-        !mve_check_qreg_bank(s, a->qd | a->qm) ||
-        !fn || a->rn == 15) {
-        /* Rn case is UNPREDICTABLE */
-        return false;
-    }
-
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-
-    addr = load_reg(s, a->rn);
-
-    qd = mve_qreg_ptr(a->qd);
-    qm = mve_qreg_ptr(a->qm);
-    fn(cpu_env, qd, qm, addr);
-    tcg_temp_free_ptr(qd);
-    tcg_temp_free_ptr(qm);
-    tcg_temp_free_i32(addr);
-    mve_update_eci(s);
-    return true;
-}
-
-/*
- * The naming scheme here is "vldrb_sg_sh == in-memory byte loads
- * signextended to halfword elements in register". _os_ indicates that
- * the offsets in Qm should be scaled by the element size.
- */
-/* This macro is just to make the arrays more compact in these functions */
-#define F(N) gen_helper_mve_##N
-
-/* VLDRB/VSTRB (ie msize 1) with OS=1 is UNPREDICTABLE; we UNDEF */
-static bool trans_VLDR_S_sg(DisasContext *s, arg_vldst_sg *a)
-{
-    static MVEGenLdStSGFn * const fns[2][4][4] = { {
-            { NULL, F(vldrb_sg_sh), F(vldrb_sg_sw), NULL },
-            { NULL, NULL,           F(vldrh_sg_sw), NULL },
-            { NULL, NULL,           NULL,           NULL },
-            { NULL, NULL,           NULL,           NULL }
-        }, {
-            { NULL, NULL,              NULL,              NULL },
-            { NULL, NULL,              F(vldrh_sg_os_sw), NULL },
-            { NULL, NULL,              NULL,              NULL },
-            { NULL, NULL,              NULL,              NULL }
-        }
-    };
-    if (a->qd == a->qm) {
-        return false; /* UNPREDICTABLE */
-    }
-    return do_ldst_sg(s, a, fns[a->os][a->msize][a->size]);
-}
-
-static bool trans_VLDR_U_sg(DisasContext *s, arg_vldst_sg *a)
-{
-    static MVEGenLdStSGFn * const fns[2][4][4] = { {
-            { F(vldrb_sg_ub), F(vldrb_sg_uh), F(vldrb_sg_uw), NULL },
-            { NULL,           F(vldrh_sg_uh), F(vldrh_sg_uw), NULL },
-            { NULL,           NULL,           F(vldrw_sg_uw), NULL },
-            { NULL,           NULL,           NULL,           F(vldrd_sg_ud) }
-        }, {
-            { NULL, NULL,              NULL,              NULL },
-            { NULL, F(vldrh_sg_os_uh), F(vldrh_sg_os_uw), NULL },
-            { NULL, NULL,              F(vldrw_sg_os_uw), NULL },
-            { NULL, NULL,              NULL,              F(vldrd_sg_os_ud) }
-        }
-    };
-    if (a->qd == a->qm) {
-        return false; /* UNPREDICTABLE */
-    }
-    return do_ldst_sg(s, a, fns[a->os][a->msize][a->size]);
-}
-
-static bool trans_VSTR_sg(DisasContext *s, arg_vldst_sg *a)
-{
-    static MVEGenLdStSGFn * const fns[2][4][4] = { {
-            { F(vstrb_sg_ub), F(vstrb_sg_uh), F(vstrb_sg_uw), NULL },
-            { NULL,           F(vstrh_sg_uh), F(vstrh_sg_uw), NULL },
-            { NULL,           NULL,           F(vstrw_sg_uw), NULL },
-            { NULL,           NULL,           NULL,           F(vstrd_sg_ud) }
-        }, {
-            { NULL, NULL,              NULL,              NULL },
-            { NULL, F(vstrh_sg_os_uh), F(vstrh_sg_os_uw), NULL },
-            { NULL, NULL,              F(vstrw_sg_os_uw), NULL },
-            { NULL, NULL,              NULL,              F(vstrd_sg_os_ud) }
-        }
-    };
-    return do_ldst_sg(s, a, fns[a->os][a->msize][a->size]);
-}
-
-#undef F
-
-static bool do_ldst_sg_imm(DisasContext *s, arg_vldst_sg_imm *a,
-                           MVEGenLdStSGFn *fn, unsigned msize)
-{
-    uint32_t offset;
-    TCGv_ptr qd, qm;
-
-    if (!dc_isar_feature(aa32_mve, s) ||
-        !mve_check_qreg_bank(s, a->qd | a->qm) ||
-        !fn) {
-        return false;
-    }
-
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-
-    offset = a->imm << msize;
-    if (!a->a) {
-        offset = -offset;
-    }
-
-    qd = mve_qreg_ptr(a->qd);
-    qm = mve_qreg_ptr(a->qm);
-    fn(cpu_env, qd, qm, tcg_constant_i32(offset));
-    tcg_temp_free_ptr(qd);
-    tcg_temp_free_ptr(qm);
-    mve_update_eci(s);
-    return true;
-}
-
-static bool trans_VLDRW_sg_imm(DisasContext *s, arg_vldst_sg_imm *a)
-{
-    static MVEGenLdStSGFn * const fns[] = {
-        gen_helper_mve_vldrw_sg_uw,
-        gen_helper_mve_vldrw_sg_wb_uw,
-    };
-    if (a->qd == a->qm) {
-        return false; /* UNPREDICTABLE */
-    }
-    return do_ldst_sg_imm(s, a, fns[a->w], MO_32);
-}
-
-static bool trans_VLDRD_sg_imm(DisasContext *s, arg_vldst_sg_imm *a)
-{
-    static MVEGenLdStSGFn * const fns[] = {
-        gen_helper_mve_vldrd_sg_ud,
-        gen_helper_mve_vldrd_sg_wb_ud,
-    };
-    if (a->qd == a->qm) {
-        return false; /* UNPREDICTABLE */
-    }
-    return do_ldst_sg_imm(s, a, fns[a->w], MO_64);
-}
-
-static bool trans_VSTRW_sg_imm(DisasContext *s, arg_vldst_sg_imm *a)
-{
-    static MVEGenLdStSGFn * const fns[] = {
-        gen_helper_mve_vstrw_sg_uw,
-        gen_helper_mve_vstrw_sg_wb_uw,
-    };
-    return do_ldst_sg_imm(s, a, fns[a->w], MO_32);
-}
-
-static bool trans_VSTRD_sg_imm(DisasContext *s, arg_vldst_sg_imm *a)
-{
-    static MVEGenLdStSGFn * const fns[] = {
-        gen_helper_mve_vstrd_sg_ud,
-        gen_helper_mve_vstrd_sg_wb_ud,
-    };
-    return do_ldst_sg_imm(s, a, fns[a->w], MO_64);
-}
-
-static bool do_vldst_il(DisasContext *s, arg_vldst_il *a, MVEGenLdStIlFn *fn,
-                        int addrinc)
-{
-    TCGv_i32 rn;
-
-    if (!dc_isar_feature(aa32_mve, s) ||
-        !mve_check_qreg_bank(s, a->qd) ||
-        !fn || (a->rn == 13 && a->w) || a->rn == 15) {
-        /* Variously UNPREDICTABLE or UNDEF or related-encoding */
-        return false;
-    }
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-
-    rn = load_reg(s, a->rn);
-    /*
-     * We pass the index of Qd, not a pointer, because the helper must
-     * access multiple Q registers starting at Qd and working up.
-     */
-    fn(cpu_env, tcg_constant_i32(a->qd), rn);
-
-    if (a->w) {
-        tcg_gen_addi_i32(rn, rn, addrinc);
-        store_reg(s, a->rn, rn);
-    } else {
-        tcg_temp_free_i32(rn);
-    }
-    mve_update_and_store_eci(s);
-    return true;
-}
-
-/* This macro is just to make the arrays more compact in these functions */
-#define F(N) gen_helper_mve_##N
-
-static bool trans_VLD2(DisasContext *s, arg_vldst_il *a)
-{
-    static MVEGenLdStIlFn * const fns[4][4] = {
-        { F(vld20b), F(vld20h), F(vld20w), NULL, },
-        { F(vld21b), F(vld21h), F(vld21w), NULL, },
-        { NULL, NULL, NULL, NULL },
-        { NULL, NULL, NULL, NULL },
-    };
-    if (a->qd > 6) {
-        return false;
-    }
-    return do_vldst_il(s, a, fns[a->pat][a->size], 32);
-}
-
-static bool trans_VLD4(DisasContext *s, arg_vldst_il *a)
-{
-    static MVEGenLdStIlFn * const fns[4][4] = {
-        { F(vld40b), F(vld40h), F(vld40w), NULL, },
-        { F(vld41b), F(vld41h), F(vld41w), NULL, },
-        { F(vld42b), F(vld42h), F(vld42w), NULL, },
-        { F(vld43b), F(vld43h), F(vld43w), NULL, },
-    };
-    if (a->qd > 4) {
-        return false;
-    }
-    return do_vldst_il(s, a, fns[a->pat][a->size], 64);
-}
-
-static bool trans_VST2(DisasContext *s, arg_vldst_il *a)
-{
-    static MVEGenLdStIlFn * const fns[4][4] = {
-        { F(vst20b), F(vst20h), F(vst20w), NULL, },
-        { F(vst21b), F(vst21h), F(vst21w), NULL, },
-        { NULL, NULL, NULL, NULL },
-        { NULL, NULL, NULL, NULL },
-    };
-    if (a->qd > 6) {
-        return false;
-    }
-    return do_vldst_il(s, a, fns[a->pat][a->size], 32);
-}
-
-static bool trans_VST4(DisasContext *s, arg_vldst_il *a)
-{
-    static MVEGenLdStIlFn * const fns[4][4] = {
-        { F(vst40b), F(vst40h), F(vst40w), NULL, },
-        { F(vst41b), F(vst41h), F(vst41w), NULL, },
-        { F(vst42b), F(vst42h), F(vst42w), NULL, },
-        { F(vst43b), F(vst43h), F(vst43w), NULL, },
-    };
-    if (a->qd > 4) {
-        return false;
-    }
-    return do_vldst_il(s, a, fns[a->pat][a->size], 64);
-}
-
-#undef F
-
-static bool trans_VDUP(DisasContext *s, arg_VDUP *a)
-{
-    TCGv_ptr qd;
-    TCGv_i32 rt;
-
-    if (!dc_isar_feature(aa32_mve, s) ||
-        !mve_check_qreg_bank(s, a->qd)) {
-        return false;
-    }
-    if (a->rt == 13 || a->rt == 15) {
-        /* UNPREDICTABLE; we choose to UNDEF */
-        return false;
-    }
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-
-    rt = load_reg(s, a->rt);
-    if (mve_no_predication(s)) {
-        tcg_gen_gvec_dup_i32(a->size, mve_qreg_offset(a->qd), 16, 16, rt);
-    } else {
-        qd = mve_qreg_ptr(a->qd);
-        tcg_gen_dup_i32(a->size, rt, rt);
-        gen_helper_mve_vdup(cpu_env, qd, rt);
-        tcg_temp_free_ptr(qd);
-    }
-    tcg_temp_free_i32(rt);
-    mve_update_eci(s);
-    return true;
-}
-
-static bool do_1op_vec(DisasContext *s, arg_1op *a, MVEGenOneOpFn fn,
-                       GVecGen2Fn vecfn)
-{
-    TCGv_ptr qd, qm;
-
-    if (!dc_isar_feature(aa32_mve, s) ||
-        !mve_check_qreg_bank(s, a->qd | a->qm) ||
-        !fn) {
-        return false;
-    }
-
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-
-    if (vecfn && mve_no_predication(s)) {
-        vecfn(a->size, mve_qreg_offset(a->qd), mve_qreg_offset(a->qm), 16, 16);
-    } else {
-        qd = mve_qreg_ptr(a->qd);
-        qm = mve_qreg_ptr(a->qm);
-        fn(cpu_env, qd, qm);
-        tcg_temp_free_ptr(qd);
-        tcg_temp_free_ptr(qm);
-    }
-    mve_update_eci(s);
-    return true;
-}
-
-static bool do_1op(DisasContext *s, arg_1op *a, MVEGenOneOpFn fn)
-{
-    return do_1op_vec(s, a, fn, NULL);
-}
-
-#define DO_1OP_VEC(INSN, FN, VECFN)                             \
-    static bool trans_##INSN(DisasContext *s, arg_1op *a)       \
-    {                                                           \
-        static MVEGenOneOpFn * const fns[] = {                  \
-            gen_helper_mve_##FN##b,                             \
-            gen_helper_mve_##FN##h,                             \
-            gen_helper_mve_##FN##w,                             \
-            NULL,                                               \
-        };                                                      \
-        return do_1op_vec(s, a, fns[a->size], VECFN);           \
-    }
-
-#define DO_1OP(INSN, FN) DO_1OP_VEC(INSN, FN, NULL)
-
-DO_1OP(VCLZ, vclz)
-DO_1OP(VCLS, vcls)
-DO_1OP_VEC(VABS, vabs, tcg_gen_gvec_abs)
-DO_1OP_VEC(VNEG, vneg, tcg_gen_gvec_neg)
-DO_1OP(VQABS, vqabs)
-DO_1OP(VQNEG, vqneg)
-DO_1OP(VMAXA, vmaxa)
-DO_1OP(VMINA, vmina)
-
-/*
- * For simple float/int conversions we use the fixed-point
- * conversion helpers with a zero shift count
- */
-#define DO_VCVT(INSN, HFN, SFN)                                         \
-    static void gen_##INSN##h(TCGv_ptr env, TCGv_ptr qd, TCGv_ptr qm)   \
-    {                                                                   \
-        gen_helper_mve_##HFN(env, qd, qm, tcg_constant_i32(0));         \
-    }                                                                   \
-    static void gen_##INSN##s(TCGv_ptr env, TCGv_ptr qd, TCGv_ptr qm)   \
-    {                                                                   \
-        gen_helper_mve_##SFN(env, qd, qm, tcg_constant_i32(0));         \
-    }                                                                   \
-    static bool trans_##INSN(DisasContext *s, arg_1op *a)               \
-    {                                                                   \
-        static MVEGenOneOpFn * const fns[] = {                          \
-            NULL,                                                       \
-            gen_##INSN##h,                                              \
-            gen_##INSN##s,                                              \
-            NULL,                                                       \
-        };                                                              \
-        if (!dc_isar_feature(aa32_mve_fp, s)) {                         \
-            return false;                                               \
-        }                                                               \
-        return do_1op(s, a, fns[a->size]);                              \
-    }
-
-DO_VCVT(VCVT_SF, vcvt_sh, vcvt_sf)
-DO_VCVT(VCVT_UF, vcvt_uh, vcvt_uf)
-DO_VCVT(VCVT_FS, vcvt_hs, vcvt_fs)
-DO_VCVT(VCVT_FU, vcvt_hu, vcvt_fu)
-
-static bool do_vcvt_rmode(DisasContext *s, arg_1op *a,
-                          enum arm_fprounding rmode, bool u)
-{
-    /*
-     * Handle VCVT fp to int with specified rounding mode.
-     * This is a 1op fn but we must pass the rounding mode as
-     * an immediate to the helper.
-     */
-    TCGv_ptr qd, qm;
-    static MVEGenVCVTRmodeFn * const fns[4][2] = {
-        { NULL, NULL },
-        { gen_helper_mve_vcvt_rm_sh, gen_helper_mve_vcvt_rm_uh },
-        { gen_helper_mve_vcvt_rm_ss, gen_helper_mve_vcvt_rm_us },
-        { NULL, NULL },
-    };
-    MVEGenVCVTRmodeFn *fn = fns[a->size][u];
-
-    if (!dc_isar_feature(aa32_mve_fp, s) ||
-        !mve_check_qreg_bank(s, a->qd | a->qm) ||
-        !fn) {
-        return false;
-    }
-
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-
-    qd = mve_qreg_ptr(a->qd);
-    qm = mve_qreg_ptr(a->qm);
-    fn(cpu_env, qd, qm, tcg_constant_i32(arm_rmode_to_sf(rmode)));
-    tcg_temp_free_ptr(qd);
-    tcg_temp_free_ptr(qm);
-    mve_update_eci(s);
-    return true;
-}
-
-#define DO_VCVT_RMODE(INSN, RMODE, U)                           \
-    static bool trans_##INSN(DisasContext *s, arg_1op *a)       \
-    {                                                           \
-        return do_vcvt_rmode(s, a, RMODE, U);                   \
-    }                                                           \
-
-DO_VCVT_RMODE(VCVTAS, FPROUNDING_TIEAWAY, false)
-DO_VCVT_RMODE(VCVTAU, FPROUNDING_TIEAWAY, true)
-DO_VCVT_RMODE(VCVTNS, FPROUNDING_TIEEVEN, false)
-DO_VCVT_RMODE(VCVTNU, FPROUNDING_TIEEVEN, true)
-DO_VCVT_RMODE(VCVTPS, FPROUNDING_POSINF, false)
-DO_VCVT_RMODE(VCVTPU, FPROUNDING_POSINF, true)
-DO_VCVT_RMODE(VCVTMS, FPROUNDING_NEGINF, false)
-DO_VCVT_RMODE(VCVTMU, FPROUNDING_NEGINF, true)
-
-#define DO_VCVT_SH(INSN, FN)                                    \
-    static bool trans_##INSN(DisasContext *s, arg_1op *a)       \
-    {                                                           \
-        if (!dc_isar_feature(aa32_mve_fp, s)) {                 \
-            return false;                                       \
-        }                                                       \
-        return do_1op(s, a, gen_helper_mve_##FN);               \
-    }                                                           \
-
-DO_VCVT_SH(VCVTB_SH, vcvtb_sh)
-DO_VCVT_SH(VCVTT_SH, vcvtt_sh)
-DO_VCVT_SH(VCVTB_HS, vcvtb_hs)
-DO_VCVT_SH(VCVTT_HS, vcvtt_hs)
-
-#define DO_VRINT(INSN, RMODE)                                           \
-    static void gen_##INSN##h(TCGv_ptr env, TCGv_ptr qd, TCGv_ptr qm)   \
-    {                                                                   \
-        gen_helper_mve_vrint_rm_h(env, qd, qm,                          \
-                                  tcg_constant_i32(arm_rmode_to_sf(RMODE))); \
-    }                                                                   \
-    static void gen_##INSN##s(TCGv_ptr env, TCGv_ptr qd, TCGv_ptr qm)   \
-    {                                                                   \
-        gen_helper_mve_vrint_rm_s(env, qd, qm,                          \
-                                  tcg_constant_i32(arm_rmode_to_sf(RMODE))); \
-    }                                                                   \
-    static bool trans_##INSN(DisasContext *s, arg_1op *a)               \
-    {                                                                   \
-        static MVEGenOneOpFn * const fns[] = {                          \
-            NULL,                                                       \
-            gen_##INSN##h,                                              \
-            gen_##INSN##s,                                              \
-            NULL,                                                       \
-        };                                                              \
-        if (!dc_isar_feature(aa32_mve_fp, s)) {                         \
-            return false;                                               \
-        }                                                               \
-        return do_1op(s, a, fns[a->size]);                              \
-    }
-
-DO_VRINT(VRINTN, FPROUNDING_TIEEVEN)
-DO_VRINT(VRINTA, FPROUNDING_TIEAWAY)
-DO_VRINT(VRINTZ, FPROUNDING_ZERO)
-DO_VRINT(VRINTM, FPROUNDING_NEGINF)
-DO_VRINT(VRINTP, FPROUNDING_POSINF)
-
-static bool trans_VRINTX(DisasContext *s, arg_1op *a)
-{
-    static MVEGenOneOpFn * const fns[] = {
-        NULL,
-        gen_helper_mve_vrintx_h,
-        gen_helper_mve_vrintx_s,
-        NULL,
-    };
-    if (!dc_isar_feature(aa32_mve_fp, s)) {
-        return false;
-    }
-    return do_1op(s, a, fns[a->size]);
-}
-
-/* Narrowing moves: only size 0 and 1 are valid */
-#define DO_VMOVN(INSN, FN) \
-    static bool trans_##INSN(DisasContext *s, arg_1op *a)       \
-    {                                                           \
-        static MVEGenOneOpFn * const fns[] = {                  \
-            gen_helper_mve_##FN##b,                             \
-            gen_helper_mve_##FN##h,                             \
-            NULL,                                               \
-            NULL,                                               \
-        };                                                      \
-        return do_1op(s, a, fns[a->size]);                      \
-    }
-
-DO_VMOVN(VMOVNB, vmovnb)
-DO_VMOVN(VMOVNT, vmovnt)
-DO_VMOVN(VQMOVUNB, vqmovunb)
-DO_VMOVN(VQMOVUNT, vqmovunt)
-DO_VMOVN(VQMOVN_BS, vqmovnbs)
-DO_VMOVN(VQMOVN_TS, vqmovnts)
-DO_VMOVN(VQMOVN_BU, vqmovnbu)
-DO_VMOVN(VQMOVN_TU, vqmovntu)
-
-static bool trans_VREV16(DisasContext *s, arg_1op *a)
-{
-    static MVEGenOneOpFn * const fns[] = {
-        gen_helper_mve_vrev16b,
-        NULL,
-        NULL,
-        NULL,
-    };
-    return do_1op(s, a, fns[a->size]);
-}
-
-static bool trans_VREV32(DisasContext *s, arg_1op *a)
-{
-    static MVEGenOneOpFn * const fns[] = {
-        gen_helper_mve_vrev32b,
-        gen_helper_mve_vrev32h,
-        NULL,
-        NULL,
-    };
-    return do_1op(s, a, fns[a->size]);
-}
-
-static bool trans_VREV64(DisasContext *s, arg_1op *a)
-{
-    static MVEGenOneOpFn * const fns[] = {
-        gen_helper_mve_vrev64b,
-        gen_helper_mve_vrev64h,
-        gen_helper_mve_vrev64w,
-        NULL,
-    };
-    return do_1op(s, a, fns[a->size]);
-}
-
-static bool trans_VMVN(DisasContext *s, arg_1op *a)
-{
-    return do_1op_vec(s, a, gen_helper_mve_vmvn, tcg_gen_gvec_not);
-}
-
-static bool trans_VABS_fp(DisasContext *s, arg_1op *a)
-{
-    static MVEGenOneOpFn * const fns[] = {
-        NULL,
-        gen_helper_mve_vfabsh,
-        gen_helper_mve_vfabss,
-        NULL,
-    };
-    if (!dc_isar_feature(aa32_mve_fp, s)) {
-        return false;
-    }
-    return do_1op(s, a, fns[a->size]);
-}
-
-static bool trans_VNEG_fp(DisasContext *s, arg_1op *a)
-{
-    static MVEGenOneOpFn * const fns[] = {
-        NULL,
-        gen_helper_mve_vfnegh,
-        gen_helper_mve_vfnegs,
-        NULL,
-    };
-    if (!dc_isar_feature(aa32_mve_fp, s)) {
-        return false;
-    }
-    return do_1op(s, a, fns[a->size]);
-}
-
-static bool do_2op_vec(DisasContext *s, arg_2op *a, MVEGenTwoOpFn fn,
-                       GVecGen3Fn *vecfn)
-{
-    TCGv_ptr qd, qn, qm;
-
-    if (!dc_isar_feature(aa32_mve, s) ||
-        !mve_check_qreg_bank(s, a->qd | a->qn | a->qm) ||
-        !fn) {
-        return false;
-    }
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-
-    if (vecfn && mve_no_predication(s)) {
-        vecfn(a->size, mve_qreg_offset(a->qd), mve_qreg_offset(a->qn),
-              mve_qreg_offset(a->qm), 16, 16);
-    } else {
-        qd = mve_qreg_ptr(a->qd);
-        qn = mve_qreg_ptr(a->qn);
-        qm = mve_qreg_ptr(a->qm);
-        fn(cpu_env, qd, qn, qm);
-        tcg_temp_free_ptr(qd);
-        tcg_temp_free_ptr(qn);
-        tcg_temp_free_ptr(qm);
-    }
-    mve_update_eci(s);
-    return true;
-}
-
-static bool do_2op(DisasContext *s, arg_2op *a, MVEGenTwoOpFn *fn)
-{
-    return do_2op_vec(s, a, fn, NULL);
-}
-
-#define DO_LOGIC(INSN, HELPER, VECFN)                           \
-    static bool trans_##INSN(DisasContext *s, arg_2op *a)       \
-    {                                                           \
-        return do_2op_vec(s, a, HELPER, VECFN);                 \
-    }
-
-DO_LOGIC(VAND, gen_helper_mve_vand, tcg_gen_gvec_and)
-DO_LOGIC(VBIC, gen_helper_mve_vbic, tcg_gen_gvec_andc)
-DO_LOGIC(VORR, gen_helper_mve_vorr, tcg_gen_gvec_or)
-DO_LOGIC(VORN, gen_helper_mve_vorn, tcg_gen_gvec_orc)
-DO_LOGIC(VEOR, gen_helper_mve_veor, tcg_gen_gvec_xor)
-
-static bool trans_VPSEL(DisasContext *s, arg_2op *a)
-{
-    /* This insn updates predication bits */
-    s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
-    return do_2op(s, a, gen_helper_mve_vpsel);
-}
-
-#define DO_2OP_VEC(INSN, FN, VECFN)                             \
-    static bool trans_##INSN(DisasContext *s, arg_2op *a)       \
-    {                                                           \
-        static MVEGenTwoOpFn * const fns[] = {                  \
-            gen_helper_mve_##FN##b,                             \
-            gen_helper_mve_##FN##h,                             \
-            gen_helper_mve_##FN##w,                             \
-            NULL,                                               \
-        };                                                      \
-        return do_2op_vec(s, a, fns[a->size], VECFN);           \
-    }
-
-#define DO_2OP(INSN, FN) DO_2OP_VEC(INSN, FN, NULL)
-
-DO_2OP_VEC(VADD, vadd, tcg_gen_gvec_add)
-DO_2OP_VEC(VSUB, vsub, tcg_gen_gvec_sub)
-DO_2OP_VEC(VMUL, vmul, tcg_gen_gvec_mul)
-DO_2OP(VMULH_S, vmulhs)
-DO_2OP(VMULH_U, vmulhu)
-DO_2OP(VRMULH_S, vrmulhs)
-DO_2OP(VRMULH_U, vrmulhu)
-DO_2OP_VEC(VMAX_S, vmaxs, tcg_gen_gvec_smax)
-DO_2OP_VEC(VMAX_U, vmaxu, tcg_gen_gvec_umax)
-DO_2OP_VEC(VMIN_S, vmins, tcg_gen_gvec_smin)
-DO_2OP_VEC(VMIN_U, vminu, tcg_gen_gvec_umin)
-DO_2OP(VABD_S, vabds)
-DO_2OP(VABD_U, vabdu)
-DO_2OP(VHADD_S, vhadds)
-DO_2OP(VHADD_U, vhaddu)
-DO_2OP(VHSUB_S, vhsubs)
-DO_2OP(VHSUB_U, vhsubu)
-DO_2OP(VMULL_BS, vmullbs)
-DO_2OP(VMULL_BU, vmullbu)
-DO_2OP(VMULL_TS, vmullts)
-DO_2OP(VMULL_TU, vmulltu)
-DO_2OP(VQDMULH, vqdmulh)
-DO_2OP(VQRDMULH, vqrdmulh)
-DO_2OP(VQADD_S, vqadds)
-DO_2OP(VQADD_U, vqaddu)
-DO_2OP(VQSUB_S, vqsubs)
-DO_2OP(VQSUB_U, vqsubu)
-DO_2OP(VSHL_S, vshls)
-DO_2OP(VSHL_U, vshlu)
-DO_2OP(VRSHL_S, vrshls)
-DO_2OP(VRSHL_U, vrshlu)
-DO_2OP(VQSHL_S, vqshls)
-DO_2OP(VQSHL_U, vqshlu)
-DO_2OP(VQRSHL_S, vqrshls)
-DO_2OP(VQRSHL_U, vqrshlu)
-DO_2OP(VQDMLADH, vqdmladh)
-DO_2OP(VQDMLADHX, vqdmladhx)
-DO_2OP(VQRDMLADH, vqrdmladh)
-DO_2OP(VQRDMLADHX, vqrdmladhx)
-DO_2OP(VQDMLSDH, vqdmlsdh)
-DO_2OP(VQDMLSDHX, vqdmlsdhx)
-DO_2OP(VQRDMLSDH, vqrdmlsdh)
-DO_2OP(VQRDMLSDHX, vqrdmlsdhx)
-DO_2OP(VRHADD_S, vrhadds)
-DO_2OP(VRHADD_U, vrhaddu)
-/*
- * VCADD Qd == Qm at size MO_32 is UNPREDICTABLE; we choose not to diagnose
- * so we can reuse the DO_2OP macro. (Our implementation calculates the
- * "expected" results in this case.) Similarly for VHCADD.
- */
-DO_2OP(VCADD90, vcadd90)
-DO_2OP(VCADD270, vcadd270)
-DO_2OP(VHCADD90, vhcadd90)
-DO_2OP(VHCADD270, vhcadd270)
-
-static bool trans_VQDMULLB(DisasContext *s, arg_2op *a)
-{
-    static MVEGenTwoOpFn * const fns[] = {
-        NULL,
-        gen_helper_mve_vqdmullbh,
-        gen_helper_mve_vqdmullbw,
-        NULL,
-    };
-    if (a->size == MO_32 && (a->qd == a->qm || a->qd == a->qn)) {
-        /* UNPREDICTABLE; we choose to undef */
-        return false;
-    }
-    return do_2op(s, a, fns[a->size]);
-}
-
-static bool trans_VQDMULLT(DisasContext *s, arg_2op *a)
-{
-    static MVEGenTwoOpFn * const fns[] = {
-        NULL,
-        gen_helper_mve_vqdmullth,
-        gen_helper_mve_vqdmulltw,
-        NULL,
-    };
-    if (a->size == MO_32 && (a->qd == a->qm || a->qd == a->qn)) {
-        /* UNPREDICTABLE; we choose to undef */
-        return false;
-    }
-    return do_2op(s, a, fns[a->size]);
-}
-
-static bool trans_VMULLP_B(DisasContext *s, arg_2op *a)
-{
-    /*
-     * Note that a->size indicates the output size, ie VMULL.P8
-     * is the 8x8->16 operation and a->size is MO_16; VMULL.P16
-     * is the 16x16->32 operation and a->size is MO_32.
-     */
-    static MVEGenTwoOpFn * const fns[] = {
-        NULL,
-        gen_helper_mve_vmullpbh,
-        gen_helper_mve_vmullpbw,
-        NULL,
-    };
-    return do_2op(s, a, fns[a->size]);
-}
-
-static bool trans_VMULLP_T(DisasContext *s, arg_2op *a)
-{
-    /* a->size is as for trans_VMULLP_B */
-    static MVEGenTwoOpFn * const fns[] = {
-        NULL,
-        gen_helper_mve_vmullpth,
-        gen_helper_mve_vmullptw,
-        NULL,
-    };
-    return do_2op(s, a, fns[a->size]);
-}
-
-/*
- * VADC and VSBC: these perform an add-with-carry or subtract-with-carry
- * of the 32-bit elements in each lane of the input vectors, where the
- * carry-out of each add is the carry-in of the next.  The initial carry
- * input is either fixed (0 for VADCI, 1 for VSBCI) or is from FPSCR.C
- * (for VADC and VSBC); the carry out at the end is written back to FPSCR.C.
- * These insns are subject to beat-wise execution.  Partial execution
- * of an I=1 (initial carry input fixed) insn which does not
- * execute the first beat must start with the current FPSCR.NZCV
- * value, not the fixed constant input.
- */
-static bool trans_VADC(DisasContext *s, arg_2op *a)
-{
-    return do_2op(s, a, gen_helper_mve_vadc);
-}
-
-static bool trans_VADCI(DisasContext *s, arg_2op *a)
-{
-    if (mve_skip_first_beat(s)) {
-        return trans_VADC(s, a);
-    }
-    return do_2op(s, a, gen_helper_mve_vadci);
-}
-
-static bool trans_VSBC(DisasContext *s, arg_2op *a)
-{
-    return do_2op(s, a, gen_helper_mve_vsbc);
-}
-
-static bool trans_VSBCI(DisasContext *s, arg_2op *a)
-{
-    if (mve_skip_first_beat(s)) {
-        return trans_VSBC(s, a);
-    }
-    return do_2op(s, a, gen_helper_mve_vsbci);
-}
-
-#define DO_2OP_FP(INSN, FN)                                     \
-    static bool trans_##INSN(DisasContext *s, arg_2op *a)       \
-    {                                                           \
-        static MVEGenTwoOpFn * const fns[] = {                  \
-            NULL,                                               \
-            gen_helper_mve_##FN##h,                             \
-            gen_helper_mve_##FN##s,                             \
-            NULL,                                               \
-        };                                                      \
-        if (!dc_isar_feature(aa32_mve_fp, s)) {                 \
-            return false;                                       \
-        }                                                       \
-        return do_2op(s, a, fns[a->size]);                      \
-    }
-
-DO_2OP_FP(VADD_fp, vfadd)
-DO_2OP_FP(VSUB_fp, vfsub)
-DO_2OP_FP(VMUL_fp, vfmul)
-DO_2OP_FP(VABD_fp, vfabd)
-DO_2OP_FP(VMAXNM, vmaxnm)
-DO_2OP_FP(VMINNM, vminnm)
-DO_2OP_FP(VCADD90_fp, vfcadd90)
-DO_2OP_FP(VCADD270_fp, vfcadd270)
-DO_2OP_FP(VFMA, vfma)
-DO_2OP_FP(VFMS, vfms)
-DO_2OP_FP(VCMUL0, vcmul0)
-DO_2OP_FP(VCMUL90, vcmul90)
-DO_2OP_FP(VCMUL180, vcmul180)
-DO_2OP_FP(VCMUL270, vcmul270)
-DO_2OP_FP(VCMLA0, vcmla0)
-DO_2OP_FP(VCMLA90, vcmla90)
-DO_2OP_FP(VCMLA180, vcmla180)
-DO_2OP_FP(VCMLA270, vcmla270)
-DO_2OP_FP(VMAXNMA, vmaxnma)
-DO_2OP_FP(VMINNMA, vminnma)
-
-static bool do_2op_scalar(DisasContext *s, arg_2scalar *a,
-                          MVEGenTwoOpScalarFn fn)
-{
-    TCGv_ptr qd, qn;
-    TCGv_i32 rm;
-
-    if (!dc_isar_feature(aa32_mve, s) ||
-        !mve_check_qreg_bank(s, a->qd | a->qn) ||
-        !fn) {
-        return false;
-    }
-    if (a->rm == 13 || a->rm == 15) {
-        /* UNPREDICTABLE */
-        return false;
-    }
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-
-    qd = mve_qreg_ptr(a->qd);
-    qn = mve_qreg_ptr(a->qn);
-    rm = load_reg(s, a->rm);
-    fn(cpu_env, qd, qn, rm);
-    tcg_temp_free_i32(rm);
-    tcg_temp_free_ptr(qd);
-    tcg_temp_free_ptr(qn);
-    mve_update_eci(s);
-    return true;
-}
-
-#define DO_2OP_SCALAR(INSN, FN)                                 \
-    static bool trans_##INSN(DisasContext *s, arg_2scalar *a)   \
-    {                                                           \
-        static MVEGenTwoOpScalarFn * const fns[] = {            \
-            gen_helper_mve_##FN##b,                             \
-            gen_helper_mve_##FN##h,                             \
-            gen_helper_mve_##FN##w,                             \
-            NULL,                                               \
-        };                                                      \
-        return do_2op_scalar(s, a, fns[a->size]);               \
-    }
-
-DO_2OP_SCALAR(VADD_scalar, vadd_scalar)
-DO_2OP_SCALAR(VSUB_scalar, vsub_scalar)
-DO_2OP_SCALAR(VMUL_scalar, vmul_scalar)
-DO_2OP_SCALAR(VHADD_S_scalar, vhadds_scalar)
-DO_2OP_SCALAR(VHADD_U_scalar, vhaddu_scalar)
-DO_2OP_SCALAR(VHSUB_S_scalar, vhsubs_scalar)
-DO_2OP_SCALAR(VHSUB_U_scalar, vhsubu_scalar)
-DO_2OP_SCALAR(VQADD_S_scalar, vqadds_scalar)
-DO_2OP_SCALAR(VQADD_U_scalar, vqaddu_scalar)
-DO_2OP_SCALAR(VQSUB_S_scalar, vqsubs_scalar)
-DO_2OP_SCALAR(VQSUB_U_scalar, vqsubu_scalar)
-DO_2OP_SCALAR(VQDMULH_scalar, vqdmulh_scalar)
-DO_2OP_SCALAR(VQRDMULH_scalar, vqrdmulh_scalar)
-DO_2OP_SCALAR(VBRSR, vbrsr)
-DO_2OP_SCALAR(VMLA, vmla)
-DO_2OP_SCALAR(VMLAS, vmlas)
-DO_2OP_SCALAR(VQDMLAH, vqdmlah)
-DO_2OP_SCALAR(VQRDMLAH, vqrdmlah)
-DO_2OP_SCALAR(VQDMLASH, vqdmlash)
-DO_2OP_SCALAR(VQRDMLASH, vqrdmlash)
-
-static bool trans_VQDMULLB_scalar(DisasContext *s, arg_2scalar *a)
-{
-    static MVEGenTwoOpScalarFn * const fns[] = {
-        NULL,
-        gen_helper_mve_vqdmullb_scalarh,
-        gen_helper_mve_vqdmullb_scalarw,
-        NULL,
-    };
-    if (a->qd == a->qn && a->size == MO_32) {
-        /* UNPREDICTABLE; we choose to undef */
-        return false;
-    }
-    return do_2op_scalar(s, a, fns[a->size]);
-}
-
-static bool trans_VQDMULLT_scalar(DisasContext *s, arg_2scalar *a)
-{
-    static MVEGenTwoOpScalarFn * const fns[] = {
-        NULL,
-        gen_helper_mve_vqdmullt_scalarh,
-        gen_helper_mve_vqdmullt_scalarw,
-        NULL,
-    };
-    if (a->qd == a->qn && a->size == MO_32) {
-        /* UNPREDICTABLE; we choose to undef */
-        return false;
-    }
-    return do_2op_scalar(s, a, fns[a->size]);
-}
-
-
-#define DO_2OP_FP_SCALAR(INSN, FN)                              \
-    static bool trans_##INSN(DisasContext *s, arg_2scalar *a)   \
-    {                                                           \
-        static MVEGenTwoOpScalarFn * const fns[] = {            \
-            NULL,                                               \
-            gen_helper_mve_##FN##h,                             \
-            gen_helper_mve_##FN##s,                             \
-            NULL,                                               \
-        };                                                      \
-        if (!dc_isar_feature(aa32_mve_fp, s)) {                 \
-            return false;                                       \
-        }                                                       \
-        return do_2op_scalar(s, a, fns[a->size]);               \
-    }
-
-DO_2OP_FP_SCALAR(VADD_fp_scalar, vfadd_scalar)
-DO_2OP_FP_SCALAR(VSUB_fp_scalar, vfsub_scalar)
-DO_2OP_FP_SCALAR(VMUL_fp_scalar, vfmul_scalar)
-DO_2OP_FP_SCALAR(VFMA_scalar, vfma_scalar)
-DO_2OP_FP_SCALAR(VFMAS_scalar, vfmas_scalar)
-
-static bool do_long_dual_acc(DisasContext *s, arg_vmlaldav *a,
-                             MVEGenLongDualAccOpFn *fn)
-{
-    TCGv_ptr qn, qm;
-    TCGv_i64 rda;
-    TCGv_i32 rdalo, rdahi;
-
-    if (!dc_isar_feature(aa32_mve, s) ||
-        !mve_check_qreg_bank(s, a->qn | a->qm) ||
-        !fn) {
-        return false;
-    }
-    /*
-     * rdahi == 13 is UNPREDICTABLE; rdahi == 15 is a related
-     * encoding; rdalo always has bit 0 clear so cannot be 13 or 15.
-     */
-    if (a->rdahi == 13 || a->rdahi == 15) {
-        return false;
-    }
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-
-    qn = mve_qreg_ptr(a->qn);
-    qm = mve_qreg_ptr(a->qm);
-
-    /*
-     * This insn is subject to beat-wise execution. Partial execution
-     * of an A=0 (no-accumulate) insn which does not execute the first
-     * beat must start with the current rda value, not 0.
-     */
-    if (a->a || mve_skip_first_beat(s)) {
-        rda = tcg_temp_new_i64();
-        rdalo = load_reg(s, a->rdalo);
-        rdahi = load_reg(s, a->rdahi);
-        tcg_gen_concat_i32_i64(rda, rdalo, rdahi);
-        tcg_temp_free_i32(rdalo);
-        tcg_temp_free_i32(rdahi);
-    } else {
-        rda = tcg_const_i64(0);
-    }
-
-    fn(rda, cpu_env, qn, qm, rda);
-    tcg_temp_free_ptr(qn);
-    tcg_temp_free_ptr(qm);
-
-    rdalo = tcg_temp_new_i32();
-    rdahi = tcg_temp_new_i32();
-    tcg_gen_extrl_i64_i32(rdalo, rda);
-    tcg_gen_extrh_i64_i32(rdahi, rda);
-    store_reg(s, a->rdalo, rdalo);
-    store_reg(s, a->rdahi, rdahi);
-    tcg_temp_free_i64(rda);
-    mve_update_eci(s);
-    return true;
-}
-
-static bool trans_VMLALDAV_S(DisasContext *s, arg_vmlaldav *a)
-{
-    static MVEGenLongDualAccOpFn * const fns[4][2] = {
-        { NULL, NULL },
-        { gen_helper_mve_vmlaldavsh, gen_helper_mve_vmlaldavxsh },
-        { gen_helper_mve_vmlaldavsw, gen_helper_mve_vmlaldavxsw },
-        { NULL, NULL },
-    };
-    return do_long_dual_acc(s, a, fns[a->size][a->x]);
-}
-
-static bool trans_VMLALDAV_U(DisasContext *s, arg_vmlaldav *a)
-{
-    static MVEGenLongDualAccOpFn * const fns[4][2] = {
-        { NULL, NULL },
-        { gen_helper_mve_vmlaldavuh, NULL },
-        { gen_helper_mve_vmlaldavuw, NULL },
-        { NULL, NULL },
-    };
-    return do_long_dual_acc(s, a, fns[a->size][a->x]);
-}
-
-static bool trans_VMLSLDAV(DisasContext *s, arg_vmlaldav *a)
-{
-    static MVEGenLongDualAccOpFn * const fns[4][2] = {
-        { NULL, NULL },
-        { gen_helper_mve_vmlsldavsh, gen_helper_mve_vmlsldavxsh },
-        { gen_helper_mve_vmlsldavsw, gen_helper_mve_vmlsldavxsw },
-        { NULL, NULL },
-    };
-    return do_long_dual_acc(s, a, fns[a->size][a->x]);
-}
-
-static bool trans_VRMLALDAVH_S(DisasContext *s, arg_vmlaldav *a)
-{
-    static MVEGenLongDualAccOpFn * const fns[] = {
-        gen_helper_mve_vrmlaldavhsw, gen_helper_mve_vrmlaldavhxsw,
-    };
-    return do_long_dual_acc(s, a, fns[a->x]);
-}
-
-static bool trans_VRMLALDAVH_U(DisasContext *s, arg_vmlaldav *a)
-{
-    static MVEGenLongDualAccOpFn * const fns[] = {
-        gen_helper_mve_vrmlaldavhuw, NULL,
-    };
-    return do_long_dual_acc(s, a, fns[a->x]);
-}
-
-static bool trans_VRMLSLDAVH(DisasContext *s, arg_vmlaldav *a)
-{
-    static MVEGenLongDualAccOpFn * const fns[] = {
-        gen_helper_mve_vrmlsldavhsw, gen_helper_mve_vrmlsldavhxsw,
-    };
-    return do_long_dual_acc(s, a, fns[a->x]);
-}
-
-static bool do_dual_acc(DisasContext *s, arg_vmladav *a, MVEGenDualAccOpFn *fn)
-{
-    TCGv_ptr qn, qm;
-    TCGv_i32 rda;
-
-    if (!dc_isar_feature(aa32_mve, s) ||
-        !mve_check_qreg_bank(s, a->qn) ||
-        !fn) {
-        return false;
-    }
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-
-    qn = mve_qreg_ptr(a->qn);
-    qm = mve_qreg_ptr(a->qm);
-
-    /*
-     * This insn is subject to beat-wise execution. Partial execution
-     * of an A=0 (no-accumulate) insn which does not execute the first
-     * beat must start with the current rda value, not 0.
-     */
-    if (a->a || mve_skip_first_beat(s)) {
-        rda = load_reg(s, a->rda);
-    } else {
-        rda = tcg_const_i32(0);
-    }
-
-    fn(rda, cpu_env, qn, qm, rda);
-    store_reg(s, a->rda, rda);
-    tcg_temp_free_ptr(qn);
-    tcg_temp_free_ptr(qm);
-
-    mve_update_eci(s);
-    return true;
-}
-
-#define DO_DUAL_ACC(INSN, FN)                                           \
-    static bool trans_##INSN(DisasContext *s, arg_vmladav *a)           \
-    {                                                                   \
-        static MVEGenDualAccOpFn * const fns[4][2] = {                  \
-            { gen_helper_mve_##FN##b, gen_helper_mve_##FN##xb },        \
-            { gen_helper_mve_##FN##h, gen_helper_mve_##FN##xh },        \
-            { gen_helper_mve_##FN##w, gen_helper_mve_##FN##xw },        \
-            { NULL, NULL },                                             \
-        };                                                              \
-        return do_dual_acc(s, a, fns[a->size][a->x]);                   \
-    }
-
-DO_DUAL_ACC(VMLADAV_S, vmladavs)
-DO_DUAL_ACC(VMLSDAV, vmlsdav)
-
-static bool trans_VMLADAV_U(DisasContext *s, arg_vmladav *a)
-{
-    static MVEGenDualAccOpFn * const fns[4][2] = {
-        { gen_helper_mve_vmladavub, NULL },
-        { gen_helper_mve_vmladavuh, NULL },
-        { gen_helper_mve_vmladavuw, NULL },
-        { NULL, NULL },
-    };
-    return do_dual_acc(s, a, fns[a->size][a->x]);
-}
-
-static void gen_vpst(DisasContext *s, uint32_t mask)
-{
-    /*
-     * Set the VPR mask fields. We take advantage of MASK01 and MASK23
-     * being adjacent fields in the register.
-     *
-     * Updating the masks is not predicated, but it is subject to beat-wise
-     * execution, and the mask is updated on the odd-numbered beats.
-     * So if PSR.ECI says we should skip beat 1, we mustn't update the
-     * 01 mask field.
-     */
-    TCGv_i32 vpr = load_cpu_field(v7m.vpr);
-    switch (s->eci) {
-    case ECI_NONE:
-    case ECI_A0:
-        /* Update both 01 and 23 fields */
-        tcg_gen_deposit_i32(vpr, vpr,
-                            tcg_constant_i32(mask | (mask << 4)),
-                            R_V7M_VPR_MASK01_SHIFT,
-                            R_V7M_VPR_MASK01_LENGTH + R_V7M_VPR_MASK23_LENGTH);
-        break;
-    case ECI_A0A1:
-    case ECI_A0A1A2:
-    case ECI_A0A1A2B0:
-        /* Update only the 23 mask field */
-        tcg_gen_deposit_i32(vpr, vpr,
-                            tcg_constant_i32(mask),
-                            R_V7M_VPR_MASK23_SHIFT, R_V7M_VPR_MASK23_LENGTH);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-    store_cpu_field(vpr, v7m.vpr);
-}
-
-static bool trans_VPST(DisasContext *s, arg_VPST *a)
-{
-    /* mask == 0 is a "related encoding" */
-    if (!dc_isar_feature(aa32_mve, s) || !a->mask) {
-        return false;
-    }
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-    gen_vpst(s, a->mask);
-    mve_update_and_store_eci(s);
-    return true;
-}
-
-static bool trans_VPNOT(DisasContext *s, arg_VPNOT *a)
-{
-    /*
-     * Invert the predicate in VPR.P0. We have call out to
-     * a helper because this insn itself is beatwise and can
-     * be predicated.
-     */
-    if (!dc_isar_feature(aa32_mve, s)) {
-        return false;
-    }
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-
-    gen_helper_mve_vpnot(cpu_env);
-    /* This insn updates predication bits */
-    s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
-    mve_update_eci(s);
-    return true;
-}
-
-static bool trans_VADDV(DisasContext *s, arg_VADDV *a)
-{
-    /* VADDV: vector add across vector */
-    static MVEGenVADDVFn * const fns[4][2] = {
-        { gen_helper_mve_vaddvsb, gen_helper_mve_vaddvub },
-        { gen_helper_mve_vaddvsh, gen_helper_mve_vaddvuh },
-        { gen_helper_mve_vaddvsw, gen_helper_mve_vaddvuw },
-        { NULL, NULL }
-    };
-    TCGv_ptr qm;
-    TCGv_i32 rda;
-
-    if (!dc_isar_feature(aa32_mve, s) ||
-        a->size == 3) {
-        return false;
-    }
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-
-    /*
-     * This insn is subject to beat-wise execution. Partial execution
-     * of an A=0 (no-accumulate) insn which does not execute the first
-     * beat must start with the current value of Rda, not zero.
-     */
-    if (a->a || mve_skip_first_beat(s)) {
-        /* Accumulate input from Rda */
-        rda = load_reg(s, a->rda);
-    } else {
-        /* Accumulate starting at zero */
-        rda = tcg_const_i32(0);
-    }
-
-    qm = mve_qreg_ptr(a->qm);
-    fns[a->size][a->u](rda, cpu_env, qm, rda);
-    store_reg(s, a->rda, rda);
-    tcg_temp_free_ptr(qm);
-
-    mve_update_eci(s);
-    return true;
-}
-
-static bool trans_VADDLV(DisasContext *s, arg_VADDLV *a)
-{
-    /*
-     * Vector Add Long Across Vector: accumulate the 32-bit
-     * elements of the vector into a 64-bit result stored in
-     * a pair of general-purpose registers.
-     * No need to check Qm's bank: it is only 3 bits in decode.
-     */
-    TCGv_ptr qm;
-    TCGv_i64 rda;
-    TCGv_i32 rdalo, rdahi;
-
-    if (!dc_isar_feature(aa32_mve, s)) {
-        return false;
-    }
-    /*
-     * rdahi == 13 is UNPREDICTABLE; rdahi == 15 is a related
-     * encoding; rdalo always has bit 0 clear so cannot be 13 or 15.
-     */
-    if (a->rdahi == 13 || a->rdahi == 15) {
-        return false;
-    }
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-
-    /*
-     * This insn is subject to beat-wise execution. Partial execution
-     * of an A=0 (no-accumulate) insn which does not execute the first
-     * beat must start with the current value of RdaHi:RdaLo, not zero.
-     */
-    if (a->a || mve_skip_first_beat(s)) {
-        /* Accumulate input from RdaHi:RdaLo */
-        rda = tcg_temp_new_i64();
-        rdalo = load_reg(s, a->rdalo);
-        rdahi = load_reg(s, a->rdahi);
-        tcg_gen_concat_i32_i64(rda, rdalo, rdahi);
-        tcg_temp_free_i32(rdalo);
-        tcg_temp_free_i32(rdahi);
-    } else {
-        /* Accumulate starting at zero */
-        rda = tcg_const_i64(0);
-    }
-
-    qm = mve_qreg_ptr(a->qm);
-    if (a->u) {
-        gen_helper_mve_vaddlv_u(rda, cpu_env, qm, rda);
-    } else {
-        gen_helper_mve_vaddlv_s(rda, cpu_env, qm, rda);
-    }
-    tcg_temp_free_ptr(qm);
-
-    rdalo = tcg_temp_new_i32();
-    rdahi = tcg_temp_new_i32();
-    tcg_gen_extrl_i64_i32(rdalo, rda);
-    tcg_gen_extrh_i64_i32(rdahi, rda);
-    store_reg(s, a->rdalo, rdalo);
-    store_reg(s, a->rdahi, rdahi);
-    tcg_temp_free_i64(rda);
-    mve_update_eci(s);
-    return true;
-}
-
-static bool do_1imm(DisasContext *s, arg_1imm *a, MVEGenOneOpImmFn *fn,
-                    GVecGen2iFn *vecfn)
-{
-    TCGv_ptr qd;
-    uint64_t imm;
-
-    if (!dc_isar_feature(aa32_mve, s) ||
-        !mve_check_qreg_bank(s, a->qd) ||
-        !fn) {
-        return false;
-    }
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-
-    imm = asimd_imm_const(a->imm, a->cmode, a->op);
-
-    if (vecfn && mve_no_predication(s)) {
-        vecfn(MO_64, mve_qreg_offset(a->qd), mve_qreg_offset(a->qd),
-              imm, 16, 16);
-    } else {
-        qd = mve_qreg_ptr(a->qd);
-        fn(cpu_env, qd, tcg_constant_i64(imm));
-        tcg_temp_free_ptr(qd);
-    }
-    mve_update_eci(s);
-    return true;
-}
-
-static void gen_gvec_vmovi(unsigned vece, uint32_t dofs, uint32_t aofs,
-                           int64_t c, uint32_t oprsz, uint32_t maxsz)
-{
-    tcg_gen_gvec_dup_imm(vece, dofs, oprsz, maxsz, c);
-}
-
-static bool trans_Vimm_1r(DisasContext *s, arg_1imm *a)
-{
-    /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
-    MVEGenOneOpImmFn *fn;
-    GVecGen2iFn *vecfn;
-
-    if ((a->cmode & 1) && a->cmode < 12) {
-        if (a->op) {
-            /*
-             * For op=1, the immediate will be inverted by asimd_imm_const(),
-             * so the VBIC becomes a logical AND operation.
-             */
-            fn = gen_helper_mve_vandi;
-            vecfn = tcg_gen_gvec_andi;
-        } else {
-            fn = gen_helper_mve_vorri;
-            vecfn = tcg_gen_gvec_ori;
-        }
-    } else {
-        /* There is one unallocated cmode/op combination in this space */
-        if (a->cmode == 15 && a->op == 1) {
-            return false;
-        }
-        /* asimd_imm_const() sorts out VMVNI vs VMOVI for us */
-        fn = gen_helper_mve_vmovi;
-        vecfn = gen_gvec_vmovi;
-    }
-    return do_1imm(s, a, fn, vecfn);
-}
-
-static bool do_2shift_vec(DisasContext *s, arg_2shift *a, MVEGenTwoOpShiftFn fn,
-                          bool negateshift, GVecGen2iFn vecfn)
-{
-    TCGv_ptr qd, qm;
-    int shift = a->shift;
-
-    if (!dc_isar_feature(aa32_mve, s) ||
-        !mve_check_qreg_bank(s, a->qd | a->qm) ||
-        !fn) {
-        return false;
-    }
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-
-    /*
-     * When we handle a right shift insn using a left-shift helper
-     * which permits a negative shift count to indicate a right-shift,
-     * we must negate the shift count.
-     */
-    if (negateshift) {
-        shift = -shift;
-    }
-
-    if (vecfn && mve_no_predication(s)) {
-        vecfn(a->size, mve_qreg_offset(a->qd), mve_qreg_offset(a->qm),
-              shift, 16, 16);
-    } else {
-        qd = mve_qreg_ptr(a->qd);
-        qm = mve_qreg_ptr(a->qm);
-        fn(cpu_env, qd, qm, tcg_constant_i32(shift));
-        tcg_temp_free_ptr(qd);
-        tcg_temp_free_ptr(qm);
-    }
-    mve_update_eci(s);
-    return true;
-}
-
-static bool do_2shift(DisasContext *s, arg_2shift *a, MVEGenTwoOpShiftFn fn,
-                      bool negateshift)
-{
-    return do_2shift_vec(s, a, fn, negateshift, NULL);
-}
-
-#define DO_2SHIFT_VEC(INSN, FN, NEGATESHIFT, VECFN)                     \
-    static bool trans_##INSN(DisasContext *s, arg_2shift *a)            \
-    {                                                                   \
-        static MVEGenTwoOpShiftFn * const fns[] = {                     \
-            gen_helper_mve_##FN##b,                                     \
-            gen_helper_mve_##FN##h,                                     \
-            gen_helper_mve_##FN##w,                                     \
-            NULL,                                                       \
-        };                                                              \
-        return do_2shift_vec(s, a, fns[a->size], NEGATESHIFT, VECFN);   \
-    }
-
-#define DO_2SHIFT(INSN, FN, NEGATESHIFT)        \
-    DO_2SHIFT_VEC(INSN, FN, NEGATESHIFT, NULL)
-
-static void do_gvec_shri_s(unsigned vece, uint32_t dofs, uint32_t aofs,
-                           int64_t shift, uint32_t oprsz, uint32_t maxsz)
-{
-    /*
-     * We get here with a negated shift count, and we must handle
-     * shifts by the element size, which tcg_gen_gvec_sari() does not do.
-     */
-    shift = -shift;
-    if (shift == (8 << vece)) {
-        shift--;
-    }
-    tcg_gen_gvec_sari(vece, dofs, aofs, shift, oprsz, maxsz);
-}
-
-static void do_gvec_shri_u(unsigned vece, uint32_t dofs, uint32_t aofs,
-                           int64_t shift, uint32_t oprsz, uint32_t maxsz)
-{
-    /*
-     * We get here with a negated shift count, and we must handle
-     * shifts by the element size, which tcg_gen_gvec_shri() does not do.
-     */
-    shift = -shift;
-    if (shift == (8 << vece)) {
-        tcg_gen_gvec_dup_imm(vece, dofs, oprsz, maxsz, 0);
-    } else {
-        tcg_gen_gvec_shri(vece, dofs, aofs, shift, oprsz, maxsz);
-    }
-}
-
-DO_2SHIFT_VEC(VSHLI, vshli_u, false, tcg_gen_gvec_shli)
-DO_2SHIFT(VQSHLI_S, vqshli_s, false)
-DO_2SHIFT(VQSHLI_U, vqshli_u, false)
-DO_2SHIFT(VQSHLUI, vqshlui_s, false)
-/* These right shifts use a left-shift helper with negated shift count */
-DO_2SHIFT_VEC(VSHRI_S, vshli_s, true, do_gvec_shri_s)
-DO_2SHIFT_VEC(VSHRI_U, vshli_u, true, do_gvec_shri_u)
-DO_2SHIFT(VRSHRI_S, vrshli_s, true)
-DO_2SHIFT(VRSHRI_U, vrshli_u, true)
-
-DO_2SHIFT_VEC(VSRI, vsri, false, gen_gvec_sri)
-DO_2SHIFT_VEC(VSLI, vsli, false, gen_gvec_sli)
-
-#define DO_2SHIFT_FP(INSN, FN)                                  \
-    static bool trans_##INSN(DisasContext *s, arg_2shift *a)    \
-    {                                                           \
-        if (!dc_isar_feature(aa32_mve_fp, s)) {                 \
-            return false;                                       \
-        }                                                       \
-        return do_2shift(s, a, gen_helper_mve_##FN, false);     \
-    }
-
-DO_2SHIFT_FP(VCVT_SH_fixed, vcvt_sh)
-DO_2SHIFT_FP(VCVT_UH_fixed, vcvt_uh)
-DO_2SHIFT_FP(VCVT_HS_fixed, vcvt_hs)
-DO_2SHIFT_FP(VCVT_HU_fixed, vcvt_hu)
-DO_2SHIFT_FP(VCVT_SF_fixed, vcvt_sf)
-DO_2SHIFT_FP(VCVT_UF_fixed, vcvt_uf)
-DO_2SHIFT_FP(VCVT_FS_fixed, vcvt_fs)
-DO_2SHIFT_FP(VCVT_FU_fixed, vcvt_fu)
-
-static bool do_2shift_scalar(DisasContext *s, arg_shl_scalar *a,
-                             MVEGenTwoOpShiftFn *fn)
-{
-    TCGv_ptr qda;
-    TCGv_i32 rm;
-
-    if (!dc_isar_feature(aa32_mve, s) ||
-        !mve_check_qreg_bank(s, a->qda) ||
-        a->rm == 13 || a->rm == 15 || !fn) {
-        /* Rm cases are UNPREDICTABLE */
-        return false;
-    }
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-
-    qda = mve_qreg_ptr(a->qda);
-    rm = load_reg(s, a->rm);
-    fn(cpu_env, qda, qda, rm);
-    tcg_temp_free_ptr(qda);
-    tcg_temp_free_i32(rm);
-    mve_update_eci(s);
-    return true;
-}
-
-#define DO_2SHIFT_SCALAR(INSN, FN)                                      \
-    static bool trans_##INSN(DisasContext *s, arg_shl_scalar *a)        \
-    {                                                                   \
-        static MVEGenTwoOpShiftFn * const fns[] = {                     \
-            gen_helper_mve_##FN##b,                                     \
-            gen_helper_mve_##FN##h,                                     \
-            gen_helper_mve_##FN##w,                                     \
-            NULL,                                                       \
-        };                                                              \
-        return do_2shift_scalar(s, a, fns[a->size]);                    \
-    }
-
-DO_2SHIFT_SCALAR(VSHL_S_scalar, vshli_s)
-DO_2SHIFT_SCALAR(VSHL_U_scalar, vshli_u)
-DO_2SHIFT_SCALAR(VRSHL_S_scalar, vrshli_s)
-DO_2SHIFT_SCALAR(VRSHL_U_scalar, vrshli_u)
-DO_2SHIFT_SCALAR(VQSHL_S_scalar, vqshli_s)
-DO_2SHIFT_SCALAR(VQSHL_U_scalar, vqshli_u)
-DO_2SHIFT_SCALAR(VQRSHL_S_scalar, vqrshli_s)
-DO_2SHIFT_SCALAR(VQRSHL_U_scalar, vqrshli_u)
-
-#define DO_VSHLL(INSN, FN)                                              \
-    static bool trans_##INSN(DisasContext *s, arg_2shift *a)            \
-    {                                                                   \
-        static MVEGenTwoOpShiftFn * const fns[] = {                     \
-            gen_helper_mve_##FN##b,                                     \
-            gen_helper_mve_##FN##h,                                     \
-        };                                                              \
-        return do_2shift_vec(s, a, fns[a->size], false, do_gvec_##FN);  \
-    }
-
-/*
- * For the VSHLL vector helpers, the vece is the size of the input
- * (ie MO_8 or MO_16); the helpers want to work in the output size.
- * The shift count can be 0..<input size>, inclusive. (0 is VMOVL.)
- */
-static void do_gvec_vshllbs(unsigned vece, uint32_t dofs, uint32_t aofs,
-                            int64_t shift, uint32_t oprsz, uint32_t maxsz)
-{
-    unsigned ovece = vece + 1;
-    unsigned ibits = vece == MO_8 ? 8 : 16;
-    tcg_gen_gvec_shli(ovece, dofs, aofs, ibits, oprsz, maxsz);
-    tcg_gen_gvec_sari(ovece, dofs, dofs, ibits - shift, oprsz, maxsz);
-}
-
-static void do_gvec_vshllbu(unsigned vece, uint32_t dofs, uint32_t aofs,
-                            int64_t shift, uint32_t oprsz, uint32_t maxsz)
-{
-    unsigned ovece = vece + 1;
-    tcg_gen_gvec_andi(ovece, dofs, aofs,
-                      ovece == MO_16 ? 0xff : 0xffff, oprsz, maxsz);
-    tcg_gen_gvec_shli(ovece, dofs, dofs, shift, oprsz, maxsz);
-}
-
-static void do_gvec_vshllts(unsigned vece, uint32_t dofs, uint32_t aofs,
-                            int64_t shift, uint32_t oprsz, uint32_t maxsz)
-{
-    unsigned ovece = vece + 1;
-    unsigned ibits = vece == MO_8 ? 8 : 16;
-    if (shift == 0) {
-        tcg_gen_gvec_sari(ovece, dofs, aofs, ibits, oprsz, maxsz);
-    } else {
-        tcg_gen_gvec_andi(ovece, dofs, aofs,
-                          ovece == MO_16 ? 0xff00 : 0xffff0000, oprsz, maxsz);
-        tcg_gen_gvec_sari(ovece, dofs, dofs, ibits - shift, oprsz, maxsz);
-    }
-}
-
-static void do_gvec_vshlltu(unsigned vece, uint32_t dofs, uint32_t aofs,
-                            int64_t shift, uint32_t oprsz, uint32_t maxsz)
-{
-    unsigned ovece = vece + 1;
-    unsigned ibits = vece == MO_8 ? 8 : 16;
-    if (shift == 0) {
-        tcg_gen_gvec_shri(ovece, dofs, aofs, ibits, oprsz, maxsz);
-    } else {
-        tcg_gen_gvec_andi(ovece, dofs, aofs,
-                          ovece == MO_16 ? 0xff00 : 0xffff0000, oprsz, maxsz);
-        tcg_gen_gvec_shri(ovece, dofs, dofs, ibits - shift, oprsz, maxsz);
-    }
-}
-
-DO_VSHLL(VSHLL_BS, vshllbs)
-DO_VSHLL(VSHLL_BU, vshllbu)
-DO_VSHLL(VSHLL_TS, vshllts)
-DO_VSHLL(VSHLL_TU, vshlltu)
-
-#define DO_2SHIFT_N(INSN, FN)                                   \
-    static bool trans_##INSN(DisasContext *s, arg_2shift *a)    \
-    {                                                           \
-        static MVEGenTwoOpShiftFn * const fns[] = {             \
-            gen_helper_mve_##FN##b,                             \
-            gen_helper_mve_##FN##h,                             \
-        };                                                      \
-        return do_2shift(s, a, fns[a->size], false);            \
-    }
-
-DO_2SHIFT_N(VSHRNB, vshrnb)
-DO_2SHIFT_N(VSHRNT, vshrnt)
-DO_2SHIFT_N(VRSHRNB, vrshrnb)
-DO_2SHIFT_N(VRSHRNT, vrshrnt)
-DO_2SHIFT_N(VQSHRNB_S, vqshrnb_s)
-DO_2SHIFT_N(VQSHRNT_S, vqshrnt_s)
-DO_2SHIFT_N(VQSHRNB_U, vqshrnb_u)
-DO_2SHIFT_N(VQSHRNT_U, vqshrnt_u)
-DO_2SHIFT_N(VQSHRUNB, vqshrunb)
-DO_2SHIFT_N(VQSHRUNT, vqshrunt)
-DO_2SHIFT_N(VQRSHRNB_S, vqrshrnb_s)
-DO_2SHIFT_N(VQRSHRNT_S, vqrshrnt_s)
-DO_2SHIFT_N(VQRSHRNB_U, vqrshrnb_u)
-DO_2SHIFT_N(VQRSHRNT_U, vqrshrnt_u)
-DO_2SHIFT_N(VQRSHRUNB, vqrshrunb)
-DO_2SHIFT_N(VQRSHRUNT, vqrshrunt)
-
-static bool trans_VSHLC(DisasContext *s, arg_VSHLC *a)
-{
-    /*
-     * Whole Vector Left Shift with Carry. The carry is taken
-     * from a general purpose register and written back there.
-     * An imm of 0 means "shift by 32".
-     */
-    TCGv_ptr qd;
-    TCGv_i32 rdm;
-
-    if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qd)) {
-        return false;
-    }
-    if (a->rdm == 13 || a->rdm == 15) {
-        /* CONSTRAINED UNPREDICTABLE: we UNDEF */
-        return false;
-    }
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-
-    qd = mve_qreg_ptr(a->qd);
-    rdm = load_reg(s, a->rdm);
-    gen_helper_mve_vshlc(rdm, cpu_env, qd, rdm, tcg_constant_i32(a->imm));
-    store_reg(s, a->rdm, rdm);
-    tcg_temp_free_ptr(qd);
-    mve_update_eci(s);
-    return true;
-}
-
-static bool do_vidup(DisasContext *s, arg_vidup *a, MVEGenVIDUPFn *fn)
-{
-    TCGv_ptr qd;
-    TCGv_i32 rn;
-
-    /*
-     * Vector increment/decrement with wrap and duplicate (VIDUP, VDDUP).
-     * This fills the vector with elements of successively increasing
-     * or decreasing values, starting from Rn.
-     */
-    if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qd)) {
-        return false;
-    }
-    if (a->size == MO_64) {
-        /* size 0b11 is another encoding */
-        return false;
-    }
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-
-    qd = mve_qreg_ptr(a->qd);
-    rn = load_reg(s, a->rn);
-    fn(rn, cpu_env, qd, rn, tcg_constant_i32(a->imm));
-    store_reg(s, a->rn, rn);
-    tcg_temp_free_ptr(qd);
-    mve_update_eci(s);
-    return true;
-}
-
-static bool do_viwdup(DisasContext *s, arg_viwdup *a, MVEGenVIWDUPFn *fn)
-{
-    TCGv_ptr qd;
-    TCGv_i32 rn, rm;
-
-    /*
-     * Vector increment/decrement with wrap and duplicate (VIWDUp, VDWDUP)
-     * This fills the vector with elements of successively increasing
-     * or decreasing values, starting from Rn. Rm specifies a point where
-     * the count wraps back around to 0. The updated offset is written back
-     * to Rn.
-     */
-    if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qd)) {
-        return false;
-    }
-    if (!fn || a->rm == 13 || a->rm == 15) {
-        /*
-         * size 0b11 is another encoding; Rm == 13 is UNPREDICTABLE;
-         * Rm == 13 is VIWDUP, VDWDUP.
-         */
-        return false;
-    }
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-
-    qd = mve_qreg_ptr(a->qd);
-    rn = load_reg(s, a->rn);
-    rm = load_reg(s, a->rm);
-    fn(rn, cpu_env, qd, rn, rm, tcg_constant_i32(a->imm));
-    store_reg(s, a->rn, rn);
-    tcg_temp_free_ptr(qd);
-    tcg_temp_free_i32(rm);
-    mve_update_eci(s);
-    return true;
-}
-
-static bool trans_VIDUP(DisasContext *s, arg_vidup *a)
-{
-    static MVEGenVIDUPFn * const fns[] = {
-        gen_helper_mve_vidupb,
-        gen_helper_mve_viduph,
-        gen_helper_mve_vidupw,
-        NULL,
-    };
-    return do_vidup(s, a, fns[a->size]);
-}
-
-static bool trans_VDDUP(DisasContext *s, arg_vidup *a)
-{
-    static MVEGenVIDUPFn * const fns[] = {
-        gen_helper_mve_vidupb,
-        gen_helper_mve_viduph,
-        gen_helper_mve_vidupw,
-        NULL,
-    };
-    /* VDDUP is just like VIDUP but with a negative immediate */
-    a->imm = -a->imm;
-    return do_vidup(s, a, fns[a->size]);
-}
-
-static bool trans_VIWDUP(DisasContext *s, arg_viwdup *a)
-{
-    static MVEGenVIWDUPFn * const fns[] = {
-        gen_helper_mve_viwdupb,
-        gen_helper_mve_viwduph,
-        gen_helper_mve_viwdupw,
-        NULL,
-    };
-    return do_viwdup(s, a, fns[a->size]);
-}
-
-static bool trans_VDWDUP(DisasContext *s, arg_viwdup *a)
-{
-    static MVEGenVIWDUPFn * const fns[] = {
-        gen_helper_mve_vdwdupb,
-        gen_helper_mve_vdwduph,
-        gen_helper_mve_vdwdupw,
-        NULL,
-    };
-    return do_viwdup(s, a, fns[a->size]);
-}
-
-static bool do_vcmp(DisasContext *s, arg_vcmp *a, MVEGenCmpFn *fn)
-{
-    TCGv_ptr qn, qm;
-
-    if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qm) ||
-        !fn) {
-        return false;
-    }
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-
-    qn = mve_qreg_ptr(a->qn);
-    qm = mve_qreg_ptr(a->qm);
-    fn(cpu_env, qn, qm);
-    tcg_temp_free_ptr(qn);
-    tcg_temp_free_ptr(qm);
-    if (a->mask) {
-        /* VPT */
-        gen_vpst(s, a->mask);
-    }
-    /* This insn updates predication bits */
-    s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
-    mve_update_eci(s);
-    return true;
-}
-
-static bool do_vcmp_scalar(DisasContext *s, arg_vcmp_scalar *a,
-                           MVEGenScalarCmpFn *fn)
-{
-    TCGv_ptr qn;
-    TCGv_i32 rm;
-
-    if (!dc_isar_feature(aa32_mve, s) || !fn || a->rm == 13) {
-        return false;
-    }
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-
-    qn = mve_qreg_ptr(a->qn);
-    if (a->rm == 15) {
-        /* Encoding Rm=0b1111 means "constant zero" */
-        rm = tcg_constant_i32(0);
-    } else {
-        rm = load_reg(s, a->rm);
-    }
-    fn(cpu_env, qn, rm);
-    tcg_temp_free_ptr(qn);
-    tcg_temp_free_i32(rm);
-    if (a->mask) {
-        /* VPT */
-        gen_vpst(s, a->mask);
-    }
-    /* This insn updates predication bits */
-    s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
-    mve_update_eci(s);
-    return true;
-}
-
-#define DO_VCMP(INSN, FN)                                       \
-    static bool trans_##INSN(DisasContext *s, arg_vcmp *a)      \
-    {                                                           \
-        static MVEGenCmpFn * const fns[] = {                    \
-            gen_helper_mve_##FN##b,                             \
-            gen_helper_mve_##FN##h,                             \
-            gen_helper_mve_##FN##w,                             \
-            NULL,                                               \
-        };                                                      \
-        return do_vcmp(s, a, fns[a->size]);                     \
-    }                                                           \
-    static bool trans_##INSN##_scalar(DisasContext *s,          \
-                                      arg_vcmp_scalar *a)       \
-    {                                                           \
-        static MVEGenScalarCmpFn * const fns[] = {              \
-            gen_helper_mve_##FN##_scalarb,                      \
-            gen_helper_mve_##FN##_scalarh,                      \
-            gen_helper_mve_##FN##_scalarw,                      \
-            NULL,                                               \
-        };                                                      \
-        return do_vcmp_scalar(s, a, fns[a->size]);              \
-    }
-
-DO_VCMP(VCMPEQ, vcmpeq)
-DO_VCMP(VCMPNE, vcmpne)
-DO_VCMP(VCMPCS, vcmpcs)
-DO_VCMP(VCMPHI, vcmphi)
-DO_VCMP(VCMPGE, vcmpge)
-DO_VCMP(VCMPLT, vcmplt)
-DO_VCMP(VCMPGT, vcmpgt)
-DO_VCMP(VCMPLE, vcmple)
-
-#define DO_VCMP_FP(INSN, FN)                                    \
-    static bool trans_##INSN(DisasContext *s, arg_vcmp *a)      \
-    {                                                           \
-        static MVEGenCmpFn * const fns[] = {                    \
-            NULL,                                               \
-            gen_helper_mve_##FN##h,                             \
-            gen_helper_mve_##FN##s,                             \
-            NULL,                                               \
-        };                                                      \
-        if (!dc_isar_feature(aa32_mve_fp, s)) {                 \
-            return false;                                       \
-        }                                                       \
-        return do_vcmp(s, a, fns[a->size]);                     \
-    }                                                           \
-    static bool trans_##INSN##_scalar(DisasContext *s,          \
-                                      arg_vcmp_scalar *a)       \
-    {                                                           \
-        static MVEGenScalarCmpFn * const fns[] = {              \
-            NULL,                                               \
-            gen_helper_mve_##FN##_scalarh,                      \
-            gen_helper_mve_##FN##_scalars,                      \
-            NULL,                                               \
-        };                                                      \
-        if (!dc_isar_feature(aa32_mve_fp, s)) {                 \
-            return false;                                       \
-        }                                                       \
-        return do_vcmp_scalar(s, a, fns[a->size]);              \
-    }
-
-DO_VCMP_FP(VCMPEQ_fp, vfcmpeq)
-DO_VCMP_FP(VCMPNE_fp, vfcmpne)
-DO_VCMP_FP(VCMPGE_fp, vfcmpge)
-DO_VCMP_FP(VCMPLT_fp, vfcmplt)
-DO_VCMP_FP(VCMPGT_fp, vfcmpgt)
-DO_VCMP_FP(VCMPLE_fp, vfcmple)
-
-static bool do_vmaxv(DisasContext *s, arg_vmaxv *a, MVEGenVADDVFn fn)
-{
-    /*
-     * MIN/MAX operations across a vector: compute the min or
-     * max of the initial value in a general purpose register
-     * and all the elements in the vector, and store it back
-     * into the general purpose register.
-     */
-    TCGv_ptr qm;
-    TCGv_i32 rda;
-
-    if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qm) ||
-        !fn || a->rda == 13 || a->rda == 15) {
-        /* Rda cases are UNPREDICTABLE */
-        return false;
-    }
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-
-    qm = mve_qreg_ptr(a->qm);
-    rda = load_reg(s, a->rda);
-    fn(rda, cpu_env, qm, rda);
-    store_reg(s, a->rda, rda);
-    tcg_temp_free_ptr(qm);
-    mve_update_eci(s);
-    return true;
-}
-
-#define DO_VMAXV(INSN, FN)                                      \
-    static bool trans_##INSN(DisasContext *s, arg_vmaxv *a)     \
-    {                                                           \
-        static MVEGenVADDVFn * const fns[] = {                  \
-            gen_helper_mve_##FN##b,                             \
-            gen_helper_mve_##FN##h,                             \
-            gen_helper_mve_##FN##w,                             \
-            NULL,                                               \
-        };                                                      \
-        return do_vmaxv(s, a, fns[a->size]);                    \
-    }
-
-DO_VMAXV(VMAXV_S, vmaxvs)
-DO_VMAXV(VMAXV_U, vmaxvu)
-DO_VMAXV(VMAXAV, vmaxav)
-DO_VMAXV(VMINV_S, vminvs)
-DO_VMAXV(VMINV_U, vminvu)
-DO_VMAXV(VMINAV, vminav)
-
-#define DO_VMAXV_FP(INSN, FN)                                   \
-    static bool trans_##INSN(DisasContext *s, arg_vmaxv *a)     \
-    {                                                           \
-        static MVEGenVADDVFn * const fns[] = {                  \
-            NULL,                                               \
-            gen_helper_mve_##FN##h,                             \
-            gen_helper_mve_##FN##s,                             \
-            NULL,                                               \
-        };                                                      \
-        if (!dc_isar_feature(aa32_mve_fp, s)) {                 \
-            return false;                                       \
-        }                                                       \
-        return do_vmaxv(s, a, fns[a->size]);                    \
-    }
-
-DO_VMAXV_FP(VMAXNMV, vmaxnmv)
-DO_VMAXV_FP(VMINNMV, vminnmv)
-DO_VMAXV_FP(VMAXNMAV, vmaxnmav)
-DO_VMAXV_FP(VMINNMAV, vminnmav)
-
-static bool do_vabav(DisasContext *s, arg_vabav *a, MVEGenVABAVFn *fn)
-{
-    /* Absolute difference accumulated across vector */
-    TCGv_ptr qn, qm;
-    TCGv_i32 rda;
-
-    if (!dc_isar_feature(aa32_mve, s) ||
-        !mve_check_qreg_bank(s, a->qm | a->qn) ||
-        !fn || a->rda == 13 || a->rda == 15) {
-        /* Rda cases are UNPREDICTABLE */
-        return false;
-    }
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-
-    qm = mve_qreg_ptr(a->qm);
-    qn = mve_qreg_ptr(a->qn);
-    rda = load_reg(s, a->rda);
-    fn(rda, cpu_env, qn, qm, rda);
-    store_reg(s, a->rda, rda);
-    tcg_temp_free_ptr(qm);
-    tcg_temp_free_ptr(qn);
-    mve_update_eci(s);
-    return true;
-}
-
-#define DO_VABAV(INSN, FN)                                      \
-    static bool trans_##INSN(DisasContext *s, arg_vabav *a)     \
-    {                                                           \
-        static MVEGenVABAVFn * const fns[] = {                  \
-            gen_helper_mve_##FN##b,                             \
-            gen_helper_mve_##FN##h,                             \
-            gen_helper_mve_##FN##w,                             \
-            NULL,                                               \
-        };                                                      \
-        return do_vabav(s, a, fns[a->size]);                    \
-    }
-
-DO_VABAV(VABAV_S, vabavs)
-DO_VABAV(VABAV_U, vabavu)
-
-static bool trans_VMOV_to_2gp(DisasContext *s, arg_VMOV_to_2gp *a)
-{
-    /*
-     * VMOV two 32-bit vector lanes to two general-purpose registers.
-     * This insn is not predicated but it is subject to beat-wise
-     * execution if it is not in an IT block. For us this means
-     * only that if PSR.ECI says we should not be executing the beat
-     * corresponding to the lane of the vector register being accessed
-     * then we should skip perfoming the move, and that we need to do
-     * the usual check for bad ECI state and advance of ECI state.
-     * (If PSR.ECI is non-zero then we cannot be in an IT block.)
-     */
-    TCGv_i32 tmp;
-    int vd;
-
-    if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qd) ||
-        a->rt == 13 || a->rt == 15 || a->rt2 == 13 || a->rt2 == 15 ||
-        a->rt == a->rt2) {
-        /* Rt/Rt2 cases are UNPREDICTABLE */
-        return false;
-    }
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-
-    /* Convert Qreg index to Dreg for read_neon_element32() etc */
-    vd = a->qd * 2;
-
-    if (!mve_skip_vmov(s, vd, a->idx, MO_32)) {
-        tmp = tcg_temp_new_i32();
-        read_neon_element32(tmp, vd, a->idx, MO_32);
-        store_reg(s, a->rt, tmp);
-    }
-    if (!mve_skip_vmov(s, vd + 1, a->idx, MO_32)) {
-        tmp = tcg_temp_new_i32();
-        read_neon_element32(tmp, vd + 1, a->idx, MO_32);
-        store_reg(s, a->rt2, tmp);
-    }
-
-    mve_update_and_store_eci(s);
-    return true;
-}
-
-static bool trans_VMOV_from_2gp(DisasContext *s, arg_VMOV_to_2gp *a)
-{
-    /*
-     * VMOV two general-purpose registers to two 32-bit vector lanes.
-     * This insn is not predicated but it is subject to beat-wise
-     * execution if it is not in an IT block. For us this means
-     * only that if PSR.ECI says we should not be executing the beat
-     * corresponding to the lane of the vector register being accessed
-     * then we should skip perfoming the move, and that we need to do
-     * the usual check for bad ECI state and advance of ECI state.
-     * (If PSR.ECI is non-zero then we cannot be in an IT block.)
-     */
-    TCGv_i32 tmp;
-    int vd;
-
-    if (!dc_isar_feature(aa32_mve, s) || !mve_check_qreg_bank(s, a->qd) ||
-        a->rt == 13 || a->rt == 15 || a->rt2 == 13 || a->rt2 == 15) {
-        /* Rt/Rt2 cases are UNPREDICTABLE */
-        return false;
-    }
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-
-    /* Convert Qreg idx to Dreg for read_neon_element32() etc */
-    vd = a->qd * 2;
-
-    if (!mve_skip_vmov(s, vd, a->idx, MO_32)) {
-        tmp = load_reg(s, a->rt);
-        write_neon_element32(tmp, vd, a->idx, MO_32);
-        tcg_temp_free_i32(tmp);
-    }
-    if (!mve_skip_vmov(s, vd + 1, a->idx, MO_32)) {
-        tmp = load_reg(s, a->rt2);
-        write_neon_element32(tmp, vd + 1, a->idx, MO_32);
-        tcg_temp_free_i32(tmp);
-    }
-
-    mve_update_and_store_eci(s);
-    return true;
-}
diff --git a/target/arm/translate-neon.c b/target/arm/translate-neon.c
deleted file mode 100644
index 4016339..0000000
--- a/target/arm/translate-neon.c
+++ /dev/null
@@ -1,4064 +0,0 @@
-/*
- *  ARM translation: AArch32 Neon instructions
- *
- *  Copyright (c) 2003 Fabrice Bellard
- *  Copyright (c) 2005-2007 CodeSourcery
- *  Copyright (c) 2007 OpenedHand, Ltd.
- *  Copyright (c) 2020 Linaro, Ltd.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "qemu/osdep.h"
-#include "tcg/tcg-op.h"
-#include "tcg/tcg-op-gvec.h"
-#include "exec/exec-all.h"
-#include "exec/gen-icount.h"
-#include "translate.h"
-#include "translate-a32.h"
-
-/* Include the generated Neon decoder */
-#include "decode-neon-dp.c.inc"
-#include "decode-neon-ls.c.inc"
-#include "decode-neon-shared.c.inc"
-
-static TCGv_ptr vfp_reg_ptr(bool dp, int reg)
-{
-    TCGv_ptr ret = tcg_temp_new_ptr();
-    tcg_gen_addi_ptr(ret, cpu_env, vfp_reg_offset(dp, reg));
-    return ret;
-}
-
-static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
-{
-    long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
-
-    switch (mop) {
-    case MO_UB:
-        tcg_gen_ld8u_i32(var, cpu_env, offset);
-        break;
-    case MO_UW:
-        tcg_gen_ld16u_i32(var, cpu_env, offset);
-        break;
-    case MO_UL:
-        tcg_gen_ld_i32(var, cpu_env, offset);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-}
-
-static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
-{
-    long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
-
-    switch (mop) {
-    case MO_UB:
-        tcg_gen_ld8u_i64(var, cpu_env, offset);
-        break;
-    case MO_UW:
-        tcg_gen_ld16u_i64(var, cpu_env, offset);
-        break;
-    case MO_UL:
-        tcg_gen_ld32u_i64(var, cpu_env, offset);
-        break;
-    case MO_UQ:
-        tcg_gen_ld_i64(var, cpu_env, offset);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-}
-
-static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
-{
-    long offset = neon_element_offset(reg, ele, size);
-
-    switch (size) {
-    case MO_8:
-        tcg_gen_st8_i32(var, cpu_env, offset);
-        break;
-    case MO_16:
-        tcg_gen_st16_i32(var, cpu_env, offset);
-        break;
-    case MO_32:
-        tcg_gen_st_i32(var, cpu_env, offset);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-}
-
-static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
-{
-    long offset = neon_element_offset(reg, ele, size);
-
-    switch (size) {
-    case MO_8:
-        tcg_gen_st8_i64(var, cpu_env, offset);
-        break;
-    case MO_16:
-        tcg_gen_st16_i64(var, cpu_env, offset);
-        break;
-    case MO_32:
-        tcg_gen_st32_i64(var, cpu_env, offset);
-        break;
-    case MO_64:
-        tcg_gen_st_i64(var, cpu_env, offset);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-}
-
-static bool do_neon_ddda(DisasContext *s, int q, int vd, int vn, int vm,
-                         int data, gen_helper_gvec_4 *fn_gvec)
-{
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
-        return false;
-    }
-
-    /*
-     * UNDEF accesses to odd registers for each bit of Q.
-     * Q will be 0b111 for all Q-reg instructions, otherwise
-     * when we have mixed Q- and D-reg inputs.
-     */
-    if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    int opr_sz = q ? 16 : 8;
-    tcg_gen_gvec_4_ool(vfp_reg_offset(1, vd),
-                       vfp_reg_offset(1, vn),
-                       vfp_reg_offset(1, vm),
-                       vfp_reg_offset(1, vd),
-                       opr_sz, opr_sz, data, fn_gvec);
-    return true;
-}
-
-static bool do_neon_ddda_fpst(DisasContext *s, int q, int vd, int vn, int vm,
-                              int data, ARMFPStatusFlavour fp_flavour,
-                              gen_helper_gvec_4_ptr *fn_gvec_ptr)
-{
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
-        return false;
-    }
-
-    /*
-     * UNDEF accesses to odd registers for each bit of Q.
-     * Q will be 0b111 for all Q-reg instructions, otherwise
-     * when we have mixed Q- and D-reg inputs.
-     */
-    if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    int opr_sz = q ? 16 : 8;
-    TCGv_ptr fpst = fpstatus_ptr(fp_flavour);
-
-    tcg_gen_gvec_4_ptr(vfp_reg_offset(1, vd),
-                       vfp_reg_offset(1, vn),
-                       vfp_reg_offset(1, vm),
-                       vfp_reg_offset(1, vd),
-                       fpst, opr_sz, opr_sz, data, fn_gvec_ptr);
-    tcg_temp_free_ptr(fpst);
-    return true;
-}
-
-static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
-{
-    if (!dc_isar_feature(aa32_vcma, s)) {
-        return false;
-    }
-    if (a->size == MO_16) {
-        if (!dc_isar_feature(aa32_fp16_arith, s)) {
-            return false;
-        }
-        return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
-                                 FPST_STD_F16, gen_helper_gvec_fcmlah);
-    }
-    return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
-                             FPST_STD, gen_helper_gvec_fcmlas);
-}
-
-static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
-{
-    int opr_sz;
-    TCGv_ptr fpst;
-    gen_helper_gvec_3_ptr *fn_gvec_ptr;
-
-    if (!dc_isar_feature(aa32_vcma, s)
-        || (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s))) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vn | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if ((a->vn | a->vm | a->vd) & a->q) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    opr_sz = (1 + a->q) * 8;
-    fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
-    fn_gvec_ptr = (a->size == MO_16) ?
-        gen_helper_gvec_fcaddh : gen_helper_gvec_fcadds;
-    tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
-                       vfp_reg_offset(1, a->vn),
-                       vfp_reg_offset(1, a->vm),
-                       fpst, opr_sz, opr_sz, a->rot,
-                       fn_gvec_ptr);
-    tcg_temp_free_ptr(fpst);
-    return true;
-}
-
-static bool trans_VSDOT(DisasContext *s, arg_VSDOT *a)
-{
-    if (!dc_isar_feature(aa32_dp, s)) {
-        return false;
-    }
-    return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
-                        gen_helper_gvec_sdot_b);
-}
-
-static bool trans_VUDOT(DisasContext *s, arg_VUDOT *a)
-{
-    if (!dc_isar_feature(aa32_dp, s)) {
-        return false;
-    }
-    return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
-                        gen_helper_gvec_udot_b);
-}
-
-static bool trans_VUSDOT(DisasContext *s, arg_VUSDOT *a)
-{
-    if (!dc_isar_feature(aa32_i8mm, s)) {
-        return false;
-    }
-    return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
-                        gen_helper_gvec_usdot_b);
-}
-
-static bool trans_VDOT_b16(DisasContext *s, arg_VDOT_b16 *a)
-{
-    if (!dc_isar_feature(aa32_bf16, s)) {
-        return false;
-    }
-    return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
-                        gen_helper_gvec_bfdot);
-}
-
-static bool trans_VFML(DisasContext *s, arg_VFML *a)
-{
-    int opr_sz;
-
-    if (!dc_isar_feature(aa32_fhm, s)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        (a->vd & 0x10)) {
-        return false;
-    }
-
-    if (a->vd & a->q) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    opr_sz = (1 + a->q) * 8;
-    tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
-                       vfp_reg_offset(a->q, a->vn),
-                       vfp_reg_offset(a->q, a->vm),
-                       cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
-                       gen_helper_gvec_fmlal_a32);
-    return true;
-}
-
-static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
-{
-    int data = (a->index << 2) | a->rot;
-
-    if (!dc_isar_feature(aa32_vcma, s)) {
-        return false;
-    }
-    if (a->size == MO_16) {
-        if (!dc_isar_feature(aa32_fp16_arith, s)) {
-            return false;
-        }
-        return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
-                                 FPST_STD_F16, gen_helper_gvec_fcmlah_idx);
-    }
-    return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
-                             FPST_STD, gen_helper_gvec_fcmlas_idx);
-}
-
-static bool trans_VSDOT_scalar(DisasContext *s, arg_VSDOT_scalar *a)
-{
-    if (!dc_isar_feature(aa32_dp, s)) {
-        return false;
-    }
-    return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
-                        gen_helper_gvec_sdot_idx_b);
-}
-
-static bool trans_VUDOT_scalar(DisasContext *s, arg_VUDOT_scalar *a)
-{
-    if (!dc_isar_feature(aa32_dp, s)) {
-        return false;
-    }
-    return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
-                        gen_helper_gvec_udot_idx_b);
-}
-
-static bool trans_VUSDOT_scalar(DisasContext *s, arg_VUSDOT_scalar *a)
-{
-    if (!dc_isar_feature(aa32_i8mm, s)) {
-        return false;
-    }
-    return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
-                        gen_helper_gvec_usdot_idx_b);
-}
-
-static bool trans_VSUDOT_scalar(DisasContext *s, arg_VSUDOT_scalar *a)
-{
-    if (!dc_isar_feature(aa32_i8mm, s)) {
-        return false;
-    }
-    return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
-                        gen_helper_gvec_sudot_idx_b);
-}
-
-static bool trans_VDOT_b16_scal(DisasContext *s, arg_VDOT_b16_scal *a)
-{
-    if (!dc_isar_feature(aa32_bf16, s)) {
-        return false;
-    }
-    return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
-                        gen_helper_gvec_bfdot_idx);
-}
-
-static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
-{
-    int opr_sz;
-
-    if (!dc_isar_feature(aa32_fhm, s)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
-        return false;
-    }
-
-    if (a->vd & a->q) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    opr_sz = (1 + a->q) * 8;
-    tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
-                       vfp_reg_offset(a->q, a->vn),
-                       vfp_reg_offset(a->q, a->rm),
-                       cpu_env, opr_sz, opr_sz,
-                       (a->index << 2) | a->s, /* is_2 == 0 */
-                       gen_helper_gvec_fmlal_idx_a32);
-    return true;
-}
-
-static struct {
-    int nregs;
-    int interleave;
-    int spacing;
-} const neon_ls_element_type[11] = {
-    {1, 4, 1},
-    {1, 4, 2},
-    {4, 1, 1},
-    {2, 2, 2},
-    {1, 3, 1},
-    {1, 3, 2},
-    {3, 1, 1},
-    {1, 1, 1},
-    {1, 2, 1},
-    {1, 2, 2},
-    {2, 1, 1}
-};
-
-static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
-                                      int stride)
-{
-    if (rm != 15) {
-        TCGv_i32 base;
-
-        base = load_reg(s, rn);
-        if (rm == 13) {
-            tcg_gen_addi_i32(base, base, stride);
-        } else {
-            TCGv_i32 index;
-            index = load_reg(s, rm);
-            tcg_gen_add_i32(base, base, index);
-            tcg_temp_free_i32(index);
-        }
-        store_reg(s, rn, base);
-    }
-}
-
-static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
-{
-    /* Neon load/store multiple structures */
-    int nregs, interleave, spacing, reg, n;
-    MemOp mop, align, endian;
-    int mmu_idx = get_mem_index(s);
-    int size = a->size;
-    TCGv_i64 tmp64;
-    TCGv_i32 addr;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist */
-    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
-        return false;
-    }
-    if (a->itype > 10) {
-        return false;
-    }
-    /* Catch UNDEF cases for bad values of align field */
-    switch (a->itype & 0xc) {
-    case 4:
-        if (a->align >= 2) {
-            return false;
-        }
-        break;
-    case 8:
-        if (a->align == 3) {
-            return false;
-        }
-        break;
-    default:
-        break;
-    }
-    nregs = neon_ls_element_type[a->itype].nregs;
-    interleave = neon_ls_element_type[a->itype].interleave;
-    spacing = neon_ls_element_type[a->itype].spacing;
-    if (size == 3 && (interleave | spacing) != 1) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    /* For our purposes, bytes are always little-endian.  */
-    endian = s->be_data;
-    if (size == 0) {
-        endian = MO_LE;
-    }
-
-    /* Enforce alignment requested by the instruction */
-    if (a->align) {
-        align = pow2_align(a->align + 2); /* 4 ** a->align */
-    } else {
-        align = s->align_mem ? MO_ALIGN : 0;
-    }
-
-    /*
-     * Consecutive little-endian elements from a single register
-     * can be promoted to a larger little-endian operation.
-     */
-    if (interleave == 1 && endian == MO_LE) {
-        /* Retain any natural alignment. */
-        if (align == MO_ALIGN) {
-            align = pow2_align(size);
-        }
-        size = 3;
-    }
-
-    tmp64 = tcg_temp_new_i64();
-    addr = tcg_temp_new_i32();
-    load_reg_var(s, addr, a->rn);
-
-    mop = endian | size | align;
-    for (reg = 0; reg < nregs; reg++) {
-        for (n = 0; n < 8 >> size; n++) {
-            int xs;
-            for (xs = 0; xs < interleave; xs++) {
-                int tt = a->vd + reg + spacing * xs;
-
-                if (a->l) {
-                    gen_aa32_ld_internal_i64(s, tmp64, addr, mmu_idx, mop);
-                    neon_store_element64(tt, n, size, tmp64);
-                } else {
-                    neon_load_element64(tmp64, tt, n, size);
-                    gen_aa32_st_internal_i64(s, tmp64, addr, mmu_idx, mop);
-                }
-                tcg_gen_addi_i32(addr, addr, 1 << size);
-
-                /* Subsequent memory operations inherit alignment */
-                mop &= ~MO_AMASK;
-            }
-        }
-    }
-    tcg_temp_free_i32(addr);
-    tcg_temp_free_i64(tmp64);
-
-    gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
-    return true;
-}
-
-static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
-{
-    /* Neon load single structure to all lanes */
-    int reg, stride, vec_size;
-    int vd = a->vd;
-    int size = a->size;
-    int nregs = a->n + 1;
-    TCGv_i32 addr, tmp;
-    MemOp mop, align;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist */
-    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
-        return false;
-    }
-
-    align = 0;
-    if (size == 3) {
-        if (nregs != 4 || a->a == 0) {
-            return false;
-        }
-        /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
-        size = MO_32;
-        align = MO_ALIGN_16;
-    } else if (a->a) {
-        switch (nregs) {
-        case 1:
-            if (size == 0) {
-                return false;
-            }
-            align = MO_ALIGN;
-            break;
-        case 2:
-            align = pow2_align(size + 1);
-            break;
-        case 3:
-            return false;
-        case 4:
-            if (size == 2) {
-                align = pow2_align(3);
-            } else {
-                align = pow2_align(size + 2);
-            }
-            break;
-        default:
-            g_assert_not_reached();
-        }
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    /*
-     * VLD1 to all lanes: T bit indicates how many Dregs to write.
-     * VLD2/3/4 to all lanes: T bit indicates register stride.
-     */
-    stride = a->t ? 2 : 1;
-    vec_size = nregs == 1 ? stride * 8 : 8;
-    mop = size | align;
-    tmp = tcg_temp_new_i32();
-    addr = tcg_temp_new_i32();
-    load_reg_var(s, addr, a->rn);
-    for (reg = 0; reg < nregs; reg++) {
-        gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), mop);
-        if ((vd & 1) && vec_size == 16) {
-            /*
-             * We cannot write 16 bytes at once because the
-             * destination is unaligned.
-             */
-            tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
-                                 8, 8, tmp);
-            tcg_gen_gvec_mov(0, neon_full_reg_offset(vd + 1),
-                             neon_full_reg_offset(vd), 8, 8);
-        } else {
-            tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
-                                 vec_size, vec_size, tmp);
-        }
-        tcg_gen_addi_i32(addr, addr, 1 << size);
-        vd += stride;
-
-        /* Subsequent memory operations inherit alignment */
-        mop &= ~MO_AMASK;
-    }
-    tcg_temp_free_i32(tmp);
-    tcg_temp_free_i32(addr);
-
-    gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
-
-    return true;
-}
-
-static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
-{
-    /* Neon load/store single structure to one lane */
-    int reg;
-    int nregs = a->n + 1;
-    int vd = a->vd;
-    TCGv_i32 addr, tmp;
-    MemOp mop;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist */
-    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
-        return false;
-    }
-
-    /* Catch the UNDEF cases. This is unavoidably a bit messy. */
-    switch (nregs) {
-    case 1:
-        if (a->stride != 1) {
-            return false;
-        }
-        if (((a->align & (1 << a->size)) != 0) ||
-            (a->size == 2 && (a->align == 1 || a->align == 2))) {
-            return false;
-        }
-        break;
-    case 2:
-        if (a->size == 2 && (a->align & 2) != 0) {
-            return false;
-        }
-        break;
-    case 3:
-        if (a->align != 0) {
-            return false;
-        }
-        break;
-    case 4:
-        if (a->size == 2 && a->align == 3) {
-            return false;
-        }
-        break;
-    default:
-        g_assert_not_reached();
-    }
-    if ((vd + a->stride * (nregs - 1)) > 31) {
-        /*
-         * Attempts to write off the end of the register file are
-         * UNPREDICTABLE; we choose to UNDEF because otherwise we would
-         * access off the end of the array that holds the register data.
-         */
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    /* Pick up SCTLR settings */
-    mop = finalize_memop(s, a->size);
-
-    if (a->align) {
-        MemOp align_op;
-
-        switch (nregs) {
-        case 1:
-            /* For VLD1, use natural alignment. */
-            align_op = MO_ALIGN;
-            break;
-        case 2:
-            /* For VLD2, use double alignment. */
-            align_op = pow2_align(a->size + 1);
-            break;
-        case 4:
-            if (a->size == MO_32) {
-                /*
-                 * For VLD4.32, align = 1 is double alignment, align = 2 is
-                 * quad alignment; align = 3 is rejected above.
-                 */
-                align_op = pow2_align(a->size + a->align);
-            } else {
-                /* For VLD4.8 and VLD.16, we want quad alignment. */
-                align_op = pow2_align(a->size + 2);
-            }
-            break;
-        default:
-            /* For VLD3, the alignment field is zero and rejected above. */
-            g_assert_not_reached();
-        }
-
-        mop = (mop & ~MO_AMASK) | align_op;
-    }
-
-    tmp = tcg_temp_new_i32();
-    addr = tcg_temp_new_i32();
-    load_reg_var(s, addr, a->rn);
-
-    for (reg = 0; reg < nregs; reg++) {
-        if (a->l) {
-            gen_aa32_ld_internal_i32(s, tmp, addr, get_mem_index(s), mop);
-            neon_store_element(vd, a->reg_idx, a->size, tmp);
-        } else { /* Store */
-            neon_load_element(tmp, vd, a->reg_idx, a->size);
-            gen_aa32_st_internal_i32(s, tmp, addr, get_mem_index(s), mop);
-        }
-        vd += a->stride;
-        tcg_gen_addi_i32(addr, addr, 1 << a->size);
-
-        /* Subsequent memory operations inherit alignment */
-        mop &= ~MO_AMASK;
-    }
-    tcg_temp_free_i32(addr);
-    tcg_temp_free_i32(tmp);
-
-    gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
-
-    return true;
-}
-
-static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
-{
-    int vec_size = a->q ? 16 : 8;
-    int rd_ofs = neon_full_reg_offset(a->vd);
-    int rn_ofs = neon_full_reg_offset(a->vn);
-    int rm_ofs = neon_full_reg_offset(a->vm);
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vn | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if ((a->vn | a->vm | a->vd) & a->q) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
-    return true;
-}
-
-#define DO_3SAME(INSN, FUNC)                                            \
-    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
-    {                                                                   \
-        return do_3same(s, a, FUNC);                                    \
-    }
-
-DO_3SAME(VADD, tcg_gen_gvec_add)
-DO_3SAME(VSUB, tcg_gen_gvec_sub)
-DO_3SAME(VAND, tcg_gen_gvec_and)
-DO_3SAME(VBIC, tcg_gen_gvec_andc)
-DO_3SAME(VORR, tcg_gen_gvec_or)
-DO_3SAME(VORN, tcg_gen_gvec_orc)
-DO_3SAME(VEOR, tcg_gen_gvec_xor)
-DO_3SAME(VSHL_S, gen_gvec_sshl)
-DO_3SAME(VSHL_U, gen_gvec_ushl)
-DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
-DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
-DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
-DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
-
-/* These insns are all gvec_bitsel but with the inputs in various orders. */
-#define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
-    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
-                                uint32_t rn_ofs, uint32_t rm_ofs,       \
-                                uint32_t oprsz, uint32_t maxsz)         \
-    {                                                                   \
-        tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
-    }                                                                   \
-    DO_3SAME(INSN, gen_##INSN##_3s)
-
-DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
-DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
-DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
-
-#define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
-    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
-    {                                                                   \
-        if (a->size == 3) {                                             \
-            return false;                                               \
-        }                                                               \
-        return do_3same(s, a, FUNC);                                    \
-    }
-
-DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
-DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
-DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
-DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
-DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
-DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
-DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
-DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
-DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
-DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
-DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
-DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
-
-#define DO_3SAME_CMP(INSN, COND)                                        \
-    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
-                                uint32_t rn_ofs, uint32_t rm_ofs,       \
-                                uint32_t oprsz, uint32_t maxsz)         \
-    {                                                                   \
-        tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
-    }                                                                   \
-    DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
-
-DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
-DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
-DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
-DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
-DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
-
-#define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
-    static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
-                         uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
-    {                                                                      \
-        tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
-    }
-
-WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
-
-static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
-{
-    if (a->size != 0) {
-        return false;
-    }
-    return do_3same(s, a, gen_VMUL_p_3s);
-}
-
-#define DO_VQRDMLAH(INSN, FUNC)                                         \
-    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
-    {                                                                   \
-        if (!dc_isar_feature(aa32_rdm, s)) {                            \
-            return false;                                               \
-        }                                                               \
-        if (a->size != 1 && a->size != 2) {                             \
-            return false;                                               \
-        }                                                               \
-        return do_3same(s, a, FUNC);                                    \
-    }
-
-DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
-DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
-
-#define DO_SHA1(NAME, FUNC)                                             \
-    WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
-    static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
-    {                                                                   \
-        if (!dc_isar_feature(aa32_sha1, s)) {                           \
-            return false;                                               \
-        }                                                               \
-        return do_3same(s, a, gen_##NAME##_3s);                         \
-    }
-
-DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
-DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
-DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
-DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
-
-#define DO_SHA2(NAME, FUNC)                                             \
-    WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
-    static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
-    {                                                                   \
-        if (!dc_isar_feature(aa32_sha2, s)) {                           \
-            return false;                                               \
-        }                                                               \
-        return do_3same(s, a, gen_##NAME##_3s);                         \
-    }
-
-DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
-DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
-DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
-
-#define DO_3SAME_64(INSN, FUNC)                                         \
-    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
-                                uint32_t rn_ofs, uint32_t rm_ofs,       \
-                                uint32_t oprsz, uint32_t maxsz)         \
-    {                                                                   \
-        static const GVecGen3 op = { .fni8 = FUNC };                    \
-        tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op);      \
-    }                                                                   \
-    DO_3SAME(INSN, gen_##INSN##_3s)
-
-#define DO_3SAME_64_ENV(INSN, FUNC)                                     \
-    static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)    \
-    {                                                                   \
-        FUNC(d, cpu_env, n, m);                                         \
-    }                                                                   \
-    DO_3SAME_64(INSN, gen_##INSN##_elt)
-
-DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
-DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
-DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
-DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
-DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
-DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
-
-#define DO_3SAME_32(INSN, FUNC)                                         \
-    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
-                                uint32_t rn_ofs, uint32_t rm_ofs,       \
-                                uint32_t oprsz, uint32_t maxsz)         \
-    {                                                                   \
-        static const GVecGen3 ops[4] = {                                \
-            { .fni4 = gen_helper_neon_##FUNC##8 },                      \
-            { .fni4 = gen_helper_neon_##FUNC##16 },                     \
-            { .fni4 = gen_helper_neon_##FUNC##32 },                     \
-            { 0 },                                                      \
-        };                                                              \
-        tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
-    }                                                                   \
-    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
-    {                                                                   \
-        if (a->size > 2) {                                              \
-            return false;                                               \
-        }                                                               \
-        return do_3same(s, a, gen_##INSN##_3s);                         \
-    }
-
-/*
- * Some helper functions need to be passed the cpu_env. In order
- * to use those with the gvec APIs like tcg_gen_gvec_3() we need
- * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
- * and which call a NeonGenTwoOpEnvFn().
- */
-#define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
-    static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
-    {                                                                   \
-        FUNC(d, cpu_env, n, m);                                         \
-    }
-
-#define DO_3SAME_32_ENV(INSN, FUNC)                                     \
-    WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8);        \
-    WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16);      \
-    WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32);      \
-    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
-                                uint32_t rn_ofs, uint32_t rm_ofs,       \
-                                uint32_t oprsz, uint32_t maxsz)         \
-    {                                                                   \
-        static const GVecGen3 ops[4] = {                                \
-            { .fni4 = gen_##INSN##_tramp8 },                            \
-            { .fni4 = gen_##INSN##_tramp16 },                           \
-            { .fni4 = gen_##INSN##_tramp32 },                           \
-            { 0 },                                                      \
-        };                                                              \
-        tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
-    }                                                                   \
-    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
-    {                                                                   \
-        if (a->size > 2) {                                              \
-            return false;                                               \
-        }                                                               \
-        return do_3same(s, a, gen_##INSN##_3s);                         \
-    }
-
-DO_3SAME_32(VHADD_S, hadd_s)
-DO_3SAME_32(VHADD_U, hadd_u)
-DO_3SAME_32(VHSUB_S, hsub_s)
-DO_3SAME_32(VHSUB_U, hsub_u)
-DO_3SAME_32(VRHADD_S, rhadd_s)
-DO_3SAME_32(VRHADD_U, rhadd_u)
-DO_3SAME_32(VRSHL_S, rshl_s)
-DO_3SAME_32(VRSHL_U, rshl_u)
-
-DO_3SAME_32_ENV(VQSHL_S, qshl_s)
-DO_3SAME_32_ENV(VQSHL_U, qshl_u)
-DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
-DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
-
-static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn)
-{
-    /* Operations handled pairwise 32 bits at a time */
-    TCGv_i32 tmp, tmp2, tmp3;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vn | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if (a->size == 3) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    assert(a->q == 0); /* enforced by decode patterns */
-
-    /*
-     * Note that we have to be careful not to clobber the source operands
-     * in the "vm == vd" case by storing the result of the first pass too
-     * early. Since Q is 0 there are always just two passes, so instead
-     * of a complicated loop over each pass we just unroll.
-     */
-    tmp = tcg_temp_new_i32();
-    tmp2 = tcg_temp_new_i32();
-    tmp3 = tcg_temp_new_i32();
-
-    read_neon_element32(tmp, a->vn, 0, MO_32);
-    read_neon_element32(tmp2, a->vn, 1, MO_32);
-    fn(tmp, tmp, tmp2);
-
-    read_neon_element32(tmp3, a->vm, 0, MO_32);
-    read_neon_element32(tmp2, a->vm, 1, MO_32);
-    fn(tmp3, tmp3, tmp2);
-
-    write_neon_element32(tmp, a->vd, 0, MO_32);
-    write_neon_element32(tmp3, a->vd, 1, MO_32);
-
-    tcg_temp_free_i32(tmp);
-    tcg_temp_free_i32(tmp2);
-    tcg_temp_free_i32(tmp3);
-    return true;
-}
-
-#define DO_3SAME_PAIR(INSN, func)                                       \
-    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
-    {                                                                   \
-        static NeonGenTwoOpFn * const fns[] = {                         \
-            gen_helper_neon_##func##8,                                  \
-            gen_helper_neon_##func##16,                                 \
-            gen_helper_neon_##func##32,                                 \
-        };                                                              \
-        if (a->size > 2) {                                              \
-            return false;                                               \
-        }                                                               \
-        return do_3same_pair(s, a, fns[a->size]);                       \
-    }
-
-/* 32-bit pairwise ops end up the same as the elementwise versions.  */
-#define gen_helper_neon_pmax_s32  tcg_gen_smax_i32
-#define gen_helper_neon_pmax_u32  tcg_gen_umax_i32
-#define gen_helper_neon_pmin_s32  tcg_gen_smin_i32
-#define gen_helper_neon_pmin_u32  tcg_gen_umin_i32
-#define gen_helper_neon_padd_u32  tcg_gen_add_i32
-
-DO_3SAME_PAIR(VPMAX_S, pmax_s)
-DO_3SAME_PAIR(VPMIN_S, pmin_s)
-DO_3SAME_PAIR(VPMAX_U, pmax_u)
-DO_3SAME_PAIR(VPMIN_U, pmin_u)
-DO_3SAME_PAIR(VPADD, padd_u)
-
-#define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
-    WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16);    \
-    WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32);    \
-    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
-                                uint32_t rn_ofs, uint32_t rm_ofs,       \
-                                uint32_t oprsz, uint32_t maxsz)         \
-    {                                                                   \
-        static const GVecGen3 ops[2] = {                                \
-            { .fni4 = gen_##INSN##_tramp16 },                           \
-            { .fni4 = gen_##INSN##_tramp32 },                           \
-        };                                                              \
-        tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
-    }                                                                   \
-    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
-    {                                                                   \
-        if (a->size != 1 && a->size != 2) {                             \
-            return false;                                               \
-        }                                                               \
-        return do_3same(s, a, gen_##INSN##_3s);                         \
-    }
-
-DO_3SAME_VQDMULH(VQDMULH, qdmulh)
-DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
-
-#define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC)                              \
-    static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
-                         uint32_t rn_ofs, uint32_t rm_ofs,              \
-                         uint32_t oprsz, uint32_t maxsz)                \
-    {                                                                   \
-        TCGv_ptr fpst = fpstatus_ptr(FPST);                             \
-        tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
-                           oprsz, maxsz, 0, FUNC);                      \
-        tcg_temp_free_ptr(fpst);                                        \
-    }
-
-#define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC)                                 \
-    WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC)                 \
-    WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC)             \
-    static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
-    {                                                                   \
-        if (a->size == MO_16) {                                         \
-            if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
-                return false;                                           \
-            }                                                           \
-            return do_3same(s, a, gen_##INSN##_fp16_3s);                \
-        }                                                               \
-        return do_3same(s, a, gen_##INSN##_fp32_3s);                    \
-    }
-
-
-DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
-DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
-DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
-DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
-DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
-DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
-DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
-DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
-DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
-DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
-DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
-DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
-DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
-DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
-DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
-DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
-DO_3S_FP_GVEC(VRSQRTS, gen_helper_gvec_rsqrts_nf_s, gen_helper_gvec_rsqrts_nf_h)
-
-WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
-WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
-WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
-WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
-
-static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
-{
-    if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
-        return false;
-    }
-
-    if (a->size == MO_16) {
-        if (!dc_isar_feature(aa32_fp16_arith, s)) {
-            return false;
-        }
-        return do_3same(s, a, gen_VMAXNM_fp16_3s);
-    }
-    return do_3same(s, a, gen_VMAXNM_fp32_3s);
-}
-
-static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
-{
-    if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
-        return false;
-    }
-
-    if (a->size == MO_16) {
-        if (!dc_isar_feature(aa32_fp16_arith, s)) {
-            return false;
-        }
-        return do_3same(s, a, gen_VMINNM_fp16_3s);
-    }
-    return do_3same(s, a, gen_VMINNM_fp32_3s);
-}
-
-static bool do_3same_fp_pair(DisasContext *s, arg_3same *a,
-                             gen_helper_gvec_3_ptr *fn)
-{
-    /* FP pairwise operations */
-    TCGv_ptr fpstatus;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vn | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    assert(a->q == 0); /* enforced by decode patterns */
-
-
-    fpstatus = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
-    tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
-                       vfp_reg_offset(1, a->vn),
-                       vfp_reg_offset(1, a->vm),
-                       fpstatus, 8, 8, 0, fn);
-    tcg_temp_free_ptr(fpstatus);
-
-    return true;
-}
-
-/*
- * For all the functions using this macro, size == 1 means fp16,
- * which is an architecture extension we don't implement yet.
- */
-#define DO_3S_FP_PAIR(INSN,FUNC)                                    \
-    static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
-    {                                                               \
-        if (a->size == MO_16) {                                     \
-            if (!dc_isar_feature(aa32_fp16_arith, s)) {             \
-                return false;                                       \
-            }                                                       \
-            return do_3same_fp_pair(s, a, FUNC##h);                 \
-        }                                                           \
-        return do_3same_fp_pair(s, a, FUNC##s);                     \
-    }
-
-DO_3S_FP_PAIR(VPADD, gen_helper_neon_padd)
-DO_3S_FP_PAIR(VPMAX, gen_helper_neon_pmax)
-DO_3S_FP_PAIR(VPMIN, gen_helper_neon_pmin)
-
-static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
-{
-    /* Handle a 2-reg-shift insn which can be vectorized. */
-    int vec_size = a->q ? 16 : 8;
-    int rd_ofs = neon_full_reg_offset(a->vd);
-    int rm_ofs = neon_full_reg_offset(a->vm);
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if ((a->vm | a->vd) & a->q) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
-    return true;
-}
-
-#define DO_2SH(INSN, FUNC)                                              \
-    static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
-    {                                                                   \
-        return do_vector_2sh(s, a, FUNC);                               \
-    }                                                                   \
-
-DO_2SH(VSHL, tcg_gen_gvec_shli)
-DO_2SH(VSLI, gen_gvec_sli)
-DO_2SH(VSRI, gen_gvec_sri)
-DO_2SH(VSRA_S, gen_gvec_ssra)
-DO_2SH(VSRA_U, gen_gvec_usra)
-DO_2SH(VRSHR_S, gen_gvec_srshr)
-DO_2SH(VRSHR_U, gen_gvec_urshr)
-DO_2SH(VRSRA_S, gen_gvec_srsra)
-DO_2SH(VRSRA_U, gen_gvec_ursra)
-
-static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
-{
-    /* Signed shift out of range results in all-sign-bits */
-    a->shift = MIN(a->shift, (8 << a->size) - 1);
-    return do_vector_2sh(s, a, tcg_gen_gvec_sari);
-}
-
-static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
-                            int64_t shift, uint32_t oprsz, uint32_t maxsz)
-{
-    tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
-}
-
-static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
-{
-    /* Shift out of range is architecturally valid and results in zero. */
-    if (a->shift >= (8 << a->size)) {
-        return do_vector_2sh(s, a, gen_zero_rd_2sh);
-    } else {
-        return do_vector_2sh(s, a, tcg_gen_gvec_shri);
-    }
-}
-
-static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
-                             NeonGenTwo64OpEnvFn *fn)
-{
-    /*
-     * 2-reg-and-shift operations, size == 3 case, where the
-     * function needs to be passed cpu_env.
-     */
-    TCGv_i64 constimm;
-    int pass;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if ((a->vm | a->vd) & a->q) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    /*
-     * To avoid excessive duplication of ops we implement shift
-     * by immediate using the variable shift operations.
-     */
-    constimm = tcg_constant_i64(dup_const(a->size, a->shift));
-
-    for (pass = 0; pass < a->q + 1; pass++) {
-        TCGv_i64 tmp = tcg_temp_new_i64();
-
-        read_neon_element64(tmp, a->vm, pass, MO_64);
-        fn(tmp, cpu_env, tmp, constimm);
-        write_neon_element64(tmp, a->vd, pass, MO_64);
-        tcg_temp_free_i64(tmp);
-    }
-    return true;
-}
-
-static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
-                             NeonGenTwoOpEnvFn *fn)
-{
-    /*
-     * 2-reg-and-shift operations, size < 3 case, where the
-     * helper needs to be passed cpu_env.
-     */
-    TCGv_i32 constimm, tmp;
-    int pass;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if ((a->vm | a->vd) & a->q) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    /*
-     * To avoid excessive duplication of ops we implement shift
-     * by immediate using the variable shift operations.
-     */
-    constimm = tcg_constant_i32(dup_const(a->size, a->shift));
-    tmp = tcg_temp_new_i32();
-
-    for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
-        read_neon_element32(tmp, a->vm, pass, MO_32);
-        fn(tmp, cpu_env, tmp, constimm);
-        write_neon_element32(tmp, a->vd, pass, MO_32);
-    }
-    tcg_temp_free_i32(tmp);
-    return true;
-}
-
-#define DO_2SHIFT_ENV(INSN, FUNC)                                       \
-    static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
-    {                                                                   \
-        return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
-    }                                                                   \
-    static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
-    {                                                                   \
-        static NeonGenTwoOpEnvFn * const fns[] = {                      \
-            gen_helper_neon_##FUNC##8,                                  \
-            gen_helper_neon_##FUNC##16,                                 \
-            gen_helper_neon_##FUNC##32,                                 \
-        };                                                              \
-        assert(a->size < ARRAY_SIZE(fns));                              \
-        return do_2shift_env_32(s, a, fns[a->size]);                    \
-    }
-
-DO_2SHIFT_ENV(VQSHLU, qshlu_s)
-DO_2SHIFT_ENV(VQSHL_U, qshl_u)
-DO_2SHIFT_ENV(VQSHL_S, qshl_s)
-
-static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
-                                NeonGenTwo64OpFn *shiftfn,
-                                NeonGenNarrowEnvFn *narrowfn)
-{
-    /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
-    TCGv_i64 constimm, rm1, rm2;
-    TCGv_i32 rd;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if (a->vm & 1) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    /*
-     * This is always a right shift, and the shiftfn is always a
-     * left-shift helper, which thus needs the negated shift count.
-     */
-    constimm = tcg_constant_i64(-a->shift);
-    rm1 = tcg_temp_new_i64();
-    rm2 = tcg_temp_new_i64();
-    rd = tcg_temp_new_i32();
-
-    /* Load both inputs first to avoid potential overwrite if rm == rd */
-    read_neon_element64(rm1, a->vm, 0, MO_64);
-    read_neon_element64(rm2, a->vm, 1, MO_64);
-
-    shiftfn(rm1, rm1, constimm);
-    narrowfn(rd, cpu_env, rm1);
-    write_neon_element32(rd, a->vd, 0, MO_32);
-
-    shiftfn(rm2, rm2, constimm);
-    narrowfn(rd, cpu_env, rm2);
-    write_neon_element32(rd, a->vd, 1, MO_32);
-
-    tcg_temp_free_i32(rd);
-    tcg_temp_free_i64(rm1);
-    tcg_temp_free_i64(rm2);
-
-    return true;
-}
-
-static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
-                                NeonGenTwoOpFn *shiftfn,
-                                NeonGenNarrowEnvFn *narrowfn)
-{
-    /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
-    TCGv_i32 constimm, rm1, rm2, rm3, rm4;
-    TCGv_i64 rtmp;
-    uint32_t imm;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if (a->vm & 1) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    /*
-     * This is always a right shift, and the shiftfn is always a
-     * left-shift helper, which thus needs the negated shift count
-     * duplicated into each lane of the immediate value.
-     */
-    if (a->size == 1) {
-        imm = (uint16_t)(-a->shift);
-        imm |= imm << 16;
-    } else {
-        /* size == 2 */
-        imm = -a->shift;
-    }
-    constimm = tcg_constant_i32(imm);
-
-    /* Load all inputs first to avoid potential overwrite */
-    rm1 = tcg_temp_new_i32();
-    rm2 = tcg_temp_new_i32();
-    rm3 = tcg_temp_new_i32();
-    rm4 = tcg_temp_new_i32();
-    read_neon_element32(rm1, a->vm, 0, MO_32);
-    read_neon_element32(rm2, a->vm, 1, MO_32);
-    read_neon_element32(rm3, a->vm, 2, MO_32);
-    read_neon_element32(rm4, a->vm, 3, MO_32);
-    rtmp = tcg_temp_new_i64();
-
-    shiftfn(rm1, rm1, constimm);
-    shiftfn(rm2, rm2, constimm);
-
-    tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
-    tcg_temp_free_i32(rm2);
-
-    narrowfn(rm1, cpu_env, rtmp);
-    write_neon_element32(rm1, a->vd, 0, MO_32);
-    tcg_temp_free_i32(rm1);
-
-    shiftfn(rm3, rm3, constimm);
-    shiftfn(rm4, rm4, constimm);
-
-    tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
-    tcg_temp_free_i32(rm4);
-
-    narrowfn(rm3, cpu_env, rtmp);
-    tcg_temp_free_i64(rtmp);
-    write_neon_element32(rm3, a->vd, 1, MO_32);
-    tcg_temp_free_i32(rm3);
-    return true;
-}
-
-#define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
-    static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
-    {                                                                   \
-        return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
-    }
-#define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
-    static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
-    {                                                                   \
-        return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
-    }
-
-static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
-{
-    tcg_gen_extrl_i64_i32(dest, src);
-}
-
-static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
-{
-    gen_helper_neon_narrow_u16(dest, src);
-}
-
-static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
-{
-    gen_helper_neon_narrow_u8(dest, src);
-}
-
-DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
-DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
-DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
-
-DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
-DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
-DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
-
-DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
-DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
-DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
-
-DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
-DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
-DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
-DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
-DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
-DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
-
-DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
-DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
-DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
-
-DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
-DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
-DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
-
-DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
-DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
-DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
-
-static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
-                         NeonGenWidenFn *widenfn, bool u)
-{
-    TCGv_i64 tmp;
-    TCGv_i32 rm0, rm1;
-    uint64_t widen_mask = 0;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if (a->vd & 1) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    /*
-     * This is a widen-and-shift operation. The shift is always less
-     * than the width of the source type, so after widening the input
-     * vector we can simply shift the whole 64-bit widened register,
-     * and then clear the potential overflow bits resulting from left
-     * bits of the narrow input appearing as right bits of the left
-     * neighbour narrow input. Calculate a mask of bits to clear.
-     */
-    if ((a->shift != 0) && (a->size < 2 || u)) {
-        int esize = 8 << a->size;
-        widen_mask = MAKE_64BIT_MASK(0, esize);
-        widen_mask >>= esize - a->shift;
-        widen_mask = dup_const(a->size + 1, widen_mask);
-    }
-
-    rm0 = tcg_temp_new_i32();
-    rm1 = tcg_temp_new_i32();
-    read_neon_element32(rm0, a->vm, 0, MO_32);
-    read_neon_element32(rm1, a->vm, 1, MO_32);
-    tmp = tcg_temp_new_i64();
-
-    widenfn(tmp, rm0);
-    tcg_temp_free_i32(rm0);
-    if (a->shift != 0) {
-        tcg_gen_shli_i64(tmp, tmp, a->shift);
-        tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
-    }
-    write_neon_element64(tmp, a->vd, 0, MO_64);
-
-    widenfn(tmp, rm1);
-    tcg_temp_free_i32(rm1);
-    if (a->shift != 0) {
-        tcg_gen_shli_i64(tmp, tmp, a->shift);
-        tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
-    }
-    write_neon_element64(tmp, a->vd, 1, MO_64);
-    tcg_temp_free_i64(tmp);
-    return true;
-}
-
-static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
-{
-    static NeonGenWidenFn * const widenfn[] = {
-        gen_helper_neon_widen_s8,
-        gen_helper_neon_widen_s16,
-        tcg_gen_ext_i32_i64,
-    };
-    return do_vshll_2sh(s, a, widenfn[a->size], false);
-}
-
-static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
-{
-    static NeonGenWidenFn * const widenfn[] = {
-        gen_helper_neon_widen_u8,
-        gen_helper_neon_widen_u16,
-        tcg_gen_extu_i32_i64,
-    };
-    return do_vshll_2sh(s, a, widenfn[a->size], true);
-}
-
-static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
-                      gen_helper_gvec_2_ptr *fn)
-{
-    /* FP operations in 2-reg-and-shift group */
-    int vec_size = a->q ? 16 : 8;
-    int rd_ofs = neon_full_reg_offset(a->vd);
-    int rm_ofs = neon_full_reg_offset(a->vm);
-    TCGv_ptr fpst;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    if (a->size == MO_16) {
-        if (!dc_isar_feature(aa32_fp16_arith, s)) {
-            return false;
-        }
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if ((a->vm | a->vd) & a->q) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
-    tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, vec_size, vec_size, a->shift, fn);
-    tcg_temp_free_ptr(fpst);
-    return true;
-}
-
-#define DO_FP_2SH(INSN, FUNC)                                           \
-    static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
-    {                                                                   \
-        return do_fp_2sh(s, a, FUNC);                                   \
-    }
-
-DO_FP_2SH(VCVT_SF, gen_helper_gvec_vcvt_sf)
-DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf)
-DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs)
-DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu)
-
-DO_FP_2SH(VCVT_SH, gen_helper_gvec_vcvt_sh)
-DO_FP_2SH(VCVT_UH, gen_helper_gvec_vcvt_uh)
-DO_FP_2SH(VCVT_HS, gen_helper_gvec_vcvt_hs)
-DO_FP_2SH(VCVT_HU, gen_helper_gvec_vcvt_hu)
-
-static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
-                        GVecGen2iFn *fn)
-{
-    uint64_t imm;
-    int reg_ofs, vec_size;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
-        return false;
-    }
-
-    if (a->vd & a->q) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    reg_ofs = neon_full_reg_offset(a->vd);
-    vec_size = a->q ? 16 : 8;
-    imm = asimd_imm_const(a->imm, a->cmode, a->op);
-
-    fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
-    return true;
-}
-
-static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
-                        int64_t c, uint32_t oprsz, uint32_t maxsz)
-{
-    tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
-}
-
-static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
-{
-    /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
-    GVecGen2iFn *fn;
-
-    if ((a->cmode & 1) && a->cmode < 12) {
-        /* for op=1, the imm will be inverted, so BIC becomes AND. */
-        fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
-    } else {
-        /* There is one unallocated cmode/op combination in this space */
-        if (a->cmode == 15 && a->op == 1) {
-            return false;
-        }
-        fn = gen_VMOV_1r;
-    }
-    return do_1reg_imm(s, a, fn);
-}
-
-static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
-                           NeonGenWidenFn *widenfn,
-                           NeonGenTwo64OpFn *opfn,
-                           int src1_mop, int src2_mop)
-{
-    /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
-    TCGv_i64 rn0_64, rn1_64, rm_64;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vn | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if (!opfn) {
-        /* size == 3 case, which is an entirely different insn group */
-        return false;
-    }
-
-    if ((a->vd & 1) || (src1_mop == MO_UQ && (a->vn & 1))) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    rn0_64 = tcg_temp_new_i64();
-    rn1_64 = tcg_temp_new_i64();
-    rm_64 = tcg_temp_new_i64();
-
-    if (src1_mop >= 0) {
-        read_neon_element64(rn0_64, a->vn, 0, src1_mop);
-    } else {
-        TCGv_i32 tmp = tcg_temp_new_i32();
-        read_neon_element32(tmp, a->vn, 0, MO_32);
-        widenfn(rn0_64, tmp);
-        tcg_temp_free_i32(tmp);
-    }
-    if (src2_mop >= 0) {
-        read_neon_element64(rm_64, a->vm, 0, src2_mop);
-    } else {
-        TCGv_i32 tmp = tcg_temp_new_i32();
-        read_neon_element32(tmp, a->vm, 0, MO_32);
-        widenfn(rm_64, tmp);
-        tcg_temp_free_i32(tmp);
-    }
-
-    opfn(rn0_64, rn0_64, rm_64);
-
-    /*
-     * Load second pass inputs before storing the first pass result, to
-     * avoid incorrect results if a narrow input overlaps with the result.
-     */
-    if (src1_mop >= 0) {
-        read_neon_element64(rn1_64, a->vn, 1, src1_mop);
-    } else {
-        TCGv_i32 tmp = tcg_temp_new_i32();
-        read_neon_element32(tmp, a->vn, 1, MO_32);
-        widenfn(rn1_64, tmp);
-        tcg_temp_free_i32(tmp);
-    }
-    if (src2_mop >= 0) {
-        read_neon_element64(rm_64, a->vm, 1, src2_mop);
-    } else {
-        TCGv_i32 tmp = tcg_temp_new_i32();
-        read_neon_element32(tmp, a->vm, 1, MO_32);
-        widenfn(rm_64, tmp);
-        tcg_temp_free_i32(tmp);
-    }
-
-    write_neon_element64(rn0_64, a->vd, 0, MO_64);
-
-    opfn(rn1_64, rn1_64, rm_64);
-    write_neon_element64(rn1_64, a->vd, 1, MO_64);
-
-    tcg_temp_free_i64(rn0_64);
-    tcg_temp_free_i64(rn1_64);
-    tcg_temp_free_i64(rm_64);
-
-    return true;
-}
-
-#define DO_PREWIDEN(INSN, S, OP, SRC1WIDE, SIGN)                        \
-    static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
-    {                                                                   \
-        static NeonGenWidenFn * const widenfn[] = {                     \
-            gen_helper_neon_widen_##S##8,                               \
-            gen_helper_neon_widen_##S##16,                              \
-            NULL, NULL,                                                 \
-        };                                                              \
-        static NeonGenTwo64OpFn * const addfn[] = {                     \
-            gen_helper_neon_##OP##l_u16,                                \
-            gen_helper_neon_##OP##l_u32,                                \
-            tcg_gen_##OP##_i64,                                         \
-            NULL,                                                       \
-        };                                                              \
-        int narrow_mop = a->size == MO_32 ? MO_32 | SIGN : -1;          \
-        return do_prewiden_3d(s, a, widenfn[a->size], addfn[a->size],   \
-                              SRC1WIDE ? MO_UQ : narrow_mop,             \
-                              narrow_mop);                              \
-    }
-
-DO_PREWIDEN(VADDL_S, s, add, false, MO_SIGN)
-DO_PREWIDEN(VADDL_U, u, add, false, 0)
-DO_PREWIDEN(VSUBL_S, s, sub, false, MO_SIGN)
-DO_PREWIDEN(VSUBL_U, u, sub, false, 0)
-DO_PREWIDEN(VADDW_S, s, add, true, MO_SIGN)
-DO_PREWIDEN(VADDW_U, u, add, true, 0)
-DO_PREWIDEN(VSUBW_S, s, sub, true, MO_SIGN)
-DO_PREWIDEN(VSUBW_U, u, sub, true, 0)
-
-static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
-                         NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
-{
-    /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
-    TCGv_i64 rn_64, rm_64;
-    TCGv_i32 rd0, rd1;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vn | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if (!opfn || !narrowfn) {
-        /* size == 3 case, which is an entirely different insn group */
-        return false;
-    }
-
-    if ((a->vn | a->vm) & 1) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    rn_64 = tcg_temp_new_i64();
-    rm_64 = tcg_temp_new_i64();
-    rd0 = tcg_temp_new_i32();
-    rd1 = tcg_temp_new_i32();
-
-    read_neon_element64(rn_64, a->vn, 0, MO_64);
-    read_neon_element64(rm_64, a->vm, 0, MO_64);
-
-    opfn(rn_64, rn_64, rm_64);
-
-    narrowfn(rd0, rn_64);
-
-    read_neon_element64(rn_64, a->vn, 1, MO_64);
-    read_neon_element64(rm_64, a->vm, 1, MO_64);
-
-    opfn(rn_64, rn_64, rm_64);
-
-    narrowfn(rd1, rn_64);
-
-    write_neon_element32(rd0, a->vd, 0, MO_32);
-    write_neon_element32(rd1, a->vd, 1, MO_32);
-
-    tcg_temp_free_i32(rd0);
-    tcg_temp_free_i32(rd1);
-    tcg_temp_free_i64(rn_64);
-    tcg_temp_free_i64(rm_64);
-
-    return true;
-}
-
-#define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
-    static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
-    {                                                                   \
-        static NeonGenTwo64OpFn * const addfn[] = {                     \
-            gen_helper_neon_##OP##l_u16,                                \
-            gen_helper_neon_##OP##l_u32,                                \
-            tcg_gen_##OP##_i64,                                         \
-            NULL,                                                       \
-        };                                                              \
-        static NeonGenNarrowFn * const narrowfn[] = {                   \
-            gen_helper_neon_##NARROWTYPE##_high_u8,                     \
-            gen_helper_neon_##NARROWTYPE##_high_u16,                    \
-            EXTOP,                                                      \
-            NULL,                                                       \
-        };                                                              \
-        return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
-    }
-
-static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
-{
-    tcg_gen_addi_i64(rn, rn, 1u << 31);
-    tcg_gen_extrh_i64_i32(rd, rn);
-}
-
-DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
-DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
-DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
-DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
-
-static bool do_long_3d(DisasContext *s, arg_3diff *a,
-                       NeonGenTwoOpWidenFn *opfn,
-                       NeonGenTwo64OpFn *accfn)
-{
-    /*
-     * 3-regs different lengths, long operations.
-     * These perform an operation on two inputs that returns a double-width
-     * result, and then possibly perform an accumulation operation of
-     * that result into the double-width destination.
-     */
-    TCGv_i64 rd0, rd1, tmp;
-    TCGv_i32 rn, rm;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vn | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if (!opfn) {
-        /* size == 3 case, which is an entirely different insn group */
-        return false;
-    }
-
-    if (a->vd & 1) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    rd0 = tcg_temp_new_i64();
-    rd1 = tcg_temp_new_i64();
-
-    rn = tcg_temp_new_i32();
-    rm = tcg_temp_new_i32();
-    read_neon_element32(rn, a->vn, 0, MO_32);
-    read_neon_element32(rm, a->vm, 0, MO_32);
-    opfn(rd0, rn, rm);
-
-    read_neon_element32(rn, a->vn, 1, MO_32);
-    read_neon_element32(rm, a->vm, 1, MO_32);
-    opfn(rd1, rn, rm);
-    tcg_temp_free_i32(rn);
-    tcg_temp_free_i32(rm);
-
-    /* Don't store results until after all loads: they might overlap */
-    if (accfn) {
-        tmp = tcg_temp_new_i64();
-        read_neon_element64(tmp, a->vd, 0, MO_64);
-        accfn(rd0, tmp, rd0);
-        read_neon_element64(tmp, a->vd, 1, MO_64);
-        accfn(rd1, tmp, rd1);
-        tcg_temp_free_i64(tmp);
-    }
-
-    write_neon_element64(rd0, a->vd, 0, MO_64);
-    write_neon_element64(rd1, a->vd, 1, MO_64);
-    tcg_temp_free_i64(rd0);
-    tcg_temp_free_i64(rd1);
-
-    return true;
-}
-
-static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
-{
-    static NeonGenTwoOpWidenFn * const opfn[] = {
-        gen_helper_neon_abdl_s16,
-        gen_helper_neon_abdl_s32,
-        gen_helper_neon_abdl_s64,
-        NULL,
-    };
-
-    return do_long_3d(s, a, opfn[a->size], NULL);
-}
-
-static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
-{
-    static NeonGenTwoOpWidenFn * const opfn[] = {
-        gen_helper_neon_abdl_u16,
-        gen_helper_neon_abdl_u32,
-        gen_helper_neon_abdl_u64,
-        NULL,
-    };
-
-    return do_long_3d(s, a, opfn[a->size], NULL);
-}
-
-static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
-{
-    static NeonGenTwoOpWidenFn * const opfn[] = {
-        gen_helper_neon_abdl_s16,
-        gen_helper_neon_abdl_s32,
-        gen_helper_neon_abdl_s64,
-        NULL,
-    };
-    static NeonGenTwo64OpFn * const addfn[] = {
-        gen_helper_neon_addl_u16,
-        gen_helper_neon_addl_u32,
-        tcg_gen_add_i64,
-        NULL,
-    };
-
-    return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
-}
-
-static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
-{
-    static NeonGenTwoOpWidenFn * const opfn[] = {
-        gen_helper_neon_abdl_u16,
-        gen_helper_neon_abdl_u32,
-        gen_helper_neon_abdl_u64,
-        NULL,
-    };
-    static NeonGenTwo64OpFn * const addfn[] = {
-        gen_helper_neon_addl_u16,
-        gen_helper_neon_addl_u32,
-        tcg_gen_add_i64,
-        NULL,
-    };
-
-    return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
-}
-
-static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
-{
-    TCGv_i32 lo = tcg_temp_new_i32();
-    TCGv_i32 hi = tcg_temp_new_i32();
-
-    tcg_gen_muls2_i32(lo, hi, rn, rm);
-    tcg_gen_concat_i32_i64(rd, lo, hi);
-
-    tcg_temp_free_i32(lo);
-    tcg_temp_free_i32(hi);
-}
-
-static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
-{
-    TCGv_i32 lo = tcg_temp_new_i32();
-    TCGv_i32 hi = tcg_temp_new_i32();
-
-    tcg_gen_mulu2_i32(lo, hi, rn, rm);
-    tcg_gen_concat_i32_i64(rd, lo, hi);
-
-    tcg_temp_free_i32(lo);
-    tcg_temp_free_i32(hi);
-}
-
-static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
-{
-    static NeonGenTwoOpWidenFn * const opfn[] = {
-        gen_helper_neon_mull_s8,
-        gen_helper_neon_mull_s16,
-        gen_mull_s32,
-        NULL,
-    };
-
-    return do_long_3d(s, a, opfn[a->size], NULL);
-}
-
-static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
-{
-    static NeonGenTwoOpWidenFn * const opfn[] = {
-        gen_helper_neon_mull_u8,
-        gen_helper_neon_mull_u16,
-        gen_mull_u32,
-        NULL,
-    };
-
-    return do_long_3d(s, a, opfn[a->size], NULL);
-}
-
-#define DO_VMLAL(INSN,MULL,ACC)                                         \
-    static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
-    {                                                                   \
-        static NeonGenTwoOpWidenFn * const opfn[] = {                   \
-            gen_helper_neon_##MULL##8,                                  \
-            gen_helper_neon_##MULL##16,                                 \
-            gen_##MULL##32,                                             \
-            NULL,                                                       \
-        };                                                              \
-        static NeonGenTwo64OpFn * const accfn[] = {                     \
-            gen_helper_neon_##ACC##l_u16,                               \
-            gen_helper_neon_##ACC##l_u32,                               \
-            tcg_gen_##ACC##_i64,                                        \
-            NULL,                                                       \
-        };                                                              \
-        return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
-    }
-
-DO_VMLAL(VMLAL_S,mull_s,add)
-DO_VMLAL(VMLAL_U,mull_u,add)
-DO_VMLAL(VMLSL_S,mull_s,sub)
-DO_VMLAL(VMLSL_U,mull_u,sub)
-
-static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
-{
-    gen_helper_neon_mull_s16(rd, rn, rm);
-    gen_helper_neon_addl_saturate_s32(rd, cpu_env, rd, rd);
-}
-
-static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
-{
-    gen_mull_s32(rd, rn, rm);
-    gen_helper_neon_addl_saturate_s64(rd, cpu_env, rd, rd);
-}
-
-static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
-{
-    static NeonGenTwoOpWidenFn * const opfn[] = {
-        NULL,
-        gen_VQDMULL_16,
-        gen_VQDMULL_32,
-        NULL,
-    };
-
-    return do_long_3d(s, a, opfn[a->size], NULL);
-}
-
-static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
-{
-    gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
-}
-
-static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
-{
-    gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
-}
-
-static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
-{
-    static NeonGenTwoOpWidenFn * const opfn[] = {
-        NULL,
-        gen_VQDMULL_16,
-        gen_VQDMULL_32,
-        NULL,
-    };
-    static NeonGenTwo64OpFn * const accfn[] = {
-        NULL,
-        gen_VQDMLAL_acc_16,
-        gen_VQDMLAL_acc_32,
-        NULL,
-    };
-
-    return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
-}
-
-static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
-{
-    gen_helper_neon_negl_u32(rm, rm);
-    gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
-}
-
-static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
-{
-    tcg_gen_neg_i64(rm, rm);
-    gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
-}
-
-static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
-{
-    static NeonGenTwoOpWidenFn * const opfn[] = {
-        NULL,
-        gen_VQDMULL_16,
-        gen_VQDMULL_32,
-        NULL,
-    };
-    static NeonGenTwo64OpFn * const accfn[] = {
-        NULL,
-        gen_VQDMLSL_acc_16,
-        gen_VQDMLSL_acc_32,
-        NULL,
-    };
-
-    return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
-}
-
-static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
-{
-    gen_helper_gvec_3 *fn_gvec;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vn | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if (a->vd & 1) {
-        return false;
-    }
-
-    switch (a->size) {
-    case 0:
-        fn_gvec = gen_helper_neon_pmull_h;
-        break;
-    case 2:
-        if (!dc_isar_feature(aa32_pmull, s)) {
-            return false;
-        }
-        fn_gvec = gen_helper_gvec_pmull_q;
-        break;
-    default:
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    tcg_gen_gvec_3_ool(neon_full_reg_offset(a->vd),
-                       neon_full_reg_offset(a->vn),
-                       neon_full_reg_offset(a->vm),
-                       16, 16, 0, fn_gvec);
-    return true;
-}
-
-static void gen_neon_dup_low16(TCGv_i32 var)
-{
-    TCGv_i32 tmp = tcg_temp_new_i32();
-    tcg_gen_ext16u_i32(var, var);
-    tcg_gen_shli_i32(tmp, var, 16);
-    tcg_gen_or_i32(var, var, tmp);
-    tcg_temp_free_i32(tmp);
-}
-
-static void gen_neon_dup_high16(TCGv_i32 var)
-{
-    TCGv_i32 tmp = tcg_temp_new_i32();
-    tcg_gen_andi_i32(var, var, 0xffff0000);
-    tcg_gen_shri_i32(tmp, var, 16);
-    tcg_gen_or_i32(var, var, tmp);
-    tcg_temp_free_i32(tmp);
-}
-
-static inline TCGv_i32 neon_get_scalar(int size, int reg)
-{
-    TCGv_i32 tmp = tcg_temp_new_i32();
-    if (size == MO_16) {
-        read_neon_element32(tmp, reg & 7, reg >> 4, MO_32);
-        if (reg & 8) {
-            gen_neon_dup_high16(tmp);
-        } else {
-            gen_neon_dup_low16(tmp);
-        }
-    } else {
-        read_neon_element32(tmp, reg & 15, reg >> 4, MO_32);
-    }
-    return tmp;
-}
-
-static bool do_2scalar(DisasContext *s, arg_2scalar *a,
-                       NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
-{
-    /*
-     * Two registers and a scalar: perform an operation between
-     * the input elements and the scalar, and then possibly
-     * perform an accumulation operation of that result into the
-     * destination.
-     */
-    TCGv_i32 scalar, tmp;
-    int pass;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vn | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if (!opfn) {
-        /* Bad size (including size == 3, which is a different insn group) */
-        return false;
-    }
-
-    if (a->q && ((a->vd | a->vn) & 1)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    scalar = neon_get_scalar(a->size, a->vm);
-    tmp = tcg_temp_new_i32();
-
-    for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
-        read_neon_element32(tmp, a->vn, pass, MO_32);
-        opfn(tmp, tmp, scalar);
-        if (accfn) {
-            TCGv_i32 rd = tcg_temp_new_i32();
-            read_neon_element32(rd, a->vd, pass, MO_32);
-            accfn(tmp, rd, tmp);
-            tcg_temp_free_i32(rd);
-        }
-        write_neon_element32(tmp, a->vd, pass, MO_32);
-    }
-    tcg_temp_free_i32(tmp);
-    tcg_temp_free_i32(scalar);
-    return true;
-}
-
-static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
-{
-    static NeonGenTwoOpFn * const opfn[] = {
-        NULL,
-        gen_helper_neon_mul_u16,
-        tcg_gen_mul_i32,
-        NULL,
-    };
-
-    return do_2scalar(s, a, opfn[a->size], NULL);
-}
-
-static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
-{
-    static NeonGenTwoOpFn * const opfn[] = {
-        NULL,
-        gen_helper_neon_mul_u16,
-        tcg_gen_mul_i32,
-        NULL,
-    };
-    static NeonGenTwoOpFn * const accfn[] = {
-        NULL,
-        gen_helper_neon_add_u16,
-        tcg_gen_add_i32,
-        NULL,
-    };
-
-    return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
-}
-
-static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
-{
-    static NeonGenTwoOpFn * const opfn[] = {
-        NULL,
-        gen_helper_neon_mul_u16,
-        tcg_gen_mul_i32,
-        NULL,
-    };
-    static NeonGenTwoOpFn * const accfn[] = {
-        NULL,
-        gen_helper_neon_sub_u16,
-        tcg_gen_sub_i32,
-        NULL,
-    };
-
-    return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
-}
-
-static bool do_2scalar_fp_vec(DisasContext *s, arg_2scalar *a,
-                              gen_helper_gvec_3_ptr *fn)
-{
-    /* Two registers and a scalar, using gvec */
-    int vec_size = a->q ? 16 : 8;
-    int rd_ofs = neon_full_reg_offset(a->vd);
-    int rn_ofs = neon_full_reg_offset(a->vn);
-    int rm_ofs;
-    int idx;
-    TCGv_ptr fpstatus;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vn | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if (!fn) {
-        /* Bad size (including size == 3, which is a different insn group) */
-        return false;
-    }
-
-    if (a->q && ((a->vd | a->vn) & 1)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    /* a->vm is M:Vm, which encodes both register and index */
-    idx = extract32(a->vm, a->size + 2, 2);
-    a->vm = extract32(a->vm, 0, a->size + 2);
-    rm_ofs = neon_full_reg_offset(a->vm);
-
-    fpstatus = fpstatus_ptr(a->size == 1 ? FPST_STD_F16 : FPST_STD);
-    tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpstatus,
-                       vec_size, vec_size, idx, fn);
-    tcg_temp_free_ptr(fpstatus);
-    return true;
-}
-
-#define DO_VMUL_F_2sc(NAME, FUNC)                                       \
-    static bool trans_##NAME##_F_2sc(DisasContext *s, arg_2scalar *a)   \
-    {                                                                   \
-        static gen_helper_gvec_3_ptr * const opfn[] = {                 \
-            NULL,                                                       \
-            gen_helper_##FUNC##_h,                                      \
-            gen_helper_##FUNC##_s,                                      \
-            NULL,                                                       \
-        };                                                              \
-        if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) { \
-            return false;                                               \
-        }                                                               \
-        return do_2scalar_fp_vec(s, a, opfn[a->size]);                  \
-    }
-
-DO_VMUL_F_2sc(VMUL, gvec_fmul_idx)
-DO_VMUL_F_2sc(VMLA, gvec_fmla_nf_idx)
-DO_VMUL_F_2sc(VMLS, gvec_fmls_nf_idx)
-
-WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
-WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
-WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
-WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
-
-static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
-{
-    static NeonGenTwoOpFn * const opfn[] = {
-        NULL,
-        gen_VQDMULH_16,
-        gen_VQDMULH_32,
-        NULL,
-    };
-
-    return do_2scalar(s, a, opfn[a->size], NULL);
-}
-
-static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
-{
-    static NeonGenTwoOpFn * const opfn[] = {
-        NULL,
-        gen_VQRDMULH_16,
-        gen_VQRDMULH_32,
-        NULL,
-    };
-
-    return do_2scalar(s, a, opfn[a->size], NULL);
-}
-
-static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
-                            NeonGenThreeOpEnvFn *opfn)
-{
-    /*
-     * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
-     * performs a kind of fused op-then-accumulate using a helper
-     * function that takes all of rd, rn and the scalar at once.
-     */
-    TCGv_i32 scalar, rn, rd;
-    int pass;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    if (!dc_isar_feature(aa32_rdm, s)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vn | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if (!opfn) {
-        /* Bad size (including size == 3, which is a different insn group) */
-        return false;
-    }
-
-    if (a->q && ((a->vd | a->vn) & 1)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    scalar = neon_get_scalar(a->size, a->vm);
-    rn = tcg_temp_new_i32();
-    rd = tcg_temp_new_i32();
-
-    for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
-        read_neon_element32(rn, a->vn, pass, MO_32);
-        read_neon_element32(rd, a->vd, pass, MO_32);
-        opfn(rd, cpu_env, rn, scalar, rd);
-        write_neon_element32(rd, a->vd, pass, MO_32);
-    }
-    tcg_temp_free_i32(rn);
-    tcg_temp_free_i32(rd);
-    tcg_temp_free_i32(scalar);
-
-    return true;
-}
-
-static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
-{
-    static NeonGenThreeOpEnvFn *opfn[] = {
-        NULL,
-        gen_helper_neon_qrdmlah_s16,
-        gen_helper_neon_qrdmlah_s32,
-        NULL,
-    };
-    return do_vqrdmlah_2sc(s, a, opfn[a->size]);
-}
-
-static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
-{
-    static NeonGenThreeOpEnvFn *opfn[] = {
-        NULL,
-        gen_helper_neon_qrdmlsh_s16,
-        gen_helper_neon_qrdmlsh_s32,
-        NULL,
-    };
-    return do_vqrdmlah_2sc(s, a, opfn[a->size]);
-}
-
-static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
-                            NeonGenTwoOpWidenFn *opfn,
-                            NeonGenTwo64OpFn *accfn)
-{
-    /*
-     * Two registers and a scalar, long operations: perform an
-     * operation on the input elements and the scalar which produces
-     * a double-width result, and then possibly perform an accumulation
-     * operation of that result into the destination.
-     */
-    TCGv_i32 scalar, rn;
-    TCGv_i64 rn0_64, rn1_64;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vn | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if (!opfn) {
-        /* Bad size (including size == 3, which is a different insn group) */
-        return false;
-    }
-
-    if (a->vd & 1) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    scalar = neon_get_scalar(a->size, a->vm);
-
-    /* Load all inputs before writing any outputs, in case of overlap */
-    rn = tcg_temp_new_i32();
-    read_neon_element32(rn, a->vn, 0, MO_32);
-    rn0_64 = tcg_temp_new_i64();
-    opfn(rn0_64, rn, scalar);
-
-    read_neon_element32(rn, a->vn, 1, MO_32);
-    rn1_64 = tcg_temp_new_i64();
-    opfn(rn1_64, rn, scalar);
-    tcg_temp_free_i32(rn);
-    tcg_temp_free_i32(scalar);
-
-    if (accfn) {
-        TCGv_i64 t64 = tcg_temp_new_i64();
-        read_neon_element64(t64, a->vd, 0, MO_64);
-        accfn(rn0_64, t64, rn0_64);
-        read_neon_element64(t64, a->vd, 1, MO_64);
-        accfn(rn1_64, t64, rn1_64);
-        tcg_temp_free_i64(t64);
-    }
-
-    write_neon_element64(rn0_64, a->vd, 0, MO_64);
-    write_neon_element64(rn1_64, a->vd, 1, MO_64);
-    tcg_temp_free_i64(rn0_64);
-    tcg_temp_free_i64(rn1_64);
-    return true;
-}
-
-static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
-{
-    static NeonGenTwoOpWidenFn * const opfn[] = {
-        NULL,
-        gen_helper_neon_mull_s16,
-        gen_mull_s32,
-        NULL,
-    };
-
-    return do_2scalar_long(s, a, opfn[a->size], NULL);
-}
-
-static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
-{
-    static NeonGenTwoOpWidenFn * const opfn[] = {
-        NULL,
-        gen_helper_neon_mull_u16,
-        gen_mull_u32,
-        NULL,
-    };
-
-    return do_2scalar_long(s, a, opfn[a->size], NULL);
-}
-
-#define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
-    static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
-    {                                                                   \
-        static NeonGenTwoOpWidenFn * const opfn[] = {                   \
-            NULL,                                                       \
-            gen_helper_neon_##MULL##16,                                 \
-            gen_##MULL##32,                                             \
-            NULL,                                                       \
-        };                                                              \
-        static NeonGenTwo64OpFn * const accfn[] = {                     \
-            NULL,                                                       \
-            gen_helper_neon_##ACC##l_u32,                               \
-            tcg_gen_##ACC##_i64,                                        \
-            NULL,                                                       \
-        };                                                              \
-        return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
-    }
-
-DO_VMLAL_2SC(VMLAL_S, mull_s, add)
-DO_VMLAL_2SC(VMLAL_U, mull_u, add)
-DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
-DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
-
-static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
-{
-    static NeonGenTwoOpWidenFn * const opfn[] = {
-        NULL,
-        gen_VQDMULL_16,
-        gen_VQDMULL_32,
-        NULL,
-    };
-
-    return do_2scalar_long(s, a, opfn[a->size], NULL);
-}
-
-static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
-{
-    static NeonGenTwoOpWidenFn * const opfn[] = {
-        NULL,
-        gen_VQDMULL_16,
-        gen_VQDMULL_32,
-        NULL,
-    };
-    static NeonGenTwo64OpFn * const accfn[] = {
-        NULL,
-        gen_VQDMLAL_acc_16,
-        gen_VQDMLAL_acc_32,
-        NULL,
-    };
-
-    return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
-}
-
-static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
-{
-    static NeonGenTwoOpWidenFn * const opfn[] = {
-        NULL,
-        gen_VQDMULL_16,
-        gen_VQDMULL_32,
-        NULL,
-    };
-    static NeonGenTwo64OpFn * const accfn[] = {
-        NULL,
-        gen_VQDMLSL_acc_16,
-        gen_VQDMLSL_acc_32,
-        NULL,
-    };
-
-    return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
-}
-
-static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
-{
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vn | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if ((a->vn | a->vm | a->vd) & a->q) {
-        return false;
-    }
-
-    if (a->imm > 7 && !a->q) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    if (!a->q) {
-        /* Extract 64 bits from <Vm:Vn> */
-        TCGv_i64 left, right, dest;
-
-        left = tcg_temp_new_i64();
-        right = tcg_temp_new_i64();
-        dest = tcg_temp_new_i64();
-
-        read_neon_element64(right, a->vn, 0, MO_64);
-        read_neon_element64(left, a->vm, 0, MO_64);
-        tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
-        write_neon_element64(dest, a->vd, 0, MO_64);
-
-        tcg_temp_free_i64(left);
-        tcg_temp_free_i64(right);
-        tcg_temp_free_i64(dest);
-    } else {
-        /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
-        TCGv_i64 left, middle, right, destleft, destright;
-
-        left = tcg_temp_new_i64();
-        middle = tcg_temp_new_i64();
-        right = tcg_temp_new_i64();
-        destleft = tcg_temp_new_i64();
-        destright = tcg_temp_new_i64();
-
-        if (a->imm < 8) {
-            read_neon_element64(right, a->vn, 0, MO_64);
-            read_neon_element64(middle, a->vn, 1, MO_64);
-            tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
-            read_neon_element64(left, a->vm, 0, MO_64);
-            tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
-        } else {
-            read_neon_element64(right, a->vn, 1, MO_64);
-            read_neon_element64(middle, a->vm, 0, MO_64);
-            tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
-            read_neon_element64(left, a->vm, 1, MO_64);
-            tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
-        }
-
-        write_neon_element64(destright, a->vd, 0, MO_64);
-        write_neon_element64(destleft, a->vd, 1, MO_64);
-
-        tcg_temp_free_i64(destright);
-        tcg_temp_free_i64(destleft);
-        tcg_temp_free_i64(right);
-        tcg_temp_free_i64(middle);
-        tcg_temp_free_i64(left);
-    }
-    return true;
-}
-
-static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
-{
-    TCGv_i64 val, def;
-    TCGv_i32 desc;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vn | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if ((a->vn + a->len + 1) > 32) {
-        /*
-         * This is UNPREDICTABLE; we choose to UNDEF to avoid the
-         * helper function running off the end of the register file.
-         */
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    desc = tcg_constant_i32((a->vn << 2) | a->len);
-    def = tcg_temp_new_i64();
-    if (a->op) {
-        read_neon_element64(def, a->vd, 0, MO_64);
-    } else {
-        tcg_gen_movi_i64(def, 0);
-    }
-    val = tcg_temp_new_i64();
-    read_neon_element64(val, a->vm, 0, MO_64);
-
-    gen_helper_neon_tbl(val, cpu_env, desc, val, def);
-    write_neon_element64(val, a->vd, 0, MO_64);
-
-    tcg_temp_free_i64(def);
-    tcg_temp_free_i64(val);
-    return true;
-}
-
-static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
-{
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if (a->vd & a->q) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    tcg_gen_gvec_dup_mem(a->size, neon_full_reg_offset(a->vd),
-                         neon_element_offset(a->vm, a->index, a->size),
-                         a->q ? 16 : 8, a->q ? 16 : 8);
-    return true;
-}
-
-static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
-{
-    int pass, half;
-    TCGv_i32 tmp[2];
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if ((a->vd | a->vm) & a->q) {
-        return false;
-    }
-
-    if (a->size == 3) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    tmp[0] = tcg_temp_new_i32();
-    tmp[1] = tcg_temp_new_i32();
-
-    for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
-        for (half = 0; half < 2; half++) {
-            read_neon_element32(tmp[half], a->vm, pass * 2 + half, MO_32);
-            switch (a->size) {
-            case 0:
-                tcg_gen_bswap32_i32(tmp[half], tmp[half]);
-                break;
-            case 1:
-                gen_swap_half(tmp[half], tmp[half]);
-                break;
-            case 2:
-                break;
-            default:
-                g_assert_not_reached();
-            }
-        }
-        write_neon_element32(tmp[1], a->vd, pass * 2, MO_32);
-        write_neon_element32(tmp[0], a->vd, pass * 2 + 1, MO_32);
-    }
-
-    tcg_temp_free_i32(tmp[0]);
-    tcg_temp_free_i32(tmp[1]);
-    return true;
-}
-
-static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
-                              NeonGenWidenFn *widenfn,
-                              NeonGenTwo64OpFn *opfn,
-                              NeonGenTwo64OpFn *accfn)
-{
-    /*
-     * Pairwise long operations: widen both halves of the pair,
-     * combine the pairs with the opfn, and then possibly accumulate
-     * into the destination with the accfn.
-     */
-    int pass;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if ((a->vd | a->vm) & a->q) {
-        return false;
-    }
-
-    if (!widenfn) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    for (pass = 0; pass < a->q + 1; pass++) {
-        TCGv_i32 tmp;
-        TCGv_i64 rm0_64, rm1_64, rd_64;
-
-        rm0_64 = tcg_temp_new_i64();
-        rm1_64 = tcg_temp_new_i64();
-        rd_64 = tcg_temp_new_i64();
-
-        tmp = tcg_temp_new_i32();
-        read_neon_element32(tmp, a->vm, pass * 2, MO_32);
-        widenfn(rm0_64, tmp);
-        read_neon_element32(tmp, a->vm, pass * 2 + 1, MO_32);
-        widenfn(rm1_64, tmp);
-        tcg_temp_free_i32(tmp);
-
-        opfn(rd_64, rm0_64, rm1_64);
-        tcg_temp_free_i64(rm0_64);
-        tcg_temp_free_i64(rm1_64);
-
-        if (accfn) {
-            TCGv_i64 tmp64 = tcg_temp_new_i64();
-            read_neon_element64(tmp64, a->vd, pass, MO_64);
-            accfn(rd_64, tmp64, rd_64);
-            tcg_temp_free_i64(tmp64);
-        }
-        write_neon_element64(rd_64, a->vd, pass, MO_64);
-        tcg_temp_free_i64(rd_64);
-    }
-    return true;
-}
-
-static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
-{
-    static NeonGenWidenFn * const widenfn[] = {
-        gen_helper_neon_widen_s8,
-        gen_helper_neon_widen_s16,
-        tcg_gen_ext_i32_i64,
-        NULL,
-    };
-    static NeonGenTwo64OpFn * const opfn[] = {
-        gen_helper_neon_paddl_u16,
-        gen_helper_neon_paddl_u32,
-        tcg_gen_add_i64,
-        NULL,
-    };
-
-    return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
-}
-
-static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
-{
-    static NeonGenWidenFn * const widenfn[] = {
-        gen_helper_neon_widen_u8,
-        gen_helper_neon_widen_u16,
-        tcg_gen_extu_i32_i64,
-        NULL,
-    };
-    static NeonGenTwo64OpFn * const opfn[] = {
-        gen_helper_neon_paddl_u16,
-        gen_helper_neon_paddl_u32,
-        tcg_gen_add_i64,
-        NULL,
-    };
-
-    return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
-}
-
-static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
-{
-    static NeonGenWidenFn * const widenfn[] = {
-        gen_helper_neon_widen_s8,
-        gen_helper_neon_widen_s16,
-        tcg_gen_ext_i32_i64,
-        NULL,
-    };
-    static NeonGenTwo64OpFn * const opfn[] = {
-        gen_helper_neon_paddl_u16,
-        gen_helper_neon_paddl_u32,
-        tcg_gen_add_i64,
-        NULL,
-    };
-    static NeonGenTwo64OpFn * const accfn[] = {
-        gen_helper_neon_addl_u16,
-        gen_helper_neon_addl_u32,
-        tcg_gen_add_i64,
-        NULL,
-    };
-
-    return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
-                             accfn[a->size]);
-}
-
-static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
-{
-    static NeonGenWidenFn * const widenfn[] = {
-        gen_helper_neon_widen_u8,
-        gen_helper_neon_widen_u16,
-        tcg_gen_extu_i32_i64,
-        NULL,
-    };
-    static NeonGenTwo64OpFn * const opfn[] = {
-        gen_helper_neon_paddl_u16,
-        gen_helper_neon_paddl_u32,
-        tcg_gen_add_i64,
-        NULL,
-    };
-    static NeonGenTwo64OpFn * const accfn[] = {
-        gen_helper_neon_addl_u16,
-        gen_helper_neon_addl_u32,
-        tcg_gen_add_i64,
-        NULL,
-    };
-
-    return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
-                             accfn[a->size]);
-}
-
-typedef void ZipFn(TCGv_ptr, TCGv_ptr);
-
-static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
-                       ZipFn *fn)
-{
-    TCGv_ptr pd, pm;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if ((a->vd | a->vm) & a->q) {
-        return false;
-    }
-
-    if (!fn) {
-        /* Bad size or size/q combination */
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    pd = vfp_reg_ptr(true, a->vd);
-    pm = vfp_reg_ptr(true, a->vm);
-    fn(pd, pm);
-    tcg_temp_free_ptr(pd);
-    tcg_temp_free_ptr(pm);
-    return true;
-}
-
-static bool trans_VUZP(DisasContext *s, arg_2misc *a)
-{
-    static ZipFn * const fn[2][4] = {
-        {
-            gen_helper_neon_unzip8,
-            gen_helper_neon_unzip16,
-            NULL,
-            NULL,
-        }, {
-            gen_helper_neon_qunzip8,
-            gen_helper_neon_qunzip16,
-            gen_helper_neon_qunzip32,
-            NULL,
-        }
-    };
-    return do_zip_uzp(s, a, fn[a->q][a->size]);
-}
-
-static bool trans_VZIP(DisasContext *s, arg_2misc *a)
-{
-    static ZipFn * const fn[2][4] = {
-        {
-            gen_helper_neon_zip8,
-            gen_helper_neon_zip16,
-            NULL,
-            NULL,
-        }, {
-            gen_helper_neon_qzip8,
-            gen_helper_neon_qzip16,
-            gen_helper_neon_qzip32,
-            NULL,
-        }
-    };
-    return do_zip_uzp(s, a, fn[a->q][a->size]);
-}
-
-static bool do_vmovn(DisasContext *s, arg_2misc *a,
-                     NeonGenNarrowEnvFn *narrowfn)
-{
-    TCGv_i64 rm;
-    TCGv_i32 rd0, rd1;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if (a->vm & 1) {
-        return false;
-    }
-
-    if (!narrowfn) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    rm = tcg_temp_new_i64();
-    rd0 = tcg_temp_new_i32();
-    rd1 = tcg_temp_new_i32();
-
-    read_neon_element64(rm, a->vm, 0, MO_64);
-    narrowfn(rd0, cpu_env, rm);
-    read_neon_element64(rm, a->vm, 1, MO_64);
-    narrowfn(rd1, cpu_env, rm);
-    write_neon_element32(rd0, a->vd, 0, MO_32);
-    write_neon_element32(rd1, a->vd, 1, MO_32);
-    tcg_temp_free_i32(rd0);
-    tcg_temp_free_i32(rd1);
-    tcg_temp_free_i64(rm);
-    return true;
-}
-
-#define DO_VMOVN(INSN, FUNC)                                    \
-    static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
-    {                                                           \
-        static NeonGenNarrowEnvFn * const narrowfn[] = {        \
-            FUNC##8,                                            \
-            FUNC##16,                                           \
-            FUNC##32,                                           \
-            NULL,                                               \
-        };                                                      \
-        return do_vmovn(s, a, narrowfn[a->size]);               \
-    }
-
-DO_VMOVN(VMOVN, gen_neon_narrow_u)
-DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
-DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
-DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
-
-static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
-{
-    TCGv_i32 rm0, rm1;
-    TCGv_i64 rd;
-    static NeonGenWidenFn * const widenfns[] = {
-        gen_helper_neon_widen_u8,
-        gen_helper_neon_widen_u16,
-        tcg_gen_extu_i32_i64,
-        NULL,
-    };
-    NeonGenWidenFn *widenfn = widenfns[a->size];
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if (a->vd & 1) {
-        return false;
-    }
-
-    if (!widenfn) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    rd = tcg_temp_new_i64();
-    rm0 = tcg_temp_new_i32();
-    rm1 = tcg_temp_new_i32();
-
-    read_neon_element32(rm0, a->vm, 0, MO_32);
-    read_neon_element32(rm1, a->vm, 1, MO_32);
-
-    widenfn(rd, rm0);
-    tcg_gen_shli_i64(rd, rd, 8 << a->size);
-    write_neon_element64(rd, a->vd, 0, MO_64);
-    widenfn(rd, rm1);
-    tcg_gen_shli_i64(rd, rd, 8 << a->size);
-    write_neon_element64(rd, a->vd, 1, MO_64);
-
-    tcg_temp_free_i64(rd);
-    tcg_temp_free_i32(rm0);
-    tcg_temp_free_i32(rm1);
-    return true;
-}
-
-static bool trans_VCVT_B16_F32(DisasContext *s, arg_2misc *a)
-{
-    TCGv_ptr fpst;
-    TCGv_i64 tmp;
-    TCGv_i32 dst0, dst1;
-
-    if (!dc_isar_feature(aa32_bf16, s)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if ((a->vm & 1) || (a->size != 1)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    fpst = fpstatus_ptr(FPST_STD);
-    tmp = tcg_temp_new_i64();
-    dst0 = tcg_temp_new_i32();
-    dst1 = tcg_temp_new_i32();
-
-    read_neon_element64(tmp, a->vm, 0, MO_64);
-    gen_helper_bfcvt_pair(dst0, tmp, fpst);
-
-    read_neon_element64(tmp, a->vm, 1, MO_64);
-    gen_helper_bfcvt_pair(dst1, tmp, fpst);
-
-    write_neon_element32(dst0, a->vd, 0, MO_32);
-    write_neon_element32(dst1, a->vd, 1, MO_32);
-
-    tcg_temp_free_i64(tmp);
-    tcg_temp_free_i32(dst0);
-    tcg_temp_free_i32(dst1);
-    tcg_temp_free_ptr(fpst);
-    return true;
-}
-
-static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
-{
-    TCGv_ptr fpst;
-    TCGv_i32 ahp, tmp, tmp2, tmp3;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
-        !dc_isar_feature(aa32_fp16_spconv, s)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if ((a->vm & 1) || (a->size != 1)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    fpst = fpstatus_ptr(FPST_STD);
-    ahp = get_ahp_flag();
-    tmp = tcg_temp_new_i32();
-    read_neon_element32(tmp, a->vm, 0, MO_32);
-    gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
-    tmp2 = tcg_temp_new_i32();
-    read_neon_element32(tmp2, a->vm, 1, MO_32);
-    gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
-    tcg_gen_shli_i32(tmp2, tmp2, 16);
-    tcg_gen_or_i32(tmp2, tmp2, tmp);
-    read_neon_element32(tmp, a->vm, 2, MO_32);
-    gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
-    tmp3 = tcg_temp_new_i32();
-    read_neon_element32(tmp3, a->vm, 3, MO_32);
-    write_neon_element32(tmp2, a->vd, 0, MO_32);
-    tcg_temp_free_i32(tmp2);
-    gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
-    tcg_gen_shli_i32(tmp3, tmp3, 16);
-    tcg_gen_or_i32(tmp3, tmp3, tmp);
-    write_neon_element32(tmp3, a->vd, 1, MO_32);
-    tcg_temp_free_i32(tmp3);
-    tcg_temp_free_i32(tmp);
-    tcg_temp_free_i32(ahp);
-    tcg_temp_free_ptr(fpst);
-
-    return true;
-}
-
-static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
-{
-    TCGv_ptr fpst;
-    TCGv_i32 ahp, tmp, tmp2, tmp3;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
-        !dc_isar_feature(aa32_fp16_spconv, s)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if ((a->vd & 1) || (a->size != 1)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    fpst = fpstatus_ptr(FPST_STD);
-    ahp = get_ahp_flag();
-    tmp3 = tcg_temp_new_i32();
-    tmp2 = tcg_temp_new_i32();
-    tmp = tcg_temp_new_i32();
-    read_neon_element32(tmp, a->vm, 0, MO_32);
-    read_neon_element32(tmp2, a->vm, 1, MO_32);
-    tcg_gen_ext16u_i32(tmp3, tmp);
-    gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
-    write_neon_element32(tmp3, a->vd, 0, MO_32);
-    tcg_gen_shri_i32(tmp, tmp, 16);
-    gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
-    write_neon_element32(tmp, a->vd, 1, MO_32);
-    tcg_temp_free_i32(tmp);
-    tcg_gen_ext16u_i32(tmp3, tmp2);
-    gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
-    write_neon_element32(tmp3, a->vd, 2, MO_32);
-    tcg_temp_free_i32(tmp3);
-    tcg_gen_shri_i32(tmp2, tmp2, 16);
-    gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
-    write_neon_element32(tmp2, a->vd, 3, MO_32);
-    tcg_temp_free_i32(tmp2);
-    tcg_temp_free_i32(ahp);
-    tcg_temp_free_ptr(fpst);
-
-    return true;
-}
-
-static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
-{
-    int vec_size = a->q ? 16 : 8;
-    int rd_ofs = neon_full_reg_offset(a->vd);
-    int rm_ofs = neon_full_reg_offset(a->vm);
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if (a->size == 3) {
-        return false;
-    }
-
-    if ((a->vd | a->vm) & a->q) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size);
-
-    return true;
-}
-
-#define DO_2MISC_VEC(INSN, FN)                                  \
-    static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
-    {                                                           \
-        return do_2misc_vec(s, a, FN);                          \
-    }
-
-DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg)
-DO_2MISC_VEC(VABS, tcg_gen_gvec_abs)
-DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0)
-DO_2MISC_VEC(VCGT0, gen_gvec_cgt0)
-DO_2MISC_VEC(VCLE0, gen_gvec_cle0)
-DO_2MISC_VEC(VCGE0, gen_gvec_cge0)
-DO_2MISC_VEC(VCLT0, gen_gvec_clt0)
-
-static bool trans_VMVN(DisasContext *s, arg_2misc *a)
-{
-    if (a->size != 0) {
-        return false;
-    }
-    return do_2misc_vec(s, a, tcg_gen_gvec_not);
-}
-
-#define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA)                          \
-    static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
-                         uint32_t rm_ofs, uint32_t oprsz,               \
-                         uint32_t maxsz)                                \
-    {                                                                   \
-        tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz,        \
-                           DATA, FUNC);                                 \
-    }
-
-#define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA)                          \
-    static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
-                         uint32_t rm_ofs, uint32_t oprsz,               \
-                         uint32_t maxsz)                                \
-    {                                                                   \
-        tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC);   \
-    }
-
-WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0)
-WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aese, 1)
-WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0)
-WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesmc, 1)
-WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0)
-WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0)
-WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0)
-
-#define DO_2M_CRYPTO(INSN, FEATURE, SIZE)                       \
-    static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
-    {                                                           \
-        if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) {  \
-            return false;                                       \
-        }                                                       \
-        return do_2misc_vec(s, a, gen_##INSN);                  \
-    }
-
-DO_2M_CRYPTO(AESE, aa32_aes, 0)
-DO_2M_CRYPTO(AESD, aa32_aes, 0)
-DO_2M_CRYPTO(AESMC, aa32_aes, 0)
-DO_2M_CRYPTO(AESIMC, aa32_aes, 0)
-DO_2M_CRYPTO(SHA1H, aa32_sha1, 2)
-DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2)
-DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2)
-
-static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
-{
-    TCGv_i32 tmp;
-    int pass;
-
-    /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if (!fn) {
-        return false;
-    }
-
-    if ((a->vd | a->vm) & a->q) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    tmp = tcg_temp_new_i32();
-    for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
-        read_neon_element32(tmp, a->vm, pass, MO_32);
-        fn(tmp, tmp);
-        write_neon_element32(tmp, a->vd, pass, MO_32);
-    }
-    tcg_temp_free_i32(tmp);
-
-    return true;
-}
-
-static bool trans_VREV32(DisasContext *s, arg_2misc *a)
-{
-    static NeonGenOneOpFn * const fn[] = {
-        tcg_gen_bswap32_i32,
-        gen_swap_half,
-        NULL,
-        NULL,
-    };
-    return do_2misc(s, a, fn[a->size]);
-}
-
-static bool trans_VREV16(DisasContext *s, arg_2misc *a)
-{
-    if (a->size != 0) {
-        return false;
-    }
-    return do_2misc(s, a, gen_rev16);
-}
-
-static bool trans_VCLS(DisasContext *s, arg_2misc *a)
-{
-    static NeonGenOneOpFn * const fn[] = {
-        gen_helper_neon_cls_s8,
-        gen_helper_neon_cls_s16,
-        gen_helper_neon_cls_s32,
-        NULL,
-    };
-    return do_2misc(s, a, fn[a->size]);
-}
-
-static void do_VCLZ_32(TCGv_i32 rd, TCGv_i32 rm)
-{
-    tcg_gen_clzi_i32(rd, rm, 32);
-}
-
-static bool trans_VCLZ(DisasContext *s, arg_2misc *a)
-{
-    static NeonGenOneOpFn * const fn[] = {
-        gen_helper_neon_clz_u8,
-        gen_helper_neon_clz_u16,
-        do_VCLZ_32,
-        NULL,
-    };
-    return do_2misc(s, a, fn[a->size]);
-}
-
-static bool trans_VCNT(DisasContext *s, arg_2misc *a)
-{
-    if (a->size != 0) {
-        return false;
-    }
-    return do_2misc(s, a, gen_helper_neon_cnt_u8);
-}
-
-static void gen_VABS_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
-                       uint32_t oprsz, uint32_t maxsz)
-{
-    tcg_gen_gvec_andi(vece, rd_ofs, rm_ofs,
-                      vece == MO_16 ? 0x7fff : 0x7fffffff,
-                      oprsz, maxsz);
-}
-
-static bool trans_VABS_F(DisasContext *s, arg_2misc *a)
-{
-    if (a->size == MO_16) {
-        if (!dc_isar_feature(aa32_fp16_arith, s)) {
-            return false;
-        }
-    } else if (a->size != MO_32) {
-        return false;
-    }
-    return do_2misc_vec(s, a, gen_VABS_F);
-}
-
-static void gen_VNEG_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
-                       uint32_t oprsz, uint32_t maxsz)
-{
-    tcg_gen_gvec_xori(vece, rd_ofs, rm_ofs,
-                      vece == MO_16 ? 0x8000 : 0x80000000,
-                      oprsz, maxsz);
-}
-
-static bool trans_VNEG_F(DisasContext *s, arg_2misc *a)
-{
-    if (a->size == MO_16) {
-        if (!dc_isar_feature(aa32_fp16_arith, s)) {
-            return false;
-        }
-    } else if (a->size != MO_32) {
-        return false;
-    }
-    return do_2misc_vec(s, a, gen_VNEG_F);
-}
-
-static bool trans_VRECPE(DisasContext *s, arg_2misc *a)
-{
-    if (a->size != 2) {
-        return false;
-    }
-    return do_2misc(s, a, gen_helper_recpe_u32);
-}
-
-static bool trans_VRSQRTE(DisasContext *s, arg_2misc *a)
-{
-    if (a->size != 2) {
-        return false;
-    }
-    return do_2misc(s, a, gen_helper_rsqrte_u32);
-}
-
-#define WRAP_1OP_ENV_FN(WRAPNAME, FUNC) \
-    static void WRAPNAME(TCGv_i32 d, TCGv_i32 m)        \
-    {                                                   \
-        FUNC(d, cpu_env, m);                            \
-    }
-
-WRAP_1OP_ENV_FN(gen_VQABS_s8, gen_helper_neon_qabs_s8)
-WRAP_1OP_ENV_FN(gen_VQABS_s16, gen_helper_neon_qabs_s16)
-WRAP_1OP_ENV_FN(gen_VQABS_s32, gen_helper_neon_qabs_s32)
-WRAP_1OP_ENV_FN(gen_VQNEG_s8, gen_helper_neon_qneg_s8)
-WRAP_1OP_ENV_FN(gen_VQNEG_s16, gen_helper_neon_qneg_s16)
-WRAP_1OP_ENV_FN(gen_VQNEG_s32, gen_helper_neon_qneg_s32)
-
-static bool trans_VQABS(DisasContext *s, arg_2misc *a)
-{
-    static NeonGenOneOpFn * const fn[] = {
-        gen_VQABS_s8,
-        gen_VQABS_s16,
-        gen_VQABS_s32,
-        NULL,
-    };
-    return do_2misc(s, a, fn[a->size]);
-}
-
-static bool trans_VQNEG(DisasContext *s, arg_2misc *a)
-{
-    static NeonGenOneOpFn * const fn[] = {
-        gen_VQNEG_s8,
-        gen_VQNEG_s16,
-        gen_VQNEG_s32,
-        NULL,
-    };
-    return do_2misc(s, a, fn[a->size]);
-}
-
-#define DO_2MISC_FP_VEC(INSN, HFUNC, SFUNC)                             \
-    static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
-                           uint32_t rm_ofs,                             \
-                           uint32_t oprsz, uint32_t maxsz)              \
-    {                                                                   \
-        static gen_helper_gvec_2_ptr * const fns[4] = {                 \
-            NULL, HFUNC, SFUNC, NULL,                                   \
-        };                                                              \
-        TCGv_ptr fpst;                                                  \
-        fpst = fpstatus_ptr(vece == MO_16 ? FPST_STD_F16 : FPST_STD);   \
-        tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, 0,       \
-                           fns[vece]);                                  \
-        tcg_temp_free_ptr(fpst);                                        \
-    }                                                                   \
-    static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
-    {                                                                   \
-        if (a->size == MO_16) {                                         \
-            if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
-                return false;                                           \
-            }                                                           \
-        } else if (a->size != MO_32) {                                  \
-            return false;                                               \
-        }                                                               \
-        return do_2misc_vec(s, a, gen_##INSN);                          \
-    }
-
-DO_2MISC_FP_VEC(VRECPE_F, gen_helper_gvec_frecpe_h, gen_helper_gvec_frecpe_s)
-DO_2MISC_FP_VEC(VRSQRTE_F, gen_helper_gvec_frsqrte_h, gen_helper_gvec_frsqrte_s)
-DO_2MISC_FP_VEC(VCGT0_F, gen_helper_gvec_fcgt0_h, gen_helper_gvec_fcgt0_s)
-DO_2MISC_FP_VEC(VCGE0_F, gen_helper_gvec_fcge0_h, gen_helper_gvec_fcge0_s)
-DO_2MISC_FP_VEC(VCEQ0_F, gen_helper_gvec_fceq0_h, gen_helper_gvec_fceq0_s)
-DO_2MISC_FP_VEC(VCLT0_F, gen_helper_gvec_fclt0_h, gen_helper_gvec_fclt0_s)
-DO_2MISC_FP_VEC(VCLE0_F, gen_helper_gvec_fcle0_h, gen_helper_gvec_fcle0_s)
-DO_2MISC_FP_VEC(VCVT_FS, gen_helper_gvec_sstoh, gen_helper_gvec_sitos)
-DO_2MISC_FP_VEC(VCVT_FU, gen_helper_gvec_ustoh, gen_helper_gvec_uitos)
-DO_2MISC_FP_VEC(VCVT_SF, gen_helper_gvec_tosszh, gen_helper_gvec_tosizs)
-DO_2MISC_FP_VEC(VCVT_UF, gen_helper_gvec_touszh, gen_helper_gvec_touizs)
-
-DO_2MISC_FP_VEC(VRINTX_impl, gen_helper_gvec_vrintx_h, gen_helper_gvec_vrintx_s)
-
-static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
-{
-    if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
-        return false;
-    }
-    return trans_VRINTX_impl(s, a);
-}
-
-#define DO_VEC_RMODE(INSN, RMODE, OP)                                   \
-    static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
-                           uint32_t rm_ofs,                             \
-                           uint32_t oprsz, uint32_t maxsz)              \
-    {                                                                   \
-        static gen_helper_gvec_2_ptr * const fns[4] = {                 \
-            NULL,                                                       \
-            gen_helper_gvec_##OP##h,                                    \
-            gen_helper_gvec_##OP##s,                                    \
-            NULL,                                                       \
-        };                                                              \
-        TCGv_ptr fpst;                                                  \
-        fpst = fpstatus_ptr(vece == 1 ? FPST_STD_F16 : FPST_STD);       \
-        tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz,          \
-                           arm_rmode_to_sf(RMODE), fns[vece]);          \
-        tcg_temp_free_ptr(fpst);                                        \
-    }                                                                   \
-    static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
-    {                                                                   \
-        if (!arm_dc_feature(s, ARM_FEATURE_V8)) {                       \
-            return false;                                               \
-        }                                                               \
-        if (a->size == MO_16) {                                         \
-            if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
-                return false;                                           \
-            }                                                           \
-        } else if (a->size != MO_32) {                                  \
-            return false;                                               \
-        }                                                               \
-        return do_2misc_vec(s, a, gen_##INSN);                          \
-    }
-
-DO_VEC_RMODE(VCVTAU, FPROUNDING_TIEAWAY, vcvt_rm_u)
-DO_VEC_RMODE(VCVTAS, FPROUNDING_TIEAWAY, vcvt_rm_s)
-DO_VEC_RMODE(VCVTNU, FPROUNDING_TIEEVEN, vcvt_rm_u)
-DO_VEC_RMODE(VCVTNS, FPROUNDING_TIEEVEN, vcvt_rm_s)
-DO_VEC_RMODE(VCVTPU, FPROUNDING_POSINF, vcvt_rm_u)
-DO_VEC_RMODE(VCVTPS, FPROUNDING_POSINF, vcvt_rm_s)
-DO_VEC_RMODE(VCVTMU, FPROUNDING_NEGINF, vcvt_rm_u)
-DO_VEC_RMODE(VCVTMS, FPROUNDING_NEGINF, vcvt_rm_s)
-
-DO_VEC_RMODE(VRINTN, FPROUNDING_TIEEVEN, vrint_rm_)
-DO_VEC_RMODE(VRINTA, FPROUNDING_TIEAWAY, vrint_rm_)
-DO_VEC_RMODE(VRINTZ, FPROUNDING_ZERO, vrint_rm_)
-DO_VEC_RMODE(VRINTM, FPROUNDING_NEGINF, vrint_rm_)
-DO_VEC_RMODE(VRINTP, FPROUNDING_POSINF, vrint_rm_)
-
-static bool trans_VSWP(DisasContext *s, arg_2misc *a)
-{
-    TCGv_i64 rm, rd;
-    int pass;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if (a->size != 0) {
-        return false;
-    }
-
-    if ((a->vd | a->vm) & a->q) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    rm = tcg_temp_new_i64();
-    rd = tcg_temp_new_i64();
-    for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
-        read_neon_element64(rm, a->vm, pass, MO_64);
-        read_neon_element64(rd, a->vd, pass, MO_64);
-        write_neon_element64(rm, a->vd, pass, MO_64);
-        write_neon_element64(rd, a->vm, pass, MO_64);
-    }
-    tcg_temp_free_i64(rm);
-    tcg_temp_free_i64(rd);
-
-    return true;
-}
-static void gen_neon_trn_u8(TCGv_i32 t0, TCGv_i32 t1)
-{
-    TCGv_i32 rd, tmp;
-
-    rd = tcg_temp_new_i32();
-    tmp = tcg_temp_new_i32();
-
-    tcg_gen_shli_i32(rd, t0, 8);
-    tcg_gen_andi_i32(rd, rd, 0xff00ff00);
-    tcg_gen_andi_i32(tmp, t1, 0x00ff00ff);
-    tcg_gen_or_i32(rd, rd, tmp);
-
-    tcg_gen_shri_i32(t1, t1, 8);
-    tcg_gen_andi_i32(t1, t1, 0x00ff00ff);
-    tcg_gen_andi_i32(tmp, t0, 0xff00ff00);
-    tcg_gen_or_i32(t1, t1, tmp);
-    tcg_gen_mov_i32(t0, rd);
-
-    tcg_temp_free_i32(tmp);
-    tcg_temp_free_i32(rd);
-}
-
-static void gen_neon_trn_u16(TCGv_i32 t0, TCGv_i32 t1)
-{
-    TCGv_i32 rd, tmp;
-
-    rd = tcg_temp_new_i32();
-    tmp = tcg_temp_new_i32();
-
-    tcg_gen_shli_i32(rd, t0, 16);
-    tcg_gen_andi_i32(tmp, t1, 0xffff);
-    tcg_gen_or_i32(rd, rd, tmp);
-    tcg_gen_shri_i32(t1, t1, 16);
-    tcg_gen_andi_i32(tmp, t0, 0xffff0000);
-    tcg_gen_or_i32(t1, t1, tmp);
-    tcg_gen_mov_i32(t0, rd);
-
-    tcg_temp_free_i32(tmp);
-    tcg_temp_free_i32(rd);
-}
-
-static bool trans_VTRN(DisasContext *s, arg_2misc *a)
-{
-    TCGv_i32 tmp, tmp2;
-    int pass;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if ((a->vd | a->vm) & a->q) {
-        return false;
-    }
-
-    if (a->size == 3) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    tmp = tcg_temp_new_i32();
-    tmp2 = tcg_temp_new_i32();
-    if (a->size == MO_32) {
-        for (pass = 0; pass < (a->q ? 4 : 2); pass += 2) {
-            read_neon_element32(tmp, a->vm, pass, MO_32);
-            read_neon_element32(tmp2, a->vd, pass + 1, MO_32);
-            write_neon_element32(tmp2, a->vm, pass, MO_32);
-            write_neon_element32(tmp, a->vd, pass + 1, MO_32);
-        }
-    } else {
-        for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
-            read_neon_element32(tmp, a->vm, pass, MO_32);
-            read_neon_element32(tmp2, a->vd, pass, MO_32);
-            if (a->size == MO_8) {
-                gen_neon_trn_u8(tmp, tmp2);
-            } else {
-                gen_neon_trn_u16(tmp, tmp2);
-            }
-            write_neon_element32(tmp2, a->vm, pass, MO_32);
-            write_neon_element32(tmp, a->vd, pass, MO_32);
-        }
-    }
-    tcg_temp_free_i32(tmp);
-    tcg_temp_free_i32(tmp2);
-    return true;
-}
-
-static bool trans_VSMMLA(DisasContext *s, arg_VSMMLA *a)
-{
-    if (!dc_isar_feature(aa32_i8mm, s)) {
-        return false;
-    }
-    return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
-                        gen_helper_gvec_smmla_b);
-}
-
-static bool trans_VUMMLA(DisasContext *s, arg_VUMMLA *a)
-{
-    if (!dc_isar_feature(aa32_i8mm, s)) {
-        return false;
-    }
-    return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
-                        gen_helper_gvec_ummla_b);
-}
-
-static bool trans_VUSMMLA(DisasContext *s, arg_VUSMMLA *a)
-{
-    if (!dc_isar_feature(aa32_i8mm, s)) {
-        return false;
-    }
-    return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
-                        gen_helper_gvec_usmmla_b);
-}
-
-static bool trans_VMMLA_b16(DisasContext *s, arg_VMMLA_b16 *a)
-{
-    if (!dc_isar_feature(aa32_bf16, s)) {
-        return false;
-    }
-    return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
-                        gen_helper_gvec_bfmmla);
-}
-
-static bool trans_VFMA_b16(DisasContext *s, arg_VFMA_b16 *a)
-{
-    if (!dc_isar_feature(aa32_bf16, s)) {
-        return false;
-    }
-    return do_neon_ddda_fpst(s, 7, a->vd, a->vn, a->vm, a->q, FPST_STD,
-                             gen_helper_gvec_bfmlal);
-}
-
-static bool trans_VFMA_b16_scal(DisasContext *s, arg_VFMA_b16_scal *a)
-{
-    if (!dc_isar_feature(aa32_bf16, s)) {
-        return false;
-    }
-    return do_neon_ddda_fpst(s, 6, a->vd, a->vn, a->vm,
-                             (a->index << 1) | a->q, FPST_STD,
-                             gen_helper_gvec_bfmlal_idx);
-}
diff --git a/target/arm/translate-sme.c b/target/arm/translate-sme.c
deleted file mode 100644
index 7b87a9d..0000000
--- a/target/arm/translate-sme.c
+++ /dev/null
@@ -1,373 +0,0 @@
-/*
- * AArch64 SME translation
- *
- * Copyright (c) 2022 Linaro, Ltd
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "qemu/osdep.h"
-#include "cpu.h"
-#include "tcg/tcg-op.h"
-#include "tcg/tcg-op-gvec.h"
-#include "tcg/tcg-gvec-desc.h"
-#include "translate.h"
-#include "exec/helper-gen.h"
-#include "translate-a64.h"
-#include "fpu/softfloat.h"
-
-
-/*
- * Include the generated decoder.
- */
-
-#include "decode-sme.c.inc"
-
-
-/*
- * Resolve tile.size[index] to a host pointer, where tile and index
- * are always decoded together, dependent on the element size.
- */
-static TCGv_ptr get_tile_rowcol(DisasContext *s, int esz, int rs,
-                                int tile_index, bool vertical)
-{
-    int tile = tile_index >> (4 - esz);
-    int index = esz == MO_128 ? 0 : extract32(tile_index, 0, 4 - esz);
-    int pos, len, offset;
-    TCGv_i32 tmp;
-    TCGv_ptr addr;
-
-    /* Compute the final index, which is Rs+imm. */
-    tmp = tcg_temp_new_i32();
-    tcg_gen_trunc_tl_i32(tmp, cpu_reg(s, rs));
-    tcg_gen_addi_i32(tmp, tmp, index);
-
-    /* Prepare a power-of-two modulo via extraction of @len bits. */
-    len = ctz32(streaming_vec_reg_size(s)) - esz;
-
-    if (vertical) {
-        /*
-         * Compute the byte offset of the index within the tile:
-         *     (index % (svl / size)) * size
-         *   = (index % (svl >> esz)) << esz
-         * Perform the power-of-two modulo via extraction of the low @len bits.
-         * Perform the multiply by shifting left by @pos bits.
-         * Perform these operations simultaneously via deposit into zero.
-         */
-        pos = esz;
-        tcg_gen_deposit_z_i32(tmp, tmp, pos, len);
-
-        /*
-         * For big-endian, adjust the indexed column byte offset within
-         * the uint64_t host words that make up env->zarray[].
-         */
-        if (HOST_BIG_ENDIAN && esz < MO_64) {
-            tcg_gen_xori_i32(tmp, tmp, 8 - (1 << esz));
-        }
-    } else {
-        /*
-         * Compute the byte offset of the index within the tile:
-         *     (index % (svl / size)) * (size * sizeof(row))
-         *   = (index % (svl >> esz)) << (esz + log2(sizeof(row)))
-         */
-        pos = esz + ctz32(sizeof(ARMVectorReg));
-        tcg_gen_deposit_z_i32(tmp, tmp, pos, len);
-
-        /* Row slices are always aligned and need no endian adjustment. */
-    }
-
-    /* The tile byte offset within env->zarray is the row. */
-    offset = tile * sizeof(ARMVectorReg);
-
-    /* Include the byte offset of zarray to make this relative to env. */
-    offset += offsetof(CPUARMState, zarray);
-    tcg_gen_addi_i32(tmp, tmp, offset);
-
-    /* Add the byte offset to env to produce the final pointer. */
-    addr = tcg_temp_new_ptr();
-    tcg_gen_ext_i32_ptr(addr, tmp);
-    tcg_temp_free_i32(tmp);
-    tcg_gen_add_ptr(addr, addr, cpu_env);
-
-    return addr;
-}
-
-static bool trans_ZERO(DisasContext *s, arg_ZERO *a)
-{
-    if (!dc_isar_feature(aa64_sme, s)) {
-        return false;
-    }
-    if (sme_za_enabled_check(s)) {
-        gen_helper_sme_zero(cpu_env, tcg_constant_i32(a->imm),
-                            tcg_constant_i32(streaming_vec_reg_size(s)));
-    }
-    return true;
-}
-
-static bool trans_MOVA(DisasContext *s, arg_MOVA *a)
-{
-    static gen_helper_gvec_4 * const h_fns[5] = {
-        gen_helper_sve_sel_zpzz_b, gen_helper_sve_sel_zpzz_h,
-        gen_helper_sve_sel_zpzz_s, gen_helper_sve_sel_zpzz_d,
-        gen_helper_sve_sel_zpzz_q
-    };
-    static gen_helper_gvec_3 * const cz_fns[5] = {
-        gen_helper_sme_mova_cz_b, gen_helper_sme_mova_cz_h,
-        gen_helper_sme_mova_cz_s, gen_helper_sme_mova_cz_d,
-        gen_helper_sme_mova_cz_q,
-    };
-    static gen_helper_gvec_3 * const zc_fns[5] = {
-        gen_helper_sme_mova_zc_b, gen_helper_sme_mova_zc_h,
-        gen_helper_sme_mova_zc_s, gen_helper_sme_mova_zc_d,
-        gen_helper_sme_mova_zc_q,
-    };
-
-    TCGv_ptr t_za, t_zr, t_pg;
-    TCGv_i32 t_desc;
-    int svl;
-
-    if (!dc_isar_feature(aa64_sme, s)) {
-        return false;
-    }
-    if (!sme_smza_enabled_check(s)) {
-        return true;
-    }
-
-    t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v);
-    t_zr = vec_full_reg_ptr(s, a->zr);
-    t_pg = pred_full_reg_ptr(s, a->pg);
-
-    svl = streaming_vec_reg_size(s);
-    t_desc = tcg_constant_i32(simd_desc(svl, svl, 0));
-
-    if (a->v) {
-        /* Vertical slice -- use sme mova helpers. */
-        if (a->to_vec) {
-            zc_fns[a->esz](t_zr, t_za, t_pg, t_desc);
-        } else {
-            cz_fns[a->esz](t_za, t_zr, t_pg, t_desc);
-        }
-    } else {
-        /* Horizontal slice -- reuse sve sel helpers. */
-        if (a->to_vec) {
-            h_fns[a->esz](t_zr, t_za, t_zr, t_pg, t_desc);
-        } else {
-            h_fns[a->esz](t_za, t_zr, t_za, t_pg, t_desc);
-        }
-    }
-
-    tcg_temp_free_ptr(t_za);
-    tcg_temp_free_ptr(t_zr);
-    tcg_temp_free_ptr(t_pg);
-
-    return true;
-}
-
-static bool trans_LDST1(DisasContext *s, arg_LDST1 *a)
-{
-    typedef void GenLdSt1(TCGv_env, TCGv_ptr, TCGv_ptr, TCGv, TCGv_i32);
-
-    /*
-     * Indexed by [esz][be][v][mte][st], which is (except for load/store)
-     * also the order in which the elements appear in the function names,
-     * and so how we must concatenate the pieces.
-     */
-
-#define FN_LS(F)     { gen_helper_sme_ld1##F, gen_helper_sme_st1##F }
-#define FN_MTE(F)    { FN_LS(F), FN_LS(F##_mte) }
-#define FN_HV(F)     { FN_MTE(F##_h), FN_MTE(F##_v) }
-#define FN_END(L, B) { FN_HV(L), FN_HV(B) }
-
-    static GenLdSt1 * const fns[5][2][2][2][2] = {
-        FN_END(b, b),
-        FN_END(h_le, h_be),
-        FN_END(s_le, s_be),
-        FN_END(d_le, d_be),
-        FN_END(q_le, q_be),
-    };
-
-#undef FN_LS
-#undef FN_MTE
-#undef FN_HV
-#undef FN_END
-
-    TCGv_ptr t_za, t_pg;
-    TCGv_i64 addr;
-    int svl, desc = 0;
-    bool be = s->be_data == MO_BE;
-    bool mte = s->mte_active[0];
-
-    if (!dc_isar_feature(aa64_sme, s)) {
-        return false;
-    }
-    if (!sme_smza_enabled_check(s)) {
-        return true;
-    }
-
-    t_za = get_tile_rowcol(s, a->esz, a->rs, a->za_imm, a->v);
-    t_pg = pred_full_reg_ptr(s, a->pg);
-    addr = tcg_temp_new_i64();
-
-    tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), a->esz);
-    tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
-
-    if (mte) {
-        desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s));
-        desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid);
-        desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma);
-        desc = FIELD_DP32(desc, MTEDESC, WRITE, a->st);
-        desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (1 << a->esz) - 1);
-        desc <<= SVE_MTEDESC_SHIFT;
-    } else {
-        addr = clean_data_tbi(s, addr);
-    }
-    svl = streaming_vec_reg_size(s);
-    desc = simd_desc(svl, svl, desc);
-
-    fns[a->esz][be][a->v][mte][a->st](cpu_env, t_za, t_pg, addr,
-                                      tcg_constant_i32(desc));
-
-    tcg_temp_free_ptr(t_za);
-    tcg_temp_free_ptr(t_pg);
-    tcg_temp_free_i64(addr);
-    return true;
-}
-
-typedef void GenLdStR(DisasContext *, TCGv_ptr, int, int, int, int);
-
-static bool do_ldst_r(DisasContext *s, arg_ldstr *a, GenLdStR *fn)
-{
-    int svl = streaming_vec_reg_size(s);
-    int imm = a->imm;
-    TCGv_ptr base;
-
-    if (!sme_za_enabled_check(s)) {
-        return true;
-    }
-
-    /* ZA[n] equates to ZA0H.B[n]. */
-    base = get_tile_rowcol(s, MO_8, a->rv, imm, false);
-
-    fn(s, base, 0, svl, a->rn, imm * svl);
-
-    tcg_temp_free_ptr(base);
-    return true;
-}
-
-TRANS_FEAT(LDR, aa64_sme, do_ldst_r, a, gen_sve_ldr)
-TRANS_FEAT(STR, aa64_sme, do_ldst_r, a, gen_sve_str)
-
-static bool do_adda(DisasContext *s, arg_adda *a, MemOp esz,
-                    gen_helper_gvec_4 *fn)
-{
-    int svl = streaming_vec_reg_size(s);
-    uint32_t desc = simd_desc(svl, svl, 0);
-    TCGv_ptr za, zn, pn, pm;
-
-    if (!sme_smza_enabled_check(s)) {
-        return true;
-    }
-
-    /* Sum XZR+zad to find ZAd. */
-    za = get_tile_rowcol(s, esz, 31, a->zad, false);
-    zn = vec_full_reg_ptr(s, a->zn);
-    pn = pred_full_reg_ptr(s, a->pn);
-    pm = pred_full_reg_ptr(s, a->pm);
-
-    fn(za, zn, pn, pm, tcg_constant_i32(desc));
-
-    tcg_temp_free_ptr(za);
-    tcg_temp_free_ptr(zn);
-    tcg_temp_free_ptr(pn);
-    tcg_temp_free_ptr(pm);
-    return true;
-}
-
-TRANS_FEAT(ADDHA_s, aa64_sme, do_adda, a, MO_32, gen_helper_sme_addha_s)
-TRANS_FEAT(ADDVA_s, aa64_sme, do_adda, a, MO_32, gen_helper_sme_addva_s)
-TRANS_FEAT(ADDHA_d, aa64_sme_i16i64, do_adda, a, MO_64, gen_helper_sme_addha_d)
-TRANS_FEAT(ADDVA_d, aa64_sme_i16i64, do_adda, a, MO_64, gen_helper_sme_addva_d)
-
-static bool do_outprod(DisasContext *s, arg_op *a, MemOp esz,
-                       gen_helper_gvec_5 *fn)
-{
-    int svl = streaming_vec_reg_size(s);
-    uint32_t desc = simd_desc(svl, svl, a->sub);
-    TCGv_ptr za, zn, zm, pn, pm;
-
-    if (!sme_smza_enabled_check(s)) {
-        return true;
-    }
-
-    /* Sum XZR+zad to find ZAd. */
-    za = get_tile_rowcol(s, esz, 31, a->zad, false);
-    zn = vec_full_reg_ptr(s, a->zn);
-    zm = vec_full_reg_ptr(s, a->zm);
-    pn = pred_full_reg_ptr(s, a->pn);
-    pm = pred_full_reg_ptr(s, a->pm);
-
-    fn(za, zn, zm, pn, pm, tcg_constant_i32(desc));
-
-    tcg_temp_free_ptr(za);
-    tcg_temp_free_ptr(zn);
-    tcg_temp_free_ptr(pn);
-    tcg_temp_free_ptr(pm);
-    return true;
-}
-
-static bool do_outprod_fpst(DisasContext *s, arg_op *a, MemOp esz,
-                            gen_helper_gvec_5_ptr *fn)
-{
-    int svl = streaming_vec_reg_size(s);
-    uint32_t desc = simd_desc(svl, svl, a->sub);
-    TCGv_ptr za, zn, zm, pn, pm, fpst;
-
-    if (!sme_smza_enabled_check(s)) {
-        return true;
-    }
-
-    /* Sum XZR+zad to find ZAd. */
-    za = get_tile_rowcol(s, esz, 31, a->zad, false);
-    zn = vec_full_reg_ptr(s, a->zn);
-    zm = vec_full_reg_ptr(s, a->zm);
-    pn = pred_full_reg_ptr(s, a->pn);
-    pm = pred_full_reg_ptr(s, a->pm);
-    fpst = fpstatus_ptr(FPST_FPCR);
-
-    fn(za, zn, zm, pn, pm, fpst, tcg_constant_i32(desc));
-
-    tcg_temp_free_ptr(za);
-    tcg_temp_free_ptr(zn);
-    tcg_temp_free_ptr(pn);
-    tcg_temp_free_ptr(pm);
-    tcg_temp_free_ptr(fpst);
-    return true;
-}
-
-TRANS_FEAT(FMOPA_h, aa64_sme, do_outprod_fpst, a, MO_32, gen_helper_sme_fmopa_h)
-TRANS_FEAT(FMOPA_s, aa64_sme, do_outprod_fpst, a, MO_32, gen_helper_sme_fmopa_s)
-TRANS_FEAT(FMOPA_d, aa64_sme_f64f64, do_outprod_fpst, a, MO_64, gen_helper_sme_fmopa_d)
-
-/* TODO: FEAT_EBF16 */
-TRANS_FEAT(BFMOPA, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_bfmopa)
-
-TRANS_FEAT(SMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_smopa_s)
-TRANS_FEAT(UMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_umopa_s)
-TRANS_FEAT(SUMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_sumopa_s)
-TRANS_FEAT(USMOPA_s, aa64_sme, do_outprod, a, MO_32, gen_helper_sme_usmopa_s)
-
-TRANS_FEAT(SMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_smopa_d)
-TRANS_FEAT(UMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_umopa_d)
-TRANS_FEAT(SUMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_sumopa_d)
-TRANS_FEAT(USMOPA_d, aa64_sme_i16i64, do_outprod, a, MO_64, gen_helper_sme_usmopa_d)
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
deleted file mode 100644
index 621a2ab..0000000
--- a/target/arm/translate-sve.c
+++ /dev/null
@@ -1,7583 +0,0 @@
-/*
- * AArch64 SVE translation
- *
- * Copyright (c) 2018 Linaro, Ltd
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "qemu/osdep.h"
-#include "cpu.h"
-#include "exec/exec-all.h"
-#include "tcg/tcg-op.h"
-#include "tcg/tcg-op-gvec.h"
-#include "tcg/tcg-gvec-desc.h"
-#include "qemu/log.h"
-#include "arm_ldst.h"
-#include "translate.h"
-#include "internals.h"
-#include "exec/helper-proto.h"
-#include "exec/helper-gen.h"
-#include "exec/log.h"
-#include "translate-a64.h"
-#include "fpu/softfloat.h"
-
-
-typedef void GVecGen2sFn(unsigned, uint32_t, uint32_t,
-                         TCGv_i64, uint32_t, uint32_t);
-
-typedef void gen_helper_gvec_flags_3(TCGv_i32, TCGv_ptr, TCGv_ptr,
-                                     TCGv_ptr, TCGv_i32);
-typedef void gen_helper_gvec_flags_4(TCGv_i32, TCGv_ptr, TCGv_ptr,
-                                     TCGv_ptr, TCGv_ptr, TCGv_i32);
-
-typedef void gen_helper_gvec_mem(TCGv_env, TCGv_ptr, TCGv_i64, TCGv_i32);
-typedef void gen_helper_gvec_mem_scatter(TCGv_env, TCGv_ptr, TCGv_ptr,
-                                         TCGv_ptr, TCGv_i64, TCGv_i32);
-
-/*
- * Helpers for extracting complex instruction fields.
- */
-
-/* See e.g. ASR (immediate, predicated).
- * Returns -1 for unallocated encoding; diagnose later.
- */
-static int tszimm_esz(DisasContext *s, int x)
-{
-    x >>= 3;  /* discard imm3 */
-    return 31 - clz32(x);
-}
-
-static int tszimm_shr(DisasContext *s, int x)
-{
-    return (16 << tszimm_esz(s, x)) - x;
-}
-
-/* See e.g. LSL (immediate, predicated).  */
-static int tszimm_shl(DisasContext *s, int x)
-{
-    return x - (8 << tszimm_esz(s, x));
-}
-
-/* The SH bit is in bit 8.  Extract the low 8 and shift.  */
-static inline int expand_imm_sh8s(DisasContext *s, int x)
-{
-    return (int8_t)x << (x & 0x100 ? 8 : 0);
-}
-
-static inline int expand_imm_sh8u(DisasContext *s, int x)
-{
-    return (uint8_t)x << (x & 0x100 ? 8 : 0);
-}
-
-/* Convert a 2-bit memory size (msz) to a 4-bit data type (dtype)
- * with unsigned data.  C.f. SVE Memory Contiguous Load Group.
- */
-static inline int msz_dtype(DisasContext *s, int msz)
-{
-    static const uint8_t dtype[4] = { 0, 5, 10, 15 };
-    return dtype[msz];
-}
-
-/*
- * Include the generated decoder.
- */
-
-#include "decode-sve.c.inc"
-
-/*
- * Implement all of the translator functions referenced by the decoder.
- */
-
-/* Invoke an out-of-line helper on 2 Zregs. */
-static bool gen_gvec_ool_zz(DisasContext *s, gen_helper_gvec_2 *fn,
-                            int rd, int rn, int data)
-{
-    if (fn == NULL) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        tcg_gen_gvec_2_ool(vec_full_reg_offset(s, rd),
-                           vec_full_reg_offset(s, rn),
-                           vsz, vsz, data, fn);
-    }
-    return true;
-}
-
-static bool gen_gvec_fpst_zz(DisasContext *s, gen_helper_gvec_2_ptr *fn,
-                             int rd, int rn, int data,
-                             ARMFPStatusFlavour flavour)
-{
-    if (fn == NULL) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        TCGv_ptr status = fpstatus_ptr(flavour);
-
-        tcg_gen_gvec_2_ptr(vec_full_reg_offset(s, rd),
-                           vec_full_reg_offset(s, rn),
-                           status, vsz, vsz, data, fn);
-        tcg_temp_free_ptr(status);
-    }
-    return true;
-}
-
-static bool gen_gvec_fpst_arg_zz(DisasContext *s, gen_helper_gvec_2_ptr *fn,
-                                 arg_rr_esz *a, int data)
-{
-    return gen_gvec_fpst_zz(s, fn, a->rd, a->rn, data,
-                            a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
-}
-
-/* Invoke an out-of-line helper on 3 Zregs. */
-static bool gen_gvec_ool_zzz(DisasContext *s, gen_helper_gvec_3 *fn,
-                             int rd, int rn, int rm, int data)
-{
-    if (fn == NULL) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        tcg_gen_gvec_3_ool(vec_full_reg_offset(s, rd),
-                           vec_full_reg_offset(s, rn),
-                           vec_full_reg_offset(s, rm),
-                           vsz, vsz, data, fn);
-    }
-    return true;
-}
-
-static bool gen_gvec_ool_arg_zzz(DisasContext *s, gen_helper_gvec_3 *fn,
-                                 arg_rrr_esz *a, int data)
-{
-    return gen_gvec_ool_zzz(s, fn, a->rd, a->rn, a->rm, data);
-}
-
-/* Invoke an out-of-line helper on 3 Zregs, plus float_status. */
-static bool gen_gvec_fpst_zzz(DisasContext *s, gen_helper_gvec_3_ptr *fn,
-                              int rd, int rn, int rm,
-                              int data, ARMFPStatusFlavour flavour)
-{
-    if (fn == NULL) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        TCGv_ptr status = fpstatus_ptr(flavour);
-
-        tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd),
-                           vec_full_reg_offset(s, rn),
-                           vec_full_reg_offset(s, rm),
-                           status, vsz, vsz, data, fn);
-
-        tcg_temp_free_ptr(status);
-    }
-    return true;
-}
-
-static bool gen_gvec_fpst_arg_zzz(DisasContext *s, gen_helper_gvec_3_ptr *fn,
-                                  arg_rrr_esz *a, int data)
-{
-    return gen_gvec_fpst_zzz(s, fn, a->rd, a->rn, a->rm, data,
-                             a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
-}
-
-/* Invoke an out-of-line helper on 4 Zregs. */
-static bool gen_gvec_ool_zzzz(DisasContext *s, gen_helper_gvec_4 *fn,
-                              int rd, int rn, int rm, int ra, int data)
-{
-    if (fn == NULL) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        tcg_gen_gvec_4_ool(vec_full_reg_offset(s, rd),
-                           vec_full_reg_offset(s, rn),
-                           vec_full_reg_offset(s, rm),
-                           vec_full_reg_offset(s, ra),
-                           vsz, vsz, data, fn);
-    }
-    return true;
-}
-
-static bool gen_gvec_ool_arg_zzzz(DisasContext *s, gen_helper_gvec_4 *fn,
-                                  arg_rrrr_esz *a, int data)
-{
-    return gen_gvec_ool_zzzz(s, fn, a->rd, a->rn, a->rm, a->ra, data);
-}
-
-static bool gen_gvec_ool_arg_zzxz(DisasContext *s, gen_helper_gvec_4 *fn,
-                                  arg_rrxr_esz *a)
-{
-    return gen_gvec_ool_zzzz(s, fn, a->rd, a->rn, a->rm, a->ra, a->index);
-}
-
-/* Invoke an out-of-line helper on 4 Zregs, plus a pointer. */
-static bool gen_gvec_ptr_zzzz(DisasContext *s, gen_helper_gvec_4_ptr *fn,
-                              int rd, int rn, int rm, int ra,
-                              int data, TCGv_ptr ptr)
-{
-    if (fn == NULL) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, rd),
-                           vec_full_reg_offset(s, rn),
-                           vec_full_reg_offset(s, rm),
-                           vec_full_reg_offset(s, ra),
-                           ptr, vsz, vsz, data, fn);
-    }
-    return true;
-}
-
-static bool gen_gvec_fpst_zzzz(DisasContext *s, gen_helper_gvec_4_ptr *fn,
-                               int rd, int rn, int rm, int ra,
-                               int data, ARMFPStatusFlavour flavour)
-{
-    TCGv_ptr status = fpstatus_ptr(flavour);
-    bool ret = gen_gvec_ptr_zzzz(s, fn, rd, rn, rm, ra, data, status);
-    tcg_temp_free_ptr(status);
-    return ret;
-}
-
-/* Invoke an out-of-line helper on 4 Zregs, 1 Preg, plus fpst. */
-static bool gen_gvec_fpst_zzzzp(DisasContext *s, gen_helper_gvec_5_ptr *fn,
-                                int rd, int rn, int rm, int ra, int pg,
-                                int data, ARMFPStatusFlavour flavour)
-{
-    if (fn == NULL) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        TCGv_ptr status = fpstatus_ptr(flavour);
-
-        tcg_gen_gvec_5_ptr(vec_full_reg_offset(s, rd),
-                           vec_full_reg_offset(s, rn),
-                           vec_full_reg_offset(s, rm),
-                           vec_full_reg_offset(s, ra),
-                           pred_full_reg_offset(s, pg),
-                           status, vsz, vsz, data, fn);
-
-        tcg_temp_free_ptr(status);
-    }
-    return true;
-}
-
-/* Invoke an out-of-line helper on 2 Zregs and a predicate. */
-static bool gen_gvec_ool_zzp(DisasContext *s, gen_helper_gvec_3 *fn,
-                             int rd, int rn, int pg, int data)
-{
-    if (fn == NULL) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        tcg_gen_gvec_3_ool(vec_full_reg_offset(s, rd),
-                           vec_full_reg_offset(s, rn),
-                           pred_full_reg_offset(s, pg),
-                           vsz, vsz, data, fn);
-    }
-    return true;
-}
-
-static bool gen_gvec_ool_arg_zpz(DisasContext *s, gen_helper_gvec_3 *fn,
-                                 arg_rpr_esz *a, int data)
-{
-    return gen_gvec_ool_zzp(s, fn, a->rd, a->rn, a->pg, data);
-}
-
-static bool gen_gvec_ool_arg_zpzi(DisasContext *s, gen_helper_gvec_3 *fn,
-                                  arg_rpri_esz *a)
-{
-    return gen_gvec_ool_zzp(s, fn, a->rd, a->rn, a->pg, a->imm);
-}
-
-static bool gen_gvec_fpst_zzp(DisasContext *s, gen_helper_gvec_3_ptr *fn,
-                              int rd, int rn, int pg, int data,
-                              ARMFPStatusFlavour flavour)
-{
-    if (fn == NULL) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        TCGv_ptr status = fpstatus_ptr(flavour);
-
-        tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd),
-                           vec_full_reg_offset(s, rn),
-                           pred_full_reg_offset(s, pg),
-                           status, vsz, vsz, data, fn);
-        tcg_temp_free_ptr(status);
-    }
-    return true;
-}
-
-static bool gen_gvec_fpst_arg_zpz(DisasContext *s, gen_helper_gvec_3_ptr *fn,
-                                  arg_rpr_esz *a, int data,
-                                  ARMFPStatusFlavour flavour)
-{
-    return gen_gvec_fpst_zzp(s, fn, a->rd, a->rn, a->pg, data, flavour);
-}
-
-/* Invoke an out-of-line helper on 3 Zregs and a predicate. */
-static bool gen_gvec_ool_zzzp(DisasContext *s, gen_helper_gvec_4 *fn,
-                              int rd, int rn, int rm, int pg, int data)
-{
-    if (fn == NULL) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        tcg_gen_gvec_4_ool(vec_full_reg_offset(s, rd),
-                           vec_full_reg_offset(s, rn),
-                           vec_full_reg_offset(s, rm),
-                           pred_full_reg_offset(s, pg),
-                           vsz, vsz, data, fn);
-    }
-    return true;
-}
-
-static bool gen_gvec_ool_arg_zpzz(DisasContext *s, gen_helper_gvec_4 *fn,
-                                  arg_rprr_esz *a, int data)
-{
-    return gen_gvec_ool_zzzp(s, fn, a->rd, a->rn, a->rm, a->pg, data);
-}
-
-/* Invoke an out-of-line helper on 3 Zregs and a predicate. */
-static bool gen_gvec_fpst_zzzp(DisasContext *s, gen_helper_gvec_4_ptr *fn,
-                               int rd, int rn, int rm, int pg, int data,
-                               ARMFPStatusFlavour flavour)
-{
-    if (fn == NULL) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        TCGv_ptr status = fpstatus_ptr(flavour);
-
-        tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, rd),
-                           vec_full_reg_offset(s, rn),
-                           vec_full_reg_offset(s, rm),
-                           pred_full_reg_offset(s, pg),
-                           status, vsz, vsz, data, fn);
-        tcg_temp_free_ptr(status);
-    }
-    return true;
-}
-
-static bool gen_gvec_fpst_arg_zpzz(DisasContext *s, gen_helper_gvec_4_ptr *fn,
-                                   arg_rprr_esz *a)
-{
-    return gen_gvec_fpst_zzzp(s, fn, a->rd, a->rn, a->rm, a->pg, 0,
-                              a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
-}
-
-/* Invoke a vector expander on two Zregs and an immediate.  */
-static bool gen_gvec_fn_zzi(DisasContext *s, GVecGen2iFn *gvec_fn,
-                            int esz, int rd, int rn, uint64_t imm)
-{
-    if (gvec_fn == NULL) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        gvec_fn(esz, vec_full_reg_offset(s, rd),
-                vec_full_reg_offset(s, rn), imm, vsz, vsz);
-    }
-    return true;
-}
-
-static bool gen_gvec_fn_arg_zzi(DisasContext *s, GVecGen2iFn *gvec_fn,
-                                arg_rri_esz *a)
-{
-    if (a->esz < 0) {
-        /* Invalid tsz encoding -- see tszimm_esz. */
-        return false;
-    }
-    return gen_gvec_fn_zzi(s, gvec_fn, a->esz, a->rd, a->rn, a->imm);
-}
-
-/* Invoke a vector expander on three Zregs.  */
-static bool gen_gvec_fn_zzz(DisasContext *s, GVecGen3Fn *gvec_fn,
-                            int esz, int rd, int rn, int rm)
-{
-    if (gvec_fn == NULL) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        gvec_fn(esz, vec_full_reg_offset(s, rd),
-                vec_full_reg_offset(s, rn),
-                vec_full_reg_offset(s, rm), vsz, vsz);
-    }
-    return true;
-}
-
-static bool gen_gvec_fn_arg_zzz(DisasContext *s, GVecGen3Fn *fn,
-                                arg_rrr_esz *a)
-{
-    return gen_gvec_fn_zzz(s, fn, a->esz, a->rd, a->rn, a->rm);
-}
-
-/* Invoke a vector expander on four Zregs.  */
-static bool gen_gvec_fn_arg_zzzz(DisasContext *s, GVecGen4Fn *gvec_fn,
-                                 arg_rrrr_esz *a)
-{
-    if (gvec_fn == NULL) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        gvec_fn(a->esz, vec_full_reg_offset(s, a->rd),
-                vec_full_reg_offset(s, a->rn),
-                vec_full_reg_offset(s, a->rm),
-                vec_full_reg_offset(s, a->ra), vsz, vsz);
-    }
-    return true;
-}
-
-/* Invoke a vector move on two Zregs.  */
-static bool do_mov_z(DisasContext *s, int rd, int rn)
-{
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        tcg_gen_gvec_mov(MO_8, vec_full_reg_offset(s, rd),
-                         vec_full_reg_offset(s, rn), vsz, vsz);
-    }
-    return true;
-}
-
-/* Initialize a Zreg with replications of a 64-bit immediate.  */
-static void do_dupi_z(DisasContext *s, int rd, uint64_t word)
-{
-    unsigned vsz = vec_full_reg_size(s);
-    tcg_gen_gvec_dup_imm(MO_64, vec_full_reg_offset(s, rd), vsz, vsz, word);
-}
-
-/* Invoke a vector expander on three Pregs.  */
-static bool gen_gvec_fn_ppp(DisasContext *s, GVecGen3Fn *gvec_fn,
-                            int rd, int rn, int rm)
-{
-    if (sve_access_check(s)) {
-        unsigned psz = pred_gvec_reg_size(s);
-        gvec_fn(MO_64, pred_full_reg_offset(s, rd),
-                pred_full_reg_offset(s, rn),
-                pred_full_reg_offset(s, rm), psz, psz);
-    }
-    return true;
-}
-
-/* Invoke a vector move on two Pregs.  */
-static bool do_mov_p(DisasContext *s, int rd, int rn)
-{
-    if (sve_access_check(s)) {
-        unsigned psz = pred_gvec_reg_size(s);
-        tcg_gen_gvec_mov(MO_8, pred_full_reg_offset(s, rd),
-                         pred_full_reg_offset(s, rn), psz, psz);
-    }
-    return true;
-}
-
-/* Set the cpu flags as per a return from an SVE helper.  */
-static void do_pred_flags(TCGv_i32 t)
-{
-    tcg_gen_mov_i32(cpu_NF, t);
-    tcg_gen_andi_i32(cpu_ZF, t, 2);
-    tcg_gen_andi_i32(cpu_CF, t, 1);
-    tcg_gen_movi_i32(cpu_VF, 0);
-}
-
-/* Subroutines computing the ARM PredTest psuedofunction.  */
-static void do_predtest1(TCGv_i64 d, TCGv_i64 g)
-{
-    TCGv_i32 t = tcg_temp_new_i32();
-
-    gen_helper_sve_predtest1(t, d, g);
-    do_pred_flags(t);
-    tcg_temp_free_i32(t);
-}
-
-static void do_predtest(DisasContext *s, int dofs, int gofs, int words)
-{
-    TCGv_ptr dptr = tcg_temp_new_ptr();
-    TCGv_ptr gptr = tcg_temp_new_ptr();
-    TCGv_i32 t = tcg_temp_new_i32();
-
-    tcg_gen_addi_ptr(dptr, cpu_env, dofs);
-    tcg_gen_addi_ptr(gptr, cpu_env, gofs);
-
-    gen_helper_sve_predtest(t, dptr, gptr, tcg_constant_i32(words));
-    tcg_temp_free_ptr(dptr);
-    tcg_temp_free_ptr(gptr);
-
-    do_pred_flags(t);
-    tcg_temp_free_i32(t);
-}
-
-/* For each element size, the bits within a predicate word that are active.  */
-const uint64_t pred_esz_masks[5] = {
-    0xffffffffffffffffull, 0x5555555555555555ull,
-    0x1111111111111111ull, 0x0101010101010101ull,
-    0x0001000100010001ull,
-};
-
-static bool trans_INVALID(DisasContext *s, arg_INVALID *a)
-{
-    unallocated_encoding(s);
-    return true;
-}
-
-/*
- *** SVE Logical - Unpredicated Group
- */
-
-TRANS_FEAT(AND_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_and, a)
-TRANS_FEAT(ORR_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_or, a)
-TRANS_FEAT(EOR_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_xor, a)
-TRANS_FEAT(BIC_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_andc, a)
-
-static void gen_xar8_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, int64_t sh)
-{
-    TCGv_i64 t = tcg_temp_new_i64();
-    uint64_t mask = dup_const(MO_8, 0xff >> sh);
-
-    tcg_gen_xor_i64(t, n, m);
-    tcg_gen_shri_i64(d, t, sh);
-    tcg_gen_shli_i64(t, t, 8 - sh);
-    tcg_gen_andi_i64(d, d, mask);
-    tcg_gen_andi_i64(t, t, ~mask);
-    tcg_gen_or_i64(d, d, t);
-    tcg_temp_free_i64(t);
-}
-
-static void gen_xar16_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, int64_t sh)
-{
-    TCGv_i64 t = tcg_temp_new_i64();
-    uint64_t mask = dup_const(MO_16, 0xffff >> sh);
-
-    tcg_gen_xor_i64(t, n, m);
-    tcg_gen_shri_i64(d, t, sh);
-    tcg_gen_shli_i64(t, t, 16 - sh);
-    tcg_gen_andi_i64(d, d, mask);
-    tcg_gen_andi_i64(t, t, ~mask);
-    tcg_gen_or_i64(d, d, t);
-    tcg_temp_free_i64(t);
-}
-
-static void gen_xar_i32(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, int32_t sh)
-{
-    tcg_gen_xor_i32(d, n, m);
-    tcg_gen_rotri_i32(d, d, sh);
-}
-
-static void gen_xar_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, int64_t sh)
-{
-    tcg_gen_xor_i64(d, n, m);
-    tcg_gen_rotri_i64(d, d, sh);
-}
-
-static void gen_xar_vec(unsigned vece, TCGv_vec d, TCGv_vec n,
-                        TCGv_vec m, int64_t sh)
-{
-    tcg_gen_xor_vec(vece, d, n, m);
-    tcg_gen_rotri_vec(vece, d, d, sh);
-}
-
-void gen_gvec_xar(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                  uint32_t rm_ofs, int64_t shift,
-                  uint32_t opr_sz, uint32_t max_sz)
-{
-    static const TCGOpcode vecop[] = { INDEX_op_rotli_vec, 0 };
-    static const GVecGen3i ops[4] = {
-        { .fni8 = gen_xar8_i64,
-          .fniv = gen_xar_vec,
-          .fno = gen_helper_sve2_xar_b,
-          .opt_opc = vecop,
-          .vece = MO_8 },
-        { .fni8 = gen_xar16_i64,
-          .fniv = gen_xar_vec,
-          .fno = gen_helper_sve2_xar_h,
-          .opt_opc = vecop,
-          .vece = MO_16 },
-        { .fni4 = gen_xar_i32,
-          .fniv = gen_xar_vec,
-          .fno = gen_helper_sve2_xar_s,
-          .opt_opc = vecop,
-          .vece = MO_32 },
-        { .fni8 = gen_xar_i64,
-          .fniv = gen_xar_vec,
-          .fno = gen_helper_gvec_xar_d,
-          .opt_opc = vecop,
-          .vece = MO_64 }
-    };
-    int esize = 8 << vece;
-
-    /* The SVE2 range is 1 .. esize; the AdvSIMD range is 0 .. esize-1. */
-    tcg_debug_assert(shift >= 0);
-    tcg_debug_assert(shift <= esize);
-    shift &= esize - 1;
-
-    if (shift == 0) {
-        /* xar with no rotate devolves to xor. */
-        tcg_gen_gvec_xor(vece, rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz);
-    } else {
-        tcg_gen_gvec_3i(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz,
-                        shift, &ops[vece]);
-    }
-}
-
-static bool trans_XAR(DisasContext *s, arg_rrri_esz *a)
-{
-    if (a->esz < 0 || !dc_isar_feature(aa64_sve2, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        gen_gvec_xar(a->esz, vec_full_reg_offset(s, a->rd),
-                     vec_full_reg_offset(s, a->rn),
-                     vec_full_reg_offset(s, a->rm), a->imm, vsz, vsz);
-    }
-    return true;
-}
-
-static void gen_eor3_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 k)
-{
-    tcg_gen_xor_i64(d, n, m);
-    tcg_gen_xor_i64(d, d, k);
-}
-
-static void gen_eor3_vec(unsigned vece, TCGv_vec d, TCGv_vec n,
-                         TCGv_vec m, TCGv_vec k)
-{
-    tcg_gen_xor_vec(vece, d, n, m);
-    tcg_gen_xor_vec(vece, d, d, k);
-}
-
-static void gen_eor3(unsigned vece, uint32_t d, uint32_t n, uint32_t m,
-                     uint32_t a, uint32_t oprsz, uint32_t maxsz)
-{
-    static const GVecGen4 op = {
-        .fni8 = gen_eor3_i64,
-        .fniv = gen_eor3_vec,
-        .fno = gen_helper_sve2_eor3,
-        .vece = MO_64,
-        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-    };
-    tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &op);
-}
-
-TRANS_FEAT(EOR3, aa64_sve2, gen_gvec_fn_arg_zzzz, gen_eor3, a)
-
-static void gen_bcax_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 k)
-{
-    tcg_gen_andc_i64(d, m, k);
-    tcg_gen_xor_i64(d, d, n);
-}
-
-static void gen_bcax_vec(unsigned vece, TCGv_vec d, TCGv_vec n,
-                         TCGv_vec m, TCGv_vec k)
-{
-    tcg_gen_andc_vec(vece, d, m, k);
-    tcg_gen_xor_vec(vece, d, d, n);
-}
-
-static void gen_bcax(unsigned vece, uint32_t d, uint32_t n, uint32_t m,
-                     uint32_t a, uint32_t oprsz, uint32_t maxsz)
-{
-    static const GVecGen4 op = {
-        .fni8 = gen_bcax_i64,
-        .fniv = gen_bcax_vec,
-        .fno = gen_helper_sve2_bcax,
-        .vece = MO_64,
-        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-    };
-    tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &op);
-}
-
-TRANS_FEAT(BCAX, aa64_sve2, gen_gvec_fn_arg_zzzz, gen_bcax, a)
-
-static void gen_bsl(unsigned vece, uint32_t d, uint32_t n, uint32_t m,
-                    uint32_t a, uint32_t oprsz, uint32_t maxsz)
-{
-    /* BSL differs from the generic bitsel in argument ordering. */
-    tcg_gen_gvec_bitsel(vece, d, a, n, m, oprsz, maxsz);
-}
-
-TRANS_FEAT(BSL, aa64_sve2, gen_gvec_fn_arg_zzzz, gen_bsl, a)
-
-static void gen_bsl1n_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 k)
-{
-    tcg_gen_andc_i64(n, k, n);
-    tcg_gen_andc_i64(m, m, k);
-    tcg_gen_or_i64(d, n, m);
-}
-
-static void gen_bsl1n_vec(unsigned vece, TCGv_vec d, TCGv_vec n,
-                          TCGv_vec m, TCGv_vec k)
-{
-    if (TCG_TARGET_HAS_bitsel_vec) {
-        tcg_gen_not_vec(vece, n, n);
-        tcg_gen_bitsel_vec(vece, d, k, n, m);
-    } else {
-        tcg_gen_andc_vec(vece, n, k, n);
-        tcg_gen_andc_vec(vece, m, m, k);
-        tcg_gen_or_vec(vece, d, n, m);
-    }
-}
-
-static void gen_bsl1n(unsigned vece, uint32_t d, uint32_t n, uint32_t m,
-                      uint32_t a, uint32_t oprsz, uint32_t maxsz)
-{
-    static const GVecGen4 op = {
-        .fni8 = gen_bsl1n_i64,
-        .fniv = gen_bsl1n_vec,
-        .fno = gen_helper_sve2_bsl1n,
-        .vece = MO_64,
-        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-    };
-    tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &op);
-}
-
-TRANS_FEAT(BSL1N, aa64_sve2, gen_gvec_fn_arg_zzzz, gen_bsl1n, a)
-
-static void gen_bsl2n_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 k)
-{
-    /*
-     * Z[dn] = (n & k) | (~m & ~k)
-     *       =         | ~(m | k)
-     */
-    tcg_gen_and_i64(n, n, k);
-    if (TCG_TARGET_HAS_orc_i64) {
-        tcg_gen_or_i64(m, m, k);
-        tcg_gen_orc_i64(d, n, m);
-    } else {
-        tcg_gen_nor_i64(m, m, k);
-        tcg_gen_or_i64(d, n, m);
-    }
-}
-
-static void gen_bsl2n_vec(unsigned vece, TCGv_vec d, TCGv_vec n,
-                          TCGv_vec m, TCGv_vec k)
-{
-    if (TCG_TARGET_HAS_bitsel_vec) {
-        tcg_gen_not_vec(vece, m, m);
-        tcg_gen_bitsel_vec(vece, d, k, n, m);
-    } else {
-        tcg_gen_and_vec(vece, n, n, k);
-        tcg_gen_or_vec(vece, m, m, k);
-        tcg_gen_orc_vec(vece, d, n, m);
-    }
-}
-
-static void gen_bsl2n(unsigned vece, uint32_t d, uint32_t n, uint32_t m,
-                      uint32_t a, uint32_t oprsz, uint32_t maxsz)
-{
-    static const GVecGen4 op = {
-        .fni8 = gen_bsl2n_i64,
-        .fniv = gen_bsl2n_vec,
-        .fno = gen_helper_sve2_bsl2n,
-        .vece = MO_64,
-        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-    };
-    tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &op);
-}
-
-TRANS_FEAT(BSL2N, aa64_sve2, gen_gvec_fn_arg_zzzz, gen_bsl2n, a)
-
-static void gen_nbsl_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 k)
-{
-    tcg_gen_and_i64(n, n, k);
-    tcg_gen_andc_i64(m, m, k);
-    tcg_gen_nor_i64(d, n, m);
-}
-
-static void gen_nbsl_vec(unsigned vece, TCGv_vec d, TCGv_vec n,
-                          TCGv_vec m, TCGv_vec k)
-{
-    tcg_gen_bitsel_vec(vece, d, k, n, m);
-    tcg_gen_not_vec(vece, d, d);
-}
-
-static void gen_nbsl(unsigned vece, uint32_t d, uint32_t n, uint32_t m,
-                     uint32_t a, uint32_t oprsz, uint32_t maxsz)
-{
-    static const GVecGen4 op = {
-        .fni8 = gen_nbsl_i64,
-        .fniv = gen_nbsl_vec,
-        .fno = gen_helper_sve2_nbsl,
-        .vece = MO_64,
-        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-    };
-    tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &op);
-}
-
-TRANS_FEAT(NBSL, aa64_sve2, gen_gvec_fn_arg_zzzz, gen_nbsl, a)
-
-/*
- *** SVE Integer Arithmetic - Unpredicated Group
- */
-
-TRANS_FEAT(ADD_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_add, a)
-TRANS_FEAT(SUB_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_sub, a)
-TRANS_FEAT(SQADD_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_ssadd, a)
-TRANS_FEAT(SQSUB_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_sssub, a)
-TRANS_FEAT(UQADD_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_usadd, a)
-TRANS_FEAT(UQSUB_zzz, aa64_sve, gen_gvec_fn_arg_zzz, tcg_gen_gvec_ussub, a)
-
-/*
- *** SVE Integer Arithmetic - Binary Predicated Group
- */
-
-/* Select active elememnts from Zn and inactive elements from Zm,
- * storing the result in Zd.
- */
-static bool do_sel_z(DisasContext *s, int rd, int rn, int rm, int pg, int esz)
-{
-    static gen_helper_gvec_4 * const fns[4] = {
-        gen_helper_sve_sel_zpzz_b, gen_helper_sve_sel_zpzz_h,
-        gen_helper_sve_sel_zpzz_s, gen_helper_sve_sel_zpzz_d
-    };
-    return gen_gvec_ool_zzzp(s, fns[esz], rd, rn, rm, pg, 0);
-}
-
-#define DO_ZPZZ(NAME, FEAT, name) \
-    static gen_helper_gvec_4 * const name##_zpzz_fns[4] = {               \
-        gen_helper_##name##_zpzz_b, gen_helper_##name##_zpzz_h,           \
-        gen_helper_##name##_zpzz_s, gen_helper_##name##_zpzz_d,           \
-    };                                                                    \
-    TRANS_FEAT(NAME, FEAT, gen_gvec_ool_arg_zpzz,                         \
-               name##_zpzz_fns[a->esz], a, 0)
-
-DO_ZPZZ(AND_zpzz, aa64_sve, sve_and)
-DO_ZPZZ(EOR_zpzz, aa64_sve, sve_eor)
-DO_ZPZZ(ORR_zpzz, aa64_sve, sve_orr)
-DO_ZPZZ(BIC_zpzz, aa64_sve, sve_bic)
-
-DO_ZPZZ(ADD_zpzz, aa64_sve, sve_add)
-DO_ZPZZ(SUB_zpzz, aa64_sve, sve_sub)
-
-DO_ZPZZ(SMAX_zpzz, aa64_sve, sve_smax)
-DO_ZPZZ(UMAX_zpzz, aa64_sve, sve_umax)
-DO_ZPZZ(SMIN_zpzz, aa64_sve, sve_smin)
-DO_ZPZZ(UMIN_zpzz, aa64_sve, sve_umin)
-DO_ZPZZ(SABD_zpzz, aa64_sve, sve_sabd)
-DO_ZPZZ(UABD_zpzz, aa64_sve, sve_uabd)
-
-DO_ZPZZ(MUL_zpzz, aa64_sve, sve_mul)
-DO_ZPZZ(SMULH_zpzz, aa64_sve, sve_smulh)
-DO_ZPZZ(UMULH_zpzz, aa64_sve, sve_umulh)
-
-DO_ZPZZ(ASR_zpzz, aa64_sve, sve_asr)
-DO_ZPZZ(LSR_zpzz, aa64_sve, sve_lsr)
-DO_ZPZZ(LSL_zpzz, aa64_sve, sve_lsl)
-
-static gen_helper_gvec_4 * const sdiv_fns[4] = {
-    NULL, NULL, gen_helper_sve_sdiv_zpzz_s, gen_helper_sve_sdiv_zpzz_d
-};
-TRANS_FEAT(SDIV_zpzz, aa64_sve, gen_gvec_ool_arg_zpzz, sdiv_fns[a->esz], a, 0)
-
-static gen_helper_gvec_4 * const udiv_fns[4] = {
-    NULL, NULL, gen_helper_sve_udiv_zpzz_s, gen_helper_sve_udiv_zpzz_d
-};
-TRANS_FEAT(UDIV_zpzz, aa64_sve, gen_gvec_ool_arg_zpzz, udiv_fns[a->esz], a, 0)
-
-TRANS_FEAT(SEL_zpzz, aa64_sve, do_sel_z, a->rd, a->rn, a->rm, a->pg, a->esz)
-
-/*
- *** SVE Integer Arithmetic - Unary Predicated Group
- */
-
-#define DO_ZPZ(NAME, FEAT, name) \
-    static gen_helper_gvec_3 * const name##_fns[4] = {              \
-        gen_helper_##name##_b, gen_helper_##name##_h,               \
-        gen_helper_##name##_s, gen_helper_##name##_d,               \
-    };                                                              \
-    TRANS_FEAT(NAME, FEAT, gen_gvec_ool_arg_zpz, name##_fns[a->esz], a, 0)
-
-DO_ZPZ(CLS, aa64_sve, sve_cls)
-DO_ZPZ(CLZ, aa64_sve, sve_clz)
-DO_ZPZ(CNT_zpz, aa64_sve, sve_cnt_zpz)
-DO_ZPZ(CNOT, aa64_sve, sve_cnot)
-DO_ZPZ(NOT_zpz, aa64_sve, sve_not_zpz)
-DO_ZPZ(ABS, aa64_sve, sve_abs)
-DO_ZPZ(NEG, aa64_sve, sve_neg)
-DO_ZPZ(RBIT, aa64_sve, sve_rbit)
-
-static gen_helper_gvec_3 * const fabs_fns[4] = {
-    NULL,                  gen_helper_sve_fabs_h,
-    gen_helper_sve_fabs_s, gen_helper_sve_fabs_d,
-};
-TRANS_FEAT(FABS, aa64_sve, gen_gvec_ool_arg_zpz, fabs_fns[a->esz], a, 0)
-
-static gen_helper_gvec_3 * const fneg_fns[4] = {
-    NULL,                  gen_helper_sve_fneg_h,
-    gen_helper_sve_fneg_s, gen_helper_sve_fneg_d,
-};
-TRANS_FEAT(FNEG, aa64_sve, gen_gvec_ool_arg_zpz, fneg_fns[a->esz], a, 0)
-
-static gen_helper_gvec_3 * const sxtb_fns[4] = {
-    NULL,                  gen_helper_sve_sxtb_h,
-    gen_helper_sve_sxtb_s, gen_helper_sve_sxtb_d,
-};
-TRANS_FEAT(SXTB, aa64_sve, gen_gvec_ool_arg_zpz, sxtb_fns[a->esz], a, 0)
-
-static gen_helper_gvec_3 * const uxtb_fns[4] = {
-    NULL,                  gen_helper_sve_uxtb_h,
-    gen_helper_sve_uxtb_s, gen_helper_sve_uxtb_d,
-};
-TRANS_FEAT(UXTB, aa64_sve, gen_gvec_ool_arg_zpz, uxtb_fns[a->esz], a, 0)
-
-static gen_helper_gvec_3 * const sxth_fns[4] = {
-    NULL, NULL, gen_helper_sve_sxth_s, gen_helper_sve_sxth_d
-};
-TRANS_FEAT(SXTH, aa64_sve, gen_gvec_ool_arg_zpz, sxth_fns[a->esz], a, 0)
-
-static gen_helper_gvec_3 * const uxth_fns[4] = {
-    NULL, NULL, gen_helper_sve_uxth_s, gen_helper_sve_uxth_d
-};
-TRANS_FEAT(UXTH, aa64_sve, gen_gvec_ool_arg_zpz, uxth_fns[a->esz], a, 0)
-
-TRANS_FEAT(SXTW, aa64_sve, gen_gvec_ool_arg_zpz,
-           a->esz == 3 ? gen_helper_sve_sxtw_d : NULL, a, 0)
-TRANS_FEAT(UXTW, aa64_sve, gen_gvec_ool_arg_zpz,
-           a->esz == 3 ? gen_helper_sve_uxtw_d : NULL, a, 0)
-
-/*
- *** SVE Integer Reduction Group
- */
-
-typedef void gen_helper_gvec_reduc(TCGv_i64, TCGv_ptr, TCGv_ptr, TCGv_i32);
-static bool do_vpz_ool(DisasContext *s, arg_rpr_esz *a,
-                       gen_helper_gvec_reduc *fn)
-{
-    unsigned vsz = vec_full_reg_size(s);
-    TCGv_ptr t_zn, t_pg;
-    TCGv_i32 desc;
-    TCGv_i64 temp;
-
-    if (fn == NULL) {
-        return false;
-    }
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    desc = tcg_constant_i32(simd_desc(vsz, vsz, 0));
-    temp = tcg_temp_new_i64();
-    t_zn = tcg_temp_new_ptr();
-    t_pg = tcg_temp_new_ptr();
-
-    tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, a->rn));
-    tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, a->pg));
-    fn(temp, t_zn, t_pg, desc);
-    tcg_temp_free_ptr(t_zn);
-    tcg_temp_free_ptr(t_pg);
-
-    write_fp_dreg(s, a->rd, temp);
-    tcg_temp_free_i64(temp);
-    return true;
-}
-
-#define DO_VPZ(NAME, name) \
-    static gen_helper_gvec_reduc * const name##_fns[4] = {               \
-        gen_helper_sve_##name##_b, gen_helper_sve_##name##_h,            \
-        gen_helper_sve_##name##_s, gen_helper_sve_##name##_d,            \
-    };                                                                   \
-    TRANS_FEAT(NAME, aa64_sve, do_vpz_ool, a, name##_fns[a->esz])
-
-DO_VPZ(ORV, orv)
-DO_VPZ(ANDV, andv)
-DO_VPZ(EORV, eorv)
-
-DO_VPZ(UADDV, uaddv)
-DO_VPZ(SMAXV, smaxv)
-DO_VPZ(UMAXV, umaxv)
-DO_VPZ(SMINV, sminv)
-DO_VPZ(UMINV, uminv)
-
-static gen_helper_gvec_reduc * const saddv_fns[4] = {
-    gen_helper_sve_saddv_b, gen_helper_sve_saddv_h,
-    gen_helper_sve_saddv_s, NULL
-};
-TRANS_FEAT(SADDV, aa64_sve, do_vpz_ool, a, saddv_fns[a->esz])
-
-#undef DO_VPZ
-
-/*
- *** SVE Shift by Immediate - Predicated Group
- */
-
-/*
- * Copy Zn into Zd, storing zeros into inactive elements.
- * If invert, store zeros into the active elements.
- */
-static bool do_movz_zpz(DisasContext *s, int rd, int rn, int pg,
-                        int esz, bool invert)
-{
-    static gen_helper_gvec_3 * const fns[4] = {
-        gen_helper_sve_movz_b, gen_helper_sve_movz_h,
-        gen_helper_sve_movz_s, gen_helper_sve_movz_d,
-    };
-    return gen_gvec_ool_zzp(s, fns[esz], rd, rn, pg, invert);
-}
-
-static bool do_shift_zpzi(DisasContext *s, arg_rpri_esz *a, bool asr,
-                          gen_helper_gvec_3 * const fns[4])
-{
-    int max;
-
-    if (a->esz < 0) {
-        /* Invalid tsz encoding -- see tszimm_esz. */
-        return false;
-    }
-
-    /*
-     * Shift by element size is architecturally valid.
-     * For arithmetic right-shift, it's the same as by one less.
-     * For logical shifts and ASRD, it is a zeroing operation.
-     */
-    max = 8 << a->esz;
-    if (a->imm >= max) {
-        if (asr) {
-            a->imm = max - 1;
-        } else {
-            return do_movz_zpz(s, a->rd, a->rd, a->pg, a->esz, true);
-        }
-    }
-    return gen_gvec_ool_arg_zpzi(s, fns[a->esz], a);
-}
-
-static gen_helper_gvec_3 * const asr_zpzi_fns[4] = {
-    gen_helper_sve_asr_zpzi_b, gen_helper_sve_asr_zpzi_h,
-    gen_helper_sve_asr_zpzi_s, gen_helper_sve_asr_zpzi_d,
-};
-TRANS_FEAT(ASR_zpzi, aa64_sve, do_shift_zpzi, a, true, asr_zpzi_fns)
-
-static gen_helper_gvec_3 * const lsr_zpzi_fns[4] = {
-    gen_helper_sve_lsr_zpzi_b, gen_helper_sve_lsr_zpzi_h,
-    gen_helper_sve_lsr_zpzi_s, gen_helper_sve_lsr_zpzi_d,
-};
-TRANS_FEAT(LSR_zpzi, aa64_sve, do_shift_zpzi, a, false, lsr_zpzi_fns)
-
-static gen_helper_gvec_3 * const lsl_zpzi_fns[4] = {
-    gen_helper_sve_lsl_zpzi_b, gen_helper_sve_lsl_zpzi_h,
-    gen_helper_sve_lsl_zpzi_s, gen_helper_sve_lsl_zpzi_d,
-};
-TRANS_FEAT(LSL_zpzi, aa64_sve, do_shift_zpzi, a, false, lsl_zpzi_fns)
-
-static gen_helper_gvec_3 * const asrd_fns[4] = {
-    gen_helper_sve_asrd_b, gen_helper_sve_asrd_h,
-    gen_helper_sve_asrd_s, gen_helper_sve_asrd_d,
-};
-TRANS_FEAT(ASRD, aa64_sve, do_shift_zpzi, a, false, asrd_fns)
-
-static gen_helper_gvec_3 * const sqshl_zpzi_fns[4] = {
-    gen_helper_sve2_sqshl_zpzi_b, gen_helper_sve2_sqshl_zpzi_h,
-    gen_helper_sve2_sqshl_zpzi_s, gen_helper_sve2_sqshl_zpzi_d,
-};
-TRANS_FEAT(SQSHL_zpzi, aa64_sve2, gen_gvec_ool_arg_zpzi,
-           a->esz < 0 ? NULL : sqshl_zpzi_fns[a->esz], a)
-
-static gen_helper_gvec_3 * const uqshl_zpzi_fns[4] = {
-    gen_helper_sve2_uqshl_zpzi_b, gen_helper_sve2_uqshl_zpzi_h,
-    gen_helper_sve2_uqshl_zpzi_s, gen_helper_sve2_uqshl_zpzi_d,
-};
-TRANS_FEAT(UQSHL_zpzi, aa64_sve2, gen_gvec_ool_arg_zpzi,
-           a->esz < 0 ? NULL : uqshl_zpzi_fns[a->esz], a)
-
-static gen_helper_gvec_3 * const srshr_fns[4] = {
-    gen_helper_sve2_srshr_b, gen_helper_sve2_srshr_h,
-    gen_helper_sve2_srshr_s, gen_helper_sve2_srshr_d,
-};
-TRANS_FEAT(SRSHR, aa64_sve2, gen_gvec_ool_arg_zpzi,
-           a->esz < 0 ? NULL : srshr_fns[a->esz], a)
-
-static gen_helper_gvec_3 * const urshr_fns[4] = {
-    gen_helper_sve2_urshr_b, gen_helper_sve2_urshr_h,
-    gen_helper_sve2_urshr_s, gen_helper_sve2_urshr_d,
-};
-TRANS_FEAT(URSHR, aa64_sve2, gen_gvec_ool_arg_zpzi,
-           a->esz < 0 ? NULL : urshr_fns[a->esz], a)
-
-static gen_helper_gvec_3 * const sqshlu_fns[4] = {
-    gen_helper_sve2_sqshlu_b, gen_helper_sve2_sqshlu_h,
-    gen_helper_sve2_sqshlu_s, gen_helper_sve2_sqshlu_d,
-};
-TRANS_FEAT(SQSHLU, aa64_sve2, gen_gvec_ool_arg_zpzi,
-           a->esz < 0 ? NULL : sqshlu_fns[a->esz], a)
-
-/*
- *** SVE Bitwise Shift - Predicated Group
- */
-
-#define DO_ZPZW(NAME, name) \
-    static gen_helper_gvec_4 * const name##_zpzw_fns[4] = {               \
-        gen_helper_sve_##name##_zpzw_b, gen_helper_sve_##name##_zpzw_h,   \
-        gen_helper_sve_##name##_zpzw_s, NULL                              \
-    };                                                                    \
-    TRANS_FEAT(NAME##_zpzw, aa64_sve, gen_gvec_ool_arg_zpzz,              \
-               a->esz < 0 ? NULL : name##_zpzw_fns[a->esz], a, 0)
-
-DO_ZPZW(ASR, asr)
-DO_ZPZW(LSR, lsr)
-DO_ZPZW(LSL, lsl)
-
-#undef DO_ZPZW
-
-/*
- *** SVE Bitwise Shift - Unpredicated Group
- */
-
-static bool do_shift_imm(DisasContext *s, arg_rri_esz *a, bool asr,
-                         void (*gvec_fn)(unsigned, uint32_t, uint32_t,
-                                         int64_t, uint32_t, uint32_t))
-{
-    if (a->esz < 0) {
-        /* Invalid tsz encoding -- see tszimm_esz. */
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        /* Shift by element size is architecturally valid.  For
-           arithmetic right-shift, it's the same as by one less.
-           Otherwise it is a zeroing operation.  */
-        if (a->imm >= 8 << a->esz) {
-            if (asr) {
-                a->imm = (8 << a->esz) - 1;
-            } else {
-                do_dupi_z(s, a->rd, 0);
-                return true;
-            }
-        }
-        gvec_fn(a->esz, vec_full_reg_offset(s, a->rd),
-                vec_full_reg_offset(s, a->rn), a->imm, vsz, vsz);
-    }
-    return true;
-}
-
-TRANS_FEAT(ASR_zzi, aa64_sve, do_shift_imm, a, true, tcg_gen_gvec_sari)
-TRANS_FEAT(LSR_zzi, aa64_sve, do_shift_imm, a, false, tcg_gen_gvec_shri)
-TRANS_FEAT(LSL_zzi, aa64_sve, do_shift_imm, a, false, tcg_gen_gvec_shli)
-
-#define DO_ZZW(NAME, name) \
-    static gen_helper_gvec_3 * const name##_zzw_fns[4] = {                \
-        gen_helper_sve_##name##_zzw_b, gen_helper_sve_##name##_zzw_h,     \
-        gen_helper_sve_##name##_zzw_s, NULL                               \
-    };                                                                    \
-    TRANS_FEAT(NAME, aa64_sve, gen_gvec_ool_arg_zzz,                      \
-               name##_zzw_fns[a->esz], a, 0)
-
-DO_ZZW(ASR_zzw, asr)
-DO_ZZW(LSR_zzw, lsr)
-DO_ZZW(LSL_zzw, lsl)
-
-#undef DO_ZZW
-
-/*
- *** SVE Integer Multiply-Add Group
- */
-
-static bool do_zpzzz_ool(DisasContext *s, arg_rprrr_esz *a,
-                         gen_helper_gvec_5 *fn)
-{
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        tcg_gen_gvec_5_ool(vec_full_reg_offset(s, a->rd),
-                           vec_full_reg_offset(s, a->ra),
-                           vec_full_reg_offset(s, a->rn),
-                           vec_full_reg_offset(s, a->rm),
-                           pred_full_reg_offset(s, a->pg),
-                           vsz, vsz, 0, fn);
-    }
-    return true;
-}
-
-static gen_helper_gvec_5 * const mla_fns[4] = {
-    gen_helper_sve_mla_b, gen_helper_sve_mla_h,
-    gen_helper_sve_mla_s, gen_helper_sve_mla_d,
-};
-TRANS_FEAT(MLA, aa64_sve, do_zpzzz_ool, a, mla_fns[a->esz])
-
-static gen_helper_gvec_5 * const mls_fns[4] = {
-    gen_helper_sve_mls_b, gen_helper_sve_mls_h,
-    gen_helper_sve_mls_s, gen_helper_sve_mls_d,
-};
-TRANS_FEAT(MLS, aa64_sve, do_zpzzz_ool, a, mls_fns[a->esz])
-
-/*
- *** SVE Index Generation Group
- */
-
-static bool do_index(DisasContext *s, int esz, int rd,
-                     TCGv_i64 start, TCGv_i64 incr)
-{
-    unsigned vsz;
-    TCGv_i32 desc;
-    TCGv_ptr t_zd;
-
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    vsz = vec_full_reg_size(s);
-    desc = tcg_constant_i32(simd_desc(vsz, vsz, 0));
-    t_zd = tcg_temp_new_ptr();
-
-    tcg_gen_addi_ptr(t_zd, cpu_env, vec_full_reg_offset(s, rd));
-    if (esz == 3) {
-        gen_helper_sve_index_d(t_zd, start, incr, desc);
-    } else {
-        typedef void index_fn(TCGv_ptr, TCGv_i32, TCGv_i32, TCGv_i32);
-        static index_fn * const fns[3] = {
-            gen_helper_sve_index_b,
-            gen_helper_sve_index_h,
-            gen_helper_sve_index_s,
-        };
-        TCGv_i32 s32 = tcg_temp_new_i32();
-        TCGv_i32 i32 = tcg_temp_new_i32();
-
-        tcg_gen_extrl_i64_i32(s32, start);
-        tcg_gen_extrl_i64_i32(i32, incr);
-        fns[esz](t_zd, s32, i32, desc);
-
-        tcg_temp_free_i32(s32);
-        tcg_temp_free_i32(i32);
-    }
-    tcg_temp_free_ptr(t_zd);
-    return true;
-}
-
-TRANS_FEAT(INDEX_ii, aa64_sve, do_index, a->esz, a->rd,
-           tcg_constant_i64(a->imm1), tcg_constant_i64(a->imm2))
-TRANS_FEAT(INDEX_ir, aa64_sve, do_index, a->esz, a->rd,
-           tcg_constant_i64(a->imm), cpu_reg(s, a->rm))
-TRANS_FEAT(INDEX_ri, aa64_sve, do_index, a->esz, a->rd,
-           cpu_reg(s, a->rn), tcg_constant_i64(a->imm))
-TRANS_FEAT(INDEX_rr, aa64_sve, do_index, a->esz, a->rd,
-           cpu_reg(s, a->rn), cpu_reg(s, a->rm))
-
-/*
- *** SVE Stack Allocation Group
- */
-
-static bool trans_ADDVL(DisasContext *s, arg_ADDVL *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        TCGv_i64 rd = cpu_reg_sp(s, a->rd);
-        TCGv_i64 rn = cpu_reg_sp(s, a->rn);
-        tcg_gen_addi_i64(rd, rn, a->imm * vec_full_reg_size(s));
-    }
-    return true;
-}
-
-static bool trans_ADDSVL(DisasContext *s, arg_ADDSVL *a)
-{
-    if (!dc_isar_feature(aa64_sme, s)) {
-        return false;
-    }
-    if (sme_enabled_check(s)) {
-        TCGv_i64 rd = cpu_reg_sp(s, a->rd);
-        TCGv_i64 rn = cpu_reg_sp(s, a->rn);
-        tcg_gen_addi_i64(rd, rn, a->imm * streaming_vec_reg_size(s));
-    }
-    return true;
-}
-
-static bool trans_ADDPL(DisasContext *s, arg_ADDPL *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        TCGv_i64 rd = cpu_reg_sp(s, a->rd);
-        TCGv_i64 rn = cpu_reg_sp(s, a->rn);
-        tcg_gen_addi_i64(rd, rn, a->imm * pred_full_reg_size(s));
-    }
-    return true;
-}
-
-static bool trans_ADDSPL(DisasContext *s, arg_ADDSPL *a)
-{
-    if (!dc_isar_feature(aa64_sme, s)) {
-        return false;
-    }
-    if (sme_enabled_check(s)) {
-        TCGv_i64 rd = cpu_reg_sp(s, a->rd);
-        TCGv_i64 rn = cpu_reg_sp(s, a->rn);
-        tcg_gen_addi_i64(rd, rn, a->imm * streaming_pred_reg_size(s));
-    }
-    return true;
-}
-
-static bool trans_RDVL(DisasContext *s, arg_RDVL *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        TCGv_i64 reg = cpu_reg(s, a->rd);
-        tcg_gen_movi_i64(reg, a->imm * vec_full_reg_size(s));
-    }
-    return true;
-}
-
-static bool trans_RDSVL(DisasContext *s, arg_RDSVL *a)
-{
-    if (!dc_isar_feature(aa64_sme, s)) {
-        return false;
-    }
-    if (sme_enabled_check(s)) {
-        TCGv_i64 reg = cpu_reg(s, a->rd);
-        tcg_gen_movi_i64(reg, a->imm * streaming_vec_reg_size(s));
-    }
-    return true;
-}
-
-/*
- *** SVE Compute Vector Address Group
- */
-
-static bool do_adr(DisasContext *s, arg_rrri *a, gen_helper_gvec_3 *fn)
-{
-    return gen_gvec_ool_zzz(s, fn, a->rd, a->rn, a->rm, a->imm);
-}
-
-TRANS_FEAT_NONSTREAMING(ADR_p32, aa64_sve, do_adr, a, gen_helper_sve_adr_p32)
-TRANS_FEAT_NONSTREAMING(ADR_p64, aa64_sve, do_adr, a, gen_helper_sve_adr_p64)
-TRANS_FEAT_NONSTREAMING(ADR_s32, aa64_sve, do_adr, a, gen_helper_sve_adr_s32)
-TRANS_FEAT_NONSTREAMING(ADR_u32, aa64_sve, do_adr, a, gen_helper_sve_adr_u32)
-
-/*
- *** SVE Integer Misc - Unpredicated Group
- */
-
-static gen_helper_gvec_2 * const fexpa_fns[4] = {
-    NULL,                   gen_helper_sve_fexpa_h,
-    gen_helper_sve_fexpa_s, gen_helper_sve_fexpa_d,
-};
-TRANS_FEAT_NONSTREAMING(FEXPA, aa64_sve, gen_gvec_ool_zz,
-                        fexpa_fns[a->esz], a->rd, a->rn, 0)
-
-static gen_helper_gvec_3 * const ftssel_fns[4] = {
-    NULL,                    gen_helper_sve_ftssel_h,
-    gen_helper_sve_ftssel_s, gen_helper_sve_ftssel_d,
-};
-TRANS_FEAT_NONSTREAMING(FTSSEL, aa64_sve, gen_gvec_ool_arg_zzz,
-                        ftssel_fns[a->esz], a, 0)
-
-/*
- *** SVE Predicate Logical Operations Group
- */
-
-static bool do_pppp_flags(DisasContext *s, arg_rprr_s *a,
-                          const GVecGen4 *gvec_op)
-{
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    unsigned psz = pred_gvec_reg_size(s);
-    int dofs = pred_full_reg_offset(s, a->rd);
-    int nofs = pred_full_reg_offset(s, a->rn);
-    int mofs = pred_full_reg_offset(s, a->rm);
-    int gofs = pred_full_reg_offset(s, a->pg);
-
-    if (!a->s) {
-        tcg_gen_gvec_4(dofs, nofs, mofs, gofs, psz, psz, gvec_op);
-        return true;
-    }
-
-    if (psz == 8) {
-        /* Do the operation and the flags generation in temps.  */
-        TCGv_i64 pd = tcg_temp_new_i64();
-        TCGv_i64 pn = tcg_temp_new_i64();
-        TCGv_i64 pm = tcg_temp_new_i64();
-        TCGv_i64 pg = tcg_temp_new_i64();
-
-        tcg_gen_ld_i64(pn, cpu_env, nofs);
-        tcg_gen_ld_i64(pm, cpu_env, mofs);
-        tcg_gen_ld_i64(pg, cpu_env, gofs);
-
-        gvec_op->fni8(pd, pn, pm, pg);
-        tcg_gen_st_i64(pd, cpu_env, dofs);
-
-        do_predtest1(pd, pg);
-
-        tcg_temp_free_i64(pd);
-        tcg_temp_free_i64(pn);
-        tcg_temp_free_i64(pm);
-        tcg_temp_free_i64(pg);
-    } else {
-        /* The operation and flags generation is large.  The computation
-         * of the flags depends on the original contents of the guarding
-         * predicate.  If the destination overwrites the guarding predicate,
-         * then the easiest way to get this right is to save a copy.
-          */
-        int tofs = gofs;
-        if (a->rd == a->pg) {
-            tofs = offsetof(CPUARMState, vfp.preg_tmp);
-            tcg_gen_gvec_mov(0, tofs, gofs, psz, psz);
-        }
-
-        tcg_gen_gvec_4(dofs, nofs, mofs, gofs, psz, psz, gvec_op);
-        do_predtest(s, dofs, tofs, psz / 8);
-    }
-    return true;
-}
-
-static void gen_and_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg)
-{
-    tcg_gen_and_i64(pd, pn, pm);
-    tcg_gen_and_i64(pd, pd, pg);
-}
-
-static void gen_and_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn,
-                           TCGv_vec pm, TCGv_vec pg)
-{
-    tcg_gen_and_vec(vece, pd, pn, pm);
-    tcg_gen_and_vec(vece, pd, pd, pg);
-}
-
-static bool trans_AND_pppp(DisasContext *s, arg_rprr_s *a)
-{
-    static const GVecGen4 op = {
-        .fni8 = gen_and_pg_i64,
-        .fniv = gen_and_pg_vec,
-        .fno = gen_helper_sve_and_pppp,
-        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-    };
-
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (!a->s) {
-        if (a->rn == a->rm) {
-            if (a->pg == a->rn) {
-                return do_mov_p(s, a->rd, a->rn);
-            }
-            return gen_gvec_fn_ppp(s, tcg_gen_gvec_and, a->rd, a->rn, a->pg);
-        } else if (a->pg == a->rn || a->pg == a->rm) {
-            return gen_gvec_fn_ppp(s, tcg_gen_gvec_and, a->rd, a->rn, a->rm);
-        }
-    }
-    return do_pppp_flags(s, a, &op);
-}
-
-static void gen_bic_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg)
-{
-    tcg_gen_andc_i64(pd, pn, pm);
-    tcg_gen_and_i64(pd, pd, pg);
-}
-
-static void gen_bic_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn,
-                           TCGv_vec pm, TCGv_vec pg)
-{
-    tcg_gen_andc_vec(vece, pd, pn, pm);
-    tcg_gen_and_vec(vece, pd, pd, pg);
-}
-
-static bool trans_BIC_pppp(DisasContext *s, arg_rprr_s *a)
-{
-    static const GVecGen4 op = {
-        .fni8 = gen_bic_pg_i64,
-        .fniv = gen_bic_pg_vec,
-        .fno = gen_helper_sve_bic_pppp,
-        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-    };
-
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (!a->s && a->pg == a->rn) {
-        return gen_gvec_fn_ppp(s, tcg_gen_gvec_andc, a->rd, a->rn, a->rm);
-    }
-    return do_pppp_flags(s, a, &op);
-}
-
-static void gen_eor_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg)
-{
-    tcg_gen_xor_i64(pd, pn, pm);
-    tcg_gen_and_i64(pd, pd, pg);
-}
-
-static void gen_eor_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn,
-                           TCGv_vec pm, TCGv_vec pg)
-{
-    tcg_gen_xor_vec(vece, pd, pn, pm);
-    tcg_gen_and_vec(vece, pd, pd, pg);
-}
-
-static bool trans_EOR_pppp(DisasContext *s, arg_rprr_s *a)
-{
-    static const GVecGen4 op = {
-        .fni8 = gen_eor_pg_i64,
-        .fniv = gen_eor_pg_vec,
-        .fno = gen_helper_sve_eor_pppp,
-        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-    };
-
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    /* Alias NOT (predicate) is EOR Pd.B, Pg/Z, Pn.B, Pg.B */
-    if (!a->s && a->pg == a->rm) {
-        return gen_gvec_fn_ppp(s, tcg_gen_gvec_andc, a->rd, a->pg, a->rn);
-    }
-    return do_pppp_flags(s, a, &op);
-}
-
-static bool trans_SEL_pppp(DisasContext *s, arg_rprr_s *a)
-{
-    if (a->s || !dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned psz = pred_gvec_reg_size(s);
-        tcg_gen_gvec_bitsel(MO_8, pred_full_reg_offset(s, a->rd),
-                            pred_full_reg_offset(s, a->pg),
-                            pred_full_reg_offset(s, a->rn),
-                            pred_full_reg_offset(s, a->rm), psz, psz);
-    }
-    return true;
-}
-
-static void gen_orr_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg)
-{
-    tcg_gen_or_i64(pd, pn, pm);
-    tcg_gen_and_i64(pd, pd, pg);
-}
-
-static void gen_orr_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn,
-                           TCGv_vec pm, TCGv_vec pg)
-{
-    tcg_gen_or_vec(vece, pd, pn, pm);
-    tcg_gen_and_vec(vece, pd, pd, pg);
-}
-
-static bool trans_ORR_pppp(DisasContext *s, arg_rprr_s *a)
-{
-    static const GVecGen4 op = {
-        .fni8 = gen_orr_pg_i64,
-        .fniv = gen_orr_pg_vec,
-        .fno = gen_helper_sve_orr_pppp,
-        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-    };
-
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (!a->s && a->pg == a->rn && a->rn == a->rm) {
-        return do_mov_p(s, a->rd, a->rn);
-    }
-    return do_pppp_flags(s, a, &op);
-}
-
-static void gen_orn_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg)
-{
-    tcg_gen_orc_i64(pd, pn, pm);
-    tcg_gen_and_i64(pd, pd, pg);
-}
-
-static void gen_orn_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn,
-                           TCGv_vec pm, TCGv_vec pg)
-{
-    tcg_gen_orc_vec(vece, pd, pn, pm);
-    tcg_gen_and_vec(vece, pd, pd, pg);
-}
-
-static bool trans_ORN_pppp(DisasContext *s, arg_rprr_s *a)
-{
-    static const GVecGen4 op = {
-        .fni8 = gen_orn_pg_i64,
-        .fniv = gen_orn_pg_vec,
-        .fno = gen_helper_sve_orn_pppp,
-        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-    };
-
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    return do_pppp_flags(s, a, &op);
-}
-
-static void gen_nor_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg)
-{
-    tcg_gen_or_i64(pd, pn, pm);
-    tcg_gen_andc_i64(pd, pg, pd);
-}
-
-static void gen_nor_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn,
-                           TCGv_vec pm, TCGv_vec pg)
-{
-    tcg_gen_or_vec(vece, pd, pn, pm);
-    tcg_gen_andc_vec(vece, pd, pg, pd);
-}
-
-static bool trans_NOR_pppp(DisasContext *s, arg_rprr_s *a)
-{
-    static const GVecGen4 op = {
-        .fni8 = gen_nor_pg_i64,
-        .fniv = gen_nor_pg_vec,
-        .fno = gen_helper_sve_nor_pppp,
-        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-    };
-
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    return do_pppp_flags(s, a, &op);
-}
-
-static void gen_nand_pg_i64(TCGv_i64 pd, TCGv_i64 pn, TCGv_i64 pm, TCGv_i64 pg)
-{
-    tcg_gen_and_i64(pd, pn, pm);
-    tcg_gen_andc_i64(pd, pg, pd);
-}
-
-static void gen_nand_pg_vec(unsigned vece, TCGv_vec pd, TCGv_vec pn,
-                           TCGv_vec pm, TCGv_vec pg)
-{
-    tcg_gen_and_vec(vece, pd, pn, pm);
-    tcg_gen_andc_vec(vece, pd, pg, pd);
-}
-
-static bool trans_NAND_pppp(DisasContext *s, arg_rprr_s *a)
-{
-    static const GVecGen4 op = {
-        .fni8 = gen_nand_pg_i64,
-        .fniv = gen_nand_pg_vec,
-        .fno = gen_helper_sve_nand_pppp,
-        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-    };
-
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    return do_pppp_flags(s, a, &op);
-}
-
-/*
- *** SVE Predicate Misc Group
- */
-
-static bool trans_PTEST(DisasContext *s, arg_PTEST *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        int nofs = pred_full_reg_offset(s, a->rn);
-        int gofs = pred_full_reg_offset(s, a->pg);
-        int words = DIV_ROUND_UP(pred_full_reg_size(s), 8);
-
-        if (words == 1) {
-            TCGv_i64 pn = tcg_temp_new_i64();
-            TCGv_i64 pg = tcg_temp_new_i64();
-
-            tcg_gen_ld_i64(pn, cpu_env, nofs);
-            tcg_gen_ld_i64(pg, cpu_env, gofs);
-            do_predtest1(pn, pg);
-
-            tcg_temp_free_i64(pn);
-            tcg_temp_free_i64(pg);
-        } else {
-            do_predtest(s, nofs, gofs, words);
-        }
-    }
-    return true;
-}
-
-/* See the ARM pseudocode DecodePredCount.  */
-static unsigned decode_pred_count(unsigned fullsz, int pattern, int esz)
-{
-    unsigned elements = fullsz >> esz;
-    unsigned bound;
-
-    switch (pattern) {
-    case 0x0: /* POW2 */
-        return pow2floor(elements);
-    case 0x1: /* VL1 */
-    case 0x2: /* VL2 */
-    case 0x3: /* VL3 */
-    case 0x4: /* VL4 */
-    case 0x5: /* VL5 */
-    case 0x6: /* VL6 */
-    case 0x7: /* VL7 */
-    case 0x8: /* VL8 */
-        bound = pattern;
-        break;
-    case 0x9: /* VL16 */
-    case 0xa: /* VL32 */
-    case 0xb: /* VL64 */
-    case 0xc: /* VL128 */
-    case 0xd: /* VL256 */
-        bound = 16 << (pattern - 9);
-        break;
-    case 0x1d: /* MUL4 */
-        return elements - elements % 4;
-    case 0x1e: /* MUL3 */
-        return elements - elements % 3;
-    case 0x1f: /* ALL */
-        return elements;
-    default:   /* #uimm5 */
-        return 0;
-    }
-    return elements >= bound ? bound : 0;
-}
-
-/* This handles all of the predicate initialization instructions,
- * PTRUE, PFALSE, SETFFR.  For PFALSE, we will have set PAT == 32
- * so that decode_pred_count returns 0.  For SETFFR, we will have
- * set RD == 16 == FFR.
- */
-static bool do_predset(DisasContext *s, int esz, int rd, int pat, bool setflag)
-{
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    unsigned fullsz = vec_full_reg_size(s);
-    unsigned ofs = pred_full_reg_offset(s, rd);
-    unsigned numelem, setsz, i;
-    uint64_t word, lastword;
-    TCGv_i64 t;
-
-    numelem = decode_pred_count(fullsz, pat, esz);
-
-    /* Determine what we must store into each bit, and how many.  */
-    if (numelem == 0) {
-        lastword = word = 0;
-        setsz = fullsz;
-    } else {
-        setsz = numelem << esz;
-        lastword = word = pred_esz_masks[esz];
-        if (setsz % 64) {
-            lastword &= MAKE_64BIT_MASK(0, setsz % 64);
-        }
-    }
-
-    t = tcg_temp_new_i64();
-    if (fullsz <= 64) {
-        tcg_gen_movi_i64(t, lastword);
-        tcg_gen_st_i64(t, cpu_env, ofs);
-        goto done;
-    }
-
-    if (word == lastword) {
-        unsigned maxsz = size_for_gvec(fullsz / 8);
-        unsigned oprsz = size_for_gvec(setsz / 8);
-
-        if (oprsz * 8 == setsz) {
-            tcg_gen_gvec_dup_imm(MO_64, ofs, oprsz, maxsz, word);
-            goto done;
-        }
-    }
-
-    setsz /= 8;
-    fullsz /= 8;
-
-    tcg_gen_movi_i64(t, word);
-    for (i = 0; i < QEMU_ALIGN_DOWN(setsz, 8); i += 8) {
-        tcg_gen_st_i64(t, cpu_env, ofs + i);
-    }
-    if (lastword != word) {
-        tcg_gen_movi_i64(t, lastword);
-        tcg_gen_st_i64(t, cpu_env, ofs + i);
-        i += 8;
-    }
-    if (i < fullsz) {
-        tcg_gen_movi_i64(t, 0);
-        for (; i < fullsz; i += 8) {
-            tcg_gen_st_i64(t, cpu_env, ofs + i);
-        }
-    }
-
- done:
-    tcg_temp_free_i64(t);
-
-    /* PTRUES */
-    if (setflag) {
-        tcg_gen_movi_i32(cpu_NF, -(word != 0));
-        tcg_gen_movi_i32(cpu_CF, word == 0);
-        tcg_gen_movi_i32(cpu_VF, 0);
-        tcg_gen_mov_i32(cpu_ZF, cpu_NF);
-    }
-    return true;
-}
-
-TRANS_FEAT(PTRUE, aa64_sve, do_predset, a->esz, a->rd, a->pat, a->s)
-
-/* Note pat == 31 is #all, to set all elements.  */
-TRANS_FEAT_NONSTREAMING(SETFFR, aa64_sve,
-                        do_predset, 0, FFR_PRED_NUM, 31, false)
-
-/* Note pat == 32 is #unimp, to set no elements.  */
-TRANS_FEAT(PFALSE, aa64_sve, do_predset, 0, a->rd, 32, false)
-
-static bool trans_RDFFR_p(DisasContext *s, arg_RDFFR_p *a)
-{
-    /* The path through do_pppp_flags is complicated enough to want to avoid
-     * duplication.  Frob the arguments into the form of a predicated AND.
-     */
-    arg_rprr_s alt_a = {
-        .rd = a->rd, .pg = a->pg, .s = a->s,
-        .rn = FFR_PRED_NUM, .rm = FFR_PRED_NUM,
-    };
-
-    s->is_nonstreaming = true;
-    return trans_AND_pppp(s, &alt_a);
-}
-
-TRANS_FEAT_NONSTREAMING(RDFFR, aa64_sve, do_mov_p, a->rd, FFR_PRED_NUM)
-TRANS_FEAT_NONSTREAMING(WRFFR, aa64_sve, do_mov_p, FFR_PRED_NUM, a->rn)
-
-static bool do_pfirst_pnext(DisasContext *s, arg_rr_esz *a,
-                            void (*gen_fn)(TCGv_i32, TCGv_ptr,
-                                           TCGv_ptr, TCGv_i32))
-{
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    TCGv_ptr t_pd = tcg_temp_new_ptr();
-    TCGv_ptr t_pg = tcg_temp_new_ptr();
-    TCGv_i32 t;
-    unsigned desc = 0;
-
-    desc = FIELD_DP32(desc, PREDDESC, OPRSZ, pred_full_reg_size(s));
-    desc = FIELD_DP32(desc, PREDDESC, ESZ, a->esz);
-
-    tcg_gen_addi_ptr(t_pd, cpu_env, pred_full_reg_offset(s, a->rd));
-    tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, a->rn));
-    t = tcg_temp_new_i32();
-
-    gen_fn(t, t_pd, t_pg, tcg_constant_i32(desc));
-    tcg_temp_free_ptr(t_pd);
-    tcg_temp_free_ptr(t_pg);
-
-    do_pred_flags(t);
-    tcg_temp_free_i32(t);
-    return true;
-}
-
-TRANS_FEAT(PFIRST, aa64_sve, do_pfirst_pnext, a, gen_helper_sve_pfirst)
-TRANS_FEAT(PNEXT, aa64_sve, do_pfirst_pnext, a, gen_helper_sve_pnext)
-
-/*
- *** SVE Element Count Group
- */
-
-/* Perform an inline saturating addition of a 32-bit value within
- * a 64-bit register.  The second operand is known to be positive,
- * which halves the comparisions we must perform to bound the result.
- */
-static void do_sat_addsub_32(TCGv_i64 reg, TCGv_i64 val, bool u, bool d)
-{
-    int64_t ibound;
-
-    /* Use normal 64-bit arithmetic to detect 32-bit overflow.  */
-    if (u) {
-        tcg_gen_ext32u_i64(reg, reg);
-    } else {
-        tcg_gen_ext32s_i64(reg, reg);
-    }
-    if (d) {
-        tcg_gen_sub_i64(reg, reg, val);
-        ibound = (u ? 0 : INT32_MIN);
-        tcg_gen_smax_i64(reg, reg, tcg_constant_i64(ibound));
-    } else {
-        tcg_gen_add_i64(reg, reg, val);
-        ibound = (u ? UINT32_MAX : INT32_MAX);
-        tcg_gen_smin_i64(reg, reg, tcg_constant_i64(ibound));
-    }
-}
-
-/* Similarly with 64-bit values.  */
-static void do_sat_addsub_64(TCGv_i64 reg, TCGv_i64 val, bool u, bool d)
-{
-    TCGv_i64 t0 = tcg_temp_new_i64();
-    TCGv_i64 t2;
-
-    if (u) {
-        if (d) {
-            tcg_gen_sub_i64(t0, reg, val);
-            t2 = tcg_constant_i64(0);
-            tcg_gen_movcond_i64(TCG_COND_LTU, reg, reg, val, t2, t0);
-        } else {
-            tcg_gen_add_i64(t0, reg, val);
-            t2 = tcg_constant_i64(-1);
-            tcg_gen_movcond_i64(TCG_COND_LTU, reg, t0, reg, t2, t0);
-        }
-    } else {
-        TCGv_i64 t1 = tcg_temp_new_i64();
-        if (d) {
-            /* Detect signed overflow for subtraction.  */
-            tcg_gen_xor_i64(t0, reg, val);
-            tcg_gen_sub_i64(t1, reg, val);
-            tcg_gen_xor_i64(reg, reg, t1);
-            tcg_gen_and_i64(t0, t0, reg);
-
-            /* Bound the result.  */
-            tcg_gen_movi_i64(reg, INT64_MIN);
-            t2 = tcg_constant_i64(0);
-            tcg_gen_movcond_i64(TCG_COND_LT, reg, t0, t2, reg, t1);
-        } else {
-            /* Detect signed overflow for addition.  */
-            tcg_gen_xor_i64(t0, reg, val);
-            tcg_gen_add_i64(reg, reg, val);
-            tcg_gen_xor_i64(t1, reg, val);
-            tcg_gen_andc_i64(t0, t1, t0);
-
-            /* Bound the result.  */
-            tcg_gen_movi_i64(t1, INT64_MAX);
-            t2 = tcg_constant_i64(0);
-            tcg_gen_movcond_i64(TCG_COND_LT, reg, t0, t2, t1, reg);
-        }
-        tcg_temp_free_i64(t1);
-    }
-    tcg_temp_free_i64(t0);
-}
-
-/* Similarly with a vector and a scalar operand.  */
-static void do_sat_addsub_vec(DisasContext *s, int esz, int rd, int rn,
-                              TCGv_i64 val, bool u, bool d)
-{
-    unsigned vsz = vec_full_reg_size(s);
-    TCGv_ptr dptr, nptr;
-    TCGv_i32 t32, desc;
-    TCGv_i64 t64;
-
-    dptr = tcg_temp_new_ptr();
-    nptr = tcg_temp_new_ptr();
-    tcg_gen_addi_ptr(dptr, cpu_env, vec_full_reg_offset(s, rd));
-    tcg_gen_addi_ptr(nptr, cpu_env, vec_full_reg_offset(s, rn));
-    desc = tcg_constant_i32(simd_desc(vsz, vsz, 0));
-
-    switch (esz) {
-    case MO_8:
-        t32 = tcg_temp_new_i32();
-        tcg_gen_extrl_i64_i32(t32, val);
-        if (d) {
-            tcg_gen_neg_i32(t32, t32);
-        }
-        if (u) {
-            gen_helper_sve_uqaddi_b(dptr, nptr, t32, desc);
-        } else {
-            gen_helper_sve_sqaddi_b(dptr, nptr, t32, desc);
-        }
-        tcg_temp_free_i32(t32);
-        break;
-
-    case MO_16:
-        t32 = tcg_temp_new_i32();
-        tcg_gen_extrl_i64_i32(t32, val);
-        if (d) {
-            tcg_gen_neg_i32(t32, t32);
-        }
-        if (u) {
-            gen_helper_sve_uqaddi_h(dptr, nptr, t32, desc);
-        } else {
-            gen_helper_sve_sqaddi_h(dptr, nptr, t32, desc);
-        }
-        tcg_temp_free_i32(t32);
-        break;
-
-    case MO_32:
-        t64 = tcg_temp_new_i64();
-        if (d) {
-            tcg_gen_neg_i64(t64, val);
-        } else {
-            tcg_gen_mov_i64(t64, val);
-        }
-        if (u) {
-            gen_helper_sve_uqaddi_s(dptr, nptr, t64, desc);
-        } else {
-            gen_helper_sve_sqaddi_s(dptr, nptr, t64, desc);
-        }
-        tcg_temp_free_i64(t64);
-        break;
-
-    case MO_64:
-        if (u) {
-            if (d) {
-                gen_helper_sve_uqsubi_d(dptr, nptr, val, desc);
-            } else {
-                gen_helper_sve_uqaddi_d(dptr, nptr, val, desc);
-            }
-        } else if (d) {
-            t64 = tcg_temp_new_i64();
-            tcg_gen_neg_i64(t64, val);
-            gen_helper_sve_sqaddi_d(dptr, nptr, t64, desc);
-            tcg_temp_free_i64(t64);
-        } else {
-            gen_helper_sve_sqaddi_d(dptr, nptr, val, desc);
-        }
-        break;
-
-    default:
-        g_assert_not_reached();
-    }
-
-    tcg_temp_free_ptr(dptr);
-    tcg_temp_free_ptr(nptr);
-}
-
-static bool trans_CNT_r(DisasContext *s, arg_CNT_r *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned fullsz = vec_full_reg_size(s);
-        unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz);
-        tcg_gen_movi_i64(cpu_reg(s, a->rd), numelem * a->imm);
-    }
-    return true;
-}
-
-static bool trans_INCDEC_r(DisasContext *s, arg_incdec_cnt *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned fullsz = vec_full_reg_size(s);
-        unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz);
-        int inc = numelem * a->imm * (a->d ? -1 : 1);
-        TCGv_i64 reg = cpu_reg(s, a->rd);
-
-        tcg_gen_addi_i64(reg, reg, inc);
-    }
-    return true;
-}
-
-static bool trans_SINCDEC_r_32(DisasContext *s, arg_incdec_cnt *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    unsigned fullsz = vec_full_reg_size(s);
-    unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz);
-    int inc = numelem * a->imm;
-    TCGv_i64 reg = cpu_reg(s, a->rd);
-
-    /* Use normal 64-bit arithmetic to detect 32-bit overflow.  */
-    if (inc == 0) {
-        if (a->u) {
-            tcg_gen_ext32u_i64(reg, reg);
-        } else {
-            tcg_gen_ext32s_i64(reg, reg);
-        }
-    } else {
-        do_sat_addsub_32(reg, tcg_constant_i64(inc), a->u, a->d);
-    }
-    return true;
-}
-
-static bool trans_SINCDEC_r_64(DisasContext *s, arg_incdec_cnt *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    unsigned fullsz = vec_full_reg_size(s);
-    unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz);
-    int inc = numelem * a->imm;
-    TCGv_i64 reg = cpu_reg(s, a->rd);
-
-    if (inc != 0) {
-        do_sat_addsub_64(reg, tcg_constant_i64(inc), a->u, a->d);
-    }
-    return true;
-}
-
-static bool trans_INCDEC_v(DisasContext *s, arg_incdec2_cnt *a)
-{
-    if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-
-    unsigned fullsz = vec_full_reg_size(s);
-    unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz);
-    int inc = numelem * a->imm;
-
-    if (inc != 0) {
-        if (sve_access_check(s)) {
-            tcg_gen_gvec_adds(a->esz, vec_full_reg_offset(s, a->rd),
-                              vec_full_reg_offset(s, a->rn),
-                              tcg_constant_i64(a->d ? -inc : inc),
-                              fullsz, fullsz);
-        }
-    } else {
-        do_mov_z(s, a->rd, a->rn);
-    }
-    return true;
-}
-
-static bool trans_SINCDEC_v(DisasContext *s, arg_incdec2_cnt *a)
-{
-    if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-
-    unsigned fullsz = vec_full_reg_size(s);
-    unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz);
-    int inc = numelem * a->imm;
-
-    if (inc != 0) {
-        if (sve_access_check(s)) {
-            do_sat_addsub_vec(s, a->esz, a->rd, a->rn,
-                              tcg_constant_i64(inc), a->u, a->d);
-        }
-    } else {
-        do_mov_z(s, a->rd, a->rn);
-    }
-    return true;
-}
-
-/*
- *** SVE Bitwise Immediate Group
- */
-
-static bool do_zz_dbm(DisasContext *s, arg_rr_dbm *a, GVecGen2iFn *gvec_fn)
-{
-    uint64_t imm;
-    if (!logic_imm_decode_wmask(&imm, extract32(a->dbm, 12, 1),
-                                extract32(a->dbm, 0, 6),
-                                extract32(a->dbm, 6, 6))) {
-        return false;
-    }
-    return gen_gvec_fn_zzi(s, gvec_fn, MO_64, a->rd, a->rn, imm);
-}
-
-TRANS_FEAT(AND_zzi, aa64_sve, do_zz_dbm, a, tcg_gen_gvec_andi)
-TRANS_FEAT(ORR_zzi, aa64_sve, do_zz_dbm, a, tcg_gen_gvec_ori)
-TRANS_FEAT(EOR_zzi, aa64_sve, do_zz_dbm, a, tcg_gen_gvec_xori)
-
-static bool trans_DUPM(DisasContext *s, arg_DUPM *a)
-{
-    uint64_t imm;
-
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (!logic_imm_decode_wmask(&imm, extract32(a->dbm, 12, 1),
-                                extract32(a->dbm, 0, 6),
-                                extract32(a->dbm, 6, 6))) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        do_dupi_z(s, a->rd, imm);
-    }
-    return true;
-}
-
-/*
- *** SVE Integer Wide Immediate - Predicated Group
- */
-
-/* Implement all merging copies.  This is used for CPY (immediate),
- * FCPY, CPY (scalar), CPY (SIMD&FP scalar).
- */
-static void do_cpy_m(DisasContext *s, int esz, int rd, int rn, int pg,
-                     TCGv_i64 val)
-{
-    typedef void gen_cpy(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i64, TCGv_i32);
-    static gen_cpy * const fns[4] = {
-        gen_helper_sve_cpy_m_b, gen_helper_sve_cpy_m_h,
-        gen_helper_sve_cpy_m_s, gen_helper_sve_cpy_m_d,
-    };
-    unsigned vsz = vec_full_reg_size(s);
-    TCGv_i32 desc = tcg_constant_i32(simd_desc(vsz, vsz, 0));
-    TCGv_ptr t_zd = tcg_temp_new_ptr();
-    TCGv_ptr t_zn = tcg_temp_new_ptr();
-    TCGv_ptr t_pg = tcg_temp_new_ptr();
-
-    tcg_gen_addi_ptr(t_zd, cpu_env, vec_full_reg_offset(s, rd));
-    tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, rn));
-    tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg));
-
-    fns[esz](t_zd, t_zn, t_pg, val, desc);
-
-    tcg_temp_free_ptr(t_zd);
-    tcg_temp_free_ptr(t_zn);
-    tcg_temp_free_ptr(t_pg);
-}
-
-static bool trans_FCPY(DisasContext *s, arg_FCPY *a)
-{
-    if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        /* Decode the VFP immediate.  */
-        uint64_t imm = vfp_expand_imm(a->esz, a->imm);
-        do_cpy_m(s, a->esz, a->rd, a->rn, a->pg, tcg_constant_i64(imm));
-    }
-    return true;
-}
-
-static bool trans_CPY_m_i(DisasContext *s, arg_rpri_esz *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        do_cpy_m(s, a->esz, a->rd, a->rn, a->pg, tcg_constant_i64(a->imm));
-    }
-    return true;
-}
-
-static bool trans_CPY_z_i(DisasContext *s, arg_CPY_z_i *a)
-{
-    static gen_helper_gvec_2i * const fns[4] = {
-        gen_helper_sve_cpy_z_b, gen_helper_sve_cpy_z_h,
-        gen_helper_sve_cpy_z_s, gen_helper_sve_cpy_z_d,
-    };
-
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        tcg_gen_gvec_2i_ool(vec_full_reg_offset(s, a->rd),
-                            pred_full_reg_offset(s, a->pg),
-                            tcg_constant_i64(a->imm),
-                            vsz, vsz, 0, fns[a->esz]);
-    }
-    return true;
-}
-
-/*
- *** SVE Permute Extract Group
- */
-
-static bool do_EXT(DisasContext *s, int rd, int rn, int rm, int imm)
-{
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    unsigned vsz = vec_full_reg_size(s);
-    unsigned n_ofs = imm >= vsz ? 0 : imm;
-    unsigned n_siz = vsz - n_ofs;
-    unsigned d = vec_full_reg_offset(s, rd);
-    unsigned n = vec_full_reg_offset(s, rn);
-    unsigned m = vec_full_reg_offset(s, rm);
-
-    /* Use host vector move insns if we have appropriate sizes
-     * and no unfortunate overlap.
-     */
-    if (m != d
-        && n_ofs == size_for_gvec(n_ofs)
-        && n_siz == size_for_gvec(n_siz)
-        && (d != n || n_siz <= n_ofs)) {
-        tcg_gen_gvec_mov(0, d, n + n_ofs, n_siz, n_siz);
-        if (n_ofs != 0) {
-            tcg_gen_gvec_mov(0, d + n_siz, m, n_ofs, n_ofs);
-        }
-    } else {
-        tcg_gen_gvec_3_ool(d, n, m, vsz, vsz, n_ofs, gen_helper_sve_ext);
-    }
-    return true;
-}
-
-TRANS_FEAT(EXT, aa64_sve, do_EXT, a->rd, a->rn, a->rm, a->imm)
-TRANS_FEAT(EXT_sve2, aa64_sve2, do_EXT, a->rd, a->rn, (a->rn + 1) % 32, a->imm)
-
-/*
- *** SVE Permute - Unpredicated Group
- */
-
-static bool trans_DUP_s(DisasContext *s, arg_DUP_s *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        tcg_gen_gvec_dup_i64(a->esz, vec_full_reg_offset(s, a->rd),
-                             vsz, vsz, cpu_reg_sp(s, a->rn));
-    }
-    return true;
-}
-
-static bool trans_DUP_x(DisasContext *s, arg_DUP_x *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if ((a->imm & 0x1f) == 0) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        unsigned dofs = vec_full_reg_offset(s, a->rd);
-        unsigned esz, index;
-
-        esz = ctz32(a->imm);
-        index = a->imm >> (esz + 1);
-
-        if ((index << esz) < vsz) {
-            unsigned nofs = vec_reg_offset(s, a->rn, index, esz);
-            tcg_gen_gvec_dup_mem(esz, dofs, nofs, vsz, vsz);
-        } else {
-            /*
-             * While dup_mem handles 128-bit elements, dup_imm does not.
-             * Thankfully element size doesn't matter for splatting zero.
-             */
-            tcg_gen_gvec_dup_imm(MO_64, dofs, vsz, vsz, 0);
-        }
-    }
-    return true;
-}
-
-static void do_insr_i64(DisasContext *s, arg_rrr_esz *a, TCGv_i64 val)
-{
-    typedef void gen_insr(TCGv_ptr, TCGv_ptr, TCGv_i64, TCGv_i32);
-    static gen_insr * const fns[4] = {
-        gen_helper_sve_insr_b, gen_helper_sve_insr_h,
-        gen_helper_sve_insr_s, gen_helper_sve_insr_d,
-    };
-    unsigned vsz = vec_full_reg_size(s);
-    TCGv_i32 desc = tcg_constant_i32(simd_desc(vsz, vsz, 0));
-    TCGv_ptr t_zd = tcg_temp_new_ptr();
-    TCGv_ptr t_zn = tcg_temp_new_ptr();
-
-    tcg_gen_addi_ptr(t_zd, cpu_env, vec_full_reg_offset(s, a->rd));
-    tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, a->rn));
-
-    fns[a->esz](t_zd, t_zn, val, desc);
-
-    tcg_temp_free_ptr(t_zd);
-    tcg_temp_free_ptr(t_zn);
-}
-
-static bool trans_INSR_f(DisasContext *s, arg_rrr_esz *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        TCGv_i64 t = tcg_temp_new_i64();
-        tcg_gen_ld_i64(t, cpu_env, vec_reg_offset(s, a->rm, 0, MO_64));
-        do_insr_i64(s, a, t);
-        tcg_temp_free_i64(t);
-    }
-    return true;
-}
-
-static bool trans_INSR_r(DisasContext *s, arg_rrr_esz *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        do_insr_i64(s, a, cpu_reg(s, a->rm));
-    }
-    return true;
-}
-
-static gen_helper_gvec_2 * const rev_fns[4] = {
-    gen_helper_sve_rev_b, gen_helper_sve_rev_h,
-    gen_helper_sve_rev_s, gen_helper_sve_rev_d
-};
-TRANS_FEAT(REV_v, aa64_sve, gen_gvec_ool_zz, rev_fns[a->esz], a->rd, a->rn, 0)
-
-static gen_helper_gvec_3 * const sve_tbl_fns[4] = {
-    gen_helper_sve_tbl_b, gen_helper_sve_tbl_h,
-    gen_helper_sve_tbl_s, gen_helper_sve_tbl_d
-};
-TRANS_FEAT(TBL, aa64_sve, gen_gvec_ool_arg_zzz, sve_tbl_fns[a->esz], a, 0)
-
-static gen_helper_gvec_4 * const sve2_tbl_fns[4] = {
-    gen_helper_sve2_tbl_b, gen_helper_sve2_tbl_h,
-    gen_helper_sve2_tbl_s, gen_helper_sve2_tbl_d
-};
-TRANS_FEAT(TBL_sve2, aa64_sve2, gen_gvec_ool_zzzz, sve2_tbl_fns[a->esz],
-           a->rd, a->rn, (a->rn + 1) % 32, a->rm, 0)
-
-static gen_helper_gvec_3 * const tbx_fns[4] = {
-    gen_helper_sve2_tbx_b, gen_helper_sve2_tbx_h,
-    gen_helper_sve2_tbx_s, gen_helper_sve2_tbx_d
-};
-TRANS_FEAT(TBX, aa64_sve2, gen_gvec_ool_arg_zzz, tbx_fns[a->esz], a, 0)
-
-static bool trans_UNPK(DisasContext *s, arg_UNPK *a)
-{
-    static gen_helper_gvec_2 * const fns[4][2] = {
-        { NULL, NULL },
-        { gen_helper_sve_sunpk_h, gen_helper_sve_uunpk_h },
-        { gen_helper_sve_sunpk_s, gen_helper_sve_uunpk_s },
-        { gen_helper_sve_sunpk_d, gen_helper_sve_uunpk_d },
-    };
-
-    if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        tcg_gen_gvec_2_ool(vec_full_reg_offset(s, a->rd),
-                           vec_full_reg_offset(s, a->rn)
-                           + (a->h ? vsz / 2 : 0),
-                           vsz, vsz, 0, fns[a->esz][a->u]);
-    }
-    return true;
-}
-
-/*
- *** SVE Permute - Predicates Group
- */
-
-static bool do_perm_pred3(DisasContext *s, arg_rrr_esz *a, bool high_odd,
-                          gen_helper_gvec_3 *fn)
-{
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    unsigned vsz = pred_full_reg_size(s);
-
-    TCGv_ptr t_d = tcg_temp_new_ptr();
-    TCGv_ptr t_n = tcg_temp_new_ptr();
-    TCGv_ptr t_m = tcg_temp_new_ptr();
-    uint32_t desc = 0;
-
-    desc = FIELD_DP32(desc, PREDDESC, OPRSZ, vsz);
-    desc = FIELD_DP32(desc, PREDDESC, ESZ, a->esz);
-    desc = FIELD_DP32(desc, PREDDESC, DATA, high_odd);
-
-    tcg_gen_addi_ptr(t_d, cpu_env, pred_full_reg_offset(s, a->rd));
-    tcg_gen_addi_ptr(t_n, cpu_env, pred_full_reg_offset(s, a->rn));
-    tcg_gen_addi_ptr(t_m, cpu_env, pred_full_reg_offset(s, a->rm));
-
-    fn(t_d, t_n, t_m, tcg_constant_i32(desc));
-
-    tcg_temp_free_ptr(t_d);
-    tcg_temp_free_ptr(t_n);
-    tcg_temp_free_ptr(t_m);
-    return true;
-}
-
-static bool do_perm_pred2(DisasContext *s, arg_rr_esz *a, bool high_odd,
-                          gen_helper_gvec_2 *fn)
-{
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    unsigned vsz = pred_full_reg_size(s);
-    TCGv_ptr t_d = tcg_temp_new_ptr();
-    TCGv_ptr t_n = tcg_temp_new_ptr();
-    uint32_t desc = 0;
-
-    tcg_gen_addi_ptr(t_d, cpu_env, pred_full_reg_offset(s, a->rd));
-    tcg_gen_addi_ptr(t_n, cpu_env, pred_full_reg_offset(s, a->rn));
-
-    desc = FIELD_DP32(desc, PREDDESC, OPRSZ, vsz);
-    desc = FIELD_DP32(desc, PREDDESC, ESZ, a->esz);
-    desc = FIELD_DP32(desc, PREDDESC, DATA, high_odd);
-
-    fn(t_d, t_n, tcg_constant_i32(desc));
-
-    tcg_temp_free_ptr(t_d);
-    tcg_temp_free_ptr(t_n);
-    return true;
-}
-
-TRANS_FEAT(ZIP1_p, aa64_sve, do_perm_pred3, a, 0, gen_helper_sve_zip_p)
-TRANS_FEAT(ZIP2_p, aa64_sve, do_perm_pred3, a, 1, gen_helper_sve_zip_p)
-TRANS_FEAT(UZP1_p, aa64_sve, do_perm_pred3, a, 0, gen_helper_sve_uzp_p)
-TRANS_FEAT(UZP2_p, aa64_sve, do_perm_pred3, a, 1, gen_helper_sve_uzp_p)
-TRANS_FEAT(TRN1_p, aa64_sve, do_perm_pred3, a, 0, gen_helper_sve_trn_p)
-TRANS_FEAT(TRN2_p, aa64_sve, do_perm_pred3, a, 1, gen_helper_sve_trn_p)
-
-TRANS_FEAT(REV_p, aa64_sve, do_perm_pred2, a, 0, gen_helper_sve_rev_p)
-TRANS_FEAT(PUNPKLO, aa64_sve, do_perm_pred2, a, 0, gen_helper_sve_punpk_p)
-TRANS_FEAT(PUNPKHI, aa64_sve, do_perm_pred2, a, 1, gen_helper_sve_punpk_p)
-
-/*
- *** SVE Permute - Interleaving Group
- */
-
-static gen_helper_gvec_3 * const zip_fns[4] = {
-    gen_helper_sve_zip_b, gen_helper_sve_zip_h,
-    gen_helper_sve_zip_s, gen_helper_sve_zip_d,
-};
-TRANS_FEAT(ZIP1_z, aa64_sve, gen_gvec_ool_arg_zzz,
-           zip_fns[a->esz], a, 0)
-TRANS_FEAT(ZIP2_z, aa64_sve, gen_gvec_ool_arg_zzz,
-           zip_fns[a->esz], a, vec_full_reg_size(s) / 2)
-
-TRANS_FEAT(ZIP1_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz,
-           gen_helper_sve2_zip_q, a, 0)
-TRANS_FEAT(ZIP2_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz,
-           gen_helper_sve2_zip_q, a,
-           QEMU_ALIGN_DOWN(vec_full_reg_size(s), 32) / 2)
-
-static gen_helper_gvec_3 * const uzp_fns[4] = {
-    gen_helper_sve_uzp_b, gen_helper_sve_uzp_h,
-    gen_helper_sve_uzp_s, gen_helper_sve_uzp_d,
-};
-
-TRANS_FEAT(UZP1_z, aa64_sve, gen_gvec_ool_arg_zzz,
-           uzp_fns[a->esz], a, 0)
-TRANS_FEAT(UZP2_z, aa64_sve, gen_gvec_ool_arg_zzz,
-           uzp_fns[a->esz], a, 1 << a->esz)
-
-TRANS_FEAT(UZP1_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz,
-           gen_helper_sve2_uzp_q, a, 0)
-TRANS_FEAT(UZP2_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz,
-           gen_helper_sve2_uzp_q, a, 16)
-
-static gen_helper_gvec_3 * const trn_fns[4] = {
-    gen_helper_sve_trn_b, gen_helper_sve_trn_h,
-    gen_helper_sve_trn_s, gen_helper_sve_trn_d,
-};
-
-TRANS_FEAT(TRN1_z, aa64_sve, gen_gvec_ool_arg_zzz,
-           trn_fns[a->esz], a, 0)
-TRANS_FEAT(TRN2_z, aa64_sve, gen_gvec_ool_arg_zzz,
-           trn_fns[a->esz], a, 1 << a->esz)
-
-TRANS_FEAT(TRN1_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz,
-           gen_helper_sve2_trn_q, a, 0)
-TRANS_FEAT(TRN2_q, aa64_sve_f64mm, gen_gvec_ool_arg_zzz,
-           gen_helper_sve2_trn_q, a, 16)
-
-/*
- *** SVE Permute Vector - Predicated Group
- */
-
-static gen_helper_gvec_3 * const compact_fns[4] = {
-    NULL, NULL, gen_helper_sve_compact_s, gen_helper_sve_compact_d
-};
-TRANS_FEAT_NONSTREAMING(COMPACT, aa64_sve, gen_gvec_ool_arg_zpz,
-                        compact_fns[a->esz], a, 0)
-
-/* Call the helper that computes the ARM LastActiveElement pseudocode
- * function, scaled by the element size.  This includes the not found
- * indication; e.g. not found for esz=3 is -8.
- */
-static void find_last_active(DisasContext *s, TCGv_i32 ret, int esz, int pg)
-{
-    /* Predicate sizes may be smaller and cannot use simd_desc.  We cannot
-     * round up, as we do elsewhere, because we need the exact size.
-     */
-    TCGv_ptr t_p = tcg_temp_new_ptr();
-    unsigned desc = 0;
-
-    desc = FIELD_DP32(desc, PREDDESC, OPRSZ, pred_full_reg_size(s));
-    desc = FIELD_DP32(desc, PREDDESC, ESZ, esz);
-
-    tcg_gen_addi_ptr(t_p, cpu_env, pred_full_reg_offset(s, pg));
-
-    gen_helper_sve_last_active_element(ret, t_p, tcg_constant_i32(desc));
-
-    tcg_temp_free_ptr(t_p);
-}
-
-/* Increment LAST to the offset of the next element in the vector,
- * wrapping around to 0.
- */
-static void incr_last_active(DisasContext *s, TCGv_i32 last, int esz)
-{
-    unsigned vsz = vec_full_reg_size(s);
-
-    tcg_gen_addi_i32(last, last, 1 << esz);
-    if (is_power_of_2(vsz)) {
-        tcg_gen_andi_i32(last, last, vsz - 1);
-    } else {
-        TCGv_i32 max = tcg_constant_i32(vsz);
-        TCGv_i32 zero = tcg_constant_i32(0);
-        tcg_gen_movcond_i32(TCG_COND_GEU, last, last, max, zero, last);
-    }
-}
-
-/* If LAST < 0, set LAST to the offset of the last element in the vector.  */
-static void wrap_last_active(DisasContext *s, TCGv_i32 last, int esz)
-{
-    unsigned vsz = vec_full_reg_size(s);
-
-    if (is_power_of_2(vsz)) {
-        tcg_gen_andi_i32(last, last, vsz - 1);
-    } else {
-        TCGv_i32 max = tcg_constant_i32(vsz - (1 << esz));
-        TCGv_i32 zero = tcg_constant_i32(0);
-        tcg_gen_movcond_i32(TCG_COND_LT, last, last, zero, max, last);
-    }
-}
-
-/* Load an unsigned element of ESZ from BASE+OFS.  */
-static TCGv_i64 load_esz(TCGv_ptr base, int ofs, int esz)
-{
-    TCGv_i64 r = tcg_temp_new_i64();
-
-    switch (esz) {
-    case 0:
-        tcg_gen_ld8u_i64(r, base, ofs);
-        break;
-    case 1:
-        tcg_gen_ld16u_i64(r, base, ofs);
-        break;
-    case 2:
-        tcg_gen_ld32u_i64(r, base, ofs);
-        break;
-    case 3:
-        tcg_gen_ld_i64(r, base, ofs);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-    return r;
-}
-
-/* Load an unsigned element of ESZ from RM[LAST].  */
-static TCGv_i64 load_last_active(DisasContext *s, TCGv_i32 last,
-                                 int rm, int esz)
-{
-    TCGv_ptr p = tcg_temp_new_ptr();
-    TCGv_i64 r;
-
-    /* Convert offset into vector into offset into ENV.
-     * The final adjustment for the vector register base
-     * is added via constant offset to the load.
-     */
-#if HOST_BIG_ENDIAN
-    /* Adjust for element ordering.  See vec_reg_offset.  */
-    if (esz < 3) {
-        tcg_gen_xori_i32(last, last, 8 - (1 << esz));
-    }
-#endif
-    tcg_gen_ext_i32_ptr(p, last);
-    tcg_gen_add_ptr(p, p, cpu_env);
-
-    r = load_esz(p, vec_full_reg_offset(s, rm), esz);
-    tcg_temp_free_ptr(p);
-
-    return r;
-}
-
-/* Compute CLAST for a Zreg.  */
-static bool do_clast_vector(DisasContext *s, arg_rprr_esz *a, bool before)
-{
-    TCGv_i32 last;
-    TCGLabel *over;
-    TCGv_i64 ele;
-    unsigned vsz, esz = a->esz;
-
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    last = tcg_temp_local_new_i32();
-    over = gen_new_label();
-
-    find_last_active(s, last, esz, a->pg);
-
-    /* There is of course no movcond for a 2048-bit vector,
-     * so we must branch over the actual store.
-     */
-    tcg_gen_brcondi_i32(TCG_COND_LT, last, 0, over);
-
-    if (!before) {
-        incr_last_active(s, last, esz);
-    }
-
-    ele = load_last_active(s, last, a->rm, esz);
-    tcg_temp_free_i32(last);
-
-    vsz = vec_full_reg_size(s);
-    tcg_gen_gvec_dup_i64(esz, vec_full_reg_offset(s, a->rd), vsz, vsz, ele);
-    tcg_temp_free_i64(ele);
-
-    /* If this insn used MOVPRFX, we may need a second move.  */
-    if (a->rd != a->rn) {
-        TCGLabel *done = gen_new_label();
-        tcg_gen_br(done);
-
-        gen_set_label(over);
-        do_mov_z(s, a->rd, a->rn);
-
-        gen_set_label(done);
-    } else {
-        gen_set_label(over);
-    }
-    return true;
-}
-
-TRANS_FEAT(CLASTA_z, aa64_sve, do_clast_vector, a, false)
-TRANS_FEAT(CLASTB_z, aa64_sve, do_clast_vector, a, true)
-
-/* Compute CLAST for a scalar.  */
-static void do_clast_scalar(DisasContext *s, int esz, int pg, int rm,
-                            bool before, TCGv_i64 reg_val)
-{
-    TCGv_i32 last = tcg_temp_new_i32();
-    TCGv_i64 ele, cmp;
-
-    find_last_active(s, last, esz, pg);
-
-    /* Extend the original value of last prior to incrementing.  */
-    cmp = tcg_temp_new_i64();
-    tcg_gen_ext_i32_i64(cmp, last);
-
-    if (!before) {
-        incr_last_active(s, last, esz);
-    }
-
-    /* The conceit here is that while last < 0 indicates not found, after
-     * adjusting for cpu_env->vfp.zregs[rm], it is still a valid address
-     * from which we can load garbage.  We then discard the garbage with
-     * a conditional move.
-     */
-    ele = load_last_active(s, last, rm, esz);
-    tcg_temp_free_i32(last);
-
-    tcg_gen_movcond_i64(TCG_COND_GE, reg_val, cmp, tcg_constant_i64(0),
-                        ele, reg_val);
-
-    tcg_temp_free_i64(cmp);
-    tcg_temp_free_i64(ele);
-}
-
-/* Compute CLAST for a Vreg.  */
-static bool do_clast_fp(DisasContext *s, arg_rpr_esz *a, bool before)
-{
-    if (sve_access_check(s)) {
-        int esz = a->esz;
-        int ofs = vec_reg_offset(s, a->rd, 0, esz);
-        TCGv_i64 reg = load_esz(cpu_env, ofs, esz);
-
-        do_clast_scalar(s, esz, a->pg, a->rn, before, reg);
-        write_fp_dreg(s, a->rd, reg);
-        tcg_temp_free_i64(reg);
-    }
-    return true;
-}
-
-TRANS_FEAT(CLASTA_v, aa64_sve, do_clast_fp, a, false)
-TRANS_FEAT(CLASTB_v, aa64_sve, do_clast_fp, a, true)
-
-/* Compute CLAST for a Xreg.  */
-static bool do_clast_general(DisasContext *s, arg_rpr_esz *a, bool before)
-{
-    TCGv_i64 reg;
-
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    reg = cpu_reg(s, a->rd);
-    switch (a->esz) {
-    case 0:
-        tcg_gen_ext8u_i64(reg, reg);
-        break;
-    case 1:
-        tcg_gen_ext16u_i64(reg, reg);
-        break;
-    case 2:
-        tcg_gen_ext32u_i64(reg, reg);
-        break;
-    case 3:
-        break;
-    default:
-        g_assert_not_reached();
-    }
-
-    do_clast_scalar(s, a->esz, a->pg, a->rn, before, reg);
-    return true;
-}
-
-TRANS_FEAT(CLASTA_r, aa64_sve, do_clast_general, a, false)
-TRANS_FEAT(CLASTB_r, aa64_sve, do_clast_general, a, true)
-
-/* Compute LAST for a scalar.  */
-static TCGv_i64 do_last_scalar(DisasContext *s, int esz,
-                               int pg, int rm, bool before)
-{
-    TCGv_i32 last = tcg_temp_new_i32();
-    TCGv_i64 ret;
-
-    find_last_active(s, last, esz, pg);
-    if (before) {
-        wrap_last_active(s, last, esz);
-    } else {
-        incr_last_active(s, last, esz);
-    }
-
-    ret = load_last_active(s, last, rm, esz);
-    tcg_temp_free_i32(last);
-    return ret;
-}
-
-/* Compute LAST for a Vreg.  */
-static bool do_last_fp(DisasContext *s, arg_rpr_esz *a, bool before)
-{
-    if (sve_access_check(s)) {
-        TCGv_i64 val = do_last_scalar(s, a->esz, a->pg, a->rn, before);
-        write_fp_dreg(s, a->rd, val);
-        tcg_temp_free_i64(val);
-    }
-    return true;
-}
-
-TRANS_FEAT(LASTA_v, aa64_sve, do_last_fp, a, false)
-TRANS_FEAT(LASTB_v, aa64_sve, do_last_fp, a, true)
-
-/* Compute LAST for a Xreg.  */
-static bool do_last_general(DisasContext *s, arg_rpr_esz *a, bool before)
-{
-    if (sve_access_check(s)) {
-        TCGv_i64 val = do_last_scalar(s, a->esz, a->pg, a->rn, before);
-        tcg_gen_mov_i64(cpu_reg(s, a->rd), val);
-        tcg_temp_free_i64(val);
-    }
-    return true;
-}
-
-TRANS_FEAT(LASTA_r, aa64_sve, do_last_general, a, false)
-TRANS_FEAT(LASTB_r, aa64_sve, do_last_general, a, true)
-
-static bool trans_CPY_m_r(DisasContext *s, arg_rpr_esz *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        do_cpy_m(s, a->esz, a->rd, a->rd, a->pg, cpu_reg_sp(s, a->rn));
-    }
-    return true;
-}
-
-static bool trans_CPY_m_v(DisasContext *s, arg_rpr_esz *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        int ofs = vec_reg_offset(s, a->rn, 0, a->esz);
-        TCGv_i64 t = load_esz(cpu_env, ofs, a->esz);
-        do_cpy_m(s, a->esz, a->rd, a->rd, a->pg, t);
-        tcg_temp_free_i64(t);
-    }
-    return true;
-}
-
-static gen_helper_gvec_3 * const revb_fns[4] = {
-    NULL,                  gen_helper_sve_revb_h,
-    gen_helper_sve_revb_s, gen_helper_sve_revb_d,
-};
-TRANS_FEAT(REVB, aa64_sve, gen_gvec_ool_arg_zpz, revb_fns[a->esz], a, 0)
-
-static gen_helper_gvec_3 * const revh_fns[4] = {
-    NULL, NULL, gen_helper_sve_revh_s, gen_helper_sve_revh_d,
-};
-TRANS_FEAT(REVH, aa64_sve, gen_gvec_ool_arg_zpz, revh_fns[a->esz], a, 0)
-
-TRANS_FEAT(REVW, aa64_sve, gen_gvec_ool_arg_zpz,
-           a->esz == 3 ? gen_helper_sve_revw_d : NULL, a, 0)
-
-TRANS_FEAT(REVD, aa64_sme, gen_gvec_ool_arg_zpz, gen_helper_sme_revd_q, a, 0)
-
-TRANS_FEAT(SPLICE, aa64_sve, gen_gvec_ool_arg_zpzz,
-           gen_helper_sve_splice, a, a->esz)
-
-TRANS_FEAT(SPLICE_sve2, aa64_sve2, gen_gvec_ool_zzzp, gen_helper_sve_splice,
-           a->rd, a->rn, (a->rn + 1) % 32, a->pg, a->esz)
-
-/*
- *** SVE Integer Compare - Vectors Group
- */
-
-static bool do_ppzz_flags(DisasContext *s, arg_rprr_esz *a,
-                          gen_helper_gvec_flags_4 *gen_fn)
-{
-    TCGv_ptr pd, zn, zm, pg;
-    unsigned vsz;
-    TCGv_i32 t;
-
-    if (gen_fn == NULL) {
-        return false;
-    }
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    vsz = vec_full_reg_size(s);
-    t = tcg_temp_new_i32();
-    pd = tcg_temp_new_ptr();
-    zn = tcg_temp_new_ptr();
-    zm = tcg_temp_new_ptr();
-    pg = tcg_temp_new_ptr();
-
-    tcg_gen_addi_ptr(pd, cpu_env, pred_full_reg_offset(s, a->rd));
-    tcg_gen_addi_ptr(zn, cpu_env, vec_full_reg_offset(s, a->rn));
-    tcg_gen_addi_ptr(zm, cpu_env, vec_full_reg_offset(s, a->rm));
-    tcg_gen_addi_ptr(pg, cpu_env, pred_full_reg_offset(s, a->pg));
-
-    gen_fn(t, pd, zn, zm, pg, tcg_constant_i32(simd_desc(vsz, vsz, 0)));
-
-    tcg_temp_free_ptr(pd);
-    tcg_temp_free_ptr(zn);
-    tcg_temp_free_ptr(zm);
-    tcg_temp_free_ptr(pg);
-
-    do_pred_flags(t);
-
-    tcg_temp_free_i32(t);
-    return true;
-}
-
-#define DO_PPZZ(NAME, name) \
-    static gen_helper_gvec_flags_4 * const name##_ppzz_fns[4] = {       \
-        gen_helper_sve_##name##_ppzz_b, gen_helper_sve_##name##_ppzz_h, \
-        gen_helper_sve_##name##_ppzz_s, gen_helper_sve_##name##_ppzz_d, \
-    };                                                                  \
-    TRANS_FEAT(NAME##_ppzz, aa64_sve, do_ppzz_flags,                    \
-               a, name##_ppzz_fns[a->esz])
-
-DO_PPZZ(CMPEQ, cmpeq)
-DO_PPZZ(CMPNE, cmpne)
-DO_PPZZ(CMPGT, cmpgt)
-DO_PPZZ(CMPGE, cmpge)
-DO_PPZZ(CMPHI, cmphi)
-DO_PPZZ(CMPHS, cmphs)
-
-#undef DO_PPZZ
-
-#define DO_PPZW(NAME, name) \
-    static gen_helper_gvec_flags_4 * const name##_ppzw_fns[4] = {       \
-        gen_helper_sve_##name##_ppzw_b, gen_helper_sve_##name##_ppzw_h, \
-        gen_helper_sve_##name##_ppzw_s, NULL                            \
-    };                                                                  \
-    TRANS_FEAT(NAME##_ppzw, aa64_sve, do_ppzz_flags,                    \
-               a, name##_ppzw_fns[a->esz])
-
-DO_PPZW(CMPEQ, cmpeq)
-DO_PPZW(CMPNE, cmpne)
-DO_PPZW(CMPGT, cmpgt)
-DO_PPZW(CMPGE, cmpge)
-DO_PPZW(CMPHI, cmphi)
-DO_PPZW(CMPHS, cmphs)
-DO_PPZW(CMPLT, cmplt)
-DO_PPZW(CMPLE, cmple)
-DO_PPZW(CMPLO, cmplo)
-DO_PPZW(CMPLS, cmpls)
-
-#undef DO_PPZW
-
-/*
- *** SVE Integer Compare - Immediate Groups
- */
-
-static bool do_ppzi_flags(DisasContext *s, arg_rpri_esz *a,
-                          gen_helper_gvec_flags_3 *gen_fn)
-{
-    TCGv_ptr pd, zn, pg;
-    unsigned vsz;
-    TCGv_i32 t;
-
-    if (gen_fn == NULL) {
-        return false;
-    }
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    vsz = vec_full_reg_size(s);
-    t = tcg_temp_new_i32();
-    pd = tcg_temp_new_ptr();
-    zn = tcg_temp_new_ptr();
-    pg = tcg_temp_new_ptr();
-
-    tcg_gen_addi_ptr(pd, cpu_env, pred_full_reg_offset(s, a->rd));
-    tcg_gen_addi_ptr(zn, cpu_env, vec_full_reg_offset(s, a->rn));
-    tcg_gen_addi_ptr(pg, cpu_env, pred_full_reg_offset(s, a->pg));
-
-    gen_fn(t, pd, zn, pg, tcg_constant_i32(simd_desc(vsz, vsz, a->imm)));
-
-    tcg_temp_free_ptr(pd);
-    tcg_temp_free_ptr(zn);
-    tcg_temp_free_ptr(pg);
-
-    do_pred_flags(t);
-
-    tcg_temp_free_i32(t);
-    return true;
-}
-
-#define DO_PPZI(NAME, name) \
-    static gen_helper_gvec_flags_3 * const name##_ppzi_fns[4] = {         \
-        gen_helper_sve_##name##_ppzi_b, gen_helper_sve_##name##_ppzi_h,   \
-        gen_helper_sve_##name##_ppzi_s, gen_helper_sve_##name##_ppzi_d,   \
-    };                                                                    \
-    TRANS_FEAT(NAME##_ppzi, aa64_sve, do_ppzi_flags, a,                   \
-               name##_ppzi_fns[a->esz])
-
-DO_PPZI(CMPEQ, cmpeq)
-DO_PPZI(CMPNE, cmpne)
-DO_PPZI(CMPGT, cmpgt)
-DO_PPZI(CMPGE, cmpge)
-DO_PPZI(CMPHI, cmphi)
-DO_PPZI(CMPHS, cmphs)
-DO_PPZI(CMPLT, cmplt)
-DO_PPZI(CMPLE, cmple)
-DO_PPZI(CMPLO, cmplo)
-DO_PPZI(CMPLS, cmpls)
-
-#undef DO_PPZI
-
-/*
- *** SVE Partition Break Group
- */
-
-static bool do_brk3(DisasContext *s, arg_rprr_s *a,
-                    gen_helper_gvec_4 *fn, gen_helper_gvec_flags_4 *fn_s)
-{
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    unsigned vsz = pred_full_reg_size(s);
-
-    /* Predicate sizes may be smaller and cannot use simd_desc.  */
-    TCGv_ptr d = tcg_temp_new_ptr();
-    TCGv_ptr n = tcg_temp_new_ptr();
-    TCGv_ptr m = tcg_temp_new_ptr();
-    TCGv_ptr g = tcg_temp_new_ptr();
-    TCGv_i32 desc = tcg_constant_i32(FIELD_DP32(0, PREDDESC, OPRSZ, vsz));
-
-    tcg_gen_addi_ptr(d, cpu_env, pred_full_reg_offset(s, a->rd));
-    tcg_gen_addi_ptr(n, cpu_env, pred_full_reg_offset(s, a->rn));
-    tcg_gen_addi_ptr(m, cpu_env, pred_full_reg_offset(s, a->rm));
-    tcg_gen_addi_ptr(g, cpu_env, pred_full_reg_offset(s, a->pg));
-
-    if (a->s) {
-        TCGv_i32 t = tcg_temp_new_i32();
-        fn_s(t, d, n, m, g, desc);
-        do_pred_flags(t);
-        tcg_temp_free_i32(t);
-    } else {
-        fn(d, n, m, g, desc);
-    }
-    tcg_temp_free_ptr(d);
-    tcg_temp_free_ptr(n);
-    tcg_temp_free_ptr(m);
-    tcg_temp_free_ptr(g);
-    return true;
-}
-
-static bool do_brk2(DisasContext *s, arg_rpr_s *a,
-                    gen_helper_gvec_3 *fn, gen_helper_gvec_flags_3 *fn_s)
-{
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    unsigned vsz = pred_full_reg_size(s);
-
-    /* Predicate sizes may be smaller and cannot use simd_desc.  */
-    TCGv_ptr d = tcg_temp_new_ptr();
-    TCGv_ptr n = tcg_temp_new_ptr();
-    TCGv_ptr g = tcg_temp_new_ptr();
-    TCGv_i32 desc = tcg_constant_i32(FIELD_DP32(0, PREDDESC, OPRSZ, vsz));
-
-    tcg_gen_addi_ptr(d, cpu_env, pred_full_reg_offset(s, a->rd));
-    tcg_gen_addi_ptr(n, cpu_env, pred_full_reg_offset(s, a->rn));
-    tcg_gen_addi_ptr(g, cpu_env, pred_full_reg_offset(s, a->pg));
-
-    if (a->s) {
-        TCGv_i32 t = tcg_temp_new_i32();
-        fn_s(t, d, n, g, desc);
-        do_pred_flags(t);
-        tcg_temp_free_i32(t);
-    } else {
-        fn(d, n, g, desc);
-    }
-    tcg_temp_free_ptr(d);
-    tcg_temp_free_ptr(n);
-    tcg_temp_free_ptr(g);
-    return true;
-}
-
-TRANS_FEAT(BRKPA, aa64_sve, do_brk3, a,
-           gen_helper_sve_brkpa, gen_helper_sve_brkpas)
-TRANS_FEAT(BRKPB, aa64_sve, do_brk3, a,
-           gen_helper_sve_brkpb, gen_helper_sve_brkpbs)
-
-TRANS_FEAT(BRKA_m, aa64_sve, do_brk2, a,
-           gen_helper_sve_brka_m, gen_helper_sve_brkas_m)
-TRANS_FEAT(BRKB_m, aa64_sve, do_brk2, a,
-           gen_helper_sve_brkb_m, gen_helper_sve_brkbs_m)
-
-TRANS_FEAT(BRKA_z, aa64_sve, do_brk2, a,
-           gen_helper_sve_brka_z, gen_helper_sve_brkas_z)
-TRANS_FEAT(BRKB_z, aa64_sve, do_brk2, a,
-           gen_helper_sve_brkb_z, gen_helper_sve_brkbs_z)
-
-TRANS_FEAT(BRKN, aa64_sve, do_brk2, a,
-           gen_helper_sve_brkn, gen_helper_sve_brkns)
-
-/*
- *** SVE Predicate Count Group
- */
-
-static void do_cntp(DisasContext *s, TCGv_i64 val, int esz, int pn, int pg)
-{
-    unsigned psz = pred_full_reg_size(s);
-
-    if (psz <= 8) {
-        uint64_t psz_mask;
-
-        tcg_gen_ld_i64(val, cpu_env, pred_full_reg_offset(s, pn));
-        if (pn != pg) {
-            TCGv_i64 g = tcg_temp_new_i64();
-            tcg_gen_ld_i64(g, cpu_env, pred_full_reg_offset(s, pg));
-            tcg_gen_and_i64(val, val, g);
-            tcg_temp_free_i64(g);
-        }
-
-        /* Reduce the pred_esz_masks value simply to reduce the
-         * size of the code generated here.
-         */
-        psz_mask = MAKE_64BIT_MASK(0, psz * 8);
-        tcg_gen_andi_i64(val, val, pred_esz_masks[esz] & psz_mask);
-
-        tcg_gen_ctpop_i64(val, val);
-    } else {
-        TCGv_ptr t_pn = tcg_temp_new_ptr();
-        TCGv_ptr t_pg = tcg_temp_new_ptr();
-        unsigned desc = 0;
-
-        desc = FIELD_DP32(desc, PREDDESC, OPRSZ, psz);
-        desc = FIELD_DP32(desc, PREDDESC, ESZ, esz);
-
-        tcg_gen_addi_ptr(t_pn, cpu_env, pred_full_reg_offset(s, pn));
-        tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg));
-
-        gen_helper_sve_cntp(val, t_pn, t_pg, tcg_constant_i32(desc));
-        tcg_temp_free_ptr(t_pn);
-        tcg_temp_free_ptr(t_pg);
-    }
-}
-
-static bool trans_CNTP(DisasContext *s, arg_CNTP *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        do_cntp(s, cpu_reg(s, a->rd), a->esz, a->rn, a->pg);
-    }
-    return true;
-}
-
-static bool trans_INCDECP_r(DisasContext *s, arg_incdec_pred *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        TCGv_i64 reg = cpu_reg(s, a->rd);
-        TCGv_i64 val = tcg_temp_new_i64();
-
-        do_cntp(s, val, a->esz, a->pg, a->pg);
-        if (a->d) {
-            tcg_gen_sub_i64(reg, reg, val);
-        } else {
-            tcg_gen_add_i64(reg, reg, val);
-        }
-        tcg_temp_free_i64(val);
-    }
-    return true;
-}
-
-static bool trans_INCDECP_z(DisasContext *s, arg_incdec2_pred *a)
-{
-    if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        TCGv_i64 val = tcg_temp_new_i64();
-        GVecGen2sFn *gvec_fn = a->d ? tcg_gen_gvec_subs : tcg_gen_gvec_adds;
-
-        do_cntp(s, val, a->esz, a->pg, a->pg);
-        gvec_fn(a->esz, vec_full_reg_offset(s, a->rd),
-                vec_full_reg_offset(s, a->rn), val, vsz, vsz);
-    }
-    return true;
-}
-
-static bool trans_SINCDECP_r_32(DisasContext *s, arg_incdec_pred *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        TCGv_i64 reg = cpu_reg(s, a->rd);
-        TCGv_i64 val = tcg_temp_new_i64();
-
-        do_cntp(s, val, a->esz, a->pg, a->pg);
-        do_sat_addsub_32(reg, val, a->u, a->d);
-    }
-    return true;
-}
-
-static bool trans_SINCDECP_r_64(DisasContext *s, arg_incdec_pred *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        TCGv_i64 reg = cpu_reg(s, a->rd);
-        TCGv_i64 val = tcg_temp_new_i64();
-
-        do_cntp(s, val, a->esz, a->pg, a->pg);
-        do_sat_addsub_64(reg, val, a->u, a->d);
-    }
-    return true;
-}
-
-static bool trans_SINCDECP_z(DisasContext *s, arg_incdec2_pred *a)
-{
-    if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        TCGv_i64 val = tcg_temp_new_i64();
-        do_cntp(s, val, a->esz, a->pg, a->pg);
-        do_sat_addsub_vec(s, a->esz, a->rd, a->rn, val, a->u, a->d);
-    }
-    return true;
-}
-
-/*
- *** SVE Integer Compare Scalars Group
- */
-
-static bool trans_CTERM(DisasContext *s, arg_CTERM *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    TCGCond cond = (a->ne ? TCG_COND_NE : TCG_COND_EQ);
-    TCGv_i64 rn = read_cpu_reg(s, a->rn, a->sf);
-    TCGv_i64 rm = read_cpu_reg(s, a->rm, a->sf);
-    TCGv_i64 cmp = tcg_temp_new_i64();
-
-    tcg_gen_setcond_i64(cond, cmp, rn, rm);
-    tcg_gen_extrl_i64_i32(cpu_NF, cmp);
-    tcg_temp_free_i64(cmp);
-
-    /* VF = !NF & !CF.  */
-    tcg_gen_xori_i32(cpu_VF, cpu_NF, 1);
-    tcg_gen_andc_i32(cpu_VF, cpu_VF, cpu_CF);
-
-    /* Both NF and VF actually look at bit 31.  */
-    tcg_gen_neg_i32(cpu_NF, cpu_NF);
-    tcg_gen_neg_i32(cpu_VF, cpu_VF);
-    return true;
-}
-
-static bool trans_WHILE(DisasContext *s, arg_WHILE *a)
-{
-    TCGv_i64 op0, op1, t0, t1, tmax;
-    TCGv_i32 t2;
-    TCGv_ptr ptr;
-    unsigned vsz = vec_full_reg_size(s);
-    unsigned desc = 0;
-    TCGCond cond;
-    uint64_t maxval;
-    /* Note that GE/HS has a->eq == 0 and GT/HI has a->eq == 1. */
-    bool eq = a->eq == a->lt;
-
-    /* The greater-than conditions are all SVE2. */
-    if (a->lt
-        ? !dc_isar_feature(aa64_sve, s)
-        : !dc_isar_feature(aa64_sve2, s)) {
-        return false;
-    }
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    op0 = read_cpu_reg(s, a->rn, 1);
-    op1 = read_cpu_reg(s, a->rm, 1);
-
-    if (!a->sf) {
-        if (a->u) {
-            tcg_gen_ext32u_i64(op0, op0);
-            tcg_gen_ext32u_i64(op1, op1);
-        } else {
-            tcg_gen_ext32s_i64(op0, op0);
-            tcg_gen_ext32s_i64(op1, op1);
-        }
-    }
-
-    /* For the helper, compress the different conditions into a computation
-     * of how many iterations for which the condition is true.
-     */
-    t0 = tcg_temp_new_i64();
-    t1 = tcg_temp_new_i64();
-
-    if (a->lt) {
-        tcg_gen_sub_i64(t0, op1, op0);
-        if (a->u) {
-            maxval = a->sf ? UINT64_MAX : UINT32_MAX;
-            cond = eq ? TCG_COND_LEU : TCG_COND_LTU;
-        } else {
-            maxval = a->sf ? INT64_MAX : INT32_MAX;
-            cond = eq ? TCG_COND_LE : TCG_COND_LT;
-        }
-    } else {
-        tcg_gen_sub_i64(t0, op0, op1);
-        if (a->u) {
-            maxval = 0;
-            cond = eq ? TCG_COND_GEU : TCG_COND_GTU;
-        } else {
-            maxval = a->sf ? INT64_MIN : INT32_MIN;
-            cond = eq ? TCG_COND_GE : TCG_COND_GT;
-        }
-    }
-
-    tmax = tcg_constant_i64(vsz >> a->esz);
-    if (eq) {
-        /* Equality means one more iteration.  */
-        tcg_gen_addi_i64(t0, t0, 1);
-
-        /*
-         * For the less-than while, if op1 is maxval (and the only time
-         * the addition above could overflow), then we produce an all-true
-         * predicate by setting the count to the vector length.  This is
-         * because the pseudocode is described as an increment + compare
-         * loop, and the maximum integer would always compare true.
-         * Similarly, the greater-than while has the same issue with the
-         * minimum integer due to the decrement + compare loop.
-         */
-        tcg_gen_movi_i64(t1, maxval);
-        tcg_gen_movcond_i64(TCG_COND_EQ, t0, op1, t1, tmax, t0);
-    }
-
-    /* Bound to the maximum.  */
-    tcg_gen_umin_i64(t0, t0, tmax);
-
-    /* Set the count to zero if the condition is false.  */
-    tcg_gen_movi_i64(t1, 0);
-    tcg_gen_movcond_i64(cond, t0, op0, op1, t0, t1);
-    tcg_temp_free_i64(t1);
-
-    /* Since we're bounded, pass as a 32-bit type.  */
-    t2 = tcg_temp_new_i32();
-    tcg_gen_extrl_i64_i32(t2, t0);
-    tcg_temp_free_i64(t0);
-
-    /* Scale elements to bits.  */
-    tcg_gen_shli_i32(t2, t2, a->esz);
-
-    desc = FIELD_DP32(desc, PREDDESC, OPRSZ, vsz / 8);
-    desc = FIELD_DP32(desc, PREDDESC, ESZ, a->esz);
-
-    ptr = tcg_temp_new_ptr();
-    tcg_gen_addi_ptr(ptr, cpu_env, pred_full_reg_offset(s, a->rd));
-
-    if (a->lt) {
-        gen_helper_sve_whilel(t2, ptr, t2, tcg_constant_i32(desc));
-    } else {
-        gen_helper_sve_whileg(t2, ptr, t2, tcg_constant_i32(desc));
-    }
-    do_pred_flags(t2);
-
-    tcg_temp_free_ptr(ptr);
-    tcg_temp_free_i32(t2);
-    return true;
-}
-
-static bool trans_WHILE_ptr(DisasContext *s, arg_WHILE_ptr *a)
-{
-    TCGv_i64 op0, op1, diff, t1, tmax;
-    TCGv_i32 t2;
-    TCGv_ptr ptr;
-    unsigned vsz = vec_full_reg_size(s);
-    unsigned desc = 0;
-
-    if (!dc_isar_feature(aa64_sve2, s)) {
-        return false;
-    }
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    op0 = read_cpu_reg(s, a->rn, 1);
-    op1 = read_cpu_reg(s, a->rm, 1);
-
-    tmax = tcg_constant_i64(vsz);
-    diff = tcg_temp_new_i64();
-
-    if (a->rw) {
-        /* WHILERW */
-        /* diff = abs(op1 - op0), noting that op0/1 are unsigned. */
-        t1 = tcg_temp_new_i64();
-        tcg_gen_sub_i64(diff, op0, op1);
-        tcg_gen_sub_i64(t1, op1, op0);
-        tcg_gen_movcond_i64(TCG_COND_GEU, diff, op0, op1, diff, t1);
-        tcg_temp_free_i64(t1);
-        /* Round down to a multiple of ESIZE.  */
-        tcg_gen_andi_i64(diff, diff, -1 << a->esz);
-        /* If op1 == op0, diff == 0, and the condition is always true. */
-        tcg_gen_movcond_i64(TCG_COND_EQ, diff, op0, op1, tmax, diff);
-    } else {
-        /* WHILEWR */
-        tcg_gen_sub_i64(diff, op1, op0);
-        /* Round down to a multiple of ESIZE.  */
-        tcg_gen_andi_i64(diff, diff, -1 << a->esz);
-        /* If op0 >= op1, diff <= 0, the condition is always true. */
-        tcg_gen_movcond_i64(TCG_COND_GEU, diff, op0, op1, tmax, diff);
-    }
-
-    /* Bound to the maximum.  */
-    tcg_gen_umin_i64(diff, diff, tmax);
-
-    /* Since we're bounded, pass as a 32-bit type.  */
-    t2 = tcg_temp_new_i32();
-    tcg_gen_extrl_i64_i32(t2, diff);
-    tcg_temp_free_i64(diff);
-
-    desc = FIELD_DP32(desc, PREDDESC, OPRSZ, vsz / 8);
-    desc = FIELD_DP32(desc, PREDDESC, ESZ, a->esz);
-
-    ptr = tcg_temp_new_ptr();
-    tcg_gen_addi_ptr(ptr, cpu_env, pred_full_reg_offset(s, a->rd));
-
-    gen_helper_sve_whilel(t2, ptr, t2, tcg_constant_i32(desc));
-    do_pred_flags(t2);
-
-    tcg_temp_free_ptr(ptr);
-    tcg_temp_free_i32(t2);
-    return true;
-}
-
-/*
- *** SVE Integer Wide Immediate - Unpredicated Group
- */
-
-static bool trans_FDUP(DisasContext *s, arg_FDUP *a)
-{
-    if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        int dofs = vec_full_reg_offset(s, a->rd);
-        uint64_t imm;
-
-        /* Decode the VFP immediate.  */
-        imm = vfp_expand_imm(a->esz, a->imm);
-        tcg_gen_gvec_dup_imm(a->esz, dofs, vsz, vsz, imm);
-    }
-    return true;
-}
-
-static bool trans_DUP_i(DisasContext *s, arg_DUP_i *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        int dofs = vec_full_reg_offset(s, a->rd);
-        tcg_gen_gvec_dup_imm(a->esz, dofs, vsz, vsz, a->imm);
-    }
-    return true;
-}
-
-TRANS_FEAT(ADD_zzi, aa64_sve, gen_gvec_fn_arg_zzi, tcg_gen_gvec_addi, a)
-
-static bool trans_SUB_zzi(DisasContext *s, arg_rri_esz *a)
-{
-    a->imm = -a->imm;
-    return trans_ADD_zzi(s, a);
-}
-
-static bool trans_SUBR_zzi(DisasContext *s, arg_rri_esz *a)
-{
-    static const TCGOpcode vecop_list[] = { INDEX_op_sub_vec, 0 };
-    static const GVecGen2s op[4] = {
-        { .fni8 = tcg_gen_vec_sub8_i64,
-          .fniv = tcg_gen_sub_vec,
-          .fno = gen_helper_sve_subri_b,
-          .opt_opc = vecop_list,
-          .vece = MO_8,
-          .scalar_first = true },
-        { .fni8 = tcg_gen_vec_sub16_i64,
-          .fniv = tcg_gen_sub_vec,
-          .fno = gen_helper_sve_subri_h,
-          .opt_opc = vecop_list,
-          .vece = MO_16,
-          .scalar_first = true },
-        { .fni4 = tcg_gen_sub_i32,
-          .fniv = tcg_gen_sub_vec,
-          .fno = gen_helper_sve_subri_s,
-          .opt_opc = vecop_list,
-          .vece = MO_32,
-          .scalar_first = true },
-        { .fni8 = tcg_gen_sub_i64,
-          .fniv = tcg_gen_sub_vec,
-          .fno = gen_helper_sve_subri_d,
-          .opt_opc = vecop_list,
-          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-          .vece = MO_64,
-          .scalar_first = true }
-    };
-
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        tcg_gen_gvec_2s(vec_full_reg_offset(s, a->rd),
-                        vec_full_reg_offset(s, a->rn),
-                        vsz, vsz, tcg_constant_i64(a->imm), &op[a->esz]);
-    }
-    return true;
-}
-
-TRANS_FEAT(MUL_zzi, aa64_sve, gen_gvec_fn_arg_zzi, tcg_gen_gvec_muli, a)
-
-static bool do_zzi_sat(DisasContext *s, arg_rri_esz *a, bool u, bool d)
-{
-    if (sve_access_check(s)) {
-        do_sat_addsub_vec(s, a->esz, a->rd, a->rn,
-                          tcg_constant_i64(a->imm), u, d);
-    }
-    return true;
-}
-
-TRANS_FEAT(SQADD_zzi, aa64_sve, do_zzi_sat, a, false, false)
-TRANS_FEAT(UQADD_zzi, aa64_sve, do_zzi_sat, a, true, false)
-TRANS_FEAT(SQSUB_zzi, aa64_sve, do_zzi_sat, a, false, true)
-TRANS_FEAT(UQSUB_zzi, aa64_sve, do_zzi_sat, a, true, true)
-
-static bool do_zzi_ool(DisasContext *s, arg_rri_esz *a, gen_helper_gvec_2i *fn)
-{
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        tcg_gen_gvec_2i_ool(vec_full_reg_offset(s, a->rd),
-                            vec_full_reg_offset(s, a->rn),
-                            tcg_constant_i64(a->imm), vsz, vsz, 0, fn);
-    }
-    return true;
-}
-
-#define DO_ZZI(NAME, name) \
-    static gen_helper_gvec_2i * const name##i_fns[4] = {                \
-        gen_helper_sve_##name##i_b, gen_helper_sve_##name##i_h,         \
-        gen_helper_sve_##name##i_s, gen_helper_sve_##name##i_d,         \
-    };                                                                  \
-    TRANS_FEAT(NAME##_zzi, aa64_sve, do_zzi_ool, a, name##i_fns[a->esz])
-
-DO_ZZI(SMAX, smax)
-DO_ZZI(UMAX, umax)
-DO_ZZI(SMIN, smin)
-DO_ZZI(UMIN, umin)
-
-#undef DO_ZZI
-
-static gen_helper_gvec_4 * const dot_fns[2][2] = {
-    { gen_helper_gvec_sdot_b, gen_helper_gvec_sdot_h },
-    { gen_helper_gvec_udot_b, gen_helper_gvec_udot_h }
-};
-TRANS_FEAT(DOT_zzzz, aa64_sve, gen_gvec_ool_zzzz,
-           dot_fns[a->u][a->sz], a->rd, a->rn, a->rm, a->ra, 0)
-
-/*
- * SVE Multiply - Indexed
- */
-
-TRANS_FEAT(SDOT_zzxw_s, aa64_sve, gen_gvec_ool_arg_zzxz,
-           gen_helper_gvec_sdot_idx_b, a)
-TRANS_FEAT(SDOT_zzxw_d, aa64_sve, gen_gvec_ool_arg_zzxz,
-           gen_helper_gvec_sdot_idx_h, a)
-TRANS_FEAT(UDOT_zzxw_s, aa64_sve, gen_gvec_ool_arg_zzxz,
-           gen_helper_gvec_udot_idx_b, a)
-TRANS_FEAT(UDOT_zzxw_d, aa64_sve, gen_gvec_ool_arg_zzxz,
-           gen_helper_gvec_udot_idx_h, a)
-
-TRANS_FEAT(SUDOT_zzxw_s, aa64_sve_i8mm, gen_gvec_ool_arg_zzxz,
-           gen_helper_gvec_sudot_idx_b, a)
-TRANS_FEAT(USDOT_zzxw_s, aa64_sve_i8mm, gen_gvec_ool_arg_zzxz,
-           gen_helper_gvec_usdot_idx_b, a)
-
-#define DO_SVE2_RRX(NAME, FUNC) \
-    TRANS_FEAT(NAME, aa64_sve, gen_gvec_ool_zzz, FUNC,          \
-               a->rd, a->rn, a->rm, a->index)
-
-DO_SVE2_RRX(MUL_zzx_h, gen_helper_gvec_mul_idx_h)
-DO_SVE2_RRX(MUL_zzx_s, gen_helper_gvec_mul_idx_s)
-DO_SVE2_RRX(MUL_zzx_d, gen_helper_gvec_mul_idx_d)
-
-DO_SVE2_RRX(SQDMULH_zzx_h, gen_helper_sve2_sqdmulh_idx_h)
-DO_SVE2_RRX(SQDMULH_zzx_s, gen_helper_sve2_sqdmulh_idx_s)
-DO_SVE2_RRX(SQDMULH_zzx_d, gen_helper_sve2_sqdmulh_idx_d)
-
-DO_SVE2_RRX(SQRDMULH_zzx_h, gen_helper_sve2_sqrdmulh_idx_h)
-DO_SVE2_RRX(SQRDMULH_zzx_s, gen_helper_sve2_sqrdmulh_idx_s)
-DO_SVE2_RRX(SQRDMULH_zzx_d, gen_helper_sve2_sqrdmulh_idx_d)
-
-#undef DO_SVE2_RRX
-
-#define DO_SVE2_RRX_TB(NAME, FUNC, TOP) \
-    TRANS_FEAT(NAME, aa64_sve, gen_gvec_ool_zzz, FUNC,          \
-               a->rd, a->rn, a->rm, (a->index << 1) | TOP)
-
-DO_SVE2_RRX_TB(SQDMULLB_zzx_s, gen_helper_sve2_sqdmull_idx_s, false)
-DO_SVE2_RRX_TB(SQDMULLB_zzx_d, gen_helper_sve2_sqdmull_idx_d, false)
-DO_SVE2_RRX_TB(SQDMULLT_zzx_s, gen_helper_sve2_sqdmull_idx_s, true)
-DO_SVE2_RRX_TB(SQDMULLT_zzx_d, gen_helper_sve2_sqdmull_idx_d, true)
-
-DO_SVE2_RRX_TB(SMULLB_zzx_s, gen_helper_sve2_smull_idx_s, false)
-DO_SVE2_RRX_TB(SMULLB_zzx_d, gen_helper_sve2_smull_idx_d, false)
-DO_SVE2_RRX_TB(SMULLT_zzx_s, gen_helper_sve2_smull_idx_s, true)
-DO_SVE2_RRX_TB(SMULLT_zzx_d, gen_helper_sve2_smull_idx_d, true)
-
-DO_SVE2_RRX_TB(UMULLB_zzx_s, gen_helper_sve2_umull_idx_s, false)
-DO_SVE2_RRX_TB(UMULLB_zzx_d, gen_helper_sve2_umull_idx_d, false)
-DO_SVE2_RRX_TB(UMULLT_zzx_s, gen_helper_sve2_umull_idx_s, true)
-DO_SVE2_RRX_TB(UMULLT_zzx_d, gen_helper_sve2_umull_idx_d, true)
-
-#undef DO_SVE2_RRX_TB
-
-#define DO_SVE2_RRXR(NAME, FUNC) \
-    TRANS_FEAT(NAME, aa64_sve2, gen_gvec_ool_arg_zzxz, FUNC, a)
-
-DO_SVE2_RRXR(MLA_zzxz_h, gen_helper_gvec_mla_idx_h)
-DO_SVE2_RRXR(MLA_zzxz_s, gen_helper_gvec_mla_idx_s)
-DO_SVE2_RRXR(MLA_zzxz_d, gen_helper_gvec_mla_idx_d)
-
-DO_SVE2_RRXR(MLS_zzxz_h, gen_helper_gvec_mls_idx_h)
-DO_SVE2_RRXR(MLS_zzxz_s, gen_helper_gvec_mls_idx_s)
-DO_SVE2_RRXR(MLS_zzxz_d, gen_helper_gvec_mls_idx_d)
-
-DO_SVE2_RRXR(SQRDMLAH_zzxz_h, gen_helper_sve2_sqrdmlah_idx_h)
-DO_SVE2_RRXR(SQRDMLAH_zzxz_s, gen_helper_sve2_sqrdmlah_idx_s)
-DO_SVE2_RRXR(SQRDMLAH_zzxz_d, gen_helper_sve2_sqrdmlah_idx_d)
-
-DO_SVE2_RRXR(SQRDMLSH_zzxz_h, gen_helper_sve2_sqrdmlsh_idx_h)
-DO_SVE2_RRXR(SQRDMLSH_zzxz_s, gen_helper_sve2_sqrdmlsh_idx_s)
-DO_SVE2_RRXR(SQRDMLSH_zzxz_d, gen_helper_sve2_sqrdmlsh_idx_d)
-
-#undef DO_SVE2_RRXR
-
-#define DO_SVE2_RRXR_TB(NAME, FUNC, TOP) \
-    TRANS_FEAT(NAME, aa64_sve2, gen_gvec_ool_zzzz, FUNC,        \
-               a->rd, a->rn, a->rm, a->ra, (a->index << 1) | TOP)
-
-DO_SVE2_RRXR_TB(SQDMLALB_zzxw_s, gen_helper_sve2_sqdmlal_idx_s, false)
-DO_SVE2_RRXR_TB(SQDMLALB_zzxw_d, gen_helper_sve2_sqdmlal_idx_d, false)
-DO_SVE2_RRXR_TB(SQDMLALT_zzxw_s, gen_helper_sve2_sqdmlal_idx_s, true)
-DO_SVE2_RRXR_TB(SQDMLALT_zzxw_d, gen_helper_sve2_sqdmlal_idx_d, true)
-
-DO_SVE2_RRXR_TB(SQDMLSLB_zzxw_s, gen_helper_sve2_sqdmlsl_idx_s, false)
-DO_SVE2_RRXR_TB(SQDMLSLB_zzxw_d, gen_helper_sve2_sqdmlsl_idx_d, false)
-DO_SVE2_RRXR_TB(SQDMLSLT_zzxw_s, gen_helper_sve2_sqdmlsl_idx_s, true)
-DO_SVE2_RRXR_TB(SQDMLSLT_zzxw_d, gen_helper_sve2_sqdmlsl_idx_d, true)
-
-DO_SVE2_RRXR_TB(SMLALB_zzxw_s, gen_helper_sve2_smlal_idx_s, false)
-DO_SVE2_RRXR_TB(SMLALB_zzxw_d, gen_helper_sve2_smlal_idx_d, false)
-DO_SVE2_RRXR_TB(SMLALT_zzxw_s, gen_helper_sve2_smlal_idx_s, true)
-DO_SVE2_RRXR_TB(SMLALT_zzxw_d, gen_helper_sve2_smlal_idx_d, true)
-
-DO_SVE2_RRXR_TB(UMLALB_zzxw_s, gen_helper_sve2_umlal_idx_s, false)
-DO_SVE2_RRXR_TB(UMLALB_zzxw_d, gen_helper_sve2_umlal_idx_d, false)
-DO_SVE2_RRXR_TB(UMLALT_zzxw_s, gen_helper_sve2_umlal_idx_s, true)
-DO_SVE2_RRXR_TB(UMLALT_zzxw_d, gen_helper_sve2_umlal_idx_d, true)
-
-DO_SVE2_RRXR_TB(SMLSLB_zzxw_s, gen_helper_sve2_smlsl_idx_s, false)
-DO_SVE2_RRXR_TB(SMLSLB_zzxw_d, gen_helper_sve2_smlsl_idx_d, false)
-DO_SVE2_RRXR_TB(SMLSLT_zzxw_s, gen_helper_sve2_smlsl_idx_s, true)
-DO_SVE2_RRXR_TB(SMLSLT_zzxw_d, gen_helper_sve2_smlsl_idx_d, true)
-
-DO_SVE2_RRXR_TB(UMLSLB_zzxw_s, gen_helper_sve2_umlsl_idx_s, false)
-DO_SVE2_RRXR_TB(UMLSLB_zzxw_d, gen_helper_sve2_umlsl_idx_d, false)
-DO_SVE2_RRXR_TB(UMLSLT_zzxw_s, gen_helper_sve2_umlsl_idx_s, true)
-DO_SVE2_RRXR_TB(UMLSLT_zzxw_d, gen_helper_sve2_umlsl_idx_d, true)
-
-#undef DO_SVE2_RRXR_TB
-
-#define DO_SVE2_RRXR_ROT(NAME, FUNC) \
-    TRANS_FEAT(NAME, aa64_sve2, gen_gvec_ool_zzzz, FUNC,           \
-               a->rd, a->rn, a->rm, a->ra, (a->index << 2) | a->rot)
-
-DO_SVE2_RRXR_ROT(CMLA_zzxz_h, gen_helper_sve2_cmla_idx_h)
-DO_SVE2_RRXR_ROT(CMLA_zzxz_s, gen_helper_sve2_cmla_idx_s)
-
-DO_SVE2_RRXR_ROT(SQRDCMLAH_zzxz_h, gen_helper_sve2_sqrdcmlah_idx_h)
-DO_SVE2_RRXR_ROT(SQRDCMLAH_zzxz_s, gen_helper_sve2_sqrdcmlah_idx_s)
-
-DO_SVE2_RRXR_ROT(CDOT_zzxw_s, gen_helper_sve2_cdot_idx_s)
-DO_SVE2_RRXR_ROT(CDOT_zzxw_d, gen_helper_sve2_cdot_idx_d)
-
-#undef DO_SVE2_RRXR_ROT
-
-/*
- *** SVE Floating Point Multiply-Add Indexed Group
- */
-
-static bool do_FMLA_zzxz(DisasContext *s, arg_rrxr_esz *a, bool sub)
-{
-    static gen_helper_gvec_4_ptr * const fns[4] = {
-        NULL,
-        gen_helper_gvec_fmla_idx_h,
-        gen_helper_gvec_fmla_idx_s,
-        gen_helper_gvec_fmla_idx_d,
-    };
-    return gen_gvec_fpst_zzzz(s, fns[a->esz], a->rd, a->rn, a->rm, a->ra,
-                              (a->index << 1) | sub,
-                              a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
-}
-
-TRANS_FEAT(FMLA_zzxz, aa64_sve, do_FMLA_zzxz, a, false)
-TRANS_FEAT(FMLS_zzxz, aa64_sve, do_FMLA_zzxz, a, true)
-
-/*
- *** SVE Floating Point Multiply Indexed Group
- */
-
-static gen_helper_gvec_3_ptr * const fmul_idx_fns[4] = {
-    NULL,                       gen_helper_gvec_fmul_idx_h,
-    gen_helper_gvec_fmul_idx_s, gen_helper_gvec_fmul_idx_d,
-};
-TRANS_FEAT(FMUL_zzx, aa64_sve, gen_gvec_fpst_zzz,
-           fmul_idx_fns[a->esz], a->rd, a->rn, a->rm, a->index,
-           a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
-
-/*
- *** SVE Floating Point Fast Reduction Group
- */
-
-typedef void gen_helper_fp_reduce(TCGv_i64, TCGv_ptr, TCGv_ptr,
-                                  TCGv_ptr, TCGv_i32);
-
-static bool do_reduce(DisasContext *s, arg_rpr_esz *a,
-                      gen_helper_fp_reduce *fn)
-{
-    unsigned vsz, p2vsz;
-    TCGv_i32 t_desc;
-    TCGv_ptr t_zn, t_pg, status;
-    TCGv_i64 temp;
-
-    if (fn == NULL) {
-        return false;
-    }
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    vsz = vec_full_reg_size(s);
-    p2vsz = pow2ceil(vsz);
-    t_desc = tcg_constant_i32(simd_desc(vsz, vsz, p2vsz));
-    temp = tcg_temp_new_i64();
-    t_zn = tcg_temp_new_ptr();
-    t_pg = tcg_temp_new_ptr();
-
-    tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, a->rn));
-    tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, a->pg));
-    status = fpstatus_ptr(a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
-
-    fn(temp, t_zn, t_pg, status, t_desc);
-    tcg_temp_free_ptr(t_zn);
-    tcg_temp_free_ptr(t_pg);
-    tcg_temp_free_ptr(status);
-
-    write_fp_dreg(s, a->rd, temp);
-    tcg_temp_free_i64(temp);
-    return true;
-}
-
-#define DO_VPZ(NAME, name) \
-    static gen_helper_fp_reduce * const name##_fns[4] = {                \
-        NULL,                      gen_helper_sve_##name##_h,            \
-        gen_helper_sve_##name##_s, gen_helper_sve_##name##_d,            \
-    };                                                                   \
-    TRANS_FEAT(NAME, aa64_sve, do_reduce, a, name##_fns[a->esz])
-
-DO_VPZ(FADDV, faddv)
-DO_VPZ(FMINNMV, fminnmv)
-DO_VPZ(FMAXNMV, fmaxnmv)
-DO_VPZ(FMINV, fminv)
-DO_VPZ(FMAXV, fmaxv)
-
-#undef DO_VPZ
-
-/*
- *** SVE Floating Point Unary Operations - Unpredicated Group
- */
-
-static gen_helper_gvec_2_ptr * const frecpe_fns[] = {
-    NULL,                     gen_helper_gvec_frecpe_h,
-    gen_helper_gvec_frecpe_s, gen_helper_gvec_frecpe_d,
-};
-TRANS_FEAT(FRECPE, aa64_sve, gen_gvec_fpst_arg_zz, frecpe_fns[a->esz], a, 0)
-
-static gen_helper_gvec_2_ptr * const frsqrte_fns[] = {
-    NULL,                      gen_helper_gvec_frsqrte_h,
-    gen_helper_gvec_frsqrte_s, gen_helper_gvec_frsqrte_d,
-};
-TRANS_FEAT(FRSQRTE, aa64_sve, gen_gvec_fpst_arg_zz, frsqrte_fns[a->esz], a, 0)
-
-/*
- *** SVE Floating Point Compare with Zero Group
- */
-
-static bool do_ppz_fp(DisasContext *s, arg_rpr_esz *a,
-                      gen_helper_gvec_3_ptr *fn)
-{
-    if (fn == NULL) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        TCGv_ptr status =
-            fpstatus_ptr(a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
-
-        tcg_gen_gvec_3_ptr(pred_full_reg_offset(s, a->rd),
-                           vec_full_reg_offset(s, a->rn),
-                           pred_full_reg_offset(s, a->pg),
-                           status, vsz, vsz, 0, fn);
-        tcg_temp_free_ptr(status);
-    }
-    return true;
-}
-
-#define DO_PPZ(NAME, name) \
-    static gen_helper_gvec_3_ptr * const name##_fns[] = {         \
-        NULL,                      gen_helper_sve_##name##_h,     \
-        gen_helper_sve_##name##_s, gen_helper_sve_##name##_d,     \
-    };                                                            \
-    TRANS_FEAT(NAME, aa64_sve, do_ppz_fp, a, name##_fns[a->esz])
-
-DO_PPZ(FCMGE_ppz0, fcmge0)
-DO_PPZ(FCMGT_ppz0, fcmgt0)
-DO_PPZ(FCMLE_ppz0, fcmle0)
-DO_PPZ(FCMLT_ppz0, fcmlt0)
-DO_PPZ(FCMEQ_ppz0, fcmeq0)
-DO_PPZ(FCMNE_ppz0, fcmne0)
-
-#undef DO_PPZ
-
-/*
- *** SVE floating-point trig multiply-add coefficient
- */
-
-static gen_helper_gvec_3_ptr * const ftmad_fns[4] = {
-    NULL,                   gen_helper_sve_ftmad_h,
-    gen_helper_sve_ftmad_s, gen_helper_sve_ftmad_d,
-};
-TRANS_FEAT_NONSTREAMING(FTMAD, aa64_sve, gen_gvec_fpst_zzz,
-                        ftmad_fns[a->esz], a->rd, a->rn, a->rm, a->imm,
-                        a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
-
-/*
- *** SVE Floating Point Accumulating Reduction Group
- */
-
-static bool trans_FADDA(DisasContext *s, arg_rprr_esz *a)
-{
-    typedef void fadda_fn(TCGv_i64, TCGv_i64, TCGv_ptr,
-                          TCGv_ptr, TCGv_ptr, TCGv_i32);
-    static fadda_fn * const fns[3] = {
-        gen_helper_sve_fadda_h,
-        gen_helper_sve_fadda_s,
-        gen_helper_sve_fadda_d,
-    };
-    unsigned vsz = vec_full_reg_size(s);
-    TCGv_ptr t_rm, t_pg, t_fpst;
-    TCGv_i64 t_val;
-    TCGv_i32 t_desc;
-
-    if (a->esz == 0 || !dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    s->is_nonstreaming = true;
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    t_val = load_esz(cpu_env, vec_reg_offset(s, a->rn, 0, a->esz), a->esz);
-    t_rm = tcg_temp_new_ptr();
-    t_pg = tcg_temp_new_ptr();
-    tcg_gen_addi_ptr(t_rm, cpu_env, vec_full_reg_offset(s, a->rm));
-    tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, a->pg));
-    t_fpst = fpstatus_ptr(a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
-    t_desc = tcg_constant_i32(simd_desc(vsz, vsz, 0));
-
-    fns[a->esz - 1](t_val, t_val, t_rm, t_pg, t_fpst, t_desc);
-
-    tcg_temp_free_ptr(t_fpst);
-    tcg_temp_free_ptr(t_pg);
-    tcg_temp_free_ptr(t_rm);
-
-    write_fp_dreg(s, a->rd, t_val);
-    tcg_temp_free_i64(t_val);
-    return true;
-}
-
-/*
- *** SVE Floating Point Arithmetic - Unpredicated Group
- */
-
-#define DO_FP3(NAME, name) \
-    static gen_helper_gvec_3_ptr * const name##_fns[4] = {          \
-        NULL, gen_helper_gvec_##name##_h,                           \
-        gen_helper_gvec_##name##_s, gen_helper_gvec_##name##_d      \
-    };                                                              \
-    TRANS_FEAT(NAME, aa64_sve, gen_gvec_fpst_arg_zzz, name##_fns[a->esz], a, 0)
-
-DO_FP3(FADD_zzz, fadd)
-DO_FP3(FSUB_zzz, fsub)
-DO_FP3(FMUL_zzz, fmul)
-DO_FP3(FRECPS, recps)
-DO_FP3(FRSQRTS, rsqrts)
-
-#undef DO_FP3
-
-static gen_helper_gvec_3_ptr * const ftsmul_fns[4] = {
-    NULL,                     gen_helper_gvec_ftsmul_h,
-    gen_helper_gvec_ftsmul_s, gen_helper_gvec_ftsmul_d
-};
-TRANS_FEAT_NONSTREAMING(FTSMUL, aa64_sve, gen_gvec_fpst_arg_zzz,
-                        ftsmul_fns[a->esz], a, 0)
-
-/*
- *** SVE Floating Point Arithmetic - Predicated Group
- */
-
-#define DO_ZPZZ_FP(NAME, FEAT, name) \
-    static gen_helper_gvec_4_ptr * const name##_zpzz_fns[4] = { \
-        NULL,                  gen_helper_##name##_h,           \
-        gen_helper_##name##_s, gen_helper_##name##_d            \
-    };                                                          \
-    TRANS_FEAT(NAME, FEAT, gen_gvec_fpst_arg_zpzz, name##_zpzz_fns[a->esz], a)
-
-DO_ZPZZ_FP(FADD_zpzz, aa64_sve, sve_fadd)
-DO_ZPZZ_FP(FSUB_zpzz, aa64_sve, sve_fsub)
-DO_ZPZZ_FP(FMUL_zpzz, aa64_sve, sve_fmul)
-DO_ZPZZ_FP(FMIN_zpzz, aa64_sve, sve_fmin)
-DO_ZPZZ_FP(FMAX_zpzz, aa64_sve, sve_fmax)
-DO_ZPZZ_FP(FMINNM_zpzz, aa64_sve, sve_fminnum)
-DO_ZPZZ_FP(FMAXNM_zpzz, aa64_sve, sve_fmaxnum)
-DO_ZPZZ_FP(FABD, aa64_sve, sve_fabd)
-DO_ZPZZ_FP(FSCALE, aa64_sve, sve_fscalbn)
-DO_ZPZZ_FP(FDIV, aa64_sve, sve_fdiv)
-DO_ZPZZ_FP(FMULX, aa64_sve, sve_fmulx)
-
-typedef void gen_helper_sve_fp2scalar(TCGv_ptr, TCGv_ptr, TCGv_ptr,
-                                      TCGv_i64, TCGv_ptr, TCGv_i32);
-
-static void do_fp_scalar(DisasContext *s, int zd, int zn, int pg, bool is_fp16,
-                         TCGv_i64 scalar, gen_helper_sve_fp2scalar *fn)
-{
-    unsigned vsz = vec_full_reg_size(s);
-    TCGv_ptr t_zd, t_zn, t_pg, status;
-    TCGv_i32 desc;
-
-    t_zd = tcg_temp_new_ptr();
-    t_zn = tcg_temp_new_ptr();
-    t_pg = tcg_temp_new_ptr();
-    tcg_gen_addi_ptr(t_zd, cpu_env, vec_full_reg_offset(s, zd));
-    tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, zn));
-    tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg));
-
-    status = fpstatus_ptr(is_fp16 ? FPST_FPCR_F16 : FPST_FPCR);
-    desc = tcg_constant_i32(simd_desc(vsz, vsz, 0));
-    fn(t_zd, t_zn, t_pg, scalar, status, desc);
-
-    tcg_temp_free_ptr(status);
-    tcg_temp_free_ptr(t_pg);
-    tcg_temp_free_ptr(t_zn);
-    tcg_temp_free_ptr(t_zd);
-}
-
-static bool do_fp_imm(DisasContext *s, arg_rpri_esz *a, uint64_t imm,
-                      gen_helper_sve_fp2scalar *fn)
-{
-    if (fn == NULL) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        do_fp_scalar(s, a->rd, a->rn, a->pg, a->esz == MO_16,
-                     tcg_constant_i64(imm), fn);
-    }
-    return true;
-}
-
-#define DO_FP_IMM(NAME, name, const0, const1)                           \
-    static gen_helper_sve_fp2scalar * const name##_fns[4] = {           \
-        NULL, gen_helper_sve_##name##_h,                                \
-        gen_helper_sve_##name##_s,                                      \
-        gen_helper_sve_##name##_d                                       \
-    };                                                                  \
-    static uint64_t const name##_const[4][2] = {                        \
-        { -1, -1 },                                                     \
-        { float16_##const0, float16_##const1 },                         \
-        { float32_##const0, float32_##const1 },                         \
-        { float64_##const0, float64_##const1 },                         \
-    };                                                                  \
-    TRANS_FEAT(NAME##_zpzi, aa64_sve, do_fp_imm, a,                     \
-               name##_const[a->esz][a->imm], name##_fns[a->esz])
-
-DO_FP_IMM(FADD, fadds, half, one)
-DO_FP_IMM(FSUB, fsubs, half, one)
-DO_FP_IMM(FMUL, fmuls, half, two)
-DO_FP_IMM(FSUBR, fsubrs, half, one)
-DO_FP_IMM(FMAXNM, fmaxnms, zero, one)
-DO_FP_IMM(FMINNM, fminnms, zero, one)
-DO_FP_IMM(FMAX, fmaxs, zero, one)
-DO_FP_IMM(FMIN, fmins, zero, one)
-
-#undef DO_FP_IMM
-
-static bool do_fp_cmp(DisasContext *s, arg_rprr_esz *a,
-                      gen_helper_gvec_4_ptr *fn)
-{
-    if (fn == NULL) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        TCGv_ptr status = fpstatus_ptr(a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
-        tcg_gen_gvec_4_ptr(pred_full_reg_offset(s, a->rd),
-                           vec_full_reg_offset(s, a->rn),
-                           vec_full_reg_offset(s, a->rm),
-                           pred_full_reg_offset(s, a->pg),
-                           status, vsz, vsz, 0, fn);
-        tcg_temp_free_ptr(status);
-    }
-    return true;
-}
-
-#define DO_FPCMP(NAME, name) \
-    static gen_helper_gvec_4_ptr * const name##_fns[4] = {            \
-        NULL, gen_helper_sve_##name##_h,                              \
-        gen_helper_sve_##name##_s, gen_helper_sve_##name##_d          \
-    };                                                                \
-    TRANS_FEAT(NAME##_ppzz, aa64_sve, do_fp_cmp, a, name##_fns[a->esz])
-
-DO_FPCMP(FCMGE, fcmge)
-DO_FPCMP(FCMGT, fcmgt)
-DO_FPCMP(FCMEQ, fcmeq)
-DO_FPCMP(FCMNE, fcmne)
-DO_FPCMP(FCMUO, fcmuo)
-DO_FPCMP(FACGE, facge)
-DO_FPCMP(FACGT, facgt)
-
-#undef DO_FPCMP
-
-static gen_helper_gvec_4_ptr * const fcadd_fns[] = {
-    NULL,                   gen_helper_sve_fcadd_h,
-    gen_helper_sve_fcadd_s, gen_helper_sve_fcadd_d,
-};
-TRANS_FEAT(FCADD, aa64_sve, gen_gvec_fpst_zzzp, fcadd_fns[a->esz],
-           a->rd, a->rn, a->rm, a->pg, a->rot,
-           a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
-
-#define DO_FMLA(NAME, name) \
-    static gen_helper_gvec_5_ptr * const name##_fns[4] = {              \
-        NULL, gen_helper_sve_##name##_h,                                \
-        gen_helper_sve_##name##_s, gen_helper_sve_##name##_d            \
-    };                                                                  \
-    TRANS_FEAT(NAME, aa64_sve, gen_gvec_fpst_zzzzp, name##_fns[a->esz], \
-               a->rd, a->rn, a->rm, a->ra, a->pg, 0,                    \
-               a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
-
-DO_FMLA(FMLA_zpzzz, fmla_zpzzz)
-DO_FMLA(FMLS_zpzzz, fmls_zpzzz)
-DO_FMLA(FNMLA_zpzzz, fnmla_zpzzz)
-DO_FMLA(FNMLS_zpzzz, fnmls_zpzzz)
-
-#undef DO_FMLA
-
-static gen_helper_gvec_5_ptr * const fcmla_fns[4] = {
-    NULL,                         gen_helper_sve_fcmla_zpzzz_h,
-    gen_helper_sve_fcmla_zpzzz_s, gen_helper_sve_fcmla_zpzzz_d,
-};
-TRANS_FEAT(FCMLA_zpzzz, aa64_sve, gen_gvec_fpst_zzzzp, fcmla_fns[a->esz],
-           a->rd, a->rn, a->rm, a->ra, a->pg, a->rot,
-           a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
-
-static gen_helper_gvec_4_ptr * const fcmla_idx_fns[4] = {
-    NULL, gen_helper_gvec_fcmlah_idx, gen_helper_gvec_fcmlas_idx, NULL
-};
-TRANS_FEAT(FCMLA_zzxz, aa64_sve, gen_gvec_fpst_zzzz, fcmla_idx_fns[a->esz],
-           a->rd, a->rn, a->rm, a->ra, a->index * 4 + a->rot,
-           a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
-
-/*
- *** SVE Floating Point Unary Operations Predicated Group
- */
-
-TRANS_FEAT(FCVT_sh, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_fcvt_sh, a, 0, FPST_FPCR)
-TRANS_FEAT(FCVT_hs, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_fcvt_hs, a, 0, FPST_FPCR)
-
-TRANS_FEAT(BFCVT, aa64_sve_bf16, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_bfcvt, a, 0, FPST_FPCR)
-
-TRANS_FEAT(FCVT_dh, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_fcvt_dh, a, 0, FPST_FPCR)
-TRANS_FEAT(FCVT_hd, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_fcvt_hd, a, 0, FPST_FPCR)
-TRANS_FEAT(FCVT_ds, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_fcvt_ds, a, 0, FPST_FPCR)
-TRANS_FEAT(FCVT_sd, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_fcvt_sd, a, 0, FPST_FPCR)
-
-TRANS_FEAT(FCVTZS_hh, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_fcvtzs_hh, a, 0, FPST_FPCR_F16)
-TRANS_FEAT(FCVTZU_hh, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_fcvtzu_hh, a, 0, FPST_FPCR_F16)
-TRANS_FEAT(FCVTZS_hs, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_fcvtzs_hs, a, 0, FPST_FPCR_F16)
-TRANS_FEAT(FCVTZU_hs, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_fcvtzu_hs, a, 0, FPST_FPCR_F16)
-TRANS_FEAT(FCVTZS_hd, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_fcvtzs_hd, a, 0, FPST_FPCR_F16)
-TRANS_FEAT(FCVTZU_hd, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_fcvtzu_hd, a, 0, FPST_FPCR_F16)
-
-TRANS_FEAT(FCVTZS_ss, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_fcvtzs_ss, a, 0, FPST_FPCR)
-TRANS_FEAT(FCVTZU_ss, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_fcvtzu_ss, a, 0, FPST_FPCR)
-TRANS_FEAT(FCVTZS_sd, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_fcvtzs_sd, a, 0, FPST_FPCR)
-TRANS_FEAT(FCVTZU_sd, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_fcvtzu_sd, a, 0, FPST_FPCR)
-TRANS_FEAT(FCVTZS_ds, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_fcvtzs_ds, a, 0, FPST_FPCR)
-TRANS_FEAT(FCVTZU_ds, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_fcvtzu_ds, a, 0, FPST_FPCR)
-
-TRANS_FEAT(FCVTZS_dd, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_fcvtzs_dd, a, 0, FPST_FPCR)
-TRANS_FEAT(FCVTZU_dd, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_fcvtzu_dd, a, 0, FPST_FPCR)
-
-static gen_helper_gvec_3_ptr * const frint_fns[] = {
-    NULL,
-    gen_helper_sve_frint_h,
-    gen_helper_sve_frint_s,
-    gen_helper_sve_frint_d
-};
-TRANS_FEAT(FRINTI, aa64_sve, gen_gvec_fpst_arg_zpz, frint_fns[a->esz],
-           a, 0, a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
-
-static gen_helper_gvec_3_ptr * const frintx_fns[] = {
-    NULL,
-    gen_helper_sve_frintx_h,
-    gen_helper_sve_frintx_s,
-    gen_helper_sve_frintx_d
-};
-TRANS_FEAT(FRINTX, aa64_sve, gen_gvec_fpst_arg_zpz, frintx_fns[a->esz],
-           a, 0, a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
-
-static bool do_frint_mode(DisasContext *s, arg_rpr_esz *a,
-                          int mode, gen_helper_gvec_3_ptr *fn)
-{
-    unsigned vsz;
-    TCGv_i32 tmode;
-    TCGv_ptr status;
-
-    if (fn == NULL) {
-        return false;
-    }
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    vsz = vec_full_reg_size(s);
-    tmode = tcg_const_i32(mode);
-    status = fpstatus_ptr(a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
-
-    gen_helper_set_rmode(tmode, tmode, status);
-
-    tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, a->rd),
-                       vec_full_reg_offset(s, a->rn),
-                       pred_full_reg_offset(s, a->pg),
-                       status, vsz, vsz, 0, fn);
-
-    gen_helper_set_rmode(tmode, tmode, status);
-    tcg_temp_free_i32(tmode);
-    tcg_temp_free_ptr(status);
-    return true;
-}
-
-TRANS_FEAT(FRINTN, aa64_sve, do_frint_mode, a,
-           float_round_nearest_even, frint_fns[a->esz])
-TRANS_FEAT(FRINTP, aa64_sve, do_frint_mode, a,
-           float_round_up, frint_fns[a->esz])
-TRANS_FEAT(FRINTM, aa64_sve, do_frint_mode, a,
-           float_round_down, frint_fns[a->esz])
-TRANS_FEAT(FRINTZ, aa64_sve, do_frint_mode, a,
-           float_round_to_zero, frint_fns[a->esz])
-TRANS_FEAT(FRINTA, aa64_sve, do_frint_mode, a,
-           float_round_ties_away, frint_fns[a->esz])
-
-static gen_helper_gvec_3_ptr * const frecpx_fns[] = {
-    NULL,                    gen_helper_sve_frecpx_h,
-    gen_helper_sve_frecpx_s, gen_helper_sve_frecpx_d,
-};
-TRANS_FEAT(FRECPX, aa64_sve, gen_gvec_fpst_arg_zpz, frecpx_fns[a->esz],
-           a, 0, a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
-
-static gen_helper_gvec_3_ptr * const fsqrt_fns[] = {
-    NULL,                   gen_helper_sve_fsqrt_h,
-    gen_helper_sve_fsqrt_s, gen_helper_sve_fsqrt_d,
-};
-TRANS_FEAT(FSQRT, aa64_sve, gen_gvec_fpst_arg_zpz, fsqrt_fns[a->esz],
-           a, 0, a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
-
-TRANS_FEAT(SCVTF_hh, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_scvt_hh, a, 0, FPST_FPCR_F16)
-TRANS_FEAT(SCVTF_sh, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_scvt_sh, a, 0, FPST_FPCR_F16)
-TRANS_FEAT(SCVTF_dh, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_scvt_dh, a, 0, FPST_FPCR_F16)
-
-TRANS_FEAT(SCVTF_ss, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_scvt_ss, a, 0, FPST_FPCR)
-TRANS_FEAT(SCVTF_ds, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_scvt_ds, a, 0, FPST_FPCR)
-
-TRANS_FEAT(SCVTF_sd, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_scvt_sd, a, 0, FPST_FPCR)
-TRANS_FEAT(SCVTF_dd, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_scvt_dd, a, 0, FPST_FPCR)
-
-TRANS_FEAT(UCVTF_hh, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_ucvt_hh, a, 0, FPST_FPCR_F16)
-TRANS_FEAT(UCVTF_sh, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_ucvt_sh, a, 0, FPST_FPCR_F16)
-TRANS_FEAT(UCVTF_dh, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_ucvt_dh, a, 0, FPST_FPCR_F16)
-
-TRANS_FEAT(UCVTF_ss, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_ucvt_ss, a, 0, FPST_FPCR)
-TRANS_FEAT(UCVTF_ds, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_ucvt_ds, a, 0, FPST_FPCR)
-TRANS_FEAT(UCVTF_sd, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_ucvt_sd, a, 0, FPST_FPCR)
-
-TRANS_FEAT(UCVTF_dd, aa64_sve, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_ucvt_dd, a, 0, FPST_FPCR)
-
-/*
- *** SVE Memory - 32-bit Gather and Unsized Contiguous Group
- */
-
-/* Subroutine loading a vector register at VOFS of LEN bytes.
- * The load should begin at the address Rn + IMM.
- */
-
-void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs,
-                 int len, int rn, int imm)
-{
-    int len_align = QEMU_ALIGN_DOWN(len, 8);
-    int len_remain = len % 8;
-    int nparts = len / 8 + ctpop8(len_remain);
-    int midx = get_mem_index(s);
-    TCGv_i64 dirty_addr, clean_addr, t0, t1;
-
-    dirty_addr = tcg_temp_new_i64();
-    tcg_gen_addi_i64(dirty_addr, cpu_reg_sp(s, rn), imm);
-    clean_addr = gen_mte_checkN(s, dirty_addr, false, rn != 31, len);
-    tcg_temp_free_i64(dirty_addr);
-
-    /*
-     * Note that unpredicated load/store of vector/predicate registers
-     * are defined as a stream of bytes, which equates to little-endian
-     * operations on larger quantities.
-     * Attempt to keep code expansion to a minimum by limiting the
-     * amount of unrolling done.
-     */
-    if (nparts <= 4) {
-        int i;
-
-        t0 = tcg_temp_new_i64();
-        for (i = 0; i < len_align; i += 8) {
-            tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ);
-            tcg_gen_st_i64(t0, base, vofs + i);
-            tcg_gen_addi_i64(clean_addr, clean_addr, 8);
-        }
-        tcg_temp_free_i64(t0);
-    } else {
-        TCGLabel *loop = gen_new_label();
-        TCGv_ptr tp, i = tcg_const_local_ptr(0);
-
-        /* Copy the clean address into a local temp, live across the loop. */
-        t0 = clean_addr;
-        clean_addr = new_tmp_a64_local(s);
-        tcg_gen_mov_i64(clean_addr, t0);
-
-        if (base != cpu_env) {
-            TCGv_ptr b = tcg_temp_local_new_ptr();
-            tcg_gen_mov_ptr(b, base);
-            base = b;
-        }
-
-        gen_set_label(loop);
-
-        t0 = tcg_temp_new_i64();
-        tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ);
-        tcg_gen_addi_i64(clean_addr, clean_addr, 8);
-
-        tp = tcg_temp_new_ptr();
-        tcg_gen_add_ptr(tp, base, i);
-        tcg_gen_addi_ptr(i, i, 8);
-        tcg_gen_st_i64(t0, tp, vofs);
-        tcg_temp_free_ptr(tp);
-        tcg_temp_free_i64(t0);
-
-        tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop);
-        tcg_temp_free_ptr(i);
-
-        if (base != cpu_env) {
-            tcg_temp_free_ptr(base);
-            assert(len_remain == 0);
-        }
-    }
-
-    /*
-     * Predicate register loads can be any multiple of 2.
-     * Note that we still store the entire 64-bit unit into cpu_env.
-     */
-    if (len_remain) {
-        t0 = tcg_temp_new_i64();
-        switch (len_remain) {
-        case 2:
-        case 4:
-        case 8:
-            tcg_gen_qemu_ld_i64(t0, clean_addr, midx,
-                                MO_LE | ctz32(len_remain));
-            break;
-
-        case 6:
-            t1 = tcg_temp_new_i64();
-            tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUL);
-            tcg_gen_addi_i64(clean_addr, clean_addr, 4);
-            tcg_gen_qemu_ld_i64(t1, clean_addr, midx, MO_LEUW);
-            tcg_gen_deposit_i64(t0, t0, t1, 32, 32);
-            tcg_temp_free_i64(t1);
-            break;
-
-        default:
-            g_assert_not_reached();
-        }
-        tcg_gen_st_i64(t0, base, vofs + len_align);
-        tcg_temp_free_i64(t0);
-    }
-}
-
-/* Similarly for stores.  */
-void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs,
-                 int len, int rn, int imm)
-{
-    int len_align = QEMU_ALIGN_DOWN(len, 8);
-    int len_remain = len % 8;
-    int nparts = len / 8 + ctpop8(len_remain);
-    int midx = get_mem_index(s);
-    TCGv_i64 dirty_addr, clean_addr, t0;
-
-    dirty_addr = tcg_temp_new_i64();
-    tcg_gen_addi_i64(dirty_addr, cpu_reg_sp(s, rn), imm);
-    clean_addr = gen_mte_checkN(s, dirty_addr, false, rn != 31, len);
-    tcg_temp_free_i64(dirty_addr);
-
-    /* Note that unpredicated load/store of vector/predicate registers
-     * are defined as a stream of bytes, which equates to little-endian
-     * operations on larger quantities.  There is no nice way to force
-     * a little-endian store for aarch64_be-linux-user out of line.
-     *
-     * Attempt to keep code expansion to a minimum by limiting the
-     * amount of unrolling done.
-     */
-    if (nparts <= 4) {
-        int i;
-
-        t0 = tcg_temp_new_i64();
-        for (i = 0; i < len_align; i += 8) {
-            tcg_gen_ld_i64(t0, base, vofs + i);
-            tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ);
-            tcg_gen_addi_i64(clean_addr, clean_addr, 8);
-        }
-        tcg_temp_free_i64(t0);
-    } else {
-        TCGLabel *loop = gen_new_label();
-        TCGv_ptr tp, i = tcg_const_local_ptr(0);
-
-        /* Copy the clean address into a local temp, live across the loop. */
-        t0 = clean_addr;
-        clean_addr = new_tmp_a64_local(s);
-        tcg_gen_mov_i64(clean_addr, t0);
-
-        if (base != cpu_env) {
-            TCGv_ptr b = tcg_temp_local_new_ptr();
-            tcg_gen_mov_ptr(b, base);
-            base = b;
-        }
-
-        gen_set_label(loop);
-
-        t0 = tcg_temp_new_i64();
-        tp = tcg_temp_new_ptr();
-        tcg_gen_add_ptr(tp, base, i);
-        tcg_gen_ld_i64(t0, tp, vofs);
-        tcg_gen_addi_ptr(i, i, 8);
-        tcg_temp_free_ptr(tp);
-
-        tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ);
-        tcg_gen_addi_i64(clean_addr, clean_addr, 8);
-        tcg_temp_free_i64(t0);
-
-        tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop);
-        tcg_temp_free_ptr(i);
-
-        if (base != cpu_env) {
-            tcg_temp_free_ptr(base);
-            assert(len_remain == 0);
-        }
-    }
-
-    /* Predicate register stores can be any multiple of 2.  */
-    if (len_remain) {
-        t0 = tcg_temp_new_i64();
-        tcg_gen_ld_i64(t0, base, vofs + len_align);
-
-        switch (len_remain) {
-        case 2:
-        case 4:
-        case 8:
-            tcg_gen_qemu_st_i64(t0, clean_addr, midx,
-                                MO_LE | ctz32(len_remain));
-            break;
-
-        case 6:
-            tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUL);
-            tcg_gen_addi_i64(clean_addr, clean_addr, 4);
-            tcg_gen_shri_i64(t0, t0, 32);
-            tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUW);
-            break;
-
-        default:
-            g_assert_not_reached();
-        }
-        tcg_temp_free_i64(t0);
-    }
-}
-
-static bool trans_LDR_zri(DisasContext *s, arg_rri *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        int size = vec_full_reg_size(s);
-        int off = vec_full_reg_offset(s, a->rd);
-        gen_sve_ldr(s, cpu_env, off, size, a->rn, a->imm * size);
-    }
-    return true;
-}
-
-static bool trans_LDR_pri(DisasContext *s, arg_rri *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        int size = pred_full_reg_size(s);
-        int off = pred_full_reg_offset(s, a->rd);
-        gen_sve_ldr(s, cpu_env, off, size, a->rn, a->imm * size);
-    }
-    return true;
-}
-
-static bool trans_STR_zri(DisasContext *s, arg_rri *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        int size = vec_full_reg_size(s);
-        int off = vec_full_reg_offset(s, a->rd);
-        gen_sve_str(s, cpu_env, off, size, a->rn, a->imm * size);
-    }
-    return true;
-}
-
-static bool trans_STR_pri(DisasContext *s, arg_rri *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        int size = pred_full_reg_size(s);
-        int off = pred_full_reg_offset(s, a->rd);
-        gen_sve_str(s, cpu_env, off, size, a->rn, a->imm * size);
-    }
-    return true;
-}
-
-/*
- *** SVE Memory - Contiguous Load Group
- */
-
-/* The memory mode of the dtype.  */
-static const MemOp dtype_mop[16] = {
-    MO_UB, MO_UB, MO_UB, MO_UB,
-    MO_SL, MO_UW, MO_UW, MO_UW,
-    MO_SW, MO_SW, MO_UL, MO_UL,
-    MO_SB, MO_SB, MO_SB, MO_UQ
-};
-
-#define dtype_msz(x)  (dtype_mop[x] & MO_SIZE)
-
-/* The vector element size of dtype.  */
-static const uint8_t dtype_esz[16] = {
-    0, 1, 2, 3,
-    3, 1, 2, 3,
-    3, 2, 2, 3,
-    3, 2, 1, 3
-};
-
-static void do_mem_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr,
-                       int dtype, uint32_t mte_n, bool is_write,
-                       gen_helper_gvec_mem *fn)
-{
-    unsigned vsz = vec_full_reg_size(s);
-    TCGv_ptr t_pg;
-    int desc = 0;
-
-    /*
-     * For e.g. LD4, there are not enough arguments to pass all 4
-     * registers as pointers, so encode the regno into the data field.
-     * For consistency, do this even for LD1.
-     */
-    if (s->mte_active[0]) {
-        int msz = dtype_msz(dtype);
-
-        desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s));
-        desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid);
-        desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma);
-        desc = FIELD_DP32(desc, MTEDESC, WRITE, is_write);
-        desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (mte_n << msz) - 1);
-        desc <<= SVE_MTEDESC_SHIFT;
-    } else {
-        addr = clean_data_tbi(s, addr);
-    }
-
-    desc = simd_desc(vsz, vsz, zt | desc);
-    t_pg = tcg_temp_new_ptr();
-
-    tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg));
-    fn(cpu_env, t_pg, addr, tcg_constant_i32(desc));
-
-    tcg_temp_free_ptr(t_pg);
-}
-
-/* Indexed by [mte][be][dtype][nreg] */
-static gen_helper_gvec_mem * const ldr_fns[2][2][16][4] = {
-    { /* mte inactive, little-endian */
-      { { gen_helper_sve_ld1bb_r, gen_helper_sve_ld2bb_r,
-          gen_helper_sve_ld3bb_r, gen_helper_sve_ld4bb_r },
-        { gen_helper_sve_ld1bhu_r, NULL, NULL, NULL },
-        { gen_helper_sve_ld1bsu_r, NULL, NULL, NULL },
-        { gen_helper_sve_ld1bdu_r, NULL, NULL, NULL },
-
-        { gen_helper_sve_ld1sds_le_r, NULL, NULL, NULL },
-        { gen_helper_sve_ld1hh_le_r, gen_helper_sve_ld2hh_le_r,
-          gen_helper_sve_ld3hh_le_r, gen_helper_sve_ld4hh_le_r },
-        { gen_helper_sve_ld1hsu_le_r, NULL, NULL, NULL },
-        { gen_helper_sve_ld1hdu_le_r, NULL, NULL, NULL },
-
-        { gen_helper_sve_ld1hds_le_r, NULL, NULL, NULL },
-        { gen_helper_sve_ld1hss_le_r, NULL, NULL, NULL },
-        { gen_helper_sve_ld1ss_le_r, gen_helper_sve_ld2ss_le_r,
-          gen_helper_sve_ld3ss_le_r, gen_helper_sve_ld4ss_le_r },
-        { gen_helper_sve_ld1sdu_le_r, NULL, NULL, NULL },
-
-        { gen_helper_sve_ld1bds_r, NULL, NULL, NULL },
-        { gen_helper_sve_ld1bss_r, NULL, NULL, NULL },
-        { gen_helper_sve_ld1bhs_r, NULL, NULL, NULL },
-        { gen_helper_sve_ld1dd_le_r, gen_helper_sve_ld2dd_le_r,
-          gen_helper_sve_ld3dd_le_r, gen_helper_sve_ld4dd_le_r } },
-
-      /* mte inactive, big-endian */
-      { { gen_helper_sve_ld1bb_r, gen_helper_sve_ld2bb_r,
-          gen_helper_sve_ld3bb_r, gen_helper_sve_ld4bb_r },
-        { gen_helper_sve_ld1bhu_r, NULL, NULL, NULL },
-        { gen_helper_sve_ld1bsu_r, NULL, NULL, NULL },
-        { gen_helper_sve_ld1bdu_r, NULL, NULL, NULL },
-
-        { gen_helper_sve_ld1sds_be_r, NULL, NULL, NULL },
-        { gen_helper_sve_ld1hh_be_r, gen_helper_sve_ld2hh_be_r,
-          gen_helper_sve_ld3hh_be_r, gen_helper_sve_ld4hh_be_r },
-        { gen_helper_sve_ld1hsu_be_r, NULL, NULL, NULL },
-        { gen_helper_sve_ld1hdu_be_r, NULL, NULL, NULL },
-
-        { gen_helper_sve_ld1hds_be_r, NULL, NULL, NULL },
-        { gen_helper_sve_ld1hss_be_r, NULL, NULL, NULL },
-        { gen_helper_sve_ld1ss_be_r, gen_helper_sve_ld2ss_be_r,
-          gen_helper_sve_ld3ss_be_r, gen_helper_sve_ld4ss_be_r },
-        { gen_helper_sve_ld1sdu_be_r, NULL, NULL, NULL },
-
-        { gen_helper_sve_ld1bds_r, NULL, NULL, NULL },
-        { gen_helper_sve_ld1bss_r, NULL, NULL, NULL },
-        { gen_helper_sve_ld1bhs_r, NULL, NULL, NULL },
-        { gen_helper_sve_ld1dd_be_r, gen_helper_sve_ld2dd_be_r,
-          gen_helper_sve_ld3dd_be_r, gen_helper_sve_ld4dd_be_r } } },
-
-    { /* mte active, little-endian */
-      { { gen_helper_sve_ld1bb_r_mte,
-          gen_helper_sve_ld2bb_r_mte,
-          gen_helper_sve_ld3bb_r_mte,
-          gen_helper_sve_ld4bb_r_mte },
-        { gen_helper_sve_ld1bhu_r_mte, NULL, NULL, NULL },
-        { gen_helper_sve_ld1bsu_r_mte, NULL, NULL, NULL },
-        { gen_helper_sve_ld1bdu_r_mte, NULL, NULL, NULL },
-
-        { gen_helper_sve_ld1sds_le_r_mte, NULL, NULL, NULL },
-        { gen_helper_sve_ld1hh_le_r_mte,
-          gen_helper_sve_ld2hh_le_r_mte,
-          gen_helper_sve_ld3hh_le_r_mte,
-          gen_helper_sve_ld4hh_le_r_mte },
-        { gen_helper_sve_ld1hsu_le_r_mte, NULL, NULL, NULL },
-        { gen_helper_sve_ld1hdu_le_r_mte, NULL, NULL, NULL },
-
-        { gen_helper_sve_ld1hds_le_r_mte, NULL, NULL, NULL },
-        { gen_helper_sve_ld1hss_le_r_mte, NULL, NULL, NULL },
-        { gen_helper_sve_ld1ss_le_r_mte,
-          gen_helper_sve_ld2ss_le_r_mte,
-          gen_helper_sve_ld3ss_le_r_mte,
-          gen_helper_sve_ld4ss_le_r_mte },
-        { gen_helper_sve_ld1sdu_le_r_mte, NULL, NULL, NULL },
-
-        { gen_helper_sve_ld1bds_r_mte, NULL, NULL, NULL },
-        { gen_helper_sve_ld1bss_r_mte, NULL, NULL, NULL },
-        { gen_helper_sve_ld1bhs_r_mte, NULL, NULL, NULL },
-        { gen_helper_sve_ld1dd_le_r_mte,
-          gen_helper_sve_ld2dd_le_r_mte,
-          gen_helper_sve_ld3dd_le_r_mte,
-          gen_helper_sve_ld4dd_le_r_mte } },
-
-      /* mte active, big-endian */
-      { { gen_helper_sve_ld1bb_r_mte,
-          gen_helper_sve_ld2bb_r_mte,
-          gen_helper_sve_ld3bb_r_mte,
-          gen_helper_sve_ld4bb_r_mte },
-        { gen_helper_sve_ld1bhu_r_mte, NULL, NULL, NULL },
-        { gen_helper_sve_ld1bsu_r_mte, NULL, NULL, NULL },
-        { gen_helper_sve_ld1bdu_r_mte, NULL, NULL, NULL },
-
-        { gen_helper_sve_ld1sds_be_r_mte, NULL, NULL, NULL },
-        { gen_helper_sve_ld1hh_be_r_mte,
-          gen_helper_sve_ld2hh_be_r_mte,
-          gen_helper_sve_ld3hh_be_r_mte,
-          gen_helper_sve_ld4hh_be_r_mte },
-        { gen_helper_sve_ld1hsu_be_r_mte, NULL, NULL, NULL },
-        { gen_helper_sve_ld1hdu_be_r_mte, NULL, NULL, NULL },
-
-        { gen_helper_sve_ld1hds_be_r_mte, NULL, NULL, NULL },
-        { gen_helper_sve_ld1hss_be_r_mte, NULL, NULL, NULL },
-        { gen_helper_sve_ld1ss_be_r_mte,
-          gen_helper_sve_ld2ss_be_r_mte,
-          gen_helper_sve_ld3ss_be_r_mte,
-          gen_helper_sve_ld4ss_be_r_mte },
-        { gen_helper_sve_ld1sdu_be_r_mte, NULL, NULL, NULL },
-
-        { gen_helper_sve_ld1bds_r_mte, NULL, NULL, NULL },
-        { gen_helper_sve_ld1bss_r_mte, NULL, NULL, NULL },
-        { gen_helper_sve_ld1bhs_r_mte, NULL, NULL, NULL },
-        { gen_helper_sve_ld1dd_be_r_mte,
-          gen_helper_sve_ld2dd_be_r_mte,
-          gen_helper_sve_ld3dd_be_r_mte,
-          gen_helper_sve_ld4dd_be_r_mte } } },
-};
-
-static void do_ld_zpa(DisasContext *s, int zt, int pg,
-                      TCGv_i64 addr, int dtype, int nreg)
-{
-    gen_helper_gvec_mem *fn
-        = ldr_fns[s->mte_active[0]][s->be_data == MO_BE][dtype][nreg];
-
-    /*
-     * While there are holes in the table, they are not
-     * accessible via the instruction encoding.
-     */
-    assert(fn != NULL);
-    do_mem_zpa(s, zt, pg, addr, dtype, nreg, false, fn);
-}
-
-static bool trans_LD_zprr(DisasContext *s, arg_rprr_load *a)
-{
-    if (a->rm == 31 || !dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        TCGv_i64 addr = new_tmp_a64(s);
-        tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), dtype_msz(a->dtype));
-        tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
-        do_ld_zpa(s, a->rd, a->pg, addr, a->dtype, a->nreg);
-    }
-    return true;
-}
-
-static bool trans_LD_zpri(DisasContext *s, arg_rpri_load *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        int vsz = vec_full_reg_size(s);
-        int elements = vsz >> dtype_esz[a->dtype];
-        TCGv_i64 addr = new_tmp_a64(s);
-
-        tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn),
-                         (a->imm * elements * (a->nreg + 1))
-                         << dtype_msz(a->dtype));
-        do_ld_zpa(s, a->rd, a->pg, addr, a->dtype, a->nreg);
-    }
-    return true;
-}
-
-static bool trans_LDFF1_zprr(DisasContext *s, arg_rprr_load *a)
-{
-    static gen_helper_gvec_mem * const fns[2][2][16] = {
-        { /* mte inactive, little-endian */
-          { gen_helper_sve_ldff1bb_r,
-            gen_helper_sve_ldff1bhu_r,
-            gen_helper_sve_ldff1bsu_r,
-            gen_helper_sve_ldff1bdu_r,
-
-            gen_helper_sve_ldff1sds_le_r,
-            gen_helper_sve_ldff1hh_le_r,
-            gen_helper_sve_ldff1hsu_le_r,
-            gen_helper_sve_ldff1hdu_le_r,
-
-            gen_helper_sve_ldff1hds_le_r,
-            gen_helper_sve_ldff1hss_le_r,
-            gen_helper_sve_ldff1ss_le_r,
-            gen_helper_sve_ldff1sdu_le_r,
-
-            gen_helper_sve_ldff1bds_r,
-            gen_helper_sve_ldff1bss_r,
-            gen_helper_sve_ldff1bhs_r,
-            gen_helper_sve_ldff1dd_le_r },
-
-          /* mte inactive, big-endian */
-          { gen_helper_sve_ldff1bb_r,
-            gen_helper_sve_ldff1bhu_r,
-            gen_helper_sve_ldff1bsu_r,
-            gen_helper_sve_ldff1bdu_r,
-
-            gen_helper_sve_ldff1sds_be_r,
-            gen_helper_sve_ldff1hh_be_r,
-            gen_helper_sve_ldff1hsu_be_r,
-            gen_helper_sve_ldff1hdu_be_r,
-
-            gen_helper_sve_ldff1hds_be_r,
-            gen_helper_sve_ldff1hss_be_r,
-            gen_helper_sve_ldff1ss_be_r,
-            gen_helper_sve_ldff1sdu_be_r,
-
-            gen_helper_sve_ldff1bds_r,
-            gen_helper_sve_ldff1bss_r,
-            gen_helper_sve_ldff1bhs_r,
-            gen_helper_sve_ldff1dd_be_r } },
-
-        { /* mte active, little-endian */
-          { gen_helper_sve_ldff1bb_r_mte,
-            gen_helper_sve_ldff1bhu_r_mte,
-            gen_helper_sve_ldff1bsu_r_mte,
-            gen_helper_sve_ldff1bdu_r_mte,
-
-            gen_helper_sve_ldff1sds_le_r_mte,
-            gen_helper_sve_ldff1hh_le_r_mte,
-            gen_helper_sve_ldff1hsu_le_r_mte,
-            gen_helper_sve_ldff1hdu_le_r_mte,
-
-            gen_helper_sve_ldff1hds_le_r_mte,
-            gen_helper_sve_ldff1hss_le_r_mte,
-            gen_helper_sve_ldff1ss_le_r_mte,
-            gen_helper_sve_ldff1sdu_le_r_mte,
-
-            gen_helper_sve_ldff1bds_r_mte,
-            gen_helper_sve_ldff1bss_r_mte,
-            gen_helper_sve_ldff1bhs_r_mte,
-            gen_helper_sve_ldff1dd_le_r_mte },
-
-          /* mte active, big-endian */
-          { gen_helper_sve_ldff1bb_r_mte,
-            gen_helper_sve_ldff1bhu_r_mte,
-            gen_helper_sve_ldff1bsu_r_mte,
-            gen_helper_sve_ldff1bdu_r_mte,
-
-            gen_helper_sve_ldff1sds_be_r_mte,
-            gen_helper_sve_ldff1hh_be_r_mte,
-            gen_helper_sve_ldff1hsu_be_r_mte,
-            gen_helper_sve_ldff1hdu_be_r_mte,
-
-            gen_helper_sve_ldff1hds_be_r_mte,
-            gen_helper_sve_ldff1hss_be_r_mte,
-            gen_helper_sve_ldff1ss_be_r_mte,
-            gen_helper_sve_ldff1sdu_be_r_mte,
-
-            gen_helper_sve_ldff1bds_r_mte,
-            gen_helper_sve_ldff1bss_r_mte,
-            gen_helper_sve_ldff1bhs_r_mte,
-            gen_helper_sve_ldff1dd_be_r_mte } },
-    };
-
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    s->is_nonstreaming = true;
-    if (sve_access_check(s)) {
-        TCGv_i64 addr = new_tmp_a64(s);
-        tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), dtype_msz(a->dtype));
-        tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
-        do_mem_zpa(s, a->rd, a->pg, addr, a->dtype, 1, false,
-                   fns[s->mte_active[0]][s->be_data == MO_BE][a->dtype]);
-    }
-    return true;
-}
-
-static bool trans_LDNF1_zpri(DisasContext *s, arg_rpri_load *a)
-{
-    static gen_helper_gvec_mem * const fns[2][2][16] = {
-        { /* mte inactive, little-endian */
-          { gen_helper_sve_ldnf1bb_r,
-            gen_helper_sve_ldnf1bhu_r,
-            gen_helper_sve_ldnf1bsu_r,
-            gen_helper_sve_ldnf1bdu_r,
-
-            gen_helper_sve_ldnf1sds_le_r,
-            gen_helper_sve_ldnf1hh_le_r,
-            gen_helper_sve_ldnf1hsu_le_r,
-            gen_helper_sve_ldnf1hdu_le_r,
-
-            gen_helper_sve_ldnf1hds_le_r,
-            gen_helper_sve_ldnf1hss_le_r,
-            gen_helper_sve_ldnf1ss_le_r,
-            gen_helper_sve_ldnf1sdu_le_r,
-
-            gen_helper_sve_ldnf1bds_r,
-            gen_helper_sve_ldnf1bss_r,
-            gen_helper_sve_ldnf1bhs_r,
-            gen_helper_sve_ldnf1dd_le_r },
-
-          /* mte inactive, big-endian */
-          { gen_helper_sve_ldnf1bb_r,
-            gen_helper_sve_ldnf1bhu_r,
-            gen_helper_sve_ldnf1bsu_r,
-            gen_helper_sve_ldnf1bdu_r,
-
-            gen_helper_sve_ldnf1sds_be_r,
-            gen_helper_sve_ldnf1hh_be_r,
-            gen_helper_sve_ldnf1hsu_be_r,
-            gen_helper_sve_ldnf1hdu_be_r,
-
-            gen_helper_sve_ldnf1hds_be_r,
-            gen_helper_sve_ldnf1hss_be_r,
-            gen_helper_sve_ldnf1ss_be_r,
-            gen_helper_sve_ldnf1sdu_be_r,
-
-            gen_helper_sve_ldnf1bds_r,
-            gen_helper_sve_ldnf1bss_r,
-            gen_helper_sve_ldnf1bhs_r,
-            gen_helper_sve_ldnf1dd_be_r } },
-
-        { /* mte inactive, little-endian */
-          { gen_helper_sve_ldnf1bb_r_mte,
-            gen_helper_sve_ldnf1bhu_r_mte,
-            gen_helper_sve_ldnf1bsu_r_mte,
-            gen_helper_sve_ldnf1bdu_r_mte,
-
-            gen_helper_sve_ldnf1sds_le_r_mte,
-            gen_helper_sve_ldnf1hh_le_r_mte,
-            gen_helper_sve_ldnf1hsu_le_r_mte,
-            gen_helper_sve_ldnf1hdu_le_r_mte,
-
-            gen_helper_sve_ldnf1hds_le_r_mte,
-            gen_helper_sve_ldnf1hss_le_r_mte,
-            gen_helper_sve_ldnf1ss_le_r_mte,
-            gen_helper_sve_ldnf1sdu_le_r_mte,
-
-            gen_helper_sve_ldnf1bds_r_mte,
-            gen_helper_sve_ldnf1bss_r_mte,
-            gen_helper_sve_ldnf1bhs_r_mte,
-            gen_helper_sve_ldnf1dd_le_r_mte },
-
-          /* mte inactive, big-endian */
-          { gen_helper_sve_ldnf1bb_r_mte,
-            gen_helper_sve_ldnf1bhu_r_mte,
-            gen_helper_sve_ldnf1bsu_r_mte,
-            gen_helper_sve_ldnf1bdu_r_mte,
-
-            gen_helper_sve_ldnf1sds_be_r_mte,
-            gen_helper_sve_ldnf1hh_be_r_mte,
-            gen_helper_sve_ldnf1hsu_be_r_mte,
-            gen_helper_sve_ldnf1hdu_be_r_mte,
-
-            gen_helper_sve_ldnf1hds_be_r_mte,
-            gen_helper_sve_ldnf1hss_be_r_mte,
-            gen_helper_sve_ldnf1ss_be_r_mte,
-            gen_helper_sve_ldnf1sdu_be_r_mte,
-
-            gen_helper_sve_ldnf1bds_r_mte,
-            gen_helper_sve_ldnf1bss_r_mte,
-            gen_helper_sve_ldnf1bhs_r_mte,
-            gen_helper_sve_ldnf1dd_be_r_mte } },
-    };
-
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    s->is_nonstreaming = true;
-    if (sve_access_check(s)) {
-        int vsz = vec_full_reg_size(s);
-        int elements = vsz >> dtype_esz[a->dtype];
-        int off = (a->imm * elements) << dtype_msz(a->dtype);
-        TCGv_i64 addr = new_tmp_a64(s);
-
-        tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), off);
-        do_mem_zpa(s, a->rd, a->pg, addr, a->dtype, 1, false,
-                   fns[s->mte_active[0]][s->be_data == MO_BE][a->dtype]);
-    }
-    return true;
-}
-
-static void do_ldrq(DisasContext *s, int zt, int pg, TCGv_i64 addr, int dtype)
-{
-    unsigned vsz = vec_full_reg_size(s);
-    TCGv_ptr t_pg;
-    int poff;
-
-    /* Load the first quadword using the normal predicated load helpers.  */
-    poff = pred_full_reg_offset(s, pg);
-    if (vsz > 16) {
-        /*
-         * Zero-extend the first 16 bits of the predicate into a temporary.
-         * This avoids triggering an assert making sure we don't have bits
-         * set within a predicate beyond VQ, but we have lowered VQ to 1
-         * for this load operation.
-         */
-        TCGv_i64 tmp = tcg_temp_new_i64();
-#if HOST_BIG_ENDIAN
-        poff += 6;
-#endif
-        tcg_gen_ld16u_i64(tmp, cpu_env, poff);
-
-        poff = offsetof(CPUARMState, vfp.preg_tmp);
-        tcg_gen_st_i64(tmp, cpu_env, poff);
-        tcg_temp_free_i64(tmp);
-    }
-
-    t_pg = tcg_temp_new_ptr();
-    tcg_gen_addi_ptr(t_pg, cpu_env, poff);
-
-    gen_helper_gvec_mem *fn
-        = ldr_fns[s->mte_active[0]][s->be_data == MO_BE][dtype][0];
-    fn(cpu_env, t_pg, addr, tcg_constant_i32(simd_desc(16, 16, zt)));
-
-    tcg_temp_free_ptr(t_pg);
-
-    /* Replicate that first quadword.  */
-    if (vsz > 16) {
-        int doff = vec_full_reg_offset(s, zt);
-        tcg_gen_gvec_dup_mem(4, doff + 16, doff, vsz - 16, vsz - 16);
-    }
-}
-
-static bool trans_LD1RQ_zprr(DisasContext *s, arg_rprr_load *a)
-{
-    if (a->rm == 31 || !dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        int msz = dtype_msz(a->dtype);
-        TCGv_i64 addr = new_tmp_a64(s);
-        tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), msz);
-        tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
-        do_ldrq(s, a->rd, a->pg, addr, a->dtype);
-    }
-    return true;
-}
-
-static bool trans_LD1RQ_zpri(DisasContext *s, arg_rpri_load *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        TCGv_i64 addr = new_tmp_a64(s);
-        tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), a->imm * 16);
-        do_ldrq(s, a->rd, a->pg, addr, a->dtype);
-    }
-    return true;
-}
-
-static void do_ldro(DisasContext *s, int zt, int pg, TCGv_i64 addr, int dtype)
-{
-    unsigned vsz = vec_full_reg_size(s);
-    unsigned vsz_r32;
-    TCGv_ptr t_pg;
-    int poff, doff;
-
-    if (vsz < 32) {
-        /*
-         * Note that this UNDEFINED check comes after CheckSVEEnabled()
-         * in the ARM pseudocode, which is the sve_access_check() done
-         * in our caller.  We should not now return false from the caller.
-         */
-        unallocated_encoding(s);
-        return;
-    }
-
-    /* Load the first octaword using the normal predicated load helpers.  */
-
-    poff = pred_full_reg_offset(s, pg);
-    if (vsz > 32) {
-        /*
-         * Zero-extend the first 32 bits of the predicate into a temporary.
-         * This avoids triggering an assert making sure we don't have bits
-         * set within a predicate beyond VQ, but we have lowered VQ to 2
-         * for this load operation.
-         */
-        TCGv_i64 tmp = tcg_temp_new_i64();
-#if HOST_BIG_ENDIAN
-        poff += 4;
-#endif
-        tcg_gen_ld32u_i64(tmp, cpu_env, poff);
-
-        poff = offsetof(CPUARMState, vfp.preg_tmp);
-        tcg_gen_st_i64(tmp, cpu_env, poff);
-        tcg_temp_free_i64(tmp);
-    }
-
-    t_pg = tcg_temp_new_ptr();
-    tcg_gen_addi_ptr(t_pg, cpu_env, poff);
-
-    gen_helper_gvec_mem *fn
-        = ldr_fns[s->mte_active[0]][s->be_data == MO_BE][dtype][0];
-    fn(cpu_env, t_pg, addr, tcg_constant_i32(simd_desc(32, 32, zt)));
-
-    tcg_temp_free_ptr(t_pg);
-
-    /*
-     * Replicate that first octaword.
-     * The replication happens in units of 32; if the full vector size
-     * is not a multiple of 32, the final bits are zeroed.
-     */
-    doff = vec_full_reg_offset(s, zt);
-    vsz_r32 = QEMU_ALIGN_DOWN(vsz, 32);
-    if (vsz >= 64) {
-        tcg_gen_gvec_dup_mem(5, doff + 32, doff, vsz_r32 - 32, vsz_r32 - 32);
-    }
-    vsz -= vsz_r32;
-    if (vsz) {
-        tcg_gen_gvec_dup_imm(MO_64, doff + vsz_r32, vsz, vsz, 0);
-    }
-}
-
-static bool trans_LD1RO_zprr(DisasContext *s, arg_rprr_load *a)
-{
-    if (!dc_isar_feature(aa64_sve_f64mm, s)) {
-        return false;
-    }
-    if (a->rm == 31) {
-        return false;
-    }
-    s->is_nonstreaming = true;
-    if (sve_access_check(s)) {
-        TCGv_i64 addr = new_tmp_a64(s);
-        tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), dtype_msz(a->dtype));
-        tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
-        do_ldro(s, a->rd, a->pg, addr, a->dtype);
-    }
-    return true;
-}
-
-static bool trans_LD1RO_zpri(DisasContext *s, arg_rpri_load *a)
-{
-    if (!dc_isar_feature(aa64_sve_f64mm, s)) {
-        return false;
-    }
-    s->is_nonstreaming = true;
-    if (sve_access_check(s)) {
-        TCGv_i64 addr = new_tmp_a64(s);
-        tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), a->imm * 32);
-        do_ldro(s, a->rd, a->pg, addr, a->dtype);
-    }
-    return true;
-}
-
-/* Load and broadcast element.  */
-static bool trans_LD1R_zpri(DisasContext *s, arg_rpri_load *a)
-{
-    unsigned vsz = vec_full_reg_size(s);
-    unsigned psz = pred_full_reg_size(s);
-    unsigned esz = dtype_esz[a->dtype];
-    unsigned msz = dtype_msz(a->dtype);
-    TCGLabel *over;
-    TCGv_i64 temp, clean_addr;
-
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    over = gen_new_label();
-
-    /* If the guarding predicate has no bits set, no load occurs.  */
-    if (psz <= 8) {
-        /* Reduce the pred_esz_masks value simply to reduce the
-         * size of the code generated here.
-         */
-        uint64_t psz_mask = MAKE_64BIT_MASK(0, psz * 8);
-        temp = tcg_temp_new_i64();
-        tcg_gen_ld_i64(temp, cpu_env, pred_full_reg_offset(s, a->pg));
-        tcg_gen_andi_i64(temp, temp, pred_esz_masks[esz] & psz_mask);
-        tcg_gen_brcondi_i64(TCG_COND_EQ, temp, 0, over);
-        tcg_temp_free_i64(temp);
-    } else {
-        TCGv_i32 t32 = tcg_temp_new_i32();
-        find_last_active(s, t32, esz, a->pg);
-        tcg_gen_brcondi_i32(TCG_COND_LT, t32, 0, over);
-        tcg_temp_free_i32(t32);
-    }
-
-    /* Load the data.  */
-    temp = tcg_temp_new_i64();
-    tcg_gen_addi_i64(temp, cpu_reg_sp(s, a->rn), a->imm << msz);
-    clean_addr = gen_mte_check1(s, temp, false, true, msz);
-
-    tcg_gen_qemu_ld_i64(temp, clean_addr, get_mem_index(s),
-                        finalize_memop(s, dtype_mop[a->dtype]));
-
-    /* Broadcast to *all* elements.  */
-    tcg_gen_gvec_dup_i64(esz, vec_full_reg_offset(s, a->rd),
-                         vsz, vsz, temp);
-    tcg_temp_free_i64(temp);
-
-    /* Zero the inactive elements.  */
-    gen_set_label(over);
-    return do_movz_zpz(s, a->rd, a->rd, a->pg, esz, false);
-}
-
-static void do_st_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr,
-                      int msz, int esz, int nreg)
-{
-    static gen_helper_gvec_mem * const fn_single[2][2][4][4] = {
-        { { { gen_helper_sve_st1bb_r,
-              gen_helper_sve_st1bh_r,
-              gen_helper_sve_st1bs_r,
-              gen_helper_sve_st1bd_r },
-            { NULL,
-              gen_helper_sve_st1hh_le_r,
-              gen_helper_sve_st1hs_le_r,
-              gen_helper_sve_st1hd_le_r },
-            { NULL, NULL,
-              gen_helper_sve_st1ss_le_r,
-              gen_helper_sve_st1sd_le_r },
-            { NULL, NULL, NULL,
-              gen_helper_sve_st1dd_le_r } },
-          { { gen_helper_sve_st1bb_r,
-              gen_helper_sve_st1bh_r,
-              gen_helper_sve_st1bs_r,
-              gen_helper_sve_st1bd_r },
-            { NULL,
-              gen_helper_sve_st1hh_be_r,
-              gen_helper_sve_st1hs_be_r,
-              gen_helper_sve_st1hd_be_r },
-            { NULL, NULL,
-              gen_helper_sve_st1ss_be_r,
-              gen_helper_sve_st1sd_be_r },
-            { NULL, NULL, NULL,
-              gen_helper_sve_st1dd_be_r } } },
-
-        { { { gen_helper_sve_st1bb_r_mte,
-              gen_helper_sve_st1bh_r_mte,
-              gen_helper_sve_st1bs_r_mte,
-              gen_helper_sve_st1bd_r_mte },
-            { NULL,
-              gen_helper_sve_st1hh_le_r_mte,
-              gen_helper_sve_st1hs_le_r_mte,
-              gen_helper_sve_st1hd_le_r_mte },
-            { NULL, NULL,
-              gen_helper_sve_st1ss_le_r_mte,
-              gen_helper_sve_st1sd_le_r_mte },
-            { NULL, NULL, NULL,
-              gen_helper_sve_st1dd_le_r_mte } },
-          { { gen_helper_sve_st1bb_r_mte,
-              gen_helper_sve_st1bh_r_mte,
-              gen_helper_sve_st1bs_r_mte,
-              gen_helper_sve_st1bd_r_mte },
-            { NULL,
-              gen_helper_sve_st1hh_be_r_mte,
-              gen_helper_sve_st1hs_be_r_mte,
-              gen_helper_sve_st1hd_be_r_mte },
-            { NULL, NULL,
-              gen_helper_sve_st1ss_be_r_mte,
-              gen_helper_sve_st1sd_be_r_mte },
-            { NULL, NULL, NULL,
-              gen_helper_sve_st1dd_be_r_mte } } },
-    };
-    static gen_helper_gvec_mem * const fn_multiple[2][2][3][4] = {
-        { { { gen_helper_sve_st2bb_r,
-              gen_helper_sve_st2hh_le_r,
-              gen_helper_sve_st2ss_le_r,
-              gen_helper_sve_st2dd_le_r },
-            { gen_helper_sve_st3bb_r,
-              gen_helper_sve_st3hh_le_r,
-              gen_helper_sve_st3ss_le_r,
-              gen_helper_sve_st3dd_le_r },
-            { gen_helper_sve_st4bb_r,
-              gen_helper_sve_st4hh_le_r,
-              gen_helper_sve_st4ss_le_r,
-              gen_helper_sve_st4dd_le_r } },
-          { { gen_helper_sve_st2bb_r,
-              gen_helper_sve_st2hh_be_r,
-              gen_helper_sve_st2ss_be_r,
-              gen_helper_sve_st2dd_be_r },
-            { gen_helper_sve_st3bb_r,
-              gen_helper_sve_st3hh_be_r,
-              gen_helper_sve_st3ss_be_r,
-              gen_helper_sve_st3dd_be_r },
-            { gen_helper_sve_st4bb_r,
-              gen_helper_sve_st4hh_be_r,
-              gen_helper_sve_st4ss_be_r,
-              gen_helper_sve_st4dd_be_r } } },
-        { { { gen_helper_sve_st2bb_r_mte,
-              gen_helper_sve_st2hh_le_r_mte,
-              gen_helper_sve_st2ss_le_r_mte,
-              gen_helper_sve_st2dd_le_r_mte },
-            { gen_helper_sve_st3bb_r_mte,
-              gen_helper_sve_st3hh_le_r_mte,
-              gen_helper_sve_st3ss_le_r_mte,
-              gen_helper_sve_st3dd_le_r_mte },
-            { gen_helper_sve_st4bb_r_mte,
-              gen_helper_sve_st4hh_le_r_mte,
-              gen_helper_sve_st4ss_le_r_mte,
-              gen_helper_sve_st4dd_le_r_mte } },
-          { { gen_helper_sve_st2bb_r_mte,
-              gen_helper_sve_st2hh_be_r_mte,
-              gen_helper_sve_st2ss_be_r_mte,
-              gen_helper_sve_st2dd_be_r_mte },
-            { gen_helper_sve_st3bb_r_mte,
-              gen_helper_sve_st3hh_be_r_mte,
-              gen_helper_sve_st3ss_be_r_mte,
-              gen_helper_sve_st3dd_be_r_mte },
-            { gen_helper_sve_st4bb_r_mte,
-              gen_helper_sve_st4hh_be_r_mte,
-              gen_helper_sve_st4ss_be_r_mte,
-              gen_helper_sve_st4dd_be_r_mte } } },
-    };
-    gen_helper_gvec_mem *fn;
-    int be = s->be_data == MO_BE;
-
-    if (nreg == 0) {
-        /* ST1 */
-        fn = fn_single[s->mte_active[0]][be][msz][esz];
-        nreg = 1;
-    } else {
-        /* ST2, ST3, ST4 -- msz == esz, enforced by encoding */
-        assert(msz == esz);
-        fn = fn_multiple[s->mte_active[0]][be][nreg - 1][msz];
-    }
-    assert(fn != NULL);
-    do_mem_zpa(s, zt, pg, addr, msz_dtype(s, msz), nreg, true, fn);
-}
-
-static bool trans_ST_zprr(DisasContext *s, arg_rprr_store *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (a->rm == 31 || a->msz > a->esz) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        TCGv_i64 addr = new_tmp_a64(s);
-        tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), a->msz);
-        tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
-        do_st_zpa(s, a->rd, a->pg, addr, a->msz, a->esz, a->nreg);
-    }
-    return true;
-}
-
-static bool trans_ST_zpri(DisasContext *s, arg_rpri_store *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    if (a->msz > a->esz) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        int vsz = vec_full_reg_size(s);
-        int elements = vsz >> a->esz;
-        TCGv_i64 addr = new_tmp_a64(s);
-
-        tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn),
-                         (a->imm * elements * (a->nreg + 1)) << a->msz);
-        do_st_zpa(s, a->rd, a->pg, addr, a->msz, a->esz, a->nreg);
-    }
-    return true;
-}
-
-/*
- *** SVE gather loads / scatter stores
- */
-
-static void do_mem_zpz(DisasContext *s, int zt, int pg, int zm,
-                       int scale, TCGv_i64 scalar, int msz, bool is_write,
-                       gen_helper_gvec_mem_scatter *fn)
-{
-    unsigned vsz = vec_full_reg_size(s);
-    TCGv_ptr t_zm = tcg_temp_new_ptr();
-    TCGv_ptr t_pg = tcg_temp_new_ptr();
-    TCGv_ptr t_zt = tcg_temp_new_ptr();
-    int desc = 0;
-
-    if (s->mte_active[0]) {
-        desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s));
-        desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid);
-        desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma);
-        desc = FIELD_DP32(desc, MTEDESC, WRITE, is_write);
-        desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (1 << msz) - 1);
-        desc <<= SVE_MTEDESC_SHIFT;
-    }
-    desc = simd_desc(vsz, vsz, desc | scale);
-
-    tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg));
-    tcg_gen_addi_ptr(t_zm, cpu_env, vec_full_reg_offset(s, zm));
-    tcg_gen_addi_ptr(t_zt, cpu_env, vec_full_reg_offset(s, zt));
-    fn(cpu_env, t_zt, t_pg, t_zm, scalar, tcg_constant_i32(desc));
-
-    tcg_temp_free_ptr(t_zt);
-    tcg_temp_free_ptr(t_zm);
-    tcg_temp_free_ptr(t_pg);
-}
-
-/* Indexed by [mte][be][ff][xs][u][msz].  */
-static gen_helper_gvec_mem_scatter * const
-gather_load_fn32[2][2][2][2][2][3] = {
-    { /* MTE Inactive */
-        { /* Little-endian */
-            { { { gen_helper_sve_ldbss_zsu,
-                  gen_helper_sve_ldhss_le_zsu,
-                  NULL, },
-                { gen_helper_sve_ldbsu_zsu,
-                  gen_helper_sve_ldhsu_le_zsu,
-                  gen_helper_sve_ldss_le_zsu, } },
-              { { gen_helper_sve_ldbss_zss,
-                  gen_helper_sve_ldhss_le_zss,
-                  NULL, },
-                { gen_helper_sve_ldbsu_zss,
-                  gen_helper_sve_ldhsu_le_zss,
-                  gen_helper_sve_ldss_le_zss, } } },
-
-            /* First-fault */
-            { { { gen_helper_sve_ldffbss_zsu,
-                  gen_helper_sve_ldffhss_le_zsu,
-                  NULL, },
-                { gen_helper_sve_ldffbsu_zsu,
-                  gen_helper_sve_ldffhsu_le_zsu,
-                  gen_helper_sve_ldffss_le_zsu, } },
-              { { gen_helper_sve_ldffbss_zss,
-                  gen_helper_sve_ldffhss_le_zss,
-                  NULL, },
-                { gen_helper_sve_ldffbsu_zss,
-                  gen_helper_sve_ldffhsu_le_zss,
-                  gen_helper_sve_ldffss_le_zss, } } } },
-
-        { /* Big-endian */
-            { { { gen_helper_sve_ldbss_zsu,
-                  gen_helper_sve_ldhss_be_zsu,
-                  NULL, },
-                { gen_helper_sve_ldbsu_zsu,
-                  gen_helper_sve_ldhsu_be_zsu,
-                  gen_helper_sve_ldss_be_zsu, } },
-              { { gen_helper_sve_ldbss_zss,
-                  gen_helper_sve_ldhss_be_zss,
-                  NULL, },
-                { gen_helper_sve_ldbsu_zss,
-                  gen_helper_sve_ldhsu_be_zss,
-                  gen_helper_sve_ldss_be_zss, } } },
-
-            /* First-fault */
-            { { { gen_helper_sve_ldffbss_zsu,
-                  gen_helper_sve_ldffhss_be_zsu,
-                  NULL, },
-                { gen_helper_sve_ldffbsu_zsu,
-                  gen_helper_sve_ldffhsu_be_zsu,
-                  gen_helper_sve_ldffss_be_zsu, } },
-              { { gen_helper_sve_ldffbss_zss,
-                  gen_helper_sve_ldffhss_be_zss,
-                  NULL, },
-                { gen_helper_sve_ldffbsu_zss,
-                  gen_helper_sve_ldffhsu_be_zss,
-                  gen_helper_sve_ldffss_be_zss, } } } } },
-    { /* MTE Active */
-        { /* Little-endian */
-            { { { gen_helper_sve_ldbss_zsu_mte,
-                  gen_helper_sve_ldhss_le_zsu_mte,
-                  NULL, },
-                { gen_helper_sve_ldbsu_zsu_mte,
-                  gen_helper_sve_ldhsu_le_zsu_mte,
-                  gen_helper_sve_ldss_le_zsu_mte, } },
-              { { gen_helper_sve_ldbss_zss_mte,
-                  gen_helper_sve_ldhss_le_zss_mte,
-                  NULL, },
-                { gen_helper_sve_ldbsu_zss_mte,
-                  gen_helper_sve_ldhsu_le_zss_mte,
-                  gen_helper_sve_ldss_le_zss_mte, } } },
-
-            /* First-fault */
-            { { { gen_helper_sve_ldffbss_zsu_mte,
-                  gen_helper_sve_ldffhss_le_zsu_mte,
-                  NULL, },
-                { gen_helper_sve_ldffbsu_zsu_mte,
-                  gen_helper_sve_ldffhsu_le_zsu_mte,
-                  gen_helper_sve_ldffss_le_zsu_mte, } },
-              { { gen_helper_sve_ldffbss_zss_mte,
-                  gen_helper_sve_ldffhss_le_zss_mte,
-                  NULL, },
-                { gen_helper_sve_ldffbsu_zss_mte,
-                  gen_helper_sve_ldffhsu_le_zss_mte,
-                  gen_helper_sve_ldffss_le_zss_mte, } } } },
-
-        { /* Big-endian */
-            { { { gen_helper_sve_ldbss_zsu_mte,
-                  gen_helper_sve_ldhss_be_zsu_mte,
-                  NULL, },
-                { gen_helper_sve_ldbsu_zsu_mte,
-                  gen_helper_sve_ldhsu_be_zsu_mte,
-                  gen_helper_sve_ldss_be_zsu_mte, } },
-              { { gen_helper_sve_ldbss_zss_mte,
-                  gen_helper_sve_ldhss_be_zss_mte,
-                  NULL, },
-                { gen_helper_sve_ldbsu_zss_mte,
-                  gen_helper_sve_ldhsu_be_zss_mte,
-                  gen_helper_sve_ldss_be_zss_mte, } } },
-
-            /* First-fault */
-            { { { gen_helper_sve_ldffbss_zsu_mte,
-                  gen_helper_sve_ldffhss_be_zsu_mte,
-                  NULL, },
-                { gen_helper_sve_ldffbsu_zsu_mte,
-                  gen_helper_sve_ldffhsu_be_zsu_mte,
-                  gen_helper_sve_ldffss_be_zsu_mte, } },
-              { { gen_helper_sve_ldffbss_zss_mte,
-                  gen_helper_sve_ldffhss_be_zss_mte,
-                  NULL, },
-                { gen_helper_sve_ldffbsu_zss_mte,
-                  gen_helper_sve_ldffhsu_be_zss_mte,
-                  gen_helper_sve_ldffss_be_zss_mte, } } } } },
-};
-
-/* Note that we overload xs=2 to indicate 64-bit offset.  */
-static gen_helper_gvec_mem_scatter * const
-gather_load_fn64[2][2][2][3][2][4] = {
-    { /* MTE Inactive */
-        { /* Little-endian */
-            { { { gen_helper_sve_ldbds_zsu,
-                  gen_helper_sve_ldhds_le_zsu,
-                  gen_helper_sve_ldsds_le_zsu,
-                  NULL, },
-                { gen_helper_sve_ldbdu_zsu,
-                  gen_helper_sve_ldhdu_le_zsu,
-                  gen_helper_sve_ldsdu_le_zsu,
-                  gen_helper_sve_lddd_le_zsu, } },
-              { { gen_helper_sve_ldbds_zss,
-                  gen_helper_sve_ldhds_le_zss,
-                  gen_helper_sve_ldsds_le_zss,
-                  NULL, },
-                { gen_helper_sve_ldbdu_zss,
-                  gen_helper_sve_ldhdu_le_zss,
-                  gen_helper_sve_ldsdu_le_zss,
-                  gen_helper_sve_lddd_le_zss, } },
-              { { gen_helper_sve_ldbds_zd,
-                  gen_helper_sve_ldhds_le_zd,
-                  gen_helper_sve_ldsds_le_zd,
-                  NULL, },
-                { gen_helper_sve_ldbdu_zd,
-                  gen_helper_sve_ldhdu_le_zd,
-                  gen_helper_sve_ldsdu_le_zd,
-                  gen_helper_sve_lddd_le_zd, } } },
-
-            /* First-fault */
-            { { { gen_helper_sve_ldffbds_zsu,
-                  gen_helper_sve_ldffhds_le_zsu,
-                  gen_helper_sve_ldffsds_le_zsu,
-                  NULL, },
-                { gen_helper_sve_ldffbdu_zsu,
-                  gen_helper_sve_ldffhdu_le_zsu,
-                  gen_helper_sve_ldffsdu_le_zsu,
-                  gen_helper_sve_ldffdd_le_zsu, } },
-              { { gen_helper_sve_ldffbds_zss,
-                  gen_helper_sve_ldffhds_le_zss,
-                  gen_helper_sve_ldffsds_le_zss,
-                  NULL, },
-                { gen_helper_sve_ldffbdu_zss,
-                  gen_helper_sve_ldffhdu_le_zss,
-                  gen_helper_sve_ldffsdu_le_zss,
-                  gen_helper_sve_ldffdd_le_zss, } },
-              { { gen_helper_sve_ldffbds_zd,
-                  gen_helper_sve_ldffhds_le_zd,
-                  gen_helper_sve_ldffsds_le_zd,
-                  NULL, },
-                { gen_helper_sve_ldffbdu_zd,
-                  gen_helper_sve_ldffhdu_le_zd,
-                  gen_helper_sve_ldffsdu_le_zd,
-                  gen_helper_sve_ldffdd_le_zd, } } } },
-        { /* Big-endian */
-            { { { gen_helper_sve_ldbds_zsu,
-                  gen_helper_sve_ldhds_be_zsu,
-                  gen_helper_sve_ldsds_be_zsu,
-                  NULL, },
-                { gen_helper_sve_ldbdu_zsu,
-                  gen_helper_sve_ldhdu_be_zsu,
-                  gen_helper_sve_ldsdu_be_zsu,
-                  gen_helper_sve_lddd_be_zsu, } },
-              { { gen_helper_sve_ldbds_zss,
-                  gen_helper_sve_ldhds_be_zss,
-                  gen_helper_sve_ldsds_be_zss,
-                  NULL, },
-                { gen_helper_sve_ldbdu_zss,
-                  gen_helper_sve_ldhdu_be_zss,
-                  gen_helper_sve_ldsdu_be_zss,
-                  gen_helper_sve_lddd_be_zss, } },
-              { { gen_helper_sve_ldbds_zd,
-                  gen_helper_sve_ldhds_be_zd,
-                  gen_helper_sve_ldsds_be_zd,
-                  NULL, },
-                { gen_helper_sve_ldbdu_zd,
-                  gen_helper_sve_ldhdu_be_zd,
-                  gen_helper_sve_ldsdu_be_zd,
-                  gen_helper_sve_lddd_be_zd, } } },
-
-            /* First-fault */
-            { { { gen_helper_sve_ldffbds_zsu,
-                  gen_helper_sve_ldffhds_be_zsu,
-                  gen_helper_sve_ldffsds_be_zsu,
-                  NULL, },
-                { gen_helper_sve_ldffbdu_zsu,
-                  gen_helper_sve_ldffhdu_be_zsu,
-                  gen_helper_sve_ldffsdu_be_zsu,
-                  gen_helper_sve_ldffdd_be_zsu, } },
-              { { gen_helper_sve_ldffbds_zss,
-                  gen_helper_sve_ldffhds_be_zss,
-                  gen_helper_sve_ldffsds_be_zss,
-                  NULL, },
-                { gen_helper_sve_ldffbdu_zss,
-                  gen_helper_sve_ldffhdu_be_zss,
-                  gen_helper_sve_ldffsdu_be_zss,
-                  gen_helper_sve_ldffdd_be_zss, } },
-              { { gen_helper_sve_ldffbds_zd,
-                  gen_helper_sve_ldffhds_be_zd,
-                  gen_helper_sve_ldffsds_be_zd,
-                  NULL, },
-                { gen_helper_sve_ldffbdu_zd,
-                  gen_helper_sve_ldffhdu_be_zd,
-                  gen_helper_sve_ldffsdu_be_zd,
-                  gen_helper_sve_ldffdd_be_zd, } } } } },
-    { /* MTE Active */
-        { /* Little-endian */
-            { { { gen_helper_sve_ldbds_zsu_mte,
-                  gen_helper_sve_ldhds_le_zsu_mte,
-                  gen_helper_sve_ldsds_le_zsu_mte,
-                  NULL, },
-                { gen_helper_sve_ldbdu_zsu_mte,
-                  gen_helper_sve_ldhdu_le_zsu_mte,
-                  gen_helper_sve_ldsdu_le_zsu_mte,
-                  gen_helper_sve_lddd_le_zsu_mte, } },
-              { { gen_helper_sve_ldbds_zss_mte,
-                  gen_helper_sve_ldhds_le_zss_mte,
-                  gen_helper_sve_ldsds_le_zss_mte,
-                  NULL, },
-                { gen_helper_sve_ldbdu_zss_mte,
-                  gen_helper_sve_ldhdu_le_zss_mte,
-                  gen_helper_sve_ldsdu_le_zss_mte,
-                  gen_helper_sve_lddd_le_zss_mte, } },
-              { { gen_helper_sve_ldbds_zd_mte,
-                  gen_helper_sve_ldhds_le_zd_mte,
-                  gen_helper_sve_ldsds_le_zd_mte,
-                  NULL, },
-                { gen_helper_sve_ldbdu_zd_mte,
-                  gen_helper_sve_ldhdu_le_zd_mte,
-                  gen_helper_sve_ldsdu_le_zd_mte,
-                  gen_helper_sve_lddd_le_zd_mte, } } },
-
-            /* First-fault */
-            { { { gen_helper_sve_ldffbds_zsu_mte,
-                  gen_helper_sve_ldffhds_le_zsu_mte,
-                  gen_helper_sve_ldffsds_le_zsu_mte,
-                  NULL, },
-                { gen_helper_sve_ldffbdu_zsu_mte,
-                  gen_helper_sve_ldffhdu_le_zsu_mte,
-                  gen_helper_sve_ldffsdu_le_zsu_mte,
-                  gen_helper_sve_ldffdd_le_zsu_mte, } },
-              { { gen_helper_sve_ldffbds_zss_mte,
-                  gen_helper_sve_ldffhds_le_zss_mte,
-                  gen_helper_sve_ldffsds_le_zss_mte,
-                  NULL, },
-                { gen_helper_sve_ldffbdu_zss_mte,
-                  gen_helper_sve_ldffhdu_le_zss_mte,
-                  gen_helper_sve_ldffsdu_le_zss_mte,
-                  gen_helper_sve_ldffdd_le_zss_mte, } },
-              { { gen_helper_sve_ldffbds_zd_mte,
-                  gen_helper_sve_ldffhds_le_zd_mte,
-                  gen_helper_sve_ldffsds_le_zd_mte,
-                  NULL, },
-                { gen_helper_sve_ldffbdu_zd_mte,
-                  gen_helper_sve_ldffhdu_le_zd_mte,
-                  gen_helper_sve_ldffsdu_le_zd_mte,
-                  gen_helper_sve_ldffdd_le_zd_mte, } } } },
-        { /* Big-endian */
-            { { { gen_helper_sve_ldbds_zsu_mte,
-                  gen_helper_sve_ldhds_be_zsu_mte,
-                  gen_helper_sve_ldsds_be_zsu_mte,
-                  NULL, },
-                { gen_helper_sve_ldbdu_zsu_mte,
-                  gen_helper_sve_ldhdu_be_zsu_mte,
-                  gen_helper_sve_ldsdu_be_zsu_mte,
-                  gen_helper_sve_lddd_be_zsu_mte, } },
-              { { gen_helper_sve_ldbds_zss_mte,
-                  gen_helper_sve_ldhds_be_zss_mte,
-                  gen_helper_sve_ldsds_be_zss_mte,
-                  NULL, },
-                { gen_helper_sve_ldbdu_zss_mte,
-                  gen_helper_sve_ldhdu_be_zss_mte,
-                  gen_helper_sve_ldsdu_be_zss_mte,
-                  gen_helper_sve_lddd_be_zss_mte, } },
-              { { gen_helper_sve_ldbds_zd_mte,
-                  gen_helper_sve_ldhds_be_zd_mte,
-                  gen_helper_sve_ldsds_be_zd_mte,
-                  NULL, },
-                { gen_helper_sve_ldbdu_zd_mte,
-                  gen_helper_sve_ldhdu_be_zd_mte,
-                  gen_helper_sve_ldsdu_be_zd_mte,
-                  gen_helper_sve_lddd_be_zd_mte, } } },
-
-            /* First-fault */
-            { { { gen_helper_sve_ldffbds_zsu_mte,
-                  gen_helper_sve_ldffhds_be_zsu_mte,
-                  gen_helper_sve_ldffsds_be_zsu_mte,
-                  NULL, },
-                { gen_helper_sve_ldffbdu_zsu_mte,
-                  gen_helper_sve_ldffhdu_be_zsu_mte,
-                  gen_helper_sve_ldffsdu_be_zsu_mte,
-                  gen_helper_sve_ldffdd_be_zsu_mte, } },
-              { { gen_helper_sve_ldffbds_zss_mte,
-                  gen_helper_sve_ldffhds_be_zss_mte,
-                  gen_helper_sve_ldffsds_be_zss_mte,
-                  NULL, },
-                { gen_helper_sve_ldffbdu_zss_mte,
-                  gen_helper_sve_ldffhdu_be_zss_mte,
-                  gen_helper_sve_ldffsdu_be_zss_mte,
-                  gen_helper_sve_ldffdd_be_zss_mte, } },
-              { { gen_helper_sve_ldffbds_zd_mte,
-                  gen_helper_sve_ldffhds_be_zd_mte,
-                  gen_helper_sve_ldffsds_be_zd_mte,
-                  NULL, },
-                { gen_helper_sve_ldffbdu_zd_mte,
-                  gen_helper_sve_ldffhdu_be_zd_mte,
-                  gen_helper_sve_ldffsdu_be_zd_mte,
-                  gen_helper_sve_ldffdd_be_zd_mte, } } } } },
-};
-
-static bool trans_LD1_zprz(DisasContext *s, arg_LD1_zprz *a)
-{
-    gen_helper_gvec_mem_scatter *fn = NULL;
-    bool be = s->be_data == MO_BE;
-    bool mte = s->mte_active[0];
-
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    s->is_nonstreaming = true;
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    switch (a->esz) {
-    case MO_32:
-        fn = gather_load_fn32[mte][be][a->ff][a->xs][a->u][a->msz];
-        break;
-    case MO_64:
-        fn = gather_load_fn64[mte][be][a->ff][a->xs][a->u][a->msz];
-        break;
-    }
-    assert(fn != NULL);
-
-    do_mem_zpz(s, a->rd, a->pg, a->rm, a->scale * a->msz,
-               cpu_reg_sp(s, a->rn), a->msz, false, fn);
-    return true;
-}
-
-static bool trans_LD1_zpiz(DisasContext *s, arg_LD1_zpiz *a)
-{
-    gen_helper_gvec_mem_scatter *fn = NULL;
-    bool be = s->be_data == MO_BE;
-    bool mte = s->mte_active[0];
-
-    if (a->esz < a->msz || (a->esz == a->msz && !a->u)) {
-        return false;
-    }
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    s->is_nonstreaming = true;
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    switch (a->esz) {
-    case MO_32:
-        fn = gather_load_fn32[mte][be][a->ff][0][a->u][a->msz];
-        break;
-    case MO_64:
-        fn = gather_load_fn64[mte][be][a->ff][2][a->u][a->msz];
-        break;
-    }
-    assert(fn != NULL);
-
-    /* Treat LD1_zpiz (zn[x] + imm) the same way as LD1_zprz (rn + zm[x])
-     * by loading the immediate into the scalar parameter.
-     */
-    do_mem_zpz(s, a->rd, a->pg, a->rn, 0,
-               tcg_constant_i64(a->imm << a->msz), a->msz, false, fn);
-    return true;
-}
-
-static bool trans_LDNT1_zprz(DisasContext *s, arg_LD1_zprz *a)
-{
-    gen_helper_gvec_mem_scatter *fn = NULL;
-    bool be = s->be_data == MO_BE;
-    bool mte = s->mte_active[0];
-
-    if (a->esz < a->msz + !a->u) {
-        return false;
-    }
-    if (!dc_isar_feature(aa64_sve2, s)) {
-        return false;
-    }
-    s->is_nonstreaming = true;
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    switch (a->esz) {
-    case MO_32:
-        fn = gather_load_fn32[mte][be][0][0][a->u][a->msz];
-        break;
-    case MO_64:
-        fn = gather_load_fn64[mte][be][0][2][a->u][a->msz];
-        break;
-    }
-    assert(fn != NULL);
-
-    do_mem_zpz(s, a->rd, a->pg, a->rn, 0,
-               cpu_reg(s, a->rm), a->msz, false, fn);
-    return true;
-}
-
-/* Indexed by [mte][be][xs][msz].  */
-static gen_helper_gvec_mem_scatter * const scatter_store_fn32[2][2][2][3] = {
-    { /* MTE Inactive */
-        { /* Little-endian */
-            { gen_helper_sve_stbs_zsu,
-              gen_helper_sve_sths_le_zsu,
-              gen_helper_sve_stss_le_zsu, },
-            { gen_helper_sve_stbs_zss,
-              gen_helper_sve_sths_le_zss,
-              gen_helper_sve_stss_le_zss, } },
-        { /* Big-endian */
-            { gen_helper_sve_stbs_zsu,
-              gen_helper_sve_sths_be_zsu,
-              gen_helper_sve_stss_be_zsu, },
-            { gen_helper_sve_stbs_zss,
-              gen_helper_sve_sths_be_zss,
-              gen_helper_sve_stss_be_zss, } } },
-    { /* MTE Active */
-        { /* Little-endian */
-            { gen_helper_sve_stbs_zsu_mte,
-              gen_helper_sve_sths_le_zsu_mte,
-              gen_helper_sve_stss_le_zsu_mte, },
-            { gen_helper_sve_stbs_zss_mte,
-              gen_helper_sve_sths_le_zss_mte,
-              gen_helper_sve_stss_le_zss_mte, } },
-        { /* Big-endian */
-            { gen_helper_sve_stbs_zsu_mte,
-              gen_helper_sve_sths_be_zsu_mte,
-              gen_helper_sve_stss_be_zsu_mte, },
-            { gen_helper_sve_stbs_zss_mte,
-              gen_helper_sve_sths_be_zss_mte,
-              gen_helper_sve_stss_be_zss_mte, } } },
-};
-
-/* Note that we overload xs=2 to indicate 64-bit offset.  */
-static gen_helper_gvec_mem_scatter * const scatter_store_fn64[2][2][3][4] = {
-    { /* MTE Inactive */
-         { /* Little-endian */
-             { gen_helper_sve_stbd_zsu,
-               gen_helper_sve_sthd_le_zsu,
-               gen_helper_sve_stsd_le_zsu,
-               gen_helper_sve_stdd_le_zsu, },
-             { gen_helper_sve_stbd_zss,
-               gen_helper_sve_sthd_le_zss,
-               gen_helper_sve_stsd_le_zss,
-               gen_helper_sve_stdd_le_zss, },
-             { gen_helper_sve_stbd_zd,
-               gen_helper_sve_sthd_le_zd,
-               gen_helper_sve_stsd_le_zd,
-               gen_helper_sve_stdd_le_zd, } },
-         { /* Big-endian */
-             { gen_helper_sve_stbd_zsu,
-               gen_helper_sve_sthd_be_zsu,
-               gen_helper_sve_stsd_be_zsu,
-               gen_helper_sve_stdd_be_zsu, },
-             { gen_helper_sve_stbd_zss,
-               gen_helper_sve_sthd_be_zss,
-               gen_helper_sve_stsd_be_zss,
-               gen_helper_sve_stdd_be_zss, },
-             { gen_helper_sve_stbd_zd,
-               gen_helper_sve_sthd_be_zd,
-               gen_helper_sve_stsd_be_zd,
-               gen_helper_sve_stdd_be_zd, } } },
-    { /* MTE Inactive */
-         { /* Little-endian */
-             { gen_helper_sve_stbd_zsu_mte,
-               gen_helper_sve_sthd_le_zsu_mte,
-               gen_helper_sve_stsd_le_zsu_mte,
-               gen_helper_sve_stdd_le_zsu_mte, },
-             { gen_helper_sve_stbd_zss_mte,
-               gen_helper_sve_sthd_le_zss_mte,
-               gen_helper_sve_stsd_le_zss_mte,
-               gen_helper_sve_stdd_le_zss_mte, },
-             { gen_helper_sve_stbd_zd_mte,
-               gen_helper_sve_sthd_le_zd_mte,
-               gen_helper_sve_stsd_le_zd_mte,
-               gen_helper_sve_stdd_le_zd_mte, } },
-         { /* Big-endian */
-             { gen_helper_sve_stbd_zsu_mte,
-               gen_helper_sve_sthd_be_zsu_mte,
-               gen_helper_sve_stsd_be_zsu_mte,
-               gen_helper_sve_stdd_be_zsu_mte, },
-             { gen_helper_sve_stbd_zss_mte,
-               gen_helper_sve_sthd_be_zss_mte,
-               gen_helper_sve_stsd_be_zss_mte,
-               gen_helper_sve_stdd_be_zss_mte, },
-             { gen_helper_sve_stbd_zd_mte,
-               gen_helper_sve_sthd_be_zd_mte,
-               gen_helper_sve_stsd_be_zd_mte,
-               gen_helper_sve_stdd_be_zd_mte, } } },
-};
-
-static bool trans_ST1_zprz(DisasContext *s, arg_ST1_zprz *a)
-{
-    gen_helper_gvec_mem_scatter *fn;
-    bool be = s->be_data == MO_BE;
-    bool mte = s->mte_active[0];
-
-    if (a->esz < a->msz || (a->msz == 0 && a->scale)) {
-        return false;
-    }
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    s->is_nonstreaming = true;
-    if (!sve_access_check(s)) {
-        return true;
-    }
-    switch (a->esz) {
-    case MO_32:
-        fn = scatter_store_fn32[mte][be][a->xs][a->msz];
-        break;
-    case MO_64:
-        fn = scatter_store_fn64[mte][be][a->xs][a->msz];
-        break;
-    default:
-        g_assert_not_reached();
-    }
-    do_mem_zpz(s, a->rd, a->pg, a->rm, a->scale * a->msz,
-               cpu_reg_sp(s, a->rn), a->msz, true, fn);
-    return true;
-}
-
-static bool trans_ST1_zpiz(DisasContext *s, arg_ST1_zpiz *a)
-{
-    gen_helper_gvec_mem_scatter *fn = NULL;
-    bool be = s->be_data == MO_BE;
-    bool mte = s->mte_active[0];
-
-    if (a->esz < a->msz) {
-        return false;
-    }
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    s->is_nonstreaming = true;
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    switch (a->esz) {
-    case MO_32:
-        fn = scatter_store_fn32[mte][be][0][a->msz];
-        break;
-    case MO_64:
-        fn = scatter_store_fn64[mte][be][2][a->msz];
-        break;
-    }
-    assert(fn != NULL);
-
-    /* Treat ST1_zpiz (zn[x] + imm) the same way as ST1_zprz (rn + zm[x])
-     * by loading the immediate into the scalar parameter.
-     */
-    do_mem_zpz(s, a->rd, a->pg, a->rn, 0,
-               tcg_constant_i64(a->imm << a->msz), a->msz, true, fn);
-    return true;
-}
-
-static bool trans_STNT1_zprz(DisasContext *s, arg_ST1_zprz *a)
-{
-    gen_helper_gvec_mem_scatter *fn;
-    bool be = s->be_data == MO_BE;
-    bool mte = s->mte_active[0];
-
-    if (a->esz < a->msz) {
-        return false;
-    }
-    if (!dc_isar_feature(aa64_sve2, s)) {
-        return false;
-    }
-    s->is_nonstreaming = true;
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    switch (a->esz) {
-    case MO_32:
-        fn = scatter_store_fn32[mte][be][0][a->msz];
-        break;
-    case MO_64:
-        fn = scatter_store_fn64[mte][be][2][a->msz];
-        break;
-    default:
-        g_assert_not_reached();
-    }
-
-    do_mem_zpz(s, a->rd, a->pg, a->rn, 0,
-               cpu_reg(s, a->rm), a->msz, true, fn);
-    return true;
-}
-
-/*
- * Prefetches
- */
-
-static bool trans_PRF(DisasContext *s, arg_PRF *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    /* Prefetch is a nop within QEMU.  */
-    (void)sve_access_check(s);
-    return true;
-}
-
-static bool trans_PRF_rr(DisasContext *s, arg_PRF_rr *a)
-{
-    if (a->rm == 31 || !dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    /* Prefetch is a nop within QEMU.  */
-    (void)sve_access_check(s);
-    return true;
-}
-
-static bool trans_PRF_ns(DisasContext *s, arg_PRF_ns *a)
-{
-    if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    /* Prefetch is a nop within QEMU.  */
-    s->is_nonstreaming = true;
-    (void)sve_access_check(s);
-    return true;
-}
-
-/*
- * Move Prefix
- *
- * TODO: The implementation so far could handle predicated merging movprfx.
- * The helper functions as written take an extra source register to
- * use in the operation, but the result is only written when predication
- * succeeds.  For unpredicated movprfx, we need to rearrange the helpers
- * to allow the final write back to the destination to be unconditional.
- * For predicated zeroing movprfx, we need to rearrange the helpers to
- * allow the final write back to zero inactives.
- *
- * In the meantime, just emit the moves.
- */
-
-TRANS_FEAT(MOVPRFX, aa64_sve, do_mov_z, a->rd, a->rn)
-TRANS_FEAT(MOVPRFX_m, aa64_sve, do_sel_z, a->rd, a->rn, a->rd, a->pg, a->esz)
-TRANS_FEAT(MOVPRFX_z, aa64_sve, do_movz_zpz, a->rd, a->rn, a->pg, a->esz, false)
-
-/*
- * SVE2 Integer Multiply - Unpredicated
- */
-
-TRANS_FEAT(MUL_zzz, aa64_sve2, gen_gvec_fn_arg_zzz, tcg_gen_gvec_mul, a)
-
-static gen_helper_gvec_3 * const smulh_zzz_fns[4] = {
-    gen_helper_gvec_smulh_b, gen_helper_gvec_smulh_h,
-    gen_helper_gvec_smulh_s, gen_helper_gvec_smulh_d,
-};
-TRANS_FEAT(SMULH_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
-           smulh_zzz_fns[a->esz], a, 0)
-
-static gen_helper_gvec_3 * const umulh_zzz_fns[4] = {
-    gen_helper_gvec_umulh_b, gen_helper_gvec_umulh_h,
-    gen_helper_gvec_umulh_s, gen_helper_gvec_umulh_d,
-};
-TRANS_FEAT(UMULH_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
-           umulh_zzz_fns[a->esz], a, 0)
-
-TRANS_FEAT(PMUL_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
-           gen_helper_gvec_pmul_b, a, 0)
-
-static gen_helper_gvec_3 * const sqdmulh_zzz_fns[4] = {
-    gen_helper_sve2_sqdmulh_b, gen_helper_sve2_sqdmulh_h,
-    gen_helper_sve2_sqdmulh_s, gen_helper_sve2_sqdmulh_d,
-};
-TRANS_FEAT(SQDMULH_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
-           sqdmulh_zzz_fns[a->esz], a, 0)
-
-static gen_helper_gvec_3 * const sqrdmulh_zzz_fns[4] = {
-    gen_helper_sve2_sqrdmulh_b, gen_helper_sve2_sqrdmulh_h,
-    gen_helper_sve2_sqrdmulh_s, gen_helper_sve2_sqrdmulh_d,
-};
-TRANS_FEAT(SQRDMULH_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
-           sqrdmulh_zzz_fns[a->esz], a, 0)
-
-/*
- * SVE2 Integer - Predicated
- */
-
-static gen_helper_gvec_4 * const sadlp_fns[4] = {
-    NULL,                          gen_helper_sve2_sadalp_zpzz_h,
-    gen_helper_sve2_sadalp_zpzz_s, gen_helper_sve2_sadalp_zpzz_d,
-};
-TRANS_FEAT(SADALP_zpzz, aa64_sve2, gen_gvec_ool_arg_zpzz,
-           sadlp_fns[a->esz], a, 0)
-
-static gen_helper_gvec_4 * const uadlp_fns[4] = {
-    NULL,                          gen_helper_sve2_uadalp_zpzz_h,
-    gen_helper_sve2_uadalp_zpzz_s, gen_helper_sve2_uadalp_zpzz_d,
-};
-TRANS_FEAT(UADALP_zpzz, aa64_sve2, gen_gvec_ool_arg_zpzz,
-           uadlp_fns[a->esz], a, 0)
-
-/*
- * SVE2 integer unary operations (predicated)
- */
-
-TRANS_FEAT(URECPE, aa64_sve2, gen_gvec_ool_arg_zpz,
-           a->esz == 2 ? gen_helper_sve2_urecpe_s : NULL, a, 0)
-
-TRANS_FEAT(URSQRTE, aa64_sve2, gen_gvec_ool_arg_zpz,
-           a->esz == 2 ? gen_helper_sve2_ursqrte_s : NULL, a, 0)
-
-static gen_helper_gvec_3 * const sqabs_fns[4] = {
-    gen_helper_sve2_sqabs_b, gen_helper_sve2_sqabs_h,
-    gen_helper_sve2_sqabs_s, gen_helper_sve2_sqabs_d,
-};
-TRANS_FEAT(SQABS, aa64_sve2, gen_gvec_ool_arg_zpz, sqabs_fns[a->esz], a, 0)
-
-static gen_helper_gvec_3 * const sqneg_fns[4] = {
-    gen_helper_sve2_sqneg_b, gen_helper_sve2_sqneg_h,
-    gen_helper_sve2_sqneg_s, gen_helper_sve2_sqneg_d,
-};
-TRANS_FEAT(SQNEG, aa64_sve2, gen_gvec_ool_arg_zpz, sqneg_fns[a->esz], a, 0)
-
-DO_ZPZZ(SQSHL, aa64_sve2, sve2_sqshl)
-DO_ZPZZ(SQRSHL, aa64_sve2, sve2_sqrshl)
-DO_ZPZZ(SRSHL, aa64_sve2, sve2_srshl)
-
-DO_ZPZZ(UQSHL, aa64_sve2, sve2_uqshl)
-DO_ZPZZ(UQRSHL, aa64_sve2, sve2_uqrshl)
-DO_ZPZZ(URSHL, aa64_sve2, sve2_urshl)
-
-DO_ZPZZ(SHADD, aa64_sve2, sve2_shadd)
-DO_ZPZZ(SRHADD, aa64_sve2, sve2_srhadd)
-DO_ZPZZ(SHSUB, aa64_sve2, sve2_shsub)
-
-DO_ZPZZ(UHADD, aa64_sve2, sve2_uhadd)
-DO_ZPZZ(URHADD, aa64_sve2, sve2_urhadd)
-DO_ZPZZ(UHSUB, aa64_sve2, sve2_uhsub)
-
-DO_ZPZZ(ADDP, aa64_sve2, sve2_addp)
-DO_ZPZZ(SMAXP, aa64_sve2, sve2_smaxp)
-DO_ZPZZ(UMAXP, aa64_sve2, sve2_umaxp)
-DO_ZPZZ(SMINP, aa64_sve2, sve2_sminp)
-DO_ZPZZ(UMINP, aa64_sve2, sve2_uminp)
-
-DO_ZPZZ(SQADD_zpzz, aa64_sve2, sve2_sqadd)
-DO_ZPZZ(UQADD_zpzz, aa64_sve2, sve2_uqadd)
-DO_ZPZZ(SQSUB_zpzz, aa64_sve2, sve2_sqsub)
-DO_ZPZZ(UQSUB_zpzz, aa64_sve2, sve2_uqsub)
-DO_ZPZZ(SUQADD, aa64_sve2, sve2_suqadd)
-DO_ZPZZ(USQADD, aa64_sve2, sve2_usqadd)
-
-/*
- * SVE2 Widening Integer Arithmetic
- */
-
-static gen_helper_gvec_3 * const saddl_fns[4] = {
-    NULL,                    gen_helper_sve2_saddl_h,
-    gen_helper_sve2_saddl_s, gen_helper_sve2_saddl_d,
-};
-TRANS_FEAT(SADDLB, aa64_sve2, gen_gvec_ool_arg_zzz,
-           saddl_fns[a->esz], a, 0)
-TRANS_FEAT(SADDLT, aa64_sve2, gen_gvec_ool_arg_zzz,
-           saddl_fns[a->esz], a, 3)
-TRANS_FEAT(SADDLBT, aa64_sve2, gen_gvec_ool_arg_zzz,
-           saddl_fns[a->esz], a, 2)
-
-static gen_helper_gvec_3 * const ssubl_fns[4] = {
-    NULL,                    gen_helper_sve2_ssubl_h,
-    gen_helper_sve2_ssubl_s, gen_helper_sve2_ssubl_d,
-};
-TRANS_FEAT(SSUBLB, aa64_sve2, gen_gvec_ool_arg_zzz,
-           ssubl_fns[a->esz], a, 0)
-TRANS_FEAT(SSUBLT, aa64_sve2, gen_gvec_ool_arg_zzz,
-           ssubl_fns[a->esz], a, 3)
-TRANS_FEAT(SSUBLBT, aa64_sve2, gen_gvec_ool_arg_zzz,
-           ssubl_fns[a->esz], a, 2)
-TRANS_FEAT(SSUBLTB, aa64_sve2, gen_gvec_ool_arg_zzz,
-           ssubl_fns[a->esz], a, 1)
-
-static gen_helper_gvec_3 * const sabdl_fns[4] = {
-    NULL,                    gen_helper_sve2_sabdl_h,
-    gen_helper_sve2_sabdl_s, gen_helper_sve2_sabdl_d,
-};
-TRANS_FEAT(SABDLB, aa64_sve2, gen_gvec_ool_arg_zzz,
-           sabdl_fns[a->esz], a, 0)
-TRANS_FEAT(SABDLT, aa64_sve2, gen_gvec_ool_arg_zzz,
-           sabdl_fns[a->esz], a, 3)
-
-static gen_helper_gvec_3 * const uaddl_fns[4] = {
-    NULL,                    gen_helper_sve2_uaddl_h,
-    gen_helper_sve2_uaddl_s, gen_helper_sve2_uaddl_d,
-};
-TRANS_FEAT(UADDLB, aa64_sve2, gen_gvec_ool_arg_zzz,
-           uaddl_fns[a->esz], a, 0)
-TRANS_FEAT(UADDLT, aa64_sve2, gen_gvec_ool_arg_zzz,
-           uaddl_fns[a->esz], a, 3)
-
-static gen_helper_gvec_3 * const usubl_fns[4] = {
-    NULL,                    gen_helper_sve2_usubl_h,
-    gen_helper_sve2_usubl_s, gen_helper_sve2_usubl_d,
-};
-TRANS_FEAT(USUBLB, aa64_sve2, gen_gvec_ool_arg_zzz,
-           usubl_fns[a->esz], a, 0)
-TRANS_FEAT(USUBLT, aa64_sve2, gen_gvec_ool_arg_zzz,
-           usubl_fns[a->esz], a, 3)
-
-static gen_helper_gvec_3 * const uabdl_fns[4] = {
-    NULL,                    gen_helper_sve2_uabdl_h,
-    gen_helper_sve2_uabdl_s, gen_helper_sve2_uabdl_d,
-};
-TRANS_FEAT(UABDLB, aa64_sve2, gen_gvec_ool_arg_zzz,
-           uabdl_fns[a->esz], a, 0)
-TRANS_FEAT(UABDLT, aa64_sve2, gen_gvec_ool_arg_zzz,
-           uabdl_fns[a->esz], a, 3)
-
-static gen_helper_gvec_3 * const sqdmull_fns[4] = {
-    NULL,                          gen_helper_sve2_sqdmull_zzz_h,
-    gen_helper_sve2_sqdmull_zzz_s, gen_helper_sve2_sqdmull_zzz_d,
-};
-TRANS_FEAT(SQDMULLB_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
-           sqdmull_fns[a->esz], a, 0)
-TRANS_FEAT(SQDMULLT_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
-           sqdmull_fns[a->esz], a, 3)
-
-static gen_helper_gvec_3 * const smull_fns[4] = {
-    NULL,                        gen_helper_sve2_smull_zzz_h,
-    gen_helper_sve2_smull_zzz_s, gen_helper_sve2_smull_zzz_d,
-};
-TRANS_FEAT(SMULLB_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
-           smull_fns[a->esz], a, 0)
-TRANS_FEAT(SMULLT_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
-           smull_fns[a->esz], a, 3)
-
-static gen_helper_gvec_3 * const umull_fns[4] = {
-    NULL,                        gen_helper_sve2_umull_zzz_h,
-    gen_helper_sve2_umull_zzz_s, gen_helper_sve2_umull_zzz_d,
-};
-TRANS_FEAT(UMULLB_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
-           umull_fns[a->esz], a, 0)
-TRANS_FEAT(UMULLT_zzz, aa64_sve2, gen_gvec_ool_arg_zzz,
-           umull_fns[a->esz], a, 3)
-
-static gen_helper_gvec_3 * const eoril_fns[4] = {
-    gen_helper_sve2_eoril_b, gen_helper_sve2_eoril_h,
-    gen_helper_sve2_eoril_s, gen_helper_sve2_eoril_d,
-};
-TRANS_FEAT(EORBT, aa64_sve2, gen_gvec_ool_arg_zzz, eoril_fns[a->esz], a, 2)
-TRANS_FEAT(EORTB, aa64_sve2, gen_gvec_ool_arg_zzz, eoril_fns[a->esz], a, 1)
-
-static bool do_trans_pmull(DisasContext *s, arg_rrr_esz *a, bool sel)
-{
-    static gen_helper_gvec_3 * const fns[4] = {
-        gen_helper_gvec_pmull_q, gen_helper_sve2_pmull_h,
-        NULL,                    gen_helper_sve2_pmull_d,
-    };
-
-    if (a->esz == 0) {
-        if (!dc_isar_feature(aa64_sve2_pmull128, s)) {
-            return false;
-        }
-        s->is_nonstreaming = true;
-    } else if (!dc_isar_feature(aa64_sve, s)) {
-        return false;
-    }
-    return gen_gvec_ool_arg_zzz(s, fns[a->esz], a, sel);
-}
-
-TRANS_FEAT(PMULLB, aa64_sve2, do_trans_pmull, a, false)
-TRANS_FEAT(PMULLT, aa64_sve2, do_trans_pmull, a, true)
-
-static gen_helper_gvec_3 * const saddw_fns[4] = {
-    NULL,                    gen_helper_sve2_saddw_h,
-    gen_helper_sve2_saddw_s, gen_helper_sve2_saddw_d,
-};
-TRANS_FEAT(SADDWB, aa64_sve2, gen_gvec_ool_arg_zzz, saddw_fns[a->esz], a, 0)
-TRANS_FEAT(SADDWT, aa64_sve2, gen_gvec_ool_arg_zzz, saddw_fns[a->esz], a, 1)
-
-static gen_helper_gvec_3 * const ssubw_fns[4] = {
-    NULL,                    gen_helper_sve2_ssubw_h,
-    gen_helper_sve2_ssubw_s, gen_helper_sve2_ssubw_d,
-};
-TRANS_FEAT(SSUBWB, aa64_sve2, gen_gvec_ool_arg_zzz, ssubw_fns[a->esz], a, 0)
-TRANS_FEAT(SSUBWT, aa64_sve2, gen_gvec_ool_arg_zzz, ssubw_fns[a->esz], a, 1)
-
-static gen_helper_gvec_3 * const uaddw_fns[4] = {
-    NULL,                    gen_helper_sve2_uaddw_h,
-    gen_helper_sve2_uaddw_s, gen_helper_sve2_uaddw_d,
-};
-TRANS_FEAT(UADDWB, aa64_sve2, gen_gvec_ool_arg_zzz, uaddw_fns[a->esz], a, 0)
-TRANS_FEAT(UADDWT, aa64_sve2, gen_gvec_ool_arg_zzz, uaddw_fns[a->esz], a, 1)
-
-static gen_helper_gvec_3 * const usubw_fns[4] = {
-    NULL,                    gen_helper_sve2_usubw_h,
-    gen_helper_sve2_usubw_s, gen_helper_sve2_usubw_d,
-};
-TRANS_FEAT(USUBWB, aa64_sve2, gen_gvec_ool_arg_zzz, usubw_fns[a->esz], a, 0)
-TRANS_FEAT(USUBWT, aa64_sve2, gen_gvec_ool_arg_zzz, usubw_fns[a->esz], a, 1)
-
-static void gen_sshll_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t imm)
-{
-    int top = imm & 1;
-    int shl = imm >> 1;
-    int halfbits = 4 << vece;
-
-    if (top) {
-        if (shl == halfbits) {
-            TCGv_vec t = tcg_temp_new_vec_matching(d);
-            tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(halfbits, halfbits));
-            tcg_gen_and_vec(vece, d, n, t);
-            tcg_temp_free_vec(t);
-        } else {
-            tcg_gen_sari_vec(vece, d, n, halfbits);
-            tcg_gen_shli_vec(vece, d, d, shl);
-        }
-    } else {
-        tcg_gen_shli_vec(vece, d, n, halfbits);
-        tcg_gen_sari_vec(vece, d, d, halfbits - shl);
-    }
-}
-
-static void gen_ushll_i64(unsigned vece, TCGv_i64 d, TCGv_i64 n, int imm)
-{
-    int halfbits = 4 << vece;
-    int top = imm & 1;
-    int shl = (imm >> 1);
-    int shift;
-    uint64_t mask;
-
-    mask = MAKE_64BIT_MASK(0, halfbits);
-    mask <<= shl;
-    mask = dup_const(vece, mask);
-
-    shift = shl - top * halfbits;
-    if (shift < 0) {
-        tcg_gen_shri_i64(d, n, -shift);
-    } else {
-        tcg_gen_shli_i64(d, n, shift);
-    }
-    tcg_gen_andi_i64(d, d, mask);
-}
-
-static void gen_ushll16_i64(TCGv_i64 d, TCGv_i64 n, int64_t imm)
-{
-    gen_ushll_i64(MO_16, d, n, imm);
-}
-
-static void gen_ushll32_i64(TCGv_i64 d, TCGv_i64 n, int64_t imm)
-{
-    gen_ushll_i64(MO_32, d, n, imm);
-}
-
-static void gen_ushll64_i64(TCGv_i64 d, TCGv_i64 n, int64_t imm)
-{
-    gen_ushll_i64(MO_64, d, n, imm);
-}
-
-static void gen_ushll_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t imm)
-{
-    int halfbits = 4 << vece;
-    int top = imm & 1;
-    int shl = imm >> 1;
-
-    if (top) {
-        if (shl == halfbits) {
-            TCGv_vec t = tcg_temp_new_vec_matching(d);
-            tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(halfbits, halfbits));
-            tcg_gen_and_vec(vece, d, n, t);
-            tcg_temp_free_vec(t);
-        } else {
-            tcg_gen_shri_vec(vece, d, n, halfbits);
-            tcg_gen_shli_vec(vece, d, d, shl);
-        }
-    } else {
-        if (shl == 0) {
-            TCGv_vec t = tcg_temp_new_vec_matching(d);
-            tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
-            tcg_gen_and_vec(vece, d, n, t);
-            tcg_temp_free_vec(t);
-        } else {
-            tcg_gen_shli_vec(vece, d, n, halfbits);
-            tcg_gen_shri_vec(vece, d, d, halfbits - shl);
-        }
-    }
-}
-
-static bool do_shll_tb(DisasContext *s, arg_rri_esz *a,
-                       const GVecGen2i ops[3], bool sel)
-{
-
-    if (a->esz < 0 || a->esz > 2) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        tcg_gen_gvec_2i(vec_full_reg_offset(s, a->rd),
-                        vec_full_reg_offset(s, a->rn),
-                        vsz, vsz, (a->imm << 1) | sel,
-                        &ops[a->esz]);
-    }
-    return true;
-}
-
-static const TCGOpcode sshll_list[] = {
-    INDEX_op_shli_vec, INDEX_op_sari_vec, 0
-};
-static const GVecGen2i sshll_ops[3] = {
-    { .fniv = gen_sshll_vec,
-      .opt_opc = sshll_list,
-      .fno = gen_helper_sve2_sshll_h,
-      .vece = MO_16 },
-    { .fniv = gen_sshll_vec,
-      .opt_opc = sshll_list,
-      .fno = gen_helper_sve2_sshll_s,
-      .vece = MO_32 },
-    { .fniv = gen_sshll_vec,
-      .opt_opc = sshll_list,
-      .fno = gen_helper_sve2_sshll_d,
-      .vece = MO_64 }
-};
-TRANS_FEAT(SSHLLB, aa64_sve2, do_shll_tb, a, sshll_ops, false)
-TRANS_FEAT(SSHLLT, aa64_sve2, do_shll_tb, a, sshll_ops, true)
-
-static const TCGOpcode ushll_list[] = {
-    INDEX_op_shli_vec, INDEX_op_shri_vec, 0
-};
-static const GVecGen2i ushll_ops[3] = {
-    { .fni8 = gen_ushll16_i64,
-      .fniv = gen_ushll_vec,
-      .opt_opc = ushll_list,
-      .fno = gen_helper_sve2_ushll_h,
-      .vece = MO_16 },
-    { .fni8 = gen_ushll32_i64,
-      .fniv = gen_ushll_vec,
-      .opt_opc = ushll_list,
-      .fno = gen_helper_sve2_ushll_s,
-      .vece = MO_32 },
-    { .fni8 = gen_ushll64_i64,
-      .fniv = gen_ushll_vec,
-      .opt_opc = ushll_list,
-      .fno = gen_helper_sve2_ushll_d,
-      .vece = MO_64 },
-};
-TRANS_FEAT(USHLLB, aa64_sve2, do_shll_tb, a, ushll_ops, false)
-TRANS_FEAT(USHLLT, aa64_sve2, do_shll_tb, a, ushll_ops, true)
-
-static gen_helper_gvec_3 * const bext_fns[4] = {
-    gen_helper_sve2_bext_b, gen_helper_sve2_bext_h,
-    gen_helper_sve2_bext_s, gen_helper_sve2_bext_d,
-};
-TRANS_FEAT_NONSTREAMING(BEXT, aa64_sve2_bitperm, gen_gvec_ool_arg_zzz,
-                        bext_fns[a->esz], a, 0)
-
-static gen_helper_gvec_3 * const bdep_fns[4] = {
-    gen_helper_sve2_bdep_b, gen_helper_sve2_bdep_h,
-    gen_helper_sve2_bdep_s, gen_helper_sve2_bdep_d,
-};
-TRANS_FEAT_NONSTREAMING(BDEP, aa64_sve2_bitperm, gen_gvec_ool_arg_zzz,
-                        bdep_fns[a->esz], a, 0)
-
-static gen_helper_gvec_3 * const bgrp_fns[4] = {
-    gen_helper_sve2_bgrp_b, gen_helper_sve2_bgrp_h,
-    gen_helper_sve2_bgrp_s, gen_helper_sve2_bgrp_d,
-};
-TRANS_FEAT_NONSTREAMING(BGRP, aa64_sve2_bitperm, gen_gvec_ool_arg_zzz,
-                        bgrp_fns[a->esz], a, 0)
-
-static gen_helper_gvec_3 * const cadd_fns[4] = {
-    gen_helper_sve2_cadd_b, gen_helper_sve2_cadd_h,
-    gen_helper_sve2_cadd_s, gen_helper_sve2_cadd_d,
-};
-TRANS_FEAT(CADD_rot90, aa64_sve2, gen_gvec_ool_arg_zzz,
-           cadd_fns[a->esz], a, 0)
-TRANS_FEAT(CADD_rot270, aa64_sve2, gen_gvec_ool_arg_zzz,
-           cadd_fns[a->esz], a, 1)
-
-static gen_helper_gvec_3 * const sqcadd_fns[4] = {
-    gen_helper_sve2_sqcadd_b, gen_helper_sve2_sqcadd_h,
-    gen_helper_sve2_sqcadd_s, gen_helper_sve2_sqcadd_d,
-};
-TRANS_FEAT(SQCADD_rot90, aa64_sve2, gen_gvec_ool_arg_zzz,
-           sqcadd_fns[a->esz], a, 0)
-TRANS_FEAT(SQCADD_rot270, aa64_sve2, gen_gvec_ool_arg_zzz,
-           sqcadd_fns[a->esz], a, 1)
-
-static gen_helper_gvec_4 * const sabal_fns[4] = {
-    NULL,                    gen_helper_sve2_sabal_h,
-    gen_helper_sve2_sabal_s, gen_helper_sve2_sabal_d,
-};
-TRANS_FEAT(SABALB, aa64_sve2, gen_gvec_ool_arg_zzzz, sabal_fns[a->esz], a, 0)
-TRANS_FEAT(SABALT, aa64_sve2, gen_gvec_ool_arg_zzzz, sabal_fns[a->esz], a, 1)
-
-static gen_helper_gvec_4 * const uabal_fns[4] = {
-    NULL,                    gen_helper_sve2_uabal_h,
-    gen_helper_sve2_uabal_s, gen_helper_sve2_uabal_d,
-};
-TRANS_FEAT(UABALB, aa64_sve2, gen_gvec_ool_arg_zzzz, uabal_fns[a->esz], a, 0)
-TRANS_FEAT(UABALT, aa64_sve2, gen_gvec_ool_arg_zzzz, uabal_fns[a->esz], a, 1)
-
-static bool do_adcl(DisasContext *s, arg_rrrr_esz *a, bool sel)
-{
-    static gen_helper_gvec_4 * const fns[2] = {
-        gen_helper_sve2_adcl_s,
-        gen_helper_sve2_adcl_d,
-    };
-    /*
-     * Note that in this case the ESZ field encodes both size and sign.
-     * Split out 'subtract' into bit 1 of the data field for the helper.
-     */
-    return gen_gvec_ool_arg_zzzz(s, fns[a->esz & 1], a, (a->esz & 2) | sel);
-}
-
-TRANS_FEAT(ADCLB, aa64_sve2, do_adcl, a, false)
-TRANS_FEAT(ADCLT, aa64_sve2, do_adcl, a, true)
-
-TRANS_FEAT(SSRA, aa64_sve2, gen_gvec_fn_arg_zzi, gen_gvec_ssra, a)
-TRANS_FEAT(USRA, aa64_sve2, gen_gvec_fn_arg_zzi, gen_gvec_usra, a)
-TRANS_FEAT(SRSRA, aa64_sve2, gen_gvec_fn_arg_zzi, gen_gvec_srsra, a)
-TRANS_FEAT(URSRA, aa64_sve2, gen_gvec_fn_arg_zzi, gen_gvec_ursra, a)
-TRANS_FEAT(SRI, aa64_sve2, gen_gvec_fn_arg_zzi, gen_gvec_sri, a)
-TRANS_FEAT(SLI, aa64_sve2, gen_gvec_fn_arg_zzi, gen_gvec_sli, a)
-
-TRANS_FEAT(SABA, aa64_sve2, gen_gvec_fn_arg_zzz, gen_gvec_saba, a)
-TRANS_FEAT(UABA, aa64_sve2, gen_gvec_fn_arg_zzz, gen_gvec_uaba, a)
-
-static bool do_narrow_extract(DisasContext *s, arg_rri_esz *a,
-                              const GVecGen2 ops[3])
-{
-    if (a->esz < 0 || a->esz > MO_32 || a->imm != 0) {
-        return false;
-    }
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        tcg_gen_gvec_2(vec_full_reg_offset(s, a->rd),
-                        vec_full_reg_offset(s, a->rn),
-                        vsz, vsz, &ops[a->esz]);
-    }
-    return true;
-}
-
-static const TCGOpcode sqxtn_list[] = {
-    INDEX_op_shli_vec, INDEX_op_smin_vec, INDEX_op_smax_vec, 0
-};
-
-static void gen_sqxtnb_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
-{
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
-    int halfbits = 4 << vece;
-    int64_t mask = (1ull << halfbits) - 1;
-    int64_t min = -1ull << (halfbits - 1);
-    int64_t max = -min - 1;
-
-    tcg_gen_dupi_vec(vece, t, min);
-    tcg_gen_smax_vec(vece, d, n, t);
-    tcg_gen_dupi_vec(vece, t, max);
-    tcg_gen_smin_vec(vece, d, d, t);
-    tcg_gen_dupi_vec(vece, t, mask);
-    tcg_gen_and_vec(vece, d, d, t);
-    tcg_temp_free_vec(t);
-}
-
-static const GVecGen2 sqxtnb_ops[3] = {
-    { .fniv = gen_sqxtnb_vec,
-      .opt_opc = sqxtn_list,
-      .fno = gen_helper_sve2_sqxtnb_h,
-      .vece = MO_16 },
-    { .fniv = gen_sqxtnb_vec,
-      .opt_opc = sqxtn_list,
-      .fno = gen_helper_sve2_sqxtnb_s,
-      .vece = MO_32 },
-    { .fniv = gen_sqxtnb_vec,
-      .opt_opc = sqxtn_list,
-      .fno = gen_helper_sve2_sqxtnb_d,
-      .vece = MO_64 },
-};
-TRANS_FEAT(SQXTNB, aa64_sve2, do_narrow_extract, a, sqxtnb_ops)
-
-static void gen_sqxtnt_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
-{
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
-    int halfbits = 4 << vece;
-    int64_t mask = (1ull << halfbits) - 1;
-    int64_t min = -1ull << (halfbits - 1);
-    int64_t max = -min - 1;
-
-    tcg_gen_dupi_vec(vece, t, min);
-    tcg_gen_smax_vec(vece, n, n, t);
-    tcg_gen_dupi_vec(vece, t, max);
-    tcg_gen_smin_vec(vece, n, n, t);
-    tcg_gen_shli_vec(vece, n, n, halfbits);
-    tcg_gen_dupi_vec(vece, t, mask);
-    tcg_gen_bitsel_vec(vece, d, t, d, n);
-    tcg_temp_free_vec(t);
-}
-
-static const GVecGen2 sqxtnt_ops[3] = {
-    { .fniv = gen_sqxtnt_vec,
-      .opt_opc = sqxtn_list,
-      .load_dest = true,
-      .fno = gen_helper_sve2_sqxtnt_h,
-      .vece = MO_16 },
-    { .fniv = gen_sqxtnt_vec,
-      .opt_opc = sqxtn_list,
-      .load_dest = true,
-      .fno = gen_helper_sve2_sqxtnt_s,
-      .vece = MO_32 },
-    { .fniv = gen_sqxtnt_vec,
-      .opt_opc = sqxtn_list,
-      .load_dest = true,
-      .fno = gen_helper_sve2_sqxtnt_d,
-      .vece = MO_64 },
-};
-TRANS_FEAT(SQXTNT, aa64_sve2, do_narrow_extract, a, sqxtnt_ops)
-
-static const TCGOpcode uqxtn_list[] = {
-    INDEX_op_shli_vec, INDEX_op_umin_vec, 0
-};
-
-static void gen_uqxtnb_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
-{
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
-    int halfbits = 4 << vece;
-    int64_t max = (1ull << halfbits) - 1;
-
-    tcg_gen_dupi_vec(vece, t, max);
-    tcg_gen_umin_vec(vece, d, n, t);
-    tcg_temp_free_vec(t);
-}
-
-static const GVecGen2 uqxtnb_ops[3] = {
-    { .fniv = gen_uqxtnb_vec,
-      .opt_opc = uqxtn_list,
-      .fno = gen_helper_sve2_uqxtnb_h,
-      .vece = MO_16 },
-    { .fniv = gen_uqxtnb_vec,
-      .opt_opc = uqxtn_list,
-      .fno = gen_helper_sve2_uqxtnb_s,
-      .vece = MO_32 },
-    { .fniv = gen_uqxtnb_vec,
-      .opt_opc = uqxtn_list,
-      .fno = gen_helper_sve2_uqxtnb_d,
-      .vece = MO_64 },
-};
-TRANS_FEAT(UQXTNB, aa64_sve2, do_narrow_extract, a, uqxtnb_ops)
-
-static void gen_uqxtnt_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
-{
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
-    int halfbits = 4 << vece;
-    int64_t max = (1ull << halfbits) - 1;
-
-    tcg_gen_dupi_vec(vece, t, max);
-    tcg_gen_umin_vec(vece, n, n, t);
-    tcg_gen_shli_vec(vece, n, n, halfbits);
-    tcg_gen_bitsel_vec(vece, d, t, d, n);
-    tcg_temp_free_vec(t);
-}
-
-static const GVecGen2 uqxtnt_ops[3] = {
-    { .fniv = gen_uqxtnt_vec,
-      .opt_opc = uqxtn_list,
-      .load_dest = true,
-      .fno = gen_helper_sve2_uqxtnt_h,
-      .vece = MO_16 },
-    { .fniv = gen_uqxtnt_vec,
-      .opt_opc = uqxtn_list,
-      .load_dest = true,
-      .fno = gen_helper_sve2_uqxtnt_s,
-      .vece = MO_32 },
-    { .fniv = gen_uqxtnt_vec,
-      .opt_opc = uqxtn_list,
-      .load_dest = true,
-      .fno = gen_helper_sve2_uqxtnt_d,
-      .vece = MO_64 },
-};
-TRANS_FEAT(UQXTNT, aa64_sve2, do_narrow_extract, a, uqxtnt_ops)
-
-static const TCGOpcode sqxtun_list[] = {
-    INDEX_op_shli_vec, INDEX_op_umin_vec, INDEX_op_smax_vec, 0
-};
-
-static void gen_sqxtunb_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
-{
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
-    int halfbits = 4 << vece;
-    int64_t max = (1ull << halfbits) - 1;
-
-    tcg_gen_dupi_vec(vece, t, 0);
-    tcg_gen_smax_vec(vece, d, n, t);
-    tcg_gen_dupi_vec(vece, t, max);
-    tcg_gen_umin_vec(vece, d, d, t);
-    tcg_temp_free_vec(t);
-}
-
-static const GVecGen2 sqxtunb_ops[3] = {
-    { .fniv = gen_sqxtunb_vec,
-      .opt_opc = sqxtun_list,
-      .fno = gen_helper_sve2_sqxtunb_h,
-      .vece = MO_16 },
-    { .fniv = gen_sqxtunb_vec,
-      .opt_opc = sqxtun_list,
-      .fno = gen_helper_sve2_sqxtunb_s,
-      .vece = MO_32 },
-    { .fniv = gen_sqxtunb_vec,
-      .opt_opc = sqxtun_list,
-      .fno = gen_helper_sve2_sqxtunb_d,
-      .vece = MO_64 },
-};
-TRANS_FEAT(SQXTUNB, aa64_sve2, do_narrow_extract, a, sqxtunb_ops)
-
-static void gen_sqxtunt_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
-{
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
-    int halfbits = 4 << vece;
-    int64_t max = (1ull << halfbits) - 1;
-
-    tcg_gen_dupi_vec(vece, t, 0);
-    tcg_gen_smax_vec(vece, n, n, t);
-    tcg_gen_dupi_vec(vece, t, max);
-    tcg_gen_umin_vec(vece, n, n, t);
-    tcg_gen_shli_vec(vece, n, n, halfbits);
-    tcg_gen_bitsel_vec(vece, d, t, d, n);
-    tcg_temp_free_vec(t);
-}
-
-static const GVecGen2 sqxtunt_ops[3] = {
-    { .fniv = gen_sqxtunt_vec,
-      .opt_opc = sqxtun_list,
-      .load_dest = true,
-      .fno = gen_helper_sve2_sqxtunt_h,
-      .vece = MO_16 },
-    { .fniv = gen_sqxtunt_vec,
-      .opt_opc = sqxtun_list,
-      .load_dest = true,
-      .fno = gen_helper_sve2_sqxtunt_s,
-      .vece = MO_32 },
-    { .fniv = gen_sqxtunt_vec,
-      .opt_opc = sqxtun_list,
-      .load_dest = true,
-      .fno = gen_helper_sve2_sqxtunt_d,
-      .vece = MO_64 },
-};
-TRANS_FEAT(SQXTUNT, aa64_sve2, do_narrow_extract, a, sqxtunt_ops)
-
-static bool do_shr_narrow(DisasContext *s, arg_rri_esz *a,
-                          const GVecGen2i ops[3])
-{
-    if (a->esz < 0 || a->esz > MO_32) {
-        return false;
-    }
-    assert(a->imm > 0 && a->imm <= (8 << a->esz));
-    if (sve_access_check(s)) {
-        unsigned vsz = vec_full_reg_size(s);
-        tcg_gen_gvec_2i(vec_full_reg_offset(s, a->rd),
-                        vec_full_reg_offset(s, a->rn),
-                        vsz, vsz, a->imm, &ops[a->esz]);
-    }
-    return true;
-}
-
-static void gen_shrnb_i64(unsigned vece, TCGv_i64 d, TCGv_i64 n, int shr)
-{
-    int halfbits = 4 << vece;
-    uint64_t mask = dup_const(vece, MAKE_64BIT_MASK(0, halfbits));
-
-    tcg_gen_shri_i64(d, n, shr);
-    tcg_gen_andi_i64(d, d, mask);
-}
-
-static void gen_shrnb16_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr)
-{
-    gen_shrnb_i64(MO_16, d, n, shr);
-}
-
-static void gen_shrnb32_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr)
-{
-    gen_shrnb_i64(MO_32, d, n, shr);
-}
-
-static void gen_shrnb64_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr)
-{
-    gen_shrnb_i64(MO_64, d, n, shr);
-}
-
-static void gen_shrnb_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t shr)
-{
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
-    int halfbits = 4 << vece;
-    uint64_t mask = MAKE_64BIT_MASK(0, halfbits);
-
-    tcg_gen_shri_vec(vece, n, n, shr);
-    tcg_gen_dupi_vec(vece, t, mask);
-    tcg_gen_and_vec(vece, d, n, t);
-    tcg_temp_free_vec(t);
-}
-
-static const TCGOpcode shrnb_vec_list[] = { INDEX_op_shri_vec, 0 };
-static const GVecGen2i shrnb_ops[3] = {
-    { .fni8 = gen_shrnb16_i64,
-      .fniv = gen_shrnb_vec,
-      .opt_opc = shrnb_vec_list,
-      .fno = gen_helper_sve2_shrnb_h,
-      .vece = MO_16 },
-    { .fni8 = gen_shrnb32_i64,
-      .fniv = gen_shrnb_vec,
-      .opt_opc = shrnb_vec_list,
-      .fno = gen_helper_sve2_shrnb_s,
-      .vece = MO_32 },
-    { .fni8 = gen_shrnb64_i64,
-      .fniv = gen_shrnb_vec,
-      .opt_opc = shrnb_vec_list,
-      .fno = gen_helper_sve2_shrnb_d,
-      .vece = MO_64 },
-};
-TRANS_FEAT(SHRNB, aa64_sve2, do_shr_narrow, a, shrnb_ops)
-
-static void gen_shrnt_i64(unsigned vece, TCGv_i64 d, TCGv_i64 n, int shr)
-{
-    int halfbits = 4 << vece;
-    uint64_t mask = dup_const(vece, MAKE_64BIT_MASK(0, halfbits));
-
-    tcg_gen_shli_i64(n, n, halfbits - shr);
-    tcg_gen_andi_i64(n, n, ~mask);
-    tcg_gen_andi_i64(d, d, mask);
-    tcg_gen_or_i64(d, d, n);
-}
-
-static void gen_shrnt16_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr)
-{
-    gen_shrnt_i64(MO_16, d, n, shr);
-}
-
-static void gen_shrnt32_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr)
-{
-    gen_shrnt_i64(MO_32, d, n, shr);
-}
-
-static void gen_shrnt64_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr)
-{
-    tcg_gen_shri_i64(n, n, shr);
-    tcg_gen_deposit_i64(d, d, n, 32, 32);
-}
-
-static void gen_shrnt_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t shr)
-{
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
-    int halfbits = 4 << vece;
-    uint64_t mask = MAKE_64BIT_MASK(0, halfbits);
-
-    tcg_gen_shli_vec(vece, n, n, halfbits - shr);
-    tcg_gen_dupi_vec(vece, t, mask);
-    tcg_gen_bitsel_vec(vece, d, t, d, n);
-    tcg_temp_free_vec(t);
-}
-
-static const TCGOpcode shrnt_vec_list[] = { INDEX_op_shli_vec, 0 };
-static const GVecGen2i shrnt_ops[3] = {
-    { .fni8 = gen_shrnt16_i64,
-      .fniv = gen_shrnt_vec,
-      .opt_opc = shrnt_vec_list,
-      .load_dest = true,
-      .fno = gen_helper_sve2_shrnt_h,
-      .vece = MO_16 },
-    { .fni8 = gen_shrnt32_i64,
-      .fniv = gen_shrnt_vec,
-      .opt_opc = shrnt_vec_list,
-      .load_dest = true,
-      .fno = gen_helper_sve2_shrnt_s,
-      .vece = MO_32 },
-    { .fni8 = gen_shrnt64_i64,
-      .fniv = gen_shrnt_vec,
-      .opt_opc = shrnt_vec_list,
-      .load_dest = true,
-      .fno = gen_helper_sve2_shrnt_d,
-      .vece = MO_64 },
-};
-TRANS_FEAT(SHRNT, aa64_sve2, do_shr_narrow, a, shrnt_ops)
-
-static const GVecGen2i rshrnb_ops[3] = {
-    { .fno = gen_helper_sve2_rshrnb_h },
-    { .fno = gen_helper_sve2_rshrnb_s },
-    { .fno = gen_helper_sve2_rshrnb_d },
-};
-TRANS_FEAT(RSHRNB, aa64_sve2, do_shr_narrow, a, rshrnb_ops)
-
-static const GVecGen2i rshrnt_ops[3] = {
-    { .fno = gen_helper_sve2_rshrnt_h },
-    { .fno = gen_helper_sve2_rshrnt_s },
-    { .fno = gen_helper_sve2_rshrnt_d },
-};
-TRANS_FEAT(RSHRNT, aa64_sve2, do_shr_narrow, a, rshrnt_ops)
-
-static void gen_sqshrunb_vec(unsigned vece, TCGv_vec d,
-                             TCGv_vec n, int64_t shr)
-{
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
-    int halfbits = 4 << vece;
-
-    tcg_gen_sari_vec(vece, n, n, shr);
-    tcg_gen_dupi_vec(vece, t, 0);
-    tcg_gen_smax_vec(vece, n, n, t);
-    tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
-    tcg_gen_umin_vec(vece, d, n, t);
-    tcg_temp_free_vec(t);
-}
-
-static const TCGOpcode sqshrunb_vec_list[] = {
-    INDEX_op_sari_vec, INDEX_op_smax_vec, INDEX_op_umin_vec, 0
-};
-static const GVecGen2i sqshrunb_ops[3] = {
-    { .fniv = gen_sqshrunb_vec,
-      .opt_opc = sqshrunb_vec_list,
-      .fno = gen_helper_sve2_sqshrunb_h,
-      .vece = MO_16 },
-    { .fniv = gen_sqshrunb_vec,
-      .opt_opc = sqshrunb_vec_list,
-      .fno = gen_helper_sve2_sqshrunb_s,
-      .vece = MO_32 },
-    { .fniv = gen_sqshrunb_vec,
-      .opt_opc = sqshrunb_vec_list,
-      .fno = gen_helper_sve2_sqshrunb_d,
-      .vece = MO_64 },
-};
-TRANS_FEAT(SQSHRUNB, aa64_sve2, do_shr_narrow, a, sqshrunb_ops)
-
-static void gen_sqshrunt_vec(unsigned vece, TCGv_vec d,
-                             TCGv_vec n, int64_t shr)
-{
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
-    int halfbits = 4 << vece;
-
-    tcg_gen_sari_vec(vece, n, n, shr);
-    tcg_gen_dupi_vec(vece, t, 0);
-    tcg_gen_smax_vec(vece, n, n, t);
-    tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
-    tcg_gen_umin_vec(vece, n, n, t);
-    tcg_gen_shli_vec(vece, n, n, halfbits);
-    tcg_gen_bitsel_vec(vece, d, t, d, n);
-    tcg_temp_free_vec(t);
-}
-
-static const TCGOpcode sqshrunt_vec_list[] = {
-    INDEX_op_shli_vec, INDEX_op_sari_vec,
-    INDEX_op_smax_vec, INDEX_op_umin_vec, 0
-};
-static const GVecGen2i sqshrunt_ops[3] = {
-    { .fniv = gen_sqshrunt_vec,
-      .opt_opc = sqshrunt_vec_list,
-      .load_dest = true,
-      .fno = gen_helper_sve2_sqshrunt_h,
-      .vece = MO_16 },
-    { .fniv = gen_sqshrunt_vec,
-      .opt_opc = sqshrunt_vec_list,
-      .load_dest = true,
-      .fno = gen_helper_sve2_sqshrunt_s,
-      .vece = MO_32 },
-    { .fniv = gen_sqshrunt_vec,
-      .opt_opc = sqshrunt_vec_list,
-      .load_dest = true,
-      .fno = gen_helper_sve2_sqshrunt_d,
-      .vece = MO_64 },
-};
-TRANS_FEAT(SQSHRUNT, aa64_sve2, do_shr_narrow, a, sqshrunt_ops)
-
-static const GVecGen2i sqrshrunb_ops[3] = {
-    { .fno = gen_helper_sve2_sqrshrunb_h },
-    { .fno = gen_helper_sve2_sqrshrunb_s },
-    { .fno = gen_helper_sve2_sqrshrunb_d },
-};
-TRANS_FEAT(SQRSHRUNB, aa64_sve2, do_shr_narrow, a, sqrshrunb_ops)
-
-static const GVecGen2i sqrshrunt_ops[3] = {
-    { .fno = gen_helper_sve2_sqrshrunt_h },
-    { .fno = gen_helper_sve2_sqrshrunt_s },
-    { .fno = gen_helper_sve2_sqrshrunt_d },
-};
-TRANS_FEAT(SQRSHRUNT, aa64_sve2, do_shr_narrow, a, sqrshrunt_ops)
-
-static void gen_sqshrnb_vec(unsigned vece, TCGv_vec d,
-                            TCGv_vec n, int64_t shr)
-{
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
-    int halfbits = 4 << vece;
-    int64_t max = MAKE_64BIT_MASK(0, halfbits - 1);
-    int64_t min = -max - 1;
-
-    tcg_gen_sari_vec(vece, n, n, shr);
-    tcg_gen_dupi_vec(vece, t, min);
-    tcg_gen_smax_vec(vece, n, n, t);
-    tcg_gen_dupi_vec(vece, t, max);
-    tcg_gen_smin_vec(vece, n, n, t);
-    tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
-    tcg_gen_and_vec(vece, d, n, t);
-    tcg_temp_free_vec(t);
-}
-
-static const TCGOpcode sqshrnb_vec_list[] = {
-    INDEX_op_sari_vec, INDEX_op_smax_vec, INDEX_op_smin_vec, 0
-};
-static const GVecGen2i sqshrnb_ops[3] = {
-    { .fniv = gen_sqshrnb_vec,
-      .opt_opc = sqshrnb_vec_list,
-      .fno = gen_helper_sve2_sqshrnb_h,
-      .vece = MO_16 },
-    { .fniv = gen_sqshrnb_vec,
-      .opt_opc = sqshrnb_vec_list,
-      .fno = gen_helper_sve2_sqshrnb_s,
-      .vece = MO_32 },
-    { .fniv = gen_sqshrnb_vec,
-      .opt_opc = sqshrnb_vec_list,
-      .fno = gen_helper_sve2_sqshrnb_d,
-      .vece = MO_64 },
-};
-TRANS_FEAT(SQSHRNB, aa64_sve2, do_shr_narrow, a, sqshrnb_ops)
-
-static void gen_sqshrnt_vec(unsigned vece, TCGv_vec d,
-                             TCGv_vec n, int64_t shr)
-{
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
-    int halfbits = 4 << vece;
-    int64_t max = MAKE_64BIT_MASK(0, halfbits - 1);
-    int64_t min = -max - 1;
-
-    tcg_gen_sari_vec(vece, n, n, shr);
-    tcg_gen_dupi_vec(vece, t, min);
-    tcg_gen_smax_vec(vece, n, n, t);
-    tcg_gen_dupi_vec(vece, t, max);
-    tcg_gen_smin_vec(vece, n, n, t);
-    tcg_gen_shli_vec(vece, n, n, halfbits);
-    tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
-    tcg_gen_bitsel_vec(vece, d, t, d, n);
-    tcg_temp_free_vec(t);
-}
-
-static const TCGOpcode sqshrnt_vec_list[] = {
-    INDEX_op_shli_vec, INDEX_op_sari_vec,
-    INDEX_op_smax_vec, INDEX_op_smin_vec, 0
-};
-static const GVecGen2i sqshrnt_ops[3] = {
-    { .fniv = gen_sqshrnt_vec,
-      .opt_opc = sqshrnt_vec_list,
-      .load_dest = true,
-      .fno = gen_helper_sve2_sqshrnt_h,
-      .vece = MO_16 },
-    { .fniv = gen_sqshrnt_vec,
-      .opt_opc = sqshrnt_vec_list,
-      .load_dest = true,
-      .fno = gen_helper_sve2_sqshrnt_s,
-      .vece = MO_32 },
-    { .fniv = gen_sqshrnt_vec,
-      .opt_opc = sqshrnt_vec_list,
-      .load_dest = true,
-      .fno = gen_helper_sve2_sqshrnt_d,
-      .vece = MO_64 },
-};
-TRANS_FEAT(SQSHRNT, aa64_sve2, do_shr_narrow, a, sqshrnt_ops)
-
-static const GVecGen2i sqrshrnb_ops[3] = {
-    { .fno = gen_helper_sve2_sqrshrnb_h },
-    { .fno = gen_helper_sve2_sqrshrnb_s },
-    { .fno = gen_helper_sve2_sqrshrnb_d },
-};
-TRANS_FEAT(SQRSHRNB, aa64_sve2, do_shr_narrow, a, sqrshrnb_ops)
-
-static const GVecGen2i sqrshrnt_ops[3] = {
-    { .fno = gen_helper_sve2_sqrshrnt_h },
-    { .fno = gen_helper_sve2_sqrshrnt_s },
-    { .fno = gen_helper_sve2_sqrshrnt_d },
-};
-TRANS_FEAT(SQRSHRNT, aa64_sve2, do_shr_narrow, a, sqrshrnt_ops)
-
-static void gen_uqshrnb_vec(unsigned vece, TCGv_vec d,
-                            TCGv_vec n, int64_t shr)
-{
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
-    int halfbits = 4 << vece;
-
-    tcg_gen_shri_vec(vece, n, n, shr);
-    tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
-    tcg_gen_umin_vec(vece, d, n, t);
-    tcg_temp_free_vec(t);
-}
-
-static const TCGOpcode uqshrnb_vec_list[] = {
-    INDEX_op_shri_vec, INDEX_op_umin_vec, 0
-};
-static const GVecGen2i uqshrnb_ops[3] = {
-    { .fniv = gen_uqshrnb_vec,
-      .opt_opc = uqshrnb_vec_list,
-      .fno = gen_helper_sve2_uqshrnb_h,
-      .vece = MO_16 },
-    { .fniv = gen_uqshrnb_vec,
-      .opt_opc = uqshrnb_vec_list,
-      .fno = gen_helper_sve2_uqshrnb_s,
-      .vece = MO_32 },
-    { .fniv = gen_uqshrnb_vec,
-      .opt_opc = uqshrnb_vec_list,
-      .fno = gen_helper_sve2_uqshrnb_d,
-      .vece = MO_64 },
-};
-TRANS_FEAT(UQSHRNB, aa64_sve2, do_shr_narrow, a, uqshrnb_ops)
-
-static void gen_uqshrnt_vec(unsigned vece, TCGv_vec d,
-                            TCGv_vec n, int64_t shr)
-{
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
-    int halfbits = 4 << vece;
-
-    tcg_gen_shri_vec(vece, n, n, shr);
-    tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
-    tcg_gen_umin_vec(vece, n, n, t);
-    tcg_gen_shli_vec(vece, n, n, halfbits);
-    tcg_gen_bitsel_vec(vece, d, t, d, n);
-    tcg_temp_free_vec(t);
-}
-
-static const TCGOpcode uqshrnt_vec_list[] = {
-    INDEX_op_shli_vec, INDEX_op_shri_vec, INDEX_op_umin_vec, 0
-};
-static const GVecGen2i uqshrnt_ops[3] = {
-    { .fniv = gen_uqshrnt_vec,
-      .opt_opc = uqshrnt_vec_list,
-      .load_dest = true,
-      .fno = gen_helper_sve2_uqshrnt_h,
-      .vece = MO_16 },
-    { .fniv = gen_uqshrnt_vec,
-      .opt_opc = uqshrnt_vec_list,
-      .load_dest = true,
-      .fno = gen_helper_sve2_uqshrnt_s,
-      .vece = MO_32 },
-    { .fniv = gen_uqshrnt_vec,
-      .opt_opc = uqshrnt_vec_list,
-      .load_dest = true,
-      .fno = gen_helper_sve2_uqshrnt_d,
-      .vece = MO_64 },
-};
-TRANS_FEAT(UQSHRNT, aa64_sve2, do_shr_narrow, a, uqshrnt_ops)
-
-static const GVecGen2i uqrshrnb_ops[3] = {
-    { .fno = gen_helper_sve2_uqrshrnb_h },
-    { .fno = gen_helper_sve2_uqrshrnb_s },
-    { .fno = gen_helper_sve2_uqrshrnb_d },
-};
-TRANS_FEAT(UQRSHRNB, aa64_sve2, do_shr_narrow, a, uqrshrnb_ops)
-
-static const GVecGen2i uqrshrnt_ops[3] = {
-    { .fno = gen_helper_sve2_uqrshrnt_h },
-    { .fno = gen_helper_sve2_uqrshrnt_s },
-    { .fno = gen_helper_sve2_uqrshrnt_d },
-};
-TRANS_FEAT(UQRSHRNT, aa64_sve2, do_shr_narrow, a, uqrshrnt_ops)
-
-#define DO_SVE2_ZZZ_NARROW(NAME, name)                                    \
-    static gen_helper_gvec_3 * const name##_fns[4] = {                    \
-        NULL,                       gen_helper_sve2_##name##_h,           \
-        gen_helper_sve2_##name##_s, gen_helper_sve2_##name##_d,           \
-    };                                                                    \
-    TRANS_FEAT(NAME, aa64_sve2, gen_gvec_ool_arg_zzz,                     \
-               name##_fns[a->esz], a, 0)
-
-DO_SVE2_ZZZ_NARROW(ADDHNB, addhnb)
-DO_SVE2_ZZZ_NARROW(ADDHNT, addhnt)
-DO_SVE2_ZZZ_NARROW(RADDHNB, raddhnb)
-DO_SVE2_ZZZ_NARROW(RADDHNT, raddhnt)
-
-DO_SVE2_ZZZ_NARROW(SUBHNB, subhnb)
-DO_SVE2_ZZZ_NARROW(SUBHNT, subhnt)
-DO_SVE2_ZZZ_NARROW(RSUBHNB, rsubhnb)
-DO_SVE2_ZZZ_NARROW(RSUBHNT, rsubhnt)
-
-static gen_helper_gvec_flags_4 * const match_fns[4] = {
-    gen_helper_sve2_match_ppzz_b, gen_helper_sve2_match_ppzz_h, NULL, NULL
-};
-TRANS_FEAT_NONSTREAMING(MATCH, aa64_sve2, do_ppzz_flags, a, match_fns[a->esz])
-
-static gen_helper_gvec_flags_4 * const nmatch_fns[4] = {
-    gen_helper_sve2_nmatch_ppzz_b, gen_helper_sve2_nmatch_ppzz_h, NULL, NULL
-};
-TRANS_FEAT_NONSTREAMING(NMATCH, aa64_sve2, do_ppzz_flags, a, nmatch_fns[a->esz])
-
-static gen_helper_gvec_4 * const histcnt_fns[4] = {
-    NULL, NULL, gen_helper_sve2_histcnt_s, gen_helper_sve2_histcnt_d
-};
-TRANS_FEAT_NONSTREAMING(HISTCNT, aa64_sve2, gen_gvec_ool_arg_zpzz,
-                        histcnt_fns[a->esz], a, 0)
-
-TRANS_FEAT_NONSTREAMING(HISTSEG, aa64_sve2, gen_gvec_ool_arg_zzz,
-                        a->esz == 0 ? gen_helper_sve2_histseg : NULL, a, 0)
-
-DO_ZPZZ_FP(FADDP, aa64_sve2, sve2_faddp_zpzz)
-DO_ZPZZ_FP(FMAXNMP, aa64_sve2, sve2_fmaxnmp_zpzz)
-DO_ZPZZ_FP(FMINNMP, aa64_sve2, sve2_fminnmp_zpzz)
-DO_ZPZZ_FP(FMAXP, aa64_sve2, sve2_fmaxp_zpzz)
-DO_ZPZZ_FP(FMINP, aa64_sve2, sve2_fminp_zpzz)
-
-/*
- * SVE Integer Multiply-Add (unpredicated)
- */
-
-TRANS_FEAT_NONSTREAMING(FMMLA_s, aa64_sve_f32mm, gen_gvec_fpst_zzzz,
-                        gen_helper_fmmla_s, a->rd, a->rn, a->rm, a->ra,
-                        0, FPST_FPCR)
-TRANS_FEAT_NONSTREAMING(FMMLA_d, aa64_sve_f64mm, gen_gvec_fpst_zzzz,
-                        gen_helper_fmmla_d, a->rd, a->rn, a->rm, a->ra,
-                        0, FPST_FPCR)
-
-static gen_helper_gvec_4 * const sqdmlal_zzzw_fns[] = {
-    NULL,                           gen_helper_sve2_sqdmlal_zzzw_h,
-    gen_helper_sve2_sqdmlal_zzzw_s, gen_helper_sve2_sqdmlal_zzzw_d,
-};
-TRANS_FEAT(SQDMLALB_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
-           sqdmlal_zzzw_fns[a->esz], a, 0)
-TRANS_FEAT(SQDMLALT_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
-           sqdmlal_zzzw_fns[a->esz], a, 3)
-TRANS_FEAT(SQDMLALBT, aa64_sve2, gen_gvec_ool_arg_zzzz,
-           sqdmlal_zzzw_fns[a->esz], a, 2)
-
-static gen_helper_gvec_4 * const sqdmlsl_zzzw_fns[] = {
-    NULL,                           gen_helper_sve2_sqdmlsl_zzzw_h,
-    gen_helper_sve2_sqdmlsl_zzzw_s, gen_helper_sve2_sqdmlsl_zzzw_d,
-};
-TRANS_FEAT(SQDMLSLB_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
-           sqdmlsl_zzzw_fns[a->esz], a, 0)
-TRANS_FEAT(SQDMLSLT_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
-           sqdmlsl_zzzw_fns[a->esz], a, 3)
-TRANS_FEAT(SQDMLSLBT, aa64_sve2, gen_gvec_ool_arg_zzzz,
-           sqdmlsl_zzzw_fns[a->esz], a, 2)
-
-static gen_helper_gvec_4 * const sqrdmlah_fns[] = {
-    gen_helper_sve2_sqrdmlah_b, gen_helper_sve2_sqrdmlah_h,
-    gen_helper_sve2_sqrdmlah_s, gen_helper_sve2_sqrdmlah_d,
-};
-TRANS_FEAT(SQRDMLAH_zzzz, aa64_sve2, gen_gvec_ool_arg_zzzz,
-           sqrdmlah_fns[a->esz], a, 0)
-
-static gen_helper_gvec_4 * const sqrdmlsh_fns[] = {
-    gen_helper_sve2_sqrdmlsh_b, gen_helper_sve2_sqrdmlsh_h,
-    gen_helper_sve2_sqrdmlsh_s, gen_helper_sve2_sqrdmlsh_d,
-};
-TRANS_FEAT(SQRDMLSH_zzzz, aa64_sve2, gen_gvec_ool_arg_zzzz,
-           sqrdmlsh_fns[a->esz], a, 0)
-
-static gen_helper_gvec_4 * const smlal_zzzw_fns[] = {
-    NULL,                         gen_helper_sve2_smlal_zzzw_h,
-    gen_helper_sve2_smlal_zzzw_s, gen_helper_sve2_smlal_zzzw_d,
-};
-TRANS_FEAT(SMLALB_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
-           smlal_zzzw_fns[a->esz], a, 0)
-TRANS_FEAT(SMLALT_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
-           smlal_zzzw_fns[a->esz], a, 1)
-
-static gen_helper_gvec_4 * const umlal_zzzw_fns[] = {
-    NULL,                         gen_helper_sve2_umlal_zzzw_h,
-    gen_helper_sve2_umlal_zzzw_s, gen_helper_sve2_umlal_zzzw_d,
-};
-TRANS_FEAT(UMLALB_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
-           umlal_zzzw_fns[a->esz], a, 0)
-TRANS_FEAT(UMLALT_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
-           umlal_zzzw_fns[a->esz], a, 1)
-
-static gen_helper_gvec_4 * const smlsl_zzzw_fns[] = {
-    NULL,                         gen_helper_sve2_smlsl_zzzw_h,
-    gen_helper_sve2_smlsl_zzzw_s, gen_helper_sve2_smlsl_zzzw_d,
-};
-TRANS_FEAT(SMLSLB_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
-           smlsl_zzzw_fns[a->esz], a, 0)
-TRANS_FEAT(SMLSLT_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
-           smlsl_zzzw_fns[a->esz], a, 1)
-
-static gen_helper_gvec_4 * const umlsl_zzzw_fns[] = {
-    NULL,                         gen_helper_sve2_umlsl_zzzw_h,
-    gen_helper_sve2_umlsl_zzzw_s, gen_helper_sve2_umlsl_zzzw_d,
-};
-TRANS_FEAT(UMLSLB_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
-           umlsl_zzzw_fns[a->esz], a, 0)
-TRANS_FEAT(UMLSLT_zzzw, aa64_sve2, gen_gvec_ool_arg_zzzz,
-           umlsl_zzzw_fns[a->esz], a, 1)
-
-static gen_helper_gvec_4 * const cmla_fns[] = {
-    gen_helper_sve2_cmla_zzzz_b, gen_helper_sve2_cmla_zzzz_h,
-    gen_helper_sve2_cmla_zzzz_s, gen_helper_sve2_cmla_zzzz_d,
-};
-TRANS_FEAT(CMLA_zzzz, aa64_sve2, gen_gvec_ool_zzzz,
-           cmla_fns[a->esz], a->rd, a->rn, a->rm, a->ra, a->rot)
-
-static gen_helper_gvec_4 * const cdot_fns[] = {
-    NULL, NULL, gen_helper_sve2_cdot_zzzz_s, gen_helper_sve2_cdot_zzzz_d
-};
-TRANS_FEAT(CDOT_zzzz, aa64_sve2, gen_gvec_ool_zzzz,
-           cdot_fns[a->esz], a->rd, a->rn, a->rm, a->ra, a->rot)
-
-static gen_helper_gvec_4 * const sqrdcmlah_fns[] = {
-    gen_helper_sve2_sqrdcmlah_zzzz_b, gen_helper_sve2_sqrdcmlah_zzzz_h,
-    gen_helper_sve2_sqrdcmlah_zzzz_s, gen_helper_sve2_sqrdcmlah_zzzz_d,
-};
-TRANS_FEAT(SQRDCMLAH_zzzz, aa64_sve2, gen_gvec_ool_zzzz,
-           sqrdcmlah_fns[a->esz], a->rd, a->rn, a->rm, a->ra, a->rot)
-
-TRANS_FEAT(USDOT_zzzz, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz,
-           a->esz == 2 ? gen_helper_gvec_usdot_b : NULL, a, 0)
-
-TRANS_FEAT_NONSTREAMING(AESMC, aa64_sve2_aes, gen_gvec_ool_zz,
-                        gen_helper_crypto_aesmc, a->rd, a->rd, a->decrypt)
-
-TRANS_FEAT_NONSTREAMING(AESE, aa64_sve2_aes, gen_gvec_ool_arg_zzz,
-                        gen_helper_crypto_aese, a, false)
-TRANS_FEAT_NONSTREAMING(AESD, aa64_sve2_aes, gen_gvec_ool_arg_zzz,
-                        gen_helper_crypto_aese, a, true)
-
-TRANS_FEAT_NONSTREAMING(SM4E, aa64_sve2_sm4, gen_gvec_ool_arg_zzz,
-                        gen_helper_crypto_sm4e, a, 0)
-TRANS_FEAT_NONSTREAMING(SM4EKEY, aa64_sve2_sm4, gen_gvec_ool_arg_zzz,
-                        gen_helper_crypto_sm4ekey, a, 0)
-
-TRANS_FEAT_NONSTREAMING(RAX1, aa64_sve2_sha3, gen_gvec_fn_arg_zzz,
-                        gen_gvec_rax1, a)
-
-TRANS_FEAT(FCVTNT_sh, aa64_sve2, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve2_fcvtnt_sh, a, 0, FPST_FPCR)
-TRANS_FEAT(FCVTNT_ds, aa64_sve2, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve2_fcvtnt_ds, a, 0, FPST_FPCR)
-
-TRANS_FEAT(BFCVTNT, aa64_sve_bf16, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve_bfcvtnt, a, 0, FPST_FPCR)
-
-TRANS_FEAT(FCVTLT_hs, aa64_sve2, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve2_fcvtlt_hs, a, 0, FPST_FPCR)
-TRANS_FEAT(FCVTLT_sd, aa64_sve2, gen_gvec_fpst_arg_zpz,
-           gen_helper_sve2_fcvtlt_sd, a, 0, FPST_FPCR)
-
-TRANS_FEAT(FCVTX_ds, aa64_sve2, do_frint_mode, a,
-           float_round_to_odd, gen_helper_sve_fcvt_ds)
-TRANS_FEAT(FCVTXNT_ds, aa64_sve2, do_frint_mode, a,
-           float_round_to_odd, gen_helper_sve2_fcvtnt_ds)
-
-static gen_helper_gvec_3_ptr * const flogb_fns[] = {
-    NULL,               gen_helper_flogb_h,
-    gen_helper_flogb_s, gen_helper_flogb_d
-};
-TRANS_FEAT(FLOGB, aa64_sve2, gen_gvec_fpst_arg_zpz, flogb_fns[a->esz],
-           a, 0, a->esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR)
-
-static bool do_FMLAL_zzzw(DisasContext *s, arg_rrrr_esz *a, bool sub, bool sel)
-{
-    return gen_gvec_ptr_zzzz(s, gen_helper_sve2_fmlal_zzzw_s,
-                             a->rd, a->rn, a->rm, a->ra,
-                             (sel << 1) | sub, cpu_env);
-}
-
-TRANS_FEAT(FMLALB_zzzw, aa64_sve2, do_FMLAL_zzzw, a, false, false)
-TRANS_FEAT(FMLALT_zzzw, aa64_sve2, do_FMLAL_zzzw, a, false, true)
-TRANS_FEAT(FMLSLB_zzzw, aa64_sve2, do_FMLAL_zzzw, a, true, false)
-TRANS_FEAT(FMLSLT_zzzw, aa64_sve2, do_FMLAL_zzzw, a, true, true)
-
-static bool do_FMLAL_zzxw(DisasContext *s, arg_rrxr_esz *a, bool sub, bool sel)
-{
-    return gen_gvec_ptr_zzzz(s, gen_helper_sve2_fmlal_zzxw_s,
-                             a->rd, a->rn, a->rm, a->ra,
-                             (a->index << 2) | (sel << 1) | sub, cpu_env);
-}
-
-TRANS_FEAT(FMLALB_zzxw, aa64_sve2, do_FMLAL_zzxw, a, false, false)
-TRANS_FEAT(FMLALT_zzxw, aa64_sve2, do_FMLAL_zzxw, a, false, true)
-TRANS_FEAT(FMLSLB_zzxw, aa64_sve2, do_FMLAL_zzxw, a, true, false)
-TRANS_FEAT(FMLSLT_zzxw, aa64_sve2, do_FMLAL_zzxw, a, true, true)
-
-TRANS_FEAT_NONSTREAMING(SMMLA, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz,
-                        gen_helper_gvec_smmla_b, a, 0)
-TRANS_FEAT_NONSTREAMING(USMMLA, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz,
-                        gen_helper_gvec_usmmla_b, a, 0)
-TRANS_FEAT_NONSTREAMING(UMMLA, aa64_sve_i8mm, gen_gvec_ool_arg_zzzz,
-                        gen_helper_gvec_ummla_b, a, 0)
-
-TRANS_FEAT(BFDOT_zzzz, aa64_sve_bf16, gen_gvec_ool_arg_zzzz,
-           gen_helper_gvec_bfdot, a, 0)
-TRANS_FEAT(BFDOT_zzxz, aa64_sve_bf16, gen_gvec_ool_arg_zzxz,
-           gen_helper_gvec_bfdot_idx, a)
-
-TRANS_FEAT_NONSTREAMING(BFMMLA, aa64_sve_bf16, gen_gvec_ool_arg_zzzz,
-                        gen_helper_gvec_bfmmla, a, 0)
-
-static bool do_BFMLAL_zzzw(DisasContext *s, arg_rrrr_esz *a, bool sel)
-{
-    return gen_gvec_fpst_zzzz(s, gen_helper_gvec_bfmlal,
-                              a->rd, a->rn, a->rm, a->ra, sel, FPST_FPCR);
-}
-
-TRANS_FEAT(BFMLALB_zzzw, aa64_sve_bf16, do_BFMLAL_zzzw, a, false)
-TRANS_FEAT(BFMLALT_zzzw, aa64_sve_bf16, do_BFMLAL_zzzw, a, true)
-
-static bool do_BFMLAL_zzxw(DisasContext *s, arg_rrxr_esz *a, bool sel)
-{
-    return gen_gvec_fpst_zzzz(s, gen_helper_gvec_bfmlal_idx,
-                              a->rd, a->rn, a->rm, a->ra,
-                              (a->index << 1) | sel, FPST_FPCR);
-}
-
-TRANS_FEAT(BFMLALB_zzxw, aa64_sve_bf16, do_BFMLAL_zzxw, a, false)
-TRANS_FEAT(BFMLALT_zzxw, aa64_sve_bf16, do_BFMLAL_zzxw, a, true)
-
-static bool trans_PSEL(DisasContext *s, arg_psel *a)
-{
-    int vl = vec_full_reg_size(s);
-    int pl = pred_gvec_reg_size(s);
-    int elements = vl >> a->esz;
-    TCGv_i64 tmp, didx, dbit;
-    TCGv_ptr ptr;
-
-    if (!dc_isar_feature(aa64_sme, s)) {
-        return false;
-    }
-    if (!sve_access_check(s)) {
-        return true;
-    }
-
-    tmp = tcg_temp_new_i64();
-    dbit = tcg_temp_new_i64();
-    didx = tcg_temp_new_i64();
-    ptr = tcg_temp_new_ptr();
-
-    /* Compute the predicate element. */
-    tcg_gen_addi_i64(tmp, cpu_reg(s, a->rv), a->imm);
-    if (is_power_of_2(elements)) {
-        tcg_gen_andi_i64(tmp, tmp, elements - 1);
-    } else {
-        tcg_gen_remu_i64(tmp, tmp, tcg_constant_i64(elements));
-    }
-
-    /* Extract the predicate byte and bit indices. */
-    tcg_gen_shli_i64(tmp, tmp, a->esz);
-    tcg_gen_andi_i64(dbit, tmp, 7);
-    tcg_gen_shri_i64(didx, tmp, 3);
-    if (HOST_BIG_ENDIAN) {
-        tcg_gen_xori_i64(didx, didx, 7);
-    }
-
-    /* Load the predicate word. */
-    tcg_gen_trunc_i64_ptr(ptr, didx);
-    tcg_gen_add_ptr(ptr, ptr, cpu_env);
-    tcg_gen_ld8u_i64(tmp, ptr, pred_full_reg_offset(s, a->pm));
-
-    /* Extract the predicate bit and replicate to MO_64. */
-    tcg_gen_shr_i64(tmp, tmp, dbit);
-    tcg_gen_andi_i64(tmp, tmp, 1);
-    tcg_gen_neg_i64(tmp, tmp);
-
-    /* Apply to either copy the source, or write zeros. */
-    tcg_gen_gvec_ands(MO_64, pred_full_reg_offset(s, a->pd),
-                      pred_full_reg_offset(s, a->pn), tmp, pl, pl);
-
-    tcg_temp_free_i64(tmp);
-    tcg_temp_free_i64(dbit);
-    tcg_temp_free_i64(didx);
-    tcg_temp_free_ptr(ptr);
-    return true;
-}
-
-static void gen_sclamp_i32(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, TCGv_i32 a)
-{
-    tcg_gen_smax_i32(d, a, n);
-    tcg_gen_smin_i32(d, d, m);
-}
-
-static void gen_sclamp_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 a)
-{
-    tcg_gen_smax_i64(d, a, n);
-    tcg_gen_smin_i64(d, d, m);
-}
-
-static void gen_sclamp_vec(unsigned vece, TCGv_vec d, TCGv_vec n,
-                           TCGv_vec m, TCGv_vec a)
-{
-    tcg_gen_smax_vec(vece, d, a, n);
-    tcg_gen_smin_vec(vece, d, d, m);
-}
-
-static void gen_sclamp(unsigned vece, uint32_t d, uint32_t n, uint32_t m,
-                       uint32_t a, uint32_t oprsz, uint32_t maxsz)
-{
-    static const TCGOpcode vecop[] = {
-        INDEX_op_smin_vec, INDEX_op_smax_vec, 0
-    };
-    static const GVecGen4 ops[4] = {
-        { .fniv = gen_sclamp_vec,
-          .fno  = gen_helper_gvec_sclamp_b,
-          .opt_opc = vecop,
-          .vece = MO_8 },
-        { .fniv = gen_sclamp_vec,
-          .fno  = gen_helper_gvec_sclamp_h,
-          .opt_opc = vecop,
-          .vece = MO_16 },
-        { .fni4 = gen_sclamp_i32,
-          .fniv = gen_sclamp_vec,
-          .fno  = gen_helper_gvec_sclamp_s,
-          .opt_opc = vecop,
-          .vece = MO_32 },
-        { .fni8 = gen_sclamp_i64,
-          .fniv = gen_sclamp_vec,
-          .fno  = gen_helper_gvec_sclamp_d,
-          .opt_opc = vecop,
-          .vece = MO_64,
-          .prefer_i64 = TCG_TARGET_REG_BITS == 64 }
-    };
-    tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &ops[vece]);
-}
-
-TRANS_FEAT(SCLAMP, aa64_sme, gen_gvec_fn_arg_zzzz, gen_sclamp, a)
-
-static void gen_uclamp_i32(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, TCGv_i32 a)
-{
-    tcg_gen_umax_i32(d, a, n);
-    tcg_gen_umin_i32(d, d, m);
-}
-
-static void gen_uclamp_i64(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_i64 a)
-{
-    tcg_gen_umax_i64(d, a, n);
-    tcg_gen_umin_i64(d, d, m);
-}
-
-static void gen_uclamp_vec(unsigned vece, TCGv_vec d, TCGv_vec n,
-                           TCGv_vec m, TCGv_vec a)
-{
-    tcg_gen_umax_vec(vece, d, a, n);
-    tcg_gen_umin_vec(vece, d, d, m);
-}
-
-static void gen_uclamp(unsigned vece, uint32_t d, uint32_t n, uint32_t m,
-                       uint32_t a, uint32_t oprsz, uint32_t maxsz)
-{
-    static const TCGOpcode vecop[] = {
-        INDEX_op_umin_vec, INDEX_op_umax_vec, 0
-    };
-    static const GVecGen4 ops[4] = {
-        { .fniv = gen_uclamp_vec,
-          .fno  = gen_helper_gvec_uclamp_b,
-          .opt_opc = vecop,
-          .vece = MO_8 },
-        { .fniv = gen_uclamp_vec,
-          .fno  = gen_helper_gvec_uclamp_h,
-          .opt_opc = vecop,
-          .vece = MO_16 },
-        { .fni4 = gen_uclamp_i32,
-          .fniv = gen_uclamp_vec,
-          .fno  = gen_helper_gvec_uclamp_s,
-          .opt_opc = vecop,
-          .vece = MO_32 },
-        { .fni8 = gen_uclamp_i64,
-          .fniv = gen_uclamp_vec,
-          .fno  = gen_helper_gvec_uclamp_d,
-          .opt_opc = vecop,
-          .vece = MO_64,
-          .prefer_i64 = TCG_TARGET_REG_BITS == 64 }
-    };
-    tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, &ops[vece]);
-}
-
-TRANS_FEAT(UCLAMP, aa64_sme, gen_gvec_fn_arg_zzzz, gen_uclamp, a)
diff --git a/target/arm/translate-vfp.c b/target/arm/translate-vfp.c
deleted file mode 100644
index 5c5d58d..0000000
--- a/target/arm/translate-vfp.c
+++ /dev/null
@@ -1,3619 +0,0 @@
-/*
- *  ARM translation: AArch32 VFP instructions
- *
- *  Copyright (c) 2003 Fabrice Bellard
- *  Copyright (c) 2005-2007 CodeSourcery
- *  Copyright (c) 2007 OpenedHand, Ltd.
- *  Copyright (c) 2019 Linaro, Ltd.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "qemu/osdep.h"
-#include "tcg/tcg-op.h"
-#include "tcg/tcg-op-gvec.h"
-#include "exec/exec-all.h"
-#include "exec/gen-icount.h"
-#include "translate.h"
-#include "translate-a32.h"
-
-/* Include the generated VFP decoder */
-#include "decode-vfp.c.inc"
-#include "decode-vfp-uncond.c.inc"
-
-static inline void vfp_load_reg64(TCGv_i64 var, int reg)
-{
-    tcg_gen_ld_i64(var, cpu_env, vfp_reg_offset(true, reg));
-}
-
-static inline void vfp_store_reg64(TCGv_i64 var, int reg)
-{
-    tcg_gen_st_i64(var, cpu_env, vfp_reg_offset(true, reg));
-}
-
-static inline void vfp_load_reg32(TCGv_i32 var, int reg)
-{
-    tcg_gen_ld_i32(var, cpu_env, vfp_reg_offset(false, reg));
-}
-
-static inline void vfp_store_reg32(TCGv_i32 var, int reg)
-{
-    tcg_gen_st_i32(var, cpu_env, vfp_reg_offset(false, reg));
-}
-
-/*
- * The imm8 encodes the sign bit, enough bits to represent an exponent in
- * the range 01....1xx to 10....0xx, and the most significant 4 bits of
- * the mantissa; see VFPExpandImm() in the v8 ARM ARM.
- */
-uint64_t vfp_expand_imm(int size, uint8_t imm8)
-{
-    uint64_t imm;
-
-    switch (size) {
-    case MO_64:
-        imm = (extract32(imm8, 7, 1) ? 0x8000 : 0) |
-            (extract32(imm8, 6, 1) ? 0x3fc0 : 0x4000) |
-            extract32(imm8, 0, 6);
-        imm <<= 48;
-        break;
-    case MO_32:
-        imm = (extract32(imm8, 7, 1) ? 0x8000 : 0) |
-            (extract32(imm8, 6, 1) ? 0x3e00 : 0x4000) |
-            (extract32(imm8, 0, 6) << 3);
-        imm <<= 16;
-        break;
-    case MO_16:
-        imm = (extract32(imm8, 7, 1) ? 0x8000 : 0) |
-            (extract32(imm8, 6, 1) ? 0x3000 : 0x4000) |
-            (extract32(imm8, 0, 6) << 6);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-    return imm;
-}
-
-/*
- * Return the offset of a 16-bit half of the specified VFP single-precision
- * register. If top is true, returns the top 16 bits; otherwise the bottom
- * 16 bits.
- */
-static inline long vfp_f16_offset(unsigned reg, bool top)
-{
-    long offs = vfp_reg_offset(false, reg);
-#if HOST_BIG_ENDIAN
-    if (!top) {
-        offs += 2;
-    }
-#else
-    if (top) {
-        offs += 2;
-    }
-#endif
-    return offs;
-}
-
-/*
- * Generate code for M-profile lazy FP state preservation if needed;
- * this corresponds to the pseudocode PreserveFPState() function.
- */
-static void gen_preserve_fp_state(DisasContext *s, bool skip_context_update)
-{
-    if (s->v7m_lspact) {
-        /*
-         * Lazy state saving affects external memory and also the NVIC,
-         * so we must mark it as an IO operation for icount (and cause
-         * this to be the last insn in the TB).
-         */
-        if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-            s->base.is_jmp = DISAS_UPDATE_EXIT;
-            gen_io_start();
-        }
-        gen_helper_v7m_preserve_fp_state(cpu_env);
-        /*
-         * If the preserve_fp_state helper doesn't throw an exception
-         * then it will clear LSPACT; we don't need to repeat this for
-         * any further FP insns in this TB.
-         */
-        s->v7m_lspact = false;
-        /*
-         * The helper might have zeroed VPR, so we do not know the
-         * correct value for the MVE_NO_PRED TB flag any more.
-         * If we're about to create a new fp context then that
-         * will precisely determine the MVE_NO_PRED value (see
-         * gen_update_fp_context()). Otherwise, we must:
-         *  - set s->mve_no_pred to false, so this instruction
-         *    is generated to use helper functions
-         *  - end the TB now, without chaining to the next TB
-         */
-        if (skip_context_update || !s->v7m_new_fp_ctxt_needed) {
-            s->mve_no_pred = false;
-            s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
-        }
-    }
-}
-
-/*
- * Generate code for M-profile FP context handling: update the
- * ownership of the FP context, and create a new context if
- * necessary. This corresponds to the parts of the pseudocode
- * ExecuteFPCheck() after the inital PreserveFPState() call.
- */
-static void gen_update_fp_context(DisasContext *s)
-{
-    /* Update ownership of FP context: set FPCCR.S to match current state */
-    if (s->v8m_fpccr_s_wrong) {
-        TCGv_i32 tmp;
-
-        tmp = load_cpu_field(v7m.fpccr[M_REG_S]);
-        if (s->v8m_secure) {
-            tcg_gen_ori_i32(tmp, tmp, R_V7M_FPCCR_S_MASK);
-        } else {
-            tcg_gen_andi_i32(tmp, tmp, ~R_V7M_FPCCR_S_MASK);
-        }
-        store_cpu_field(tmp, v7m.fpccr[M_REG_S]);
-        /* Don't need to do this for any further FP insns in this TB */
-        s->v8m_fpccr_s_wrong = false;
-    }
-
-    if (s->v7m_new_fp_ctxt_needed) {
-        /*
-         * Create new FP context by updating CONTROL.FPCA, CONTROL.SFPA,
-         * the FPSCR, and VPR.
-         */
-        TCGv_i32 control, fpscr;
-        uint32_t bits = R_V7M_CONTROL_FPCA_MASK;
-
-        fpscr = load_cpu_field(v7m.fpdscr[s->v8m_secure]);
-        gen_helper_vfp_set_fpscr(cpu_env, fpscr);
-        tcg_temp_free_i32(fpscr);
-        if (dc_isar_feature(aa32_mve, s)) {
-            store_cpu_field(tcg_constant_i32(0), v7m.vpr);
-        }
-        /*
-         * We just updated the FPSCR and VPR. Some of this state is cached
-         * in the MVE_NO_PRED TB flag. We want to avoid having to end the
-         * TB here, which means we need the new value of the MVE_NO_PRED
-         * flag to be exactly known here and the same for all executions.
-         * Luckily FPDSCR.LTPSIZE is always constant 4 and the VPR is
-         * always set to 0, so the new MVE_NO_PRED flag is always 1
-         * if and only if we have MVE.
-         *
-         * (The other FPSCR state cached in TB flags is VECLEN and VECSTRIDE,
-         * but those do not exist for M-profile, so are not relevant here.)
-         */
-        s->mve_no_pred = dc_isar_feature(aa32_mve, s);
-
-        if (s->v8m_secure) {
-            bits |= R_V7M_CONTROL_SFPA_MASK;
-        }
-        control = load_cpu_field(v7m.control[M_REG_S]);
-        tcg_gen_ori_i32(control, control, bits);
-        store_cpu_field(control, v7m.control[M_REG_S]);
-        /* Don't need to do this for any further FP insns in this TB */
-        s->v7m_new_fp_ctxt_needed = false;
-    }
-}
-
-/*
- * Check that VFP access is enabled, A-profile specific version.
- *
- * If VFP is enabled, return true. If not, emit code to generate an
- * appropriate exception and return false.
- * The ignore_vfp_enabled argument specifies that we should ignore
- * whether VFP is enabled via FPEXC.EN: this should be true for FMXR/FMRX
- * accesses to FPSID, FPEXC, MVFR0, MVFR1, MVFR2, and false for all other insns.
- */
-static bool vfp_access_check_a(DisasContext *s, bool ignore_vfp_enabled)
-{
-    if (s->fp_excp_el) {
-        /*
-         * The full syndrome is only used for HSR when HCPTR traps:
-         * For v8, when TA==0, coproc is RES0.
-         * For v7, any use of a Floating-point instruction or access
-         * to a Floating-point Extension register that is trapped to
-         * Hyp mode because of a trap configured in the HCPTR sets
-         * this field to 0xA.
-         */
-        int coproc = arm_dc_feature(s, ARM_FEATURE_V8) ? 0 : 0xa;
-        uint32_t syn = syn_fp_access_trap(1, 0xe, false, coproc);
-
-        gen_exception_insn_el(s, 0, EXCP_UDEF, syn, s->fp_excp_el);
-        return false;
-    }
-
-    /*
-     * Note that rebuild_hflags_a32 has already accounted for being in EL0
-     * and the higher EL in A64 mode, etc.  Unlike A64 mode, there do not
-     * appear to be any insns which touch VFP which are allowed.
-     */
-    if (s->sme_trap_nonstreaming) {
-        gen_exception_insn(s, 0, EXCP_UDEF,
-                           syn_smetrap(SME_ET_Streaming,
-                                       curr_insn_len(s) == 2));
-        return false;
-    }
-
-    if (!s->vfp_enabled && !ignore_vfp_enabled) {
-        assert(!arm_dc_feature(s, ARM_FEATURE_M));
-        unallocated_encoding(s);
-        return false;
-    }
-    return true;
-}
-
-/*
- * Check that VFP access is enabled, M-profile specific version.
- *
- * If VFP is enabled, do the necessary M-profile lazy-FP handling and then
- * return true. If not, emit code to generate an appropriate exception and
- * return false.
- * skip_context_update is true to skip the "update FP context" part of this.
- */
-bool vfp_access_check_m(DisasContext *s, bool skip_context_update)
-{
-    if (s->fp_excp_el) {
-        /*
-         * M-profile mostly catches the "FPU disabled" case early, in
-         * disas_m_nocp(), but a few insns (eg LCTP, WLSTP, DLSTP)
-         * which do coprocessor-checks are outside the large ranges of
-         * the encoding space handled by the patterns in m-nocp.decode,
-         * and for them we may need to raise NOCP here.
-         */
-        gen_exception_insn_el(s, 0, EXCP_NOCP,
-                              syn_uncategorized(), s->fp_excp_el);
-        return false;
-    }
-
-    /* Handle M-profile lazy FP state mechanics */
-
-    /* Trigger lazy-state preservation if necessary */
-    gen_preserve_fp_state(s, skip_context_update);
-
-    if (!skip_context_update) {
-        /* Update ownership of FP context and create new FP context if needed */
-        gen_update_fp_context(s);
-    }
-
-    return true;
-}
-
-/*
- * The most usual kind of VFP access check, for everything except
- * FMXR/FMRX to the always-available special registers.
- */
-bool vfp_access_check(DisasContext *s)
-{
-    if (arm_dc_feature(s, ARM_FEATURE_M)) {
-        return vfp_access_check_m(s, false);
-    } else {
-        return vfp_access_check_a(s, false);
-    }
-}
-
-static bool trans_VSEL(DisasContext *s, arg_VSEL *a)
-{
-    uint32_t rd, rn, rm;
-    int sz = a->sz;
-
-    if (!dc_isar_feature(aa32_vsel, s)) {
-        return false;
-    }
-
-    if (sz == 3 && !dc_isar_feature(aa32_fpdp_v2, s)) {
-        return false;
-    }
-
-    if (sz == 1 && !dc_isar_feature(aa32_fp16_arith, s)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist */
-    if (sz == 3 && !dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vm | a->vn | a->vd) & 0x10)) {
-        return false;
-    }
-
-    rd = a->vd;
-    rn = a->vn;
-    rm = a->vm;
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    if (sz == 3) {
-        TCGv_i64 frn, frm, dest;
-        TCGv_i64 tmp, zero, zf, nf, vf;
-
-        zero = tcg_constant_i64(0);
-
-        frn = tcg_temp_new_i64();
-        frm = tcg_temp_new_i64();
-        dest = tcg_temp_new_i64();
-
-        zf = tcg_temp_new_i64();
-        nf = tcg_temp_new_i64();
-        vf = tcg_temp_new_i64();
-
-        tcg_gen_extu_i32_i64(zf, cpu_ZF);
-        tcg_gen_ext_i32_i64(nf, cpu_NF);
-        tcg_gen_ext_i32_i64(vf, cpu_VF);
-
-        vfp_load_reg64(frn, rn);
-        vfp_load_reg64(frm, rm);
-        switch (a->cc) {
-        case 0: /* eq: Z */
-            tcg_gen_movcond_i64(TCG_COND_EQ, dest, zf, zero, frn, frm);
-            break;
-        case 1: /* vs: V */
-            tcg_gen_movcond_i64(TCG_COND_LT, dest, vf, zero, frn, frm);
-            break;
-        case 2: /* ge: N == V -> N ^ V == 0 */
-            tmp = tcg_temp_new_i64();
-            tcg_gen_xor_i64(tmp, vf, nf);
-            tcg_gen_movcond_i64(TCG_COND_GE, dest, tmp, zero, frn, frm);
-            tcg_temp_free_i64(tmp);
-            break;
-        case 3: /* gt: !Z && N == V */
-            tcg_gen_movcond_i64(TCG_COND_NE, dest, zf, zero, frn, frm);
-            tmp = tcg_temp_new_i64();
-            tcg_gen_xor_i64(tmp, vf, nf);
-            tcg_gen_movcond_i64(TCG_COND_GE, dest, tmp, zero, dest, frm);
-            tcg_temp_free_i64(tmp);
-            break;
-        }
-        vfp_store_reg64(dest, rd);
-        tcg_temp_free_i64(frn);
-        tcg_temp_free_i64(frm);
-        tcg_temp_free_i64(dest);
-
-        tcg_temp_free_i64(zf);
-        tcg_temp_free_i64(nf);
-        tcg_temp_free_i64(vf);
-    } else {
-        TCGv_i32 frn, frm, dest;
-        TCGv_i32 tmp, zero;
-
-        zero = tcg_constant_i32(0);
-
-        frn = tcg_temp_new_i32();
-        frm = tcg_temp_new_i32();
-        dest = tcg_temp_new_i32();
-        vfp_load_reg32(frn, rn);
-        vfp_load_reg32(frm, rm);
-        switch (a->cc) {
-        case 0: /* eq: Z */
-            tcg_gen_movcond_i32(TCG_COND_EQ, dest, cpu_ZF, zero, frn, frm);
-            break;
-        case 1: /* vs: V */
-            tcg_gen_movcond_i32(TCG_COND_LT, dest, cpu_VF, zero, frn, frm);
-            break;
-        case 2: /* ge: N == V -> N ^ V == 0 */
-            tmp = tcg_temp_new_i32();
-            tcg_gen_xor_i32(tmp, cpu_VF, cpu_NF);
-            tcg_gen_movcond_i32(TCG_COND_GE, dest, tmp, zero, frn, frm);
-            tcg_temp_free_i32(tmp);
-            break;
-        case 3: /* gt: !Z && N == V */
-            tcg_gen_movcond_i32(TCG_COND_NE, dest, cpu_ZF, zero, frn, frm);
-            tmp = tcg_temp_new_i32();
-            tcg_gen_xor_i32(tmp, cpu_VF, cpu_NF);
-            tcg_gen_movcond_i32(TCG_COND_GE, dest, tmp, zero, dest, frm);
-            tcg_temp_free_i32(tmp);
-            break;
-        }
-        /* For fp16 the top half is always zeroes */
-        if (sz == 1) {
-            tcg_gen_andi_i32(dest, dest, 0xffff);
-        }
-        vfp_store_reg32(dest, rd);
-        tcg_temp_free_i32(frn);
-        tcg_temp_free_i32(frm);
-        tcg_temp_free_i32(dest);
-    }
-
-    return true;
-}
-
-/*
- * Table for converting the most common AArch32 encoding of
- * rounding mode to arm_fprounding order (which matches the
- * common AArch64 order); see ARM ARM pseudocode FPDecodeRM().
- */
-static const uint8_t fp_decode_rm[] = {
-    FPROUNDING_TIEAWAY,
-    FPROUNDING_TIEEVEN,
-    FPROUNDING_POSINF,
-    FPROUNDING_NEGINF,
-};
-
-static bool trans_VRINT(DisasContext *s, arg_VRINT *a)
-{
-    uint32_t rd, rm;
-    int sz = a->sz;
-    TCGv_ptr fpst;
-    TCGv_i32 tcg_rmode;
-    int rounding = fp_decode_rm[a->rm];
-
-    if (!dc_isar_feature(aa32_vrint, s)) {
-        return false;
-    }
-
-    if (sz == 3 && !dc_isar_feature(aa32_fpdp_v2, s)) {
-        return false;
-    }
-
-    if (sz == 1 && !dc_isar_feature(aa32_fp16_arith, s)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist */
-    if (sz == 3 && !dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vm | a->vd) & 0x10)) {
-        return false;
-    }
-
-    rd = a->vd;
-    rm = a->vm;
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    if (sz == 1) {
-        fpst = fpstatus_ptr(FPST_FPCR_F16);
-    } else {
-        fpst = fpstatus_ptr(FPST_FPCR);
-    }
-
-    tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rounding));
-    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
-
-    if (sz == 3) {
-        TCGv_i64 tcg_op;
-        TCGv_i64 tcg_res;
-        tcg_op = tcg_temp_new_i64();
-        tcg_res = tcg_temp_new_i64();
-        vfp_load_reg64(tcg_op, rm);
-        gen_helper_rintd(tcg_res, tcg_op, fpst);
-        vfp_store_reg64(tcg_res, rd);
-        tcg_temp_free_i64(tcg_op);
-        tcg_temp_free_i64(tcg_res);
-    } else {
-        TCGv_i32 tcg_op;
-        TCGv_i32 tcg_res;
-        tcg_op = tcg_temp_new_i32();
-        tcg_res = tcg_temp_new_i32();
-        vfp_load_reg32(tcg_op, rm);
-        if (sz == 1) {
-            gen_helper_rinth(tcg_res, tcg_op, fpst);
-        } else {
-            gen_helper_rints(tcg_res, tcg_op, fpst);
-        }
-        vfp_store_reg32(tcg_res, rd);
-        tcg_temp_free_i32(tcg_op);
-        tcg_temp_free_i32(tcg_res);
-    }
-
-    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
-    tcg_temp_free_i32(tcg_rmode);
-
-    tcg_temp_free_ptr(fpst);
-    return true;
-}
-
-static bool trans_VCVT(DisasContext *s, arg_VCVT *a)
-{
-    uint32_t rd, rm;
-    int sz = a->sz;
-    TCGv_ptr fpst;
-    TCGv_i32 tcg_rmode, tcg_shift;
-    int rounding = fp_decode_rm[a->rm];
-    bool is_signed = a->op;
-
-    if (!dc_isar_feature(aa32_vcvt_dr, s)) {
-        return false;
-    }
-
-    if (sz == 3 && !dc_isar_feature(aa32_fpdp_v2, s)) {
-        return false;
-    }
-
-    if (sz == 1 && !dc_isar_feature(aa32_fp16_arith, s)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist */
-    if (sz == 3 && !dc_isar_feature(aa32_simd_r32, s) && (a->vm & 0x10)) {
-        return false;
-    }
-
-    rd = a->vd;
-    rm = a->vm;
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    if (sz == 1) {
-        fpst = fpstatus_ptr(FPST_FPCR_F16);
-    } else {
-        fpst = fpstatus_ptr(FPST_FPCR);
-    }
-
-    tcg_shift = tcg_constant_i32(0);
-
-    tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rounding));
-    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
-
-    if (sz == 3) {
-        TCGv_i64 tcg_double, tcg_res;
-        TCGv_i32 tcg_tmp;
-        tcg_double = tcg_temp_new_i64();
-        tcg_res = tcg_temp_new_i64();
-        tcg_tmp = tcg_temp_new_i32();
-        vfp_load_reg64(tcg_double, rm);
-        if (is_signed) {
-            gen_helper_vfp_tosld(tcg_res, tcg_double, tcg_shift, fpst);
-        } else {
-            gen_helper_vfp_tould(tcg_res, tcg_double, tcg_shift, fpst);
-        }
-        tcg_gen_extrl_i64_i32(tcg_tmp, tcg_res);
-        vfp_store_reg32(tcg_tmp, rd);
-        tcg_temp_free_i32(tcg_tmp);
-        tcg_temp_free_i64(tcg_res);
-        tcg_temp_free_i64(tcg_double);
-    } else {
-        TCGv_i32 tcg_single, tcg_res;
-        tcg_single = tcg_temp_new_i32();
-        tcg_res = tcg_temp_new_i32();
-        vfp_load_reg32(tcg_single, rm);
-        if (sz == 1) {
-            if (is_signed) {
-                gen_helper_vfp_toslh(tcg_res, tcg_single, tcg_shift, fpst);
-            } else {
-                gen_helper_vfp_toulh(tcg_res, tcg_single, tcg_shift, fpst);
-            }
-        } else {
-            if (is_signed) {
-                gen_helper_vfp_tosls(tcg_res, tcg_single, tcg_shift, fpst);
-            } else {
-                gen_helper_vfp_touls(tcg_res, tcg_single, tcg_shift, fpst);
-            }
-        }
-        vfp_store_reg32(tcg_res, rd);
-        tcg_temp_free_i32(tcg_res);
-        tcg_temp_free_i32(tcg_single);
-    }
-
-    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
-    tcg_temp_free_i32(tcg_rmode);
-
-    tcg_temp_free_ptr(fpst);
-
-    return true;
-}
-
-bool mve_skip_vmov(DisasContext *s, int vn, int index, int size)
-{
-    /*
-     * In a CPU with MVE, the VMOV (vector lane to general-purpose register)
-     * and VMOV (general-purpose register to vector lane) insns are not
-     * predicated, but they are subject to beatwise execution if they are
-     * not in an IT block.
-     *
-     * Since our implementation always executes all 4 beats in one tick,
-     * this means only that if PSR.ECI says we should not be executing
-     * the beat corresponding to the lane of the vector register being
-     * accessed then we should skip performing the move, and that we need
-     * to do the usual check for bad ECI state and advance of ECI state.
-     *
-     * Note that if PSR.ECI is non-zero then we cannot be in an IT block.
-     *
-     * Return true if this VMOV scalar <-> gpreg should be skipped because
-     * the MVE PSR.ECI state says we skip the beat where the store happens.
-     */
-
-    /* Calculate the byte offset into Qn which we're going to access */
-    int ofs = (index << size) + ((vn & 1) * 8);
-
-    if (!dc_isar_feature(aa32_mve, s)) {
-        return false;
-    }
-
-    switch (s->eci) {
-    case ECI_NONE:
-        return false;
-    case ECI_A0:
-        return ofs < 4;
-    case ECI_A0A1:
-        return ofs < 8;
-    case ECI_A0A1A2:
-    case ECI_A0A1A2B0:
-        return ofs < 12;
-    default:
-        g_assert_not_reached();
-    }
-}
-
-static bool trans_VMOV_to_gp(DisasContext *s, arg_VMOV_to_gp *a)
-{
-    /* VMOV scalar to general purpose register */
-    TCGv_i32 tmp;
-
-    /*
-     * SIZE == MO_32 is a VFP instruction; otherwise NEON. MVE has
-     * all sizes, whether the CPU has fp or not.
-     */
-    if (!dc_isar_feature(aa32_mve, s)) {
-        if (a->size == MO_32
-            ? !dc_isar_feature(aa32_fpsp_v2, s)
-            : !arm_dc_feature(s, ARM_FEATURE_NEON)) {
-            return false;
-        }
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist */
-    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vn & 0x10)) {
-        return false;
-    }
-
-    if (dc_isar_feature(aa32_mve, s)) {
-        if (!mve_eci_check(s)) {
-            return true;
-        }
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    if (!mve_skip_vmov(s, a->vn, a->index, a->size)) {
-        tmp = tcg_temp_new_i32();
-        read_neon_element32(tmp, a->vn, a->index,
-                            a->size | (a->u ? 0 : MO_SIGN));
-        store_reg(s, a->rt, tmp);
-    }
-
-    if (dc_isar_feature(aa32_mve, s)) {
-        mve_update_and_store_eci(s);
-    }
-    return true;
-}
-
-static bool trans_VMOV_from_gp(DisasContext *s, arg_VMOV_from_gp *a)
-{
-    /* VMOV general purpose register to scalar */
-    TCGv_i32 tmp;
-
-    /*
-     * SIZE == MO_32 is a VFP instruction; otherwise NEON. MVE has
-     * all sizes, whether the CPU has fp or not.
-     */
-    if (!dc_isar_feature(aa32_mve, s)) {
-        if (a->size == MO_32
-            ? !dc_isar_feature(aa32_fpsp_v2, s)
-            : !arm_dc_feature(s, ARM_FEATURE_NEON)) {
-            return false;
-        }
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist */
-    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vn & 0x10)) {
-        return false;
-    }
-
-    if (dc_isar_feature(aa32_mve, s)) {
-        if (!mve_eci_check(s)) {
-            return true;
-        }
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    if (!mve_skip_vmov(s, a->vn, a->index, a->size)) {
-        tmp = load_reg(s, a->rt);
-        write_neon_element32(tmp, a->vn, a->index, a->size);
-        tcg_temp_free_i32(tmp);
-    }
-
-    if (dc_isar_feature(aa32_mve, s)) {
-        mve_update_and_store_eci(s);
-    }
-    return true;
-}
-
-static bool trans_VDUP(DisasContext *s, arg_VDUP *a)
-{
-    /* VDUP (general purpose register) */
-    TCGv_i32 tmp;
-    int size, vec_size;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist */
-    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vn & 0x10)) {
-        return false;
-    }
-
-    if (a->b && a->e) {
-        return false;
-    }
-
-    if (a->q && (a->vn & 1)) {
-        return false;
-    }
-
-    vec_size = a->q ? 16 : 8;
-    if (a->b) {
-        size = 0;
-    } else if (a->e) {
-        size = 1;
-    } else {
-        size = 2;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    tmp = load_reg(s, a->rt);
-    tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(a->vn),
-                         vec_size, vec_size, tmp);
-    tcg_temp_free_i32(tmp);
-
-    return true;
-}
-
-static bool trans_VMSR_VMRS(DisasContext *s, arg_VMSR_VMRS *a)
-{
-    TCGv_i32 tmp;
-    bool ignore_vfp_enabled = false;
-
-    if (arm_dc_feature(s, ARM_FEATURE_M)) {
-        /* M profile version was already handled in m-nocp.decode */
-        return false;
-    }
-
-    if (!dc_isar_feature(aa32_fpsp_v2, s)) {
-        return false;
-    }
-
-    switch (a->reg) {
-    case ARM_VFP_FPSID:
-        /*
-         * VFPv2 allows access to FPSID from userspace; VFPv3 restricts
-         * all ID registers to privileged access only.
-         */
-        if (IS_USER(s) && dc_isar_feature(aa32_fpsp_v3, s)) {
-            return false;
-        }
-        ignore_vfp_enabled = true;
-        break;
-    case ARM_VFP_MVFR0:
-    case ARM_VFP_MVFR1:
-        if (IS_USER(s) || !arm_dc_feature(s, ARM_FEATURE_MVFR)) {
-            return false;
-        }
-        ignore_vfp_enabled = true;
-        break;
-    case ARM_VFP_MVFR2:
-        if (IS_USER(s) || !arm_dc_feature(s, ARM_FEATURE_V8)) {
-            return false;
-        }
-        ignore_vfp_enabled = true;
-        break;
-    case ARM_VFP_FPSCR:
-        break;
-    case ARM_VFP_FPEXC:
-        if (IS_USER(s)) {
-            return false;
-        }
-        ignore_vfp_enabled = true;
-        break;
-    case ARM_VFP_FPINST:
-    case ARM_VFP_FPINST2:
-        /* Not present in VFPv3 */
-        if (IS_USER(s) || dc_isar_feature(aa32_fpsp_v3, s)) {
-            return false;
-        }
-        break;
-    default:
-        return false;
-    }
-
-    /*
-     * Call vfp_access_check_a() directly, because we need to tell
-     * it to ignore FPEXC.EN for some register accesses.
-     */
-    if (!vfp_access_check_a(s, ignore_vfp_enabled)) {
-        return true;
-    }
-
-    if (a->l) {
-        /* VMRS, move VFP special register to gp register */
-        switch (a->reg) {
-        case ARM_VFP_MVFR0:
-        case ARM_VFP_MVFR1:
-        case ARM_VFP_MVFR2:
-        case ARM_VFP_FPSID:
-            if (s->current_el == 1) {
-                gen_set_condexec(s);
-                gen_update_pc(s, 0);
-                gen_helper_check_hcr_el2_trap(cpu_env,
-                                              tcg_constant_i32(a->rt),
-                                              tcg_constant_i32(a->reg));
-            }
-            /* fall through */
-        case ARM_VFP_FPEXC:
-        case ARM_VFP_FPINST:
-        case ARM_VFP_FPINST2:
-            tmp = load_cpu_field(vfp.xregs[a->reg]);
-            break;
-        case ARM_VFP_FPSCR:
-            if (a->rt == 15) {
-                tmp = load_cpu_field(vfp.xregs[ARM_VFP_FPSCR]);
-                tcg_gen_andi_i32(tmp, tmp, FPCR_NZCV_MASK);
-            } else {
-                tmp = tcg_temp_new_i32();
-                gen_helper_vfp_get_fpscr(tmp, cpu_env);
-            }
-            break;
-        default:
-            g_assert_not_reached();
-        }
-
-        if (a->rt == 15) {
-            /* Set the 4 flag bits in the CPSR.  */
-            gen_set_nzcv(tmp);
-            tcg_temp_free_i32(tmp);
-        } else {
-            store_reg(s, a->rt, tmp);
-        }
-    } else {
-        /* VMSR, move gp register to VFP special register */
-        switch (a->reg) {
-        case ARM_VFP_FPSID:
-        case ARM_VFP_MVFR0:
-        case ARM_VFP_MVFR1:
-        case ARM_VFP_MVFR2:
-            /* Writes are ignored.  */
-            break;
-        case ARM_VFP_FPSCR:
-            tmp = load_reg(s, a->rt);
-            gen_helper_vfp_set_fpscr(cpu_env, tmp);
-            tcg_temp_free_i32(tmp);
-            gen_lookup_tb(s);
-            break;
-        case ARM_VFP_FPEXC:
-            /*
-             * TODO: VFP subarchitecture support.
-             * For now, keep the EN bit only
-             */
-            tmp = load_reg(s, a->rt);
-            tcg_gen_andi_i32(tmp, tmp, 1 << 30);
-            store_cpu_field(tmp, vfp.xregs[a->reg]);
-            gen_lookup_tb(s);
-            break;
-        case ARM_VFP_FPINST:
-        case ARM_VFP_FPINST2:
-            tmp = load_reg(s, a->rt);
-            store_cpu_field(tmp, vfp.xregs[a->reg]);
-            break;
-        default:
-            g_assert_not_reached();
-        }
-    }
-
-    return true;
-}
-
-
-static bool trans_VMOV_half(DisasContext *s, arg_VMOV_single *a)
-{
-    TCGv_i32 tmp;
-
-    if (!dc_isar_feature(aa32_fp16_arith, s)) {
-        return false;
-    }
-
-    if (a->rt == 15) {
-        /* UNPREDICTABLE; we choose to UNDEF */
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    if (a->l) {
-        /* VFP to general purpose register */
-        tmp = tcg_temp_new_i32();
-        vfp_load_reg32(tmp, a->vn);
-        tcg_gen_andi_i32(tmp, tmp, 0xffff);
-        store_reg(s, a->rt, tmp);
-    } else {
-        /* general purpose register to VFP */
-        tmp = load_reg(s, a->rt);
-        tcg_gen_andi_i32(tmp, tmp, 0xffff);
-        vfp_store_reg32(tmp, a->vn);
-        tcg_temp_free_i32(tmp);
-    }
-
-    return true;
-}
-
-static bool trans_VMOV_single(DisasContext *s, arg_VMOV_single *a)
-{
-    TCGv_i32 tmp;
-
-    if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    if (a->l) {
-        /* VFP to general purpose register */
-        tmp = tcg_temp_new_i32();
-        vfp_load_reg32(tmp, a->vn);
-        if (a->rt == 15) {
-            /* Set the 4 flag bits in the CPSR.  */
-            gen_set_nzcv(tmp);
-            tcg_temp_free_i32(tmp);
-        } else {
-            store_reg(s, a->rt, tmp);
-        }
-    } else {
-        /* general purpose register to VFP */
-        tmp = load_reg(s, a->rt);
-        vfp_store_reg32(tmp, a->vn);
-        tcg_temp_free_i32(tmp);
-    }
-
-    return true;
-}
-
-static bool trans_VMOV_64_sp(DisasContext *s, arg_VMOV_64_sp *a)
-{
-    TCGv_i32 tmp;
-
-    if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
-        return false;
-    }
-
-    /*
-     * VMOV between two general-purpose registers and two single precision
-     * floating point registers
-     */
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    if (a->op) {
-        /* fpreg to gpreg */
-        tmp = tcg_temp_new_i32();
-        vfp_load_reg32(tmp, a->vm);
-        store_reg(s, a->rt, tmp);
-        tmp = tcg_temp_new_i32();
-        vfp_load_reg32(tmp, a->vm + 1);
-        store_reg(s, a->rt2, tmp);
-    } else {
-        /* gpreg to fpreg */
-        tmp = load_reg(s, a->rt);
-        vfp_store_reg32(tmp, a->vm);
-        tcg_temp_free_i32(tmp);
-        tmp = load_reg(s, a->rt2);
-        vfp_store_reg32(tmp, a->vm + 1);
-        tcg_temp_free_i32(tmp);
-    }
-
-    return true;
-}
-
-static bool trans_VMOV_64_dp(DisasContext *s, arg_VMOV_64_dp *a)
-{
-    TCGv_i32 tmp;
-
-    /*
-     * VMOV between two general-purpose registers and one double precision
-     * floating point register.  Note that this does not require support
-     * for double precision arithmetic.
-     */
-    if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist */
-    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vm & 0x10)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    if (a->op) {
-        /* fpreg to gpreg */
-        tmp = tcg_temp_new_i32();
-        vfp_load_reg32(tmp, a->vm * 2);
-        store_reg(s, a->rt, tmp);
-        tmp = tcg_temp_new_i32();
-        vfp_load_reg32(tmp, a->vm * 2 + 1);
-        store_reg(s, a->rt2, tmp);
-    } else {
-        /* gpreg to fpreg */
-        tmp = load_reg(s, a->rt);
-        vfp_store_reg32(tmp, a->vm * 2);
-        tcg_temp_free_i32(tmp);
-        tmp = load_reg(s, a->rt2);
-        vfp_store_reg32(tmp, a->vm * 2 + 1);
-        tcg_temp_free_i32(tmp);
-    }
-
-    return true;
-}
-
-static bool trans_VLDR_VSTR_hp(DisasContext *s, arg_VLDR_VSTR_sp *a)
-{
-    uint32_t offset;
-    TCGv_i32 addr, tmp;
-
-    if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    /* imm8 field is offset/2 for fp16, unlike fp32 and fp64 */
-    offset = a->imm << 1;
-    if (!a->u) {
-        offset = -offset;
-    }
-
-    /* For thumb, use of PC is UNPREDICTABLE.  */
-    addr = add_reg_for_lit(s, a->rn, offset);
-    tmp = tcg_temp_new_i32();
-    if (a->l) {
-        gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), MO_UW | MO_ALIGN);
-        vfp_store_reg32(tmp, a->vd);
-    } else {
-        vfp_load_reg32(tmp, a->vd);
-        gen_aa32_st_i32(s, tmp, addr, get_mem_index(s), MO_UW | MO_ALIGN);
-    }
-    tcg_temp_free_i32(tmp);
-    tcg_temp_free_i32(addr);
-
-    return true;
-}
-
-static bool trans_VLDR_VSTR_sp(DisasContext *s, arg_VLDR_VSTR_sp *a)
-{
-    uint32_t offset;
-    TCGv_i32 addr, tmp;
-
-    if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    offset = a->imm << 2;
-    if (!a->u) {
-        offset = -offset;
-    }
-
-    /* For thumb, use of PC is UNPREDICTABLE.  */
-    addr = add_reg_for_lit(s, a->rn, offset);
-    tmp = tcg_temp_new_i32();
-    if (a->l) {
-        gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), MO_UL | MO_ALIGN);
-        vfp_store_reg32(tmp, a->vd);
-    } else {
-        vfp_load_reg32(tmp, a->vd);
-        gen_aa32_st_i32(s, tmp, addr, get_mem_index(s), MO_UL | MO_ALIGN);
-    }
-    tcg_temp_free_i32(tmp);
-    tcg_temp_free_i32(addr);
-
-    return true;
-}
-
-static bool trans_VLDR_VSTR_dp(DisasContext *s, arg_VLDR_VSTR_dp *a)
-{
-    uint32_t offset;
-    TCGv_i32 addr;
-    TCGv_i64 tmp;
-
-    /* Note that this does not require support for double arithmetic.  */
-    if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist */
-    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    offset = a->imm << 2;
-    if (!a->u) {
-        offset = -offset;
-    }
-
-    /* For thumb, use of PC is UNPREDICTABLE.  */
-    addr = add_reg_for_lit(s, a->rn, offset);
-    tmp = tcg_temp_new_i64();
-    if (a->l) {
-        gen_aa32_ld_i64(s, tmp, addr, get_mem_index(s), MO_UQ | MO_ALIGN_4);
-        vfp_store_reg64(tmp, a->vd);
-    } else {
-        vfp_load_reg64(tmp, a->vd);
-        gen_aa32_st_i64(s, tmp, addr, get_mem_index(s), MO_UQ | MO_ALIGN_4);
-    }
-    tcg_temp_free_i64(tmp);
-    tcg_temp_free_i32(addr);
-
-    return true;
-}
-
-static bool trans_VLDM_VSTM_sp(DisasContext *s, arg_VLDM_VSTM_sp *a)
-{
-    uint32_t offset;
-    TCGv_i32 addr, tmp;
-    int i, n;
-
-    if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
-        return false;
-    }
-
-    n = a->imm;
-
-    if (n == 0 || (a->vd + n) > 32) {
-        /*
-         * UNPREDICTABLE cases for bad immediates: we choose to
-         * UNDEF to avoid generating huge numbers of TCG ops
-         */
-        return false;
-    }
-    if (a->rn == 15 && a->w) {
-        /* writeback to PC is UNPREDICTABLE, we choose to UNDEF */
-        return false;
-    }
-
-    s->eci_handled = true;
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    /* For thumb, use of PC is UNPREDICTABLE.  */
-    addr = add_reg_for_lit(s, a->rn, 0);
-    if (a->p) {
-        /* pre-decrement */
-        tcg_gen_addi_i32(addr, addr, -(a->imm << 2));
-    }
-
-    if (s->v8m_stackcheck && a->rn == 13 && a->w) {
-        /*
-         * Here 'addr' is the lowest address we will store to,
-         * and is either the old SP (if post-increment) or
-         * the new SP (if pre-decrement). For post-increment
-         * where the old value is below the limit and the new
-         * value is above, it is UNKNOWN whether the limit check
-         * triggers; we choose to trigger.
-         */
-        gen_helper_v8m_stackcheck(cpu_env, addr);
-    }
-
-    offset = 4;
-    tmp = tcg_temp_new_i32();
-    for (i = 0; i < n; i++) {
-        if (a->l) {
-            /* load */
-            gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), MO_UL | MO_ALIGN);
-            vfp_store_reg32(tmp, a->vd + i);
-        } else {
-            /* store */
-            vfp_load_reg32(tmp, a->vd + i);
-            gen_aa32_st_i32(s, tmp, addr, get_mem_index(s), MO_UL | MO_ALIGN);
-        }
-        tcg_gen_addi_i32(addr, addr, offset);
-    }
-    tcg_temp_free_i32(tmp);
-    if (a->w) {
-        /* writeback */
-        if (a->p) {
-            offset = -offset * n;
-            tcg_gen_addi_i32(addr, addr, offset);
-        }
-        store_reg(s, a->rn, addr);
-    } else {
-        tcg_temp_free_i32(addr);
-    }
-
-    clear_eci_state(s);
-    return true;
-}
-
-static bool trans_VLDM_VSTM_dp(DisasContext *s, arg_VLDM_VSTM_dp *a)
-{
-    uint32_t offset;
-    TCGv_i32 addr;
-    TCGv_i64 tmp;
-    int i, n;
-
-    /* Note that this does not require support for double arithmetic.  */
-    if (!dc_isar_feature(aa32_fpsp_v2, s) && !dc_isar_feature(aa32_mve, s)) {
-        return false;
-    }
-
-    n = a->imm >> 1;
-
-    if (n == 0 || (a->vd + n) > 32 || n > 16) {
-        /*
-         * UNPREDICTABLE cases for bad immediates: we choose to
-         * UNDEF to avoid generating huge numbers of TCG ops
-         */
-        return false;
-    }
-    if (a->rn == 15 && a->w) {
-        /* writeback to PC is UNPREDICTABLE, we choose to UNDEF */
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist */
-    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd + n) > 16) {
-        return false;
-    }
-
-    s->eci_handled = true;
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    /* For thumb, use of PC is UNPREDICTABLE.  */
-    addr = add_reg_for_lit(s, a->rn, 0);
-    if (a->p) {
-        /* pre-decrement */
-        tcg_gen_addi_i32(addr, addr, -(a->imm << 2));
-    }
-
-    if (s->v8m_stackcheck && a->rn == 13 && a->w) {
-        /*
-         * Here 'addr' is the lowest address we will store to,
-         * and is either the old SP (if post-increment) or
-         * the new SP (if pre-decrement). For post-increment
-         * where the old value is below the limit and the new
-         * value is above, it is UNKNOWN whether the limit check
-         * triggers; we choose to trigger.
-         */
-        gen_helper_v8m_stackcheck(cpu_env, addr);
-    }
-
-    offset = 8;
-    tmp = tcg_temp_new_i64();
-    for (i = 0; i < n; i++) {
-        if (a->l) {
-            /* load */
-            gen_aa32_ld_i64(s, tmp, addr, get_mem_index(s), MO_UQ | MO_ALIGN_4);
-            vfp_store_reg64(tmp, a->vd + i);
-        } else {
-            /* store */
-            vfp_load_reg64(tmp, a->vd + i);
-            gen_aa32_st_i64(s, tmp, addr, get_mem_index(s), MO_UQ | MO_ALIGN_4);
-        }
-        tcg_gen_addi_i32(addr, addr, offset);
-    }
-    tcg_temp_free_i64(tmp);
-    if (a->w) {
-        /* writeback */
-        if (a->p) {
-            offset = -offset * n;
-        } else if (a->imm & 1) {
-            offset = 4;
-        } else {
-            offset = 0;
-        }
-
-        if (offset != 0) {
-            tcg_gen_addi_i32(addr, addr, offset);
-        }
-        store_reg(s, a->rn, addr);
-    } else {
-        tcg_temp_free_i32(addr);
-    }
-
-    clear_eci_state(s);
-    return true;
-}
-
-/*
- * Types for callbacks for do_vfp_3op_sp() and do_vfp_3op_dp().
- * The callback should emit code to write a value to vd. If
- * do_vfp_3op_{sp,dp}() was passed reads_vd then the TCGv vd
- * will contain the old value of the relevant VFP register;
- * otherwise it must be written to only.
- */
-typedef void VFPGen3OpSPFn(TCGv_i32 vd,
-                           TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst);
-typedef void VFPGen3OpDPFn(TCGv_i64 vd,
-                           TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst);
-
-/*
- * Types for callbacks for do_vfp_2op_sp() and do_vfp_2op_dp().
- * The callback should emit code to write a value to vd (which
- * should be written to only).
- */
-typedef void VFPGen2OpSPFn(TCGv_i32 vd, TCGv_i32 vm);
-typedef void VFPGen2OpDPFn(TCGv_i64 vd, TCGv_i64 vm);
-
-/*
- * Return true if the specified S reg is in a scalar bank
- * (ie if it is s0..s7)
- */
-static inline bool vfp_sreg_is_scalar(int reg)
-{
-    return (reg & 0x18) == 0;
-}
-
-/*
- * Return true if the specified D reg is in a scalar bank
- * (ie if it is d0..d3 or d16..d19)
- */
-static inline bool vfp_dreg_is_scalar(int reg)
-{
-    return (reg & 0xc) == 0;
-}
-
-/*
- * Advance the S reg number forwards by delta within its bank
- * (ie increment the low 3 bits but leave the rest the same)
- */
-static inline int vfp_advance_sreg(int reg, int delta)
-{
-    return ((reg + delta) & 0x7) | (reg & ~0x7);
-}
-
-/*
- * Advance the D reg number forwards by delta within its bank
- * (ie increment the low 2 bits but leave the rest the same)
- */
-static inline int vfp_advance_dreg(int reg, int delta)
-{
-    return ((reg + delta) & 0x3) | (reg & ~0x3);
-}
-
-/*
- * Perform a 3-operand VFP data processing instruction. fn is the
- * callback to do the actual operation; this function deals with the
- * code to handle looping around for VFP vector processing.
- */
-static bool do_vfp_3op_sp(DisasContext *s, VFPGen3OpSPFn *fn,
-                          int vd, int vn, int vm, bool reads_vd)
-{
-    uint32_t delta_m = 0;
-    uint32_t delta_d = 0;
-    int veclen = s->vec_len;
-    TCGv_i32 f0, f1, fd;
-    TCGv_ptr fpst;
-
-    if (!dc_isar_feature(aa32_fpsp_v2, s)) {
-        return false;
-    }
-
-    if (!dc_isar_feature(aa32_fpshvec, s) &&
-        (veclen != 0 || s->vec_stride != 0)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    if (veclen > 0) {
-        /* Figure out what type of vector operation this is.  */
-        if (vfp_sreg_is_scalar(vd)) {
-            /* scalar */
-            veclen = 0;
-        } else {
-            delta_d = s->vec_stride + 1;
-
-            if (vfp_sreg_is_scalar(vm)) {
-                /* mixed scalar/vector */
-                delta_m = 0;
-            } else {
-                /* vector */
-                delta_m = delta_d;
-            }
-        }
-    }
-
-    f0 = tcg_temp_new_i32();
-    f1 = tcg_temp_new_i32();
-    fd = tcg_temp_new_i32();
-    fpst = fpstatus_ptr(FPST_FPCR);
-
-    vfp_load_reg32(f0, vn);
-    vfp_load_reg32(f1, vm);
-
-    for (;;) {
-        if (reads_vd) {
-            vfp_load_reg32(fd, vd);
-        }
-        fn(fd, f0, f1, fpst);
-        vfp_store_reg32(fd, vd);
-
-        if (veclen == 0) {
-            break;
-        }
-
-        /* Set up the operands for the next iteration */
-        veclen--;
-        vd = vfp_advance_sreg(vd, delta_d);
-        vn = vfp_advance_sreg(vn, delta_d);
-        vfp_load_reg32(f0, vn);
-        if (delta_m) {
-            vm = vfp_advance_sreg(vm, delta_m);
-            vfp_load_reg32(f1, vm);
-        }
-    }
-
-    tcg_temp_free_i32(f0);
-    tcg_temp_free_i32(f1);
-    tcg_temp_free_i32(fd);
-    tcg_temp_free_ptr(fpst);
-
-    return true;
-}
-
-static bool do_vfp_3op_hp(DisasContext *s, VFPGen3OpSPFn *fn,
-                          int vd, int vn, int vm, bool reads_vd)
-{
-    /*
-     * Do a half-precision operation. Functionally this is
-     * the same as do_vfp_3op_sp(), except:
-     *  - it uses the FPST_FPCR_F16
-     *  - it doesn't need the VFP vector handling (fp16 is a
-     *    v8 feature, and in v8 VFP vectors don't exist)
-     *  - it does the aa32_fp16_arith feature test
-     */
-    TCGv_i32 f0, f1, fd;
-    TCGv_ptr fpst;
-
-    if (!dc_isar_feature(aa32_fp16_arith, s)) {
-        return false;
-    }
-
-    if (s->vec_len != 0 || s->vec_stride != 0) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    f0 = tcg_temp_new_i32();
-    f1 = tcg_temp_new_i32();
-    fd = tcg_temp_new_i32();
-    fpst = fpstatus_ptr(FPST_FPCR_F16);
-
-    vfp_load_reg32(f0, vn);
-    vfp_load_reg32(f1, vm);
-
-    if (reads_vd) {
-        vfp_load_reg32(fd, vd);
-    }
-    fn(fd, f0, f1, fpst);
-    vfp_store_reg32(fd, vd);
-
-    tcg_temp_free_i32(f0);
-    tcg_temp_free_i32(f1);
-    tcg_temp_free_i32(fd);
-    tcg_temp_free_ptr(fpst);
-
-    return true;
-}
-
-static bool do_vfp_3op_dp(DisasContext *s, VFPGen3OpDPFn *fn,
-                          int vd, int vn, int vm, bool reads_vd)
-{
-    uint32_t delta_m = 0;
-    uint32_t delta_d = 0;
-    int veclen = s->vec_len;
-    TCGv_i64 f0, f1, fd;
-    TCGv_ptr fpst;
-
-    if (!dc_isar_feature(aa32_fpdp_v2, s)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist */
-    if (!dc_isar_feature(aa32_simd_r32, s) && ((vd | vn | vm) & 0x10)) {
-        return false;
-    }
-
-    if (!dc_isar_feature(aa32_fpshvec, s) &&
-        (veclen != 0 || s->vec_stride != 0)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    if (veclen > 0) {
-        /* Figure out what type of vector operation this is.  */
-        if (vfp_dreg_is_scalar(vd)) {
-            /* scalar */
-            veclen = 0;
-        } else {
-            delta_d = (s->vec_stride >> 1) + 1;
-
-            if (vfp_dreg_is_scalar(vm)) {
-                /* mixed scalar/vector */
-                delta_m = 0;
-            } else {
-                /* vector */
-                delta_m = delta_d;
-            }
-        }
-    }
-
-    f0 = tcg_temp_new_i64();
-    f1 = tcg_temp_new_i64();
-    fd = tcg_temp_new_i64();
-    fpst = fpstatus_ptr(FPST_FPCR);
-
-    vfp_load_reg64(f0, vn);
-    vfp_load_reg64(f1, vm);
-
-    for (;;) {
-        if (reads_vd) {
-            vfp_load_reg64(fd, vd);
-        }
-        fn(fd, f0, f1, fpst);
-        vfp_store_reg64(fd, vd);
-
-        if (veclen == 0) {
-            break;
-        }
-        /* Set up the operands for the next iteration */
-        veclen--;
-        vd = vfp_advance_dreg(vd, delta_d);
-        vn = vfp_advance_dreg(vn, delta_d);
-        vfp_load_reg64(f0, vn);
-        if (delta_m) {
-            vm = vfp_advance_dreg(vm, delta_m);
-            vfp_load_reg64(f1, vm);
-        }
-    }
-
-    tcg_temp_free_i64(f0);
-    tcg_temp_free_i64(f1);
-    tcg_temp_free_i64(fd);
-    tcg_temp_free_ptr(fpst);
-
-    return true;
-}
-
-static bool do_vfp_2op_sp(DisasContext *s, VFPGen2OpSPFn *fn, int vd, int vm)
-{
-    uint32_t delta_m = 0;
-    uint32_t delta_d = 0;
-    int veclen = s->vec_len;
-    TCGv_i32 f0, fd;
-
-    /* Note that the caller must check the aa32_fpsp_v2 feature. */
-
-    if (!dc_isar_feature(aa32_fpshvec, s) &&
-        (veclen != 0 || s->vec_stride != 0)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    if (veclen > 0) {
-        /* Figure out what type of vector operation this is.  */
-        if (vfp_sreg_is_scalar(vd)) {
-            /* scalar */
-            veclen = 0;
-        } else {
-            delta_d = s->vec_stride + 1;
-
-            if (vfp_sreg_is_scalar(vm)) {
-                /* mixed scalar/vector */
-                delta_m = 0;
-            } else {
-                /* vector */
-                delta_m = delta_d;
-            }
-        }
-    }
-
-    f0 = tcg_temp_new_i32();
-    fd = tcg_temp_new_i32();
-
-    vfp_load_reg32(f0, vm);
-
-    for (;;) {
-        fn(fd, f0);
-        vfp_store_reg32(fd, vd);
-
-        if (veclen == 0) {
-            break;
-        }
-
-        if (delta_m == 0) {
-            /* single source one-many */
-            while (veclen--) {
-                vd = vfp_advance_sreg(vd, delta_d);
-                vfp_store_reg32(fd, vd);
-            }
-            break;
-        }
-
-        /* Set up the operands for the next iteration */
-        veclen--;
-        vd = vfp_advance_sreg(vd, delta_d);
-        vm = vfp_advance_sreg(vm, delta_m);
-        vfp_load_reg32(f0, vm);
-    }
-
-    tcg_temp_free_i32(f0);
-    tcg_temp_free_i32(fd);
-
-    return true;
-}
-
-static bool do_vfp_2op_hp(DisasContext *s, VFPGen2OpSPFn *fn, int vd, int vm)
-{
-    /*
-     * Do a half-precision operation. Functionally this is
-     * the same as do_vfp_2op_sp(), except:
-     *  - it doesn't need the VFP vector handling (fp16 is a
-     *    v8 feature, and in v8 VFP vectors don't exist)
-     *  - it does the aa32_fp16_arith feature test
-     */
-    TCGv_i32 f0;
-
-    /* Note that the caller must check the aa32_fp16_arith feature */
-
-    if (!dc_isar_feature(aa32_fp16_arith, s)) {
-        return false;
-    }
-
-    if (s->vec_len != 0 || s->vec_stride != 0) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    f0 = tcg_temp_new_i32();
-    vfp_load_reg32(f0, vm);
-    fn(f0, f0);
-    vfp_store_reg32(f0, vd);
-    tcg_temp_free_i32(f0);
-
-    return true;
-}
-
-static bool do_vfp_2op_dp(DisasContext *s, VFPGen2OpDPFn *fn, int vd, int vm)
-{
-    uint32_t delta_m = 0;
-    uint32_t delta_d = 0;
-    int veclen = s->vec_len;
-    TCGv_i64 f0, fd;
-
-    /* Note that the caller must check the aa32_fpdp_v2 feature. */
-
-    /* UNDEF accesses to D16-D31 if they don't exist */
-    if (!dc_isar_feature(aa32_simd_r32, s) && ((vd | vm) & 0x10)) {
-        return false;
-    }
-
-    if (!dc_isar_feature(aa32_fpshvec, s) &&
-        (veclen != 0 || s->vec_stride != 0)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    if (veclen > 0) {
-        /* Figure out what type of vector operation this is.  */
-        if (vfp_dreg_is_scalar(vd)) {
-            /* scalar */
-            veclen = 0;
-        } else {
-            delta_d = (s->vec_stride >> 1) + 1;
-
-            if (vfp_dreg_is_scalar(vm)) {
-                /* mixed scalar/vector */
-                delta_m = 0;
-            } else {
-                /* vector */
-                delta_m = delta_d;
-            }
-        }
-    }
-
-    f0 = tcg_temp_new_i64();
-    fd = tcg_temp_new_i64();
-
-    vfp_load_reg64(f0, vm);
-
-    for (;;) {
-        fn(fd, f0);
-        vfp_store_reg64(fd, vd);
-
-        if (veclen == 0) {
-            break;
-        }
-
-        if (delta_m == 0) {
-            /* single source one-many */
-            while (veclen--) {
-                vd = vfp_advance_dreg(vd, delta_d);
-                vfp_store_reg64(fd, vd);
-            }
-            break;
-        }
-
-        /* Set up the operands for the next iteration */
-        veclen--;
-        vd = vfp_advance_dreg(vd, delta_d);
-        vd = vfp_advance_dreg(vm, delta_m);
-        vfp_load_reg64(f0, vm);
-    }
-
-    tcg_temp_free_i64(f0);
-    tcg_temp_free_i64(fd);
-
-    return true;
-}
-
-static void gen_VMLA_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
-{
-    /* Note that order of inputs to the add matters for NaNs */
-    TCGv_i32 tmp = tcg_temp_new_i32();
-
-    gen_helper_vfp_mulh(tmp, vn, vm, fpst);
-    gen_helper_vfp_addh(vd, vd, tmp, fpst);
-    tcg_temp_free_i32(tmp);
-}
-
-static bool trans_VMLA_hp(DisasContext *s, arg_VMLA_sp *a)
-{
-    return do_vfp_3op_hp(s, gen_VMLA_hp, a->vd, a->vn, a->vm, true);
-}
-
-static void gen_VMLA_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
-{
-    /* Note that order of inputs to the add matters for NaNs */
-    TCGv_i32 tmp = tcg_temp_new_i32();
-
-    gen_helper_vfp_muls(tmp, vn, vm, fpst);
-    gen_helper_vfp_adds(vd, vd, tmp, fpst);
-    tcg_temp_free_i32(tmp);
-}
-
-static bool trans_VMLA_sp(DisasContext *s, arg_VMLA_sp *a)
-{
-    return do_vfp_3op_sp(s, gen_VMLA_sp, a->vd, a->vn, a->vm, true);
-}
-
-static void gen_VMLA_dp(TCGv_i64 vd, TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst)
-{
-    /* Note that order of inputs to the add matters for NaNs */
-    TCGv_i64 tmp = tcg_temp_new_i64();
-
-    gen_helper_vfp_muld(tmp, vn, vm, fpst);
-    gen_helper_vfp_addd(vd, vd, tmp, fpst);
-    tcg_temp_free_i64(tmp);
-}
-
-static bool trans_VMLA_dp(DisasContext *s, arg_VMLA_dp *a)
-{
-    return do_vfp_3op_dp(s, gen_VMLA_dp, a->vd, a->vn, a->vm, true);
-}
-
-static void gen_VMLS_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
-{
-    /*
-     * VMLS: vd = vd + -(vn * vm)
-     * Note that order of inputs to the add matters for NaNs.
-     */
-    TCGv_i32 tmp = tcg_temp_new_i32();
-
-    gen_helper_vfp_mulh(tmp, vn, vm, fpst);
-    gen_helper_vfp_negh(tmp, tmp);
-    gen_helper_vfp_addh(vd, vd, tmp, fpst);
-    tcg_temp_free_i32(tmp);
-}
-
-static bool trans_VMLS_hp(DisasContext *s, arg_VMLS_sp *a)
-{
-    return do_vfp_3op_hp(s, gen_VMLS_hp, a->vd, a->vn, a->vm, true);
-}
-
-static void gen_VMLS_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
-{
-    /*
-     * VMLS: vd = vd + -(vn * vm)
-     * Note that order of inputs to the add matters for NaNs.
-     */
-    TCGv_i32 tmp = tcg_temp_new_i32();
-
-    gen_helper_vfp_muls(tmp, vn, vm, fpst);
-    gen_helper_vfp_negs(tmp, tmp);
-    gen_helper_vfp_adds(vd, vd, tmp, fpst);
-    tcg_temp_free_i32(tmp);
-}
-
-static bool trans_VMLS_sp(DisasContext *s, arg_VMLS_sp *a)
-{
-    return do_vfp_3op_sp(s, gen_VMLS_sp, a->vd, a->vn, a->vm, true);
-}
-
-static void gen_VMLS_dp(TCGv_i64 vd, TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst)
-{
-    /*
-     * VMLS: vd = vd + -(vn * vm)
-     * Note that order of inputs to the add matters for NaNs.
-     */
-    TCGv_i64 tmp = tcg_temp_new_i64();
-
-    gen_helper_vfp_muld(tmp, vn, vm, fpst);
-    gen_helper_vfp_negd(tmp, tmp);
-    gen_helper_vfp_addd(vd, vd, tmp, fpst);
-    tcg_temp_free_i64(tmp);
-}
-
-static bool trans_VMLS_dp(DisasContext *s, arg_VMLS_dp *a)
-{
-    return do_vfp_3op_dp(s, gen_VMLS_dp, a->vd, a->vn, a->vm, true);
-}
-
-static void gen_VNMLS_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
-{
-    /*
-     * VNMLS: -fd + (fn * fm)
-     * Note that it isn't valid to replace (-A + B) with (B - A) or similar
-     * plausible looking simplifications because this will give wrong results
-     * for NaNs.
-     */
-    TCGv_i32 tmp = tcg_temp_new_i32();
-
-    gen_helper_vfp_mulh(tmp, vn, vm, fpst);
-    gen_helper_vfp_negh(vd, vd);
-    gen_helper_vfp_addh(vd, vd, tmp, fpst);
-    tcg_temp_free_i32(tmp);
-}
-
-static bool trans_VNMLS_hp(DisasContext *s, arg_VNMLS_sp *a)
-{
-    return do_vfp_3op_hp(s, gen_VNMLS_hp, a->vd, a->vn, a->vm, true);
-}
-
-static void gen_VNMLS_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
-{
-    /*
-     * VNMLS: -fd + (fn * fm)
-     * Note that it isn't valid to replace (-A + B) with (B - A) or similar
-     * plausible looking simplifications because this will give wrong results
-     * for NaNs.
-     */
-    TCGv_i32 tmp = tcg_temp_new_i32();
-
-    gen_helper_vfp_muls(tmp, vn, vm, fpst);
-    gen_helper_vfp_negs(vd, vd);
-    gen_helper_vfp_adds(vd, vd, tmp, fpst);
-    tcg_temp_free_i32(tmp);
-}
-
-static bool trans_VNMLS_sp(DisasContext *s, arg_VNMLS_sp *a)
-{
-    return do_vfp_3op_sp(s, gen_VNMLS_sp, a->vd, a->vn, a->vm, true);
-}
-
-static void gen_VNMLS_dp(TCGv_i64 vd, TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst)
-{
-    /*
-     * VNMLS: -fd + (fn * fm)
-     * Note that it isn't valid to replace (-A + B) with (B - A) or similar
-     * plausible looking simplifications because this will give wrong results
-     * for NaNs.
-     */
-    TCGv_i64 tmp = tcg_temp_new_i64();
-
-    gen_helper_vfp_muld(tmp, vn, vm, fpst);
-    gen_helper_vfp_negd(vd, vd);
-    gen_helper_vfp_addd(vd, vd, tmp, fpst);
-    tcg_temp_free_i64(tmp);
-}
-
-static bool trans_VNMLS_dp(DisasContext *s, arg_VNMLS_dp *a)
-{
-    return do_vfp_3op_dp(s, gen_VNMLS_dp, a->vd, a->vn, a->vm, true);
-}
-
-static void gen_VNMLA_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
-{
-    /* VNMLA: -fd + -(fn * fm) */
-    TCGv_i32 tmp = tcg_temp_new_i32();
-
-    gen_helper_vfp_mulh(tmp, vn, vm, fpst);
-    gen_helper_vfp_negh(tmp, tmp);
-    gen_helper_vfp_negh(vd, vd);
-    gen_helper_vfp_addh(vd, vd, tmp, fpst);
-    tcg_temp_free_i32(tmp);
-}
-
-static bool trans_VNMLA_hp(DisasContext *s, arg_VNMLA_sp *a)
-{
-    return do_vfp_3op_hp(s, gen_VNMLA_hp, a->vd, a->vn, a->vm, true);
-}
-
-static void gen_VNMLA_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
-{
-    /* VNMLA: -fd + -(fn * fm) */
-    TCGv_i32 tmp = tcg_temp_new_i32();
-
-    gen_helper_vfp_muls(tmp, vn, vm, fpst);
-    gen_helper_vfp_negs(tmp, tmp);
-    gen_helper_vfp_negs(vd, vd);
-    gen_helper_vfp_adds(vd, vd, tmp, fpst);
-    tcg_temp_free_i32(tmp);
-}
-
-static bool trans_VNMLA_sp(DisasContext *s, arg_VNMLA_sp *a)
-{
-    return do_vfp_3op_sp(s, gen_VNMLA_sp, a->vd, a->vn, a->vm, true);
-}
-
-static void gen_VNMLA_dp(TCGv_i64 vd, TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst)
-{
-    /* VNMLA: -fd + (fn * fm) */
-    TCGv_i64 tmp = tcg_temp_new_i64();
-
-    gen_helper_vfp_muld(tmp, vn, vm, fpst);
-    gen_helper_vfp_negd(tmp, tmp);
-    gen_helper_vfp_negd(vd, vd);
-    gen_helper_vfp_addd(vd, vd, tmp, fpst);
-    tcg_temp_free_i64(tmp);
-}
-
-static bool trans_VNMLA_dp(DisasContext *s, arg_VNMLA_dp *a)
-{
-    return do_vfp_3op_dp(s, gen_VNMLA_dp, a->vd, a->vn, a->vm, true);
-}
-
-static bool trans_VMUL_hp(DisasContext *s, arg_VMUL_sp *a)
-{
-    return do_vfp_3op_hp(s, gen_helper_vfp_mulh, a->vd, a->vn, a->vm, false);
-}
-
-static bool trans_VMUL_sp(DisasContext *s, arg_VMUL_sp *a)
-{
-    return do_vfp_3op_sp(s, gen_helper_vfp_muls, a->vd, a->vn, a->vm, false);
-}
-
-static bool trans_VMUL_dp(DisasContext *s, arg_VMUL_dp *a)
-{
-    return do_vfp_3op_dp(s, gen_helper_vfp_muld, a->vd, a->vn, a->vm, false);
-}
-
-static void gen_VNMUL_hp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
-{
-    /* VNMUL: -(fn * fm) */
-    gen_helper_vfp_mulh(vd, vn, vm, fpst);
-    gen_helper_vfp_negh(vd, vd);
-}
-
-static bool trans_VNMUL_hp(DisasContext *s, arg_VNMUL_sp *a)
-{
-    return do_vfp_3op_hp(s, gen_VNMUL_hp, a->vd, a->vn, a->vm, false);
-}
-
-static void gen_VNMUL_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
-{
-    /* VNMUL: -(fn * fm) */
-    gen_helper_vfp_muls(vd, vn, vm, fpst);
-    gen_helper_vfp_negs(vd, vd);
-}
-
-static bool trans_VNMUL_sp(DisasContext *s, arg_VNMUL_sp *a)
-{
-    return do_vfp_3op_sp(s, gen_VNMUL_sp, a->vd, a->vn, a->vm, false);
-}
-
-static void gen_VNMUL_dp(TCGv_i64 vd, TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst)
-{
-    /* VNMUL: -(fn * fm) */
-    gen_helper_vfp_muld(vd, vn, vm, fpst);
-    gen_helper_vfp_negd(vd, vd);
-}
-
-static bool trans_VNMUL_dp(DisasContext *s, arg_VNMUL_dp *a)
-{
-    return do_vfp_3op_dp(s, gen_VNMUL_dp, a->vd, a->vn, a->vm, false);
-}
-
-static bool trans_VADD_hp(DisasContext *s, arg_VADD_sp *a)
-{
-    return do_vfp_3op_hp(s, gen_helper_vfp_addh, a->vd, a->vn, a->vm, false);
-}
-
-static bool trans_VADD_sp(DisasContext *s, arg_VADD_sp *a)
-{
-    return do_vfp_3op_sp(s, gen_helper_vfp_adds, a->vd, a->vn, a->vm, false);
-}
-
-static bool trans_VADD_dp(DisasContext *s, arg_VADD_dp *a)
-{
-    return do_vfp_3op_dp(s, gen_helper_vfp_addd, a->vd, a->vn, a->vm, false);
-}
-
-static bool trans_VSUB_hp(DisasContext *s, arg_VSUB_sp *a)
-{
-    return do_vfp_3op_hp(s, gen_helper_vfp_subh, a->vd, a->vn, a->vm, false);
-}
-
-static bool trans_VSUB_sp(DisasContext *s, arg_VSUB_sp *a)
-{
-    return do_vfp_3op_sp(s, gen_helper_vfp_subs, a->vd, a->vn, a->vm, false);
-}
-
-static bool trans_VSUB_dp(DisasContext *s, arg_VSUB_dp *a)
-{
-    return do_vfp_3op_dp(s, gen_helper_vfp_subd, a->vd, a->vn, a->vm, false);
-}
-
-static bool trans_VDIV_hp(DisasContext *s, arg_VDIV_sp *a)
-{
-    return do_vfp_3op_hp(s, gen_helper_vfp_divh, a->vd, a->vn, a->vm, false);
-}
-
-static bool trans_VDIV_sp(DisasContext *s, arg_VDIV_sp *a)
-{
-    return do_vfp_3op_sp(s, gen_helper_vfp_divs, a->vd, a->vn, a->vm, false);
-}
-
-static bool trans_VDIV_dp(DisasContext *s, arg_VDIV_dp *a)
-{
-    return do_vfp_3op_dp(s, gen_helper_vfp_divd, a->vd, a->vn, a->vm, false);
-}
-
-static bool trans_VMINNM_hp(DisasContext *s, arg_VMINNM_sp *a)
-{
-    if (!dc_isar_feature(aa32_vminmaxnm, s)) {
-        return false;
-    }
-    return do_vfp_3op_hp(s, gen_helper_vfp_minnumh,
-                         a->vd, a->vn, a->vm, false);
-}
-
-static bool trans_VMAXNM_hp(DisasContext *s, arg_VMAXNM_sp *a)
-{
-    if (!dc_isar_feature(aa32_vminmaxnm, s)) {
-        return false;
-    }
-    return do_vfp_3op_hp(s, gen_helper_vfp_maxnumh,
-                         a->vd, a->vn, a->vm, false);
-}
-
-static bool trans_VMINNM_sp(DisasContext *s, arg_VMINNM_sp *a)
-{
-    if (!dc_isar_feature(aa32_vminmaxnm, s)) {
-        return false;
-    }
-    return do_vfp_3op_sp(s, gen_helper_vfp_minnums,
-                         a->vd, a->vn, a->vm, false);
-}
-
-static bool trans_VMAXNM_sp(DisasContext *s, arg_VMAXNM_sp *a)
-{
-    if (!dc_isar_feature(aa32_vminmaxnm, s)) {
-        return false;
-    }
-    return do_vfp_3op_sp(s, gen_helper_vfp_maxnums,
-                         a->vd, a->vn, a->vm, false);
-}
-
-static bool trans_VMINNM_dp(DisasContext *s, arg_VMINNM_dp *a)
-{
-    if (!dc_isar_feature(aa32_vminmaxnm, s)) {
-        return false;
-    }
-    return do_vfp_3op_dp(s, gen_helper_vfp_minnumd,
-                         a->vd, a->vn, a->vm, false);
-}
-
-static bool trans_VMAXNM_dp(DisasContext *s, arg_VMAXNM_dp *a)
-{
-    if (!dc_isar_feature(aa32_vminmaxnm, s)) {
-        return false;
-    }
-    return do_vfp_3op_dp(s, gen_helper_vfp_maxnumd,
-                         a->vd, a->vn, a->vm, false);
-}
-
-static bool do_vfm_hp(DisasContext *s, arg_VFMA_sp *a, bool neg_n, bool neg_d)
-{
-    /*
-     * VFNMA : fd = muladd(-fd,  fn, fm)
-     * VFNMS : fd = muladd(-fd, -fn, fm)
-     * VFMA  : fd = muladd( fd,  fn, fm)
-     * VFMS  : fd = muladd( fd, -fn, fm)
-     *
-     * These are fused multiply-add, and must be done as one floating
-     * point operation with no rounding between the multiplication and
-     * addition steps.  NB that doing the negations here as separate
-     * steps is correct : an input NaN should come out with its sign
-     * bit flipped if it is a negated-input.
-     */
-    TCGv_ptr fpst;
-    TCGv_i32 vn, vm, vd;
-
-    /*
-     * Present in VFPv4 only, and only with the FP16 extension.
-     * Note that we can't rely on the SIMDFMAC check alone, because
-     * in a Neon-no-VFP core that ID register field will be non-zero.
-     */
-    if (!dc_isar_feature(aa32_fp16_arith, s) ||
-        !dc_isar_feature(aa32_simdfmac, s) ||
-        !dc_isar_feature(aa32_fpsp_v2, s)) {
-        return false;
-    }
-
-    if (s->vec_len != 0 || s->vec_stride != 0) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    vn = tcg_temp_new_i32();
-    vm = tcg_temp_new_i32();
-    vd = tcg_temp_new_i32();
-
-    vfp_load_reg32(vn, a->vn);
-    vfp_load_reg32(vm, a->vm);
-    if (neg_n) {
-        /* VFNMS, VFMS */
-        gen_helper_vfp_negh(vn, vn);
-    }
-    vfp_load_reg32(vd, a->vd);
-    if (neg_d) {
-        /* VFNMA, VFNMS */
-        gen_helper_vfp_negh(vd, vd);
-    }
-    fpst = fpstatus_ptr(FPST_FPCR_F16);
-    gen_helper_vfp_muladdh(vd, vn, vm, vd, fpst);
-    vfp_store_reg32(vd, a->vd);
-
-    tcg_temp_free_ptr(fpst);
-    tcg_temp_free_i32(vn);
-    tcg_temp_free_i32(vm);
-    tcg_temp_free_i32(vd);
-
-    return true;
-}
-
-static bool do_vfm_sp(DisasContext *s, arg_VFMA_sp *a, bool neg_n, bool neg_d)
-{
-    /*
-     * VFNMA : fd = muladd(-fd,  fn, fm)
-     * VFNMS : fd = muladd(-fd, -fn, fm)
-     * VFMA  : fd = muladd( fd,  fn, fm)
-     * VFMS  : fd = muladd( fd, -fn, fm)
-     *
-     * These are fused multiply-add, and must be done as one floating
-     * point operation with no rounding between the multiplication and
-     * addition steps.  NB that doing the negations here as separate
-     * steps is correct : an input NaN should come out with its sign
-     * bit flipped if it is a negated-input.
-     */
-    TCGv_ptr fpst;
-    TCGv_i32 vn, vm, vd;
-
-    /*
-     * Present in VFPv4 only.
-     * Note that we can't rely on the SIMDFMAC check alone, because
-     * in a Neon-no-VFP core that ID register field will be non-zero.
-     */
-    if (!dc_isar_feature(aa32_simdfmac, s) ||
-        !dc_isar_feature(aa32_fpsp_v2, s)) {
-        return false;
-    }
-    /*
-     * In v7A, UNPREDICTABLE with non-zero vector length/stride; from
-     * v8A, must UNDEF. We choose to UNDEF for both v7A and v8A.
-     */
-    if (s->vec_len != 0 || s->vec_stride != 0) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    vn = tcg_temp_new_i32();
-    vm = tcg_temp_new_i32();
-    vd = tcg_temp_new_i32();
-
-    vfp_load_reg32(vn, a->vn);
-    vfp_load_reg32(vm, a->vm);
-    if (neg_n) {
-        /* VFNMS, VFMS */
-        gen_helper_vfp_negs(vn, vn);
-    }
-    vfp_load_reg32(vd, a->vd);
-    if (neg_d) {
-        /* VFNMA, VFNMS */
-        gen_helper_vfp_negs(vd, vd);
-    }
-    fpst = fpstatus_ptr(FPST_FPCR);
-    gen_helper_vfp_muladds(vd, vn, vm, vd, fpst);
-    vfp_store_reg32(vd, a->vd);
-
-    tcg_temp_free_ptr(fpst);
-    tcg_temp_free_i32(vn);
-    tcg_temp_free_i32(vm);
-    tcg_temp_free_i32(vd);
-
-    return true;
-}
-
-static bool do_vfm_dp(DisasContext *s, arg_VFMA_dp *a, bool neg_n, bool neg_d)
-{
-    /*
-     * VFNMA : fd = muladd(-fd,  fn, fm)
-     * VFNMS : fd = muladd(-fd, -fn, fm)
-     * VFMA  : fd = muladd( fd,  fn, fm)
-     * VFMS  : fd = muladd( fd, -fn, fm)
-     *
-     * These are fused multiply-add, and must be done as one floating
-     * point operation with no rounding between the multiplication and
-     * addition steps.  NB that doing the negations here as separate
-     * steps is correct : an input NaN should come out with its sign
-     * bit flipped if it is a negated-input.
-     */
-    TCGv_ptr fpst;
-    TCGv_i64 vn, vm, vd;
-
-    /*
-     * Present in VFPv4 only.
-     * Note that we can't rely on the SIMDFMAC check alone, because
-     * in a Neon-no-VFP core that ID register field will be non-zero.
-     */
-    if (!dc_isar_feature(aa32_simdfmac, s) ||
-        !dc_isar_feature(aa32_fpdp_v2, s)) {
-        return false;
-    }
-    /*
-     * In v7A, UNPREDICTABLE with non-zero vector length/stride; from
-     * v8A, must UNDEF. We choose to UNDEF for both v7A and v8A.
-     */
-    if (s->vec_len != 0 || s->vec_stride != 0) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vn | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    vn = tcg_temp_new_i64();
-    vm = tcg_temp_new_i64();
-    vd = tcg_temp_new_i64();
-
-    vfp_load_reg64(vn, a->vn);
-    vfp_load_reg64(vm, a->vm);
-    if (neg_n) {
-        /* VFNMS, VFMS */
-        gen_helper_vfp_negd(vn, vn);
-    }
-    vfp_load_reg64(vd, a->vd);
-    if (neg_d) {
-        /* VFNMA, VFNMS */
-        gen_helper_vfp_negd(vd, vd);
-    }
-    fpst = fpstatus_ptr(FPST_FPCR);
-    gen_helper_vfp_muladdd(vd, vn, vm, vd, fpst);
-    vfp_store_reg64(vd, a->vd);
-
-    tcg_temp_free_ptr(fpst);
-    tcg_temp_free_i64(vn);
-    tcg_temp_free_i64(vm);
-    tcg_temp_free_i64(vd);
-
-    return true;
-}
-
-#define MAKE_ONE_VFM_TRANS_FN(INSN, PREC, NEGN, NEGD)                   \
-    static bool trans_##INSN##_##PREC(DisasContext *s,                  \
-                                      arg_##INSN##_##PREC *a)           \
-    {                                                                   \
-        return do_vfm_##PREC(s, a, NEGN, NEGD);                         \
-    }
-
-#define MAKE_VFM_TRANS_FNS(PREC) \
-    MAKE_ONE_VFM_TRANS_FN(VFMA, PREC, false, false) \
-    MAKE_ONE_VFM_TRANS_FN(VFMS, PREC, true, false) \
-    MAKE_ONE_VFM_TRANS_FN(VFNMA, PREC, false, true) \
-    MAKE_ONE_VFM_TRANS_FN(VFNMS, PREC, true, true)
-
-MAKE_VFM_TRANS_FNS(hp)
-MAKE_VFM_TRANS_FNS(sp)
-MAKE_VFM_TRANS_FNS(dp)
-
-static bool trans_VMOV_imm_hp(DisasContext *s, arg_VMOV_imm_sp *a)
-{
-    if (!dc_isar_feature(aa32_fp16_arith, s)) {
-        return false;
-    }
-
-    if (s->vec_len != 0 || s->vec_stride != 0) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    vfp_store_reg32(tcg_constant_i32(vfp_expand_imm(MO_16, a->imm)), a->vd);
-    return true;
-}
-
-static bool trans_VMOV_imm_sp(DisasContext *s, arg_VMOV_imm_sp *a)
-{
-    uint32_t delta_d = 0;
-    int veclen = s->vec_len;
-    TCGv_i32 fd;
-    uint32_t vd;
-
-    vd = a->vd;
-
-    if (!dc_isar_feature(aa32_fpsp_v3, s)) {
-        return false;
-    }
-
-    if (!dc_isar_feature(aa32_fpshvec, s) &&
-        (veclen != 0 || s->vec_stride != 0)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    if (veclen > 0) {
-        /* Figure out what type of vector operation this is.  */
-        if (vfp_sreg_is_scalar(vd)) {
-            /* scalar */
-            veclen = 0;
-        } else {
-            delta_d = s->vec_stride + 1;
-        }
-    }
-
-    fd = tcg_constant_i32(vfp_expand_imm(MO_32, a->imm));
-
-    for (;;) {
-        vfp_store_reg32(fd, vd);
-
-        if (veclen == 0) {
-            break;
-        }
-
-        /* Set up the operands for the next iteration */
-        veclen--;
-        vd = vfp_advance_sreg(vd, delta_d);
-    }
-
-    return true;
-}
-
-static bool trans_VMOV_imm_dp(DisasContext *s, arg_VMOV_imm_dp *a)
-{
-    uint32_t delta_d = 0;
-    int veclen = s->vec_len;
-    TCGv_i64 fd;
-    uint32_t vd;
-
-    vd = a->vd;
-
-    if (!dc_isar_feature(aa32_fpdp_v3, s)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) && (vd & 0x10)) {
-        return false;
-    }
-
-    if (!dc_isar_feature(aa32_fpshvec, s) &&
-        (veclen != 0 || s->vec_stride != 0)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    if (veclen > 0) {
-        /* Figure out what type of vector operation this is.  */
-        if (vfp_dreg_is_scalar(vd)) {
-            /* scalar */
-            veclen = 0;
-        } else {
-            delta_d = (s->vec_stride >> 1) + 1;
-        }
-    }
-
-    fd = tcg_constant_i64(vfp_expand_imm(MO_64, a->imm));
-
-    for (;;) {
-        vfp_store_reg64(fd, vd);
-
-        if (veclen == 0) {
-            break;
-        }
-
-        /* Set up the operands for the next iteration */
-        veclen--;
-        vd = vfp_advance_dreg(vd, delta_d);
-    }
-
-    return true;
-}
-
-#define DO_VFP_2OP(INSN, PREC, FN, CHECK)                       \
-    static bool trans_##INSN##_##PREC(DisasContext *s,          \
-                                      arg_##INSN##_##PREC *a)   \
-    {                                                           \
-        if (!dc_isar_feature(CHECK, s)) {                       \
-            return false;                                       \
-        }                                                       \
-        return do_vfp_2op_##PREC(s, FN, a->vd, a->vm);          \
-    }
-
-#define DO_VFP_VMOV(INSN, PREC, FN)                             \
-    static bool trans_##INSN##_##PREC(DisasContext *s,          \
-                                      arg_##INSN##_##PREC *a)   \
-    {                                                           \
-        if (!dc_isar_feature(aa32_fp##PREC##_v2, s) &&          \
-            !dc_isar_feature(aa32_mve, s)) {                    \
-            return false;                                       \
-        }                                                       \
-        return do_vfp_2op_##PREC(s, FN, a->vd, a->vm);          \
-    }
-
-DO_VFP_VMOV(VMOV_reg, sp, tcg_gen_mov_i32)
-DO_VFP_VMOV(VMOV_reg, dp, tcg_gen_mov_i64)
-
-DO_VFP_2OP(VABS, hp, gen_helper_vfp_absh, aa32_fp16_arith)
-DO_VFP_2OP(VABS, sp, gen_helper_vfp_abss, aa32_fpsp_v2)
-DO_VFP_2OP(VABS, dp, gen_helper_vfp_absd, aa32_fpdp_v2)
-
-DO_VFP_2OP(VNEG, hp, gen_helper_vfp_negh, aa32_fp16_arith)
-DO_VFP_2OP(VNEG, sp, gen_helper_vfp_negs, aa32_fpsp_v2)
-DO_VFP_2OP(VNEG, dp, gen_helper_vfp_negd, aa32_fpdp_v2)
-
-static void gen_VSQRT_hp(TCGv_i32 vd, TCGv_i32 vm)
-{
-    gen_helper_vfp_sqrth(vd, vm, cpu_env);
-}
-
-static void gen_VSQRT_sp(TCGv_i32 vd, TCGv_i32 vm)
-{
-    gen_helper_vfp_sqrts(vd, vm, cpu_env);
-}
-
-static void gen_VSQRT_dp(TCGv_i64 vd, TCGv_i64 vm)
-{
-    gen_helper_vfp_sqrtd(vd, vm, cpu_env);
-}
-
-DO_VFP_2OP(VSQRT, hp, gen_VSQRT_hp, aa32_fp16_arith)
-DO_VFP_2OP(VSQRT, sp, gen_VSQRT_sp, aa32_fpsp_v2)
-DO_VFP_2OP(VSQRT, dp, gen_VSQRT_dp, aa32_fpdp_v2)
-
-static bool trans_VCMP_hp(DisasContext *s, arg_VCMP_sp *a)
-{
-    TCGv_i32 vd, vm;
-
-    if (!dc_isar_feature(aa32_fp16_arith, s)) {
-        return false;
-    }
-
-    /* Vm/M bits must be zero for the Z variant */
-    if (a->z && a->vm != 0) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    vd = tcg_temp_new_i32();
-    vm = tcg_temp_new_i32();
-
-    vfp_load_reg32(vd, a->vd);
-    if (a->z) {
-        tcg_gen_movi_i32(vm, 0);
-    } else {
-        vfp_load_reg32(vm, a->vm);
-    }
-
-    if (a->e) {
-        gen_helper_vfp_cmpeh(vd, vm, cpu_env);
-    } else {
-        gen_helper_vfp_cmph(vd, vm, cpu_env);
-    }
-
-    tcg_temp_free_i32(vd);
-    tcg_temp_free_i32(vm);
-
-    return true;
-}
-
-static bool trans_VCMP_sp(DisasContext *s, arg_VCMP_sp *a)
-{
-    TCGv_i32 vd, vm;
-
-    if (!dc_isar_feature(aa32_fpsp_v2, s)) {
-        return false;
-    }
-
-    /* Vm/M bits must be zero for the Z variant */
-    if (a->z && a->vm != 0) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    vd = tcg_temp_new_i32();
-    vm = tcg_temp_new_i32();
-
-    vfp_load_reg32(vd, a->vd);
-    if (a->z) {
-        tcg_gen_movi_i32(vm, 0);
-    } else {
-        vfp_load_reg32(vm, a->vm);
-    }
-
-    if (a->e) {
-        gen_helper_vfp_cmpes(vd, vm, cpu_env);
-    } else {
-        gen_helper_vfp_cmps(vd, vm, cpu_env);
-    }
-
-    tcg_temp_free_i32(vd);
-    tcg_temp_free_i32(vm);
-
-    return true;
-}
-
-static bool trans_VCMP_dp(DisasContext *s, arg_VCMP_dp *a)
-{
-    TCGv_i64 vd, vm;
-
-    if (!dc_isar_feature(aa32_fpdp_v2, s)) {
-        return false;
-    }
-
-    /* Vm/M bits must be zero for the Z variant */
-    if (a->z && a->vm != 0) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) && ((a->vd | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    vd = tcg_temp_new_i64();
-    vm = tcg_temp_new_i64();
-
-    vfp_load_reg64(vd, a->vd);
-    if (a->z) {
-        tcg_gen_movi_i64(vm, 0);
-    } else {
-        vfp_load_reg64(vm, a->vm);
-    }
-
-    if (a->e) {
-        gen_helper_vfp_cmped(vd, vm, cpu_env);
-    } else {
-        gen_helper_vfp_cmpd(vd, vm, cpu_env);
-    }
-
-    tcg_temp_free_i64(vd);
-    tcg_temp_free_i64(vm);
-
-    return true;
-}
-
-static bool trans_VCVT_f32_f16(DisasContext *s, arg_VCVT_f32_f16 *a)
-{
-    TCGv_ptr fpst;
-    TCGv_i32 ahp_mode;
-    TCGv_i32 tmp;
-
-    if (!dc_isar_feature(aa32_fp16_spconv, s)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    fpst = fpstatus_ptr(FPST_FPCR);
-    ahp_mode = get_ahp_flag();
-    tmp = tcg_temp_new_i32();
-    /* The T bit tells us if we want the low or high 16 bits of Vm */
-    tcg_gen_ld16u_i32(tmp, cpu_env, vfp_f16_offset(a->vm, a->t));
-    gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp_mode);
-    vfp_store_reg32(tmp, a->vd);
-    tcg_temp_free_i32(ahp_mode);
-    tcg_temp_free_ptr(fpst);
-    tcg_temp_free_i32(tmp);
-    return true;
-}
-
-static bool trans_VCVT_f64_f16(DisasContext *s, arg_VCVT_f64_f16 *a)
-{
-    TCGv_ptr fpst;
-    TCGv_i32 ahp_mode;
-    TCGv_i32 tmp;
-    TCGv_i64 vd;
-
-    if (!dc_isar_feature(aa32_fpdp_v2, s)) {
-        return false;
-    }
-
-    if (!dc_isar_feature(aa32_fp16_dpconv, s)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd  & 0x10)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    fpst = fpstatus_ptr(FPST_FPCR);
-    ahp_mode = get_ahp_flag();
-    tmp = tcg_temp_new_i32();
-    /* The T bit tells us if we want the low or high 16 bits of Vm */
-    tcg_gen_ld16u_i32(tmp, cpu_env, vfp_f16_offset(a->vm, a->t));
-    vd = tcg_temp_new_i64();
-    gen_helper_vfp_fcvt_f16_to_f64(vd, tmp, fpst, ahp_mode);
-    vfp_store_reg64(vd, a->vd);
-    tcg_temp_free_i32(ahp_mode);
-    tcg_temp_free_ptr(fpst);
-    tcg_temp_free_i32(tmp);
-    tcg_temp_free_i64(vd);
-    return true;
-}
-
-static bool trans_VCVT_b16_f32(DisasContext *s, arg_VCVT_b16_f32 *a)
-{
-    TCGv_ptr fpst;
-    TCGv_i32 tmp;
-
-    if (!dc_isar_feature(aa32_bf16, s)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    fpst = fpstatus_ptr(FPST_FPCR);
-    tmp = tcg_temp_new_i32();
-
-    vfp_load_reg32(tmp, a->vm);
-    gen_helper_bfcvt(tmp, tmp, fpst);
-    tcg_gen_st16_i32(tmp, cpu_env, vfp_f16_offset(a->vd, a->t));
-    tcg_temp_free_ptr(fpst);
-    tcg_temp_free_i32(tmp);
-    return true;
-}
-
-static bool trans_VCVT_f16_f32(DisasContext *s, arg_VCVT_f16_f32 *a)
-{
-    TCGv_ptr fpst;
-    TCGv_i32 ahp_mode;
-    TCGv_i32 tmp;
-
-    if (!dc_isar_feature(aa32_fp16_spconv, s)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    fpst = fpstatus_ptr(FPST_FPCR);
-    ahp_mode = get_ahp_flag();
-    tmp = tcg_temp_new_i32();
-
-    vfp_load_reg32(tmp, a->vm);
-    gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp_mode);
-    tcg_gen_st16_i32(tmp, cpu_env, vfp_f16_offset(a->vd, a->t));
-    tcg_temp_free_i32(ahp_mode);
-    tcg_temp_free_ptr(fpst);
-    tcg_temp_free_i32(tmp);
-    return true;
-}
-
-static bool trans_VCVT_f16_f64(DisasContext *s, arg_VCVT_f16_f64 *a)
-{
-    TCGv_ptr fpst;
-    TCGv_i32 ahp_mode;
-    TCGv_i32 tmp;
-    TCGv_i64 vm;
-
-    if (!dc_isar_feature(aa32_fpdp_v2, s)) {
-        return false;
-    }
-
-    if (!dc_isar_feature(aa32_fp16_dpconv, s)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vm  & 0x10)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    fpst = fpstatus_ptr(FPST_FPCR);
-    ahp_mode = get_ahp_flag();
-    tmp = tcg_temp_new_i32();
-    vm = tcg_temp_new_i64();
-
-    vfp_load_reg64(vm, a->vm);
-    gen_helper_vfp_fcvt_f64_to_f16(tmp, vm, fpst, ahp_mode);
-    tcg_temp_free_i64(vm);
-    tcg_gen_st16_i32(tmp, cpu_env, vfp_f16_offset(a->vd, a->t));
-    tcg_temp_free_i32(ahp_mode);
-    tcg_temp_free_ptr(fpst);
-    tcg_temp_free_i32(tmp);
-    return true;
-}
-
-static bool trans_VRINTR_hp(DisasContext *s, arg_VRINTR_sp *a)
-{
-    TCGv_ptr fpst;
-    TCGv_i32 tmp;
-
-    if (!dc_isar_feature(aa32_fp16_arith, s)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    tmp = tcg_temp_new_i32();
-    vfp_load_reg32(tmp, a->vm);
-    fpst = fpstatus_ptr(FPST_FPCR_F16);
-    gen_helper_rinth(tmp, tmp, fpst);
-    vfp_store_reg32(tmp, a->vd);
-    tcg_temp_free_ptr(fpst);
-    tcg_temp_free_i32(tmp);
-    return true;
-}
-
-static bool trans_VRINTR_sp(DisasContext *s, arg_VRINTR_sp *a)
-{
-    TCGv_ptr fpst;
-    TCGv_i32 tmp;
-
-    if (!dc_isar_feature(aa32_vrint, s)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    tmp = tcg_temp_new_i32();
-    vfp_load_reg32(tmp, a->vm);
-    fpst = fpstatus_ptr(FPST_FPCR);
-    gen_helper_rints(tmp, tmp, fpst);
-    vfp_store_reg32(tmp, a->vd);
-    tcg_temp_free_ptr(fpst);
-    tcg_temp_free_i32(tmp);
-    return true;
-}
-
-static bool trans_VRINTR_dp(DisasContext *s, arg_VRINTR_dp *a)
-{
-    TCGv_ptr fpst;
-    TCGv_i64 tmp;
-
-    if (!dc_isar_feature(aa32_fpdp_v2, s)) {
-        return false;
-    }
-
-    if (!dc_isar_feature(aa32_vrint, s)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) && ((a->vd | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    tmp = tcg_temp_new_i64();
-    vfp_load_reg64(tmp, a->vm);
-    fpst = fpstatus_ptr(FPST_FPCR);
-    gen_helper_rintd(tmp, tmp, fpst);
-    vfp_store_reg64(tmp, a->vd);
-    tcg_temp_free_ptr(fpst);
-    tcg_temp_free_i64(tmp);
-    return true;
-}
-
-static bool trans_VRINTZ_hp(DisasContext *s, arg_VRINTZ_sp *a)
-{
-    TCGv_ptr fpst;
-    TCGv_i32 tmp;
-    TCGv_i32 tcg_rmode;
-
-    if (!dc_isar_feature(aa32_fp16_arith, s)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    tmp = tcg_temp_new_i32();
-    vfp_load_reg32(tmp, a->vm);
-    fpst = fpstatus_ptr(FPST_FPCR_F16);
-    tcg_rmode = tcg_const_i32(float_round_to_zero);
-    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
-    gen_helper_rinth(tmp, tmp, fpst);
-    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
-    vfp_store_reg32(tmp, a->vd);
-    tcg_temp_free_ptr(fpst);
-    tcg_temp_free_i32(tcg_rmode);
-    tcg_temp_free_i32(tmp);
-    return true;
-}
-
-static bool trans_VRINTZ_sp(DisasContext *s, arg_VRINTZ_sp *a)
-{
-    TCGv_ptr fpst;
-    TCGv_i32 tmp;
-    TCGv_i32 tcg_rmode;
-
-    if (!dc_isar_feature(aa32_vrint, s)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    tmp = tcg_temp_new_i32();
-    vfp_load_reg32(tmp, a->vm);
-    fpst = fpstatus_ptr(FPST_FPCR);
-    tcg_rmode = tcg_const_i32(float_round_to_zero);
-    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
-    gen_helper_rints(tmp, tmp, fpst);
-    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
-    vfp_store_reg32(tmp, a->vd);
-    tcg_temp_free_ptr(fpst);
-    tcg_temp_free_i32(tcg_rmode);
-    tcg_temp_free_i32(tmp);
-    return true;
-}
-
-static bool trans_VRINTZ_dp(DisasContext *s, arg_VRINTZ_dp *a)
-{
-    TCGv_ptr fpst;
-    TCGv_i64 tmp;
-    TCGv_i32 tcg_rmode;
-
-    if (!dc_isar_feature(aa32_fpdp_v2, s)) {
-        return false;
-    }
-
-    if (!dc_isar_feature(aa32_vrint, s)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) && ((a->vd | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    tmp = tcg_temp_new_i64();
-    vfp_load_reg64(tmp, a->vm);
-    fpst = fpstatus_ptr(FPST_FPCR);
-    tcg_rmode = tcg_const_i32(float_round_to_zero);
-    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
-    gen_helper_rintd(tmp, tmp, fpst);
-    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
-    vfp_store_reg64(tmp, a->vd);
-    tcg_temp_free_ptr(fpst);
-    tcg_temp_free_i64(tmp);
-    tcg_temp_free_i32(tcg_rmode);
-    return true;
-}
-
-static bool trans_VRINTX_hp(DisasContext *s, arg_VRINTX_sp *a)
-{
-    TCGv_ptr fpst;
-    TCGv_i32 tmp;
-
-    if (!dc_isar_feature(aa32_fp16_arith, s)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    tmp = tcg_temp_new_i32();
-    vfp_load_reg32(tmp, a->vm);
-    fpst = fpstatus_ptr(FPST_FPCR_F16);
-    gen_helper_rinth_exact(tmp, tmp, fpst);
-    vfp_store_reg32(tmp, a->vd);
-    tcg_temp_free_ptr(fpst);
-    tcg_temp_free_i32(tmp);
-    return true;
-}
-
-static bool trans_VRINTX_sp(DisasContext *s, arg_VRINTX_sp *a)
-{
-    TCGv_ptr fpst;
-    TCGv_i32 tmp;
-
-    if (!dc_isar_feature(aa32_vrint, s)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    tmp = tcg_temp_new_i32();
-    vfp_load_reg32(tmp, a->vm);
-    fpst = fpstatus_ptr(FPST_FPCR);
-    gen_helper_rints_exact(tmp, tmp, fpst);
-    vfp_store_reg32(tmp, a->vd);
-    tcg_temp_free_ptr(fpst);
-    tcg_temp_free_i32(tmp);
-    return true;
-}
-
-static bool trans_VRINTX_dp(DisasContext *s, arg_VRINTX_dp *a)
-{
-    TCGv_ptr fpst;
-    TCGv_i64 tmp;
-
-    if (!dc_isar_feature(aa32_fpdp_v2, s)) {
-        return false;
-    }
-
-    if (!dc_isar_feature(aa32_vrint, s)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) && ((a->vd | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    tmp = tcg_temp_new_i64();
-    vfp_load_reg64(tmp, a->vm);
-    fpst = fpstatus_ptr(FPST_FPCR);
-    gen_helper_rintd_exact(tmp, tmp, fpst);
-    vfp_store_reg64(tmp, a->vd);
-    tcg_temp_free_ptr(fpst);
-    tcg_temp_free_i64(tmp);
-    return true;
-}
-
-static bool trans_VCVT_sp(DisasContext *s, arg_VCVT_sp *a)
-{
-    TCGv_i64 vd;
-    TCGv_i32 vm;
-
-    if (!dc_isar_feature(aa32_fpdp_v2, s)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    vm = tcg_temp_new_i32();
-    vd = tcg_temp_new_i64();
-    vfp_load_reg32(vm, a->vm);
-    gen_helper_vfp_fcvtds(vd, vm, cpu_env);
-    vfp_store_reg64(vd, a->vd);
-    tcg_temp_free_i32(vm);
-    tcg_temp_free_i64(vd);
-    return true;
-}
-
-static bool trans_VCVT_dp(DisasContext *s, arg_VCVT_dp *a)
-{
-    TCGv_i64 vm;
-    TCGv_i32 vd;
-
-    if (!dc_isar_feature(aa32_fpdp_v2, s)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vm & 0x10)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    vd = tcg_temp_new_i32();
-    vm = tcg_temp_new_i64();
-    vfp_load_reg64(vm, a->vm);
-    gen_helper_vfp_fcvtsd(vd, vm, cpu_env);
-    vfp_store_reg32(vd, a->vd);
-    tcg_temp_free_i32(vd);
-    tcg_temp_free_i64(vm);
-    return true;
-}
-
-static bool trans_VCVT_int_hp(DisasContext *s, arg_VCVT_int_sp *a)
-{
-    TCGv_i32 vm;
-    TCGv_ptr fpst;
-
-    if (!dc_isar_feature(aa32_fp16_arith, s)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    vm = tcg_temp_new_i32();
-    vfp_load_reg32(vm, a->vm);
-    fpst = fpstatus_ptr(FPST_FPCR_F16);
-    if (a->s) {
-        /* i32 -> f16 */
-        gen_helper_vfp_sitoh(vm, vm, fpst);
-    } else {
-        /* u32 -> f16 */
-        gen_helper_vfp_uitoh(vm, vm, fpst);
-    }
-    vfp_store_reg32(vm, a->vd);
-    tcg_temp_free_i32(vm);
-    tcg_temp_free_ptr(fpst);
-    return true;
-}
-
-static bool trans_VCVT_int_sp(DisasContext *s, arg_VCVT_int_sp *a)
-{
-    TCGv_i32 vm;
-    TCGv_ptr fpst;
-
-    if (!dc_isar_feature(aa32_fpsp_v2, s)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    vm = tcg_temp_new_i32();
-    vfp_load_reg32(vm, a->vm);
-    fpst = fpstatus_ptr(FPST_FPCR);
-    if (a->s) {
-        /* i32 -> f32 */
-        gen_helper_vfp_sitos(vm, vm, fpst);
-    } else {
-        /* u32 -> f32 */
-        gen_helper_vfp_uitos(vm, vm, fpst);
-    }
-    vfp_store_reg32(vm, a->vd);
-    tcg_temp_free_i32(vm);
-    tcg_temp_free_ptr(fpst);
-    return true;
-}
-
-static bool trans_VCVT_int_dp(DisasContext *s, arg_VCVT_int_dp *a)
-{
-    TCGv_i32 vm;
-    TCGv_i64 vd;
-    TCGv_ptr fpst;
-
-    if (!dc_isar_feature(aa32_fpdp_v2, s)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    vm = tcg_temp_new_i32();
-    vd = tcg_temp_new_i64();
-    vfp_load_reg32(vm, a->vm);
-    fpst = fpstatus_ptr(FPST_FPCR);
-    if (a->s) {
-        /* i32 -> f64 */
-        gen_helper_vfp_sitod(vd, vm, fpst);
-    } else {
-        /* u32 -> f64 */
-        gen_helper_vfp_uitod(vd, vm, fpst);
-    }
-    vfp_store_reg64(vd, a->vd);
-    tcg_temp_free_i32(vm);
-    tcg_temp_free_i64(vd);
-    tcg_temp_free_ptr(fpst);
-    return true;
-}
-
-static bool trans_VJCVT(DisasContext *s, arg_VJCVT *a)
-{
-    TCGv_i32 vd;
-    TCGv_i64 vm;
-
-    if (!dc_isar_feature(aa32_fpdp_v2, s)) {
-        return false;
-    }
-
-    if (!dc_isar_feature(aa32_jscvt, s)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vm & 0x10)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    vm = tcg_temp_new_i64();
-    vd = tcg_temp_new_i32();
-    vfp_load_reg64(vm, a->vm);
-    gen_helper_vjcvt(vd, vm, cpu_env);
-    vfp_store_reg32(vd, a->vd);
-    tcg_temp_free_i64(vm);
-    tcg_temp_free_i32(vd);
-    return true;
-}
-
-static bool trans_VCVT_fix_hp(DisasContext *s, arg_VCVT_fix_sp *a)
-{
-    TCGv_i32 vd, shift;
-    TCGv_ptr fpst;
-    int frac_bits;
-
-    if (!dc_isar_feature(aa32_fp16_arith, s)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    frac_bits = (a->opc & 1) ? (32 - a->imm) : (16 - a->imm);
-
-    vd = tcg_temp_new_i32();
-    vfp_load_reg32(vd, a->vd);
-
-    fpst = fpstatus_ptr(FPST_FPCR_F16);
-    shift = tcg_constant_i32(frac_bits);
-
-    /* Switch on op:U:sx bits */
-    switch (a->opc) {
-    case 0:
-        gen_helper_vfp_shtoh_round_to_nearest(vd, vd, shift, fpst);
-        break;
-    case 1:
-        gen_helper_vfp_sltoh_round_to_nearest(vd, vd, shift, fpst);
-        break;
-    case 2:
-        gen_helper_vfp_uhtoh_round_to_nearest(vd, vd, shift, fpst);
-        break;
-    case 3:
-        gen_helper_vfp_ultoh_round_to_nearest(vd, vd, shift, fpst);
-        break;
-    case 4:
-        gen_helper_vfp_toshh_round_to_zero(vd, vd, shift, fpst);
-        break;
-    case 5:
-        gen_helper_vfp_toslh_round_to_zero(vd, vd, shift, fpst);
-        break;
-    case 6:
-        gen_helper_vfp_touhh_round_to_zero(vd, vd, shift, fpst);
-        break;
-    case 7:
-        gen_helper_vfp_toulh_round_to_zero(vd, vd, shift, fpst);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-
-    vfp_store_reg32(vd, a->vd);
-    tcg_temp_free_i32(vd);
-    tcg_temp_free_ptr(fpst);
-    return true;
-}
-
-static bool trans_VCVT_fix_sp(DisasContext *s, arg_VCVT_fix_sp *a)
-{
-    TCGv_i32 vd, shift;
-    TCGv_ptr fpst;
-    int frac_bits;
-
-    if (!dc_isar_feature(aa32_fpsp_v3, s)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    frac_bits = (a->opc & 1) ? (32 - a->imm) : (16 - a->imm);
-
-    vd = tcg_temp_new_i32();
-    vfp_load_reg32(vd, a->vd);
-
-    fpst = fpstatus_ptr(FPST_FPCR);
-    shift = tcg_constant_i32(frac_bits);
-
-    /* Switch on op:U:sx bits */
-    switch (a->opc) {
-    case 0:
-        gen_helper_vfp_shtos_round_to_nearest(vd, vd, shift, fpst);
-        break;
-    case 1:
-        gen_helper_vfp_sltos_round_to_nearest(vd, vd, shift, fpst);
-        break;
-    case 2:
-        gen_helper_vfp_uhtos_round_to_nearest(vd, vd, shift, fpst);
-        break;
-    case 3:
-        gen_helper_vfp_ultos_round_to_nearest(vd, vd, shift, fpst);
-        break;
-    case 4:
-        gen_helper_vfp_toshs_round_to_zero(vd, vd, shift, fpst);
-        break;
-    case 5:
-        gen_helper_vfp_tosls_round_to_zero(vd, vd, shift, fpst);
-        break;
-    case 6:
-        gen_helper_vfp_touhs_round_to_zero(vd, vd, shift, fpst);
-        break;
-    case 7:
-        gen_helper_vfp_touls_round_to_zero(vd, vd, shift, fpst);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-
-    vfp_store_reg32(vd, a->vd);
-    tcg_temp_free_i32(vd);
-    tcg_temp_free_ptr(fpst);
-    return true;
-}
-
-static bool trans_VCVT_fix_dp(DisasContext *s, arg_VCVT_fix_dp *a)
-{
-    TCGv_i64 vd;
-    TCGv_i32 shift;
-    TCGv_ptr fpst;
-    int frac_bits;
-
-    if (!dc_isar_feature(aa32_fpdp_v3, s)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    frac_bits = (a->opc & 1) ? (32 - a->imm) : (16 - a->imm);
-
-    vd = tcg_temp_new_i64();
-    vfp_load_reg64(vd, a->vd);
-
-    fpst = fpstatus_ptr(FPST_FPCR);
-    shift = tcg_constant_i32(frac_bits);
-
-    /* Switch on op:U:sx bits */
-    switch (a->opc) {
-    case 0:
-        gen_helper_vfp_shtod_round_to_nearest(vd, vd, shift, fpst);
-        break;
-    case 1:
-        gen_helper_vfp_sltod_round_to_nearest(vd, vd, shift, fpst);
-        break;
-    case 2:
-        gen_helper_vfp_uhtod_round_to_nearest(vd, vd, shift, fpst);
-        break;
-    case 3:
-        gen_helper_vfp_ultod_round_to_nearest(vd, vd, shift, fpst);
-        break;
-    case 4:
-        gen_helper_vfp_toshd_round_to_zero(vd, vd, shift, fpst);
-        break;
-    case 5:
-        gen_helper_vfp_tosld_round_to_zero(vd, vd, shift, fpst);
-        break;
-    case 6:
-        gen_helper_vfp_touhd_round_to_zero(vd, vd, shift, fpst);
-        break;
-    case 7:
-        gen_helper_vfp_tould_round_to_zero(vd, vd, shift, fpst);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-
-    vfp_store_reg64(vd, a->vd);
-    tcg_temp_free_i64(vd);
-    tcg_temp_free_ptr(fpst);
-    return true;
-}
-
-static bool trans_VCVT_hp_int(DisasContext *s, arg_VCVT_sp_int *a)
-{
-    TCGv_i32 vm;
-    TCGv_ptr fpst;
-
-    if (!dc_isar_feature(aa32_fp16_arith, s)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    fpst = fpstatus_ptr(FPST_FPCR_F16);
-    vm = tcg_temp_new_i32();
-    vfp_load_reg32(vm, a->vm);
-
-    if (a->s) {
-        if (a->rz) {
-            gen_helper_vfp_tosizh(vm, vm, fpst);
-        } else {
-            gen_helper_vfp_tosih(vm, vm, fpst);
-        }
-    } else {
-        if (a->rz) {
-            gen_helper_vfp_touizh(vm, vm, fpst);
-        } else {
-            gen_helper_vfp_touih(vm, vm, fpst);
-        }
-    }
-    vfp_store_reg32(vm, a->vd);
-    tcg_temp_free_i32(vm);
-    tcg_temp_free_ptr(fpst);
-    return true;
-}
-
-static bool trans_VCVT_sp_int(DisasContext *s, arg_VCVT_sp_int *a)
-{
-    TCGv_i32 vm;
-    TCGv_ptr fpst;
-
-    if (!dc_isar_feature(aa32_fpsp_v2, s)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    fpst = fpstatus_ptr(FPST_FPCR);
-    vm = tcg_temp_new_i32();
-    vfp_load_reg32(vm, a->vm);
-
-    if (a->s) {
-        if (a->rz) {
-            gen_helper_vfp_tosizs(vm, vm, fpst);
-        } else {
-            gen_helper_vfp_tosis(vm, vm, fpst);
-        }
-    } else {
-        if (a->rz) {
-            gen_helper_vfp_touizs(vm, vm, fpst);
-        } else {
-            gen_helper_vfp_touis(vm, vm, fpst);
-        }
-    }
-    vfp_store_reg32(vm, a->vd);
-    tcg_temp_free_i32(vm);
-    tcg_temp_free_ptr(fpst);
-    return true;
-}
-
-static bool trans_VCVT_dp_int(DisasContext *s, arg_VCVT_dp_int *a)
-{
-    TCGv_i32 vd;
-    TCGv_i64 vm;
-    TCGv_ptr fpst;
-
-    if (!dc_isar_feature(aa32_fpdp_v2, s)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vm & 0x10)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    fpst = fpstatus_ptr(FPST_FPCR);
-    vm = tcg_temp_new_i64();
-    vd = tcg_temp_new_i32();
-    vfp_load_reg64(vm, a->vm);
-
-    if (a->s) {
-        if (a->rz) {
-            gen_helper_vfp_tosizd(vd, vm, fpst);
-        } else {
-            gen_helper_vfp_tosid(vd, vm, fpst);
-        }
-    } else {
-        if (a->rz) {
-            gen_helper_vfp_touizd(vd, vm, fpst);
-        } else {
-            gen_helper_vfp_touid(vd, vm, fpst);
-        }
-    }
-    vfp_store_reg32(vd, a->vd);
-    tcg_temp_free_i32(vd);
-    tcg_temp_free_i64(vm);
-    tcg_temp_free_ptr(fpst);
-    return true;
-}
-
-static bool trans_VINS(DisasContext *s, arg_VINS *a)
-{
-    TCGv_i32 rd, rm;
-
-    if (!dc_isar_feature(aa32_fp16_arith, s)) {
-        return false;
-    }
-
-    if (s->vec_len != 0 || s->vec_stride != 0) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    /* Insert low half of Vm into high half of Vd */
-    rm = tcg_temp_new_i32();
-    rd = tcg_temp_new_i32();
-    vfp_load_reg32(rm, a->vm);
-    vfp_load_reg32(rd, a->vd);
-    tcg_gen_deposit_i32(rd, rd, rm, 16, 16);
-    vfp_store_reg32(rd, a->vd);
-    tcg_temp_free_i32(rm);
-    tcg_temp_free_i32(rd);
-    return true;
-}
-
-static bool trans_VMOVX(DisasContext *s, arg_VINS *a)
-{
-    TCGv_i32 rm;
-
-    if (!dc_isar_feature(aa32_fp16_arith, s)) {
-        return false;
-    }
-
-    if (s->vec_len != 0 || s->vec_stride != 0) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    /* Set Vd to high half of Vm */
-    rm = tcg_temp_new_i32();
-    vfp_load_reg32(rm, a->vm);
-    tcg_gen_shri_i32(rm, rm, 16);
-    vfp_store_reg32(rm, a->vd);
-    tcg_temp_free_i32(rm);
-    return true;
-}
diff --git a/target/arm/translate.c b/target/arm/translate.c
deleted file mode 100644
index c23a346..0000000
--- a/target/arm/translate.c
+++ /dev/null
@@ -1,9990 +0,0 @@
-/*
- *  ARM translation
- *
- *  Copyright (c) 2003 Fabrice Bellard
- *  Copyright (c) 2005-2007 CodeSourcery
- *  Copyright (c) 2007 OpenedHand, Ltd.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-#include "qemu/osdep.h"
-
-#include "cpu.h"
-#include "internals.h"
-#include "disas/disas.h"
-#include "exec/exec-all.h"
-#include "tcg/tcg-op.h"
-#include "tcg/tcg-op-gvec.h"
-#include "qemu/log.h"
-#include "qemu/bitops.h"
-#include "arm_ldst.h"
-#include "semihosting/semihost.h"
-#include "exec/helper-proto.h"
-#include "exec/helper-gen.h"
-#include "exec/log.h"
-#include "cpregs.h"
-
-
-#define ENABLE_ARCH_4T    arm_dc_feature(s, ARM_FEATURE_V4T)
-#define ENABLE_ARCH_5     arm_dc_feature(s, ARM_FEATURE_V5)
-/* currently all emulated v5 cores are also v5TE, so don't bother */
-#define ENABLE_ARCH_5TE   arm_dc_feature(s, ARM_FEATURE_V5)
-#define ENABLE_ARCH_5J    dc_isar_feature(aa32_jazelle, s)
-#define ENABLE_ARCH_6     arm_dc_feature(s, ARM_FEATURE_V6)
-#define ENABLE_ARCH_6K    arm_dc_feature(s, ARM_FEATURE_V6K)
-#define ENABLE_ARCH_6T2   arm_dc_feature(s, ARM_FEATURE_THUMB2)
-#define ENABLE_ARCH_7     arm_dc_feature(s, ARM_FEATURE_V7)
-#define ENABLE_ARCH_8     arm_dc_feature(s, ARM_FEATURE_V8)
-
-#include "translate.h"
-#include "translate-a32.h"
-
-/* These are TCG temporaries used only by the legacy iwMMXt decoder */
-static TCGv_i64 cpu_V0, cpu_V1, cpu_M0;
-/* These are TCG globals which alias CPUARMState fields */
-static TCGv_i32 cpu_R[16];
-TCGv_i32 cpu_CF, cpu_NF, cpu_VF, cpu_ZF;
-TCGv_i64 cpu_exclusive_addr;
-TCGv_i64 cpu_exclusive_val;
-
-#include "exec/gen-icount.h"
-
-static const char * const regnames[] =
-    { "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7",
-      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "pc" };
-
-
-/* initialize TCG globals.  */
-void arm_translate_init(void)
-{
-    int i;
-
-    for (i = 0; i < 16; i++) {
-        cpu_R[i] = tcg_global_mem_new_i32(cpu_env,
-                                          offsetof(CPUARMState, regs[i]),
-                                          regnames[i]);
-    }
-    cpu_CF = tcg_global_mem_new_i32(cpu_env, offsetof(CPUARMState, CF), "CF");
-    cpu_NF = tcg_global_mem_new_i32(cpu_env, offsetof(CPUARMState, NF), "NF");
-    cpu_VF = tcg_global_mem_new_i32(cpu_env, offsetof(CPUARMState, VF), "VF");
-    cpu_ZF = tcg_global_mem_new_i32(cpu_env, offsetof(CPUARMState, ZF), "ZF");
-
-    cpu_exclusive_addr = tcg_global_mem_new_i64(cpu_env,
-        offsetof(CPUARMState, exclusive_addr), "exclusive_addr");
-    cpu_exclusive_val = tcg_global_mem_new_i64(cpu_env,
-        offsetof(CPUARMState, exclusive_val), "exclusive_val");
-
-    a64_translate_init();
-}
-
-uint64_t asimd_imm_const(uint32_t imm, int cmode, int op)
-{
-    /* Expand the encoded constant as per AdvSIMDExpandImm pseudocode */
-    switch (cmode) {
-    case 0: case 1:
-        /* no-op */
-        break;
-    case 2: case 3:
-        imm <<= 8;
-        break;
-    case 4: case 5:
-        imm <<= 16;
-        break;
-    case 6: case 7:
-        imm <<= 24;
-        break;
-    case 8: case 9:
-        imm |= imm << 16;
-        break;
-    case 10: case 11:
-        imm = (imm << 8) | (imm << 24);
-        break;
-    case 12:
-        imm = (imm << 8) | 0xff;
-        break;
-    case 13:
-        imm = (imm << 16) | 0xffff;
-        break;
-    case 14:
-        if (op) {
-            /*
-             * This and cmode == 15 op == 1 are the only cases where
-             * the top and bottom 32 bits of the encoded constant differ.
-             */
-            uint64_t imm64 = 0;
-            int n;
-
-            for (n = 0; n < 8; n++) {
-                if (imm & (1 << n)) {
-                    imm64 |= (0xffULL << (n * 8));
-                }
-            }
-            return imm64;
-        }
-        imm |= (imm << 8) | (imm << 16) | (imm << 24);
-        break;
-    case 15:
-        if (op) {
-            /* Reserved encoding for AArch32; valid for AArch64 */
-            uint64_t imm64 = (uint64_t)(imm & 0x3f) << 48;
-            if (imm & 0x80) {
-                imm64 |= 0x8000000000000000ULL;
-            }
-            if (imm & 0x40) {
-                imm64 |= 0x3fc0000000000000ULL;
-            } else {
-                imm64 |= 0x4000000000000000ULL;
-            }
-            return imm64;
-        }
-        imm = ((imm & 0x80) << 24) | ((imm & 0x3f) << 19)
-            | ((imm & 0x40) ? (0x1f << 25) : (1 << 30));
-        break;
-    }
-    if (op) {
-        imm = ~imm;
-    }
-    return dup_const(MO_32, imm);
-}
-
-/* Generate a label used for skipping this instruction */
-void arm_gen_condlabel(DisasContext *s)
-{
-    if (!s->condjmp) {
-        s->condlabel = gen_disas_label(s);
-        s->condjmp = 1;
-    }
-}
-
-/* Flags for the disas_set_da_iss info argument:
- * lower bits hold the Rt register number, higher bits are flags.
- */
-typedef enum ISSInfo {
-    ISSNone = 0,
-    ISSRegMask = 0x1f,
-    ISSInvalid = (1 << 5),
-    ISSIsAcqRel = (1 << 6),
-    ISSIsWrite = (1 << 7),
-    ISSIs16Bit = (1 << 8),
-} ISSInfo;
-
-/*
- * Store var into env + offset to a member with size bytes.
- * Free var after use.
- */
-void store_cpu_offset(TCGv_i32 var, int offset, int size)
-{
-    switch (size) {
-    case 1:
-        tcg_gen_st8_i32(var, cpu_env, offset);
-        break;
-    case 4:
-        tcg_gen_st_i32(var, cpu_env, offset);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-    tcg_temp_free_i32(var);
-}
-
-/* Save the syndrome information for a Data Abort */
-static void disas_set_da_iss(DisasContext *s, MemOp memop, ISSInfo issinfo)
-{
-    uint32_t syn;
-    int sas = memop & MO_SIZE;
-    bool sse = memop & MO_SIGN;
-    bool is_acqrel = issinfo & ISSIsAcqRel;
-    bool is_write = issinfo & ISSIsWrite;
-    bool is_16bit = issinfo & ISSIs16Bit;
-    int srt = issinfo & ISSRegMask;
-
-    if (issinfo & ISSInvalid) {
-        /* Some callsites want to conditionally provide ISS info,
-         * eg "only if this was not a writeback"
-         */
-        return;
-    }
-
-    if (srt == 15) {
-        /* For AArch32, insns where the src/dest is R15 never generate
-         * ISS information. Catching that here saves checking at all
-         * the call sites.
-         */
-        return;
-    }
-
-    syn = syn_data_abort_with_iss(0, sas, sse, srt, 0, is_acqrel,
-                                  0, 0, 0, is_write, 0, is_16bit);
-    disas_set_insn_syndrome(s, syn);
-}
-
-static inline int get_a32_user_mem_index(DisasContext *s)
-{
-    /* Return the core mmu_idx to use for A32/T32 "unprivileged load/store"
-     * insns:
-     *  if PL2, UNPREDICTABLE (we choose to implement as if PL0)
-     *  otherwise, access as if at PL0.
-     */
-    switch (s->mmu_idx) {
-    case ARMMMUIdx_E3:
-    case ARMMMUIdx_E2:        /* this one is UNPREDICTABLE */
-    case ARMMMUIdx_E10_0:
-    case ARMMMUIdx_E10_1:
-    case ARMMMUIdx_E10_1_PAN:
-        return arm_to_core_mmu_idx(ARMMMUIdx_E10_0);
-    case ARMMMUIdx_MUser:
-    case ARMMMUIdx_MPriv:
-        return arm_to_core_mmu_idx(ARMMMUIdx_MUser);
-    case ARMMMUIdx_MUserNegPri:
-    case ARMMMUIdx_MPrivNegPri:
-        return arm_to_core_mmu_idx(ARMMMUIdx_MUserNegPri);
-    case ARMMMUIdx_MSUser:
-    case ARMMMUIdx_MSPriv:
-        return arm_to_core_mmu_idx(ARMMMUIdx_MSUser);
-    case ARMMMUIdx_MSUserNegPri:
-    case ARMMMUIdx_MSPrivNegPri:
-        return arm_to_core_mmu_idx(ARMMMUIdx_MSUserNegPri);
-    default:
-        g_assert_not_reached();
-    }
-}
-
-/* The pc_curr difference for an architectural jump. */
-static target_long jmp_diff(DisasContext *s, target_long diff)
-{
-    return diff + (s->thumb ? 4 : 8);
-}
-
-static void gen_pc_plus_diff(DisasContext *s, TCGv_i32 var, target_long diff)
-{
-    assert(s->pc_save != -1);
-    if (TARGET_TB_PCREL) {
-        tcg_gen_addi_i32(var, cpu_R[15], (s->pc_curr - s->pc_save) + diff);
-    } else {
-        tcg_gen_movi_i32(var, s->pc_curr + diff);
-    }
-}
-
-/* Set a variable to the value of a CPU register.  */
-void load_reg_var(DisasContext *s, TCGv_i32 var, int reg)
-{
-    if (reg == 15) {
-        gen_pc_plus_diff(s, var, jmp_diff(s, 0));
-    } else {
-        tcg_gen_mov_i32(var, cpu_R[reg]);
-    }
-}
-
-/*
- * Create a new temp, REG + OFS, except PC is ALIGN(PC, 4).
- * This is used for load/store for which use of PC implies (literal),
- * or ADD that implies ADR.
- */
-TCGv_i32 add_reg_for_lit(DisasContext *s, int reg, int ofs)
-{
-    TCGv_i32 tmp = tcg_temp_new_i32();
-
-    if (reg == 15) {
-        /*
-         * This address is computed from an aligned PC:
-         * subtract off the low bits.
-         */
-        gen_pc_plus_diff(s, tmp, jmp_diff(s, ofs - (s->pc_curr & 3)));
-    } else {
-        tcg_gen_addi_i32(tmp, cpu_R[reg], ofs);
-    }
-    return tmp;
-}
-
-/* Set a CPU register.  The source must be a temporary and will be
-   marked as dead.  */
-void store_reg(DisasContext *s, int reg, TCGv_i32 var)
-{
-    if (reg == 15) {
-        /* In Thumb mode, we must ignore bit 0.
-         * In ARM mode, for ARMv4 and ARMv5, it is UNPREDICTABLE if bits [1:0]
-         * are not 0b00, but for ARMv6 and above, we must ignore bits [1:0].
-         * We choose to ignore [1:0] in ARM mode for all architecture versions.
-         */
-        tcg_gen_andi_i32(var, var, s->thumb ? ~1 : ~3);
-        s->base.is_jmp = DISAS_JUMP;
-        s->pc_save = -1;
-    } else if (reg == 13 && arm_dc_feature(s, ARM_FEATURE_M)) {
-        /* For M-profile SP bits [1:0] are always zero */
-        tcg_gen_andi_i32(var, var, ~3);
-    }
-    tcg_gen_mov_i32(cpu_R[reg], var);
-    tcg_temp_free_i32(var);
-}
-
-/*
- * Variant of store_reg which applies v8M stack-limit checks before updating
- * SP. If the check fails this will result in an exception being taken.
- * We disable the stack checks for CONFIG_USER_ONLY because we have
- * no idea what the stack limits should be in that case.
- * If stack checking is not being done this just acts like store_reg().
- */
-static void store_sp_checked(DisasContext *s, TCGv_i32 var)
-{
-#ifndef CONFIG_USER_ONLY
-    if (s->v8m_stackcheck) {
-        gen_helper_v8m_stackcheck(cpu_env, var);
-    }
-#endif
-    store_reg(s, 13, var);
-}
-
-/* Value extensions.  */
-#define gen_uxtb(var) tcg_gen_ext8u_i32(var, var)
-#define gen_uxth(var) tcg_gen_ext16u_i32(var, var)
-#define gen_sxtb(var) tcg_gen_ext8s_i32(var, var)
-#define gen_sxth(var) tcg_gen_ext16s_i32(var, var)
-
-#define gen_sxtb16(var) gen_helper_sxtb16(var, var)
-#define gen_uxtb16(var) gen_helper_uxtb16(var, var)
-
-void gen_set_cpsr(TCGv_i32 var, uint32_t mask)
-{
-    gen_helper_cpsr_write(cpu_env, var, tcg_constant_i32(mask));
-}
-
-static void gen_rebuild_hflags(DisasContext *s, bool new_el)
-{
-    bool m_profile = arm_dc_feature(s, ARM_FEATURE_M);
-
-    if (new_el) {
-        if (m_profile) {
-            gen_helper_rebuild_hflags_m32_newel(cpu_env);
-        } else {
-            gen_helper_rebuild_hflags_a32_newel(cpu_env);
-        }
-    } else {
-        TCGv_i32 tcg_el = tcg_constant_i32(s->current_el);
-        if (m_profile) {
-            gen_helper_rebuild_hflags_m32(cpu_env, tcg_el);
-        } else {
-            gen_helper_rebuild_hflags_a32(cpu_env, tcg_el);
-        }
-    }
-}
-
-static void gen_exception_internal(int excp)
-{
-    assert(excp_is_internal(excp));
-    gen_helper_exception_internal(cpu_env, tcg_constant_i32(excp));
-}
-
-static void gen_singlestep_exception(DisasContext *s)
-{
-    /* We just completed step of an insn. Move from Active-not-pending
-     * to Active-pending, and then also take the swstep exception.
-     * This corresponds to making the (IMPDEF) choice to prioritize
-     * swstep exceptions over asynchronous exceptions taken to an exception
-     * level where debug is disabled. This choice has the advantage that
-     * we do not need to maintain internal state corresponding to the
-     * ISV/EX syndrome bits between completion of the step and generation
-     * of the exception, and our syndrome information is always correct.
-     */
-    gen_ss_advance(s);
-    gen_swstep_exception(s, 1, s->is_ldex);
-    s->base.is_jmp = DISAS_NORETURN;
-}
-
-void clear_eci_state(DisasContext *s)
-{
-    /*
-     * Clear any ECI/ICI state: used when a load multiple/store
-     * multiple insn executes.
-     */
-    if (s->eci) {
-        store_cpu_field_constant(0, condexec_bits);
-        s->eci = 0;
-    }
-}
-
-static void gen_smul_dual(TCGv_i32 a, TCGv_i32 b)
-{
-    TCGv_i32 tmp1 = tcg_temp_new_i32();
-    TCGv_i32 tmp2 = tcg_temp_new_i32();
-    tcg_gen_ext16s_i32(tmp1, a);
-    tcg_gen_ext16s_i32(tmp2, b);
-    tcg_gen_mul_i32(tmp1, tmp1, tmp2);
-    tcg_temp_free_i32(tmp2);
-    tcg_gen_sari_i32(a, a, 16);
-    tcg_gen_sari_i32(b, b, 16);
-    tcg_gen_mul_i32(b, b, a);
-    tcg_gen_mov_i32(a, tmp1);
-    tcg_temp_free_i32(tmp1);
-}
-
-/* Byteswap each halfword.  */
-void gen_rev16(TCGv_i32 dest, TCGv_i32 var)
-{
-    TCGv_i32 tmp = tcg_temp_new_i32();
-    TCGv_i32 mask = tcg_constant_i32(0x00ff00ff);
-    tcg_gen_shri_i32(tmp, var, 8);
-    tcg_gen_and_i32(tmp, tmp, mask);
-    tcg_gen_and_i32(var, var, mask);
-    tcg_gen_shli_i32(var, var, 8);
-    tcg_gen_or_i32(dest, var, tmp);
-    tcg_temp_free_i32(tmp);
-}
-
-/* Byteswap low halfword and sign extend.  */
-static void gen_revsh(TCGv_i32 dest, TCGv_i32 var)
-{
-    tcg_gen_bswap16_i32(var, var, TCG_BSWAP_OS);
-}
-
-/* Dual 16-bit add.  Result placed in t0 and t1 is marked as dead.
-    tmp = (t0 ^ t1) & 0x8000;
-    t0 &= ~0x8000;
-    t1 &= ~0x8000;
-    t0 = (t0 + t1) ^ tmp;
- */
-
-static void gen_add16(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
-{
-    TCGv_i32 tmp = tcg_temp_new_i32();
-    tcg_gen_xor_i32(tmp, t0, t1);
-    tcg_gen_andi_i32(tmp, tmp, 0x8000);
-    tcg_gen_andi_i32(t0, t0, ~0x8000);
-    tcg_gen_andi_i32(t1, t1, ~0x8000);
-    tcg_gen_add_i32(t0, t0, t1);
-    tcg_gen_xor_i32(dest, t0, tmp);
-    tcg_temp_free_i32(tmp);
-}
-
-/* Set N and Z flags from var.  */
-static inline void gen_logic_CC(TCGv_i32 var)
-{
-    tcg_gen_mov_i32(cpu_NF, var);
-    tcg_gen_mov_i32(cpu_ZF, var);
-}
-
-/* dest = T0 + T1 + CF. */
-static void gen_add_carry(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
-{
-    tcg_gen_add_i32(dest, t0, t1);
-    tcg_gen_add_i32(dest, dest, cpu_CF);
-}
-
-/* dest = T0 - T1 + CF - 1.  */
-static void gen_sub_carry(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
-{
-    tcg_gen_sub_i32(dest, t0, t1);
-    tcg_gen_add_i32(dest, dest, cpu_CF);
-    tcg_gen_subi_i32(dest, dest, 1);
-}
-
-/* dest = T0 + T1. Compute C, N, V and Z flags */
-static void gen_add_CC(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
-{
-    TCGv_i32 tmp = tcg_temp_new_i32();
-    tcg_gen_movi_i32(tmp, 0);
-    tcg_gen_add2_i32(cpu_NF, cpu_CF, t0, tmp, t1, tmp);
-    tcg_gen_mov_i32(cpu_ZF, cpu_NF);
-    tcg_gen_xor_i32(cpu_VF, cpu_NF, t0);
-    tcg_gen_xor_i32(tmp, t0, t1);
-    tcg_gen_andc_i32(cpu_VF, cpu_VF, tmp);
-    tcg_temp_free_i32(tmp);
-    tcg_gen_mov_i32(dest, cpu_NF);
-}
-
-/* dest = T0 + T1 + CF.  Compute C, N, V and Z flags */
-static void gen_adc_CC(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
-{
-    TCGv_i32 tmp = tcg_temp_new_i32();
-    if (TCG_TARGET_HAS_add2_i32) {
-        tcg_gen_movi_i32(tmp, 0);
-        tcg_gen_add2_i32(cpu_NF, cpu_CF, t0, tmp, cpu_CF, tmp);
-        tcg_gen_add2_i32(cpu_NF, cpu_CF, cpu_NF, cpu_CF, t1, tmp);
-    } else {
-        TCGv_i64 q0 = tcg_temp_new_i64();
-        TCGv_i64 q1 = tcg_temp_new_i64();
-        tcg_gen_extu_i32_i64(q0, t0);
-        tcg_gen_extu_i32_i64(q1, t1);
-        tcg_gen_add_i64(q0, q0, q1);
-        tcg_gen_extu_i32_i64(q1, cpu_CF);
-        tcg_gen_add_i64(q0, q0, q1);
-        tcg_gen_extr_i64_i32(cpu_NF, cpu_CF, q0);
-        tcg_temp_free_i64(q0);
-        tcg_temp_free_i64(q1);
-    }
-    tcg_gen_mov_i32(cpu_ZF, cpu_NF);
-    tcg_gen_xor_i32(cpu_VF, cpu_NF, t0);
-    tcg_gen_xor_i32(tmp, t0, t1);
-    tcg_gen_andc_i32(cpu_VF, cpu_VF, tmp);
-    tcg_temp_free_i32(tmp);
-    tcg_gen_mov_i32(dest, cpu_NF);
-}
-
-/* dest = T0 - T1. Compute C, N, V and Z flags */
-static void gen_sub_CC(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
-{
-    TCGv_i32 tmp;
-    tcg_gen_sub_i32(cpu_NF, t0, t1);
-    tcg_gen_mov_i32(cpu_ZF, cpu_NF);
-    tcg_gen_setcond_i32(TCG_COND_GEU, cpu_CF, t0, t1);
-    tcg_gen_xor_i32(cpu_VF, cpu_NF, t0);
-    tmp = tcg_temp_new_i32();
-    tcg_gen_xor_i32(tmp, t0, t1);
-    tcg_gen_and_i32(cpu_VF, cpu_VF, tmp);
-    tcg_temp_free_i32(tmp);
-    tcg_gen_mov_i32(dest, cpu_NF);
-}
-
-/* dest = T0 + ~T1 + CF.  Compute C, N, V and Z flags */
-static void gen_sbc_CC(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
-{
-    TCGv_i32 tmp = tcg_temp_new_i32();
-    tcg_gen_not_i32(tmp, t1);
-    gen_adc_CC(dest, t0, tmp);
-    tcg_temp_free_i32(tmp);
-}
-
-#define GEN_SHIFT(name)                                               \
-static void gen_##name(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)       \
-{                                                                     \
-    TCGv_i32 tmpd = tcg_temp_new_i32();                               \
-    TCGv_i32 tmp1 = tcg_temp_new_i32();                               \
-    TCGv_i32 zero = tcg_constant_i32(0);                              \
-    tcg_gen_andi_i32(tmp1, t1, 0x1f);                                 \
-    tcg_gen_##name##_i32(tmpd, t0, tmp1);                             \
-    tcg_gen_andi_i32(tmp1, t1, 0xe0);                                 \
-    tcg_gen_movcond_i32(TCG_COND_NE, dest, tmp1, zero, zero, tmpd);   \
-    tcg_temp_free_i32(tmpd);                                          \
-    tcg_temp_free_i32(tmp1);                                          \
-}
-GEN_SHIFT(shl)
-GEN_SHIFT(shr)
-#undef GEN_SHIFT
-
-static void gen_sar(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
-{
-    TCGv_i32 tmp1 = tcg_temp_new_i32();
-
-    tcg_gen_andi_i32(tmp1, t1, 0xff);
-    tcg_gen_umin_i32(tmp1, tmp1, tcg_constant_i32(31));
-    tcg_gen_sar_i32(dest, t0, tmp1);
-    tcg_temp_free_i32(tmp1);
-}
-
-static void shifter_out_im(TCGv_i32 var, int shift)
-{
-    tcg_gen_extract_i32(cpu_CF, var, shift, 1);
-}
-
-/* Shift by immediate.  Includes special handling for shift == 0.  */
-static inline void gen_arm_shift_im(TCGv_i32 var, int shiftop,
-                                    int shift, int flags)
-{
-    switch (shiftop) {
-    case 0: /* LSL */
-        if (shift != 0) {
-            if (flags)
-                shifter_out_im(var, 32 - shift);
-            tcg_gen_shli_i32(var, var, shift);
-        }
-        break;
-    case 1: /* LSR */
-        if (shift == 0) {
-            if (flags) {
-                tcg_gen_shri_i32(cpu_CF, var, 31);
-            }
-            tcg_gen_movi_i32(var, 0);
-        } else {
-            if (flags)
-                shifter_out_im(var, shift - 1);
-            tcg_gen_shri_i32(var, var, shift);
-        }
-        break;
-    case 2: /* ASR */
-        if (shift == 0)
-            shift = 32;
-        if (flags)
-            shifter_out_im(var, shift - 1);
-        if (shift == 32)
-          shift = 31;
-        tcg_gen_sari_i32(var, var, shift);
-        break;
-    case 3: /* ROR/RRX */
-        if (shift != 0) {
-            if (flags)
-                shifter_out_im(var, shift - 1);
-            tcg_gen_rotri_i32(var, var, shift); break;
-        } else {
-            TCGv_i32 tmp = tcg_temp_new_i32();
-            tcg_gen_shli_i32(tmp, cpu_CF, 31);
-            if (flags)
-                shifter_out_im(var, 0);
-            tcg_gen_shri_i32(var, var, 1);
-            tcg_gen_or_i32(var, var, tmp);
-            tcg_temp_free_i32(tmp);
-        }
-    }
-};
-
-static inline void gen_arm_shift_reg(TCGv_i32 var, int shiftop,
-                                     TCGv_i32 shift, int flags)
-{
-    if (flags) {
-        switch (shiftop) {
-        case 0: gen_helper_shl_cc(var, cpu_env, var, shift); break;
-        case 1: gen_helper_shr_cc(var, cpu_env, var, shift); break;
-        case 2: gen_helper_sar_cc(var, cpu_env, var, shift); break;
-        case 3: gen_helper_ror_cc(var, cpu_env, var, shift); break;
-        }
-    } else {
-        switch (shiftop) {
-        case 0:
-            gen_shl(var, var, shift);
-            break;
-        case 1:
-            gen_shr(var, var, shift);
-            break;
-        case 2:
-            gen_sar(var, var, shift);
-            break;
-        case 3: tcg_gen_andi_i32(shift, shift, 0x1f);
-                tcg_gen_rotr_i32(var, var, shift); break;
-        }
-    }
-    tcg_temp_free_i32(shift);
-}
-
-/*
- * Generate a conditional based on ARM condition code cc.
- * This is common between ARM and Aarch64 targets.
- */
-void arm_test_cc(DisasCompare *cmp, int cc)
-{
-    TCGv_i32 value;
-    TCGCond cond;
-    bool global = true;
-
-    switch (cc) {
-    case 0: /* eq: Z */
-    case 1: /* ne: !Z */
-        cond = TCG_COND_EQ;
-        value = cpu_ZF;
-        break;
-
-    case 2: /* cs: C */
-    case 3: /* cc: !C */
-        cond = TCG_COND_NE;
-        value = cpu_CF;
-        break;
-
-    case 4: /* mi: N */
-    case 5: /* pl: !N */
-        cond = TCG_COND_LT;
-        value = cpu_NF;
-        break;
-
-    case 6: /* vs: V */
-    case 7: /* vc: !V */
-        cond = TCG_COND_LT;
-        value = cpu_VF;
-        break;
-
-    case 8: /* hi: C && !Z */
-    case 9: /* ls: !C || Z -> !(C && !Z) */
-        cond = TCG_COND_NE;
-        value = tcg_temp_new_i32();
-        global = false;
-        /* CF is 1 for C, so -CF is an all-bits-set mask for C;
-           ZF is non-zero for !Z; so AND the two subexpressions.  */
-        tcg_gen_neg_i32(value, cpu_CF);
-        tcg_gen_and_i32(value, value, cpu_ZF);
-        break;
-
-    case 10: /* ge: N == V -> N ^ V == 0 */
-    case 11: /* lt: N != V -> N ^ V != 0 */
-        /* Since we're only interested in the sign bit, == 0 is >= 0.  */
-        cond = TCG_COND_GE;
-        value = tcg_temp_new_i32();
-        global = false;
-        tcg_gen_xor_i32(value, cpu_VF, cpu_NF);
-        break;
-
-    case 12: /* gt: !Z && N == V */
-    case 13: /* le: Z || N != V */
-        cond = TCG_COND_NE;
-        value = tcg_temp_new_i32();
-        global = false;
-        /* (N == V) is equal to the sign bit of ~(NF ^ VF).  Propagate
-         * the sign bit then AND with ZF to yield the result.  */
-        tcg_gen_xor_i32(value, cpu_VF, cpu_NF);
-        tcg_gen_sari_i32(value, value, 31);
-        tcg_gen_andc_i32(value, cpu_ZF, value);
-        break;
-
-    case 14: /* always */
-    case 15: /* always */
-        /* Use the ALWAYS condition, which will fold early.
-         * It doesn't matter what we use for the value.  */
-        cond = TCG_COND_ALWAYS;
-        value = cpu_ZF;
-        goto no_invert;
-
-    default:
-        fprintf(stderr, "Bad condition code 0x%x\n", cc);
-        abort();
-    }
-
-    if (cc & 1) {
-        cond = tcg_invert_cond(cond);
-    }
-
- no_invert:
-    cmp->cond = cond;
-    cmp->value = value;
-    cmp->value_global = global;
-}
-
-void arm_free_cc(DisasCompare *cmp)
-{
-    if (!cmp->value_global) {
-        tcg_temp_free_i32(cmp->value);
-    }
-}
-
-void arm_jump_cc(DisasCompare *cmp, TCGLabel *label)
-{
-    tcg_gen_brcondi_i32(cmp->cond, cmp->value, 0, label);
-}
-
-void arm_gen_test_cc(int cc, TCGLabel *label)
-{
-    DisasCompare cmp;
-    arm_test_cc(&cmp, cc);
-    arm_jump_cc(&cmp, label);
-    arm_free_cc(&cmp);
-}
-
-void gen_set_condexec(DisasContext *s)
-{
-    if (s->condexec_mask) {
-        uint32_t val = (s->condexec_cond << 4) | (s->condexec_mask >> 1);
-
-        store_cpu_field_constant(val, condexec_bits);
-    }
-}
-
-void gen_update_pc(DisasContext *s, target_long diff)
-{
-    gen_pc_plus_diff(s, cpu_R[15], diff);
-    s->pc_save = s->pc_curr + diff;
-}
-
-/* Set PC and Thumb state from var.  var is marked as dead.  */
-static inline void gen_bx(DisasContext *s, TCGv_i32 var)
-{
-    s->base.is_jmp = DISAS_JUMP;
-    tcg_gen_andi_i32(cpu_R[15], var, ~1);
-    tcg_gen_andi_i32(var, var, 1);
-    store_cpu_field(var, thumb);
-    s->pc_save = -1;
-}
-
-/*
- * Set PC and Thumb state from var. var is marked as dead.
- * For M-profile CPUs, include logic to detect exception-return
- * branches and handle them. This is needed for Thumb POP/LDM to PC, LDR to PC,
- * and BX reg, and no others, and happens only for code in Handler mode.
- * The Security Extension also requires us to check for the FNC_RETURN
- * which signals a function return from non-secure state; this can happen
- * in both Handler and Thread mode.
- * To avoid having to do multiple comparisons in inline generated code,
- * we make the check we do here loose, so it will match for EXC_RETURN
- * in Thread mode. For system emulation do_v7m_exception_exit() checks
- * for these spurious cases and returns without doing anything (giving
- * the same behaviour as for a branch to a non-magic address).
- *
- * In linux-user mode it is unclear what the right behaviour for an
- * attempted FNC_RETURN should be, because in real hardware this will go
- * directly to Secure code (ie not the Linux kernel) which will then treat
- * the error in any way it chooses. For QEMU we opt to make the FNC_RETURN
- * attempt behave the way it would on a CPU without the security extension,
- * which is to say "like a normal branch". That means we can simply treat
- * all branches as normal with no magic address behaviour.
- */
-static inline void gen_bx_excret(DisasContext *s, TCGv_i32 var)
-{
-    /* Generate the same code here as for a simple bx, but flag via
-     * s->base.is_jmp that we need to do the rest of the work later.
-     */
-    gen_bx(s, var);
-#ifndef CONFIG_USER_ONLY
-    if (arm_dc_feature(s, ARM_FEATURE_M_SECURITY) ||
-        (s->v7m_handler_mode && arm_dc_feature(s, ARM_FEATURE_M))) {
-        s->base.is_jmp = DISAS_BX_EXCRET;
-    }
-#endif
-}
-
-static inline void gen_bx_excret_final_code(DisasContext *s)
-{
-    /* Generate the code to finish possible exception return and end the TB */
-    DisasLabel excret_label = gen_disas_label(s);
-    uint32_t min_magic;
-
-    if (arm_dc_feature(s, ARM_FEATURE_M_SECURITY)) {
-        /* Covers FNC_RETURN and EXC_RETURN magic */
-        min_magic = FNC_RETURN_MIN_MAGIC;
-    } else {
-        /* EXC_RETURN magic only */
-        min_magic = EXC_RETURN_MIN_MAGIC;
-    }
-
-    /* Is the new PC value in the magic range indicating exception return? */
-    tcg_gen_brcondi_i32(TCG_COND_GEU, cpu_R[15], min_magic, excret_label.label);
-    /* No: end the TB as we would for a DISAS_JMP */
-    if (s->ss_active) {
-        gen_singlestep_exception(s);
-    } else {
-        tcg_gen_exit_tb(NULL, 0);
-    }
-    set_disas_label(s, excret_label);
-    /* Yes: this is an exception return.
-     * At this point in runtime env->regs[15] and env->thumb will hold
-     * the exception-return magic number, which do_v7m_exception_exit()
-     * will read. Nothing else will be able to see those values because
-     * the cpu-exec main loop guarantees that we will always go straight
-     * from raising the exception to the exception-handling code.
-     *
-     * gen_ss_advance(s) does nothing on M profile currently but
-     * calling it is conceptually the right thing as we have executed
-     * this instruction (compare SWI, HVC, SMC handling).
-     */
-    gen_ss_advance(s);
-    gen_exception_internal(EXCP_EXCEPTION_EXIT);
-}
-
-static inline void gen_bxns(DisasContext *s, int rm)
-{
-    TCGv_i32 var = load_reg(s, rm);
-
-    /* The bxns helper may raise an EXCEPTION_EXIT exception, so in theory
-     * we need to sync state before calling it, but:
-     *  - we don't need to do gen_update_pc() because the bxns helper will
-     *    always set the PC itself
-     *  - we don't need to do gen_set_condexec() because BXNS is UNPREDICTABLE
-     *    unless it's outside an IT block or the last insn in an IT block,
-     *    so we know that condexec == 0 (already set at the top of the TB)
-     *    is correct in the non-UNPREDICTABLE cases, and we can choose
-     *    "zeroes the IT bits" as our UNPREDICTABLE behaviour otherwise.
-     */
-    gen_helper_v7m_bxns(cpu_env, var);
-    tcg_temp_free_i32(var);
-    s->base.is_jmp = DISAS_EXIT;
-}
-
-static inline void gen_blxns(DisasContext *s, int rm)
-{
-    TCGv_i32 var = load_reg(s, rm);
-
-    /* We don't need to sync condexec state, for the same reason as bxns.
-     * We do however need to set the PC, because the blxns helper reads it.
-     * The blxns helper may throw an exception.
-     */
-    gen_update_pc(s, curr_insn_len(s));
-    gen_helper_v7m_blxns(cpu_env, var);
-    tcg_temp_free_i32(var);
-    s->base.is_jmp = DISAS_EXIT;
-}
-
-/* Variant of store_reg which uses branch&exchange logic when storing
-   to r15 in ARM architecture v7 and above. The source must be a temporary
-   and will be marked as dead. */
-static inline void store_reg_bx(DisasContext *s, int reg, TCGv_i32 var)
-{
-    if (reg == 15 && ENABLE_ARCH_7) {
-        gen_bx(s, var);
-    } else {
-        store_reg(s, reg, var);
-    }
-}
-
-/* Variant of store_reg which uses branch&exchange logic when storing
- * to r15 in ARM architecture v5T and above. This is used for storing
- * the results of a LDR/LDM/POP into r15, and corresponds to the cases
- * in the ARM ARM which use the LoadWritePC() pseudocode function. */
-static inline void store_reg_from_load(DisasContext *s, int reg, TCGv_i32 var)
-{
-    if (reg == 15 && ENABLE_ARCH_5) {
-        gen_bx_excret(s, var);
-    } else {
-        store_reg(s, reg, var);
-    }
-}
-
-#ifdef CONFIG_USER_ONLY
-#define IS_USER_ONLY 1
-#else
-#define IS_USER_ONLY 0
-#endif
-
-MemOp pow2_align(unsigned i)
-{
-    static const MemOp mop_align[] = {
-        0, MO_ALIGN_2, MO_ALIGN_4, MO_ALIGN_8, MO_ALIGN_16,
-        /*
-         * FIXME: TARGET_PAGE_BITS_MIN affects TLB_FLAGS_MASK such
-         * that 256-bit alignment (MO_ALIGN_32) cannot be supported:
-         * see get_alignment_bits(). Enforce only 128-bit alignment for now.
-         */
-        MO_ALIGN_16
-    };
-    g_assert(i < ARRAY_SIZE(mop_align));
-    return mop_align[i];
-}
-
-/*
- * Abstractions of "generate code to do a guest load/store for
- * AArch32", where a vaddr is always 32 bits (and is zero
- * extended if we're a 64 bit core) and  data is also
- * 32 bits unless specifically doing a 64 bit access.
- * These functions work like tcg_gen_qemu_{ld,st}* except
- * that the address argument is TCGv_i32 rather than TCGv.
- */
-
-static TCGv gen_aa32_addr(DisasContext *s, TCGv_i32 a32, MemOp op)
-{
-    TCGv addr = tcg_temp_new();
-    tcg_gen_extu_i32_tl(addr, a32);
-
-    /* Not needed for user-mode BE32, where we use MO_BE instead.  */
-    if (!IS_USER_ONLY && s->sctlr_b && (op & MO_SIZE) < MO_32) {
-        tcg_gen_xori_tl(addr, addr, 4 - (1 << (op & MO_SIZE)));
-    }
-    return addr;
-}
-
-/*
- * Internal routines are used for NEON cases where the endianness
- * and/or alignment has already been taken into account and manipulated.
- */
-void gen_aa32_ld_internal_i32(DisasContext *s, TCGv_i32 val,
-                              TCGv_i32 a32, int index, MemOp opc)
-{
-    TCGv addr = gen_aa32_addr(s, a32, opc);
-    tcg_gen_qemu_ld_i32(val, addr, index, opc);
-    tcg_temp_free(addr);
-}
-
-void gen_aa32_st_internal_i32(DisasContext *s, TCGv_i32 val,
-                              TCGv_i32 a32, int index, MemOp opc)
-{
-    TCGv addr = gen_aa32_addr(s, a32, opc);
-    tcg_gen_qemu_st_i32(val, addr, index, opc);
-    tcg_temp_free(addr);
-}
-
-void gen_aa32_ld_internal_i64(DisasContext *s, TCGv_i64 val,
-                              TCGv_i32 a32, int index, MemOp opc)
-{
-    TCGv addr = gen_aa32_addr(s, a32, opc);
-
-    tcg_gen_qemu_ld_i64(val, addr, index, opc);
-
-    /* Not needed for user-mode BE32, where we use MO_BE instead.  */
-    if (!IS_USER_ONLY && s->sctlr_b && (opc & MO_SIZE) == MO_64) {
-        tcg_gen_rotri_i64(val, val, 32);
-    }
-    tcg_temp_free(addr);
-}
-
-void gen_aa32_st_internal_i64(DisasContext *s, TCGv_i64 val,
-                              TCGv_i32 a32, int index, MemOp opc)
-{
-    TCGv addr = gen_aa32_addr(s, a32, opc);
-
-    /* Not needed for user-mode BE32, where we use MO_BE instead.  */
-    if (!IS_USER_ONLY && s->sctlr_b && (opc & MO_SIZE) == MO_64) {
-        TCGv_i64 tmp = tcg_temp_new_i64();
-        tcg_gen_rotri_i64(tmp, val, 32);
-        tcg_gen_qemu_st_i64(tmp, addr, index, opc);
-        tcg_temp_free_i64(tmp);
-    } else {
-        tcg_gen_qemu_st_i64(val, addr, index, opc);
-    }
-    tcg_temp_free(addr);
-}
-
-void gen_aa32_ld_i32(DisasContext *s, TCGv_i32 val, TCGv_i32 a32,
-                     int index, MemOp opc)
-{
-    gen_aa32_ld_internal_i32(s, val, a32, index, finalize_memop(s, opc));
-}
-
-void gen_aa32_st_i32(DisasContext *s, TCGv_i32 val, TCGv_i32 a32,
-                     int index, MemOp opc)
-{
-    gen_aa32_st_internal_i32(s, val, a32, index, finalize_memop(s, opc));
-}
-
-void gen_aa32_ld_i64(DisasContext *s, TCGv_i64 val, TCGv_i32 a32,
-                     int index, MemOp opc)
-{
-    gen_aa32_ld_internal_i64(s, val, a32, index, finalize_memop(s, opc));
-}
-
-void gen_aa32_st_i64(DisasContext *s, TCGv_i64 val, TCGv_i32 a32,
-                     int index, MemOp opc)
-{
-    gen_aa32_st_internal_i64(s, val, a32, index, finalize_memop(s, opc));
-}
-
-#define DO_GEN_LD(SUFF, OPC)                                            \
-    static inline void gen_aa32_ld##SUFF(DisasContext *s, TCGv_i32 val, \
-                                         TCGv_i32 a32, int index)       \
-    {                                                                   \
-        gen_aa32_ld_i32(s, val, a32, index, OPC);                       \
-    }
-
-#define DO_GEN_ST(SUFF, OPC)                                            \
-    static inline void gen_aa32_st##SUFF(DisasContext *s, TCGv_i32 val, \
-                                         TCGv_i32 a32, int index)       \
-    {                                                                   \
-        gen_aa32_st_i32(s, val, a32, index, OPC);                       \
-    }
-
-static inline void gen_hvc(DisasContext *s, int imm16)
-{
-    /* The pre HVC helper handles cases when HVC gets trapped
-     * as an undefined insn by runtime configuration (ie before
-     * the insn really executes).
-     */
-    gen_update_pc(s, 0);
-    gen_helper_pre_hvc(cpu_env);
-    /* Otherwise we will treat this as a real exception which
-     * happens after execution of the insn. (The distinction matters
-     * for the PC value reported to the exception handler and also
-     * for single stepping.)
-     */
-    s->svc_imm = imm16;
-    gen_update_pc(s, curr_insn_len(s));
-    s->base.is_jmp = DISAS_HVC;
-}
-
-static inline void gen_smc(DisasContext *s)
-{
-    /* As with HVC, we may take an exception either before or after
-     * the insn executes.
-     */
-    gen_update_pc(s, 0);
-    gen_helper_pre_smc(cpu_env, tcg_constant_i32(syn_aa32_smc()));
-    gen_update_pc(s, curr_insn_len(s));
-    s->base.is_jmp = DISAS_SMC;
-}
-
-static void gen_exception_internal_insn(DisasContext *s, int excp)
-{
-    gen_set_condexec(s);
-    gen_update_pc(s, 0);
-    gen_exception_internal(excp);
-    s->base.is_jmp = DISAS_NORETURN;
-}
-
-static void gen_exception_el_v(int excp, uint32_t syndrome, TCGv_i32 tcg_el)
-{
-    gen_helper_exception_with_syndrome_el(cpu_env, tcg_constant_i32(excp),
-                                          tcg_constant_i32(syndrome), tcg_el);
-}
-
-static void gen_exception_el(int excp, uint32_t syndrome, uint32_t target_el)
-{
-    gen_exception_el_v(excp, syndrome, tcg_constant_i32(target_el));
-}
-
-static void gen_exception(int excp, uint32_t syndrome)
-{
-    gen_helper_exception_with_syndrome(cpu_env, tcg_constant_i32(excp),
-                                       tcg_constant_i32(syndrome));
-}
-
-static void gen_exception_insn_el_v(DisasContext *s, target_long pc_diff,
-                                    int excp, uint32_t syn, TCGv_i32 tcg_el)
-{
-    if (s->aarch64) {
-        gen_a64_update_pc(s, pc_diff);
-    } else {
-        gen_set_condexec(s);
-        gen_update_pc(s, pc_diff);
-    }
-    gen_exception_el_v(excp, syn, tcg_el);
-    s->base.is_jmp = DISAS_NORETURN;
-}
-
-void gen_exception_insn_el(DisasContext *s, target_long pc_diff, int excp,
-                           uint32_t syn, uint32_t target_el)
-{
-    gen_exception_insn_el_v(s, pc_diff, excp, syn,
-                            tcg_constant_i32(target_el));
-}
-
-void gen_exception_insn(DisasContext *s, target_long pc_diff,
-                        int excp, uint32_t syn)
-{
-    if (s->aarch64) {
-        gen_a64_update_pc(s, pc_diff);
-    } else {
-        gen_set_condexec(s);
-        gen_update_pc(s, pc_diff);
-    }
-    gen_exception(excp, syn);
-    s->base.is_jmp = DISAS_NORETURN;
-}
-
-static void gen_exception_bkpt_insn(DisasContext *s, uint32_t syn)
-{
-    gen_set_condexec(s);
-    gen_update_pc(s, 0);
-    gen_helper_exception_bkpt_insn(cpu_env, tcg_constant_i32(syn));
-    s->base.is_jmp = DISAS_NORETURN;
-}
-
-void unallocated_encoding(DisasContext *s)
-{
-    /* Unallocated and reserved encodings are uncategorized */
-    gen_exception_insn(s, 0, EXCP_UDEF, syn_uncategorized());
-}
-
-/* Force a TB lookup after an instruction that changes the CPU state.  */
-void gen_lookup_tb(DisasContext *s)
-{
-    gen_pc_plus_diff(s, cpu_R[15], curr_insn_len(s));
-    s->base.is_jmp = DISAS_EXIT;
-}
-
-static inline void gen_hlt(DisasContext *s, int imm)
-{
-    /* HLT. This has two purposes.
-     * Architecturally, it is an external halting debug instruction.
-     * Since QEMU doesn't implement external debug, we treat this as
-     * it is required for halting debug disabled: it will UNDEF.
-     * Secondly, "HLT 0x3C" is a T32 semihosting trap instruction,
-     * and "HLT 0xF000" is an A32 semihosting syscall. These traps
-     * must trigger semihosting even for ARMv7 and earlier, where
-     * HLT was an undefined encoding.
-     * In system mode, we don't allow userspace access to
-     * semihosting, to provide some semblance of security
-     * (and for consistency with our 32-bit semihosting).
-     */
-    if (semihosting_enabled(s->current_el == 0) &&
-        (imm == (s->thumb ? 0x3c : 0xf000))) {
-        gen_exception_internal_insn(s, EXCP_SEMIHOST);
-        return;
-    }
-
-    unallocated_encoding(s);
-}
-
-/*
- * Return the offset of a "full" NEON Dreg.
- */
-long neon_full_reg_offset(unsigned reg)
-{
-    return offsetof(CPUARMState, vfp.zregs[reg >> 1].d[reg & 1]);
-}
-
-/*
- * Return the offset of a 2**SIZE piece of a NEON register, at index ELE,
- * where 0 is the least significant end of the register.
- */
-long neon_element_offset(int reg, int element, MemOp memop)
-{
-    int element_size = 1 << (memop & MO_SIZE);
-    int ofs = element * element_size;
-#if HOST_BIG_ENDIAN
-    /*
-     * Calculate the offset assuming fully little-endian,
-     * then XOR to account for the order of the 8-byte units.
-     */
-    if (element_size < 8) {
-        ofs ^= 8 - element_size;
-    }
-#endif
-    return neon_full_reg_offset(reg) + ofs;
-}
-
-/* Return the offset of a VFP Dreg (dp = true) or VFP Sreg (dp = false). */
-long vfp_reg_offset(bool dp, unsigned reg)
-{
-    if (dp) {
-        return neon_element_offset(reg, 0, MO_64);
-    } else {
-        return neon_element_offset(reg >> 1, reg & 1, MO_32);
-    }
-}
-
-void read_neon_element32(TCGv_i32 dest, int reg, int ele, MemOp memop)
-{
-    long off = neon_element_offset(reg, ele, memop);
-
-    switch (memop) {
-    case MO_SB:
-        tcg_gen_ld8s_i32(dest, cpu_env, off);
-        break;
-    case MO_UB:
-        tcg_gen_ld8u_i32(dest, cpu_env, off);
-        break;
-    case MO_SW:
-        tcg_gen_ld16s_i32(dest, cpu_env, off);
-        break;
-    case MO_UW:
-        tcg_gen_ld16u_i32(dest, cpu_env, off);
-        break;
-    case MO_UL:
-    case MO_SL:
-        tcg_gen_ld_i32(dest, cpu_env, off);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-}
-
-void read_neon_element64(TCGv_i64 dest, int reg, int ele, MemOp memop)
-{
-    long off = neon_element_offset(reg, ele, memop);
-
-    switch (memop) {
-    case MO_SL:
-        tcg_gen_ld32s_i64(dest, cpu_env, off);
-        break;
-    case MO_UL:
-        tcg_gen_ld32u_i64(dest, cpu_env, off);
-        break;
-    case MO_UQ:
-        tcg_gen_ld_i64(dest, cpu_env, off);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-}
-
-void write_neon_element32(TCGv_i32 src, int reg, int ele, MemOp memop)
-{
-    long off = neon_element_offset(reg, ele, memop);
-
-    switch (memop) {
-    case MO_8:
-        tcg_gen_st8_i32(src, cpu_env, off);
-        break;
-    case MO_16:
-        tcg_gen_st16_i32(src, cpu_env, off);
-        break;
-    case MO_32:
-        tcg_gen_st_i32(src, cpu_env, off);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-}
-
-void write_neon_element64(TCGv_i64 src, int reg, int ele, MemOp memop)
-{
-    long off = neon_element_offset(reg, ele, memop);
-
-    switch (memop) {
-    case MO_32:
-        tcg_gen_st32_i64(src, cpu_env, off);
-        break;
-    case MO_64:
-        tcg_gen_st_i64(src, cpu_env, off);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-}
-
-#define ARM_CP_RW_BIT   (1 << 20)
-
-static inline void iwmmxt_load_reg(TCGv_i64 var, int reg)
-{
-    tcg_gen_ld_i64(var, cpu_env, offsetof(CPUARMState, iwmmxt.regs[reg]));
-}
-
-static inline void iwmmxt_store_reg(TCGv_i64 var, int reg)
-{
-    tcg_gen_st_i64(var, cpu_env, offsetof(CPUARMState, iwmmxt.regs[reg]));
-}
-
-static inline TCGv_i32 iwmmxt_load_creg(int reg)
-{
-    TCGv_i32 var = tcg_temp_new_i32();
-    tcg_gen_ld_i32(var, cpu_env, offsetof(CPUARMState, iwmmxt.cregs[reg]));
-    return var;
-}
-
-static inline void iwmmxt_store_creg(int reg, TCGv_i32 var)
-{
-    tcg_gen_st_i32(var, cpu_env, offsetof(CPUARMState, iwmmxt.cregs[reg]));
-    tcg_temp_free_i32(var);
-}
-
-static inline void gen_op_iwmmxt_movq_wRn_M0(int rn)
-{
-    iwmmxt_store_reg(cpu_M0, rn);
-}
-
-static inline void gen_op_iwmmxt_movq_M0_wRn(int rn)
-{
-    iwmmxt_load_reg(cpu_M0, rn);
-}
-
-static inline void gen_op_iwmmxt_orq_M0_wRn(int rn)
-{
-    iwmmxt_load_reg(cpu_V1, rn);
-    tcg_gen_or_i64(cpu_M0, cpu_M0, cpu_V1);
-}
-
-static inline void gen_op_iwmmxt_andq_M0_wRn(int rn)
-{
-    iwmmxt_load_reg(cpu_V1, rn);
-    tcg_gen_and_i64(cpu_M0, cpu_M0, cpu_V1);
-}
-
-static inline void gen_op_iwmmxt_xorq_M0_wRn(int rn)
-{
-    iwmmxt_load_reg(cpu_V1, rn);
-    tcg_gen_xor_i64(cpu_M0, cpu_M0, cpu_V1);
-}
-
-#define IWMMXT_OP(name) \
-static inline void gen_op_iwmmxt_##name##_M0_wRn(int rn) \
-{ \
-    iwmmxt_load_reg(cpu_V1, rn); \
-    gen_helper_iwmmxt_##name(cpu_M0, cpu_M0, cpu_V1); \
-}
-
-#define IWMMXT_OP_ENV(name) \
-static inline void gen_op_iwmmxt_##name##_M0_wRn(int rn) \
-{ \
-    iwmmxt_load_reg(cpu_V1, rn); \
-    gen_helper_iwmmxt_##name(cpu_M0, cpu_env, cpu_M0, cpu_V1); \
-}
-
-#define IWMMXT_OP_ENV_SIZE(name) \
-IWMMXT_OP_ENV(name##b) \
-IWMMXT_OP_ENV(name##w) \
-IWMMXT_OP_ENV(name##l)
-
-#define IWMMXT_OP_ENV1(name) \
-static inline void gen_op_iwmmxt_##name##_M0(void) \
-{ \
-    gen_helper_iwmmxt_##name(cpu_M0, cpu_env, cpu_M0); \
-}
-
-IWMMXT_OP(maddsq)
-IWMMXT_OP(madduq)
-IWMMXT_OP(sadb)
-IWMMXT_OP(sadw)
-IWMMXT_OP(mulslw)
-IWMMXT_OP(mulshw)
-IWMMXT_OP(mululw)
-IWMMXT_OP(muluhw)
-IWMMXT_OP(macsw)
-IWMMXT_OP(macuw)
-
-IWMMXT_OP_ENV_SIZE(unpackl)
-IWMMXT_OP_ENV_SIZE(unpackh)
-
-IWMMXT_OP_ENV1(unpacklub)
-IWMMXT_OP_ENV1(unpackluw)
-IWMMXT_OP_ENV1(unpacklul)
-IWMMXT_OP_ENV1(unpackhub)
-IWMMXT_OP_ENV1(unpackhuw)
-IWMMXT_OP_ENV1(unpackhul)
-IWMMXT_OP_ENV1(unpacklsb)
-IWMMXT_OP_ENV1(unpacklsw)
-IWMMXT_OP_ENV1(unpacklsl)
-IWMMXT_OP_ENV1(unpackhsb)
-IWMMXT_OP_ENV1(unpackhsw)
-IWMMXT_OP_ENV1(unpackhsl)
-
-IWMMXT_OP_ENV_SIZE(cmpeq)
-IWMMXT_OP_ENV_SIZE(cmpgtu)
-IWMMXT_OP_ENV_SIZE(cmpgts)
-
-IWMMXT_OP_ENV_SIZE(mins)
-IWMMXT_OP_ENV_SIZE(minu)
-IWMMXT_OP_ENV_SIZE(maxs)
-IWMMXT_OP_ENV_SIZE(maxu)
-
-IWMMXT_OP_ENV_SIZE(subn)
-IWMMXT_OP_ENV_SIZE(addn)
-IWMMXT_OP_ENV_SIZE(subu)
-IWMMXT_OP_ENV_SIZE(addu)
-IWMMXT_OP_ENV_SIZE(subs)
-IWMMXT_OP_ENV_SIZE(adds)
-
-IWMMXT_OP_ENV(avgb0)
-IWMMXT_OP_ENV(avgb1)
-IWMMXT_OP_ENV(avgw0)
-IWMMXT_OP_ENV(avgw1)
-
-IWMMXT_OP_ENV(packuw)
-IWMMXT_OP_ENV(packul)
-IWMMXT_OP_ENV(packuq)
-IWMMXT_OP_ENV(packsw)
-IWMMXT_OP_ENV(packsl)
-IWMMXT_OP_ENV(packsq)
-
-static void gen_op_iwmmxt_set_mup(void)
-{
-    TCGv_i32 tmp;
-    tmp = load_cpu_field(iwmmxt.cregs[ARM_IWMMXT_wCon]);
-    tcg_gen_ori_i32(tmp, tmp, 2);
-    store_cpu_field(tmp, iwmmxt.cregs[ARM_IWMMXT_wCon]);
-}
-
-static void gen_op_iwmmxt_set_cup(void)
-{
-    TCGv_i32 tmp;
-    tmp = load_cpu_field(iwmmxt.cregs[ARM_IWMMXT_wCon]);
-    tcg_gen_ori_i32(tmp, tmp, 1);
-    store_cpu_field(tmp, iwmmxt.cregs[ARM_IWMMXT_wCon]);
-}
-
-static void gen_op_iwmmxt_setpsr_nz(void)
-{
-    TCGv_i32 tmp = tcg_temp_new_i32();
-    gen_helper_iwmmxt_setpsr_nz(tmp, cpu_M0);
-    store_cpu_field(tmp, iwmmxt.cregs[ARM_IWMMXT_wCASF]);
-}
-
-static inline void gen_op_iwmmxt_addl_M0_wRn(int rn)
-{
-    iwmmxt_load_reg(cpu_V1, rn);
-    tcg_gen_ext32u_i64(cpu_V1, cpu_V1);
-    tcg_gen_add_i64(cpu_M0, cpu_M0, cpu_V1);
-}
-
-static inline int gen_iwmmxt_address(DisasContext *s, uint32_t insn,
-                                     TCGv_i32 dest)
-{
-    int rd;
-    uint32_t offset;
-    TCGv_i32 tmp;
-
-    rd = (insn >> 16) & 0xf;
-    tmp = load_reg(s, rd);
-
-    offset = (insn & 0xff) << ((insn >> 7) & 2);
-    if (insn & (1 << 24)) {
-        /* Pre indexed */
-        if (insn & (1 << 23))
-            tcg_gen_addi_i32(tmp, tmp, offset);
-        else
-            tcg_gen_addi_i32(tmp, tmp, -offset);
-        tcg_gen_mov_i32(dest, tmp);
-        if (insn & (1 << 21))
-            store_reg(s, rd, tmp);
-        else
-            tcg_temp_free_i32(tmp);
-    } else if (insn & (1 << 21)) {
-        /* Post indexed */
-        tcg_gen_mov_i32(dest, tmp);
-        if (insn & (1 << 23))
-            tcg_gen_addi_i32(tmp, tmp, offset);
-        else
-            tcg_gen_addi_i32(tmp, tmp, -offset);
-        store_reg(s, rd, tmp);
-    } else if (!(insn & (1 << 23)))
-        return 1;
-    return 0;
-}
-
-static inline int gen_iwmmxt_shift(uint32_t insn, uint32_t mask, TCGv_i32 dest)
-{
-    int rd = (insn >> 0) & 0xf;
-    TCGv_i32 tmp;
-
-    if (insn & (1 << 8)) {
-        if (rd < ARM_IWMMXT_wCGR0 || rd > ARM_IWMMXT_wCGR3) {
-            return 1;
-        } else {
-            tmp = iwmmxt_load_creg(rd);
-        }
-    } else {
-        tmp = tcg_temp_new_i32();
-        iwmmxt_load_reg(cpu_V0, rd);
-        tcg_gen_extrl_i64_i32(tmp, cpu_V0);
-    }
-    tcg_gen_andi_i32(tmp, tmp, mask);
-    tcg_gen_mov_i32(dest, tmp);
-    tcg_temp_free_i32(tmp);
-    return 0;
-}
-
-/* Disassemble an iwMMXt instruction.  Returns nonzero if an error occurred
-   (ie. an undefined instruction).  */
-static int disas_iwmmxt_insn(DisasContext *s, uint32_t insn)
-{
-    int rd, wrd;
-    int rdhi, rdlo, rd0, rd1, i;
-    TCGv_i32 addr;
-    TCGv_i32 tmp, tmp2, tmp3;
-
-    if ((insn & 0x0e000e00) == 0x0c000000) {
-        if ((insn & 0x0fe00ff0) == 0x0c400000) {
-            wrd = insn & 0xf;
-            rdlo = (insn >> 12) & 0xf;
-            rdhi = (insn >> 16) & 0xf;
-            if (insn & ARM_CP_RW_BIT) {                         /* TMRRC */
-                iwmmxt_load_reg(cpu_V0, wrd);
-                tcg_gen_extrl_i64_i32(cpu_R[rdlo], cpu_V0);
-                tcg_gen_extrh_i64_i32(cpu_R[rdhi], cpu_V0);
-            } else {                                    /* TMCRR */
-                tcg_gen_concat_i32_i64(cpu_V0, cpu_R[rdlo], cpu_R[rdhi]);
-                iwmmxt_store_reg(cpu_V0, wrd);
-                gen_op_iwmmxt_set_mup();
-            }
-            return 0;
-        }
-
-        wrd = (insn >> 12) & 0xf;
-        addr = tcg_temp_new_i32();
-        if (gen_iwmmxt_address(s, insn, addr)) {
-            tcg_temp_free_i32(addr);
-            return 1;
-        }
-        if (insn & ARM_CP_RW_BIT) {
-            if ((insn >> 28) == 0xf) {                  /* WLDRW wCx */
-                tmp = tcg_temp_new_i32();
-                gen_aa32_ld32u(s, tmp, addr, get_mem_index(s));
-                iwmmxt_store_creg(wrd, tmp);
-            } else {
-                i = 1;
-                if (insn & (1 << 8)) {
-                    if (insn & (1 << 22)) {             /* WLDRD */
-                        gen_aa32_ld64(s, cpu_M0, addr, get_mem_index(s));
-                        i = 0;
-                    } else {                            /* WLDRW wRd */
-                        tmp = tcg_temp_new_i32();
-                        gen_aa32_ld32u(s, tmp, addr, get_mem_index(s));
-                    }
-                } else {
-                    tmp = tcg_temp_new_i32();
-                    if (insn & (1 << 22)) {             /* WLDRH */
-                        gen_aa32_ld16u(s, tmp, addr, get_mem_index(s));
-                    } else {                            /* WLDRB */
-                        gen_aa32_ld8u(s, tmp, addr, get_mem_index(s));
-                    }
-                }
-                if (i) {
-                    tcg_gen_extu_i32_i64(cpu_M0, tmp);
-                    tcg_temp_free_i32(tmp);
-                }
-                gen_op_iwmmxt_movq_wRn_M0(wrd);
-            }
-        } else {
-            if ((insn >> 28) == 0xf) {                  /* WSTRW wCx */
-                tmp = iwmmxt_load_creg(wrd);
-                gen_aa32_st32(s, tmp, addr, get_mem_index(s));
-            } else {
-                gen_op_iwmmxt_movq_M0_wRn(wrd);
-                tmp = tcg_temp_new_i32();
-                if (insn & (1 << 8)) {
-                    if (insn & (1 << 22)) {             /* WSTRD */
-                        gen_aa32_st64(s, cpu_M0, addr, get_mem_index(s));
-                    } else {                            /* WSTRW wRd */
-                        tcg_gen_extrl_i64_i32(tmp, cpu_M0);
-                        gen_aa32_st32(s, tmp, addr, get_mem_index(s));
-                    }
-                } else {
-                    if (insn & (1 << 22)) {             /* WSTRH */
-                        tcg_gen_extrl_i64_i32(tmp, cpu_M0);
-                        gen_aa32_st16(s, tmp, addr, get_mem_index(s));
-                    } else {                            /* WSTRB */
-                        tcg_gen_extrl_i64_i32(tmp, cpu_M0);
-                        gen_aa32_st8(s, tmp, addr, get_mem_index(s));
-                    }
-                }
-            }
-            tcg_temp_free_i32(tmp);
-        }
-        tcg_temp_free_i32(addr);
-        return 0;
-    }
-
-    if ((insn & 0x0f000000) != 0x0e000000)
-        return 1;
-
-    switch (((insn >> 12) & 0xf00) | ((insn >> 4) & 0xff)) {
-    case 0x000:                                                 /* WOR */
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 0) & 0xf;
-        rd1 = (insn >> 16) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        gen_op_iwmmxt_orq_M0_wRn(rd1);
-        gen_op_iwmmxt_setpsr_nz();
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        gen_op_iwmmxt_set_cup();
-        break;
-    case 0x011:                                                 /* TMCR */
-        if (insn & 0xf)
-            return 1;
-        rd = (insn >> 12) & 0xf;
-        wrd = (insn >> 16) & 0xf;
-        switch (wrd) {
-        case ARM_IWMMXT_wCID:
-        case ARM_IWMMXT_wCASF:
-            break;
-        case ARM_IWMMXT_wCon:
-            gen_op_iwmmxt_set_cup();
-            /* Fall through.  */
-        case ARM_IWMMXT_wCSSF:
-            tmp = iwmmxt_load_creg(wrd);
-            tmp2 = load_reg(s, rd);
-            tcg_gen_andc_i32(tmp, tmp, tmp2);
-            tcg_temp_free_i32(tmp2);
-            iwmmxt_store_creg(wrd, tmp);
-            break;
-        case ARM_IWMMXT_wCGR0:
-        case ARM_IWMMXT_wCGR1:
-        case ARM_IWMMXT_wCGR2:
-        case ARM_IWMMXT_wCGR3:
-            gen_op_iwmmxt_set_cup();
-            tmp = load_reg(s, rd);
-            iwmmxt_store_creg(wrd, tmp);
-            break;
-        default:
-            return 1;
-        }
-        break;
-    case 0x100:                                                 /* WXOR */
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 0) & 0xf;
-        rd1 = (insn >> 16) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        gen_op_iwmmxt_xorq_M0_wRn(rd1);
-        gen_op_iwmmxt_setpsr_nz();
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        gen_op_iwmmxt_set_cup();
-        break;
-    case 0x111:                                                 /* TMRC */
-        if (insn & 0xf)
-            return 1;
-        rd = (insn >> 12) & 0xf;
-        wrd = (insn >> 16) & 0xf;
-        tmp = iwmmxt_load_creg(wrd);
-        store_reg(s, rd, tmp);
-        break;
-    case 0x300:                                                 /* WANDN */
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 0) & 0xf;
-        rd1 = (insn >> 16) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        tcg_gen_neg_i64(cpu_M0, cpu_M0);
-        gen_op_iwmmxt_andq_M0_wRn(rd1);
-        gen_op_iwmmxt_setpsr_nz();
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        gen_op_iwmmxt_set_cup();
-        break;
-    case 0x200:                                                 /* WAND */
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 0) & 0xf;
-        rd1 = (insn >> 16) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        gen_op_iwmmxt_andq_M0_wRn(rd1);
-        gen_op_iwmmxt_setpsr_nz();
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        gen_op_iwmmxt_set_cup();
-        break;
-    case 0x810: case 0xa10:                             /* WMADD */
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 0) & 0xf;
-        rd1 = (insn >> 16) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        if (insn & (1 << 21))
-            gen_op_iwmmxt_maddsq_M0_wRn(rd1);
-        else
-            gen_op_iwmmxt_madduq_M0_wRn(rd1);
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        break;
-    case 0x10e: case 0x50e: case 0x90e: case 0xd0e:     /* WUNPCKIL */
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 16) & 0xf;
-        rd1 = (insn >> 0) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        switch ((insn >> 22) & 3) {
-        case 0:
-            gen_op_iwmmxt_unpacklb_M0_wRn(rd1);
-            break;
-        case 1:
-            gen_op_iwmmxt_unpacklw_M0_wRn(rd1);
-            break;
-        case 2:
-            gen_op_iwmmxt_unpackll_M0_wRn(rd1);
-            break;
-        case 3:
-            return 1;
-        }
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        gen_op_iwmmxt_set_cup();
-        break;
-    case 0x10c: case 0x50c: case 0x90c: case 0xd0c:     /* WUNPCKIH */
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 16) & 0xf;
-        rd1 = (insn >> 0) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        switch ((insn >> 22) & 3) {
-        case 0:
-            gen_op_iwmmxt_unpackhb_M0_wRn(rd1);
-            break;
-        case 1:
-            gen_op_iwmmxt_unpackhw_M0_wRn(rd1);
-            break;
-        case 2:
-            gen_op_iwmmxt_unpackhl_M0_wRn(rd1);
-            break;
-        case 3:
-            return 1;
-        }
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        gen_op_iwmmxt_set_cup();
-        break;
-    case 0x012: case 0x112: case 0x412: case 0x512:     /* WSAD */
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 16) & 0xf;
-        rd1 = (insn >> 0) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        if (insn & (1 << 22))
-            gen_op_iwmmxt_sadw_M0_wRn(rd1);
-        else
-            gen_op_iwmmxt_sadb_M0_wRn(rd1);
-        if (!(insn & (1 << 20)))
-            gen_op_iwmmxt_addl_M0_wRn(wrd);
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        break;
-    case 0x010: case 0x110: case 0x210: case 0x310:     /* WMUL */
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 16) & 0xf;
-        rd1 = (insn >> 0) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        if (insn & (1 << 21)) {
-            if (insn & (1 << 20))
-                gen_op_iwmmxt_mulshw_M0_wRn(rd1);
-            else
-                gen_op_iwmmxt_mulslw_M0_wRn(rd1);
-        } else {
-            if (insn & (1 << 20))
-                gen_op_iwmmxt_muluhw_M0_wRn(rd1);
-            else
-                gen_op_iwmmxt_mululw_M0_wRn(rd1);
-        }
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        break;
-    case 0x410: case 0x510: case 0x610: case 0x710:     /* WMAC */
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 16) & 0xf;
-        rd1 = (insn >> 0) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        if (insn & (1 << 21))
-            gen_op_iwmmxt_macsw_M0_wRn(rd1);
-        else
-            gen_op_iwmmxt_macuw_M0_wRn(rd1);
-        if (!(insn & (1 << 20))) {
-            iwmmxt_load_reg(cpu_V1, wrd);
-            tcg_gen_add_i64(cpu_M0, cpu_M0, cpu_V1);
-        }
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        break;
-    case 0x006: case 0x406: case 0x806: case 0xc06:     /* WCMPEQ */
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 16) & 0xf;
-        rd1 = (insn >> 0) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        switch ((insn >> 22) & 3) {
-        case 0:
-            gen_op_iwmmxt_cmpeqb_M0_wRn(rd1);
-            break;
-        case 1:
-            gen_op_iwmmxt_cmpeqw_M0_wRn(rd1);
-            break;
-        case 2:
-            gen_op_iwmmxt_cmpeql_M0_wRn(rd1);
-            break;
-        case 3:
-            return 1;
-        }
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        gen_op_iwmmxt_set_cup();
-        break;
-    case 0x800: case 0x900: case 0xc00: case 0xd00:     /* WAVG2 */
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 16) & 0xf;
-        rd1 = (insn >> 0) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        if (insn & (1 << 22)) {
-            if (insn & (1 << 20))
-                gen_op_iwmmxt_avgw1_M0_wRn(rd1);
-            else
-                gen_op_iwmmxt_avgw0_M0_wRn(rd1);
-        } else {
-            if (insn & (1 << 20))
-                gen_op_iwmmxt_avgb1_M0_wRn(rd1);
-            else
-                gen_op_iwmmxt_avgb0_M0_wRn(rd1);
-        }
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        gen_op_iwmmxt_set_cup();
-        break;
-    case 0x802: case 0x902: case 0xa02: case 0xb02:     /* WALIGNR */
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 16) & 0xf;
-        rd1 = (insn >> 0) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        tmp = iwmmxt_load_creg(ARM_IWMMXT_wCGR0 + ((insn >> 20) & 3));
-        tcg_gen_andi_i32(tmp, tmp, 7);
-        iwmmxt_load_reg(cpu_V1, rd1);
-        gen_helper_iwmmxt_align(cpu_M0, cpu_M0, cpu_V1, tmp);
-        tcg_temp_free_i32(tmp);
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        break;
-    case 0x601: case 0x605: case 0x609: case 0x60d:     /* TINSR */
-        if (((insn >> 6) & 3) == 3)
-            return 1;
-        rd = (insn >> 12) & 0xf;
-        wrd = (insn >> 16) & 0xf;
-        tmp = load_reg(s, rd);
-        gen_op_iwmmxt_movq_M0_wRn(wrd);
-        switch ((insn >> 6) & 3) {
-        case 0:
-            tmp2 = tcg_constant_i32(0xff);
-            tmp3 = tcg_constant_i32((insn & 7) << 3);
-            break;
-        case 1:
-            tmp2 = tcg_constant_i32(0xffff);
-            tmp3 = tcg_constant_i32((insn & 3) << 4);
-            break;
-        case 2:
-            tmp2 = tcg_constant_i32(0xffffffff);
-            tmp3 = tcg_constant_i32((insn & 1) << 5);
-            break;
-        default:
-            g_assert_not_reached();
-        }
-        gen_helper_iwmmxt_insr(cpu_M0, cpu_M0, tmp, tmp2, tmp3);
-        tcg_temp_free_i32(tmp);
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        break;
-    case 0x107: case 0x507: case 0x907: case 0xd07:     /* TEXTRM */
-        rd = (insn >> 12) & 0xf;
-        wrd = (insn >> 16) & 0xf;
-        if (rd == 15 || ((insn >> 22) & 3) == 3)
-            return 1;
-        gen_op_iwmmxt_movq_M0_wRn(wrd);
-        tmp = tcg_temp_new_i32();
-        switch ((insn >> 22) & 3) {
-        case 0:
-            tcg_gen_shri_i64(cpu_M0, cpu_M0, (insn & 7) << 3);
-            tcg_gen_extrl_i64_i32(tmp, cpu_M0);
-            if (insn & 8) {
-                tcg_gen_ext8s_i32(tmp, tmp);
-            } else {
-                tcg_gen_andi_i32(tmp, tmp, 0xff);
-            }
-            break;
-        case 1:
-            tcg_gen_shri_i64(cpu_M0, cpu_M0, (insn & 3) << 4);
-            tcg_gen_extrl_i64_i32(tmp, cpu_M0);
-            if (insn & 8) {
-                tcg_gen_ext16s_i32(tmp, tmp);
-            } else {
-                tcg_gen_andi_i32(tmp, tmp, 0xffff);
-            }
-            break;
-        case 2:
-            tcg_gen_shri_i64(cpu_M0, cpu_M0, (insn & 1) << 5);
-            tcg_gen_extrl_i64_i32(tmp, cpu_M0);
-            break;
-        }
-        store_reg(s, rd, tmp);
-        break;
-    case 0x117: case 0x517: case 0x917: case 0xd17:     /* TEXTRC */
-        if ((insn & 0x000ff008) != 0x0003f000 || ((insn >> 22) & 3) == 3)
-            return 1;
-        tmp = iwmmxt_load_creg(ARM_IWMMXT_wCASF);
-        switch ((insn >> 22) & 3) {
-        case 0:
-            tcg_gen_shri_i32(tmp, tmp, ((insn & 7) << 2) + 0);
-            break;
-        case 1:
-            tcg_gen_shri_i32(tmp, tmp, ((insn & 3) << 3) + 4);
-            break;
-        case 2:
-            tcg_gen_shri_i32(tmp, tmp, ((insn & 1) << 4) + 12);
-            break;
-        }
-        tcg_gen_shli_i32(tmp, tmp, 28);
-        gen_set_nzcv(tmp);
-        tcg_temp_free_i32(tmp);
-        break;
-    case 0x401: case 0x405: case 0x409: case 0x40d:     /* TBCST */
-        if (((insn >> 6) & 3) == 3)
-            return 1;
-        rd = (insn >> 12) & 0xf;
-        wrd = (insn >> 16) & 0xf;
-        tmp = load_reg(s, rd);
-        switch ((insn >> 6) & 3) {
-        case 0:
-            gen_helper_iwmmxt_bcstb(cpu_M0, tmp);
-            break;
-        case 1:
-            gen_helper_iwmmxt_bcstw(cpu_M0, tmp);
-            break;
-        case 2:
-            gen_helper_iwmmxt_bcstl(cpu_M0, tmp);
-            break;
-        }
-        tcg_temp_free_i32(tmp);
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        break;
-    case 0x113: case 0x513: case 0x913: case 0xd13:     /* TANDC */
-        if ((insn & 0x000ff00f) != 0x0003f000 || ((insn >> 22) & 3) == 3)
-            return 1;
-        tmp = iwmmxt_load_creg(ARM_IWMMXT_wCASF);
-        tmp2 = tcg_temp_new_i32();
-        tcg_gen_mov_i32(tmp2, tmp);
-        switch ((insn >> 22) & 3) {
-        case 0:
-            for (i = 0; i < 7; i ++) {
-                tcg_gen_shli_i32(tmp2, tmp2, 4);
-                tcg_gen_and_i32(tmp, tmp, tmp2);
-            }
-            break;
-        case 1:
-            for (i = 0; i < 3; i ++) {
-                tcg_gen_shli_i32(tmp2, tmp2, 8);
-                tcg_gen_and_i32(tmp, tmp, tmp2);
-            }
-            break;
-        case 2:
-            tcg_gen_shli_i32(tmp2, tmp2, 16);
-            tcg_gen_and_i32(tmp, tmp, tmp2);
-            break;
-        }
-        gen_set_nzcv(tmp);
-        tcg_temp_free_i32(tmp2);
-        tcg_temp_free_i32(tmp);
-        break;
-    case 0x01c: case 0x41c: case 0x81c: case 0xc1c:     /* WACC */
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 16) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        switch ((insn >> 22) & 3) {
-        case 0:
-            gen_helper_iwmmxt_addcb(cpu_M0, cpu_M0);
-            break;
-        case 1:
-            gen_helper_iwmmxt_addcw(cpu_M0, cpu_M0);
-            break;
-        case 2:
-            gen_helper_iwmmxt_addcl(cpu_M0, cpu_M0);
-            break;
-        case 3:
-            return 1;
-        }
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        break;
-    case 0x115: case 0x515: case 0x915: case 0xd15:     /* TORC */
-        if ((insn & 0x000ff00f) != 0x0003f000 || ((insn >> 22) & 3) == 3)
-            return 1;
-        tmp = iwmmxt_load_creg(ARM_IWMMXT_wCASF);
-        tmp2 = tcg_temp_new_i32();
-        tcg_gen_mov_i32(tmp2, tmp);
-        switch ((insn >> 22) & 3) {
-        case 0:
-            for (i = 0; i < 7; i ++) {
-                tcg_gen_shli_i32(tmp2, tmp2, 4);
-                tcg_gen_or_i32(tmp, tmp, tmp2);
-            }
-            break;
-        case 1:
-            for (i = 0; i < 3; i ++) {
-                tcg_gen_shli_i32(tmp2, tmp2, 8);
-                tcg_gen_or_i32(tmp, tmp, tmp2);
-            }
-            break;
-        case 2:
-            tcg_gen_shli_i32(tmp2, tmp2, 16);
-            tcg_gen_or_i32(tmp, tmp, tmp2);
-            break;
-        }
-        gen_set_nzcv(tmp);
-        tcg_temp_free_i32(tmp2);
-        tcg_temp_free_i32(tmp);
-        break;
-    case 0x103: case 0x503: case 0x903: case 0xd03:     /* TMOVMSK */
-        rd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 16) & 0xf;
-        if ((insn & 0xf) != 0 || ((insn >> 22) & 3) == 3)
-            return 1;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        tmp = tcg_temp_new_i32();
-        switch ((insn >> 22) & 3) {
-        case 0:
-            gen_helper_iwmmxt_msbb(tmp, cpu_M0);
-            break;
-        case 1:
-            gen_helper_iwmmxt_msbw(tmp, cpu_M0);
-            break;
-        case 2:
-            gen_helper_iwmmxt_msbl(tmp, cpu_M0);
-            break;
-        }
-        store_reg(s, rd, tmp);
-        break;
-    case 0x106: case 0x306: case 0x506: case 0x706:     /* WCMPGT */
-    case 0x906: case 0xb06: case 0xd06: case 0xf06:
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 16) & 0xf;
-        rd1 = (insn >> 0) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        switch ((insn >> 22) & 3) {
-        case 0:
-            if (insn & (1 << 21))
-                gen_op_iwmmxt_cmpgtsb_M0_wRn(rd1);
-            else
-                gen_op_iwmmxt_cmpgtub_M0_wRn(rd1);
-            break;
-        case 1:
-            if (insn & (1 << 21))
-                gen_op_iwmmxt_cmpgtsw_M0_wRn(rd1);
-            else
-                gen_op_iwmmxt_cmpgtuw_M0_wRn(rd1);
-            break;
-        case 2:
-            if (insn & (1 << 21))
-                gen_op_iwmmxt_cmpgtsl_M0_wRn(rd1);
-            else
-                gen_op_iwmmxt_cmpgtul_M0_wRn(rd1);
-            break;
-        case 3:
-            return 1;
-        }
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        gen_op_iwmmxt_set_cup();
-        break;
-    case 0x00e: case 0x20e: case 0x40e: case 0x60e:     /* WUNPCKEL */
-    case 0x80e: case 0xa0e: case 0xc0e: case 0xe0e:
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 16) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        switch ((insn >> 22) & 3) {
-        case 0:
-            if (insn & (1 << 21))
-                gen_op_iwmmxt_unpacklsb_M0();
-            else
-                gen_op_iwmmxt_unpacklub_M0();
-            break;
-        case 1:
-            if (insn & (1 << 21))
-                gen_op_iwmmxt_unpacklsw_M0();
-            else
-                gen_op_iwmmxt_unpackluw_M0();
-            break;
-        case 2:
-            if (insn & (1 << 21))
-                gen_op_iwmmxt_unpacklsl_M0();
-            else
-                gen_op_iwmmxt_unpacklul_M0();
-            break;
-        case 3:
-            return 1;
-        }
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        gen_op_iwmmxt_set_cup();
-        break;
-    case 0x00c: case 0x20c: case 0x40c: case 0x60c:     /* WUNPCKEH */
-    case 0x80c: case 0xa0c: case 0xc0c: case 0xe0c:
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 16) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        switch ((insn >> 22) & 3) {
-        case 0:
-            if (insn & (1 << 21))
-                gen_op_iwmmxt_unpackhsb_M0();
-            else
-                gen_op_iwmmxt_unpackhub_M0();
-            break;
-        case 1:
-            if (insn & (1 << 21))
-                gen_op_iwmmxt_unpackhsw_M0();
-            else
-                gen_op_iwmmxt_unpackhuw_M0();
-            break;
-        case 2:
-            if (insn & (1 << 21))
-                gen_op_iwmmxt_unpackhsl_M0();
-            else
-                gen_op_iwmmxt_unpackhul_M0();
-            break;
-        case 3:
-            return 1;
-        }
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        gen_op_iwmmxt_set_cup();
-        break;
-    case 0x204: case 0x604: case 0xa04: case 0xe04:     /* WSRL */
-    case 0x214: case 0x614: case 0xa14: case 0xe14:
-        if (((insn >> 22) & 3) == 0)
-            return 1;
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 16) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        tmp = tcg_temp_new_i32();
-        if (gen_iwmmxt_shift(insn, 0xff, tmp)) {
-            tcg_temp_free_i32(tmp);
-            return 1;
-        }
-        switch ((insn >> 22) & 3) {
-        case 1:
-            gen_helper_iwmmxt_srlw(cpu_M0, cpu_env, cpu_M0, tmp);
-            break;
-        case 2:
-            gen_helper_iwmmxt_srll(cpu_M0, cpu_env, cpu_M0, tmp);
-            break;
-        case 3:
-            gen_helper_iwmmxt_srlq(cpu_M0, cpu_env, cpu_M0, tmp);
-            break;
-        }
-        tcg_temp_free_i32(tmp);
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        gen_op_iwmmxt_set_cup();
-        break;
-    case 0x004: case 0x404: case 0x804: case 0xc04:     /* WSRA */
-    case 0x014: case 0x414: case 0x814: case 0xc14:
-        if (((insn >> 22) & 3) == 0)
-            return 1;
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 16) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        tmp = tcg_temp_new_i32();
-        if (gen_iwmmxt_shift(insn, 0xff, tmp)) {
-            tcg_temp_free_i32(tmp);
-            return 1;
-        }
-        switch ((insn >> 22) & 3) {
-        case 1:
-            gen_helper_iwmmxt_sraw(cpu_M0, cpu_env, cpu_M0, tmp);
-            break;
-        case 2:
-            gen_helper_iwmmxt_sral(cpu_M0, cpu_env, cpu_M0, tmp);
-            break;
-        case 3:
-            gen_helper_iwmmxt_sraq(cpu_M0, cpu_env, cpu_M0, tmp);
-            break;
-        }
-        tcg_temp_free_i32(tmp);
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        gen_op_iwmmxt_set_cup();
-        break;
-    case 0x104: case 0x504: case 0x904: case 0xd04:     /* WSLL */
-    case 0x114: case 0x514: case 0x914: case 0xd14:
-        if (((insn >> 22) & 3) == 0)
-            return 1;
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 16) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        tmp = tcg_temp_new_i32();
-        if (gen_iwmmxt_shift(insn, 0xff, tmp)) {
-            tcg_temp_free_i32(tmp);
-            return 1;
-        }
-        switch ((insn >> 22) & 3) {
-        case 1:
-            gen_helper_iwmmxt_sllw(cpu_M0, cpu_env, cpu_M0, tmp);
-            break;
-        case 2:
-            gen_helper_iwmmxt_slll(cpu_M0, cpu_env, cpu_M0, tmp);
-            break;
-        case 3:
-            gen_helper_iwmmxt_sllq(cpu_M0, cpu_env, cpu_M0, tmp);
-            break;
-        }
-        tcg_temp_free_i32(tmp);
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        gen_op_iwmmxt_set_cup();
-        break;
-    case 0x304: case 0x704: case 0xb04: case 0xf04:     /* WROR */
-    case 0x314: case 0x714: case 0xb14: case 0xf14:
-        if (((insn >> 22) & 3) == 0)
-            return 1;
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 16) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        tmp = tcg_temp_new_i32();
-        switch ((insn >> 22) & 3) {
-        case 1:
-            if (gen_iwmmxt_shift(insn, 0xf, tmp)) {
-                tcg_temp_free_i32(tmp);
-                return 1;
-            }
-            gen_helper_iwmmxt_rorw(cpu_M0, cpu_env, cpu_M0, tmp);
-            break;
-        case 2:
-            if (gen_iwmmxt_shift(insn, 0x1f, tmp)) {
-                tcg_temp_free_i32(tmp);
-                return 1;
-            }
-            gen_helper_iwmmxt_rorl(cpu_M0, cpu_env, cpu_M0, tmp);
-            break;
-        case 3:
-            if (gen_iwmmxt_shift(insn, 0x3f, tmp)) {
-                tcg_temp_free_i32(tmp);
-                return 1;
-            }
-            gen_helper_iwmmxt_rorq(cpu_M0, cpu_env, cpu_M0, tmp);
-            break;
-        }
-        tcg_temp_free_i32(tmp);
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        gen_op_iwmmxt_set_cup();
-        break;
-    case 0x116: case 0x316: case 0x516: case 0x716:     /* WMIN */
-    case 0x916: case 0xb16: case 0xd16: case 0xf16:
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 16) & 0xf;
-        rd1 = (insn >> 0) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        switch ((insn >> 22) & 3) {
-        case 0:
-            if (insn & (1 << 21))
-                gen_op_iwmmxt_minsb_M0_wRn(rd1);
-            else
-                gen_op_iwmmxt_minub_M0_wRn(rd1);
-            break;
-        case 1:
-            if (insn & (1 << 21))
-                gen_op_iwmmxt_minsw_M0_wRn(rd1);
-            else
-                gen_op_iwmmxt_minuw_M0_wRn(rd1);
-            break;
-        case 2:
-            if (insn & (1 << 21))
-                gen_op_iwmmxt_minsl_M0_wRn(rd1);
-            else
-                gen_op_iwmmxt_minul_M0_wRn(rd1);
-            break;
-        case 3:
-            return 1;
-        }
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        break;
-    case 0x016: case 0x216: case 0x416: case 0x616:     /* WMAX */
-    case 0x816: case 0xa16: case 0xc16: case 0xe16:
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 16) & 0xf;
-        rd1 = (insn >> 0) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        switch ((insn >> 22) & 3) {
-        case 0:
-            if (insn & (1 << 21))
-                gen_op_iwmmxt_maxsb_M0_wRn(rd1);
-            else
-                gen_op_iwmmxt_maxub_M0_wRn(rd1);
-            break;
-        case 1:
-            if (insn & (1 << 21))
-                gen_op_iwmmxt_maxsw_M0_wRn(rd1);
-            else
-                gen_op_iwmmxt_maxuw_M0_wRn(rd1);
-            break;
-        case 2:
-            if (insn & (1 << 21))
-                gen_op_iwmmxt_maxsl_M0_wRn(rd1);
-            else
-                gen_op_iwmmxt_maxul_M0_wRn(rd1);
-            break;
-        case 3:
-            return 1;
-        }
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        break;
-    case 0x002: case 0x102: case 0x202: case 0x302:     /* WALIGNI */
-    case 0x402: case 0x502: case 0x602: case 0x702:
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 16) & 0xf;
-        rd1 = (insn >> 0) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        iwmmxt_load_reg(cpu_V1, rd1);
-        gen_helper_iwmmxt_align(cpu_M0, cpu_M0, cpu_V1,
-                                tcg_constant_i32((insn >> 20) & 3));
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        break;
-    case 0x01a: case 0x11a: case 0x21a: case 0x31a:     /* WSUB */
-    case 0x41a: case 0x51a: case 0x61a: case 0x71a:
-    case 0x81a: case 0x91a: case 0xa1a: case 0xb1a:
-    case 0xc1a: case 0xd1a: case 0xe1a: case 0xf1a:
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 16) & 0xf;
-        rd1 = (insn >> 0) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        switch ((insn >> 20) & 0xf) {
-        case 0x0:
-            gen_op_iwmmxt_subnb_M0_wRn(rd1);
-            break;
-        case 0x1:
-            gen_op_iwmmxt_subub_M0_wRn(rd1);
-            break;
-        case 0x3:
-            gen_op_iwmmxt_subsb_M0_wRn(rd1);
-            break;
-        case 0x4:
-            gen_op_iwmmxt_subnw_M0_wRn(rd1);
-            break;
-        case 0x5:
-            gen_op_iwmmxt_subuw_M0_wRn(rd1);
-            break;
-        case 0x7:
-            gen_op_iwmmxt_subsw_M0_wRn(rd1);
-            break;
-        case 0x8:
-            gen_op_iwmmxt_subnl_M0_wRn(rd1);
-            break;
-        case 0x9:
-            gen_op_iwmmxt_subul_M0_wRn(rd1);
-            break;
-        case 0xb:
-            gen_op_iwmmxt_subsl_M0_wRn(rd1);
-            break;
-        default:
-            return 1;
-        }
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        gen_op_iwmmxt_set_cup();
-        break;
-    case 0x01e: case 0x11e: case 0x21e: case 0x31e:     /* WSHUFH */
-    case 0x41e: case 0x51e: case 0x61e: case 0x71e:
-    case 0x81e: case 0x91e: case 0xa1e: case 0xb1e:
-    case 0xc1e: case 0xd1e: case 0xe1e: case 0xf1e:
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 16) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        tmp = tcg_constant_i32(((insn >> 16) & 0xf0) | (insn & 0x0f));
-        gen_helper_iwmmxt_shufh(cpu_M0, cpu_env, cpu_M0, tmp);
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        gen_op_iwmmxt_set_cup();
-        break;
-    case 0x018: case 0x118: case 0x218: case 0x318:     /* WADD */
-    case 0x418: case 0x518: case 0x618: case 0x718:
-    case 0x818: case 0x918: case 0xa18: case 0xb18:
-    case 0xc18: case 0xd18: case 0xe18: case 0xf18:
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 16) & 0xf;
-        rd1 = (insn >> 0) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        switch ((insn >> 20) & 0xf) {
-        case 0x0:
-            gen_op_iwmmxt_addnb_M0_wRn(rd1);
-            break;
-        case 0x1:
-            gen_op_iwmmxt_addub_M0_wRn(rd1);
-            break;
-        case 0x3:
-            gen_op_iwmmxt_addsb_M0_wRn(rd1);
-            break;
-        case 0x4:
-            gen_op_iwmmxt_addnw_M0_wRn(rd1);
-            break;
-        case 0x5:
-            gen_op_iwmmxt_adduw_M0_wRn(rd1);
-            break;
-        case 0x7:
-            gen_op_iwmmxt_addsw_M0_wRn(rd1);
-            break;
-        case 0x8:
-            gen_op_iwmmxt_addnl_M0_wRn(rd1);
-            break;
-        case 0x9:
-            gen_op_iwmmxt_addul_M0_wRn(rd1);
-            break;
-        case 0xb:
-            gen_op_iwmmxt_addsl_M0_wRn(rd1);
-            break;
-        default:
-            return 1;
-        }
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        gen_op_iwmmxt_set_cup();
-        break;
-    case 0x008: case 0x108: case 0x208: case 0x308:     /* WPACK */
-    case 0x408: case 0x508: case 0x608: case 0x708:
-    case 0x808: case 0x908: case 0xa08: case 0xb08:
-    case 0xc08: case 0xd08: case 0xe08: case 0xf08:
-        if (!(insn & (1 << 20)) || ((insn >> 22) & 3) == 0)
-            return 1;
-        wrd = (insn >> 12) & 0xf;
-        rd0 = (insn >> 16) & 0xf;
-        rd1 = (insn >> 0) & 0xf;
-        gen_op_iwmmxt_movq_M0_wRn(rd0);
-        switch ((insn >> 22) & 3) {
-        case 1:
-            if (insn & (1 << 21))
-                gen_op_iwmmxt_packsw_M0_wRn(rd1);
-            else
-                gen_op_iwmmxt_packuw_M0_wRn(rd1);
-            break;
-        case 2:
-            if (insn & (1 << 21))
-                gen_op_iwmmxt_packsl_M0_wRn(rd1);
-            else
-                gen_op_iwmmxt_packul_M0_wRn(rd1);
-            break;
-        case 3:
-            if (insn & (1 << 21))
-                gen_op_iwmmxt_packsq_M0_wRn(rd1);
-            else
-                gen_op_iwmmxt_packuq_M0_wRn(rd1);
-            break;
-        }
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        gen_op_iwmmxt_set_cup();
-        break;
-    case 0x201: case 0x203: case 0x205: case 0x207:
-    case 0x209: case 0x20b: case 0x20d: case 0x20f:
-    case 0x211: case 0x213: case 0x215: case 0x217:
-    case 0x219: case 0x21b: case 0x21d: case 0x21f:
-        wrd = (insn >> 5) & 0xf;
-        rd0 = (insn >> 12) & 0xf;
-        rd1 = (insn >> 0) & 0xf;
-        if (rd0 == 0xf || rd1 == 0xf)
-            return 1;
-        gen_op_iwmmxt_movq_M0_wRn(wrd);
-        tmp = load_reg(s, rd0);
-        tmp2 = load_reg(s, rd1);
-        switch ((insn >> 16) & 0xf) {
-        case 0x0:                                       /* TMIA */
-            gen_helper_iwmmxt_muladdsl(cpu_M0, cpu_M0, tmp, tmp2);
-            break;
-        case 0x8:                                       /* TMIAPH */
-            gen_helper_iwmmxt_muladdsw(cpu_M0, cpu_M0, tmp, tmp2);
-            break;
-        case 0xc: case 0xd: case 0xe: case 0xf:                 /* TMIAxy */
-            if (insn & (1 << 16))
-                tcg_gen_shri_i32(tmp, tmp, 16);
-            if (insn & (1 << 17))
-                tcg_gen_shri_i32(tmp2, tmp2, 16);
-            gen_helper_iwmmxt_muladdswl(cpu_M0, cpu_M0, tmp, tmp2);
-            break;
-        default:
-            tcg_temp_free_i32(tmp2);
-            tcg_temp_free_i32(tmp);
-            return 1;
-        }
-        tcg_temp_free_i32(tmp2);
-        tcg_temp_free_i32(tmp);
-        gen_op_iwmmxt_movq_wRn_M0(wrd);
-        gen_op_iwmmxt_set_mup();
-        break;
-    default:
-        return 1;
-    }
-
-    return 0;
-}
-
-/* Disassemble an XScale DSP instruction.  Returns nonzero if an error occurred
-   (ie. an undefined instruction).  */
-static int disas_dsp_insn(DisasContext *s, uint32_t insn)
-{
-    int acc, rd0, rd1, rdhi, rdlo;
-    TCGv_i32 tmp, tmp2;
-
-    if ((insn & 0x0ff00f10) == 0x0e200010) {
-        /* Multiply with Internal Accumulate Format */
-        rd0 = (insn >> 12) & 0xf;
-        rd1 = insn & 0xf;
-        acc = (insn >> 5) & 7;
-
-        if (acc != 0)
-            return 1;
-
-        tmp = load_reg(s, rd0);
-        tmp2 = load_reg(s, rd1);
-        switch ((insn >> 16) & 0xf) {
-        case 0x0:                                       /* MIA */
-            gen_helper_iwmmxt_muladdsl(cpu_M0, cpu_M0, tmp, tmp2);
-            break;
-        case 0x8:                                       /* MIAPH */
-            gen_helper_iwmmxt_muladdsw(cpu_M0, cpu_M0, tmp, tmp2);
-            break;
-        case 0xc:                                       /* MIABB */
-        case 0xd:                                       /* MIABT */
-        case 0xe:                                       /* MIATB */
-        case 0xf:                                       /* MIATT */
-            if (insn & (1 << 16))
-                tcg_gen_shri_i32(tmp, tmp, 16);
-            if (insn & (1 << 17))
-                tcg_gen_shri_i32(tmp2, tmp2, 16);
-            gen_helper_iwmmxt_muladdswl(cpu_M0, cpu_M0, tmp, tmp2);
-            break;
-        default:
-            return 1;
-        }
-        tcg_temp_free_i32(tmp2);
-        tcg_temp_free_i32(tmp);
-
-        gen_op_iwmmxt_movq_wRn_M0(acc);
-        return 0;
-    }
-
-    if ((insn & 0x0fe00ff8) == 0x0c400000) {
-        /* Internal Accumulator Access Format */
-        rdhi = (insn >> 16) & 0xf;
-        rdlo = (insn >> 12) & 0xf;
-        acc = insn & 7;
-
-        if (acc != 0)
-            return 1;
-
-        if (insn & ARM_CP_RW_BIT) {                     /* MRA */
-            iwmmxt_load_reg(cpu_V0, acc);
-            tcg_gen_extrl_i64_i32(cpu_R[rdlo], cpu_V0);
-            tcg_gen_extrh_i64_i32(cpu_R[rdhi], cpu_V0);
-            tcg_gen_andi_i32(cpu_R[rdhi], cpu_R[rdhi], (1 << (40 - 32)) - 1);
-        } else {                                        /* MAR */
-            tcg_gen_concat_i32_i64(cpu_V0, cpu_R[rdlo], cpu_R[rdhi]);
-            iwmmxt_store_reg(cpu_V0, acc);
-        }
-        return 0;
-    }
-
-    return 1;
-}
-
-static void gen_goto_ptr(void)
-{
-    tcg_gen_lookup_and_goto_ptr();
-}
-
-/* This will end the TB but doesn't guarantee we'll return to
- * cpu_loop_exec. Any live exit_requests will be processed as we
- * enter the next TB.
- */
-static void gen_goto_tb(DisasContext *s, int n, target_long diff)
-{
-    if (translator_use_goto_tb(&s->base, s->pc_curr + diff)) {
-        /*
-         * For pcrel, the pc must always be up-to-date on entry to
-         * the linked TB, so that it can use simple additions for all
-         * further adjustments.  For !pcrel, the linked TB is compiled
-         * to know its full virtual address, so we can delay the
-         * update to pc to the unlinked path.  A long chain of links
-         * can thus avoid many updates to the PC.
-         */
-        if (TARGET_TB_PCREL) {
-            gen_update_pc(s, diff);
-            tcg_gen_goto_tb(n);
-        } else {
-            tcg_gen_goto_tb(n);
-            gen_update_pc(s, diff);
-        }
-        tcg_gen_exit_tb(s->base.tb, n);
-    } else {
-        gen_update_pc(s, diff);
-        gen_goto_ptr();
-    }
-    s->base.is_jmp = DISAS_NORETURN;
-}
-
-/* Jump, specifying which TB number to use if we gen_goto_tb() */
-static void gen_jmp_tb(DisasContext *s, target_long diff, int tbno)
-{
-    if (unlikely(s->ss_active)) {
-        /* An indirect jump so that we still trigger the debug exception.  */
-        gen_update_pc(s, diff);
-        s->base.is_jmp = DISAS_JUMP;
-        return;
-    }
-    switch (s->base.is_jmp) {
-    case DISAS_NEXT:
-    case DISAS_TOO_MANY:
-    case DISAS_NORETURN:
-        /*
-         * The normal case: just go to the destination TB.
-         * NB: NORETURN happens if we generate code like
-         *    gen_brcondi(l);
-         *    gen_jmp();
-         *    gen_set_label(l);
-         *    gen_jmp();
-         * on the second call to gen_jmp().
-         */
-        gen_goto_tb(s, tbno, diff);
-        break;
-    case DISAS_UPDATE_NOCHAIN:
-    case DISAS_UPDATE_EXIT:
-        /*
-         * We already decided we're leaving the TB for some other reason.
-         * Avoid using goto_tb so we really do exit back to the main loop
-         * and don't chain to another TB.
-         */
-        gen_update_pc(s, diff);
-        gen_goto_ptr();
-        s->base.is_jmp = DISAS_NORETURN;
-        break;
-    default:
-        /*
-         * We shouldn't be emitting code for a jump and also have
-         * is_jmp set to one of the special cases like DISAS_SWI.
-         */
-        g_assert_not_reached();
-    }
-}
-
-static inline void gen_jmp(DisasContext *s, target_long diff)
-{
-    gen_jmp_tb(s, diff, 0);
-}
-
-static inline void gen_mulxy(TCGv_i32 t0, TCGv_i32 t1, int x, int y)
-{
-    if (x)
-        tcg_gen_sari_i32(t0, t0, 16);
-    else
-        gen_sxth(t0);
-    if (y)
-        tcg_gen_sari_i32(t1, t1, 16);
-    else
-        gen_sxth(t1);
-    tcg_gen_mul_i32(t0, t0, t1);
-}
-
-/* Return the mask of PSR bits set by a MSR instruction.  */
-static uint32_t msr_mask(DisasContext *s, int flags, int spsr)
-{
-    uint32_t mask = 0;
-
-    if (flags & (1 << 0)) {
-        mask |= 0xff;
-    }
-    if (flags & (1 << 1)) {
-        mask |= 0xff00;
-    }
-    if (flags & (1 << 2)) {
-        mask |= 0xff0000;
-    }
-    if (flags & (1 << 3)) {
-        mask |= 0xff000000;
-    }
-
-    /* Mask out undefined and reserved bits.  */
-    mask &= aarch32_cpsr_valid_mask(s->features, s->isar);
-
-    /* Mask out execution state.  */
-    if (!spsr) {
-        mask &= ~CPSR_EXEC;
-    }
-
-    /* Mask out privileged bits.  */
-    if (IS_USER(s)) {
-        mask &= CPSR_USER;
-    }
-    return mask;
-}
-
-/* Returns nonzero if access to the PSR is not permitted. Marks t0 as dead. */
-static int gen_set_psr(DisasContext *s, uint32_t mask, int spsr, TCGv_i32 t0)
-{
-    TCGv_i32 tmp;
-    if (spsr) {
-        /* ??? This is also undefined in system mode.  */
-        if (IS_USER(s))
-            return 1;
-
-        tmp = load_cpu_field(spsr);
-        tcg_gen_andi_i32(tmp, tmp, ~mask);
-        tcg_gen_andi_i32(t0, t0, mask);
-        tcg_gen_or_i32(tmp, tmp, t0);
-        store_cpu_field(tmp, spsr);
-    } else {
-        gen_set_cpsr(t0, mask);
-    }
-    tcg_temp_free_i32(t0);
-    gen_lookup_tb(s);
-    return 0;
-}
-
-/* Returns nonzero if access to the PSR is not permitted.  */
-static int gen_set_psr_im(DisasContext *s, uint32_t mask, int spsr, uint32_t val)
-{
-    TCGv_i32 tmp;
-    tmp = tcg_temp_new_i32();
-    tcg_gen_movi_i32(tmp, val);
-    return gen_set_psr(s, mask, spsr, tmp);
-}
-
-static bool msr_banked_access_decode(DisasContext *s, int r, int sysm, int rn,
-                                     int *tgtmode, int *regno)
-{
-    /* Decode the r and sysm fields of MSR/MRS banked accesses into
-     * the target mode and register number, and identify the various
-     * unpredictable cases.
-     * MSR (banked) and MRS (banked) are CONSTRAINED UNPREDICTABLE if:
-     *  + executed in user mode
-     *  + using R15 as the src/dest register
-     *  + accessing an unimplemented register
-     *  + accessing a register that's inaccessible at current PL/security state*
-     *  + accessing a register that you could access with a different insn
-     * We choose to UNDEF in all these cases.
-     * Since we don't know which of the various AArch32 modes we are in
-     * we have to defer some checks to runtime.
-     * Accesses to Monitor mode registers from Secure EL1 (which implies
-     * that EL3 is AArch64) must trap to EL3.
-     *
-     * If the access checks fail this function will emit code to take
-     * an exception and return false. Otherwise it will return true,
-     * and set *tgtmode and *regno appropriately.
-     */
-    /* These instructions are present only in ARMv8, or in ARMv7 with the
-     * Virtualization Extensions.
-     */
-    if (!arm_dc_feature(s, ARM_FEATURE_V8) &&
-        !arm_dc_feature(s, ARM_FEATURE_EL2)) {
-        goto undef;
-    }
-
-    if (IS_USER(s) || rn == 15) {
-        goto undef;
-    }
-
-    /* The table in the v8 ARM ARM section F5.2.3 describes the encoding
-     * of registers into (r, sysm).
-     */
-    if (r) {
-        /* SPSRs for other modes */
-        switch (sysm) {
-        case 0xe: /* SPSR_fiq */
-            *tgtmode = ARM_CPU_MODE_FIQ;
-            break;
-        case 0x10: /* SPSR_irq */
-            *tgtmode = ARM_CPU_MODE_IRQ;
-            break;
-        case 0x12: /* SPSR_svc */
-            *tgtmode = ARM_CPU_MODE_SVC;
-            break;
-        case 0x14: /* SPSR_abt */
-            *tgtmode = ARM_CPU_MODE_ABT;
-            break;
-        case 0x16: /* SPSR_und */
-            *tgtmode = ARM_CPU_MODE_UND;
-            break;
-        case 0x1c: /* SPSR_mon */
-            *tgtmode = ARM_CPU_MODE_MON;
-            break;
-        case 0x1e: /* SPSR_hyp */
-            *tgtmode = ARM_CPU_MODE_HYP;
-            break;
-        default: /* unallocated */
-            goto undef;
-        }
-        /* We arbitrarily assign SPSR a register number of 16. */
-        *regno = 16;
-    } else {
-        /* general purpose registers for other modes */
-        switch (sysm) {
-        case 0x0 ... 0x6:   /* 0b00xxx : r8_usr ... r14_usr */
-            *tgtmode = ARM_CPU_MODE_USR;
-            *regno = sysm + 8;
-            break;
-        case 0x8 ... 0xe:   /* 0b01xxx : r8_fiq ... r14_fiq */
-            *tgtmode = ARM_CPU_MODE_FIQ;
-            *regno = sysm;
-            break;
-        case 0x10 ... 0x11: /* 0b1000x : r14_irq, r13_irq */
-            *tgtmode = ARM_CPU_MODE_IRQ;
-            *regno = sysm & 1 ? 13 : 14;
-            break;
-        case 0x12 ... 0x13: /* 0b1001x : r14_svc, r13_svc */
-            *tgtmode = ARM_CPU_MODE_SVC;
-            *regno = sysm & 1 ? 13 : 14;
-            break;
-        case 0x14 ... 0x15: /* 0b1010x : r14_abt, r13_abt */
-            *tgtmode = ARM_CPU_MODE_ABT;
-            *regno = sysm & 1 ? 13 : 14;
-            break;
-        case 0x16 ... 0x17: /* 0b1011x : r14_und, r13_und */
-            *tgtmode = ARM_CPU_MODE_UND;
-            *regno = sysm & 1 ? 13 : 14;
-            break;
-        case 0x1c ... 0x1d: /* 0b1110x : r14_mon, r13_mon */
-            *tgtmode = ARM_CPU_MODE_MON;
-            *regno = sysm & 1 ? 13 : 14;
-            break;
-        case 0x1e ... 0x1f: /* 0b1111x : elr_hyp, r13_hyp */
-            *tgtmode = ARM_CPU_MODE_HYP;
-            /* Arbitrarily pick 17 for ELR_Hyp (which is not a banked LR!) */
-            *regno = sysm & 1 ? 13 : 17;
-            break;
-        default: /* unallocated */
-            goto undef;
-        }
-    }
-
-    /* Catch the 'accessing inaccessible register' cases we can detect
-     * at translate time.
-     */
-    switch (*tgtmode) {
-    case ARM_CPU_MODE_MON:
-        if (!arm_dc_feature(s, ARM_FEATURE_EL3) || s->ns) {
-            goto undef;
-        }
-        if (s->current_el == 1) {
-            /* If we're in Secure EL1 (which implies that EL3 is AArch64)
-             * then accesses to Mon registers trap to Secure EL2, if it exists,
-             * otherwise EL3.
-             */
-            TCGv_i32 tcg_el;
-
-            if (arm_dc_feature(s, ARM_FEATURE_AARCH64) &&
-                dc_isar_feature(aa64_sel2, s)) {
-                /* Target EL is EL<3 minus SCR_EL3.EEL2> */
-                tcg_el = load_cpu_field(cp15.scr_el3);
-                tcg_gen_sextract_i32(tcg_el, tcg_el, ctz32(SCR_EEL2), 1);
-                tcg_gen_addi_i32(tcg_el, tcg_el, 3);
-            } else {
-                tcg_el = tcg_constant_i32(3);
-            }
-
-            gen_exception_insn_el_v(s, 0, EXCP_UDEF,
-                                    syn_uncategorized(), tcg_el);
-            tcg_temp_free_i32(tcg_el);
-            return false;
-        }
-        break;
-    case ARM_CPU_MODE_HYP:
-        /*
-         * SPSR_hyp and r13_hyp can only be accessed from Monitor mode
-         * (and so we can forbid accesses from EL2 or below). elr_hyp
-         * can be accessed also from Hyp mode, so forbid accesses from
-         * EL0 or EL1.
-         */
-        if (!arm_dc_feature(s, ARM_FEATURE_EL2) || s->current_el < 2 ||
-            (s->current_el < 3 && *regno != 17)) {
-            goto undef;
-        }
-        break;
-    default:
-        break;
-    }
-
-    return true;
-
-undef:
-    /* If we get here then some access check did not pass */
-    gen_exception_insn(s, 0, EXCP_UDEF, syn_uncategorized());
-    return false;
-}
-
-static void gen_msr_banked(DisasContext *s, int r, int sysm, int rn)
-{
-    TCGv_i32 tcg_reg;
-    int tgtmode = 0, regno = 0;
-
-    if (!msr_banked_access_decode(s, r, sysm, rn, &tgtmode, &regno)) {
-        return;
-    }
-
-    /* Sync state because msr_banked() can raise exceptions */
-    gen_set_condexec(s);
-    gen_update_pc(s, 0);
-    tcg_reg = load_reg(s, rn);
-    gen_helper_msr_banked(cpu_env, tcg_reg,
-                          tcg_constant_i32(tgtmode),
-                          tcg_constant_i32(regno));
-    tcg_temp_free_i32(tcg_reg);
-    s->base.is_jmp = DISAS_UPDATE_EXIT;
-}
-
-static void gen_mrs_banked(DisasContext *s, int r, int sysm, int rn)
-{
-    TCGv_i32 tcg_reg;
-    int tgtmode = 0, regno = 0;
-
-    if (!msr_banked_access_decode(s, r, sysm, rn, &tgtmode, &regno)) {
-        return;
-    }
-
-    /* Sync state because mrs_banked() can raise exceptions */
-    gen_set_condexec(s);
-    gen_update_pc(s, 0);
-    tcg_reg = tcg_temp_new_i32();
-    gen_helper_mrs_banked(tcg_reg, cpu_env,
-                          tcg_constant_i32(tgtmode),
-                          tcg_constant_i32(regno));
-    store_reg(s, rn, tcg_reg);
-    s->base.is_jmp = DISAS_UPDATE_EXIT;
-}
-
-/* Store value to PC as for an exception return (ie don't
- * mask bits). The subsequent call to gen_helper_cpsr_write_eret()
- * will do the masking based on the new value of the Thumb bit.
- */
-static void store_pc_exc_ret(DisasContext *s, TCGv_i32 pc)
-{
-    tcg_gen_mov_i32(cpu_R[15], pc);
-    tcg_temp_free_i32(pc);
-}
-
-/* Generate a v6 exception return.  Marks both values as dead.  */
-static void gen_rfe(DisasContext *s, TCGv_i32 pc, TCGv_i32 cpsr)
-{
-    store_pc_exc_ret(s, pc);
-    /* The cpsr_write_eret helper will mask the low bits of PC
-     * appropriately depending on the new Thumb bit, so it must
-     * be called after storing the new PC.
-     */
-    if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-        gen_io_start();
-    }
-    gen_helper_cpsr_write_eret(cpu_env, cpsr);
-    tcg_temp_free_i32(cpsr);
-    /* Must exit loop to check un-masked IRQs */
-    s->base.is_jmp = DISAS_EXIT;
-}
-
-/* Generate an old-style exception return. Marks pc as dead. */
-static void gen_exception_return(DisasContext *s, TCGv_i32 pc)
-{
-    gen_rfe(s, pc, load_cpu_field(spsr));
-}
-
-static void gen_gvec_fn3_qc(uint32_t rd_ofs, uint32_t rn_ofs, uint32_t rm_ofs,
-                            uint32_t opr_sz, uint32_t max_sz,
-                            gen_helper_gvec_3_ptr *fn)
-{
-    TCGv_ptr qc_ptr = tcg_temp_new_ptr();
-
-    tcg_gen_addi_ptr(qc_ptr, cpu_env, offsetof(CPUARMState, vfp.qc));
-    tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, qc_ptr,
-                       opr_sz, max_sz, 0, fn);
-    tcg_temp_free_ptr(qc_ptr);
-}
-
-void gen_gvec_sqrdmlah_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                          uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
-{
-    static gen_helper_gvec_3_ptr * const fns[2] = {
-        gen_helper_gvec_qrdmlah_s16, gen_helper_gvec_qrdmlah_s32
-    };
-    tcg_debug_assert(vece >= 1 && vece <= 2);
-    gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
-}
-
-void gen_gvec_sqrdmlsh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                          uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
-{
-    static gen_helper_gvec_3_ptr * const fns[2] = {
-        gen_helper_gvec_qrdmlsh_s16, gen_helper_gvec_qrdmlsh_s32
-    };
-    tcg_debug_assert(vece >= 1 && vece <= 2);
-    gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
-}
-
-#define GEN_CMP0(NAME, COND)                                            \
-    static void gen_##NAME##0_i32(TCGv_i32 d, TCGv_i32 a)               \
-    {                                                                   \
-        tcg_gen_setcondi_i32(COND, d, a, 0);                            \
-        tcg_gen_neg_i32(d, d);                                          \
-    }                                                                   \
-    static void gen_##NAME##0_i64(TCGv_i64 d, TCGv_i64 a)               \
-    {                                                                   \
-        tcg_gen_setcondi_i64(COND, d, a, 0);                            \
-        tcg_gen_neg_i64(d, d);                                          \
-    }                                                                   \
-    static void gen_##NAME##0_vec(unsigned vece, TCGv_vec d, TCGv_vec a) \
-    {                                                                   \
-        TCGv_vec zero = tcg_constant_vec_matching(d, vece, 0);          \
-        tcg_gen_cmp_vec(COND, vece, d, a, zero);                        \
-    }                                                                   \
-    void gen_gvec_##NAME##0(unsigned vece, uint32_t d, uint32_t m,      \
-                            uint32_t opr_sz, uint32_t max_sz)           \
-    {                                                                   \
-        const GVecGen2 op[4] = {                                        \
-            { .fno = gen_helper_gvec_##NAME##0_b,                       \
-              .fniv = gen_##NAME##0_vec,                                \
-              .opt_opc = vecop_list_cmp,                                \
-              .vece = MO_8 },                                           \
-            { .fno = gen_helper_gvec_##NAME##0_h,                       \
-              .fniv = gen_##NAME##0_vec,                                \
-              .opt_opc = vecop_list_cmp,                                \
-              .vece = MO_16 },                                          \
-            { .fni4 = gen_##NAME##0_i32,                                \
-              .fniv = gen_##NAME##0_vec,                                \
-              .opt_opc = vecop_list_cmp,                                \
-              .vece = MO_32 },                                          \
-            { .fni8 = gen_##NAME##0_i64,                                \
-              .fniv = gen_##NAME##0_vec,                                \
-              .opt_opc = vecop_list_cmp,                                \
-              .prefer_i64 = TCG_TARGET_REG_BITS == 64,                  \
-              .vece = MO_64 },                                          \
-        };                                                              \
-        tcg_gen_gvec_2(d, m, opr_sz, max_sz, &op[vece]);                \
-    }
-
-static const TCGOpcode vecop_list_cmp[] = {
-    INDEX_op_cmp_vec, 0
-};
-
-GEN_CMP0(ceq, TCG_COND_EQ)
-GEN_CMP0(cle, TCG_COND_LE)
-GEN_CMP0(cge, TCG_COND_GE)
-GEN_CMP0(clt, TCG_COND_LT)
-GEN_CMP0(cgt, TCG_COND_GT)
-
-#undef GEN_CMP0
-
-static void gen_ssra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
-{
-    tcg_gen_vec_sar8i_i64(a, a, shift);
-    tcg_gen_vec_add8_i64(d, d, a);
-}
-
-static void gen_ssra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
-{
-    tcg_gen_vec_sar16i_i64(a, a, shift);
-    tcg_gen_vec_add16_i64(d, d, a);
-}
-
-static void gen_ssra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
-{
-    tcg_gen_sari_i32(a, a, shift);
-    tcg_gen_add_i32(d, d, a);
-}
-
-static void gen_ssra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
-{
-    tcg_gen_sari_i64(a, a, shift);
-    tcg_gen_add_i64(d, d, a);
-}
-
-static void gen_ssra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
-{
-    tcg_gen_sari_vec(vece, a, a, sh);
-    tcg_gen_add_vec(vece, d, d, a);
-}
-
-void gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
-                   int64_t shift, uint32_t opr_sz, uint32_t max_sz)
-{
-    static const TCGOpcode vecop_list[] = {
-        INDEX_op_sari_vec, INDEX_op_add_vec, 0
-    };
-    static const GVecGen2i ops[4] = {
-        { .fni8 = gen_ssra8_i64,
-          .fniv = gen_ssra_vec,
-          .fno = gen_helper_gvec_ssra_b,
-          .load_dest = true,
-          .opt_opc = vecop_list,
-          .vece = MO_8 },
-        { .fni8 = gen_ssra16_i64,
-          .fniv = gen_ssra_vec,
-          .fno = gen_helper_gvec_ssra_h,
-          .load_dest = true,
-          .opt_opc = vecop_list,
-          .vece = MO_16 },
-        { .fni4 = gen_ssra32_i32,
-          .fniv = gen_ssra_vec,
-          .fno = gen_helper_gvec_ssra_s,
-          .load_dest = true,
-          .opt_opc = vecop_list,
-          .vece = MO_32 },
-        { .fni8 = gen_ssra64_i64,
-          .fniv = gen_ssra_vec,
-          .fno = gen_helper_gvec_ssra_b,
-          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-          .opt_opc = vecop_list,
-          .load_dest = true,
-          .vece = MO_64 },
-    };
-
-    /* tszimm encoding produces immediates in the range [1..esize]. */
-    tcg_debug_assert(shift > 0);
-    tcg_debug_assert(shift <= (8 << vece));
-
-    /*
-     * Shifts larger than the element size are architecturally valid.
-     * Signed results in all sign bits.
-     */
-    shift = MIN(shift, (8 << vece) - 1);
-    tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
-}
-
-static void gen_usra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
-{
-    tcg_gen_vec_shr8i_i64(a, a, shift);
-    tcg_gen_vec_add8_i64(d, d, a);
-}
-
-static void gen_usra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
-{
-    tcg_gen_vec_shr16i_i64(a, a, shift);
-    tcg_gen_vec_add16_i64(d, d, a);
-}
-
-static void gen_usra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
-{
-    tcg_gen_shri_i32(a, a, shift);
-    tcg_gen_add_i32(d, d, a);
-}
-
-static void gen_usra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
-{
-    tcg_gen_shri_i64(a, a, shift);
-    tcg_gen_add_i64(d, d, a);
-}
-
-static void gen_usra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
-{
-    tcg_gen_shri_vec(vece, a, a, sh);
-    tcg_gen_add_vec(vece, d, d, a);
-}
-
-void gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
-                   int64_t shift, uint32_t opr_sz, uint32_t max_sz)
-{
-    static const TCGOpcode vecop_list[] = {
-        INDEX_op_shri_vec, INDEX_op_add_vec, 0
-    };
-    static const GVecGen2i ops[4] = {
-        { .fni8 = gen_usra8_i64,
-          .fniv = gen_usra_vec,
-          .fno = gen_helper_gvec_usra_b,
-          .load_dest = true,
-          .opt_opc = vecop_list,
-          .vece = MO_8, },
-        { .fni8 = gen_usra16_i64,
-          .fniv = gen_usra_vec,
-          .fno = gen_helper_gvec_usra_h,
-          .load_dest = true,
-          .opt_opc = vecop_list,
-          .vece = MO_16, },
-        { .fni4 = gen_usra32_i32,
-          .fniv = gen_usra_vec,
-          .fno = gen_helper_gvec_usra_s,
-          .load_dest = true,
-          .opt_opc = vecop_list,
-          .vece = MO_32, },
-        { .fni8 = gen_usra64_i64,
-          .fniv = gen_usra_vec,
-          .fno = gen_helper_gvec_usra_d,
-          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-          .load_dest = true,
-          .opt_opc = vecop_list,
-          .vece = MO_64, },
-    };
-
-    /* tszimm encoding produces immediates in the range [1..esize]. */
-    tcg_debug_assert(shift > 0);
-    tcg_debug_assert(shift <= (8 << vece));
-
-    /*
-     * Shifts larger than the element size are architecturally valid.
-     * Unsigned results in all zeros as input to accumulate: nop.
-     */
-    if (shift < (8 << vece)) {
-        tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
-    } else {
-        /* Nop, but we do need to clear the tail. */
-        tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
-    }
-}
-
-/*
- * Shift one less than the requested amount, and the low bit is
- * the rounding bit.  For the 8 and 16-bit operations, because we
- * mask the low bit, we can perform a normal integer shift instead
- * of a vector shift.
- */
-static void gen_srshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
-{
-    TCGv_i64 t = tcg_temp_new_i64();
-
-    tcg_gen_shri_i64(t, a, sh - 1);
-    tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
-    tcg_gen_vec_sar8i_i64(d, a, sh);
-    tcg_gen_vec_add8_i64(d, d, t);
-    tcg_temp_free_i64(t);
-}
-
-static void gen_srshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
-{
-    TCGv_i64 t = tcg_temp_new_i64();
-
-    tcg_gen_shri_i64(t, a, sh - 1);
-    tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
-    tcg_gen_vec_sar16i_i64(d, a, sh);
-    tcg_gen_vec_add16_i64(d, d, t);
-    tcg_temp_free_i64(t);
-}
-
-static void gen_srshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
-{
-    TCGv_i32 t;
-
-    /* Handle shift by the input size for the benefit of trans_SRSHR_ri */
-    if (sh == 32) {
-        tcg_gen_movi_i32(d, 0);
-        return;
-    }
-    t = tcg_temp_new_i32();
-    tcg_gen_extract_i32(t, a, sh - 1, 1);
-    tcg_gen_sari_i32(d, a, sh);
-    tcg_gen_add_i32(d, d, t);
-    tcg_temp_free_i32(t);
-}
-
-static void gen_srshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
-{
-    TCGv_i64 t = tcg_temp_new_i64();
-
-    tcg_gen_extract_i64(t, a, sh - 1, 1);
-    tcg_gen_sari_i64(d, a, sh);
-    tcg_gen_add_i64(d, d, t);
-    tcg_temp_free_i64(t);
-}
-
-static void gen_srshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
-{
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
-    TCGv_vec ones = tcg_temp_new_vec_matching(d);
-
-    tcg_gen_shri_vec(vece, t, a, sh - 1);
-    tcg_gen_dupi_vec(vece, ones, 1);
-    tcg_gen_and_vec(vece, t, t, ones);
-    tcg_gen_sari_vec(vece, d, a, sh);
-    tcg_gen_add_vec(vece, d, d, t);
-
-    tcg_temp_free_vec(t);
-    tcg_temp_free_vec(ones);
-}
-
-void gen_gvec_srshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
-                    int64_t shift, uint32_t opr_sz, uint32_t max_sz)
-{
-    static const TCGOpcode vecop_list[] = {
-        INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0
-    };
-    static const GVecGen2i ops[4] = {
-        { .fni8 = gen_srshr8_i64,
-          .fniv = gen_srshr_vec,
-          .fno = gen_helper_gvec_srshr_b,
-          .opt_opc = vecop_list,
-          .vece = MO_8 },
-        { .fni8 = gen_srshr16_i64,
-          .fniv = gen_srshr_vec,
-          .fno = gen_helper_gvec_srshr_h,
-          .opt_opc = vecop_list,
-          .vece = MO_16 },
-        { .fni4 = gen_srshr32_i32,
-          .fniv = gen_srshr_vec,
-          .fno = gen_helper_gvec_srshr_s,
-          .opt_opc = vecop_list,
-          .vece = MO_32 },
-        { .fni8 = gen_srshr64_i64,
-          .fniv = gen_srshr_vec,
-          .fno = gen_helper_gvec_srshr_d,
-          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-          .opt_opc = vecop_list,
-          .vece = MO_64 },
-    };
-
-    /* tszimm encoding produces immediates in the range [1..esize] */
-    tcg_debug_assert(shift > 0);
-    tcg_debug_assert(shift <= (8 << vece));
-
-    if (shift == (8 << vece)) {
-        /*
-         * Shifts larger than the element size are architecturally valid.
-         * Signed results in all sign bits.  With rounding, this produces
-         *   (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0.
-         * I.e. always zero.
-         */
-        tcg_gen_gvec_dup_imm(vece, rd_ofs, opr_sz, max_sz, 0);
-    } else {
-        tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
-    }
-}
-
-static void gen_srsra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
-{
-    TCGv_i64 t = tcg_temp_new_i64();
-
-    gen_srshr8_i64(t, a, sh);
-    tcg_gen_vec_add8_i64(d, d, t);
-    tcg_temp_free_i64(t);
-}
-
-static void gen_srsra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
-{
-    TCGv_i64 t = tcg_temp_new_i64();
-
-    gen_srshr16_i64(t, a, sh);
-    tcg_gen_vec_add16_i64(d, d, t);
-    tcg_temp_free_i64(t);
-}
-
-static void gen_srsra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
-{
-    TCGv_i32 t = tcg_temp_new_i32();
-
-    gen_srshr32_i32(t, a, sh);
-    tcg_gen_add_i32(d, d, t);
-    tcg_temp_free_i32(t);
-}
-
-static void gen_srsra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
-{
-    TCGv_i64 t = tcg_temp_new_i64();
-
-    gen_srshr64_i64(t, a, sh);
-    tcg_gen_add_i64(d, d, t);
-    tcg_temp_free_i64(t);
-}
-
-static void gen_srsra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
-{
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
-
-    gen_srshr_vec(vece, t, a, sh);
-    tcg_gen_add_vec(vece, d, d, t);
-    tcg_temp_free_vec(t);
-}
-
-void gen_gvec_srsra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
-                    int64_t shift, uint32_t opr_sz, uint32_t max_sz)
-{
-    static const TCGOpcode vecop_list[] = {
-        INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0
-    };
-    static const GVecGen2i ops[4] = {
-        { .fni8 = gen_srsra8_i64,
-          .fniv = gen_srsra_vec,
-          .fno = gen_helper_gvec_srsra_b,
-          .opt_opc = vecop_list,
-          .load_dest = true,
-          .vece = MO_8 },
-        { .fni8 = gen_srsra16_i64,
-          .fniv = gen_srsra_vec,
-          .fno = gen_helper_gvec_srsra_h,
-          .opt_opc = vecop_list,
-          .load_dest = true,
-          .vece = MO_16 },
-        { .fni4 = gen_srsra32_i32,
-          .fniv = gen_srsra_vec,
-          .fno = gen_helper_gvec_srsra_s,
-          .opt_opc = vecop_list,
-          .load_dest = true,
-          .vece = MO_32 },
-        { .fni8 = gen_srsra64_i64,
-          .fniv = gen_srsra_vec,
-          .fno = gen_helper_gvec_srsra_d,
-          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-          .opt_opc = vecop_list,
-          .load_dest = true,
-          .vece = MO_64 },
-    };
-
-    /* tszimm encoding produces immediates in the range [1..esize] */
-    tcg_debug_assert(shift > 0);
-    tcg_debug_assert(shift <= (8 << vece));
-
-    /*
-     * Shifts larger than the element size are architecturally valid.
-     * Signed results in all sign bits.  With rounding, this produces
-     *   (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0.
-     * I.e. always zero.  With accumulation, this leaves D unchanged.
-     */
-    if (shift == (8 << vece)) {
-        /* Nop, but we do need to clear the tail. */
-        tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
-    } else {
-        tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
-    }
-}
-
-static void gen_urshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
-{
-    TCGv_i64 t = tcg_temp_new_i64();
-
-    tcg_gen_shri_i64(t, a, sh - 1);
-    tcg_gen_andi_i64(t, t, dup_const(MO_8, 1));
-    tcg_gen_vec_shr8i_i64(d, a, sh);
-    tcg_gen_vec_add8_i64(d, d, t);
-    tcg_temp_free_i64(t);
-}
-
-static void gen_urshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
-{
-    TCGv_i64 t = tcg_temp_new_i64();
-
-    tcg_gen_shri_i64(t, a, sh - 1);
-    tcg_gen_andi_i64(t, t, dup_const(MO_16, 1));
-    tcg_gen_vec_shr16i_i64(d, a, sh);
-    tcg_gen_vec_add16_i64(d, d, t);
-    tcg_temp_free_i64(t);
-}
-
-static void gen_urshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
-{
-    TCGv_i32 t;
-
-    /* Handle shift by the input size for the benefit of trans_URSHR_ri */
-    if (sh == 32) {
-        tcg_gen_extract_i32(d, a, sh - 1, 1);
-        return;
-    }
-    t = tcg_temp_new_i32();
-    tcg_gen_extract_i32(t, a, sh - 1, 1);
-    tcg_gen_shri_i32(d, a, sh);
-    tcg_gen_add_i32(d, d, t);
-    tcg_temp_free_i32(t);
-}
-
-static void gen_urshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
-{
-    TCGv_i64 t = tcg_temp_new_i64();
-
-    tcg_gen_extract_i64(t, a, sh - 1, 1);
-    tcg_gen_shri_i64(d, a, sh);
-    tcg_gen_add_i64(d, d, t);
-    tcg_temp_free_i64(t);
-}
-
-static void gen_urshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t shift)
-{
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
-    TCGv_vec ones = tcg_temp_new_vec_matching(d);
-
-    tcg_gen_shri_vec(vece, t, a, shift - 1);
-    tcg_gen_dupi_vec(vece, ones, 1);
-    tcg_gen_and_vec(vece, t, t, ones);
-    tcg_gen_shri_vec(vece, d, a, shift);
-    tcg_gen_add_vec(vece, d, d, t);
-
-    tcg_temp_free_vec(t);
-    tcg_temp_free_vec(ones);
-}
-
-void gen_gvec_urshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
-                    int64_t shift, uint32_t opr_sz, uint32_t max_sz)
-{
-    static const TCGOpcode vecop_list[] = {
-        INDEX_op_shri_vec, INDEX_op_add_vec, 0
-    };
-    static const GVecGen2i ops[4] = {
-        { .fni8 = gen_urshr8_i64,
-          .fniv = gen_urshr_vec,
-          .fno = gen_helper_gvec_urshr_b,
-          .opt_opc = vecop_list,
-          .vece = MO_8 },
-        { .fni8 = gen_urshr16_i64,
-          .fniv = gen_urshr_vec,
-          .fno = gen_helper_gvec_urshr_h,
-          .opt_opc = vecop_list,
-          .vece = MO_16 },
-        { .fni4 = gen_urshr32_i32,
-          .fniv = gen_urshr_vec,
-          .fno = gen_helper_gvec_urshr_s,
-          .opt_opc = vecop_list,
-          .vece = MO_32 },
-        { .fni8 = gen_urshr64_i64,
-          .fniv = gen_urshr_vec,
-          .fno = gen_helper_gvec_urshr_d,
-          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-          .opt_opc = vecop_list,
-          .vece = MO_64 },
-    };
-
-    /* tszimm encoding produces immediates in the range [1..esize] */
-    tcg_debug_assert(shift > 0);
-    tcg_debug_assert(shift <= (8 << vece));
-
-    if (shift == (8 << vece)) {
-        /*
-         * Shifts larger than the element size are architecturally valid.
-         * Unsigned results in zero.  With rounding, this produces a
-         * copy of the most significant bit.
-         */
-        tcg_gen_gvec_shri(vece, rd_ofs, rm_ofs, shift - 1, opr_sz, max_sz);
-    } else {
-        tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
-    }
-}
-
-static void gen_ursra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
-{
-    TCGv_i64 t = tcg_temp_new_i64();
-
-    if (sh == 8) {
-        tcg_gen_vec_shr8i_i64(t, a, 7);
-    } else {
-        gen_urshr8_i64(t, a, sh);
-    }
-    tcg_gen_vec_add8_i64(d, d, t);
-    tcg_temp_free_i64(t);
-}
-
-static void gen_ursra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
-{
-    TCGv_i64 t = tcg_temp_new_i64();
-
-    if (sh == 16) {
-        tcg_gen_vec_shr16i_i64(t, a, 15);
-    } else {
-        gen_urshr16_i64(t, a, sh);
-    }
-    tcg_gen_vec_add16_i64(d, d, t);
-    tcg_temp_free_i64(t);
-}
-
-static void gen_ursra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
-{
-    TCGv_i32 t = tcg_temp_new_i32();
-
-    if (sh == 32) {
-        tcg_gen_shri_i32(t, a, 31);
-    } else {
-        gen_urshr32_i32(t, a, sh);
-    }
-    tcg_gen_add_i32(d, d, t);
-    tcg_temp_free_i32(t);
-}
-
-static void gen_ursra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
-{
-    TCGv_i64 t = tcg_temp_new_i64();
-
-    if (sh == 64) {
-        tcg_gen_shri_i64(t, a, 63);
-    } else {
-        gen_urshr64_i64(t, a, sh);
-    }
-    tcg_gen_add_i64(d, d, t);
-    tcg_temp_free_i64(t);
-}
-
-static void gen_ursra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
-{
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
-
-    if (sh == (8 << vece)) {
-        tcg_gen_shri_vec(vece, t, a, sh - 1);
-    } else {
-        gen_urshr_vec(vece, t, a, sh);
-    }
-    tcg_gen_add_vec(vece, d, d, t);
-    tcg_temp_free_vec(t);
-}
-
-void gen_gvec_ursra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
-                    int64_t shift, uint32_t opr_sz, uint32_t max_sz)
-{
-    static const TCGOpcode vecop_list[] = {
-        INDEX_op_shri_vec, INDEX_op_add_vec, 0
-    };
-    static const GVecGen2i ops[4] = {
-        { .fni8 = gen_ursra8_i64,
-          .fniv = gen_ursra_vec,
-          .fno = gen_helper_gvec_ursra_b,
-          .opt_opc = vecop_list,
-          .load_dest = true,
-          .vece = MO_8 },
-        { .fni8 = gen_ursra16_i64,
-          .fniv = gen_ursra_vec,
-          .fno = gen_helper_gvec_ursra_h,
-          .opt_opc = vecop_list,
-          .load_dest = true,
-          .vece = MO_16 },
-        { .fni4 = gen_ursra32_i32,
-          .fniv = gen_ursra_vec,
-          .fno = gen_helper_gvec_ursra_s,
-          .opt_opc = vecop_list,
-          .load_dest = true,
-          .vece = MO_32 },
-        { .fni8 = gen_ursra64_i64,
-          .fniv = gen_ursra_vec,
-          .fno = gen_helper_gvec_ursra_d,
-          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-          .opt_opc = vecop_list,
-          .load_dest = true,
-          .vece = MO_64 },
-    };
-
-    /* tszimm encoding produces immediates in the range [1..esize] */
-    tcg_debug_assert(shift > 0);
-    tcg_debug_assert(shift <= (8 << vece));
-
-    tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
-}
-
-static void gen_shr8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
-{
-    uint64_t mask = dup_const(MO_8, 0xff >> shift);
-    TCGv_i64 t = tcg_temp_new_i64();
-
-    tcg_gen_shri_i64(t, a, shift);
-    tcg_gen_andi_i64(t, t, mask);
-    tcg_gen_andi_i64(d, d, ~mask);
-    tcg_gen_or_i64(d, d, t);
-    tcg_temp_free_i64(t);
-}
-
-static void gen_shr16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
-{
-    uint64_t mask = dup_const(MO_16, 0xffff >> shift);
-    TCGv_i64 t = tcg_temp_new_i64();
-
-    tcg_gen_shri_i64(t, a, shift);
-    tcg_gen_andi_i64(t, t, mask);
-    tcg_gen_andi_i64(d, d, ~mask);
-    tcg_gen_or_i64(d, d, t);
-    tcg_temp_free_i64(t);
-}
-
-static void gen_shr32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
-{
-    tcg_gen_shri_i32(a, a, shift);
-    tcg_gen_deposit_i32(d, d, a, 0, 32 - shift);
-}
-
-static void gen_shr64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
-{
-    tcg_gen_shri_i64(a, a, shift);
-    tcg_gen_deposit_i64(d, d, a, 0, 64 - shift);
-}
-
-static void gen_shr_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
-{
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
-    TCGv_vec m = tcg_temp_new_vec_matching(d);
-
-    tcg_gen_dupi_vec(vece, m, MAKE_64BIT_MASK((8 << vece) - sh, sh));
-    tcg_gen_shri_vec(vece, t, a, sh);
-    tcg_gen_and_vec(vece, d, d, m);
-    tcg_gen_or_vec(vece, d, d, t);
-
-    tcg_temp_free_vec(t);
-    tcg_temp_free_vec(m);
-}
-
-void gen_gvec_sri(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
-                  int64_t shift, uint32_t opr_sz, uint32_t max_sz)
-{
-    static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
-    const GVecGen2i ops[4] = {
-        { .fni8 = gen_shr8_ins_i64,
-          .fniv = gen_shr_ins_vec,
-          .fno = gen_helper_gvec_sri_b,
-          .load_dest = true,
-          .opt_opc = vecop_list,
-          .vece = MO_8 },
-        { .fni8 = gen_shr16_ins_i64,
-          .fniv = gen_shr_ins_vec,
-          .fno = gen_helper_gvec_sri_h,
-          .load_dest = true,
-          .opt_opc = vecop_list,
-          .vece = MO_16 },
-        { .fni4 = gen_shr32_ins_i32,
-          .fniv = gen_shr_ins_vec,
-          .fno = gen_helper_gvec_sri_s,
-          .load_dest = true,
-          .opt_opc = vecop_list,
-          .vece = MO_32 },
-        { .fni8 = gen_shr64_ins_i64,
-          .fniv = gen_shr_ins_vec,
-          .fno = gen_helper_gvec_sri_d,
-          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-          .load_dest = true,
-          .opt_opc = vecop_list,
-          .vece = MO_64 },
-    };
-
-    /* tszimm encoding produces immediates in the range [1..esize]. */
-    tcg_debug_assert(shift > 0);
-    tcg_debug_assert(shift <= (8 << vece));
-
-    /* Shift of esize leaves destination unchanged. */
-    if (shift < (8 << vece)) {
-        tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
-    } else {
-        /* Nop, but we do need to clear the tail. */
-        tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
-    }
-}
-
-static void gen_shl8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
-{
-    uint64_t mask = dup_const(MO_8, 0xff << shift);
-    TCGv_i64 t = tcg_temp_new_i64();
-
-    tcg_gen_shli_i64(t, a, shift);
-    tcg_gen_andi_i64(t, t, mask);
-    tcg_gen_andi_i64(d, d, ~mask);
-    tcg_gen_or_i64(d, d, t);
-    tcg_temp_free_i64(t);
-}
-
-static void gen_shl16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
-{
-    uint64_t mask = dup_const(MO_16, 0xffff << shift);
-    TCGv_i64 t = tcg_temp_new_i64();
-
-    tcg_gen_shli_i64(t, a, shift);
-    tcg_gen_andi_i64(t, t, mask);
-    tcg_gen_andi_i64(d, d, ~mask);
-    tcg_gen_or_i64(d, d, t);
-    tcg_temp_free_i64(t);
-}
-
-static void gen_shl32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
-{
-    tcg_gen_deposit_i32(d, d, a, shift, 32 - shift);
-}
-
-static void gen_shl64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
-{
-    tcg_gen_deposit_i64(d, d, a, shift, 64 - shift);
-}
-
-static void gen_shl_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
-{
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
-    TCGv_vec m = tcg_temp_new_vec_matching(d);
-
-    tcg_gen_shli_vec(vece, t, a, sh);
-    tcg_gen_dupi_vec(vece, m, MAKE_64BIT_MASK(0, sh));
-    tcg_gen_and_vec(vece, d, d, m);
-    tcg_gen_or_vec(vece, d, d, t);
-
-    tcg_temp_free_vec(t);
-    tcg_temp_free_vec(m);
-}
-
-void gen_gvec_sli(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
-                  int64_t shift, uint32_t opr_sz, uint32_t max_sz)
-{
-    static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
-    const GVecGen2i ops[4] = {
-        { .fni8 = gen_shl8_ins_i64,
-          .fniv = gen_shl_ins_vec,
-          .fno = gen_helper_gvec_sli_b,
-          .load_dest = true,
-          .opt_opc = vecop_list,
-          .vece = MO_8 },
-        { .fni8 = gen_shl16_ins_i64,
-          .fniv = gen_shl_ins_vec,
-          .fno = gen_helper_gvec_sli_h,
-          .load_dest = true,
-          .opt_opc = vecop_list,
-          .vece = MO_16 },
-        { .fni4 = gen_shl32_ins_i32,
-          .fniv = gen_shl_ins_vec,
-          .fno = gen_helper_gvec_sli_s,
-          .load_dest = true,
-          .opt_opc = vecop_list,
-          .vece = MO_32 },
-        { .fni8 = gen_shl64_ins_i64,
-          .fniv = gen_shl_ins_vec,
-          .fno = gen_helper_gvec_sli_d,
-          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-          .load_dest = true,
-          .opt_opc = vecop_list,
-          .vece = MO_64 },
-    };
-
-    /* tszimm encoding produces immediates in the range [0..esize-1]. */
-    tcg_debug_assert(shift >= 0);
-    tcg_debug_assert(shift < (8 << vece));
-
-    if (shift == 0) {
-        tcg_gen_gvec_mov(vece, rd_ofs, rm_ofs, opr_sz, max_sz);
-    } else {
-        tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
-    }
-}
-
-static void gen_mla8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
-{
-    gen_helper_neon_mul_u8(a, a, b);
-    gen_helper_neon_add_u8(d, d, a);
-}
-
-static void gen_mls8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
-{
-    gen_helper_neon_mul_u8(a, a, b);
-    gen_helper_neon_sub_u8(d, d, a);
-}
-
-static void gen_mla16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
-{
-    gen_helper_neon_mul_u16(a, a, b);
-    gen_helper_neon_add_u16(d, d, a);
-}
-
-static void gen_mls16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
-{
-    gen_helper_neon_mul_u16(a, a, b);
-    gen_helper_neon_sub_u16(d, d, a);
-}
-
-static void gen_mla32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
-{
-    tcg_gen_mul_i32(a, a, b);
-    tcg_gen_add_i32(d, d, a);
-}
-
-static void gen_mls32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
-{
-    tcg_gen_mul_i32(a, a, b);
-    tcg_gen_sub_i32(d, d, a);
-}
-
-static void gen_mla64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
-{
-    tcg_gen_mul_i64(a, a, b);
-    tcg_gen_add_i64(d, d, a);
-}
-
-static void gen_mls64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
-{
-    tcg_gen_mul_i64(a, a, b);
-    tcg_gen_sub_i64(d, d, a);
-}
-
-static void gen_mla_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
-{
-    tcg_gen_mul_vec(vece, a, a, b);
-    tcg_gen_add_vec(vece, d, d, a);
-}
-
-static void gen_mls_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
-{
-    tcg_gen_mul_vec(vece, a, a, b);
-    tcg_gen_sub_vec(vece, d, d, a);
-}
-
-/* Note that while NEON does not support VMLA and VMLS as 64-bit ops,
- * these tables are shared with AArch64 which does support them.
- */
-void gen_gvec_mla(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                  uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
-{
-    static const TCGOpcode vecop_list[] = {
-        INDEX_op_mul_vec, INDEX_op_add_vec, 0
-    };
-    static const GVecGen3 ops[4] = {
-        { .fni4 = gen_mla8_i32,
-          .fniv = gen_mla_vec,
-          .load_dest = true,
-          .opt_opc = vecop_list,
-          .vece = MO_8 },
-        { .fni4 = gen_mla16_i32,
-          .fniv = gen_mla_vec,
-          .load_dest = true,
-          .opt_opc = vecop_list,
-          .vece = MO_16 },
-        { .fni4 = gen_mla32_i32,
-          .fniv = gen_mla_vec,
-          .load_dest = true,
-          .opt_opc = vecop_list,
-          .vece = MO_32 },
-        { .fni8 = gen_mla64_i64,
-          .fniv = gen_mla_vec,
-          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-          .load_dest = true,
-          .opt_opc = vecop_list,
-          .vece = MO_64 },
-    };
-    tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
-}
-
-void gen_gvec_mls(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                  uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
-{
-    static const TCGOpcode vecop_list[] = {
-        INDEX_op_mul_vec, INDEX_op_sub_vec, 0
-    };
-    static const GVecGen3 ops[4] = {
-        { .fni4 = gen_mls8_i32,
-          .fniv = gen_mls_vec,
-          .load_dest = true,
-          .opt_opc = vecop_list,
-          .vece = MO_8 },
-        { .fni4 = gen_mls16_i32,
-          .fniv = gen_mls_vec,
-          .load_dest = true,
-          .opt_opc = vecop_list,
-          .vece = MO_16 },
-        { .fni4 = gen_mls32_i32,
-          .fniv = gen_mls_vec,
-          .load_dest = true,
-          .opt_opc = vecop_list,
-          .vece = MO_32 },
-        { .fni8 = gen_mls64_i64,
-          .fniv = gen_mls_vec,
-          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-          .load_dest = true,
-          .opt_opc = vecop_list,
-          .vece = MO_64 },
-    };
-    tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
-}
-
-/* CMTST : test is "if (X & Y != 0)". */
-static void gen_cmtst_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
-{
-    tcg_gen_and_i32(d, a, b);
-    tcg_gen_setcondi_i32(TCG_COND_NE, d, d, 0);
-    tcg_gen_neg_i32(d, d);
-}
-
-void gen_cmtst_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
-{
-    tcg_gen_and_i64(d, a, b);
-    tcg_gen_setcondi_i64(TCG_COND_NE, d, d, 0);
-    tcg_gen_neg_i64(d, d);
-}
-
-static void gen_cmtst_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
-{
-    tcg_gen_and_vec(vece, d, a, b);
-    tcg_gen_dupi_vec(vece, a, 0);
-    tcg_gen_cmp_vec(TCG_COND_NE, vece, d, d, a);
-}
-
-void gen_gvec_cmtst(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
-{
-    static const TCGOpcode vecop_list[] = { INDEX_op_cmp_vec, 0 };
-    static const GVecGen3 ops[4] = {
-        { .fni4 = gen_helper_neon_tst_u8,
-          .fniv = gen_cmtst_vec,
-          .opt_opc = vecop_list,
-          .vece = MO_8 },
-        { .fni4 = gen_helper_neon_tst_u16,
-          .fniv = gen_cmtst_vec,
-          .opt_opc = vecop_list,
-          .vece = MO_16 },
-        { .fni4 = gen_cmtst_i32,
-          .fniv = gen_cmtst_vec,
-          .opt_opc = vecop_list,
-          .vece = MO_32 },
-        { .fni8 = gen_cmtst_i64,
-          .fniv = gen_cmtst_vec,
-          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-          .opt_opc = vecop_list,
-          .vece = MO_64 },
-    };
-    tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
-}
-
-void gen_ushl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift)
-{
-    TCGv_i32 lval = tcg_temp_new_i32();
-    TCGv_i32 rval = tcg_temp_new_i32();
-    TCGv_i32 lsh = tcg_temp_new_i32();
-    TCGv_i32 rsh = tcg_temp_new_i32();
-    TCGv_i32 zero = tcg_constant_i32(0);
-    TCGv_i32 max = tcg_constant_i32(32);
-
-    /*
-     * Rely on the TCG guarantee that out of range shifts produce
-     * unspecified results, not undefined behaviour (i.e. no trap).
-     * Discard out-of-range results after the fact.
-     */
-    tcg_gen_ext8s_i32(lsh, shift);
-    tcg_gen_neg_i32(rsh, lsh);
-    tcg_gen_shl_i32(lval, src, lsh);
-    tcg_gen_shr_i32(rval, src, rsh);
-    tcg_gen_movcond_i32(TCG_COND_LTU, dst, lsh, max, lval, zero);
-    tcg_gen_movcond_i32(TCG_COND_LTU, dst, rsh, max, rval, dst);
-
-    tcg_temp_free_i32(lval);
-    tcg_temp_free_i32(rval);
-    tcg_temp_free_i32(lsh);
-    tcg_temp_free_i32(rsh);
-}
-
-void gen_ushl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift)
-{
-    TCGv_i64 lval = tcg_temp_new_i64();
-    TCGv_i64 rval = tcg_temp_new_i64();
-    TCGv_i64 lsh = tcg_temp_new_i64();
-    TCGv_i64 rsh = tcg_temp_new_i64();
-    TCGv_i64 zero = tcg_constant_i64(0);
-    TCGv_i64 max = tcg_constant_i64(64);
-
-    /*
-     * Rely on the TCG guarantee that out of range shifts produce
-     * unspecified results, not undefined behaviour (i.e. no trap).
-     * Discard out-of-range results after the fact.
-     */
-    tcg_gen_ext8s_i64(lsh, shift);
-    tcg_gen_neg_i64(rsh, lsh);
-    tcg_gen_shl_i64(lval, src, lsh);
-    tcg_gen_shr_i64(rval, src, rsh);
-    tcg_gen_movcond_i64(TCG_COND_LTU, dst, lsh, max, lval, zero);
-    tcg_gen_movcond_i64(TCG_COND_LTU, dst, rsh, max, rval, dst);
-
-    tcg_temp_free_i64(lval);
-    tcg_temp_free_i64(rval);
-    tcg_temp_free_i64(lsh);
-    tcg_temp_free_i64(rsh);
-}
-
-static void gen_ushl_vec(unsigned vece, TCGv_vec dst,
-                         TCGv_vec src, TCGv_vec shift)
-{
-    TCGv_vec lval = tcg_temp_new_vec_matching(dst);
-    TCGv_vec rval = tcg_temp_new_vec_matching(dst);
-    TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
-    TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
-    TCGv_vec msk, max;
-
-    tcg_gen_neg_vec(vece, rsh, shift);
-    if (vece == MO_8) {
-        tcg_gen_mov_vec(lsh, shift);
-    } else {
-        msk = tcg_temp_new_vec_matching(dst);
-        tcg_gen_dupi_vec(vece, msk, 0xff);
-        tcg_gen_and_vec(vece, lsh, shift, msk);
-        tcg_gen_and_vec(vece, rsh, rsh, msk);
-        tcg_temp_free_vec(msk);
-    }
-
-    /*
-     * Rely on the TCG guarantee that out of range shifts produce
-     * unspecified results, not undefined behaviour (i.e. no trap).
-     * Discard out-of-range results after the fact.
-     */
-    tcg_gen_shlv_vec(vece, lval, src, lsh);
-    tcg_gen_shrv_vec(vece, rval, src, rsh);
-
-    max = tcg_temp_new_vec_matching(dst);
-    tcg_gen_dupi_vec(vece, max, 8 << vece);
-
-    /*
-     * The choice of LT (signed) and GEU (unsigned) are biased toward
-     * the instructions of the x86_64 host.  For MO_8, the whole byte
-     * is significant so we must use an unsigned compare; otherwise we
-     * have already masked to a byte and so a signed compare works.
-     * Other tcg hosts have a full set of comparisons and do not care.
-     */
-    if (vece == MO_8) {
-        tcg_gen_cmp_vec(TCG_COND_GEU, vece, lsh, lsh, max);
-        tcg_gen_cmp_vec(TCG_COND_GEU, vece, rsh, rsh, max);
-        tcg_gen_andc_vec(vece, lval, lval, lsh);
-        tcg_gen_andc_vec(vece, rval, rval, rsh);
-    } else {
-        tcg_gen_cmp_vec(TCG_COND_LT, vece, lsh, lsh, max);
-        tcg_gen_cmp_vec(TCG_COND_LT, vece, rsh, rsh, max);
-        tcg_gen_and_vec(vece, lval, lval, lsh);
-        tcg_gen_and_vec(vece, rval, rval, rsh);
-    }
-    tcg_gen_or_vec(vece, dst, lval, rval);
-
-    tcg_temp_free_vec(max);
-    tcg_temp_free_vec(lval);
-    tcg_temp_free_vec(rval);
-    tcg_temp_free_vec(lsh);
-    tcg_temp_free_vec(rsh);
-}
-
-void gen_gvec_ushl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
-{
-    static const TCGOpcode vecop_list[] = {
-        INDEX_op_neg_vec, INDEX_op_shlv_vec,
-        INDEX_op_shrv_vec, INDEX_op_cmp_vec, 0
-    };
-    static const GVecGen3 ops[4] = {
-        { .fniv = gen_ushl_vec,
-          .fno = gen_helper_gvec_ushl_b,
-          .opt_opc = vecop_list,
-          .vece = MO_8 },
-        { .fniv = gen_ushl_vec,
-          .fno = gen_helper_gvec_ushl_h,
-          .opt_opc = vecop_list,
-          .vece = MO_16 },
-        { .fni4 = gen_ushl_i32,
-          .fniv = gen_ushl_vec,
-          .opt_opc = vecop_list,
-          .vece = MO_32 },
-        { .fni8 = gen_ushl_i64,
-          .fniv = gen_ushl_vec,
-          .opt_opc = vecop_list,
-          .vece = MO_64 },
-    };
-    tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
-}
-
-void gen_sshl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift)
-{
-    TCGv_i32 lval = tcg_temp_new_i32();
-    TCGv_i32 rval = tcg_temp_new_i32();
-    TCGv_i32 lsh = tcg_temp_new_i32();
-    TCGv_i32 rsh = tcg_temp_new_i32();
-    TCGv_i32 zero = tcg_constant_i32(0);
-    TCGv_i32 max = tcg_constant_i32(31);
-
-    /*
-     * Rely on the TCG guarantee that out of range shifts produce
-     * unspecified results, not undefined behaviour (i.e. no trap).
-     * Discard out-of-range results after the fact.
-     */
-    tcg_gen_ext8s_i32(lsh, shift);
-    tcg_gen_neg_i32(rsh, lsh);
-    tcg_gen_shl_i32(lval, src, lsh);
-    tcg_gen_umin_i32(rsh, rsh, max);
-    tcg_gen_sar_i32(rval, src, rsh);
-    tcg_gen_movcond_i32(TCG_COND_LEU, lval, lsh, max, lval, zero);
-    tcg_gen_movcond_i32(TCG_COND_LT, dst, lsh, zero, rval, lval);
-
-    tcg_temp_free_i32(lval);
-    tcg_temp_free_i32(rval);
-    tcg_temp_free_i32(lsh);
-    tcg_temp_free_i32(rsh);
-}
-
-void gen_sshl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift)
-{
-    TCGv_i64 lval = tcg_temp_new_i64();
-    TCGv_i64 rval = tcg_temp_new_i64();
-    TCGv_i64 lsh = tcg_temp_new_i64();
-    TCGv_i64 rsh = tcg_temp_new_i64();
-    TCGv_i64 zero = tcg_constant_i64(0);
-    TCGv_i64 max = tcg_constant_i64(63);
-
-    /*
-     * Rely on the TCG guarantee that out of range shifts produce
-     * unspecified results, not undefined behaviour (i.e. no trap).
-     * Discard out-of-range results after the fact.
-     */
-    tcg_gen_ext8s_i64(lsh, shift);
-    tcg_gen_neg_i64(rsh, lsh);
-    tcg_gen_shl_i64(lval, src, lsh);
-    tcg_gen_umin_i64(rsh, rsh, max);
-    tcg_gen_sar_i64(rval, src, rsh);
-    tcg_gen_movcond_i64(TCG_COND_LEU, lval, lsh, max, lval, zero);
-    tcg_gen_movcond_i64(TCG_COND_LT, dst, lsh, zero, rval, lval);
-
-    tcg_temp_free_i64(lval);
-    tcg_temp_free_i64(rval);
-    tcg_temp_free_i64(lsh);
-    tcg_temp_free_i64(rsh);
-}
-
-static void gen_sshl_vec(unsigned vece, TCGv_vec dst,
-                         TCGv_vec src, TCGv_vec shift)
-{
-    TCGv_vec lval = tcg_temp_new_vec_matching(dst);
-    TCGv_vec rval = tcg_temp_new_vec_matching(dst);
-    TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
-    TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
-    TCGv_vec tmp = tcg_temp_new_vec_matching(dst);
-
-    /*
-     * Rely on the TCG guarantee that out of range shifts produce
-     * unspecified results, not undefined behaviour (i.e. no trap).
-     * Discard out-of-range results after the fact.
-     */
-    tcg_gen_neg_vec(vece, rsh, shift);
-    if (vece == MO_8) {
-        tcg_gen_mov_vec(lsh, shift);
-    } else {
-        tcg_gen_dupi_vec(vece, tmp, 0xff);
-        tcg_gen_and_vec(vece, lsh, shift, tmp);
-        tcg_gen_and_vec(vece, rsh, rsh, tmp);
-    }
-
-    /* Bound rsh so out of bound right shift gets -1.  */
-    tcg_gen_dupi_vec(vece, tmp, (8 << vece) - 1);
-    tcg_gen_umin_vec(vece, rsh, rsh, tmp);
-    tcg_gen_cmp_vec(TCG_COND_GT, vece, tmp, lsh, tmp);
-
-    tcg_gen_shlv_vec(vece, lval, src, lsh);
-    tcg_gen_sarv_vec(vece, rval, src, rsh);
-
-    /* Select in-bound left shift.  */
-    tcg_gen_andc_vec(vece, lval, lval, tmp);
-
-    /* Select between left and right shift.  */
-    if (vece == MO_8) {
-        tcg_gen_dupi_vec(vece, tmp, 0);
-        tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, tmp, rval, lval);
-    } else {
-        tcg_gen_dupi_vec(vece, tmp, 0x80);
-        tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, tmp, lval, rval);
-    }
-
-    tcg_temp_free_vec(lval);
-    tcg_temp_free_vec(rval);
-    tcg_temp_free_vec(lsh);
-    tcg_temp_free_vec(rsh);
-    tcg_temp_free_vec(tmp);
-}
-
-void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
-{
-    static const TCGOpcode vecop_list[] = {
-        INDEX_op_neg_vec, INDEX_op_umin_vec, INDEX_op_shlv_vec,
-        INDEX_op_sarv_vec, INDEX_op_cmp_vec, INDEX_op_cmpsel_vec, 0
-    };
-    static const GVecGen3 ops[4] = {
-        { .fniv = gen_sshl_vec,
-          .fno = gen_helper_gvec_sshl_b,
-          .opt_opc = vecop_list,
-          .vece = MO_8 },
-        { .fniv = gen_sshl_vec,
-          .fno = gen_helper_gvec_sshl_h,
-          .opt_opc = vecop_list,
-          .vece = MO_16 },
-        { .fni4 = gen_sshl_i32,
-          .fniv = gen_sshl_vec,
-          .opt_opc = vecop_list,
-          .vece = MO_32 },
-        { .fni8 = gen_sshl_i64,
-          .fniv = gen_sshl_vec,
-          .opt_opc = vecop_list,
-          .vece = MO_64 },
-    };
-    tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
-}
-
-static void gen_uqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec sat,
-                          TCGv_vec a, TCGv_vec b)
-{
-    TCGv_vec x = tcg_temp_new_vec_matching(t);
-    tcg_gen_add_vec(vece, x, a, b);
-    tcg_gen_usadd_vec(vece, t, a, b);
-    tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t);
-    tcg_gen_or_vec(vece, sat, sat, x);
-    tcg_temp_free_vec(x);
-}
-
-void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                       uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
-{
-    static const TCGOpcode vecop_list[] = {
-        INDEX_op_usadd_vec, INDEX_op_cmp_vec, INDEX_op_add_vec, 0
-    };
-    static const GVecGen4 ops[4] = {
-        { .fniv = gen_uqadd_vec,
-          .fno = gen_helper_gvec_uqadd_b,
-          .write_aofs = true,
-          .opt_opc = vecop_list,
-          .vece = MO_8 },
-        { .fniv = gen_uqadd_vec,
-          .fno = gen_helper_gvec_uqadd_h,
-          .write_aofs = true,
-          .opt_opc = vecop_list,
-          .vece = MO_16 },
-        { .fniv = gen_uqadd_vec,
-          .fno = gen_helper_gvec_uqadd_s,
-          .write_aofs = true,
-          .opt_opc = vecop_list,
-          .vece = MO_32 },
-        { .fniv = gen_uqadd_vec,
-          .fno = gen_helper_gvec_uqadd_d,
-          .write_aofs = true,
-          .opt_opc = vecop_list,
-          .vece = MO_64 },
-    };
-    tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
-                   rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
-}
-
-static void gen_sqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec sat,
-                          TCGv_vec a, TCGv_vec b)
-{
-    TCGv_vec x = tcg_temp_new_vec_matching(t);
-    tcg_gen_add_vec(vece, x, a, b);
-    tcg_gen_ssadd_vec(vece, t, a, b);
-    tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t);
-    tcg_gen_or_vec(vece, sat, sat, x);
-    tcg_temp_free_vec(x);
-}
-
-void gen_gvec_sqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                       uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
-{
-    static const TCGOpcode vecop_list[] = {
-        INDEX_op_ssadd_vec, INDEX_op_cmp_vec, INDEX_op_add_vec, 0
-    };
-    static const GVecGen4 ops[4] = {
-        { .fniv = gen_sqadd_vec,
-          .fno = gen_helper_gvec_sqadd_b,
-          .opt_opc = vecop_list,
-          .write_aofs = true,
-          .vece = MO_8 },
-        { .fniv = gen_sqadd_vec,
-          .fno = gen_helper_gvec_sqadd_h,
-          .opt_opc = vecop_list,
-          .write_aofs = true,
-          .vece = MO_16 },
-        { .fniv = gen_sqadd_vec,
-          .fno = gen_helper_gvec_sqadd_s,
-          .opt_opc = vecop_list,
-          .write_aofs = true,
-          .vece = MO_32 },
-        { .fniv = gen_sqadd_vec,
-          .fno = gen_helper_gvec_sqadd_d,
-          .opt_opc = vecop_list,
-          .write_aofs = true,
-          .vece = MO_64 },
-    };
-    tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
-                   rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
-}
-
-static void gen_uqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec sat,
-                          TCGv_vec a, TCGv_vec b)
-{
-    TCGv_vec x = tcg_temp_new_vec_matching(t);
-    tcg_gen_sub_vec(vece, x, a, b);
-    tcg_gen_ussub_vec(vece, t, a, b);
-    tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t);
-    tcg_gen_or_vec(vece, sat, sat, x);
-    tcg_temp_free_vec(x);
-}
-
-void gen_gvec_uqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                       uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
-{
-    static const TCGOpcode vecop_list[] = {
-        INDEX_op_ussub_vec, INDEX_op_cmp_vec, INDEX_op_sub_vec, 0
-    };
-    static const GVecGen4 ops[4] = {
-        { .fniv = gen_uqsub_vec,
-          .fno = gen_helper_gvec_uqsub_b,
-          .opt_opc = vecop_list,
-          .write_aofs = true,
-          .vece = MO_8 },
-        { .fniv = gen_uqsub_vec,
-          .fno = gen_helper_gvec_uqsub_h,
-          .opt_opc = vecop_list,
-          .write_aofs = true,
-          .vece = MO_16 },
-        { .fniv = gen_uqsub_vec,
-          .fno = gen_helper_gvec_uqsub_s,
-          .opt_opc = vecop_list,
-          .write_aofs = true,
-          .vece = MO_32 },
-        { .fniv = gen_uqsub_vec,
-          .fno = gen_helper_gvec_uqsub_d,
-          .opt_opc = vecop_list,
-          .write_aofs = true,
-          .vece = MO_64 },
-    };
-    tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
-                   rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
-}
-
-static void gen_sqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec sat,
-                          TCGv_vec a, TCGv_vec b)
-{
-    TCGv_vec x = tcg_temp_new_vec_matching(t);
-    tcg_gen_sub_vec(vece, x, a, b);
-    tcg_gen_sssub_vec(vece, t, a, b);
-    tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t);
-    tcg_gen_or_vec(vece, sat, sat, x);
-    tcg_temp_free_vec(x);
-}
-
-void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                       uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
-{
-    static const TCGOpcode vecop_list[] = {
-        INDEX_op_sssub_vec, INDEX_op_cmp_vec, INDEX_op_sub_vec, 0
-    };
-    static const GVecGen4 ops[4] = {
-        { .fniv = gen_sqsub_vec,
-          .fno = gen_helper_gvec_sqsub_b,
-          .opt_opc = vecop_list,
-          .write_aofs = true,
-          .vece = MO_8 },
-        { .fniv = gen_sqsub_vec,
-          .fno = gen_helper_gvec_sqsub_h,
-          .opt_opc = vecop_list,
-          .write_aofs = true,
-          .vece = MO_16 },
-        { .fniv = gen_sqsub_vec,
-          .fno = gen_helper_gvec_sqsub_s,
-          .opt_opc = vecop_list,
-          .write_aofs = true,
-          .vece = MO_32 },
-        { .fniv = gen_sqsub_vec,
-          .fno = gen_helper_gvec_sqsub_d,
-          .opt_opc = vecop_list,
-          .write_aofs = true,
-          .vece = MO_64 },
-    };
-    tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
-                   rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
-}
-
-static void gen_sabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
-{
-    TCGv_i32 t = tcg_temp_new_i32();
-
-    tcg_gen_sub_i32(t, a, b);
-    tcg_gen_sub_i32(d, b, a);
-    tcg_gen_movcond_i32(TCG_COND_LT, d, a, b, d, t);
-    tcg_temp_free_i32(t);
-}
-
-static void gen_sabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
-{
-    TCGv_i64 t = tcg_temp_new_i64();
-
-    tcg_gen_sub_i64(t, a, b);
-    tcg_gen_sub_i64(d, b, a);
-    tcg_gen_movcond_i64(TCG_COND_LT, d, a, b, d, t);
-    tcg_temp_free_i64(t);
-}
-
-static void gen_sabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
-{
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
-
-    tcg_gen_smin_vec(vece, t, a, b);
-    tcg_gen_smax_vec(vece, d, a, b);
-    tcg_gen_sub_vec(vece, d, d, t);
-    tcg_temp_free_vec(t);
-}
-
-void gen_gvec_sabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
-{
-    static const TCGOpcode vecop_list[] = {
-        INDEX_op_sub_vec, INDEX_op_smin_vec, INDEX_op_smax_vec, 0
-    };
-    static const GVecGen3 ops[4] = {
-        { .fniv = gen_sabd_vec,
-          .fno = gen_helper_gvec_sabd_b,
-          .opt_opc = vecop_list,
-          .vece = MO_8 },
-        { .fniv = gen_sabd_vec,
-          .fno = gen_helper_gvec_sabd_h,
-          .opt_opc = vecop_list,
-          .vece = MO_16 },
-        { .fni4 = gen_sabd_i32,
-          .fniv = gen_sabd_vec,
-          .fno = gen_helper_gvec_sabd_s,
-          .opt_opc = vecop_list,
-          .vece = MO_32 },
-        { .fni8 = gen_sabd_i64,
-          .fniv = gen_sabd_vec,
-          .fno = gen_helper_gvec_sabd_d,
-          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-          .opt_opc = vecop_list,
-          .vece = MO_64 },
-    };
-    tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
-}
-
-static void gen_uabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
-{
-    TCGv_i32 t = tcg_temp_new_i32();
-
-    tcg_gen_sub_i32(t, a, b);
-    tcg_gen_sub_i32(d, b, a);
-    tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, d, t);
-    tcg_temp_free_i32(t);
-}
-
-static void gen_uabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
-{
-    TCGv_i64 t = tcg_temp_new_i64();
-
-    tcg_gen_sub_i64(t, a, b);
-    tcg_gen_sub_i64(d, b, a);
-    tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, d, t);
-    tcg_temp_free_i64(t);
-}
-
-static void gen_uabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
-{
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
-
-    tcg_gen_umin_vec(vece, t, a, b);
-    tcg_gen_umax_vec(vece, d, a, b);
-    tcg_gen_sub_vec(vece, d, d, t);
-    tcg_temp_free_vec(t);
-}
-
-void gen_gvec_uabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
-{
-    static const TCGOpcode vecop_list[] = {
-        INDEX_op_sub_vec, INDEX_op_umin_vec, INDEX_op_umax_vec, 0
-    };
-    static const GVecGen3 ops[4] = {
-        { .fniv = gen_uabd_vec,
-          .fno = gen_helper_gvec_uabd_b,
-          .opt_opc = vecop_list,
-          .vece = MO_8 },
-        { .fniv = gen_uabd_vec,
-          .fno = gen_helper_gvec_uabd_h,
-          .opt_opc = vecop_list,
-          .vece = MO_16 },
-        { .fni4 = gen_uabd_i32,
-          .fniv = gen_uabd_vec,
-          .fno = gen_helper_gvec_uabd_s,
-          .opt_opc = vecop_list,
-          .vece = MO_32 },
-        { .fni8 = gen_uabd_i64,
-          .fniv = gen_uabd_vec,
-          .fno = gen_helper_gvec_uabd_d,
-          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-          .opt_opc = vecop_list,
-          .vece = MO_64 },
-    };
-    tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
-}
-
-static void gen_saba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
-{
-    TCGv_i32 t = tcg_temp_new_i32();
-    gen_sabd_i32(t, a, b);
-    tcg_gen_add_i32(d, d, t);
-    tcg_temp_free_i32(t);
-}
-
-static void gen_saba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
-{
-    TCGv_i64 t = tcg_temp_new_i64();
-    gen_sabd_i64(t, a, b);
-    tcg_gen_add_i64(d, d, t);
-    tcg_temp_free_i64(t);
-}
-
-static void gen_saba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
-{
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
-    gen_sabd_vec(vece, t, a, b);
-    tcg_gen_add_vec(vece, d, d, t);
-    tcg_temp_free_vec(t);
-}
-
-void gen_gvec_saba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
-{
-    static const TCGOpcode vecop_list[] = {
-        INDEX_op_sub_vec, INDEX_op_add_vec,
-        INDEX_op_smin_vec, INDEX_op_smax_vec, 0
-    };
-    static const GVecGen3 ops[4] = {
-        { .fniv = gen_saba_vec,
-          .fno = gen_helper_gvec_saba_b,
-          .opt_opc = vecop_list,
-          .load_dest = true,
-          .vece = MO_8 },
-        { .fniv = gen_saba_vec,
-          .fno = gen_helper_gvec_saba_h,
-          .opt_opc = vecop_list,
-          .load_dest = true,
-          .vece = MO_16 },
-        { .fni4 = gen_saba_i32,
-          .fniv = gen_saba_vec,
-          .fno = gen_helper_gvec_saba_s,
-          .opt_opc = vecop_list,
-          .load_dest = true,
-          .vece = MO_32 },
-        { .fni8 = gen_saba_i64,
-          .fniv = gen_saba_vec,
-          .fno = gen_helper_gvec_saba_d,
-          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-          .opt_opc = vecop_list,
-          .load_dest = true,
-          .vece = MO_64 },
-    };
-    tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
-}
-
-static void gen_uaba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
-{
-    TCGv_i32 t = tcg_temp_new_i32();
-    gen_uabd_i32(t, a, b);
-    tcg_gen_add_i32(d, d, t);
-    tcg_temp_free_i32(t);
-}
-
-static void gen_uaba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
-{
-    TCGv_i64 t = tcg_temp_new_i64();
-    gen_uabd_i64(t, a, b);
-    tcg_gen_add_i64(d, d, t);
-    tcg_temp_free_i64(t);
-}
-
-static void gen_uaba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
-{
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
-    gen_uabd_vec(vece, t, a, b);
-    tcg_gen_add_vec(vece, d, d, t);
-    tcg_temp_free_vec(t);
-}
-
-void gen_gvec_uaba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz)
-{
-    static const TCGOpcode vecop_list[] = {
-        INDEX_op_sub_vec, INDEX_op_add_vec,
-        INDEX_op_umin_vec, INDEX_op_umax_vec, 0
-    };
-    static const GVecGen3 ops[4] = {
-        { .fniv = gen_uaba_vec,
-          .fno = gen_helper_gvec_uaba_b,
-          .opt_opc = vecop_list,
-          .load_dest = true,
-          .vece = MO_8 },
-        { .fniv = gen_uaba_vec,
-          .fno = gen_helper_gvec_uaba_h,
-          .opt_opc = vecop_list,
-          .load_dest = true,
-          .vece = MO_16 },
-        { .fni4 = gen_uaba_i32,
-          .fniv = gen_uaba_vec,
-          .fno = gen_helper_gvec_uaba_s,
-          .opt_opc = vecop_list,
-          .load_dest = true,
-          .vece = MO_32 },
-        { .fni8 = gen_uaba_i64,
-          .fniv = gen_uaba_vec,
-          .fno = gen_helper_gvec_uaba_d,
-          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
-          .opt_opc = vecop_list,
-          .load_dest = true,
-          .vece = MO_64 },
-    };
-    tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
-}
-
-static void do_coproc_insn(DisasContext *s, int cpnum, int is64,
-                           int opc1, int crn, int crm, int opc2,
-                           bool isread, int rt, int rt2)
-{
-    uint32_t key = ENCODE_CP_REG(cpnum, is64, s->ns, crn, crm, opc1, opc2);
-    const ARMCPRegInfo *ri = get_arm_cp_reginfo(s->cp_regs, key);
-    TCGv_ptr tcg_ri = NULL;
-    bool need_exit_tb;
-    uint32_t syndrome;
-
-    /*
-     * Note that since we are an implementation which takes an
-     * exception on a trapped conditional instruction only if the
-     * instruction passes its condition code check, we can take
-     * advantage of the clause in the ARM ARM that allows us to set
-     * the COND field in the instruction to 0xE in all cases.
-     * We could fish the actual condition out of the insn (ARM)
-     * or the condexec bits (Thumb) but it isn't necessary.
-     */
-    switch (cpnum) {
-    case 14:
-        if (is64) {
-            syndrome = syn_cp14_rrt_trap(1, 0xe, opc1, crm, rt, rt2,
-                                         isread, false);
-        } else {
-            syndrome = syn_cp14_rt_trap(1, 0xe, opc1, opc2, crn, crm,
-                                        rt, isread, false);
-        }
-        break;
-    case 15:
-        if (is64) {
-            syndrome = syn_cp15_rrt_trap(1, 0xe, opc1, crm, rt, rt2,
-                                         isread, false);
-        } else {
-            syndrome = syn_cp15_rt_trap(1, 0xe, opc1, opc2, crn, crm,
-                                        rt, isread, false);
-        }
-        break;
-    default:
-        /*
-         * ARMv8 defines that only coprocessors 14 and 15 exist,
-         * so this can only happen if this is an ARMv7 or earlier CPU,
-         * in which case the syndrome information won't actually be
-         * guest visible.
-         */
-        assert(!arm_dc_feature(s, ARM_FEATURE_V8));
-        syndrome = syn_uncategorized();
-        break;
-    }
-
-    if (s->hstr_active && cpnum == 15 && s->current_el == 1) {
-        /*
-         * At EL1, check for a HSTR_EL2 trap, which must take precedence
-         * over the UNDEF for "no such register" or the UNDEF for "access
-         * permissions forbid this EL1 access". HSTR_EL2 traps from EL0
-         * only happen if the cpreg doesn't UNDEF at EL0, so we do those in
-         * access_check_cp_reg(), after the checks for whether the access
-         * configurably trapped to EL1.
-         */
-        uint32_t maskbit = is64 ? crm : crn;
-
-        if (maskbit != 4 && maskbit != 14) {
-            /* T4 and T14 are RES0 so never cause traps */
-            TCGv_i32 t;
-            DisasLabel over = gen_disas_label(s);
-
-            t = load_cpu_offset(offsetoflow32(CPUARMState, cp15.hstr_el2));
-            tcg_gen_andi_i32(t, t, 1u << maskbit);
-            tcg_gen_brcondi_i32(TCG_COND_EQ, t, 0, over.label);
-            tcg_temp_free_i32(t);
-
-            gen_exception_insn(s, 0, EXCP_UDEF, syndrome);
-            set_disas_label(s, over);
-        }
-    }
-
-    if (!ri) {
-        /*
-         * Unknown register; this might be a guest error or a QEMU
-         * unimplemented feature.
-         */
-        if (is64) {
-            qemu_log_mask(LOG_UNIMP, "%s access to unsupported AArch32 "
-                          "64 bit system register cp:%d opc1: %d crm:%d "
-                          "(%s)\n",
-                          isread ? "read" : "write", cpnum, opc1, crm,
-                          s->ns ? "non-secure" : "secure");
-        } else {
-            qemu_log_mask(LOG_UNIMP, "%s access to unsupported AArch32 "
-                          "system register cp:%d opc1:%d crn:%d crm:%d "
-                          "opc2:%d (%s)\n",
-                          isread ? "read" : "write", cpnum, opc1, crn,
-                          crm, opc2, s->ns ? "non-secure" : "secure");
-        }
-        unallocated_encoding(s);
-        return;
-    }
-
-    /* Check access permissions */
-    if (!cp_access_ok(s->current_el, ri, isread)) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    if ((s->hstr_active && s->current_el == 0) || ri->accessfn ||
-        (ri->fgt && s->fgt_active) ||
-        (arm_dc_feature(s, ARM_FEATURE_XSCALE) && cpnum < 14)) {
-        /*
-         * Emit code to perform further access permissions checks at
-         * runtime; this may result in an exception.
-         * Note that on XScale all cp0..c13 registers do an access check
-         * call in order to handle c15_cpar.
-         */
-        gen_set_condexec(s);
-        gen_update_pc(s, 0);
-        tcg_ri = tcg_temp_new_ptr();
-        gen_helper_access_check_cp_reg(tcg_ri, cpu_env,
-                                       tcg_constant_i32(key),
-                                       tcg_constant_i32(syndrome),
-                                       tcg_constant_i32(isread));
-    } else if (ri->type & ARM_CP_RAISES_EXC) {
-        /*
-         * The readfn or writefn might raise an exception;
-         * synchronize the CPU state in case it does.
-         */
-        gen_set_condexec(s);
-        gen_update_pc(s, 0);
-    }
-
-    /* Handle special cases first */
-    switch (ri->type & ARM_CP_SPECIAL_MASK) {
-    case 0:
-        break;
-    case ARM_CP_NOP:
-        goto exit;
-    case ARM_CP_WFI:
-        if (isread) {
-            unallocated_encoding(s);
-        } else {
-            gen_update_pc(s, curr_insn_len(s));
-            s->base.is_jmp = DISAS_WFI;
-        }
-        goto exit;
-    default:
-        g_assert_not_reached();
-    }
-
-    if ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) {
-        gen_io_start();
-    }
-
-    if (isread) {
-        /* Read */
-        if (is64) {
-            TCGv_i64 tmp64;
-            TCGv_i32 tmp;
-            if (ri->type & ARM_CP_CONST) {
-                tmp64 = tcg_constant_i64(ri->resetvalue);
-            } else if (ri->readfn) {
-                if (!tcg_ri) {
-                    tcg_ri = gen_lookup_cp_reg(key);
-                }
-                tmp64 = tcg_temp_new_i64();
-                gen_helper_get_cp_reg64(tmp64, cpu_env, tcg_ri);
-            } else {
-                tmp64 = tcg_temp_new_i64();
-                tcg_gen_ld_i64(tmp64, cpu_env, ri->fieldoffset);
-            }
-            tmp = tcg_temp_new_i32();
-            tcg_gen_extrl_i64_i32(tmp, tmp64);
-            store_reg(s, rt, tmp);
-            tmp = tcg_temp_new_i32();
-            tcg_gen_extrh_i64_i32(tmp, tmp64);
-            tcg_temp_free_i64(tmp64);
-            store_reg(s, rt2, tmp);
-        } else {
-            TCGv_i32 tmp;
-            if (ri->type & ARM_CP_CONST) {
-                tmp = tcg_constant_i32(ri->resetvalue);
-            } else if (ri->readfn) {
-                if (!tcg_ri) {
-                    tcg_ri = gen_lookup_cp_reg(key);
-                }
-                tmp = tcg_temp_new_i32();
-                gen_helper_get_cp_reg(tmp, cpu_env, tcg_ri);
-            } else {
-                tmp = load_cpu_offset(ri->fieldoffset);
-            }
-            if (rt == 15) {
-                /* Destination register of r15 for 32 bit loads sets
-                 * the condition codes from the high 4 bits of the value
-                 */
-                gen_set_nzcv(tmp);
-                tcg_temp_free_i32(tmp);
-            } else {
-                store_reg(s, rt, tmp);
-            }
-        }
-    } else {
-        /* Write */
-        if (ri->type & ARM_CP_CONST) {
-            /* If not forbidden by access permissions, treat as WI */
-            goto exit;
-        }
-
-        if (is64) {
-            TCGv_i32 tmplo, tmphi;
-            TCGv_i64 tmp64 = tcg_temp_new_i64();
-            tmplo = load_reg(s, rt);
-            tmphi = load_reg(s, rt2);
-            tcg_gen_concat_i32_i64(tmp64, tmplo, tmphi);
-            tcg_temp_free_i32(tmplo);
-            tcg_temp_free_i32(tmphi);
-            if (ri->writefn) {
-                if (!tcg_ri) {
-                    tcg_ri = gen_lookup_cp_reg(key);
-                }
-                gen_helper_set_cp_reg64(cpu_env, tcg_ri, tmp64);
-            } else {
-                tcg_gen_st_i64(tmp64, cpu_env, ri->fieldoffset);
-            }
-            tcg_temp_free_i64(tmp64);
-        } else {
-            TCGv_i32 tmp = load_reg(s, rt);
-            if (ri->writefn) {
-                if (!tcg_ri) {
-                    tcg_ri = gen_lookup_cp_reg(key);
-                }
-                gen_helper_set_cp_reg(cpu_env, tcg_ri, tmp);
-                tcg_temp_free_i32(tmp);
-            } else {
-                store_cpu_offset(tmp, ri->fieldoffset, 4);
-            }
-        }
-    }
-
-    /* I/O operations must end the TB here (whether read or write) */
-    need_exit_tb = ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) &&
-                    (ri->type & ARM_CP_IO));
-
-    if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {
-        /*
-         * A write to any coprocessor register that ends a TB
-         * must rebuild the hflags for the next TB.
-         */
-        gen_rebuild_hflags(s, ri->type & ARM_CP_NEWEL);
-        /*
-         * We default to ending the TB on a coprocessor register write,
-         * but allow this to be suppressed by the register definition
-         * (usually only necessary to work around guest bugs).
-         */
-        need_exit_tb = true;
-    }
-    if (need_exit_tb) {
-        gen_lookup_tb(s);
-    }
-
- exit:
-    if (tcg_ri) {
-        tcg_temp_free_ptr(tcg_ri);
-    }
-}
-
-/* Decode XScale DSP or iWMMXt insn (in the copro space, cp=0 or 1) */
-static void disas_xscale_insn(DisasContext *s, uint32_t insn)
-{
-    int cpnum = (insn >> 8) & 0xf;
-
-    if (extract32(s->c15_cpar, cpnum, 1) == 0) {
-        unallocated_encoding(s);
-    } else if (arm_dc_feature(s, ARM_FEATURE_IWMMXT)) {
-        if (disas_iwmmxt_insn(s, insn)) {
-            unallocated_encoding(s);
-        }
-    } else if (arm_dc_feature(s, ARM_FEATURE_XSCALE)) {
-        if (disas_dsp_insn(s, insn)) {
-            unallocated_encoding(s);
-        }
-    }
-}
-
-/* Store a 64-bit value to a register pair.  Clobbers val.  */
-static void gen_storeq_reg(DisasContext *s, int rlow, int rhigh, TCGv_i64 val)
-{
-    TCGv_i32 tmp;
-    tmp = tcg_temp_new_i32();
-    tcg_gen_extrl_i64_i32(tmp, val);
-    store_reg(s, rlow, tmp);
-    tmp = tcg_temp_new_i32();
-    tcg_gen_extrh_i64_i32(tmp, val);
-    store_reg(s, rhigh, tmp);
-}
-
-/* load and add a 64-bit value from a register pair.  */
-static void gen_addq(DisasContext *s, TCGv_i64 val, int rlow, int rhigh)
-{
-    TCGv_i64 tmp;
-    TCGv_i32 tmpl;
-    TCGv_i32 tmph;
-
-    /* Load 64-bit value rd:rn.  */
-    tmpl = load_reg(s, rlow);
-    tmph = load_reg(s, rhigh);
-    tmp = tcg_temp_new_i64();
-    tcg_gen_concat_i32_i64(tmp, tmpl, tmph);
-    tcg_temp_free_i32(tmpl);
-    tcg_temp_free_i32(tmph);
-    tcg_gen_add_i64(val, val, tmp);
-    tcg_temp_free_i64(tmp);
-}
-
-/* Set N and Z flags from hi|lo.  */
-static void gen_logicq_cc(TCGv_i32 lo, TCGv_i32 hi)
-{
-    tcg_gen_mov_i32(cpu_NF, hi);
-    tcg_gen_or_i32(cpu_ZF, lo, hi);
-}
-
-/* Load/Store exclusive instructions are implemented by remembering
-   the value/address loaded, and seeing if these are the same
-   when the store is performed.  This should be sufficient to implement
-   the architecturally mandated semantics, and avoids having to monitor
-   regular stores.  The compare vs the remembered value is done during
-   the cmpxchg operation, but we must compare the addresses manually.  */
-static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
-                               TCGv_i32 addr, int size)
-{
-    TCGv_i32 tmp = tcg_temp_new_i32();
-    MemOp opc = size | MO_ALIGN | s->be_data;
-
-    s->is_ldex = true;
-
-    if (size == 3) {
-        TCGv_i32 tmp2 = tcg_temp_new_i32();
-        TCGv_i64 t64 = tcg_temp_new_i64();
-
-        /*
-         * For AArch32, architecturally the 32-bit word at the lowest
-         * address is always Rt and the one at addr+4 is Rt2, even if
-         * the CPU is big-endian. That means we don't want to do a
-         * gen_aa32_ld_i64(), which checks SCTLR_B as if for an
-         * architecturally 64-bit access, but instead do a 64-bit access
-         * using MO_BE if appropriate and then split the two halves.
-         */
-        TCGv taddr = gen_aa32_addr(s, addr, opc);
-
-        tcg_gen_qemu_ld_i64(t64, taddr, get_mem_index(s), opc);
-        tcg_temp_free(taddr);
-        tcg_gen_mov_i64(cpu_exclusive_val, t64);
-        if (s->be_data == MO_BE) {
-            tcg_gen_extr_i64_i32(tmp2, tmp, t64);
-        } else {
-            tcg_gen_extr_i64_i32(tmp, tmp2, t64);
-        }
-        tcg_temp_free_i64(t64);
-
-        store_reg(s, rt2, tmp2);
-    } else {
-        gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), opc);
-        tcg_gen_extu_i32_i64(cpu_exclusive_val, tmp);
-    }
-
-    store_reg(s, rt, tmp);
-    tcg_gen_extu_i32_i64(cpu_exclusive_addr, addr);
-}
-
-static void gen_clrex(DisasContext *s)
-{
-    tcg_gen_movi_i64(cpu_exclusive_addr, -1);
-}
-
-static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
-                                TCGv_i32 addr, int size)
-{
-    TCGv_i32 t0, t1, t2;
-    TCGv_i64 extaddr;
-    TCGv taddr;
-    TCGLabel *done_label;
-    TCGLabel *fail_label;
-    MemOp opc = size | MO_ALIGN | s->be_data;
-
-    /* if (env->exclusive_addr == addr && env->exclusive_val == [addr]) {
-         [addr] = {Rt};
-         {Rd} = 0;
-       } else {
-         {Rd} = 1;
-       } */
-    fail_label = gen_new_label();
-    done_label = gen_new_label();
-    extaddr = tcg_temp_new_i64();
-    tcg_gen_extu_i32_i64(extaddr, addr);
-    tcg_gen_brcond_i64(TCG_COND_NE, extaddr, cpu_exclusive_addr, fail_label);
-    tcg_temp_free_i64(extaddr);
-
-    taddr = gen_aa32_addr(s, addr, opc);
-    t0 = tcg_temp_new_i32();
-    t1 = load_reg(s, rt);
-    if (size == 3) {
-        TCGv_i64 o64 = tcg_temp_new_i64();
-        TCGv_i64 n64 = tcg_temp_new_i64();
-
-        t2 = load_reg(s, rt2);
-
-        /*
-         * For AArch32, architecturally the 32-bit word at the lowest
-         * address is always Rt and the one at addr+4 is Rt2, even if
-         * the CPU is big-endian. Since we're going to treat this as a
-         * single 64-bit BE store, we need to put the two halves in the
-         * opposite order for BE to LE, so that they end up in the right
-         * places.  We don't want gen_aa32_st_i64, because that checks
-         * SCTLR_B as if for an architectural 64-bit access.
-         */
-        if (s->be_data == MO_BE) {
-            tcg_gen_concat_i32_i64(n64, t2, t1);
-        } else {
-            tcg_gen_concat_i32_i64(n64, t1, t2);
-        }
-        tcg_temp_free_i32(t2);
-
-        tcg_gen_atomic_cmpxchg_i64(o64, taddr, cpu_exclusive_val, n64,
-                                   get_mem_index(s), opc);
-        tcg_temp_free_i64(n64);
-
-        tcg_gen_setcond_i64(TCG_COND_NE, o64, o64, cpu_exclusive_val);
-        tcg_gen_extrl_i64_i32(t0, o64);
-
-        tcg_temp_free_i64(o64);
-    } else {
-        t2 = tcg_temp_new_i32();
-        tcg_gen_extrl_i64_i32(t2, cpu_exclusive_val);
-        tcg_gen_atomic_cmpxchg_i32(t0, taddr, t2, t1, get_mem_index(s), opc);
-        tcg_gen_setcond_i32(TCG_COND_NE, t0, t0, t2);
-        tcg_temp_free_i32(t2);
-    }
-    tcg_temp_free_i32(t1);
-    tcg_temp_free(taddr);
-    tcg_gen_mov_i32(cpu_R[rd], t0);
-    tcg_temp_free_i32(t0);
-    tcg_gen_br(done_label);
-
-    gen_set_label(fail_label);
-    tcg_gen_movi_i32(cpu_R[rd], 1);
-    gen_set_label(done_label);
-    tcg_gen_movi_i64(cpu_exclusive_addr, -1);
-}
-
-/* gen_srs:
- * @env: CPUARMState
- * @s: DisasContext
- * @mode: mode field from insn (which stack to store to)
- * @amode: addressing mode (DA/IA/DB/IB), encoded as per P,U bits in ARM insn
- * @writeback: true if writeback bit set
- *
- * Generate code for the SRS (Store Return State) insn.
- */
-static void gen_srs(DisasContext *s,
-                    uint32_t mode, uint32_t amode, bool writeback)
-{
-    int32_t offset;
-    TCGv_i32 addr, tmp;
-    bool undef = false;
-
-    /* SRS is:
-     * - trapped to EL3 if EL3 is AArch64 and we are at Secure EL1
-     *   and specified mode is monitor mode
-     * - UNDEFINED in Hyp mode
-     * - UNPREDICTABLE in User or System mode
-     * - UNPREDICTABLE if the specified mode is:
-     * -- not implemented
-     * -- not a valid mode number
-     * -- a mode that's at a higher exception level
-     * -- Monitor, if we are Non-secure
-     * For the UNPREDICTABLE cases we choose to UNDEF.
-     */
-    if (s->current_el == 1 && !s->ns && mode == ARM_CPU_MODE_MON) {
-        gen_exception_insn_el(s, 0, EXCP_UDEF, syn_uncategorized(), 3);
-        return;
-    }
-
-    if (s->current_el == 0 || s->current_el == 2) {
-        undef = true;
-    }
-
-    switch (mode) {
-    case ARM_CPU_MODE_USR:
-    case ARM_CPU_MODE_FIQ:
-    case ARM_CPU_MODE_IRQ:
-    case ARM_CPU_MODE_SVC:
-    case ARM_CPU_MODE_ABT:
-    case ARM_CPU_MODE_UND:
-    case ARM_CPU_MODE_SYS:
-        break;
-    case ARM_CPU_MODE_HYP:
-        if (s->current_el == 1 || !arm_dc_feature(s, ARM_FEATURE_EL2)) {
-            undef = true;
-        }
-        break;
-    case ARM_CPU_MODE_MON:
-        /* No need to check specifically for "are we non-secure" because
-         * we've already made EL0 UNDEF and handled the trap for S-EL1;
-         * so if this isn't EL3 then we must be non-secure.
-         */
-        if (s->current_el != 3) {
-            undef = true;
-        }
-        break;
-    default:
-        undef = true;
-    }
-
-    if (undef) {
-        unallocated_encoding(s);
-        return;
-    }
-
-    addr = tcg_temp_new_i32();
-    /* get_r13_banked() will raise an exception if called from System mode */
-    gen_set_condexec(s);
-    gen_update_pc(s, 0);
-    gen_helper_get_r13_banked(addr, cpu_env, tcg_constant_i32(mode));
-    switch (amode) {
-    case 0: /* DA */
-        offset = -4;
-        break;
-    case 1: /* IA */
-        offset = 0;
-        break;
-    case 2: /* DB */
-        offset = -8;
-        break;
-    case 3: /* IB */
-        offset = 4;
-        break;
-    default:
-        g_assert_not_reached();
-    }
-    tcg_gen_addi_i32(addr, addr, offset);
-    tmp = load_reg(s, 14);
-    gen_aa32_st_i32(s, tmp, addr, get_mem_index(s), MO_UL | MO_ALIGN);
-    tcg_temp_free_i32(tmp);
-    tmp = load_cpu_field(spsr);
-    tcg_gen_addi_i32(addr, addr, 4);
-    gen_aa32_st_i32(s, tmp, addr, get_mem_index(s), MO_UL | MO_ALIGN);
-    tcg_temp_free_i32(tmp);
-    if (writeback) {
-        switch (amode) {
-        case 0:
-            offset = -8;
-            break;
-        case 1:
-            offset = 4;
-            break;
-        case 2:
-            offset = -4;
-            break;
-        case 3:
-            offset = 0;
-            break;
-        default:
-            g_assert_not_reached();
-        }
-        tcg_gen_addi_i32(addr, addr, offset);
-        gen_helper_set_r13_banked(cpu_env, tcg_constant_i32(mode), addr);
-    }
-    tcg_temp_free_i32(addr);
-    s->base.is_jmp = DISAS_UPDATE_EXIT;
-}
-
-/* Skip this instruction if the ARM condition is false */
-static void arm_skip_unless(DisasContext *s, uint32_t cond)
-{
-    arm_gen_condlabel(s);
-    arm_gen_test_cc(cond ^ 1, s->condlabel.label);
-}
-
-
-/*
- * Constant expanders used by T16/T32 decode
- */
-
-/* Return only the rotation part of T32ExpandImm.  */
-static int t32_expandimm_rot(DisasContext *s, int x)
-{
-    return x & 0xc00 ? extract32(x, 7, 5) : 0;
-}
-
-/* Return the unrotated immediate from T32ExpandImm.  */
-static int t32_expandimm_imm(DisasContext *s, int x)
-{
-    int imm = extract32(x, 0, 8);
-
-    switch (extract32(x, 8, 4)) {
-    case 0: /* XY */
-        /* Nothing to do.  */
-        break;
-    case 1: /* 00XY00XY */
-        imm *= 0x00010001;
-        break;
-    case 2: /* XY00XY00 */
-        imm *= 0x01000100;
-        break;
-    case 3: /* XYXYXYXY */
-        imm *= 0x01010101;
-        break;
-    default:
-        /* Rotated constant.  */
-        imm |= 0x80;
-        break;
-    }
-    return imm;
-}
-
-static int t32_branch24(DisasContext *s, int x)
-{
-    /* Convert J1:J2 at x[22:21] to I2:I1, which involves I=J^~S.  */
-    x ^= !(x < 0) * (3 << 21);
-    /* Append the final zero.  */
-    return x << 1;
-}
-
-static int t16_setflags(DisasContext *s)
-{
-    return s->condexec_mask == 0;
-}
-
-static int t16_push_list(DisasContext *s, int x)
-{
-    return (x & 0xff) | (x & 0x100) << (14 - 8);
-}
-
-static int t16_pop_list(DisasContext *s, int x)
-{
-    return (x & 0xff) | (x & 0x100) << (15 - 8);
-}
-
-/*
- * Include the generated decoders.
- */
-
-#include "decode-a32.c.inc"
-#include "decode-a32-uncond.c.inc"
-#include "decode-t32.c.inc"
-#include "decode-t16.c.inc"
-
-static bool valid_cp(DisasContext *s, int cp)
-{
-    /*
-     * Return true if this coprocessor field indicates something
-     * that's really a possible coprocessor.
-     * For v7 and earlier, coprocessors 8..15 were reserved for Arm use,
-     * and of those only cp14 and cp15 were used for registers.
-     * cp10 and cp11 were used for VFP and Neon, whose decode is
-     * dealt with elsewhere. With the advent of fp16, cp9 is also
-     * now part of VFP.
-     * For v8A and later, the encoding has been tightened so that
-     * only cp14 and cp15 are valid, and other values aren't considered
-     * to be in the coprocessor-instruction space at all. v8M still
-     * permits coprocessors 0..7.
-     * For XScale, we must not decode the XScale cp0, cp1 space as
-     * a standard coprocessor insn, because we want to fall through to
-     * the legacy disas_xscale_insn() decoder after decodetree is done.
-     */
-    if (arm_dc_feature(s, ARM_FEATURE_XSCALE) && (cp == 0 || cp == 1)) {
-        return false;
-    }
-
-    if (arm_dc_feature(s, ARM_FEATURE_V8) &&
-        !arm_dc_feature(s, ARM_FEATURE_M)) {
-        return cp >= 14;
-    }
-    return cp < 8 || cp >= 14;
-}
-
-static bool trans_MCR(DisasContext *s, arg_MCR *a)
-{
-    if (!valid_cp(s, a->cp)) {
-        return false;
-    }
-    do_coproc_insn(s, a->cp, false, a->opc1, a->crn, a->crm, a->opc2,
-                   false, a->rt, 0);
-    return true;
-}
-
-static bool trans_MRC(DisasContext *s, arg_MRC *a)
-{
-    if (!valid_cp(s, a->cp)) {
-        return false;
-    }
-    do_coproc_insn(s, a->cp, false, a->opc1, a->crn, a->crm, a->opc2,
-                   true, a->rt, 0);
-    return true;
-}
-
-static bool trans_MCRR(DisasContext *s, arg_MCRR *a)
-{
-    if (!valid_cp(s, a->cp)) {
-        return false;
-    }
-    do_coproc_insn(s, a->cp, true, a->opc1, 0, a->crm, 0,
-                   false, a->rt, a->rt2);
-    return true;
-}
-
-static bool trans_MRRC(DisasContext *s, arg_MRRC *a)
-{
-    if (!valid_cp(s, a->cp)) {
-        return false;
-    }
-    do_coproc_insn(s, a->cp, true, a->opc1, 0, a->crm, 0,
-                   true, a->rt, a->rt2);
-    return true;
-}
-
-/* Helpers to swap operands for reverse-subtract.  */
-static void gen_rsb(TCGv_i32 dst, TCGv_i32 a, TCGv_i32 b)
-{
-    tcg_gen_sub_i32(dst, b, a);
-}
-
-static void gen_rsb_CC(TCGv_i32 dst, TCGv_i32 a, TCGv_i32 b)
-{
-    gen_sub_CC(dst, b, a);
-}
-
-static void gen_rsc(TCGv_i32 dest, TCGv_i32 a, TCGv_i32 b)
-{
-    gen_sub_carry(dest, b, a);
-}
-
-static void gen_rsc_CC(TCGv_i32 dest, TCGv_i32 a, TCGv_i32 b)
-{
-    gen_sbc_CC(dest, b, a);
-}
-
-/*
- * Helpers for the data processing routines.
- *
- * After the computation store the results back.
- * This may be suppressed altogether (STREG_NONE), require a runtime
- * check against the stack limits (STREG_SP_CHECK), or generate an
- * exception return.  Oh, or store into a register.
- *
- * Always return true, indicating success for a trans_* function.
- */
-typedef enum {
-   STREG_NONE,
-   STREG_NORMAL,
-   STREG_SP_CHECK,
-   STREG_EXC_RET,
-} StoreRegKind;
-
-static bool store_reg_kind(DisasContext *s, int rd,
-                            TCGv_i32 val, StoreRegKind kind)
-{
-    switch (kind) {
-    case STREG_NONE:
-        tcg_temp_free_i32(val);
-        return true;
-    case STREG_NORMAL:
-        /* See ALUWritePC: Interworking only from a32 mode. */
-        if (s->thumb) {
-            store_reg(s, rd, val);
-        } else {
-            store_reg_bx(s, rd, val);
-        }
-        return true;
-    case STREG_SP_CHECK:
-        store_sp_checked(s, val);
-        return true;
-    case STREG_EXC_RET:
-        gen_exception_return(s, val);
-        return true;
-    }
-    g_assert_not_reached();
-}
-
-/*
- * Data Processing (register)
- *
- * Operate, with set flags, one register source,
- * one immediate shifted register source, and a destination.
- */
-static bool op_s_rrr_shi(DisasContext *s, arg_s_rrr_shi *a,
-                         void (*gen)(TCGv_i32, TCGv_i32, TCGv_i32),
-                         int logic_cc, StoreRegKind kind)
-{
-    TCGv_i32 tmp1, tmp2;
-
-    tmp2 = load_reg(s, a->rm);
-    gen_arm_shift_im(tmp2, a->shty, a->shim, logic_cc);
-    tmp1 = load_reg(s, a->rn);
-
-    gen(tmp1, tmp1, tmp2);
-    tcg_temp_free_i32(tmp2);
-
-    if (logic_cc) {
-        gen_logic_CC(tmp1);
-    }
-    return store_reg_kind(s, a->rd, tmp1, kind);
-}
-
-static bool op_s_rxr_shi(DisasContext *s, arg_s_rrr_shi *a,
-                         void (*gen)(TCGv_i32, TCGv_i32),
-                         int logic_cc, StoreRegKind kind)
-{
-    TCGv_i32 tmp;
-
-    tmp = load_reg(s, a->rm);
-    gen_arm_shift_im(tmp, a->shty, a->shim, logic_cc);
-
-    gen(tmp, tmp);
-    if (logic_cc) {
-        gen_logic_CC(tmp);
-    }
-    return store_reg_kind(s, a->rd, tmp, kind);
-}
-
-/*
- * Data-processing (register-shifted register)
- *
- * Operate, with set flags, one register source,
- * one register shifted register source, and a destination.
- */
-static bool op_s_rrr_shr(DisasContext *s, arg_s_rrr_shr *a,
-                         void (*gen)(TCGv_i32, TCGv_i32, TCGv_i32),
-                         int logic_cc, StoreRegKind kind)
-{
-    TCGv_i32 tmp1, tmp2;
-
-    tmp1 = load_reg(s, a->rs);
-    tmp2 = load_reg(s, a->rm);
-    gen_arm_shift_reg(tmp2, a->shty, tmp1, logic_cc);
-    tmp1 = load_reg(s, a->rn);
-
-    gen(tmp1, tmp1, tmp2);
-    tcg_temp_free_i32(tmp2);
-
-    if (logic_cc) {
-        gen_logic_CC(tmp1);
-    }
-    return store_reg_kind(s, a->rd, tmp1, kind);
-}
-
-static bool op_s_rxr_shr(DisasContext *s, arg_s_rrr_shr *a,
-                         void (*gen)(TCGv_i32, TCGv_i32),
-                         int logic_cc, StoreRegKind kind)
-{
-    TCGv_i32 tmp1, tmp2;
-
-    tmp1 = load_reg(s, a->rs);
-    tmp2 = load_reg(s, a->rm);
-    gen_arm_shift_reg(tmp2, a->shty, tmp1, logic_cc);
-
-    gen(tmp2, tmp2);
-    if (logic_cc) {
-        gen_logic_CC(tmp2);
-    }
-    return store_reg_kind(s, a->rd, tmp2, kind);
-}
-
-/*
- * Data-processing (immediate)
- *
- * Operate, with set flags, one register source,
- * one rotated immediate, and a destination.
- *
- * Note that logic_cc && a->rot setting CF based on the msb of the
- * immediate is the reason why we must pass in the unrotated form
- * of the immediate.
- */
-static bool op_s_rri_rot(DisasContext *s, arg_s_rri_rot *a,
-                         void (*gen)(TCGv_i32, TCGv_i32, TCGv_i32),
-                         int logic_cc, StoreRegKind kind)
-{
-    TCGv_i32 tmp1;
-    uint32_t imm;
-
-    imm = ror32(a->imm, a->rot);
-    if (logic_cc && a->rot) {
-        tcg_gen_movi_i32(cpu_CF, imm >> 31);
-    }
-    tmp1 = load_reg(s, a->rn);
-
-    gen(tmp1, tmp1, tcg_constant_i32(imm));
-
-    if (logic_cc) {
-        gen_logic_CC(tmp1);
-    }
-    return store_reg_kind(s, a->rd, tmp1, kind);
-}
-
-static bool op_s_rxi_rot(DisasContext *s, arg_s_rri_rot *a,
-                         void (*gen)(TCGv_i32, TCGv_i32),
-                         int logic_cc, StoreRegKind kind)
-{
-    TCGv_i32 tmp;
-    uint32_t imm;
-
-    imm = ror32(a->imm, a->rot);
-    if (logic_cc && a->rot) {
-        tcg_gen_movi_i32(cpu_CF, imm >> 31);
-    }
-
-    tmp = tcg_temp_new_i32();
-    gen(tmp, tcg_constant_i32(imm));
-
-    if (logic_cc) {
-        gen_logic_CC(tmp);
-    }
-    return store_reg_kind(s, a->rd, tmp, kind);
-}
-
-#define DO_ANY3(NAME, OP, L, K)                                         \
-    static bool trans_##NAME##_rrri(DisasContext *s, arg_s_rrr_shi *a)  \
-    { StoreRegKind k = (K); return op_s_rrr_shi(s, a, OP, L, k); }      \
-    static bool trans_##NAME##_rrrr(DisasContext *s, arg_s_rrr_shr *a)  \
-    { StoreRegKind k = (K); return op_s_rrr_shr(s, a, OP, L, k); }      \
-    static bool trans_##NAME##_rri(DisasContext *s, arg_s_rri_rot *a)   \
-    { StoreRegKind k = (K); return op_s_rri_rot(s, a, OP, L, k); }
-
-#define DO_ANY2(NAME, OP, L, K)                                         \
-    static bool trans_##NAME##_rxri(DisasContext *s, arg_s_rrr_shi *a)  \
-    { StoreRegKind k = (K); return op_s_rxr_shi(s, a, OP, L, k); }      \
-    static bool trans_##NAME##_rxrr(DisasContext *s, arg_s_rrr_shr *a)  \
-    { StoreRegKind k = (K); return op_s_rxr_shr(s, a, OP, L, k); }      \
-    static bool trans_##NAME##_rxi(DisasContext *s, arg_s_rri_rot *a)   \
-    { StoreRegKind k = (K); return op_s_rxi_rot(s, a, OP, L, k); }
-
-#define DO_CMP2(NAME, OP, L)                                            \
-    static bool trans_##NAME##_xrri(DisasContext *s, arg_s_rrr_shi *a)  \
-    { return op_s_rrr_shi(s, a, OP, L, STREG_NONE); }                   \
-    static bool trans_##NAME##_xrrr(DisasContext *s, arg_s_rrr_shr *a)  \
-    { return op_s_rrr_shr(s, a, OP, L, STREG_NONE); }                   \
-    static bool trans_##NAME##_xri(DisasContext *s, arg_s_rri_rot *a)   \
-    { return op_s_rri_rot(s, a, OP, L, STREG_NONE); }
-
-DO_ANY3(AND, tcg_gen_and_i32, a->s, STREG_NORMAL)
-DO_ANY3(EOR, tcg_gen_xor_i32, a->s, STREG_NORMAL)
-DO_ANY3(ORR, tcg_gen_or_i32, a->s, STREG_NORMAL)
-DO_ANY3(BIC, tcg_gen_andc_i32, a->s, STREG_NORMAL)
-
-DO_ANY3(RSB, a->s ? gen_rsb_CC : gen_rsb, false, STREG_NORMAL)
-DO_ANY3(ADC, a->s ? gen_adc_CC : gen_add_carry, false, STREG_NORMAL)
-DO_ANY3(SBC, a->s ? gen_sbc_CC : gen_sub_carry, false, STREG_NORMAL)
-DO_ANY3(RSC, a->s ? gen_rsc_CC : gen_rsc, false, STREG_NORMAL)
-
-DO_CMP2(TST, tcg_gen_and_i32, true)
-DO_CMP2(TEQ, tcg_gen_xor_i32, true)
-DO_CMP2(CMN, gen_add_CC, false)
-DO_CMP2(CMP, gen_sub_CC, false)
-
-DO_ANY3(ADD, a->s ? gen_add_CC : tcg_gen_add_i32, false,
-        a->rd == 13 && a->rn == 13 ? STREG_SP_CHECK : STREG_NORMAL)
-
-/*
- * Note for the computation of StoreRegKind we return out of the
- * middle of the functions that are expanded by DO_ANY3, and that
- * we modify a->s via that parameter before it is used by OP.
- */
-DO_ANY3(SUB, a->s ? gen_sub_CC : tcg_gen_sub_i32, false,
-        ({
-            StoreRegKind ret = STREG_NORMAL;
-            if (a->rd == 15 && a->s) {
-                /*
-                 * See ALUExceptionReturn:
-                 * In User mode, UNPREDICTABLE; we choose UNDEF.
-                 * In Hyp mode, UNDEFINED.
-                 */
-                if (IS_USER(s) || s->current_el == 2) {
-                    unallocated_encoding(s);
-                    return true;
-                }
-                /* There is no writeback of nzcv to PSTATE.  */
-                a->s = 0;
-                ret = STREG_EXC_RET;
-            } else if (a->rd == 13 && a->rn == 13) {
-                ret = STREG_SP_CHECK;
-            }
-            ret;
-        }))
-
-DO_ANY2(MOV, tcg_gen_mov_i32, a->s,
-        ({
-            StoreRegKind ret = STREG_NORMAL;
-            if (a->rd == 15 && a->s) {
-                /*
-                 * See ALUExceptionReturn:
-                 * In User mode, UNPREDICTABLE; we choose UNDEF.
-                 * In Hyp mode, UNDEFINED.
-                 */
-                if (IS_USER(s) || s->current_el == 2) {
-                    unallocated_encoding(s);
-                    return true;
-                }
-                /* There is no writeback of nzcv to PSTATE.  */
-                a->s = 0;
-                ret = STREG_EXC_RET;
-            } else if (a->rd == 13) {
-                ret = STREG_SP_CHECK;
-            }
-            ret;
-        }))
-
-DO_ANY2(MVN, tcg_gen_not_i32, a->s, STREG_NORMAL)
-
-/*
- * ORN is only available with T32, so there is no register-shifted-register
- * form of the insn.  Using the DO_ANY3 macro would create an unused function.
- */
-static bool trans_ORN_rrri(DisasContext *s, arg_s_rrr_shi *a)
-{
-    return op_s_rrr_shi(s, a, tcg_gen_orc_i32, a->s, STREG_NORMAL);
-}
-
-static bool trans_ORN_rri(DisasContext *s, arg_s_rri_rot *a)
-{
-    return op_s_rri_rot(s, a, tcg_gen_orc_i32, a->s, STREG_NORMAL);
-}
-
-#undef DO_ANY3
-#undef DO_ANY2
-#undef DO_CMP2
-
-static bool trans_ADR(DisasContext *s, arg_ri *a)
-{
-    store_reg_bx(s, a->rd, add_reg_for_lit(s, 15, a->imm));
-    return true;
-}
-
-static bool trans_MOVW(DisasContext *s, arg_MOVW *a)
-{
-    if (!ENABLE_ARCH_6T2) {
-        return false;
-    }
-
-    store_reg(s, a->rd, tcg_constant_i32(a->imm));
-    return true;
-}
-
-static bool trans_MOVT(DisasContext *s, arg_MOVW *a)
-{
-    TCGv_i32 tmp;
-
-    if (!ENABLE_ARCH_6T2) {
-        return false;
-    }
-
-    tmp = load_reg(s, a->rd);
-    tcg_gen_ext16u_i32(tmp, tmp);
-    tcg_gen_ori_i32(tmp, tmp, a->imm << 16);
-    store_reg(s, a->rd, tmp);
-    return true;
-}
-
-/*
- * v8.1M MVE wide-shifts
- */
-static bool do_mve_shl_ri(DisasContext *s, arg_mve_shl_ri *a,
-                          WideShiftImmFn *fn)
-{
-    TCGv_i64 rda;
-    TCGv_i32 rdalo, rdahi;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
-        /* Decode falls through to ORR/MOV UNPREDICTABLE handling */
-        return false;
-    }
-    if (a->rdahi == 15) {
-        /* These are a different encoding (SQSHL/SRSHR/UQSHL/URSHR) */
-        return false;
-    }
-    if (!dc_isar_feature(aa32_mve, s) ||
-        !arm_dc_feature(s, ARM_FEATURE_M_MAIN) ||
-        a->rdahi == 13) {
-        /* RdaHi == 13 is UNPREDICTABLE; we choose to UNDEF */
-        unallocated_encoding(s);
-        return true;
-    }
-
-    if (a->shim == 0) {
-        a->shim = 32;
-    }
-
-    rda = tcg_temp_new_i64();
-    rdalo = load_reg(s, a->rdalo);
-    rdahi = load_reg(s, a->rdahi);
-    tcg_gen_concat_i32_i64(rda, rdalo, rdahi);
-
-    fn(rda, rda, a->shim);
-
-    tcg_gen_extrl_i64_i32(rdalo, rda);
-    tcg_gen_extrh_i64_i32(rdahi, rda);
-    store_reg(s, a->rdalo, rdalo);
-    store_reg(s, a->rdahi, rdahi);
-    tcg_temp_free_i64(rda);
-
-    return true;
-}
-
-static bool trans_ASRL_ri(DisasContext *s, arg_mve_shl_ri *a)
-{
-    return do_mve_shl_ri(s, a, tcg_gen_sari_i64);
-}
-
-static bool trans_LSLL_ri(DisasContext *s, arg_mve_shl_ri *a)
-{
-    return do_mve_shl_ri(s, a, tcg_gen_shli_i64);
-}
-
-static bool trans_LSRL_ri(DisasContext *s, arg_mve_shl_ri *a)
-{
-    return do_mve_shl_ri(s, a, tcg_gen_shri_i64);
-}
-
-static void gen_mve_sqshll(TCGv_i64 r, TCGv_i64 n, int64_t shift)
-{
-    gen_helper_mve_sqshll(r, cpu_env, n, tcg_constant_i32(shift));
-}
-
-static bool trans_SQSHLL_ri(DisasContext *s, arg_mve_shl_ri *a)
-{
-    return do_mve_shl_ri(s, a, gen_mve_sqshll);
-}
-
-static void gen_mve_uqshll(TCGv_i64 r, TCGv_i64 n, int64_t shift)
-{
-    gen_helper_mve_uqshll(r, cpu_env, n, tcg_constant_i32(shift));
-}
-
-static bool trans_UQSHLL_ri(DisasContext *s, arg_mve_shl_ri *a)
-{
-    return do_mve_shl_ri(s, a, gen_mve_uqshll);
-}
-
-static bool trans_SRSHRL_ri(DisasContext *s, arg_mve_shl_ri *a)
-{
-    return do_mve_shl_ri(s, a, gen_srshr64_i64);
-}
-
-static bool trans_URSHRL_ri(DisasContext *s, arg_mve_shl_ri *a)
-{
-    return do_mve_shl_ri(s, a, gen_urshr64_i64);
-}
-
-static bool do_mve_shl_rr(DisasContext *s, arg_mve_shl_rr *a, WideShiftFn *fn)
-{
-    TCGv_i64 rda;
-    TCGv_i32 rdalo, rdahi;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
-        /* Decode falls through to ORR/MOV UNPREDICTABLE handling */
-        return false;
-    }
-    if (a->rdahi == 15) {
-        /* These are a different encoding (SQSHL/SRSHR/UQSHL/URSHR) */
-        return false;
-    }
-    if (!dc_isar_feature(aa32_mve, s) ||
-        !arm_dc_feature(s, ARM_FEATURE_M_MAIN) ||
-        a->rdahi == 13 || a->rm == 13 || a->rm == 15 ||
-        a->rm == a->rdahi || a->rm == a->rdalo) {
-        /* These rdahi/rdalo/rm cases are UNPREDICTABLE; we choose to UNDEF */
-        unallocated_encoding(s);
-        return true;
-    }
-
-    rda = tcg_temp_new_i64();
-    rdalo = load_reg(s, a->rdalo);
-    rdahi = load_reg(s, a->rdahi);
-    tcg_gen_concat_i32_i64(rda, rdalo, rdahi);
-
-    /* The helper takes care of the sign-extension of the low 8 bits of Rm */
-    fn(rda, cpu_env, rda, cpu_R[a->rm]);
-
-    tcg_gen_extrl_i64_i32(rdalo, rda);
-    tcg_gen_extrh_i64_i32(rdahi, rda);
-    store_reg(s, a->rdalo, rdalo);
-    store_reg(s, a->rdahi, rdahi);
-    tcg_temp_free_i64(rda);
-
-    return true;
-}
-
-static bool trans_LSLL_rr(DisasContext *s, arg_mve_shl_rr *a)
-{
-    return do_mve_shl_rr(s, a, gen_helper_mve_ushll);
-}
-
-static bool trans_ASRL_rr(DisasContext *s, arg_mve_shl_rr *a)
-{
-    return do_mve_shl_rr(s, a, gen_helper_mve_sshrl);
-}
-
-static bool trans_UQRSHLL64_rr(DisasContext *s, arg_mve_shl_rr *a)
-{
-    return do_mve_shl_rr(s, a, gen_helper_mve_uqrshll);
-}
-
-static bool trans_SQRSHRL64_rr(DisasContext *s, arg_mve_shl_rr *a)
-{
-    return do_mve_shl_rr(s, a, gen_helper_mve_sqrshrl);
-}
-
-static bool trans_UQRSHLL48_rr(DisasContext *s, arg_mve_shl_rr *a)
-{
-    return do_mve_shl_rr(s, a, gen_helper_mve_uqrshll48);
-}
-
-static bool trans_SQRSHRL48_rr(DisasContext *s, arg_mve_shl_rr *a)
-{
-    return do_mve_shl_rr(s, a, gen_helper_mve_sqrshrl48);
-}
-
-static bool do_mve_sh_ri(DisasContext *s, arg_mve_sh_ri *a, ShiftImmFn *fn)
-{
-    if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
-        /* Decode falls through to ORR/MOV UNPREDICTABLE handling */
-        return false;
-    }
-    if (!dc_isar_feature(aa32_mve, s) ||
-        !arm_dc_feature(s, ARM_FEATURE_M_MAIN) ||
-        a->rda == 13 || a->rda == 15) {
-        /* These rda cases are UNPREDICTABLE; we choose to UNDEF */
-        unallocated_encoding(s);
-        return true;
-    }
-
-    if (a->shim == 0) {
-        a->shim = 32;
-    }
-    fn(cpu_R[a->rda], cpu_R[a->rda], a->shim);
-
-    return true;
-}
-
-static bool trans_URSHR_ri(DisasContext *s, arg_mve_sh_ri *a)
-{
-    return do_mve_sh_ri(s, a, gen_urshr32_i32);
-}
-
-static bool trans_SRSHR_ri(DisasContext *s, arg_mve_sh_ri *a)
-{
-    return do_mve_sh_ri(s, a, gen_srshr32_i32);
-}
-
-static void gen_mve_sqshl(TCGv_i32 r, TCGv_i32 n, int32_t shift)
-{
-    gen_helper_mve_sqshl(r, cpu_env, n, tcg_constant_i32(shift));
-}
-
-static bool trans_SQSHL_ri(DisasContext *s, arg_mve_sh_ri *a)
-{
-    return do_mve_sh_ri(s, a, gen_mve_sqshl);
-}
-
-static void gen_mve_uqshl(TCGv_i32 r, TCGv_i32 n, int32_t shift)
-{
-    gen_helper_mve_uqshl(r, cpu_env, n, tcg_constant_i32(shift));
-}
-
-static bool trans_UQSHL_ri(DisasContext *s, arg_mve_sh_ri *a)
-{
-    return do_mve_sh_ri(s, a, gen_mve_uqshl);
-}
-
-static bool do_mve_sh_rr(DisasContext *s, arg_mve_sh_rr *a, ShiftFn *fn)
-{
-    if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
-        /* Decode falls through to ORR/MOV UNPREDICTABLE handling */
-        return false;
-    }
-    if (!dc_isar_feature(aa32_mve, s) ||
-        !arm_dc_feature(s, ARM_FEATURE_M_MAIN) ||
-        a->rda == 13 || a->rda == 15 || a->rm == 13 || a->rm == 15 ||
-        a->rm == a->rda) {
-        /* These rda/rm cases are UNPREDICTABLE; we choose to UNDEF */
-        unallocated_encoding(s);
-        return true;
-    }
-
-    /* The helper takes care of the sign-extension of the low 8 bits of Rm */
-    fn(cpu_R[a->rda], cpu_env, cpu_R[a->rda], cpu_R[a->rm]);
-    return true;
-}
-
-static bool trans_SQRSHR_rr(DisasContext *s, arg_mve_sh_rr *a)
-{
-    return do_mve_sh_rr(s, a, gen_helper_mve_sqrshr);
-}
-
-static bool trans_UQRSHL_rr(DisasContext *s, arg_mve_sh_rr *a)
-{
-    return do_mve_sh_rr(s, a, gen_helper_mve_uqrshl);
-}
-
-/*
- * Multiply and multiply accumulate
- */
-
-static bool op_mla(DisasContext *s, arg_s_rrrr *a, bool add)
-{
-    TCGv_i32 t1, t2;
-
-    t1 = load_reg(s, a->rn);
-    t2 = load_reg(s, a->rm);
-    tcg_gen_mul_i32(t1, t1, t2);
-    tcg_temp_free_i32(t2);
-    if (add) {
-        t2 = load_reg(s, a->ra);
-        tcg_gen_add_i32(t1, t1, t2);
-        tcg_temp_free_i32(t2);
-    }
-    if (a->s) {
-        gen_logic_CC(t1);
-    }
-    store_reg(s, a->rd, t1);
-    return true;
-}
-
-static bool trans_MUL(DisasContext *s, arg_MUL *a)
-{
-    return op_mla(s, a, false);
-}
-
-static bool trans_MLA(DisasContext *s, arg_MLA *a)
-{
-    return op_mla(s, a, true);
-}
-
-static bool trans_MLS(DisasContext *s, arg_MLS *a)
-{
-    TCGv_i32 t1, t2;
-
-    if (!ENABLE_ARCH_6T2) {
-        return false;
-    }
-    t1 = load_reg(s, a->rn);
-    t2 = load_reg(s, a->rm);
-    tcg_gen_mul_i32(t1, t1, t2);
-    tcg_temp_free_i32(t2);
-    t2 = load_reg(s, a->ra);
-    tcg_gen_sub_i32(t1, t2, t1);
-    tcg_temp_free_i32(t2);
-    store_reg(s, a->rd, t1);
-    return true;
-}
-
-static bool op_mlal(DisasContext *s, arg_s_rrrr *a, bool uns, bool add)
-{
-    TCGv_i32 t0, t1, t2, t3;
-
-    t0 = load_reg(s, a->rm);
-    t1 = load_reg(s, a->rn);
-    if (uns) {
-        tcg_gen_mulu2_i32(t0, t1, t0, t1);
-    } else {
-        tcg_gen_muls2_i32(t0, t1, t0, t1);
-    }
-    if (add) {
-        t2 = load_reg(s, a->ra);
-        t3 = load_reg(s, a->rd);
-        tcg_gen_add2_i32(t0, t1, t0, t1, t2, t3);
-        tcg_temp_free_i32(t2);
-        tcg_temp_free_i32(t3);
-    }
-    if (a->s) {
-        gen_logicq_cc(t0, t1);
-    }
-    store_reg(s, a->ra, t0);
-    store_reg(s, a->rd, t1);
-    return true;
-}
-
-static bool trans_UMULL(DisasContext *s, arg_UMULL *a)
-{
-    return op_mlal(s, a, true, false);
-}
-
-static bool trans_SMULL(DisasContext *s, arg_SMULL *a)
-{
-    return op_mlal(s, a, false, false);
-}
-
-static bool trans_UMLAL(DisasContext *s, arg_UMLAL *a)
-{
-    return op_mlal(s, a, true, true);
-}
-
-static bool trans_SMLAL(DisasContext *s, arg_SMLAL *a)
-{
-    return op_mlal(s, a, false, true);
-}
-
-static bool trans_UMAAL(DisasContext *s, arg_UMAAL *a)
-{
-    TCGv_i32 t0, t1, t2, zero;
-
-    if (s->thumb
-        ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)
-        : !ENABLE_ARCH_6) {
-        return false;
-    }
-
-    t0 = load_reg(s, a->rm);
-    t1 = load_reg(s, a->rn);
-    tcg_gen_mulu2_i32(t0, t1, t0, t1);
-    zero = tcg_constant_i32(0);
-    t2 = load_reg(s, a->ra);
-    tcg_gen_add2_i32(t0, t1, t0, t1, t2, zero);
-    tcg_temp_free_i32(t2);
-    t2 = load_reg(s, a->rd);
-    tcg_gen_add2_i32(t0, t1, t0, t1, t2, zero);
-    tcg_temp_free_i32(t2);
-    store_reg(s, a->ra, t0);
-    store_reg(s, a->rd, t1);
-    return true;
-}
-
-/*
- * Saturating addition and subtraction
- */
-
-static bool op_qaddsub(DisasContext *s, arg_rrr *a, bool add, bool doub)
-{
-    TCGv_i32 t0, t1;
-
-    if (s->thumb
-        ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)
-        : !ENABLE_ARCH_5TE) {
-        return false;
-    }
-
-    t0 = load_reg(s, a->rm);
-    t1 = load_reg(s, a->rn);
-    if (doub) {
-        gen_helper_add_saturate(t1, cpu_env, t1, t1);
-    }
-    if (add) {
-        gen_helper_add_saturate(t0, cpu_env, t0, t1);
-    } else {
-        gen_helper_sub_saturate(t0, cpu_env, t0, t1);
-    }
-    tcg_temp_free_i32(t1);
-    store_reg(s, a->rd, t0);
-    return true;
-}
-
-#define DO_QADDSUB(NAME, ADD, DOUB) \
-static bool trans_##NAME(DisasContext *s, arg_rrr *a)    \
-{                                                        \
-    return op_qaddsub(s, a, ADD, DOUB);                  \
-}
-
-DO_QADDSUB(QADD, true, false)
-DO_QADDSUB(QSUB, false, false)
-DO_QADDSUB(QDADD, true, true)
-DO_QADDSUB(QDSUB, false, true)
-
-#undef DO_QADDSUB
-
-/*
- * Halfword multiply and multiply accumulate
- */
-
-static bool op_smlaxxx(DisasContext *s, arg_rrrr *a,
-                       int add_long, bool nt, bool mt)
-{
-    TCGv_i32 t0, t1, tl, th;
-
-    if (s->thumb
-        ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)
-        : !ENABLE_ARCH_5TE) {
-        return false;
-    }
-
-    t0 = load_reg(s, a->rn);
-    t1 = load_reg(s, a->rm);
-    gen_mulxy(t0, t1, nt, mt);
-    tcg_temp_free_i32(t1);
-
-    switch (add_long) {
-    case 0:
-        store_reg(s, a->rd, t0);
-        break;
-    case 1:
-        t1 = load_reg(s, a->ra);
-        gen_helper_add_setq(t0, cpu_env, t0, t1);
-        tcg_temp_free_i32(t1);
-        store_reg(s, a->rd, t0);
-        break;
-    case 2:
-        tl = load_reg(s, a->ra);
-        th = load_reg(s, a->rd);
-        /* Sign-extend the 32-bit product to 64 bits.  */
-        t1 = tcg_temp_new_i32();
-        tcg_gen_sari_i32(t1, t0, 31);
-        tcg_gen_add2_i32(tl, th, tl, th, t0, t1);
-        tcg_temp_free_i32(t0);
-        tcg_temp_free_i32(t1);
-        store_reg(s, a->ra, tl);
-        store_reg(s, a->rd, th);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-    return true;
-}
-
-#define DO_SMLAX(NAME, add, nt, mt) \
-static bool trans_##NAME(DisasContext *s, arg_rrrr *a)     \
-{                                                          \
-    return op_smlaxxx(s, a, add, nt, mt);                  \
-}
-
-DO_SMLAX(SMULBB, 0, 0, 0)
-DO_SMLAX(SMULBT, 0, 0, 1)
-DO_SMLAX(SMULTB, 0, 1, 0)
-DO_SMLAX(SMULTT, 0, 1, 1)
-
-DO_SMLAX(SMLABB, 1, 0, 0)
-DO_SMLAX(SMLABT, 1, 0, 1)
-DO_SMLAX(SMLATB, 1, 1, 0)
-DO_SMLAX(SMLATT, 1, 1, 1)
-
-DO_SMLAX(SMLALBB, 2, 0, 0)
-DO_SMLAX(SMLALBT, 2, 0, 1)
-DO_SMLAX(SMLALTB, 2, 1, 0)
-DO_SMLAX(SMLALTT, 2, 1, 1)
-
-#undef DO_SMLAX
-
-static bool op_smlawx(DisasContext *s, arg_rrrr *a, bool add, bool mt)
-{
-    TCGv_i32 t0, t1;
-
-    if (!ENABLE_ARCH_5TE) {
-        return false;
-    }
-
-    t0 = load_reg(s, a->rn);
-    t1 = load_reg(s, a->rm);
-    /*
-     * Since the nominal result is product<47:16>, shift the 16-bit
-     * input up by 16 bits, so that the result is at product<63:32>.
-     */
-    if (mt) {
-        tcg_gen_andi_i32(t1, t1, 0xffff0000);
-    } else {
-        tcg_gen_shli_i32(t1, t1, 16);
-    }
-    tcg_gen_muls2_i32(t0, t1, t0, t1);
-    tcg_temp_free_i32(t0);
-    if (add) {
-        t0 = load_reg(s, a->ra);
-        gen_helper_add_setq(t1, cpu_env, t1, t0);
-        tcg_temp_free_i32(t0);
-    }
-    store_reg(s, a->rd, t1);
-    return true;
-}
-
-#define DO_SMLAWX(NAME, add, mt) \
-static bool trans_##NAME(DisasContext *s, arg_rrrr *a)     \
-{                                                          \
-    return op_smlawx(s, a, add, mt);                       \
-}
-
-DO_SMLAWX(SMULWB, 0, 0)
-DO_SMLAWX(SMULWT, 0, 1)
-DO_SMLAWX(SMLAWB, 1, 0)
-DO_SMLAWX(SMLAWT, 1, 1)
-
-#undef DO_SMLAWX
-
-/*
- * MSR (immediate) and hints
- */
-
-static bool trans_YIELD(DisasContext *s, arg_YIELD *a)
-{
-    /*
-     * When running single-threaded TCG code, use the helper to ensure that
-     * the next round-robin scheduled vCPU gets a crack.  When running in
-     * MTTCG we don't generate jumps to the helper as it won't affect the
-     * scheduling of other vCPUs.
-     */
-    if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) {
-        gen_update_pc(s, curr_insn_len(s));
-        s->base.is_jmp = DISAS_YIELD;
-    }
-    return true;
-}
-
-static bool trans_WFE(DisasContext *s, arg_WFE *a)
-{
-    /*
-     * When running single-threaded TCG code, use the helper to ensure that
-     * the next round-robin scheduled vCPU gets a crack.  In MTTCG mode we
-     * just skip this instruction.  Currently the SEV/SEVL instructions,
-     * which are *one* of many ways to wake the CPU from WFE, are not
-     * implemented so we can't sleep like WFI does.
-     */
-    if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) {
-        gen_update_pc(s, curr_insn_len(s));
-        s->base.is_jmp = DISAS_WFE;
-    }
-    return true;
-}
-
-static bool trans_WFI(DisasContext *s, arg_WFI *a)
-{
-    /* For WFI, halt the vCPU until an IRQ. */
-    gen_update_pc(s, curr_insn_len(s));
-    s->base.is_jmp = DISAS_WFI;
-    return true;
-}
-
-static bool trans_ESB(DisasContext *s, arg_ESB *a)
-{
-    /*
-     * For M-profile, minimal-RAS ESB can be a NOP.
-     * Without RAS, we must implement this as NOP.
-     */
-    if (!arm_dc_feature(s, ARM_FEATURE_M) && dc_isar_feature(aa32_ras, s)) {
-        /*
-         * QEMU does not have a source of physical SErrors,
-         * so we are only concerned with virtual SErrors.
-         * The pseudocode in the ARM for this case is
-         *   if PSTATE.EL IN {EL0, EL1} && EL2Enabled() then
-         *      AArch32.vESBOperation();
-         * Most of the condition can be evaluated at translation time.
-         * Test for EL2 present, and defer test for SEL2 to runtime.
-         */
-        if (s->current_el <= 1 && arm_dc_feature(s, ARM_FEATURE_EL2)) {
-            gen_helper_vesb(cpu_env);
-        }
-    }
-    return true;
-}
-
-static bool trans_NOP(DisasContext *s, arg_NOP *a)
-{
-    return true;
-}
-
-static bool trans_MSR_imm(DisasContext *s, arg_MSR_imm *a)
-{
-    uint32_t val = ror32(a->imm, a->rot * 2);
-    uint32_t mask = msr_mask(s, a->mask, a->r);
-
-    if (gen_set_psr_im(s, mask, a->r, val)) {
-        unallocated_encoding(s);
-    }
-    return true;
-}
-
-/*
- * Cyclic Redundancy Check
- */
-
-static bool op_crc32(DisasContext *s, arg_rrr *a, bool c, MemOp sz)
-{
-    TCGv_i32 t1, t2, t3;
-
-    if (!dc_isar_feature(aa32_crc32, s)) {
-        return false;
-    }
-
-    t1 = load_reg(s, a->rn);
-    t2 = load_reg(s, a->rm);
-    switch (sz) {
-    case MO_8:
-        gen_uxtb(t2);
-        break;
-    case MO_16:
-        gen_uxth(t2);
-        break;
-    case MO_32:
-        break;
-    default:
-        g_assert_not_reached();
-    }
-    t3 = tcg_constant_i32(1 << sz);
-    if (c) {
-        gen_helper_crc32c(t1, t1, t2, t3);
-    } else {
-        gen_helper_crc32(t1, t1, t2, t3);
-    }
-    tcg_temp_free_i32(t2);
-    store_reg(s, a->rd, t1);
-    return true;
-}
-
-#define DO_CRC32(NAME, c, sz) \
-static bool trans_##NAME(DisasContext *s, arg_rrr *a)  \
-    { return op_crc32(s, a, c, sz); }
-
-DO_CRC32(CRC32B, false, MO_8)
-DO_CRC32(CRC32H, false, MO_16)
-DO_CRC32(CRC32W, false, MO_32)
-DO_CRC32(CRC32CB, true, MO_8)
-DO_CRC32(CRC32CH, true, MO_16)
-DO_CRC32(CRC32CW, true, MO_32)
-
-#undef DO_CRC32
-
-/*
- * Miscellaneous instructions
- */
-
-static bool trans_MRS_bank(DisasContext *s, arg_MRS_bank *a)
-{
-    if (arm_dc_feature(s, ARM_FEATURE_M)) {
-        return false;
-    }
-    gen_mrs_banked(s, a->r, a->sysm, a->rd);
-    return true;
-}
-
-static bool trans_MSR_bank(DisasContext *s, arg_MSR_bank *a)
-{
-    if (arm_dc_feature(s, ARM_FEATURE_M)) {
-        return false;
-    }
-    gen_msr_banked(s, a->r, a->sysm, a->rn);
-    return true;
-}
-
-static bool trans_MRS_reg(DisasContext *s, arg_MRS_reg *a)
-{
-    TCGv_i32 tmp;
-
-    if (arm_dc_feature(s, ARM_FEATURE_M)) {
-        return false;
-    }
-    if (a->r) {
-        if (IS_USER(s)) {
-            unallocated_encoding(s);
-            return true;
-        }
-        tmp = load_cpu_field(spsr);
-    } else {
-        tmp = tcg_temp_new_i32();
-        gen_helper_cpsr_read(tmp, cpu_env);
-    }
-    store_reg(s, a->rd, tmp);
-    return true;
-}
-
-static bool trans_MSR_reg(DisasContext *s, arg_MSR_reg *a)
-{
-    TCGv_i32 tmp;
-    uint32_t mask = msr_mask(s, a->mask, a->r);
-
-    if (arm_dc_feature(s, ARM_FEATURE_M)) {
-        return false;
-    }
-    tmp = load_reg(s, a->rn);
-    if (gen_set_psr(s, mask, a->r, tmp)) {
-        unallocated_encoding(s);
-    }
-    return true;
-}
-
-static bool trans_MRS_v7m(DisasContext *s, arg_MRS_v7m *a)
-{
-    TCGv_i32 tmp;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_M)) {
-        return false;
-    }
-    tmp = tcg_temp_new_i32();
-    gen_helper_v7m_mrs(tmp, cpu_env, tcg_constant_i32(a->sysm));
-    store_reg(s, a->rd, tmp);
-    return true;
-}
-
-static bool trans_MSR_v7m(DisasContext *s, arg_MSR_v7m *a)
-{
-    TCGv_i32 addr, reg;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_M)) {
-        return false;
-    }
-    addr = tcg_constant_i32((a->mask << 10) | a->sysm);
-    reg = load_reg(s, a->rn);
-    gen_helper_v7m_msr(cpu_env, addr, reg);
-    tcg_temp_free_i32(reg);
-    /* If we wrote to CONTROL, the EL might have changed */
-    gen_rebuild_hflags(s, true);
-    gen_lookup_tb(s);
-    return true;
-}
-
-static bool trans_BX(DisasContext *s, arg_BX *a)
-{
-    if (!ENABLE_ARCH_4T) {
-        return false;
-    }
-    gen_bx_excret(s, load_reg(s, a->rm));
-    return true;
-}
-
-static bool trans_BXJ(DisasContext *s, arg_BXJ *a)
-{
-    if (!ENABLE_ARCH_5J || arm_dc_feature(s, ARM_FEATURE_M)) {
-        return false;
-    }
-    /*
-     * v7A allows BXJ to be trapped via HSTR.TJDBX. We don't waste a
-     * TBFLAGS bit on a basically-never-happens case, so call a helper
-     * function to check for the trap and raise the exception if needed
-     * (passing it the register number for the syndrome value).
-     * v8A doesn't have this HSTR bit.
-     */
-    if (!arm_dc_feature(s, ARM_FEATURE_V8) &&
-        arm_dc_feature(s, ARM_FEATURE_EL2) &&
-        s->current_el < 2 && s->ns) {
-        gen_helper_check_bxj_trap(cpu_env, tcg_constant_i32(a->rm));
-    }
-    /* Trivial implementation equivalent to bx.  */
-    gen_bx(s, load_reg(s, a->rm));
-    return true;
-}
-
-static bool trans_BLX_r(DisasContext *s, arg_BLX_r *a)
-{
-    TCGv_i32 tmp;
-
-    if (!ENABLE_ARCH_5) {
-        return false;
-    }
-    tmp = load_reg(s, a->rm);
-    gen_pc_plus_diff(s, cpu_R[14], curr_insn_len(s) | s->thumb);
-    gen_bx(s, tmp);
-    return true;
-}
-
-/*
- * BXNS/BLXNS: only exist for v8M with the security extensions,
- * and always UNDEF if NonSecure.  We don't implement these in
- * the user-only mode either (in theory you can use them from
- * Secure User mode but they are too tied in to system emulation).
- */
-static bool trans_BXNS(DisasContext *s, arg_BXNS *a)
-{
-    if (!s->v8m_secure || IS_USER_ONLY) {
-        unallocated_encoding(s);
-    } else {
-        gen_bxns(s, a->rm);
-    }
-    return true;
-}
-
-static bool trans_BLXNS(DisasContext *s, arg_BLXNS *a)
-{
-    if (!s->v8m_secure || IS_USER_ONLY) {
-        unallocated_encoding(s);
-    } else {
-        gen_blxns(s, a->rm);
-    }
-    return true;
-}
-
-static bool trans_CLZ(DisasContext *s, arg_CLZ *a)
-{
-    TCGv_i32 tmp;
-
-    if (!ENABLE_ARCH_5) {
-        return false;
-    }
-    tmp = load_reg(s, a->rm);
-    tcg_gen_clzi_i32(tmp, tmp, 32);
-    store_reg(s, a->rd, tmp);
-    return true;
-}
-
-static bool trans_ERET(DisasContext *s, arg_ERET *a)
-{
-    TCGv_i32 tmp;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_V7VE)) {
-        return false;
-    }
-    if (IS_USER(s)) {
-        unallocated_encoding(s);
-        return true;
-    }
-    if (s->current_el == 2) {
-        /* ERET from Hyp uses ELR_Hyp, not LR */
-        tmp = load_cpu_field(elr_el[2]);
-    } else {
-        tmp = load_reg(s, 14);
-    }
-    gen_exception_return(s, tmp);
-    return true;
-}
-
-static bool trans_HLT(DisasContext *s, arg_HLT *a)
-{
-    gen_hlt(s, a->imm);
-    return true;
-}
-
-static bool trans_BKPT(DisasContext *s, arg_BKPT *a)
-{
-    if (!ENABLE_ARCH_5) {
-        return false;
-    }
-    /* BKPT is OK with ECI set and leaves it untouched */
-    s->eci_handled = true;
-    if (arm_dc_feature(s, ARM_FEATURE_M) &&
-        semihosting_enabled(s->current_el == 0) &&
-        (a->imm == 0xab)) {
-        gen_exception_internal_insn(s, EXCP_SEMIHOST);
-    } else {
-        gen_exception_bkpt_insn(s, syn_aa32_bkpt(a->imm, false));
-    }
-    return true;
-}
-
-static bool trans_HVC(DisasContext *s, arg_HVC *a)
-{
-    if (!ENABLE_ARCH_7 || arm_dc_feature(s, ARM_FEATURE_M)) {
-        return false;
-    }
-    if (IS_USER(s)) {
-        unallocated_encoding(s);
-    } else {
-        gen_hvc(s, a->imm);
-    }
-    return true;
-}
-
-static bool trans_SMC(DisasContext *s, arg_SMC *a)
-{
-    if (!ENABLE_ARCH_6K || arm_dc_feature(s, ARM_FEATURE_M)) {
-        return false;
-    }
-    if (IS_USER(s)) {
-        unallocated_encoding(s);
-    } else {
-        gen_smc(s);
-    }
-    return true;
-}
-
-static bool trans_SG(DisasContext *s, arg_SG *a)
-{
-    if (!arm_dc_feature(s, ARM_FEATURE_M) ||
-        !arm_dc_feature(s, ARM_FEATURE_V8)) {
-        return false;
-    }
-    /*
-     * SG (v8M only)
-     * The bulk of the behaviour for this instruction is implemented
-     * in v7m_handle_execute_nsc(), which deals with the insn when
-     * it is executed by a CPU in non-secure state from memory
-     * which is Secure & NonSecure-Callable.
-     * Here we only need to handle the remaining cases:
-     *  * in NS memory (including the "security extension not
-     *    implemented" case) : NOP
-     *  * in S memory but CPU already secure (clear IT bits)
-     * We know that the attribute for the memory this insn is
-     * in must match the current CPU state, because otherwise
-     * get_phys_addr_pmsav8 would have generated an exception.
-     */
-    if (s->v8m_secure) {
-        /* Like the IT insn, we don't need to generate any code */
-        s->condexec_cond = 0;
-        s->condexec_mask = 0;
-    }
-    return true;
-}
-
-static bool trans_TT(DisasContext *s, arg_TT *a)
-{
-    TCGv_i32 addr, tmp;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_M) ||
-        !arm_dc_feature(s, ARM_FEATURE_V8)) {
-        return false;
-    }
-    if (a->rd == 13 || a->rd == 15 || a->rn == 15) {
-        /* We UNDEF for these UNPREDICTABLE cases */
-        unallocated_encoding(s);
-        return true;
-    }
-    if (a->A && !s->v8m_secure) {
-        /* This case is UNDEFINED.  */
-        unallocated_encoding(s);
-        return true;
-    }
-
-    addr = load_reg(s, a->rn);
-    tmp = tcg_temp_new_i32();
-    gen_helper_v7m_tt(tmp, cpu_env, addr, tcg_constant_i32((a->A << 1) | a->T));
-    tcg_temp_free_i32(addr);
-    store_reg(s, a->rd, tmp);
-    return true;
-}
-
-/*
- * Load/store register index
- */
-
-static ISSInfo make_issinfo(DisasContext *s, int rd, bool p, bool w)
-{
-    ISSInfo ret;
-
-    /* ISS not valid if writeback */
-    if (p && !w) {
-        ret = rd;
-        if (curr_insn_len(s) == 2) {
-            ret |= ISSIs16Bit;
-        }
-    } else {
-        ret = ISSInvalid;
-    }
-    return ret;
-}
-
-static TCGv_i32 op_addr_rr_pre(DisasContext *s, arg_ldst_rr *a)
-{
-    TCGv_i32 addr = load_reg(s, a->rn);
-
-    if (s->v8m_stackcheck && a->rn == 13 && a->w) {
-        gen_helper_v8m_stackcheck(cpu_env, addr);
-    }
-
-    if (a->p) {
-        TCGv_i32 ofs = load_reg(s, a->rm);
-        gen_arm_shift_im(ofs, a->shtype, a->shimm, 0);
-        if (a->u) {
-            tcg_gen_add_i32(addr, addr, ofs);
-        } else {
-            tcg_gen_sub_i32(addr, addr, ofs);
-        }
-        tcg_temp_free_i32(ofs);
-    }
-    return addr;
-}
-
-static void op_addr_rr_post(DisasContext *s, arg_ldst_rr *a,
-                            TCGv_i32 addr, int address_offset)
-{
-    if (!a->p) {
-        TCGv_i32 ofs = load_reg(s, a->rm);
-        gen_arm_shift_im(ofs, a->shtype, a->shimm, 0);
-        if (a->u) {
-            tcg_gen_add_i32(addr, addr, ofs);
-        } else {
-            tcg_gen_sub_i32(addr, addr, ofs);
-        }
-        tcg_temp_free_i32(ofs);
-    } else if (!a->w) {
-        tcg_temp_free_i32(addr);
-        return;
-    }
-    tcg_gen_addi_i32(addr, addr, address_offset);
-    store_reg(s, a->rn, addr);
-}
-
-static bool op_load_rr(DisasContext *s, arg_ldst_rr *a,
-                       MemOp mop, int mem_idx)
-{
-    ISSInfo issinfo = make_issinfo(s, a->rt, a->p, a->w);
-    TCGv_i32 addr, tmp;
-
-    addr = op_addr_rr_pre(s, a);
-
-    tmp = tcg_temp_new_i32();
-    gen_aa32_ld_i32(s, tmp, addr, mem_idx, mop);
-    disas_set_da_iss(s, mop, issinfo);
-
-    /*
-     * Perform base writeback before the loaded value to
-     * ensure correct behavior with overlapping index registers.
-     */
-    op_addr_rr_post(s, a, addr, 0);
-    store_reg_from_load(s, a->rt, tmp);
-    return true;
-}
-
-static bool op_store_rr(DisasContext *s, arg_ldst_rr *a,
-                        MemOp mop, int mem_idx)
-{
-    ISSInfo issinfo = make_issinfo(s, a->rt, a->p, a->w) | ISSIsWrite;
-    TCGv_i32 addr, tmp;
-
-    /*
-     * In Thumb encodings of stores Rn=1111 is UNDEF; for Arm it
-     * is either UNPREDICTABLE or has defined behaviour
-     */
-    if (s->thumb && a->rn == 15) {
-        return false;
-    }
-
-    addr = op_addr_rr_pre(s, a);
-
-    tmp = load_reg(s, a->rt);
-    gen_aa32_st_i32(s, tmp, addr, mem_idx, mop);
-    disas_set_da_iss(s, mop, issinfo);
-    tcg_temp_free_i32(tmp);
-
-    op_addr_rr_post(s, a, addr, 0);
-    return true;
-}
-
-static bool trans_LDRD_rr(DisasContext *s, arg_ldst_rr *a)
-{
-    int mem_idx = get_mem_index(s);
-    TCGv_i32 addr, tmp;
-
-    if (!ENABLE_ARCH_5TE) {
-        return false;
-    }
-    if (a->rt & 1) {
-        unallocated_encoding(s);
-        return true;
-    }
-    addr = op_addr_rr_pre(s, a);
-
-    tmp = tcg_temp_new_i32();
-    gen_aa32_ld_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
-    store_reg(s, a->rt, tmp);
-
-    tcg_gen_addi_i32(addr, addr, 4);
-
-    tmp = tcg_temp_new_i32();
-    gen_aa32_ld_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
-    store_reg(s, a->rt + 1, tmp);
-
-    /* LDRD w/ base writeback is undefined if the registers overlap.  */
-    op_addr_rr_post(s, a, addr, -4);
-    return true;
-}
-
-static bool trans_STRD_rr(DisasContext *s, arg_ldst_rr *a)
-{
-    int mem_idx = get_mem_index(s);
-    TCGv_i32 addr, tmp;
-
-    if (!ENABLE_ARCH_5TE) {
-        return false;
-    }
-    if (a->rt & 1) {
-        unallocated_encoding(s);
-        return true;
-    }
-    addr = op_addr_rr_pre(s, a);
-
-    tmp = load_reg(s, a->rt);
-    gen_aa32_st_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
-    tcg_temp_free_i32(tmp);
-
-    tcg_gen_addi_i32(addr, addr, 4);
-
-    tmp = load_reg(s, a->rt + 1);
-    gen_aa32_st_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
-    tcg_temp_free_i32(tmp);
-
-    op_addr_rr_post(s, a, addr, -4);
-    return true;
-}
-
-/*
- * Load/store immediate index
- */
-
-static TCGv_i32 op_addr_ri_pre(DisasContext *s, arg_ldst_ri *a)
-{
-    int ofs = a->imm;
-
-    if (!a->u) {
-        ofs = -ofs;
-    }
-
-    if (s->v8m_stackcheck && a->rn == 13 && a->w) {
-        /*
-         * Stackcheck. Here we know 'addr' is the current SP;
-         * U is set if we're moving SP up, else down. It is
-         * UNKNOWN whether the limit check triggers when SP starts
-         * below the limit and ends up above it; we chose to do so.
-         */
-        if (!a->u) {
-            TCGv_i32 newsp = tcg_temp_new_i32();
-            tcg_gen_addi_i32(newsp, cpu_R[13], ofs);
-            gen_helper_v8m_stackcheck(cpu_env, newsp);
-            tcg_temp_free_i32(newsp);
-        } else {
-            gen_helper_v8m_stackcheck(cpu_env, cpu_R[13]);
-        }
-    }
-
-    return add_reg_for_lit(s, a->rn, a->p ? ofs : 0);
-}
-
-static void op_addr_ri_post(DisasContext *s, arg_ldst_ri *a,
-                            TCGv_i32 addr, int address_offset)
-{
-    if (!a->p) {
-        if (a->u) {
-            address_offset += a->imm;
-        } else {
-            address_offset -= a->imm;
-        }
-    } else if (!a->w) {
-        tcg_temp_free_i32(addr);
-        return;
-    }
-    tcg_gen_addi_i32(addr, addr, address_offset);
-    store_reg(s, a->rn, addr);
-}
-
-static bool op_load_ri(DisasContext *s, arg_ldst_ri *a,
-                       MemOp mop, int mem_idx)
-{
-    ISSInfo issinfo = make_issinfo(s, a->rt, a->p, a->w);
-    TCGv_i32 addr, tmp;
-
-    addr = op_addr_ri_pre(s, a);
-
-    tmp = tcg_temp_new_i32();
-    gen_aa32_ld_i32(s, tmp, addr, mem_idx, mop);
-    disas_set_da_iss(s, mop, issinfo);
-
-    /*
-     * Perform base writeback before the loaded value to
-     * ensure correct behavior with overlapping index registers.
-     */
-    op_addr_ri_post(s, a, addr, 0);
-    store_reg_from_load(s, a->rt, tmp);
-    return true;
-}
-
-static bool op_store_ri(DisasContext *s, arg_ldst_ri *a,
-                        MemOp mop, int mem_idx)
-{
-    ISSInfo issinfo = make_issinfo(s, a->rt, a->p, a->w) | ISSIsWrite;
-    TCGv_i32 addr, tmp;
-
-    /*
-     * In Thumb encodings of stores Rn=1111 is UNDEF; for Arm it
-     * is either UNPREDICTABLE or has defined behaviour
-     */
-    if (s->thumb && a->rn == 15) {
-        return false;
-    }
-
-    addr = op_addr_ri_pre(s, a);
-
-    tmp = load_reg(s, a->rt);
-    gen_aa32_st_i32(s, tmp, addr, mem_idx, mop);
-    disas_set_da_iss(s, mop, issinfo);
-    tcg_temp_free_i32(tmp);
-
-    op_addr_ri_post(s, a, addr, 0);
-    return true;
-}
-
-static bool op_ldrd_ri(DisasContext *s, arg_ldst_ri *a, int rt2)
-{
-    int mem_idx = get_mem_index(s);
-    TCGv_i32 addr, tmp;
-
-    addr = op_addr_ri_pre(s, a);
-
-    tmp = tcg_temp_new_i32();
-    gen_aa32_ld_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
-    store_reg(s, a->rt, tmp);
-
-    tcg_gen_addi_i32(addr, addr, 4);
-
-    tmp = tcg_temp_new_i32();
-    gen_aa32_ld_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
-    store_reg(s, rt2, tmp);
-
-    /* LDRD w/ base writeback is undefined if the registers overlap.  */
-    op_addr_ri_post(s, a, addr, -4);
-    return true;
-}
-
-static bool trans_LDRD_ri_a32(DisasContext *s, arg_ldst_ri *a)
-{
-    if (!ENABLE_ARCH_5TE || (a->rt & 1)) {
-        return false;
-    }
-    return op_ldrd_ri(s, a, a->rt + 1);
-}
-
-static bool trans_LDRD_ri_t32(DisasContext *s, arg_ldst_ri2 *a)
-{
-    arg_ldst_ri b = {
-        .u = a->u, .w = a->w, .p = a->p,
-        .rn = a->rn, .rt = a->rt, .imm = a->imm
-    };
-    return op_ldrd_ri(s, &b, a->rt2);
-}
-
-static bool op_strd_ri(DisasContext *s, arg_ldst_ri *a, int rt2)
-{
-    int mem_idx = get_mem_index(s);
-    TCGv_i32 addr, tmp;
-
-    addr = op_addr_ri_pre(s, a);
-
-    tmp = load_reg(s, a->rt);
-    gen_aa32_st_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
-    tcg_temp_free_i32(tmp);
-
-    tcg_gen_addi_i32(addr, addr, 4);
-
-    tmp = load_reg(s, rt2);
-    gen_aa32_st_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
-    tcg_temp_free_i32(tmp);
-
-    op_addr_ri_post(s, a, addr, -4);
-    return true;
-}
-
-static bool trans_STRD_ri_a32(DisasContext *s, arg_ldst_ri *a)
-{
-    if (!ENABLE_ARCH_5TE || (a->rt & 1)) {
-        return false;
-    }
-    return op_strd_ri(s, a, a->rt + 1);
-}
-
-static bool trans_STRD_ri_t32(DisasContext *s, arg_ldst_ri2 *a)
-{
-    arg_ldst_ri b = {
-        .u = a->u, .w = a->w, .p = a->p,
-        .rn = a->rn, .rt = a->rt, .imm = a->imm
-    };
-    return op_strd_ri(s, &b, a->rt2);
-}
-
-#define DO_LDST(NAME, WHICH, MEMOP) \
-static bool trans_##NAME##_ri(DisasContext *s, arg_ldst_ri *a)        \
-{                                                                     \
-    return op_##WHICH##_ri(s, a, MEMOP, get_mem_index(s));            \
-}                                                                     \
-static bool trans_##NAME##T_ri(DisasContext *s, arg_ldst_ri *a)       \
-{                                                                     \
-    return op_##WHICH##_ri(s, a, MEMOP, get_a32_user_mem_index(s));   \
-}                                                                     \
-static bool trans_##NAME##_rr(DisasContext *s, arg_ldst_rr *a)        \
-{                                                                     \
-    return op_##WHICH##_rr(s, a, MEMOP, get_mem_index(s));            \
-}                                                                     \
-static bool trans_##NAME##T_rr(DisasContext *s, arg_ldst_rr *a)       \
-{                                                                     \
-    return op_##WHICH##_rr(s, a, MEMOP, get_a32_user_mem_index(s));   \
-}
-
-DO_LDST(LDR, load, MO_UL)
-DO_LDST(LDRB, load, MO_UB)
-DO_LDST(LDRH, load, MO_UW)
-DO_LDST(LDRSB, load, MO_SB)
-DO_LDST(LDRSH, load, MO_SW)
-
-DO_LDST(STR, store, MO_UL)
-DO_LDST(STRB, store, MO_UB)
-DO_LDST(STRH, store, MO_UW)
-
-#undef DO_LDST
-
-/*
- * Synchronization primitives
- */
-
-static bool op_swp(DisasContext *s, arg_SWP *a, MemOp opc)
-{
-    TCGv_i32 addr, tmp;
-    TCGv taddr;
-
-    opc |= s->be_data;
-    addr = load_reg(s, a->rn);
-    taddr = gen_aa32_addr(s, addr, opc);
-    tcg_temp_free_i32(addr);
-
-    tmp = load_reg(s, a->rt2);
-    tcg_gen_atomic_xchg_i32(tmp, taddr, tmp, get_mem_index(s), opc);
-    tcg_temp_free(taddr);
-
-    store_reg(s, a->rt, tmp);
-    return true;
-}
-
-static bool trans_SWP(DisasContext *s, arg_SWP *a)
-{
-    return op_swp(s, a, MO_UL | MO_ALIGN);
-}
-
-static bool trans_SWPB(DisasContext *s, arg_SWP *a)
-{
-    return op_swp(s, a, MO_UB);
-}
-
-/*
- * Load/Store Exclusive and Load-Acquire/Store-Release
- */
-
-static bool op_strex(DisasContext *s, arg_STREX *a, MemOp mop, bool rel)
-{
-    TCGv_i32 addr;
-    /* Some cases stopped being UNPREDICTABLE in v8A (but not v8M) */
-    bool v8a = ENABLE_ARCH_8 && !arm_dc_feature(s, ARM_FEATURE_M);
-
-    /* We UNDEF for these UNPREDICTABLE cases.  */
-    if (a->rd == 15 || a->rn == 15 || a->rt == 15
-        || a->rd == a->rn || a->rd == a->rt
-        || (!v8a && s->thumb && (a->rd == 13 || a->rt == 13))
-        || (mop == MO_64
-            && (a->rt2 == 15
-                || a->rd == a->rt2
-                || (!v8a && s->thumb && a->rt2 == 13)))) {
-        unallocated_encoding(s);
-        return true;
-    }
-
-    if (rel) {
-        tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL);
-    }
-
-    addr = tcg_temp_local_new_i32();
-    load_reg_var(s, addr, a->rn);
-    tcg_gen_addi_i32(addr, addr, a->imm);
-
-    gen_store_exclusive(s, a->rd, a->rt, a->rt2, addr, mop);
-    tcg_temp_free_i32(addr);
-    return true;
-}
-
-static bool trans_STREX(DisasContext *s, arg_STREX *a)
-{
-    if (!ENABLE_ARCH_6) {
-        return false;
-    }
-    return op_strex(s, a, MO_32, false);
-}
-
-static bool trans_STREXD_a32(DisasContext *s, arg_STREX *a)
-{
-    if (!ENABLE_ARCH_6K) {
-        return false;
-    }
-    /* We UNDEF for these UNPREDICTABLE cases.  */
-    if (a->rt & 1) {
-        unallocated_encoding(s);
-        return true;
-    }
-    a->rt2 = a->rt + 1;
-    return op_strex(s, a, MO_64, false);
-}
-
-static bool trans_STREXD_t32(DisasContext *s, arg_STREX *a)
-{
-    return op_strex(s, a, MO_64, false);
-}
-
-static bool trans_STREXB(DisasContext *s, arg_STREX *a)
-{
-    if (s->thumb ? !ENABLE_ARCH_7 : !ENABLE_ARCH_6K) {
-        return false;
-    }
-    return op_strex(s, a, MO_8, false);
-}
-
-static bool trans_STREXH(DisasContext *s, arg_STREX *a)
-{
-    if (s->thumb ? !ENABLE_ARCH_7 : !ENABLE_ARCH_6K) {
-        return false;
-    }
-    return op_strex(s, a, MO_16, false);
-}
-
-static bool trans_STLEX(DisasContext *s, arg_STREX *a)
-{
-    if (!ENABLE_ARCH_8) {
-        return false;
-    }
-    return op_strex(s, a, MO_32, true);
-}
-
-static bool trans_STLEXD_a32(DisasContext *s, arg_STREX *a)
-{
-    if (!ENABLE_ARCH_8) {
-        return false;
-    }
-    /* We UNDEF for these UNPREDICTABLE cases.  */
-    if (a->rt & 1) {
-        unallocated_encoding(s);
-        return true;
-    }
-    a->rt2 = a->rt + 1;
-    return op_strex(s, a, MO_64, true);
-}
-
-static bool trans_STLEXD_t32(DisasContext *s, arg_STREX *a)
-{
-    if (!ENABLE_ARCH_8) {
-        return false;
-    }
-    return op_strex(s, a, MO_64, true);
-}
-
-static bool trans_STLEXB(DisasContext *s, arg_STREX *a)
-{
-    if (!ENABLE_ARCH_8) {
-        return false;
-    }
-    return op_strex(s, a, MO_8, true);
-}
-
-static bool trans_STLEXH(DisasContext *s, arg_STREX *a)
-{
-    if (!ENABLE_ARCH_8) {
-        return false;
-    }
-    return op_strex(s, a, MO_16, true);
-}
-
-static bool op_stl(DisasContext *s, arg_STL *a, MemOp mop)
-{
-    TCGv_i32 addr, tmp;
-
-    if (!ENABLE_ARCH_8) {
-        return false;
-    }
-    /* We UNDEF for these UNPREDICTABLE cases.  */
-    if (a->rn == 15 || a->rt == 15) {
-        unallocated_encoding(s);
-        return true;
-    }
-
-    addr = load_reg(s, a->rn);
-    tmp = load_reg(s, a->rt);
-    tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL);
-    gen_aa32_st_i32(s, tmp, addr, get_mem_index(s), mop | MO_ALIGN);
-    disas_set_da_iss(s, mop, a->rt | ISSIsAcqRel | ISSIsWrite);
-
-    tcg_temp_free_i32(tmp);
-    tcg_temp_free_i32(addr);
-    return true;
-}
-
-static bool trans_STL(DisasContext *s, arg_STL *a)
-{
-    return op_stl(s, a, MO_UL);
-}
-
-static bool trans_STLB(DisasContext *s, arg_STL *a)
-{
-    return op_stl(s, a, MO_UB);
-}
-
-static bool trans_STLH(DisasContext *s, arg_STL *a)
-{
-    return op_stl(s, a, MO_UW);
-}
-
-static bool op_ldrex(DisasContext *s, arg_LDREX *a, MemOp mop, bool acq)
-{
-    TCGv_i32 addr;
-    /* Some cases stopped being UNPREDICTABLE in v8A (but not v8M) */
-    bool v8a = ENABLE_ARCH_8 && !arm_dc_feature(s, ARM_FEATURE_M);
-
-    /* We UNDEF for these UNPREDICTABLE cases.  */
-    if (a->rn == 15 || a->rt == 15
-        || (!v8a && s->thumb && a->rt == 13)
-        || (mop == MO_64
-            && (a->rt2 == 15 || a->rt == a->rt2
-                || (!v8a && s->thumb && a->rt2 == 13)))) {
-        unallocated_encoding(s);
-        return true;
-    }
-
-    addr = tcg_temp_local_new_i32();
-    load_reg_var(s, addr, a->rn);
-    tcg_gen_addi_i32(addr, addr, a->imm);
-
-    gen_load_exclusive(s, a->rt, a->rt2, addr, mop);
-    tcg_temp_free_i32(addr);
-
-    if (acq) {
-        tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ);
-    }
-    return true;
-}
-
-static bool trans_LDREX(DisasContext *s, arg_LDREX *a)
-{
-    if (!ENABLE_ARCH_6) {
-        return false;
-    }
-    return op_ldrex(s, a, MO_32, false);
-}
-
-static bool trans_LDREXD_a32(DisasContext *s, arg_LDREX *a)
-{
-    if (!ENABLE_ARCH_6K) {
-        return false;
-    }
-    /* We UNDEF for these UNPREDICTABLE cases.  */
-    if (a->rt & 1) {
-        unallocated_encoding(s);
-        return true;
-    }
-    a->rt2 = a->rt + 1;
-    return op_ldrex(s, a, MO_64, false);
-}
-
-static bool trans_LDREXD_t32(DisasContext *s, arg_LDREX *a)
-{
-    return op_ldrex(s, a, MO_64, false);
-}
-
-static bool trans_LDREXB(DisasContext *s, arg_LDREX *a)
-{
-    if (s->thumb ? !ENABLE_ARCH_7 : !ENABLE_ARCH_6K) {
-        return false;
-    }
-    return op_ldrex(s, a, MO_8, false);
-}
-
-static bool trans_LDREXH(DisasContext *s, arg_LDREX *a)
-{
-    if (s->thumb ? !ENABLE_ARCH_7 : !ENABLE_ARCH_6K) {
-        return false;
-    }
-    return op_ldrex(s, a, MO_16, false);
-}
-
-static bool trans_LDAEX(DisasContext *s, arg_LDREX *a)
-{
-    if (!ENABLE_ARCH_8) {
-        return false;
-    }
-    return op_ldrex(s, a, MO_32, true);
-}
-
-static bool trans_LDAEXD_a32(DisasContext *s, arg_LDREX *a)
-{
-    if (!ENABLE_ARCH_8) {
-        return false;
-    }
-    /* We UNDEF for these UNPREDICTABLE cases.  */
-    if (a->rt & 1) {
-        unallocated_encoding(s);
-        return true;
-    }
-    a->rt2 = a->rt + 1;
-    return op_ldrex(s, a, MO_64, true);
-}
-
-static bool trans_LDAEXD_t32(DisasContext *s, arg_LDREX *a)
-{
-    if (!ENABLE_ARCH_8) {
-        return false;
-    }
-    return op_ldrex(s, a, MO_64, true);
-}
-
-static bool trans_LDAEXB(DisasContext *s, arg_LDREX *a)
-{
-    if (!ENABLE_ARCH_8) {
-        return false;
-    }
-    return op_ldrex(s, a, MO_8, true);
-}
-
-static bool trans_LDAEXH(DisasContext *s, arg_LDREX *a)
-{
-    if (!ENABLE_ARCH_8) {
-        return false;
-    }
-    return op_ldrex(s, a, MO_16, true);
-}
-
-static bool op_lda(DisasContext *s, arg_LDA *a, MemOp mop)
-{
-    TCGv_i32 addr, tmp;
-
-    if (!ENABLE_ARCH_8) {
-        return false;
-    }
-    /* We UNDEF for these UNPREDICTABLE cases.  */
-    if (a->rn == 15 || a->rt == 15) {
-        unallocated_encoding(s);
-        return true;
-    }
-
-    addr = load_reg(s, a->rn);
-    tmp = tcg_temp_new_i32();
-    gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), mop | MO_ALIGN);
-    disas_set_da_iss(s, mop, a->rt | ISSIsAcqRel);
-    tcg_temp_free_i32(addr);
-
-    store_reg(s, a->rt, tmp);
-    tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL);
-    return true;
-}
-
-static bool trans_LDA(DisasContext *s, arg_LDA *a)
-{
-    return op_lda(s, a, MO_UL);
-}
-
-static bool trans_LDAB(DisasContext *s, arg_LDA *a)
-{
-    return op_lda(s, a, MO_UB);
-}
-
-static bool trans_LDAH(DisasContext *s, arg_LDA *a)
-{
-    return op_lda(s, a, MO_UW);
-}
-
-/*
- * Media instructions
- */
-
-static bool trans_USADA8(DisasContext *s, arg_USADA8 *a)
-{
-    TCGv_i32 t1, t2;
-
-    if (!ENABLE_ARCH_6) {
-        return false;
-    }
-
-    t1 = load_reg(s, a->rn);
-    t2 = load_reg(s, a->rm);
-    gen_helper_usad8(t1, t1, t2);
-    tcg_temp_free_i32(t2);
-    if (a->ra != 15) {
-        t2 = load_reg(s, a->ra);
-        tcg_gen_add_i32(t1, t1, t2);
-        tcg_temp_free_i32(t2);
-    }
-    store_reg(s, a->rd, t1);
-    return true;
-}
-
-static bool op_bfx(DisasContext *s, arg_UBFX *a, bool u)
-{
-    TCGv_i32 tmp;
-    int width = a->widthm1 + 1;
-    int shift = a->lsb;
-
-    if (!ENABLE_ARCH_6T2) {
-        return false;
-    }
-    if (shift + width > 32) {
-        /* UNPREDICTABLE; we choose to UNDEF */
-        unallocated_encoding(s);
-        return true;
-    }
-
-    tmp = load_reg(s, a->rn);
-    if (u) {
-        tcg_gen_extract_i32(tmp, tmp, shift, width);
-    } else {
-        tcg_gen_sextract_i32(tmp, tmp, shift, width);
-    }
-    store_reg(s, a->rd, tmp);
-    return true;
-}
-
-static bool trans_SBFX(DisasContext *s, arg_SBFX *a)
-{
-    return op_bfx(s, a, false);
-}
-
-static bool trans_UBFX(DisasContext *s, arg_UBFX *a)
-{
-    return op_bfx(s, a, true);
-}
-
-static bool trans_BFCI(DisasContext *s, arg_BFCI *a)
-{
-    TCGv_i32 tmp;
-    int msb = a->msb, lsb = a->lsb;
-    int width;
-
-    if (!ENABLE_ARCH_6T2) {
-        return false;
-    }
-    if (msb < lsb) {
-        /* UNPREDICTABLE; we choose to UNDEF */
-        unallocated_encoding(s);
-        return true;
-    }
-
-    width = msb + 1 - lsb;
-    if (a->rn == 15) {
-        /* BFC */
-        tmp = tcg_const_i32(0);
-    } else {
-        /* BFI */
-        tmp = load_reg(s, a->rn);
-    }
-    if (width != 32) {
-        TCGv_i32 tmp2 = load_reg(s, a->rd);
-        tcg_gen_deposit_i32(tmp, tmp2, tmp, lsb, width);
-        tcg_temp_free_i32(tmp2);
-    }
-    store_reg(s, a->rd, tmp);
-    return true;
-}
-
-static bool trans_UDF(DisasContext *s, arg_UDF *a)
-{
-    unallocated_encoding(s);
-    return true;
-}
-
-/*
- * Parallel addition and subtraction
- */
-
-static bool op_par_addsub(DisasContext *s, arg_rrr *a,
-                          void (*gen)(TCGv_i32, TCGv_i32, TCGv_i32))
-{
-    TCGv_i32 t0, t1;
-
-    if (s->thumb
-        ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)
-        : !ENABLE_ARCH_6) {
-        return false;
-    }
-
-    t0 = load_reg(s, a->rn);
-    t1 = load_reg(s, a->rm);
-
-    gen(t0, t0, t1);
-
-    tcg_temp_free_i32(t1);
-    store_reg(s, a->rd, t0);
-    return true;
-}
-
-static bool op_par_addsub_ge(DisasContext *s, arg_rrr *a,
-                             void (*gen)(TCGv_i32, TCGv_i32,
-                                         TCGv_i32, TCGv_ptr))
-{
-    TCGv_i32 t0, t1;
-    TCGv_ptr ge;
-
-    if (s->thumb
-        ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)
-        : !ENABLE_ARCH_6) {
-        return false;
-    }
-
-    t0 = load_reg(s, a->rn);
-    t1 = load_reg(s, a->rm);
-
-    ge = tcg_temp_new_ptr();
-    tcg_gen_addi_ptr(ge, cpu_env, offsetof(CPUARMState, GE));
-    gen(t0, t0, t1, ge);
-
-    tcg_temp_free_ptr(ge);
-    tcg_temp_free_i32(t1);
-    store_reg(s, a->rd, t0);
-    return true;
-}
-
-#define DO_PAR_ADDSUB(NAME, helper) \
-static bool trans_##NAME(DisasContext *s, arg_rrr *a)   \
-{                                                       \
-    return op_par_addsub(s, a, helper);                 \
-}
-
-#define DO_PAR_ADDSUB_GE(NAME, helper) \
-static bool trans_##NAME(DisasContext *s, arg_rrr *a)   \
-{                                                       \
-    return op_par_addsub_ge(s, a, helper);              \
-}
-
-DO_PAR_ADDSUB_GE(SADD16, gen_helper_sadd16)
-DO_PAR_ADDSUB_GE(SASX, gen_helper_saddsubx)
-DO_PAR_ADDSUB_GE(SSAX, gen_helper_ssubaddx)
-DO_PAR_ADDSUB_GE(SSUB16, gen_helper_ssub16)
-DO_PAR_ADDSUB_GE(SADD8, gen_helper_sadd8)
-DO_PAR_ADDSUB_GE(SSUB8, gen_helper_ssub8)
-
-DO_PAR_ADDSUB_GE(UADD16, gen_helper_uadd16)
-DO_PAR_ADDSUB_GE(UASX, gen_helper_uaddsubx)
-DO_PAR_ADDSUB_GE(USAX, gen_helper_usubaddx)
-DO_PAR_ADDSUB_GE(USUB16, gen_helper_usub16)
-DO_PAR_ADDSUB_GE(UADD8, gen_helper_uadd8)
-DO_PAR_ADDSUB_GE(USUB8, gen_helper_usub8)
-
-DO_PAR_ADDSUB(QADD16, gen_helper_qadd16)
-DO_PAR_ADDSUB(QASX, gen_helper_qaddsubx)
-DO_PAR_ADDSUB(QSAX, gen_helper_qsubaddx)
-DO_PAR_ADDSUB(QSUB16, gen_helper_qsub16)
-DO_PAR_ADDSUB(QADD8, gen_helper_qadd8)
-DO_PAR_ADDSUB(QSUB8, gen_helper_qsub8)
-
-DO_PAR_ADDSUB(UQADD16, gen_helper_uqadd16)
-DO_PAR_ADDSUB(UQASX, gen_helper_uqaddsubx)
-DO_PAR_ADDSUB(UQSAX, gen_helper_uqsubaddx)
-DO_PAR_ADDSUB(UQSUB16, gen_helper_uqsub16)
-DO_PAR_ADDSUB(UQADD8, gen_helper_uqadd8)
-DO_PAR_ADDSUB(UQSUB8, gen_helper_uqsub8)
-
-DO_PAR_ADDSUB(SHADD16, gen_helper_shadd16)
-DO_PAR_ADDSUB(SHASX, gen_helper_shaddsubx)
-DO_PAR_ADDSUB(SHSAX, gen_helper_shsubaddx)
-DO_PAR_ADDSUB(SHSUB16, gen_helper_shsub16)
-DO_PAR_ADDSUB(SHADD8, gen_helper_shadd8)
-DO_PAR_ADDSUB(SHSUB8, gen_helper_shsub8)
-
-DO_PAR_ADDSUB(UHADD16, gen_helper_uhadd16)
-DO_PAR_ADDSUB(UHASX, gen_helper_uhaddsubx)
-DO_PAR_ADDSUB(UHSAX, gen_helper_uhsubaddx)
-DO_PAR_ADDSUB(UHSUB16, gen_helper_uhsub16)
-DO_PAR_ADDSUB(UHADD8, gen_helper_uhadd8)
-DO_PAR_ADDSUB(UHSUB8, gen_helper_uhsub8)
-
-#undef DO_PAR_ADDSUB
-#undef DO_PAR_ADDSUB_GE
-
-/*
- * Packing, unpacking, saturation, and reversal
- */
-
-static bool trans_PKH(DisasContext *s, arg_PKH *a)
-{
-    TCGv_i32 tn, tm;
-    int shift = a->imm;
-
-    if (s->thumb
-        ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)
-        : !ENABLE_ARCH_6) {
-        return false;
-    }
-
-    tn = load_reg(s, a->rn);
-    tm = load_reg(s, a->rm);
-    if (a->tb) {
-        /* PKHTB */
-        if (shift == 0) {
-            shift = 31;
-        }
-        tcg_gen_sari_i32(tm, tm, shift);
-        tcg_gen_deposit_i32(tn, tn, tm, 0, 16);
-    } else {
-        /* PKHBT */
-        tcg_gen_shli_i32(tm, tm, shift);
-        tcg_gen_deposit_i32(tn, tm, tn, 0, 16);
-    }
-    tcg_temp_free_i32(tm);
-    store_reg(s, a->rd, tn);
-    return true;
-}
-
-static bool op_sat(DisasContext *s, arg_sat *a,
-                   void (*gen)(TCGv_i32, TCGv_env, TCGv_i32, TCGv_i32))
-{
-    TCGv_i32 tmp;
-    int shift = a->imm;
-
-    if (!ENABLE_ARCH_6) {
-        return false;
-    }
-
-    tmp = load_reg(s, a->rn);
-    if (a->sh) {
-        tcg_gen_sari_i32(tmp, tmp, shift ? shift : 31);
-    } else {
-        tcg_gen_shli_i32(tmp, tmp, shift);
-    }
-
-    gen(tmp, cpu_env, tmp, tcg_constant_i32(a->satimm));
-
-    store_reg(s, a->rd, tmp);
-    return true;
-}
-
-static bool trans_SSAT(DisasContext *s, arg_sat *a)
-{
-    return op_sat(s, a, gen_helper_ssat);
-}
-
-static bool trans_USAT(DisasContext *s, arg_sat *a)
-{
-    return op_sat(s, a, gen_helper_usat);
-}
-
-static bool trans_SSAT16(DisasContext *s, arg_sat *a)
-{
-    if (s->thumb && !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)) {
-        return false;
-    }
-    return op_sat(s, a, gen_helper_ssat16);
-}
-
-static bool trans_USAT16(DisasContext *s, arg_sat *a)
-{
-    if (s->thumb && !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)) {
-        return false;
-    }
-    return op_sat(s, a, gen_helper_usat16);
-}
-
-static bool op_xta(DisasContext *s, arg_rrr_rot *a,
-                   void (*gen_extract)(TCGv_i32, TCGv_i32),
-                   void (*gen_add)(TCGv_i32, TCGv_i32, TCGv_i32))
-{
-    TCGv_i32 tmp;
-
-    if (!ENABLE_ARCH_6) {
-        return false;
-    }
-
-    tmp = load_reg(s, a->rm);
-    /*
-     * TODO: In many cases we could do a shift instead of a rotate.
-     * Combined with a simple extend, that becomes an extract.
-     */
-    tcg_gen_rotri_i32(tmp, tmp, a->rot * 8);
-    gen_extract(tmp, tmp);
-
-    if (a->rn != 15) {
-        TCGv_i32 tmp2 = load_reg(s, a->rn);
-        gen_add(tmp, tmp, tmp2);
-        tcg_temp_free_i32(tmp2);
-    }
-    store_reg(s, a->rd, tmp);
-    return true;
-}
-
-static bool trans_SXTAB(DisasContext *s, arg_rrr_rot *a)
-{
-    return op_xta(s, a, tcg_gen_ext8s_i32, tcg_gen_add_i32);
-}
-
-static bool trans_SXTAH(DisasContext *s, arg_rrr_rot *a)
-{
-    return op_xta(s, a, tcg_gen_ext16s_i32, tcg_gen_add_i32);
-}
-
-static bool trans_SXTAB16(DisasContext *s, arg_rrr_rot *a)
-{
-    if (s->thumb && !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)) {
-        return false;
-    }
-    return op_xta(s, a, gen_helper_sxtb16, gen_add16);
-}
-
-static bool trans_UXTAB(DisasContext *s, arg_rrr_rot *a)
-{
-    return op_xta(s, a, tcg_gen_ext8u_i32, tcg_gen_add_i32);
-}
-
-static bool trans_UXTAH(DisasContext *s, arg_rrr_rot *a)
-{
-    return op_xta(s, a, tcg_gen_ext16u_i32, tcg_gen_add_i32);
-}
-
-static bool trans_UXTAB16(DisasContext *s, arg_rrr_rot *a)
-{
-    if (s->thumb && !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)) {
-        return false;
-    }
-    return op_xta(s, a, gen_helper_uxtb16, gen_add16);
-}
-
-static bool trans_SEL(DisasContext *s, arg_rrr *a)
-{
-    TCGv_i32 t1, t2, t3;
-
-    if (s->thumb
-        ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)
-        : !ENABLE_ARCH_6) {
-        return false;
-    }
-
-    t1 = load_reg(s, a->rn);
-    t2 = load_reg(s, a->rm);
-    t3 = tcg_temp_new_i32();
-    tcg_gen_ld_i32(t3, cpu_env, offsetof(CPUARMState, GE));
-    gen_helper_sel_flags(t1, t3, t1, t2);
-    tcg_temp_free_i32(t3);
-    tcg_temp_free_i32(t2);
-    store_reg(s, a->rd, t1);
-    return true;
-}
-
-static bool op_rr(DisasContext *s, arg_rr *a,
-                  void (*gen)(TCGv_i32, TCGv_i32))
-{
-    TCGv_i32 tmp;
-
-    tmp = load_reg(s, a->rm);
-    gen(tmp, tmp);
-    store_reg(s, a->rd, tmp);
-    return true;
-}
-
-static bool trans_REV(DisasContext *s, arg_rr *a)
-{
-    if (!ENABLE_ARCH_6) {
-        return false;
-    }
-    return op_rr(s, a, tcg_gen_bswap32_i32);
-}
-
-static bool trans_REV16(DisasContext *s, arg_rr *a)
-{
-    if (!ENABLE_ARCH_6) {
-        return false;
-    }
-    return op_rr(s, a, gen_rev16);
-}
-
-static bool trans_REVSH(DisasContext *s, arg_rr *a)
-{
-    if (!ENABLE_ARCH_6) {
-        return false;
-    }
-    return op_rr(s, a, gen_revsh);
-}
-
-static bool trans_RBIT(DisasContext *s, arg_rr *a)
-{
-    if (!ENABLE_ARCH_6T2) {
-        return false;
-    }
-    return op_rr(s, a, gen_helper_rbit);
-}
-
-/*
- * Signed multiply, signed and unsigned divide
- */
-
-static bool op_smlad(DisasContext *s, arg_rrrr *a, bool m_swap, bool sub)
-{
-    TCGv_i32 t1, t2;
-
-    if (!ENABLE_ARCH_6) {
-        return false;
-    }
-
-    t1 = load_reg(s, a->rn);
-    t2 = load_reg(s, a->rm);
-    if (m_swap) {
-        gen_swap_half(t2, t2);
-    }
-    gen_smul_dual(t1, t2);
-
-    if (sub) {
-        /*
-         * This subtraction cannot overflow, so we can do a simple
-         * 32-bit subtraction and then a possible 32-bit saturating
-         * addition of Ra.
-         */
-        tcg_gen_sub_i32(t1, t1, t2);
-        tcg_temp_free_i32(t2);
-
-        if (a->ra != 15) {
-            t2 = load_reg(s, a->ra);
-            gen_helper_add_setq(t1, cpu_env, t1, t2);
-            tcg_temp_free_i32(t2);
-        }
-    } else if (a->ra == 15) {
-        /* Single saturation-checking addition */
-        gen_helper_add_setq(t1, cpu_env, t1, t2);
-        tcg_temp_free_i32(t2);
-    } else {
-        /*
-         * We need to add the products and Ra together and then
-         * determine whether the final result overflowed. Doing
-         * this as two separate add-and-check-overflow steps incorrectly
-         * sets Q for cases like (-32768 * -32768) + (-32768 * -32768) + -1.
-         * Do all the arithmetic at 64-bits and then check for overflow.
-         */
-        TCGv_i64 p64, q64;
-        TCGv_i32 t3, qf, one;
-
-        p64 = tcg_temp_new_i64();
-        q64 = tcg_temp_new_i64();
-        tcg_gen_ext_i32_i64(p64, t1);
-        tcg_gen_ext_i32_i64(q64, t2);
-        tcg_gen_add_i64(p64, p64, q64);
-        load_reg_var(s, t2, a->ra);
-        tcg_gen_ext_i32_i64(q64, t2);
-        tcg_gen_add_i64(p64, p64, q64);
-        tcg_temp_free_i64(q64);
-
-        tcg_gen_extr_i64_i32(t1, t2, p64);
-        tcg_temp_free_i64(p64);
-        /*
-         * t1 is the low half of the result which goes into Rd.
-         * We have overflow and must set Q if the high half (t2)
-         * is different from the sign-extension of t1.
-         */
-        t3 = tcg_temp_new_i32();
-        tcg_gen_sari_i32(t3, t1, 31);
-        qf = load_cpu_field(QF);
-        one = tcg_constant_i32(1);
-        tcg_gen_movcond_i32(TCG_COND_NE, qf, t2, t3, one, qf);
-        store_cpu_field(qf, QF);
-        tcg_temp_free_i32(t3);
-        tcg_temp_free_i32(t2);
-    }
-    store_reg(s, a->rd, t1);
-    return true;
-}
-
-static bool trans_SMLAD(DisasContext *s, arg_rrrr *a)
-{
-    return op_smlad(s, a, false, false);
-}
-
-static bool trans_SMLADX(DisasContext *s, arg_rrrr *a)
-{
-    return op_smlad(s, a, true, false);
-}
-
-static bool trans_SMLSD(DisasContext *s, arg_rrrr *a)
-{
-    return op_smlad(s, a, false, true);
-}
-
-static bool trans_SMLSDX(DisasContext *s, arg_rrrr *a)
-{
-    return op_smlad(s, a, true, true);
-}
-
-static bool op_smlald(DisasContext *s, arg_rrrr *a, bool m_swap, bool sub)
-{
-    TCGv_i32 t1, t2;
-    TCGv_i64 l1, l2;
-
-    if (!ENABLE_ARCH_6) {
-        return false;
-    }
-
-    t1 = load_reg(s, a->rn);
-    t2 = load_reg(s, a->rm);
-    if (m_swap) {
-        gen_swap_half(t2, t2);
-    }
-    gen_smul_dual(t1, t2);
-
-    l1 = tcg_temp_new_i64();
-    l2 = tcg_temp_new_i64();
-    tcg_gen_ext_i32_i64(l1, t1);
-    tcg_gen_ext_i32_i64(l2, t2);
-    tcg_temp_free_i32(t1);
-    tcg_temp_free_i32(t2);
-
-    if (sub) {
-        tcg_gen_sub_i64(l1, l1, l2);
-    } else {
-        tcg_gen_add_i64(l1, l1, l2);
-    }
-    tcg_temp_free_i64(l2);
-
-    gen_addq(s, l1, a->ra, a->rd);
-    gen_storeq_reg(s, a->ra, a->rd, l1);
-    tcg_temp_free_i64(l1);
-    return true;
-}
-
-static bool trans_SMLALD(DisasContext *s, arg_rrrr *a)
-{
-    return op_smlald(s, a, false, false);
-}
-
-static bool trans_SMLALDX(DisasContext *s, arg_rrrr *a)
-{
-    return op_smlald(s, a, true, false);
-}
-
-static bool trans_SMLSLD(DisasContext *s, arg_rrrr *a)
-{
-    return op_smlald(s, a, false, true);
-}
-
-static bool trans_SMLSLDX(DisasContext *s, arg_rrrr *a)
-{
-    return op_smlald(s, a, true, true);
-}
-
-static bool op_smmla(DisasContext *s, arg_rrrr *a, bool round, bool sub)
-{
-    TCGv_i32 t1, t2;
-
-    if (s->thumb
-        ? !arm_dc_feature(s, ARM_FEATURE_THUMB_DSP)
-        : !ENABLE_ARCH_6) {
-        return false;
-    }
-
-    t1 = load_reg(s, a->rn);
-    t2 = load_reg(s, a->rm);
-    tcg_gen_muls2_i32(t2, t1, t1, t2);
-
-    if (a->ra != 15) {
-        TCGv_i32 t3 = load_reg(s, a->ra);
-        if (sub) {
-            /*
-             * For SMMLS, we need a 64-bit subtract.  Borrow caused by
-             * a non-zero multiplicand lowpart, and the correct result
-             * lowpart for rounding.
-             */
-            tcg_gen_sub2_i32(t2, t1, tcg_constant_i32(0), t3, t2, t1);
-        } else {
-            tcg_gen_add_i32(t1, t1, t3);
-        }
-        tcg_temp_free_i32(t3);
-    }
-    if (round) {
-        /*
-         * Adding 0x80000000 to the 64-bit quantity means that we have
-         * carry in to the high word when the low word has the msb set.
-         */
-        tcg_gen_shri_i32(t2, t2, 31);
-        tcg_gen_add_i32(t1, t1, t2);
-    }
-    tcg_temp_free_i32(t2);
-    store_reg(s, a->rd, t1);
-    return true;
-}
-
-static bool trans_SMMLA(DisasContext *s, arg_rrrr *a)
-{
-    return op_smmla(s, a, false, false);
-}
-
-static bool trans_SMMLAR(DisasContext *s, arg_rrrr *a)
-{
-    return op_smmla(s, a, true, false);
-}
-
-static bool trans_SMMLS(DisasContext *s, arg_rrrr *a)
-{
-    return op_smmla(s, a, false, true);
-}
-
-static bool trans_SMMLSR(DisasContext *s, arg_rrrr *a)
-{
-    return op_smmla(s, a, true, true);
-}
-
-static bool op_div(DisasContext *s, arg_rrr *a, bool u)
-{
-    TCGv_i32 t1, t2;
-
-    if (s->thumb
-        ? !dc_isar_feature(aa32_thumb_div, s)
-        : !dc_isar_feature(aa32_arm_div, s)) {
-        return false;
-    }
-
-    t1 = load_reg(s, a->rn);
-    t2 = load_reg(s, a->rm);
-    if (u) {
-        gen_helper_udiv(t1, cpu_env, t1, t2);
-    } else {
-        gen_helper_sdiv(t1, cpu_env, t1, t2);
-    }
-    tcg_temp_free_i32(t2);
-    store_reg(s, a->rd, t1);
-    return true;
-}
-
-static bool trans_SDIV(DisasContext *s, arg_rrr *a)
-{
-    return op_div(s, a, false);
-}
-
-static bool trans_UDIV(DisasContext *s, arg_rrr *a)
-{
-    return op_div(s, a, true);
-}
-
-/*
- * Block data transfer
- */
-
-static TCGv_i32 op_addr_block_pre(DisasContext *s, arg_ldst_block *a, int n)
-{
-    TCGv_i32 addr = load_reg(s, a->rn);
-
-    if (a->b) {
-        if (a->i) {
-            /* pre increment */
-            tcg_gen_addi_i32(addr, addr, 4);
-        } else {
-            /* pre decrement */
-            tcg_gen_addi_i32(addr, addr, -(n * 4));
-        }
-    } else if (!a->i && n != 1) {
-        /* post decrement */
-        tcg_gen_addi_i32(addr, addr, -((n - 1) * 4));
-    }
-
-    if (s->v8m_stackcheck && a->rn == 13 && a->w) {
-        /*
-         * If the writeback is incrementing SP rather than
-         * decrementing it, and the initial SP is below the
-         * stack limit but the final written-back SP would
-         * be above, then we must not perform any memory
-         * accesses, but it is IMPDEF whether we generate
-         * an exception. We choose to do so in this case.
-         * At this point 'addr' is the lowest address, so
-         * either the original SP (if incrementing) or our
-         * final SP (if decrementing), so that's what we check.
-         */
-        gen_helper_v8m_stackcheck(cpu_env, addr);
-    }
-
-    return addr;
-}
-
-static void op_addr_block_post(DisasContext *s, arg_ldst_block *a,
-                               TCGv_i32 addr, int n)
-{
-    if (a->w) {
-        /* write back */
-        if (!a->b) {
-            if (a->i) {
-                /* post increment */
-                tcg_gen_addi_i32(addr, addr, 4);
-            } else {
-                /* post decrement */
-                tcg_gen_addi_i32(addr, addr, -(n * 4));
-            }
-        } else if (!a->i && n != 1) {
-            /* pre decrement */
-            tcg_gen_addi_i32(addr, addr, -((n - 1) * 4));
-        }
-        store_reg(s, a->rn, addr);
-    } else {
-        tcg_temp_free_i32(addr);
-    }
-}
-
-static bool op_stm(DisasContext *s, arg_ldst_block *a, int min_n)
-{
-    int i, j, n, list, mem_idx;
-    bool user = a->u;
-    TCGv_i32 addr, tmp;
-
-    if (user) {
-        /* STM (user) */
-        if (IS_USER(s)) {
-            /* Only usable in supervisor mode.  */
-            unallocated_encoding(s);
-            return true;
-        }
-    }
-
-    list = a->list;
-    n = ctpop16(list);
-    if (n < min_n || a->rn == 15) {
-        unallocated_encoding(s);
-        return true;
-    }
-
-    s->eci_handled = true;
-
-    addr = op_addr_block_pre(s, a, n);
-    mem_idx = get_mem_index(s);
-
-    for (i = j = 0; i < 16; i++) {
-        if (!(list & (1 << i))) {
-            continue;
-        }
-
-        if (user && i != 15) {
-            tmp = tcg_temp_new_i32();
-            gen_helper_get_user_reg(tmp, cpu_env, tcg_constant_i32(i));
-        } else {
-            tmp = load_reg(s, i);
-        }
-        gen_aa32_st_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
-        tcg_temp_free_i32(tmp);
-
-        /* No need to add after the last transfer.  */
-        if (++j != n) {
-            tcg_gen_addi_i32(addr, addr, 4);
-        }
-    }
-
-    op_addr_block_post(s, a, addr, n);
-    clear_eci_state(s);
-    return true;
-}
-
-static bool trans_STM(DisasContext *s, arg_ldst_block *a)
-{
-    /* BitCount(list) < 1 is UNPREDICTABLE */
-    return op_stm(s, a, 1);
-}
-
-static bool trans_STM_t32(DisasContext *s, arg_ldst_block *a)
-{
-    /* Writeback register in register list is UNPREDICTABLE for T32.  */
-    if (a->w && (a->list & (1 << a->rn))) {
-        unallocated_encoding(s);
-        return true;
-    }
-    /* BitCount(list) < 2 is UNPREDICTABLE */
-    return op_stm(s, a, 2);
-}
-
-static bool do_ldm(DisasContext *s, arg_ldst_block *a, int min_n)
-{
-    int i, j, n, list, mem_idx;
-    bool loaded_base;
-    bool user = a->u;
-    bool exc_return = false;
-    TCGv_i32 addr, tmp, loaded_var;
-
-    if (user) {
-        /* LDM (user), LDM (exception return) */
-        if (IS_USER(s)) {
-            /* Only usable in supervisor mode.  */
-            unallocated_encoding(s);
-            return true;
-        }
-        if (extract32(a->list, 15, 1)) {
-            exc_return = true;
-            user = false;
-        } else {
-            /* LDM (user) does not allow writeback.  */
-            if (a->w) {
-                unallocated_encoding(s);
-                return true;
-            }
-        }
-    }
-
-    list = a->list;
-    n = ctpop16(list);
-    if (n < min_n || a->rn == 15) {
-        unallocated_encoding(s);
-        return true;
-    }
-
-    s->eci_handled = true;
-
-    addr = op_addr_block_pre(s, a, n);
-    mem_idx = get_mem_index(s);
-    loaded_base = false;
-    loaded_var = NULL;
-
-    for (i = j = 0; i < 16; i++) {
-        if (!(list & (1 << i))) {
-            continue;
-        }
-
-        tmp = tcg_temp_new_i32();
-        gen_aa32_ld_i32(s, tmp, addr, mem_idx, MO_UL | MO_ALIGN);
-        if (user) {
-            gen_helper_set_user_reg(cpu_env, tcg_constant_i32(i), tmp);
-            tcg_temp_free_i32(tmp);
-        } else if (i == a->rn) {
-            loaded_var = tmp;
-            loaded_base = true;
-        } else if (i == 15 && exc_return) {
-            store_pc_exc_ret(s, tmp);
-        } else {
-            store_reg_from_load(s, i, tmp);
-        }
-
-        /* No need to add after the last transfer.  */
-        if (++j != n) {
-            tcg_gen_addi_i32(addr, addr, 4);
-        }
-    }
-
-    op_addr_block_post(s, a, addr, n);
-
-    if (loaded_base) {
-        /* Note that we reject base == pc above.  */
-        store_reg(s, a->rn, loaded_var);
-    }
-
-    if (exc_return) {
-        /* Restore CPSR from SPSR.  */
-        tmp = load_cpu_field(spsr);
-        if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-            gen_io_start();
-        }
-        gen_helper_cpsr_write_eret(cpu_env, tmp);
-        tcg_temp_free_i32(tmp);
-        /* Must exit loop to check un-masked IRQs */
-        s->base.is_jmp = DISAS_EXIT;
-    }
-    clear_eci_state(s);
-    return true;
-}
-
-static bool trans_LDM_a32(DisasContext *s, arg_ldst_block *a)
-{
-    /*
-     * Writeback register in register list is UNPREDICTABLE
-     * for ArchVersion() >= 7.  Prior to v7, A32 would write
-     * an UNKNOWN value to the base register.
-     */
-    if (ENABLE_ARCH_7 && a->w && (a->list & (1 << a->rn))) {
-        unallocated_encoding(s);
-        return true;
-    }
-    /* BitCount(list) < 1 is UNPREDICTABLE */
-    return do_ldm(s, a, 1);
-}
-
-static bool trans_LDM_t32(DisasContext *s, arg_ldst_block *a)
-{
-    /* Writeback register in register list is UNPREDICTABLE for T32. */
-    if (a->w && (a->list & (1 << a->rn))) {
-        unallocated_encoding(s);
-        return true;
-    }
-    /* BitCount(list) < 2 is UNPREDICTABLE */
-    return do_ldm(s, a, 2);
-}
-
-static bool trans_LDM_t16(DisasContext *s, arg_ldst_block *a)
-{
-    /* Writeback is conditional on the base register not being loaded.  */
-    a->w = !(a->list & (1 << a->rn));
-    /* BitCount(list) < 1 is UNPREDICTABLE */
-    return do_ldm(s, a, 1);
-}
-
-static bool trans_CLRM(DisasContext *s, arg_CLRM *a)
-{
-    int i;
-    TCGv_i32 zero;
-
-    if (!dc_isar_feature(aa32_m_sec_state, s)) {
-        return false;
-    }
-
-    if (extract32(a->list, 13, 1)) {
-        return false;
-    }
-
-    if (!a->list) {
-        /* UNPREDICTABLE; we choose to UNDEF */
-        return false;
-    }
-
-    s->eci_handled = true;
-
-    zero = tcg_constant_i32(0);
-    for (i = 0; i < 15; i++) {
-        if (extract32(a->list, i, 1)) {
-            /* Clear R[i] */
-            tcg_gen_mov_i32(cpu_R[i], zero);
-        }
-    }
-    if (extract32(a->list, 15, 1)) {
-        /*
-         * Clear APSR (by calling the MSR helper with the same argument
-         * as for "MSR APSR_nzcvqg, Rn": mask = 0b1100, SYSM=0)
-         */
-        gen_helper_v7m_msr(cpu_env, tcg_constant_i32(0xc00), zero);
-    }
-    clear_eci_state(s);
-    return true;
-}
-
-/*
- * Branch, branch with link
- */
-
-static bool trans_B(DisasContext *s, arg_i *a)
-{
-    gen_jmp(s, jmp_diff(s, a->imm));
-    return true;
-}
-
-static bool trans_B_cond_thumb(DisasContext *s, arg_ci *a)
-{
-    /* This has cond from encoding, required to be outside IT block.  */
-    if (a->cond >= 0xe) {
-        return false;
-    }
-    if (s->condexec_mask) {
-        unallocated_encoding(s);
-        return true;
-    }
-    arm_skip_unless(s, a->cond);
-    gen_jmp(s, jmp_diff(s, a->imm));
-    return true;
-}
-
-static bool trans_BL(DisasContext *s, arg_i *a)
-{
-    gen_pc_plus_diff(s, cpu_R[14], curr_insn_len(s) | s->thumb);
-    gen_jmp(s, jmp_diff(s, a->imm));
-    return true;
-}
-
-static bool trans_BLX_i(DisasContext *s, arg_BLX_i *a)
-{
-    /*
-     * BLX <imm> would be useless on M-profile; the encoding space
-     * is used for other insns from v8.1M onward, and UNDEFs before that.
-     */
-    if (arm_dc_feature(s, ARM_FEATURE_M)) {
-        return false;
-    }
-
-    /* For A32, ARM_FEATURE_V5 is checked near the start of the uncond block. */
-    if (s->thumb && (a->imm & 2)) {
-        return false;
-    }
-    gen_pc_plus_diff(s, cpu_R[14], curr_insn_len(s) | s->thumb);
-    store_cpu_field_constant(!s->thumb, thumb);
-    /* This jump is computed from an aligned PC: subtract off the low bits. */
-    gen_jmp(s, jmp_diff(s, a->imm - (s->pc_curr & 3)));
-    return true;
-}
-
-static bool trans_BL_BLX_prefix(DisasContext *s, arg_BL_BLX_prefix *a)
-{
-    assert(!arm_dc_feature(s, ARM_FEATURE_THUMB2));
-    gen_pc_plus_diff(s, cpu_R[14], jmp_diff(s, a->imm << 12));
-    return true;
-}
-
-static bool trans_BL_suffix(DisasContext *s, arg_BL_suffix *a)
-{
-    TCGv_i32 tmp = tcg_temp_new_i32();
-
-    assert(!arm_dc_feature(s, ARM_FEATURE_THUMB2));
-    tcg_gen_addi_i32(tmp, cpu_R[14], (a->imm << 1) | 1);
-    gen_pc_plus_diff(s, cpu_R[14], curr_insn_len(s) | 1);
-    gen_bx(s, tmp);
-    return true;
-}
-
-static bool trans_BLX_suffix(DisasContext *s, arg_BLX_suffix *a)
-{
-    TCGv_i32 tmp;
-
-    assert(!arm_dc_feature(s, ARM_FEATURE_THUMB2));
-    if (!ENABLE_ARCH_5) {
-        return false;
-    }
-    tmp = tcg_temp_new_i32();
-    tcg_gen_addi_i32(tmp, cpu_R[14], a->imm << 1);
-    tcg_gen_andi_i32(tmp, tmp, 0xfffffffc);
-    gen_pc_plus_diff(s, cpu_R[14], curr_insn_len(s) | 1);
-    gen_bx(s, tmp);
-    return true;
-}
-
-static bool trans_BF(DisasContext *s, arg_BF *a)
-{
-    /*
-     * M-profile branch future insns. The architecture permits an
-     * implementation to implement these as NOPs (equivalent to
-     * discarding the LO_BRANCH_INFO cache immediately), and we
-     * take that IMPDEF option because for QEMU a "real" implementation
-     * would be complicated and wouldn't execute any faster.
-     */
-    if (!dc_isar_feature(aa32_lob, s)) {
-        return false;
-    }
-    if (a->boff == 0) {
-        /* SEE "Related encodings" (loop insns) */
-        return false;
-    }
-    /* Handle as NOP */
-    return true;
-}
-
-static bool trans_DLS(DisasContext *s, arg_DLS *a)
-{
-    /* M-profile low-overhead loop start */
-    TCGv_i32 tmp;
-
-    if (!dc_isar_feature(aa32_lob, s)) {
-        return false;
-    }
-    if (a->rn == 13 || a->rn == 15) {
-        /*
-         * For DLSTP rn == 15 is a related encoding (LCTP); the
-         * other cases caught by this condition are all
-         * CONSTRAINED UNPREDICTABLE: we choose to UNDEF
-         */
-        return false;
-    }
-
-    if (a->size != 4) {
-        /* DLSTP */
-        if (!dc_isar_feature(aa32_mve, s)) {
-            return false;
-        }
-        if (!vfp_access_check(s)) {
-            return true;
-        }
-    }
-
-    /* Not a while loop: set LR to the count, and set LTPSIZE for DLSTP */
-    tmp = load_reg(s, a->rn);
-    store_reg(s, 14, tmp);
-    if (a->size != 4) {
-        /* DLSTP: set FPSCR.LTPSIZE */
-        store_cpu_field(tcg_constant_i32(a->size), v7m.ltpsize);
-        s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
-    }
-    return true;
-}
-
-static bool trans_WLS(DisasContext *s, arg_WLS *a)
-{
-    /* M-profile low-overhead while-loop start */
-    TCGv_i32 tmp;
-    DisasLabel nextlabel;
-
-    if (!dc_isar_feature(aa32_lob, s)) {
-        return false;
-    }
-    if (a->rn == 13 || a->rn == 15) {
-        /*
-         * For WLSTP rn == 15 is a related encoding (LE); the
-         * other cases caught by this condition are all
-         * CONSTRAINED UNPREDICTABLE: we choose to UNDEF
-         */
-        return false;
-    }
-    if (s->condexec_mask) {
-        /*
-         * WLS in an IT block is CONSTRAINED UNPREDICTABLE;
-         * we choose to UNDEF, because otherwise our use of
-         * gen_goto_tb(1) would clash with the use of TB exit 1
-         * in the dc->condjmp condition-failed codepath in
-         * arm_tr_tb_stop() and we'd get an assertion.
-         */
-        return false;
-    }
-    if (a->size != 4) {
-        /* WLSTP */
-        if (!dc_isar_feature(aa32_mve, s)) {
-            return false;
-        }
-        /*
-         * We need to check that the FPU is enabled here, but mustn't
-         * call vfp_access_check() to do that because we don't want to
-         * do the lazy state preservation in the "loop count is zero" case.
-         * Do the check-and-raise-exception by hand.
-         */
-        if (s->fp_excp_el) {
-            gen_exception_insn_el(s, 0, EXCP_NOCP,
-                                  syn_uncategorized(), s->fp_excp_el);
-            return true;
-        }
-    }
-
-    nextlabel = gen_disas_label(s);
-    tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_R[a->rn], 0, nextlabel.label);
-    tmp = load_reg(s, a->rn);
-    store_reg(s, 14, tmp);
-    if (a->size != 4) {
-        /*
-         * WLSTP: set FPSCR.LTPSIZE. This requires that we do the
-         * lazy state preservation, new FP context creation, etc,
-         * that vfp_access_check() does. We know that the actual
-         * access check will succeed (ie it won't generate code that
-         * throws an exception) because we did that check by hand earlier.
-         */
-        bool ok = vfp_access_check(s);
-        assert(ok);
-        store_cpu_field(tcg_constant_i32(a->size), v7m.ltpsize);
-        /*
-         * LTPSIZE updated, but MVE_NO_PRED will always be the same thing (0)
-         * when we take this upcoming exit from this TB, so gen_jmp_tb() is OK.
-         */
-    }
-    gen_jmp_tb(s, curr_insn_len(s), 1);
-
-    set_disas_label(s, nextlabel);
-    gen_jmp(s, jmp_diff(s, a->imm));
-    return true;
-}
-
-static bool trans_LE(DisasContext *s, arg_LE *a)
-{
-    /*
-     * M-profile low-overhead loop end. The architecture permits an
-     * implementation to discard the LO_BRANCH_INFO cache at any time,
-     * and we take the IMPDEF option to never set it in the first place
-     * (equivalent to always discarding it immediately), because for QEMU
-     * a "real" implementation would be complicated and wouldn't execute
-     * any faster.
-     */
-    TCGv_i32 tmp;
-    DisasLabel loopend;
-    bool fpu_active;
-
-    if (!dc_isar_feature(aa32_lob, s)) {
-        return false;
-    }
-    if (a->f && a->tp) {
-        return false;
-    }
-    if (s->condexec_mask) {
-        /*
-         * LE in an IT block is CONSTRAINED UNPREDICTABLE;
-         * we choose to UNDEF, because otherwise our use of
-         * gen_goto_tb(1) would clash with the use of TB exit 1
-         * in the dc->condjmp condition-failed codepath in
-         * arm_tr_tb_stop() and we'd get an assertion.
-         */
-        return false;
-    }
-    if (a->tp) {
-        /* LETP */
-        if (!dc_isar_feature(aa32_mve, s)) {
-            return false;
-        }
-        if (!vfp_access_check(s)) {
-            s->eci_handled = true;
-            return true;
-        }
-    }
-
-    /* LE/LETP is OK with ECI set and leaves it untouched */
-    s->eci_handled = true;
-
-    /*
-     * With MVE, LTPSIZE might not be 4, and we must emit an INVSTATE
-     * UsageFault exception for the LE insn in that case. Note that we
-     * are not directly checking FPSCR.LTPSIZE but instead check the
-     * pseudocode LTPSIZE() function, which returns 4 if the FPU is
-     * not currently active (ie ActiveFPState() returns false). We
-     * can identify not-active purely from our TB state flags, as the
-     * FPU is active only if:
-     *  the FPU is enabled
-     *  AND lazy state preservation is not active
-     *  AND we do not need a new fp context (this is the ASPEN/FPCA check)
-     *
-     * Usually we don't need to care about this distinction between
-     * LTPSIZE and FPSCR.LTPSIZE, because the code in vfp_access_check()
-     * will either take an exception or clear the conditions that make
-     * the FPU not active. But LE is an unusual case of a non-FP insn
-     * that looks at LTPSIZE.
-     */
-    fpu_active = !s->fp_excp_el && !s->v7m_lspact && !s->v7m_new_fp_ctxt_needed;
-
-    if (!a->tp && dc_isar_feature(aa32_mve, s) && fpu_active) {
-        /* Need to do a runtime check for LTPSIZE != 4 */
-        DisasLabel skipexc = gen_disas_label(s);
-        tmp = load_cpu_field(v7m.ltpsize);
-        tcg_gen_brcondi_i32(TCG_COND_EQ, tmp, 4, skipexc.label);
-        tcg_temp_free_i32(tmp);
-        gen_exception_insn(s, 0, EXCP_INVSTATE, syn_uncategorized());
-        set_disas_label(s, skipexc);
-    }
-
-    if (a->f) {
-        /* Loop-forever: just jump back to the loop start */
-        gen_jmp(s, jmp_diff(s, -a->imm));
-        return true;
-    }
-
-    /*
-     * Not loop-forever. If LR <= loop-decrement-value this is the last loop.
-     * For LE, we know at this point that LTPSIZE must be 4 and the
-     * loop decrement value is 1. For LETP we need to calculate the decrement
-     * value from LTPSIZE.
-     */
-    loopend = gen_disas_label(s);
-    if (!a->tp) {
-        tcg_gen_brcondi_i32(TCG_COND_LEU, cpu_R[14], 1, loopend.label);
-        tcg_gen_addi_i32(cpu_R[14], cpu_R[14], -1);
-    } else {
-        /*
-         * Decrement by 1 << (4 - LTPSIZE). We need to use a TCG local
-         * so that decr stays live after the brcondi.
-         */
-        TCGv_i32 decr = tcg_temp_local_new_i32();
-        TCGv_i32 ltpsize = load_cpu_field(v7m.ltpsize);
-        tcg_gen_sub_i32(decr, tcg_constant_i32(4), ltpsize);
-        tcg_gen_shl_i32(decr, tcg_constant_i32(1), decr);
-        tcg_temp_free_i32(ltpsize);
-
-        tcg_gen_brcond_i32(TCG_COND_LEU, cpu_R[14], decr, loopend.label);
-
-        tcg_gen_sub_i32(cpu_R[14], cpu_R[14], decr);
-        tcg_temp_free_i32(decr);
-    }
-    /* Jump back to the loop start */
-    gen_jmp(s, jmp_diff(s, -a->imm));
-
-    set_disas_label(s, loopend);
-    if (a->tp) {
-        /* Exits from tail-pred loops must reset LTPSIZE to 4 */
-        store_cpu_field(tcg_constant_i32(4), v7m.ltpsize);
-    }
-    /* End TB, continuing to following insn */
-    gen_jmp_tb(s, curr_insn_len(s), 1);
-    return true;
-}
-
-static bool trans_LCTP(DisasContext *s, arg_LCTP *a)
-{
-    /*
-     * M-profile Loop Clear with Tail Predication. Since our implementation
-     * doesn't cache branch information, all we need to do is reset
-     * FPSCR.LTPSIZE to 4.
-     */
-
-    if (!dc_isar_feature(aa32_lob, s) ||
-        !dc_isar_feature(aa32_mve, s)) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    store_cpu_field_constant(4, v7m.ltpsize);
-    return true;
-}
-
-static bool trans_VCTP(DisasContext *s, arg_VCTP *a)
-{
-    /*
-     * M-profile Create Vector Tail Predicate. This insn is itself
-     * predicated and is subject to beatwise execution.
-     */
-    TCGv_i32 rn_shifted, masklen;
-
-    if (!dc_isar_feature(aa32_mve, s) || a->rn == 13 || a->rn == 15) {
-        return false;
-    }
-
-    if (!mve_eci_check(s) || !vfp_access_check(s)) {
-        return true;
-    }
-
-    /*
-     * We pre-calculate the mask length here to avoid having
-     * to have multiple helpers specialized for size.
-     * We pass the helper "rn <= (1 << (4 - size)) ? (rn << size) : 16".
-     */
-    rn_shifted = tcg_temp_new_i32();
-    masklen = load_reg(s, a->rn);
-    tcg_gen_shli_i32(rn_shifted, masklen, a->size);
-    tcg_gen_movcond_i32(TCG_COND_LEU, masklen,
-                        masklen, tcg_constant_i32(1 << (4 - a->size)),
-                        rn_shifted, tcg_constant_i32(16));
-    gen_helper_mve_vctp(cpu_env, masklen);
-    tcg_temp_free_i32(masklen);
-    tcg_temp_free_i32(rn_shifted);
-    /* This insn updates predication bits */
-    s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
-    mve_update_eci(s);
-    return true;
-}
-
-static bool op_tbranch(DisasContext *s, arg_tbranch *a, bool half)
-{
-    TCGv_i32 addr, tmp;
-
-    tmp = load_reg(s, a->rm);
-    if (half) {
-        tcg_gen_add_i32(tmp, tmp, tmp);
-    }
-    addr = load_reg(s, a->rn);
-    tcg_gen_add_i32(addr, addr, tmp);
-
-    gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), half ? MO_UW : MO_UB);
-
-    tcg_gen_add_i32(tmp, tmp, tmp);
-    gen_pc_plus_diff(s, addr, jmp_diff(s, 0));
-    tcg_gen_add_i32(tmp, tmp, addr);
-    tcg_temp_free_i32(addr);
-    store_reg(s, 15, tmp);
-    return true;
-}
-
-static bool trans_TBB(DisasContext *s, arg_tbranch *a)
-{
-    return op_tbranch(s, a, false);
-}
-
-static bool trans_TBH(DisasContext *s, arg_tbranch *a)
-{
-    return op_tbranch(s, a, true);
-}
-
-static bool trans_CBZ(DisasContext *s, arg_CBZ *a)
-{
-    TCGv_i32 tmp = load_reg(s, a->rn);
-
-    arm_gen_condlabel(s);
-    tcg_gen_brcondi_i32(a->nz ? TCG_COND_EQ : TCG_COND_NE,
-                        tmp, 0, s->condlabel.label);
-    tcg_temp_free_i32(tmp);
-    gen_jmp(s, jmp_diff(s, a->imm));
-    return true;
-}
-
-/*
- * Supervisor call - both T32 & A32 come here so we need to check
- * which mode we are in when checking for semihosting.
- */
-
-static bool trans_SVC(DisasContext *s, arg_SVC *a)
-{
-    const uint32_t semihost_imm = s->thumb ? 0xab : 0x123456;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_M) &&
-        semihosting_enabled(s->current_el == 0) &&
-        (a->imm == semihost_imm)) {
-        gen_exception_internal_insn(s, EXCP_SEMIHOST);
-    } else {
-        if (s->fgt_svc) {
-            uint32_t syndrome = syn_aa32_svc(a->imm, s->thumb);
-            gen_exception_insn_el(s, 0, EXCP_UDEF, syndrome, 2);
-        } else {
-            gen_update_pc(s, curr_insn_len(s));
-            s->svc_imm = a->imm;
-            s->base.is_jmp = DISAS_SWI;
-        }
-    }
-    return true;
-}
-
-/*
- * Unconditional system instructions
- */
-
-static bool trans_RFE(DisasContext *s, arg_RFE *a)
-{
-    static const int8_t pre_offset[4] = {
-        /* DA */ -4, /* IA */ 0, /* DB */ -8, /* IB */ 4
-    };
-    static const int8_t post_offset[4] = {
-        /* DA */ -8, /* IA */ 4, /* DB */ -4, /* IB */ 0
-    };
-    TCGv_i32 addr, t1, t2;
-
-    if (!ENABLE_ARCH_6 || arm_dc_feature(s, ARM_FEATURE_M)) {
-        return false;
-    }
-    if (IS_USER(s)) {
-        unallocated_encoding(s);
-        return true;
-    }
-
-    addr = load_reg(s, a->rn);
-    tcg_gen_addi_i32(addr, addr, pre_offset[a->pu]);
-
-    /* Load PC into tmp and CPSR into tmp2.  */
-    t1 = tcg_temp_new_i32();
-    gen_aa32_ld_i32(s, t1, addr, get_mem_index(s), MO_UL | MO_ALIGN);
-    tcg_gen_addi_i32(addr, addr, 4);
-    t2 = tcg_temp_new_i32();
-    gen_aa32_ld_i32(s, t2, addr, get_mem_index(s), MO_UL | MO_ALIGN);
-
-    if (a->w) {
-        /* Base writeback.  */
-        tcg_gen_addi_i32(addr, addr, post_offset[a->pu]);
-        store_reg(s, a->rn, addr);
-    } else {
-        tcg_temp_free_i32(addr);
-    }
-    gen_rfe(s, t1, t2);
-    return true;
-}
-
-static bool trans_SRS(DisasContext *s, arg_SRS *a)
-{
-    if (!ENABLE_ARCH_6 || arm_dc_feature(s, ARM_FEATURE_M)) {
-        return false;
-    }
-    gen_srs(s, a->mode, a->pu, a->w);
-    return true;
-}
-
-static bool trans_CPS(DisasContext *s, arg_CPS *a)
-{
-    uint32_t mask, val;
-
-    if (!ENABLE_ARCH_6 || arm_dc_feature(s, ARM_FEATURE_M)) {
-        return false;
-    }
-    if (IS_USER(s)) {
-        /* Implemented as NOP in user mode.  */
-        return true;
-    }
-    /* TODO: There are quite a lot of UNPREDICTABLE argument combinations. */
-
-    mask = val = 0;
-    if (a->imod & 2) {
-        if (a->A) {
-            mask |= CPSR_A;
-        }
-        if (a->I) {
-            mask |= CPSR_I;
-        }
-        if (a->F) {
-            mask |= CPSR_F;
-        }
-        if (a->imod & 1) {
-            val |= mask;
-        }
-    }
-    if (a->M) {
-        mask |= CPSR_M;
-        val |= a->mode;
-    }
-    if (mask) {
-        gen_set_psr_im(s, mask, 0, val);
-    }
-    return true;
-}
-
-static bool trans_CPS_v7m(DisasContext *s, arg_CPS_v7m *a)
-{
-    TCGv_i32 tmp, addr;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_M)) {
-        return false;
-    }
-    if (IS_USER(s)) {
-        /* Implemented as NOP in user mode.  */
-        return true;
-    }
-
-    tmp = tcg_constant_i32(a->im);
-    /* FAULTMASK */
-    if (a->F) {
-        addr = tcg_constant_i32(19);
-        gen_helper_v7m_msr(cpu_env, addr, tmp);
-    }
-    /* PRIMASK */
-    if (a->I) {
-        addr = tcg_constant_i32(16);
-        gen_helper_v7m_msr(cpu_env, addr, tmp);
-    }
-    gen_rebuild_hflags(s, false);
-    gen_lookup_tb(s);
-    return true;
-}
-
-/*
- * Clear-Exclusive, Barriers
- */
-
-static bool trans_CLREX(DisasContext *s, arg_CLREX *a)
-{
-    if (s->thumb
-        ? !ENABLE_ARCH_7 && !arm_dc_feature(s, ARM_FEATURE_M)
-        : !ENABLE_ARCH_6K) {
-        return false;
-    }
-    gen_clrex(s);
-    return true;
-}
-
-static bool trans_DSB(DisasContext *s, arg_DSB *a)
-{
-    if (!ENABLE_ARCH_7 && !arm_dc_feature(s, ARM_FEATURE_M)) {
-        return false;
-    }
-    tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
-    return true;
-}
-
-static bool trans_DMB(DisasContext *s, arg_DMB *a)
-{
-    return trans_DSB(s, NULL);
-}
-
-static bool trans_ISB(DisasContext *s, arg_ISB *a)
-{
-    if (!ENABLE_ARCH_7 && !arm_dc_feature(s, ARM_FEATURE_M)) {
-        return false;
-    }
-    /*
-     * We need to break the TB after this insn to execute
-     * self-modifying code correctly and also to take
-     * any pending interrupts immediately.
-     */
-    s->base.is_jmp = DISAS_TOO_MANY;
-    return true;
-}
-
-static bool trans_SB(DisasContext *s, arg_SB *a)
-{
-    if (!dc_isar_feature(aa32_sb, s)) {
-        return false;
-    }
-    /*
-     * TODO: There is no speculation barrier opcode
-     * for TCG; MB and end the TB instead.
-     */
-    tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
-    s->base.is_jmp = DISAS_TOO_MANY;
-    return true;
-}
-
-static bool trans_SETEND(DisasContext *s, arg_SETEND *a)
-{
-    if (!ENABLE_ARCH_6) {
-        return false;
-    }
-    if (a->E != (s->be_data == MO_BE)) {
-        gen_helper_setend(cpu_env);
-        s->base.is_jmp = DISAS_UPDATE_EXIT;
-    }
-    return true;
-}
-
-/*
- * Preload instructions
- * All are nops, contingent on the appropriate arch level.
- */
-
-static bool trans_PLD(DisasContext *s, arg_PLD *a)
-{
-    return ENABLE_ARCH_5TE;
-}
-
-static bool trans_PLDW(DisasContext *s, arg_PLD *a)
-{
-    return arm_dc_feature(s, ARM_FEATURE_V7MP);
-}
-
-static bool trans_PLI(DisasContext *s, arg_PLD *a)
-{
-    return ENABLE_ARCH_7;
-}
-
-/*
- * If-then
- */
-
-static bool trans_IT(DisasContext *s, arg_IT *a)
-{
-    int cond_mask = a->cond_mask;
-
-    /*
-     * No actual code generated for this insn, just setup state.
-     *
-     * Combinations of firstcond and mask which set up an 0b1111
-     * condition are UNPREDICTABLE; we take the CONSTRAINED
-     * UNPREDICTABLE choice to treat 0b1111 the same as 0b1110,
-     * i.e. both meaning "execute always".
-     */
-    s->condexec_cond = (cond_mask >> 4) & 0xe;
-    s->condexec_mask = cond_mask & 0x1f;
-    return true;
-}
-
-/* v8.1M CSEL/CSINC/CSNEG/CSINV */
-static bool trans_CSEL(DisasContext *s, arg_CSEL *a)
-{
-    TCGv_i32 rn, rm, zero;
-    DisasCompare c;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
-        return false;
-    }
-
-    if (a->rm == 13) {
-        /* SEE "Related encodings" (MVE shifts) */
-        return false;
-    }
-
-    if (a->rd == 13 || a->rd == 15 || a->rn == 13 || a->fcond >= 14) {
-        /* CONSTRAINED UNPREDICTABLE: we choose to UNDEF */
-        return false;
-    }
-
-    /* In this insn input reg fields of 0b1111 mean "zero", not "PC" */
-    zero = tcg_constant_i32(0);
-    if (a->rn == 15) {
-        rn = zero;
-    } else {
-        rn = load_reg(s, a->rn);
-    }
-    if (a->rm == 15) {
-        rm = zero;
-    } else {
-        rm = load_reg(s, a->rm);
-    }
-
-    switch (a->op) {
-    case 0: /* CSEL */
-        break;
-    case 1: /* CSINC */
-        tcg_gen_addi_i32(rm, rm, 1);
-        break;
-    case 2: /* CSINV */
-        tcg_gen_not_i32(rm, rm);
-        break;
-    case 3: /* CSNEG */
-        tcg_gen_neg_i32(rm, rm);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-
-    arm_test_cc(&c, a->fcond);
-    tcg_gen_movcond_i32(c.cond, rn, c.value, zero, rn, rm);
-    arm_free_cc(&c);
-
-    store_reg(s, a->rd, rn);
-    tcg_temp_free_i32(rm);
-
-    return true;
-}
-
-/*
- * Legacy decoder.
- */
-
-static void disas_arm_insn(DisasContext *s, unsigned int insn)
-{
-    unsigned int cond = insn >> 28;
-
-    /* M variants do not implement ARM mode; this must raise the INVSTATE
-     * UsageFault exception.
-     */
-    if (arm_dc_feature(s, ARM_FEATURE_M)) {
-        gen_exception_insn(s, 0, EXCP_INVSTATE, syn_uncategorized());
-        return;
-    }
-
-    if (s->pstate_il) {
-        /*
-         * Illegal execution state. This has priority over BTI
-         * exceptions, but comes after instruction abort exceptions.
-         */
-        gen_exception_insn(s, 0, EXCP_UDEF, syn_illegalstate());
-        return;
-    }
-
-    if (cond == 0xf) {
-        /* In ARMv3 and v4 the NV condition is UNPREDICTABLE; we
-         * choose to UNDEF. In ARMv5 and above the space is used
-         * for miscellaneous unconditional instructions.
-         */
-        if (!arm_dc_feature(s, ARM_FEATURE_V5)) {
-            unallocated_encoding(s);
-            return;
-        }
-
-        /* Unconditional instructions.  */
-        /* TODO: Perhaps merge these into one decodetree output file.  */
-        if (disas_a32_uncond(s, insn) ||
-            disas_vfp_uncond(s, insn) ||
-            disas_neon_dp(s, insn) ||
-            disas_neon_ls(s, insn) ||
-            disas_neon_shared(s, insn)) {
-            return;
-        }
-        /* fall back to legacy decoder */
-
-        if ((insn & 0x0e000f00) == 0x0c000100) {
-            if (arm_dc_feature(s, ARM_FEATURE_IWMMXT)) {
-                /* iWMMXt register transfer.  */
-                if (extract32(s->c15_cpar, 1, 1)) {
-                    if (!disas_iwmmxt_insn(s, insn)) {
-                        return;
-                    }
-                }
-            }
-        }
-        goto illegal_op;
-    }
-    if (cond != 0xe) {
-        /* if not always execute, we generate a conditional jump to
-           next instruction */
-        arm_skip_unless(s, cond);
-    }
-
-    /* TODO: Perhaps merge these into one decodetree output file.  */
-    if (disas_a32(s, insn) ||
-        disas_vfp(s, insn)) {
-        return;
-    }
-    /* fall back to legacy decoder */
-    /* TODO: convert xscale/iwmmxt decoder to decodetree ?? */
-    if (arm_dc_feature(s, ARM_FEATURE_XSCALE)) {
-        if (((insn & 0x0c000e00) == 0x0c000000)
-            && ((insn & 0x03000000) != 0x03000000)) {
-            /* Coprocessor insn, coprocessor 0 or 1 */
-            disas_xscale_insn(s, insn);
-            return;
-        }
-    }
-
-illegal_op:
-    unallocated_encoding(s);
-}
-
-static bool thumb_insn_is_16bit(DisasContext *s, uint32_t pc, uint32_t insn)
-{
-    /*
-     * Return true if this is a 16 bit instruction. We must be precise
-     * about this (matching the decode).
-     */
-    if ((insn >> 11) < 0x1d) {
-        /* Definitely a 16-bit instruction */
-        return true;
-    }
-
-    /* Top five bits 0b11101 / 0b11110 / 0b11111 : this is the
-     * first half of a 32-bit Thumb insn. Thumb-1 cores might
-     * end up actually treating this as two 16-bit insns, though,
-     * if it's half of a bl/blx pair that might span a page boundary.
-     */
-    if (arm_dc_feature(s, ARM_FEATURE_THUMB2) ||
-        arm_dc_feature(s, ARM_FEATURE_M)) {
-        /* Thumb2 cores (including all M profile ones) always treat
-         * 32-bit insns as 32-bit.
-         */
-        return false;
-    }
-
-    if ((insn >> 11) == 0x1e && pc - s->page_start < TARGET_PAGE_SIZE - 3) {
-        /* 0b1111_0xxx_xxxx_xxxx : BL/BLX prefix, and the suffix
-         * is not on the next page; we merge this into a 32-bit
-         * insn.
-         */
-        return false;
-    }
-    /* 0b1110_1xxx_xxxx_xxxx : BLX suffix (or UNDEF);
-     * 0b1111_1xxx_xxxx_xxxx : BL suffix;
-     * 0b1111_0xxx_xxxx_xxxx : BL/BLX prefix on the end of a page
-     *  -- handle as single 16 bit insn
-     */
-    return true;
-}
-
-/* Translate a 32-bit thumb instruction. */
-static void disas_thumb2_insn(DisasContext *s, uint32_t insn)
-{
-    /*
-     * ARMv6-M supports a limited subset of Thumb2 instructions.
-     * Other Thumb1 architectures allow only 32-bit
-     * combined BL/BLX prefix and suffix.
-     */
-    if (arm_dc_feature(s, ARM_FEATURE_M) &&
-        !arm_dc_feature(s, ARM_FEATURE_V7)) {
-        int i;
-        bool found = false;
-        static const uint32_t armv6m_insn[] = {0xf3808000 /* msr */,
-                                               0xf3b08040 /* dsb */,
-                                               0xf3b08050 /* dmb */,
-                                               0xf3b08060 /* isb */,
-                                               0xf3e08000 /* mrs */,
-                                               0xf000d000 /* bl */};
-        static const uint32_t armv6m_mask[] = {0xffe0d000,
-                                               0xfff0d0f0,
-                                               0xfff0d0f0,
-                                               0xfff0d0f0,
-                                               0xffe0d000,
-                                               0xf800d000};
-
-        for (i = 0; i < ARRAY_SIZE(armv6m_insn); i++) {
-            if ((insn & armv6m_mask[i]) == armv6m_insn[i]) {
-                found = true;
-                break;
-            }
-        }
-        if (!found) {
-            goto illegal_op;
-        }
-    } else if ((insn & 0xf800e800) != 0xf000e800)  {
-        if (!arm_dc_feature(s, ARM_FEATURE_THUMB2)) {
-            unallocated_encoding(s);
-            return;
-        }
-    }
-
-    if (arm_dc_feature(s, ARM_FEATURE_M)) {
-        /*
-         * NOCP takes precedence over any UNDEF for (almost) the
-         * entire wide range of coprocessor-space encodings, so check
-         * for it first before proceeding to actually decode eg VFP
-         * insns. This decode also handles the few insns which are
-         * in copro space but do not have NOCP checks (eg VLLDM, VLSTM).
-         */
-        if (disas_m_nocp(s, insn)) {
-            return;
-        }
-    }
-
-    if ((insn & 0xef000000) == 0xef000000) {
-        /*
-         * T32 encodings 0b111p_1111_qqqq_qqqq_qqqq_qqqq_qqqq_qqqq
-         * transform into
-         * A32 encodings 0b1111_001p_qqqq_qqqq_qqqq_qqqq_qqqq_qqqq
-         */
-        uint32_t a32_insn = (insn & 0xe2ffffff) |
-            ((insn & (1 << 28)) >> 4) | (1 << 28);
-
-        if (disas_neon_dp(s, a32_insn)) {
-            return;
-        }
-    }
-
-    if ((insn & 0xff100000) == 0xf9000000) {
-        /*
-         * T32 encodings 0b1111_1001_ppp0_qqqq_qqqq_qqqq_qqqq_qqqq
-         * transform into
-         * A32 encodings 0b1111_0100_ppp0_qqqq_qqqq_qqqq_qqqq_qqqq
-         */
-        uint32_t a32_insn = (insn & 0x00ffffff) | 0xf4000000;
-
-        if (disas_neon_ls(s, a32_insn)) {
-            return;
-        }
-    }
-
-    /*
-     * TODO: Perhaps merge these into one decodetree output file.
-     * Note disas_vfp is written for a32 with cond field in the
-     * top nibble.  The t32 encoding requires 0xe in the top nibble.
-     */
-    if (disas_t32(s, insn) ||
-        disas_vfp_uncond(s, insn) ||
-        disas_neon_shared(s, insn) ||
-        disas_mve(s, insn) ||
-        ((insn >> 28) == 0xe && disas_vfp(s, insn))) {
-        return;
-    }
-
-illegal_op:
-    unallocated_encoding(s);
-}
-
-static void disas_thumb_insn(DisasContext *s, uint32_t insn)
-{
-    if (!disas_t16(s, insn)) {
-        unallocated_encoding(s);
-    }
-}
-
-static bool insn_crosses_page(CPUARMState *env, DisasContext *s)
-{
-    /* Return true if the insn at dc->base.pc_next might cross a page boundary.
-     * (False positives are OK, false negatives are not.)
-     * We know this is a Thumb insn, and our caller ensures we are
-     * only called if dc->base.pc_next is less than 4 bytes from the page
-     * boundary, so we cross the page if the first 16 bits indicate
-     * that this is a 32 bit insn.
-     */
-    uint16_t insn = arm_lduw_code(env, &s->base, s->base.pc_next, s->sctlr_b);
-
-    return !thumb_insn_is_16bit(s, s->base.pc_next, insn);
-}
-
-static void arm_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs)
-{
-    DisasContext *dc = container_of(dcbase, DisasContext, base);
-    CPUARMState *env = cs->env_ptr;
-    ARMCPU *cpu = env_archcpu(env);
-    CPUARMTBFlags tb_flags = arm_tbflags_from_tb(dc->base.tb);
-    uint32_t condexec, core_mmu_idx;
-
-    dc->isar = &cpu->isar;
-    dc->condjmp = 0;
-    dc->pc_save = dc->base.pc_first;
-    dc->aarch64 = false;
-    dc->thumb = EX_TBFLAG_AM32(tb_flags, THUMB);
-    dc->be_data = EX_TBFLAG_ANY(tb_flags, BE_DATA) ? MO_BE : MO_LE;
-    condexec = EX_TBFLAG_AM32(tb_flags, CONDEXEC);
-    /*
-     * the CONDEXEC TB flags are CPSR bits [15:10][26:25]. On A-profile this
-     * is always the IT bits. On M-profile, some of the reserved encodings
-     * of IT are used instead to indicate either ICI or ECI, which
-     * indicate partial progress of a restartable insn that was interrupted
-     * partway through by an exception:
-     *  * if CONDEXEC[3:0] != 0b0000 : CONDEXEC is IT bits
-     *  * if CONDEXEC[3:0] == 0b0000 : CONDEXEC is ICI or ECI bits
-     * In all cases CONDEXEC == 0 means "not in IT block or restartable
-     * insn, behave normally".
-     */
-    dc->eci = dc->condexec_mask = dc->condexec_cond = 0;
-    dc->eci_handled = false;
-    if (condexec & 0xf) {
-        dc->condexec_mask = (condexec & 0xf) << 1;
-        dc->condexec_cond = condexec >> 4;
-    } else {
-        if (arm_feature(env, ARM_FEATURE_M)) {
-            dc->eci = condexec >> 4;
-        }
-    }
-
-    core_mmu_idx = EX_TBFLAG_ANY(tb_flags, MMUIDX);
-    dc->mmu_idx = core_to_arm_mmu_idx(env, core_mmu_idx);
-    dc->current_el = arm_mmu_idx_to_el(dc->mmu_idx);
-#if !defined(CONFIG_USER_ONLY)
-    dc->user = (dc->current_el == 0);
-#endif
-    dc->fp_excp_el = EX_TBFLAG_ANY(tb_flags, FPEXC_EL);
-    dc->align_mem = EX_TBFLAG_ANY(tb_flags, ALIGN_MEM);
-    dc->pstate_il = EX_TBFLAG_ANY(tb_flags, PSTATE__IL);
-    dc->fgt_active = EX_TBFLAG_ANY(tb_flags, FGT_ACTIVE);
-    dc->fgt_svc = EX_TBFLAG_ANY(tb_flags, FGT_SVC);
-
-    if (arm_feature(env, ARM_FEATURE_M)) {
-        dc->vfp_enabled = 1;
-        dc->be_data = MO_TE;
-        dc->v7m_handler_mode = EX_TBFLAG_M32(tb_flags, HANDLER);
-        dc->v8m_secure = EX_TBFLAG_M32(tb_flags, SECURE);
-        dc->v8m_stackcheck = EX_TBFLAG_M32(tb_flags, STACKCHECK);
-        dc->v8m_fpccr_s_wrong = EX_TBFLAG_M32(tb_flags, FPCCR_S_WRONG);
-        dc->v7m_new_fp_ctxt_needed =
-            EX_TBFLAG_M32(tb_flags, NEW_FP_CTXT_NEEDED);
-        dc->v7m_lspact = EX_TBFLAG_M32(tb_flags, LSPACT);
-        dc->mve_no_pred = EX_TBFLAG_M32(tb_flags, MVE_NO_PRED);
-    } else {
-        dc->sctlr_b = EX_TBFLAG_A32(tb_flags, SCTLR__B);
-        dc->hstr_active = EX_TBFLAG_A32(tb_flags, HSTR_ACTIVE);
-        dc->ns = EX_TBFLAG_A32(tb_flags, NS);
-        dc->vfp_enabled = EX_TBFLAG_A32(tb_flags, VFPEN);
-        if (arm_feature(env, ARM_FEATURE_XSCALE)) {
-            dc->c15_cpar = EX_TBFLAG_A32(tb_flags, XSCALE_CPAR);
-        } else {
-            dc->vec_len = EX_TBFLAG_A32(tb_flags, VECLEN);
-            dc->vec_stride = EX_TBFLAG_A32(tb_flags, VECSTRIDE);
-        }
-        dc->sme_trap_nonstreaming =
-            EX_TBFLAG_A32(tb_flags, SME_TRAP_NONSTREAMING);
-    }
-    dc->cp_regs = cpu->cp_regs;
-    dc->features = env->features;
-
-    /* Single step state. The code-generation logic here is:
-     *  SS_ACTIVE == 0:
-     *   generate code with no special handling for single-stepping (except
-     *   that anything that can make us go to SS_ACTIVE == 1 must end the TB;
-     *   this happens anyway because those changes are all system register or
-     *   PSTATE writes).
-     *  SS_ACTIVE == 1, PSTATE.SS == 1: (active-not-pending)
-     *   emit code for one insn
-     *   emit code to clear PSTATE.SS
-     *   emit code to generate software step exception for completed step
-     *   end TB (as usual for having generated an exception)
-     *  SS_ACTIVE == 1, PSTATE.SS == 0: (active-pending)
-     *   emit code to generate a software step exception
-     *   end the TB
-     */
-    dc->ss_active = EX_TBFLAG_ANY(tb_flags, SS_ACTIVE);
-    dc->pstate_ss = EX_TBFLAG_ANY(tb_flags, PSTATE__SS);
-    dc->is_ldex = false;
-
-    dc->page_start = dc->base.pc_first & TARGET_PAGE_MASK;
-
-    /* If architectural single step active, limit to 1.  */
-    if (dc->ss_active) {
-        dc->base.max_insns = 1;
-    }
-
-    /* ARM is a fixed-length ISA.  Bound the number of insns to execute
-       to those left on the page.  */
-    if (!dc->thumb) {
-        int bound = -(dc->base.pc_first | TARGET_PAGE_MASK) / 4;
-        dc->base.max_insns = MIN(dc->base.max_insns, bound);
-    }
-
-    cpu_V0 = tcg_temp_new_i64();
-    cpu_V1 = tcg_temp_new_i64();
-    cpu_M0 = tcg_temp_new_i64();
-}
-
-static void arm_tr_tb_start(DisasContextBase *dcbase, CPUState *cpu)
-{
-    DisasContext *dc = container_of(dcbase, DisasContext, base);
-
-    /* A note on handling of the condexec (IT) bits:
-     *
-     * We want to avoid the overhead of having to write the updated condexec
-     * bits back to the CPUARMState for every instruction in an IT block. So:
-     * (1) if the condexec bits are not already zero then we write
-     * zero back into the CPUARMState now. This avoids complications trying
-     * to do it at the end of the block. (For example if we don't do this
-     * it's hard to identify whether we can safely skip writing condexec
-     * at the end of the TB, which we definitely want to do for the case
-     * where a TB doesn't do anything with the IT state at all.)
-     * (2) if we are going to leave the TB then we call gen_set_condexec()
-     * which will write the correct value into CPUARMState if zero is wrong.
-     * This is done both for leaving the TB at the end, and for leaving
-     * it because of an exception we know will happen, which is done in
-     * gen_exception_insn(). The latter is necessary because we need to
-     * leave the TB with the PC/IT state just prior to execution of the
-     * instruction which caused the exception.
-     * (3) if we leave the TB unexpectedly (eg a data abort on a load)
-     * then the CPUARMState will be wrong and we need to reset it.
-     * This is handled in the same way as restoration of the
-     * PC in these situations; we save the value of the condexec bits
-     * for each PC via tcg_gen_insn_start(), and restore_state_to_opc()
-     * then uses this to restore them after an exception.
-     *
-     * Note that there are no instructions which can read the condexec
-     * bits, and none which can write non-static values to them, so
-     * we don't need to care about whether CPUARMState is correct in the
-     * middle of a TB.
-     */
-
-    /* Reset the conditional execution bits immediately. This avoids
-       complications trying to do it at the end of the block.  */
-    if (dc->condexec_mask || dc->condexec_cond) {
-        store_cpu_field_constant(0, condexec_bits);
-    }
-}
-
-static void arm_tr_insn_start(DisasContextBase *dcbase, CPUState *cpu)
-{
-    DisasContext *dc = container_of(dcbase, DisasContext, base);
-    /*
-     * The ECI/ICI bits share PSR bits with the IT bits, so we
-     * need to reconstitute the bits from the split-out DisasContext
-     * fields here.
-     */
-    uint32_t condexec_bits;
-    target_ulong pc_arg = dc->base.pc_next;
-
-    if (TARGET_TB_PCREL) {
-        pc_arg &= ~TARGET_PAGE_MASK;
-    }
-    if (dc->eci) {
-        condexec_bits = dc->eci << 4;
-    } else {
-        condexec_bits = (dc->condexec_cond << 4) | (dc->condexec_mask >> 1);
-    }
-    tcg_gen_insn_start(pc_arg, condexec_bits, 0);
-    dc->insn_start = tcg_last_op();
-}
-
-static bool arm_check_kernelpage(DisasContext *dc)
-{
-#ifdef CONFIG_USER_ONLY
-    /* Intercept jump to the magic kernel page.  */
-    if (dc->base.pc_next >= 0xffff0000) {
-        /* We always get here via a jump, so know we are not in a
-           conditional execution block.  */
-        gen_exception_internal(EXCP_KERNEL_TRAP);
-        dc->base.is_jmp = DISAS_NORETURN;
-        return true;
-    }
-#endif
-    return false;
-}
-
-static bool arm_check_ss_active(DisasContext *dc)
-{
-    if (dc->ss_active && !dc->pstate_ss) {
-        /* Singlestep state is Active-pending.
-         * If we're in this state at the start of a TB then either
-         *  a) we just took an exception to an EL which is being debugged
-         *     and this is the first insn in the exception handler
-         *  b) debug exceptions were masked and we just unmasked them
-         *     without changing EL (eg by clearing PSTATE.D)
-         * In either case we're going to take a swstep exception in the
-         * "did not step an insn" case, and so the syndrome ISV and EX
-         * bits should be zero.
-         */
-        assert(dc->base.num_insns == 1);
-        gen_swstep_exception(dc, 0, 0);
-        dc->base.is_jmp = DISAS_NORETURN;
-        return true;
-    }
-
-    return false;
-}
-
-static void arm_post_translate_insn(DisasContext *dc)
-{
-    if (dc->condjmp && dc->base.is_jmp == DISAS_NEXT) {
-        if (dc->pc_save != dc->condlabel.pc_save) {
-            gen_update_pc(dc, dc->condlabel.pc_save - dc->pc_save);
-        }
-        gen_set_label(dc->condlabel.label);
-        dc->condjmp = 0;
-    }
-    translator_loop_temp_check(&dc->base);
-}
-
-static void arm_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu)
-{
-    DisasContext *dc = container_of(dcbase, DisasContext, base);
-    CPUARMState *env = cpu->env_ptr;
-    uint32_t pc = dc->base.pc_next;
-    unsigned int insn;
-
-    /* Singlestep exceptions have the highest priority. */
-    if (arm_check_ss_active(dc)) {
-        dc->base.pc_next = pc + 4;
-        return;
-    }
-
-    if (pc & 3) {
-        /*
-         * PC alignment fault.  This has priority over the instruction abort
-         * that we would receive from a translation fault via arm_ldl_code
-         * (or the execution of the kernelpage entrypoint). This should only
-         * be possible after an indirect branch, at the start of the TB.
-         */
-        assert(dc->base.num_insns == 1);
-        gen_helper_exception_pc_alignment(cpu_env, tcg_constant_tl(pc));
-        dc->base.is_jmp = DISAS_NORETURN;
-        dc->base.pc_next = QEMU_ALIGN_UP(pc, 4);
-        return;
-    }
-
-    if (arm_check_kernelpage(dc)) {
-        dc->base.pc_next = pc + 4;
-        return;
-    }
-
-    dc->pc_curr = pc;
-    insn = arm_ldl_code(env, &dc->base, pc, dc->sctlr_b);
-    dc->insn = insn;
-    dc->base.pc_next = pc + 4;
-    disas_arm_insn(dc, insn);
-
-    arm_post_translate_insn(dc);
-
-    /* ARM is a fixed-length ISA.  We performed the cross-page check
-       in init_disas_context by adjusting max_insns.  */
-}
-
-static bool thumb_insn_is_unconditional(DisasContext *s, uint32_t insn)
-{
-    /* Return true if this Thumb insn is always unconditional,
-     * even inside an IT block. This is true of only a very few
-     * instructions: BKPT, HLT, and SG.
-     *
-     * A larger class of instructions are UNPREDICTABLE if used
-     * inside an IT block; we do not need to detect those here, because
-     * what we do by default (perform the cc check and update the IT
-     * bits state machine) is a permitted CONSTRAINED UNPREDICTABLE
-     * choice for those situations.
-     *
-     * insn is either a 16-bit or a 32-bit instruction; the two are
-     * distinguishable because for the 16-bit case the top 16 bits
-     * are zeroes, and that isn't a valid 32-bit encoding.
-     */
-    if ((insn & 0xffffff00) == 0xbe00) {
-        /* BKPT */
-        return true;
-    }
-
-    if ((insn & 0xffffffc0) == 0xba80 && arm_dc_feature(s, ARM_FEATURE_V8) &&
-        !arm_dc_feature(s, ARM_FEATURE_M)) {
-        /* HLT: v8A only. This is unconditional even when it is going to
-         * UNDEF; see the v8A ARM ARM DDI0487B.a H3.3.
-         * For v7 cores this was a plain old undefined encoding and so
-         * honours its cc check. (We might be using the encoding as
-         * a semihosting trap, but we don't change the cc check behaviour
-         * on that account, because a debugger connected to a real v7A
-         * core and emulating semihosting traps by catching the UNDEF
-         * exception would also only see cases where the cc check passed.
-         * No guest code should be trying to do a HLT semihosting trap
-         * in an IT block anyway.
-         */
-        return true;
-    }
-
-    if (insn == 0xe97fe97f && arm_dc_feature(s, ARM_FEATURE_V8) &&
-        arm_dc_feature(s, ARM_FEATURE_M)) {
-        /* SG: v8M only */
-        return true;
-    }
-
-    return false;
-}
-
-static void thumb_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu)
-{
-    DisasContext *dc = container_of(dcbase, DisasContext, base);
-    CPUARMState *env = cpu->env_ptr;
-    uint32_t pc = dc->base.pc_next;
-    uint32_t insn;
-    bool is_16bit;
-    /* TCG op to rewind to if this turns out to be an invalid ECI state */
-    TCGOp *insn_eci_rewind = NULL;
-    target_ulong insn_eci_pc_save = -1;
-
-    /* Misaligned thumb PC is architecturally impossible. */
-    assert((dc->base.pc_next & 1) == 0);
-
-    if (arm_check_ss_active(dc) || arm_check_kernelpage(dc)) {
-        dc->base.pc_next = pc + 2;
-        return;
-    }
-
-    dc->pc_curr = pc;
-    insn = arm_lduw_code(env, &dc->base, pc, dc->sctlr_b);
-    is_16bit = thumb_insn_is_16bit(dc, dc->base.pc_next, insn);
-    pc += 2;
-    if (!is_16bit) {
-        uint32_t insn2 = arm_lduw_code(env, &dc->base, pc, dc->sctlr_b);
-        insn = insn << 16 | insn2;
-        pc += 2;
-    }
-    dc->base.pc_next = pc;
-    dc->insn = insn;
-
-    if (dc->pstate_il) {
-        /*
-         * Illegal execution state. This has priority over BTI
-         * exceptions, but comes after instruction abort exceptions.
-         */
-        gen_exception_insn(dc, 0, EXCP_UDEF, syn_illegalstate());
-        return;
-    }
-
-    if (dc->eci) {
-        /*
-         * For M-profile continuable instructions, ECI/ICI handling
-         * falls into these cases:
-         *  - interrupt-continuable instructions
-         *     These are the various load/store multiple insns (both
-         *     integer and fp). The ICI bits indicate the register
-         *     where the load/store can resume. We make the IMPDEF
-         *     choice to always do "instruction restart", ie ignore
-         *     the ICI value and always execute the ldm/stm from the
-         *     start. So all we need to do is zero PSR.ICI if the
-         *     insn executes.
-         *  - MVE instructions subject to beat-wise execution
-         *     Here the ECI bits indicate which beats have already been
-         *     executed, and we must honour this. Each insn of this
-         *     type will handle it correctly. We will update PSR.ECI
-         *     in the helper function for the insn (some ECI values
-         *     mean that the following insn also has been partially
-         *     executed).
-         *  - Special cases which don't advance ECI
-         *     The insns LE, LETP and BKPT leave the ECI/ICI state
-         *     bits untouched.
-         *  - all other insns (the common case)
-         *     Non-zero ECI/ICI means an INVSTATE UsageFault.
-         *     We place a rewind-marker here. Insns in the previous
-         *     three categories will set a flag in the DisasContext.
-         *     If the flag isn't set after we call disas_thumb_insn()
-         *     or disas_thumb2_insn() then we know we have a "some other
-         *     insn" case. We will rewind to the marker (ie throwing away
-         *     all the generated code) and instead emit "take exception".
-         */
-        insn_eci_rewind = tcg_last_op();
-        insn_eci_pc_save = dc->pc_save;
-    }
-
-    if (dc->condexec_mask && !thumb_insn_is_unconditional(dc, insn)) {
-        uint32_t cond = dc->condexec_cond;
-
-        /*
-         * Conditionally skip the insn. Note that both 0xe and 0xf mean
-         * "always"; 0xf is not "never".
-         */
-        if (cond < 0x0e) {
-            arm_skip_unless(dc, cond);
-        }
-    }
-
-    if (is_16bit) {
-        disas_thumb_insn(dc, insn);
-    } else {
-        disas_thumb2_insn(dc, insn);
-    }
-
-    /* Advance the Thumb condexec condition.  */
-    if (dc->condexec_mask) {
-        dc->condexec_cond = ((dc->condexec_cond & 0xe) |
-                             ((dc->condexec_mask >> 4) & 1));
-        dc->condexec_mask = (dc->condexec_mask << 1) & 0x1f;
-        if (dc->condexec_mask == 0) {
-            dc->condexec_cond = 0;
-        }
-    }
-
-    if (dc->eci && !dc->eci_handled) {
-        /*
-         * Insn wasn't valid for ECI/ICI at all: undo what we
-         * just generated and instead emit an exception
-         */
-        tcg_remove_ops_after(insn_eci_rewind);
-        dc->pc_save = insn_eci_pc_save;
-        dc->condjmp = 0;
-        gen_exception_insn(dc, 0, EXCP_INVSTATE, syn_uncategorized());
-    }
-
-    arm_post_translate_insn(dc);
-
-    /* Thumb is a variable-length ISA.  Stop translation when the next insn
-     * will touch a new page.  This ensures that prefetch aborts occur at
-     * the right place.
-     *
-     * We want to stop the TB if the next insn starts in a new page,
-     * or if it spans between this page and the next. This means that
-     * if we're looking at the last halfword in the page we need to
-     * see if it's a 16-bit Thumb insn (which will fit in this TB)
-     * or a 32-bit Thumb insn (which won't).
-     * This is to avoid generating a silly TB with a single 16-bit insn
-     * in it at the end of this page (which would execute correctly
-     * but isn't very efficient).
-     */
-    if (dc->base.is_jmp == DISAS_NEXT
-        && (dc->base.pc_next - dc->page_start >= TARGET_PAGE_SIZE
-            || (dc->base.pc_next - dc->page_start >= TARGET_PAGE_SIZE - 3
-                && insn_crosses_page(env, dc)))) {
-        dc->base.is_jmp = DISAS_TOO_MANY;
-    }
-}
-
-static void arm_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
-{
-    DisasContext *dc = container_of(dcbase, DisasContext, base);
-
-    /* At this stage dc->condjmp will only be set when the skipped
-       instruction was a conditional branch or trap, and the PC has
-       already been written.  */
-    gen_set_condexec(dc);
-    if (dc->base.is_jmp == DISAS_BX_EXCRET) {
-        /* Exception return branches need some special case code at the
-         * end of the TB, which is complex enough that it has to
-         * handle the single-step vs not and the condition-failed
-         * insn codepath itself.
-         */
-        gen_bx_excret_final_code(dc);
-    } else if (unlikely(dc->ss_active)) {
-        /* Unconditional and "condition passed" instruction codepath. */
-        switch (dc->base.is_jmp) {
-        case DISAS_SWI:
-            gen_ss_advance(dc);
-            gen_exception(EXCP_SWI, syn_aa32_svc(dc->svc_imm, dc->thumb));
-            break;
-        case DISAS_HVC:
-            gen_ss_advance(dc);
-            gen_exception_el(EXCP_HVC, syn_aa32_hvc(dc->svc_imm), 2);
-            break;
-        case DISAS_SMC:
-            gen_ss_advance(dc);
-            gen_exception_el(EXCP_SMC, syn_aa32_smc(), 3);
-            break;
-        case DISAS_NEXT:
-        case DISAS_TOO_MANY:
-        case DISAS_UPDATE_EXIT:
-        case DISAS_UPDATE_NOCHAIN:
-            gen_update_pc(dc, curr_insn_len(dc));
-            /* fall through */
-        default:
-            /* FIXME: Single stepping a WFI insn will not halt the CPU. */
-            gen_singlestep_exception(dc);
-            break;
-        case DISAS_NORETURN:
-            break;
-        }
-    } else {
-        /* While branches must always occur at the end of an IT block,
-           there are a few other things that can cause us to terminate
-           the TB in the middle of an IT block:
-            - Exception generating instructions (bkpt, swi, undefined).
-            - Page boundaries.
-            - Hardware watchpoints.
-           Hardware breakpoints have already been handled and skip this code.
-         */
-        switch (dc->base.is_jmp) {
-        case DISAS_NEXT:
-        case DISAS_TOO_MANY:
-            gen_goto_tb(dc, 1, curr_insn_len(dc));
-            break;
-        case DISAS_UPDATE_NOCHAIN:
-            gen_update_pc(dc, curr_insn_len(dc));
-            /* fall through */
-        case DISAS_JUMP:
-            gen_goto_ptr();
-            break;
-        case DISAS_UPDATE_EXIT:
-            gen_update_pc(dc, curr_insn_len(dc));
-            /* fall through */
-        default:
-            /* indicate that the hash table must be used to find the next TB */
-            tcg_gen_exit_tb(NULL, 0);
-            break;
-        case DISAS_NORETURN:
-            /* nothing more to generate */
-            break;
-        case DISAS_WFI:
-            gen_helper_wfi(cpu_env, tcg_constant_i32(curr_insn_len(dc)));
-            /*
-             * The helper doesn't necessarily throw an exception, but we
-             * must go back to the main loop to check for interrupts anyway.
-             */
-            tcg_gen_exit_tb(NULL, 0);
-            break;
-        case DISAS_WFE:
-            gen_helper_wfe(cpu_env);
-            break;
-        case DISAS_YIELD:
-            gen_helper_yield(cpu_env);
-            break;
-        case DISAS_SWI:
-            gen_exception(EXCP_SWI, syn_aa32_svc(dc->svc_imm, dc->thumb));
-            break;
-        case DISAS_HVC:
-            gen_exception_el(EXCP_HVC, syn_aa32_hvc(dc->svc_imm), 2);
-            break;
-        case DISAS_SMC:
-            gen_exception_el(EXCP_SMC, syn_aa32_smc(), 3);
-            break;
-        }
-    }
-
-    if (dc->condjmp) {
-        /* "Condition failed" instruction codepath for the branch/trap insn */
-        set_disas_label(dc, dc->condlabel);
-        gen_set_condexec(dc);
-        if (unlikely(dc->ss_active)) {
-            gen_update_pc(dc, curr_insn_len(dc));
-            gen_singlestep_exception(dc);
-        } else {
-            gen_goto_tb(dc, 1, curr_insn_len(dc));
-        }
-    }
-}
-
-static void arm_tr_disas_log(const DisasContextBase *dcbase,
-                             CPUState *cpu, FILE *logfile)
-{
-    DisasContext *dc = container_of(dcbase, DisasContext, base);
-
-    fprintf(logfile, "IN: %s\n", lookup_symbol(dc->base.pc_first));
-    target_disas(logfile, cpu, dc->base.pc_first, dc->base.tb->size);
-}
-
-static const TranslatorOps arm_translator_ops = {
-    .init_disas_context = arm_tr_init_disas_context,
-    .tb_start           = arm_tr_tb_start,
-    .insn_start         = arm_tr_insn_start,
-    .translate_insn     = arm_tr_translate_insn,
-    .tb_stop            = arm_tr_tb_stop,
-    .disas_log          = arm_tr_disas_log,
-};
-
-static const TranslatorOps thumb_translator_ops = {
-    .init_disas_context = arm_tr_init_disas_context,
-    .tb_start           = arm_tr_tb_start,
-    .insn_start         = arm_tr_insn_start,
-    .translate_insn     = thumb_tr_translate_insn,
-    .tb_stop            = arm_tr_tb_stop,
-    .disas_log          = arm_tr_disas_log,
-};
-
-/* generate intermediate code for basic block 'tb'.  */
-void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int max_insns,
-                           target_ulong pc, void *host_pc)
-{
-    DisasContext dc = { };
-    const TranslatorOps *ops = &arm_translator_ops;
-    CPUARMTBFlags tb_flags = arm_tbflags_from_tb(tb);
-
-    if (EX_TBFLAG_AM32(tb_flags, THUMB)) {
-        ops = &thumb_translator_ops;
-    }
-#ifdef TARGET_AARCH64
-    if (EX_TBFLAG_ANY(tb_flags, AARCH64_STATE)) {
-        ops = &aarch64_translator_ops;
-    }
-#endif
-
-    translator_loop(cpu, tb, max_insns, pc, host_pc, ops, &dc.base);
-}
diff --git a/target/arm/translate.h b/target/arm/translate.h
deleted file mode 100644
index 3717824..0000000
--- a/target/arm/translate.h
+++ /dev/null
@@ -1,644 +0,0 @@
-#ifndef TARGET_ARM_TRANSLATE_H
-#define TARGET_ARM_TRANSLATE_H
-
-#include "exec/translator.h"
-#include "internals.h"
-
-
-/* internal defines */
-
-/*
- * Save pc_save across a branch, so that we may restore the value from
- * before the branch at the point the label is emitted.
- */
-typedef struct DisasLabel {
-    TCGLabel *label;
-    target_ulong pc_save;
-} DisasLabel;
-
-typedef struct DisasContext {
-    DisasContextBase base;
-    const ARMISARegisters *isar;
-
-    /* The address of the current instruction being translated. */
-    target_ulong pc_curr;
-    /*
-     * For TARGET_TB_PCREL, the full value of cpu_pc is not known
-     * (although the page offset is known).  For convenience, the
-     * translation loop uses the full virtual address that triggered
-     * the translation, from base.pc_start through pc_curr.
-     * For efficiency, we do not update cpu_pc for every instruction.
-     * Instead, pc_save has the value of pc_curr at the time of the
-     * last update to cpu_pc, which allows us to compute the addend
-     * needed to bring cpu_pc current: pc_curr - pc_save.
-     * If cpu_pc now contains the destination of an indirect branch,
-     * pc_save contains -1 to indicate that relative updates are no
-     * longer possible.
-     */
-    target_ulong pc_save;
-    target_ulong page_start;
-    uint32_t insn;
-    /* Nonzero if this instruction has been conditionally skipped.  */
-    int condjmp;
-    /* The label that will be jumped to when the instruction is skipped.  */
-    DisasLabel condlabel;
-    /* Thumb-2 conditional execution bits.  */
-    int condexec_mask;
-    int condexec_cond;
-    /* M-profile ECI/ICI exception-continuable instruction state */
-    int eci;
-    /*
-     * trans_ functions for insns which are continuable should set this true
-     * after decode (ie after any UNDEF checks)
-     */
-    bool eci_handled;
-    int sctlr_b;
-    MemOp be_data;
-#if !defined(CONFIG_USER_ONLY)
-    int user;
-#endif
-    ARMMMUIdx mmu_idx; /* MMU index to use for normal loads/stores */
-    uint8_t tbii;      /* TBI1|TBI0 for insns */
-    uint8_t tbid;      /* TBI1|TBI0 for data */
-    uint8_t tcma;      /* TCMA1|TCMA0 for MTE */
-    bool ns;        /* Use non-secure CPREG bank on access */
-    int fp_excp_el; /* FP exception EL or 0 if enabled */
-    int sve_excp_el; /* SVE exception EL or 0 if enabled */
-    int sme_excp_el; /* SME exception EL or 0 if enabled */
-    int vl;          /* current vector length in bytes */
-    int svl;         /* current streaming vector length in bytes */
-    bool vfp_enabled; /* FP enabled via FPSCR.EN */
-    int vec_len;
-    int vec_stride;
-    bool v7m_handler_mode;
-    bool v8m_secure; /* true if v8M and we're in Secure mode */
-    bool v8m_stackcheck; /* true if we need to perform v8M stack limit checks */
-    bool v8m_fpccr_s_wrong; /* true if v8M FPCCR.S != v8m_secure */
-    bool v7m_new_fp_ctxt_needed; /* ASPEN set but no active FP context */
-    bool v7m_lspact; /* FPCCR.LSPACT set */
-    /* Immediate value in AArch32 SVC insn; must be set if is_jmp == DISAS_SWI
-     * so that top level loop can generate correct syndrome information.
-     */
-    uint32_t svc_imm;
-    int current_el;
-    GHashTable *cp_regs;
-    uint64_t features; /* CPU features bits */
-    bool aarch64;
-    bool thumb;
-    /* Because unallocated encodings generate different exception syndrome
-     * information from traps due to FP being disabled, we can't do a single
-     * "is fp access disabled" check at a high level in the decode tree.
-     * To help in catching bugs where the access check was forgotten in some
-     * code path, we set this flag when the access check is done, and assert
-     * that it is set at the point where we actually touch the FP regs.
-     */
-    bool fp_access_checked;
-    bool sve_access_checked;
-    /* ARMv8 single-step state (this is distinct from the QEMU gdbstub
-     * single-step support).
-     */
-    bool ss_active;
-    bool pstate_ss;
-    /* True if the insn just emitted was a load-exclusive instruction
-     * (necessary for syndrome information for single step exceptions),
-     * ie A64 LDX*, LDAX*, A32/T32 LDREX*, LDAEX*.
-     */
-    bool is_ldex;
-    /* True if AccType_UNPRIV should be used for LDTR et al */
-    bool unpriv;
-    /* True if v8.3-PAuth is active.  */
-    bool pauth_active;
-    /* True if v8.5-MTE access to tags is enabled.  */
-    bool ata;
-    /* True if v8.5-MTE tag checks affect the PE; index with is_unpriv.  */
-    bool mte_active[2];
-    /* True with v8.5-BTI and SCTLR_ELx.BT* set.  */
-    bool bt;
-    /* True if any CP15 access is trapped by HSTR_EL2 */
-    bool hstr_active;
-    /* True if memory operations require alignment */
-    bool align_mem;
-    /* True if PSTATE.IL is set */
-    bool pstate_il;
-    /* True if PSTATE.SM is set. */
-    bool pstate_sm;
-    /* True if PSTATE.ZA is set. */
-    bool pstate_za;
-    /* True if non-streaming insns should raise an SME Streaming exception. */
-    bool sme_trap_nonstreaming;
-    /* True if the current instruction is non-streaming. */
-    bool is_nonstreaming;
-    /* True if MVE insns are definitely not predicated by VPR or LTPSIZE */
-    bool mve_no_pred;
-    /* True if fine-grained traps are active */
-    bool fgt_active;
-    /* True if fine-grained trap on ERET is enabled */
-    bool fgt_eret;
-    /* True if fine-grained trap on SVC is enabled */
-    bool fgt_svc;
-    /*
-     * >= 0, a copy of PSTATE.BTYPE, which will be 0 without v8.5-BTI.
-     *  < 0, set by the current instruction.
-     */
-    int8_t btype;
-    /* A copy of cpu->dcz_blocksize. */
-    uint8_t dcz_blocksize;
-    /* True if this page is guarded.  */
-    bool guarded_page;
-    /* Bottom two bits of XScale c15_cpar coprocessor access control reg */
-    int c15_cpar;
-    /* TCG op of the current insn_start.  */
-    TCGOp *insn_start;
-#define TMP_A64_MAX 16
-    int tmp_a64_count;
-    TCGv_i64 tmp_a64[TMP_A64_MAX];
-} DisasContext;
-
-typedef struct DisasCompare {
-    TCGCond cond;
-    TCGv_i32 value;
-    bool value_global;
-} DisasCompare;
-
-/* Share the TCG temporaries common between 32 and 64 bit modes.  */
-extern TCGv_i32 cpu_NF, cpu_ZF, cpu_CF, cpu_VF;
-extern TCGv_i64 cpu_exclusive_addr;
-extern TCGv_i64 cpu_exclusive_val;
-
-/*
- * Constant expanders for the decoders.
- */
-
-static inline int negate(DisasContext *s, int x)
-{
-    return -x;
-}
-
-static inline int plus_1(DisasContext *s, int x)
-{
-    return x + 1;
-}
-
-static inline int plus_2(DisasContext *s, int x)
-{
-    return x + 2;
-}
-
-static inline int plus_12(DisasContext *s, int x)
-{
-    return x + 12;
-}
-
-static inline int times_2(DisasContext *s, int x)
-{
-    return x * 2;
-}
-
-static inline int times_4(DisasContext *s, int x)
-{
-    return x * 4;
-}
-
-static inline int times_2_plus_1(DisasContext *s, int x)
-{
-    return x * 2 + 1;
-}
-
-static inline int rsub_64(DisasContext *s, int x)
-{
-    return 64 - x;
-}
-
-static inline int rsub_32(DisasContext *s, int x)
-{
-    return 32 - x;
-}
-
-static inline int rsub_16(DisasContext *s, int x)
-{
-    return 16 - x;
-}
-
-static inline int rsub_8(DisasContext *s, int x)
-{
-    return 8 - x;
-}
-
-static inline int neon_3same_fp_size(DisasContext *s, int x)
-{
-    /* Convert 0==fp32, 1==fp16 into a MO_* value */
-    return MO_32 - x;
-}
-
-static inline int arm_dc_feature(DisasContext *dc, int feature)
-{
-    return (dc->features & (1ULL << feature)) != 0;
-}
-
-static inline int get_mem_index(DisasContext *s)
-{
-    return arm_to_core_mmu_idx(s->mmu_idx);
-}
-
-static inline void disas_set_insn_syndrome(DisasContext *s, uint32_t syn)
-{
-    /* We don't need to save all of the syndrome so we mask and shift
-     * out unneeded bits to help the sleb128 encoder do a better job.
-     */
-    syn &= ARM_INSN_START_WORD2_MASK;
-    syn >>= ARM_INSN_START_WORD2_SHIFT;
-
-    /* We check and clear insn_start_idx to catch multiple updates.  */
-    assert(s->insn_start != NULL);
-    tcg_set_insn_start_param(s->insn_start, 2, syn);
-    s->insn_start = NULL;
-}
-
-static inline int curr_insn_len(DisasContext *s)
-{
-    return s->base.pc_next - s->pc_curr;
-}
-
-/* is_jmp field values */
-#define DISAS_JUMP      DISAS_TARGET_0 /* only pc was modified dynamically */
-/* CPU state was modified dynamically; exit to main loop for interrupts. */
-#define DISAS_UPDATE_EXIT  DISAS_TARGET_1
-/* These instructions trap after executing, so the A32/T32 decoder must
- * defer them until after the conditional execution state has been updated.
- * WFI also needs special handling when single-stepping.
- */
-#define DISAS_WFI       DISAS_TARGET_2
-#define DISAS_SWI       DISAS_TARGET_3
-/* WFE */
-#define DISAS_WFE       DISAS_TARGET_4
-#define DISAS_HVC       DISAS_TARGET_5
-#define DISAS_SMC       DISAS_TARGET_6
-#define DISAS_YIELD     DISAS_TARGET_7
-/* M profile branch which might be an exception return (and so needs
- * custom end-of-TB code)
- */
-#define DISAS_BX_EXCRET DISAS_TARGET_8
-/*
- * For instructions which want an immediate exit to the main loop, as opposed
- * to attempting to use lookup_and_goto_ptr.  Unlike DISAS_UPDATE_EXIT, this
- * doesn't write the PC on exiting the translation loop so you need to ensure
- * something (gen_a64_update_pc or runtime helper) has done so before we reach
- * return from cpu_tb_exec.
- */
-#define DISAS_EXIT      DISAS_TARGET_9
-/* CPU state was modified dynamically; no need to exit, but do not chain. */
-#define DISAS_UPDATE_NOCHAIN  DISAS_TARGET_10
-
-#ifdef TARGET_AARCH64
-void a64_translate_init(void);
-void gen_a64_update_pc(DisasContext *s, target_long diff);
-extern const TranslatorOps aarch64_translator_ops;
-#else
-static inline void a64_translate_init(void)
-{
-}
-
-static inline void gen_a64_update_pc(DisasContext *s, target_long diff)
-{
-}
-#endif
-
-void arm_test_cc(DisasCompare *cmp, int cc);
-void arm_free_cc(DisasCompare *cmp);
-void arm_jump_cc(DisasCompare *cmp, TCGLabel *label);
-void arm_gen_test_cc(int cc, TCGLabel *label);
-MemOp pow2_align(unsigned i);
-void unallocated_encoding(DisasContext *s);
-void gen_exception_insn_el(DisasContext *s, target_long pc_diff, int excp,
-                           uint32_t syn, uint32_t target_el);
-void gen_exception_insn(DisasContext *s, target_long pc_diff,
-                        int excp, uint32_t syn);
-
-/* Return state of Alternate Half-precision flag, caller frees result */
-static inline TCGv_i32 get_ahp_flag(void)
-{
-    TCGv_i32 ret = tcg_temp_new_i32();
-
-    tcg_gen_ld_i32(ret, cpu_env,
-                   offsetof(CPUARMState, vfp.xregs[ARM_VFP_FPSCR]));
-    tcg_gen_extract_i32(ret, ret, 26, 1);
-
-    return ret;
-}
-
-/* Set bits within PSTATE.  */
-static inline void set_pstate_bits(uint32_t bits)
-{
-    TCGv_i32 p = tcg_temp_new_i32();
-
-    tcg_debug_assert(!(bits & CACHED_PSTATE_BITS));
-
-    tcg_gen_ld_i32(p, cpu_env, offsetof(CPUARMState, pstate));
-    tcg_gen_ori_i32(p, p, bits);
-    tcg_gen_st_i32(p, cpu_env, offsetof(CPUARMState, pstate));
-    tcg_temp_free_i32(p);
-}
-
-/* Clear bits within PSTATE.  */
-static inline void clear_pstate_bits(uint32_t bits)
-{
-    TCGv_i32 p = tcg_temp_new_i32();
-
-    tcg_debug_assert(!(bits & CACHED_PSTATE_BITS));
-
-    tcg_gen_ld_i32(p, cpu_env, offsetof(CPUARMState, pstate));
-    tcg_gen_andi_i32(p, p, ~bits);
-    tcg_gen_st_i32(p, cpu_env, offsetof(CPUARMState, pstate));
-    tcg_temp_free_i32(p);
-}
-
-/* If the singlestep state is Active-not-pending, advance to Active-pending. */
-static inline void gen_ss_advance(DisasContext *s)
-{
-    if (s->ss_active) {
-        s->pstate_ss = 0;
-        clear_pstate_bits(PSTATE_SS);
-    }
-}
-
-/* Generate an architectural singlestep exception */
-static inline void gen_swstep_exception(DisasContext *s, int isv, int ex)
-{
-    /* Fill in the same_el field of the syndrome in the helper. */
-    uint32_t syn = syn_swstep(false, isv, ex);
-    gen_helper_exception_swstep(cpu_env, tcg_constant_i32(syn));
-}
-
-/*
- * Given a VFP floating point constant encoded into an 8 bit immediate in an
- * instruction, expand it to the actual constant value of the specified
- * size, as per the VFPExpandImm() pseudocode in the Arm ARM.
- */
-uint64_t vfp_expand_imm(int size, uint8_t imm8);
-
-/* Vector operations shared between ARM and AArch64.  */
-void gen_gvec_ceq0(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
-                   uint32_t opr_sz, uint32_t max_sz);
-void gen_gvec_clt0(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
-                   uint32_t opr_sz, uint32_t max_sz);
-void gen_gvec_cgt0(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
-                   uint32_t opr_sz, uint32_t max_sz);
-void gen_gvec_cle0(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
-                   uint32_t opr_sz, uint32_t max_sz);
-void gen_gvec_cge0(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
-                   uint32_t opr_sz, uint32_t max_sz);
-
-void gen_gvec_mla(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                  uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
-void gen_gvec_mls(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                  uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
-
-void gen_gvec_cmtst(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
-void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
-void gen_gvec_ushl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
-
-void gen_cmtst_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
-void gen_ushl_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b);
-void gen_sshl_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b);
-void gen_ushl_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
-void gen_sshl_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
-
-void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                       uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
-void gen_gvec_sqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                       uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
-void gen_gvec_uqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                       uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
-void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                       uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
-
-void gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
-                   int64_t shift, uint32_t opr_sz, uint32_t max_sz);
-void gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
-                   int64_t shift, uint32_t opr_sz, uint32_t max_sz);
-
-void gen_gvec_srshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
-                    int64_t shift, uint32_t opr_sz, uint32_t max_sz);
-void gen_gvec_urshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
-                    int64_t shift, uint32_t opr_sz, uint32_t max_sz);
-void gen_gvec_srsra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
-                    int64_t shift, uint32_t opr_sz, uint32_t max_sz);
-void gen_gvec_ursra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
-                    int64_t shift, uint32_t opr_sz, uint32_t max_sz);
-
-void gen_gvec_sri(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
-                  int64_t shift, uint32_t opr_sz, uint32_t max_sz);
-void gen_gvec_sli(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
-                  int64_t shift, uint32_t opr_sz, uint32_t max_sz);
-
-void gen_gvec_sqrdmlah_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                          uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
-void gen_gvec_sqrdmlsh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                          uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
-
-void gen_gvec_sabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
-void gen_gvec_uabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
-
-void gen_gvec_saba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
-void gen_gvec_uaba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
-                   uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
-
-/*
- * Forward to the isar_feature_* tests given a DisasContext pointer.
- */
-#define dc_isar_feature(name, ctx) \
-    ({ DisasContext *ctx_ = (ctx); isar_feature_##name(ctx_->isar); })
-
-/* Note that the gvec expanders operate on offsets + sizes.  */
-typedef void GVecGen2Fn(unsigned, uint32_t, uint32_t, uint32_t, uint32_t);
-typedef void GVecGen2iFn(unsigned, uint32_t, uint32_t, int64_t,
-                         uint32_t, uint32_t);
-typedef void GVecGen3Fn(unsigned, uint32_t, uint32_t,
-                        uint32_t, uint32_t, uint32_t);
-typedef void GVecGen4Fn(unsigned, uint32_t, uint32_t, uint32_t,
-                        uint32_t, uint32_t, uint32_t);
-
-/* Function prototype for gen_ functions for calling Neon helpers */
-typedef void NeonGenOneOpFn(TCGv_i32, TCGv_i32);
-typedef void NeonGenOneOpEnvFn(TCGv_i32, TCGv_ptr, TCGv_i32);
-typedef void NeonGenTwoOpFn(TCGv_i32, TCGv_i32, TCGv_i32);
-typedef void NeonGenTwoOpEnvFn(TCGv_i32, TCGv_ptr, TCGv_i32, TCGv_i32);
-typedef void NeonGenThreeOpEnvFn(TCGv_i32, TCGv_env, TCGv_i32,
-                                 TCGv_i32, TCGv_i32);
-typedef void NeonGenTwo64OpFn(TCGv_i64, TCGv_i64, TCGv_i64);
-typedef void NeonGenTwo64OpEnvFn(TCGv_i64, TCGv_ptr, TCGv_i64, TCGv_i64);
-typedef void NeonGenNarrowFn(TCGv_i32, TCGv_i64);
-typedef void NeonGenNarrowEnvFn(TCGv_i32, TCGv_ptr, TCGv_i64);
-typedef void NeonGenWidenFn(TCGv_i64, TCGv_i32);
-typedef void NeonGenTwoOpWidenFn(TCGv_i64, TCGv_i32, TCGv_i32);
-typedef void NeonGenOneSingleOpFn(TCGv_i32, TCGv_i32, TCGv_ptr);
-typedef void NeonGenTwoSingleOpFn(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_ptr);
-typedef void NeonGenTwoDoubleOpFn(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_ptr);
-typedef void NeonGenOne64OpFn(TCGv_i64, TCGv_i64);
-typedef void CryptoTwoOpFn(TCGv_ptr, TCGv_ptr);
-typedef void CryptoThreeOpIntFn(TCGv_ptr, TCGv_ptr, TCGv_i32);
-typedef void CryptoThreeOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr);
-typedef void AtomicThreeOpFn(TCGv_i64, TCGv_i64, TCGv_i64, TCGArg, MemOp);
-typedef void WideShiftImmFn(TCGv_i64, TCGv_i64, int64_t shift);
-typedef void WideShiftFn(TCGv_i64, TCGv_ptr, TCGv_i64, TCGv_i32);
-typedef void ShiftImmFn(TCGv_i32, TCGv_i32, int32_t shift);
-typedef void ShiftFn(TCGv_i32, TCGv_ptr, TCGv_i32, TCGv_i32);
-
-/**
- * arm_tbflags_from_tb:
- * @tb: the TranslationBlock
- *
- * Extract the flag values from @tb.
- */
-static inline CPUARMTBFlags arm_tbflags_from_tb(const TranslationBlock *tb)
-{
-    return (CPUARMTBFlags){ tb->flags, tb->cs_base };
-}
-
-/*
- * Enum for argument to fpstatus_ptr().
- */
-typedef enum ARMFPStatusFlavour {
-    FPST_FPCR,
-    FPST_FPCR_F16,
-    FPST_STD,
-    FPST_STD_F16,
-} ARMFPStatusFlavour;
-
-/**
- * fpstatus_ptr: return TCGv_ptr to the specified fp_status field
- *
- * We have multiple softfloat float_status fields in the Arm CPU state struct
- * (see the comment in cpu.h for details). Return a TCGv_ptr which has
- * been set up to point to the requested field in the CPU state struct.
- * The options are:
- *
- * FPST_FPCR
- *   for non-FP16 operations controlled by the FPCR
- * FPST_FPCR_F16
- *   for operations controlled by the FPCR where FPCR.FZ16 is to be used
- * FPST_STD
- *   for A32/T32 Neon operations using the "standard FPSCR value"
- * FPST_STD_F16
- *   as FPST_STD, but where FPCR.FZ16 is to be used
- */
-static inline TCGv_ptr fpstatus_ptr(ARMFPStatusFlavour flavour)
-{
-    TCGv_ptr statusptr = tcg_temp_new_ptr();
-    int offset;
-
-    switch (flavour) {
-    case FPST_FPCR:
-        offset = offsetof(CPUARMState, vfp.fp_status);
-        break;
-    case FPST_FPCR_F16:
-        offset = offsetof(CPUARMState, vfp.fp_status_f16);
-        break;
-    case FPST_STD:
-        offset = offsetof(CPUARMState, vfp.standard_fp_status);
-        break;
-    case FPST_STD_F16:
-        offset = offsetof(CPUARMState, vfp.standard_fp_status_f16);
-        break;
-    default:
-        g_assert_not_reached();
-    }
-    tcg_gen_addi_ptr(statusptr, cpu_env, offset);
-    return statusptr;
-}
-
-/**
- * finalize_memop:
- * @s: DisasContext
- * @opc: size+sign+align of the memory operation
- *
- * Build the complete MemOp for a memory operation, including alignment
- * and endianness.
- *
- * If (op & MO_AMASK) then the operation already contains the required
- * alignment, e.g. for AccType_ATOMIC.  Otherwise, this an optionally
- * unaligned operation, e.g. for AccType_NORMAL.
- *
- * In the latter case, there are configuration bits that require alignment,
- * and this is applied here.  Note that there is no way to indicate that
- * no alignment should ever be enforced; this must be handled manually.
- */
-static inline MemOp finalize_memop(DisasContext *s, MemOp opc)
-{
-    if (s->align_mem && !(opc & MO_AMASK)) {
-        opc |= MO_ALIGN;
-    }
-    return opc | s->be_data;
-}
-
-/**
- * asimd_imm_const: Expand an encoded SIMD constant value
- *
- * Expand a SIMD constant value. This is essentially the pseudocode
- * AdvSIMDExpandImm, except that we also perform the boolean NOT needed for
- * VMVN and VBIC (when cmode < 14 && op == 1).
- *
- * The combination cmode == 15 op == 1 is a reserved encoding for AArch32;
- * callers must catch this; we return the 64-bit constant value defined
- * for AArch64.
- *
- * cmode = 2,3,4,5,6,7,10,11,12,13 imm=0 was UNPREDICTABLE in v7A but
- * is either not unpredictable or merely CONSTRAINED UNPREDICTABLE in v8A;
- * we produce an immediate constant value of 0 in these cases.
- */
-uint64_t asimd_imm_const(uint32_t imm, int cmode, int op);
-
-/*
- * gen_disas_label:
- * Create a label and cache a copy of pc_save.
- */
-static inline DisasLabel gen_disas_label(DisasContext *s)
-{
-    return (DisasLabel){
-        .label = gen_new_label(),
-        .pc_save = s->pc_save,
-    };
-}
-
-/*
- * set_disas_label:
- * Emit a label and restore the cached copy of pc_save.
- */
-static inline void set_disas_label(DisasContext *s, DisasLabel l)
-{
-    gen_set_label(l.label);
-    s->pc_save = l.pc_save;
-}
-
-static inline TCGv_ptr gen_lookup_cp_reg(uint32_t key)
-{
-    TCGv_ptr ret = tcg_temp_new_ptr();
-    gen_helper_lookup_cp_reg(ret, cpu_env, tcg_constant_i32(key));
-    return ret;
-}
-
-/*
- * Helpers for implementing sets of trans_* functions.
- * Defer the implementation of NAME to FUNC, with optional extra arguments.
- */
-#define TRANS(NAME, FUNC, ...) \
-    static bool trans_##NAME(DisasContext *s, arg_##NAME *a) \
-    { return FUNC(s, __VA_ARGS__); }
-#define TRANS_FEAT(NAME, FEAT, FUNC, ...) \
-    static bool trans_##NAME(DisasContext *s, arg_##NAME *a) \
-    { return dc_isar_feature(FEAT, s) && FUNC(s, __VA_ARGS__); }
-
-#define TRANS_FEAT_NONSTREAMING(NAME, FEAT, FUNC, ...)            \
-    static bool trans_##NAME(DisasContext *s, arg_##NAME *a)      \
-    {                                                             \
-        s->is_nonstreaming = true;                                \
-        return dc_isar_feature(FEAT, s) && FUNC(s, __VA_ARGS__);  \
-    }
-
-#endif /* TARGET_ARM_TRANSLATE_H */
diff --git a/target/arm/vfp-uncond.decode b/target/arm/vfp-uncond.decode
deleted file mode 100644
index 5c50447..0000000
--- a/target/arm/vfp-uncond.decode
+++ /dev/null
@@ -1,82 +0,0 @@
-# AArch32 VFP instruction descriptions (unconditional insns)
-#
-#  Copyright (c) 2019 Linaro, Ltd
-#
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-#
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, see <http://www.gnu.org/licenses/>.
-
-#
-# This file is processed by scripts/decodetree.py
-#
-# Encodings for the unconditional VFP instructions are here:
-# generally anything matching A32
-#  1111 1110 .... .... .... 101. ...0 ....
-# and T32
-#  1111 110. .... .... .... 101. .... ....
-#  1111 1110 .... .... .... 101. .... ....
-# (but those patterns might also cover some Neon instructions,
-# which do not live in this file.)
-
-# VFP registers have an odd encoding with a four-bit field
-# and a one-bit field which are assembled in different orders
-# depending on whether the register is double or single precision.
-# Each individual instruction function must do the checks for
-# "double register selected but CPU does not have double support"
-# and "double register number has bit 4 set but CPU does not
-# support D16-D31" (which should UNDEF).
-%vm_dp  5:1 0:4
-%vm_sp  0:4 5:1
-%vn_dp  7:1 16:4
-%vn_sp  16:4 7:1
-%vd_dp  22:1 12:4
-%vd_sp  12:4 22:1
-
-@vfp_dnm_s   ................................ vm=%vm_sp vn=%vn_sp vd=%vd_sp
-@vfp_dnm_d   ................................ vm=%vm_dp vn=%vn_dp vd=%vd_dp
-
-VSEL        1111 1110 0. cc:2 .... .... 1001 .0.0 .... \
-            vm=%vm_sp vn=%vn_sp vd=%vd_sp sz=1
-VSEL        1111 1110 0. cc:2 .... .... 1010 .0.0 .... \
-            vm=%vm_sp vn=%vn_sp vd=%vd_sp sz=2
-VSEL        1111 1110 0. cc:2 .... .... 1011 .0.0 .... \
-            vm=%vm_dp vn=%vn_dp vd=%vd_dp sz=3
-
-VMAXNM_hp   1111 1110 1.00 .... .... 1001 .0.0 ....         @vfp_dnm_s
-VMINNM_hp   1111 1110 1.00 .... .... 1001 .1.0 ....         @vfp_dnm_s
-
-VMAXNM_sp   1111 1110 1.00 .... .... 1010 .0.0 ....         @vfp_dnm_s
-VMINNM_sp   1111 1110 1.00 .... .... 1010 .1.0 ....         @vfp_dnm_s
-
-VMAXNM_dp   1111 1110 1.00 .... .... 1011 .0.0 ....         @vfp_dnm_d
-VMINNM_dp   1111 1110 1.00 .... .... 1011 .1.0 ....         @vfp_dnm_d
-
-VRINT       1111 1110 1.11 10 rm:2 .... 1001 01.0 .... \
-            vm=%vm_sp vd=%vd_sp sz=1
-VRINT       1111 1110 1.11 10 rm:2 .... 1010 01.0 .... \
-            vm=%vm_sp vd=%vd_sp sz=2
-VRINT       1111 1110 1.11 10 rm:2 .... 1011 01.0 .... \
-            vm=%vm_dp vd=%vd_dp sz=3
-
-# VCVT float to int with specified rounding mode; Vd is always single-precision
-VCVT        1111 1110 1.11 11 rm:2 .... 1001 op:1 1.0 .... \
-            vm=%vm_sp vd=%vd_sp sz=1
-VCVT        1111 1110 1.11 11 rm:2 .... 1010 op:1 1.0 .... \
-            vm=%vm_sp vd=%vd_sp sz=2
-VCVT        1111 1110 1.11 11 rm:2 .... 1011 op:1 1.0 .... \
-            vm=%vm_dp vd=%vd_sp sz=3
-
-VMOVX       1111 1110 1.11 0000 .... 1010 01 . 0 .... \
-            vd=%vd_sp vm=%vm_sp
-
-VINS        1111 1110 1.11 0000 .... 1010 11 . 0 .... \
-            vd=%vd_sp vm=%vm_sp
diff --git a/target/arm/vfp.decode b/target/arm/vfp.decode
deleted file mode 100644
index 5405e80..0000000
--- a/target/arm/vfp.decode
+++ /dev/null
@@ -1,247 +0,0 @@
-# AArch32 VFP instruction descriptions (conditional insns)
-#
-#  Copyright (c) 2019 Linaro, Ltd
-#
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-#
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, see <http://www.gnu.org/licenses/>.
-
-#
-# This file is processed by scripts/decodetree.py
-#
-# Encodings for the conditional VFP instructions are here:
-# generally anything matching A32
-#  cccc 11.. .... .... .... 101. .... ....
-# and T32
-#  1110 110. .... .... .... 101. .... ....
-#  1110 1110 .... .... .... 101. .... ....
-# (but those patterns might also cover some Neon instructions,
-# which do not live in this file.)
-
-# VFP registers have an odd encoding with a four-bit field
-# and a one-bit field which are assembled in different orders
-# depending on whether the register is double or single precision.
-# Each individual instruction function must do the checks for
-# "double register selected but CPU does not have double support"
-# and "double register number has bit 4 set but CPU does not
-# support D16-D31" (which should UNDEF).
-%vm_dp  5:1 0:4
-%vm_sp  0:4 5:1
-%vn_dp  7:1 16:4
-%vn_sp  16:4 7:1
-%vd_dp  22:1 12:4
-%vd_sp  12:4 22:1
-
-%vmov_idx_b     21:1 5:2
-%vmov_idx_h     21:1 6:1
-
-%vmov_imm 16:4 0:4
-
-@vfp_dnm_s   ................................ vm=%vm_sp vn=%vn_sp vd=%vd_sp
-@vfp_dnm_d   ................................ vm=%vm_dp vn=%vn_dp vd=%vd_dp
-
-@vfp_dm_ss   ................................ vm=%vm_sp vd=%vd_sp
-@vfp_dm_dd   ................................ vm=%vm_dp vd=%vd_dp
-@vfp_dm_ds   ................................ vm=%vm_sp vd=%vd_dp
-@vfp_dm_sd   ................................ vm=%vm_dp vd=%vd_sp
-
-# VMOV scalar to general-purpose register; note that this does
-# include some Neon cases.
-VMOV_to_gp   ---- 1110 u:1 1.        1 .... rt:4 1011 ... 1 0000 \
-             vn=%vn_dp size=0 index=%vmov_idx_b
-VMOV_to_gp   ---- 1110 u:1 0.        1 .... rt:4 1011 ..1 1 0000 \
-             vn=%vn_dp size=1 index=%vmov_idx_h
-VMOV_to_gp   ---- 1110 0   0 index:1 1 .... rt:4 1011 .00 1 0000 \
-             vn=%vn_dp size=2 u=0
-
-VMOV_from_gp ---- 1110 0 1.        0 .... rt:4 1011 ... 1 0000 \
-             vn=%vn_dp size=0 index=%vmov_idx_b
-VMOV_from_gp ---- 1110 0 0.        0 .... rt:4 1011 ..1 1 0000 \
-             vn=%vn_dp size=1 index=%vmov_idx_h
-VMOV_from_gp ---- 1110 0 0 index:1 0 .... rt:4 1011 .00 1 0000 \
-             vn=%vn_dp size=2
-
-VDUP         ---- 1110 1 b:1 q:1 0 .... rt:4 1011 . 0 e:1 1 0000 \
-             vn=%vn_dp
-
-VMSR_VMRS    ---- 1110 111 l:1 reg:4 rt:4 1010 0001 0000
-VMOV_half    ---- 1110 000 l:1 .... rt:4 1001 . 001 0000    vn=%vn_sp
-VMOV_single  ---- 1110 000 l:1 .... rt:4 1010 . 001 0000    vn=%vn_sp
-
-VMOV_64_sp   ---- 1100 010 op:1 rt2:4 rt:4 1010 00.1 ....   vm=%vm_sp
-VMOV_64_dp   ---- 1100 010 op:1 rt2:4 rt:4 1011 00.1 ....   vm=%vm_dp
-
-VLDR_VSTR_hp ---- 1101 u:1 .0 l:1 rn:4 .... 1001 imm:8      vd=%vd_sp
-VLDR_VSTR_sp ---- 1101 u:1 .0 l:1 rn:4 .... 1010 imm:8      vd=%vd_sp
-VLDR_VSTR_dp ---- 1101 u:1 .0 l:1 rn:4 .... 1011 imm:8      vd=%vd_dp
-
-# We split the load/store multiple up into two patterns to avoid
-# overlap with other insns in the "Advanced SIMD load/store and 64-bit move"
-# grouping:
-#   P=0 U=0 W=0 is 64-bit VMOV
-#   P=1 W=0 is VLDR/VSTR
-#   P=U W=1 is UNDEF
-# leaving P=0 U=1 W=x and P=1 U=0 W=1 for load/store multiple.
-# These include FSTM/FLDM.
-VLDM_VSTM_sp ---- 1100 1 . w:1 l:1 rn:4 .... 1010 imm:8 \
-             vd=%vd_sp p=0 u=1
-VLDM_VSTM_dp ---- 1100 1 . w:1 l:1 rn:4 .... 1011 imm:8 \
-             vd=%vd_dp p=0 u=1
-
-VLDM_VSTM_sp ---- 1101 0.1 l:1 rn:4 .... 1010 imm:8 \
-             vd=%vd_sp p=1 u=0 w=1
-VLDM_VSTM_dp ---- 1101 0.1 l:1 rn:4 .... 1011 imm:8 \
-             vd=%vd_dp p=1 u=0 w=1
-
-# 3-register VFP data-processing; bits [23,21:20,6] identify the operation.
-VMLA_hp      ---- 1110 0.00 .... .... 1001 .0.0 ....        @vfp_dnm_s
-VMLA_sp      ---- 1110 0.00 .... .... 1010 .0.0 ....        @vfp_dnm_s
-VMLA_dp      ---- 1110 0.00 .... .... 1011 .0.0 ....        @vfp_dnm_d
-
-VMLS_hp      ---- 1110 0.00 .... .... 1001 .1.0 ....        @vfp_dnm_s
-VMLS_sp      ---- 1110 0.00 .... .... 1010 .1.0 ....        @vfp_dnm_s
-VMLS_dp      ---- 1110 0.00 .... .... 1011 .1.0 ....        @vfp_dnm_d
-
-VNMLS_hp     ---- 1110 0.01 .... .... 1001 .0.0 ....        @vfp_dnm_s
-VNMLS_sp     ---- 1110 0.01 .... .... 1010 .0.0 ....        @vfp_dnm_s
-VNMLS_dp     ---- 1110 0.01 .... .... 1011 .0.0 ....        @vfp_dnm_d
-
-VNMLA_hp     ---- 1110 0.01 .... .... 1001 .1.0 ....        @vfp_dnm_s
-VNMLA_sp     ---- 1110 0.01 .... .... 1010 .1.0 ....        @vfp_dnm_s
-VNMLA_dp     ---- 1110 0.01 .... .... 1011 .1.0 ....        @vfp_dnm_d
-
-VMUL_hp      ---- 1110 0.10 .... .... 1001 .0.0 ....        @vfp_dnm_s
-VMUL_sp      ---- 1110 0.10 .... .... 1010 .0.0 ....        @vfp_dnm_s
-VMUL_dp      ---- 1110 0.10 .... .... 1011 .0.0 ....        @vfp_dnm_d
-
-VNMUL_hp     ---- 1110 0.10 .... .... 1001 .1.0 ....        @vfp_dnm_s
-VNMUL_sp     ---- 1110 0.10 .... .... 1010 .1.0 ....        @vfp_dnm_s
-VNMUL_dp     ---- 1110 0.10 .... .... 1011 .1.0 ....        @vfp_dnm_d
-
-VADD_hp      ---- 1110 0.11 .... .... 1001 .0.0 ....        @vfp_dnm_s
-VADD_sp      ---- 1110 0.11 .... .... 1010 .0.0 ....        @vfp_dnm_s
-VADD_dp      ---- 1110 0.11 .... .... 1011 .0.0 ....        @vfp_dnm_d
-
-VSUB_hp      ---- 1110 0.11 .... .... 1001 .1.0 ....        @vfp_dnm_s
-VSUB_sp      ---- 1110 0.11 .... .... 1010 .1.0 ....        @vfp_dnm_s
-VSUB_dp      ---- 1110 0.11 .... .... 1011 .1.0 ....        @vfp_dnm_d
-
-VDIV_hp      ---- 1110 1.00 .... .... 1001 .0.0 ....        @vfp_dnm_s
-VDIV_sp      ---- 1110 1.00 .... .... 1010 .0.0 ....        @vfp_dnm_s
-VDIV_dp      ---- 1110 1.00 .... .... 1011 .0.0 ....        @vfp_dnm_d
-
-VFMA_hp      ---- 1110 1.10 .... .... 1001 .0. 0 ....       @vfp_dnm_s
-VFMS_hp      ---- 1110 1.10 .... .... 1001 .1. 0 ....       @vfp_dnm_s
-VFNMA_hp     ---- 1110 1.01 .... .... 1001 .0. 0 ....       @vfp_dnm_s
-VFNMS_hp     ---- 1110 1.01 .... .... 1001 .1. 0 ....       @vfp_dnm_s
-
-VFMA_sp      ---- 1110 1.10 .... .... 1010 .0. 0 ....       @vfp_dnm_s
-VFMS_sp      ---- 1110 1.10 .... .... 1010 .1. 0 ....       @vfp_dnm_s
-VFNMA_sp     ---- 1110 1.01 .... .... 1010 .0. 0 ....       @vfp_dnm_s
-VFNMS_sp     ---- 1110 1.01 .... .... 1010 .1. 0 ....       @vfp_dnm_s
-
-VFMA_dp      ---- 1110 1.10 .... .... 1011 .0.0 ....        @vfp_dnm_d
-VFMS_dp      ---- 1110 1.10 .... .... 1011 .1.0 ....        @vfp_dnm_d
-VFNMA_dp     ---- 1110 1.01 .... .... 1011 .0.0 ....        @vfp_dnm_d
-VFNMS_dp     ---- 1110 1.01 .... .... 1011 .1.0 ....        @vfp_dnm_d
-
-VMOV_imm_hp  ---- 1110 1.11 .... .... 1001 0000 .... \
-             vd=%vd_sp imm=%vmov_imm
-VMOV_imm_sp  ---- 1110 1.11 .... .... 1010 0000 .... \
-             vd=%vd_sp imm=%vmov_imm
-VMOV_imm_dp  ---- 1110 1.11 .... .... 1011 0000 .... \
-             vd=%vd_dp imm=%vmov_imm
-
-VMOV_reg_sp  ---- 1110 1.11 0000 .... 1010 01.0 ....        @vfp_dm_ss
-VMOV_reg_dp  ---- 1110 1.11 0000 .... 1011 01.0 ....        @vfp_dm_dd
-
-VABS_hp      ---- 1110 1.11 0000 .... 1001 11.0 ....        @vfp_dm_ss
-VABS_sp      ---- 1110 1.11 0000 .... 1010 11.0 ....        @vfp_dm_ss
-VABS_dp      ---- 1110 1.11 0000 .... 1011 11.0 ....        @vfp_dm_dd
-
-VNEG_hp      ---- 1110 1.11 0001 .... 1001 01.0 ....        @vfp_dm_ss
-VNEG_sp      ---- 1110 1.11 0001 .... 1010 01.0 ....        @vfp_dm_ss
-VNEG_dp      ---- 1110 1.11 0001 .... 1011 01.0 ....        @vfp_dm_dd
-
-VSQRT_hp     ---- 1110 1.11 0001 .... 1001 11.0 ....        @vfp_dm_ss
-VSQRT_sp     ---- 1110 1.11 0001 .... 1010 11.0 ....        @vfp_dm_ss
-VSQRT_dp     ---- 1110 1.11 0001 .... 1011 11.0 ....        @vfp_dm_dd
-
-VCMP_hp      ---- 1110 1.11 010 z:1 .... 1001 e:1 1.0 .... \
-             vd=%vd_sp vm=%vm_sp
-VCMP_sp      ---- 1110 1.11 010 z:1 .... 1010 e:1 1.0 .... \
-             vd=%vd_sp vm=%vm_sp
-VCMP_dp      ---- 1110 1.11 010 z:1 .... 1011 e:1 1.0 .... \
-             vd=%vd_dp vm=%vm_dp
-
-# VCVTT and VCVTB from f16: Vd format depends on size bit; Vm is always vm_sp
-VCVT_f32_f16 ---- 1110 1.11 0010 .... 1010 t:1 1.0 .... \
-             vd=%vd_sp vm=%vm_sp
-VCVT_f64_f16 ---- 1110 1.11 0010 .... 1011 t:1 1.0 .... \
-             vd=%vd_dp vm=%vm_sp
-
-# VCVTB and VCVTT to f16: Vd format is always vd_sp;
-# Vm format depends on size bit
-VCVT_b16_f32 ---- 1110 1.11 0011 .... 1001 t:1 1.0 .... \
-             vd=%vd_sp vm=%vm_sp
-VCVT_f16_f32 ---- 1110 1.11 0011 .... 1010 t:1 1.0 .... \
-             vd=%vd_sp vm=%vm_sp
-VCVT_f16_f64 ---- 1110 1.11 0011 .... 1011 t:1 1.0 .... \
-             vd=%vd_sp vm=%vm_dp
-
-VRINTR_hp    ---- 1110 1.11 0110 .... 1001 01.0 ....        @vfp_dm_ss
-VRINTR_sp    ---- 1110 1.11 0110 .... 1010 01.0 ....        @vfp_dm_ss
-VRINTR_dp    ---- 1110 1.11 0110 .... 1011 01.0 ....        @vfp_dm_dd
-
-VRINTZ_hp    ---- 1110 1.11 0110 .... 1001 11.0 ....        @vfp_dm_ss
-VRINTZ_sp    ---- 1110 1.11 0110 .... 1010 11.0 ....        @vfp_dm_ss
-VRINTZ_dp    ---- 1110 1.11 0110 .... 1011 11.0 ....        @vfp_dm_dd
-
-VRINTX_hp    ---- 1110 1.11 0111 .... 1001 01.0 ....        @vfp_dm_ss
-VRINTX_sp    ---- 1110 1.11 0111 .... 1010 01.0 ....        @vfp_dm_ss
-VRINTX_dp    ---- 1110 1.11 0111 .... 1011 01.0 ....        @vfp_dm_dd
-
-# VCVT between single and double:
-# Vm precision depends on size; Vd is its reverse
-VCVT_sp      ---- 1110 1.11 0111 .... 1010 11.0 ....        @vfp_dm_ds
-VCVT_dp      ---- 1110 1.11 0111 .... 1011 11.0 ....        @vfp_dm_sd
-
-# VCVT from integer to floating point: Vm always single; Vd depends on size
-VCVT_int_hp  ---- 1110 1.11 1000 .... 1001 s:1 1.0 .... \
-             vd=%vd_sp vm=%vm_sp
-VCVT_int_sp  ---- 1110 1.11 1000 .... 1010 s:1 1.0 .... \
-             vd=%vd_sp vm=%vm_sp
-VCVT_int_dp  ---- 1110 1.11 1000 .... 1011 s:1 1.0 .... \
-             vd=%vd_dp vm=%vm_sp
-
-# VJCVT is always dp to sp
-VJCVT        ---- 1110 1.11 1001 .... 1011 11.0 ....        @vfp_dm_sd
-
-# VCVT between floating-point and fixed-point. The immediate value
-# is in the same format as a Vm single-precision register number.
-# We assemble bits 18 (op), 16 (u) and 7 (sx) into a single opc field
-# for the convenience of the trans_VCVT_fix functions.
-%vcvt_fix_op 18:1 16:1 7:1
-VCVT_fix_hp  ---- 1110 1.11 1.1. .... 1001 .1.0 .... \
-             vd=%vd_sp imm=%vm_sp opc=%vcvt_fix_op
-VCVT_fix_sp  ---- 1110 1.11 1.1. .... 1010 .1.0 .... \
-             vd=%vd_sp imm=%vm_sp opc=%vcvt_fix_op
-VCVT_fix_dp  ---- 1110 1.11 1.1. .... 1011 .1.0 .... \
-             vd=%vd_dp imm=%vm_sp opc=%vcvt_fix_op
-
-# VCVT float to integer (VCVT and VCVTR): Vd always single; Vd depends on size
-VCVT_hp_int  ---- 1110 1.11 110 s:1 .... 1001 rz:1 1.0 .... \
-             vd=%vd_sp vm=%vm_sp
-VCVT_sp_int  ---- 1110 1.11 110 s:1 .... 1010 rz:1 1.0 .... \
-             vd=%vd_sp vm=%vm_sp
-VCVT_dp_int  ---- 1110 1.11 110 s:1 .... 1011 rz:1 1.0 .... \
-             vd=%vd_sp vm=%vm_dp
-- 
cgit v1.1


From a3ef070ea9b2d9af95422b38b022f11d8c302d2e Mon Sep 17 00:00:00 2001
From: Claudio Fontana <cfontana@suse.de>
Date: Fri, 17 Feb 2023 17:11:30 -0300
Subject: target/arm: move helpers to tcg/
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Claudio Fontana <cfontana@suse.de>
Signed-off-by: Fabiano Rosas <farosas@suse.de>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Tested-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 target/arm/crypto_helper.c     |  778 -----
 target/arm/helper-a64.c        |  954 -----
 target/arm/iwmmxt_helper.c     |  670 ----
 target/arm/m_helper.c          | 2902 ----------------
 target/arm/meson.build         |   15 +-
 target/arm/mte_helper.c        |  908 -----
 target/arm/mve_helper.c        | 3450 ------------------
 target/arm/neon_helper.c       | 1740 ----------
 target/arm/op_helper.c         | 1069 ------
 target/arm/pauth_helper.c      |  515 ---
 target/arm/sme_helper.c        | 1168 -------
 target/arm/sve_helper.c        | 7483 ----------------------------------------
 target/arm/tcg-stubs.c         |   23 +
 target/arm/tcg/crypto_helper.c |  778 +++++
 target/arm/tcg/helper-a64.c    |  954 +++++
 target/arm/tcg/iwmmxt_helper.c |  670 ++++
 target/arm/tcg/m_helper.c      | 2902 ++++++++++++++++
 target/arm/tcg/meson.build     |   13 +
 target/arm/tcg/mte_helper.c    |  908 +++++
 target/arm/tcg/mve_helper.c    | 3450 ++++++++++++++++++
 target/arm/tcg/neon_helper.c   | 1740 ++++++++++
 target/arm/tcg/op_helper.c     | 1069 ++++++
 target/arm/tcg/pauth_helper.c  |  515 +++
 target/arm/tcg/sme_helper.c    | 1168 +++++++
 target/arm/tcg/sve_helper.c    | 7483 ++++++++++++++++++++++++++++++++++++++++
 target/arm/tcg/tlb_helper.c    |  287 ++
 target/arm/tcg/vec_helper.c    | 2716 +++++++++++++++
 target/arm/tcg/vec_internal.h  |  246 ++
 target/arm/tlb_helper.c        |  287 --
 target/arm/vec_helper.c        | 2716 ---------------
 target/arm/vec_internal.h      |  246 --
 31 files changed, 24924 insertions(+), 24899 deletions(-)
 delete mode 100644 target/arm/crypto_helper.c
 delete mode 100644 target/arm/helper-a64.c
 delete mode 100644 target/arm/iwmmxt_helper.c
 delete mode 100644 target/arm/m_helper.c
 delete mode 100644 target/arm/mte_helper.c
 delete mode 100644 target/arm/mve_helper.c
 delete mode 100644 target/arm/neon_helper.c
 delete mode 100644 target/arm/op_helper.c
 delete mode 100644 target/arm/pauth_helper.c
 delete mode 100644 target/arm/sme_helper.c
 delete mode 100644 target/arm/sve_helper.c
 create mode 100644 target/arm/tcg-stubs.c
 create mode 100644 target/arm/tcg/crypto_helper.c
 create mode 100644 target/arm/tcg/helper-a64.c
 create mode 100644 target/arm/tcg/iwmmxt_helper.c
 create mode 100644 target/arm/tcg/m_helper.c
 create mode 100644 target/arm/tcg/mte_helper.c
 create mode 100644 target/arm/tcg/mve_helper.c
 create mode 100644 target/arm/tcg/neon_helper.c
 create mode 100644 target/arm/tcg/op_helper.c
 create mode 100644 target/arm/tcg/pauth_helper.c
 create mode 100644 target/arm/tcg/sme_helper.c
 create mode 100644 target/arm/tcg/sve_helper.c
 create mode 100644 target/arm/tcg/tlb_helper.c
 create mode 100644 target/arm/tcg/vec_helper.c
 create mode 100644 target/arm/tcg/vec_internal.h
 delete mode 100644 target/arm/tlb_helper.c
 delete mode 100644 target/arm/vec_helper.c
 delete mode 100644 target/arm/vec_internal.h

diff --git a/target/arm/crypto_helper.c b/target/arm/crypto_helper.c
deleted file mode 100644
index d286903..0000000
--- a/target/arm/crypto_helper.c
+++ /dev/null
@@ -1,778 +0,0 @@
-/*
- * crypto_helper.c - emulate v8 Crypto Extensions instructions
- *
- * Copyright (C) 2013 - 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- */
-
-#include "qemu/osdep.h"
-
-#include "cpu.h"
-#include "exec/helper-proto.h"
-#include "tcg/tcg-gvec-desc.h"
-#include "crypto/aes.h"
-#include "crypto/sm4.h"
-#include "vec_internal.h"
-
-union CRYPTO_STATE {
-    uint8_t    bytes[16];
-    uint32_t   words[4];
-    uint64_t   l[2];
-};
-
-#if HOST_BIG_ENDIAN
-#define CR_ST_BYTE(state, i)   ((state).bytes[(15 - (i)) ^ 8])
-#define CR_ST_WORD(state, i)   ((state).words[(3 - (i)) ^ 2])
-#else
-#define CR_ST_BYTE(state, i)   ((state).bytes[i])
-#define CR_ST_WORD(state, i)   ((state).words[i])
-#endif
-
-/*
- * The caller has not been converted to full gvec, and so only
- * modifies the low 16 bytes of the vector register.
- */
-static void clear_tail_16(void *vd, uint32_t desc)
-{
-    int opr_sz = simd_oprsz(desc);
-    int max_sz = simd_maxsz(desc);
-
-    assert(opr_sz == 16);
-    clear_tail(vd, opr_sz, max_sz);
-}
-
-static void do_crypto_aese(uint64_t *rd, uint64_t *rn,
-                           uint64_t *rm, bool decrypt)
-{
-    static uint8_t const * const sbox[2] = { AES_sbox, AES_isbox };
-    static uint8_t const * const shift[2] = { AES_shifts, AES_ishifts };
-    union CRYPTO_STATE rk = { .l = { rm[0], rm[1] } };
-    union CRYPTO_STATE st = { .l = { rn[0], rn[1] } };
-    int i;
-
-    /* xor state vector with round key */
-    rk.l[0] ^= st.l[0];
-    rk.l[1] ^= st.l[1];
-
-    /* combine ShiftRows operation and sbox substitution */
-    for (i = 0; i < 16; i++) {
-        CR_ST_BYTE(st, i) = sbox[decrypt][CR_ST_BYTE(rk, shift[decrypt][i])];
-    }
-
-    rd[0] = st.l[0];
-    rd[1] = st.l[1];
-}
-
-void HELPER(crypto_aese)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    bool decrypt = simd_data(desc);
-
-    for (i = 0; i < opr_sz; i += 16) {
-        do_crypto_aese(vd + i, vn + i, vm + i, decrypt);
-    }
-    clear_tail(vd, opr_sz, simd_maxsz(desc));
-}
-
-static void do_crypto_aesmc(uint64_t *rd, uint64_t *rm, bool decrypt)
-{
-    static uint32_t const mc[][256] = { {
-        /* MixColumns lookup table */
-        0x00000000, 0x03010102, 0x06020204, 0x05030306,
-        0x0c040408, 0x0f05050a, 0x0a06060c, 0x0907070e,
-        0x18080810, 0x1b090912, 0x1e0a0a14, 0x1d0b0b16,
-        0x140c0c18, 0x170d0d1a, 0x120e0e1c, 0x110f0f1e,
-        0x30101020, 0x33111122, 0x36121224, 0x35131326,
-        0x3c141428, 0x3f15152a, 0x3a16162c, 0x3917172e,
-        0x28181830, 0x2b191932, 0x2e1a1a34, 0x2d1b1b36,
-        0x241c1c38, 0x271d1d3a, 0x221e1e3c, 0x211f1f3e,
-        0x60202040, 0x63212142, 0x66222244, 0x65232346,
-        0x6c242448, 0x6f25254a, 0x6a26264c, 0x6927274e,
-        0x78282850, 0x7b292952, 0x7e2a2a54, 0x7d2b2b56,
-        0x742c2c58, 0x772d2d5a, 0x722e2e5c, 0x712f2f5e,
-        0x50303060, 0x53313162, 0x56323264, 0x55333366,
-        0x5c343468, 0x5f35356a, 0x5a36366c, 0x5937376e,
-        0x48383870, 0x4b393972, 0x4e3a3a74, 0x4d3b3b76,
-        0x443c3c78, 0x473d3d7a, 0x423e3e7c, 0x413f3f7e,
-        0xc0404080, 0xc3414182, 0xc6424284, 0xc5434386,
-        0xcc444488, 0xcf45458a, 0xca46468c, 0xc947478e,
-        0xd8484890, 0xdb494992, 0xde4a4a94, 0xdd4b4b96,
-        0xd44c4c98, 0xd74d4d9a, 0xd24e4e9c, 0xd14f4f9e,
-        0xf05050a0, 0xf35151a2, 0xf65252a4, 0xf55353a6,
-        0xfc5454a8, 0xff5555aa, 0xfa5656ac, 0xf95757ae,
-        0xe85858b0, 0xeb5959b2, 0xee5a5ab4, 0xed5b5bb6,
-        0xe45c5cb8, 0xe75d5dba, 0xe25e5ebc, 0xe15f5fbe,
-        0xa06060c0, 0xa36161c2, 0xa66262c4, 0xa56363c6,
-        0xac6464c8, 0xaf6565ca, 0xaa6666cc, 0xa96767ce,
-        0xb86868d0, 0xbb6969d2, 0xbe6a6ad4, 0xbd6b6bd6,
-        0xb46c6cd8, 0xb76d6dda, 0xb26e6edc, 0xb16f6fde,
-        0x907070e0, 0x937171e2, 0x967272e4, 0x957373e6,
-        0x9c7474e8, 0x9f7575ea, 0x9a7676ec, 0x997777ee,
-        0x887878f0, 0x8b7979f2, 0x8e7a7af4, 0x8d7b7bf6,
-        0x847c7cf8, 0x877d7dfa, 0x827e7efc, 0x817f7ffe,
-        0x9b80801b, 0x98818119, 0x9d82821f, 0x9e83831d,
-        0x97848413, 0x94858511, 0x91868617, 0x92878715,
-        0x8388880b, 0x80898909, 0x858a8a0f, 0x868b8b0d,
-        0x8f8c8c03, 0x8c8d8d01, 0x898e8e07, 0x8a8f8f05,
-        0xab90903b, 0xa8919139, 0xad92923f, 0xae93933d,
-        0xa7949433, 0xa4959531, 0xa1969637, 0xa2979735,
-        0xb398982b, 0xb0999929, 0xb59a9a2f, 0xb69b9b2d,
-        0xbf9c9c23, 0xbc9d9d21, 0xb99e9e27, 0xba9f9f25,
-        0xfba0a05b, 0xf8a1a159, 0xfda2a25f, 0xfea3a35d,
-        0xf7a4a453, 0xf4a5a551, 0xf1a6a657, 0xf2a7a755,
-        0xe3a8a84b, 0xe0a9a949, 0xe5aaaa4f, 0xe6abab4d,
-        0xefacac43, 0xecadad41, 0xe9aeae47, 0xeaafaf45,
-        0xcbb0b07b, 0xc8b1b179, 0xcdb2b27f, 0xceb3b37d,
-        0xc7b4b473, 0xc4b5b571, 0xc1b6b677, 0xc2b7b775,
-        0xd3b8b86b, 0xd0b9b969, 0xd5baba6f, 0xd6bbbb6d,
-        0xdfbcbc63, 0xdcbdbd61, 0xd9bebe67, 0xdabfbf65,
-        0x5bc0c09b, 0x58c1c199, 0x5dc2c29f, 0x5ec3c39d,
-        0x57c4c493, 0x54c5c591, 0x51c6c697, 0x52c7c795,
-        0x43c8c88b, 0x40c9c989, 0x45caca8f, 0x46cbcb8d,
-        0x4fcccc83, 0x4ccdcd81, 0x49cece87, 0x4acfcf85,
-        0x6bd0d0bb, 0x68d1d1b9, 0x6dd2d2bf, 0x6ed3d3bd,
-        0x67d4d4b3, 0x64d5d5b1, 0x61d6d6b7, 0x62d7d7b5,
-        0x73d8d8ab, 0x70d9d9a9, 0x75dadaaf, 0x76dbdbad,
-        0x7fdcdca3, 0x7cdddda1, 0x79dedea7, 0x7adfdfa5,
-        0x3be0e0db, 0x38e1e1d9, 0x3de2e2df, 0x3ee3e3dd,
-        0x37e4e4d3, 0x34e5e5d1, 0x31e6e6d7, 0x32e7e7d5,
-        0x23e8e8cb, 0x20e9e9c9, 0x25eaeacf, 0x26ebebcd,
-        0x2fececc3, 0x2cededc1, 0x29eeeec7, 0x2aefefc5,
-        0x0bf0f0fb, 0x08f1f1f9, 0x0df2f2ff, 0x0ef3f3fd,
-        0x07f4f4f3, 0x04f5f5f1, 0x01f6f6f7, 0x02f7f7f5,
-        0x13f8f8eb, 0x10f9f9e9, 0x15fafaef, 0x16fbfbed,
-        0x1ffcfce3, 0x1cfdfde1, 0x19fefee7, 0x1affffe5,
-    }, {
-        /* Inverse MixColumns lookup table */
-        0x00000000, 0x0b0d090e, 0x161a121c, 0x1d171b12,
-        0x2c342438, 0x27392d36, 0x3a2e3624, 0x31233f2a,
-        0x58684870, 0x5365417e, 0x4e725a6c, 0x457f5362,
-        0x745c6c48, 0x7f516546, 0x62467e54, 0x694b775a,
-        0xb0d090e0, 0xbbdd99ee, 0xa6ca82fc, 0xadc78bf2,
-        0x9ce4b4d8, 0x97e9bdd6, 0x8afea6c4, 0x81f3afca,
-        0xe8b8d890, 0xe3b5d19e, 0xfea2ca8c, 0xf5afc382,
-        0xc48cfca8, 0xcf81f5a6, 0xd296eeb4, 0xd99be7ba,
-        0x7bbb3bdb, 0x70b632d5, 0x6da129c7, 0x66ac20c9,
-        0x578f1fe3, 0x5c8216ed, 0x41950dff, 0x4a9804f1,
-        0x23d373ab, 0x28de7aa5, 0x35c961b7, 0x3ec468b9,
-        0x0fe75793, 0x04ea5e9d, 0x19fd458f, 0x12f04c81,
-        0xcb6bab3b, 0xc066a235, 0xdd71b927, 0xd67cb029,
-        0xe75f8f03, 0xec52860d, 0xf1459d1f, 0xfa489411,
-        0x9303e34b, 0x980eea45, 0x8519f157, 0x8e14f859,
-        0xbf37c773, 0xb43ace7d, 0xa92dd56f, 0xa220dc61,
-        0xf66d76ad, 0xfd607fa3, 0xe07764b1, 0xeb7a6dbf,
-        0xda595295, 0xd1545b9b, 0xcc434089, 0xc74e4987,
-        0xae053edd, 0xa50837d3, 0xb81f2cc1, 0xb31225cf,
-        0x82311ae5, 0x893c13eb, 0x942b08f9, 0x9f2601f7,
-        0x46bde64d, 0x4db0ef43, 0x50a7f451, 0x5baafd5f,
-        0x6a89c275, 0x6184cb7b, 0x7c93d069, 0x779ed967,
-        0x1ed5ae3d, 0x15d8a733, 0x08cfbc21, 0x03c2b52f,
-        0x32e18a05, 0x39ec830b, 0x24fb9819, 0x2ff69117,
-        0x8dd64d76, 0x86db4478, 0x9bcc5f6a, 0x90c15664,
-        0xa1e2694e, 0xaaef6040, 0xb7f87b52, 0xbcf5725c,
-        0xd5be0506, 0xdeb30c08, 0xc3a4171a, 0xc8a91e14,
-        0xf98a213e, 0xf2872830, 0xef903322, 0xe49d3a2c,
-        0x3d06dd96, 0x360bd498, 0x2b1ccf8a, 0x2011c684,
-        0x1132f9ae, 0x1a3ff0a0, 0x0728ebb2, 0x0c25e2bc,
-        0x656e95e6, 0x6e639ce8, 0x737487fa, 0x78798ef4,
-        0x495ab1de, 0x4257b8d0, 0x5f40a3c2, 0x544daacc,
-        0xf7daec41, 0xfcd7e54f, 0xe1c0fe5d, 0xeacdf753,
-        0xdbeec879, 0xd0e3c177, 0xcdf4da65, 0xc6f9d36b,
-        0xafb2a431, 0xa4bfad3f, 0xb9a8b62d, 0xb2a5bf23,
-        0x83868009, 0x888b8907, 0x959c9215, 0x9e919b1b,
-        0x470a7ca1, 0x4c0775af, 0x51106ebd, 0x5a1d67b3,
-        0x6b3e5899, 0x60335197, 0x7d244a85, 0x7629438b,
-        0x1f6234d1, 0x146f3ddf, 0x097826cd, 0x02752fc3,
-        0x335610e9, 0x385b19e7, 0x254c02f5, 0x2e410bfb,
-        0x8c61d79a, 0x876cde94, 0x9a7bc586, 0x9176cc88,
-        0xa055f3a2, 0xab58faac, 0xb64fe1be, 0xbd42e8b0,
-        0xd4099fea, 0xdf0496e4, 0xc2138df6, 0xc91e84f8,
-        0xf83dbbd2, 0xf330b2dc, 0xee27a9ce, 0xe52aa0c0,
-        0x3cb1477a, 0x37bc4e74, 0x2aab5566, 0x21a65c68,
-        0x10856342, 0x1b886a4c, 0x069f715e, 0x0d927850,
-        0x64d90f0a, 0x6fd40604, 0x72c31d16, 0x79ce1418,
-        0x48ed2b32, 0x43e0223c, 0x5ef7392e, 0x55fa3020,
-        0x01b79aec, 0x0aba93e2, 0x17ad88f0, 0x1ca081fe,
-        0x2d83bed4, 0x268eb7da, 0x3b99acc8, 0x3094a5c6,
-        0x59dfd29c, 0x52d2db92, 0x4fc5c080, 0x44c8c98e,
-        0x75ebf6a4, 0x7ee6ffaa, 0x63f1e4b8, 0x68fcedb6,
-        0xb1670a0c, 0xba6a0302, 0xa77d1810, 0xac70111e,
-        0x9d532e34, 0x965e273a, 0x8b493c28, 0x80443526,
-        0xe90f427c, 0xe2024b72, 0xff155060, 0xf418596e,
-        0xc53b6644, 0xce366f4a, 0xd3217458, 0xd82c7d56,
-        0x7a0ca137, 0x7101a839, 0x6c16b32b, 0x671bba25,
-        0x5638850f, 0x5d358c01, 0x40229713, 0x4b2f9e1d,
-        0x2264e947, 0x2969e049, 0x347efb5b, 0x3f73f255,
-        0x0e50cd7f, 0x055dc471, 0x184adf63, 0x1347d66d,
-        0xcadc31d7, 0xc1d138d9, 0xdcc623cb, 0xd7cb2ac5,
-        0xe6e815ef, 0xede51ce1, 0xf0f207f3, 0xfbff0efd,
-        0x92b479a7, 0x99b970a9, 0x84ae6bbb, 0x8fa362b5,
-        0xbe805d9f, 0xb58d5491, 0xa89a4f83, 0xa397468d,
-    } };
-
-    union CRYPTO_STATE st = { .l = { rm[0], rm[1] } };
-    int i;
-
-    for (i = 0; i < 16; i += 4) {
-        CR_ST_WORD(st, i >> 2) =
-            mc[decrypt][CR_ST_BYTE(st, i)] ^
-            rol32(mc[decrypt][CR_ST_BYTE(st, i + 1)], 8) ^
-            rol32(mc[decrypt][CR_ST_BYTE(st, i + 2)], 16) ^
-            rol32(mc[decrypt][CR_ST_BYTE(st, i + 3)], 24);
-    }
-
-    rd[0] = st.l[0];
-    rd[1] = st.l[1];
-}
-
-void HELPER(crypto_aesmc)(void *vd, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    bool decrypt = simd_data(desc);
-
-    for (i = 0; i < opr_sz; i += 16) {
-        do_crypto_aesmc(vd + i, vm + i, decrypt);
-    }
-    clear_tail(vd, opr_sz, simd_maxsz(desc));
-}
-
-/*
- * SHA-1 logical functions
- */
-
-static uint32_t cho(uint32_t x, uint32_t y, uint32_t z)
-{
-    return (x & (y ^ z)) ^ z;
-}
-
-static uint32_t par(uint32_t x, uint32_t y, uint32_t z)
-{
-    return x ^ y ^ z;
-}
-
-static uint32_t maj(uint32_t x, uint32_t y, uint32_t z)
-{
-    return (x & y) | ((x | y) & z);
-}
-
-void HELPER(crypto_sha1su0)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    uint64_t *d = vd, *n = vn, *m = vm;
-    uint64_t d0, d1;
-
-    d0 = d[1] ^ d[0] ^ m[0];
-    d1 = n[0] ^ d[1] ^ m[1];
-    d[0] = d0;
-    d[1] = d1;
-
-    clear_tail_16(vd, desc);
-}
-
-static inline void crypto_sha1_3reg(uint64_t *rd, uint64_t *rn,
-                                    uint64_t *rm, uint32_t desc,
-                                    uint32_t (*fn)(union CRYPTO_STATE *d))
-{
-    union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
-    union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
-    union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
-    int i;
-
-    for (i = 0; i < 4; i++) {
-        uint32_t t = fn(&d);
-
-        t += rol32(CR_ST_WORD(d, 0), 5) + CR_ST_WORD(n, 0)
-             + CR_ST_WORD(m, i);
-
-        CR_ST_WORD(n, 0) = CR_ST_WORD(d, 3);
-        CR_ST_WORD(d, 3) = CR_ST_WORD(d, 2);
-        CR_ST_WORD(d, 2) = ror32(CR_ST_WORD(d, 1), 2);
-        CR_ST_WORD(d, 1) = CR_ST_WORD(d, 0);
-        CR_ST_WORD(d, 0) = t;
-    }
-    rd[0] = d.l[0];
-    rd[1] = d.l[1];
-
-    clear_tail_16(rd, desc);
-}
-
-static uint32_t do_sha1c(union CRYPTO_STATE *d)
-{
-    return cho(CR_ST_WORD(*d, 1), CR_ST_WORD(*d, 2), CR_ST_WORD(*d, 3));
-}
-
-void HELPER(crypto_sha1c)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    crypto_sha1_3reg(vd, vn, vm, desc, do_sha1c);
-}
-
-static uint32_t do_sha1p(union CRYPTO_STATE *d)
-{
-    return par(CR_ST_WORD(*d, 1), CR_ST_WORD(*d, 2), CR_ST_WORD(*d, 3));
-}
-
-void HELPER(crypto_sha1p)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    crypto_sha1_3reg(vd, vn, vm, desc, do_sha1p);
-}
-
-static uint32_t do_sha1m(union CRYPTO_STATE *d)
-{
-    return maj(CR_ST_WORD(*d, 1), CR_ST_WORD(*d, 2), CR_ST_WORD(*d, 3));
-}
-
-void HELPER(crypto_sha1m)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    crypto_sha1_3reg(vd, vn, vm, desc, do_sha1m);
-}
-
-void HELPER(crypto_sha1h)(void *vd, void *vm, uint32_t desc)
-{
-    uint64_t *rd = vd;
-    uint64_t *rm = vm;
-    union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
-
-    CR_ST_WORD(m, 0) = ror32(CR_ST_WORD(m, 0), 2);
-    CR_ST_WORD(m, 1) = CR_ST_WORD(m, 2) = CR_ST_WORD(m, 3) = 0;
-
-    rd[0] = m.l[0];
-    rd[1] = m.l[1];
-
-    clear_tail_16(vd, desc);
-}
-
-void HELPER(crypto_sha1su1)(void *vd, void *vm, uint32_t desc)
-{
-    uint64_t *rd = vd;
-    uint64_t *rm = vm;
-    union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
-    union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
-
-    CR_ST_WORD(d, 0) = rol32(CR_ST_WORD(d, 0) ^ CR_ST_WORD(m, 1), 1);
-    CR_ST_WORD(d, 1) = rol32(CR_ST_WORD(d, 1) ^ CR_ST_WORD(m, 2), 1);
-    CR_ST_WORD(d, 2) = rol32(CR_ST_WORD(d, 2) ^ CR_ST_WORD(m, 3), 1);
-    CR_ST_WORD(d, 3) = rol32(CR_ST_WORD(d, 3) ^ CR_ST_WORD(d, 0), 1);
-
-    rd[0] = d.l[0];
-    rd[1] = d.l[1];
-
-    clear_tail_16(vd, desc);
-}
-
-/*
- * The SHA-256 logical functions, according to
- * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf
- */
-
-static uint32_t S0(uint32_t x)
-{
-    return ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22);
-}
-
-static uint32_t S1(uint32_t x)
-{
-    return ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25);
-}
-
-static uint32_t s0(uint32_t x)
-{
-    return ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3);
-}
-
-static uint32_t s1(uint32_t x)
-{
-    return ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10);
-}
-
-void HELPER(crypto_sha256h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    uint64_t *rd = vd;
-    uint64_t *rn = vn;
-    uint64_t *rm = vm;
-    union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
-    union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
-    union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
-    int i;
-
-    for (i = 0; i < 4; i++) {
-        uint32_t t = cho(CR_ST_WORD(n, 0), CR_ST_WORD(n, 1), CR_ST_WORD(n, 2))
-                     + CR_ST_WORD(n, 3) + S1(CR_ST_WORD(n, 0))
-                     + CR_ST_WORD(m, i);
-
-        CR_ST_WORD(n, 3) = CR_ST_WORD(n, 2);
-        CR_ST_WORD(n, 2) = CR_ST_WORD(n, 1);
-        CR_ST_WORD(n, 1) = CR_ST_WORD(n, 0);
-        CR_ST_WORD(n, 0) = CR_ST_WORD(d, 3) + t;
-
-        t += maj(CR_ST_WORD(d, 0), CR_ST_WORD(d, 1), CR_ST_WORD(d, 2))
-             + S0(CR_ST_WORD(d, 0));
-
-        CR_ST_WORD(d, 3) = CR_ST_WORD(d, 2);
-        CR_ST_WORD(d, 2) = CR_ST_WORD(d, 1);
-        CR_ST_WORD(d, 1) = CR_ST_WORD(d, 0);
-        CR_ST_WORD(d, 0) = t;
-    }
-
-    rd[0] = d.l[0];
-    rd[1] = d.l[1];
-
-    clear_tail_16(vd, desc);
-}
-
-void HELPER(crypto_sha256h2)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    uint64_t *rd = vd;
-    uint64_t *rn = vn;
-    uint64_t *rm = vm;
-    union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
-    union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
-    union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
-    int i;
-
-    for (i = 0; i < 4; i++) {
-        uint32_t t = cho(CR_ST_WORD(d, 0), CR_ST_WORD(d, 1), CR_ST_WORD(d, 2))
-                     + CR_ST_WORD(d, 3) + S1(CR_ST_WORD(d, 0))
-                     + CR_ST_WORD(m, i);
-
-        CR_ST_WORD(d, 3) = CR_ST_WORD(d, 2);
-        CR_ST_WORD(d, 2) = CR_ST_WORD(d, 1);
-        CR_ST_WORD(d, 1) = CR_ST_WORD(d, 0);
-        CR_ST_WORD(d, 0) = CR_ST_WORD(n, 3 - i) + t;
-    }
-
-    rd[0] = d.l[0];
-    rd[1] = d.l[1];
-
-    clear_tail_16(vd, desc);
-}
-
-void HELPER(crypto_sha256su0)(void *vd, void *vm, uint32_t desc)
-{
-    uint64_t *rd = vd;
-    uint64_t *rm = vm;
-    union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
-    union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
-
-    CR_ST_WORD(d, 0) += s0(CR_ST_WORD(d, 1));
-    CR_ST_WORD(d, 1) += s0(CR_ST_WORD(d, 2));
-    CR_ST_WORD(d, 2) += s0(CR_ST_WORD(d, 3));
-    CR_ST_WORD(d, 3) += s0(CR_ST_WORD(m, 0));
-
-    rd[0] = d.l[0];
-    rd[1] = d.l[1];
-
-    clear_tail_16(vd, desc);
-}
-
-void HELPER(crypto_sha256su1)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    uint64_t *rd = vd;
-    uint64_t *rn = vn;
-    uint64_t *rm = vm;
-    union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
-    union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
-    union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
-
-    CR_ST_WORD(d, 0) += s1(CR_ST_WORD(m, 2)) + CR_ST_WORD(n, 1);
-    CR_ST_WORD(d, 1) += s1(CR_ST_WORD(m, 3)) + CR_ST_WORD(n, 2);
-    CR_ST_WORD(d, 2) += s1(CR_ST_WORD(d, 0)) + CR_ST_WORD(n, 3);
-    CR_ST_WORD(d, 3) += s1(CR_ST_WORD(d, 1)) + CR_ST_WORD(m, 0);
-
-    rd[0] = d.l[0];
-    rd[1] = d.l[1];
-
-    clear_tail_16(vd, desc);
-}
-
-/*
- * The SHA-512 logical functions (same as above but using 64-bit operands)
- */
-
-static uint64_t cho512(uint64_t x, uint64_t y, uint64_t z)
-{
-    return (x & (y ^ z)) ^ z;
-}
-
-static uint64_t maj512(uint64_t x, uint64_t y, uint64_t z)
-{
-    return (x & y) | ((x | y) & z);
-}
-
-static uint64_t S0_512(uint64_t x)
-{
-    return ror64(x, 28) ^ ror64(x, 34) ^ ror64(x, 39);
-}
-
-static uint64_t S1_512(uint64_t x)
-{
-    return ror64(x, 14) ^ ror64(x, 18) ^ ror64(x, 41);
-}
-
-static uint64_t s0_512(uint64_t x)
-{
-    return ror64(x, 1) ^ ror64(x, 8) ^ (x >> 7);
-}
-
-static uint64_t s1_512(uint64_t x)
-{
-    return ror64(x, 19) ^ ror64(x, 61) ^ (x >> 6);
-}
-
-void HELPER(crypto_sha512h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    uint64_t *rd = vd;
-    uint64_t *rn = vn;
-    uint64_t *rm = vm;
-    uint64_t d0 = rd[0];
-    uint64_t d1 = rd[1];
-
-    d1 += S1_512(rm[1]) + cho512(rm[1], rn[0], rn[1]);
-    d0 += S1_512(d1 + rm[0]) + cho512(d1 + rm[0], rm[1], rn[0]);
-
-    rd[0] = d0;
-    rd[1] = d1;
-
-    clear_tail_16(vd, desc);
-}
-
-void HELPER(crypto_sha512h2)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    uint64_t *rd = vd;
-    uint64_t *rn = vn;
-    uint64_t *rm = vm;
-    uint64_t d0 = rd[0];
-    uint64_t d1 = rd[1];
-
-    d1 += S0_512(rm[0]) + maj512(rn[0], rm[1], rm[0]);
-    d0 += S0_512(d1) + maj512(d1, rm[0], rm[1]);
-
-    rd[0] = d0;
-    rd[1] = d1;
-
-    clear_tail_16(vd, desc);
-}
-
-void HELPER(crypto_sha512su0)(void *vd, void *vn, uint32_t desc)
-{
-    uint64_t *rd = vd;
-    uint64_t *rn = vn;
-    uint64_t d0 = rd[0];
-    uint64_t d1 = rd[1];
-
-    d0 += s0_512(rd[1]);
-    d1 += s0_512(rn[0]);
-
-    rd[0] = d0;
-    rd[1] = d1;
-
-    clear_tail_16(vd, desc);
-}
-
-void HELPER(crypto_sha512su1)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    uint64_t *rd = vd;
-    uint64_t *rn = vn;
-    uint64_t *rm = vm;
-
-    rd[0] += s1_512(rn[0]) + rm[0];
-    rd[1] += s1_512(rn[1]) + rm[1];
-
-    clear_tail_16(vd, desc);
-}
-
-void HELPER(crypto_sm3partw1)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    uint64_t *rd = vd;
-    uint64_t *rn = vn;
-    uint64_t *rm = vm;
-    union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
-    union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
-    union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
-    uint32_t t;
-
-    t = CR_ST_WORD(d, 0) ^ CR_ST_WORD(n, 0) ^ ror32(CR_ST_WORD(m, 1), 17);
-    CR_ST_WORD(d, 0) = t ^ ror32(t, 17) ^ ror32(t, 9);
-
-    t = CR_ST_WORD(d, 1) ^ CR_ST_WORD(n, 1) ^ ror32(CR_ST_WORD(m, 2), 17);
-    CR_ST_WORD(d, 1) = t ^ ror32(t, 17) ^ ror32(t, 9);
-
-    t = CR_ST_WORD(d, 2) ^ CR_ST_WORD(n, 2) ^ ror32(CR_ST_WORD(m, 3), 17);
-    CR_ST_WORD(d, 2) = t ^ ror32(t, 17) ^ ror32(t, 9);
-
-    t = CR_ST_WORD(d, 3) ^ CR_ST_WORD(n, 3) ^ ror32(CR_ST_WORD(d, 0), 17);
-    CR_ST_WORD(d, 3) = t ^ ror32(t, 17) ^ ror32(t, 9);
-
-    rd[0] = d.l[0];
-    rd[1] = d.l[1];
-
-    clear_tail_16(vd, desc);
-}
-
-void HELPER(crypto_sm3partw2)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    uint64_t *rd = vd;
-    uint64_t *rn = vn;
-    uint64_t *rm = vm;
-    union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
-    union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
-    union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
-    uint32_t t = CR_ST_WORD(n, 0) ^ ror32(CR_ST_WORD(m, 0), 25);
-
-    CR_ST_WORD(d, 0) ^= t;
-    CR_ST_WORD(d, 1) ^= CR_ST_WORD(n, 1) ^ ror32(CR_ST_WORD(m, 1), 25);
-    CR_ST_WORD(d, 2) ^= CR_ST_WORD(n, 2) ^ ror32(CR_ST_WORD(m, 2), 25);
-    CR_ST_WORD(d, 3) ^= CR_ST_WORD(n, 3) ^ ror32(CR_ST_WORD(m, 3), 25) ^
-                        ror32(t, 17) ^ ror32(t, 2) ^ ror32(t, 26);
-
-    rd[0] = d.l[0];
-    rd[1] = d.l[1];
-
-    clear_tail_16(vd, desc);
-}
-
-static inline void QEMU_ALWAYS_INLINE
-crypto_sm3tt(uint64_t *rd, uint64_t *rn, uint64_t *rm,
-             uint32_t desc, uint32_t opcode)
-{
-    union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
-    union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
-    union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
-    uint32_t imm2 = simd_data(desc);
-    uint32_t t;
-
-    assert(imm2 < 4);
-
-    if (opcode == 0 || opcode == 2) {
-        /* SM3TT1A, SM3TT2A */
-        t = par(CR_ST_WORD(d, 3), CR_ST_WORD(d, 2), CR_ST_WORD(d, 1));
-    } else if (opcode == 1) {
-        /* SM3TT1B */
-        t = maj(CR_ST_WORD(d, 3), CR_ST_WORD(d, 2), CR_ST_WORD(d, 1));
-    } else if (opcode == 3) {
-        /* SM3TT2B */
-        t = cho(CR_ST_WORD(d, 3), CR_ST_WORD(d, 2), CR_ST_WORD(d, 1));
-    } else {
-        qemu_build_not_reached();
-    }
-
-    t += CR_ST_WORD(d, 0) + CR_ST_WORD(m, imm2);
-
-    CR_ST_WORD(d, 0) = CR_ST_WORD(d, 1);
-
-    if (opcode < 2) {
-        /* SM3TT1A, SM3TT1B */
-        t += CR_ST_WORD(n, 3) ^ ror32(CR_ST_WORD(d, 3), 20);
-
-        CR_ST_WORD(d, 1) = ror32(CR_ST_WORD(d, 2), 23);
-    } else {
-        /* SM3TT2A, SM3TT2B */
-        t += CR_ST_WORD(n, 3);
-        t ^= rol32(t, 9) ^ rol32(t, 17);
-
-        CR_ST_WORD(d, 1) = ror32(CR_ST_WORD(d, 2), 13);
-    }
-
-    CR_ST_WORD(d, 2) = CR_ST_WORD(d, 3);
-    CR_ST_WORD(d, 3) = t;
-
-    rd[0] = d.l[0];
-    rd[1] = d.l[1];
-
-    clear_tail_16(rd, desc);
-}
-
-#define DO_SM3TT(NAME, OPCODE) \
-    void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
-    { crypto_sm3tt(vd, vn, vm, desc, OPCODE); }
-
-DO_SM3TT(crypto_sm3tt1a, 0)
-DO_SM3TT(crypto_sm3tt1b, 1)
-DO_SM3TT(crypto_sm3tt2a, 2)
-DO_SM3TT(crypto_sm3tt2b, 3)
-
-#undef DO_SM3TT
-
-static void do_crypto_sm4e(uint64_t *rd, uint64_t *rn, uint64_t *rm)
-{
-    union CRYPTO_STATE d = { .l = { rn[0], rn[1] } };
-    union CRYPTO_STATE n = { .l = { rm[0], rm[1] } };
-    uint32_t t, i;
-
-    for (i = 0; i < 4; i++) {
-        t = CR_ST_WORD(d, (i + 1) % 4) ^
-            CR_ST_WORD(d, (i + 2) % 4) ^
-            CR_ST_WORD(d, (i + 3) % 4) ^
-            CR_ST_WORD(n, i);
-
-        t = sm4_sbox[t & 0xff] |
-            sm4_sbox[(t >> 8) & 0xff] << 8 |
-            sm4_sbox[(t >> 16) & 0xff] << 16 |
-            sm4_sbox[(t >> 24) & 0xff] << 24;
-
-        CR_ST_WORD(d, i) ^= t ^ rol32(t, 2) ^ rol32(t, 10) ^ rol32(t, 18) ^
-                            rol32(t, 24);
-    }
-
-    rd[0] = d.l[0];
-    rd[1] = d.l[1];
-}
-
-void HELPER(crypto_sm4e)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-
-    for (i = 0; i < opr_sz; i += 16) {
-        do_crypto_sm4e(vd + i, vn + i, vm + i);
-    }
-    clear_tail(vd, opr_sz, simd_maxsz(desc));
-}
-
-static void do_crypto_sm4ekey(uint64_t *rd, uint64_t *rn, uint64_t *rm)
-{
-    union CRYPTO_STATE d;
-    union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
-    union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
-    uint32_t t, i;
-
-    d = n;
-    for (i = 0; i < 4; i++) {
-        t = CR_ST_WORD(d, (i + 1) % 4) ^
-            CR_ST_WORD(d, (i + 2) % 4) ^
-            CR_ST_WORD(d, (i + 3) % 4) ^
-            CR_ST_WORD(m, i);
-
-        t = sm4_sbox[t & 0xff] |
-            sm4_sbox[(t >> 8) & 0xff] << 8 |
-            sm4_sbox[(t >> 16) & 0xff] << 16 |
-            sm4_sbox[(t >> 24) & 0xff] << 24;
-
-        CR_ST_WORD(d, i) ^= t ^ rol32(t, 13) ^ rol32(t, 23);
-    }
-
-    rd[0] = d.l[0];
-    rd[1] = d.l[1];
-}
-
-void HELPER(crypto_sm4ekey)(void *vd, void *vn, void* vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-
-    for (i = 0; i < opr_sz; i += 16) {
-        do_crypto_sm4ekey(vd + i, vn + i, vm + i);
-    }
-    clear_tail(vd, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(crypto_rax1)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    uint64_t *d = vd, *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz / 8; ++i) {
-        d[i] = n[i] ^ rol64(m[i], 1);
-    }
-    clear_tail(vd, opr_sz, simd_maxsz(desc));
-}
diff --git a/target/arm/helper-a64.c b/target/arm/helper-a64.c
deleted file mode 100644
index 0972a4b..0000000
--- a/target/arm/helper-a64.c
+++ /dev/null
@@ -1,954 +0,0 @@
-/*
- *  AArch64 specific helpers
- *
- *  Copyright (c) 2013 Alexander Graf <agraf@suse.de>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "qemu/osdep.h"
-#include "qemu/units.h"
-#include "cpu.h"
-#include "exec/gdbstub.h"
-#include "exec/helper-proto.h"
-#include "qemu/host-utils.h"
-#include "qemu/log.h"
-#include "qemu/main-loop.h"
-#include "qemu/bitops.h"
-#include "internals.h"
-#include "qemu/crc32c.h"
-#include "exec/exec-all.h"
-#include "exec/cpu_ldst.h"
-#include "qemu/int128.h"
-#include "qemu/atomic128.h"
-#include "fpu/softfloat.h"
-#include <zlib.h> /* For crc32 */
-
-/* C2.4.7 Multiply and divide */
-/* special cases for 0 and LLONG_MIN are mandated by the standard */
-uint64_t HELPER(udiv64)(uint64_t num, uint64_t den)
-{
-    if (den == 0) {
-        return 0;
-    }
-    return num / den;
-}
-
-int64_t HELPER(sdiv64)(int64_t num, int64_t den)
-{
-    if (den == 0) {
-        return 0;
-    }
-    if (num == LLONG_MIN && den == -1) {
-        return LLONG_MIN;
-    }
-    return num / den;
-}
-
-uint64_t HELPER(rbit64)(uint64_t x)
-{
-    return revbit64(x);
-}
-
-void HELPER(msr_i_spsel)(CPUARMState *env, uint32_t imm)
-{
-    update_spsel(env, imm);
-}
-
-static void daif_check(CPUARMState *env, uint32_t op,
-                       uint32_t imm, uintptr_t ra)
-{
-    /* DAIF update to PSTATE. This is OK from EL0 only if UMA is set.  */
-    if (arm_current_el(env) == 0 && !(arm_sctlr(env, 0) & SCTLR_UMA)) {
-        raise_exception_ra(env, EXCP_UDEF,
-                           syn_aa64_sysregtrap(0, extract32(op, 0, 3),
-                                               extract32(op, 3, 3), 4,
-                                               imm, 0x1f, 0),
-                           exception_target_el(env), ra);
-    }
-}
-
-void HELPER(msr_i_daifset)(CPUARMState *env, uint32_t imm)
-{
-    daif_check(env, 0x1e, imm, GETPC());
-    env->daif |= (imm << 6) & PSTATE_DAIF;
-    arm_rebuild_hflags(env);
-}
-
-void HELPER(msr_i_daifclear)(CPUARMState *env, uint32_t imm)
-{
-    daif_check(env, 0x1f, imm, GETPC());
-    env->daif &= ~((imm << 6) & PSTATE_DAIF);
-    arm_rebuild_hflags(env);
-}
-
-/* Convert a softfloat float_relation_ (as returned by
- * the float*_compare functions) to the correct ARM
- * NZCV flag state.
- */
-static inline uint32_t float_rel_to_flags(int res)
-{
-    uint64_t flags;
-    switch (res) {
-    case float_relation_equal:
-        flags = PSTATE_Z | PSTATE_C;
-        break;
-    case float_relation_less:
-        flags = PSTATE_N;
-        break;
-    case float_relation_greater:
-        flags = PSTATE_C;
-        break;
-    case float_relation_unordered:
-    default:
-        flags = PSTATE_C | PSTATE_V;
-        break;
-    }
-    return flags;
-}
-
-uint64_t HELPER(vfp_cmph_a64)(uint32_t x, uint32_t y, void *fp_status)
-{
-    return float_rel_to_flags(float16_compare_quiet(x, y, fp_status));
-}
-
-uint64_t HELPER(vfp_cmpeh_a64)(uint32_t x, uint32_t y, void *fp_status)
-{
-    return float_rel_to_flags(float16_compare(x, y, fp_status));
-}
-
-uint64_t HELPER(vfp_cmps_a64)(float32 x, float32 y, void *fp_status)
-{
-    return float_rel_to_flags(float32_compare_quiet(x, y, fp_status));
-}
-
-uint64_t HELPER(vfp_cmpes_a64)(float32 x, float32 y, void *fp_status)
-{
-    return float_rel_to_flags(float32_compare(x, y, fp_status));
-}
-
-uint64_t HELPER(vfp_cmpd_a64)(float64 x, float64 y, void *fp_status)
-{
-    return float_rel_to_flags(float64_compare_quiet(x, y, fp_status));
-}
-
-uint64_t HELPER(vfp_cmped_a64)(float64 x, float64 y, void *fp_status)
-{
-    return float_rel_to_flags(float64_compare(x, y, fp_status));
-}
-
-float32 HELPER(vfp_mulxs)(float32 a, float32 b, void *fpstp)
-{
-    float_status *fpst = fpstp;
-
-    a = float32_squash_input_denormal(a, fpst);
-    b = float32_squash_input_denormal(b, fpst);
-
-    if ((float32_is_zero(a) && float32_is_infinity(b)) ||
-        (float32_is_infinity(a) && float32_is_zero(b))) {
-        /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
-        return make_float32((1U << 30) |
-                            ((float32_val(a) ^ float32_val(b)) & (1U << 31)));
-    }
-    return float32_mul(a, b, fpst);
-}
-
-float64 HELPER(vfp_mulxd)(float64 a, float64 b, void *fpstp)
-{
-    float_status *fpst = fpstp;
-
-    a = float64_squash_input_denormal(a, fpst);
-    b = float64_squash_input_denormal(b, fpst);
-
-    if ((float64_is_zero(a) && float64_is_infinity(b)) ||
-        (float64_is_infinity(a) && float64_is_zero(b))) {
-        /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
-        return make_float64((1ULL << 62) |
-                            ((float64_val(a) ^ float64_val(b)) & (1ULL << 63)));
-    }
-    return float64_mul(a, b, fpst);
-}
-
-/* 64bit/double versions of the neon float compare functions */
-uint64_t HELPER(neon_ceq_f64)(float64 a, float64 b, void *fpstp)
-{
-    float_status *fpst = fpstp;
-    return -float64_eq_quiet(a, b, fpst);
-}
-
-uint64_t HELPER(neon_cge_f64)(float64 a, float64 b, void *fpstp)
-{
-    float_status *fpst = fpstp;
-    return -float64_le(b, a, fpst);
-}
-
-uint64_t HELPER(neon_cgt_f64)(float64 a, float64 b, void *fpstp)
-{
-    float_status *fpst = fpstp;
-    return -float64_lt(b, a, fpst);
-}
-
-/* Reciprocal step and sqrt step. Note that unlike the A32/T32
- * versions, these do a fully fused multiply-add or
- * multiply-add-and-halve.
- */
-
-uint32_t HELPER(recpsf_f16)(uint32_t a, uint32_t b, void *fpstp)
-{
-    float_status *fpst = fpstp;
-
-    a = float16_squash_input_denormal(a, fpst);
-    b = float16_squash_input_denormal(b, fpst);
-
-    a = float16_chs(a);
-    if ((float16_is_infinity(a) && float16_is_zero(b)) ||
-        (float16_is_infinity(b) && float16_is_zero(a))) {
-        return float16_two;
-    }
-    return float16_muladd(a, b, float16_two, 0, fpst);
-}
-
-float32 HELPER(recpsf_f32)(float32 a, float32 b, void *fpstp)
-{
-    float_status *fpst = fpstp;
-
-    a = float32_squash_input_denormal(a, fpst);
-    b = float32_squash_input_denormal(b, fpst);
-
-    a = float32_chs(a);
-    if ((float32_is_infinity(a) && float32_is_zero(b)) ||
-        (float32_is_infinity(b) && float32_is_zero(a))) {
-        return float32_two;
-    }
-    return float32_muladd(a, b, float32_two, 0, fpst);
-}
-
-float64 HELPER(recpsf_f64)(float64 a, float64 b, void *fpstp)
-{
-    float_status *fpst = fpstp;
-
-    a = float64_squash_input_denormal(a, fpst);
-    b = float64_squash_input_denormal(b, fpst);
-
-    a = float64_chs(a);
-    if ((float64_is_infinity(a) && float64_is_zero(b)) ||
-        (float64_is_infinity(b) && float64_is_zero(a))) {
-        return float64_two;
-    }
-    return float64_muladd(a, b, float64_two, 0, fpst);
-}
-
-uint32_t HELPER(rsqrtsf_f16)(uint32_t a, uint32_t b, void *fpstp)
-{
-    float_status *fpst = fpstp;
-
-    a = float16_squash_input_denormal(a, fpst);
-    b = float16_squash_input_denormal(b, fpst);
-
-    a = float16_chs(a);
-    if ((float16_is_infinity(a) && float16_is_zero(b)) ||
-        (float16_is_infinity(b) && float16_is_zero(a))) {
-        return float16_one_point_five;
-    }
-    return float16_muladd(a, b, float16_three, float_muladd_halve_result, fpst);
-}
-
-float32 HELPER(rsqrtsf_f32)(float32 a, float32 b, void *fpstp)
-{
-    float_status *fpst = fpstp;
-
-    a = float32_squash_input_denormal(a, fpst);
-    b = float32_squash_input_denormal(b, fpst);
-
-    a = float32_chs(a);
-    if ((float32_is_infinity(a) && float32_is_zero(b)) ||
-        (float32_is_infinity(b) && float32_is_zero(a))) {
-        return float32_one_point_five;
-    }
-    return float32_muladd(a, b, float32_three, float_muladd_halve_result, fpst);
-}
-
-float64 HELPER(rsqrtsf_f64)(float64 a, float64 b, void *fpstp)
-{
-    float_status *fpst = fpstp;
-
-    a = float64_squash_input_denormal(a, fpst);
-    b = float64_squash_input_denormal(b, fpst);
-
-    a = float64_chs(a);
-    if ((float64_is_infinity(a) && float64_is_zero(b)) ||
-        (float64_is_infinity(b) && float64_is_zero(a))) {
-        return float64_one_point_five;
-    }
-    return float64_muladd(a, b, float64_three, float_muladd_halve_result, fpst);
-}
-
-/* Pairwise long add: add pairs of adjacent elements into
- * double-width elements in the result (eg _s8 is an 8x8->16 op)
- */
-uint64_t HELPER(neon_addlp_s8)(uint64_t a)
-{
-    uint64_t nsignmask = 0x0080008000800080ULL;
-    uint64_t wsignmask = 0x8000800080008000ULL;
-    uint64_t elementmask = 0x00ff00ff00ff00ffULL;
-    uint64_t tmp1, tmp2;
-    uint64_t res, signres;
-
-    /* Extract odd elements, sign extend each to a 16 bit field */
-    tmp1 = a & elementmask;
-    tmp1 ^= nsignmask;
-    tmp1 |= wsignmask;
-    tmp1 = (tmp1 - nsignmask) ^ wsignmask;
-    /* Ditto for the even elements */
-    tmp2 = (a >> 8) & elementmask;
-    tmp2 ^= nsignmask;
-    tmp2 |= wsignmask;
-    tmp2 = (tmp2 - nsignmask) ^ wsignmask;
-
-    /* calculate the result by summing bits 0..14, 16..22, etc,
-     * and then adjusting the sign bits 15, 23, etc manually.
-     * This ensures the addition can't overflow the 16 bit field.
-     */
-    signres = (tmp1 ^ tmp2) & wsignmask;
-    res = (tmp1 & ~wsignmask) + (tmp2 & ~wsignmask);
-    res ^= signres;
-
-    return res;
-}
-
-uint64_t HELPER(neon_addlp_u8)(uint64_t a)
-{
-    uint64_t tmp;
-
-    tmp = a & 0x00ff00ff00ff00ffULL;
-    tmp += (a >> 8) & 0x00ff00ff00ff00ffULL;
-    return tmp;
-}
-
-uint64_t HELPER(neon_addlp_s16)(uint64_t a)
-{
-    int32_t reslo, reshi;
-
-    reslo = (int32_t)(int16_t)a + (int32_t)(int16_t)(a >> 16);
-    reshi = (int32_t)(int16_t)(a >> 32) + (int32_t)(int16_t)(a >> 48);
-
-    return (uint32_t)reslo | (((uint64_t)reshi) << 32);
-}
-
-uint64_t HELPER(neon_addlp_u16)(uint64_t a)
-{
-    uint64_t tmp;
-
-    tmp = a & 0x0000ffff0000ffffULL;
-    tmp += (a >> 16) & 0x0000ffff0000ffffULL;
-    return tmp;
-}
-
-/* Floating-point reciprocal exponent - see FPRecpX in ARM ARM */
-uint32_t HELPER(frecpx_f16)(uint32_t a, void *fpstp)
-{
-    float_status *fpst = fpstp;
-    uint16_t val16, sbit;
-    int16_t exp;
-
-    if (float16_is_any_nan(a)) {
-        float16 nan = a;
-        if (float16_is_signaling_nan(a, fpst)) {
-            float_raise(float_flag_invalid, fpst);
-            if (!fpst->default_nan_mode) {
-                nan = float16_silence_nan(a, fpst);
-            }
-        }
-        if (fpst->default_nan_mode) {
-            nan = float16_default_nan(fpst);
-        }
-        return nan;
-    }
-
-    a = float16_squash_input_denormal(a, fpst);
-
-    val16 = float16_val(a);
-    sbit = 0x8000 & val16;
-    exp = extract32(val16, 10, 5);
-
-    if (exp == 0) {
-        return make_float16(deposit32(sbit, 10, 5, 0x1e));
-    } else {
-        return make_float16(deposit32(sbit, 10, 5, ~exp));
-    }
-}
-
-float32 HELPER(frecpx_f32)(float32 a, void *fpstp)
-{
-    float_status *fpst = fpstp;
-    uint32_t val32, sbit;
-    int32_t exp;
-
-    if (float32_is_any_nan(a)) {
-        float32 nan = a;
-        if (float32_is_signaling_nan(a, fpst)) {
-            float_raise(float_flag_invalid, fpst);
-            if (!fpst->default_nan_mode) {
-                nan = float32_silence_nan(a, fpst);
-            }
-        }
-        if (fpst->default_nan_mode) {
-            nan = float32_default_nan(fpst);
-        }
-        return nan;
-    }
-
-    a = float32_squash_input_denormal(a, fpst);
-
-    val32 = float32_val(a);
-    sbit = 0x80000000ULL & val32;
-    exp = extract32(val32, 23, 8);
-
-    if (exp == 0) {
-        return make_float32(sbit | (0xfe << 23));
-    } else {
-        return make_float32(sbit | (~exp & 0xff) << 23);
-    }
-}
-
-float64 HELPER(frecpx_f64)(float64 a, void *fpstp)
-{
-    float_status *fpst = fpstp;
-    uint64_t val64, sbit;
-    int64_t exp;
-
-    if (float64_is_any_nan(a)) {
-        float64 nan = a;
-        if (float64_is_signaling_nan(a, fpst)) {
-            float_raise(float_flag_invalid, fpst);
-            if (!fpst->default_nan_mode) {
-                nan = float64_silence_nan(a, fpst);
-            }
-        }
-        if (fpst->default_nan_mode) {
-            nan = float64_default_nan(fpst);
-        }
-        return nan;
-    }
-
-    a = float64_squash_input_denormal(a, fpst);
-
-    val64 = float64_val(a);
-    sbit = 0x8000000000000000ULL & val64;
-    exp = extract64(float64_val(a), 52, 11);
-
-    if (exp == 0) {
-        return make_float64(sbit | (0x7feULL << 52));
-    } else {
-        return make_float64(sbit | (~exp & 0x7ffULL) << 52);
-    }
-}
-
-float32 HELPER(fcvtx_f64_to_f32)(float64 a, CPUARMState *env)
-{
-    /* Von Neumann rounding is implemented by using round-to-zero
-     * and then setting the LSB of the result if Inexact was raised.
-     */
-    float32 r;
-    float_status *fpst = &env->vfp.fp_status;
-    float_status tstat = *fpst;
-    int exflags;
-
-    set_float_rounding_mode(float_round_to_zero, &tstat);
-    set_float_exception_flags(0, &tstat);
-    r = float64_to_float32(a, &tstat);
-    exflags = get_float_exception_flags(&tstat);
-    if (exflags & float_flag_inexact) {
-        r = make_float32(float32_val(r) | 1);
-    }
-    exflags |= get_float_exception_flags(fpst);
-    set_float_exception_flags(exflags, fpst);
-    return r;
-}
-
-/* 64-bit versions of the CRC helpers. Note that although the operation
- * (and the prototypes of crc32c() and crc32() mean that only the bottom
- * 32 bits of the accumulator and result are used, we pass and return
- * uint64_t for convenience of the generated code. Unlike the 32-bit
- * instruction set versions, val may genuinely have 64 bits of data in it.
- * The upper bytes of val (above the number specified by 'bytes') must have
- * been zeroed out by the caller.
- */
-uint64_t HELPER(crc32_64)(uint64_t acc, uint64_t val, uint32_t bytes)
-{
-    uint8_t buf[8];
-
-    stq_le_p(buf, val);
-
-    /* zlib crc32 converts the accumulator and output to one's complement.  */
-    return crc32(acc ^ 0xffffffff, buf, bytes) ^ 0xffffffff;
-}
-
-uint64_t HELPER(crc32c_64)(uint64_t acc, uint64_t val, uint32_t bytes)
-{
-    uint8_t buf[8];
-
-    stq_le_p(buf, val);
-
-    /* Linux crc32c converts the output to one's complement.  */
-    return crc32c(acc, buf, bytes) ^ 0xffffffff;
-}
-
-/*
- * AdvSIMD half-precision
- */
-
-#define ADVSIMD_HELPER(name, suffix) HELPER(glue(glue(advsimd_, name), suffix))
-
-#define ADVSIMD_HALFOP(name) \
-uint32_t ADVSIMD_HELPER(name, h)(uint32_t a, uint32_t b, void *fpstp) \
-{ \
-    float_status *fpst = fpstp; \
-    return float16_ ## name(a, b, fpst);    \
-}
-
-ADVSIMD_HALFOP(add)
-ADVSIMD_HALFOP(sub)
-ADVSIMD_HALFOP(mul)
-ADVSIMD_HALFOP(div)
-ADVSIMD_HALFOP(min)
-ADVSIMD_HALFOP(max)
-ADVSIMD_HALFOP(minnum)
-ADVSIMD_HALFOP(maxnum)
-
-#define ADVSIMD_TWOHALFOP(name)                                         \
-uint32_t ADVSIMD_HELPER(name, 2h)(uint32_t two_a, uint32_t two_b, void *fpstp) \
-{ \
-    float16  a1, a2, b1, b2;                        \
-    uint32_t r1, r2;                                \
-    float_status *fpst = fpstp;                     \
-    a1 = extract32(two_a, 0, 16);                   \
-    a2 = extract32(two_a, 16, 16);                  \
-    b1 = extract32(two_b, 0, 16);                   \
-    b2 = extract32(two_b, 16, 16);                  \
-    r1 = float16_ ## name(a1, b1, fpst);            \
-    r2 = float16_ ## name(a2, b2, fpst);            \
-    return deposit32(r1, 16, 16, r2);               \
-}
-
-ADVSIMD_TWOHALFOP(add)
-ADVSIMD_TWOHALFOP(sub)
-ADVSIMD_TWOHALFOP(mul)
-ADVSIMD_TWOHALFOP(div)
-ADVSIMD_TWOHALFOP(min)
-ADVSIMD_TWOHALFOP(max)
-ADVSIMD_TWOHALFOP(minnum)
-ADVSIMD_TWOHALFOP(maxnum)
-
-/* Data processing - scalar floating-point and advanced SIMD */
-static float16 float16_mulx(float16 a, float16 b, void *fpstp)
-{
-    float_status *fpst = fpstp;
-
-    a = float16_squash_input_denormal(a, fpst);
-    b = float16_squash_input_denormal(b, fpst);
-
-    if ((float16_is_zero(a) && float16_is_infinity(b)) ||
-        (float16_is_infinity(a) && float16_is_zero(b))) {
-        /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
-        return make_float16((1U << 14) |
-                            ((float16_val(a) ^ float16_val(b)) & (1U << 15)));
-    }
-    return float16_mul(a, b, fpst);
-}
-
-ADVSIMD_HALFOP(mulx)
-ADVSIMD_TWOHALFOP(mulx)
-
-/* fused multiply-accumulate */
-uint32_t HELPER(advsimd_muladdh)(uint32_t a, uint32_t b, uint32_t c,
-                                 void *fpstp)
-{
-    float_status *fpst = fpstp;
-    return float16_muladd(a, b, c, 0, fpst);
-}
-
-uint32_t HELPER(advsimd_muladd2h)(uint32_t two_a, uint32_t two_b,
-                                  uint32_t two_c, void *fpstp)
-{
-    float_status *fpst = fpstp;
-    float16  a1, a2, b1, b2, c1, c2;
-    uint32_t r1, r2;
-    a1 = extract32(two_a, 0, 16);
-    a2 = extract32(two_a, 16, 16);
-    b1 = extract32(two_b, 0, 16);
-    b2 = extract32(two_b, 16, 16);
-    c1 = extract32(two_c, 0, 16);
-    c2 = extract32(two_c, 16, 16);
-    r1 = float16_muladd(a1, b1, c1, 0, fpst);
-    r2 = float16_muladd(a2, b2, c2, 0, fpst);
-    return deposit32(r1, 16, 16, r2);
-}
-
-/*
- * Floating point comparisons produce an integer result. Softfloat
- * routines return float_relation types which we convert to the 0/-1
- * Neon requires.
- */
-
-#define ADVSIMD_CMPRES(test) (test) ? 0xffff : 0
-
-uint32_t HELPER(advsimd_ceq_f16)(uint32_t a, uint32_t b, void *fpstp)
-{
-    float_status *fpst = fpstp;
-    int compare = float16_compare_quiet(a, b, fpst);
-    return ADVSIMD_CMPRES(compare == float_relation_equal);
-}
-
-uint32_t HELPER(advsimd_cge_f16)(uint32_t a, uint32_t b, void *fpstp)
-{
-    float_status *fpst = fpstp;
-    int compare = float16_compare(a, b, fpst);
-    return ADVSIMD_CMPRES(compare == float_relation_greater ||
-                          compare == float_relation_equal);
-}
-
-uint32_t HELPER(advsimd_cgt_f16)(uint32_t a, uint32_t b, void *fpstp)
-{
-    float_status *fpst = fpstp;
-    int compare = float16_compare(a, b, fpst);
-    return ADVSIMD_CMPRES(compare == float_relation_greater);
-}
-
-uint32_t HELPER(advsimd_acge_f16)(uint32_t a, uint32_t b, void *fpstp)
-{
-    float_status *fpst = fpstp;
-    float16 f0 = float16_abs(a);
-    float16 f1 = float16_abs(b);
-    int compare = float16_compare(f0, f1, fpst);
-    return ADVSIMD_CMPRES(compare == float_relation_greater ||
-                          compare == float_relation_equal);
-}
-
-uint32_t HELPER(advsimd_acgt_f16)(uint32_t a, uint32_t b, void *fpstp)
-{
-    float_status *fpst = fpstp;
-    float16 f0 = float16_abs(a);
-    float16 f1 = float16_abs(b);
-    int compare = float16_compare(f0, f1, fpst);
-    return ADVSIMD_CMPRES(compare == float_relation_greater);
-}
-
-/* round to integral */
-uint32_t HELPER(advsimd_rinth_exact)(uint32_t x, void *fp_status)
-{
-    return float16_round_to_int(x, fp_status);
-}
-
-uint32_t HELPER(advsimd_rinth)(uint32_t x, void *fp_status)
-{
-    int old_flags = get_float_exception_flags(fp_status), new_flags;
-    float16 ret;
-
-    ret = float16_round_to_int(x, fp_status);
-
-    /* Suppress any inexact exceptions the conversion produced */
-    if (!(old_flags & float_flag_inexact)) {
-        new_flags = get_float_exception_flags(fp_status);
-        set_float_exception_flags(new_flags & ~float_flag_inexact, fp_status);
-    }
-
-    return ret;
-}
-
-/*
- * Half-precision floating point conversion functions
- *
- * There are a multitude of conversion functions with various
- * different rounding modes. This is dealt with by the calling code
- * setting the mode appropriately before calling the helper.
- */
-
-uint32_t HELPER(advsimd_f16tosinth)(uint32_t a, void *fpstp)
-{
-    float_status *fpst = fpstp;
-
-    /* Invalid if we are passed a NaN */
-    if (float16_is_any_nan(a)) {
-        float_raise(float_flag_invalid, fpst);
-        return 0;
-    }
-    return float16_to_int16(a, fpst);
-}
-
-uint32_t HELPER(advsimd_f16touinth)(uint32_t a, void *fpstp)
-{
-    float_status *fpst = fpstp;
-
-    /* Invalid if we are passed a NaN */
-    if (float16_is_any_nan(a)) {
-        float_raise(float_flag_invalid, fpst);
-        return 0;
-    }
-    return float16_to_uint16(a, fpst);
-}
-
-static int el_from_spsr(uint32_t spsr)
-{
-    /* Return the exception level that this SPSR is requesting a return to,
-     * or -1 if it is invalid (an illegal return)
-     */
-    if (spsr & PSTATE_nRW) {
-        switch (spsr & CPSR_M) {
-        case ARM_CPU_MODE_USR:
-            return 0;
-        case ARM_CPU_MODE_HYP:
-            return 2;
-        case ARM_CPU_MODE_FIQ:
-        case ARM_CPU_MODE_IRQ:
-        case ARM_CPU_MODE_SVC:
-        case ARM_CPU_MODE_ABT:
-        case ARM_CPU_MODE_UND:
-        case ARM_CPU_MODE_SYS:
-            return 1;
-        case ARM_CPU_MODE_MON:
-            /* Returning to Mon from AArch64 is never possible,
-             * so this is an illegal return.
-             */
-        default:
-            return -1;
-        }
-    } else {
-        if (extract32(spsr, 1, 1)) {
-            /* Return with reserved M[1] bit set */
-            return -1;
-        }
-        if (extract32(spsr, 0, 4) == 1) {
-            /* return to EL0 with M[0] bit set */
-            return -1;
-        }
-        return extract32(spsr, 2, 2);
-    }
-}
-
-static void cpsr_write_from_spsr_elx(CPUARMState *env,
-                                     uint32_t val)
-{
-    uint32_t mask;
-
-    /* Save SPSR_ELx.SS into PSTATE. */
-    env->pstate = (env->pstate & ~PSTATE_SS) | (val & PSTATE_SS);
-    val &= ~PSTATE_SS;
-
-    /* Move DIT to the correct location for CPSR */
-    if (val & PSTATE_DIT) {
-        val &= ~PSTATE_DIT;
-        val |= CPSR_DIT;
-    }
-
-    mask = aarch32_cpsr_valid_mask(env->features, \
-        &env_archcpu(env)->isar);
-    cpsr_write(env, val, mask, CPSRWriteRaw);
-}
-
-void HELPER(exception_return)(CPUARMState *env, uint64_t new_pc)
-{
-    int cur_el = arm_current_el(env);
-    unsigned int spsr_idx = aarch64_banked_spsr_index(cur_el);
-    uint32_t spsr = env->banked_spsr[spsr_idx];
-    int new_el;
-    bool return_to_aa64 = (spsr & PSTATE_nRW) == 0;
-
-    aarch64_save_sp(env, cur_el);
-
-    arm_clear_exclusive(env);
-
-    /* We must squash the PSTATE.SS bit to zero unless both of the
-     * following hold:
-     *  1. debug exceptions are currently disabled
-     *  2. singlestep will be active in the EL we return to
-     * We check 1 here and 2 after we've done the pstate/cpsr write() to
-     * transition to the EL we're going to.
-     */
-    if (arm_generate_debug_exceptions(env)) {
-        spsr &= ~PSTATE_SS;
-    }
-
-    new_el = el_from_spsr(spsr);
-    if (new_el == -1) {
-        goto illegal_return;
-    }
-    if (new_el > cur_el || (new_el == 2 && !arm_is_el2_enabled(env))) {
-        /* Disallow return to an EL which is unimplemented or higher
-         * than the current one.
-         */
-        goto illegal_return;
-    }
-
-    if (new_el != 0 && arm_el_is_aa64(env, new_el) != return_to_aa64) {
-        /* Return to an EL which is configured for a different register width */
-        goto illegal_return;
-    }
-
-    if (new_el == 1 && (arm_hcr_el2_eff(env) & HCR_TGE)) {
-        goto illegal_return;
-    }
-
-    qemu_mutex_lock_iothread();
-    arm_call_pre_el_change_hook(env_archcpu(env));
-    qemu_mutex_unlock_iothread();
-
-    if (!return_to_aa64) {
-        env->aarch64 = false;
-        /* We do a raw CPSR write because aarch64_sync_64_to_32()
-         * will sort the register banks out for us, and we've already
-         * caught all the bad-mode cases in el_from_spsr().
-         */
-        cpsr_write_from_spsr_elx(env, spsr);
-        if (!arm_singlestep_active(env)) {
-            env->pstate &= ~PSTATE_SS;
-        }
-        aarch64_sync_64_to_32(env);
-
-        if (spsr & CPSR_T) {
-            env->regs[15] = new_pc & ~0x1;
-        } else {
-            env->regs[15] = new_pc & ~0x3;
-        }
-        helper_rebuild_hflags_a32(env, new_el);
-        qemu_log_mask(CPU_LOG_INT, "Exception return from AArch64 EL%d to "
-                      "AArch32 EL%d PC 0x%" PRIx32 "\n",
-                      cur_el, new_el, env->regs[15]);
-    } else {
-        int tbii;
-
-        env->aarch64 = true;
-        spsr &= aarch64_pstate_valid_mask(&env_archcpu(env)->isar);
-        pstate_write(env, spsr);
-        if (!arm_singlestep_active(env)) {
-            env->pstate &= ~PSTATE_SS;
-        }
-        aarch64_restore_sp(env, new_el);
-        helper_rebuild_hflags_a64(env, new_el);
-
-        /*
-         * Apply TBI to the exception return address.  We had to delay this
-         * until after we selected the new EL, so that we could select the
-         * correct TBI+TBID bits.  This is made easier by waiting until after
-         * the hflags rebuild, since we can pull the composite TBII field
-         * from there.
-         */
-        tbii = EX_TBFLAG_A64(env->hflags, TBII);
-        if ((tbii >> extract64(new_pc, 55, 1)) & 1) {
-            /* TBI is enabled. */
-            int core_mmu_idx = cpu_mmu_index(env, false);
-            if (regime_has_2_ranges(core_to_aa64_mmu_idx(core_mmu_idx))) {
-                new_pc = sextract64(new_pc, 0, 56);
-            } else {
-                new_pc = extract64(new_pc, 0, 56);
-            }
-        }
-        env->pc = new_pc;
-
-        qemu_log_mask(CPU_LOG_INT, "Exception return from AArch64 EL%d to "
-                      "AArch64 EL%d PC 0x%" PRIx64 "\n",
-                      cur_el, new_el, env->pc);
-    }
-
-    /*
-     * Note that cur_el can never be 0.  If new_el is 0, then
-     * el0_a64 is return_to_aa64, else el0_a64 is ignored.
-     */
-    aarch64_sve_change_el(env, cur_el, new_el, return_to_aa64);
-
-    qemu_mutex_lock_iothread();
-    arm_call_el_change_hook(env_archcpu(env));
-    qemu_mutex_unlock_iothread();
-
-    return;
-
-illegal_return:
-    /* Illegal return events of various kinds have architecturally
-     * mandated behaviour:
-     * restore NZCV and DAIF from SPSR_ELx
-     * set PSTATE.IL
-     * restore PC from ELR_ELx
-     * no change to exception level, execution state or stack pointer
-     */
-    env->pstate |= PSTATE_IL;
-    env->pc = new_pc;
-    spsr &= PSTATE_NZCV | PSTATE_DAIF;
-    spsr |= pstate_read(env) & ~(PSTATE_NZCV | PSTATE_DAIF);
-    pstate_write(env, spsr);
-    if (!arm_singlestep_active(env)) {
-        env->pstate &= ~PSTATE_SS;
-    }
-    helper_rebuild_hflags_a64(env, cur_el);
-    qemu_log_mask(LOG_GUEST_ERROR, "Illegal exception return at EL%d: "
-                  "resuming execution at 0x%" PRIx64 "\n", cur_el, env->pc);
-}
-
-/*
- * Square Root and Reciprocal square root
- */
-
-uint32_t HELPER(sqrt_f16)(uint32_t a, void *fpstp)
-{
-    float_status *s = fpstp;
-
-    return float16_sqrt(a, s);
-}
-
-void HELPER(dc_zva)(CPUARMState *env, uint64_t vaddr_in)
-{
-    /*
-     * Implement DC ZVA, which zeroes a fixed-length block of memory.
-     * Note that we do not implement the (architecturally mandated)
-     * alignment fault for attempts to use this on Device memory
-     * (which matches the usual QEMU behaviour of not implementing either
-     * alignment faults or any memory attribute handling).
-     */
-    int blocklen = 4 << env_archcpu(env)->dcz_blocksize;
-    uint64_t vaddr = vaddr_in & ~(blocklen - 1);
-    int mmu_idx = cpu_mmu_index(env, false);
-    void *mem;
-
-    /*
-     * Trapless lookup.  In addition to actual invalid page, may
-     * return NULL for I/O, watchpoints, clean pages, etc.
-     */
-    mem = tlb_vaddr_to_host(env, vaddr, MMU_DATA_STORE, mmu_idx);
-
-#ifndef CONFIG_USER_ONLY
-    if (unlikely(!mem)) {
-        uintptr_t ra = GETPC();
-
-        /*
-         * Trap if accessing an invalid page.  DC_ZVA requires that we supply
-         * the original pointer for an invalid page.  But watchpoints require
-         * that we probe the actual space.  So do both.
-         */
-        (void) probe_write(env, vaddr_in, 1, mmu_idx, ra);
-        mem = probe_write(env, vaddr, blocklen, mmu_idx, ra);
-
-        if (unlikely(!mem)) {
-            /*
-             * The only remaining reason for mem == NULL is I/O.
-             * Just do a series of byte writes as the architecture demands.
-             */
-            for (int i = 0; i < blocklen; i++) {
-                cpu_stb_mmuidx_ra(env, vaddr + i, 0, mmu_idx, ra);
-            }
-            return;
-        }
-    }
-#endif
-
-    memset(mem, 0, blocklen);
-}
diff --git a/target/arm/iwmmxt_helper.c b/target/arm/iwmmxt_helper.c
deleted file mode 100644
index 610b1b2..0000000
--- a/target/arm/iwmmxt_helper.c
+++ /dev/null
@@ -1,670 +0,0 @@
-/*
- * iwMMXt micro operations for XScale.
- *
- * Copyright (c) 2007 OpenedHand, Ltd.
- * Written by Andrzej Zaborowski <andrew@openedhand.com>
- * Copyright (c) 2008 CodeSourcery
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "qemu/osdep.h"
-
-#include "cpu.h"
-#include "exec/helper-proto.h"
-
-/* iwMMXt macros extracted from GNU gdb.  */
-
-/* Set the SIMD wCASF flags for 8, 16, 32 or 64-bit operations.  */
-#define SIMD8_SET(v, n, b)      ((v != 0) << ((((b) + 1) * 4) + (n)))
-#define SIMD16_SET(v, n, h)     ((v != 0) << ((((h) + 1) * 8) + (n)))
-#define SIMD32_SET(v, n, w)     ((v != 0) << ((((w) + 1) * 16) + (n)))
-#define SIMD64_SET(v, n)        ((v != 0) << (32 + (n)))
-/* Flags to pass as "n" above.  */
-#define SIMD_NBIT       -1
-#define SIMD_ZBIT       -2
-#define SIMD_CBIT       -3
-#define SIMD_VBIT       -4
-/* Various status bit macros.  */
-#define NBIT8(x)        ((x) & 0x80)
-#define NBIT16(x)       ((x) & 0x8000)
-#define NBIT32(x)       ((x) & 0x80000000)
-#define NBIT64(x)       ((x) & 0x8000000000000000ULL)
-#define ZBIT8(x)        (((x) & 0xff) == 0)
-#define ZBIT16(x)       (((x) & 0xffff) == 0)
-#define ZBIT32(x)       (((x) & 0xffffffff) == 0)
-#define ZBIT64(x)       (x == 0)
-/* Sign extension macros.  */
-#define EXTEND8H(a)     ((uint16_t) (int8_t) (a))
-#define EXTEND8(a)      ((uint32_t) (int8_t) (a))
-#define EXTEND16(a)     ((uint32_t) (int16_t) (a))
-#define EXTEND16S(a)    ((int32_t) (int16_t) (a))
-#define EXTEND32(a)     ((uint64_t) (int32_t) (a))
-
-uint64_t HELPER(iwmmxt_maddsq)(uint64_t a, uint64_t b)
-{
-    a = ((
-            EXTEND16S((a >> 0) & 0xffff) * EXTEND16S((b >> 0) & 0xffff) +
-            EXTEND16S((a >> 16) & 0xffff) * EXTEND16S((b >> 16) & 0xffff)
-        ) & 0xffffffff) | ((uint64_t) (
-            EXTEND16S((a >> 32) & 0xffff) * EXTEND16S((b >> 32) & 0xffff) +
-            EXTEND16S((a >> 48) & 0xffff) * EXTEND16S((b >> 48) & 0xffff)
-        ) << 32);
-    return a;
-}
-
-uint64_t HELPER(iwmmxt_madduq)(uint64_t a, uint64_t b)
-{
-    a = ((
-            ((a >> 0) & 0xffff) * ((b >> 0) & 0xffff) +
-            ((a >> 16) & 0xffff) * ((b >> 16) & 0xffff)
-        ) & 0xffffffff) | ((
-            ((a >> 32) & 0xffff) * ((b >> 32) & 0xffff) +
-            ((a >> 48) & 0xffff) * ((b >> 48) & 0xffff)
-        ) << 32);
-    return a;
-}
-
-uint64_t HELPER(iwmmxt_sadb)(uint64_t a, uint64_t b)
-{
-#define abs(x) (((x) >= 0) ? x : -x)
-#define SADB(SHR) abs((int) ((a >> SHR) & 0xff) - (int) ((b >> SHR) & 0xff))
-    return
-        SADB(0) + SADB(8) + SADB(16) + SADB(24) +
-        SADB(32) + SADB(40) + SADB(48) + SADB(56);
-#undef SADB
-}
-
-uint64_t HELPER(iwmmxt_sadw)(uint64_t a, uint64_t b)
-{
-#define SADW(SHR) \
-    abs((int) ((a >> SHR) & 0xffff) - (int) ((b >> SHR) & 0xffff))
-    return SADW(0) + SADW(16) + SADW(32) + SADW(48);
-#undef SADW
-}
-
-uint64_t HELPER(iwmmxt_mulslw)(uint64_t a, uint64_t b)
-{
-#define MULS(SHR) ((uint64_t) ((( \
-        EXTEND16S((a >> SHR) & 0xffff) * EXTEND16S((b >> SHR) & 0xffff) \
-    ) >> 0) & 0xffff) << SHR)
-    return MULS(0) | MULS(16) | MULS(32) | MULS(48);
-#undef MULS
-}
-
-uint64_t HELPER(iwmmxt_mulshw)(uint64_t a, uint64_t b)
-{
-#define MULS(SHR) ((uint64_t) ((( \
-        EXTEND16S((a >> SHR) & 0xffff) * EXTEND16S((b >> SHR) & 0xffff) \
-    ) >> 16) & 0xffff) << SHR)
-    return MULS(0) | MULS(16) | MULS(32) | MULS(48);
-#undef MULS
-}
-
-uint64_t HELPER(iwmmxt_mululw)(uint64_t a, uint64_t b)
-{
-#define MULU(SHR) ((uint64_t) ((( \
-        ((a >> SHR) & 0xffff) * ((b >> SHR) & 0xffff) \
-    ) >> 0) & 0xffff) << SHR)
-    return MULU(0) | MULU(16) | MULU(32) | MULU(48);
-#undef MULU
-}
-
-uint64_t HELPER(iwmmxt_muluhw)(uint64_t a, uint64_t b)
-{
-#define MULU(SHR) ((uint64_t) ((( \
-        ((a >> SHR) & 0xffff) * ((b >> SHR) & 0xffff) \
-    ) >> 16) & 0xffff) << SHR)
-    return MULU(0) | MULU(16) | MULU(32) | MULU(48);
-#undef MULU
-}
-
-uint64_t HELPER(iwmmxt_macsw)(uint64_t a, uint64_t b)
-{
-#define MACS(SHR) ( \
-        EXTEND16((a >> SHR) & 0xffff) * EXTEND16S((b >> SHR) & 0xffff))
-    return (int64_t) (MACS(0) + MACS(16) + MACS(32) + MACS(48));
-#undef MACS
-}
-
-uint64_t HELPER(iwmmxt_macuw)(uint64_t a, uint64_t b)
-{
-#define MACU(SHR) ( \
-        (uint32_t) ((a >> SHR) & 0xffff) * \
-        (uint32_t) ((b >> SHR) & 0xffff))
-    return MACU(0) + MACU(16) + MACU(32) + MACU(48);
-#undef MACU
-}
-
-#define NZBIT8(x, i) \
-    SIMD8_SET(NBIT8((x) & 0xff), SIMD_NBIT, i) | \
-    SIMD8_SET(ZBIT8((x) & 0xff), SIMD_ZBIT, i)
-#define NZBIT16(x, i) \
-    SIMD16_SET(NBIT16((x) & 0xffff), SIMD_NBIT, i) | \
-    SIMD16_SET(ZBIT16((x) & 0xffff), SIMD_ZBIT, i)
-#define NZBIT32(x, i) \
-    SIMD32_SET(NBIT32((x) & 0xffffffff), SIMD_NBIT, i) | \
-    SIMD32_SET(ZBIT32((x) & 0xffffffff), SIMD_ZBIT, i)
-#define NZBIT64(x) \
-    SIMD64_SET(NBIT64(x), SIMD_NBIT) | \
-    SIMD64_SET(ZBIT64(x), SIMD_ZBIT)
-#define IWMMXT_OP_UNPACK(S, SH0, SH1, SH2, SH3)                         \
-uint64_t HELPER(glue(iwmmxt_unpack, glue(S, b)))(CPUARMState *env, \
-                                                 uint64_t a, uint64_t b) \
-{                                                               \
-    a =                                                                 \
-        (((a >> SH0) & 0xff) << 0) | (((b >> SH0) & 0xff) << 8) |       \
-        (((a >> SH1) & 0xff) << 16) | (((b >> SH1) & 0xff) << 24) |     \
-        (((a >> SH2) & 0xff) << 32) | (((b >> SH2) & 0xff) << 40) |     \
-        (((a >> SH3) & 0xff) << 48) | (((b >> SH3) & 0xff) << 56);      \
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =                       \
-        NZBIT8(a >> 0, 0) | NZBIT8(a >> 8, 1) |                         \
-        NZBIT8(a >> 16, 2) | NZBIT8(a >> 24, 3) |               \
-        NZBIT8(a >> 32, 4) | NZBIT8(a >> 40, 5) |               \
-        NZBIT8(a >> 48, 6) | NZBIT8(a >> 56, 7);                \
-    return a;                                                   \
-}                                                               \
-uint64_t HELPER(glue(iwmmxt_unpack, glue(S, w)))(CPUARMState *env, \
-                                        uint64_t a, uint64_t b) \
-{                                                               \
-    a =                                                                 \
-        (((a >> SH0) & 0xffff) << 0) |                          \
-        (((b >> SH0) & 0xffff) << 16) |                                 \
-        (((a >> SH2) & 0xffff) << 32) |                                 \
-        (((b >> SH2) & 0xffff) << 48);                          \
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =                       \
-        NZBIT8(a >> 0, 0) | NZBIT8(a >> 16, 1) |                \
-        NZBIT8(a >> 32, 2) | NZBIT8(a >> 48, 3);                \
-    return a;                                                   \
-}                                                               \
-uint64_t HELPER(glue(iwmmxt_unpack, glue(S, l)))(CPUARMState *env, \
-                                        uint64_t a, uint64_t b) \
-{                                                               \
-    a =                                                                 \
-        (((a >> SH0) & 0xffffffff) << 0) |                      \
-        (((b >> SH0) & 0xffffffff) << 32);                      \
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =                       \
-        NZBIT32(a >> 0, 0) | NZBIT32(a >> 32, 1);               \
-    return a;                                                   \
-}                                                               \
-uint64_t HELPER(glue(iwmmxt_unpack, glue(S, ub)))(CPUARMState *env, \
-                                                  uint64_t x)   \
-{                                                               \
-    x =                                                                 \
-        (((x >> SH0) & 0xff) << 0) |                            \
-        (((x >> SH1) & 0xff) << 16) |                           \
-        (((x >> SH2) & 0xff) << 32) |                           \
-        (((x >> SH3) & 0xff) << 48);                            \
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =                       \
-        NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) |              \
-        NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3);              \
-    return x;                                                   \
-}                                                               \
-uint64_t HELPER(glue(iwmmxt_unpack, glue(S, uw)))(CPUARMState *env, \
-                                                  uint64_t x)   \
-{                                                               \
-    x =                                                                 \
-        (((x >> SH0) & 0xffff) << 0) |                          \
-        (((x >> SH2) & 0xffff) << 32);                          \
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =                       \
-        NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1);               \
-    return x;                                                   \
-}                                                               \
-uint64_t HELPER(glue(iwmmxt_unpack, glue(S, ul)))(CPUARMState *env, \
-                                                  uint64_t x)   \
-{                                                               \
-    x = (((x >> SH0) & 0xffffffff) << 0);                       \
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x >> 0);      \
-    return x;                                                   \
-}                                                               \
-uint64_t HELPER(glue(iwmmxt_unpack, glue(S, sb)))(CPUARMState *env, \
-                                                  uint64_t x)   \
-{                                                               \
-    x =                                                                 \
-        ((uint64_t) EXTEND8H((x >> SH0) & 0xff) << 0) |                 \
-        ((uint64_t) EXTEND8H((x >> SH1) & 0xff) << 16) |        \
-        ((uint64_t) EXTEND8H((x >> SH2) & 0xff) << 32) |        \
-        ((uint64_t) EXTEND8H((x >> SH3) & 0xff) << 48);                 \
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =                       \
-        NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) |              \
-        NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3);              \
-    return x;                                                   \
-}                                                               \
-uint64_t HELPER(glue(iwmmxt_unpack, glue(S, sw)))(CPUARMState *env, \
-                                                  uint64_t x)   \
-{                                                               \
-    x =                                                                 \
-        ((uint64_t) EXTEND16((x >> SH0) & 0xffff) << 0) |       \
-        ((uint64_t) EXTEND16((x >> SH2) & 0xffff) << 32);       \
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =                       \
-        NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1);               \
-    return x;                                                   \
-}                                                               \
-uint64_t HELPER(glue(iwmmxt_unpack, glue(S, sl)))(CPUARMState *env, \
-                                                  uint64_t x)   \
-{                                                               \
-    x = EXTEND32((x >> SH0) & 0xffffffff);                      \
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x >> 0);      \
-    return x;                                                   \
-}
-IWMMXT_OP_UNPACK(l, 0, 8, 16, 24)
-IWMMXT_OP_UNPACK(h, 32, 40, 48, 56)
-
-#define IWMMXT_OP_CMP(SUFF, Tb, Tw, Tl, O)                      \
-uint64_t HELPER(glue(iwmmxt_, glue(SUFF, b)))(CPUARMState *env,    \
-                                        uint64_t a, uint64_t b) \
-{                                                               \
-    a =                                                                 \
-        CMP(0, Tb, O, 0xff) | CMP(8, Tb, O, 0xff) |             \
-        CMP(16, Tb, O, 0xff) | CMP(24, Tb, O, 0xff) |           \
-        CMP(32, Tb, O, 0xff) | CMP(40, Tb, O, 0xff) |           \
-        CMP(48, Tb, O, 0xff) | CMP(56, Tb, O, 0xff);            \
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =                       \
-        NZBIT8(a >> 0, 0) | NZBIT8(a >> 8, 1) |                         \
-        NZBIT8(a >> 16, 2) | NZBIT8(a >> 24, 3) |               \
-        NZBIT8(a >> 32, 4) | NZBIT8(a >> 40, 5) |               \
-        NZBIT8(a >> 48, 6) | NZBIT8(a >> 56, 7);                \
-    return a;                                                   \
-}                                                               \
-uint64_t HELPER(glue(iwmmxt_, glue(SUFF, w)))(CPUARMState *env,    \
-                                        uint64_t a, uint64_t b) \
-{                                                               \
-    a = CMP(0, Tw, O, 0xffff) | CMP(16, Tw, O, 0xffff) |        \
-        CMP(32, Tw, O, 0xffff) | CMP(48, Tw, O, 0xffff);        \
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =                       \
-        NZBIT16(a >> 0, 0) | NZBIT16(a >> 16, 1) |              \
-        NZBIT16(a >> 32, 2) | NZBIT16(a >> 48, 3);              \
-    return a;                                                   \
-}                                                               \
-uint64_t HELPER(glue(iwmmxt_, glue(SUFF, l)))(CPUARMState *env,    \
-                                        uint64_t a, uint64_t b) \
-{                                                               \
-    a = CMP(0, Tl, O, 0xffffffff) |                             \
-        CMP(32, Tl, O, 0xffffffff);                             \
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =                       \
-        NZBIT32(a >> 0, 0) | NZBIT32(a >> 32, 1);               \
-    return a;                                                   \
-}
-#define CMP(SHR, TYPE, OPER, MASK) ((((TYPE) ((a >> SHR) & MASK) OPER \
-            (TYPE) ((b >> SHR) & MASK)) ? (uint64_t) MASK : 0) << SHR)
-IWMMXT_OP_CMP(cmpeq, uint8_t, uint16_t, uint32_t, ==)
-IWMMXT_OP_CMP(cmpgts, int8_t, int16_t, int32_t, >)
-IWMMXT_OP_CMP(cmpgtu, uint8_t, uint16_t, uint32_t, >)
-#undef CMP
-#define CMP(SHR, TYPE, OPER, MASK) ((((TYPE) ((a >> SHR) & MASK) OPER \
-            (TYPE) ((b >> SHR) & MASK)) ? a : b) & ((uint64_t) MASK << SHR))
-IWMMXT_OP_CMP(mins, int8_t, int16_t, int32_t, <)
-IWMMXT_OP_CMP(minu, uint8_t, uint16_t, uint32_t, <)
-IWMMXT_OP_CMP(maxs, int8_t, int16_t, int32_t, >)
-IWMMXT_OP_CMP(maxu, uint8_t, uint16_t, uint32_t, >)
-#undef CMP
-#define CMP(SHR, TYPE, OPER, MASK) ((uint64_t) (((TYPE) ((a >> SHR) & MASK) \
-            OPER (TYPE) ((b >> SHR) & MASK)) & MASK) << SHR)
-IWMMXT_OP_CMP(subn, uint8_t, uint16_t, uint32_t, -)
-IWMMXT_OP_CMP(addn, uint8_t, uint16_t, uint32_t, +)
-#undef CMP
-/* TODO Signed- and Unsigned-Saturation */
-#define CMP(SHR, TYPE, OPER, MASK) ((uint64_t) (((TYPE) ((a >> SHR) & MASK) \
-            OPER (TYPE) ((b >> SHR) & MASK)) & MASK) << SHR)
-IWMMXT_OP_CMP(subu, uint8_t, uint16_t, uint32_t, -)
-IWMMXT_OP_CMP(addu, uint8_t, uint16_t, uint32_t, +)
-IWMMXT_OP_CMP(subs, int8_t, int16_t, int32_t, -)
-IWMMXT_OP_CMP(adds, int8_t, int16_t, int32_t, +)
-#undef CMP
-#undef IWMMXT_OP_CMP
-
-#define AVGB(SHR) ((( \
-        ((a >> SHR) & 0xff) + ((b >> SHR) & 0xff) + round) >> 1) << SHR)
-#define IWMMXT_OP_AVGB(r)                                                 \
-uint64_t HELPER(iwmmxt_avgb##r)(CPUARMState *env, uint64_t a, uint64_t b)    \
-{                                                                         \
-    const int round = r;                                                  \
-    a = AVGB(0) | AVGB(8) | AVGB(16) | AVGB(24) |                         \
-        AVGB(32) | AVGB(40) | AVGB(48) | AVGB(56);                        \
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =                                 \
-        SIMD8_SET(ZBIT8((a >> 0) & 0xff), SIMD_ZBIT, 0) |                 \
-        SIMD8_SET(ZBIT8((a >> 8) & 0xff), SIMD_ZBIT, 1) |                 \
-        SIMD8_SET(ZBIT8((a >> 16) & 0xff), SIMD_ZBIT, 2) |                \
-        SIMD8_SET(ZBIT8((a >> 24) & 0xff), SIMD_ZBIT, 3) |                \
-        SIMD8_SET(ZBIT8((a >> 32) & 0xff), SIMD_ZBIT, 4) |                \
-        SIMD8_SET(ZBIT8((a >> 40) & 0xff), SIMD_ZBIT, 5) |                \
-        SIMD8_SET(ZBIT8((a >> 48) & 0xff), SIMD_ZBIT, 6) |                \
-        SIMD8_SET(ZBIT8((a >> 56) & 0xff), SIMD_ZBIT, 7);                 \
-    return a;                                                             \
-}
-IWMMXT_OP_AVGB(0)
-IWMMXT_OP_AVGB(1)
-#undef IWMMXT_OP_AVGB
-#undef AVGB
-
-#define AVGW(SHR) ((( \
-        ((a >> SHR) & 0xffff) + ((b >> SHR) & 0xffff) + round) >> 1) << SHR)
-#define IWMMXT_OP_AVGW(r)                                               \
-uint64_t HELPER(iwmmxt_avgw##r)(CPUARMState *env, uint64_t a, uint64_t b)  \
-{                                                                       \
-    const int round = r;                                                \
-    a = AVGW(0) | AVGW(16) | AVGW(32) | AVGW(48);                       \
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =                               \
-        SIMD16_SET(ZBIT16((a >> 0) & 0xffff), SIMD_ZBIT, 0) |           \
-        SIMD16_SET(ZBIT16((a >> 16) & 0xffff), SIMD_ZBIT, 1) |          \
-        SIMD16_SET(ZBIT16((a >> 32) & 0xffff), SIMD_ZBIT, 2) |          \
-        SIMD16_SET(ZBIT16((a >> 48) & 0xffff), SIMD_ZBIT, 3);           \
-    return a;                                                           \
-}
-IWMMXT_OP_AVGW(0)
-IWMMXT_OP_AVGW(1)
-#undef IWMMXT_OP_AVGW
-#undef AVGW
-
-uint64_t HELPER(iwmmxt_align)(uint64_t a, uint64_t b, uint32_t n)
-{
-    a >>= n << 3;
-    a |= b << (64 - (n << 3));
-    return a;
-}
-
-uint64_t HELPER(iwmmxt_insr)(uint64_t x, uint32_t a, uint32_t b, uint32_t n)
-{
-    x &= ~((uint64_t) b << n);
-    x |= (uint64_t) (a & b) << n;
-    return x;
-}
-
-uint32_t HELPER(iwmmxt_setpsr_nz)(uint64_t x)
-{
-    return SIMD64_SET((x == 0), SIMD_ZBIT) |
-           SIMD64_SET((x & (1ULL << 63)), SIMD_NBIT);
-}
-
-uint64_t HELPER(iwmmxt_bcstb)(uint32_t arg)
-{
-    arg &= 0xff;
-    return
-        ((uint64_t) arg << 0 ) | ((uint64_t) arg << 8 ) |
-        ((uint64_t) arg << 16) | ((uint64_t) arg << 24) |
-        ((uint64_t) arg << 32) | ((uint64_t) arg << 40) |
-        ((uint64_t) arg << 48) | ((uint64_t) arg << 56);
-}
-
-uint64_t HELPER(iwmmxt_bcstw)(uint32_t arg)
-{
-    arg &= 0xffff;
-    return
-        ((uint64_t) arg << 0 ) | ((uint64_t) arg << 16) |
-        ((uint64_t) arg << 32) | ((uint64_t) arg << 48);
-}
-
-uint64_t HELPER(iwmmxt_bcstl)(uint32_t arg)
-{
-    return arg | ((uint64_t) arg << 32);
-}
-
-uint64_t HELPER(iwmmxt_addcb)(uint64_t x)
-{
-    return
-        ((x >> 0) & 0xff) + ((x >> 8) & 0xff) +
-        ((x >> 16) & 0xff) + ((x >> 24) & 0xff) +
-        ((x >> 32) & 0xff) + ((x >> 40) & 0xff) +
-        ((x >> 48) & 0xff) + ((x >> 56) & 0xff);
-}
-
-uint64_t HELPER(iwmmxt_addcw)(uint64_t x)
-{
-    return
-        ((x >> 0) & 0xffff) + ((x >> 16) & 0xffff) +
-        ((x >> 32) & 0xffff) + ((x >> 48) & 0xffff);
-}
-
-uint64_t HELPER(iwmmxt_addcl)(uint64_t x)
-{
-    return (x & 0xffffffff) + (x >> 32);
-}
-
-uint32_t HELPER(iwmmxt_msbb)(uint64_t x)
-{
-    return
-        ((x >> 7) & 0x01) | ((x >> 14) & 0x02) |
-        ((x >> 21) & 0x04) | ((x >> 28) & 0x08) |
-        ((x >> 35) & 0x10) | ((x >> 42) & 0x20) |
-        ((x >> 49) & 0x40) | ((x >> 56) & 0x80);
-}
-
-uint32_t HELPER(iwmmxt_msbw)(uint64_t x)
-{
-    return
-        ((x >> 15) & 0x01) | ((x >> 30) & 0x02) |
-        ((x >> 45) & 0x04) | ((x >> 52) & 0x08);
-}
-
-uint32_t HELPER(iwmmxt_msbl)(uint64_t x)
-{
-    return ((x >> 31) & 0x01) | ((x >> 62) & 0x02);
-}
-
-/* FIXME: Split wCASF setting into a separate op to avoid env use.  */
-uint64_t HELPER(iwmmxt_srlw)(CPUARMState *env, uint64_t x, uint32_t n)
-{
-    x = (((x & (0xffffll << 0)) >> n) & (0xffffll << 0)) |
-        (((x & (0xffffll << 16)) >> n) & (0xffffll << 16)) |
-        (((x & (0xffffll << 32)) >> n) & (0xffffll << 32)) |
-        (((x & (0xffffll << 48)) >> n) & (0xffffll << 48));
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
-        NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) |
-        NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3);
-    return x;
-}
-
-uint64_t HELPER(iwmmxt_srll)(CPUARMState *env, uint64_t x, uint32_t n)
-{
-    x = ((x & (0xffffffffll << 0)) >> n) |
-        ((x >> n) & (0xffffffffll << 32));
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
-        NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1);
-    return x;
-}
-
-uint64_t HELPER(iwmmxt_srlq)(CPUARMState *env, uint64_t x, uint32_t n)
-{
-    x >>= n;
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x);
-    return x;
-}
-
-uint64_t HELPER(iwmmxt_sllw)(CPUARMState *env, uint64_t x, uint32_t n)
-{
-    x = (((x & (0xffffll << 0)) << n) & (0xffffll << 0)) |
-        (((x & (0xffffll << 16)) << n) & (0xffffll << 16)) |
-        (((x & (0xffffll << 32)) << n) & (0xffffll << 32)) |
-        (((x & (0xffffll << 48)) << n) & (0xffffll << 48));
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
-        NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) |
-        NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3);
-    return x;
-}
-
-uint64_t HELPER(iwmmxt_slll)(CPUARMState *env, uint64_t x, uint32_t n)
-{
-    x = ((x << n) & (0xffffffffll << 0)) |
-        ((x & (0xffffffffll << 32)) << n);
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
-        NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1);
-    return x;
-}
-
-uint64_t HELPER(iwmmxt_sllq)(CPUARMState *env, uint64_t x, uint32_t n)
-{
-    x <<= n;
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x);
-    return x;
-}
-
-uint64_t HELPER(iwmmxt_sraw)(CPUARMState *env, uint64_t x, uint32_t n)
-{
-    x = ((uint64_t) ((EXTEND16(x >> 0) >> n) & 0xffff) << 0) |
-        ((uint64_t) ((EXTEND16(x >> 16) >> n) & 0xffff) << 16) |
-        ((uint64_t) ((EXTEND16(x >> 32) >> n) & 0xffff) << 32) |
-        ((uint64_t) ((EXTEND16(x >> 48) >> n) & 0xffff) << 48);
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
-        NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) |
-        NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3);
-    return x;
-}
-
-uint64_t HELPER(iwmmxt_sral)(CPUARMState *env, uint64_t x, uint32_t n)
-{
-    x = (((EXTEND32(x >> 0) >> n) & 0xffffffff) << 0) |
-        (((EXTEND32(x >> 32) >> n) & 0xffffffff) << 32);
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
-        NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1);
-    return x;
-}
-
-uint64_t HELPER(iwmmxt_sraq)(CPUARMState *env, uint64_t x, uint32_t n)
-{
-    x = (int64_t) x >> n;
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x);
-    return x;
-}
-
-uint64_t HELPER(iwmmxt_rorw)(CPUARMState *env, uint64_t x, uint32_t n)
-{
-    x = ((((x & (0xffffll << 0)) >> n) |
-          ((x & (0xffffll << 0)) << (16 - n))) & (0xffffll << 0)) |
-        ((((x & (0xffffll << 16)) >> n) |
-          ((x & (0xffffll << 16)) << (16 - n))) & (0xffffll << 16)) |
-        ((((x & (0xffffll << 32)) >> n) |
-          ((x & (0xffffll << 32)) << (16 - n))) & (0xffffll << 32)) |
-        ((((x & (0xffffll << 48)) >> n) |
-          ((x & (0xffffll << 48)) << (16 - n))) & (0xffffll << 48));
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
-        NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) |
-        NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3);
-    return x;
-}
-
-uint64_t HELPER(iwmmxt_rorl)(CPUARMState *env, uint64_t x, uint32_t n)
-{
-    x = ((x & (0xffffffffll << 0)) >> n) |
-        ((x >> n) & (0xffffffffll << 32)) |
-        ((x << (32 - n)) & (0xffffffffll << 0)) |
-        ((x & (0xffffffffll << 32)) << (32 - n));
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
-        NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1);
-    return x;
-}
-
-uint64_t HELPER(iwmmxt_rorq)(CPUARMState *env, uint64_t x, uint32_t n)
-{
-    x = ror64(x, n);
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x);
-    return x;
-}
-
-uint64_t HELPER(iwmmxt_shufh)(CPUARMState *env, uint64_t x, uint32_t n)
-{
-    x = (((x >> ((n << 4) & 0x30)) & 0xffff) << 0) |
-        (((x >> ((n << 2) & 0x30)) & 0xffff) << 16) |
-        (((x >> ((n << 0) & 0x30)) & 0xffff) << 32) |
-        (((x >> ((n >> 2) & 0x30)) & 0xffff) << 48);
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
-        NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) |
-        NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3);
-    return x;
-}
-
-/* TODO: Unsigned-Saturation */
-uint64_t HELPER(iwmmxt_packuw)(CPUARMState *env, uint64_t a, uint64_t b)
-{
-    a = (((a >> 0) & 0xff) << 0) | (((a >> 16) & 0xff) << 8) |
-        (((a >> 32) & 0xff) << 16) | (((a >> 48) & 0xff) << 24) |
-        (((b >> 0) & 0xff) << 32) | (((b >> 16) & 0xff) << 40) |
-        (((b >> 32) & 0xff) << 48) | (((b >> 48) & 0xff) << 56);
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
-        NZBIT8(a >> 0, 0) | NZBIT8(a >> 8, 1) |
-        NZBIT8(a >> 16, 2) | NZBIT8(a >> 24, 3) |
-        NZBIT8(a >> 32, 4) | NZBIT8(a >> 40, 5) |
-        NZBIT8(a >> 48, 6) | NZBIT8(a >> 56, 7);
-    return a;
-}
-
-uint64_t HELPER(iwmmxt_packul)(CPUARMState *env, uint64_t a, uint64_t b)
-{
-    a = (((a >> 0) & 0xffff) << 0) | (((a >> 32) & 0xffff) << 16) |
-        (((b >> 0) & 0xffff) << 32) | (((b >> 32) & 0xffff) << 48);
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
-        NZBIT16(a >> 0, 0) | NZBIT16(a >> 16, 1) |
-        NZBIT16(a >> 32, 2) | NZBIT16(a >> 48, 3);
-    return a;
-}
-
-uint64_t HELPER(iwmmxt_packuq)(CPUARMState *env, uint64_t a, uint64_t b)
-{
-    a = (a & 0xffffffff) | ((b & 0xffffffff) << 32);
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
-        NZBIT32(a >> 0, 0) | NZBIT32(a >> 32, 1);
-    return a;
-}
-
-/* TODO: Signed-Saturation */
-uint64_t HELPER(iwmmxt_packsw)(CPUARMState *env, uint64_t a, uint64_t b)
-{
-    a = (((a >> 0) & 0xff) << 0) | (((a >> 16) & 0xff) << 8) |
-        (((a >> 32) & 0xff) << 16) | (((a >> 48) & 0xff) << 24) |
-        (((b >> 0) & 0xff) << 32) | (((b >> 16) & 0xff) << 40) |
-        (((b >> 32) & 0xff) << 48) | (((b >> 48) & 0xff) << 56);
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
-        NZBIT8(a >> 0, 0) | NZBIT8(a >> 8, 1) |
-        NZBIT8(a >> 16, 2) | NZBIT8(a >> 24, 3) |
-        NZBIT8(a >> 32, 4) | NZBIT8(a >> 40, 5) |
-        NZBIT8(a >> 48, 6) | NZBIT8(a >> 56, 7);
-    return a;
-}
-
-uint64_t HELPER(iwmmxt_packsl)(CPUARMState *env, uint64_t a, uint64_t b)
-{
-    a = (((a >> 0) & 0xffff) << 0) | (((a >> 32) & 0xffff) << 16) |
-        (((b >> 0) & 0xffff) << 32) | (((b >> 32) & 0xffff) << 48);
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
-        NZBIT16(a >> 0, 0) | NZBIT16(a >> 16, 1) |
-        NZBIT16(a >> 32, 2) | NZBIT16(a >> 48, 3);
-    return a;
-}
-
-uint64_t HELPER(iwmmxt_packsq)(CPUARMState *env, uint64_t a, uint64_t b)
-{
-    a = (a & 0xffffffff) | ((b & 0xffffffff) << 32);
-    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
-        NZBIT32(a >> 0, 0) | NZBIT32(a >> 32, 1);
-    return a;
-}
-
-uint64_t HELPER(iwmmxt_muladdsl)(uint64_t c, uint32_t a, uint32_t b)
-{
-    return c + ((int32_t) EXTEND32(a) * (int32_t) EXTEND32(b));
-}
-
-uint64_t HELPER(iwmmxt_muladdsw)(uint64_t c, uint32_t a, uint32_t b)
-{
-    c += EXTEND32(EXTEND16S((a >> 0) & 0xffff) *
-                  EXTEND16S((b >> 0) & 0xffff));
-    c += EXTEND32(EXTEND16S((a >> 16) & 0xffff) *
-                  EXTEND16S((b >> 16) & 0xffff));
-    return c;
-}
-
-uint64_t HELPER(iwmmxt_muladdswl)(uint64_t c, uint32_t a, uint32_t b)
-{
-    return c + (EXTEND32(EXTEND16S(a & 0xffff) *
-                         EXTEND16S(b & 0xffff)));
-}
diff --git a/target/arm/m_helper.c b/target/arm/m_helper.c
deleted file mode 100644
index f94e87e..0000000
--- a/target/arm/m_helper.c
+++ /dev/null
@@ -1,2902 +0,0 @@
-/*
- * ARM generic helpers.
- *
- * This code is licensed under the GNU GPL v2 or later.
- *
- * SPDX-License-Identifier: GPL-2.0-or-later
- */
-
-#include "qemu/osdep.h"
-#include "cpu.h"
-#include "internals.h"
-#include "exec/helper-proto.h"
-#include "qemu/main-loop.h"
-#include "qemu/bitops.h"
-#include "qemu/log.h"
-#include "exec/exec-all.h"
-#ifdef CONFIG_TCG
-#include "exec/cpu_ldst.h"
-#include "semihosting/common-semi.h"
-#endif
-#if !defined(CONFIG_USER_ONLY)
-#include "hw/intc/armv7m_nvic.h"
-#endif
-
-static void v7m_msr_xpsr(CPUARMState *env, uint32_t mask,
-                         uint32_t reg, uint32_t val)
-{
-    /* Only APSR is actually writable */
-    if (!(reg & 4)) {
-        uint32_t apsrmask = 0;
-
-        if (mask & 8) {
-            apsrmask |= XPSR_NZCV | XPSR_Q;
-        }
-        if ((mask & 4) && arm_feature(env, ARM_FEATURE_THUMB_DSP)) {
-            apsrmask |= XPSR_GE;
-        }
-        xpsr_write(env, val, apsrmask);
-    }
-}
-
-static uint32_t v7m_mrs_xpsr(CPUARMState *env, uint32_t reg, unsigned el)
-{
-    uint32_t mask = 0;
-
-    if ((reg & 1) && el) {
-        mask |= XPSR_EXCP; /* IPSR (unpriv. reads as zero) */
-    }
-    if (!(reg & 4)) {
-        mask |= XPSR_NZCV | XPSR_Q; /* APSR */
-        if (arm_feature(env, ARM_FEATURE_THUMB_DSP)) {
-            mask |= XPSR_GE;
-        }
-    }
-    /* EPSR reads as zero */
-    return xpsr_read(env) & mask;
-}
-
-static uint32_t v7m_mrs_control(CPUARMState *env, uint32_t secure)
-{
-    uint32_t value = env->v7m.control[secure];
-
-    if (!secure) {
-        /* SFPA is RAZ/WI from NS; FPCA is stored in the M_REG_S bank */
-        value |= env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK;
-    }
-    return value;
-}
-
-#ifdef CONFIG_USER_ONLY
-
-void HELPER(v7m_msr)(CPUARMState *env, uint32_t maskreg, uint32_t val)
-{
-    uint32_t mask = extract32(maskreg, 8, 4);
-    uint32_t reg = extract32(maskreg, 0, 8);
-
-    switch (reg) {
-    case 0 ... 7: /* xPSR sub-fields */
-        v7m_msr_xpsr(env, mask, reg, val);
-        break;
-    case 20: /* CONTROL */
-        /* There are no sub-fields that are actually writable from EL0. */
-        break;
-    default:
-        /* Unprivileged writes to other registers are ignored */
-        break;
-    }
-}
-
-uint32_t HELPER(v7m_mrs)(CPUARMState *env, uint32_t reg)
-{
-    switch (reg) {
-    case 0 ... 7: /* xPSR sub-fields */
-        return v7m_mrs_xpsr(env, reg, 0);
-    case 20: /* CONTROL */
-        return v7m_mrs_control(env, 0);
-    default:
-        /* Unprivileged reads others as zero.  */
-        return 0;
-    }
-}
-
-void HELPER(v7m_bxns)(CPUARMState *env, uint32_t dest)
-{
-    /* translate.c should never generate calls here in user-only mode */
-    g_assert_not_reached();
-}
-
-void HELPER(v7m_blxns)(CPUARMState *env, uint32_t dest)
-{
-    /* translate.c should never generate calls here in user-only mode */
-    g_assert_not_reached();
-}
-
-void HELPER(v7m_preserve_fp_state)(CPUARMState *env)
-{
-    /* translate.c should never generate calls here in user-only mode */
-    g_assert_not_reached();
-}
-
-void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr)
-{
-    /* translate.c should never generate calls here in user-only mode */
-    g_assert_not_reached();
-}
-
-void HELPER(v7m_vlldm)(CPUARMState *env, uint32_t fptr)
-{
-    /* translate.c should never generate calls here in user-only mode */
-    g_assert_not_reached();
-}
-
-uint32_t HELPER(v7m_tt)(CPUARMState *env, uint32_t addr, uint32_t op)
-{
-    /*
-     * The TT instructions can be used by unprivileged code, but in
-     * user-only emulation we don't have the MPU.
-     * Luckily since we know we are NonSecure unprivileged (and that in
-     * turn means that the A flag wasn't specified), all the bits in the
-     * register must be zero:
-     *  IREGION: 0 because IRVALID is 0
-     *  IRVALID: 0 because NS
-     *  S: 0 because NS
-     *  NSRW: 0 because NS
-     *  NSR: 0 because NS
-     *  RW: 0 because unpriv and A flag not set
-     *  R: 0 because unpriv and A flag not set
-     *  SRVALID: 0 because NS
-     *  MRVALID: 0 because unpriv and A flag not set
-     *  SREGION: 0 becaus SRVALID is 0
-     *  MREGION: 0 because MRVALID is 0
-     */
-    return 0;
-}
-
-ARMMMUIdx arm_v7m_mmu_idx_for_secstate(CPUARMState *env, bool secstate)
-{
-    return ARMMMUIdx_MUser;
-}
-
-#else /* !CONFIG_USER_ONLY */
-
-static ARMMMUIdx arm_v7m_mmu_idx_all(CPUARMState *env,
-                                     bool secstate, bool priv, bool negpri)
-{
-    ARMMMUIdx mmu_idx = ARM_MMU_IDX_M;
-
-    if (priv) {
-        mmu_idx |= ARM_MMU_IDX_M_PRIV;
-    }
-
-    if (negpri) {
-        mmu_idx |= ARM_MMU_IDX_M_NEGPRI;
-    }
-
-    if (secstate) {
-        mmu_idx |= ARM_MMU_IDX_M_S;
-    }
-
-    return mmu_idx;
-}
-
-static ARMMMUIdx arm_v7m_mmu_idx_for_secstate_and_priv(CPUARMState *env,
-                                                       bool secstate, bool priv)
-{
-    bool negpri = armv7m_nvic_neg_prio_requested(env->nvic, secstate);
-
-    return arm_v7m_mmu_idx_all(env, secstate, priv, negpri);
-}
-
-/* Return the MMU index for a v7M CPU in the specified security state */
-ARMMMUIdx arm_v7m_mmu_idx_for_secstate(CPUARMState *env, bool secstate)
-{
-    bool priv = arm_v7m_is_handler_mode(env) ||
-        !(env->v7m.control[secstate] & 1);
-
-    return arm_v7m_mmu_idx_for_secstate_and_priv(env, secstate, priv);
-}
-
-/*
- * What kind of stack write are we doing? This affects how exceptions
- * generated during the stacking are treated.
- */
-typedef enum StackingMode {
-    STACK_NORMAL,
-    STACK_IGNFAULTS,
-    STACK_LAZYFP,
-} StackingMode;
-
-static bool v7m_stack_write(ARMCPU *cpu, uint32_t addr, uint32_t value,
-                            ARMMMUIdx mmu_idx, StackingMode mode)
-{
-    CPUState *cs = CPU(cpu);
-    CPUARMState *env = &cpu->env;
-    MemTxResult txres;
-    GetPhysAddrResult res = {};
-    ARMMMUFaultInfo fi = {};
-    bool secure = mmu_idx & ARM_MMU_IDX_M_S;
-    int exc;
-    bool exc_secure;
-
-    if (get_phys_addr(env, addr, MMU_DATA_STORE, mmu_idx, &res, &fi)) {
-        /* MPU/SAU lookup failed */
-        if (fi.type == ARMFault_QEMU_SFault) {
-            if (mode == STACK_LAZYFP) {
-                qemu_log_mask(CPU_LOG_INT,
-                              "...SecureFault with SFSR.LSPERR "
-                              "during lazy stacking\n");
-                env->v7m.sfsr |= R_V7M_SFSR_LSPERR_MASK;
-            } else {
-                qemu_log_mask(CPU_LOG_INT,
-                              "...SecureFault with SFSR.AUVIOL "
-                              "during stacking\n");
-                env->v7m.sfsr |= R_V7M_SFSR_AUVIOL_MASK;
-            }
-            env->v7m.sfsr |= R_V7M_SFSR_SFARVALID_MASK;
-            env->v7m.sfar = addr;
-            exc = ARMV7M_EXCP_SECURE;
-            exc_secure = false;
-        } else {
-            if (mode == STACK_LAZYFP) {
-                qemu_log_mask(CPU_LOG_INT,
-                              "...MemManageFault with CFSR.MLSPERR\n");
-                env->v7m.cfsr[secure] |= R_V7M_CFSR_MLSPERR_MASK;
-            } else {
-                qemu_log_mask(CPU_LOG_INT,
-                              "...MemManageFault with CFSR.MSTKERR\n");
-                env->v7m.cfsr[secure] |= R_V7M_CFSR_MSTKERR_MASK;
-            }
-            exc = ARMV7M_EXCP_MEM;
-            exc_secure = secure;
-        }
-        goto pend_fault;
-    }
-    address_space_stl_le(arm_addressspace(cs, res.f.attrs), res.f.phys_addr,
-                         value, res.f.attrs, &txres);
-    if (txres != MEMTX_OK) {
-        /* BusFault trying to write the data */
-        if (mode == STACK_LAZYFP) {
-            qemu_log_mask(CPU_LOG_INT, "...BusFault with BFSR.LSPERR\n");
-            env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_LSPERR_MASK;
-        } else {
-            qemu_log_mask(CPU_LOG_INT, "...BusFault with BFSR.STKERR\n");
-            env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_STKERR_MASK;
-        }
-        exc = ARMV7M_EXCP_BUS;
-        exc_secure = false;
-        goto pend_fault;
-    }
-    return true;
-
-pend_fault:
-    /*
-     * By pending the exception at this point we are making
-     * the IMPDEF choice "overridden exceptions pended" (see the
-     * MergeExcInfo() pseudocode). The other choice would be to not
-     * pend them now and then make a choice about which to throw away
-     * later if we have two derived exceptions.
-     * The only case when we must not pend the exception but instead
-     * throw it away is if we are doing the push of the callee registers
-     * and we've already generated a derived exception (this is indicated
-     * by the caller passing STACK_IGNFAULTS). Even in this case we will
-     * still update the fault status registers.
-     */
-    switch (mode) {
-    case STACK_NORMAL:
-        armv7m_nvic_set_pending_derived(env->nvic, exc, exc_secure);
-        break;
-    case STACK_LAZYFP:
-        armv7m_nvic_set_pending_lazyfp(env->nvic, exc, exc_secure);
-        break;
-    case STACK_IGNFAULTS:
-        break;
-    }
-    return false;
-}
-
-static bool v7m_stack_read(ARMCPU *cpu, uint32_t *dest, uint32_t addr,
-                           ARMMMUIdx mmu_idx)
-{
-    CPUState *cs = CPU(cpu);
-    CPUARMState *env = &cpu->env;
-    MemTxResult txres;
-    GetPhysAddrResult res = {};
-    ARMMMUFaultInfo fi = {};
-    bool secure = mmu_idx & ARM_MMU_IDX_M_S;
-    int exc;
-    bool exc_secure;
-    uint32_t value;
-
-    if (get_phys_addr(env, addr, MMU_DATA_LOAD, mmu_idx, &res, &fi)) {
-        /* MPU/SAU lookup failed */
-        if (fi.type == ARMFault_QEMU_SFault) {
-            qemu_log_mask(CPU_LOG_INT,
-                          "...SecureFault with SFSR.AUVIOL during unstack\n");
-            env->v7m.sfsr |= R_V7M_SFSR_AUVIOL_MASK | R_V7M_SFSR_SFARVALID_MASK;
-            env->v7m.sfar = addr;
-            exc = ARMV7M_EXCP_SECURE;
-            exc_secure = false;
-        } else {
-            qemu_log_mask(CPU_LOG_INT,
-                          "...MemManageFault with CFSR.MUNSTKERR\n");
-            env->v7m.cfsr[secure] |= R_V7M_CFSR_MUNSTKERR_MASK;
-            exc = ARMV7M_EXCP_MEM;
-            exc_secure = secure;
-        }
-        goto pend_fault;
-    }
-
-    value = address_space_ldl(arm_addressspace(cs, res.f.attrs),
-                              res.f.phys_addr, res.f.attrs, &txres);
-    if (txres != MEMTX_OK) {
-        /* BusFault trying to read the data */
-        qemu_log_mask(CPU_LOG_INT, "...BusFault with BFSR.UNSTKERR\n");
-        env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_UNSTKERR_MASK;
-        exc = ARMV7M_EXCP_BUS;
-        exc_secure = false;
-        goto pend_fault;
-    }
-
-    *dest = value;
-    return true;
-
-pend_fault:
-    /*
-     * By pending the exception at this point we are making
-     * the IMPDEF choice "overridden exceptions pended" (see the
-     * MergeExcInfo() pseudocode). The other choice would be to not
-     * pend them now and then make a choice about which to throw away
-     * later if we have two derived exceptions.
-     */
-    armv7m_nvic_set_pending(env->nvic, exc, exc_secure);
-    return false;
-}
-
-void HELPER(v7m_preserve_fp_state)(CPUARMState *env)
-{
-    /*
-     * Preserve FP state (because LSPACT was set and we are about
-     * to execute an FP instruction). This corresponds to the
-     * PreserveFPState() pseudocode.
-     * We may throw an exception if the stacking fails.
-     */
-    ARMCPU *cpu = env_archcpu(env);
-    bool is_secure = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_S_MASK;
-    bool negpri = !(env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_HFRDY_MASK);
-    bool is_priv = !(env->v7m.fpccr[is_secure] & R_V7M_FPCCR_USER_MASK);
-    bool splimviol = env->v7m.fpccr[is_secure] & R_V7M_FPCCR_SPLIMVIOL_MASK;
-    uint32_t fpcar = env->v7m.fpcar[is_secure];
-    bool stacked_ok = true;
-    bool ts = is_secure && (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK);
-    bool take_exception;
-
-    /* Take the iothread lock as we are going to touch the NVIC */
-    qemu_mutex_lock_iothread();
-
-    /* Check the background context had access to the FPU */
-    if (!v7m_cpacr_pass(env, is_secure, is_priv)) {
-        armv7m_nvic_set_pending_lazyfp(env->nvic, ARMV7M_EXCP_USAGE, is_secure);
-        env->v7m.cfsr[is_secure] |= R_V7M_CFSR_NOCP_MASK;
-        stacked_ok = false;
-    } else if (!is_secure && !extract32(env->v7m.nsacr, 10, 1)) {
-        armv7m_nvic_set_pending_lazyfp(env->nvic, ARMV7M_EXCP_USAGE, M_REG_S);
-        env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_NOCP_MASK;
-        stacked_ok = false;
-    }
-
-    if (!splimviol && stacked_ok) {
-        /* We only stack if the stack limit wasn't violated */
-        int i;
-        ARMMMUIdx mmu_idx;
-
-        mmu_idx = arm_v7m_mmu_idx_all(env, is_secure, is_priv, negpri);
-        for (i = 0; i < (ts ? 32 : 16); i += 2) {
-            uint64_t dn = *aa32_vfp_dreg(env, i / 2);
-            uint32_t faddr = fpcar + 4 * i;
-            uint32_t slo = extract64(dn, 0, 32);
-            uint32_t shi = extract64(dn, 32, 32);
-
-            if (i >= 16) {
-                faddr += 8; /* skip the slot for the FPSCR/VPR */
-            }
-            stacked_ok = stacked_ok &&
-                v7m_stack_write(cpu, faddr, slo, mmu_idx, STACK_LAZYFP) &&
-                v7m_stack_write(cpu, faddr + 4, shi, mmu_idx, STACK_LAZYFP);
-        }
-
-        stacked_ok = stacked_ok &&
-            v7m_stack_write(cpu, fpcar + 0x40,
-                            vfp_get_fpscr(env), mmu_idx, STACK_LAZYFP);
-        if (cpu_isar_feature(aa32_mve, cpu)) {
-            stacked_ok = stacked_ok &&
-                v7m_stack_write(cpu, fpcar + 0x44,
-                                env->v7m.vpr, mmu_idx, STACK_LAZYFP);
-        }
-    }
-
-    /*
-     * We definitely pended an exception, but it's possible that it
-     * might not be able to be taken now. If its priority permits us
-     * to take it now, then we must not update the LSPACT or FP regs,
-     * but instead jump out to take the exception immediately.
-     * If it's just pending and won't be taken until the current
-     * handler exits, then we do update LSPACT and the FP regs.
-     */
-    take_exception = !stacked_ok &&
-        armv7m_nvic_can_take_pending_exception(env->nvic);
-
-    qemu_mutex_unlock_iothread();
-
-    if (take_exception) {
-        raise_exception_ra(env, EXCP_LAZYFP, 0, 1, GETPC());
-    }
-
-    env->v7m.fpccr[is_secure] &= ~R_V7M_FPCCR_LSPACT_MASK;
-
-    if (ts) {
-        /* Clear s0 to s31 and the FPSCR and VPR */
-        int i;
-
-        for (i = 0; i < 32; i += 2) {
-            *aa32_vfp_dreg(env, i / 2) = 0;
-        }
-        vfp_set_fpscr(env, 0);
-        if (cpu_isar_feature(aa32_mve, cpu)) {
-            env->v7m.vpr = 0;
-        }
-    }
-    /*
-     * Otherwise s0 to s15, FPSCR and VPR are UNKNOWN; we choose to leave them
-     * unchanged.
-     */
-}
-
-/*
- * Write to v7M CONTROL.SPSEL bit for the specified security bank.
- * This may change the current stack pointer between Main and Process
- * stack pointers if it is done for the CONTROL register for the current
- * security state.
- */
-static void write_v7m_control_spsel_for_secstate(CPUARMState *env,
-                                                 bool new_spsel,
-                                                 bool secstate)
-{
-    bool old_is_psp = v7m_using_psp(env);
-
-    env->v7m.control[secstate] =
-        deposit32(env->v7m.control[secstate],
-                  R_V7M_CONTROL_SPSEL_SHIFT,
-                  R_V7M_CONTROL_SPSEL_LENGTH, new_spsel);
-
-    if (secstate == env->v7m.secure) {
-        bool new_is_psp = v7m_using_psp(env);
-        uint32_t tmp;
-
-        if (old_is_psp != new_is_psp) {
-            tmp = env->v7m.other_sp;
-            env->v7m.other_sp = env->regs[13];
-            env->regs[13] = tmp;
-        }
-    }
-}
-
-/*
- * Write to v7M CONTROL.SPSEL bit. This may change the current
- * stack pointer between Main and Process stack pointers.
- */
-static void write_v7m_control_spsel(CPUARMState *env, bool new_spsel)
-{
-    write_v7m_control_spsel_for_secstate(env, new_spsel, env->v7m.secure);
-}
-
-void write_v7m_exception(CPUARMState *env, uint32_t new_exc)
-{
-    /*
-     * Write a new value to v7m.exception, thus transitioning into or out
-     * of Handler mode; this may result in a change of active stack pointer.
-     */
-    bool new_is_psp, old_is_psp = v7m_using_psp(env);
-    uint32_t tmp;
-
-    env->v7m.exception = new_exc;
-
-    new_is_psp = v7m_using_psp(env);
-
-    if (old_is_psp != new_is_psp) {
-        tmp = env->v7m.other_sp;
-        env->v7m.other_sp = env->regs[13];
-        env->regs[13] = tmp;
-    }
-}
-
-/* Switch M profile security state between NS and S */
-static void switch_v7m_security_state(CPUARMState *env, bool new_secstate)
-{
-    uint32_t new_ss_msp, new_ss_psp;
-
-    if (env->v7m.secure == new_secstate) {
-        return;
-    }
-
-    /*
-     * All the banked state is accessed by looking at env->v7m.secure
-     * except for the stack pointer; rearrange the SP appropriately.
-     */
-    new_ss_msp = env->v7m.other_ss_msp;
-    new_ss_psp = env->v7m.other_ss_psp;
-
-    if (v7m_using_psp(env)) {
-        env->v7m.other_ss_psp = env->regs[13];
-        env->v7m.other_ss_msp = env->v7m.other_sp;
-    } else {
-        env->v7m.other_ss_msp = env->regs[13];
-        env->v7m.other_ss_psp = env->v7m.other_sp;
-    }
-
-    env->v7m.secure = new_secstate;
-
-    if (v7m_using_psp(env)) {
-        env->regs[13] = new_ss_psp;
-        env->v7m.other_sp = new_ss_msp;
-    } else {
-        env->regs[13] = new_ss_msp;
-        env->v7m.other_sp = new_ss_psp;
-    }
-}
-
-void HELPER(v7m_bxns)(CPUARMState *env, uint32_t dest)
-{
-    /*
-     * Handle v7M BXNS:
-     *  - if the return value is a magic value, do exception return (like BX)
-     *  - otherwise bit 0 of the return value is the target security state
-     */
-    uint32_t min_magic;
-
-    if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
-        /* Covers FNC_RETURN and EXC_RETURN magic */
-        min_magic = FNC_RETURN_MIN_MAGIC;
-    } else {
-        /* EXC_RETURN magic only */
-        min_magic = EXC_RETURN_MIN_MAGIC;
-    }
-
-    if (dest >= min_magic) {
-        /*
-         * This is an exception return magic value; put it where
-         * do_v7m_exception_exit() expects and raise EXCEPTION_EXIT.
-         * Note that if we ever add gen_ss_advance() singlestep support to
-         * M profile this should count as an "instruction execution complete"
-         * event (compare gen_bx_excret_final_code()).
-         */
-        env->regs[15] = dest & ~1;
-        env->thumb = dest & 1;
-        HELPER(exception_internal)(env, EXCP_EXCEPTION_EXIT);
-        /* notreached */
-    }
-
-    /* translate.c should have made BXNS UNDEF unless we're secure */
-    assert(env->v7m.secure);
-
-    if (!(dest & 1)) {
-        env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK;
-    }
-    switch_v7m_security_state(env, dest & 1);
-    env->thumb = true;
-    env->regs[15] = dest & ~1;
-    arm_rebuild_hflags(env);
-}
-
-void HELPER(v7m_blxns)(CPUARMState *env, uint32_t dest)
-{
-    /*
-     * Handle v7M BLXNS:
-     *  - bit 0 of the destination address is the target security state
-     */
-
-    /* At this point regs[15] is the address just after the BLXNS */
-    uint32_t nextinst = env->regs[15] | 1;
-    uint32_t sp = env->regs[13] - 8;
-    uint32_t saved_psr;
-
-    /* translate.c will have made BLXNS UNDEF unless we're secure */
-    assert(env->v7m.secure);
-
-    if (dest & 1) {
-        /*
-         * Target is Secure, so this is just a normal BLX,
-         * except that the low bit doesn't indicate Thumb/not.
-         */
-        env->regs[14] = nextinst;
-        env->thumb = true;
-        env->regs[15] = dest & ~1;
-        return;
-    }
-
-    /* Target is non-secure: first push a stack frame */
-    if (!QEMU_IS_ALIGNED(sp, 8)) {
-        qemu_log_mask(LOG_GUEST_ERROR,
-                      "BLXNS with misaligned SP is UNPREDICTABLE\n");
-    }
-
-    if (sp < v7m_sp_limit(env)) {
-        raise_exception(env, EXCP_STKOF, 0, 1);
-    }
-
-    saved_psr = env->v7m.exception;
-    if (env->v7m.control[M_REG_S] & R_V7M_CONTROL_SFPA_MASK) {
-        saved_psr |= XPSR_SFPA;
-    }
-
-    /* Note that these stores can throw exceptions on MPU faults */
-    cpu_stl_data_ra(env, sp, nextinst, GETPC());
-    cpu_stl_data_ra(env, sp + 4, saved_psr, GETPC());
-
-    env->regs[13] = sp;
-    env->regs[14] = 0xfeffffff;
-    if (arm_v7m_is_handler_mode(env)) {
-        /*
-         * Write a dummy value to IPSR, to avoid leaking the current secure
-         * exception number to non-secure code. This is guaranteed not
-         * to cause write_v7m_exception() to actually change stacks.
-         */
-        write_v7m_exception(env, 1);
-    }
-    env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK;
-    switch_v7m_security_state(env, 0);
-    env->thumb = true;
-    env->regs[15] = dest;
-    arm_rebuild_hflags(env);
-}
-
-static uint32_t *get_v7m_sp_ptr(CPUARMState *env, bool secure, bool threadmode,
-                                bool spsel)
-{
-    /*
-     * Return a pointer to the location where we currently store the
-     * stack pointer for the requested security state and thread mode.
-     * This pointer will become invalid if the CPU state is updated
-     * such that the stack pointers are switched around (eg changing
-     * the SPSEL control bit).
-     * Compare the v8M ARM ARM pseudocode LookUpSP_with_security_mode().
-     * Unlike that pseudocode, we require the caller to pass us in the
-     * SPSEL control bit value; this is because we also use this
-     * function in handling of pushing of the callee-saves registers
-     * part of the v8M stack frame (pseudocode PushCalleeStack()),
-     * and in the tailchain codepath the SPSEL bit comes from the exception
-     * return magic LR value from the previous exception. The pseudocode
-     * opencodes the stack-selection in PushCalleeStack(), but we prefer
-     * to make this utility function generic enough to do the job.
-     */
-    bool want_psp = threadmode && spsel;
-
-    if (secure == env->v7m.secure) {
-        if (want_psp == v7m_using_psp(env)) {
-            return &env->regs[13];
-        } else {
-            return &env->v7m.other_sp;
-        }
-    } else {
-        if (want_psp) {
-            return &env->v7m.other_ss_psp;
-        } else {
-            return &env->v7m.other_ss_msp;
-        }
-    }
-}
-
-static bool arm_v7m_load_vector(ARMCPU *cpu, int exc, bool targets_secure,
-                                uint32_t *pvec)
-{
-    CPUState *cs = CPU(cpu);
-    CPUARMState *env = &cpu->env;
-    MemTxResult result;
-    uint32_t addr = env->v7m.vecbase[targets_secure] + exc * 4;
-    uint32_t vector_entry;
-    MemTxAttrs attrs = {};
-    ARMMMUIdx mmu_idx;
-    bool exc_secure;
-
-    qemu_log_mask(CPU_LOG_INT,
-                  "...loading from element %d of %s vector table at 0x%x\n",
-                  exc, targets_secure ? "secure" : "non-secure", addr);
-
-    mmu_idx = arm_v7m_mmu_idx_for_secstate_and_priv(env, targets_secure, true);
-
-    /*
-     * We don't do a get_phys_addr() here because the rules for vector
-     * loads are special: they always use the default memory map, and
-     * the default memory map permits reads from all addresses.
-     * Since there's no easy way to pass through to pmsav8_mpu_lookup()
-     * that we want this special case which would always say "yes",
-     * we just do the SAU lookup here followed by a direct physical load.
-     */
-    attrs.secure = targets_secure;
-    attrs.user = false;
-
-    if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
-        V8M_SAttributes sattrs = {};
-
-        v8m_security_lookup(env, addr, MMU_DATA_LOAD, mmu_idx,
-                            targets_secure, &sattrs);
-        if (sattrs.ns) {
-            attrs.secure = false;
-        } else if (!targets_secure) {
-            /*
-             * NS access to S memory: the underlying exception which we escalate
-             * to HardFault is SecureFault, which always targets Secure.
-             */
-            exc_secure = true;
-            goto load_fail;
-        }
-    }
-
-    vector_entry = address_space_ldl(arm_addressspace(cs, attrs), addr,
-                                     attrs, &result);
-    if (result != MEMTX_OK) {
-        /*
-         * Underlying exception is BusFault: its target security state
-         * depends on BFHFNMINS.
-         */
-        exc_secure = !(cpu->env.v7m.aircr & R_V7M_AIRCR_BFHFNMINS_MASK);
-        goto load_fail;
-    }
-    *pvec = vector_entry;
-    qemu_log_mask(CPU_LOG_INT, "...loaded new PC 0x%x\n", *pvec);
-    return true;
-
-load_fail:
-    /*
-     * All vector table fetch fails are reported as HardFault, with
-     * HFSR.VECTTBL and .FORCED set. (FORCED is set because
-     * technically the underlying exception is a SecureFault or BusFault
-     * that is escalated to HardFault.) This is a terminal exception,
-     * so we will either take the HardFault immediately or else enter
-     * lockup (the latter case is handled in armv7m_nvic_set_pending_derived()).
-     * The HardFault is Secure if BFHFNMINS is 0 (meaning that all HFs are
-     * secure); otherwise it targets the same security state as the
-     * underlying exception.
-     * In v8.1M HardFaults from vector table fetch fails don't set FORCED.
-     */
-    if (!(cpu->env.v7m.aircr & R_V7M_AIRCR_BFHFNMINS_MASK)) {
-        exc_secure = true;
-    }
-    env->v7m.hfsr |= R_V7M_HFSR_VECTTBL_MASK;
-    if (!arm_feature(env, ARM_FEATURE_V8_1M)) {
-        env->v7m.hfsr |= R_V7M_HFSR_FORCED_MASK;
-    }
-    armv7m_nvic_set_pending_derived(env->nvic, ARMV7M_EXCP_HARD, exc_secure);
-    return false;
-}
-
-static uint32_t v7m_integrity_sig(CPUARMState *env, uint32_t lr)
-{
-    /*
-     * Return the integrity signature value for the callee-saves
-     * stack frame section. @lr is the exception return payload/LR value
-     * whose FType bit forms bit 0 of the signature if FP is present.
-     */
-    uint32_t sig = 0xfefa125a;
-
-    if (!cpu_isar_feature(aa32_vfp_simd, env_archcpu(env))
-        || (lr & R_V7M_EXCRET_FTYPE_MASK)) {
-        sig |= 1;
-    }
-    return sig;
-}
-
-static bool v7m_push_callee_stack(ARMCPU *cpu, uint32_t lr, bool dotailchain,
-                                  bool ignore_faults)
-{
-    /*
-     * For v8M, push the callee-saves register part of the stack frame.
-     * Compare the v8M pseudocode PushCalleeStack().
-     * In the tailchaining case this may not be the current stack.
-     */
-    CPUARMState *env = &cpu->env;
-    uint32_t *frame_sp_p;
-    uint32_t frameptr;
-    ARMMMUIdx mmu_idx;
-    bool stacked_ok;
-    uint32_t limit;
-    bool want_psp;
-    uint32_t sig;
-    StackingMode smode = ignore_faults ? STACK_IGNFAULTS : STACK_NORMAL;
-
-    if (dotailchain) {
-        bool mode = lr & R_V7M_EXCRET_MODE_MASK;
-        bool priv = !(env->v7m.control[M_REG_S] & R_V7M_CONTROL_NPRIV_MASK) ||
-            !mode;
-
-        mmu_idx = arm_v7m_mmu_idx_for_secstate_and_priv(env, M_REG_S, priv);
-        frame_sp_p = get_v7m_sp_ptr(env, M_REG_S, mode,
-                                    lr & R_V7M_EXCRET_SPSEL_MASK);
-        want_psp = mode && (lr & R_V7M_EXCRET_SPSEL_MASK);
-        if (want_psp) {
-            limit = env->v7m.psplim[M_REG_S];
-        } else {
-            limit = env->v7m.msplim[M_REG_S];
-        }
-    } else {
-        mmu_idx = arm_mmu_idx(env);
-        frame_sp_p = &env->regs[13];
-        limit = v7m_sp_limit(env);
-    }
-
-    frameptr = *frame_sp_p - 0x28;
-    if (frameptr < limit) {
-        /*
-         * Stack limit failure: set SP to the limit value, and generate
-         * STKOF UsageFault. Stack pushes below the limit must not be
-         * performed. It is IMPDEF whether pushes above the limit are
-         * performed; we choose not to.
-         */
-        qemu_log_mask(CPU_LOG_INT,
-                      "...STKOF during callee-saves register stacking\n");
-        env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_STKOF_MASK;
-        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
-                                env->v7m.secure);
-        *frame_sp_p = limit;
-        return true;
-    }
-
-    /*
-     * Write as much of the stack frame as we can. A write failure may
-     * cause us to pend a derived exception.
-     */
-    sig = v7m_integrity_sig(env, lr);
-    stacked_ok =
-        v7m_stack_write(cpu, frameptr, sig, mmu_idx, smode) &&
-        v7m_stack_write(cpu, frameptr + 0x8, env->regs[4], mmu_idx, smode) &&
-        v7m_stack_write(cpu, frameptr + 0xc, env->regs[5], mmu_idx, smode) &&
-        v7m_stack_write(cpu, frameptr + 0x10, env->regs[6], mmu_idx, smode) &&
-        v7m_stack_write(cpu, frameptr + 0x14, env->regs[7], mmu_idx, smode) &&
-        v7m_stack_write(cpu, frameptr + 0x18, env->regs[8], mmu_idx, smode) &&
-        v7m_stack_write(cpu, frameptr + 0x1c, env->regs[9], mmu_idx, smode) &&
-        v7m_stack_write(cpu, frameptr + 0x20, env->regs[10], mmu_idx, smode) &&
-        v7m_stack_write(cpu, frameptr + 0x24, env->regs[11], mmu_idx, smode);
-
-    /* Update SP regardless of whether any of the stack accesses failed. */
-    *frame_sp_p = frameptr;
-
-    return !stacked_ok;
-}
-
-static void v7m_exception_taken(ARMCPU *cpu, uint32_t lr, bool dotailchain,
-                                bool ignore_stackfaults)
-{
-    /*
-     * Do the "take the exception" parts of exception entry,
-     * but not the pushing of state to the stack. This is
-     * similar to the pseudocode ExceptionTaken() function.
-     */
-    CPUARMState *env = &cpu->env;
-    uint32_t addr;
-    bool targets_secure;
-    int exc;
-    bool push_failed = false;
-
-    armv7m_nvic_get_pending_irq_info(env->nvic, &exc, &targets_secure);
-    qemu_log_mask(CPU_LOG_INT, "...taking pending %s exception %d\n",
-                  targets_secure ? "secure" : "nonsecure", exc);
-
-    if (dotailchain) {
-        /* Sanitize LR FType and PREFIX bits */
-        if (!cpu_isar_feature(aa32_vfp_simd, cpu)) {
-            lr |= R_V7M_EXCRET_FTYPE_MASK;
-        }
-        lr = deposit32(lr, 24, 8, 0xff);
-    }
-
-    if (arm_feature(env, ARM_FEATURE_V8)) {
-        if (arm_feature(env, ARM_FEATURE_M_SECURITY) &&
-            (lr & R_V7M_EXCRET_S_MASK)) {
-            /*
-             * The background code (the owner of the registers in the
-             * exception frame) is Secure. This means it may either already
-             * have or now needs to push callee-saves registers.
-             */
-            if (targets_secure) {
-                if (dotailchain && !(lr & R_V7M_EXCRET_ES_MASK)) {
-                    /*
-                     * We took an exception from Secure to NonSecure
-                     * (which means the callee-saved registers got stacked)
-                     * and are now tailchaining to a Secure exception.
-                     * Clear DCRS so eventual return from this Secure
-                     * exception unstacks the callee-saved registers.
-                     */
-                    lr &= ~R_V7M_EXCRET_DCRS_MASK;
-                }
-            } else {
-                /*
-                 * We're going to a non-secure exception; push the
-                 * callee-saves registers to the stack now, if they're
-                 * not already saved.
-                 */
-                if (lr & R_V7M_EXCRET_DCRS_MASK &&
-                    !(dotailchain && !(lr & R_V7M_EXCRET_ES_MASK))) {
-                    push_failed = v7m_push_callee_stack(cpu, lr, dotailchain,
-                                                        ignore_stackfaults);
-                }
-                lr |= R_V7M_EXCRET_DCRS_MASK;
-            }
-        }
-
-        lr &= ~R_V7M_EXCRET_ES_MASK;
-        if (targets_secure) {
-            lr |= R_V7M_EXCRET_ES_MASK;
-        }
-        lr &= ~R_V7M_EXCRET_SPSEL_MASK;
-        if (env->v7m.control[targets_secure] & R_V7M_CONTROL_SPSEL_MASK) {
-            lr |= R_V7M_EXCRET_SPSEL_MASK;
-        }
-
-        /*
-         * Clear registers if necessary to prevent non-secure exception
-         * code being able to see register values from secure code.
-         * Where register values become architecturally UNKNOWN we leave
-         * them with their previous values. v8.1M is tighter than v8.0M
-         * here and always zeroes the caller-saved registers regardless
-         * of the security state the exception is targeting.
-         */
-        if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
-            if (!targets_secure || arm_feature(env, ARM_FEATURE_V8_1M)) {
-                /*
-                 * Always clear the caller-saved registers (they have been
-                 * pushed to the stack earlier in v7m_push_stack()).
-                 * Clear callee-saved registers if the background code is
-                 * Secure (in which case these regs were saved in
-                 * v7m_push_callee_stack()).
-                 */
-                int i;
-                /*
-                 * r4..r11 are callee-saves, zero only if background
-                 * state was Secure (EXCRET.S == 1) and exception
-                 * targets Non-secure state
-                 */
-                bool zero_callee_saves = !targets_secure &&
-                    (lr & R_V7M_EXCRET_S_MASK);
-
-                for (i = 0; i < 13; i++) {
-                    if (i < 4 || i > 11 || zero_callee_saves) {
-                        env->regs[i] = 0;
-                    }
-                }
-                /* Clear EAPSR */
-                xpsr_write(env, 0, XPSR_NZCV | XPSR_Q | XPSR_GE | XPSR_IT);
-            }
-        }
-    }
-
-    if (push_failed && !ignore_stackfaults) {
-        /*
-         * Derived exception on callee-saves register stacking:
-         * we might now want to take a different exception which
-         * targets a different security state, so try again from the top.
-         */
-        qemu_log_mask(CPU_LOG_INT,
-                      "...derived exception on callee-saves register stacking");
-        v7m_exception_taken(cpu, lr, true, true);
-        return;
-    }
-
-    if (!arm_v7m_load_vector(cpu, exc, targets_secure, &addr)) {
-        /* Vector load failed: derived exception */
-        qemu_log_mask(CPU_LOG_INT, "...derived exception on vector table load");
-        v7m_exception_taken(cpu, lr, true, true);
-        return;
-    }
-
-    /*
-     * Now we've done everything that might cause a derived exception
-     * we can go ahead and activate whichever exception we're going to
-     * take (which might now be the derived exception).
-     */
-    armv7m_nvic_acknowledge_irq(env->nvic);
-
-    /* Switch to target security state -- must do this before writing SPSEL */
-    switch_v7m_security_state(env, targets_secure);
-    write_v7m_control_spsel(env, 0);
-    arm_clear_exclusive(env);
-    /* Clear SFPA and FPCA (has no effect if no FPU) */
-    env->v7m.control[M_REG_S] &=
-        ~(R_V7M_CONTROL_FPCA_MASK | R_V7M_CONTROL_SFPA_MASK);
-    /* Clear IT bits */
-    env->condexec_bits = 0;
-    env->regs[14] = lr;
-    env->regs[15] = addr & 0xfffffffe;
-    env->thumb = addr & 1;
-    arm_rebuild_hflags(env);
-}
-
-static void v7m_update_fpccr(CPUARMState *env, uint32_t frameptr,
-                             bool apply_splim)
-{
-    /*
-     * Like the pseudocode UpdateFPCCR: save state in FPCAR and FPCCR
-     * that we will need later in order to do lazy FP reg stacking.
-     */
-    bool is_secure = env->v7m.secure;
-    NVICState *nvic = env->nvic;
-    /*
-     * Some bits are unbanked and live always in fpccr[M_REG_S]; some bits
-     * are banked and we want to update the bit in the bank for the
-     * current security state; and in one case we want to specifically
-     * update the NS banked version of a bit even if we are secure.
-     */
-    uint32_t *fpccr_s = &env->v7m.fpccr[M_REG_S];
-    uint32_t *fpccr_ns = &env->v7m.fpccr[M_REG_NS];
-    uint32_t *fpccr = &env->v7m.fpccr[is_secure];
-    bool hfrdy, bfrdy, mmrdy, ns_ufrdy, s_ufrdy, sfrdy, monrdy;
-
-    env->v7m.fpcar[is_secure] = frameptr & ~0x7;
-
-    if (apply_splim && arm_feature(env, ARM_FEATURE_V8)) {
-        bool splimviol;
-        uint32_t splim = v7m_sp_limit(env);
-        bool ign = armv7m_nvic_neg_prio_requested(nvic, is_secure) &&
-            (env->v7m.ccr[is_secure] & R_V7M_CCR_STKOFHFNMIGN_MASK);
-
-        splimviol = !ign && frameptr < splim;
-        *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, SPLIMVIOL, splimviol);
-    }
-
-    *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, LSPACT, 1);
-
-    *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, S, is_secure);
-
-    *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, USER, arm_current_el(env) == 0);
-
-    *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, THREAD,
-                        !arm_v7m_is_handler_mode(env));
-
-    hfrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_HARD, false);
-    *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, HFRDY, hfrdy);
-
-    bfrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_BUS, false);
-    *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, BFRDY, bfrdy);
-
-    mmrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_MEM, is_secure);
-    *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, MMRDY, mmrdy);
-
-    ns_ufrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_USAGE, false);
-    *fpccr_ns = FIELD_DP32(*fpccr_ns, V7M_FPCCR, UFRDY, ns_ufrdy);
-
-    monrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_DEBUG, false);
-    *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, MONRDY, monrdy);
-
-    if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
-        s_ufrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_USAGE, true);
-        *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, UFRDY, s_ufrdy);
-
-        sfrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_SECURE, false);
-        *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, SFRDY, sfrdy);
-    }
-}
-
-void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr)
-{
-    /* fptr is the value of Rn, the frame pointer we store the FP regs to */
-    ARMCPU *cpu = env_archcpu(env);
-    bool s = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_S_MASK;
-    bool lspact = env->v7m.fpccr[s] & R_V7M_FPCCR_LSPACT_MASK;
-    uintptr_t ra = GETPC();
-
-    assert(env->v7m.secure);
-
-    if (!(env->v7m.control[M_REG_S] & R_V7M_CONTROL_SFPA_MASK)) {
-        return;
-    }
-
-    /* Check access to the coprocessor is permitted */
-    if (!v7m_cpacr_pass(env, true, arm_current_el(env) != 0)) {
-        raise_exception_ra(env, EXCP_NOCP, 0, 1, GETPC());
-    }
-
-    if (lspact) {
-        /* LSPACT should not be active when there is active FP state */
-        raise_exception_ra(env, EXCP_LSERR, 0, 1, GETPC());
-    }
-
-    if (fptr & 7) {
-        raise_exception_ra(env, EXCP_UNALIGNED, 0, 1, GETPC());
-    }
-
-    /*
-     * Note that we do not use v7m_stack_write() here, because the
-     * accesses should not set the FSR bits for stacking errors if they
-     * fail. (In pseudocode terms, they are AccType_NORMAL, not AccType_STACK
-     * or AccType_LAZYFP). Faults in cpu_stl_data_ra() will throw exceptions
-     * and longjmp out.
-     */
-    if (!(env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPEN_MASK)) {
-        bool ts = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK;
-        int i;
-
-        for (i = 0; i < (ts ? 32 : 16); i += 2) {
-            uint64_t dn = *aa32_vfp_dreg(env, i / 2);
-            uint32_t faddr = fptr + 4 * i;
-            uint32_t slo = extract64(dn, 0, 32);
-            uint32_t shi = extract64(dn, 32, 32);
-
-            if (i >= 16) {
-                faddr += 8; /* skip the slot for the FPSCR */
-            }
-            cpu_stl_data_ra(env, faddr, slo, ra);
-            cpu_stl_data_ra(env, faddr + 4, shi, ra);
-        }
-        cpu_stl_data_ra(env, fptr + 0x40, vfp_get_fpscr(env), ra);
-        if (cpu_isar_feature(aa32_mve, cpu)) {
-            cpu_stl_data_ra(env, fptr + 0x44, env->v7m.vpr, ra);
-        }
-
-        /*
-         * If TS is 0 then s0 to s15, FPSCR and VPR are UNKNOWN; we choose to
-         * leave them unchanged, matching our choice in v7m_preserve_fp_state.
-         */
-        if (ts) {
-            for (i = 0; i < 32; i += 2) {
-                *aa32_vfp_dreg(env, i / 2) = 0;
-            }
-            vfp_set_fpscr(env, 0);
-            if (cpu_isar_feature(aa32_mve, cpu)) {
-                env->v7m.vpr = 0;
-            }
-        }
-    } else {
-        v7m_update_fpccr(env, fptr, false);
-    }
-
-    env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_FPCA_MASK;
-}
-
-void HELPER(v7m_vlldm)(CPUARMState *env, uint32_t fptr)
-{
-    ARMCPU *cpu = env_archcpu(env);
-    uintptr_t ra = GETPC();
-
-    /* fptr is the value of Rn, the frame pointer we load the FP regs from */
-    assert(env->v7m.secure);
-
-    if (!(env->v7m.control[M_REG_S] & R_V7M_CONTROL_SFPA_MASK)) {
-        return;
-    }
-
-    /* Check access to the coprocessor is permitted */
-    if (!v7m_cpacr_pass(env, true, arm_current_el(env) != 0)) {
-        raise_exception_ra(env, EXCP_NOCP, 0, 1, GETPC());
-    }
-
-    if (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPACT_MASK) {
-        /* State in FP is still valid */
-        env->v7m.fpccr[M_REG_S] &= ~R_V7M_FPCCR_LSPACT_MASK;
-    } else {
-        bool ts = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK;
-        int i;
-        uint32_t fpscr;
-
-        if (fptr & 7) {
-            raise_exception_ra(env, EXCP_UNALIGNED, 0, 1, GETPC());
-        }
-
-        for (i = 0; i < (ts ? 32 : 16); i += 2) {
-            uint32_t slo, shi;
-            uint64_t dn;
-            uint32_t faddr = fptr + 4 * i;
-
-            if (i >= 16) {
-                faddr += 8; /* skip the slot for the FPSCR and VPR */
-            }
-
-            slo = cpu_ldl_data_ra(env, faddr, ra);
-            shi = cpu_ldl_data_ra(env, faddr + 4, ra);
-
-            dn = (uint64_t) shi << 32 | slo;
-            *aa32_vfp_dreg(env, i / 2) = dn;
-        }
-        fpscr = cpu_ldl_data_ra(env, fptr + 0x40, ra);
-        vfp_set_fpscr(env, fpscr);
-        if (cpu_isar_feature(aa32_mve, cpu)) {
-            env->v7m.vpr = cpu_ldl_data_ra(env, fptr + 0x44, ra);
-        }
-    }
-
-    env->v7m.control[M_REG_S] |= R_V7M_CONTROL_FPCA_MASK;
-}
-
-static bool v7m_push_stack(ARMCPU *cpu)
-{
-    /*
-     * Do the "set up stack frame" part of exception entry,
-     * similar to pseudocode PushStack().
-     * Return true if we generate a derived exception (and so
-     * should ignore further stack faults trying to process
-     * that derived exception.)
-     */
-    bool stacked_ok = true, limitviol = false;
-    CPUARMState *env = &cpu->env;
-    uint32_t xpsr = xpsr_read(env);
-    uint32_t frameptr = env->regs[13];
-    ARMMMUIdx mmu_idx = arm_mmu_idx(env);
-    uint32_t framesize;
-    bool nsacr_cp10 = extract32(env->v7m.nsacr, 10, 1);
-
-    if ((env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK) &&
-        (env->v7m.secure || nsacr_cp10)) {
-        if (env->v7m.secure &&
-            env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK) {
-            framesize = 0xa8;
-        } else {
-            framesize = 0x68;
-        }
-    } else {
-        framesize = 0x20;
-    }
-
-    /* Align stack pointer if the guest wants that */
-    if ((frameptr & 4) &&
-        (env->v7m.ccr[env->v7m.secure] & R_V7M_CCR_STKALIGN_MASK)) {
-        frameptr -= 4;
-        xpsr |= XPSR_SPREALIGN;
-    }
-
-    xpsr &= ~XPSR_SFPA;
-    if (env->v7m.secure &&
-        (env->v7m.control[M_REG_S] & R_V7M_CONTROL_SFPA_MASK)) {
-        xpsr |= XPSR_SFPA;
-    }
-
-    frameptr -= framesize;
-
-    if (arm_feature(env, ARM_FEATURE_V8)) {
-        uint32_t limit = v7m_sp_limit(env);
-
-        if (frameptr < limit) {
-            /*
-             * Stack limit failure: set SP to the limit value, and generate
-             * STKOF UsageFault. Stack pushes below the limit must not be
-             * performed. It is IMPDEF whether pushes above the limit are
-             * performed; we choose not to.
-             */
-            qemu_log_mask(CPU_LOG_INT,
-                          "...STKOF during stacking\n");
-            env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_STKOF_MASK;
-            armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
-                                    env->v7m.secure);
-            env->regs[13] = limit;
-            /*
-             * We won't try to perform any further memory accesses but
-             * we must continue through the following code to check for
-             * permission faults during FPU state preservation, and we
-             * must update FPCCR if lazy stacking is enabled.
-             */
-            limitviol = true;
-            stacked_ok = false;
-        }
-    }
-
-    /*
-     * Write as much of the stack frame as we can. If we fail a stack
-     * write this will result in a derived exception being pended
-     * (which may be taken in preference to the one we started with
-     * if it has higher priority).
-     */
-    stacked_ok = stacked_ok &&
-        v7m_stack_write(cpu, frameptr, env->regs[0], mmu_idx, STACK_NORMAL) &&
-        v7m_stack_write(cpu, frameptr + 4, env->regs[1],
-                        mmu_idx, STACK_NORMAL) &&
-        v7m_stack_write(cpu, frameptr + 8, env->regs[2],
-                        mmu_idx, STACK_NORMAL) &&
-        v7m_stack_write(cpu, frameptr + 12, env->regs[3],
-                        mmu_idx, STACK_NORMAL) &&
-        v7m_stack_write(cpu, frameptr + 16, env->regs[12],
-                        mmu_idx, STACK_NORMAL) &&
-        v7m_stack_write(cpu, frameptr + 20, env->regs[14],
-                        mmu_idx, STACK_NORMAL) &&
-        v7m_stack_write(cpu, frameptr + 24, env->regs[15],
-                        mmu_idx, STACK_NORMAL) &&
-        v7m_stack_write(cpu, frameptr + 28, xpsr, mmu_idx, STACK_NORMAL);
-
-    if (env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK) {
-        /* FPU is active, try to save its registers */
-        bool fpccr_s = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_S_MASK;
-        bool lspact = env->v7m.fpccr[fpccr_s] & R_V7M_FPCCR_LSPACT_MASK;
-
-        if (lspact && arm_feature(env, ARM_FEATURE_M_SECURITY)) {
-            qemu_log_mask(CPU_LOG_INT,
-                          "...SecureFault because LSPACT and FPCA both set\n");
-            env->v7m.sfsr |= R_V7M_SFSR_LSERR_MASK;
-            armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
-        } else if (!env->v7m.secure && !nsacr_cp10) {
-            qemu_log_mask(CPU_LOG_INT,
-                          "...Secure UsageFault with CFSR.NOCP because "
-                          "NSACR.CP10 prevents stacking FP regs\n");
-            armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, M_REG_S);
-            env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_NOCP_MASK;
-        } else {
-            if (!(env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPEN_MASK)) {
-                /* Lazy stacking disabled, save registers now */
-                int i;
-                bool cpacr_pass = v7m_cpacr_pass(env, env->v7m.secure,
-                                                 arm_current_el(env) != 0);
-
-                if (stacked_ok && !cpacr_pass) {
-                    /*
-                     * Take UsageFault if CPACR forbids access. The pseudocode
-                     * here does a full CheckCPEnabled() but we know the NSACR
-                     * check can never fail as we have already handled that.
-                     */
-                    qemu_log_mask(CPU_LOG_INT,
-                                  "...UsageFault with CFSR.NOCP because "
-                                  "CPACR.CP10 prevents stacking FP regs\n");
-                    armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
-                                            env->v7m.secure);
-                    env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_NOCP_MASK;
-                    stacked_ok = false;
-                }
-
-                for (i = 0; i < ((framesize == 0xa8) ? 32 : 16); i += 2) {
-                    uint64_t dn = *aa32_vfp_dreg(env, i / 2);
-                    uint32_t faddr = frameptr + 0x20 + 4 * i;
-                    uint32_t slo = extract64(dn, 0, 32);
-                    uint32_t shi = extract64(dn, 32, 32);
-
-                    if (i >= 16) {
-                        faddr += 8; /* skip the slot for the FPSCR and VPR */
-                    }
-                    stacked_ok = stacked_ok &&
-                        v7m_stack_write(cpu, faddr, slo,
-                                        mmu_idx, STACK_NORMAL) &&
-                        v7m_stack_write(cpu, faddr + 4, shi,
-                                        mmu_idx, STACK_NORMAL);
-                }
-                stacked_ok = stacked_ok &&
-                    v7m_stack_write(cpu, frameptr + 0x60,
-                                    vfp_get_fpscr(env), mmu_idx, STACK_NORMAL);
-                if (cpu_isar_feature(aa32_mve, cpu)) {
-                    stacked_ok = stacked_ok &&
-                        v7m_stack_write(cpu, frameptr + 0x64,
-                                        env->v7m.vpr, mmu_idx, STACK_NORMAL);
-                }
-                if (cpacr_pass) {
-                    for (i = 0; i < ((framesize == 0xa8) ? 32 : 16); i += 2) {
-                        *aa32_vfp_dreg(env, i / 2) = 0;
-                    }
-                    vfp_set_fpscr(env, 0);
-                    if (cpu_isar_feature(aa32_mve, cpu)) {
-                        env->v7m.vpr = 0;
-                    }
-                }
-            } else {
-                /* Lazy stacking enabled, save necessary info to stack later */
-                v7m_update_fpccr(env, frameptr + 0x20, true);
-            }
-        }
-    }
-
-    /*
-     * If we broke a stack limit then SP was already updated earlier;
-     * otherwise we update SP regardless of whether any of the stack
-     * accesses failed or we took some other kind of fault.
-     */
-    if (!limitviol) {
-        env->regs[13] = frameptr;
-    }
-
-    return !stacked_ok;
-}
-
-static void do_v7m_exception_exit(ARMCPU *cpu)
-{
-    CPUARMState *env = &cpu->env;
-    uint32_t excret;
-    uint32_t xpsr, xpsr_mask;
-    bool ufault = false;
-    bool sfault = false;
-    bool return_to_sp_process;
-    bool return_to_handler;
-    bool rettobase = false;
-    bool exc_secure = false;
-    bool return_to_secure;
-    bool ftype;
-    bool restore_s16_s31 = false;
-
-    /*
-     * If we're not in Handler mode then jumps to magic exception-exit
-     * addresses don't have magic behaviour. However for the v8M
-     * security extensions the magic secure-function-return has to
-     * work in thread mode too, so to avoid doing an extra check in
-     * the generated code we allow exception-exit magic to also cause the
-     * internal exception and bring us here in thread mode. Correct code
-     * will never try to do this (the following insn fetch will always
-     * fault) so we the overhead of having taken an unnecessary exception
-     * doesn't matter.
-     */
-    if (!arm_v7m_is_handler_mode(env)) {
-        return;
-    }
-
-    /*
-     * In the spec pseudocode ExceptionReturn() is called directly
-     * from BXWritePC() and gets the full target PC value including
-     * bit zero. In QEMU's implementation we treat it as a normal
-     * jump-to-register (which is then caught later on), and so split
-     * the target value up between env->regs[15] and env->thumb in
-     * gen_bx(). Reconstitute it.
-     */
-    excret = env->regs[15];
-    if (env->thumb) {
-        excret |= 1;
-    }
-
-    qemu_log_mask(CPU_LOG_INT, "Exception return: magic PC %" PRIx32
-                  " previous exception %d\n",
-                  excret, env->v7m.exception);
-
-    if ((excret & R_V7M_EXCRET_RES1_MASK) != R_V7M_EXCRET_RES1_MASK) {
-        qemu_log_mask(LOG_GUEST_ERROR, "M profile: zero high bits in exception "
-                      "exit PC value 0x%" PRIx32 " are UNPREDICTABLE\n",
-                      excret);
-    }
-
-    ftype = excret & R_V7M_EXCRET_FTYPE_MASK;
-
-    if (!ftype && !cpu_isar_feature(aa32_vfp_simd, cpu)) {
-        qemu_log_mask(LOG_GUEST_ERROR, "M profile: zero FTYPE in exception "
-                      "exit PC value 0x%" PRIx32 " is UNPREDICTABLE "
-                      "if FPU not present\n",
-                      excret);
-        ftype = true;
-    }
-
-    if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
-        /*
-         * EXC_RETURN.ES validation check (R_SMFL). We must do this before
-         * we pick which FAULTMASK to clear.
-         */
-        if (!env->v7m.secure &&
-            ((excret & R_V7M_EXCRET_ES_MASK) ||
-             !(excret & R_V7M_EXCRET_DCRS_MASK))) {
-            sfault = 1;
-            /* For all other purposes, treat ES as 0 (R_HXSR) */
-            excret &= ~R_V7M_EXCRET_ES_MASK;
-        }
-        exc_secure = excret & R_V7M_EXCRET_ES_MASK;
-    }
-
-    if (env->v7m.exception != ARMV7M_EXCP_NMI) {
-        /*
-         * Auto-clear FAULTMASK on return from other than NMI.
-         * If the security extension is implemented then this only
-         * happens if the raw execution priority is >= 0; the
-         * value of the ES bit in the exception return value indicates
-         * which security state's faultmask to clear. (v8M ARM ARM R_KBNF.)
-         */
-        if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
-            if (armv7m_nvic_raw_execution_priority(env->nvic) >= 0) {
-                env->v7m.faultmask[exc_secure] = 0;
-            }
-        } else {
-            env->v7m.faultmask[M_REG_NS] = 0;
-        }
-    }
-
-    switch (armv7m_nvic_complete_irq(env->nvic, env->v7m.exception,
-                                     exc_secure)) {
-    case -1:
-        /* attempt to exit an exception that isn't active */
-        ufault = true;
-        break;
-    case 0:
-        /* still an irq active now */
-        break;
-    case 1:
-        /*
-         * We returned to base exception level, no nesting.
-         * (In the pseudocode this is written using "NestedActivation != 1"
-         * where we have 'rettobase == false'.)
-         */
-        rettobase = true;
-        break;
-    default:
-        g_assert_not_reached();
-    }
-
-    return_to_handler = !(excret & R_V7M_EXCRET_MODE_MASK);
-    return_to_sp_process = excret & R_V7M_EXCRET_SPSEL_MASK;
-    return_to_secure = arm_feature(env, ARM_FEATURE_M_SECURITY) &&
-        (excret & R_V7M_EXCRET_S_MASK);
-
-    if (arm_feature(env, ARM_FEATURE_V8)) {
-        if (!arm_feature(env, ARM_FEATURE_M_SECURITY)) {
-            /*
-             * UNPREDICTABLE if S == 1 or DCRS == 0 or ES == 1 (R_XLCP);
-             * we choose to take the UsageFault.
-             */
-            if ((excret & R_V7M_EXCRET_S_MASK) ||
-                (excret & R_V7M_EXCRET_ES_MASK) ||
-                !(excret & R_V7M_EXCRET_DCRS_MASK)) {
-                ufault = true;
-            }
-        }
-        if (excret & R_V7M_EXCRET_RES0_MASK) {
-            ufault = true;
-        }
-    } else {
-        /* For v7M we only recognize certain combinations of the low bits */
-        switch (excret & 0xf) {
-        case 1: /* Return to Handler */
-            break;
-        case 13: /* Return to Thread using Process stack */
-        case 9: /* Return to Thread using Main stack */
-            /*
-             * We only need to check NONBASETHRDENA for v7M, because in
-             * v8M this bit does not exist (it is RES1).
-             */
-            if (!rettobase &&
-                !(env->v7m.ccr[env->v7m.secure] &
-                  R_V7M_CCR_NONBASETHRDENA_MASK)) {
-                ufault = true;
-            }
-            break;
-        default:
-            ufault = true;
-        }
-    }
-
-    /*
-     * Set CONTROL.SPSEL from excret.SPSEL. Since we're still in
-     * Handler mode (and will be until we write the new XPSR.Interrupt
-     * field) this does not switch around the current stack pointer.
-     * We must do this before we do any kind of tailchaining, including
-     * for the derived exceptions on integrity check failures, or we will
-     * give the guest an incorrect EXCRET.SPSEL value on exception entry.
-     */
-    write_v7m_control_spsel_for_secstate(env, return_to_sp_process, exc_secure);
-
-    /*
-     * Clear scratch FP values left in caller saved registers; this
-     * must happen before any kind of tail chaining.
-     */
-    if ((env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_CLRONRET_MASK) &&
-        (env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK)) {
-        if (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPACT_MASK) {
-            env->v7m.sfsr |= R_V7M_SFSR_LSERR_MASK;
-            armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
-            qemu_log_mask(CPU_LOG_INT, "...taking SecureFault on existing "
-                          "stackframe: error during lazy state deactivation\n");
-            v7m_exception_taken(cpu, excret, true, false);
-            return;
-        } else {
-            if (arm_feature(env, ARM_FEATURE_V8_1M)) {
-                /* v8.1M adds this NOCP check */
-                bool nsacr_pass = exc_secure ||
-                    extract32(env->v7m.nsacr, 10, 1);
-                bool cpacr_pass = v7m_cpacr_pass(env, exc_secure, true);
-                if (!nsacr_pass) {
-                    armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, true);
-                    env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_NOCP_MASK;
-                    qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on existing "
-                        "stackframe: NSACR prevents clearing FPU registers\n");
-                    v7m_exception_taken(cpu, excret, true, false);
-                    return;
-                } else if (!cpacr_pass) {
-                    armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
-                                            exc_secure);
-                    env->v7m.cfsr[exc_secure] |= R_V7M_CFSR_NOCP_MASK;
-                    qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on existing "
-                        "stackframe: CPACR prevents clearing FPU registers\n");
-                    v7m_exception_taken(cpu, excret, true, false);
-                    return;
-                }
-            }
-            /* Clear s0..s15, FPSCR and VPR */
-            int i;
-
-            for (i = 0; i < 16; i += 2) {
-                *aa32_vfp_dreg(env, i / 2) = 0;
-            }
-            vfp_set_fpscr(env, 0);
-            if (cpu_isar_feature(aa32_mve, cpu)) {
-                env->v7m.vpr = 0;
-            }
-        }
-    }
-
-    if (sfault) {
-        env->v7m.sfsr |= R_V7M_SFSR_INVER_MASK;
-        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
-        qemu_log_mask(CPU_LOG_INT, "...taking SecureFault on existing "
-                      "stackframe: failed EXC_RETURN.ES validity check\n");
-        v7m_exception_taken(cpu, excret, true, false);
-        return;
-    }
-
-    if (ufault) {
-        /*
-         * Bad exception return: instead of popping the exception
-         * stack, directly take a usage fault on the current stack.
-         */
-        env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVPC_MASK;
-        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure);
-        qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on existing "
-                      "stackframe: failed exception return integrity check\n");
-        v7m_exception_taken(cpu, excret, true, false);
-        return;
-    }
-
-    /*
-     * Tailchaining: if there is currently a pending exception that
-     * is high enough priority to preempt execution at the level we're
-     * about to return to, then just directly take that exception now,
-     * avoiding an unstack-and-then-stack. Note that now we have
-     * deactivated the previous exception by calling armv7m_nvic_complete_irq()
-     * our current execution priority is already the execution priority we are
-     * returning to -- none of the state we would unstack or set based on
-     * the EXCRET value affects it.
-     */
-    if (armv7m_nvic_can_take_pending_exception(env->nvic)) {
-        qemu_log_mask(CPU_LOG_INT, "...tailchaining to pending exception\n");
-        v7m_exception_taken(cpu, excret, true, false);
-        return;
-    }
-
-    switch_v7m_security_state(env, return_to_secure);
-
-    {
-        /*
-         * The stack pointer we should be reading the exception frame from
-         * depends on bits in the magic exception return type value (and
-         * for v8M isn't necessarily the stack pointer we will eventually
-         * end up resuming execution with). Get a pointer to the location
-         * in the CPU state struct where the SP we need is currently being
-         * stored; we will use and modify it in place.
-         * We use this limited C variable scope so we don't accidentally
-         * use 'frame_sp_p' after we do something that makes it invalid.
-         */
-        bool spsel = env->v7m.control[return_to_secure] & R_V7M_CONTROL_SPSEL_MASK;
-        uint32_t *frame_sp_p = get_v7m_sp_ptr(env,
-                                              return_to_secure,
-                                              !return_to_handler,
-                                              spsel);
-        uint32_t frameptr = *frame_sp_p;
-        bool pop_ok = true;
-        ARMMMUIdx mmu_idx;
-        bool return_to_priv = return_to_handler ||
-            !(env->v7m.control[return_to_secure] & R_V7M_CONTROL_NPRIV_MASK);
-
-        mmu_idx = arm_v7m_mmu_idx_for_secstate_and_priv(env, return_to_secure,
-                                                        return_to_priv);
-
-        if (!QEMU_IS_ALIGNED(frameptr, 8) &&
-            arm_feature(env, ARM_FEATURE_V8)) {
-            qemu_log_mask(LOG_GUEST_ERROR,
-                          "M profile exception return with non-8-aligned SP "
-                          "for destination state is UNPREDICTABLE\n");
-        }
-
-        /* Do we need to pop callee-saved registers? */
-        if (return_to_secure &&
-            ((excret & R_V7M_EXCRET_ES_MASK) == 0 ||
-             (excret & R_V7M_EXCRET_DCRS_MASK) == 0)) {
-            uint32_t actual_sig;
-
-            pop_ok = v7m_stack_read(cpu, &actual_sig, frameptr, mmu_idx);
-
-            if (pop_ok && v7m_integrity_sig(env, excret) != actual_sig) {
-                /* Take a SecureFault on the current stack */
-                env->v7m.sfsr |= R_V7M_SFSR_INVIS_MASK;
-                armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
-                qemu_log_mask(CPU_LOG_INT, "...taking SecureFault on existing "
-                              "stackframe: failed exception return integrity "
-                              "signature check\n");
-                v7m_exception_taken(cpu, excret, true, false);
-                return;
-            }
-
-            pop_ok = pop_ok &&
-                v7m_stack_read(cpu, &env->regs[4], frameptr + 0x8, mmu_idx) &&
-                v7m_stack_read(cpu, &env->regs[5], frameptr + 0xc, mmu_idx) &&
-                v7m_stack_read(cpu, &env->regs[6], frameptr + 0x10, mmu_idx) &&
-                v7m_stack_read(cpu, &env->regs[7], frameptr + 0x14, mmu_idx) &&
-                v7m_stack_read(cpu, &env->regs[8], frameptr + 0x18, mmu_idx) &&
-                v7m_stack_read(cpu, &env->regs[9], frameptr + 0x1c, mmu_idx) &&
-                v7m_stack_read(cpu, &env->regs[10], frameptr + 0x20, mmu_idx) &&
-                v7m_stack_read(cpu, &env->regs[11], frameptr + 0x24, mmu_idx);
-
-            frameptr += 0x28;
-        }
-
-        /* Pop registers */
-        pop_ok = pop_ok &&
-            v7m_stack_read(cpu, &env->regs[0], frameptr, mmu_idx) &&
-            v7m_stack_read(cpu, &env->regs[1], frameptr + 0x4, mmu_idx) &&
-            v7m_stack_read(cpu, &env->regs[2], frameptr + 0x8, mmu_idx) &&
-            v7m_stack_read(cpu, &env->regs[3], frameptr + 0xc, mmu_idx) &&
-            v7m_stack_read(cpu, &env->regs[12], frameptr + 0x10, mmu_idx) &&
-            v7m_stack_read(cpu, &env->regs[14], frameptr + 0x14, mmu_idx) &&
-            v7m_stack_read(cpu, &env->regs[15], frameptr + 0x18, mmu_idx) &&
-            v7m_stack_read(cpu, &xpsr, frameptr + 0x1c, mmu_idx);
-
-        if (!pop_ok) {
-            /*
-             * v7m_stack_read() pended a fault, so take it (as a tail
-             * chained exception on the same stack frame)
-             */
-            qemu_log_mask(CPU_LOG_INT, "...derived exception on unstacking\n");
-            v7m_exception_taken(cpu, excret, true, false);
-            return;
-        }
-
-        /*
-         * Returning from an exception with a PC with bit 0 set is defined
-         * behaviour on v8M (bit 0 is ignored), but for v7M it was specified
-         * to be UNPREDICTABLE. In practice actual v7M hardware seems to ignore
-         * the lsbit, and there are several RTOSes out there which incorrectly
-         * assume the r15 in the stack frame should be a Thumb-style "lsbit
-         * indicates ARM/Thumb" value, so ignore the bit on v7M as well, but
-         * complain about the badly behaved guest.
-         */
-        if (env->regs[15] & 1) {
-            env->regs[15] &= ~1U;
-            if (!arm_feature(env, ARM_FEATURE_V8)) {
-                qemu_log_mask(LOG_GUEST_ERROR,
-                              "M profile return from interrupt with misaligned "
-                              "PC is UNPREDICTABLE on v7M\n");
-            }
-        }
-
-        if (arm_feature(env, ARM_FEATURE_V8)) {
-            /*
-             * For v8M we have to check whether the xPSR exception field
-             * matches the EXCRET value for return to handler/thread
-             * before we commit to changing the SP and xPSR.
-             */
-            bool will_be_handler = (xpsr & XPSR_EXCP) != 0;
-            if (return_to_handler != will_be_handler) {
-                /*
-                 * Take an INVPC UsageFault on the current stack.
-                 * By this point we will have switched to the security state
-                 * for the background state, so this UsageFault will target
-                 * that state.
-                 */
-                armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
-                                        env->v7m.secure);
-                env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVPC_MASK;
-                qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on existing "
-                              "stackframe: failed exception return integrity "
-                              "check\n");
-                v7m_exception_taken(cpu, excret, true, false);
-                return;
-            }
-        }
-
-        if (!ftype) {
-            /* FP present and we need to handle it */
-            if (!return_to_secure &&
-                (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPACT_MASK)) {
-                armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
-                env->v7m.sfsr |= R_V7M_SFSR_LSERR_MASK;
-                qemu_log_mask(CPU_LOG_INT,
-                              "...taking SecureFault on existing stackframe: "
-                              "Secure LSPACT set but exception return is "
-                              "not to secure state\n");
-                v7m_exception_taken(cpu, excret, true, false);
-                return;
-            }
-
-            restore_s16_s31 = return_to_secure &&
-                (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK);
-
-            if (env->v7m.fpccr[return_to_secure] & R_V7M_FPCCR_LSPACT_MASK) {
-                /* State in FPU is still valid, just clear LSPACT */
-                env->v7m.fpccr[return_to_secure] &= ~R_V7M_FPCCR_LSPACT_MASK;
-            } else {
-                int i;
-                uint32_t fpscr;
-                bool cpacr_pass, nsacr_pass;
-
-                cpacr_pass = v7m_cpacr_pass(env, return_to_secure,
-                                            return_to_priv);
-                nsacr_pass = return_to_secure ||
-                    extract32(env->v7m.nsacr, 10, 1);
-
-                if (!cpacr_pass) {
-                    armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
-                                            return_to_secure);
-                    env->v7m.cfsr[return_to_secure] |= R_V7M_CFSR_NOCP_MASK;
-                    qemu_log_mask(CPU_LOG_INT,
-                                  "...taking UsageFault on existing "
-                                  "stackframe: CPACR.CP10 prevents unstacking "
-                                  "FP regs\n");
-                    v7m_exception_taken(cpu, excret, true, false);
-                    return;
-                } else if (!nsacr_pass) {
-                    armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, true);
-                    env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_INVPC_MASK;
-                    qemu_log_mask(CPU_LOG_INT,
-                                  "...taking Secure UsageFault on existing "
-                                  "stackframe: NSACR.CP10 prevents unstacking "
-                                  "FP regs\n");
-                    v7m_exception_taken(cpu, excret, true, false);
-                    return;
-                }
-
-                for (i = 0; i < (restore_s16_s31 ? 32 : 16); i += 2) {
-                    uint32_t slo, shi;
-                    uint64_t dn;
-                    uint32_t faddr = frameptr + 0x20 + 4 * i;
-
-                    if (i >= 16) {
-                        faddr += 8; /* Skip the slot for the FPSCR and VPR */
-                    }
-
-                    pop_ok = pop_ok &&
-                        v7m_stack_read(cpu, &slo, faddr, mmu_idx) &&
-                        v7m_stack_read(cpu, &shi, faddr + 4, mmu_idx);
-
-                    if (!pop_ok) {
-                        break;
-                    }
-
-                    dn = (uint64_t)shi << 32 | slo;
-                    *aa32_vfp_dreg(env, i / 2) = dn;
-                }
-                pop_ok = pop_ok &&
-                    v7m_stack_read(cpu, &fpscr, frameptr + 0x60, mmu_idx);
-                if (pop_ok) {
-                    vfp_set_fpscr(env, fpscr);
-                }
-                if (cpu_isar_feature(aa32_mve, cpu)) {
-                    pop_ok = pop_ok &&
-                        v7m_stack_read(cpu, &env->v7m.vpr,
-                                       frameptr + 0x64, mmu_idx);
-                }
-                if (!pop_ok) {
-                    /*
-                     * These regs are 0 if security extension present;
-                     * otherwise merely UNKNOWN. We zero always.
-                     */
-                    for (i = 0; i < (restore_s16_s31 ? 32 : 16); i += 2) {
-                        *aa32_vfp_dreg(env, i / 2) = 0;
-                    }
-                    vfp_set_fpscr(env, 0);
-                    if (cpu_isar_feature(aa32_mve, cpu)) {
-                        env->v7m.vpr = 0;
-                    }
-                }
-            }
-        }
-        env->v7m.control[M_REG_S] = FIELD_DP32(env->v7m.control[M_REG_S],
-                                               V7M_CONTROL, FPCA, !ftype);
-
-        /* Commit to consuming the stack frame */
-        frameptr += 0x20;
-        if (!ftype) {
-            frameptr += 0x48;
-            if (restore_s16_s31) {
-                frameptr += 0x40;
-            }
-        }
-        /*
-         * Undo stack alignment (the SPREALIGN bit indicates that the original
-         * pre-exception SP was not 8-aligned and we added a padding word to
-         * align it, so we undo this by ORing in the bit that increases it
-         * from the current 8-aligned value to the 8-unaligned value. (Adding 4
-         * would work too but a logical OR is how the pseudocode specifies it.)
-         */
-        if (xpsr & XPSR_SPREALIGN) {
-            frameptr |= 4;
-        }
-        *frame_sp_p = frameptr;
-    }
-
-    xpsr_mask = ~(XPSR_SPREALIGN | XPSR_SFPA);
-    if (!arm_feature(env, ARM_FEATURE_THUMB_DSP)) {
-        xpsr_mask &= ~XPSR_GE;
-    }
-    /* This xpsr_write() will invalidate frame_sp_p as it may switch stack */
-    xpsr_write(env, xpsr, xpsr_mask);
-
-    if (env->v7m.secure) {
-        bool sfpa = xpsr & XPSR_SFPA;
-
-        env->v7m.control[M_REG_S] = FIELD_DP32(env->v7m.control[M_REG_S],
-                                               V7M_CONTROL, SFPA, sfpa);
-    }
-
-    /*
-     * The restored xPSR exception field will be zero if we're
-     * resuming in Thread mode. If that doesn't match what the
-     * exception return excret specified then this is a UsageFault.
-     * v7M requires we make this check here; v8M did it earlier.
-     */
-    if (return_to_handler != arm_v7m_is_handler_mode(env)) {
-        /*
-         * Take an INVPC UsageFault by pushing the stack again;
-         * we know we're v7M so this is never a Secure UsageFault.
-         */
-        bool ignore_stackfaults;
-
-        assert(!arm_feature(env, ARM_FEATURE_V8));
-        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, false);
-        env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVPC_MASK;
-        ignore_stackfaults = v7m_push_stack(cpu);
-        qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on new stackframe: "
-                      "failed exception return integrity check\n");
-        v7m_exception_taken(cpu, excret, false, ignore_stackfaults);
-        return;
-    }
-
-    /* Otherwise, we have a successful exception exit. */
-    arm_clear_exclusive(env);
-    arm_rebuild_hflags(env);
-    qemu_log_mask(CPU_LOG_INT, "...successful exception return\n");
-}
-
-static bool do_v7m_function_return(ARMCPU *cpu)
-{
-    /*
-     * v8M security extensions magic function return.
-     * We may either:
-     *  (1) throw an exception (longjump)
-     *  (2) return true if we successfully handled the function return
-     *  (3) return false if we failed a consistency check and have
-     *      pended a UsageFault that needs to be taken now
-     *
-     * At this point the magic return value is split between env->regs[15]
-     * and env->thumb. We don't bother to reconstitute it because we don't
-     * need it (all values are handled the same way).
-     */
-    CPUARMState *env = &cpu->env;
-    uint32_t newpc, newpsr, newpsr_exc;
-
-    qemu_log_mask(CPU_LOG_INT, "...really v7M secure function return\n");
-
-    {
-        bool threadmode, spsel;
-        MemOpIdx oi;
-        ARMMMUIdx mmu_idx;
-        uint32_t *frame_sp_p;
-        uint32_t frameptr;
-
-        /* Pull the return address and IPSR from the Secure stack */
-        threadmode = !arm_v7m_is_handler_mode(env);
-        spsel = env->v7m.control[M_REG_S] & R_V7M_CONTROL_SPSEL_MASK;
-
-        frame_sp_p = get_v7m_sp_ptr(env, true, threadmode, spsel);
-        frameptr = *frame_sp_p;
-
-        /*
-         * These loads may throw an exception (for MPU faults). We want to
-         * do them as secure, so work out what MMU index that is.
-         */
-        mmu_idx = arm_v7m_mmu_idx_for_secstate(env, true);
-        oi = make_memop_idx(MO_LEUL, arm_to_core_mmu_idx(mmu_idx));
-        newpc = cpu_ldl_le_mmu(env, frameptr, oi, 0);
-        newpsr = cpu_ldl_le_mmu(env, frameptr + 4, oi, 0);
-
-        /* Consistency checks on new IPSR */
-        newpsr_exc = newpsr & XPSR_EXCP;
-        if (!((env->v7m.exception == 0 && newpsr_exc == 0) ||
-              (env->v7m.exception == 1 && newpsr_exc != 0))) {
-            /* Pend the fault and tell our caller to take it */
-            env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVPC_MASK;
-            armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
-                                    env->v7m.secure);
-            qemu_log_mask(CPU_LOG_INT,
-                          "...taking INVPC UsageFault: "
-                          "IPSR consistency check failed\n");
-            return false;
-        }
-
-        *frame_sp_p = frameptr + 8;
-    }
-
-    /* This invalidates frame_sp_p */
-    switch_v7m_security_state(env, true);
-    env->v7m.exception = newpsr_exc;
-    env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK;
-    if (newpsr & XPSR_SFPA) {
-        env->v7m.control[M_REG_S] |= R_V7M_CONTROL_SFPA_MASK;
-    }
-    xpsr_write(env, 0, XPSR_IT);
-    env->thumb = newpc & 1;
-    env->regs[15] = newpc & ~1;
-    arm_rebuild_hflags(env);
-
-    qemu_log_mask(CPU_LOG_INT, "...function return successful\n");
-    return true;
-}
-
-static bool v7m_read_half_insn(ARMCPU *cpu, ARMMMUIdx mmu_idx, bool secure,
-                               uint32_t addr, uint16_t *insn)
-{
-    /*
-     * Load a 16-bit portion of a v7M instruction, returning true on success,
-     * or false on failure (in which case we will have pended the appropriate
-     * exception).
-     * We need to do the instruction fetch's MPU and SAU checks
-     * like this because there is no MMU index that would allow
-     * doing the load with a single function call. Instead we must
-     * first check that the security attributes permit the load
-     * and that they don't mismatch on the two halves of the instruction,
-     * and then we do the load as a secure load (ie using the security
-     * attributes of the address, not the CPU, as architecturally required).
-     */
-    CPUState *cs = CPU(cpu);
-    CPUARMState *env = &cpu->env;
-    V8M_SAttributes sattrs = {};
-    GetPhysAddrResult res = {};
-    ARMMMUFaultInfo fi = {};
-    MemTxResult txres;
-
-    v8m_security_lookup(env, addr, MMU_INST_FETCH, mmu_idx, secure, &sattrs);
-    if (!sattrs.nsc || sattrs.ns) {
-        /*
-         * This must be the second half of the insn, and it straddles a
-         * region boundary with the second half not being S&NSC.
-         */
-        env->v7m.sfsr |= R_V7M_SFSR_INVEP_MASK;
-        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
-        qemu_log_mask(CPU_LOG_INT,
-                      "...really SecureFault with SFSR.INVEP\n");
-        return false;
-    }
-    if (get_phys_addr(env, addr, MMU_INST_FETCH, mmu_idx, &res, &fi)) {
-        /* the MPU lookup failed */
-        env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_IACCVIOL_MASK;
-        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_MEM, env->v7m.secure);
-        qemu_log_mask(CPU_LOG_INT, "...really MemManage with CFSR.IACCVIOL\n");
-        return false;
-    }
-    *insn = address_space_lduw_le(arm_addressspace(cs, res.f.attrs),
-                                  res.f.phys_addr, res.f.attrs, &txres);
-    if (txres != MEMTX_OK) {
-        env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_IBUSERR_MASK;
-        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_BUS, false);
-        qemu_log_mask(CPU_LOG_INT, "...really BusFault with CFSR.IBUSERR\n");
-        return false;
-    }
-    return true;
-}
-
-static bool v7m_read_sg_stack_word(ARMCPU *cpu, ARMMMUIdx mmu_idx,
-                                   uint32_t addr, uint32_t *spdata)
-{
-    /*
-     * Read a word of data from the stack for the SG instruction,
-     * writing the value into *spdata. If the load succeeds, return
-     * true; otherwise pend an appropriate exception and return false.
-     * (We can't use data load helpers here that throw an exception
-     * because of the context we're called in, which is halfway through
-     * arm_v7m_cpu_do_interrupt().)
-     */
-    CPUState *cs = CPU(cpu);
-    CPUARMState *env = &cpu->env;
-    MemTxResult txres;
-    GetPhysAddrResult res = {};
-    ARMMMUFaultInfo fi = {};
-    uint32_t value;
-
-    if (get_phys_addr(env, addr, MMU_DATA_LOAD, mmu_idx, &res, &fi)) {
-        /* MPU/SAU lookup failed */
-        if (fi.type == ARMFault_QEMU_SFault) {
-            qemu_log_mask(CPU_LOG_INT,
-                          "...SecureFault during stack word read\n");
-            env->v7m.sfsr |= R_V7M_SFSR_AUVIOL_MASK | R_V7M_SFSR_SFARVALID_MASK;
-            env->v7m.sfar = addr;
-            armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
-        } else {
-            qemu_log_mask(CPU_LOG_INT,
-                          "...MemManageFault during stack word read\n");
-            env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_DACCVIOL_MASK |
-                R_V7M_CFSR_MMARVALID_MASK;
-            env->v7m.mmfar[M_REG_S] = addr;
-            armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_MEM, false);
-        }
-        return false;
-    }
-    value = address_space_ldl(arm_addressspace(cs, res.f.attrs),
-                              res.f.phys_addr, res.f.attrs, &txres);
-    if (txres != MEMTX_OK) {
-        /* BusFault trying to read the data */
-        qemu_log_mask(CPU_LOG_INT,
-                      "...BusFault during stack word read\n");
-        env->v7m.cfsr[M_REG_NS] |=
-            (R_V7M_CFSR_PRECISERR_MASK | R_V7M_CFSR_BFARVALID_MASK);
-        env->v7m.bfar = addr;
-        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_BUS, false);
-        return false;
-    }
-
-    *spdata = value;
-    return true;
-}
-
-static bool v7m_handle_execute_nsc(ARMCPU *cpu)
-{
-    /*
-     * Check whether this attempt to execute code in a Secure & NS-Callable
-     * memory region is for an SG instruction; if so, then emulate the
-     * effect of the SG instruction and return true. Otherwise pend
-     * the correct kind of exception and return false.
-     */
-    CPUARMState *env = &cpu->env;
-    ARMMMUIdx mmu_idx;
-    uint16_t insn;
-
-    /*
-     * We should never get here unless get_phys_addr_pmsav8() caused
-     * an exception for NS executing in S&NSC memory.
-     */
-    assert(!env->v7m.secure);
-    assert(arm_feature(env, ARM_FEATURE_M_SECURITY));
-
-    /* We want to do the MPU lookup as secure; work out what mmu_idx that is */
-    mmu_idx = arm_v7m_mmu_idx_for_secstate(env, true);
-
-    if (!v7m_read_half_insn(cpu, mmu_idx, true, env->regs[15], &insn)) {
-        return false;
-    }
-
-    if (!env->thumb) {
-        goto gen_invep;
-    }
-
-    if (insn != 0xe97f) {
-        /*
-         * Not an SG instruction first half (we choose the IMPDEF
-         * early-SG-check option).
-         */
-        goto gen_invep;
-    }
-
-    if (!v7m_read_half_insn(cpu, mmu_idx, true, env->regs[15] + 2, &insn)) {
-        return false;
-    }
-
-    if (insn != 0xe97f) {
-        /*
-         * Not an SG instruction second half (yes, both halves of the SG
-         * insn have the same hex value)
-         */
-        goto gen_invep;
-    }
-
-    /*
-     * OK, we have confirmed that we really have an SG instruction.
-     * We know we're NS in S memory so don't need to repeat those checks.
-     */
-    qemu_log_mask(CPU_LOG_INT, "...really an SG instruction at 0x%08" PRIx32
-                  ", executing it\n", env->regs[15]);
-
-    if (cpu_isar_feature(aa32_m_sec_state, cpu) &&
-        !arm_v7m_is_handler_mode(env)) {
-        /*
-         * v8.1M exception stack frame integrity check. Note that we
-         * must perform the memory access even if CCR_S.TRD is zero
-         * and we aren't going to check what the data loaded is.
-         */
-        uint32_t spdata, sp;
-
-        /*
-         * We know we are currently NS, so the S stack pointers must be
-         * in other_ss_{psp,msp}, not in regs[13]/other_sp.
-         */
-        sp = v7m_using_psp(env) ? env->v7m.other_ss_psp : env->v7m.other_ss_msp;
-        if (!v7m_read_sg_stack_word(cpu, mmu_idx, sp, &spdata)) {
-            /* Stack access failed and an exception has been pended */
-            return false;
-        }
-
-        if (env->v7m.ccr[M_REG_S] & R_V7M_CCR_TRD_MASK) {
-            if (((spdata & ~1) == 0xfefa125a) ||
-                !(env->v7m.control[M_REG_S] & 1)) {
-                goto gen_invep;
-            }
-        }
-    }
-
-    env->regs[14] &= ~1;
-    env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK;
-    switch_v7m_security_state(env, true);
-    xpsr_write(env, 0, XPSR_IT);
-    env->regs[15] += 4;
-    arm_rebuild_hflags(env);
-    return true;
-
-gen_invep:
-    env->v7m.sfsr |= R_V7M_SFSR_INVEP_MASK;
-    armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
-    qemu_log_mask(CPU_LOG_INT,
-                  "...really SecureFault with SFSR.INVEP\n");
-    return false;
-}
-
-void arm_v7m_cpu_do_interrupt(CPUState *cs)
-{
-    ARMCPU *cpu = ARM_CPU(cs);
-    CPUARMState *env = &cpu->env;
-    uint32_t lr;
-    bool ignore_stackfaults;
-
-    arm_log_exception(cs);
-
-    /*
-     * For exceptions we just mark as pending on the NVIC, and let that
-     * handle it.
-     */
-    switch (cs->exception_index) {
-    case EXCP_UDEF:
-        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure);
-        env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_UNDEFINSTR_MASK;
-        break;
-    case EXCP_NOCP:
-    {
-        /*
-         * NOCP might be directed to something other than the current
-         * security state if this fault is because of NSACR; we indicate
-         * the target security state using exception.target_el.
-         */
-        int target_secstate;
-
-        if (env->exception.target_el == 3) {
-            target_secstate = M_REG_S;
-        } else {
-            target_secstate = env->v7m.secure;
-        }
-        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, target_secstate);
-        env->v7m.cfsr[target_secstate] |= R_V7M_CFSR_NOCP_MASK;
-        break;
-    }
-    case EXCP_INVSTATE:
-        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure);
-        env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVSTATE_MASK;
-        break;
-    case EXCP_STKOF:
-        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure);
-        env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_STKOF_MASK;
-        break;
-    case EXCP_LSERR:
-        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
-        env->v7m.sfsr |= R_V7M_SFSR_LSERR_MASK;
-        break;
-    case EXCP_UNALIGNED:
-        /* Unaligned faults reported by M-profile aware code */
-        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure);
-        env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_UNALIGNED_MASK;
-        break;
-    case EXCP_DIVBYZERO:
-        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure);
-        env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_DIVBYZERO_MASK;
-        break;
-    case EXCP_SWI:
-        /* The PC already points to the next instruction.  */
-        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SVC, env->v7m.secure);
-        break;
-    case EXCP_PREFETCH_ABORT:
-    case EXCP_DATA_ABORT:
-        /*
-         * Note that for M profile we don't have a guest facing FSR, but
-         * the env->exception.fsr will be populated by the code that
-         * raises the fault, in the A profile short-descriptor format.
-         *
-         * Log the exception.vaddress now regardless of subtype, because
-         * logging below only logs it when it goes into a guest visible
-         * register.
-         */
-        qemu_log_mask(CPU_LOG_INT, "...at fault address 0x%x\n",
-                      (uint32_t)env->exception.vaddress);
-        switch (env->exception.fsr & 0xf) {
-        case M_FAKE_FSR_NSC_EXEC:
-            /*
-             * Exception generated when we try to execute code at an address
-             * which is marked as Secure & Non-Secure Callable and the CPU
-             * is in the Non-Secure state. The only instruction which can
-             * be executed like this is SG (and that only if both halves of
-             * the SG instruction have the same security attributes.)
-             * Everything else must generate an INVEP SecureFault, so we
-             * emulate the SG instruction here.
-             */
-            if (v7m_handle_execute_nsc(cpu)) {
-                return;
-            }
-            break;
-        case M_FAKE_FSR_SFAULT:
-            /*
-             * Various flavours of SecureFault for attempts to execute or
-             * access data in the wrong security state.
-             */
-            switch (cs->exception_index) {
-            case EXCP_PREFETCH_ABORT:
-                if (env->v7m.secure) {
-                    env->v7m.sfsr |= R_V7M_SFSR_INVTRAN_MASK;
-                    qemu_log_mask(CPU_LOG_INT,
-                                  "...really SecureFault with SFSR.INVTRAN\n");
-                } else {
-                    env->v7m.sfsr |= R_V7M_SFSR_INVEP_MASK;
-                    qemu_log_mask(CPU_LOG_INT,
-                                  "...really SecureFault with SFSR.INVEP\n");
-                }
-                break;
-            case EXCP_DATA_ABORT:
-                /* This must be an NS access to S memory */
-                env->v7m.sfsr |= R_V7M_SFSR_AUVIOL_MASK;
-                qemu_log_mask(CPU_LOG_INT,
-                              "...really SecureFault with SFSR.AUVIOL\n");
-                break;
-            }
-            armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
-            break;
-        case 0x8: /* External Abort */
-            switch (cs->exception_index) {
-            case EXCP_PREFETCH_ABORT:
-                env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_IBUSERR_MASK;
-                qemu_log_mask(CPU_LOG_INT, "...with CFSR.IBUSERR\n");
-                break;
-            case EXCP_DATA_ABORT:
-                env->v7m.cfsr[M_REG_NS] |=
-                    (R_V7M_CFSR_PRECISERR_MASK | R_V7M_CFSR_BFARVALID_MASK);
-                env->v7m.bfar = env->exception.vaddress;
-                qemu_log_mask(CPU_LOG_INT,
-                              "...with CFSR.PRECISERR and BFAR 0x%x\n",
-                              env->v7m.bfar);
-                break;
-            }
-            armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_BUS, false);
-            break;
-        case 0x1: /* Alignment fault reported by generic code */
-            qemu_log_mask(CPU_LOG_INT,
-                          "...really UsageFault with UFSR.UNALIGNED\n");
-            env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_UNALIGNED_MASK;
-            armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
-                                    env->v7m.secure);
-            break;
-        default:
-            /*
-             * All other FSR values are either MPU faults or "can't happen
-             * for M profile" cases.
-             */
-            switch (cs->exception_index) {
-            case EXCP_PREFETCH_ABORT:
-                env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_IACCVIOL_MASK;
-                qemu_log_mask(CPU_LOG_INT, "...with CFSR.IACCVIOL\n");
-                break;
-            case EXCP_DATA_ABORT:
-                env->v7m.cfsr[env->v7m.secure] |=
-                    (R_V7M_CFSR_DACCVIOL_MASK | R_V7M_CFSR_MMARVALID_MASK);
-                env->v7m.mmfar[env->v7m.secure] = env->exception.vaddress;
-                qemu_log_mask(CPU_LOG_INT,
-                              "...with CFSR.DACCVIOL and MMFAR 0x%x\n",
-                              env->v7m.mmfar[env->v7m.secure]);
-                break;
-            }
-            armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_MEM,
-                                    env->v7m.secure);
-            break;
-        }
-        break;
-    case EXCP_SEMIHOST:
-        qemu_log_mask(CPU_LOG_INT,
-                      "...handling as semihosting call 0x%x\n",
-                      env->regs[0]);
-#ifdef CONFIG_TCG
-        do_common_semihosting(cs);
-#else
-        g_assert_not_reached();
-#endif
-        env->regs[15] += env->thumb ? 2 : 4;
-        return;
-    case EXCP_BKPT:
-        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_DEBUG, false);
-        break;
-    case EXCP_IRQ:
-        break;
-    case EXCP_EXCEPTION_EXIT:
-        if (env->regs[15] < EXC_RETURN_MIN_MAGIC) {
-            /* Must be v8M security extension function return */
-            assert(env->regs[15] >= FNC_RETURN_MIN_MAGIC);
-            assert(arm_feature(env, ARM_FEATURE_M_SECURITY));
-            if (do_v7m_function_return(cpu)) {
-                return;
-            }
-        } else {
-            do_v7m_exception_exit(cpu);
-            return;
-        }
-        break;
-    case EXCP_LAZYFP:
-        /*
-         * We already pended the specific exception in the NVIC in the
-         * v7m_preserve_fp_state() helper function.
-         */
-        break;
-    default:
-        cpu_abort(cs, "Unhandled exception 0x%x\n", cs->exception_index);
-        return; /* Never happens.  Keep compiler happy.  */
-    }
-
-    if (arm_feature(env, ARM_FEATURE_V8)) {
-        lr = R_V7M_EXCRET_RES1_MASK |
-            R_V7M_EXCRET_DCRS_MASK;
-        /*
-         * The S bit indicates whether we should return to Secure
-         * or NonSecure (ie our current state).
-         * The ES bit indicates whether we're taking this exception
-         * to Secure or NonSecure (ie our target state). We set it
-         * later, in v7m_exception_taken().
-         * The SPSEL bit is also set in v7m_exception_taken() for v8M.
-         * This corresponds to the ARM ARM pseudocode for v8M setting
-         * some LR bits in PushStack() and some in ExceptionTaken();
-         * the distinction matters for the tailchain cases where we
-         * can take an exception without pushing the stack.
-         */
-        if (env->v7m.secure) {
-            lr |= R_V7M_EXCRET_S_MASK;
-        }
-    } else {
-        lr = R_V7M_EXCRET_RES1_MASK |
-            R_V7M_EXCRET_S_MASK |
-            R_V7M_EXCRET_DCRS_MASK |
-            R_V7M_EXCRET_ES_MASK;
-        if (env->v7m.control[M_REG_NS] & R_V7M_CONTROL_SPSEL_MASK) {
-            lr |= R_V7M_EXCRET_SPSEL_MASK;
-        }
-    }
-    if (!(env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK)) {
-        lr |= R_V7M_EXCRET_FTYPE_MASK;
-    }
-    if (!arm_v7m_is_handler_mode(env)) {
-        lr |= R_V7M_EXCRET_MODE_MASK;
-    }
-
-    ignore_stackfaults = v7m_push_stack(cpu);
-    v7m_exception_taken(cpu, lr, false, ignore_stackfaults);
-}
-
-uint32_t HELPER(v7m_mrs)(CPUARMState *env, uint32_t reg)
-{
-    unsigned el = arm_current_el(env);
-
-    /* First handle registers which unprivileged can read */
-    switch (reg) {
-    case 0 ... 7: /* xPSR sub-fields */
-        return v7m_mrs_xpsr(env, reg, el);
-    case 20: /* CONTROL */
-        return v7m_mrs_control(env, env->v7m.secure);
-    case 0x94: /* CONTROL_NS */
-        /*
-         * We have to handle this here because unprivileged Secure code
-         * can read the NS CONTROL register.
-         */
-        if (!env->v7m.secure) {
-            return 0;
-        }
-        return env->v7m.control[M_REG_NS] |
-            (env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK);
-    }
-
-    if (el == 0) {
-        return 0; /* unprivileged reads others as zero */
-    }
-
-    if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
-        switch (reg) {
-        case 0x88: /* MSP_NS */
-            if (!env->v7m.secure) {
-                return 0;
-            }
-            return env->v7m.other_ss_msp;
-        case 0x89: /* PSP_NS */
-            if (!env->v7m.secure) {
-                return 0;
-            }
-            return env->v7m.other_ss_psp;
-        case 0x8a: /* MSPLIM_NS */
-            if (!env->v7m.secure) {
-                return 0;
-            }
-            return env->v7m.msplim[M_REG_NS];
-        case 0x8b: /* PSPLIM_NS */
-            if (!env->v7m.secure) {
-                return 0;
-            }
-            return env->v7m.psplim[M_REG_NS];
-        case 0x90: /* PRIMASK_NS */
-            if (!env->v7m.secure) {
-                return 0;
-            }
-            return env->v7m.primask[M_REG_NS];
-        case 0x91: /* BASEPRI_NS */
-            if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
-                goto bad_reg;
-            }
-            if (!env->v7m.secure) {
-                return 0;
-            }
-            return env->v7m.basepri[M_REG_NS];
-        case 0x93: /* FAULTMASK_NS */
-            if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
-                goto bad_reg;
-            }
-            if (!env->v7m.secure) {
-                return 0;
-            }
-            return env->v7m.faultmask[M_REG_NS];
-        case 0x98: /* SP_NS */
-        {
-            /*
-             * This gives the non-secure SP selected based on whether we're
-             * currently in handler mode or not, using the NS CONTROL.SPSEL.
-             */
-            bool spsel = env->v7m.control[M_REG_NS] & R_V7M_CONTROL_SPSEL_MASK;
-
-            if (!env->v7m.secure) {
-                return 0;
-            }
-            if (!arm_v7m_is_handler_mode(env) && spsel) {
-                return env->v7m.other_ss_psp;
-            } else {
-                return env->v7m.other_ss_msp;
-            }
-        }
-        default:
-            break;
-        }
-    }
-
-    switch (reg) {
-    case 8: /* MSP */
-        return v7m_using_psp(env) ? env->v7m.other_sp : env->regs[13];
-    case 9: /* PSP */
-        return v7m_using_psp(env) ? env->regs[13] : env->v7m.other_sp;
-    case 10: /* MSPLIM */
-        if (!arm_feature(env, ARM_FEATURE_V8)) {
-            goto bad_reg;
-        }
-        return env->v7m.msplim[env->v7m.secure];
-    case 11: /* PSPLIM */
-        if (!arm_feature(env, ARM_FEATURE_V8)) {
-            goto bad_reg;
-        }
-        return env->v7m.psplim[env->v7m.secure];
-    case 16: /* PRIMASK */
-        return env->v7m.primask[env->v7m.secure];
-    case 17: /* BASEPRI */
-    case 18: /* BASEPRI_MAX */
-        if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
-            goto bad_reg;
-        }
-        return env->v7m.basepri[env->v7m.secure];
-    case 19: /* FAULTMASK */
-        if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
-            goto bad_reg;
-        }
-        return env->v7m.faultmask[env->v7m.secure];
-    default:
-    bad_reg:
-        qemu_log_mask(LOG_GUEST_ERROR, "Attempt to read unknown special"
-                                       " register %d\n", reg);
-        return 0;
-    }
-}
-
-void HELPER(v7m_msr)(CPUARMState *env, uint32_t maskreg, uint32_t val)
-{
-    /*
-     * We're passed bits [11..0] of the instruction; extract
-     * SYSm and the mask bits.
-     * Invalid combinations of SYSm and mask are UNPREDICTABLE;
-     * we choose to treat them as if the mask bits were valid.
-     * NB that the pseudocode 'mask' variable is bits [11..10],
-     * whereas ours is [11..8].
-     */
-    uint32_t mask = extract32(maskreg, 8, 4);
-    uint32_t reg = extract32(maskreg, 0, 8);
-    int cur_el = arm_current_el(env);
-
-    if (cur_el == 0 && reg > 7 && reg != 20) {
-        /*
-         * only xPSR sub-fields and CONTROL.SFPA may be written by
-         * unprivileged code
-         */
-        return;
-    }
-
-    if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
-        switch (reg) {
-        case 0x88: /* MSP_NS */
-            if (!env->v7m.secure) {
-                return;
-            }
-            env->v7m.other_ss_msp = val & ~3;
-            return;
-        case 0x89: /* PSP_NS */
-            if (!env->v7m.secure) {
-                return;
-            }
-            env->v7m.other_ss_psp = val & ~3;
-            return;
-        case 0x8a: /* MSPLIM_NS */
-            if (!env->v7m.secure) {
-                return;
-            }
-            env->v7m.msplim[M_REG_NS] = val & ~7;
-            return;
-        case 0x8b: /* PSPLIM_NS */
-            if (!env->v7m.secure) {
-                return;
-            }
-            env->v7m.psplim[M_REG_NS] = val & ~7;
-            return;
-        case 0x90: /* PRIMASK_NS */
-            if (!env->v7m.secure) {
-                return;
-            }
-            env->v7m.primask[M_REG_NS] = val & 1;
-            return;
-        case 0x91: /* BASEPRI_NS */
-            if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
-                goto bad_reg;
-            }
-            if (!env->v7m.secure) {
-                return;
-            }
-            env->v7m.basepri[M_REG_NS] = val & 0xff;
-            return;
-        case 0x93: /* FAULTMASK_NS */
-            if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
-                goto bad_reg;
-            }
-            if (!env->v7m.secure) {
-                return;
-            }
-            env->v7m.faultmask[M_REG_NS] = val & 1;
-            return;
-        case 0x94: /* CONTROL_NS */
-            if (!env->v7m.secure) {
-                return;
-            }
-            write_v7m_control_spsel_for_secstate(env,
-                                                 val & R_V7M_CONTROL_SPSEL_MASK,
-                                                 M_REG_NS);
-            if (arm_feature(env, ARM_FEATURE_M_MAIN)) {
-                env->v7m.control[M_REG_NS] &= ~R_V7M_CONTROL_NPRIV_MASK;
-                env->v7m.control[M_REG_NS] |= val & R_V7M_CONTROL_NPRIV_MASK;
-            }
-            /*
-             * SFPA is RAZ/WI from NS. FPCA is RO if NSACR.CP10 == 0,
-             * RES0 if the FPU is not present, and is stored in the S bank
-             */
-            if (cpu_isar_feature(aa32_vfp_simd, env_archcpu(env)) &&
-                extract32(env->v7m.nsacr, 10, 1)) {
-                env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_FPCA_MASK;
-                env->v7m.control[M_REG_S] |= val & R_V7M_CONTROL_FPCA_MASK;
-            }
-            return;
-        case 0x98: /* SP_NS */
-        {
-            /*
-             * This gives the non-secure SP selected based on whether we're
-             * currently in handler mode or not, using the NS CONTROL.SPSEL.
-             */
-            bool spsel = env->v7m.control[M_REG_NS] & R_V7M_CONTROL_SPSEL_MASK;
-            bool is_psp = !arm_v7m_is_handler_mode(env) && spsel;
-            uint32_t limit;
-
-            if (!env->v7m.secure) {
-                return;
-            }
-
-            limit = is_psp ? env->v7m.psplim[false] : env->v7m.msplim[false];
-
-            val &= ~0x3;
-
-            if (val < limit) {
-                raise_exception_ra(env, EXCP_STKOF, 0, 1, GETPC());
-            }
-
-            if (is_psp) {
-                env->v7m.other_ss_psp = val;
-            } else {
-                env->v7m.other_ss_msp = val;
-            }
-            return;
-        }
-        default:
-            break;
-        }
-    }
-
-    switch (reg) {
-    case 0 ... 7: /* xPSR sub-fields */
-        v7m_msr_xpsr(env, mask, reg, val);
-        break;
-    case 8: /* MSP */
-        if (v7m_using_psp(env)) {
-            env->v7m.other_sp = val & ~3;
-        } else {
-            env->regs[13] = val & ~3;
-        }
-        break;
-    case 9: /* PSP */
-        if (v7m_using_psp(env)) {
-            env->regs[13] = val & ~3;
-        } else {
-            env->v7m.other_sp = val & ~3;
-        }
-        break;
-    case 10: /* MSPLIM */
-        if (!arm_feature(env, ARM_FEATURE_V8)) {
-            goto bad_reg;
-        }
-        env->v7m.msplim[env->v7m.secure] = val & ~7;
-        break;
-    case 11: /* PSPLIM */
-        if (!arm_feature(env, ARM_FEATURE_V8)) {
-            goto bad_reg;
-        }
-        env->v7m.psplim[env->v7m.secure] = val & ~7;
-        break;
-    case 16: /* PRIMASK */
-        env->v7m.primask[env->v7m.secure] = val & 1;
-        break;
-    case 17: /* BASEPRI */
-        if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
-            goto bad_reg;
-        }
-        env->v7m.basepri[env->v7m.secure] = val & 0xff;
-        break;
-    case 18: /* BASEPRI_MAX */
-        if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
-            goto bad_reg;
-        }
-        val &= 0xff;
-        if (val != 0 && (val < env->v7m.basepri[env->v7m.secure]
-                         || env->v7m.basepri[env->v7m.secure] == 0)) {
-            env->v7m.basepri[env->v7m.secure] = val;
-        }
-        break;
-    case 19: /* FAULTMASK */
-        if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
-            goto bad_reg;
-        }
-        env->v7m.faultmask[env->v7m.secure] = val & 1;
-        break;
-    case 20: /* CONTROL */
-        /*
-         * Writing to the SPSEL bit only has an effect if we are in
-         * thread mode; other bits can be updated by any privileged code.
-         * write_v7m_control_spsel() deals with updating the SPSEL bit in
-         * env->v7m.control, so we only need update the others.
-         * For v7M, we must just ignore explicit writes to SPSEL in handler
-         * mode; for v8M the write is permitted but will have no effect.
-         * All these bits are writes-ignored from non-privileged code,
-         * except for SFPA.
-         */
-        if (cur_el > 0 && (arm_feature(env, ARM_FEATURE_V8) ||
-                           !arm_v7m_is_handler_mode(env))) {
-            write_v7m_control_spsel(env, (val & R_V7M_CONTROL_SPSEL_MASK) != 0);
-        }
-        if (cur_el > 0 && arm_feature(env, ARM_FEATURE_M_MAIN)) {
-            env->v7m.control[env->v7m.secure] &= ~R_V7M_CONTROL_NPRIV_MASK;
-            env->v7m.control[env->v7m.secure] |= val & R_V7M_CONTROL_NPRIV_MASK;
-        }
-        if (cpu_isar_feature(aa32_vfp_simd, env_archcpu(env))) {
-            /*
-             * SFPA is RAZ/WI from NS or if no FPU.
-             * FPCA is RO if NSACR.CP10 == 0, RES0 if the FPU is not present.
-             * Both are stored in the S bank.
-             */
-            if (env->v7m.secure) {
-                env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK;
-                env->v7m.control[M_REG_S] |= val & R_V7M_CONTROL_SFPA_MASK;
-            }
-            if (cur_el > 0 &&
-                (env->v7m.secure || !arm_feature(env, ARM_FEATURE_M_SECURITY) ||
-                 extract32(env->v7m.nsacr, 10, 1))) {
-                env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_FPCA_MASK;
-                env->v7m.control[M_REG_S] |= val & R_V7M_CONTROL_FPCA_MASK;
-            }
-        }
-        break;
-    default:
-    bad_reg:
-        qemu_log_mask(LOG_GUEST_ERROR, "Attempt to write unknown special"
-                                       " register %d\n", reg);
-        return;
-    }
-}
-
-uint32_t HELPER(v7m_tt)(CPUARMState *env, uint32_t addr, uint32_t op)
-{
-    /* Implement the TT instruction. op is bits [7:6] of the insn. */
-    bool forceunpriv = op & 1;
-    bool alt = op & 2;
-    V8M_SAttributes sattrs = {};
-    uint32_t tt_resp;
-    bool r, rw, nsr, nsrw, mrvalid;
-    ARMMMUIdx mmu_idx;
-    uint32_t mregion;
-    bool targetpriv;
-    bool targetsec = env->v7m.secure;
-
-    /*
-     * Work out what the security state and privilege level we're
-     * interested in is...
-     */
-    if (alt) {
-        targetsec = !targetsec;
-    }
-
-    if (forceunpriv) {
-        targetpriv = false;
-    } else {
-        targetpriv = arm_v7m_is_handler_mode(env) ||
-            !(env->v7m.control[targetsec] & R_V7M_CONTROL_NPRIV_MASK);
-    }
-
-    /* ...and then figure out which MMU index this is */
-    mmu_idx = arm_v7m_mmu_idx_for_secstate_and_priv(env, targetsec, targetpriv);
-
-    /*
-     * We know that the MPU and SAU don't care about the access type
-     * for our purposes beyond that we don't want to claim to be
-     * an insn fetch, so we arbitrarily call this a read.
-     */
-
-    /*
-     * MPU region info only available for privileged or if
-     * inspecting the other MPU state.
-     */
-    if (arm_current_el(env) != 0 || alt) {
-        GetPhysAddrResult res = {};
-        ARMMMUFaultInfo fi = {};
-
-        /* We can ignore the return value as prot is always set */
-        pmsav8_mpu_lookup(env, addr, MMU_DATA_LOAD, mmu_idx, targetsec,
-                          &res, &fi, &mregion);
-        if (mregion == -1) {
-            mrvalid = false;
-            mregion = 0;
-        } else {
-            mrvalid = true;
-        }
-        r = res.f.prot & PAGE_READ;
-        rw = res.f.prot & PAGE_WRITE;
-    } else {
-        r = false;
-        rw = false;
-        mrvalid = false;
-        mregion = 0;
-    }
-
-    if (env->v7m.secure) {
-        v8m_security_lookup(env, addr, MMU_DATA_LOAD, mmu_idx,
-                            targetsec, &sattrs);
-        nsr = sattrs.ns && r;
-        nsrw = sattrs.ns && rw;
-    } else {
-        sattrs.ns = true;
-        nsr = false;
-        nsrw = false;
-    }
-
-    tt_resp = (sattrs.iregion << 24) |
-        (sattrs.irvalid << 23) |
-        ((!sattrs.ns) << 22) |
-        (nsrw << 21) |
-        (nsr << 20) |
-        (rw << 19) |
-        (r << 18) |
-        (sattrs.srvalid << 17) |
-        (mrvalid << 16) |
-        (sattrs.sregion << 8) |
-        mregion;
-
-    return tt_resp;
-}
-
-#endif /* !CONFIG_USER_ONLY */
diff --git a/target/arm/meson.build b/target/arm/meson.build
index b2904b6..3e2f403 100644
--- a/target/arm/meson.build
+++ b/target/arm/meson.build
@@ -1,17 +1,9 @@
 arm_ss = ss.source_set()
 arm_ss.add(files(
   'cpu.c',
-  'crypto_helper.c',
   'debug_helper.c',
   'gdbstub.c',
   'helper.c',
-  'iwmmxt_helper.c',
-  'm_helper.c',
-  'mve_helper.c',
-  'neon_helper.c',
-  'op_helper.c',
-  'tlb_helper.c',
-  'vec_helper.c',
   'vfp_helper.c',
   'cpu_tcg.c',
 ))
@@ -22,11 +14,6 @@ arm_ss.add(when: 'CONFIG_KVM', if_true: files('kvm.c', 'kvm64.c'), if_false: fil
 arm_ss.add(when: 'TARGET_AARCH64', if_true: files(
   'cpu64.c',
   'gdbstub64.c',
-  'helper-a64.c',
-  'mte_helper.c',
-  'pauth_helper.c',
-  'sve_helper.c',
-  'sme_helper.c',
 ))
 
 arm_softmmu_ss = ss.source_set()
@@ -43,6 +30,8 @@ subdir('hvf')
 
 if 'CONFIG_TCG' in config_all
    subdir('tcg')
+else
+    arm_ss.add(files('tcg-stubs.c'))
 endif
 
 target_arch += {'arm': arm_ss}
diff --git a/target/arm/mte_helper.c b/target/arm/mte_helper.c
deleted file mode 100644
index 98bcf59..0000000
--- a/target/arm/mte_helper.c
+++ /dev/null
@@ -1,908 +0,0 @@
-/*
- * ARM v8.5-MemTag Operations
- *
- * Copyright (c) 2020 Linaro, Ltd.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "qemu/osdep.h"
-#include "qemu/log.h"
-#include "cpu.h"
-#include "internals.h"
-#include "exec/exec-all.h"
-#include "exec/ram_addr.h"
-#include "exec/cpu_ldst.h"
-#include "exec/helper-proto.h"
-#include "qapi/error.h"
-#include "qemu/guest-random.h"
-
-
-static int choose_nonexcluded_tag(int tag, int offset, uint16_t exclude)
-{
-    if (exclude == 0xffff) {
-        return 0;
-    }
-    if (offset == 0) {
-        while (exclude & (1 << tag)) {
-            tag = (tag + 1) & 15;
-        }
-    } else {
-        do {
-            do {
-                tag = (tag + 1) & 15;
-            } while (exclude & (1 << tag));
-        } while (--offset > 0);
-    }
-    return tag;
-}
-
-/**
- * allocation_tag_mem:
- * @env: the cpu environment
- * @ptr_mmu_idx: the addressing regime to use for the virtual address
- * @ptr: the virtual address for which to look up tag memory
- * @ptr_access: the access to use for the virtual address
- * @ptr_size: the number of bytes in the normal memory access
- * @tag_access: the access to use for the tag memory
- * @tag_size: the number of bytes in the tag memory access
- * @ra: the return address for exception handling
- *
- * Our tag memory is formatted as a sequence of little-endian nibbles.
- * That is, the byte at (addr >> (LOG2_TAG_GRANULE + 1)) contains two
- * tags, with the tag at [3:0] for the lower addr and the tag at [7:4]
- * for the higher addr.
- *
- * Here, resolve the physical address from the virtual address, and return
- * a pointer to the corresponding tag byte.  Exit with exception if the
- * virtual address is not accessible for @ptr_access.
- *
- * The @ptr_size and @tag_size values may not have an obvious relation
- * due to the alignment of @ptr, and the number of tag checks required.
- *
- * If there is no tag storage corresponding to @ptr, return NULL.
- */
-static uint8_t *allocation_tag_mem(CPUARMState *env, int ptr_mmu_idx,
-                                   uint64_t ptr, MMUAccessType ptr_access,
-                                   int ptr_size, MMUAccessType tag_access,
-                                   int tag_size, uintptr_t ra)
-{
-#ifdef CONFIG_USER_ONLY
-    uint64_t clean_ptr = useronly_clean_ptr(ptr);
-    int flags = page_get_flags(clean_ptr);
-    uint8_t *tags;
-    uintptr_t index;
-
-    if (!(flags & (ptr_access == MMU_DATA_STORE ? PAGE_WRITE_ORG : PAGE_READ))) {
-        cpu_loop_exit_sigsegv(env_cpu(env), ptr, ptr_access,
-                              !(flags & PAGE_VALID), ra);
-    }
-
-    /* Require both MAP_ANON and PROT_MTE for the page. */
-    if (!(flags & PAGE_ANON) || !(flags & PAGE_MTE)) {
-        return NULL;
-    }
-
-    tags = page_get_target_data(clean_ptr);
-
-    index = extract32(ptr, LOG2_TAG_GRANULE + 1,
-                      TARGET_PAGE_BITS - LOG2_TAG_GRANULE - 1);
-    return tags + index;
-#else
-    CPUTLBEntryFull *full;
-    MemTxAttrs attrs;
-    int in_page, flags;
-    hwaddr ptr_paddr, tag_paddr, xlat;
-    MemoryRegion *mr;
-    ARMASIdx tag_asi;
-    AddressSpace *tag_as;
-    void *host;
-
-    /*
-     * Probe the first byte of the virtual address.  This raises an
-     * exception for inaccessible pages, and resolves the virtual address
-     * into the softmmu tlb.
-     *
-     * When RA == 0, this is for mte_probe.  The page is expected to be
-     * valid.  Indicate to probe_access_flags no-fault, then assert that
-     * we received a valid page.
-     */
-    flags = probe_access_full(env, ptr, ptr_access, ptr_mmu_idx,
-                              ra == 0, &host, &full, ra);
-    assert(!(flags & TLB_INVALID_MASK));
-
-    /* If the virtual page MemAttr != Tagged, access unchecked. */
-    if (full->pte_attrs != 0xf0) {
-        return NULL;
-    }
-
-    /*
-     * If not backed by host ram, there is no tag storage: access unchecked.
-     * This is probably a guest os bug though, so log it.
-     */
-    if (unlikely(flags & TLB_MMIO)) {
-        qemu_log_mask(LOG_GUEST_ERROR,
-                      "Page @ 0x%" PRIx64 " indicates Tagged Normal memory "
-                      "but is not backed by host ram\n", ptr);
-        return NULL;
-    }
-
-    /*
-     * Remember these values across the second lookup below,
-     * which may invalidate this pointer via tlb resize.
-     */
-    ptr_paddr = full->phys_addr | (ptr & ~TARGET_PAGE_MASK);
-    attrs = full->attrs;
-    full = NULL;
-
-    /*
-     * The Normal memory access can extend to the next page.  E.g. a single
-     * 8-byte access to the last byte of a page will check only the last
-     * tag on the first page.
-     * Any page access exception has priority over tag check exception.
-     */
-    in_page = -(ptr | TARGET_PAGE_MASK);
-    if (unlikely(ptr_size > in_page)) {
-        flags |= probe_access_full(env, ptr + in_page, ptr_access,
-                                   ptr_mmu_idx, ra == 0, &host, &full, ra);
-        assert(!(flags & TLB_INVALID_MASK));
-    }
-
-    /* Any debug exception has priority over a tag check exception. */
-    if (unlikely(flags & TLB_WATCHPOINT)) {
-        int wp = ptr_access == MMU_DATA_LOAD ? BP_MEM_READ : BP_MEM_WRITE;
-        assert(ra != 0);
-        cpu_check_watchpoint(env_cpu(env), ptr, ptr_size, attrs, wp, ra);
-    }
-
-    /* Convert to the physical address in tag space.  */
-    tag_paddr = ptr_paddr >> (LOG2_TAG_GRANULE + 1);
-
-    /* Look up the address in tag space. */
-    tag_asi = attrs.secure ? ARMASIdx_TagS : ARMASIdx_TagNS;
-    tag_as = cpu_get_address_space(env_cpu(env), tag_asi);
-    mr = address_space_translate(tag_as, tag_paddr, &xlat, NULL,
-                                 tag_access == MMU_DATA_STORE, attrs);
-
-    /*
-     * Note that @mr will never be NULL.  If there is nothing in the address
-     * space at @tag_paddr, the translation will return the unallocated memory
-     * region.  For our purposes, the result must be ram.
-     */
-    if (unlikely(!memory_region_is_ram(mr))) {
-        /* ??? Failure is a board configuration error. */
-        qemu_log_mask(LOG_UNIMP,
-                      "Tag Memory @ 0x%" HWADDR_PRIx " not found for "
-                      "Normal Memory @ 0x%" HWADDR_PRIx "\n",
-                      tag_paddr, ptr_paddr);
-        return NULL;
-    }
-
-    /*
-     * Ensure the tag memory is dirty on write, for migration.
-     * Tag memory can never contain code or display memory (vga).
-     */
-    if (tag_access == MMU_DATA_STORE) {
-        ram_addr_t tag_ra = memory_region_get_ram_addr(mr) + xlat;
-        cpu_physical_memory_set_dirty_flag(tag_ra, DIRTY_MEMORY_MIGRATION);
-    }
-
-    return memory_region_get_ram_ptr(mr) + xlat;
-#endif
-}
-
-uint64_t HELPER(irg)(CPUARMState *env, uint64_t rn, uint64_t rm)
-{
-    uint16_t exclude = extract32(rm | env->cp15.gcr_el1, 0, 16);
-    int rrnd = extract32(env->cp15.gcr_el1, 16, 1);
-    int start = extract32(env->cp15.rgsr_el1, 0, 4);
-    int seed = extract32(env->cp15.rgsr_el1, 8, 16);
-    int offset, i, rtag;
-
-    /*
-     * Our IMPDEF choice for GCR_EL1.RRND==1 is to continue to use the
-     * deterministic algorithm.  Except that with RRND==1 the kernel is
-     * not required to have set RGSR_EL1.SEED != 0, which is required for
-     * the deterministic algorithm to function.  So we force a non-zero
-     * SEED for that case.
-     */
-    if (unlikely(seed == 0) && rrnd) {
-        do {
-            Error *err = NULL;
-            uint16_t two;
-
-            if (qemu_guest_getrandom(&two, sizeof(two), &err) < 0) {
-                /*
-                 * Failed, for unknown reasons in the crypto subsystem.
-                 * Best we can do is log the reason and use a constant seed.
-                 */
-                qemu_log_mask(LOG_UNIMP, "IRG: Crypto failure: %s\n",
-                              error_get_pretty(err));
-                error_free(err);
-                two = 1;
-            }
-            seed = two;
-        } while (seed == 0);
-    }
-
-    /* RandomTag */
-    for (i = offset = 0; i < 4; ++i) {
-        /* NextRandomTagBit */
-        int top = (extract32(seed, 5, 1) ^ extract32(seed, 3, 1) ^
-                   extract32(seed, 2, 1) ^ extract32(seed, 0, 1));
-        seed = (top << 15) | (seed >> 1);
-        offset |= top << i;
-    }
-    rtag = choose_nonexcluded_tag(start, offset, exclude);
-    env->cp15.rgsr_el1 = rtag | (seed << 8);
-
-    return address_with_allocation_tag(rn, rtag);
-}
-
-uint64_t HELPER(addsubg)(CPUARMState *env, uint64_t ptr,
-                         int32_t offset, uint32_t tag_offset)
-{
-    int start_tag = allocation_tag_from_addr(ptr);
-    uint16_t exclude = extract32(env->cp15.gcr_el1, 0, 16);
-    int rtag = choose_nonexcluded_tag(start_tag, tag_offset, exclude);
-
-    return address_with_allocation_tag(ptr + offset, rtag);
-}
-
-static int load_tag1(uint64_t ptr, uint8_t *mem)
-{
-    int ofs = extract32(ptr, LOG2_TAG_GRANULE, 1) * 4;
-    return extract32(*mem, ofs, 4);
-}
-
-uint64_t HELPER(ldg)(CPUARMState *env, uint64_t ptr, uint64_t xt)
-{
-    int mmu_idx = cpu_mmu_index(env, false);
-    uint8_t *mem;
-    int rtag = 0;
-
-    /* Trap if accessing an invalid page.  */
-    mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_LOAD, 1,
-                             MMU_DATA_LOAD, 1, GETPC());
-
-    /* Load if page supports tags. */
-    if (mem) {
-        rtag = load_tag1(ptr, mem);
-    }
-
-    return address_with_allocation_tag(xt, rtag);
-}
-
-static void check_tag_aligned(CPUARMState *env, uint64_t ptr, uintptr_t ra)
-{
-    if (unlikely(!QEMU_IS_ALIGNED(ptr, TAG_GRANULE))) {
-        arm_cpu_do_unaligned_access(env_cpu(env), ptr, MMU_DATA_STORE,
-                                    cpu_mmu_index(env, false), ra);
-        g_assert_not_reached();
-    }
-}
-
-/* For use in a non-parallel context, store to the given nibble.  */
-static void store_tag1(uint64_t ptr, uint8_t *mem, int tag)
-{
-    int ofs = extract32(ptr, LOG2_TAG_GRANULE, 1) * 4;
-    *mem = deposit32(*mem, ofs, 4, tag);
-}
-
-/* For use in a parallel context, atomically store to the given nibble.  */
-static void store_tag1_parallel(uint64_t ptr, uint8_t *mem, int tag)
-{
-    int ofs = extract32(ptr, LOG2_TAG_GRANULE, 1) * 4;
-    uint8_t old = qatomic_read(mem);
-
-    while (1) {
-        uint8_t new = deposit32(old, ofs, 4, tag);
-        uint8_t cmp = qatomic_cmpxchg(mem, old, new);
-        if (likely(cmp == old)) {
-            return;
-        }
-        old = cmp;
-    }
-}
-
-typedef void stg_store1(uint64_t, uint8_t *, int);
-
-static inline void do_stg(CPUARMState *env, uint64_t ptr, uint64_t xt,
-                          uintptr_t ra, stg_store1 store1)
-{
-    int mmu_idx = cpu_mmu_index(env, false);
-    uint8_t *mem;
-
-    check_tag_aligned(env, ptr, ra);
-
-    /* Trap if accessing an invalid page.  */
-    mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE, TAG_GRANULE,
-                             MMU_DATA_STORE, 1, ra);
-
-    /* Store if page supports tags. */
-    if (mem) {
-        store1(ptr, mem, allocation_tag_from_addr(xt));
-    }
-}
-
-void HELPER(stg)(CPUARMState *env, uint64_t ptr, uint64_t xt)
-{
-    do_stg(env, ptr, xt, GETPC(), store_tag1);
-}
-
-void HELPER(stg_parallel)(CPUARMState *env, uint64_t ptr, uint64_t xt)
-{
-    do_stg(env, ptr, xt, GETPC(), store_tag1_parallel);
-}
-
-void HELPER(stg_stub)(CPUARMState *env, uint64_t ptr)
-{
-    int mmu_idx = cpu_mmu_index(env, false);
-    uintptr_t ra = GETPC();
-
-    check_tag_aligned(env, ptr, ra);
-    probe_write(env, ptr, TAG_GRANULE, mmu_idx, ra);
-}
-
-static inline void do_st2g(CPUARMState *env, uint64_t ptr, uint64_t xt,
-                           uintptr_t ra, stg_store1 store1)
-{
-    int mmu_idx = cpu_mmu_index(env, false);
-    int tag = allocation_tag_from_addr(xt);
-    uint8_t *mem1, *mem2;
-
-    check_tag_aligned(env, ptr, ra);
-
-    /*
-     * Trap if accessing an invalid page(s).
-     * This takes priority over !allocation_tag_access_enabled.
-     */
-    if (ptr & TAG_GRANULE) {
-        /* Two stores unaligned mod TAG_GRANULE*2 -- modify two bytes. */
-        mem1 = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE,
-                                  TAG_GRANULE, MMU_DATA_STORE, 1, ra);
-        mem2 = allocation_tag_mem(env, mmu_idx, ptr + TAG_GRANULE,
-                                  MMU_DATA_STORE, TAG_GRANULE,
-                                  MMU_DATA_STORE, 1, ra);
-
-        /* Store if page(s) support tags. */
-        if (mem1) {
-            store1(TAG_GRANULE, mem1, tag);
-        }
-        if (mem2) {
-            store1(0, mem2, tag);
-        }
-    } else {
-        /* Two stores aligned mod TAG_GRANULE*2 -- modify one byte. */
-        mem1 = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE,
-                                  2 * TAG_GRANULE, MMU_DATA_STORE, 1, ra);
-        if (mem1) {
-            tag |= tag << 4;
-            qatomic_set(mem1, tag);
-        }
-    }
-}
-
-void HELPER(st2g)(CPUARMState *env, uint64_t ptr, uint64_t xt)
-{
-    do_st2g(env, ptr, xt, GETPC(), store_tag1);
-}
-
-void HELPER(st2g_parallel)(CPUARMState *env, uint64_t ptr, uint64_t xt)
-{
-    do_st2g(env, ptr, xt, GETPC(), store_tag1_parallel);
-}
-
-void HELPER(st2g_stub)(CPUARMState *env, uint64_t ptr)
-{
-    int mmu_idx = cpu_mmu_index(env, false);
-    uintptr_t ra = GETPC();
-    int in_page = -(ptr | TARGET_PAGE_MASK);
-
-    check_tag_aligned(env, ptr, ra);
-
-    if (likely(in_page >= 2 * TAG_GRANULE)) {
-        probe_write(env, ptr, 2 * TAG_GRANULE, mmu_idx, ra);
-    } else {
-        probe_write(env, ptr, TAG_GRANULE, mmu_idx, ra);
-        probe_write(env, ptr + TAG_GRANULE, TAG_GRANULE, mmu_idx, ra);
-    }
-}
-
-#define LDGM_STGM_SIZE  (4 << GMID_EL1_BS)
-
-uint64_t HELPER(ldgm)(CPUARMState *env, uint64_t ptr)
-{
-    int mmu_idx = cpu_mmu_index(env, false);
-    uintptr_t ra = GETPC();
-    void *tag_mem;
-
-    ptr = QEMU_ALIGN_DOWN(ptr, LDGM_STGM_SIZE);
-
-    /* Trap if accessing an invalid page.  */
-    tag_mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_LOAD,
-                                 LDGM_STGM_SIZE, MMU_DATA_LOAD,
-                                 LDGM_STGM_SIZE / (2 * TAG_GRANULE), ra);
-
-    /* The tag is squashed to zero if the page does not support tags.  */
-    if (!tag_mem) {
-        return 0;
-    }
-
-    QEMU_BUILD_BUG_ON(GMID_EL1_BS != 6);
-    /*
-     * We are loading 64-bits worth of tags.  The ordering of elements
-     * within the word corresponds to a 64-bit little-endian operation.
-     */
-    return ldq_le_p(tag_mem);
-}
-
-void HELPER(stgm)(CPUARMState *env, uint64_t ptr, uint64_t val)
-{
-    int mmu_idx = cpu_mmu_index(env, false);
-    uintptr_t ra = GETPC();
-    void *tag_mem;
-
-    ptr = QEMU_ALIGN_DOWN(ptr, LDGM_STGM_SIZE);
-
-    /* Trap if accessing an invalid page.  */
-    tag_mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE,
-                                 LDGM_STGM_SIZE, MMU_DATA_LOAD,
-                                 LDGM_STGM_SIZE / (2 * TAG_GRANULE), ra);
-
-    /*
-     * Tag store only happens if the page support tags,
-     * and if the OS has enabled access to the tags.
-     */
-    if (!tag_mem) {
-        return;
-    }
-
-    QEMU_BUILD_BUG_ON(GMID_EL1_BS != 6);
-    /*
-     * We are storing 64-bits worth of tags.  The ordering of elements
-     * within the word corresponds to a 64-bit little-endian operation.
-     */
-    stq_le_p(tag_mem, val);
-}
-
-void HELPER(stzgm_tags)(CPUARMState *env, uint64_t ptr, uint64_t val)
-{
-    uintptr_t ra = GETPC();
-    int mmu_idx = cpu_mmu_index(env, false);
-    int log2_dcz_bytes, log2_tag_bytes;
-    intptr_t dcz_bytes, tag_bytes;
-    uint8_t *mem;
-
-    /*
-     * In arm_cpu_realizefn, we assert that dcz > LOG2_TAG_GRANULE+1,
-     * i.e. 32 bytes, which is an unreasonably small dcz anyway,
-     * to make sure that we can access one complete tag byte here.
-     */
-    log2_dcz_bytes = env_archcpu(env)->dcz_blocksize + 2;
-    log2_tag_bytes = log2_dcz_bytes - (LOG2_TAG_GRANULE + 1);
-    dcz_bytes = (intptr_t)1 << log2_dcz_bytes;
-    tag_bytes = (intptr_t)1 << log2_tag_bytes;
-    ptr &= -dcz_bytes;
-
-    mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE, dcz_bytes,
-                             MMU_DATA_STORE, tag_bytes, ra);
-    if (mem) {
-        int tag_pair = (val & 0xf) * 0x11;
-        memset(mem, tag_pair, tag_bytes);
-    }
-}
-
-static void mte_sync_check_fail(CPUARMState *env, uint32_t desc,
-                                uint64_t dirty_ptr, uintptr_t ra)
-{
-    int is_write, syn;
-
-    env->exception.vaddress = dirty_ptr;
-
-    is_write = FIELD_EX32(desc, MTEDESC, WRITE);
-    syn = syn_data_abort_no_iss(arm_current_el(env) != 0, 0, 0, 0, 0, is_write,
-                                0x11);
-    raise_exception_ra(env, EXCP_DATA_ABORT, syn, exception_target_el(env), ra);
-    g_assert_not_reached();
-}
-
-static void mte_async_check_fail(CPUARMState *env, uint64_t dirty_ptr,
-                                 uintptr_t ra, ARMMMUIdx arm_mmu_idx, int el)
-{
-    int select;
-
-    if (regime_has_2_ranges(arm_mmu_idx)) {
-        select = extract64(dirty_ptr, 55, 1);
-    } else {
-        select = 0;
-    }
-    env->cp15.tfsr_el[el] |= 1 << select;
-#ifdef CONFIG_USER_ONLY
-    /*
-     * Stand in for a timer irq, setting _TIF_MTE_ASYNC_FAULT,
-     * which then sends a SIGSEGV when the thread is next scheduled.
-     * This cpu will return to the main loop at the end of the TB,
-     * which is rather sooner than "normal".  But the alternative
-     * is waiting until the next syscall.
-     */
-    qemu_cpu_kick(env_cpu(env));
-#endif
-}
-
-/* Record a tag check failure.  */
-static void mte_check_fail(CPUARMState *env, uint32_t desc,
-                           uint64_t dirty_ptr, uintptr_t ra)
-{
-    int mmu_idx = FIELD_EX32(desc, MTEDESC, MIDX);
-    ARMMMUIdx arm_mmu_idx = core_to_aa64_mmu_idx(mmu_idx);
-    int el, reg_el, tcf;
-    uint64_t sctlr;
-
-    reg_el = regime_el(env, arm_mmu_idx);
-    sctlr = env->cp15.sctlr_el[reg_el];
-
-    switch (arm_mmu_idx) {
-    case ARMMMUIdx_E10_0:
-    case ARMMMUIdx_E20_0:
-        el = 0;
-        tcf = extract64(sctlr, 38, 2);
-        break;
-    default:
-        el = reg_el;
-        tcf = extract64(sctlr, 40, 2);
-    }
-
-    switch (tcf) {
-    case 1:
-        /* Tag check fail causes a synchronous exception. */
-        mte_sync_check_fail(env, desc, dirty_ptr, ra);
-        break;
-
-    case 0:
-        /*
-         * Tag check fail does not affect the PE.
-         * We eliminate this case by not setting MTE_ACTIVE
-         * in tb_flags, so that we never make this runtime call.
-         */
-        g_assert_not_reached();
-
-    case 2:
-        /* Tag check fail causes asynchronous flag set.  */
-        mte_async_check_fail(env, dirty_ptr, ra, arm_mmu_idx, el);
-        break;
-
-    case 3:
-        /*
-         * Tag check fail causes asynchronous flag set for stores, or
-         * a synchronous exception for loads.
-         */
-        if (FIELD_EX32(desc, MTEDESC, WRITE)) {
-            mte_async_check_fail(env, dirty_ptr, ra, arm_mmu_idx, el);
-        } else {
-            mte_sync_check_fail(env, desc, dirty_ptr, ra);
-        }
-        break;
-    }
-}
-
-/**
- * checkN:
- * @tag: tag memory to test
- * @odd: true to begin testing at tags at odd nibble
- * @cmp: the tag to compare against
- * @count: number of tags to test
- *
- * Return the number of successful tests.
- * Thus a return value < @count indicates a failure.
- *
- * A note about sizes: count is expected to be small.
- *
- * The most common use will be LDP/STP of two integer registers,
- * which means 16 bytes of memory touching at most 2 tags, but
- * often the access is aligned and thus just 1 tag.
- *
- * Using AdvSIMD LD/ST (multiple), one can access 64 bytes of memory,
- * touching at most 5 tags.  SVE LDR/STR (vector) with the default
- * vector length is also 64 bytes; the maximum architectural length
- * is 256 bytes touching at most 9 tags.
- *
- * The loop below uses 7 logical operations and 1 memory operation
- * per tag pair.  An implementation that loads an aligned word and
- * uses masking to ignore adjacent tags requires 18 logical operations
- * and thus does not begin to pay off until 6 tags.
- * Which, according to the survey above, is unlikely to be common.
- */
-static int checkN(uint8_t *mem, int odd, int cmp, int count)
-{
-    int n = 0, diff;
-
-    /* Replicate the test tag and compare.  */
-    cmp *= 0x11;
-    diff = *mem++ ^ cmp;
-
-    if (odd) {
-        goto start_odd;
-    }
-
-    while (1) {
-        /* Test even tag. */
-        if (unlikely((diff) & 0x0f)) {
-            break;
-        }
-        if (++n == count) {
-            break;
-        }
-
-    start_odd:
-        /* Test odd tag. */
-        if (unlikely((diff) & 0xf0)) {
-            break;
-        }
-        if (++n == count) {
-            break;
-        }
-
-        diff = *mem++ ^ cmp;
-    }
-    return n;
-}
-
-/**
- * mte_probe_int() - helper for mte_probe and mte_check
- * @env: CPU environment
- * @desc: MTEDESC descriptor
- * @ptr: virtual address of the base of the access
- * @fault: return virtual address of the first check failure
- *
- * Internal routine for both mte_probe and mte_check.
- * Return zero on failure, filling in *fault.
- * Return negative on trivial success for tbi disabled.
- * Return positive on success with tbi enabled.
- */
-static int mte_probe_int(CPUARMState *env, uint32_t desc, uint64_t ptr,
-                         uintptr_t ra, uint64_t *fault)
-{
-    int mmu_idx, ptr_tag, bit55;
-    uint64_t ptr_last, prev_page, next_page;
-    uint64_t tag_first, tag_last;
-    uint64_t tag_byte_first, tag_byte_last;
-    uint32_t sizem1, tag_count, tag_size, n, c;
-    uint8_t *mem1, *mem2;
-    MMUAccessType type;
-
-    bit55 = extract64(ptr, 55, 1);
-    *fault = ptr;
-
-    /* If TBI is disabled, the access is unchecked, and ptr is not dirty. */
-    if (unlikely(!tbi_check(desc, bit55))) {
-        return -1;
-    }
-
-    ptr_tag = allocation_tag_from_addr(ptr);
-
-    if (tcma_check(desc, bit55, ptr_tag)) {
-        return 1;
-    }
-
-    mmu_idx = FIELD_EX32(desc, MTEDESC, MIDX);
-    type = FIELD_EX32(desc, MTEDESC, WRITE) ? MMU_DATA_STORE : MMU_DATA_LOAD;
-    sizem1 = FIELD_EX32(desc, MTEDESC, SIZEM1);
-
-    /* Find the addr of the end of the access */
-    ptr_last = ptr + sizem1;
-
-    /* Round the bounds to the tag granule, and compute the number of tags. */
-    tag_first = QEMU_ALIGN_DOWN(ptr, TAG_GRANULE);
-    tag_last = QEMU_ALIGN_DOWN(ptr_last, TAG_GRANULE);
-    tag_count = ((tag_last - tag_first) / TAG_GRANULE) + 1;
-
-    /* Round the bounds to twice the tag granule, and compute the bytes. */
-    tag_byte_first = QEMU_ALIGN_DOWN(ptr, 2 * TAG_GRANULE);
-    tag_byte_last = QEMU_ALIGN_DOWN(ptr_last, 2 * TAG_GRANULE);
-
-    /* Locate the page boundaries. */
-    prev_page = ptr & TARGET_PAGE_MASK;
-    next_page = prev_page + TARGET_PAGE_SIZE;
-
-    if (likely(tag_last - prev_page < TARGET_PAGE_SIZE)) {
-        /* Memory access stays on one page. */
-        tag_size = ((tag_byte_last - tag_byte_first) / (2 * TAG_GRANULE)) + 1;
-        mem1 = allocation_tag_mem(env, mmu_idx, ptr, type, sizem1 + 1,
-                                  MMU_DATA_LOAD, tag_size, ra);
-        if (!mem1) {
-            return 1;
-        }
-        /* Perform all of the comparisons. */
-        n = checkN(mem1, ptr & TAG_GRANULE, ptr_tag, tag_count);
-    } else {
-        /* Memory access crosses to next page. */
-        tag_size = (next_page - tag_byte_first) / (2 * TAG_GRANULE);
-        mem1 = allocation_tag_mem(env, mmu_idx, ptr, type, next_page - ptr,
-                                  MMU_DATA_LOAD, tag_size, ra);
-
-        tag_size = ((tag_byte_last - next_page) / (2 * TAG_GRANULE)) + 1;
-        mem2 = allocation_tag_mem(env, mmu_idx, next_page, type,
-                                  ptr_last - next_page + 1,
-                                  MMU_DATA_LOAD, tag_size, ra);
-
-        /*
-         * Perform all of the comparisons.
-         * Note the possible but unlikely case of the operation spanning
-         * two pages that do not both have tagging enabled.
-         */
-        n = c = (next_page - tag_first) / TAG_GRANULE;
-        if (mem1) {
-            n = checkN(mem1, ptr & TAG_GRANULE, ptr_tag, c);
-        }
-        if (n == c) {
-            if (!mem2) {
-                return 1;
-            }
-            n += checkN(mem2, 0, ptr_tag, tag_count - c);
-        }
-    }
-
-    if (likely(n == tag_count)) {
-        return 1;
-    }
-
-    /*
-     * If we failed, we know which granule.  For the first granule, the
-     * failure address is @ptr, the first byte accessed.  Otherwise the
-     * failure address is the first byte of the nth granule.
-     */
-    if (n > 0) {
-        *fault = tag_first + n * TAG_GRANULE;
-    }
-    return 0;
-}
-
-uint64_t mte_check(CPUARMState *env, uint32_t desc, uint64_t ptr, uintptr_t ra)
-{
-    uint64_t fault;
-    int ret = mte_probe_int(env, desc, ptr, ra, &fault);
-
-    if (unlikely(ret == 0)) {
-        mte_check_fail(env, desc, fault, ra);
-    } else if (ret < 0) {
-        return ptr;
-    }
-    return useronly_clean_ptr(ptr);
-}
-
-uint64_t HELPER(mte_check)(CPUARMState *env, uint32_t desc, uint64_t ptr)
-{
-    return mte_check(env, desc, ptr, GETPC());
-}
-
-/*
- * No-fault version of mte_check, to be used by SVE for MemSingleNF.
- * Returns false if the access is Checked and the check failed.  This
- * is only intended to probe the tag -- the validity of the page must
- * be checked beforehand.
- */
-bool mte_probe(CPUARMState *env, uint32_t desc, uint64_t ptr)
-{
-    uint64_t fault;
-    int ret = mte_probe_int(env, desc, ptr, 0, &fault);
-
-    return ret != 0;
-}
-
-/*
- * Perform an MTE checked access for DC_ZVA.
- */
-uint64_t HELPER(mte_check_zva)(CPUARMState *env, uint32_t desc, uint64_t ptr)
-{
-    uintptr_t ra = GETPC();
-    int log2_dcz_bytes, log2_tag_bytes;
-    int mmu_idx, bit55;
-    intptr_t dcz_bytes, tag_bytes, i;
-    void *mem;
-    uint64_t ptr_tag, mem_tag, align_ptr;
-
-    bit55 = extract64(ptr, 55, 1);
-
-    /* If TBI is disabled, the access is unchecked, and ptr is not dirty. */
-    if (unlikely(!tbi_check(desc, bit55))) {
-        return ptr;
-    }
-
-    ptr_tag = allocation_tag_from_addr(ptr);
-
-    if (tcma_check(desc, bit55, ptr_tag)) {
-        goto done;
-    }
-
-    /*
-     * In arm_cpu_realizefn, we asserted that dcz > LOG2_TAG_GRANULE+1,
-     * i.e. 32 bytes, which is an unreasonably small dcz anyway, to make
-     * sure that we can access one complete tag byte here.
-     */
-    log2_dcz_bytes = env_archcpu(env)->dcz_blocksize + 2;
-    log2_tag_bytes = log2_dcz_bytes - (LOG2_TAG_GRANULE + 1);
-    dcz_bytes = (intptr_t)1 << log2_dcz_bytes;
-    tag_bytes = (intptr_t)1 << log2_tag_bytes;
-    align_ptr = ptr & -dcz_bytes;
-
-    /*
-     * Trap if accessing an invalid page.  DC_ZVA requires that we supply
-     * the original pointer for an invalid page.  But watchpoints require
-     * that we probe the actual space.  So do both.
-     */
-    mmu_idx = FIELD_EX32(desc, MTEDESC, MIDX);
-    (void) probe_write(env, ptr, 1, mmu_idx, ra);
-    mem = allocation_tag_mem(env, mmu_idx, align_ptr, MMU_DATA_STORE,
-                             dcz_bytes, MMU_DATA_LOAD, tag_bytes, ra);
-    if (!mem) {
-        goto done;
-    }
-
-    /*
-     * Unlike the reasoning for checkN, DC_ZVA is always aligned, and thus
-     * it is quite easy to perform all of the comparisons at once without
-     * any extra masking.
-     *
-     * The most common zva block size is 64; some of the thunderx cpus use
-     * a block size of 128.  For user-only, aarch64_max_initfn will set the
-     * block size to 512.  Fill out the other cases for future-proofing.
-     *
-     * In order to be able to find the first miscompare later, we want the
-     * tag bytes to be in little-endian order.
-     */
-    switch (log2_tag_bytes) {
-    case 0: /* zva_blocksize 32 */
-        mem_tag = *(uint8_t *)mem;
-        ptr_tag *= 0x11u;
-        break;
-    case 1: /* zva_blocksize 64 */
-        mem_tag = cpu_to_le16(*(uint16_t *)mem);
-        ptr_tag *= 0x1111u;
-        break;
-    case 2: /* zva_blocksize 128 */
-        mem_tag = cpu_to_le32(*(uint32_t *)mem);
-        ptr_tag *= 0x11111111u;
-        break;
-    case 3: /* zva_blocksize 256 */
-        mem_tag = cpu_to_le64(*(uint64_t *)mem);
-        ptr_tag *= 0x1111111111111111ull;
-        break;
-
-    default: /* zva_blocksize 512, 1024, 2048 */
-        ptr_tag *= 0x1111111111111111ull;
-        i = 0;
-        do {
-            mem_tag = cpu_to_le64(*(uint64_t *)(mem + i));
-            if (unlikely(mem_tag != ptr_tag)) {
-                goto fail;
-            }
-            i += 8;
-            align_ptr += 16 * TAG_GRANULE;
-        } while (i < tag_bytes);
-        goto done;
-    }
-
-    if (likely(mem_tag == ptr_tag)) {
-        goto done;
-    }
-
- fail:
-    /* Locate the first nibble that differs. */
-    i = ctz64(mem_tag ^ ptr_tag) >> 4;
-    mte_check_fail(env, desc, align_ptr + i * TAG_GRANULE, ra);
-
- done:
-    return useronly_clean_ptr(ptr);
-}
diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c
deleted file mode 100644
index 403b345..0000000
--- a/target/arm/mve_helper.c
+++ /dev/null
@@ -1,3450 +0,0 @@
-/*
- * M-profile MVE Operations
- *
- * Copyright (c) 2021 Linaro, Ltd.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "qemu/osdep.h"
-#include "cpu.h"
-#include "internals.h"
-#include "vec_internal.h"
-#include "exec/helper-proto.h"
-#include "exec/cpu_ldst.h"
-#include "exec/exec-all.h"
-#include "tcg/tcg.h"
-#include "fpu/softfloat.h"
-
-static uint16_t mve_eci_mask(CPUARMState *env)
-{
-    /*
-     * Return the mask of which elements in the MVE vector correspond
-     * to beats being executed. The mask has 1 bits for executed lanes
-     * and 0 bits where ECI says this beat was already executed.
-     */
-    int eci;
-
-    if ((env->condexec_bits & 0xf) != 0) {
-        return 0xffff;
-    }
-
-    eci = env->condexec_bits >> 4;
-    switch (eci) {
-    case ECI_NONE:
-        return 0xffff;
-    case ECI_A0:
-        return 0xfff0;
-    case ECI_A0A1:
-        return 0xff00;
-    case ECI_A0A1A2:
-    case ECI_A0A1A2B0:
-        return 0xf000;
-    default:
-        g_assert_not_reached();
-    }
-}
-
-static uint16_t mve_element_mask(CPUARMState *env)
-{
-    /*
-     * Return the mask of which elements in the MVE vector should be
-     * updated. This is a combination of multiple things:
-     *  (1) by default, we update every lane in the vector
-     *  (2) VPT predication stores its state in the VPR register;
-     *  (3) low-overhead-branch tail predication will mask out part
-     *      the vector on the final iteration of the loop
-     *  (4) if EPSR.ECI is set then we must execute only some beats
-     *      of the insn
-     * We combine all these into a 16-bit result with the same semantics
-     * as VPR.P0: 0 to mask the lane, 1 if it is active.
-     * 8-bit vector ops will look at all bits of the result;
-     * 16-bit ops will look at bits 0, 2, 4, ...;
-     * 32-bit ops will look at bits 0, 4, 8 and 12.
-     * Compare pseudocode GetCurInstrBeat(), though that only returns
-     * the 4-bit slice of the mask corresponding to a single beat.
-     */
-    uint16_t mask = FIELD_EX32(env->v7m.vpr, V7M_VPR, P0);
-
-    if (!(env->v7m.vpr & R_V7M_VPR_MASK01_MASK)) {
-        mask |= 0xff;
-    }
-    if (!(env->v7m.vpr & R_V7M_VPR_MASK23_MASK)) {
-        mask |= 0xff00;
-    }
-
-    if (env->v7m.ltpsize < 4 &&
-        env->regs[14] <= (1 << (4 - env->v7m.ltpsize))) {
-        /*
-         * Tail predication active, and this is the last loop iteration.
-         * The element size is (1 << ltpsize), and we only want to process
-         * loopcount elements, so we want to retain the least significant
-         * (loopcount * esize) predicate bits and zero out bits above that.
-         */
-        int masklen = env->regs[14] << env->v7m.ltpsize;
-        assert(masklen <= 16);
-        uint16_t ltpmask = masklen ? MAKE_64BIT_MASK(0, masklen) : 0;
-        mask &= ltpmask;
-    }
-
-    /*
-     * ECI bits indicate which beats are already executed;
-     * we handle this by effectively predicating them out.
-     */
-    mask &= mve_eci_mask(env);
-    return mask;
-}
-
-static void mve_advance_vpt(CPUARMState *env)
-{
-    /* Advance the VPT and ECI state if necessary */
-    uint32_t vpr = env->v7m.vpr;
-    unsigned mask01, mask23;
-    uint16_t inv_mask;
-    uint16_t eci_mask = mve_eci_mask(env);
-
-    if ((env->condexec_bits & 0xf) == 0) {
-        env->condexec_bits = (env->condexec_bits == (ECI_A0A1A2B0 << 4)) ?
-            (ECI_A0 << 4) : (ECI_NONE << 4);
-    }
-
-    if (!(vpr & (R_V7M_VPR_MASK01_MASK | R_V7M_VPR_MASK23_MASK))) {
-        /* VPT not enabled, nothing to do */
-        return;
-    }
-
-    /* Invert P0 bits if needed, but only for beats we actually executed */
-    mask01 = FIELD_EX32(vpr, V7M_VPR, MASK01);
-    mask23 = FIELD_EX32(vpr, V7M_VPR, MASK23);
-    /* Start by assuming we invert all bits corresponding to executed beats */
-    inv_mask = eci_mask;
-    if (mask01 <= 8) {
-        /* MASK01 says don't invert low half of P0 */
-        inv_mask &= ~0xff;
-    }
-    if (mask23 <= 8) {
-        /* MASK23 says don't invert high half of P0 */
-        inv_mask &= ~0xff00;
-    }
-    vpr ^= inv_mask;
-    /* Only update MASK01 if beat 1 executed */
-    if (eci_mask & 0xf0) {
-        vpr = FIELD_DP32(vpr, V7M_VPR, MASK01, mask01 << 1);
-    }
-    /* Beat 3 always executes, so update MASK23 */
-    vpr = FIELD_DP32(vpr, V7M_VPR, MASK23, mask23 << 1);
-    env->v7m.vpr = vpr;
-}
-
-/* For loads, predicated lanes are zeroed instead of keeping their old values */
-#define DO_VLDR(OP, MSIZE, LDTYPE, ESIZE, TYPE)                         \
-    void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr)    \
-    {                                                                   \
-        TYPE *d = vd;                                                   \
-        uint16_t mask = mve_element_mask(env);                          \
-        uint16_t eci_mask = mve_eci_mask(env);                          \
-        unsigned b, e;                                                  \
-        /*                                                              \
-         * R_SXTM allows the dest reg to become UNKNOWN for abandoned   \
-         * beats so we don't care if we update part of the dest and     \
-         * then take an exception.                                      \
-         */                                                             \
-        for (b = 0, e = 0; b < 16; b += ESIZE, e++) {                   \
-            if (eci_mask & (1 << b)) {                                  \
-                d[H##ESIZE(e)] = (mask & (1 << b)) ?                    \
-                    cpu_##LDTYPE##_data_ra(env, addr, GETPC()) : 0;     \
-            }                                                           \
-            addr += MSIZE;                                              \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-#define DO_VSTR(OP, MSIZE, STTYPE, ESIZE, TYPE)                         \
-    void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr)    \
-    {                                                                   \
-        TYPE *d = vd;                                                   \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned b, e;                                                  \
-        for (b = 0, e = 0; b < 16; b += ESIZE, e++) {                   \
-            if (mask & (1 << b)) {                                      \
-                cpu_##STTYPE##_data_ra(env, addr, d[H##ESIZE(e)], GETPC()); \
-            }                                                           \
-            addr += MSIZE;                                              \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-DO_VLDR(vldrb, 1, ldub, 1, uint8_t)
-DO_VLDR(vldrh, 2, lduw, 2, uint16_t)
-DO_VLDR(vldrw, 4, ldl, 4, uint32_t)
-
-DO_VSTR(vstrb, 1, stb, 1, uint8_t)
-DO_VSTR(vstrh, 2, stw, 2, uint16_t)
-DO_VSTR(vstrw, 4, stl, 4, uint32_t)
-
-DO_VLDR(vldrb_sh, 1, ldsb, 2, int16_t)
-DO_VLDR(vldrb_sw, 1, ldsb, 4, int32_t)
-DO_VLDR(vldrb_uh, 1, ldub, 2, uint16_t)
-DO_VLDR(vldrb_uw, 1, ldub, 4, uint32_t)
-DO_VLDR(vldrh_sw, 2, ldsw, 4, int32_t)
-DO_VLDR(vldrh_uw, 2, lduw, 4, uint32_t)
-
-DO_VSTR(vstrb_h, 1, stb, 2, int16_t)
-DO_VSTR(vstrb_w, 1, stb, 4, int32_t)
-DO_VSTR(vstrh_w, 2, stw, 4, int32_t)
-
-#undef DO_VLDR
-#undef DO_VSTR
-
-/*
- * Gather loads/scatter stores. Here each element of Qm specifies
- * an offset to use from the base register Rm. In the _os_ versions
- * that offset is scaled by the element size.
- * For loads, predicated lanes are zeroed instead of retaining
- * their previous values.
- */
-#define DO_VLDR_SG(OP, LDTYPE, ESIZE, TYPE, OFFTYPE, ADDRFN, WB)        \
-    void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm,         \
-                          uint32_t base)                                \
-    {                                                                   \
-        TYPE *d = vd;                                                   \
-        OFFTYPE *m = vm;                                                \
-        uint16_t mask = mve_element_mask(env);                          \
-        uint16_t eci_mask = mve_eci_mask(env);                          \
-        unsigned e;                                                     \
-        uint32_t addr;                                                  \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE, eci_mask >>= ESIZE) { \
-            if (!(eci_mask & 1)) {                                      \
-                continue;                                               \
-            }                                                           \
-            addr = ADDRFN(base, m[H##ESIZE(e)]);                        \
-            d[H##ESIZE(e)] = (mask & 1) ?                               \
-                cpu_##LDTYPE##_data_ra(env, addr, GETPC()) : 0;         \
-            if (WB) {                                                   \
-                m[H##ESIZE(e)] = addr;                                  \
-            }                                                           \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-/* We know here TYPE is unsigned so always the same as the offset type */
-#define DO_VSTR_SG(OP, STTYPE, ESIZE, TYPE, ADDRFN, WB)                 \
-    void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm,         \
-                          uint32_t base)                                \
-    {                                                                   \
-        TYPE *d = vd;                                                   \
-        TYPE *m = vm;                                                   \
-        uint16_t mask = mve_element_mask(env);                          \
-        uint16_t eci_mask = mve_eci_mask(env);                          \
-        unsigned e;                                                     \
-        uint32_t addr;                                                  \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE, eci_mask >>= ESIZE) { \
-            if (!(eci_mask & 1)) {                                      \
-                continue;                                               \
-            }                                                           \
-            addr = ADDRFN(base, m[H##ESIZE(e)]);                        \
-            if (mask & 1) {                                             \
-                cpu_##STTYPE##_data_ra(env, addr, d[H##ESIZE(e)], GETPC()); \
-            }                                                           \
-            if (WB) {                                                   \
-                m[H##ESIZE(e)] = addr;                                  \
-            }                                                           \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-/*
- * 64-bit accesses are slightly different: they are done as two 32-bit
- * accesses, controlled by the predicate mask for the relevant beat,
- * and with a single 32-bit offset in the first of the two Qm elements.
- * Note that for QEMU our IMPDEF AIRCR.ENDIANNESS is always 0 (little).
- * Address writeback happens on the odd beats and updates the address
- * stored in the even-beat element.
- */
-#define DO_VLDR64_SG(OP, ADDRFN, WB)                                    \
-    void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm,         \
-                          uint32_t base)                                \
-    {                                                                   \
-        uint32_t *d = vd;                                               \
-        uint32_t *m = vm;                                               \
-        uint16_t mask = mve_element_mask(env);                          \
-        uint16_t eci_mask = mve_eci_mask(env);                          \
-        unsigned e;                                                     \
-        uint32_t addr;                                                  \
-        for (e = 0; e < 16 / 4; e++, mask >>= 4, eci_mask >>= 4) {      \
-            if (!(eci_mask & 1)) {                                      \
-                continue;                                               \
-            }                                                           \
-            addr = ADDRFN(base, m[H4(e & ~1)]);                         \
-            addr += 4 * (e & 1);                                        \
-            d[H4(e)] = (mask & 1) ? cpu_ldl_data_ra(env, addr, GETPC()) : 0; \
-            if (WB && (e & 1)) {                                        \
-                m[H4(e & ~1)] = addr - 4;                               \
-            }                                                           \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-#define DO_VSTR64_SG(OP, ADDRFN, WB)                                    \
-    void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm,         \
-                          uint32_t base)                                \
-    {                                                                   \
-        uint32_t *d = vd;                                               \
-        uint32_t *m = vm;                                               \
-        uint16_t mask = mve_element_mask(env);                          \
-        uint16_t eci_mask = mve_eci_mask(env);                          \
-        unsigned e;                                                     \
-        uint32_t addr;                                                  \
-        for (e = 0; e < 16 / 4; e++, mask >>= 4, eci_mask >>= 4) {      \
-            if (!(eci_mask & 1)) {                                      \
-                continue;                                               \
-            }                                                           \
-            addr = ADDRFN(base, m[H4(e & ~1)]);                         \
-            addr += 4 * (e & 1);                                        \
-            if (mask & 1) {                                             \
-                cpu_stl_data_ra(env, addr, d[H4(e)], GETPC());          \
-            }                                                           \
-            if (WB && (e & 1)) {                                        \
-                m[H4(e & ~1)] = addr - 4;                               \
-            }                                                           \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-#define ADDR_ADD(BASE, OFFSET) ((BASE) + (OFFSET))
-#define ADDR_ADD_OSH(BASE, OFFSET) ((BASE) + ((OFFSET) << 1))
-#define ADDR_ADD_OSW(BASE, OFFSET) ((BASE) + ((OFFSET) << 2))
-#define ADDR_ADD_OSD(BASE, OFFSET) ((BASE) + ((OFFSET) << 3))
-
-DO_VLDR_SG(vldrb_sg_sh, ldsb, 2, int16_t, uint16_t, ADDR_ADD, false)
-DO_VLDR_SG(vldrb_sg_sw, ldsb, 4, int32_t, uint32_t, ADDR_ADD, false)
-DO_VLDR_SG(vldrh_sg_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD, false)
-
-DO_VLDR_SG(vldrb_sg_ub, ldub, 1, uint8_t, uint8_t, ADDR_ADD, false)
-DO_VLDR_SG(vldrb_sg_uh, ldub, 2, uint16_t, uint16_t, ADDR_ADD, false)
-DO_VLDR_SG(vldrb_sg_uw, ldub, 4, uint32_t, uint32_t, ADDR_ADD, false)
-DO_VLDR_SG(vldrh_sg_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD, false)
-DO_VLDR_SG(vldrh_sg_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD, false)
-DO_VLDR_SG(vldrw_sg_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD, false)
-DO_VLDR64_SG(vldrd_sg_ud, ADDR_ADD, false)
-
-DO_VLDR_SG(vldrh_sg_os_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD_OSH, false)
-DO_VLDR_SG(vldrh_sg_os_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD_OSH, false)
-DO_VLDR_SG(vldrh_sg_os_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD_OSH, false)
-DO_VLDR_SG(vldrw_sg_os_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD_OSW, false)
-DO_VLDR64_SG(vldrd_sg_os_ud, ADDR_ADD_OSD, false)
-
-DO_VSTR_SG(vstrb_sg_ub, stb, 1, uint8_t, ADDR_ADD, false)
-DO_VSTR_SG(vstrb_sg_uh, stb, 2, uint16_t, ADDR_ADD, false)
-DO_VSTR_SG(vstrb_sg_uw, stb, 4, uint32_t, ADDR_ADD, false)
-DO_VSTR_SG(vstrh_sg_uh, stw, 2, uint16_t, ADDR_ADD, false)
-DO_VSTR_SG(vstrh_sg_uw, stw, 4, uint32_t, ADDR_ADD, false)
-DO_VSTR_SG(vstrw_sg_uw, stl, 4, uint32_t, ADDR_ADD, false)
-DO_VSTR64_SG(vstrd_sg_ud, ADDR_ADD, false)
-
-DO_VSTR_SG(vstrh_sg_os_uh, stw, 2, uint16_t, ADDR_ADD_OSH, false)
-DO_VSTR_SG(vstrh_sg_os_uw, stw, 4, uint32_t, ADDR_ADD_OSH, false)
-DO_VSTR_SG(vstrw_sg_os_uw, stl, 4, uint32_t, ADDR_ADD_OSW, false)
-DO_VSTR64_SG(vstrd_sg_os_ud, ADDR_ADD_OSD, false)
-
-DO_VLDR_SG(vldrw_sg_wb_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD, true)
-DO_VLDR64_SG(vldrd_sg_wb_ud, ADDR_ADD, true)
-DO_VSTR_SG(vstrw_sg_wb_uw, stl, 4, uint32_t, ADDR_ADD, true)
-DO_VSTR64_SG(vstrd_sg_wb_ud, ADDR_ADD, true)
-
-/*
- * Deinterleaving loads/interleaving stores.
- *
- * For these helpers we are passed the index of the first Qreg
- * (VLD2/VST2 will also access Qn+1, VLD4/VST4 access Qn .. Qn+3)
- * and the value of the base address register Rn.
- * The helpers are specialized for pattern and element size, so
- * for instance vld42h is VLD4 with pattern 2, element size MO_16.
- *
- * These insns are beatwise but not predicated, so we must honour ECI,
- * but need not look at mve_element_mask().
- *
- * The pseudocode implements these insns with multiple memory accesses
- * of the element size, but rules R_VVVG and R_FXDM permit us to make
- * one 32-bit memory access per beat.
- */
-#define DO_VLD4B(OP, O1, O2, O3, O4)                                    \
-    void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
-                          uint32_t base)                                \
-    {                                                                   \
-        int beat, e;                                                    \
-        uint16_t mask = mve_eci_mask(env);                              \
-        static const uint8_t off[4] = { O1, O2, O3, O4 };               \
-        uint32_t addr, data;                                            \
-        for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
-            if ((mask & 1) == 0) {                                      \
-                /* ECI says skip this beat */                           \
-                continue;                                               \
-            }                                                           \
-            addr = base + off[beat] * 4;                                \
-            data = cpu_ldl_le_data_ra(env, addr, GETPC());              \
-            for (e = 0; e < 4; e++, data >>= 8) {                       \
-                uint8_t *qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + e); \
-                qd[H1(off[beat])] = data;                               \
-            }                                                           \
-        }                                                               \
-    }
-
-#define DO_VLD4H(OP, O1, O2)                                            \
-    void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
-                          uint32_t base)                                \
-    {                                                                   \
-        int beat;                                                       \
-        uint16_t mask = mve_eci_mask(env);                              \
-        static const uint8_t off[4] = { O1, O1, O2, O2 };               \
-        uint32_t addr, data;                                            \
-        int y; /* y counts 0 2 0 2 */                                   \
-        uint16_t *qd;                                                   \
-        for (beat = 0, y = 0; beat < 4; beat++, mask >>= 4, y ^= 2) {   \
-            if ((mask & 1) == 0) {                                      \
-                /* ECI says skip this beat */                           \
-                continue;                                               \
-            }                                                           \
-            addr = base + off[beat] * 8 + (beat & 1) * 4;               \
-            data = cpu_ldl_le_data_ra(env, addr, GETPC());              \
-            qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y);             \
-            qd[H2(off[beat])] = data;                                   \
-            data >>= 16;                                                \
-            qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y + 1);         \
-            qd[H2(off[beat])] = data;                                   \
-        }                                                               \
-    }
-
-#define DO_VLD4W(OP, O1, O2, O3, O4)                                    \
-    void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
-                          uint32_t base)                                \
-    {                                                                   \
-        int beat;                                                       \
-        uint16_t mask = mve_eci_mask(env);                              \
-        static const uint8_t off[4] = { O1, O2, O3, O4 };               \
-        uint32_t addr, data;                                            \
-        uint32_t *qd;                                                   \
-        int y;                                                          \
-        for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
-            if ((mask & 1) == 0) {                                      \
-                /* ECI says skip this beat */                           \
-                continue;                                               \
-            }                                                           \
-            addr = base + off[beat] * 4;                                \
-            data = cpu_ldl_le_data_ra(env, addr, GETPC());              \
-            y = (beat + (O1 & 2)) & 3;                                  \
-            qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + y);             \
-            qd[H4(off[beat] >> 2)] = data;                              \
-        }                                                               \
-    }
-
-DO_VLD4B(vld40b, 0, 1, 10, 11)
-DO_VLD4B(vld41b, 2, 3, 12, 13)
-DO_VLD4B(vld42b, 4, 5, 14, 15)
-DO_VLD4B(vld43b, 6, 7, 8, 9)
-
-DO_VLD4H(vld40h, 0, 5)
-DO_VLD4H(vld41h, 1, 6)
-DO_VLD4H(vld42h, 2, 7)
-DO_VLD4H(vld43h, 3, 4)
-
-DO_VLD4W(vld40w, 0, 1, 10, 11)
-DO_VLD4W(vld41w, 2, 3, 12, 13)
-DO_VLD4W(vld42w, 4, 5, 14, 15)
-DO_VLD4W(vld43w, 6, 7, 8, 9)
-
-#define DO_VLD2B(OP, O1, O2, O3, O4)                                    \
-    void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
-                          uint32_t base)                                \
-    {                                                                   \
-        int beat, e;                                                    \
-        uint16_t mask = mve_eci_mask(env);                              \
-        static const uint8_t off[4] = { O1, O2, O3, O4 };               \
-        uint32_t addr, data;                                            \
-        uint8_t *qd;                                                    \
-        for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
-            if ((mask & 1) == 0) {                                      \
-                /* ECI says skip this beat */                           \
-                continue;                                               \
-            }                                                           \
-            addr = base + off[beat] * 2;                                \
-            data = cpu_ldl_le_data_ra(env, addr, GETPC());              \
-            for (e = 0; e < 4; e++, data >>= 8) {                       \
-                qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + (e & 1));    \
-                qd[H1(off[beat] + (e >> 1))] = data;                    \
-            }                                                           \
-        }                                                               \
-    }
-
-#define DO_VLD2H(OP, O1, O2, O3, O4)                                    \
-    void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
-                          uint32_t base)                                \
-    {                                                                   \
-        int beat;                                                       \
-        uint16_t mask = mve_eci_mask(env);                              \
-        static const uint8_t off[4] = { O1, O2, O3, O4 };               \
-        uint32_t addr, data;                                            \
-        int e;                                                          \
-        uint16_t *qd;                                                   \
-        for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
-            if ((mask & 1) == 0) {                                      \
-                /* ECI says skip this beat */                           \
-                continue;                                               \
-            }                                                           \
-            addr = base + off[beat] * 4;                                \
-            data = cpu_ldl_le_data_ra(env, addr, GETPC());              \
-            for (e = 0; e < 2; e++, data >>= 16) {                      \
-                qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + e);         \
-                qd[H2(off[beat])] = data;                               \
-            }                                                           \
-        }                                                               \
-    }
-
-#define DO_VLD2W(OP, O1, O2, O3, O4)                                    \
-    void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
-                          uint32_t base)                                \
-    {                                                                   \
-        int beat;                                                       \
-        uint16_t mask = mve_eci_mask(env);                              \
-        static const uint8_t off[4] = { O1, O2, O3, O4 };               \
-        uint32_t addr, data;                                            \
-        uint32_t *qd;                                                   \
-        for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
-            if ((mask & 1) == 0) {                                      \
-                /* ECI says skip this beat */                           \
-                continue;                                               \
-            }                                                           \
-            addr = base + off[beat];                                    \
-            data = cpu_ldl_le_data_ra(env, addr, GETPC());              \
-            qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + (beat & 1));    \
-            qd[H4(off[beat] >> 3)] = data;                              \
-        }                                                               \
-    }
-
-DO_VLD2B(vld20b, 0, 2, 12, 14)
-DO_VLD2B(vld21b, 4, 6, 8, 10)
-
-DO_VLD2H(vld20h, 0, 1, 6, 7)
-DO_VLD2H(vld21h, 2, 3, 4, 5)
-
-DO_VLD2W(vld20w, 0, 4, 24, 28)
-DO_VLD2W(vld21w, 8, 12, 16, 20)
-
-#define DO_VST4B(OP, O1, O2, O3, O4)                                    \
-    void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
-                          uint32_t base)                                \
-    {                                                                   \
-        int beat, e;                                                    \
-        uint16_t mask = mve_eci_mask(env);                              \
-        static const uint8_t off[4] = { O1, O2, O3, O4 };               \
-        uint32_t addr, data;                                            \
-        for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
-            if ((mask & 1) == 0) {                                      \
-                /* ECI says skip this beat */                           \
-                continue;                                               \
-            }                                                           \
-            addr = base + off[beat] * 4;                                \
-            data = 0;                                                   \
-            for (e = 3; e >= 0; e--) {                                  \
-                uint8_t *qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + e); \
-                data = (data << 8) | qd[H1(off[beat])];                 \
-            }                                                           \
-            cpu_stl_le_data_ra(env, addr, data, GETPC());               \
-        }                                                               \
-    }
-
-#define DO_VST4H(OP, O1, O2)                                            \
-    void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
-                          uint32_t base)                                \
-    {                                                                   \
-        int beat;                                                       \
-        uint16_t mask = mve_eci_mask(env);                              \
-        static const uint8_t off[4] = { O1, O1, O2, O2 };               \
-        uint32_t addr, data;                                            \
-        int y; /* y counts 0 2 0 2 */                                   \
-        uint16_t *qd;                                                   \
-        for (beat = 0, y = 0; beat < 4; beat++, mask >>= 4, y ^= 2) {   \
-            if ((mask & 1) == 0) {                                      \
-                /* ECI says skip this beat */                           \
-                continue;                                               \
-            }                                                           \
-            addr = base + off[beat] * 8 + (beat & 1) * 4;               \
-            qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y);             \
-            data = qd[H2(off[beat])];                                   \
-            qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y + 1);         \
-            data |= qd[H2(off[beat])] << 16;                            \
-            cpu_stl_le_data_ra(env, addr, data, GETPC());               \
-        }                                                               \
-    }
-
-#define DO_VST4W(OP, O1, O2, O3, O4)                                    \
-    void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
-                          uint32_t base)                                \
-    {                                                                   \
-        int beat;                                                       \
-        uint16_t mask = mve_eci_mask(env);                              \
-        static const uint8_t off[4] = { O1, O2, O3, O4 };               \
-        uint32_t addr, data;                                            \
-        uint32_t *qd;                                                   \
-        int y;                                                          \
-        for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
-            if ((mask & 1) == 0) {                                      \
-                /* ECI says skip this beat */                           \
-                continue;                                               \
-            }                                                           \
-            addr = base + off[beat] * 4;                                \
-            y = (beat + (O1 & 2)) & 3;                                  \
-            qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + y);             \
-            data = qd[H4(off[beat] >> 2)];                              \
-            cpu_stl_le_data_ra(env, addr, data, GETPC());               \
-        }                                                               \
-    }
-
-DO_VST4B(vst40b, 0, 1, 10, 11)
-DO_VST4B(vst41b, 2, 3, 12, 13)
-DO_VST4B(vst42b, 4, 5, 14, 15)
-DO_VST4B(vst43b, 6, 7, 8, 9)
-
-DO_VST4H(vst40h, 0, 5)
-DO_VST4H(vst41h, 1, 6)
-DO_VST4H(vst42h, 2, 7)
-DO_VST4H(vst43h, 3, 4)
-
-DO_VST4W(vst40w, 0, 1, 10, 11)
-DO_VST4W(vst41w, 2, 3, 12, 13)
-DO_VST4W(vst42w, 4, 5, 14, 15)
-DO_VST4W(vst43w, 6, 7, 8, 9)
-
-#define DO_VST2B(OP, O1, O2, O3, O4)                                    \
-    void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
-                          uint32_t base)                                \
-    {                                                                   \
-        int beat, e;                                                    \
-        uint16_t mask = mve_eci_mask(env);                              \
-        static const uint8_t off[4] = { O1, O2, O3, O4 };               \
-        uint32_t addr, data;                                            \
-        uint8_t *qd;                                                    \
-        for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
-            if ((mask & 1) == 0) {                                      \
-                /* ECI says skip this beat */                           \
-                continue;                                               \
-            }                                                           \
-            addr = base + off[beat] * 2;                                \
-            data = 0;                                                   \
-            for (e = 3; e >= 0; e--) {                                  \
-                qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + (e & 1));    \
-                data = (data << 8) | qd[H1(off[beat] + (e >> 1))];      \
-            }                                                           \
-            cpu_stl_le_data_ra(env, addr, data, GETPC());               \
-        }                                                               \
-    }
-
-#define DO_VST2H(OP, O1, O2, O3, O4)                                    \
-    void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
-                          uint32_t base)                                \
-    {                                                                   \
-        int beat;                                                       \
-        uint16_t mask = mve_eci_mask(env);                              \
-        static const uint8_t off[4] = { O1, O2, O3, O4 };               \
-        uint32_t addr, data;                                            \
-        int e;                                                          \
-        uint16_t *qd;                                                   \
-        for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
-            if ((mask & 1) == 0) {                                      \
-                /* ECI says skip this beat */                           \
-                continue;                                               \
-            }                                                           \
-            addr = base + off[beat] * 4;                                \
-            data = 0;                                                   \
-            for (e = 1; e >= 0; e--) {                                  \
-                qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + e);         \
-                data = (data << 16) | qd[H2(off[beat])];                \
-            }                                                           \
-            cpu_stl_le_data_ra(env, addr, data, GETPC());               \
-        }                                                               \
-    }
-
-#define DO_VST2W(OP, O1, O2, O3, O4)                                    \
-    void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
-                          uint32_t base)                                \
-    {                                                                   \
-        int beat;                                                       \
-        uint16_t mask = mve_eci_mask(env);                              \
-        static const uint8_t off[4] = { O1, O2, O3, O4 };               \
-        uint32_t addr, data;                                            \
-        uint32_t *qd;                                                   \
-        for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
-            if ((mask & 1) == 0) {                                      \
-                /* ECI says skip this beat */                           \
-                continue;                                               \
-            }                                                           \
-            addr = base + off[beat];                                    \
-            qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + (beat & 1));    \
-            data = qd[H4(off[beat] >> 3)];                              \
-            cpu_stl_le_data_ra(env, addr, data, GETPC());               \
-        }                                                               \
-    }
-
-DO_VST2B(vst20b, 0, 2, 12, 14)
-DO_VST2B(vst21b, 4, 6, 8, 10)
-
-DO_VST2H(vst20h, 0, 1, 6, 7)
-DO_VST2H(vst21h, 2, 3, 4, 5)
-
-DO_VST2W(vst20w, 0, 4, 24, 28)
-DO_VST2W(vst21w, 8, 12, 16, 20)
-
-/*
- * The mergemask(D, R, M) macro performs the operation "*D = R" but
- * storing only the bytes which correspond to 1 bits in M,
- * leaving other bytes in *D unchanged. We use _Generic
- * to select the correct implementation based on the type of D.
- */
-
-static void mergemask_ub(uint8_t *d, uint8_t r, uint16_t mask)
-{
-    if (mask & 1) {
-        *d = r;
-    }
-}
-
-static void mergemask_sb(int8_t *d, int8_t r, uint16_t mask)
-{
-    mergemask_ub((uint8_t *)d, r, mask);
-}
-
-static void mergemask_uh(uint16_t *d, uint16_t r, uint16_t mask)
-{
-    uint16_t bmask = expand_pred_b(mask);
-    *d = (*d & ~bmask) | (r & bmask);
-}
-
-static void mergemask_sh(int16_t *d, int16_t r, uint16_t mask)
-{
-    mergemask_uh((uint16_t *)d, r, mask);
-}
-
-static void mergemask_uw(uint32_t *d, uint32_t r, uint16_t mask)
-{
-    uint32_t bmask = expand_pred_b(mask);
-    *d = (*d & ~bmask) | (r & bmask);
-}
-
-static void mergemask_sw(int32_t *d, int32_t r, uint16_t mask)
-{
-    mergemask_uw((uint32_t *)d, r, mask);
-}
-
-static void mergemask_uq(uint64_t *d, uint64_t r, uint16_t mask)
-{
-    uint64_t bmask = expand_pred_b(mask);
-    *d = (*d & ~bmask) | (r & bmask);
-}
-
-static void mergemask_sq(int64_t *d, int64_t r, uint16_t mask)
-{
-    mergemask_uq((uint64_t *)d, r, mask);
-}
-
-#define mergemask(D, R, M)                      \
-    _Generic(D,                                 \
-             uint8_t *: mergemask_ub,           \
-             int8_t *:  mergemask_sb,           \
-             uint16_t *: mergemask_uh,          \
-             int16_t *:  mergemask_sh,          \
-             uint32_t *: mergemask_uw,          \
-             int32_t *:  mergemask_sw,          \
-             uint64_t *: mergemask_uq,          \
-             int64_t *:  mergemask_sq)(D, R, M)
-
-void HELPER(mve_vdup)(CPUARMState *env, void *vd, uint32_t val)
-{
-    /*
-     * The generated code already replicated an 8 or 16 bit constant
-     * into the 32-bit value, so we only need to write the 32-bit
-     * value to all elements of the Qreg, allowing for predication.
-     */
-    uint32_t *d = vd;
-    uint16_t mask = mve_element_mask(env);
-    unsigned e;
-    for (e = 0; e < 16 / 4; e++, mask >>= 4) {
-        mergemask(&d[H4(e)], val, mask);
-    }
-    mve_advance_vpt(env);
-}
-
-#define DO_1OP(OP, ESIZE, TYPE, FN)                                     \
-    void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm)         \
-    {                                                                   \
-        TYPE *d = vd, *m = vm;                                          \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned e;                                                     \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
-            mergemask(&d[H##ESIZE(e)], FN(m[H##ESIZE(e)]), mask);       \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-#define DO_CLS_B(N)   (clrsb32(N) - 24)
-#define DO_CLS_H(N)   (clrsb32(N) - 16)
-
-DO_1OP(vclsb, 1, int8_t, DO_CLS_B)
-DO_1OP(vclsh, 2, int16_t, DO_CLS_H)
-DO_1OP(vclsw, 4, int32_t, clrsb32)
-
-#define DO_CLZ_B(N)   (clz32(N) - 24)
-#define DO_CLZ_H(N)   (clz32(N) - 16)
-
-DO_1OP(vclzb, 1, uint8_t, DO_CLZ_B)
-DO_1OP(vclzh, 2, uint16_t, DO_CLZ_H)
-DO_1OP(vclzw, 4, uint32_t, clz32)
-
-DO_1OP(vrev16b, 2, uint16_t, bswap16)
-DO_1OP(vrev32b, 4, uint32_t, bswap32)
-DO_1OP(vrev32h, 4, uint32_t, hswap32)
-DO_1OP(vrev64b, 8, uint64_t, bswap64)
-DO_1OP(vrev64h, 8, uint64_t, hswap64)
-DO_1OP(vrev64w, 8, uint64_t, wswap64)
-
-#define DO_NOT(N) (~(N))
-
-DO_1OP(vmvn, 8, uint64_t, DO_NOT)
-
-#define DO_ABS(N) ((N) < 0 ? -(N) : (N))
-#define DO_FABSH(N)  ((N) & dup_const(MO_16, 0x7fff))
-#define DO_FABSS(N)  ((N) & dup_const(MO_32, 0x7fffffff))
-
-DO_1OP(vabsb, 1, int8_t, DO_ABS)
-DO_1OP(vabsh, 2, int16_t, DO_ABS)
-DO_1OP(vabsw, 4, int32_t, DO_ABS)
-
-/* We can do these 64 bits at a time */
-DO_1OP(vfabsh, 8, uint64_t, DO_FABSH)
-DO_1OP(vfabss, 8, uint64_t, DO_FABSS)
-
-#define DO_NEG(N)    (-(N))
-#define DO_FNEGH(N) ((N) ^ dup_const(MO_16, 0x8000))
-#define DO_FNEGS(N) ((N) ^ dup_const(MO_32, 0x80000000))
-
-DO_1OP(vnegb, 1, int8_t, DO_NEG)
-DO_1OP(vnegh, 2, int16_t, DO_NEG)
-DO_1OP(vnegw, 4, int32_t, DO_NEG)
-
-/* We can do these 64 bits at a time */
-DO_1OP(vfnegh, 8, uint64_t, DO_FNEGH)
-DO_1OP(vfnegs, 8, uint64_t, DO_FNEGS)
-
-/*
- * 1 operand immediates: Vda is destination and possibly also one source.
- * All these insns work at 64-bit widths.
- */
-#define DO_1OP_IMM(OP, FN)                                              \
-    void HELPER(mve_##OP)(CPUARMState *env, void *vda, uint64_t imm)    \
-    {                                                                   \
-        uint64_t *da = vda;                                             \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned e;                                                     \
-        for (e = 0; e < 16 / 8; e++, mask >>= 8) {                      \
-            mergemask(&da[H8(e)], FN(da[H8(e)], imm), mask);            \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-#define DO_MOVI(N, I) (I)
-#define DO_ANDI(N, I) ((N) & (I))
-#define DO_ORRI(N, I) ((N) | (I))
-
-DO_1OP_IMM(vmovi, DO_MOVI)
-DO_1OP_IMM(vandi, DO_ANDI)
-DO_1OP_IMM(vorri, DO_ORRI)
-
-#define DO_2OP(OP, ESIZE, TYPE, FN)                                     \
-    void HELPER(glue(mve_, OP))(CPUARMState *env,                       \
-                                void *vd, void *vn, void *vm)           \
-    {                                                                   \
-        TYPE *d = vd, *n = vn, *m = vm;                                 \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned e;                                                     \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
-            mergemask(&d[H##ESIZE(e)],                                  \
-                      FN(n[H##ESIZE(e)], m[H##ESIZE(e)]), mask);        \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-/* provide unsigned 2-op helpers for all sizes */
-#define DO_2OP_U(OP, FN)                        \
-    DO_2OP(OP##b, 1, uint8_t, FN)               \
-    DO_2OP(OP##h, 2, uint16_t, FN)              \
-    DO_2OP(OP##w, 4, uint32_t, FN)
-
-/* provide signed 2-op helpers for all sizes */
-#define DO_2OP_S(OP, FN)                        \
-    DO_2OP(OP##b, 1, int8_t, FN)                \
-    DO_2OP(OP##h, 2, int16_t, FN)               \
-    DO_2OP(OP##w, 4, int32_t, FN)
-
-/*
- * "Long" operations where two half-sized inputs (taken from either the
- * top or the bottom of the input vector) produce a double-width result.
- * Here ESIZE, TYPE are for the input, and LESIZE, LTYPE for the output.
- */
-#define DO_2OP_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN)               \
-    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \
-    {                                                                   \
-        LTYPE *d = vd;                                                  \
-        TYPE *n = vn, *m = vm;                                          \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned le;                                                    \
-        for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) {         \
-            LTYPE r = FN((LTYPE)n[H##ESIZE(le * 2 + TOP)],              \
-                         m[H##ESIZE(le * 2 + TOP)]);                    \
-            mergemask(&d[H##LESIZE(le)], r, mask);                      \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-#define DO_2OP_SAT(OP, ESIZE, TYPE, FN)                                 \
-    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \
-    {                                                                   \
-        TYPE *d = vd, *n = vn, *m = vm;                                 \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned e;                                                     \
-        bool qc = false;                                                \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
-            bool sat = false;                                           \
-            TYPE r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], &sat);          \
-            mergemask(&d[H##ESIZE(e)], r, mask);                        \
-            qc |= sat & mask & 1;                                       \
-        }                                                               \
-        if (qc) {                                                       \
-            env->vfp.qc[0] = qc;                                        \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-/* provide unsigned 2-op helpers for all sizes */
-#define DO_2OP_SAT_U(OP, FN)                    \
-    DO_2OP_SAT(OP##b, 1, uint8_t, FN)           \
-    DO_2OP_SAT(OP##h, 2, uint16_t, FN)          \
-    DO_2OP_SAT(OP##w, 4, uint32_t, FN)
-
-/* provide signed 2-op helpers for all sizes */
-#define DO_2OP_SAT_S(OP, FN)                    \
-    DO_2OP_SAT(OP##b, 1, int8_t, FN)            \
-    DO_2OP_SAT(OP##h, 2, int16_t, FN)           \
-    DO_2OP_SAT(OP##w, 4, int32_t, FN)
-
-#define DO_AND(N, M)  ((N) & (M))
-#define DO_BIC(N, M)  ((N) & ~(M))
-#define DO_ORR(N, M)  ((N) | (M))
-#define DO_ORN(N, M)  ((N) | ~(M))
-#define DO_EOR(N, M)  ((N) ^ (M))
-
-DO_2OP(vand, 8, uint64_t, DO_AND)
-DO_2OP(vbic, 8, uint64_t, DO_BIC)
-DO_2OP(vorr, 8, uint64_t, DO_ORR)
-DO_2OP(vorn, 8, uint64_t, DO_ORN)
-DO_2OP(veor, 8, uint64_t, DO_EOR)
-
-#define DO_ADD(N, M) ((N) + (M))
-#define DO_SUB(N, M) ((N) - (M))
-#define DO_MUL(N, M) ((N) * (M))
-
-DO_2OP_U(vadd, DO_ADD)
-DO_2OP_U(vsub, DO_SUB)
-DO_2OP_U(vmul, DO_MUL)
-
-DO_2OP_L(vmullbsb, 0, 1, int8_t, 2, int16_t, DO_MUL)
-DO_2OP_L(vmullbsh, 0, 2, int16_t, 4, int32_t, DO_MUL)
-DO_2OP_L(vmullbsw, 0, 4, int32_t, 8, int64_t, DO_MUL)
-DO_2OP_L(vmullbub, 0, 1, uint8_t, 2, uint16_t, DO_MUL)
-DO_2OP_L(vmullbuh, 0, 2, uint16_t, 4, uint32_t, DO_MUL)
-DO_2OP_L(vmullbuw, 0, 4, uint32_t, 8, uint64_t, DO_MUL)
-
-DO_2OP_L(vmulltsb, 1, 1, int8_t, 2, int16_t, DO_MUL)
-DO_2OP_L(vmulltsh, 1, 2, int16_t, 4, int32_t, DO_MUL)
-DO_2OP_L(vmulltsw, 1, 4, int32_t, 8, int64_t, DO_MUL)
-DO_2OP_L(vmulltub, 1, 1, uint8_t, 2, uint16_t, DO_MUL)
-DO_2OP_L(vmulltuh, 1, 2, uint16_t, 4, uint32_t, DO_MUL)
-DO_2OP_L(vmulltuw, 1, 4, uint32_t, 8, uint64_t, DO_MUL)
-
-/*
- * Polynomial multiply. We can always do this generating 64 bits
- * of the result at a time, so we don't need to use DO_2OP_L.
- */
-#define VMULLPH_MASK 0x00ff00ff00ff00ffULL
-#define VMULLPW_MASK 0x0000ffff0000ffffULL
-#define DO_VMULLPBH(N, M) pmull_h((N) & VMULLPH_MASK, (M) & VMULLPH_MASK)
-#define DO_VMULLPTH(N, M) DO_VMULLPBH((N) >> 8, (M) >> 8)
-#define DO_VMULLPBW(N, M) pmull_w((N) & VMULLPW_MASK, (M) & VMULLPW_MASK)
-#define DO_VMULLPTW(N, M) DO_VMULLPBW((N) >> 16, (M) >> 16)
-
-DO_2OP(vmullpbh, 8, uint64_t, DO_VMULLPBH)
-DO_2OP(vmullpth, 8, uint64_t, DO_VMULLPTH)
-DO_2OP(vmullpbw, 8, uint64_t, DO_VMULLPBW)
-DO_2OP(vmullptw, 8, uint64_t, DO_VMULLPTW)
-
-/*
- * Because the computation type is at least twice as large as required,
- * these work for both signed and unsigned source types.
- */
-static inline uint8_t do_mulh_b(int32_t n, int32_t m)
-{
-    return (n * m) >> 8;
-}
-
-static inline uint16_t do_mulh_h(int32_t n, int32_t m)
-{
-    return (n * m) >> 16;
-}
-
-static inline uint32_t do_mulh_w(int64_t n, int64_t m)
-{
-    return (n * m) >> 32;
-}
-
-static inline uint8_t do_rmulh_b(int32_t n, int32_t m)
-{
-    return (n * m + (1U << 7)) >> 8;
-}
-
-static inline uint16_t do_rmulh_h(int32_t n, int32_t m)
-{
-    return (n * m + (1U << 15)) >> 16;
-}
-
-static inline uint32_t do_rmulh_w(int64_t n, int64_t m)
-{
-    return (n * m + (1U << 31)) >> 32;
-}
-
-DO_2OP(vmulhsb, 1, int8_t, do_mulh_b)
-DO_2OP(vmulhsh, 2, int16_t, do_mulh_h)
-DO_2OP(vmulhsw, 4, int32_t, do_mulh_w)
-DO_2OP(vmulhub, 1, uint8_t, do_mulh_b)
-DO_2OP(vmulhuh, 2, uint16_t, do_mulh_h)
-DO_2OP(vmulhuw, 4, uint32_t, do_mulh_w)
-
-DO_2OP(vrmulhsb, 1, int8_t, do_rmulh_b)
-DO_2OP(vrmulhsh, 2, int16_t, do_rmulh_h)
-DO_2OP(vrmulhsw, 4, int32_t, do_rmulh_w)
-DO_2OP(vrmulhub, 1, uint8_t, do_rmulh_b)
-DO_2OP(vrmulhuh, 2, uint16_t, do_rmulh_h)
-DO_2OP(vrmulhuw, 4, uint32_t, do_rmulh_w)
-
-#define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
-#define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
-
-DO_2OP_S(vmaxs, DO_MAX)
-DO_2OP_U(vmaxu, DO_MAX)
-DO_2OP_S(vmins, DO_MIN)
-DO_2OP_U(vminu, DO_MIN)
-
-#define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
-
-DO_2OP_S(vabds, DO_ABD)
-DO_2OP_U(vabdu, DO_ABD)
-
-static inline uint32_t do_vhadd_u(uint32_t n, uint32_t m)
-{
-    return ((uint64_t)n + m) >> 1;
-}
-
-static inline int32_t do_vhadd_s(int32_t n, int32_t m)
-{
-    return ((int64_t)n + m) >> 1;
-}
-
-static inline uint32_t do_vhsub_u(uint32_t n, uint32_t m)
-{
-    return ((uint64_t)n - m) >> 1;
-}
-
-static inline int32_t do_vhsub_s(int32_t n, int32_t m)
-{
-    return ((int64_t)n - m) >> 1;
-}
-
-DO_2OP_S(vhadds, do_vhadd_s)
-DO_2OP_U(vhaddu, do_vhadd_u)
-DO_2OP_S(vhsubs, do_vhsub_s)
-DO_2OP_U(vhsubu, do_vhsub_u)
-
-#define DO_VSHLS(N, M) do_sqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL)
-#define DO_VSHLU(N, M) do_uqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL)
-#define DO_VRSHLS(N, M) do_sqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, true, NULL)
-#define DO_VRSHLU(N, M) do_uqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, true, NULL)
-
-DO_2OP_S(vshls, DO_VSHLS)
-DO_2OP_U(vshlu, DO_VSHLU)
-DO_2OP_S(vrshls, DO_VRSHLS)
-DO_2OP_U(vrshlu, DO_VRSHLU)
-
-#define DO_RHADD_S(N, M) (((int64_t)(N) + (M) + 1) >> 1)
-#define DO_RHADD_U(N, M) (((uint64_t)(N) + (M) + 1) >> 1)
-
-DO_2OP_S(vrhadds, DO_RHADD_S)
-DO_2OP_U(vrhaddu, DO_RHADD_U)
-
-static void do_vadc(CPUARMState *env, uint32_t *d, uint32_t *n, uint32_t *m,
-                    uint32_t inv, uint32_t carry_in, bool update_flags)
-{
-    uint16_t mask = mve_element_mask(env);
-    unsigned e;
-
-    /* If any additions trigger, we will update flags. */
-    if (mask & 0x1111) {
-        update_flags = true;
-    }
-
-    for (e = 0; e < 16 / 4; e++, mask >>= 4) {
-        uint64_t r = carry_in;
-        r += n[H4(e)];
-        r += m[H4(e)] ^ inv;
-        if (mask & 1) {
-            carry_in = r >> 32;
-        }
-        mergemask(&d[H4(e)], r, mask);
-    }
-
-    if (update_flags) {
-        /* Store C, clear NZV. */
-        env->vfp.xregs[ARM_VFP_FPSCR] &= ~FPCR_NZCV_MASK;
-        env->vfp.xregs[ARM_VFP_FPSCR] |= carry_in * FPCR_C;
-    }
-    mve_advance_vpt(env);
-}
-
-void HELPER(mve_vadc)(CPUARMState *env, void *vd, void *vn, void *vm)
-{
-    bool carry_in = env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_C;
-    do_vadc(env, vd, vn, vm, 0, carry_in, false);
-}
-
-void HELPER(mve_vsbc)(CPUARMState *env, void *vd, void *vn, void *vm)
-{
-    bool carry_in = env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_C;
-    do_vadc(env, vd, vn, vm, -1, carry_in, false);
-}
-
-
-void HELPER(mve_vadci)(CPUARMState *env, void *vd, void *vn, void *vm)
-{
-    do_vadc(env, vd, vn, vm, 0, 0, true);
-}
-
-void HELPER(mve_vsbci)(CPUARMState *env, void *vd, void *vn, void *vm)
-{
-    do_vadc(env, vd, vn, vm, -1, 1, true);
-}
-
-#define DO_VCADD(OP, ESIZE, TYPE, FN0, FN1)                             \
-    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \
-    {                                                                   \
-        TYPE *d = vd, *n = vn, *m = vm;                                 \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned e;                                                     \
-        TYPE r[16 / ESIZE];                                             \
-        /* Calculate all results first to avoid overwriting inputs */   \
-        for (e = 0; e < 16 / ESIZE; e++) {                              \
-            if (!(e & 1)) {                                             \
-                r[e] = FN0(n[H##ESIZE(e)], m[H##ESIZE(e + 1)]);         \
-            } else {                                                    \
-                r[e] = FN1(n[H##ESIZE(e)], m[H##ESIZE(e - 1)]);         \
-            }                                                           \
-        }                                                               \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
-            mergemask(&d[H##ESIZE(e)], r[e], mask);                     \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-#define DO_VCADD_ALL(OP, FN0, FN1)              \
-    DO_VCADD(OP##b, 1, int8_t, FN0, FN1)        \
-    DO_VCADD(OP##h, 2, int16_t, FN0, FN1)       \
-    DO_VCADD(OP##w, 4, int32_t, FN0, FN1)
-
-DO_VCADD_ALL(vcadd90, DO_SUB, DO_ADD)
-DO_VCADD_ALL(vcadd270, DO_ADD, DO_SUB)
-DO_VCADD_ALL(vhcadd90, do_vhsub_s, do_vhadd_s)
-DO_VCADD_ALL(vhcadd270, do_vhadd_s, do_vhsub_s)
-
-static inline int32_t do_sat_bhw(int64_t val, int64_t min, int64_t max, bool *s)
-{
-    if (val > max) {
-        *s = true;
-        return max;
-    } else if (val < min) {
-        *s = true;
-        return min;
-    }
-    return val;
-}
-
-#define DO_SQADD_B(n, m, s) do_sat_bhw((int64_t)n + m, INT8_MIN, INT8_MAX, s)
-#define DO_SQADD_H(n, m, s) do_sat_bhw((int64_t)n + m, INT16_MIN, INT16_MAX, s)
-#define DO_SQADD_W(n, m, s) do_sat_bhw((int64_t)n + m, INT32_MIN, INT32_MAX, s)
-
-#define DO_UQADD_B(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT8_MAX, s)
-#define DO_UQADD_H(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT16_MAX, s)
-#define DO_UQADD_W(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT32_MAX, s)
-
-#define DO_SQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, INT8_MIN, INT8_MAX, s)
-#define DO_SQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, INT16_MIN, INT16_MAX, s)
-#define DO_SQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, INT32_MIN, INT32_MAX, s)
-
-#define DO_UQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT8_MAX, s)
-#define DO_UQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT16_MAX, s)
-#define DO_UQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT32_MAX, s)
-
-/*
- * For QDMULH and QRDMULH we simplify "double and shift by esize" into
- * "shift by esize-1", adjusting the QRDMULH rounding constant to match.
- */
-#define DO_QDMULH_B(n, m, s) do_sat_bhw(((int64_t)n * m) >> 7, \
-                                        INT8_MIN, INT8_MAX, s)
-#define DO_QDMULH_H(n, m, s) do_sat_bhw(((int64_t)n * m) >> 15, \
-                                        INT16_MIN, INT16_MAX, s)
-#define DO_QDMULH_W(n, m, s) do_sat_bhw(((int64_t)n * m) >> 31, \
-                                        INT32_MIN, INT32_MAX, s)
-
-#define DO_QRDMULH_B(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 6)) >> 7, \
-                                         INT8_MIN, INT8_MAX, s)
-#define DO_QRDMULH_H(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 14)) >> 15, \
-                                         INT16_MIN, INT16_MAX, s)
-#define DO_QRDMULH_W(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 30)) >> 31, \
-                                         INT32_MIN, INT32_MAX, s)
-
-DO_2OP_SAT(vqdmulhb, 1, int8_t, DO_QDMULH_B)
-DO_2OP_SAT(vqdmulhh, 2, int16_t, DO_QDMULH_H)
-DO_2OP_SAT(vqdmulhw, 4, int32_t, DO_QDMULH_W)
-
-DO_2OP_SAT(vqrdmulhb, 1, int8_t, DO_QRDMULH_B)
-DO_2OP_SAT(vqrdmulhh, 2, int16_t, DO_QRDMULH_H)
-DO_2OP_SAT(vqrdmulhw, 4, int32_t, DO_QRDMULH_W)
-
-DO_2OP_SAT(vqaddub, 1, uint8_t, DO_UQADD_B)
-DO_2OP_SAT(vqadduh, 2, uint16_t, DO_UQADD_H)
-DO_2OP_SAT(vqadduw, 4, uint32_t, DO_UQADD_W)
-DO_2OP_SAT(vqaddsb, 1, int8_t, DO_SQADD_B)
-DO_2OP_SAT(vqaddsh, 2, int16_t, DO_SQADD_H)
-DO_2OP_SAT(vqaddsw, 4, int32_t, DO_SQADD_W)
-
-DO_2OP_SAT(vqsubub, 1, uint8_t, DO_UQSUB_B)
-DO_2OP_SAT(vqsubuh, 2, uint16_t, DO_UQSUB_H)
-DO_2OP_SAT(vqsubuw, 4, uint32_t, DO_UQSUB_W)
-DO_2OP_SAT(vqsubsb, 1, int8_t, DO_SQSUB_B)
-DO_2OP_SAT(vqsubsh, 2, int16_t, DO_SQSUB_H)
-DO_2OP_SAT(vqsubsw, 4, int32_t, DO_SQSUB_W)
-
-/*
- * This wrapper fixes up the impedance mismatch between do_sqrshl_bhs()
- * and friends wanting a uint32_t* sat and our needing a bool*.
- */
-#define WRAP_QRSHL_HELPER(FN, N, M, ROUND, satp)                        \
-    ({                                                                  \
-        uint32_t su32 = 0;                                              \
-        typeof(N) r = FN(N, (int8_t)(M), sizeof(N) * 8, ROUND, &su32);  \
-        if (su32) {                                                     \
-            *satp = true;                                               \
-        }                                                               \
-        r;                                                              \
-    })
-
-#define DO_SQSHL_OP(N, M, satp) \
-    WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, false, satp)
-#define DO_UQSHL_OP(N, M, satp) \
-    WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, false, satp)
-#define DO_SQRSHL_OP(N, M, satp) \
-    WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, true, satp)
-#define DO_UQRSHL_OP(N, M, satp) \
-    WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, true, satp)
-#define DO_SUQSHL_OP(N, M, satp) \
-    WRAP_QRSHL_HELPER(do_suqrshl_bhs, N, M, false, satp)
-
-DO_2OP_SAT_S(vqshls, DO_SQSHL_OP)
-DO_2OP_SAT_U(vqshlu, DO_UQSHL_OP)
-DO_2OP_SAT_S(vqrshls, DO_SQRSHL_OP)
-DO_2OP_SAT_U(vqrshlu, DO_UQRSHL_OP)
-
-/*
- * Multiply add dual returning high half
- * The 'FN' here takes four inputs A, B, C, D, a 0/1 indicator of
- * whether to add the rounding constant, and the pointer to the
- * saturation flag, and should do "(A * B + C * D) * 2 + rounding constant",
- * saturate to twice the input size and return the high half; or
- * (A * B - C * D) etc for VQDMLSDH.
- */
-#define DO_VQDMLADH_OP(OP, ESIZE, TYPE, XCHG, ROUND, FN)                \
-    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn,   \
-                                void *vm)                               \
-    {                                                                   \
-        TYPE *d = vd, *n = vn, *m = vm;                                 \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned e;                                                     \
-        bool qc = false;                                                \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
-            bool sat = false;                                           \
-            if ((e & 1) == XCHG) {                                      \
-                TYPE r = FN(n[H##ESIZE(e)],                             \
-                            m[H##ESIZE(e - XCHG)],                      \
-                            n[H##ESIZE(e + (1 - 2 * XCHG))],            \
-                            m[H##ESIZE(e + (1 - XCHG))],                \
-                            ROUND, &sat);                               \
-                mergemask(&d[H##ESIZE(e)], r, mask);                    \
-                qc |= sat & mask & 1;                                   \
-            }                                                           \
-        }                                                               \
-        if (qc) {                                                       \
-            env->vfp.qc[0] = qc;                                        \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-static int8_t do_vqdmladh_b(int8_t a, int8_t b, int8_t c, int8_t d,
-                            int round, bool *sat)
-{
-    int64_t r = ((int64_t)a * b + (int64_t)c * d) * 2 + (round << 7);
-    return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8;
-}
-
-static int16_t do_vqdmladh_h(int16_t a, int16_t b, int16_t c, int16_t d,
-                             int round, bool *sat)
-{
-    int64_t r = ((int64_t)a * b + (int64_t)c * d) * 2 + (round << 15);
-    return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16;
-}
-
-static int32_t do_vqdmladh_w(int32_t a, int32_t b, int32_t c, int32_t d,
-                             int round, bool *sat)
-{
-    int64_t m1 = (int64_t)a * b;
-    int64_t m2 = (int64_t)c * d;
-    int64_t r;
-    /*
-     * Architecturally we should do the entire add, double, round
-     * and then check for saturation. We do three saturating adds,
-     * but we need to be careful about the order. If the first
-     * m1 + m2 saturates then it's impossible for the *2+rc to
-     * bring it back into the non-saturated range. However, if
-     * m1 + m2 is negative then it's possible that doing the doubling
-     * would take the intermediate result below INT64_MAX and the
-     * addition of the rounding constant then brings it back in range.
-     * So we add half the rounding constant before doubling rather
-     * than adding the rounding constant after the doubling.
-     */
-    if (sadd64_overflow(m1, m2, &r) ||
-        sadd64_overflow(r, (round << 30), &r) ||
-        sadd64_overflow(r, r, &r)) {
-        *sat = true;
-        return r < 0 ? INT32_MAX : INT32_MIN;
-    }
-    return r >> 32;
-}
-
-static int8_t do_vqdmlsdh_b(int8_t a, int8_t b, int8_t c, int8_t d,
-                            int round, bool *sat)
-{
-    int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 7);
-    return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8;
-}
-
-static int16_t do_vqdmlsdh_h(int16_t a, int16_t b, int16_t c, int16_t d,
-                             int round, bool *sat)
-{
-    int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 15);
-    return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16;
-}
-
-static int32_t do_vqdmlsdh_w(int32_t a, int32_t b, int32_t c, int32_t d,
-                             int round, bool *sat)
-{
-    int64_t m1 = (int64_t)a * b;
-    int64_t m2 = (int64_t)c * d;
-    int64_t r;
-    /* The same ordering issue as in do_vqdmladh_w applies here too */
-    if (ssub64_overflow(m1, m2, &r) ||
-        sadd64_overflow(r, (round << 30), &r) ||
-        sadd64_overflow(r, r, &r)) {
-        *sat = true;
-        return r < 0 ? INT32_MAX : INT32_MIN;
-    }
-    return r >> 32;
-}
-
-DO_VQDMLADH_OP(vqdmladhb, 1, int8_t, 0, 0, do_vqdmladh_b)
-DO_VQDMLADH_OP(vqdmladhh, 2, int16_t, 0, 0, do_vqdmladh_h)
-DO_VQDMLADH_OP(vqdmladhw, 4, int32_t, 0, 0, do_vqdmladh_w)
-DO_VQDMLADH_OP(vqdmladhxb, 1, int8_t, 1, 0, do_vqdmladh_b)
-DO_VQDMLADH_OP(vqdmladhxh, 2, int16_t, 1, 0, do_vqdmladh_h)
-DO_VQDMLADH_OP(vqdmladhxw, 4, int32_t, 1, 0, do_vqdmladh_w)
-
-DO_VQDMLADH_OP(vqrdmladhb, 1, int8_t, 0, 1, do_vqdmladh_b)
-DO_VQDMLADH_OP(vqrdmladhh, 2, int16_t, 0, 1, do_vqdmladh_h)
-DO_VQDMLADH_OP(vqrdmladhw, 4, int32_t, 0, 1, do_vqdmladh_w)
-DO_VQDMLADH_OP(vqrdmladhxb, 1, int8_t, 1, 1, do_vqdmladh_b)
-DO_VQDMLADH_OP(vqrdmladhxh, 2, int16_t, 1, 1, do_vqdmladh_h)
-DO_VQDMLADH_OP(vqrdmladhxw, 4, int32_t, 1, 1, do_vqdmladh_w)
-
-DO_VQDMLADH_OP(vqdmlsdhb, 1, int8_t, 0, 0, do_vqdmlsdh_b)
-DO_VQDMLADH_OP(vqdmlsdhh, 2, int16_t, 0, 0, do_vqdmlsdh_h)
-DO_VQDMLADH_OP(vqdmlsdhw, 4, int32_t, 0, 0, do_vqdmlsdh_w)
-DO_VQDMLADH_OP(vqdmlsdhxb, 1, int8_t, 1, 0, do_vqdmlsdh_b)
-DO_VQDMLADH_OP(vqdmlsdhxh, 2, int16_t, 1, 0, do_vqdmlsdh_h)
-DO_VQDMLADH_OP(vqdmlsdhxw, 4, int32_t, 1, 0, do_vqdmlsdh_w)
-
-DO_VQDMLADH_OP(vqrdmlsdhb, 1, int8_t, 0, 1, do_vqdmlsdh_b)
-DO_VQDMLADH_OP(vqrdmlsdhh, 2, int16_t, 0, 1, do_vqdmlsdh_h)
-DO_VQDMLADH_OP(vqrdmlsdhw, 4, int32_t, 0, 1, do_vqdmlsdh_w)
-DO_VQDMLADH_OP(vqrdmlsdhxb, 1, int8_t, 1, 1, do_vqdmlsdh_b)
-DO_VQDMLADH_OP(vqrdmlsdhxh, 2, int16_t, 1, 1, do_vqdmlsdh_h)
-DO_VQDMLADH_OP(vqrdmlsdhxw, 4, int32_t, 1, 1, do_vqdmlsdh_w)
-
-#define DO_2OP_SCALAR(OP, ESIZE, TYPE, FN)                              \
-    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn,   \
-                                uint32_t rm)                            \
-    {                                                                   \
-        TYPE *d = vd, *n = vn;                                          \
-        TYPE m = rm;                                                    \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned e;                                                     \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
-            mergemask(&d[H##ESIZE(e)], FN(n[H##ESIZE(e)], m), mask);    \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-#define DO_2OP_SAT_SCALAR(OP, ESIZE, TYPE, FN)                          \
-    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn,   \
-                                uint32_t rm)                            \
-    {                                                                   \
-        TYPE *d = vd, *n = vn;                                          \
-        TYPE m = rm;                                                    \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned e;                                                     \
-        bool qc = false;                                                \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
-            bool sat = false;                                           \
-            mergemask(&d[H##ESIZE(e)], FN(n[H##ESIZE(e)], m, &sat),     \
-                      mask);                                            \
-            qc |= sat & mask & 1;                                       \
-        }                                                               \
-        if (qc) {                                                       \
-            env->vfp.qc[0] = qc;                                        \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-/* "accumulating" version where FN takes d as well as n and m */
-#define DO_2OP_ACC_SCALAR(OP, ESIZE, TYPE, FN)                          \
-    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn,   \
-                                uint32_t rm)                            \
-    {                                                                   \
-        TYPE *d = vd, *n = vn;                                          \
-        TYPE m = rm;                                                    \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned e;                                                     \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
-            mergemask(&d[H##ESIZE(e)],                                  \
-                      FN(d[H##ESIZE(e)], n[H##ESIZE(e)], m), mask);     \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-#define DO_2OP_SAT_ACC_SCALAR(OP, ESIZE, TYPE, FN)                      \
-    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn,   \
-                                uint32_t rm)                            \
-    {                                                                   \
-        TYPE *d = vd, *n = vn;                                          \
-        TYPE m = rm;                                                    \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned e;                                                     \
-        bool qc = false;                                                \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
-            bool sat = false;                                           \
-            mergemask(&d[H##ESIZE(e)],                                  \
-                      FN(d[H##ESIZE(e)], n[H##ESIZE(e)], m, &sat),      \
-                      mask);                                            \
-            qc |= sat & mask & 1;                                       \
-        }                                                               \
-        if (qc) {                                                       \
-            env->vfp.qc[0] = qc;                                        \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-/* provide unsigned 2-op scalar helpers for all sizes */
-#define DO_2OP_SCALAR_U(OP, FN)                 \
-    DO_2OP_SCALAR(OP##b, 1, uint8_t, FN)        \
-    DO_2OP_SCALAR(OP##h, 2, uint16_t, FN)       \
-    DO_2OP_SCALAR(OP##w, 4, uint32_t, FN)
-#define DO_2OP_SCALAR_S(OP, FN)                 \
-    DO_2OP_SCALAR(OP##b, 1, int8_t, FN)         \
-    DO_2OP_SCALAR(OP##h, 2, int16_t, FN)        \
-    DO_2OP_SCALAR(OP##w, 4, int32_t, FN)
-
-#define DO_2OP_ACC_SCALAR_U(OP, FN)             \
-    DO_2OP_ACC_SCALAR(OP##b, 1, uint8_t, FN)    \
-    DO_2OP_ACC_SCALAR(OP##h, 2, uint16_t, FN)   \
-    DO_2OP_ACC_SCALAR(OP##w, 4, uint32_t, FN)
-
-DO_2OP_SCALAR_U(vadd_scalar, DO_ADD)
-DO_2OP_SCALAR_U(vsub_scalar, DO_SUB)
-DO_2OP_SCALAR_U(vmul_scalar, DO_MUL)
-DO_2OP_SCALAR_S(vhadds_scalar, do_vhadd_s)
-DO_2OP_SCALAR_U(vhaddu_scalar, do_vhadd_u)
-DO_2OP_SCALAR_S(vhsubs_scalar, do_vhsub_s)
-DO_2OP_SCALAR_U(vhsubu_scalar, do_vhsub_u)
-
-DO_2OP_SAT_SCALAR(vqaddu_scalarb, 1, uint8_t, DO_UQADD_B)
-DO_2OP_SAT_SCALAR(vqaddu_scalarh, 2, uint16_t, DO_UQADD_H)
-DO_2OP_SAT_SCALAR(vqaddu_scalarw, 4, uint32_t, DO_UQADD_W)
-DO_2OP_SAT_SCALAR(vqadds_scalarb, 1, int8_t, DO_SQADD_B)
-DO_2OP_SAT_SCALAR(vqadds_scalarh, 2, int16_t, DO_SQADD_H)
-DO_2OP_SAT_SCALAR(vqadds_scalarw, 4, int32_t, DO_SQADD_W)
-
-DO_2OP_SAT_SCALAR(vqsubu_scalarb, 1, uint8_t, DO_UQSUB_B)
-DO_2OP_SAT_SCALAR(vqsubu_scalarh, 2, uint16_t, DO_UQSUB_H)
-DO_2OP_SAT_SCALAR(vqsubu_scalarw, 4, uint32_t, DO_UQSUB_W)
-DO_2OP_SAT_SCALAR(vqsubs_scalarb, 1, int8_t, DO_SQSUB_B)
-DO_2OP_SAT_SCALAR(vqsubs_scalarh, 2, int16_t, DO_SQSUB_H)
-DO_2OP_SAT_SCALAR(vqsubs_scalarw, 4, int32_t, DO_SQSUB_W)
-
-DO_2OP_SAT_SCALAR(vqdmulh_scalarb, 1, int8_t, DO_QDMULH_B)
-DO_2OP_SAT_SCALAR(vqdmulh_scalarh, 2, int16_t, DO_QDMULH_H)
-DO_2OP_SAT_SCALAR(vqdmulh_scalarw, 4, int32_t, DO_QDMULH_W)
-DO_2OP_SAT_SCALAR(vqrdmulh_scalarb, 1, int8_t, DO_QRDMULH_B)
-DO_2OP_SAT_SCALAR(vqrdmulh_scalarh, 2, int16_t, DO_QRDMULH_H)
-DO_2OP_SAT_SCALAR(vqrdmulh_scalarw, 4, int32_t, DO_QRDMULH_W)
-
-static int8_t do_vqdmlah_b(int8_t a, int8_t b, int8_t c, int round, bool *sat)
-{
-    int64_t r = (int64_t)a * b * 2 + ((int64_t)c << 8) + (round << 7);
-    return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8;
-}
-
-static int16_t do_vqdmlah_h(int16_t a, int16_t b, int16_t c,
-                           int round, bool *sat)
-{
-    int64_t r = (int64_t)a * b * 2 + ((int64_t)c << 16) + (round << 15);
-    return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16;
-}
-
-static int32_t do_vqdmlah_w(int32_t a, int32_t b, int32_t c,
-                            int round, bool *sat)
-{
-    /*
-     * Architecturally we should do the entire add, double, round
-     * and then check for saturation. We do three saturating adds,
-     * but we need to be careful about the order. If the first
-     * m1 + m2 saturates then it's impossible for the *2+rc to
-     * bring it back into the non-saturated range. However, if
-     * m1 + m2 is negative then it's possible that doing the doubling
-     * would take the intermediate result below INT64_MAX and the
-     * addition of the rounding constant then brings it back in range.
-     * So we add half the rounding constant and half the "c << esize"
-     * before doubling rather than adding the rounding constant after
-     * the doubling.
-     */
-    int64_t m1 = (int64_t)a * b;
-    int64_t m2 = (int64_t)c << 31;
-    int64_t r;
-    if (sadd64_overflow(m1, m2, &r) ||
-        sadd64_overflow(r, (round << 30), &r) ||
-        sadd64_overflow(r, r, &r)) {
-        *sat = true;
-        return r < 0 ? INT32_MAX : INT32_MIN;
-    }
-    return r >> 32;
-}
-
-/*
- * The *MLAH insns are vector * scalar + vector;
- * the *MLASH insns are vector * vector + scalar
- */
-#define DO_VQDMLAH_B(D, N, M, S) do_vqdmlah_b(N, M, D, 0, S)
-#define DO_VQDMLAH_H(D, N, M, S) do_vqdmlah_h(N, M, D, 0, S)
-#define DO_VQDMLAH_W(D, N, M, S) do_vqdmlah_w(N, M, D, 0, S)
-#define DO_VQRDMLAH_B(D, N, M, S) do_vqdmlah_b(N, M, D, 1, S)
-#define DO_VQRDMLAH_H(D, N, M, S) do_vqdmlah_h(N, M, D, 1, S)
-#define DO_VQRDMLAH_W(D, N, M, S) do_vqdmlah_w(N, M, D, 1, S)
-
-#define DO_VQDMLASH_B(D, N, M, S) do_vqdmlah_b(N, D, M, 0, S)
-#define DO_VQDMLASH_H(D, N, M, S) do_vqdmlah_h(N, D, M, 0, S)
-#define DO_VQDMLASH_W(D, N, M, S) do_vqdmlah_w(N, D, M, 0, S)
-#define DO_VQRDMLASH_B(D, N, M, S) do_vqdmlah_b(N, D, M, 1, S)
-#define DO_VQRDMLASH_H(D, N, M, S) do_vqdmlah_h(N, D, M, 1, S)
-#define DO_VQRDMLASH_W(D, N, M, S) do_vqdmlah_w(N, D, M, 1, S)
-
-DO_2OP_SAT_ACC_SCALAR(vqdmlahb, 1, int8_t, DO_VQDMLAH_B)
-DO_2OP_SAT_ACC_SCALAR(vqdmlahh, 2, int16_t, DO_VQDMLAH_H)
-DO_2OP_SAT_ACC_SCALAR(vqdmlahw, 4, int32_t, DO_VQDMLAH_W)
-DO_2OP_SAT_ACC_SCALAR(vqrdmlahb, 1, int8_t, DO_VQRDMLAH_B)
-DO_2OP_SAT_ACC_SCALAR(vqrdmlahh, 2, int16_t, DO_VQRDMLAH_H)
-DO_2OP_SAT_ACC_SCALAR(vqrdmlahw, 4, int32_t, DO_VQRDMLAH_W)
-
-DO_2OP_SAT_ACC_SCALAR(vqdmlashb, 1, int8_t, DO_VQDMLASH_B)
-DO_2OP_SAT_ACC_SCALAR(vqdmlashh, 2, int16_t, DO_VQDMLASH_H)
-DO_2OP_SAT_ACC_SCALAR(vqdmlashw, 4, int32_t, DO_VQDMLASH_W)
-DO_2OP_SAT_ACC_SCALAR(vqrdmlashb, 1, int8_t, DO_VQRDMLASH_B)
-DO_2OP_SAT_ACC_SCALAR(vqrdmlashh, 2, int16_t, DO_VQRDMLASH_H)
-DO_2OP_SAT_ACC_SCALAR(vqrdmlashw, 4, int32_t, DO_VQRDMLASH_W)
-
-/* Vector by scalar plus vector */
-#define DO_VMLA(D, N, M) ((N) * (M) + (D))
-
-DO_2OP_ACC_SCALAR_U(vmla, DO_VMLA)
-
-/* Vector by vector plus scalar */
-#define DO_VMLAS(D, N, M) ((N) * (D) + (M))
-
-DO_2OP_ACC_SCALAR_U(vmlas, DO_VMLAS)
-
-/*
- * Long saturating scalar ops. As with DO_2OP_L, TYPE and H are for the
- * input (smaller) type and LESIZE, LTYPE, LH for the output (long) type.
- * SATMASK specifies which bits of the predicate mask matter for determining
- * whether to propagate a saturation indication into FPSCR.QC -- for
- * the 16x16->32 case we must check only the bit corresponding to the T or B
- * half that we used, but for the 32x32->64 case we propagate if the mask
- * bit is set for either half.
- */
-#define DO_2OP_SAT_SCALAR_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN, SATMASK) \
-    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn,   \
-                                uint32_t rm)                            \
-    {                                                                   \
-        LTYPE *d = vd;                                                  \
-        TYPE *n = vn;                                                   \
-        TYPE m = rm;                                                    \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned le;                                                    \
-        bool qc = false;                                                \
-        for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) {         \
-            bool sat = false;                                           \
-            LTYPE r = FN((LTYPE)n[H##ESIZE(le * 2 + TOP)], m, &sat);    \
-            mergemask(&d[H##LESIZE(le)], r, mask);                      \
-            qc |= sat && (mask & SATMASK);                              \
-        }                                                               \
-        if (qc) {                                                       \
-            env->vfp.qc[0] = qc;                                        \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-static inline int32_t do_qdmullh(int16_t n, int16_t m, bool *sat)
-{
-    int64_t r = ((int64_t)n * m) * 2;
-    return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat);
-}
-
-static inline int64_t do_qdmullw(int32_t n, int32_t m, bool *sat)
-{
-    /* The multiply can't overflow, but the doubling might */
-    int64_t r = (int64_t)n * m;
-    if (r > INT64_MAX / 2) {
-        *sat = true;
-        return INT64_MAX;
-    } else if (r < INT64_MIN / 2) {
-        *sat = true;
-        return INT64_MIN;
-    } else {
-        return r * 2;
-    }
-}
-
-#define SATMASK16B 1
-#define SATMASK16T (1 << 2)
-#define SATMASK32 ((1 << 4) | 1)
-
-DO_2OP_SAT_SCALAR_L(vqdmullb_scalarh, 0, 2, int16_t, 4, int32_t, \
-                    do_qdmullh, SATMASK16B)
-DO_2OP_SAT_SCALAR_L(vqdmullb_scalarw, 0, 4, int32_t, 8, int64_t, \
-                    do_qdmullw, SATMASK32)
-DO_2OP_SAT_SCALAR_L(vqdmullt_scalarh, 1, 2, int16_t, 4, int32_t, \
-                    do_qdmullh, SATMASK16T)
-DO_2OP_SAT_SCALAR_L(vqdmullt_scalarw, 1, 4, int32_t, 8, int64_t, \
-                    do_qdmullw, SATMASK32)
-
-/*
- * Long saturating ops
- */
-#define DO_2OP_SAT_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN, SATMASK)  \
-    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn,   \
-                                void *vm)                               \
-    {                                                                   \
-        LTYPE *d = vd;                                                  \
-        TYPE *n = vn, *m = vm;                                          \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned le;                                                    \
-        bool qc = false;                                                \
-        for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) {         \
-            bool sat = false;                                           \
-            LTYPE op1 = n[H##ESIZE(le * 2 + TOP)];                      \
-            LTYPE op2 = m[H##ESIZE(le * 2 + TOP)];                      \
-            mergemask(&d[H##LESIZE(le)], FN(op1, op2, &sat), mask);     \
-            qc |= sat && (mask & SATMASK);                              \
-        }                                                               \
-        if (qc) {                                                       \
-            env->vfp.qc[0] = qc;                                        \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-DO_2OP_SAT_L(vqdmullbh, 0, 2, int16_t, 4, int32_t, do_qdmullh, SATMASK16B)
-DO_2OP_SAT_L(vqdmullbw, 0, 4, int32_t, 8, int64_t, do_qdmullw, SATMASK32)
-DO_2OP_SAT_L(vqdmullth, 1, 2, int16_t, 4, int32_t, do_qdmullh, SATMASK16T)
-DO_2OP_SAT_L(vqdmulltw, 1, 4, int32_t, 8, int64_t, do_qdmullw, SATMASK32)
-
-static inline uint32_t do_vbrsrb(uint32_t n, uint32_t m)
-{
-    m &= 0xff;
-    if (m == 0) {
-        return 0;
-    }
-    n = revbit8(n);
-    if (m < 8) {
-        n >>= 8 - m;
-    }
-    return n;
-}
-
-static inline uint32_t do_vbrsrh(uint32_t n, uint32_t m)
-{
-    m &= 0xff;
-    if (m == 0) {
-        return 0;
-    }
-    n = revbit16(n);
-    if (m < 16) {
-        n >>= 16 - m;
-    }
-    return n;
-}
-
-static inline uint32_t do_vbrsrw(uint32_t n, uint32_t m)
-{
-    m &= 0xff;
-    if (m == 0) {
-        return 0;
-    }
-    n = revbit32(n);
-    if (m < 32) {
-        n >>= 32 - m;
-    }
-    return n;
-}
-
-DO_2OP_SCALAR(vbrsrb, 1, uint8_t, do_vbrsrb)
-DO_2OP_SCALAR(vbrsrh, 2, uint16_t, do_vbrsrh)
-DO_2OP_SCALAR(vbrsrw, 4, uint32_t, do_vbrsrw)
-
-/*
- * Multiply add long dual accumulate ops.
- */
-#define DO_LDAV(OP, ESIZE, TYPE, XCHG, EVENACC, ODDACC)                 \
-    uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn,         \
-                                    void *vm, uint64_t a)               \
-    {                                                                   \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned e;                                                     \
-        TYPE *n = vn, *m = vm;                                          \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
-            if (mask & 1) {                                             \
-                if (e & 1) {                                            \
-                    a ODDACC                                            \
-                        (int64_t)n[H##ESIZE(e - 1 * XCHG)] * m[H##ESIZE(e)]; \
-                } else {                                                \
-                    a EVENACC                                           \
-                        (int64_t)n[H##ESIZE(e + 1 * XCHG)] * m[H##ESIZE(e)]; \
-                }                                                       \
-            }                                                           \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-        return a;                                                       \
-    }
-
-DO_LDAV(vmlaldavsh, 2, int16_t, false, +=, +=)
-DO_LDAV(vmlaldavxsh, 2, int16_t, true, +=, +=)
-DO_LDAV(vmlaldavsw, 4, int32_t, false, +=, +=)
-DO_LDAV(vmlaldavxsw, 4, int32_t, true, +=, +=)
-
-DO_LDAV(vmlaldavuh, 2, uint16_t, false, +=, +=)
-DO_LDAV(vmlaldavuw, 4, uint32_t, false, +=, +=)
-
-DO_LDAV(vmlsldavsh, 2, int16_t, false, +=, -=)
-DO_LDAV(vmlsldavxsh, 2, int16_t, true, +=, -=)
-DO_LDAV(vmlsldavsw, 4, int32_t, false, +=, -=)
-DO_LDAV(vmlsldavxsw, 4, int32_t, true, +=, -=)
-
-/*
- * Multiply add dual accumulate ops
- */
-#define DO_DAV(OP, ESIZE, TYPE, XCHG, EVENACC, ODDACC) \
-    uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn,         \
-                                    void *vm, uint32_t a)               \
-    {                                                                   \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned e;                                                     \
-        TYPE *n = vn, *m = vm;                                          \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
-            if (mask & 1) {                                             \
-                if (e & 1) {                                            \
-                    a ODDACC                                            \
-                        n[H##ESIZE(e - 1 * XCHG)] * m[H##ESIZE(e)];     \
-                } else {                                                \
-                    a EVENACC                                           \
-                        n[H##ESIZE(e + 1 * XCHG)] * m[H##ESIZE(e)];     \
-                }                                                       \
-            }                                                           \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-        return a;                                                       \
-    }
-
-#define DO_DAV_S(INSN, XCHG, EVENACC, ODDACC)           \
-    DO_DAV(INSN##b, 1, int8_t, XCHG, EVENACC, ODDACC)   \
-    DO_DAV(INSN##h, 2, int16_t, XCHG, EVENACC, ODDACC)  \
-    DO_DAV(INSN##w, 4, int32_t, XCHG, EVENACC, ODDACC)
-
-#define DO_DAV_U(INSN, XCHG, EVENACC, ODDACC)           \
-    DO_DAV(INSN##b, 1, uint8_t, XCHG, EVENACC, ODDACC)  \
-    DO_DAV(INSN##h, 2, uint16_t, XCHG, EVENACC, ODDACC) \
-    DO_DAV(INSN##w, 4, uint32_t, XCHG, EVENACC, ODDACC)
-
-DO_DAV_S(vmladavs, false, +=, +=)
-DO_DAV_U(vmladavu, false, +=, +=)
-DO_DAV_S(vmlsdav, false, +=, -=)
-DO_DAV_S(vmladavsx, true, +=, +=)
-DO_DAV_S(vmlsdavx, true, +=, -=)
-
-/*
- * Rounding multiply add long dual accumulate high. In the pseudocode
- * this is implemented with a 72-bit internal accumulator value of which
- * the top 64 bits are returned. We optimize this to avoid having to
- * use 128-bit arithmetic -- we can do this because the 74-bit accumulator
- * is squashed back into 64-bits after each beat.
- */
-#define DO_LDAVH(OP, TYPE, LTYPE, XCHG, SUB)                            \
-    uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn,         \
-                                    void *vm, uint64_t a)               \
-    {                                                                   \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned e;                                                     \
-        TYPE *n = vn, *m = vm;                                          \
-        for (e = 0; e < 16 / 4; e++, mask >>= 4) {                      \
-            if (mask & 1) {                                             \
-                LTYPE mul;                                              \
-                if (e & 1) {                                            \
-                    mul = (LTYPE)n[H4(e - 1 * XCHG)] * m[H4(e)];        \
-                    if (SUB) {                                          \
-                        mul = -mul;                                     \
-                    }                                                   \
-                } else {                                                \
-                    mul = (LTYPE)n[H4(e + 1 * XCHG)] * m[H4(e)];        \
-                }                                                       \
-                mul = (mul >> 8) + ((mul >> 7) & 1);                    \
-                a += mul;                                               \
-            }                                                           \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-        return a;                                                       \
-    }
-
-DO_LDAVH(vrmlaldavhsw, int32_t, int64_t, false, false)
-DO_LDAVH(vrmlaldavhxsw, int32_t, int64_t, true, false)
-
-DO_LDAVH(vrmlaldavhuw, uint32_t, uint64_t, false, false)
-
-DO_LDAVH(vrmlsldavhsw, int32_t, int64_t, false, true)
-DO_LDAVH(vrmlsldavhxsw, int32_t, int64_t, true, true)
-
-/* Vector add across vector */
-#define DO_VADDV(OP, ESIZE, TYPE)                               \
-    uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \
-                                    uint32_t ra)                \
-    {                                                           \
-        uint16_t mask = mve_element_mask(env);                  \
-        unsigned e;                                             \
-        TYPE *m = vm;                                           \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {      \
-            if (mask & 1) {                                     \
-                ra += m[H##ESIZE(e)];                           \
-            }                                                   \
-        }                                                       \
-        mve_advance_vpt(env);                                   \
-        return ra;                                              \
-    }                                                           \
-
-DO_VADDV(vaddvsb, 1, int8_t)
-DO_VADDV(vaddvsh, 2, int16_t)
-DO_VADDV(vaddvsw, 4, int32_t)
-DO_VADDV(vaddvub, 1, uint8_t)
-DO_VADDV(vaddvuh, 2, uint16_t)
-DO_VADDV(vaddvuw, 4, uint32_t)
-
-/*
- * Vector max/min across vector. Unlike VADDV, we must
- * read ra as the element size, not its full width.
- * We work with int64_t internally for simplicity.
- */
-#define DO_VMAXMINV(OP, ESIZE, TYPE, RATYPE, FN)                \
-    uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \
-                                    uint32_t ra_in)             \
-    {                                                           \
-        uint16_t mask = mve_element_mask(env);                  \
-        unsigned e;                                             \
-        TYPE *m = vm;                                           \
-        int64_t ra = (RATYPE)ra_in;                             \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {      \
-            if (mask & 1) {                                     \
-                ra = FN(ra, m[H##ESIZE(e)]);                    \
-            }                                                   \
-        }                                                       \
-        mve_advance_vpt(env);                                   \
-        return ra;                                              \
-    }                                                           \
-
-#define DO_VMAXMINV_U(INSN, FN)                         \
-    DO_VMAXMINV(INSN##b, 1, uint8_t, uint8_t, FN)       \
-    DO_VMAXMINV(INSN##h, 2, uint16_t, uint16_t, FN)     \
-    DO_VMAXMINV(INSN##w, 4, uint32_t, uint32_t, FN)
-#define DO_VMAXMINV_S(INSN, FN)                         \
-    DO_VMAXMINV(INSN##b, 1, int8_t, int8_t, FN)         \
-    DO_VMAXMINV(INSN##h, 2, int16_t, int16_t, FN)       \
-    DO_VMAXMINV(INSN##w, 4, int32_t, int32_t, FN)
-
-/*
- * Helpers for max and min of absolute values across vector:
- * note that we only take the absolute value of 'm', not 'n'
- */
-static int64_t do_maxa(int64_t n, int64_t m)
-{
-    if (m < 0) {
-        m = -m;
-    }
-    return MAX(n, m);
-}
-
-static int64_t do_mina(int64_t n, int64_t m)
-{
-    if (m < 0) {
-        m = -m;
-    }
-    return MIN(n, m);
-}
-
-DO_VMAXMINV_S(vmaxvs, DO_MAX)
-DO_VMAXMINV_U(vmaxvu, DO_MAX)
-DO_VMAXMINV_S(vminvs, DO_MIN)
-DO_VMAXMINV_U(vminvu, DO_MIN)
-/*
- * VMAXAV, VMINAV treat the general purpose input as unsigned
- * and the vector elements as signed.
- */
-DO_VMAXMINV(vmaxavb, 1, int8_t, uint8_t, do_maxa)
-DO_VMAXMINV(vmaxavh, 2, int16_t, uint16_t, do_maxa)
-DO_VMAXMINV(vmaxavw, 4, int32_t, uint32_t, do_maxa)
-DO_VMAXMINV(vminavb, 1, int8_t, uint8_t, do_mina)
-DO_VMAXMINV(vminavh, 2, int16_t, uint16_t, do_mina)
-DO_VMAXMINV(vminavw, 4, int32_t, uint32_t, do_mina)
-
-#define DO_VABAV(OP, ESIZE, TYPE)                               \
-    uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \
-                                    void *vm, uint32_t ra)      \
-    {                                                           \
-        uint16_t mask = mve_element_mask(env);                  \
-        unsigned e;                                             \
-        TYPE *m = vm, *n = vn;                                  \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {      \
-            if (mask & 1) {                                     \
-                int64_t n0 = n[H##ESIZE(e)];                    \
-                int64_t m0 = m[H##ESIZE(e)];                    \
-                uint32_t r = n0 >= m0 ? (n0 - m0) : (m0 - n0);  \
-                ra += r;                                        \
-            }                                                   \
-        }                                                       \
-        mve_advance_vpt(env);                                   \
-        return ra;                                              \
-    }
-
-DO_VABAV(vabavsb, 1, int8_t)
-DO_VABAV(vabavsh, 2, int16_t)
-DO_VABAV(vabavsw, 4, int32_t)
-DO_VABAV(vabavub, 1, uint8_t)
-DO_VABAV(vabavuh, 2, uint16_t)
-DO_VABAV(vabavuw, 4, uint32_t)
-
-#define DO_VADDLV(OP, TYPE, LTYPE)                              \
-    uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \
-                                    uint64_t ra)                \
-    {                                                           \
-        uint16_t mask = mve_element_mask(env);                  \
-        unsigned e;                                             \
-        TYPE *m = vm;                                           \
-        for (e = 0; e < 16 / 4; e++, mask >>= 4) {              \
-            if (mask & 1) {                                     \
-                ra += (LTYPE)m[H4(e)];                          \
-            }                                                   \
-        }                                                       \
-        mve_advance_vpt(env);                                   \
-        return ra;                                              \
-    }                                                           \
-
-DO_VADDLV(vaddlv_s, int32_t, int64_t)
-DO_VADDLV(vaddlv_u, uint32_t, uint64_t)
-
-/* Shifts by immediate */
-#define DO_2SHIFT(OP, ESIZE, TYPE, FN)                          \
-    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd,     \
-                                void *vm, uint32_t shift)       \
-    {                                                           \
-        TYPE *d = vd, *m = vm;                                  \
-        uint16_t mask = mve_element_mask(env);                  \
-        unsigned e;                                             \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {      \
-            mergemask(&d[H##ESIZE(e)],                          \
-                      FN(m[H##ESIZE(e)], shift), mask);         \
-        }                                                       \
-        mve_advance_vpt(env);                                   \
-    }
-
-#define DO_2SHIFT_SAT(OP, ESIZE, TYPE, FN)                      \
-    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd,     \
-                                void *vm, uint32_t shift)       \
-    {                                                           \
-        TYPE *d = vd, *m = vm;                                  \
-        uint16_t mask = mve_element_mask(env);                  \
-        unsigned e;                                             \
-        bool qc = false;                                        \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {      \
-            bool sat = false;                                   \
-            mergemask(&d[H##ESIZE(e)],                          \
-                      FN(m[H##ESIZE(e)], shift, &sat), mask);   \
-            qc |= sat & mask & 1;                               \
-        }                                                       \
-        if (qc) {                                               \
-            env->vfp.qc[0] = qc;                                \
-        }                                                       \
-        mve_advance_vpt(env);                                   \
-    }
-
-/* provide unsigned 2-op shift helpers for all sizes */
-#define DO_2SHIFT_U(OP, FN)                     \
-    DO_2SHIFT(OP##b, 1, uint8_t, FN)            \
-    DO_2SHIFT(OP##h, 2, uint16_t, FN)           \
-    DO_2SHIFT(OP##w, 4, uint32_t, FN)
-#define DO_2SHIFT_S(OP, FN)                     \
-    DO_2SHIFT(OP##b, 1, int8_t, FN)             \
-    DO_2SHIFT(OP##h, 2, int16_t, FN)            \
-    DO_2SHIFT(OP##w, 4, int32_t, FN)
-
-#define DO_2SHIFT_SAT_U(OP, FN)                 \
-    DO_2SHIFT_SAT(OP##b, 1, uint8_t, FN)        \
-    DO_2SHIFT_SAT(OP##h, 2, uint16_t, FN)       \
-    DO_2SHIFT_SAT(OP##w, 4, uint32_t, FN)
-#define DO_2SHIFT_SAT_S(OP, FN)                 \
-    DO_2SHIFT_SAT(OP##b, 1, int8_t, FN)         \
-    DO_2SHIFT_SAT(OP##h, 2, int16_t, FN)        \
-    DO_2SHIFT_SAT(OP##w, 4, int32_t, FN)
-
-DO_2SHIFT_U(vshli_u, DO_VSHLU)
-DO_2SHIFT_S(vshli_s, DO_VSHLS)
-DO_2SHIFT_SAT_U(vqshli_u, DO_UQSHL_OP)
-DO_2SHIFT_SAT_S(vqshli_s, DO_SQSHL_OP)
-DO_2SHIFT_SAT_S(vqshlui_s, DO_SUQSHL_OP)
-DO_2SHIFT_U(vrshli_u, DO_VRSHLU)
-DO_2SHIFT_S(vrshli_s, DO_VRSHLS)
-DO_2SHIFT_SAT_U(vqrshli_u, DO_UQRSHL_OP)
-DO_2SHIFT_SAT_S(vqrshli_s, DO_SQRSHL_OP)
-
-/* Shift-and-insert; we always work with 64 bits at a time */
-#define DO_2SHIFT_INSERT(OP, ESIZE, SHIFTFN, MASKFN)                    \
-    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd,             \
-                                void *vm, uint32_t shift)               \
-    {                                                                   \
-        uint64_t *d = vd, *m = vm;                                      \
-        uint16_t mask;                                                  \
-        uint64_t shiftmask;                                             \
-        unsigned e;                                                     \
-        if (shift == ESIZE * 8) {                                       \
-            /*                                                          \
-             * Only VSRI can shift by <dt>; it should mean "don't       \
-             * update the destination". The generic logic can't handle  \
-             * this because it would try to shift by an out-of-range    \
-             * amount, so special case it here.                         \
-             */                                                         \
-            goto done;                                                  \
-        }                                                               \
-        assert(shift < ESIZE * 8);                                      \
-        mask = mve_element_mask(env);                                   \
-        /* ESIZE / 2 gives the MO_* value if ESIZE is in [1,2,4] */     \
-        shiftmask = dup_const(ESIZE / 2, MASKFN(ESIZE * 8, shift));     \
-        for (e = 0; e < 16 / 8; e++, mask >>= 8) {                      \
-            uint64_t r = (SHIFTFN(m[H8(e)], shift) & shiftmask) |       \
-                (d[H8(e)] & ~shiftmask);                                \
-            mergemask(&d[H8(e)], r, mask);                              \
-        }                                                               \
-done:                                                                   \
-        mve_advance_vpt(env);                                           \
-    }
-
-#define DO_SHL(N, SHIFT) ((N) << (SHIFT))
-#define DO_SHR(N, SHIFT) ((N) >> (SHIFT))
-#define SHL_MASK(EBITS, SHIFT) MAKE_64BIT_MASK((SHIFT), (EBITS) - (SHIFT))
-#define SHR_MASK(EBITS, SHIFT) MAKE_64BIT_MASK(0, (EBITS) - (SHIFT))
-
-DO_2SHIFT_INSERT(vsrib, 1, DO_SHR, SHR_MASK)
-DO_2SHIFT_INSERT(vsrih, 2, DO_SHR, SHR_MASK)
-DO_2SHIFT_INSERT(vsriw, 4, DO_SHR, SHR_MASK)
-DO_2SHIFT_INSERT(vslib, 1, DO_SHL, SHL_MASK)
-DO_2SHIFT_INSERT(vslih, 2, DO_SHL, SHL_MASK)
-DO_2SHIFT_INSERT(vsliw, 4, DO_SHL, SHL_MASK)
-
-/*
- * Long shifts taking half-sized inputs from top or bottom of the input
- * vector and producing a double-width result. ESIZE, TYPE are for
- * the input, and LESIZE, LTYPE for the output.
- * Unlike the normal shift helpers, we do not handle negative shift counts,
- * because the long shift is strictly left-only.
- */
-#define DO_VSHLL(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE)                   \
-    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd,             \
-                                void *vm, uint32_t shift)               \
-    {                                                                   \
-        LTYPE *d = vd;                                                  \
-        TYPE *m = vm;                                                   \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned le;                                                    \
-        assert(shift <= 16);                                            \
-        for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) {         \
-            LTYPE r = (LTYPE)m[H##ESIZE(le * 2 + TOP)] << shift;        \
-            mergemask(&d[H##LESIZE(le)], r, mask);                      \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-#define DO_VSHLL_ALL(OP, TOP)                                \
-    DO_VSHLL(OP##sb, TOP, 1, int8_t, 2, int16_t)             \
-    DO_VSHLL(OP##ub, TOP, 1, uint8_t, 2, uint16_t)           \
-    DO_VSHLL(OP##sh, TOP, 2, int16_t, 4, int32_t)            \
-    DO_VSHLL(OP##uh, TOP, 2, uint16_t, 4, uint32_t)          \
-
-DO_VSHLL_ALL(vshllb, false)
-DO_VSHLL_ALL(vshllt, true)
-
-/*
- * Narrowing right shifts, taking a double sized input, shifting it
- * and putting the result in either the top or bottom half of the output.
- * ESIZE, TYPE are the output, and LESIZE, LTYPE the input.
- */
-#define DO_VSHRN(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN)       \
-    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd,     \
-                                void *vm, uint32_t shift)       \
-    {                                                           \
-        LTYPE *m = vm;                                          \
-        TYPE *d = vd;                                           \
-        uint16_t mask = mve_element_mask(env);                  \
-        unsigned le;                                            \
-        mask >>= ESIZE * TOP;                                   \
-        for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
-            TYPE r = FN(m[H##LESIZE(le)], shift);               \
-            mergemask(&d[H##ESIZE(le * 2 + TOP)], r, mask);     \
-        }                                                       \
-        mve_advance_vpt(env);                                   \
-    }
-
-#define DO_VSHRN_ALL(OP, FN)                                    \
-    DO_VSHRN(OP##bb, false, 1, uint8_t, 2, uint16_t, FN)        \
-    DO_VSHRN(OP##bh, false, 2, uint16_t, 4, uint32_t, FN)       \
-    DO_VSHRN(OP##tb, true, 1, uint8_t, 2, uint16_t, FN)         \
-    DO_VSHRN(OP##th, true, 2, uint16_t, 4, uint32_t, FN)
-
-static inline uint64_t do_urshr(uint64_t x, unsigned sh)
-{
-    if (likely(sh < 64)) {
-        return (x >> sh) + ((x >> (sh - 1)) & 1);
-    } else if (sh == 64) {
-        return x >> 63;
-    } else {
-        return 0;
-    }
-}
-
-static inline int64_t do_srshr(int64_t x, unsigned sh)
-{
-    if (likely(sh < 64)) {
-        return (x >> sh) + ((x >> (sh - 1)) & 1);
-    } else {
-        /* Rounding the sign bit always produces 0. */
-        return 0;
-    }
-}
-
-DO_VSHRN_ALL(vshrn, DO_SHR)
-DO_VSHRN_ALL(vrshrn, do_urshr)
-
-static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max,
-                                 bool *satp)
-{
-    if (val > max) {
-        *satp = true;
-        return max;
-    } else if (val < min) {
-        *satp = true;
-        return min;
-    } else {
-        return val;
-    }
-}
-
-/* Saturating narrowing right shifts */
-#define DO_VSHRN_SAT(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN)   \
-    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd,     \
-                                void *vm, uint32_t shift)       \
-    {                                                           \
-        LTYPE *m = vm;                                          \
-        TYPE *d = vd;                                           \
-        uint16_t mask = mve_element_mask(env);                  \
-        bool qc = false;                                        \
-        unsigned le;                                            \
-        mask >>= ESIZE * TOP;                                   \
-        for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
-            bool sat = false;                                   \
-            TYPE r = FN(m[H##LESIZE(le)], shift, &sat);         \
-            mergemask(&d[H##ESIZE(le * 2 + TOP)], r, mask);     \
-            qc |= sat & mask & 1;                               \
-        }                                                       \
-        if (qc) {                                               \
-            env->vfp.qc[0] = qc;                                \
-        }                                                       \
-        mve_advance_vpt(env);                                   \
-    }
-
-#define DO_VSHRN_SAT_UB(BOP, TOP, FN)                           \
-    DO_VSHRN_SAT(BOP, false, 1, uint8_t, 2, uint16_t, FN)       \
-    DO_VSHRN_SAT(TOP, true, 1, uint8_t, 2, uint16_t, FN)
-
-#define DO_VSHRN_SAT_UH(BOP, TOP, FN)                           \
-    DO_VSHRN_SAT(BOP, false, 2, uint16_t, 4, uint32_t, FN)      \
-    DO_VSHRN_SAT(TOP, true, 2, uint16_t, 4, uint32_t, FN)
-
-#define DO_VSHRN_SAT_SB(BOP, TOP, FN)                           \
-    DO_VSHRN_SAT(BOP, false, 1, int8_t, 2, int16_t, FN)         \
-    DO_VSHRN_SAT(TOP, true, 1, int8_t, 2, int16_t, FN)
-
-#define DO_VSHRN_SAT_SH(BOP, TOP, FN)                           \
-    DO_VSHRN_SAT(BOP, false, 2, int16_t, 4, int32_t, FN)        \
-    DO_VSHRN_SAT(TOP, true, 2, int16_t, 4, int32_t, FN)
-
-#define DO_SHRN_SB(N, M, SATP)                                  \
-    do_sat_bhs((int64_t)(N) >> (M), INT8_MIN, INT8_MAX, SATP)
-#define DO_SHRN_UB(N, M, SATP)                                  \
-    do_sat_bhs((uint64_t)(N) >> (M), 0, UINT8_MAX, SATP)
-#define DO_SHRUN_B(N, M, SATP)                                  \
-    do_sat_bhs((int64_t)(N) >> (M), 0, UINT8_MAX, SATP)
-
-#define DO_SHRN_SH(N, M, SATP)                                  \
-    do_sat_bhs((int64_t)(N) >> (M), INT16_MIN, INT16_MAX, SATP)
-#define DO_SHRN_UH(N, M, SATP)                                  \
-    do_sat_bhs((uint64_t)(N) >> (M), 0, UINT16_MAX, SATP)
-#define DO_SHRUN_H(N, M, SATP)                                  \
-    do_sat_bhs((int64_t)(N) >> (M), 0, UINT16_MAX, SATP)
-
-#define DO_RSHRN_SB(N, M, SATP)                                 \
-    do_sat_bhs(do_srshr(N, M), INT8_MIN, INT8_MAX, SATP)
-#define DO_RSHRN_UB(N, M, SATP)                                 \
-    do_sat_bhs(do_urshr(N, M), 0, UINT8_MAX, SATP)
-#define DO_RSHRUN_B(N, M, SATP)                                 \
-    do_sat_bhs(do_srshr(N, M), 0, UINT8_MAX, SATP)
-
-#define DO_RSHRN_SH(N, M, SATP)                                 \
-    do_sat_bhs(do_srshr(N, M), INT16_MIN, INT16_MAX, SATP)
-#define DO_RSHRN_UH(N, M, SATP)                                 \
-    do_sat_bhs(do_urshr(N, M), 0, UINT16_MAX, SATP)
-#define DO_RSHRUN_H(N, M, SATP)                                 \
-    do_sat_bhs(do_srshr(N, M), 0, UINT16_MAX, SATP)
-
-DO_VSHRN_SAT_SB(vqshrnb_sb, vqshrnt_sb, DO_SHRN_SB)
-DO_VSHRN_SAT_SH(vqshrnb_sh, vqshrnt_sh, DO_SHRN_SH)
-DO_VSHRN_SAT_UB(vqshrnb_ub, vqshrnt_ub, DO_SHRN_UB)
-DO_VSHRN_SAT_UH(vqshrnb_uh, vqshrnt_uh, DO_SHRN_UH)
-DO_VSHRN_SAT_SB(vqshrunbb, vqshruntb, DO_SHRUN_B)
-DO_VSHRN_SAT_SH(vqshrunbh, vqshrunth, DO_SHRUN_H)
-
-DO_VSHRN_SAT_SB(vqrshrnb_sb, vqrshrnt_sb, DO_RSHRN_SB)
-DO_VSHRN_SAT_SH(vqrshrnb_sh, vqrshrnt_sh, DO_RSHRN_SH)
-DO_VSHRN_SAT_UB(vqrshrnb_ub, vqrshrnt_ub, DO_RSHRN_UB)
-DO_VSHRN_SAT_UH(vqrshrnb_uh, vqrshrnt_uh, DO_RSHRN_UH)
-DO_VSHRN_SAT_SB(vqrshrunbb, vqrshruntb, DO_RSHRUN_B)
-DO_VSHRN_SAT_SH(vqrshrunbh, vqrshrunth, DO_RSHRUN_H)
-
-#define DO_VMOVN(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE)                   \
-    void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm)         \
-    {                                                                   \
-        LTYPE *m = vm;                                                  \
-        TYPE *d = vd;                                                   \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned le;                                                    \
-        mask >>= ESIZE * TOP;                                           \
-        for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) {         \
-            mergemask(&d[H##ESIZE(le * 2 + TOP)],                       \
-                      m[H##LESIZE(le)], mask);                          \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-DO_VMOVN(vmovnbb, false, 1, uint8_t, 2, uint16_t)
-DO_VMOVN(vmovnbh, false, 2, uint16_t, 4, uint32_t)
-DO_VMOVN(vmovntb, true, 1, uint8_t, 2, uint16_t)
-DO_VMOVN(vmovnth, true, 2, uint16_t, 4, uint32_t)
-
-#define DO_VMOVN_SAT(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN)           \
-    void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm)         \
-    {                                                                   \
-        LTYPE *m = vm;                                                  \
-        TYPE *d = vd;                                                   \
-        uint16_t mask = mve_element_mask(env);                          \
-        bool qc = false;                                                \
-        unsigned le;                                                    \
-        mask >>= ESIZE * TOP;                                           \
-        for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) {         \
-            bool sat = false;                                           \
-            TYPE r = FN(m[H##LESIZE(le)], &sat);                        \
-            mergemask(&d[H##ESIZE(le * 2 + TOP)], r, mask);             \
-            qc |= sat & mask & 1;                                       \
-        }                                                               \
-        if (qc) {                                                       \
-            env->vfp.qc[0] = qc;                                        \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-#define DO_VMOVN_SAT_UB(BOP, TOP, FN)                           \
-    DO_VMOVN_SAT(BOP, false, 1, uint8_t, 2, uint16_t, FN)       \
-    DO_VMOVN_SAT(TOP, true, 1, uint8_t, 2, uint16_t, FN)
-
-#define DO_VMOVN_SAT_UH(BOP, TOP, FN)                           \
-    DO_VMOVN_SAT(BOP, false, 2, uint16_t, 4, uint32_t, FN)      \
-    DO_VMOVN_SAT(TOP, true, 2, uint16_t, 4, uint32_t, FN)
-
-#define DO_VMOVN_SAT_SB(BOP, TOP, FN)                           \
-    DO_VMOVN_SAT(BOP, false, 1, int8_t, 2, int16_t, FN)         \
-    DO_VMOVN_SAT(TOP, true, 1, int8_t, 2, int16_t, FN)
-
-#define DO_VMOVN_SAT_SH(BOP, TOP, FN)                           \
-    DO_VMOVN_SAT(BOP, false, 2, int16_t, 4, int32_t, FN)        \
-    DO_VMOVN_SAT(TOP, true, 2, int16_t, 4, int32_t, FN)
-
-#define DO_VQMOVN_SB(N, SATP)                           \
-    do_sat_bhs((int64_t)(N), INT8_MIN, INT8_MAX, SATP)
-#define DO_VQMOVN_UB(N, SATP)                           \
-    do_sat_bhs((uint64_t)(N), 0, UINT8_MAX, SATP)
-#define DO_VQMOVUN_B(N, SATP)                           \
-    do_sat_bhs((int64_t)(N), 0, UINT8_MAX, SATP)
-
-#define DO_VQMOVN_SH(N, SATP)                           \
-    do_sat_bhs((int64_t)(N), INT16_MIN, INT16_MAX, SATP)
-#define DO_VQMOVN_UH(N, SATP)                           \
-    do_sat_bhs((uint64_t)(N), 0, UINT16_MAX, SATP)
-#define DO_VQMOVUN_H(N, SATP)                           \
-    do_sat_bhs((int64_t)(N), 0, UINT16_MAX, SATP)
-
-DO_VMOVN_SAT_SB(vqmovnbsb, vqmovntsb, DO_VQMOVN_SB)
-DO_VMOVN_SAT_SH(vqmovnbsh, vqmovntsh, DO_VQMOVN_SH)
-DO_VMOVN_SAT_UB(vqmovnbub, vqmovntub, DO_VQMOVN_UB)
-DO_VMOVN_SAT_UH(vqmovnbuh, vqmovntuh, DO_VQMOVN_UH)
-DO_VMOVN_SAT_SB(vqmovunbb, vqmovuntb, DO_VQMOVUN_B)
-DO_VMOVN_SAT_SH(vqmovunbh, vqmovunth, DO_VQMOVUN_H)
-
-uint32_t HELPER(mve_vshlc)(CPUARMState *env, void *vd, uint32_t rdm,
-                           uint32_t shift)
-{
-    uint32_t *d = vd;
-    uint16_t mask = mve_element_mask(env);
-    unsigned e;
-    uint32_t r;
-
-    /*
-     * For each 32-bit element, we shift it left, bringing in the
-     * low 'shift' bits of rdm at the bottom. Bits shifted out at
-     * the top become the new rdm, if the predicate mask permits.
-     * The final rdm value is returned to update the register.
-     * shift == 0 here means "shift by 32 bits".
-     */
-    if (shift == 0) {
-        for (e = 0; e < 16 / 4; e++, mask >>= 4) {
-            r = rdm;
-            if (mask & 1) {
-                rdm = d[H4(e)];
-            }
-            mergemask(&d[H4(e)], r, mask);
-        }
-    } else {
-        uint32_t shiftmask = MAKE_64BIT_MASK(0, shift);
-
-        for (e = 0; e < 16 / 4; e++, mask >>= 4) {
-            r = (d[H4(e)] << shift) | (rdm & shiftmask);
-            if (mask & 1) {
-                rdm = d[H4(e)] >> (32 - shift);
-            }
-            mergemask(&d[H4(e)], r, mask);
-        }
-    }
-    mve_advance_vpt(env);
-    return rdm;
-}
-
-uint64_t HELPER(mve_sshrl)(CPUARMState *env, uint64_t n, uint32_t shift)
-{
-    return do_sqrshl_d(n, -(int8_t)shift, false, NULL);
-}
-
-uint64_t HELPER(mve_ushll)(CPUARMState *env, uint64_t n, uint32_t shift)
-{
-    return do_uqrshl_d(n, (int8_t)shift, false, NULL);
-}
-
-uint64_t HELPER(mve_sqshll)(CPUARMState *env, uint64_t n, uint32_t shift)
-{
-    return do_sqrshl_d(n, (int8_t)shift, false, &env->QF);
-}
-
-uint64_t HELPER(mve_uqshll)(CPUARMState *env, uint64_t n, uint32_t shift)
-{
-    return do_uqrshl_d(n, (int8_t)shift, false, &env->QF);
-}
-
-uint64_t HELPER(mve_sqrshrl)(CPUARMState *env, uint64_t n, uint32_t shift)
-{
-    return do_sqrshl_d(n, -(int8_t)shift, true, &env->QF);
-}
-
-uint64_t HELPER(mve_uqrshll)(CPUARMState *env, uint64_t n, uint32_t shift)
-{
-    return do_uqrshl_d(n, (int8_t)shift, true, &env->QF);
-}
-
-/* Operate on 64-bit values, but saturate at 48 bits */
-static inline int64_t do_sqrshl48_d(int64_t src, int64_t shift,
-                                    bool round, uint32_t *sat)
-{
-    int64_t val, extval;
-
-    if (shift <= -48) {
-        /* Rounding the sign bit always produces 0. */
-        if (round) {
-            return 0;
-        }
-        return src >> 63;
-    } else if (shift < 0) {
-        if (round) {
-            src >>= -shift - 1;
-            val = (src >> 1) + (src & 1);
-        } else {
-            val = src >> -shift;
-        }
-        extval = sextract64(val, 0, 48);
-        if (!sat || val == extval) {
-            return extval;
-        }
-    } else if (shift < 48) {
-        int64_t extval = sextract64(src << shift, 0, 48);
-        if (!sat || src == (extval >> shift)) {
-            return extval;
-        }
-    } else if (!sat || src == 0) {
-        return 0;
-    }
-
-    *sat = 1;
-    return src >= 0 ? MAKE_64BIT_MASK(0, 47) : MAKE_64BIT_MASK(47, 17);
-}
-
-/* Operate on 64-bit values, but saturate at 48 bits */
-static inline uint64_t do_uqrshl48_d(uint64_t src, int64_t shift,
-                                     bool round, uint32_t *sat)
-{
-    uint64_t val, extval;
-
-    if (shift <= -(48 + round)) {
-        return 0;
-    } else if (shift < 0) {
-        if (round) {
-            val = src >> (-shift - 1);
-            val = (val >> 1) + (val & 1);
-        } else {
-            val = src >> -shift;
-        }
-        extval = extract64(val, 0, 48);
-        if (!sat || val == extval) {
-            return extval;
-        }
-    } else if (shift < 48) {
-        uint64_t extval = extract64(src << shift, 0, 48);
-        if (!sat || src == (extval >> shift)) {
-            return extval;
-        }
-    } else if (!sat || src == 0) {
-        return 0;
-    }
-
-    *sat = 1;
-    return MAKE_64BIT_MASK(0, 48);
-}
-
-uint64_t HELPER(mve_sqrshrl48)(CPUARMState *env, uint64_t n, uint32_t shift)
-{
-    return do_sqrshl48_d(n, -(int8_t)shift, true, &env->QF);
-}
-
-uint64_t HELPER(mve_uqrshll48)(CPUARMState *env, uint64_t n, uint32_t shift)
-{
-    return do_uqrshl48_d(n, (int8_t)shift, true, &env->QF);
-}
-
-uint32_t HELPER(mve_uqshl)(CPUARMState *env, uint32_t n, uint32_t shift)
-{
-    return do_uqrshl_bhs(n, (int8_t)shift, 32, false, &env->QF);
-}
-
-uint32_t HELPER(mve_sqshl)(CPUARMState *env, uint32_t n, uint32_t shift)
-{
-    return do_sqrshl_bhs(n, (int8_t)shift, 32, false, &env->QF);
-}
-
-uint32_t HELPER(mve_uqrshl)(CPUARMState *env, uint32_t n, uint32_t shift)
-{
-    return do_uqrshl_bhs(n, (int8_t)shift, 32, true, &env->QF);
-}
-
-uint32_t HELPER(mve_sqrshr)(CPUARMState *env, uint32_t n, uint32_t shift)
-{
-    return do_sqrshl_bhs(n, -(int8_t)shift, 32, true, &env->QF);
-}
-
-#define DO_VIDUP(OP, ESIZE, TYPE, FN)                           \
-    uint32_t HELPER(mve_##OP)(CPUARMState *env, void *vd,       \
-                           uint32_t offset, uint32_t imm)       \
-    {                                                           \
-        TYPE *d = vd;                                           \
-        uint16_t mask = mve_element_mask(env);                  \
-        unsigned e;                                             \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {      \
-            mergemask(&d[H##ESIZE(e)], offset, mask);           \
-            offset = FN(offset, imm);                           \
-        }                                                       \
-        mve_advance_vpt(env);                                   \
-        return offset;                                          \
-    }
-
-#define DO_VIWDUP(OP, ESIZE, TYPE, FN)                          \
-    uint32_t HELPER(mve_##OP)(CPUARMState *env, void *vd,       \
-                              uint32_t offset, uint32_t wrap,   \
-                              uint32_t imm)                     \
-    {                                                           \
-        TYPE *d = vd;                                           \
-        uint16_t mask = mve_element_mask(env);                  \
-        unsigned e;                                             \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {      \
-            mergemask(&d[H##ESIZE(e)], offset, mask);           \
-            offset = FN(offset, wrap, imm);                     \
-        }                                                       \
-        mve_advance_vpt(env);                                   \
-        return offset;                                          \
-    }
-
-#define DO_VIDUP_ALL(OP, FN)                    \
-    DO_VIDUP(OP##b, 1, int8_t, FN)              \
-    DO_VIDUP(OP##h, 2, int16_t, FN)             \
-    DO_VIDUP(OP##w, 4, int32_t, FN)
-
-#define DO_VIWDUP_ALL(OP, FN)                   \
-    DO_VIWDUP(OP##b, 1, int8_t, FN)             \
-    DO_VIWDUP(OP##h, 2, int16_t, FN)            \
-    DO_VIWDUP(OP##w, 4, int32_t, FN)
-
-static uint32_t do_add_wrap(uint32_t offset, uint32_t wrap, uint32_t imm)
-{
-    offset += imm;
-    if (offset == wrap) {
-        offset = 0;
-    }
-    return offset;
-}
-
-static uint32_t do_sub_wrap(uint32_t offset, uint32_t wrap, uint32_t imm)
-{
-    if (offset == 0) {
-        offset = wrap;
-    }
-    offset -= imm;
-    return offset;
-}
-
-DO_VIDUP_ALL(vidup, DO_ADD)
-DO_VIWDUP_ALL(viwdup, do_add_wrap)
-DO_VIWDUP_ALL(vdwdup, do_sub_wrap)
-
-/*
- * Vector comparison.
- * P0 bits for non-executed beats (where eci_mask is 0) are unchanged.
- * P0 bits for predicated lanes in executed beats (where mask is 0) are 0.
- * P0 bits otherwise are updated with the results of the comparisons.
- * We must also keep unchanged the MASK fields at the top of v7m.vpr.
- */
-#define DO_VCMP(OP, ESIZE, TYPE, FN)                                    \
-    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, void *vm)   \
-    {                                                                   \
-        TYPE *n = vn, *m = vm;                                          \
-        uint16_t mask = mve_element_mask(env);                          \
-        uint16_t eci_mask = mve_eci_mask(env);                          \
-        uint16_t beatpred = 0;                                          \
-        uint16_t emask = MAKE_64BIT_MASK(0, ESIZE);                     \
-        unsigned e;                                                     \
-        for (e = 0; e < 16 / ESIZE; e++) {                              \
-            bool r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)]);                \
-            /* Comparison sets 0/1 bits for each byte in the element */ \
-            beatpred |= r * emask;                                      \
-            emask <<= ESIZE;                                            \
-        }                                                               \
-        beatpred &= mask;                                               \
-        env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) |           \
-            (beatpred & eci_mask);                                      \
-        mve_advance_vpt(env);                                           \
-    }
-
-#define DO_VCMP_SCALAR(OP, ESIZE, TYPE, FN)                             \
-    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn,             \
-                                uint32_t rm)                            \
-    {                                                                   \
-        TYPE *n = vn;                                                   \
-        uint16_t mask = mve_element_mask(env);                          \
-        uint16_t eci_mask = mve_eci_mask(env);                          \
-        uint16_t beatpred = 0;                                          \
-        uint16_t emask = MAKE_64BIT_MASK(0, ESIZE);                     \
-        unsigned e;                                                     \
-        for (e = 0; e < 16 / ESIZE; e++) {                              \
-            bool r = FN(n[H##ESIZE(e)], (TYPE)rm);                      \
-            /* Comparison sets 0/1 bits for each byte in the element */ \
-            beatpred |= r * emask;                                      \
-            emask <<= ESIZE;                                            \
-        }                                                               \
-        beatpred &= mask;                                               \
-        env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) |           \
-            (beatpred & eci_mask);                                      \
-        mve_advance_vpt(env);                                           \
-    }
-
-#define DO_VCMP_S(OP, FN)                               \
-    DO_VCMP(OP##b, 1, int8_t, FN)                       \
-    DO_VCMP(OP##h, 2, int16_t, FN)                      \
-    DO_VCMP(OP##w, 4, int32_t, FN)                      \
-    DO_VCMP_SCALAR(OP##_scalarb, 1, int8_t, FN)         \
-    DO_VCMP_SCALAR(OP##_scalarh, 2, int16_t, FN)        \
-    DO_VCMP_SCALAR(OP##_scalarw, 4, int32_t, FN)
-
-#define DO_VCMP_U(OP, FN)                               \
-    DO_VCMP(OP##b, 1, uint8_t, FN)                      \
-    DO_VCMP(OP##h, 2, uint16_t, FN)                     \
-    DO_VCMP(OP##w, 4, uint32_t, FN)                     \
-    DO_VCMP_SCALAR(OP##_scalarb, 1, uint8_t, FN)        \
-    DO_VCMP_SCALAR(OP##_scalarh, 2, uint16_t, FN)       \
-    DO_VCMP_SCALAR(OP##_scalarw, 4, uint32_t, FN)
-
-#define DO_EQ(N, M) ((N) == (M))
-#define DO_NE(N, M) ((N) != (M))
-#define DO_EQ(N, M) ((N) == (M))
-#define DO_EQ(N, M) ((N) == (M))
-#define DO_GE(N, M) ((N) >= (M))
-#define DO_LT(N, M) ((N) < (M))
-#define DO_GT(N, M) ((N) > (M))
-#define DO_LE(N, M) ((N) <= (M))
-
-DO_VCMP_U(vcmpeq, DO_EQ)
-DO_VCMP_U(vcmpne, DO_NE)
-DO_VCMP_U(vcmpcs, DO_GE)
-DO_VCMP_U(vcmphi, DO_GT)
-DO_VCMP_S(vcmpge, DO_GE)
-DO_VCMP_S(vcmplt, DO_LT)
-DO_VCMP_S(vcmpgt, DO_GT)
-DO_VCMP_S(vcmple, DO_LE)
-
-void HELPER(mve_vpsel)(CPUARMState *env, void *vd, void *vn, void *vm)
-{
-    /*
-     * Qd[n] = VPR.P0[n] ? Qn[n] : Qm[n]
-     * but note that whether bytes are written to Qd is still subject
-     * to (all forms of) predication in the usual way.
-     */
-    uint64_t *d = vd, *n = vn, *m = vm;
-    uint16_t mask = mve_element_mask(env);
-    uint16_t p0 = FIELD_EX32(env->v7m.vpr, V7M_VPR, P0);
-    unsigned e;
-    for (e = 0; e < 16 / 8; e++, mask >>= 8, p0 >>= 8) {
-        uint64_t r = m[H8(e)];
-        mergemask(&r, n[H8(e)], p0);
-        mergemask(&d[H8(e)], r, mask);
-    }
-    mve_advance_vpt(env);
-}
-
-void HELPER(mve_vpnot)(CPUARMState *env)
-{
-    /*
-     * P0 bits for unexecuted beats (where eci_mask is 0) are unchanged.
-     * P0 bits for predicated lanes in executed bits (where mask is 0) are 0.
-     * P0 bits otherwise are inverted.
-     * (This is the same logic as VCMP.)
-     * This insn is itself subject to predication and to beat-wise execution,
-     * and after it executes VPT state advances in the usual way.
-     */
-    uint16_t mask = mve_element_mask(env);
-    uint16_t eci_mask = mve_eci_mask(env);
-    uint16_t beatpred = ~env->v7m.vpr & mask;
-    env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | (beatpred & eci_mask);
-    mve_advance_vpt(env);
-}
-
-/*
- * VCTP: P0 unexecuted bits unchanged, predicated bits zeroed,
- * otherwise set according to value of Rn. The calculation of
- * newmask here works in the same way as the calculation of the
- * ltpmask in mve_element_mask(), but we have pre-calculated
- * the masklen in the generated code.
- */
-void HELPER(mve_vctp)(CPUARMState *env, uint32_t masklen)
-{
-    uint16_t mask = mve_element_mask(env);
-    uint16_t eci_mask = mve_eci_mask(env);
-    uint16_t newmask;
-
-    assert(masklen <= 16);
-    newmask = masklen ? MAKE_64BIT_MASK(0, masklen) : 0;
-    newmask &= mask;
-    env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | (newmask & eci_mask);
-    mve_advance_vpt(env);
-}
-
-#define DO_1OP_SAT(OP, ESIZE, TYPE, FN)                                 \
-    void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm)         \
-    {                                                                   \
-        TYPE *d = vd, *m = vm;                                          \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned e;                                                     \
-        bool qc = false;                                                \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
-            bool sat = false;                                           \
-            mergemask(&d[H##ESIZE(e)], FN(m[H##ESIZE(e)], &sat), mask); \
-            qc |= sat & mask & 1;                                       \
-        }                                                               \
-        if (qc) {                                                       \
-            env->vfp.qc[0] = qc;                                        \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-#define DO_VQABS_B(N, SATP) \
-    do_sat_bhs(DO_ABS((int64_t)N), INT8_MIN, INT8_MAX, SATP)
-#define DO_VQABS_H(N, SATP) \
-    do_sat_bhs(DO_ABS((int64_t)N), INT16_MIN, INT16_MAX, SATP)
-#define DO_VQABS_W(N, SATP) \
-    do_sat_bhs(DO_ABS((int64_t)N), INT32_MIN, INT32_MAX, SATP)
-
-#define DO_VQNEG_B(N, SATP) do_sat_bhs(-(int64_t)N, INT8_MIN, INT8_MAX, SATP)
-#define DO_VQNEG_H(N, SATP) do_sat_bhs(-(int64_t)N, INT16_MIN, INT16_MAX, SATP)
-#define DO_VQNEG_W(N, SATP) do_sat_bhs(-(int64_t)N, INT32_MIN, INT32_MAX, SATP)
-
-DO_1OP_SAT(vqabsb, 1, int8_t, DO_VQABS_B)
-DO_1OP_SAT(vqabsh, 2, int16_t, DO_VQABS_H)
-DO_1OP_SAT(vqabsw, 4, int32_t, DO_VQABS_W)
-
-DO_1OP_SAT(vqnegb, 1, int8_t, DO_VQNEG_B)
-DO_1OP_SAT(vqnegh, 2, int16_t, DO_VQNEG_H)
-DO_1OP_SAT(vqnegw, 4, int32_t, DO_VQNEG_W)
-
-/*
- * VMAXA, VMINA: vd is unsigned; vm is signed, and we take its
- * absolute value; we then do an unsigned comparison.
- */
-#define DO_VMAXMINA(OP, ESIZE, STYPE, UTYPE, FN)                        \
-    void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm)         \
-    {                                                                   \
-        UTYPE *d = vd;                                                  \
-        STYPE *m = vm;                                                  \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned e;                                                     \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
-            UTYPE r = DO_ABS(m[H##ESIZE(e)]);                           \
-            r = FN(d[H##ESIZE(e)], r);                                  \
-            mergemask(&d[H##ESIZE(e)], r, mask);                        \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-DO_VMAXMINA(vmaxab, 1, int8_t, uint8_t, DO_MAX)
-DO_VMAXMINA(vmaxah, 2, int16_t, uint16_t, DO_MAX)
-DO_VMAXMINA(vmaxaw, 4, int32_t, uint32_t, DO_MAX)
-DO_VMAXMINA(vminab, 1, int8_t, uint8_t, DO_MIN)
-DO_VMAXMINA(vminah, 2, int16_t, uint16_t, DO_MIN)
-DO_VMAXMINA(vminaw, 4, int32_t, uint32_t, DO_MIN)
-
-/*
- * 2-operand floating point. Note that if an element is partially
- * predicated we must do the FP operation to update the non-predicated
- * bytes, but we must be careful to avoid updating the FP exception
- * state unless byte 0 of the element was unpredicated.
- */
-#define DO_2OP_FP(OP, ESIZE, TYPE, FN)                                  \
-    void HELPER(glue(mve_, OP))(CPUARMState *env,                       \
-                                void *vd, void *vn, void *vm)           \
-    {                                                                   \
-        TYPE *d = vd, *n = vn, *m = vm;                                 \
-        TYPE r;                                                         \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned e;                                                     \
-        float_status *fpst;                                             \
-        float_status scratch_fpst;                                      \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
-            if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) {              \
-                continue;                                               \
-            }                                                           \
-            fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 :    \
-                &env->vfp.standard_fp_status;                           \
-            if (!(mask & 1)) {                                          \
-                /* We need the result but without updating flags */     \
-                scratch_fpst = *fpst;                                   \
-                fpst = &scratch_fpst;                                   \
-            }                                                           \
-            r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], fpst);               \
-            mergemask(&d[H##ESIZE(e)], r, mask);                        \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-#define DO_2OP_FP_ALL(OP, FN)                  \
-    DO_2OP_FP(OP##h, 2, float16, float16_##FN) \
-    DO_2OP_FP(OP##s, 4, float32, float32_##FN)
-
-DO_2OP_FP_ALL(vfadd, add)
-DO_2OP_FP_ALL(vfsub, sub)
-DO_2OP_FP_ALL(vfmul, mul)
-
-static inline float16 float16_abd(float16 a, float16 b, float_status *s)
-{
-    return float16_abs(float16_sub(a, b, s));
-}
-
-static inline float32 float32_abd(float32 a, float32 b, float_status *s)
-{
-    return float32_abs(float32_sub(a, b, s));
-}
-
-DO_2OP_FP_ALL(vfabd, abd)
-DO_2OP_FP_ALL(vmaxnm, maxnum)
-DO_2OP_FP_ALL(vminnm, minnum)
-
-static inline float16 float16_maxnuma(float16 a, float16 b, float_status *s)
-{
-    return float16_maxnum(float16_abs(a), float16_abs(b), s);
-}
-
-static inline float32 float32_maxnuma(float32 a, float32 b, float_status *s)
-{
-    return float32_maxnum(float32_abs(a), float32_abs(b), s);
-}
-
-static inline float16 float16_minnuma(float16 a, float16 b, float_status *s)
-{
-    return float16_minnum(float16_abs(a), float16_abs(b), s);
-}
-
-static inline float32 float32_minnuma(float32 a, float32 b, float_status *s)
-{
-    return float32_minnum(float32_abs(a), float32_abs(b), s);
-}
-
-DO_2OP_FP_ALL(vmaxnma, maxnuma)
-DO_2OP_FP_ALL(vminnma, minnuma)
-
-#define DO_VCADD_FP(OP, ESIZE, TYPE, FN0, FN1)                          \
-    void HELPER(glue(mve_, OP))(CPUARMState *env,                       \
-                                void *vd, void *vn, void *vm)           \
-    {                                                                   \
-        TYPE *d = vd, *n = vn, *m = vm;                                 \
-        TYPE r[16 / ESIZE];                                             \
-        uint16_t tm, mask = mve_element_mask(env);                      \
-        unsigned e;                                                     \
-        float_status *fpst;                                             \
-        float_status scratch_fpst;                                      \
-        /* Calculate all results first to avoid overwriting inputs */   \
-        for (e = 0, tm = mask; e < 16 / ESIZE; e++, tm >>= ESIZE) {     \
-            if ((tm & MAKE_64BIT_MASK(0, ESIZE)) == 0) {                \
-                r[e] = 0;                                               \
-                continue;                                               \
-            }                                                           \
-            fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 :    \
-                &env->vfp.standard_fp_status;                           \
-            if (!(tm & 1)) {                                            \
-                /* We need the result but without updating flags */     \
-                scratch_fpst = *fpst;                                   \
-                fpst = &scratch_fpst;                                   \
-            }                                                           \
-            if (!(e & 1)) {                                             \
-                r[e] = FN0(n[H##ESIZE(e)], m[H##ESIZE(e + 1)], fpst);   \
-            } else {                                                    \
-                r[e] = FN1(n[H##ESIZE(e)], m[H##ESIZE(e - 1)], fpst);   \
-            }                                                           \
-        }                                                               \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
-            mergemask(&d[H##ESIZE(e)], r[e], mask);                     \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-DO_VCADD_FP(vfcadd90h, 2, float16, float16_sub, float16_add)
-DO_VCADD_FP(vfcadd90s, 4, float32, float32_sub, float32_add)
-DO_VCADD_FP(vfcadd270h, 2, float16, float16_add, float16_sub)
-DO_VCADD_FP(vfcadd270s, 4, float32, float32_add, float32_sub)
-
-#define DO_VFMA(OP, ESIZE, TYPE, CHS)                                   \
-    void HELPER(glue(mve_, OP))(CPUARMState *env,                       \
-                                void *vd, void *vn, void *vm)           \
-    {                                                                   \
-        TYPE *d = vd, *n = vn, *m = vm;                                 \
-        TYPE r;                                                         \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned e;                                                     \
-        float_status *fpst;                                             \
-        float_status scratch_fpst;                                      \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
-            if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) {              \
-                continue;                                               \
-            }                                                           \
-            fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 :    \
-                &env->vfp.standard_fp_status;                           \
-            if (!(mask & 1)) {                                          \
-                /* We need the result but without updating flags */     \
-                scratch_fpst = *fpst;                                   \
-                fpst = &scratch_fpst;                                   \
-            }                                                           \
-            r = n[H##ESIZE(e)];                                         \
-            if (CHS) {                                                  \
-                r = TYPE##_chs(r);                                      \
-            }                                                           \
-            r = TYPE##_muladd(r, m[H##ESIZE(e)], d[H##ESIZE(e)],        \
-                              0, fpst);                                 \
-            mergemask(&d[H##ESIZE(e)], r, mask);                        \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-DO_VFMA(vfmah, 2, float16, false)
-DO_VFMA(vfmas, 4, float32, false)
-DO_VFMA(vfmsh, 2, float16, true)
-DO_VFMA(vfmss, 4, float32, true)
-
-#define DO_VCMLA(OP, ESIZE, TYPE, ROT, FN)                              \
-    void HELPER(glue(mve_, OP))(CPUARMState *env,                       \
-                                void *vd, void *vn, void *vm)           \
-    {                                                                   \
-        TYPE *d = vd, *n = vn, *m = vm;                                 \
-        TYPE r0, r1, e1, e2, e3, e4;                                    \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned e;                                                     \
-        float_status *fpst0, *fpst1;                                    \
-        float_status scratch_fpst;                                      \
-        /* We loop through pairs of elements at a time */               \
-        for (e = 0; e < 16 / ESIZE; e += 2, mask >>= ESIZE * 2) {       \
-            if ((mask & MAKE_64BIT_MASK(0, ESIZE * 2)) == 0) {          \
-                continue;                                               \
-            }                                                           \
-            fpst0 = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 :   \
-                &env->vfp.standard_fp_status;                           \
-            fpst1 = fpst0;                                              \
-            if (!(mask & 1)) {                                          \
-                scratch_fpst = *fpst0;                                  \
-                fpst0 = &scratch_fpst;                                  \
-            }                                                           \
-            if (!(mask & (1 << ESIZE))) {                               \
-                scratch_fpst = *fpst1;                                  \
-                fpst1 = &scratch_fpst;                                  \
-            }                                                           \
-            switch (ROT) {                                              \
-            case 0:                                                     \
-                e1 = m[H##ESIZE(e)];                                    \
-                e2 = n[H##ESIZE(e)];                                    \
-                e3 = m[H##ESIZE(e + 1)];                                \
-                e4 = n[H##ESIZE(e)];                                    \
-                break;                                                  \
-            case 1:                                                     \
-                e1 = TYPE##_chs(m[H##ESIZE(e + 1)]);                    \
-                e2 = n[H##ESIZE(e + 1)];                                \
-                e3 = m[H##ESIZE(e)];                                    \
-                e4 = n[H##ESIZE(e + 1)];                                \
-                break;                                                  \
-            case 2:                                                     \
-                e1 = TYPE##_chs(m[H##ESIZE(e)]);                        \
-                e2 = n[H##ESIZE(e)];                                    \
-                e3 = TYPE##_chs(m[H##ESIZE(e + 1)]);                    \
-                e4 = n[H##ESIZE(e)];                                    \
-                break;                                                  \
-            case 3:                                                     \
-                e1 = m[H##ESIZE(e + 1)];                                \
-                e2 = n[H##ESIZE(e + 1)];                                \
-                e3 = TYPE##_chs(m[H##ESIZE(e)]);                        \
-                e4 = n[H##ESIZE(e + 1)];                                \
-                break;                                                  \
-            default:                                                    \
-                g_assert_not_reached();                                 \
-            }                                                           \
-            r0 = FN(e2, e1, d[H##ESIZE(e)], fpst0);                     \
-            r1 = FN(e4, e3, d[H##ESIZE(e + 1)], fpst1);                 \
-            mergemask(&d[H##ESIZE(e)], r0, mask);                       \
-            mergemask(&d[H##ESIZE(e + 1)], r1, mask >> ESIZE);          \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-#define DO_VCMULH(N, M, D, S) float16_mul(N, M, S)
-#define DO_VCMULS(N, M, D, S) float32_mul(N, M, S)
-
-#define DO_VCMLAH(N, M, D, S) float16_muladd(N, M, D, 0, S)
-#define DO_VCMLAS(N, M, D, S) float32_muladd(N, M, D, 0, S)
-
-DO_VCMLA(vcmul0h, 2, float16, 0, DO_VCMULH)
-DO_VCMLA(vcmul0s, 4, float32, 0, DO_VCMULS)
-DO_VCMLA(vcmul90h, 2, float16, 1, DO_VCMULH)
-DO_VCMLA(vcmul90s, 4, float32, 1, DO_VCMULS)
-DO_VCMLA(vcmul180h, 2, float16, 2, DO_VCMULH)
-DO_VCMLA(vcmul180s, 4, float32, 2, DO_VCMULS)
-DO_VCMLA(vcmul270h, 2, float16, 3, DO_VCMULH)
-DO_VCMLA(vcmul270s, 4, float32, 3, DO_VCMULS)
-
-DO_VCMLA(vcmla0h, 2, float16, 0, DO_VCMLAH)
-DO_VCMLA(vcmla0s, 4, float32, 0, DO_VCMLAS)
-DO_VCMLA(vcmla90h, 2, float16, 1, DO_VCMLAH)
-DO_VCMLA(vcmla90s, 4, float32, 1, DO_VCMLAS)
-DO_VCMLA(vcmla180h, 2, float16, 2, DO_VCMLAH)
-DO_VCMLA(vcmla180s, 4, float32, 2, DO_VCMLAS)
-DO_VCMLA(vcmla270h, 2, float16, 3, DO_VCMLAH)
-DO_VCMLA(vcmla270s, 4, float32, 3, DO_VCMLAS)
-
-#define DO_2OP_FP_SCALAR(OP, ESIZE, TYPE, FN)                           \
-    void HELPER(glue(mve_, OP))(CPUARMState *env,                       \
-                                void *vd, void *vn, uint32_t rm)        \
-    {                                                                   \
-        TYPE *d = vd, *n = vn;                                          \
-        TYPE r, m = rm;                                                 \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned e;                                                     \
-        float_status *fpst;                                             \
-        float_status scratch_fpst;                                      \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
-            if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) {              \
-                continue;                                               \
-            }                                                           \
-            fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 :    \
-                &env->vfp.standard_fp_status;                           \
-            if (!(mask & 1)) {                                          \
-                /* We need the result but without updating flags */     \
-                scratch_fpst = *fpst;                                   \
-                fpst = &scratch_fpst;                                   \
-            }                                                           \
-            r = FN(n[H##ESIZE(e)], m, fpst);                            \
-            mergemask(&d[H##ESIZE(e)], r, mask);                        \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-#define DO_2OP_FP_SCALAR_ALL(OP, FN)                    \
-    DO_2OP_FP_SCALAR(OP##h, 2, float16, float16_##FN)   \
-    DO_2OP_FP_SCALAR(OP##s, 4, float32, float32_##FN)
-
-DO_2OP_FP_SCALAR_ALL(vfadd_scalar, add)
-DO_2OP_FP_SCALAR_ALL(vfsub_scalar, sub)
-DO_2OP_FP_SCALAR_ALL(vfmul_scalar, mul)
-
-#define DO_2OP_FP_ACC_SCALAR(OP, ESIZE, TYPE, FN)                       \
-    void HELPER(glue(mve_, OP))(CPUARMState *env,                       \
-                                void *vd, void *vn, uint32_t rm)        \
-    {                                                                   \
-        TYPE *d = vd, *n = vn;                                          \
-        TYPE r, m = rm;                                                 \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned e;                                                     \
-        float_status *fpst;                                             \
-        float_status scratch_fpst;                                      \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
-            if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) {              \
-                continue;                                               \
-            }                                                           \
-            fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 :    \
-                &env->vfp.standard_fp_status;                           \
-            if (!(mask & 1)) {                                          \
-                /* We need the result but without updating flags */     \
-                scratch_fpst = *fpst;                                   \
-                fpst = &scratch_fpst;                                   \
-            }                                                           \
-            r = FN(n[H##ESIZE(e)], m, d[H##ESIZE(e)], 0, fpst);         \
-            mergemask(&d[H##ESIZE(e)], r, mask);                        \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-/* VFMAS is vector * vector + scalar, so swap op2 and op3 */
-#define DO_VFMAS_SCALARH(N, M, D, F, S) float16_muladd(N, D, M, F, S)
-#define DO_VFMAS_SCALARS(N, M, D, F, S) float32_muladd(N, D, M, F, S)
-
-/* VFMA is vector * scalar + vector */
-DO_2OP_FP_ACC_SCALAR(vfma_scalarh, 2, float16, float16_muladd)
-DO_2OP_FP_ACC_SCALAR(vfma_scalars, 4, float32, float32_muladd)
-DO_2OP_FP_ACC_SCALAR(vfmas_scalarh, 2, float16, DO_VFMAS_SCALARH)
-DO_2OP_FP_ACC_SCALAR(vfmas_scalars, 4, float32, DO_VFMAS_SCALARS)
-
-/* Floating point max/min across vector. */
-#define DO_FP_VMAXMINV(OP, ESIZE, TYPE, ABS, FN)                \
-    uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \
-                                    uint32_t ra_in)             \
-    {                                                           \
-        uint16_t mask = mve_element_mask(env);                  \
-        unsigned e;                                             \
-        TYPE *m = vm;                                           \
-        TYPE ra = (TYPE)ra_in;                                  \
-        float_status *fpst = (ESIZE == 2) ?                     \
-            &env->vfp.standard_fp_status_f16 :                  \
-            &env->vfp.standard_fp_status;                       \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {      \
-            if (mask & 1) {                                     \
-                TYPE v = m[H##ESIZE(e)];                        \
-                if (TYPE##_is_signaling_nan(ra, fpst)) {        \
-                    ra = TYPE##_silence_nan(ra, fpst);          \
-                    float_raise(float_flag_invalid, fpst);      \
-                }                                               \
-                if (TYPE##_is_signaling_nan(v, fpst)) {         \
-                    v = TYPE##_silence_nan(v, fpst);            \
-                    float_raise(float_flag_invalid, fpst);      \
-                }                                               \
-                if (ABS) {                                      \
-                    v = TYPE##_abs(v);                          \
-                }                                               \
-                ra = FN(ra, v, fpst);                           \
-            }                                                   \
-        }                                                       \
-        mve_advance_vpt(env);                                   \
-        return ra;                                              \
-    }                                                           \
-
-#define NOP(X) (X)
-
-DO_FP_VMAXMINV(vmaxnmvh, 2, float16, false, float16_maxnum)
-DO_FP_VMAXMINV(vmaxnmvs, 4, float32, false, float32_maxnum)
-DO_FP_VMAXMINV(vminnmvh, 2, float16, false, float16_minnum)
-DO_FP_VMAXMINV(vminnmvs, 4, float32, false, float32_minnum)
-DO_FP_VMAXMINV(vmaxnmavh, 2, float16, true, float16_maxnum)
-DO_FP_VMAXMINV(vmaxnmavs, 4, float32, true, float32_maxnum)
-DO_FP_VMAXMINV(vminnmavh, 2, float16, true, float16_minnum)
-DO_FP_VMAXMINV(vminnmavs, 4, float32, true, float32_minnum)
-
-/* FP compares; note that all comparisons signal InvalidOp for QNaNs */
-#define DO_VCMP_FP(OP, ESIZE, TYPE, FN)                                 \
-    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, void *vm)   \
-    {                                                                   \
-        TYPE *n = vn, *m = vm;                                          \
-        uint16_t mask = mve_element_mask(env);                          \
-        uint16_t eci_mask = mve_eci_mask(env);                          \
-        uint16_t beatpred = 0;                                          \
-        uint16_t emask = MAKE_64BIT_MASK(0, ESIZE);                     \
-        unsigned e;                                                     \
-        float_status *fpst;                                             \
-        float_status scratch_fpst;                                      \
-        bool r;                                                         \
-        for (e = 0; e < 16 / ESIZE; e++, emask <<= ESIZE) {             \
-            if ((mask & emask) == 0) {                                  \
-                continue;                                               \
-            }                                                           \
-            fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 :    \
-                &env->vfp.standard_fp_status;                           \
-            if (!(mask & (1 << (e * ESIZE)))) {                         \
-                /* We need the result but without updating flags */     \
-                scratch_fpst = *fpst;                                   \
-                fpst = &scratch_fpst;                                   \
-            }                                                           \
-            r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], fpst);               \
-            /* Comparison sets 0/1 bits for each byte in the element */ \
-            beatpred |= r * emask;                                      \
-        }                                                               \
-        beatpred &= mask;                                               \
-        env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) |           \
-            (beatpred & eci_mask);                                      \
-        mve_advance_vpt(env);                                           \
-    }
-
-#define DO_VCMP_FP_SCALAR(OP, ESIZE, TYPE, FN)                          \
-    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn,             \
-                                uint32_t rm)                            \
-    {                                                                   \
-        TYPE *n = vn;                                                   \
-        uint16_t mask = mve_element_mask(env);                          \
-        uint16_t eci_mask = mve_eci_mask(env);                          \
-        uint16_t beatpred = 0;                                          \
-        uint16_t emask = MAKE_64BIT_MASK(0, ESIZE);                     \
-        unsigned e;                                                     \
-        float_status *fpst;                                             \
-        float_status scratch_fpst;                                      \
-        bool r;                                                         \
-        for (e = 0; e < 16 / ESIZE; e++, emask <<= ESIZE) {             \
-            if ((mask & emask) == 0) {                                  \
-                continue;                                               \
-            }                                                           \
-            fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 :    \
-                &env->vfp.standard_fp_status;                           \
-            if (!(mask & (1 << (e * ESIZE)))) {                         \
-                /* We need the result but without updating flags */     \
-                scratch_fpst = *fpst;                                   \
-                fpst = &scratch_fpst;                                   \
-            }                                                           \
-            r = FN(n[H##ESIZE(e)], (TYPE)rm, fpst);                     \
-            /* Comparison sets 0/1 bits for each byte in the element */ \
-            beatpred |= r * emask;                                      \
-        }                                                               \
-        beatpred &= mask;                                               \
-        env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) |           \
-            (beatpred & eci_mask);                                      \
-        mve_advance_vpt(env);                                           \
-    }
-
-#define DO_VCMP_FP_BOTH(VOP, SOP, ESIZE, TYPE, FN)      \
-    DO_VCMP_FP(VOP, ESIZE, TYPE, FN)                    \
-    DO_VCMP_FP_SCALAR(SOP, ESIZE, TYPE, FN)
-
-/*
- * Some care is needed here to get the correct result for the unordered case.
- * Architecturally EQ, GE and GT are defined to be false for unordered, but
- * the NE, LT and LE comparisons are defined as simple logical inverses of
- * EQ, GE and GT and so they must return true for unordered. The softfloat
- * comparison functions float*_{eq,le,lt} all return false for unordered.
- */
-#define DO_GE16(X, Y, S) float16_le(Y, X, S)
-#define DO_GE32(X, Y, S) float32_le(Y, X, S)
-#define DO_GT16(X, Y, S) float16_lt(Y, X, S)
-#define DO_GT32(X, Y, S) float32_lt(Y, X, S)
-
-DO_VCMP_FP_BOTH(vfcmpeqh, vfcmpeq_scalarh, 2, float16, float16_eq)
-DO_VCMP_FP_BOTH(vfcmpeqs, vfcmpeq_scalars, 4, float32, float32_eq)
-
-DO_VCMP_FP_BOTH(vfcmpneh, vfcmpne_scalarh, 2, float16, !float16_eq)
-DO_VCMP_FP_BOTH(vfcmpnes, vfcmpne_scalars, 4, float32, !float32_eq)
-
-DO_VCMP_FP_BOTH(vfcmpgeh, vfcmpge_scalarh, 2, float16, DO_GE16)
-DO_VCMP_FP_BOTH(vfcmpges, vfcmpge_scalars, 4, float32, DO_GE32)
-
-DO_VCMP_FP_BOTH(vfcmplth, vfcmplt_scalarh, 2, float16, !DO_GE16)
-DO_VCMP_FP_BOTH(vfcmplts, vfcmplt_scalars, 4, float32, !DO_GE32)
-
-DO_VCMP_FP_BOTH(vfcmpgth, vfcmpgt_scalarh, 2, float16, DO_GT16)
-DO_VCMP_FP_BOTH(vfcmpgts, vfcmpgt_scalars, 4, float32, DO_GT32)
-
-DO_VCMP_FP_BOTH(vfcmpleh, vfcmple_scalarh, 2, float16, !DO_GT16)
-DO_VCMP_FP_BOTH(vfcmples, vfcmple_scalars, 4, float32, !DO_GT32)
-
-#define DO_VCVT_FIXED(OP, ESIZE, TYPE, FN)                              \
-    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vm,   \
-                                uint32_t shift)                         \
-    {                                                                   \
-        TYPE *d = vd, *m = vm;                                          \
-        TYPE r;                                                         \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned e;                                                     \
-        float_status *fpst;                                             \
-        float_status scratch_fpst;                                      \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
-            if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) {              \
-                continue;                                               \
-            }                                                           \
-            fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 :    \
-                &env->vfp.standard_fp_status;                           \
-            if (!(mask & 1)) {                                          \
-                /* We need the result but without updating flags */     \
-                scratch_fpst = *fpst;                                   \
-                fpst = &scratch_fpst;                                   \
-            }                                                           \
-            r = FN(m[H##ESIZE(e)], shift, fpst);                        \
-            mergemask(&d[H##ESIZE(e)], r, mask);                        \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-DO_VCVT_FIXED(vcvt_sh, 2, int16_t, helper_vfp_shtoh)
-DO_VCVT_FIXED(vcvt_uh, 2, uint16_t, helper_vfp_uhtoh)
-DO_VCVT_FIXED(vcvt_hs, 2, int16_t, helper_vfp_toshh_round_to_zero)
-DO_VCVT_FIXED(vcvt_hu, 2, uint16_t, helper_vfp_touhh_round_to_zero)
-DO_VCVT_FIXED(vcvt_sf, 4, int32_t, helper_vfp_sltos)
-DO_VCVT_FIXED(vcvt_uf, 4, uint32_t, helper_vfp_ultos)
-DO_VCVT_FIXED(vcvt_fs, 4, int32_t, helper_vfp_tosls_round_to_zero)
-DO_VCVT_FIXED(vcvt_fu, 4, uint32_t, helper_vfp_touls_round_to_zero)
-
-/* VCVT with specified rmode */
-#define DO_VCVT_RMODE(OP, ESIZE, TYPE, FN)                              \
-    void HELPER(glue(mve_, OP))(CPUARMState *env,                       \
-                                void *vd, void *vm, uint32_t rmode)     \
-    {                                                                   \
-        TYPE *d = vd, *m = vm;                                          \
-        TYPE r;                                                         \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned e;                                                     \
-        float_status *fpst;                                             \
-        float_status scratch_fpst;                                      \
-        float_status *base_fpst = (ESIZE == 2) ?                        \
-            &env->vfp.standard_fp_status_f16 :                          \
-            &env->vfp.standard_fp_status;                               \
-        uint32_t prev_rmode = get_float_rounding_mode(base_fpst);       \
-        set_float_rounding_mode(rmode, base_fpst);                      \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
-            if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) {              \
-                continue;                                               \
-            }                                                           \
-            fpst = base_fpst;                                           \
-            if (!(mask & 1)) {                                          \
-                /* We need the result but without updating flags */     \
-                scratch_fpst = *fpst;                                   \
-                fpst = &scratch_fpst;                                   \
-            }                                                           \
-            r = FN(m[H##ESIZE(e)], 0, fpst);                            \
-            mergemask(&d[H##ESIZE(e)], r, mask);                        \
-        }                                                               \
-        set_float_rounding_mode(prev_rmode, base_fpst);                 \
-        mve_advance_vpt(env);                                           \
-    }
-
-DO_VCVT_RMODE(vcvt_rm_sh, 2, uint16_t, helper_vfp_toshh)
-DO_VCVT_RMODE(vcvt_rm_uh, 2, uint16_t, helper_vfp_touhh)
-DO_VCVT_RMODE(vcvt_rm_ss, 4, uint32_t, helper_vfp_tosls)
-DO_VCVT_RMODE(vcvt_rm_us, 4, uint32_t, helper_vfp_touls)
-
-#define DO_VRINT_RM_H(M, F, S) helper_rinth(M, S)
-#define DO_VRINT_RM_S(M, F, S) helper_rints(M, S)
-
-DO_VCVT_RMODE(vrint_rm_h, 2, uint16_t, DO_VRINT_RM_H)
-DO_VCVT_RMODE(vrint_rm_s, 4, uint32_t, DO_VRINT_RM_S)
-
-/*
- * VCVT between halfprec and singleprec. As usual for halfprec
- * conversions, FZ16 is ignored and AHP is observed.
- */
-static void do_vcvt_sh(CPUARMState *env, void *vd, void *vm, int top)
-{
-    uint16_t *d = vd;
-    uint32_t *m = vm;
-    uint16_t r;
-    uint16_t mask = mve_element_mask(env);
-    bool ieee = !(env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_AHP);
-    unsigned e;
-    float_status *fpst;
-    float_status scratch_fpst;
-    float_status *base_fpst = &env->vfp.standard_fp_status;
-    bool old_fz = get_flush_to_zero(base_fpst);
-    set_flush_to_zero(false, base_fpst);
-    for (e = 0; e < 16 / 4; e++, mask >>= 4) {
-        if ((mask & MAKE_64BIT_MASK(0, 4)) == 0) {
-            continue;
-        }
-        fpst = base_fpst;
-        if (!(mask & 1)) {
-            /* We need the result but without updating flags */
-            scratch_fpst = *fpst;
-            fpst = &scratch_fpst;
-        }
-        r = float32_to_float16(m[H4(e)], ieee, fpst);
-        mergemask(&d[H2(e * 2 + top)], r, mask >> (top * 2));
-    }
-    set_flush_to_zero(old_fz, base_fpst);
-    mve_advance_vpt(env);
-}
-
-static void do_vcvt_hs(CPUARMState *env, void *vd, void *vm, int top)
-{
-    uint32_t *d = vd;
-    uint16_t *m = vm;
-    uint32_t r;
-    uint16_t mask = mve_element_mask(env);
-    bool ieee = !(env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_AHP);
-    unsigned e;
-    float_status *fpst;
-    float_status scratch_fpst;
-    float_status *base_fpst = &env->vfp.standard_fp_status;
-    bool old_fiz = get_flush_inputs_to_zero(base_fpst);
-    set_flush_inputs_to_zero(false, base_fpst);
-    for (e = 0; e < 16 / 4; e++, mask >>= 4) {
-        if ((mask & MAKE_64BIT_MASK(0, 4)) == 0) {
-            continue;
-        }
-        fpst = base_fpst;
-        if (!(mask & (1 << (top * 2)))) {
-            /* We need the result but without updating flags */
-            scratch_fpst = *fpst;
-            fpst = &scratch_fpst;
-        }
-        r = float16_to_float32(m[H2(e * 2 + top)], ieee, fpst);
-        mergemask(&d[H4(e)], r, mask);
-    }
-    set_flush_inputs_to_zero(old_fiz, base_fpst);
-    mve_advance_vpt(env);
-}
-
-void HELPER(mve_vcvtb_sh)(CPUARMState *env, void *vd, void *vm)
-{
-    do_vcvt_sh(env, vd, vm, 0);
-}
-void HELPER(mve_vcvtt_sh)(CPUARMState *env, void *vd, void *vm)
-{
-    do_vcvt_sh(env, vd, vm, 1);
-}
-void HELPER(mve_vcvtb_hs)(CPUARMState *env, void *vd, void *vm)
-{
-    do_vcvt_hs(env, vd, vm, 0);
-}
-void HELPER(mve_vcvtt_hs)(CPUARMState *env, void *vd, void *vm)
-{
-    do_vcvt_hs(env, vd, vm, 1);
-}
-
-#define DO_1OP_FP(OP, ESIZE, TYPE, FN)                                  \
-    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vm)   \
-    {                                                                   \
-        TYPE *d = vd, *m = vm;                                          \
-        TYPE r;                                                         \
-        uint16_t mask = mve_element_mask(env);                          \
-        unsigned e;                                                     \
-        float_status *fpst;                                             \
-        float_status scratch_fpst;                                      \
-        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
-            if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) {              \
-                continue;                                               \
-            }                                                           \
-            fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 :    \
-                &env->vfp.standard_fp_status;                           \
-            if (!(mask & 1)) {                                          \
-                /* We need the result but without updating flags */     \
-                scratch_fpst = *fpst;                                   \
-                fpst = &scratch_fpst;                                   \
-            }                                                           \
-            r = FN(m[H##ESIZE(e)], fpst);                               \
-            mergemask(&d[H##ESIZE(e)], r, mask);                        \
-        }                                                               \
-        mve_advance_vpt(env);                                           \
-    }
-
-DO_1OP_FP(vrintx_h, 2, float16, float16_round_to_int)
-DO_1OP_FP(vrintx_s, 4, float32, float32_round_to_int)
diff --git a/target/arm/neon_helper.c b/target/arm/neon_helper.c
deleted file mode 100644
index bc6c4a5..0000000
--- a/target/arm/neon_helper.c
+++ /dev/null
@@ -1,1740 +0,0 @@
-/*
- * ARM NEON vector operations.
- *
- * Copyright (c) 2007, 2008 CodeSourcery.
- * Written by Paul Brook
- *
- * This code is licensed under the GNU GPL v2.
- */
-#include "qemu/osdep.h"
-
-#include "cpu.h"
-#include "exec/helper-proto.h"
-#include "fpu/softfloat.h"
-#include "vec_internal.h"
-
-#define SIGNBIT (uint32_t)0x80000000
-#define SIGNBIT64 ((uint64_t)1 << 63)
-
-#define SET_QC() env->vfp.qc[0] = 1
-
-#define NEON_TYPE1(name, type) \
-typedef struct \
-{ \
-    type v1; \
-} neon_##name;
-#if HOST_BIG_ENDIAN
-#define NEON_TYPE2(name, type) \
-typedef struct \
-{ \
-    type v2; \
-    type v1; \
-} neon_##name;
-#define NEON_TYPE4(name, type) \
-typedef struct \
-{ \
-    type v4; \
-    type v3; \
-    type v2; \
-    type v1; \
-} neon_##name;
-#else
-#define NEON_TYPE2(name, type) \
-typedef struct \
-{ \
-    type v1; \
-    type v2; \
-} neon_##name;
-#define NEON_TYPE4(name, type) \
-typedef struct \
-{ \
-    type v1; \
-    type v2; \
-    type v3; \
-    type v4; \
-} neon_##name;
-#endif
-
-NEON_TYPE4(s8, int8_t)
-NEON_TYPE4(u8, uint8_t)
-NEON_TYPE2(s16, int16_t)
-NEON_TYPE2(u16, uint16_t)
-NEON_TYPE1(s32, int32_t)
-NEON_TYPE1(u32, uint32_t)
-#undef NEON_TYPE4
-#undef NEON_TYPE2
-#undef NEON_TYPE1
-
-/* Copy from a uint32_t to a vector structure type.  */
-#define NEON_UNPACK(vtype, dest, val) do { \
-    union { \
-        vtype v; \
-        uint32_t i; \
-    } conv_u; \
-    conv_u.i = (val); \
-    dest = conv_u.v; \
-    } while(0)
-
-/* Copy from a vector structure type to a uint32_t.  */
-#define NEON_PACK(vtype, dest, val) do { \
-    union { \
-        vtype v; \
-        uint32_t i; \
-    } conv_u; \
-    conv_u.v = (val); \
-    dest = conv_u.i; \
-    } while(0)
-
-#define NEON_DO1 \
-    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
-#define NEON_DO2 \
-    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
-    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
-#define NEON_DO4 \
-    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
-    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
-    NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
-    NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
-
-#define NEON_VOP_BODY(vtype, n) \
-{ \
-    uint32_t res; \
-    vtype vsrc1; \
-    vtype vsrc2; \
-    vtype vdest; \
-    NEON_UNPACK(vtype, vsrc1, arg1); \
-    NEON_UNPACK(vtype, vsrc2, arg2); \
-    NEON_DO##n; \
-    NEON_PACK(vtype, res, vdest); \
-    return res; \
-}
-
-#define NEON_VOP(name, vtype, n) \
-uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
-NEON_VOP_BODY(vtype, n)
-
-#define NEON_VOP_ENV(name, vtype, n) \
-uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
-NEON_VOP_BODY(vtype, n)
-
-/* Pairwise operations.  */
-/* For 32-bit elements each segment only contains a single element, so
-   the elementwise and pairwise operations are the same.  */
-#define NEON_PDO2 \
-    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
-    NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
-#define NEON_PDO4 \
-    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
-    NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
-    NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
-    NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
-
-#define NEON_POP(name, vtype, n) \
-uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
-{ \
-    uint32_t res; \
-    vtype vsrc1; \
-    vtype vsrc2; \
-    vtype vdest; \
-    NEON_UNPACK(vtype, vsrc1, arg1); \
-    NEON_UNPACK(vtype, vsrc2, arg2); \
-    NEON_PDO##n; \
-    NEON_PACK(vtype, res, vdest); \
-    return res; \
-}
-
-/* Unary operators.  */
-#define NEON_VOP1(name, vtype, n) \
-uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
-{ \
-    vtype vsrc1; \
-    vtype vdest; \
-    NEON_UNPACK(vtype, vsrc1, arg); \
-    NEON_DO##n; \
-    NEON_PACK(vtype, arg, vdest); \
-    return arg; \
-}
-
-
-#define NEON_USAT(dest, src1, src2, type) do { \
-    uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
-    if (tmp != (type)tmp) { \
-        SET_QC(); \
-        dest = ~0; \
-    } else { \
-        dest = tmp; \
-    }} while(0)
-#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
-NEON_VOP_ENV(qadd_u8, neon_u8, 4)
-#undef NEON_FN
-#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
-NEON_VOP_ENV(qadd_u16, neon_u16, 2)
-#undef NEON_FN
-#undef NEON_USAT
-
-uint32_t HELPER(neon_qadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
-{
-    uint32_t res = a + b;
-    if (res < a) {
-        SET_QC();
-        res = ~0;
-    }
-    return res;
-}
-
-uint64_t HELPER(neon_qadd_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
-{
-    uint64_t res;
-
-    res = src1 + src2;
-    if (res < src1) {
-        SET_QC();
-        res = ~(uint64_t)0;
-    }
-    return res;
-}
-
-#define NEON_SSAT(dest, src1, src2, type) do { \
-    int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
-    if (tmp != (type)tmp) { \
-        SET_QC(); \
-        if (src2 > 0) { \
-            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
-        } else { \
-            tmp = 1 << (sizeof(type) * 8 - 1); \
-        } \
-    } \
-    dest = tmp; \
-    } while(0)
-#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
-NEON_VOP_ENV(qadd_s8, neon_s8, 4)
-#undef NEON_FN
-#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
-NEON_VOP_ENV(qadd_s16, neon_s16, 2)
-#undef NEON_FN
-#undef NEON_SSAT
-
-uint32_t HELPER(neon_qadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
-{
-    uint32_t res = a + b;
-    if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) {
-        SET_QC();
-        res = ~(((int32_t)a >> 31) ^ SIGNBIT);
-    }
-    return res;
-}
-
-uint64_t HELPER(neon_qadd_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
-{
-    uint64_t res;
-
-    res = src1 + src2;
-    if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
-        SET_QC();
-        res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
-    }
-    return res;
-}
-
-/* Unsigned saturating accumulate of signed value
- *
- * Op1/Rn is treated as signed
- * Op2/Rd is treated as unsigned
- *
- * Explicit casting is used to ensure the correct sign extension of
- * inputs. The result is treated as a unsigned value and saturated as such.
- *
- * We use a macro for the 8/16 bit cases which expects signed integers of va,
- * vb, and vr for interim calculation and an unsigned 32 bit result value r.
- */
-
-#define USATACC(bits, shift) \
-    do { \
-        va = sextract32(a, shift, bits);                                \
-        vb = extract32(b, shift, bits);                                 \
-        vr = va + vb;                                                   \
-        if (vr > UINT##bits##_MAX) {                                    \
-            SET_QC();                                                   \
-            vr = UINT##bits##_MAX;                                      \
-        } else if (vr < 0) {                                            \
-            SET_QC();                                                   \
-            vr = 0;                                                     \
-        }                                                               \
-        r = deposit32(r, shift, bits, vr);                              \
-   } while (0)
-
-uint32_t HELPER(neon_uqadd_s8)(CPUARMState *env, uint32_t a, uint32_t b)
-{
-    int16_t va, vb, vr;
-    uint32_t r = 0;
-
-    USATACC(8, 0);
-    USATACC(8, 8);
-    USATACC(8, 16);
-    USATACC(8, 24);
-    return r;
-}
-
-uint32_t HELPER(neon_uqadd_s16)(CPUARMState *env, uint32_t a, uint32_t b)
-{
-    int32_t va, vb, vr;
-    uint64_t r = 0;
-
-    USATACC(16, 0);
-    USATACC(16, 16);
-    return r;
-}
-
-#undef USATACC
-
-uint32_t HELPER(neon_uqadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
-{
-    int64_t va = (int32_t)a;
-    int64_t vb = (uint32_t)b;
-    int64_t vr = va + vb;
-    if (vr > UINT32_MAX) {
-        SET_QC();
-        vr = UINT32_MAX;
-    } else if (vr < 0) {
-        SET_QC();
-        vr = 0;
-    }
-    return vr;
-}
-
-uint64_t HELPER(neon_uqadd_s64)(CPUARMState *env, uint64_t a, uint64_t b)
-{
-    uint64_t res;
-    res = a + b;
-    /* We only need to look at the pattern of SIGN bits to detect
-     * +ve/-ve saturation
-     */
-    if (~a & b & ~res & SIGNBIT64) {
-        SET_QC();
-        res = UINT64_MAX;
-    } else if (a & ~b & res & SIGNBIT64) {
-        SET_QC();
-        res = 0;
-    }
-    return res;
-}
-
-/* Signed saturating accumulate of unsigned value
- *
- * Op1/Rn is treated as unsigned
- * Op2/Rd is treated as signed
- *
- * The result is treated as a signed value and saturated as such
- *
- * We use a macro for the 8/16 bit cases which expects signed integers of va,
- * vb, and vr for interim calculation and an unsigned 32 bit result value r.
- */
-
-#define SSATACC(bits, shift) \
-    do { \
-        va = extract32(a, shift, bits);                                 \
-        vb = sextract32(b, shift, bits);                                \
-        vr = va + vb;                                                   \
-        if (vr > INT##bits##_MAX) {                                     \
-            SET_QC();                                                   \
-            vr = INT##bits##_MAX;                                       \
-        } else if (vr < INT##bits##_MIN) {                              \
-            SET_QC();                                                   \
-            vr = INT##bits##_MIN;                                       \
-        }                                                               \
-        r = deposit32(r, shift, bits, vr);                              \
-    } while (0)
-
-uint32_t HELPER(neon_sqadd_u8)(CPUARMState *env, uint32_t a, uint32_t b)
-{
-    int16_t va, vb, vr;
-    uint32_t r = 0;
-
-    SSATACC(8, 0);
-    SSATACC(8, 8);
-    SSATACC(8, 16);
-    SSATACC(8, 24);
-    return r;
-}
-
-uint32_t HELPER(neon_sqadd_u16)(CPUARMState *env, uint32_t a, uint32_t b)
-{
-    int32_t va, vb, vr;
-    uint32_t r = 0;
-
-    SSATACC(16, 0);
-    SSATACC(16, 16);
-
-    return r;
-}
-
-#undef SSATACC
-
-uint32_t HELPER(neon_sqadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
-{
-    int64_t res;
-    int64_t op1 = (uint32_t)a;
-    int64_t op2 = (int32_t)b;
-    res = op1 + op2;
-    if (res > INT32_MAX) {
-        SET_QC();
-        res = INT32_MAX;
-    } else if (res < INT32_MIN) {
-        SET_QC();
-        res = INT32_MIN;
-    }
-    return res;
-}
-
-uint64_t HELPER(neon_sqadd_u64)(CPUARMState *env, uint64_t a, uint64_t b)
-{
-    uint64_t res;
-    res = a + b;
-    /* We only need to look at the pattern of SIGN bits to detect an overflow */
-    if (((a & res)
-         | (~b & res)
-         | (a & ~b)) & SIGNBIT64) {
-        SET_QC();
-        res = INT64_MAX;
-    }
-    return res;
-}
-
-
-#define NEON_USAT(dest, src1, src2, type) do { \
-    uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
-    if (tmp != (type)tmp) { \
-        SET_QC(); \
-        dest = 0; \
-    } else { \
-        dest = tmp; \
-    }} while(0)
-#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
-NEON_VOP_ENV(qsub_u8, neon_u8, 4)
-#undef NEON_FN
-#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
-NEON_VOP_ENV(qsub_u16, neon_u16, 2)
-#undef NEON_FN
-#undef NEON_USAT
-
-uint32_t HELPER(neon_qsub_u32)(CPUARMState *env, uint32_t a, uint32_t b)
-{
-    uint32_t res = a - b;
-    if (res > a) {
-        SET_QC();
-        res = 0;
-    }
-    return res;
-}
-
-uint64_t HELPER(neon_qsub_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
-{
-    uint64_t res;
-
-    if (src1 < src2) {
-        SET_QC();
-        res = 0;
-    } else {
-        res = src1 - src2;
-    }
-    return res;
-}
-
-#define NEON_SSAT(dest, src1, src2, type) do { \
-    int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
-    if (tmp != (type)tmp) { \
-        SET_QC(); \
-        if (src2 < 0) { \
-            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
-        } else { \
-            tmp = 1 << (sizeof(type) * 8 - 1); \
-        } \
-    } \
-    dest = tmp; \
-    } while(0)
-#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
-NEON_VOP_ENV(qsub_s8, neon_s8, 4)
-#undef NEON_FN
-#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
-NEON_VOP_ENV(qsub_s16, neon_s16, 2)
-#undef NEON_FN
-#undef NEON_SSAT
-
-uint32_t HELPER(neon_qsub_s32)(CPUARMState *env, uint32_t a, uint32_t b)
-{
-    uint32_t res = a - b;
-    if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) {
-        SET_QC();
-        res = ~(((int32_t)a >> 31) ^ SIGNBIT);
-    }
-    return res;
-}
-
-uint64_t HELPER(neon_qsub_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
-{
-    uint64_t res;
-
-    res = src1 - src2;
-    if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
-        SET_QC();
-        res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
-    }
-    return res;
-}
-
-#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
-NEON_VOP(hadd_s8, neon_s8, 4)
-NEON_VOP(hadd_u8, neon_u8, 4)
-NEON_VOP(hadd_s16, neon_s16, 2)
-NEON_VOP(hadd_u16, neon_u16, 2)
-#undef NEON_FN
-
-int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
-{
-    int32_t dest;
-
-    dest = (src1 >> 1) + (src2 >> 1);
-    if (src1 & src2 & 1)
-        dest++;
-    return dest;
-}
-
-uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
-{
-    uint32_t dest;
-
-    dest = (src1 >> 1) + (src2 >> 1);
-    if (src1 & src2 & 1)
-        dest++;
-    return dest;
-}
-
-#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
-NEON_VOP(rhadd_s8, neon_s8, 4)
-NEON_VOP(rhadd_u8, neon_u8, 4)
-NEON_VOP(rhadd_s16, neon_s16, 2)
-NEON_VOP(rhadd_u16, neon_u16, 2)
-#undef NEON_FN
-
-int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
-{
-    int32_t dest;
-
-    dest = (src1 >> 1) + (src2 >> 1);
-    if ((src1 | src2) & 1)
-        dest++;
-    return dest;
-}
-
-uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
-{
-    uint32_t dest;
-
-    dest = (src1 >> 1) + (src2 >> 1);
-    if ((src1 | src2) & 1)
-        dest++;
-    return dest;
-}
-
-#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
-NEON_VOP(hsub_s8, neon_s8, 4)
-NEON_VOP(hsub_u8, neon_u8, 4)
-NEON_VOP(hsub_s16, neon_s16, 2)
-NEON_VOP(hsub_u16, neon_u16, 2)
-#undef NEON_FN
-
-int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
-{
-    int32_t dest;
-
-    dest = (src1 >> 1) - (src2 >> 1);
-    if ((~src1) & src2 & 1)
-        dest--;
-    return dest;
-}
-
-uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
-{
-    uint32_t dest;
-
-    dest = (src1 >> 1) - (src2 >> 1);
-    if ((~src1) & src2 & 1)
-        dest--;
-    return dest;
-}
-
-#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
-NEON_POP(pmin_s8, neon_s8, 4)
-NEON_POP(pmin_u8, neon_u8, 4)
-NEON_POP(pmin_s16, neon_s16, 2)
-NEON_POP(pmin_u16, neon_u16, 2)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
-NEON_POP(pmax_s8, neon_s8, 4)
-NEON_POP(pmax_u8, neon_u8, 4)
-NEON_POP(pmax_s16, neon_s16, 2)
-NEON_POP(pmax_u16, neon_u16, 2)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) \
-    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
-NEON_VOP(shl_u16, neon_u16, 2)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) \
-    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
-NEON_VOP(shl_s16, neon_s16, 2)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) \
-    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
-NEON_VOP(rshl_s8, neon_s8, 4)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) \
-    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
-NEON_VOP(rshl_s16, neon_s16, 2)
-#undef NEON_FN
-
-uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
-{
-    return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
-}
-
-uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
-{
-    return do_sqrshl_d(val, (int8_t)shift, true, NULL);
-}
-
-#define NEON_FN(dest, src1, src2) \
-    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
-NEON_VOP(rshl_u8, neon_u8, 4)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) \
-    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
-NEON_VOP(rshl_u16, neon_u16, 2)
-#undef NEON_FN
-
-uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
-{
-    return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
-}
-
-uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
-{
-    return do_uqrshl_d(val, (int8_t)shift, true, NULL);
-}
-
-#define NEON_FN(dest, src1, src2) \
-    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
-NEON_VOP_ENV(qshl_u8, neon_u8, 4)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) \
-    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
-NEON_VOP_ENV(qshl_u16, neon_u16, 2)
-#undef NEON_FN
-
-uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
-{
-    return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
-}
-
-uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
-{
-    return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
-}
-
-#define NEON_FN(dest, src1, src2) \
-    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
-NEON_VOP_ENV(qshl_s8, neon_s8, 4)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) \
-    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
-NEON_VOP_ENV(qshl_s16, neon_s16, 2)
-#undef NEON_FN
-
-uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
-{
-    return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
-}
-
-uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
-{
-    return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
-}
-
-#define NEON_FN(dest, src1, src2) \
-    (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
-NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) \
-    (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
-NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
-#undef NEON_FN
-
-uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
-{
-    return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
-}
-
-uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
-{
-    return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
-}
-
-#define NEON_FN(dest, src1, src2) \
-    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
-NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) \
-    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
-NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
-#undef NEON_FN
-
-uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
-{
-    return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
-}
-
-uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
-{
-    return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
-}
-
-#define NEON_FN(dest, src1, src2) \
-    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
-NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) \
-    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
-NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
-#undef NEON_FN
-
-uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
-{
-    return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
-}
-
-uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
-{
-    return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
-}
-
-uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
-{
-    uint32_t mask;
-    mask = (a ^ b) & 0x80808080u;
-    a &= ~0x80808080u;
-    b &= ~0x80808080u;
-    return (a + b) ^ mask;
-}
-
-uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
-{
-    uint32_t mask;
-    mask = (a ^ b) & 0x80008000u;
-    a &= ~0x80008000u;
-    b &= ~0x80008000u;
-    return (a + b) ^ mask;
-}
-
-#define NEON_FN(dest, src1, src2) dest = src1 + src2
-NEON_POP(padd_u8, neon_u8, 4)
-NEON_POP(padd_u16, neon_u16, 2)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) dest = src1 - src2
-NEON_VOP(sub_u8, neon_u8, 4)
-NEON_VOP(sub_u16, neon_u16, 2)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) dest = src1 * src2
-NEON_VOP(mul_u8, neon_u8, 4)
-NEON_VOP(mul_u16, neon_u16, 2)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
-NEON_VOP(tst_u8, neon_u8, 4)
-NEON_VOP(tst_u16, neon_u16, 2)
-NEON_VOP(tst_u32, neon_u32, 1)
-#undef NEON_FN
-
-/* Count Leading Sign/Zero Bits.  */
-static inline int do_clz8(uint8_t x)
-{
-    int n;
-    for (n = 8; x; n--)
-        x >>= 1;
-    return n;
-}
-
-static inline int do_clz16(uint16_t x)
-{
-    int n;
-    for (n = 16; x; n--)
-        x >>= 1;
-    return n;
-}
-
-#define NEON_FN(dest, src, dummy) dest = do_clz8(src)
-NEON_VOP1(clz_u8, neon_u8, 4)
-#undef NEON_FN
-
-#define NEON_FN(dest, src, dummy) dest = do_clz16(src)
-NEON_VOP1(clz_u16, neon_u16, 2)
-#undef NEON_FN
-
-#define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
-NEON_VOP1(cls_s8, neon_s8, 4)
-#undef NEON_FN
-
-#define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
-NEON_VOP1(cls_s16, neon_s16, 2)
-#undef NEON_FN
-
-uint32_t HELPER(neon_cls_s32)(uint32_t x)
-{
-    int count;
-    if ((int32_t)x < 0)
-        x = ~x;
-    for (count = 32; x; count--)
-        x = x >> 1;
-    return count - 1;
-}
-
-/* Bit count.  */
-uint32_t HELPER(neon_cnt_u8)(uint32_t x)
-{
-    x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
-    x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
-    x = (x & 0x0f0f0f0f) + ((x >>  4) & 0x0f0f0f0f);
-    return x;
-}
-
-/* Reverse bits in each 8 bit word */
-uint32_t HELPER(neon_rbit_u8)(uint32_t x)
-{
-    x =  ((x & 0xf0f0f0f0) >> 4)
-       | ((x & 0x0f0f0f0f) << 4);
-    x =  ((x & 0x88888888) >> 3)
-       | ((x & 0x44444444) >> 1)
-       | ((x & 0x22222222) << 1)
-       | ((x & 0x11111111) << 3);
-    return x;
-}
-
-#define NEON_QDMULH16(dest, src1, src2, round) do { \
-    uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
-    if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
-        SET_QC(); \
-        tmp = (tmp >> 31) ^ ~SIGNBIT; \
-    } else { \
-        tmp <<= 1; \
-    } \
-    if (round) { \
-        int32_t old = tmp; \
-        tmp += 1 << 15; \
-        if ((int32_t)tmp < old) { \
-            SET_QC(); \
-            tmp = SIGNBIT - 1; \
-        } \
-    } \
-    dest = tmp >> 16; \
-    } while(0)
-#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
-NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
-#undef NEON_FN
-#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
-NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
-#undef NEON_FN
-#undef NEON_QDMULH16
-
-#define NEON_QDMULH32(dest, src1, src2, round) do { \
-    uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
-    if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
-        SET_QC(); \
-        tmp = (tmp >> 63) ^ ~SIGNBIT64; \
-    } else { \
-        tmp <<= 1; \
-    } \
-    if (round) { \
-        int64_t old = tmp; \
-        tmp += (int64_t)1 << 31; \
-        if ((int64_t)tmp < old) { \
-            SET_QC(); \
-            tmp = SIGNBIT64 - 1; \
-        } \
-    } \
-    dest = tmp >> 32; \
-    } while(0)
-#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
-NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
-#undef NEON_FN
-#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
-NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
-#undef NEON_FN
-#undef NEON_QDMULH32
-
-uint32_t HELPER(neon_narrow_u8)(uint64_t x)
-{
-    return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
-           | ((x >> 24) & 0xff000000u);
-}
-
-uint32_t HELPER(neon_narrow_u16)(uint64_t x)
-{
-    return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
-}
-
-uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
-{
-    return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
-            | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
-}
-
-uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
-{
-    return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
-}
-
-uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
-{
-    x &= 0xff80ff80ff80ff80ull;
-    x += 0x0080008000800080ull;
-    return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
-            | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
-}
-
-uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
-{
-    x &= 0xffff8000ffff8000ull;
-    x += 0x0000800000008000ull;
-    return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
-}
-
-uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
-{
-    uint16_t s;
-    uint8_t d;
-    uint32_t res = 0;
-#define SAT8(n) \
-    s = x >> n; \
-    if (s & 0x8000) { \
-        SET_QC(); \
-    } else { \
-        if (s > 0xff) { \
-            d = 0xff; \
-            SET_QC(); \
-        } else  { \
-            d = s; \
-        } \
-        res |= (uint32_t)d << (n / 2); \
-    }
-
-    SAT8(0);
-    SAT8(16);
-    SAT8(32);
-    SAT8(48);
-#undef SAT8
-    return res;
-}
-
-uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
-{
-    uint16_t s;
-    uint8_t d;
-    uint32_t res = 0;
-#define SAT8(n) \
-    s = x >> n; \
-    if (s > 0xff) { \
-        d = 0xff; \
-        SET_QC(); \
-    } else  { \
-        d = s; \
-    } \
-    res |= (uint32_t)d << (n / 2);
-
-    SAT8(0);
-    SAT8(16);
-    SAT8(32);
-    SAT8(48);
-#undef SAT8
-    return res;
-}
-
-uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
-{
-    int16_t s;
-    uint8_t d;
-    uint32_t res = 0;
-#define SAT8(n) \
-    s = x >> n; \
-    if (s != (int8_t)s) { \
-        d = (s >> 15) ^ 0x7f; \
-        SET_QC(); \
-    } else  { \
-        d = s; \
-    } \
-    res |= (uint32_t)d << (n / 2);
-
-    SAT8(0);
-    SAT8(16);
-    SAT8(32);
-    SAT8(48);
-#undef SAT8
-    return res;
-}
-
-uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
-{
-    uint32_t high;
-    uint32_t low;
-    low = x;
-    if (low & 0x80000000) {
-        low = 0;
-        SET_QC();
-    } else if (low > 0xffff) {
-        low = 0xffff;
-        SET_QC();
-    }
-    high = x >> 32;
-    if (high & 0x80000000) {
-        high = 0;
-        SET_QC();
-    } else if (high > 0xffff) {
-        high = 0xffff;
-        SET_QC();
-    }
-    return low | (high << 16);
-}
-
-uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
-{
-    uint32_t high;
-    uint32_t low;
-    low = x;
-    if (low > 0xffff) {
-        low = 0xffff;
-        SET_QC();
-    }
-    high = x >> 32;
-    if (high > 0xffff) {
-        high = 0xffff;
-        SET_QC();
-    }
-    return low | (high << 16);
-}
-
-uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
-{
-    int32_t low;
-    int32_t high;
-    low = x;
-    if (low != (int16_t)low) {
-        low = (low >> 31) ^ 0x7fff;
-        SET_QC();
-    }
-    high = x >> 32;
-    if (high != (int16_t)high) {
-        high = (high >> 31) ^ 0x7fff;
-        SET_QC();
-    }
-    return (uint16_t)low | (high << 16);
-}
-
-uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
-{
-    if (x & 0x8000000000000000ull) {
-        SET_QC();
-        return 0;
-    }
-    if (x > 0xffffffffu) {
-        SET_QC();
-        return 0xffffffffu;
-    }
-    return x;
-}
-
-uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
-{
-    if (x > 0xffffffffu) {
-        SET_QC();
-        return 0xffffffffu;
-    }
-    return x;
-}
-
-uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
-{
-    if ((int64_t)x != (int32_t)x) {
-        SET_QC();
-        return ((int64_t)x >> 63) ^ 0x7fffffff;
-    }
-    return x;
-}
-
-uint64_t HELPER(neon_widen_u8)(uint32_t x)
-{
-    uint64_t tmp;
-    uint64_t ret;
-    ret = (uint8_t)x;
-    tmp = (uint8_t)(x >> 8);
-    ret |= tmp << 16;
-    tmp = (uint8_t)(x >> 16);
-    ret |= tmp << 32;
-    tmp = (uint8_t)(x >> 24);
-    ret |= tmp << 48;
-    return ret;
-}
-
-uint64_t HELPER(neon_widen_s8)(uint32_t x)
-{
-    uint64_t tmp;
-    uint64_t ret;
-    ret = (uint16_t)(int8_t)x;
-    tmp = (uint16_t)(int8_t)(x >> 8);
-    ret |= tmp << 16;
-    tmp = (uint16_t)(int8_t)(x >> 16);
-    ret |= tmp << 32;
-    tmp = (uint16_t)(int8_t)(x >> 24);
-    ret |= tmp << 48;
-    return ret;
-}
-
-uint64_t HELPER(neon_widen_u16)(uint32_t x)
-{
-    uint64_t high = (uint16_t)(x >> 16);
-    return ((uint16_t)x) | (high << 32);
-}
-
-uint64_t HELPER(neon_widen_s16)(uint32_t x)
-{
-    uint64_t high = (int16_t)(x >> 16);
-    return ((uint32_t)(int16_t)x) | (high << 32);
-}
-
-uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
-{
-    uint64_t mask;
-    mask = (a ^ b) & 0x8000800080008000ull;
-    a &= ~0x8000800080008000ull;
-    b &= ~0x8000800080008000ull;
-    return (a + b) ^ mask;
-}
-
-uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
-{
-    uint64_t mask;
-    mask = (a ^ b) & 0x8000000080000000ull;
-    a &= ~0x8000000080000000ull;
-    b &= ~0x8000000080000000ull;
-    return (a + b) ^ mask;
-}
-
-uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
-{
-    uint64_t tmp;
-    uint64_t tmp2;
-
-    tmp = a & 0x0000ffff0000ffffull;
-    tmp += (a >> 16) & 0x0000ffff0000ffffull;
-    tmp2 = b & 0xffff0000ffff0000ull;
-    tmp2 += (b << 16) & 0xffff0000ffff0000ull;
-    return    ( tmp         & 0xffff)
-            | ((tmp  >> 16) & 0xffff0000ull)
-            | ((tmp2 << 16) & 0xffff00000000ull)
-            | ( tmp2        & 0xffff000000000000ull);
-}
-
-uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
-{
-    uint32_t low = a + (a >> 32);
-    uint32_t high = b + (b >> 32);
-    return low + ((uint64_t)high << 32);
-}
-
-uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
-{
-    uint64_t mask;
-    mask = (a ^ ~b) & 0x8000800080008000ull;
-    a |= 0x8000800080008000ull;
-    b &= ~0x8000800080008000ull;
-    return (a - b) ^ mask;
-}
-
-uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
-{
-    uint64_t mask;
-    mask = (a ^ ~b) & 0x8000000080000000ull;
-    a |= 0x8000000080000000ull;
-    b &= ~0x8000000080000000ull;
-    return (a - b) ^ mask;
-}
-
-uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
-{
-    uint32_t x, y;
-    uint32_t low, high;
-
-    x = a;
-    y = b;
-    low = x + y;
-    if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
-        SET_QC();
-        low = ((int32_t)x >> 31) ^ ~SIGNBIT;
-    }
-    x = a >> 32;
-    y = b >> 32;
-    high = x + y;
-    if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
-        SET_QC();
-        high = ((int32_t)x >> 31) ^ ~SIGNBIT;
-    }
-    return low | ((uint64_t)high << 32);
-}
-
-uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
-{
-    uint64_t result;
-
-    result = a + b;
-    if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
-        SET_QC();
-        result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
-    }
-    return result;
-}
-
-/* We have to do the arithmetic in a larger type than
- * the input type, because for example with a signed 32 bit
- * op the absolute difference can overflow a signed 32 bit value.
- */
-#define DO_ABD(dest, x, y, intype, arithtype) do {            \
-    arithtype tmp_x = (intype)(x);                            \
-    arithtype tmp_y = (intype)(y);                            \
-    dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
-    } while(0)
-
-uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
-{
-    uint64_t tmp;
-    uint64_t result;
-    DO_ABD(result, a, b, uint8_t, uint32_t);
-    DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
-    result |= tmp << 16;
-    DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
-    result |= tmp << 32;
-    DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
-    result |= tmp << 48;
-    return result;
-}
-
-uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
-{
-    uint64_t tmp;
-    uint64_t result;
-    DO_ABD(result, a, b, int8_t, int32_t);
-    DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
-    result |= tmp << 16;
-    DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
-    result |= tmp << 32;
-    DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
-    result |= tmp << 48;
-    return result;
-}
-
-uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
-{
-    uint64_t tmp;
-    uint64_t result;
-    DO_ABD(result, a, b, uint16_t, uint32_t);
-    DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
-    return result | (tmp << 32);
-}
-
-uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
-{
-    uint64_t tmp;
-    uint64_t result;
-    DO_ABD(result, a, b, int16_t, int32_t);
-    DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
-    return result | (tmp << 32);
-}
-
-uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
-{
-    uint64_t result;
-    DO_ABD(result, a, b, uint32_t, uint64_t);
-    return result;
-}
-
-uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
-{
-    uint64_t result;
-    DO_ABD(result, a, b, int32_t, int64_t);
-    return result;
-}
-#undef DO_ABD
-
-/* Widening multiply. Named type is the source type.  */
-#define DO_MULL(dest, x, y, type1, type2) do { \
-    type1 tmp_x = x; \
-    type1 tmp_y = y; \
-    dest = (type2)((type2)tmp_x * (type2)tmp_y); \
-    } while(0)
-
-uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
-{
-    uint64_t tmp;
-    uint64_t result;
-
-    DO_MULL(result, a, b, uint8_t, uint16_t);
-    DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
-    result |= tmp << 16;
-    DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
-    result |= tmp << 32;
-    DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
-    result |= tmp << 48;
-    return result;
-}
-
-uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
-{
-    uint64_t tmp;
-    uint64_t result;
-
-    DO_MULL(result, a, b, int8_t, uint16_t);
-    DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
-    result |= tmp << 16;
-    DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
-    result |= tmp << 32;
-    DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
-    result |= tmp << 48;
-    return result;
-}
-
-uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
-{
-    uint64_t tmp;
-    uint64_t result;
-
-    DO_MULL(result, a, b, uint16_t, uint32_t);
-    DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
-    return result | (tmp << 32);
-}
-
-uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
-{
-    uint64_t tmp;
-    uint64_t result;
-
-    DO_MULL(result, a, b, int16_t, uint32_t);
-    DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
-    return result | (tmp << 32);
-}
-
-uint64_t HELPER(neon_negl_u16)(uint64_t x)
-{
-    uint16_t tmp;
-    uint64_t result;
-    result = (uint16_t)-x;
-    tmp = -(x >> 16);
-    result |= (uint64_t)tmp << 16;
-    tmp = -(x >> 32);
-    result |= (uint64_t)tmp << 32;
-    tmp = -(x >> 48);
-    result |= (uint64_t)tmp << 48;
-    return result;
-}
-
-uint64_t HELPER(neon_negl_u32)(uint64_t x)
-{
-    uint32_t low = -x;
-    uint32_t high = -(x >> 32);
-    return low | ((uint64_t)high << 32);
-}
-
-/* Saturating sign manipulation.  */
-/* ??? Make these use NEON_VOP1 */
-#define DO_QABS8(x) do { \
-    if (x == (int8_t)0x80) { \
-        x = 0x7f; \
-        SET_QC(); \
-    } else if (x < 0) { \
-        x = -x; \
-    }} while (0)
-uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
-{
-    neon_s8 vec;
-    NEON_UNPACK(neon_s8, vec, x);
-    DO_QABS8(vec.v1);
-    DO_QABS8(vec.v2);
-    DO_QABS8(vec.v3);
-    DO_QABS8(vec.v4);
-    NEON_PACK(neon_s8, x, vec);
-    return x;
-}
-#undef DO_QABS8
-
-#define DO_QNEG8(x) do { \
-    if (x == (int8_t)0x80) { \
-        x = 0x7f; \
-        SET_QC(); \
-    } else { \
-        x = -x; \
-    }} while (0)
-uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
-{
-    neon_s8 vec;
-    NEON_UNPACK(neon_s8, vec, x);
-    DO_QNEG8(vec.v1);
-    DO_QNEG8(vec.v2);
-    DO_QNEG8(vec.v3);
-    DO_QNEG8(vec.v4);
-    NEON_PACK(neon_s8, x, vec);
-    return x;
-}
-#undef DO_QNEG8
-
-#define DO_QABS16(x) do { \
-    if (x == (int16_t)0x8000) { \
-        x = 0x7fff; \
-        SET_QC(); \
-    } else if (x < 0) { \
-        x = -x; \
-    }} while (0)
-uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
-{
-    neon_s16 vec;
-    NEON_UNPACK(neon_s16, vec, x);
-    DO_QABS16(vec.v1);
-    DO_QABS16(vec.v2);
-    NEON_PACK(neon_s16, x, vec);
-    return x;
-}
-#undef DO_QABS16
-
-#define DO_QNEG16(x) do { \
-    if (x == (int16_t)0x8000) { \
-        x = 0x7fff; \
-        SET_QC(); \
-    } else { \
-        x = -x; \
-    }} while (0)
-uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
-{
-    neon_s16 vec;
-    NEON_UNPACK(neon_s16, vec, x);
-    DO_QNEG16(vec.v1);
-    DO_QNEG16(vec.v2);
-    NEON_PACK(neon_s16, x, vec);
-    return x;
-}
-#undef DO_QNEG16
-
-uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
-{
-    if (x == SIGNBIT) {
-        SET_QC();
-        x = ~SIGNBIT;
-    } else if ((int32_t)x < 0) {
-        x = -x;
-    }
-    return x;
-}
-
-uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
-{
-    if (x == SIGNBIT) {
-        SET_QC();
-        x = ~SIGNBIT;
-    } else {
-        x = -x;
-    }
-    return x;
-}
-
-uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x)
-{
-    if (x == SIGNBIT64) {
-        SET_QC();
-        x = ~SIGNBIT64;
-    } else if ((int64_t)x < 0) {
-        x = -x;
-    }
-    return x;
-}
-
-uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x)
-{
-    if (x == SIGNBIT64) {
-        SET_QC();
-        x = ~SIGNBIT64;
-    } else {
-        x = -x;
-    }
-    return x;
-}
-
-/* NEON Float helpers.  */
-
-/* Floating point comparisons produce an integer result.
- * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
- * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
- */
-uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp)
-{
-    float_status *fpst = fpstp;
-    return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
-}
-
-uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp)
-{
-    float_status *fpst = fpstp;
-    return -float32_le(make_float32(b), make_float32(a), fpst);
-}
-
-uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp)
-{
-    float_status *fpst = fpstp;
-    return -float32_lt(make_float32(b), make_float32(a), fpst);
-}
-
-uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp)
-{
-    float_status *fpst = fpstp;
-    float32 f0 = float32_abs(make_float32(a));
-    float32 f1 = float32_abs(make_float32(b));
-    return -float32_le(f1, f0, fpst);
-}
-
-uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp)
-{
-    float_status *fpst = fpstp;
-    float32 f0 = float32_abs(make_float32(a));
-    float32 f1 = float32_abs(make_float32(b));
-    return -float32_lt(f1, f0, fpst);
-}
-
-uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp)
-{
-    float_status *fpst = fpstp;
-    float64 f0 = float64_abs(make_float64(a));
-    float64 f1 = float64_abs(make_float64(b));
-    return -float64_le(f1, f0, fpst);
-}
-
-uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp)
-{
-    float_status *fpst = fpstp;
-    float64 f0 = float64_abs(make_float64(a));
-    float64 f1 = float64_abs(make_float64(b));
-    return -float64_lt(f1, f0, fpst);
-}
-
-#define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
-
-void HELPER(neon_qunzip8)(void *vd, void *vm)
-{
-    uint64_t *rd = vd, *rm = vm;
-    uint64_t zd0 = rd[0], zd1 = rd[1];
-    uint64_t zm0 = rm[0], zm1 = rm[1];
-
-    uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
-        | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
-        | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
-        | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
-    uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
-        | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
-        | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
-        | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
-    uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
-        | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
-        | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
-        | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
-    uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
-        | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
-        | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
-        | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
-
-    rm[0] = m0;
-    rm[1] = m1;
-    rd[0] = d0;
-    rd[1] = d1;
-}
-
-void HELPER(neon_qunzip16)(void *vd, void *vm)
-{
-    uint64_t *rd = vd, *rm = vm;
-    uint64_t zd0 = rd[0], zd1 = rd[1];
-    uint64_t zm0 = rm[0], zm1 = rm[1];
-
-    uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
-        | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
-    uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
-        | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
-    uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
-        | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
-    uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
-        | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
-
-    rm[0] = m0;
-    rm[1] = m1;
-    rd[0] = d0;
-    rd[1] = d1;
-}
-
-void HELPER(neon_qunzip32)(void *vd, void *vm)
-{
-    uint64_t *rd = vd, *rm = vm;
-    uint64_t zd0 = rd[0], zd1 = rd[1];
-    uint64_t zm0 = rm[0], zm1 = rm[1];
-
-    uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
-    uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
-    uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
-    uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
-
-    rm[0] = m0;
-    rm[1] = m1;
-    rd[0] = d0;
-    rd[1] = d1;
-}
-
-void HELPER(neon_unzip8)(void *vd, void *vm)
-{
-    uint64_t *rd = vd, *rm = vm;
-    uint64_t zd = rd[0], zm = rm[0];
-
-    uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
-        | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
-        | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
-        | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
-    uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
-        | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
-        | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
-        | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
-
-    rm[0] = m0;
-    rd[0] = d0;
-}
-
-void HELPER(neon_unzip16)(void *vd, void *vm)
-{
-    uint64_t *rd = vd, *rm = vm;
-    uint64_t zd = rd[0], zm = rm[0];
-
-    uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
-        | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
-    uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
-        | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
-
-    rm[0] = m0;
-    rd[0] = d0;
-}
-
-void HELPER(neon_qzip8)(void *vd, void *vm)
-{
-    uint64_t *rd = vd, *rm = vm;
-    uint64_t zd0 = rd[0], zd1 = rd[1];
-    uint64_t zm0 = rm[0], zm1 = rm[1];
-
-    uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
-        | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
-        | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
-        | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
-    uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
-        | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
-        | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
-        | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
-    uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
-        | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
-        | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
-        | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
-    uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
-        | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
-        | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
-        | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
-
-    rm[0] = m0;
-    rm[1] = m1;
-    rd[0] = d0;
-    rd[1] = d1;
-}
-
-void HELPER(neon_qzip16)(void *vd, void *vm)
-{
-    uint64_t *rd = vd, *rm = vm;
-    uint64_t zd0 = rd[0], zd1 = rd[1];
-    uint64_t zm0 = rm[0], zm1 = rm[1];
-
-    uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
-        | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
-    uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
-        | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
-    uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
-        | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
-    uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
-        | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
-
-    rm[0] = m0;
-    rm[1] = m1;
-    rd[0] = d0;
-    rd[1] = d1;
-}
-
-void HELPER(neon_qzip32)(void *vd, void *vm)
-{
-    uint64_t *rd = vd, *rm = vm;
-    uint64_t zd0 = rd[0], zd1 = rd[1];
-    uint64_t zm0 = rm[0], zm1 = rm[1];
-
-    uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
-    uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
-    uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
-    uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
-
-    rm[0] = m0;
-    rm[1] = m1;
-    rd[0] = d0;
-    rd[1] = d1;
-}
-
-void HELPER(neon_zip8)(void *vd, void *vm)
-{
-    uint64_t *rd = vd, *rm = vm;
-    uint64_t zd = rd[0], zm = rm[0];
-
-    uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
-        | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
-        | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
-        | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
-    uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
-        | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
-        | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
-        | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
-
-    rm[0] = m0;
-    rd[0] = d0;
-}
-
-void HELPER(neon_zip16)(void *vd, void *vm)
-{
-    uint64_t *rd = vd, *rm = vm;
-    uint64_t zd = rd[0], zm = rm[0];
-
-    uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
-        | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
-    uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
-        | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
-
-    rm[0] = m0;
-    rd[0] = d0;
-}
diff --git a/target/arm/op_helper.c b/target/arm/op_helper.c
deleted file mode 100644
index 3baf800..0000000
--- a/target/arm/op_helper.c
+++ /dev/null
@@ -1,1069 +0,0 @@
-/*
- *  ARM helper routines
- *
- *  Copyright (c) 2005-2007 CodeSourcery, LLC
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-#include "qemu/osdep.h"
-#include "qemu/main-loop.h"
-#include "cpu.h"
-#include "exec/helper-proto.h"
-#include "internals.h"
-#include "exec/exec-all.h"
-#include "exec/cpu_ldst.h"
-#include "cpregs.h"
-
-#define SIGNBIT (uint32_t)0x80000000
-#define SIGNBIT64 ((uint64_t)1 << 63)
-
-int exception_target_el(CPUARMState *env)
-{
-    int target_el = MAX(1, arm_current_el(env));
-
-    /*
-     * No such thing as secure EL1 if EL3 is aarch32,
-     * so update the target EL to EL3 in this case.
-     */
-    if (arm_is_secure(env) && !arm_el_is_aa64(env, 3) && target_el == 1) {
-        target_el = 3;
-    }
-
-    return target_el;
-}
-
-void raise_exception(CPUARMState *env, uint32_t excp,
-                     uint32_t syndrome, uint32_t target_el)
-{
-    CPUState *cs = env_cpu(env);
-
-    if (target_el == 1 && (arm_hcr_el2_eff(env) & HCR_TGE)) {
-        /*
-         * Redirect NS EL1 exceptions to NS EL2. These are reported with
-         * their original syndrome register value, with the exception of
-         * SIMD/FP access traps, which are reported as uncategorized
-         * (see DDI0478C.a D1.10.4)
-         */
-        target_el = 2;
-        if (syn_get_ec(syndrome) == EC_ADVSIMDFPACCESSTRAP) {
-            syndrome = syn_uncategorized();
-        }
-    }
-
-    assert(!excp_is_internal(excp));
-    cs->exception_index = excp;
-    env->exception.syndrome = syndrome;
-    env->exception.target_el = target_el;
-    cpu_loop_exit(cs);
-}
-
-void raise_exception_ra(CPUARMState *env, uint32_t excp, uint32_t syndrome,
-                        uint32_t target_el, uintptr_t ra)
-{
-    CPUState *cs = env_cpu(env);
-
-    /*
-     * restore_state_to_opc() will set env->exception.syndrome, so
-     * we must restore CPU state here before setting the syndrome
-     * the caller passed us, and cannot use cpu_loop_exit_restore().
-     */
-    cpu_restore_state(cs, ra);
-    raise_exception(env, excp, syndrome, target_el);
-}
-
-uint64_t HELPER(neon_tbl)(CPUARMState *env, uint32_t desc,
-                          uint64_t ireg, uint64_t def)
-{
-    uint64_t tmp, val = 0;
-    uint32_t maxindex = ((desc & 3) + 1) * 8;
-    uint32_t base_reg = desc >> 2;
-    uint32_t shift, index, reg;
-
-    for (shift = 0; shift < 64; shift += 8) {
-        index = (ireg >> shift) & 0xff;
-        if (index < maxindex) {
-            reg = base_reg + (index >> 3);
-            tmp = *aa32_vfp_dreg(env, reg);
-            tmp = ((tmp >> ((index & 7) << 3)) & 0xff) << shift;
-        } else {
-            tmp = def & (0xffull << shift);
-        }
-        val |= tmp;
-    }
-    return val;
-}
-
-void HELPER(v8m_stackcheck)(CPUARMState *env, uint32_t newvalue)
-{
-    /*
-     * Perform the v8M stack limit check for SP updates from translated code,
-     * raising an exception if the limit is breached.
-     */
-    if (newvalue < v7m_sp_limit(env)) {
-        /*
-         * Stack limit exceptions are a rare case, so rather than syncing
-         * PC/condbits before the call, we use raise_exception_ra() so
-         * that cpu_restore_state() will sort them out.
-         */
-        raise_exception_ra(env, EXCP_STKOF, 0, 1, GETPC());
-    }
-}
-
-uint32_t HELPER(add_setq)(CPUARMState *env, uint32_t a, uint32_t b)
-{
-    uint32_t res = a + b;
-    if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT))
-        env->QF = 1;
-    return res;
-}
-
-uint32_t HELPER(add_saturate)(CPUARMState *env, uint32_t a, uint32_t b)
-{
-    uint32_t res = a + b;
-    if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) {
-        env->QF = 1;
-        res = ~(((int32_t)a >> 31) ^ SIGNBIT);
-    }
-    return res;
-}
-
-uint32_t HELPER(sub_saturate)(CPUARMState *env, uint32_t a, uint32_t b)
-{
-    uint32_t res = a - b;
-    if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) {
-        env->QF = 1;
-        res = ~(((int32_t)a >> 31) ^ SIGNBIT);
-    }
-    return res;
-}
-
-uint32_t HELPER(add_usaturate)(CPUARMState *env, uint32_t a, uint32_t b)
-{
-    uint32_t res = a + b;
-    if (res < a) {
-        env->QF = 1;
-        res = ~0;
-    }
-    return res;
-}
-
-uint32_t HELPER(sub_usaturate)(CPUARMState *env, uint32_t a, uint32_t b)
-{
-    uint32_t res = a - b;
-    if (res > a) {
-        env->QF = 1;
-        res = 0;
-    }
-    return res;
-}
-
-/* Signed saturation.  */
-static inline uint32_t do_ssat(CPUARMState *env, int32_t val, int shift)
-{
-    int32_t top;
-    uint32_t mask;
-
-    top = val >> shift;
-    mask = (1u << shift) - 1;
-    if (top > 0) {
-        env->QF = 1;
-        return mask;
-    } else if (top < -1) {
-        env->QF = 1;
-        return ~mask;
-    }
-    return val;
-}
-
-/* Unsigned saturation.  */
-static inline uint32_t do_usat(CPUARMState *env, int32_t val, int shift)
-{
-    uint32_t max;
-
-    max = (1u << shift) - 1;
-    if (val < 0) {
-        env->QF = 1;
-        return 0;
-    } else if (val > max) {
-        env->QF = 1;
-        return max;
-    }
-    return val;
-}
-
-/* Signed saturate.  */
-uint32_t HELPER(ssat)(CPUARMState *env, uint32_t x, uint32_t shift)
-{
-    return do_ssat(env, x, shift);
-}
-
-/* Dual halfword signed saturate.  */
-uint32_t HELPER(ssat16)(CPUARMState *env, uint32_t x, uint32_t shift)
-{
-    uint32_t res;
-
-    res = (uint16_t)do_ssat(env, (int16_t)x, shift);
-    res |= do_ssat(env, ((int32_t)x) >> 16, shift) << 16;
-    return res;
-}
-
-/* Unsigned saturate.  */
-uint32_t HELPER(usat)(CPUARMState *env, uint32_t x, uint32_t shift)
-{
-    return do_usat(env, x, shift);
-}
-
-/* Dual halfword unsigned saturate.  */
-uint32_t HELPER(usat16)(CPUARMState *env, uint32_t x, uint32_t shift)
-{
-    uint32_t res;
-
-    res = (uint16_t)do_usat(env, (int16_t)x, shift);
-    res |= do_usat(env, ((int32_t)x) >> 16, shift) << 16;
-    return res;
-}
-
-void HELPER(setend)(CPUARMState *env)
-{
-    env->uncached_cpsr ^= CPSR_E;
-    arm_rebuild_hflags(env);
-}
-
-void HELPER(check_bxj_trap)(CPUARMState *env, uint32_t rm)
-{
-    /*
-     * Only called if in NS EL0 or EL1 for a BXJ for a v7A CPU;
-     * check if HSTR.TJDBX means we need to trap to EL2.
-     */
-    if (env->cp15.hstr_el2 & HSTR_TJDBX) {
-        /*
-         * We know the condition code check passed, so take the IMPDEF
-         * choice to always report CV=1 COND 0xe
-         */
-        uint32_t syn = syn_bxjtrap(1, 0xe, rm);
-        raise_exception_ra(env, EXCP_HYP_TRAP, syn, 2, GETPC());
-    }
-}
-
-#ifndef CONFIG_USER_ONLY
-/* Function checks whether WFx (WFI/WFE) instructions are set up to be trapped.
- * The function returns the target EL (1-3) if the instruction is to be trapped;
- * otherwise it returns 0 indicating it is not trapped.
- */
-static inline int check_wfx_trap(CPUARMState *env, bool is_wfe)
-{
-    int cur_el = arm_current_el(env);
-    uint64_t mask;
-
-    if (arm_feature(env, ARM_FEATURE_M)) {
-        /* M profile cores can never trap WFI/WFE. */
-        return 0;
-    }
-
-    /* If we are currently in EL0 then we need to check if SCTLR is set up for
-     * WFx instructions being trapped to EL1. These trap bits don't exist in v7.
-     */
-    if (cur_el < 1 && arm_feature(env, ARM_FEATURE_V8)) {
-        int target_el;
-
-        mask = is_wfe ? SCTLR_nTWE : SCTLR_nTWI;
-        if (arm_is_secure_below_el3(env) && !arm_el_is_aa64(env, 3)) {
-            /* Secure EL0 and Secure PL1 is at EL3 */
-            target_el = 3;
-        } else {
-            target_el = 1;
-        }
-
-        if (!(env->cp15.sctlr_el[target_el] & mask)) {
-            return target_el;
-        }
-    }
-
-    /* We are not trapping to EL1; trap to EL2 if HCR_EL2 requires it
-     * No need for ARM_FEATURE check as if HCR_EL2 doesn't exist the
-     * bits will be zero indicating no trap.
-     */
-    if (cur_el < 2) {
-        mask = is_wfe ? HCR_TWE : HCR_TWI;
-        if (arm_hcr_el2_eff(env) & mask) {
-            return 2;
-        }
-    }
-
-    /* We are not trapping to EL1 or EL2; trap to EL3 if SCR_EL3 requires it */
-    if (cur_el < 3) {
-        mask = (is_wfe) ? SCR_TWE : SCR_TWI;
-        if (env->cp15.scr_el3 & mask) {
-            return 3;
-        }
-    }
-
-    return 0;
-}
-#endif
-
-void HELPER(wfi)(CPUARMState *env, uint32_t insn_len)
-{
-#ifdef CONFIG_USER_ONLY
-    /*
-     * WFI in the user-mode emulator is technically permitted but not
-     * something any real-world code would do. AArch64 Linux kernels
-     * trap it via SCTRL_EL1.nTWI and make it an (expensive) NOP;
-     * AArch32 kernels don't trap it so it will delay a bit.
-     * For QEMU, make it NOP here, because trying to raise EXCP_HLT
-     * would trigger an abort.
-     */
-    return;
-#else
-    CPUState *cs = env_cpu(env);
-    int target_el = check_wfx_trap(env, false);
-
-    if (cpu_has_work(cs)) {
-        /* Don't bother to go into our "low power state" if
-         * we would just wake up immediately.
-         */
-        return;
-    }
-
-    if (target_el) {
-        if (env->aarch64) {
-            env->pc -= insn_len;
-        } else {
-            env->regs[15] -= insn_len;
-        }
-
-        raise_exception(env, EXCP_UDEF, syn_wfx(1, 0xe, 0, insn_len == 2),
-                        target_el);
-    }
-
-    cs->exception_index = EXCP_HLT;
-    cs->halted = 1;
-    cpu_loop_exit(cs);
-#endif
-}
-
-void HELPER(wfe)(CPUARMState *env)
-{
-    /* This is a hint instruction that is semantically different
-     * from YIELD even though we currently implement it identically.
-     * Don't actually halt the CPU, just yield back to top
-     * level loop. This is not going into a "low power state"
-     * (ie halting until some event occurs), so we never take
-     * a configurable trap to a different exception level.
-     */
-    HELPER(yield)(env);
-}
-
-void HELPER(yield)(CPUARMState *env)
-{
-    CPUState *cs = env_cpu(env);
-
-    /* This is a non-trappable hint instruction that generally indicates
-     * that the guest is currently busy-looping. Yield control back to the
-     * top level loop so that a more deserving VCPU has a chance to run.
-     */
-    cs->exception_index = EXCP_YIELD;
-    cpu_loop_exit(cs);
-}
-
-/* Raise an internal-to-QEMU exception. This is limited to only
- * those EXCP values which are special cases for QEMU to interrupt
- * execution and not to be used for exceptions which are passed to
- * the guest (those must all have syndrome information and thus should
- * use exception_with_syndrome*).
- */
-void HELPER(exception_internal)(CPUARMState *env, uint32_t excp)
-{
-    CPUState *cs = env_cpu(env);
-
-    assert(excp_is_internal(excp));
-    cs->exception_index = excp;
-    cpu_loop_exit(cs);
-}
-
-/* Raise an exception with the specified syndrome register value */
-void HELPER(exception_with_syndrome_el)(CPUARMState *env, uint32_t excp,
-                                        uint32_t syndrome, uint32_t target_el)
-{
-    raise_exception(env, excp, syndrome, target_el);
-}
-
-/*
- * Raise an exception with the specified syndrome register value
- * to the default target el.
- */
-void HELPER(exception_with_syndrome)(CPUARMState *env, uint32_t excp,
-                                     uint32_t syndrome)
-{
-    raise_exception(env, excp, syndrome, exception_target_el(env));
-}
-
-uint32_t HELPER(cpsr_read)(CPUARMState *env)
-{
-    return cpsr_read(env) & ~CPSR_EXEC;
-}
-
-void HELPER(cpsr_write)(CPUARMState *env, uint32_t val, uint32_t mask)
-{
-    cpsr_write(env, val, mask, CPSRWriteByInstr);
-    /* TODO: Not all cpsr bits are relevant to hflags.  */
-    arm_rebuild_hflags(env);
-}
-
-/* Write the CPSR for a 32-bit exception return */
-void HELPER(cpsr_write_eret)(CPUARMState *env, uint32_t val)
-{
-    uint32_t mask;
-
-    qemu_mutex_lock_iothread();
-    arm_call_pre_el_change_hook(env_archcpu(env));
-    qemu_mutex_unlock_iothread();
-
-    mask = aarch32_cpsr_valid_mask(env->features, &env_archcpu(env)->isar);
-    cpsr_write(env, val, mask, CPSRWriteExceptionReturn);
-
-    /* Generated code has already stored the new PC value, but
-     * without masking out its low bits, because which bits need
-     * masking depends on whether we're returning to Thumb or ARM
-     * state. Do the masking now.
-     */
-    env->regs[15] &= (env->thumb ? ~1 : ~3);
-    arm_rebuild_hflags(env);
-
-    qemu_mutex_lock_iothread();
-    arm_call_el_change_hook(env_archcpu(env));
-    qemu_mutex_unlock_iothread();
-}
-
-/* Access to user mode registers from privileged modes.  */
-uint32_t HELPER(get_user_reg)(CPUARMState *env, uint32_t regno)
-{
-    uint32_t val;
-
-    if (regno == 13) {
-        val = env->banked_r13[BANK_USRSYS];
-    } else if (regno == 14) {
-        val = env->banked_r14[BANK_USRSYS];
-    } else if (regno >= 8
-               && (env->uncached_cpsr & 0x1f) == ARM_CPU_MODE_FIQ) {
-        val = env->usr_regs[regno - 8];
-    } else {
-        val = env->regs[regno];
-    }
-    return val;
-}
-
-void HELPER(set_user_reg)(CPUARMState *env, uint32_t regno, uint32_t val)
-{
-    if (regno == 13) {
-        env->banked_r13[BANK_USRSYS] = val;
-    } else if (regno == 14) {
-        env->banked_r14[BANK_USRSYS] = val;
-    } else if (regno >= 8
-               && (env->uncached_cpsr & 0x1f) == ARM_CPU_MODE_FIQ) {
-        env->usr_regs[regno - 8] = val;
-    } else {
-        env->regs[regno] = val;
-    }
-}
-
-void HELPER(set_r13_banked)(CPUARMState *env, uint32_t mode, uint32_t val)
-{
-    if ((env->uncached_cpsr & CPSR_M) == mode) {
-        env->regs[13] = val;
-    } else {
-        env->banked_r13[bank_number(mode)] = val;
-    }
-}
-
-uint32_t HELPER(get_r13_banked)(CPUARMState *env, uint32_t mode)
-{
-    if ((env->uncached_cpsr & CPSR_M) == ARM_CPU_MODE_SYS) {
-        /* SRS instruction is UNPREDICTABLE from System mode; we UNDEF.
-         * Other UNPREDICTABLE and UNDEF cases were caught at translate time.
-         */
-        raise_exception(env, EXCP_UDEF, syn_uncategorized(),
-                        exception_target_el(env));
-    }
-
-    if ((env->uncached_cpsr & CPSR_M) == mode) {
-        return env->regs[13];
-    } else {
-        return env->banked_r13[bank_number(mode)];
-    }
-}
-
-static void msr_mrs_banked_exc_checks(CPUARMState *env, uint32_t tgtmode,
-                                      uint32_t regno)
-{
-    /* Raise an exception if the requested access is one of the UNPREDICTABLE
-     * cases; otherwise return. This broadly corresponds to the pseudocode
-     * BankedRegisterAccessValid() and SPSRAccessValid(),
-     * except that we have already handled some cases at translate time.
-     */
-    int curmode = env->uncached_cpsr & CPSR_M;
-
-    if (regno == 17) {
-        /* ELR_Hyp: a special case because access from tgtmode is OK */
-        if (curmode != ARM_CPU_MODE_HYP && curmode != ARM_CPU_MODE_MON) {
-            goto undef;
-        }
-        return;
-    }
-
-    if (curmode == tgtmode) {
-        goto undef;
-    }
-
-    if (tgtmode == ARM_CPU_MODE_USR) {
-        switch (regno) {
-        case 8 ... 12:
-            if (curmode != ARM_CPU_MODE_FIQ) {
-                goto undef;
-            }
-            break;
-        case 13:
-            if (curmode == ARM_CPU_MODE_SYS) {
-                goto undef;
-            }
-            break;
-        case 14:
-            if (curmode == ARM_CPU_MODE_HYP || curmode == ARM_CPU_MODE_SYS) {
-                goto undef;
-            }
-            break;
-        default:
-            break;
-        }
-    }
-
-    if (tgtmode == ARM_CPU_MODE_HYP) {
-        /* SPSR_Hyp, r13_hyp: accessible from Monitor mode only */
-        if (curmode != ARM_CPU_MODE_MON) {
-            goto undef;
-        }
-    }
-
-    return;
-
-undef:
-    raise_exception(env, EXCP_UDEF, syn_uncategorized(),
-                    exception_target_el(env));
-}
-
-void HELPER(msr_banked)(CPUARMState *env, uint32_t value, uint32_t tgtmode,
-                        uint32_t regno)
-{
-    msr_mrs_banked_exc_checks(env, tgtmode, regno);
-
-    switch (regno) {
-    case 16: /* SPSRs */
-        env->banked_spsr[bank_number(tgtmode)] = value;
-        break;
-    case 17: /* ELR_Hyp */
-        env->elr_el[2] = value;
-        break;
-    case 13:
-        env->banked_r13[bank_number(tgtmode)] = value;
-        break;
-    case 14:
-        env->banked_r14[r14_bank_number(tgtmode)] = value;
-        break;
-    case 8 ... 12:
-        switch (tgtmode) {
-        case ARM_CPU_MODE_USR:
-            env->usr_regs[regno - 8] = value;
-            break;
-        case ARM_CPU_MODE_FIQ:
-            env->fiq_regs[regno - 8] = value;
-            break;
-        default:
-            g_assert_not_reached();
-        }
-        break;
-    default:
-        g_assert_not_reached();
-    }
-}
-
-uint32_t HELPER(mrs_banked)(CPUARMState *env, uint32_t tgtmode, uint32_t regno)
-{
-    msr_mrs_banked_exc_checks(env, tgtmode, regno);
-
-    switch (regno) {
-    case 16: /* SPSRs */
-        return env->banked_spsr[bank_number(tgtmode)];
-    case 17: /* ELR_Hyp */
-        return env->elr_el[2];
-    case 13:
-        return env->banked_r13[bank_number(tgtmode)];
-    case 14:
-        return env->banked_r14[r14_bank_number(tgtmode)];
-    case 8 ... 12:
-        switch (tgtmode) {
-        case ARM_CPU_MODE_USR:
-            return env->usr_regs[regno - 8];
-        case ARM_CPU_MODE_FIQ:
-            return env->fiq_regs[regno - 8];
-        default:
-            g_assert_not_reached();
-        }
-    default:
-        g_assert_not_reached();
-    }
-}
-
-const void *HELPER(access_check_cp_reg)(CPUARMState *env, uint32_t key,
-                                        uint32_t syndrome, uint32_t isread)
-{
-    ARMCPU *cpu = env_archcpu(env);
-    const ARMCPRegInfo *ri = get_arm_cp_reginfo(cpu->cp_regs, key);
-    CPAccessResult res = CP_ACCESS_OK;
-    int target_el;
-
-    assert(ri != NULL);
-
-    if (arm_feature(env, ARM_FEATURE_XSCALE) && ri->cp < 14
-        && extract32(env->cp15.c15_cpar, ri->cp, 1) == 0) {
-        res = CP_ACCESS_TRAP;
-        goto fail;
-    }
-
-    if (ri->accessfn) {
-        res = ri->accessfn(env, ri, isread);
-    }
-
-    /*
-     * If the access function indicates a trap from EL0 to EL1 then
-     * that always takes priority over the HSTR_EL2 trap. (If it indicates
-     * a trap to EL3, then the HSTR_EL2 trap takes priority; if it indicates
-     * a trap to EL2, then the syndrome is the same either way so we don't
-     * care whether technically the architecture says that HSTR_EL2 trap or
-     * the other trap takes priority. So we take the "check HSTR_EL2" path
-     * for all of those cases.)
-     */
-    if (res != CP_ACCESS_OK && ((res & CP_ACCESS_EL_MASK) == 0) &&
-        arm_current_el(env) == 0) {
-        goto fail;
-    }
-
-    /*
-     * HSTR_EL2 traps from EL1 are checked earlier, in generated code;
-     * we only need to check here for traps from EL0.
-     */
-    if (!is_a64(env) && arm_current_el(env) == 0 && ri->cp == 15 &&
-        arm_is_el2_enabled(env) &&
-        (arm_hcr_el2_eff(env) & (HCR_E2H | HCR_TGE)) != (HCR_E2H | HCR_TGE)) {
-        uint32_t mask = 1 << ri->crn;
-
-        if (ri->type & ARM_CP_64BIT) {
-            mask = 1 << ri->crm;
-        }
-
-        /* T4 and T14 are RES0 */
-        mask &= ~((1 << 4) | (1 << 14));
-
-        if (env->cp15.hstr_el2 & mask) {
-            res = CP_ACCESS_TRAP_EL2;
-            goto fail;
-        }
-    }
-
-    /*
-     * Fine-grained traps also are lower priority than undef-to-EL1,
-     * higher priority than trap-to-EL3, and we don't care about priority
-     * order with other EL2 traps because the syndrome value is the same.
-     */
-    if (arm_fgt_active(env, arm_current_el(env))) {
-        uint64_t trapword = 0;
-        unsigned int idx = FIELD_EX32(ri->fgt, FGT, IDX);
-        unsigned int bitpos = FIELD_EX32(ri->fgt, FGT, BITPOS);
-        bool rev = FIELD_EX32(ri->fgt, FGT, REV);
-        bool trapbit;
-
-        if (ri->fgt & FGT_EXEC) {
-            assert(idx < ARRAY_SIZE(env->cp15.fgt_exec));
-            trapword = env->cp15.fgt_exec[idx];
-        } else if (isread && (ri->fgt & FGT_R)) {
-            assert(idx < ARRAY_SIZE(env->cp15.fgt_read));
-            trapword = env->cp15.fgt_read[idx];
-        } else if (!isread && (ri->fgt & FGT_W)) {
-            assert(idx < ARRAY_SIZE(env->cp15.fgt_write));
-            trapword = env->cp15.fgt_write[idx];
-        }
-
-        trapbit = extract64(trapword, bitpos, 1);
-        if (trapbit != rev) {
-            res = CP_ACCESS_TRAP_EL2;
-            goto fail;
-        }
-    }
-
-    if (likely(res == CP_ACCESS_OK)) {
-        return ri;
-    }
-
- fail:
-    switch (res & ~CP_ACCESS_EL_MASK) {
-    case CP_ACCESS_TRAP:
-        break;
-    case CP_ACCESS_TRAP_UNCATEGORIZED:
-        /* Only CP_ACCESS_TRAP traps are direct to a specified EL */
-        assert((res & CP_ACCESS_EL_MASK) == 0);
-        if (cpu_isar_feature(aa64_ids, cpu) && isread &&
-            arm_cpreg_in_idspace(ri)) {
-            /*
-             * FEAT_IDST says this should be reported as EC_SYSTEMREGISTERTRAP,
-             * not EC_UNCATEGORIZED
-             */
-            break;
-        }
-        syndrome = syn_uncategorized();
-        break;
-    default:
-        g_assert_not_reached();
-    }
-
-    target_el = res & CP_ACCESS_EL_MASK;
-    switch (target_el) {
-    case 0:
-        target_el = exception_target_el(env);
-        break;
-    case 2:
-        assert(arm_current_el(env) != 3);
-        assert(arm_is_el2_enabled(env));
-        break;
-    case 3:
-        assert(arm_feature(env, ARM_FEATURE_EL3));
-        break;
-    default:
-        /* No "direct" traps to EL1 */
-        g_assert_not_reached();
-    }
-
-    raise_exception(env, EXCP_UDEF, syndrome, target_el);
-}
-
-const void *HELPER(lookup_cp_reg)(CPUARMState *env, uint32_t key)
-{
-    ARMCPU *cpu = env_archcpu(env);
-    const ARMCPRegInfo *ri = get_arm_cp_reginfo(cpu->cp_regs, key);
-
-    assert(ri != NULL);
-    return ri;
-}
-
-void HELPER(set_cp_reg)(CPUARMState *env, const void *rip, uint32_t value)
-{
-    const ARMCPRegInfo *ri = rip;
-
-    if (ri->type & ARM_CP_IO) {
-        qemu_mutex_lock_iothread();
-        ri->writefn(env, ri, value);
-        qemu_mutex_unlock_iothread();
-    } else {
-        ri->writefn(env, ri, value);
-    }
-}
-
-uint32_t HELPER(get_cp_reg)(CPUARMState *env, const void *rip)
-{
-    const ARMCPRegInfo *ri = rip;
-    uint32_t res;
-
-    if (ri->type & ARM_CP_IO) {
-        qemu_mutex_lock_iothread();
-        res = ri->readfn(env, ri);
-        qemu_mutex_unlock_iothread();
-    } else {
-        res = ri->readfn(env, ri);
-    }
-
-    return res;
-}
-
-void HELPER(set_cp_reg64)(CPUARMState *env, const void *rip, uint64_t value)
-{
-    const ARMCPRegInfo *ri = rip;
-
-    if (ri->type & ARM_CP_IO) {
-        qemu_mutex_lock_iothread();
-        ri->writefn(env, ri, value);
-        qemu_mutex_unlock_iothread();
-    } else {
-        ri->writefn(env, ri, value);
-    }
-}
-
-uint64_t HELPER(get_cp_reg64)(CPUARMState *env, const void *rip)
-{
-    const ARMCPRegInfo *ri = rip;
-    uint64_t res;
-
-    if (ri->type & ARM_CP_IO) {
-        qemu_mutex_lock_iothread();
-        res = ri->readfn(env, ri);
-        qemu_mutex_unlock_iothread();
-    } else {
-        res = ri->readfn(env, ri);
-    }
-
-    return res;
-}
-
-void HELPER(pre_hvc)(CPUARMState *env)
-{
-    ARMCPU *cpu = env_archcpu(env);
-    int cur_el = arm_current_el(env);
-    /* FIXME: Use actual secure state.  */
-    bool secure = false;
-    bool undef;
-
-    if (arm_is_psci_call(cpu, EXCP_HVC)) {
-        /* If PSCI is enabled and this looks like a valid PSCI call then
-         * that overrides the architecturally mandated HVC behaviour.
-         */
-        return;
-    }
-
-    if (!arm_feature(env, ARM_FEATURE_EL2)) {
-        /* If EL2 doesn't exist, HVC always UNDEFs */
-        undef = true;
-    } else if (arm_feature(env, ARM_FEATURE_EL3)) {
-        /* EL3.HCE has priority over EL2.HCD. */
-        undef = !(env->cp15.scr_el3 & SCR_HCE);
-    } else {
-        undef = env->cp15.hcr_el2 & HCR_HCD;
-    }
-
-    /* In ARMv7 and ARMv8/AArch32, HVC is undef in secure state.
-     * For ARMv8/AArch64, HVC is allowed in EL3.
-     * Note that we've already trapped HVC from EL0 at translation
-     * time.
-     */
-    if (secure && (!is_a64(env) || cur_el == 1)) {
-        undef = true;
-    }
-
-    if (undef) {
-        raise_exception(env, EXCP_UDEF, syn_uncategorized(),
-                        exception_target_el(env));
-    }
-}
-
-void HELPER(pre_smc)(CPUARMState *env, uint32_t syndrome)
-{
-    ARMCPU *cpu = env_archcpu(env);
-    int cur_el = arm_current_el(env);
-    bool secure = arm_is_secure(env);
-    bool smd_flag = env->cp15.scr_el3 & SCR_SMD;
-
-    /*
-     * SMC behaviour is summarized in the following table.
-     * This helper handles the "Trap to EL2" and "Undef insn" cases.
-     * The "Trap to EL3" and "PSCI call" cases are handled in the exception
-     * helper.
-     *
-     *  -> ARM_FEATURE_EL3 and !SMD
-     *                           HCR_TSC && NS EL1   !HCR_TSC || !NS EL1
-     *
-     *  Conduit SMC, valid call  Trap to EL2         PSCI Call
-     *  Conduit SMC, inval call  Trap to EL2         Trap to EL3
-     *  Conduit not SMC          Trap to EL2         Trap to EL3
-     *
-     *
-     *  -> ARM_FEATURE_EL3 and SMD
-     *                           HCR_TSC && NS EL1   !HCR_TSC || !NS EL1
-     *
-     *  Conduit SMC, valid call  Trap to EL2         PSCI Call
-     *  Conduit SMC, inval call  Trap to EL2         Undef insn
-     *  Conduit not SMC          Trap to EL2         Undef insn
-     *
-     *
-     *  -> !ARM_FEATURE_EL3
-     *                           HCR_TSC && NS EL1   !HCR_TSC || !NS EL1
-     *
-     *  Conduit SMC, valid call  Trap to EL2         PSCI Call
-     *  Conduit SMC, inval call  Trap to EL2         Undef insn
-     *  Conduit not SMC          Undef insn          Undef insn
-     */
-
-    /* On ARMv8 with EL3 AArch64, SMD applies to both S and NS state.
-     * On ARMv8 with EL3 AArch32, or ARMv7 with the Virtualization
-     *  extensions, SMD only applies to NS state.
-     * On ARMv7 without the Virtualization extensions, the SMD bit
-     * doesn't exist, but we forbid the guest to set it to 1 in scr_write(),
-     * so we need not special case this here.
-     */
-    bool smd = arm_feature(env, ARM_FEATURE_AARCH64) ? smd_flag
-                                                     : smd_flag && !secure;
-
-    if (!arm_feature(env, ARM_FEATURE_EL3) &&
-        cpu->psci_conduit != QEMU_PSCI_CONDUIT_SMC) {
-        /* If we have no EL3 then SMC always UNDEFs and can't be
-         * trapped to EL2. PSCI-via-SMC is a sort of ersatz EL3
-         * firmware within QEMU, and we want an EL2 guest to be able
-         * to forbid its EL1 from making PSCI calls into QEMU's
-         * "firmware" via HCR.TSC, so for these purposes treat
-         * PSCI-via-SMC as implying an EL3.
-         * This handles the very last line of the previous table.
-         */
-        raise_exception(env, EXCP_UDEF, syn_uncategorized(),
-                        exception_target_el(env));
-    }
-
-    if (cur_el == 1 && (arm_hcr_el2_eff(env) & HCR_TSC)) {
-        /* In NS EL1, HCR controlled routing to EL2 has priority over SMD.
-         * We also want an EL2 guest to be able to forbid its EL1 from
-         * making PSCI calls into QEMU's "firmware" via HCR.TSC.
-         * This handles all the "Trap to EL2" cases of the previous table.
-         */
-        raise_exception(env, EXCP_HYP_TRAP, syndrome, 2);
-    }
-
-    /* Catch the two remaining "Undef insn" cases of the previous table:
-     *    - PSCI conduit is SMC but we don't have a valid PCSI call,
-     *    - We don't have EL3 or SMD is set.
-     */
-    if (!arm_is_psci_call(cpu, EXCP_SMC) &&
-        (smd || !arm_feature(env, ARM_FEATURE_EL3))) {
-        raise_exception(env, EXCP_UDEF, syn_uncategorized(),
-                        exception_target_el(env));
-    }
-}
-
-/* ??? Flag setting arithmetic is awkward because we need to do comparisons.
-   The only way to do that in TCG is a conditional branch, which clobbers
-   all our temporaries.  For now implement these as helper functions.  */
-
-/* Similarly for variable shift instructions.  */
-
-uint32_t HELPER(shl_cc)(CPUARMState *env, uint32_t x, uint32_t i)
-{
-    int shift = i & 0xff;
-    if (shift >= 32) {
-        if (shift == 32)
-            env->CF = x & 1;
-        else
-            env->CF = 0;
-        return 0;
-    } else if (shift != 0) {
-        env->CF = (x >> (32 - shift)) & 1;
-        return x << shift;
-    }
-    return x;
-}
-
-uint32_t HELPER(shr_cc)(CPUARMState *env, uint32_t x, uint32_t i)
-{
-    int shift = i & 0xff;
-    if (shift >= 32) {
-        if (shift == 32)
-            env->CF = (x >> 31) & 1;
-        else
-            env->CF = 0;
-        return 0;
-    } else if (shift != 0) {
-        env->CF = (x >> (shift - 1)) & 1;
-        return x >> shift;
-    }
-    return x;
-}
-
-uint32_t HELPER(sar_cc)(CPUARMState *env, uint32_t x, uint32_t i)
-{
-    int shift = i & 0xff;
-    if (shift >= 32) {
-        env->CF = (x >> 31) & 1;
-        return (int32_t)x >> 31;
-    } else if (shift != 0) {
-        env->CF = (x >> (shift - 1)) & 1;
-        return (int32_t)x >> shift;
-    }
-    return x;
-}
-
-uint32_t HELPER(ror_cc)(CPUARMState *env, uint32_t x, uint32_t i)
-{
-    int shift1, shift;
-    shift1 = i & 0xff;
-    shift = shift1 & 0x1f;
-    if (shift == 0) {
-        if (shift1 != 0)
-            env->CF = (x >> 31) & 1;
-        return x;
-    } else {
-        env->CF = (x >> (shift - 1)) & 1;
-        return ((uint32_t)x >> shift) | (x << (32 - shift));
-    }
-}
-
-void HELPER(probe_access)(CPUARMState *env, target_ulong ptr,
-                          uint32_t access_type, uint32_t mmu_idx,
-                          uint32_t size)
-{
-    uint32_t in_page = -((uint32_t)ptr | TARGET_PAGE_SIZE);
-    uintptr_t ra = GETPC();
-
-    if (likely(size <= in_page)) {
-        probe_access(env, ptr, size, access_type, mmu_idx, ra);
-    } else {
-        probe_access(env, ptr, in_page, access_type, mmu_idx, ra);
-        probe_access(env, ptr + in_page, size - in_page,
-                     access_type, mmu_idx, ra);
-    }
-}
-
-/*
- * This function corresponds to AArch64.vESBOperation().
- * Note that the AArch32 version is not functionally different.
- */
-void HELPER(vesb)(CPUARMState *env)
-{
-    /*
-     * The EL2Enabled() check is done inside arm_hcr_el2_eff,
-     * and will return HCR_EL2.VSE == 0, so nothing happens.
-     */
-    uint64_t hcr = arm_hcr_el2_eff(env);
-    bool enabled = !(hcr & HCR_TGE) && (hcr & HCR_AMO);
-    bool pending = enabled && (hcr & HCR_VSE);
-    bool masked  = (env->daif & PSTATE_A);
-
-    /* If VSE pending and masked, defer the exception.  */
-    if (pending && masked) {
-        uint32_t syndrome;
-
-        if (arm_el_is_aa64(env, 1)) {
-            /* Copy across IDS and ISS from VSESR. */
-            syndrome = env->cp15.vsesr_el2 & 0x1ffffff;
-        } else {
-            ARMMMUFaultInfo fi = { .type = ARMFault_AsyncExternal };
-
-            if (extended_addresses_enabled(env)) {
-                syndrome = arm_fi_to_lfsc(&fi);
-            } else {
-                syndrome = arm_fi_to_sfsc(&fi);
-            }
-            /* Copy across AET and ExT from VSESR. */
-            syndrome |= env->cp15.vsesr_el2 & 0xd000;
-        }
-
-        /* Set VDISR_EL2.A along with the syndrome. */
-        env->cp15.vdisr_el2 = syndrome | (1u << 31);
-
-        /* Clear pending virtual SError */
-        env->cp15.hcr_el2 &= ~HCR_VSE;
-        cpu_reset_interrupt(env_cpu(env), CPU_INTERRUPT_VSERR);
-    }
-}
diff --git a/target/arm/pauth_helper.c b/target/arm/pauth_helper.c
deleted file mode 100644
index d0483bf..0000000
--- a/target/arm/pauth_helper.c
+++ /dev/null
@@ -1,515 +0,0 @@
-/*
- * ARM v8.3-PAuth Operations
- *
- * Copyright (c) 2019 Linaro, Ltd.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "qemu/osdep.h"
-#include "cpu.h"
-#include "internals.h"
-#include "exec/exec-all.h"
-#include "exec/cpu_ldst.h"
-#include "exec/helper-proto.h"
-#include "tcg/tcg-gvec-desc.h"
-#include "qemu/xxhash.h"
-
-
-static uint64_t pac_cell_shuffle(uint64_t i)
-{
-    uint64_t o = 0;
-
-    o |= extract64(i, 52, 4);
-    o |= extract64(i, 24, 4) << 4;
-    o |= extract64(i, 44, 4) << 8;
-    o |= extract64(i,  0, 4) << 12;
-
-    o |= extract64(i, 28, 4) << 16;
-    o |= extract64(i, 48, 4) << 20;
-    o |= extract64(i,  4, 4) << 24;
-    o |= extract64(i, 40, 4) << 28;
-
-    o |= extract64(i, 32, 4) << 32;
-    o |= extract64(i, 12, 4) << 36;
-    o |= extract64(i, 56, 4) << 40;
-    o |= extract64(i, 20, 4) << 44;
-
-    o |= extract64(i,  8, 4) << 48;
-    o |= extract64(i, 36, 4) << 52;
-    o |= extract64(i, 16, 4) << 56;
-    o |= extract64(i, 60, 4) << 60;
-
-    return o;
-}
-
-static uint64_t pac_cell_inv_shuffle(uint64_t i)
-{
-    uint64_t o = 0;
-
-    o |= extract64(i, 12, 4);
-    o |= extract64(i, 24, 4) << 4;
-    o |= extract64(i, 48, 4) << 8;
-    o |= extract64(i, 36, 4) << 12;
-
-    o |= extract64(i, 56, 4) << 16;
-    o |= extract64(i, 44, 4) << 20;
-    o |= extract64(i,  4, 4) << 24;
-    o |= extract64(i, 16, 4) << 28;
-
-    o |= i & MAKE_64BIT_MASK(32, 4);
-    o |= extract64(i, 52, 4) << 36;
-    o |= extract64(i, 28, 4) << 40;
-    o |= extract64(i,  8, 4) << 44;
-
-    o |= extract64(i, 20, 4) << 48;
-    o |= extract64(i,  0, 4) << 52;
-    o |= extract64(i, 40, 4) << 56;
-    o |= i & MAKE_64BIT_MASK(60, 4);
-
-    return o;
-}
-
-static uint64_t pac_sub(uint64_t i)
-{
-    static const uint8_t sub[16] = {
-        0xb, 0x6, 0x8, 0xf, 0xc, 0x0, 0x9, 0xe,
-        0x3, 0x7, 0x4, 0x5, 0xd, 0x2, 0x1, 0xa,
-    };
-    uint64_t o = 0;
-    int b;
-
-    for (b = 0; b < 64; b += 4) {
-        o |= (uint64_t)sub[(i >> b) & 0xf] << b;
-    }
-    return o;
-}
-
-static uint64_t pac_inv_sub(uint64_t i)
-{
-    static const uint8_t inv_sub[16] = {
-        0x5, 0xe, 0xd, 0x8, 0xa, 0xb, 0x1, 0x9,
-        0x2, 0x6, 0xf, 0x0, 0x4, 0xc, 0x7, 0x3,
-    };
-    uint64_t o = 0;
-    int b;
-
-    for (b = 0; b < 64; b += 4) {
-        o |= (uint64_t)inv_sub[(i >> b) & 0xf] << b;
-    }
-    return o;
-}
-
-static int rot_cell(int cell, int n)
-{
-    /* 4-bit rotate left by n.  */
-    cell |= cell << 4;
-    return extract32(cell, 4 - n, 4);
-}
-
-static uint64_t pac_mult(uint64_t i)
-{
-    uint64_t o = 0;
-    int b;
-
-    for (b = 0; b < 4 * 4; b += 4) {
-        int i0, i4, i8, ic, t0, t1, t2, t3;
-
-        i0 = extract64(i, b, 4);
-        i4 = extract64(i, b + 4 * 4, 4);
-        i8 = extract64(i, b + 8 * 4, 4);
-        ic = extract64(i, b + 12 * 4, 4);
-
-        t0 = rot_cell(i8, 1) ^ rot_cell(i4, 2) ^ rot_cell(i0, 1);
-        t1 = rot_cell(ic, 1) ^ rot_cell(i4, 1) ^ rot_cell(i0, 2);
-        t2 = rot_cell(ic, 2) ^ rot_cell(i8, 1) ^ rot_cell(i0, 1);
-        t3 = rot_cell(ic, 1) ^ rot_cell(i8, 2) ^ rot_cell(i4, 1);
-
-        o |= (uint64_t)t3 << b;
-        o |= (uint64_t)t2 << (b + 4 * 4);
-        o |= (uint64_t)t1 << (b + 8 * 4);
-        o |= (uint64_t)t0 << (b + 12 * 4);
-    }
-    return o;
-}
-
-static uint64_t tweak_cell_rot(uint64_t cell)
-{
-    return (cell >> 1) | (((cell ^ (cell >> 1)) & 1) << 3);
-}
-
-static uint64_t tweak_shuffle(uint64_t i)
-{
-    uint64_t o = 0;
-
-    o |= extract64(i, 16, 4) << 0;
-    o |= extract64(i, 20, 4) << 4;
-    o |= tweak_cell_rot(extract64(i, 24, 4)) << 8;
-    o |= extract64(i, 28, 4) << 12;
-
-    o |= tweak_cell_rot(extract64(i, 44, 4)) << 16;
-    o |= extract64(i,  8, 4) << 20;
-    o |= extract64(i, 12, 4) << 24;
-    o |= tweak_cell_rot(extract64(i, 32, 4)) << 28;
-
-    o |= extract64(i, 48, 4) << 32;
-    o |= extract64(i, 52, 4) << 36;
-    o |= extract64(i, 56, 4) << 40;
-    o |= tweak_cell_rot(extract64(i, 60, 4)) << 44;
-
-    o |= tweak_cell_rot(extract64(i,  0, 4)) << 48;
-    o |= extract64(i,  4, 4) << 52;
-    o |= tweak_cell_rot(extract64(i, 40, 4)) << 56;
-    o |= tweak_cell_rot(extract64(i, 36, 4)) << 60;
-
-    return o;
-}
-
-static uint64_t tweak_cell_inv_rot(uint64_t cell)
-{
-    return ((cell << 1) & 0xf) | ((cell & 1) ^ (cell >> 3));
-}
-
-static uint64_t tweak_inv_shuffle(uint64_t i)
-{
-    uint64_t o = 0;
-
-    o |= tweak_cell_inv_rot(extract64(i, 48, 4));
-    o |= extract64(i, 52, 4) << 4;
-    o |= extract64(i, 20, 4) << 8;
-    o |= extract64(i, 24, 4) << 12;
-
-    o |= extract64(i,  0, 4) << 16;
-    o |= extract64(i,  4, 4) << 20;
-    o |= tweak_cell_inv_rot(extract64(i,  8, 4)) << 24;
-    o |= extract64(i, 12, 4) << 28;
-
-    o |= tweak_cell_inv_rot(extract64(i, 28, 4)) << 32;
-    o |= tweak_cell_inv_rot(extract64(i, 60, 4)) << 36;
-    o |= tweak_cell_inv_rot(extract64(i, 56, 4)) << 40;
-    o |= tweak_cell_inv_rot(extract64(i, 16, 4)) << 44;
-
-    o |= extract64(i, 32, 4) << 48;
-    o |= extract64(i, 36, 4) << 52;
-    o |= extract64(i, 40, 4) << 56;
-    o |= tweak_cell_inv_rot(extract64(i, 44, 4)) << 60;
-
-    return o;
-}
-
-static uint64_t pauth_computepac_architected(uint64_t data, uint64_t modifier,
-                                             ARMPACKey key)
-{
-    static const uint64_t RC[5] = {
-        0x0000000000000000ull,
-        0x13198A2E03707344ull,
-        0xA4093822299F31D0ull,
-        0x082EFA98EC4E6C89ull,
-        0x452821E638D01377ull,
-    };
-    const uint64_t alpha = 0xC0AC29B7C97C50DDull;
-    /*
-     * Note that in the ARM pseudocode, key0 contains bits <127:64>
-     * and key1 contains bits <63:0> of the 128-bit key.
-     */
-    uint64_t key0 = key.hi, key1 = key.lo;
-    uint64_t workingval, runningmod, roundkey, modk0;
-    int i;
-
-    modk0 = (key0 << 63) | ((key0 >> 1) ^ (key0 >> 63));
-    runningmod = modifier;
-    workingval = data ^ key0;
-
-    for (i = 0; i <= 4; ++i) {
-        roundkey = key1 ^ runningmod;
-        workingval ^= roundkey;
-        workingval ^= RC[i];
-        if (i > 0) {
-            workingval = pac_cell_shuffle(workingval);
-            workingval = pac_mult(workingval);
-        }
-        workingval = pac_sub(workingval);
-        runningmod = tweak_shuffle(runningmod);
-    }
-    roundkey = modk0 ^ runningmod;
-    workingval ^= roundkey;
-    workingval = pac_cell_shuffle(workingval);
-    workingval = pac_mult(workingval);
-    workingval = pac_sub(workingval);
-    workingval = pac_cell_shuffle(workingval);
-    workingval = pac_mult(workingval);
-    workingval ^= key1;
-    workingval = pac_cell_inv_shuffle(workingval);
-    workingval = pac_inv_sub(workingval);
-    workingval = pac_mult(workingval);
-    workingval = pac_cell_inv_shuffle(workingval);
-    workingval ^= key0;
-    workingval ^= runningmod;
-    for (i = 0; i <= 4; ++i) {
-        workingval = pac_inv_sub(workingval);
-        if (i < 4) {
-            workingval = pac_mult(workingval);
-            workingval = pac_cell_inv_shuffle(workingval);
-        }
-        runningmod = tweak_inv_shuffle(runningmod);
-        roundkey = key1 ^ runningmod;
-        workingval ^= RC[4 - i];
-        workingval ^= roundkey;
-        workingval ^= alpha;
-    }
-    workingval ^= modk0;
-
-    return workingval;
-}
-
-static uint64_t pauth_computepac_impdef(uint64_t data, uint64_t modifier,
-                                        ARMPACKey key)
-{
-    return qemu_xxhash64_4(data, modifier, key.lo, key.hi);
-}
-
-static uint64_t pauth_computepac(CPUARMState *env, uint64_t data,
-                                 uint64_t modifier, ARMPACKey key)
-{
-    if (cpu_isar_feature(aa64_pauth_arch, env_archcpu(env))) {
-        return pauth_computepac_architected(data, modifier, key);
-    } else {
-        return pauth_computepac_impdef(data, modifier, key);
-    }
-}
-
-static uint64_t pauth_addpac(CPUARMState *env, uint64_t ptr, uint64_t modifier,
-                             ARMPACKey *key, bool data)
-{
-    ARMMMUIdx mmu_idx = arm_stage1_mmu_idx(env);
-    ARMVAParameters param = aa64_va_parameters(env, ptr, mmu_idx, data);
-    uint64_t pac, ext_ptr, ext, test;
-    int bot_bit, top_bit;
-
-    /* If tagged pointers are in use, use ptr<55>, otherwise ptr<63>.  */
-    if (param.tbi) {
-        ext = sextract64(ptr, 55, 1);
-    } else {
-        ext = sextract64(ptr, 63, 1);
-    }
-
-    /* Build a pointer with known good extension bits.  */
-    top_bit = 64 - 8 * param.tbi;
-    bot_bit = 64 - param.tsz;
-    ext_ptr = deposit64(ptr, bot_bit, top_bit - bot_bit, ext);
-
-    pac = pauth_computepac(env, ext_ptr, modifier, *key);
-
-    /*
-     * Check if the ptr has good extension bits and corrupt the
-     * pointer authentication code if not.
-     */
-    test = sextract64(ptr, bot_bit, top_bit - bot_bit);
-    if (test != 0 && test != -1) {
-        /*
-         * Note that our top_bit is one greater than the pseudocode's
-         * version, hence "- 2" here.
-         */
-        pac ^= MAKE_64BIT_MASK(top_bit - 2, 1);
-    }
-
-    /*
-     * Preserve the determination between upper and lower at bit 55,
-     * and insert pointer authentication code.
-     */
-    if (param.tbi) {
-        ptr &= ~MAKE_64BIT_MASK(bot_bit, 55 - bot_bit + 1);
-        pac &= MAKE_64BIT_MASK(bot_bit, 54 - bot_bit + 1);
-    } else {
-        ptr &= MAKE_64BIT_MASK(0, bot_bit);
-        pac &= ~(MAKE_64BIT_MASK(55, 1) | MAKE_64BIT_MASK(0, bot_bit));
-    }
-    ext &= MAKE_64BIT_MASK(55, 1);
-    return pac | ext | ptr;
-}
-
-static uint64_t pauth_original_ptr(uint64_t ptr, ARMVAParameters param)
-{
-    /* Note that bit 55 is used whether or not the regime has 2 ranges. */
-    uint64_t extfield = sextract64(ptr, 55, 1);
-    int bot_pac_bit = 64 - param.tsz;
-    int top_pac_bit = 64 - 8 * param.tbi;
-
-    return deposit64(ptr, bot_pac_bit, top_pac_bit - bot_pac_bit, extfield);
-}
-
-static uint64_t pauth_auth(CPUARMState *env, uint64_t ptr, uint64_t modifier,
-                           ARMPACKey *key, bool data, int keynumber)
-{
-    ARMMMUIdx mmu_idx = arm_stage1_mmu_idx(env);
-    ARMVAParameters param = aa64_va_parameters(env, ptr, mmu_idx, data);
-    int bot_bit, top_bit;
-    uint64_t pac, orig_ptr, test;
-
-    orig_ptr = pauth_original_ptr(ptr, param);
-    pac = pauth_computepac(env, orig_ptr, modifier, *key);
-    bot_bit = 64 - param.tsz;
-    top_bit = 64 - 8 * param.tbi;
-
-    test = (pac ^ ptr) & ~MAKE_64BIT_MASK(55, 1);
-    if (unlikely(extract64(test, bot_bit, top_bit - bot_bit))) {
-        int error_code = (keynumber << 1) | (keynumber ^ 1);
-        if (param.tbi) {
-            return deposit64(orig_ptr, 53, 2, error_code);
-        } else {
-            return deposit64(orig_ptr, 61, 2, error_code);
-        }
-    }
-    return orig_ptr;
-}
-
-static uint64_t pauth_strip(CPUARMState *env, uint64_t ptr, bool data)
-{
-    ARMMMUIdx mmu_idx = arm_stage1_mmu_idx(env);
-    ARMVAParameters param = aa64_va_parameters(env, ptr, mmu_idx, data);
-
-    return pauth_original_ptr(ptr, param);
-}
-
-static G_NORETURN
-void pauth_trap(CPUARMState *env, int target_el, uintptr_t ra)
-{
-    raise_exception_ra(env, EXCP_UDEF, syn_pactrap(), target_el, ra);
-}
-
-static void pauth_check_trap(CPUARMState *env, int el, uintptr_t ra)
-{
-    if (el < 2 && arm_is_el2_enabled(env)) {
-        uint64_t hcr = arm_hcr_el2_eff(env);
-        bool trap = !(hcr & HCR_API);
-        if (el == 0) {
-            /* Trap only applies to EL1&0 regime.  */
-            trap &= (hcr & (HCR_E2H | HCR_TGE)) != (HCR_E2H | HCR_TGE);
-        }
-        /* FIXME: ARMv8.3-NV: HCR_NV trap takes precedence for ERETA[AB].  */
-        if (trap) {
-            pauth_trap(env, 2, ra);
-        }
-    }
-    if (el < 3 && arm_feature(env, ARM_FEATURE_EL3)) {
-        if (!(env->cp15.scr_el3 & SCR_API)) {
-            pauth_trap(env, 3, ra);
-        }
-    }
-}
-
-static bool pauth_key_enabled(CPUARMState *env, int el, uint32_t bit)
-{
-    return (arm_sctlr(env, el) & bit) != 0;
-}
-
-uint64_t HELPER(pacia)(CPUARMState *env, uint64_t x, uint64_t y)
-{
-    int el = arm_current_el(env);
-    if (!pauth_key_enabled(env, el, SCTLR_EnIA)) {
-        return x;
-    }
-    pauth_check_trap(env, el, GETPC());
-    return pauth_addpac(env, x, y, &env->keys.apia, false);
-}
-
-uint64_t HELPER(pacib)(CPUARMState *env, uint64_t x, uint64_t y)
-{
-    int el = arm_current_el(env);
-    if (!pauth_key_enabled(env, el, SCTLR_EnIB)) {
-        return x;
-    }
-    pauth_check_trap(env, el, GETPC());
-    return pauth_addpac(env, x, y, &env->keys.apib, false);
-}
-
-uint64_t HELPER(pacda)(CPUARMState *env, uint64_t x, uint64_t y)
-{
-    int el = arm_current_el(env);
-    if (!pauth_key_enabled(env, el, SCTLR_EnDA)) {
-        return x;
-    }
-    pauth_check_trap(env, el, GETPC());
-    return pauth_addpac(env, x, y, &env->keys.apda, true);
-}
-
-uint64_t HELPER(pacdb)(CPUARMState *env, uint64_t x, uint64_t y)
-{
-    int el = arm_current_el(env);
-    if (!pauth_key_enabled(env, el, SCTLR_EnDB)) {
-        return x;
-    }
-    pauth_check_trap(env, el, GETPC());
-    return pauth_addpac(env, x, y, &env->keys.apdb, true);
-}
-
-uint64_t HELPER(pacga)(CPUARMState *env, uint64_t x, uint64_t y)
-{
-    uint64_t pac;
-
-    pauth_check_trap(env, arm_current_el(env), GETPC());
-    pac = pauth_computepac(env, x, y, env->keys.apga);
-
-    return pac & 0xffffffff00000000ull;
-}
-
-uint64_t HELPER(autia)(CPUARMState *env, uint64_t x, uint64_t y)
-{
-    int el = arm_current_el(env);
-    if (!pauth_key_enabled(env, el, SCTLR_EnIA)) {
-        return x;
-    }
-    pauth_check_trap(env, el, GETPC());
-    return pauth_auth(env, x, y, &env->keys.apia, false, 0);
-}
-
-uint64_t HELPER(autib)(CPUARMState *env, uint64_t x, uint64_t y)
-{
-    int el = arm_current_el(env);
-    if (!pauth_key_enabled(env, el, SCTLR_EnIB)) {
-        return x;
-    }
-    pauth_check_trap(env, el, GETPC());
-    return pauth_auth(env, x, y, &env->keys.apib, false, 1);
-}
-
-uint64_t HELPER(autda)(CPUARMState *env, uint64_t x, uint64_t y)
-{
-    int el = arm_current_el(env);
-    if (!pauth_key_enabled(env, el, SCTLR_EnDA)) {
-        return x;
-    }
-    pauth_check_trap(env, el, GETPC());
-    return pauth_auth(env, x, y, &env->keys.apda, true, 0);
-}
-
-uint64_t HELPER(autdb)(CPUARMState *env, uint64_t x, uint64_t y)
-{
-    int el = arm_current_el(env);
-    if (!pauth_key_enabled(env, el, SCTLR_EnDB)) {
-        return x;
-    }
-    pauth_check_trap(env, el, GETPC());
-    return pauth_auth(env, x, y, &env->keys.apdb, true, 1);
-}
-
-uint64_t HELPER(xpaci)(CPUARMState *env, uint64_t a)
-{
-    return pauth_strip(env, a, false);
-}
-
-uint64_t HELPER(xpacd)(CPUARMState *env, uint64_t a)
-{
-    return pauth_strip(env, a, true);
-}
diff --git a/target/arm/sme_helper.c b/target/arm/sme_helper.c
deleted file mode 100644
index 1e67fca..0000000
--- a/target/arm/sme_helper.c
+++ /dev/null
@@ -1,1168 +0,0 @@
-/*
- * ARM SME Operations
- *
- * Copyright (c) 2022 Linaro, Ltd.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "qemu/osdep.h"
-#include "cpu.h"
-#include "internals.h"
-#include "tcg/tcg-gvec-desc.h"
-#include "exec/helper-proto.h"
-#include "exec/cpu_ldst.h"
-#include "exec/exec-all.h"
-#include "qemu/int128.h"
-#include "fpu/softfloat.h"
-#include "vec_internal.h"
-#include "sve_ldst_internal.h"
-
-void helper_set_svcr(CPUARMState *env, uint32_t val, uint32_t mask)
-{
-    aarch64_set_svcr(env, val, mask);
-}
-
-void helper_sme_zero(CPUARMState *env, uint32_t imm, uint32_t svl)
-{
-    uint32_t i;
-
-    /*
-     * Special case clearing the entire ZA space.
-     * This falls into the CONSTRAINED UNPREDICTABLE zeroing of any
-     * parts of the ZA storage outside of SVL.
-     */
-    if (imm == 0xff) {
-        memset(env->zarray, 0, sizeof(env->zarray));
-        return;
-    }
-
-    /*
-     * Recall that ZAnH.D[m] is spread across ZA[n+8*m],
-     * so each row is discontiguous within ZA[].
-     */
-    for (i = 0; i < svl; i++) {
-        if (imm & (1 << (i % 8))) {
-            memset(&env->zarray[i], 0, svl);
-        }
-    }
-}
-
-
-/*
- * When considering the ZA storage as an array of elements of
- * type T, the index within that array of the Nth element of
- * a vertical slice of a tile can be calculated like this,
- * regardless of the size of type T. This is because the tiles
- * are interleaved, so if type T is size N bytes then row 1 of
- * the tile is N rows away from row 0. The division by N to
- * convert a byte offset into an array index and the multiplication
- * by N to convert from vslice-index-within-the-tile to
- * the index within the ZA storage cancel out.
- */
-#define tile_vslice_index(i) ((i) * sizeof(ARMVectorReg))
-
-/*
- * When doing byte arithmetic on the ZA storage, the element
- * byteoff bytes away in a tile vertical slice is always this
- * many bytes away in the ZA storage, regardless of the
- * size of the tile element, assuming that byteoff is a multiple
- * of the element size. Again this is because of the interleaving
- * of the tiles. For instance if we have 1 byte per element then
- * each row of the ZA storage has one byte of the vslice data,
- * and (counting from 0) byte 8 goes in row 8 of the storage
- * at offset (8 * row-size-in-bytes).
- * If we have 8 bytes per element then each row of the ZA storage
- * has 8 bytes of the data, but there are 8 interleaved tiles and
- * so byte 8 of the data goes into row 1 of the tile,
- * which is again row 8 of the storage, so the offset is still
- * (8 * row-size-in-bytes). Similarly for other element sizes.
- */
-#define tile_vslice_offset(byteoff) ((byteoff) * sizeof(ARMVectorReg))
-
-
-/*
- * Move Zreg vector to ZArray column.
- */
-#define DO_MOVA_C(NAME, TYPE, H)                                        \
-void HELPER(NAME)(void *za, void *vn, void *vg, uint32_t desc)          \
-{                                                                       \
-    int i, oprsz = simd_oprsz(desc);                                    \
-    for (i = 0; i < oprsz; ) {                                          \
-        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
-        do {                                                            \
-            if (pg & 1) {                                               \
-                *(TYPE *)(za + tile_vslice_offset(i)) = *(TYPE *)(vn + H(i)); \
-            }                                                           \
-            i += sizeof(TYPE);                                          \
-            pg >>= sizeof(TYPE);                                        \
-        } while (i & 15);                                               \
-    }                                                                   \
-}
-
-DO_MOVA_C(sme_mova_cz_b, uint8_t, H1)
-DO_MOVA_C(sme_mova_cz_h, uint16_t, H1_2)
-DO_MOVA_C(sme_mova_cz_s, uint32_t, H1_4)
-
-void HELPER(sme_mova_cz_d)(void *za, void *vn, void *vg, uint32_t desc)
-{
-    int i, oprsz = simd_oprsz(desc) / 8;
-    uint8_t *pg = vg;
-    uint64_t *n = vn;
-    uint64_t *a = za;
-
-    for (i = 0; i < oprsz; i++) {
-        if (pg[H1(i)] & 1) {
-            a[tile_vslice_index(i)] = n[i];
-        }
-    }
-}
-
-void HELPER(sme_mova_cz_q)(void *za, void *vn, void *vg, uint32_t desc)
-{
-    int i, oprsz = simd_oprsz(desc) / 16;
-    uint16_t *pg = vg;
-    Int128 *n = vn;
-    Int128 *a = za;
-
-    /*
-     * Int128 is used here simply to copy 16 bytes, and to simplify
-     * the address arithmetic.
-     */
-    for (i = 0; i < oprsz; i++) {
-        if (pg[H2(i)] & 1) {
-            a[tile_vslice_index(i)] = n[i];
-        }
-    }
-}
-
-#undef DO_MOVA_C
-
-/*
- * Move ZArray column to Zreg vector.
- */
-#define DO_MOVA_Z(NAME, TYPE, H)                                        \
-void HELPER(NAME)(void *vd, void *za, void *vg, uint32_t desc)          \
-{                                                                       \
-    int i, oprsz = simd_oprsz(desc);                                    \
-    for (i = 0; i < oprsz; ) {                                          \
-        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
-        do {                                                            \
-            if (pg & 1) {                                               \
-                *(TYPE *)(vd + H(i)) = *(TYPE *)(za + tile_vslice_offset(i)); \
-            }                                                           \
-            i += sizeof(TYPE);                                          \
-            pg >>= sizeof(TYPE);                                        \
-        } while (i & 15);                                               \
-    }                                                                   \
-}
-
-DO_MOVA_Z(sme_mova_zc_b, uint8_t, H1)
-DO_MOVA_Z(sme_mova_zc_h, uint16_t, H1_2)
-DO_MOVA_Z(sme_mova_zc_s, uint32_t, H1_4)
-
-void HELPER(sme_mova_zc_d)(void *vd, void *za, void *vg, uint32_t desc)
-{
-    int i, oprsz = simd_oprsz(desc) / 8;
-    uint8_t *pg = vg;
-    uint64_t *d = vd;
-    uint64_t *a = za;
-
-    for (i = 0; i < oprsz; i++) {
-        if (pg[H1(i)] & 1) {
-            d[i] = a[tile_vslice_index(i)];
-        }
-    }
-}
-
-void HELPER(sme_mova_zc_q)(void *vd, void *za, void *vg, uint32_t desc)
-{
-    int i, oprsz = simd_oprsz(desc) / 16;
-    uint16_t *pg = vg;
-    Int128 *d = vd;
-    Int128 *a = za;
-
-    /*
-     * Int128 is used here simply to copy 16 bytes, and to simplify
-     * the address arithmetic.
-     */
-    for (i = 0; i < oprsz; i++, za += sizeof(ARMVectorReg)) {
-        if (pg[H2(i)] & 1) {
-            d[i] = a[tile_vslice_index(i)];
-        }
-    }
-}
-
-#undef DO_MOVA_Z
-
-/*
- * Clear elements in a tile slice comprising len bytes.
- */
-
-typedef void ClearFn(void *ptr, size_t off, size_t len);
-
-static void clear_horizontal(void *ptr, size_t off, size_t len)
-{
-    memset(ptr + off, 0, len);
-}
-
-static void clear_vertical_b(void *vptr, size_t off, size_t len)
-{
-    for (size_t i = 0; i < len; ++i) {
-        *(uint8_t *)(vptr + tile_vslice_offset(i + off)) = 0;
-    }
-}
-
-static void clear_vertical_h(void *vptr, size_t off, size_t len)
-{
-    for (size_t i = 0; i < len; i += 2) {
-        *(uint16_t *)(vptr + tile_vslice_offset(i + off)) = 0;
-    }
-}
-
-static void clear_vertical_s(void *vptr, size_t off, size_t len)
-{
-    for (size_t i = 0; i < len; i += 4) {
-        *(uint32_t *)(vptr + tile_vslice_offset(i + off)) = 0;
-    }
-}
-
-static void clear_vertical_d(void *vptr, size_t off, size_t len)
-{
-    for (size_t i = 0; i < len; i += 8) {
-        *(uint64_t *)(vptr + tile_vslice_offset(i + off)) = 0;
-    }
-}
-
-static void clear_vertical_q(void *vptr, size_t off, size_t len)
-{
-    for (size_t i = 0; i < len; i += 16) {
-        memset(vptr + tile_vslice_offset(i + off), 0, 16);
-    }
-}
-
-/*
- * Copy elements from an array into a tile slice comprising len bytes.
- */
-
-typedef void CopyFn(void *dst, const void *src, size_t len);
-
-static void copy_horizontal(void *dst, const void *src, size_t len)
-{
-    memcpy(dst, src, len);
-}
-
-static void copy_vertical_b(void *vdst, const void *vsrc, size_t len)
-{
-    const uint8_t *src = vsrc;
-    uint8_t *dst = vdst;
-    size_t i;
-
-    for (i = 0; i < len; ++i) {
-        dst[tile_vslice_index(i)] = src[i];
-    }
-}
-
-static void copy_vertical_h(void *vdst, const void *vsrc, size_t len)
-{
-    const uint16_t *src = vsrc;
-    uint16_t *dst = vdst;
-    size_t i;
-
-    for (i = 0; i < len / 2; ++i) {
-        dst[tile_vslice_index(i)] = src[i];
-    }
-}
-
-static void copy_vertical_s(void *vdst, const void *vsrc, size_t len)
-{
-    const uint32_t *src = vsrc;
-    uint32_t *dst = vdst;
-    size_t i;
-
-    for (i = 0; i < len / 4; ++i) {
-        dst[tile_vslice_index(i)] = src[i];
-    }
-}
-
-static void copy_vertical_d(void *vdst, const void *vsrc, size_t len)
-{
-    const uint64_t *src = vsrc;
-    uint64_t *dst = vdst;
-    size_t i;
-
-    for (i = 0; i < len / 8; ++i) {
-        dst[tile_vslice_index(i)] = src[i];
-    }
-}
-
-static void copy_vertical_q(void *vdst, const void *vsrc, size_t len)
-{
-    for (size_t i = 0; i < len; i += 16) {
-        memcpy(vdst + tile_vslice_offset(i), vsrc + i, 16);
-    }
-}
-
-/*
- * Host and TLB primitives for vertical tile slice addressing.
- */
-
-#define DO_LD(NAME, TYPE, HOST, TLB)                                        \
-static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host)  \
-{                                                                           \
-    TYPE val = HOST(host);                                                  \
-    *(TYPE *)(za + tile_vslice_offset(off)) = val;                          \
-}                                                                           \
-static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za,           \
-                        intptr_t off, target_ulong addr, uintptr_t ra)      \
-{                                                                           \
-    TYPE val = TLB(env, useronly_clean_ptr(addr), ra);                      \
-    *(TYPE *)(za + tile_vslice_offset(off)) = val;                          \
-}
-
-#define DO_ST(NAME, TYPE, HOST, TLB)                                        \
-static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host)  \
-{                                                                           \
-    TYPE val = *(TYPE *)(za + tile_vslice_offset(off));                     \
-    HOST(host, val);                                                        \
-}                                                                           \
-static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za,           \
-                        intptr_t off, target_ulong addr, uintptr_t ra)      \
-{                                                                           \
-    TYPE val = *(TYPE *)(za + tile_vslice_offset(off));                     \
-    TLB(env, useronly_clean_ptr(addr), val, ra);                            \
-}
-
-/*
- * The ARMVectorReg elements are stored in host-endian 64-bit units.
- * For 128-bit quantities, the sequence defined by the Elem[] pseudocode
- * corresponds to storing the two 64-bit pieces in little-endian order.
- */
-#define DO_LDQ(HNAME, VNAME, BE, HOST, TLB)                                 \
-static inline void HNAME##_host(void *za, intptr_t off, void *host)         \
-{                                                                           \
-    uint64_t val0 = HOST(host), val1 = HOST(host + 8);                      \
-    uint64_t *ptr = za + off;                                               \
-    ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1;                   \
-}                                                                           \
-static inline void VNAME##_v_host(void *za, intptr_t off, void *host)       \
-{                                                                           \
-    HNAME##_host(za, tile_vslice_offset(off), host);                        \
-}                                                                           \
-static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off,    \
-                               target_ulong addr, uintptr_t ra)             \
-{                                                                           \
-    uint64_t val0 = TLB(env, useronly_clean_ptr(addr), ra);                 \
-    uint64_t val1 = TLB(env, useronly_clean_ptr(addr + 8), ra);             \
-    uint64_t *ptr = za + off;                                               \
-    ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1;                   \
-}                                                                           \
-static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off,  \
-                               target_ulong addr, uintptr_t ra)             \
-{                                                                           \
-    HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra);                \
-}
-
-#define DO_STQ(HNAME, VNAME, BE, HOST, TLB)                                 \
-static inline void HNAME##_host(void *za, intptr_t off, void *host)         \
-{                                                                           \
-    uint64_t *ptr = za + off;                                               \
-    HOST(host, ptr[BE]);                                                    \
-    HOST(host + 1, ptr[!BE]);                                               \
-}                                                                           \
-static inline void VNAME##_v_host(void *za, intptr_t off, void *host)       \
-{                                                                           \
-    HNAME##_host(za, tile_vslice_offset(off), host);                        \
-}                                                                           \
-static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off,    \
-                               target_ulong addr, uintptr_t ra)             \
-{                                                                           \
-    uint64_t *ptr = za + off;                                               \
-    TLB(env, useronly_clean_ptr(addr), ptr[BE], ra);                        \
-    TLB(env, useronly_clean_ptr(addr + 8), ptr[!BE], ra);                   \
-}                                                                           \
-static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off,  \
-                               target_ulong addr, uintptr_t ra)             \
-{                                                                           \
-    HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra);                \
-}
-
-DO_LD(ld1b, uint8_t, ldub_p, cpu_ldub_data_ra)
-DO_LD(ld1h_be, uint16_t, lduw_be_p, cpu_lduw_be_data_ra)
-DO_LD(ld1h_le, uint16_t, lduw_le_p, cpu_lduw_le_data_ra)
-DO_LD(ld1s_be, uint32_t, ldl_be_p, cpu_ldl_be_data_ra)
-DO_LD(ld1s_le, uint32_t, ldl_le_p, cpu_ldl_le_data_ra)
-DO_LD(ld1d_be, uint64_t, ldq_be_p, cpu_ldq_be_data_ra)
-DO_LD(ld1d_le, uint64_t, ldq_le_p, cpu_ldq_le_data_ra)
-
-DO_LDQ(sve_ld1qq_be, sme_ld1q_be, 1, ldq_be_p, cpu_ldq_be_data_ra)
-DO_LDQ(sve_ld1qq_le, sme_ld1q_le, 0, ldq_le_p, cpu_ldq_le_data_ra)
-
-DO_ST(st1b, uint8_t, stb_p, cpu_stb_data_ra)
-DO_ST(st1h_be, uint16_t, stw_be_p, cpu_stw_be_data_ra)
-DO_ST(st1h_le, uint16_t, stw_le_p, cpu_stw_le_data_ra)
-DO_ST(st1s_be, uint32_t, stl_be_p, cpu_stl_be_data_ra)
-DO_ST(st1s_le, uint32_t, stl_le_p, cpu_stl_le_data_ra)
-DO_ST(st1d_be, uint64_t, stq_be_p, cpu_stq_be_data_ra)
-DO_ST(st1d_le, uint64_t, stq_le_p, cpu_stq_le_data_ra)
-
-DO_STQ(sve_st1qq_be, sme_st1q_be, 1, stq_be_p, cpu_stq_be_data_ra)
-DO_STQ(sve_st1qq_le, sme_st1q_le, 0, stq_le_p, cpu_stq_le_data_ra)
-
-#undef DO_LD
-#undef DO_ST
-#undef DO_LDQ
-#undef DO_STQ
-
-/*
- * Common helper for all contiguous predicated loads.
- */
-
-static inline QEMU_ALWAYS_INLINE
-void sme_ld1(CPUARMState *env, void *za, uint64_t *vg,
-             const target_ulong addr, uint32_t desc, const uintptr_t ra,
-             const int esz, uint32_t mtedesc, bool vertical,
-             sve_ldst1_host_fn *host_fn,
-             sve_ldst1_tlb_fn *tlb_fn,
-             ClearFn *clr_fn,
-             CopyFn *cpy_fn)
-{
-    const intptr_t reg_max = simd_oprsz(desc);
-    const intptr_t esize = 1 << esz;
-    intptr_t reg_off, reg_last;
-    SVEContLdSt info;
-    void *host;
-    int flags;
-
-    /* Find the active elements.  */
-    if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
-        /* The entire predicate was false; no load occurs.  */
-        clr_fn(za, 0, reg_max);
-        return;
-    }
-
-    /* Probe the page(s).  Exit with exception for any invalid page. */
-    sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra);
-
-    /* Handle watchpoints for all active elements. */
-    sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
-                              BP_MEM_READ, ra);
-
-    /*
-     * Handle mte checks for all active elements.
-     * Since TBI must be set for MTE, !mtedesc => !mte_active.
-     */
-    if (mtedesc) {
-        sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
-                                mtedesc, ra);
-    }
-
-    flags = info.page[0].flags | info.page[1].flags;
-    if (unlikely(flags != 0)) {
-#ifdef CONFIG_USER_ONLY
-        g_assert_not_reached();
-#else
-        /*
-         * At least one page includes MMIO.
-         * Any bus operation can fail with cpu_transaction_failed,
-         * which for ARM will raise SyncExternal.  Perform the load
-         * into scratch memory to preserve register state until the end.
-         */
-        ARMVectorReg scratch = { };
-
-        reg_off = info.reg_off_first[0];
-        reg_last = info.reg_off_last[1];
-        if (reg_last < 0) {
-            reg_last = info.reg_off_split;
-            if (reg_last < 0) {
-                reg_last = info.reg_off_last[0];
-            }
-        }
-
-        do {
-            uint64_t pg = vg[reg_off >> 6];
-            do {
-                if ((pg >> (reg_off & 63)) & 1) {
-                    tlb_fn(env, &scratch, reg_off, addr + reg_off, ra);
-                }
-                reg_off += esize;
-            } while (reg_off & 63);
-        } while (reg_off <= reg_last);
-
-        cpy_fn(za, &scratch, reg_max);
-        return;
-#endif
-    }
-
-    /* The entire operation is in RAM, on valid pages. */
-
-    reg_off = info.reg_off_first[0];
-    reg_last = info.reg_off_last[0];
-    host = info.page[0].host;
-
-    if (!vertical) {
-        memset(za, 0, reg_max);
-    } else if (reg_off) {
-        clr_fn(za, 0, reg_off);
-    }
-
-    while (reg_off <= reg_last) {
-        uint64_t pg = vg[reg_off >> 6];
-        do {
-            if ((pg >> (reg_off & 63)) & 1) {
-                host_fn(za, reg_off, host + reg_off);
-            } else if (vertical) {
-                clr_fn(za, reg_off, esize);
-            }
-            reg_off += esize;
-        } while (reg_off <= reg_last && (reg_off & 63));
-    }
-
-    /*
-     * Use the slow path to manage the cross-page misalignment.
-     * But we know this is RAM and cannot trap.
-     */
-    reg_off = info.reg_off_split;
-    if (unlikely(reg_off >= 0)) {
-        tlb_fn(env, za, reg_off, addr + reg_off, ra);
-    }
-
-    reg_off = info.reg_off_first[1];
-    if (unlikely(reg_off >= 0)) {
-        reg_last = info.reg_off_last[1];
-        host = info.page[1].host;
-
-        do {
-            uint64_t pg = vg[reg_off >> 6];
-            do {
-                if ((pg >> (reg_off & 63)) & 1) {
-                    host_fn(za, reg_off, host + reg_off);
-                } else if (vertical) {
-                    clr_fn(za, reg_off, esize);
-                }
-                reg_off += esize;
-            } while (reg_off & 63);
-        } while (reg_off <= reg_last);
-    }
-}
-
-static inline QEMU_ALWAYS_INLINE
-void sme_ld1_mte(CPUARMState *env, void *za, uint64_t *vg,
-                 target_ulong addr, uint32_t desc, uintptr_t ra,
-                 const int esz, bool vertical,
-                 sve_ldst1_host_fn *host_fn,
-                 sve_ldst1_tlb_fn *tlb_fn,
-                 ClearFn *clr_fn,
-                 CopyFn *cpy_fn)
-{
-    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
-    int bit55 = extract64(addr, 55, 1);
-
-    /* Remove mtedesc from the normal sve descriptor. */
-    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
-
-    /* Perform gross MTE suppression early. */
-    if (!tbi_check(desc, bit55) ||
-        tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
-        mtedesc = 0;
-    }
-
-    sme_ld1(env, za, vg, addr, desc, ra, esz, mtedesc, vertical,
-            host_fn, tlb_fn, clr_fn, cpy_fn);
-}
-
-#define DO_LD(L, END, ESZ)                                                 \
-void HELPER(sme_ld1##L##END##_h)(CPUARMState *env, void *za, void *vg,     \
-                                 target_ulong addr, uint32_t desc)         \
-{                                                                          \
-    sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false,               \
-            sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb,           \
-            clear_horizontal, copy_horizontal);                            \
-}                                                                          \
-void HELPER(sme_ld1##L##END##_v)(CPUARMState *env, void *za, void *vg,     \
-                                 target_ulong addr, uint32_t desc)         \
-{                                                                          \
-    sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true,                \
-            sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb,             \
-            clear_vertical_##L, copy_vertical_##L);                        \
-}                                                                          \
-void HELPER(sme_ld1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
-                                     target_ulong addr, uint32_t desc)     \
-{                                                                          \
-    sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false,              \
-                sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb,       \
-                clear_horizontal, copy_horizontal);                        \
-}                                                                          \
-void HELPER(sme_ld1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
-                                     target_ulong addr, uint32_t desc)     \
-{                                                                          \
-    sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true,               \
-                sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb,         \
-                clear_vertical_##L, copy_vertical_##L);                    \
-}
-
-DO_LD(b, , MO_8)
-DO_LD(h, _be, MO_16)
-DO_LD(h, _le, MO_16)
-DO_LD(s, _be, MO_32)
-DO_LD(s, _le, MO_32)
-DO_LD(d, _be, MO_64)
-DO_LD(d, _le, MO_64)
-DO_LD(q, _be, MO_128)
-DO_LD(q, _le, MO_128)
-
-#undef DO_LD
-
-/*
- * Common helper for all contiguous predicated stores.
- */
-
-static inline QEMU_ALWAYS_INLINE
-void sme_st1(CPUARMState *env, void *za, uint64_t *vg,
-             const target_ulong addr, uint32_t desc, const uintptr_t ra,
-             const int esz, uint32_t mtedesc, bool vertical,
-             sve_ldst1_host_fn *host_fn,
-             sve_ldst1_tlb_fn *tlb_fn)
-{
-    const intptr_t reg_max = simd_oprsz(desc);
-    const intptr_t esize = 1 << esz;
-    intptr_t reg_off, reg_last;
-    SVEContLdSt info;
-    void *host;
-    int flags;
-
-    /* Find the active elements.  */
-    if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
-        /* The entire predicate was false; no store occurs.  */
-        return;
-    }
-
-    /* Probe the page(s).  Exit with exception for any invalid page. */
-    sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra);
-
-    /* Handle watchpoints for all active elements. */
-    sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
-                              BP_MEM_WRITE, ra);
-
-    /*
-     * Handle mte checks for all active elements.
-     * Since TBI must be set for MTE, !mtedesc => !mte_active.
-     */
-    if (mtedesc) {
-        sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
-                                mtedesc, ra);
-    }
-
-    flags = info.page[0].flags | info.page[1].flags;
-    if (unlikely(flags != 0)) {
-#ifdef CONFIG_USER_ONLY
-        g_assert_not_reached();
-#else
-        /*
-         * At least one page includes MMIO.
-         * Any bus operation can fail with cpu_transaction_failed,
-         * which for ARM will raise SyncExternal.  We cannot avoid
-         * this fault and will leave with the store incomplete.
-         */
-        reg_off = info.reg_off_first[0];
-        reg_last = info.reg_off_last[1];
-        if (reg_last < 0) {
-            reg_last = info.reg_off_split;
-            if (reg_last < 0) {
-                reg_last = info.reg_off_last[0];
-            }
-        }
-
-        do {
-            uint64_t pg = vg[reg_off >> 6];
-            do {
-                if ((pg >> (reg_off & 63)) & 1) {
-                    tlb_fn(env, za, reg_off, addr + reg_off, ra);
-                }
-                reg_off += esize;
-            } while (reg_off & 63);
-        } while (reg_off <= reg_last);
-        return;
-#endif
-    }
-
-    reg_off = info.reg_off_first[0];
-    reg_last = info.reg_off_last[0];
-    host = info.page[0].host;
-
-    while (reg_off <= reg_last) {
-        uint64_t pg = vg[reg_off >> 6];
-        do {
-            if ((pg >> (reg_off & 63)) & 1) {
-                host_fn(za, reg_off, host + reg_off);
-            }
-            reg_off += 1 << esz;
-        } while (reg_off <= reg_last && (reg_off & 63));
-    }
-
-    /*
-     * Use the slow path to manage the cross-page misalignment.
-     * But we know this is RAM and cannot trap.
-     */
-    reg_off = info.reg_off_split;
-    if (unlikely(reg_off >= 0)) {
-        tlb_fn(env, za, reg_off, addr + reg_off, ra);
-    }
-
-    reg_off = info.reg_off_first[1];
-    if (unlikely(reg_off >= 0)) {
-        reg_last = info.reg_off_last[1];
-        host = info.page[1].host;
-
-        do {
-            uint64_t pg = vg[reg_off >> 6];
-            do {
-                if ((pg >> (reg_off & 63)) & 1) {
-                    host_fn(za, reg_off, host + reg_off);
-                }
-                reg_off += 1 << esz;
-            } while (reg_off & 63);
-        } while (reg_off <= reg_last);
-    }
-}
-
-static inline QEMU_ALWAYS_INLINE
-void sme_st1_mte(CPUARMState *env, void *za, uint64_t *vg, target_ulong addr,
-                 uint32_t desc, uintptr_t ra, int esz, bool vertical,
-                 sve_ldst1_host_fn *host_fn,
-                 sve_ldst1_tlb_fn *tlb_fn)
-{
-    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
-    int bit55 = extract64(addr, 55, 1);
-
-    /* Remove mtedesc from the normal sve descriptor. */
-    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
-
-    /* Perform gross MTE suppression early. */
-    if (!tbi_check(desc, bit55) ||
-        tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
-        mtedesc = 0;
-    }
-
-    sme_st1(env, za, vg, addr, desc, ra, esz, mtedesc,
-            vertical, host_fn, tlb_fn);
-}
-
-#define DO_ST(L, END, ESZ)                                                 \
-void HELPER(sme_st1##L##END##_h)(CPUARMState *env, void *za, void *vg,     \
-                                 target_ulong addr, uint32_t desc)         \
-{                                                                          \
-    sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false,               \
-            sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb);          \
-}                                                                          \
-void HELPER(sme_st1##L##END##_v)(CPUARMState *env, void *za, void *vg,     \
-                                 target_ulong addr, uint32_t desc)         \
-{                                                                          \
-    sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true,                \
-            sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb);            \
-}                                                                          \
-void HELPER(sme_st1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
-                                     target_ulong addr, uint32_t desc)     \
-{                                                                          \
-    sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false,              \
-                sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb);      \
-}                                                                          \
-void HELPER(sme_st1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
-                                     target_ulong addr, uint32_t desc)     \
-{                                                                          \
-    sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true,               \
-                sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb);        \
-}
-
-DO_ST(b, , MO_8)
-DO_ST(h, _be, MO_16)
-DO_ST(h, _le, MO_16)
-DO_ST(s, _be, MO_32)
-DO_ST(s, _le, MO_32)
-DO_ST(d, _be, MO_64)
-DO_ST(d, _le, MO_64)
-DO_ST(q, _be, MO_128)
-DO_ST(q, _le, MO_128)
-
-#undef DO_ST
-
-void HELPER(sme_addha_s)(void *vzda, void *vzn, void *vpn,
-                         void *vpm, uint32_t desc)
-{
-    intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
-    uint64_t *pn = vpn, *pm = vpm;
-    uint32_t *zda = vzda, *zn = vzn;
-
-    for (row = 0; row < oprsz; ) {
-        uint64_t pa = pn[row >> 4];
-        do {
-            if (pa & 1) {
-                for (col = 0; col < oprsz; ) {
-                    uint64_t pb = pm[col >> 4];
-                    do {
-                        if (pb & 1) {
-                            zda[tile_vslice_index(row) + H4(col)] += zn[H4(col)];
-                        }
-                        pb >>= 4;
-                    } while (++col & 15);
-                }
-            }
-            pa >>= 4;
-        } while (++row & 15);
-    }
-}
-
-void HELPER(sme_addha_d)(void *vzda, void *vzn, void *vpn,
-                         void *vpm, uint32_t desc)
-{
-    intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
-    uint8_t *pn = vpn, *pm = vpm;
-    uint64_t *zda = vzda, *zn = vzn;
-
-    for (row = 0; row < oprsz; ++row) {
-        if (pn[H1(row)] & 1) {
-            for (col = 0; col < oprsz; ++col) {
-                if (pm[H1(col)] & 1) {
-                    zda[tile_vslice_index(row) + col] += zn[col];
-                }
-            }
-        }
-    }
-}
-
-void HELPER(sme_addva_s)(void *vzda, void *vzn, void *vpn,
-                         void *vpm, uint32_t desc)
-{
-    intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
-    uint64_t *pn = vpn, *pm = vpm;
-    uint32_t *zda = vzda, *zn = vzn;
-
-    for (row = 0; row < oprsz; ) {
-        uint64_t pa = pn[row >> 4];
-        do {
-            if (pa & 1) {
-                uint32_t zn_row = zn[H4(row)];
-                for (col = 0; col < oprsz; ) {
-                    uint64_t pb = pm[col >> 4];
-                    do {
-                        if (pb & 1) {
-                            zda[tile_vslice_index(row) + H4(col)] += zn_row;
-                        }
-                        pb >>= 4;
-                    } while (++col & 15);
-                }
-            }
-            pa >>= 4;
-        } while (++row & 15);
-    }
-}
-
-void HELPER(sme_addva_d)(void *vzda, void *vzn, void *vpn,
-                         void *vpm, uint32_t desc)
-{
-    intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
-    uint8_t *pn = vpn, *pm = vpm;
-    uint64_t *zda = vzda, *zn = vzn;
-
-    for (row = 0; row < oprsz; ++row) {
-        if (pn[H1(row)] & 1) {
-            uint64_t zn_row = zn[row];
-            for (col = 0; col < oprsz; ++col) {
-                if (pm[H1(col)] & 1) {
-                    zda[tile_vslice_index(row) + col] += zn_row;
-                }
-            }
-        }
-    }
-}
-
-void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn,
-                         void *vpm, void *vst, uint32_t desc)
-{
-    intptr_t row, col, oprsz = simd_maxsz(desc);
-    uint32_t neg = simd_data(desc) << 31;
-    uint16_t *pn = vpn, *pm = vpm;
-    float_status fpst;
-
-    /*
-     * Make a copy of float_status because this operation does not
-     * update the cumulative fp exception status.  It also produces
-     * default nans.
-     */
-    fpst = *(float_status *)vst;
-    set_default_nan_mode(true, &fpst);
-
-    for (row = 0; row < oprsz; ) {
-        uint16_t pa = pn[H2(row >> 4)];
-        do {
-            if (pa & 1) {
-                void *vza_row = vza + tile_vslice_offset(row);
-                uint32_t n = *(uint32_t *)(vzn + H1_4(row)) ^ neg;
-
-                for (col = 0; col < oprsz; ) {
-                    uint16_t pb = pm[H2(col >> 4)];
-                    do {
-                        if (pb & 1) {
-                            uint32_t *a = vza_row + H1_4(col);
-                            uint32_t *m = vzm + H1_4(col);
-                            *a = float32_muladd(n, *m, *a, 0, vst);
-                        }
-                        col += 4;
-                        pb >>= 4;
-                    } while (col & 15);
-                }
-            }
-            row += 4;
-            pa >>= 4;
-        } while (row & 15);
-    }
-}
-
-void HELPER(sme_fmopa_d)(void *vza, void *vzn, void *vzm, void *vpn,
-                         void *vpm, void *vst, uint32_t desc)
-{
-    intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
-    uint64_t neg = (uint64_t)simd_data(desc) << 63;
-    uint64_t *za = vza, *zn = vzn, *zm = vzm;
-    uint8_t *pn = vpn, *pm = vpm;
-    float_status fpst = *(float_status *)vst;
-
-    set_default_nan_mode(true, &fpst);
-
-    for (row = 0; row < oprsz; ++row) {
-        if (pn[H1(row)] & 1) {
-            uint64_t *za_row = &za[tile_vslice_index(row)];
-            uint64_t n = zn[row] ^ neg;
-
-            for (col = 0; col < oprsz; ++col) {
-                if (pm[H1(col)] & 1) {
-                    uint64_t *a = &za_row[col];
-                    *a = float64_muladd(n, zm[col], *a, 0, &fpst);
-                }
-            }
-        }
-    }
-}
-
-/*
- * Alter PAIR as needed for controlling predicates being false,
- * and for NEG on an enabled row element.
- */
-static inline uint32_t f16mop_adj_pair(uint32_t pair, uint32_t pg, uint32_t neg)
-{
-    /*
-     * The pseudocode uses a conditional negate after the conditional zero.
-     * It is simpler here to unconditionally negate before conditional zero.
-     */
-    pair ^= neg;
-    if (!(pg & 1)) {
-        pair &= 0xffff0000u;
-    }
-    if (!(pg & 4)) {
-        pair &= 0x0000ffffu;
-    }
-    return pair;
-}
-
-static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2,
-                          float_status *s_std, float_status *s_odd)
-{
-    float64 e1r = float16_to_float64(e1 & 0xffff, true, s_std);
-    float64 e1c = float16_to_float64(e1 >> 16, true, s_std);
-    float64 e2r = float16_to_float64(e2 & 0xffff, true, s_std);
-    float64 e2c = float16_to_float64(e2 >> 16, true, s_std);
-    float64 t64;
-    float32 t32;
-
-    /*
-     * The ARM pseudocode function FPDot performs both multiplies
-     * and the add with a single rounding operation.  Emulate this
-     * by performing the first multiply in round-to-odd, then doing
-     * the second multiply as fused multiply-add, and rounding to
-     * float32 all in one step.
-     */
-    t64 = float64_mul(e1r, e2r, s_odd);
-    t64 = float64r32_muladd(e1c, e2c, t64, 0, s_std);
-
-    /* This conversion is exact, because we've already rounded. */
-    t32 = float64_to_float32(t64, s_std);
-
-    /* The final accumulation step is not fused. */
-    return float32_add(sum, t32, s_std);
-}
-
-void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn,
-                         void *vpm, void *vst, uint32_t desc)
-{
-    intptr_t row, col, oprsz = simd_maxsz(desc);
-    uint32_t neg = simd_data(desc) * 0x80008000u;
-    uint16_t *pn = vpn, *pm = vpm;
-    float_status fpst_odd, fpst_std;
-
-    /*
-     * Make a copy of float_status because this operation does not
-     * update the cumulative fp exception status.  It also produces
-     * default nans.  Make a second copy with round-to-odd -- see above.
-     */
-    fpst_std = *(float_status *)vst;
-    set_default_nan_mode(true, &fpst_std);
-    fpst_odd = fpst_std;
-    set_float_rounding_mode(float_round_to_odd, &fpst_odd);
-
-    for (row = 0; row < oprsz; ) {
-        uint16_t prow = pn[H2(row >> 4)];
-        do {
-            void *vza_row = vza + tile_vslice_offset(row);
-            uint32_t n = *(uint32_t *)(vzn + H1_4(row));
-
-            n = f16mop_adj_pair(n, prow, neg);
-
-            for (col = 0; col < oprsz; ) {
-                uint16_t pcol = pm[H2(col >> 4)];
-                do {
-                    if (prow & pcol & 0b0101) {
-                        uint32_t *a = vza_row + H1_4(col);
-                        uint32_t m = *(uint32_t *)(vzm + H1_4(col));
-
-                        m = f16mop_adj_pair(m, pcol, 0);
-                        *a = f16_dotadd(*a, n, m, &fpst_std, &fpst_odd);
-
-                        col += 4;
-                        pcol >>= 4;
-                    }
-                } while (col & 15);
-            }
-            row += 4;
-            prow >>= 4;
-        } while (row & 15);
-    }
-}
-
-void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, void *vpn,
-                        void *vpm, uint32_t desc)
-{
-    intptr_t row, col, oprsz = simd_maxsz(desc);
-    uint32_t neg = simd_data(desc) * 0x80008000u;
-    uint16_t *pn = vpn, *pm = vpm;
-
-    for (row = 0; row < oprsz; ) {
-        uint16_t prow = pn[H2(row >> 4)];
-        do {
-            void *vza_row = vza + tile_vslice_offset(row);
-            uint32_t n = *(uint32_t *)(vzn + H1_4(row));
-
-            n = f16mop_adj_pair(n, prow, neg);
-
-            for (col = 0; col < oprsz; ) {
-                uint16_t pcol = pm[H2(col >> 4)];
-                do {
-                    if (prow & pcol & 0b0101) {
-                        uint32_t *a = vza_row + H1_4(col);
-                        uint32_t m = *(uint32_t *)(vzm + H1_4(col));
-
-                        m = f16mop_adj_pair(m, pcol, 0);
-                        *a = bfdotadd(*a, n, m);
-
-                        col += 4;
-                        pcol >>= 4;
-                    }
-                } while (col & 15);
-            }
-            row += 4;
-            prow >>= 4;
-        } while (row & 15);
-    }
-}
-
-typedef uint64_t IMOPFn(uint64_t, uint64_t, uint64_t, uint8_t, bool);
-
-static inline void do_imopa(uint64_t *za, uint64_t *zn, uint64_t *zm,
-                            uint8_t *pn, uint8_t *pm,
-                            uint32_t desc, IMOPFn *fn)
-{
-    intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
-    bool neg = simd_data(desc);
-
-    for (row = 0; row < oprsz; ++row) {
-        uint8_t pa = pn[H1(row)];
-        uint64_t *za_row = &za[tile_vslice_index(row)];
-        uint64_t n = zn[row];
-
-        for (col = 0; col < oprsz; ++col) {
-            uint8_t pb = pm[H1(col)];
-            uint64_t *a = &za_row[col];
-
-            *a = fn(n, zm[col], *a, pa & pb, neg);
-        }
-    }
-}
-
-#define DEF_IMOP_32(NAME, NTYPE, MTYPE) \
-static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
-{                                                                           \
-    uint32_t sum0 = 0, sum1 = 0;                                            \
-    /* Apply P to N as a mask, making the inactive elements 0. */           \
-    n &= expand_pred_b(p);                                                  \
-    sum0 += (NTYPE)(n >> 0) * (MTYPE)(m >> 0);                              \
-    sum0 += (NTYPE)(n >> 8) * (MTYPE)(m >> 8);                              \
-    sum0 += (NTYPE)(n >> 16) * (MTYPE)(m >> 16);                            \
-    sum0 += (NTYPE)(n >> 24) * (MTYPE)(m >> 24);                            \
-    sum1 += (NTYPE)(n >> 32) * (MTYPE)(m >> 32);                            \
-    sum1 += (NTYPE)(n >> 40) * (MTYPE)(m >> 40);                            \
-    sum1 += (NTYPE)(n >> 48) * (MTYPE)(m >> 48);                            \
-    sum1 += (NTYPE)(n >> 56) * (MTYPE)(m >> 56);                            \
-    if (neg) {                                                              \
-        sum0 = (uint32_t)a - sum0, sum1 = (uint32_t)(a >> 32) - sum1;       \
-    } else {                                                                \
-        sum0 = (uint32_t)a + sum0, sum1 = (uint32_t)(a >> 32) + sum1;       \
-    }                                                                       \
-    return ((uint64_t)sum1 << 32) | sum0;                                   \
-}
-
-#define DEF_IMOP_64(NAME, NTYPE, MTYPE) \
-static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
-{                                                                           \
-    uint64_t sum = 0;                                                       \
-    /* Apply P to N as a mask, making the inactive elements 0. */           \
-    n &= expand_pred_h(p);                                                  \
-    sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0);                               \
-    sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16);                             \
-    sum += (NTYPE)(n >> 32) * (MTYPE)(m >> 32);                             \
-    sum += (NTYPE)(n >> 48) * (MTYPE)(m >> 48);                             \
-    return neg ? a - sum : a + sum;                                         \
-}
-
-DEF_IMOP_32(smopa_s, int8_t, int8_t)
-DEF_IMOP_32(umopa_s, uint8_t, uint8_t)
-DEF_IMOP_32(sumopa_s, int8_t, uint8_t)
-DEF_IMOP_32(usmopa_s, uint8_t, int8_t)
-
-DEF_IMOP_64(smopa_d, int16_t, int16_t)
-DEF_IMOP_64(umopa_d, uint16_t, uint16_t)
-DEF_IMOP_64(sumopa_d, int16_t, uint16_t)
-DEF_IMOP_64(usmopa_d, uint16_t, int16_t)
-
-#define DEF_IMOPH(NAME) \
-    void HELPER(sme_##NAME)(void *vza, void *vzn, void *vzm, void *vpn,      \
-                            void *vpm, uint32_t desc)                        \
-    { do_imopa(vza, vzn, vzm, vpn, vpm, desc, NAME); }
-
-DEF_IMOPH(smopa_s)
-DEF_IMOPH(umopa_s)
-DEF_IMOPH(sumopa_s)
-DEF_IMOPH(usmopa_s)
-DEF_IMOPH(smopa_d)
-DEF_IMOPH(umopa_d)
-DEF_IMOPH(sumopa_d)
-DEF_IMOPH(usmopa_d)
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
deleted file mode 100644
index 521fc9b..0000000
--- a/target/arm/sve_helper.c
+++ /dev/null
@@ -1,7483 +0,0 @@
-/*
- * ARM SVE Operations
- *
- * Copyright (c) 2018 Linaro, Ltd.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "qemu/osdep.h"
-#include "cpu.h"
-#include "internals.h"
-#include "exec/exec-all.h"
-#include "exec/helper-proto.h"
-#include "tcg/tcg-gvec-desc.h"
-#include "fpu/softfloat.h"
-#include "tcg/tcg.h"
-#include "vec_internal.h"
-#include "sve_ldst_internal.h"
-
-
-/* Return a value for NZCV as per the ARM PredTest pseudofunction.
- *
- * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
- * and bit 0 set if C is set.  Compare the definitions of these variables
- * within CPUARMState.
- */
-
-/* For no G bits set, NZCV = C.  */
-#define PREDTEST_INIT  1
-
-/* This is an iterative function, called for each Pd and Pg word
- * moving forward.
- */
-static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
-{
-    if (likely(g)) {
-        /* Compute N from first D & G.
-           Use bit 2 to signal first G bit seen.  */
-        if (!(flags & 4)) {
-            flags |= ((d & (g & -g)) != 0) << 31;
-            flags |= 4;
-        }
-
-        /* Accumulate Z from each D & G.  */
-        flags |= ((d & g) != 0) << 1;
-
-        /* Compute C from last !(D & G).  Replace previous.  */
-        flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
-    }
-    return flags;
-}
-
-/* This is an iterative function, called for each Pd and Pg word
- * moving backward.
- */
-static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
-{
-    if (likely(g)) {
-        /* Compute C from first (i.e last) !(D & G).
-           Use bit 2 to signal first G bit seen.  */
-        if (!(flags & 4)) {
-            flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
-            flags |= (d & pow2floor(g)) == 0;
-        }
-
-        /* Accumulate Z from each D & G.  */
-        flags |= ((d & g) != 0) << 1;
-
-        /* Compute N from last (i.e first) D & G.  Replace previous.  */
-        flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
-    }
-    return flags;
-}
-
-/* The same for a single word predicate.  */
-uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
-{
-    return iter_predtest_fwd(d, g, PREDTEST_INIT);
-}
-
-/* The same for a multi-word predicate.  */
-uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
-{
-    uint32_t flags = PREDTEST_INIT;
-    uint64_t *d = vd, *g = vg;
-    uintptr_t i = 0;
-
-    do {
-        flags = iter_predtest_fwd(d[i], g[i], flags);
-    } while (++i < words);
-
-    return flags;
-}
-
-/* Similarly for single word elements.  */
-static inline uint64_t expand_pred_s(uint8_t byte)
-{
-    static const uint64_t word[] = {
-        [0x01] = 0x00000000ffffffffull,
-        [0x10] = 0xffffffff00000000ull,
-        [0x11] = 0xffffffffffffffffull,
-    };
-    return word[byte & 0x11];
-}
-
-#define LOGICAL_PPPP(NAME, FUNC) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
-{                                                                         \
-    uintptr_t opr_sz = simd_oprsz(desc);                                  \
-    uint64_t *d = vd, *n = vn, *m = vm, *g = vg;                          \
-    uintptr_t i;                                                          \
-    for (i = 0; i < opr_sz / 8; ++i) {                                    \
-        d[i] = FUNC(n[i], m[i], g[i]);                                    \
-    }                                                                     \
-}
-
-#define DO_AND(N, M, G)  (((N) & (M)) & (G))
-#define DO_BIC(N, M, G)  (((N) & ~(M)) & (G))
-#define DO_EOR(N, M, G)  (((N) ^ (M)) & (G))
-#define DO_ORR(N, M, G)  (((N) | (M)) & (G))
-#define DO_ORN(N, M, G)  (((N) | ~(M)) & (G))
-#define DO_NOR(N, M, G)  (~((N) | (M)) & (G))
-#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
-#define DO_SEL(N, M, G)  (((N) & (G)) | ((M) & ~(G)))
-
-LOGICAL_PPPP(sve_and_pppp, DO_AND)
-LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
-LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
-LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
-LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
-LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
-LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
-LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
-
-#undef DO_AND
-#undef DO_BIC
-#undef DO_EOR
-#undef DO_ORR
-#undef DO_ORN
-#undef DO_NOR
-#undef DO_NAND
-#undef DO_SEL
-#undef LOGICAL_PPPP
-
-/* Fully general three-operand expander, controlled by a predicate.
- * This is complicated by the host-endian storage of the register file.
- */
-/* ??? I don't expect the compiler could ever vectorize this itself.
- * With some tables we can convert bit masks to byte masks, and with
- * extra care wrt byte/word ordering we could use gcc generic vectors
- * and do 16 bytes at a time.
- */
-#define DO_ZPZZ(NAME, TYPE, H, OP)                                       \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
-{                                                                       \
-    intptr_t i, opr_sz = simd_oprsz(desc);                              \
-    for (i = 0; i < opr_sz; ) {                                         \
-        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
-        do {                                                            \
-            if (pg & 1) {                                               \
-                TYPE nn = *(TYPE *)(vn + H(i));                         \
-                TYPE mm = *(TYPE *)(vm + H(i));                         \
-                *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
-            }                                                           \
-            i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
-        } while (i & 15);                                               \
-    }                                                                   \
-}
-
-/* Similarly, specialized for 64-bit operands.  */
-#define DO_ZPZZ_D(NAME, TYPE, OP)                                \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
-{                                                               \
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
-    TYPE *d = vd, *n = vn, *m = vm;                             \
-    uint8_t *pg = vg;                                           \
-    for (i = 0; i < opr_sz; i += 1) {                           \
-        if (pg[H1(i)] & 1) {                                    \
-            TYPE nn = n[i], mm = m[i];                          \
-            d[i] = OP(nn, mm);                                  \
-        }                                                       \
-    }                                                           \
-}
-
-#define DO_AND(N, M)  (N & M)
-#define DO_EOR(N, M)  (N ^ M)
-#define DO_ORR(N, M)  (N | M)
-#define DO_BIC(N, M)  (N & ~M)
-#define DO_ADD(N, M)  (N + M)
-#define DO_SUB(N, M)  (N - M)
-#define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
-#define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
-#define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
-#define DO_MUL(N, M)  (N * M)
-
-
-/*
- * We must avoid the C undefined behaviour cases: division by
- * zero and signed division of INT_MIN by -1. Both of these
- * have architecturally defined required results for Arm.
- * We special case all signed divisions by -1 to avoid having
- * to deduce the minimum integer for the type involved.
- */
-#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
-#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
-
-DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
-DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
-DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
-DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
-
-DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
-DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
-DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
-DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
-
-DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
-DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
-DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
-DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
-
-DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
-DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
-DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
-DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
-
-DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
-DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
-DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
-DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
-
-DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
-DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
-DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
-DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
-
-DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
-DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
-DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
-DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
-
-DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
-DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
-DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
-DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
-
-DO_ZPZZ(sve_smin_zpzz_b, int8_t,  H1, DO_MIN)
-DO_ZPZZ(sve_smin_zpzz_h, int16_t,  H1_2, DO_MIN)
-DO_ZPZZ(sve_smin_zpzz_s, int32_t,  H1_4, DO_MIN)
-DO_ZPZZ_D(sve_smin_zpzz_d, int64_t,  DO_MIN)
-
-DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
-DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
-DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
-DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
-
-DO_ZPZZ(sve_sabd_zpzz_b, int8_t,  H1, DO_ABD)
-DO_ZPZZ(sve_sabd_zpzz_h, int16_t,  H1_2, DO_ABD)
-DO_ZPZZ(sve_sabd_zpzz_s, int32_t,  H1_4, DO_ABD)
-DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t,  DO_ABD)
-
-DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
-DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
-DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
-DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
-
-/* Because the computation type is at least twice as large as required,
-   these work for both signed and unsigned source types.  */
-static inline uint8_t do_mulh_b(int32_t n, int32_t m)
-{
-    return (n * m) >> 8;
-}
-
-static inline uint16_t do_mulh_h(int32_t n, int32_t m)
-{
-    return (n * m) >> 16;
-}
-
-static inline uint32_t do_mulh_s(int64_t n, int64_t m)
-{
-    return (n * m) >> 32;
-}
-
-static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
-{
-    uint64_t lo, hi;
-    muls64(&lo, &hi, n, m);
-    return hi;
-}
-
-static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
-{
-    uint64_t lo, hi;
-    mulu64(&lo, &hi, n, m);
-    return hi;
-}
-
-DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
-DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
-DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
-DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
-
-DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
-DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
-DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
-DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
-
-DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
-DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
-DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
-DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
-
-DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
-DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
-
-DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
-DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
-
-/* Note that all bits of the shift are significant
-   and not modulo the element size.  */
-#define DO_ASR(N, M)  (N >> MIN(M, sizeof(N) * 8 - 1))
-#define DO_LSR(N, M)  (M < sizeof(N) * 8 ? N >> M : 0)
-#define DO_LSL(N, M)  (M < sizeof(N) * 8 ? N << M : 0)
-
-DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
-DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
-DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
-
-DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
-DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
-DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
-
-DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
-DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
-DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
-
-DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
-DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
-DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
-
-static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
-{
-    int8_t n1 = n, n2 = n >> 8;
-    return m + n1 + n2;
-}
-
-static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
-{
-    int16_t n1 = n, n2 = n >> 16;
-    return m + n1 + n2;
-}
-
-static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
-{
-    int32_t n1 = n, n2 = n >> 32;
-    return m + n1 + n2;
-}
-
-DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
-DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
-DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
-
-static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
-{
-    uint8_t n1 = n, n2 = n >> 8;
-    return m + n1 + n2;
-}
-
-static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
-{
-    uint16_t n1 = n, n2 = n >> 16;
-    return m + n1 + n2;
-}
-
-static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
-{
-    uint32_t n1 = n, n2 = n >> 32;
-    return m + n1 + n2;
-}
-
-DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
-DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
-DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
-
-#define do_srshl_b(n, m)  do_sqrshl_bhs(n, m, 8, true, NULL)
-#define do_srshl_h(n, m)  do_sqrshl_bhs(n, m, 16, true, NULL)
-#define do_srshl_s(n, m)  do_sqrshl_bhs(n, m, 32, true, NULL)
-#define do_srshl_d(n, m)  do_sqrshl_d(n, m, true, NULL)
-
-DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
-DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
-DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
-DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
-
-#define do_urshl_b(n, m)  do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
-#define do_urshl_h(n, m)  do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
-#define do_urshl_s(n, m)  do_uqrshl_bhs(n, m, 32, true, NULL)
-#define do_urshl_d(n, m)  do_uqrshl_d(n, m, true, NULL)
-
-DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
-DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
-DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
-DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
-
-/*
- * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
- * We pass in a pointer to a dummy saturation field to trigger
- * the saturating arithmetic but discard the information about
- * whether it has occurred.
- */
-#define do_sqshl_b(n, m) \
-   ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
-#define do_sqshl_h(n, m) \
-   ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
-#define do_sqshl_s(n, m) \
-   ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
-#define do_sqshl_d(n, m) \
-   ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
-
-DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
-DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
-DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
-DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
-
-#define do_uqshl_b(n, m) \
-   ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
-#define do_uqshl_h(n, m) \
-   ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
-#define do_uqshl_s(n, m) \
-   ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
-#define do_uqshl_d(n, m) \
-   ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
-
-DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
-DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
-DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
-DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
-
-#define do_sqrshl_b(n, m) \
-   ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
-#define do_sqrshl_h(n, m) \
-   ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
-#define do_sqrshl_s(n, m) \
-   ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
-#define do_sqrshl_d(n, m) \
-   ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
-
-DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
-DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
-DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
-DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
-
-#undef do_sqrshl_d
-
-#define do_uqrshl_b(n, m) \
-   ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
-#define do_uqrshl_h(n, m) \
-   ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
-#define do_uqrshl_s(n, m) \
-   ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
-#define do_uqrshl_d(n, m) \
-   ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
-
-DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
-DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
-DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
-DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
-
-#undef do_uqrshl_d
-
-#define DO_HADD_BHS(n, m)  (((int64_t)n + m) >> 1)
-#define DO_HADD_D(n, m)    ((n >> 1) + (m >> 1) + (n & m & 1))
-
-DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
-DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
-DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
-DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
-
-DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
-DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
-DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
-DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
-
-#define DO_RHADD_BHS(n, m)  (((int64_t)n + m + 1) >> 1)
-#define DO_RHADD_D(n, m)    ((n >> 1) + (m >> 1) + ((n | m) & 1))
-
-DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
-DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
-DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
-DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
-
-DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
-DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
-DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
-DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
-
-#define DO_HSUB_BHS(n, m)  (((int64_t)n - m) >> 1)
-#define DO_HSUB_D(n, m)    ((n >> 1) - (m >> 1) - (~n & m & 1))
-
-DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
-DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
-DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
-DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
-
-DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
-DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
-DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
-DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
-
-static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
-{
-    return val >= max ? max : val <= min ? min : val;
-}
-
-#define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
-#define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
-#define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
-
-static inline int64_t do_sqadd_d(int64_t n, int64_t m)
-{
-    int64_t r = n + m;
-    if (((r ^ n) & ~(n ^ m)) < 0) {
-        /* Signed overflow.  */
-        return r < 0 ? INT64_MAX : INT64_MIN;
-    }
-    return r;
-}
-
-DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
-DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
-DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
-DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
-
-#define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
-#define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
-#define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
-
-static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
-{
-    uint64_t r = n + m;
-    return r < n ? UINT64_MAX : r;
-}
-
-DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
-DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
-DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
-DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
-
-#define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
-#define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
-#define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
-
-static inline int64_t do_sqsub_d(int64_t n, int64_t m)
-{
-    int64_t r = n - m;
-    if (((r ^ n) & (n ^ m)) < 0) {
-        /* Signed overflow.  */
-        return r < 0 ? INT64_MAX : INT64_MIN;
-    }
-    return r;
-}
-
-DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
-DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
-DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
-DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
-
-#define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
-#define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
-#define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
-
-static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
-{
-    return n > m ? n - m : 0;
-}
-
-DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
-DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
-DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
-DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
-
-#define DO_SUQADD_B(n, m) \
-    do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
-#define DO_SUQADD_H(n, m) \
-    do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
-#define DO_SUQADD_S(n, m) \
-    do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
-
-static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
-{
-    uint64_t r = n + m;
-
-    if (n < 0) {
-        /* Note that m - abs(n) cannot underflow. */
-        if (r > INT64_MAX) {
-            /* Result is either very large positive or negative. */
-            if (m > -n) {
-                /* m > abs(n), so r is a very large positive. */
-                return INT64_MAX;
-            }
-            /* Result is negative. */
-        }
-    } else {
-        /* Both inputs are positive: check for overflow.  */
-        if (r < m || r > INT64_MAX) {
-            return INT64_MAX;
-        }
-    }
-    return r;
-}
-
-DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
-DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
-DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
-DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
-
-#define DO_USQADD_B(n, m) \
-    do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
-#define DO_USQADD_H(n, m) \
-    do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
-#define DO_USQADD_S(n, m) \
-    do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
-
-static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
-{
-    uint64_t r = n + m;
-
-    if (m < 0) {
-        return n < -m ? 0 : r;
-    }
-    return r < n ? UINT64_MAX : r;
-}
-
-DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
-DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
-DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
-DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
-
-#undef DO_ZPZZ
-#undef DO_ZPZZ_D
-
-/*
- * Three operand expander, operating on element pairs.
- * If the slot I is even, the elements from from VN {I, I+1}.
- * If the slot I is odd, the elements from from VM {I-1, I}.
- * Load all of the input elements in each pair before overwriting output.
- */
-#define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
-{                                                               \
-    intptr_t i, opr_sz = simd_oprsz(desc);                      \
-    for (i = 0; i < opr_sz; ) {                                 \
-        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
-        do {                                                    \
-            TYPE n0 = *(TYPE *)(vn + H(i));                     \
-            TYPE m0 = *(TYPE *)(vm + H(i));                     \
-            TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));      \
-            TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));      \
-            if (pg & 1) {                                       \
-                *(TYPE *)(vd + H(i)) = OP(n0, n1);              \
-            }                                                   \
-            i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
-            if (pg & 1) {                                       \
-                *(TYPE *)(vd + H(i)) = OP(m0, m1);              \
-            }                                                   \
-            i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
-        } while (i & 15);                                       \
-    }                                                           \
-}
-
-/* Similarly, specialized for 64-bit operands.  */
-#define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
-{                                                               \
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
-    TYPE *d = vd, *n = vn, *m = vm;                             \
-    uint8_t *pg = vg;                                           \
-    for (i = 0; i < opr_sz; i += 2) {                           \
-        TYPE n0 = n[i], n1 = n[i + 1];                          \
-        TYPE m0 = m[i], m1 = m[i + 1];                          \
-        if (pg[H1(i)] & 1) {                                    \
-            d[i] = OP(n0, n1);                                  \
-        }                                                       \
-        if (pg[H1(i + 1)] & 1) {                                \
-            d[i + 1] = OP(m0, m1);                              \
-        }                                                       \
-    }                                                           \
-}
-
-DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
-DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
-DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
-DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
-
-DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
-DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
-DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
-DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
-
-DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
-DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
-DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
-DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
-
-DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
-DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
-DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
-DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
-
-DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
-DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
-DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
-DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
-
-#undef DO_ZPZZ_PAIR
-#undef DO_ZPZZ_PAIR_D
-
-#define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP)                              \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
-                  void *status, uint32_t desc)                          \
-{                                                                       \
-    intptr_t i, opr_sz = simd_oprsz(desc);                              \
-    for (i = 0; i < opr_sz; ) {                                         \
-        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
-        do {                                                            \
-            TYPE n0 = *(TYPE *)(vn + H(i));                             \
-            TYPE m0 = *(TYPE *)(vm + H(i));                             \
-            TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));              \
-            TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));              \
-            if (pg & 1) {                                               \
-                *(TYPE *)(vd + H(i)) = OP(n0, n1, status);              \
-            }                                                           \
-            i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
-            if (pg & 1) {                                               \
-                *(TYPE *)(vd + H(i)) = OP(m0, m1, status);              \
-            }                                                           \
-            i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
-        } while (i & 15);                                               \
-    }                                                                   \
-}
-
-DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
-DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
-DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
-
-DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
-DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
-DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
-
-DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
-DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
-DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
-
-DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
-DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
-DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
-
-DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
-DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
-DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
-
-#undef DO_ZPZZ_PAIR_FP
-
-/* Three-operand expander, controlled by a predicate, in which the
- * third operand is "wide".  That is, for D = N op M, the same 64-bit
- * value of M is used with all of the narrower values of N.
- */
-#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP)                               \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
-{                                                                       \
-    intptr_t i, opr_sz = simd_oprsz(desc);                              \
-    for (i = 0; i < opr_sz; ) {                                         \
-        uint8_t pg = *(uint8_t *)(vg + H1(i >> 3));                     \
-        TYPEW mm = *(TYPEW *)(vm + i);                                  \
-        do {                                                            \
-            if (pg & 1) {                                               \
-                TYPE nn = *(TYPE *)(vn + H(i));                         \
-                *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
-            }                                                           \
-            i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
-        } while (i & 7);                                                \
-    }                                                                   \
-}
-
-DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
-DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
-DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
-
-DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
-DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
-DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
-
-DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
-DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
-DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
-
-#undef DO_ZPZW
-
-/* Fully general two-operand expander, controlled by a predicate.
- */
-#define DO_ZPZ(NAME, TYPE, H, OP)                               \
-void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
-{                                                               \
-    intptr_t i, opr_sz = simd_oprsz(desc);                      \
-    for (i = 0; i < opr_sz; ) {                                 \
-        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
-        do {                                                    \
-            if (pg & 1) {                                       \
-                TYPE nn = *(TYPE *)(vn + H(i));                 \
-                *(TYPE *)(vd + H(i)) = OP(nn);                  \
-            }                                                   \
-            i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
-        } while (i & 15);                                       \
-    }                                                           \
-}
-
-/* Similarly, specialized for 64-bit operands.  */
-#define DO_ZPZ_D(NAME, TYPE, OP)                                \
-void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
-{                                                               \
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
-    TYPE *d = vd, *n = vn;                                      \
-    uint8_t *pg = vg;                                           \
-    for (i = 0; i < opr_sz; i += 1) {                           \
-        if (pg[H1(i)] & 1) {                                    \
-            TYPE nn = n[i];                                     \
-            d[i] = OP(nn);                                      \
-        }                                                       \
-    }                                                           \
-}
-
-#define DO_CLS_B(N)   (clrsb32(N) - 24)
-#define DO_CLS_H(N)   (clrsb32(N) - 16)
-
-DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
-DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
-DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
-DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
-
-#define DO_CLZ_B(N)   (clz32(N) - 24)
-#define DO_CLZ_H(N)   (clz32(N) - 16)
-
-DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
-DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
-DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
-DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
-
-DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
-DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
-DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
-DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
-
-#define DO_CNOT(N)    (N == 0)
-
-DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
-DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
-DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
-DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
-
-#define DO_FABS(N)    (N & ((__typeof(N))-1 >> 1))
-
-DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
-DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
-DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
-
-#define DO_FNEG(N)    (N ^ ~((__typeof(N))-1 >> 1))
-
-DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
-DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
-DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
-
-#define DO_NOT(N)    (~N)
-
-DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
-DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
-DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
-DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
-
-#define DO_SXTB(N)    ((int8_t)N)
-#define DO_SXTH(N)    ((int16_t)N)
-#define DO_SXTS(N)    ((int32_t)N)
-#define DO_UXTB(N)    ((uint8_t)N)
-#define DO_UXTH(N)    ((uint16_t)N)
-#define DO_UXTS(N)    ((uint32_t)N)
-
-DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
-DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
-DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
-DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
-DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
-DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
-
-DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
-DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
-DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
-DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
-DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
-DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
-
-#define DO_ABS(N)    (N < 0 ? -N : N)
-
-DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
-DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
-DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
-DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
-
-#define DO_NEG(N)    (-N)
-
-DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
-DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
-DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
-DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
-
-DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
-DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
-DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
-
-DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
-DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
-
-DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
-
-void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t *d = vd, *n = vn;
-    uint8_t *pg = vg;
-
-    for (i = 0; i < opr_sz; i += 2) {
-        if (pg[H1(i)] & 1) {
-            uint64_t n0 = n[i + 0];
-            uint64_t n1 = n[i + 1];
-            d[i + 0] = n1;
-            d[i + 1] = n0;
-        }
-    }
-}
-
-DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
-DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
-DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
-DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
-
-#define DO_SQABS(X) \
-    ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
-       x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
-
-DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
-DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
-DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
-DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
-
-#define DO_SQNEG(X) \
-    ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
-       x_ == min_ ? -min_ - 1 : -x_; })
-
-DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
-DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
-DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
-DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
-
-DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
-DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
-
-/* Three-operand expander, unpredicated, in which the third operand is "wide".
- */
-#define DO_ZZW(NAME, TYPE, TYPEW, H, OP)                       \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
-{                                                              \
-    intptr_t i, opr_sz = simd_oprsz(desc);                     \
-    for (i = 0; i < opr_sz; ) {                                \
-        TYPEW mm = *(TYPEW *)(vm + i);                         \
-        do {                                                   \
-            TYPE nn = *(TYPE *)(vn + H(i));                    \
-            *(TYPE *)(vd + H(i)) = OP(nn, mm);                 \
-            i += sizeof(TYPE);                                 \
-        } while (i & 7);                                       \
-    }                                                          \
-}
-
-DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
-DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
-DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
-
-DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
-DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
-DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
-
-DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
-DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
-DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
-
-#undef DO_ZZW
-
-#undef DO_CLS_B
-#undef DO_CLS_H
-#undef DO_CLZ_B
-#undef DO_CLZ_H
-#undef DO_CNOT
-#undef DO_FABS
-#undef DO_FNEG
-#undef DO_ABS
-#undef DO_NEG
-#undef DO_ZPZ
-#undef DO_ZPZ_D
-
-/*
- * Three-operand expander, unpredicated, in which the two inputs are
- * selected from the top or bottom half of the wide column.
- */
-#define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
-{                                                                       \
-    intptr_t i, opr_sz = simd_oprsz(desc);                              \
-    int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
-    int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
-    for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
-        TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
-        TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
-        *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                            \
-    }                                                                   \
-}
-
-DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
-DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
-DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
-
-DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
-DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
-DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
-
-DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
-DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
-DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
-
-DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
-DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
-DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
-
-DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
-DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
-DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
-
-DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
-DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
-DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
-
-DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
-DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
-DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
-
-DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
-DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
-DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
-
-/* Note that the multiply cannot overflow, but the doubling can. */
-static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
-{
-    int16_t val = n * m;
-    return DO_SQADD_H(val, val);
-}
-
-static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
-{
-    int32_t val = n * m;
-    return DO_SQADD_S(val, val);
-}
-
-static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
-{
-    int64_t val = n * m;
-    return do_sqadd_d(val, val);
-}
-
-DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
-DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
-DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
-
-#undef DO_ZZZ_TB
-
-#define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
-{                                                              \
-    intptr_t i, opr_sz = simd_oprsz(desc);                     \
-    int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
-    for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
-        TYPEW nn = *(TYPEW *)(vn + HW(i));                     \
-        TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));              \
-        *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                   \
-    }                                                          \
-}
-
-DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
-DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
-DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
-
-DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
-DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
-DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
-
-DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
-DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
-DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
-
-DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
-DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
-DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
-
-#undef DO_ZZZ_WTB
-
-#define DO_ZZZ_NTB(NAME, TYPE, H, OP)                                   \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
-{                                                                       \
-    intptr_t i, opr_sz = simd_oprsz(desc);                              \
-    intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
-    intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
-    for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {                    \
-        TYPE nn = *(TYPE *)(vn + H(i + sel1));                          \
-        TYPE mm = *(TYPE *)(vm + H(i + sel2));                          \
-        *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm);                       \
-    }                                                                   \
-}
-
-DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
-DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
-DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
-DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
-
-#undef DO_ZZZ_NTB
-
-#define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
-{                                                               \
-    intptr_t i, opr_sz = simd_oprsz(desc);                      \
-    intptr_t sel1 = simd_data(desc) * sizeof(TYPEN);            \
-    for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {               \
-        TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));               \
-        TYPEW mm = *(TYPEN *)(vm + HN(i + sel1));               \
-        TYPEW aa = *(TYPEW *)(va + HW(i));                      \
-        *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa;               \
-    }                                                           \
-}
-
-DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
-DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
-DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
-
-DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
-DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
-DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
-
-DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
-DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
-DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
-
-DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
-DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
-DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
-
-#define DO_NMUL(N, M)  -(N * M)
-
-DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
-DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
-DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
-
-DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
-DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
-DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
-
-#undef DO_ZZZW_ACC
-
-#define DO_XTNB(NAME, TYPE, OP) \
-void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
-{                                                            \
-    intptr_t i, opr_sz = simd_oprsz(desc);                   \
-    for (i = 0; i < opr_sz; i += sizeof(TYPE)) {             \
-        TYPE nn = *(TYPE *)(vn + i);                         \
-        nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4);  \
-        *(TYPE *)(vd + i) = nn;                              \
-    }                                                        \
-}
-
-#define DO_XTNT(NAME, TYPE, TYPEN, H, OP)                               \
-void HELPER(NAME)(void *vd, void *vn, uint32_t desc)                    \
-{                                                                       \
-    intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN));      \
-    for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
-        TYPE nn = *(TYPE *)(vn + i);                                    \
-        *(TYPEN *)(vd + i + odd) = OP(nn);                              \
-    }                                                                   \
-}
-
-#define DO_SQXTN_H(n)  do_sat_bhs(n, INT8_MIN, INT8_MAX)
-#define DO_SQXTN_S(n)  do_sat_bhs(n, INT16_MIN, INT16_MAX)
-#define DO_SQXTN_D(n)  do_sat_bhs(n, INT32_MIN, INT32_MAX)
-
-DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
-DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
-DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
-
-DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
-DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
-DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
-
-#define DO_UQXTN_H(n)  do_sat_bhs(n, 0, UINT8_MAX)
-#define DO_UQXTN_S(n)  do_sat_bhs(n, 0, UINT16_MAX)
-#define DO_UQXTN_D(n)  do_sat_bhs(n, 0, UINT32_MAX)
-
-DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
-DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
-DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
-
-DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
-DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
-DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
-
-DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
-DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
-DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
-
-DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
-DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
-DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
-
-#undef DO_XTNB
-#undef DO_XTNT
-
-void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
-    uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
-    uint32_t *a = va, *n = vn;
-    uint64_t *d = vd, *m = vm;
-
-    for (i = 0; i < opr_sz / 8; ++i) {
-        uint32_t e1 = a[2 * i + H4(0)];
-        uint32_t e2 = n[2 * i + sel] ^ inv;
-        uint64_t c = extract64(m[i], 32, 1);
-        /* Compute and store the entire 33-bit result at once. */
-        d[i] = c + e1 + e2;
-    }
-}
-
-void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
-    uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
-    uint64_t *d = vd, *a = va, *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz / 8; i += 2) {
-        Int128 e1 = int128_make64(a[i]);
-        Int128 e2 = int128_make64(n[i + sel] ^ inv);
-        Int128 c = int128_make64(m[i + 1] & 1);
-        Int128 r = int128_add(int128_add(e1, e2), c);
-        d[i + 0] = int128_getlo(r);
-        d[i + 1] = int128_gethi(r);
-    }
-}
-
-#define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
-{                                                                       \
-    intptr_t i, opr_sz = simd_oprsz(desc);                              \
-    int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
-    int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
-    for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
-        TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
-        TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
-        TYPEW aa = *(TYPEW *)(va + HW(i));                              \
-        *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm));           \
-    }                                                                   \
-}
-
-DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
-           do_sqdmull_h, DO_SQADD_H)
-DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
-           do_sqdmull_s, DO_SQADD_S)
-DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
-           do_sqdmull_d, do_sqadd_d)
-
-DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
-           do_sqdmull_h, DO_SQSUB_H)
-DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
-           do_sqdmull_s, DO_SQSUB_S)
-DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
-           do_sqdmull_d, do_sqsub_d)
-
-#undef DO_SQDMLAL
-
-#define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
-{                                                               \
-    intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);       \
-    int rot = simd_data(desc);                                  \
-    int sel_a = rot & 1, sel_b = sel_a ^ 1;                     \
-    bool sub_r = rot == 1 || rot == 2;                          \
-    bool sub_i = rot >= 2;                                      \
-    TYPE *d = vd, *n = vn, *m = vm, *a = va;                    \
-    for (i = 0; i < opr_sz; i += 2) {                           \
-        TYPE elt1_a = n[H(i + sel_a)];                          \
-        TYPE elt2_a = m[H(i + sel_a)];                          \
-        TYPE elt2_b = m[H(i + sel_b)];                          \
-        d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r);           \
-        d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i);   \
-    }                                                           \
-}
-
-#define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
-
-DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
-DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
-DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
-DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
-
-#define DO_SQRDMLAH_B(N, M, A, S) \
-    do_sqrdmlah_b(N, M, A, S, true)
-#define DO_SQRDMLAH_H(N, M, A, S) \
-    ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
-#define DO_SQRDMLAH_S(N, M, A, S) \
-    ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
-#define DO_SQRDMLAH_D(N, M, A, S) \
-    do_sqrdmlah_d(N, M, A, S, true)
-
-DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
-DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
-DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
-DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
-
-#define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)    \
-{                                                                           \
-    intptr_t i, j, oprsz = simd_oprsz(desc);                                \
-    int rot = extract32(desc, SIMD_DATA_SHIFT, 2);                          \
-    int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2;                  \
-    int sel_a = rot & 1, sel_b = sel_a ^ 1;                                 \
-    bool sub_r = rot == 1 || rot == 2;                                      \
-    bool sub_i = rot >= 2;                                                  \
-    TYPE *d = vd, *n = vn, *m = vm, *a = va;                                \
-    for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) {         \
-        TYPE elt2_a = m[H(i + idx + sel_a)];                                \
-        TYPE elt2_b = m[H(i + idx + sel_b)];                                \
-        for (j = 0; j < 16 / sizeof(TYPE); j += 2) {                        \
-            TYPE elt1_a = n[H(i + j + sel_a)];                              \
-            d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r);          \
-            d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i);  \
-        }                                                                   \
-    }                                                                       \
-}
-
-DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
-DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
-
-DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
-DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
-
-#undef DO_CMLA
-#undef DO_CMLA_FUNC
-#undef DO_CMLA_IDX_FUNC
-#undef DO_SQRDMLAH_B
-#undef DO_SQRDMLAH_H
-#undef DO_SQRDMLAH_S
-#undef DO_SQRDMLAH_D
-
-/* Note N and M are 4 elements bundled into one unit. */
-static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
-                         int sel_a, int sel_b, int sub_i)
-{
-    for (int i = 0; i <= 1; i++) {
-        int32_t elt1_r = (int8_t)(n >> (16 * i));
-        int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
-        int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
-        int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
-
-        a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
-    }
-    return a;
-}
-
-static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
-                         int sel_a, int sel_b, int sub_i)
-{
-    for (int i = 0; i <= 1; i++) {
-        int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
-        int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
-        int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
-        int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
-
-        a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
-    }
-    return a;
-}
-
-void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
-                              void *va, uint32_t desc)
-{
-    int opr_sz = simd_oprsz(desc);
-    int rot = simd_data(desc);
-    int sel_a = rot & 1;
-    int sel_b = sel_a ^ 1;
-    int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
-    uint32_t *d = vd, *n = vn, *m = vm, *a = va;
-
-    for (int e = 0; e < opr_sz / 4; e++) {
-        d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
-    }
-}
-
-void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
-                              void *va, uint32_t desc)
-{
-    int opr_sz = simd_oprsz(desc);
-    int rot = simd_data(desc);
-    int sel_a = rot & 1;
-    int sel_b = sel_a ^ 1;
-    int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
-    uint64_t *d = vd, *n = vn, *m = vm, *a = va;
-
-    for (int e = 0; e < opr_sz / 8; e++) {
-        d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
-    }
-}
-
-void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
-                             void *va, uint32_t desc)
-{
-    int opr_sz = simd_oprsz(desc);
-    int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
-    int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
-    int sel_a = rot & 1;
-    int sel_b = sel_a ^ 1;
-    int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
-    uint32_t *d = vd, *n = vn, *m = vm, *a = va;
-
-    for (int seg = 0; seg < opr_sz / 4; seg += 4) {
-        uint32_t seg_m = m[seg + idx];
-        for (int e = 0; e < 4; e++) {
-            d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
-                                   sel_a, sel_b, sub_i);
-        }
-    }
-}
-
-void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
-                             void *va, uint32_t desc)
-{
-    int seg, opr_sz = simd_oprsz(desc);
-    int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
-    int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
-    int sel_a = rot & 1;
-    int sel_b = sel_a ^ 1;
-    int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
-    uint64_t *d = vd, *n = vn, *m = vm, *a = va;
-
-    for (seg = 0; seg < opr_sz / 8; seg += 2) {
-        uint64_t seg_m = m[seg + idx];
-        for (int e = 0; e < 2; e++) {
-            d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
-                                   sel_a, sel_b, sub_i);
-        }
-    }
-}
-
-#define DO_ZZXZ(NAME, TYPE, H, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
-{                                                                       \
-    intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE);     \
-    intptr_t i, j, idx = simd_data(desc);                               \
-    TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx);           \
-    for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {               \
-        TYPE mm = m[i];                                                 \
-        for (j = 0; j < segment; j++) {                                 \
-            d[i + j] = OP(n[i + j], mm, a[i + j]);                      \
-        }                                                               \
-    }                                                                   \
-}
-
-#define DO_SQRDMLAH_H(N, M, A) \
-    ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
-#define DO_SQRDMLAH_S(N, M, A) \
-    ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
-#define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
-
-DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
-DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
-DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
-
-#define DO_SQRDMLSH_H(N, M, A) \
-    ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
-#define DO_SQRDMLSH_S(N, M, A) \
-    ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
-#define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
-
-DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
-DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
-DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
-
-#undef DO_ZZXZ
-
-#define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
-{                                                                         \
-    intptr_t i, j, oprsz = simd_oprsz(desc);                              \
-    intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
-    intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
-    for (i = 0; i < oprsz; i += 16) {                                     \
-        TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
-        for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
-            TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
-            TYPEW aa = *(TYPEW *)(va + HW(i + j));                        \
-            *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa);                  \
-        }                                                                 \
-    }                                                                     \
-}
-
-#define DO_MLA(N, M, A)  (A + N * M)
-
-DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
-DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
-DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
-DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
-
-#define DO_MLS(N, M, A)  (A - N * M)
-
-DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
-DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
-DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
-DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
-
-#define DO_SQDMLAL_S(N, M, A)  DO_SQADD_S(A, do_sqdmull_s(N, M))
-#define DO_SQDMLAL_D(N, M, A)  do_sqadd_d(A, do_sqdmull_d(N, M))
-
-DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
-DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
-
-#define DO_SQDMLSL_S(N, M, A)  DO_SQSUB_S(A, do_sqdmull_s(N, M))
-#define DO_SQDMLSL_D(N, M, A)  do_sqsub_d(A, do_sqdmull_d(N, M))
-
-DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
-DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
-
-#undef DO_MLA
-#undef DO_MLS
-#undef DO_ZZXW
-
-#define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)            \
-{                                                                         \
-    intptr_t i, j, oprsz = simd_oprsz(desc);                              \
-    intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
-    intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
-    for (i = 0; i < oprsz; i += 16) {                                     \
-        TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
-        for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
-            TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
-            *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm);                      \
-        }                                                                 \
-    }                                                                     \
-}
-
-DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
-DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
-
-DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
-DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
-
-DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
-DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
-
-#undef DO_ZZX
-
-#define DO_BITPERM(NAME, TYPE, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
-{                                                              \
-    intptr_t i, opr_sz = simd_oprsz(desc);                     \
-    for (i = 0; i < opr_sz; i += sizeof(TYPE)) {               \
-        TYPE nn = *(TYPE *)(vn + i);                           \
-        TYPE mm = *(TYPE *)(vm + i);                           \
-        *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8);      \
-    }                                                          \
-}
-
-static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
-{
-    uint64_t res = 0;
-    int db, rb = 0;
-
-    for (db = 0; db < n; ++db) {
-        if ((mask >> db) & 1) {
-            res |= ((data >> db) & 1) << rb;
-            ++rb;
-        }
-    }
-    return res;
-}
-
-DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
-DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
-DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
-DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
-
-static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
-{
-    uint64_t res = 0;
-    int rb, db = 0;
-
-    for (rb = 0; rb < n; ++rb) {
-        if ((mask >> rb) & 1) {
-            res |= ((data >> db) & 1) << rb;
-            ++db;
-        }
-    }
-    return res;
-}
-
-DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
-DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
-DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
-DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
-
-static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
-{
-    uint64_t resm = 0, resu = 0;
-    int db, rbm = 0, rbu = 0;
-
-    for (db = 0; db < n; ++db) {
-        uint64_t val = (data >> db) & 1;
-        if ((mask >> db) & 1) {
-            resm |= val << rbm++;
-        } else {
-            resu |= val << rbu++;
-        }
-    }
-
-    return resm | (resu << rbm);
-}
-
-DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
-DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
-DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
-DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
-
-#undef DO_BITPERM
-
-#define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP)                  \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
-{                                                               \
-    intptr_t i, opr_sz = simd_oprsz(desc);                      \
-    int sub_r = simd_data(desc);                                \
-    if (sub_r) {                                                \
-        for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
-            TYPE acc_r = *(TYPE *)(vn + H(i));                  \
-            TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
-            TYPE el2_r = *(TYPE *)(vm + H(i));                  \
-            TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
-            acc_r = ADD_OP(acc_r, el2_i);                       \
-            acc_i = SUB_OP(acc_i, el2_r);                       \
-            *(TYPE *)(vd + H(i)) = acc_r;                       \
-            *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
-        }                                                       \
-    } else {                                                    \
-        for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
-            TYPE acc_r = *(TYPE *)(vn + H(i));                  \
-            TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
-            TYPE el2_r = *(TYPE *)(vm + H(i));                  \
-            TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
-            acc_r = SUB_OP(acc_r, el2_i);                       \
-            acc_i = ADD_OP(acc_i, el2_r);                       \
-            *(TYPE *)(vd + H(i)) = acc_r;                       \
-            *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
-        }                                                       \
-    }                                                           \
-}
-
-DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
-DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
-DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
-DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
-
-DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
-DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
-DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
-DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
-
-#undef DO_CADD
-
-#define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
-void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
-{                                                              \
-    intptr_t i, opr_sz = simd_oprsz(desc);                     \
-    intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN);      \
-    int shift = simd_data(desc) >> 1;                          \
-    for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
-        TYPEW nn = *(TYPEN *)(vn + HN(i + sel));               \
-        *(TYPEW *)(vd + HW(i)) = nn << shift;                  \
-    }                                                          \
-}
-
-DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
-DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
-DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
-
-DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
-DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
-DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
-
-#undef DO_ZZI_SHLL
-
-/* Two-operand reduction expander, controlled by a predicate.
- * The difference between TYPERED and TYPERET has to do with
- * sign-extension.  E.g. for SMAX, TYPERED must be signed,
- * but TYPERET must be unsigned so that e.g. a 32-bit value
- * is not sign-extended to the ABI uint64_t return type.
- */
-/* ??? If we were to vectorize this by hand the reduction ordering
- * would change.  For integer operands, this is perfectly fine.
- */
-#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
-uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
-{                                                          \
-    intptr_t i, opr_sz = simd_oprsz(desc);                 \
-    TYPERED ret = INIT;                                    \
-    for (i = 0; i < opr_sz; ) {                            \
-        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
-        do {                                               \
-            if (pg & 1) {                                  \
-                TYPEELT nn = *(TYPEELT *)(vn + H(i));      \
-                ret = OP(ret, nn);                         \
-            }                                              \
-            i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT);  \
-        } while (i & 15);                                  \
-    }                                                      \
-    return (TYPERET)ret;                                   \
-}
-
-#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP)             \
-uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
-{                                                          \
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;             \
-    TYPEE *n = vn;                                         \
-    uint8_t *pg = vg;                                      \
-    TYPER ret = INIT;                                      \
-    for (i = 0; i < opr_sz; i += 1) {                      \
-        if (pg[H1(i)] & 1) {                               \
-            TYPEE nn = n[i];                               \
-            ret = OP(ret, nn);                             \
-        }                                                  \
-    }                                                      \
-    return ret;                                            \
-}
-
-DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
-DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
-DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
-DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
-
-DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
-DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
-DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
-DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
-
-DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
-DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
-DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
-DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
-
-DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
-DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
-DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
-
-DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
-DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
-DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
-DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
-
-DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
-DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
-DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
-DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
-
-DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
-DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
-DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
-DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
-
-DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
-DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
-DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
-DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
-
-DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
-DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
-DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
-DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
-
-#undef DO_VPZ
-#undef DO_VPZ_D
-
-/* Two vector operand, one scalar operand, unpredicated.  */
-#define DO_ZZI(NAME, TYPE, OP)                                       \
-void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc)   \
-{                                                                    \
-    intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);            \
-    TYPE s = s64, *d = vd, *n = vn;                                  \
-    for (i = 0; i < opr_sz; ++i) {                                   \
-        d[i] = OP(n[i], s);                                          \
-    }                                                                \
-}
-
-#define DO_SUBR(X, Y)   (Y - X)
-
-DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
-DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
-DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
-DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
-
-DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
-DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
-DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
-DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
-
-DO_ZZI(sve_smini_b, int8_t, DO_MIN)
-DO_ZZI(sve_smini_h, int16_t, DO_MIN)
-DO_ZZI(sve_smini_s, int32_t, DO_MIN)
-DO_ZZI(sve_smini_d, int64_t, DO_MIN)
-
-DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
-DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
-DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
-DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
-
-DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
-DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
-DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
-DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
-
-#undef DO_ZZI
-
-#undef DO_AND
-#undef DO_ORR
-#undef DO_EOR
-#undef DO_BIC
-#undef DO_ADD
-#undef DO_SUB
-#undef DO_MAX
-#undef DO_MIN
-#undef DO_ABD
-#undef DO_MUL
-#undef DO_DIV
-#undef DO_ASR
-#undef DO_LSR
-#undef DO_LSL
-#undef DO_SUBR
-
-/* Similar to the ARM LastActiveElement pseudocode function, except the
-   result is multiplied by the element size.  This includes the not found
-   indication; e.g. not found for esz=3 is -8.  */
-static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
-{
-    uint64_t mask = pred_esz_masks[esz];
-    intptr_t i = words;
-
-    do {
-        uint64_t this_g = g[--i] & mask;
-        if (this_g) {
-            return i * 64 + (63 - clz64(this_g));
-        }
-    } while (i > 0);
-    return (intptr_t)-1 << esz;
-}
-
-uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
-{
-    intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
-    uint32_t flags = PREDTEST_INIT;
-    uint64_t *d = vd, *g = vg;
-    intptr_t i = 0;
-
-    do {
-        uint64_t this_d = d[i];
-        uint64_t this_g = g[i];
-
-        if (this_g) {
-            if (!(flags & 4)) {
-                /* Set in D the first bit of G.  */
-                this_d |= this_g & -this_g;
-                d[i] = this_d;
-            }
-            flags = iter_predtest_fwd(this_d, this_g, flags);
-        }
-    } while (++i < words);
-
-    return flags;
-}
-
-uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
-{
-    intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
-    intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
-    uint32_t flags = PREDTEST_INIT;
-    uint64_t *d = vd, *g = vg, esz_mask;
-    intptr_t i, next;
-
-    next = last_active_element(vd, words, esz) + (1 << esz);
-    esz_mask = pred_esz_masks[esz];
-
-    /* Similar to the pseudocode for pnext, but scaled by ESZ
-       so that we find the correct bit.  */
-    if (next < words * 64) {
-        uint64_t mask = -1;
-
-        if (next & 63) {
-            mask = ~((1ull << (next & 63)) - 1);
-            next &= -64;
-        }
-        do {
-            uint64_t this_g = g[next / 64] & esz_mask & mask;
-            if (this_g != 0) {
-                next = (next & -64) + ctz64(this_g);
-                break;
-            }
-            next += 64;
-            mask = -1;
-        } while (next < words * 64);
-    }
-
-    i = 0;
-    do {
-        uint64_t this_d = 0;
-        if (i == next / 64) {
-            this_d = 1ull << (next & 63);
-        }
-        d[i] = this_d;
-        flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
-    } while (++i < words);
-
-    return flags;
-}
-
-/*
- * Copy Zn into Zd, and store zero into inactive elements.
- * If inv, store zeros into the active elements.
- */
-void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
-    uint64_t *d = vd, *n = vn;
-    uint8_t *pg = vg;
-
-    for (i = 0; i < opr_sz; i += 1) {
-        d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
-    }
-}
-
-void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
-    uint64_t *d = vd, *n = vn;
-    uint8_t *pg = vg;
-
-    for (i = 0; i < opr_sz; i += 1) {
-        d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
-    }
-}
-
-void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
-    uint64_t *d = vd, *n = vn;
-    uint8_t *pg = vg;
-
-    for (i = 0; i < opr_sz; i += 1) {
-        d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
-    }
-}
-
-void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t *d = vd, *n = vn;
-    uint8_t *pg = vg;
-    uint8_t inv = simd_data(desc);
-
-    for (i = 0; i < opr_sz; i += 1) {
-        d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
-    }
-}
-
-/* Three-operand expander, immediate operand, controlled by a predicate.
- */
-#define DO_ZPZI(NAME, TYPE, H, OP)                              \
-void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
-{                                                               \
-    intptr_t i, opr_sz = simd_oprsz(desc);                      \
-    TYPE imm = simd_data(desc);                                 \
-    for (i = 0; i < opr_sz; ) {                                 \
-        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
-        do {                                                    \
-            if (pg & 1) {                                       \
-                TYPE nn = *(TYPE *)(vn + H(i));                 \
-                *(TYPE *)(vd + H(i)) = OP(nn, imm);             \
-            }                                                   \
-            i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
-        } while (i & 15);                                       \
-    }                                                           \
-}
-
-/* Similarly, specialized for 64-bit operands.  */
-#define DO_ZPZI_D(NAME, TYPE, OP)                               \
-void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
-{                                                               \
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
-    TYPE *d = vd, *n = vn;                                      \
-    TYPE imm = simd_data(desc);                                 \
-    uint8_t *pg = vg;                                           \
-    for (i = 0; i < opr_sz; i += 1) {                           \
-        if (pg[H1(i)] & 1) {                                    \
-            TYPE nn = n[i];                                     \
-            d[i] = OP(nn, imm);                                 \
-        }                                                       \
-    }                                                           \
-}
-
-#define DO_SHR(N, M)  (N >> M)
-#define DO_SHL(N, M)  (N << M)
-
-/* Arithmetic shift right for division.  This rounds negative numbers
-   toward zero as per signed division.  Therefore before shifting,
-   when N is negative, add 2**M-1.  */
-#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
-
-static inline uint64_t do_urshr(uint64_t x, unsigned sh)
-{
-    if (likely(sh < 64)) {
-        return (x >> sh) + ((x >> (sh - 1)) & 1);
-    } else if (sh == 64) {
-        return x >> 63;
-    } else {
-        return 0;
-    }
-}
-
-static inline int64_t do_srshr(int64_t x, unsigned sh)
-{
-    if (likely(sh < 64)) {
-        return (x >> sh) + ((x >> (sh - 1)) & 1);
-    } else {
-        /* Rounding the sign bit always produces 0. */
-        return 0;
-    }
-}
-
-DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
-DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
-DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
-DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
-
-DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
-DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
-DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
-DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
-
-DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
-DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
-DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
-DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
-
-DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
-DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
-DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
-DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
-
-/* SVE2 bitwise shift by immediate */
-DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
-DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
-DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
-DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
-
-DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
-DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
-DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
-DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
-
-DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
-DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
-DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
-DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
-
-DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
-DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
-DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
-DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
-
-#define do_suqrshl_b(n, m) \
-   ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
-#define do_suqrshl_h(n, m) \
-   ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
-#define do_suqrshl_s(n, m) \
-   ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
-#define do_suqrshl_d(n, m) \
-   ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
-
-DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
-DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
-DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
-DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
-
-#undef DO_ASRD
-#undef DO_ZPZI
-#undef DO_ZPZI_D
-
-#define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
-void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
-{                                                            \
-    intptr_t i, opr_sz = simd_oprsz(desc);                   \
-    int shift = simd_data(desc);                             \
-    for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {            \
-        TYPEW nn = *(TYPEW *)(vn + i);                       \
-        *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift);           \
-    }                                                        \
-}
-
-#define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP)                  \
-void HELPER(NAME)(void *vd, void *vn, uint32_t desc)              \
-{                                                                 \
-    intptr_t i, opr_sz = simd_oprsz(desc);                        \
-    int shift = simd_data(desc);                                  \
-    for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                 \
-        TYPEW nn = *(TYPEW *)(vn + HW(i));                        \
-        *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift);   \
-    }                                                             \
-}
-
-DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
-DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
-DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
-
-DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
-DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
-DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
-
-DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
-DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
-DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
-
-DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
-DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
-DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
-
-#define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
-#define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
-#define DO_SQSHRUN_D(x, sh) \
-    do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
-
-DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
-DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
-DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
-
-DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
-DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
-DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
-
-#define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
-#define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
-#define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
-
-DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
-DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
-DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
-
-DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
-DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
-DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
-
-#define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
-#define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
-#define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
-
-DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
-DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
-DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
-
-DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
-DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
-DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
-
-#define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
-#define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
-#define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
-
-DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
-DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
-DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
-
-DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
-DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
-DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
-
-#define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
-#define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
-#define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
-
-DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
-DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
-DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
-
-DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
-DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
-DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
-
-#define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
-#define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
-#define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
-
-DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
-DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
-DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
-
-DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
-DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
-DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
-
-#undef DO_SHRNB
-#undef DO_SHRNT
-
-#define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP)                           \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
-{                                                                           \
-    intptr_t i, opr_sz = simd_oprsz(desc);                                  \
-    for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
-        TYPEW nn = *(TYPEW *)(vn + i);                                      \
-        TYPEW mm = *(TYPEW *)(vm + i);                                      \
-        *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT);                      \
-    }                                                                       \
-}
-
-#define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP)                   \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
-{                                                                           \
-    intptr_t i, opr_sz = simd_oprsz(desc);                                  \
-    for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
-        TYPEW nn = *(TYPEW *)(vn + HW(i));                                  \
-        TYPEW mm = *(TYPEW *)(vm + HW(i));                                  \
-        *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT);         \
-    }                                                                       \
-}
-
-#define DO_ADDHN(N, M, SH)  ((N + M) >> SH)
-#define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
-#define DO_SUBHN(N, M, SH)  ((N - M) >> SH)
-#define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
-
-DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
-DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
-DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
-
-DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
-DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
-DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
-
-DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
-DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
-DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
-
-DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
-DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
-DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
-
-DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
-DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
-DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
-
-DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
-DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
-DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
-
-DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
-DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
-DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
-
-DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
-DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
-DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
-
-#undef DO_RSUBHN
-#undef DO_SUBHN
-#undef DO_RADDHN
-#undef DO_ADDHN
-
-#undef DO_BINOPNB
-
-/* Fully general four-operand expander, controlled by a predicate.
- */
-#define DO_ZPZZZ(NAME, TYPE, H, OP)                           \
-void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
-                  void *vg, uint32_t desc)                    \
-{                                                             \
-    intptr_t i, opr_sz = simd_oprsz(desc);                    \
-    for (i = 0; i < opr_sz; ) {                               \
-        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));       \
-        do {                                                  \
-            if (pg & 1) {                                     \
-                TYPE nn = *(TYPE *)(vn + H(i));               \
-                TYPE mm = *(TYPE *)(vm + H(i));               \
-                TYPE aa = *(TYPE *)(va + H(i));               \
-                *(TYPE *)(vd + H(i)) = OP(aa, nn, mm);        \
-            }                                                 \
-            i += sizeof(TYPE), pg >>= sizeof(TYPE);           \
-        } while (i & 15);                                     \
-    }                                                         \
-}
-
-/* Similarly, specialized for 64-bit operands.  */
-#define DO_ZPZZZ_D(NAME, TYPE, OP)                            \
-void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
-                  void *vg, uint32_t desc)                    \
-{                                                             \
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;                \
-    TYPE *d = vd, *a = va, *n = vn, *m = vm;                  \
-    uint8_t *pg = vg;                                         \
-    for (i = 0; i < opr_sz; i += 1) {                         \
-        if (pg[H1(i)] & 1) {                                  \
-            TYPE aa = a[i], nn = n[i], mm = m[i];             \
-            d[i] = OP(aa, nn, mm);                            \
-        }                                                     \
-    }                                                         \
-}
-
-#define DO_MLA(A, N, M)  (A + N * M)
-#define DO_MLS(A, N, M)  (A - N * M)
-
-DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
-DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
-
-DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
-DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
-
-DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
-DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
-
-DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
-DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
-
-#undef DO_MLA
-#undef DO_MLS
-#undef DO_ZPZZZ
-#undef DO_ZPZZZ_D
-
-void HELPER(sve_index_b)(void *vd, uint32_t start,
-                         uint32_t incr, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    uint8_t *d = vd;
-    for (i = 0; i < opr_sz; i += 1) {
-        d[H1(i)] = start + i * incr;
-    }
-}
-
-void HELPER(sve_index_h)(void *vd, uint32_t start,
-                         uint32_t incr, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 2;
-    uint16_t *d = vd;
-    for (i = 0; i < opr_sz; i += 1) {
-        d[H2(i)] = start + i * incr;
-    }
-}
-
-void HELPER(sve_index_s)(void *vd, uint32_t start,
-                         uint32_t incr, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 4;
-    uint32_t *d = vd;
-    for (i = 0; i < opr_sz; i += 1) {
-        d[H4(i)] = start + i * incr;
-    }
-}
-
-void HELPER(sve_index_d)(void *vd, uint64_t start,
-                         uint64_t incr, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t *d = vd;
-    for (i = 0; i < opr_sz; i += 1) {
-        d[i] = start + i * incr;
-    }
-}
-
-void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 4;
-    uint32_t sh = simd_data(desc);
-    uint32_t *d = vd, *n = vn, *m = vm;
-    for (i = 0; i < opr_sz; i += 1) {
-        d[i] = n[i] + (m[i] << sh);
-    }
-}
-
-void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t sh = simd_data(desc);
-    uint64_t *d = vd, *n = vn, *m = vm;
-    for (i = 0; i < opr_sz; i += 1) {
-        d[i] = n[i] + (m[i] << sh);
-    }
-}
-
-void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t sh = simd_data(desc);
-    uint64_t *d = vd, *n = vn, *m = vm;
-    for (i = 0; i < opr_sz; i += 1) {
-        d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
-    }
-}
-
-void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t sh = simd_data(desc);
-    uint64_t *d = vd, *n = vn, *m = vm;
-    for (i = 0; i < opr_sz; i += 1) {
-        d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
-    }
-}
-
-void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
-{
-    /* These constants are cut-and-paste directly from the ARM pseudocode.  */
-    static const uint16_t coeff[] = {
-        0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
-        0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
-        0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
-        0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
-    };
-    intptr_t i, opr_sz = simd_oprsz(desc) / 2;
-    uint16_t *d = vd, *n = vn;
-
-    for (i = 0; i < opr_sz; i++) {
-        uint16_t nn = n[i];
-        intptr_t idx = extract32(nn, 0, 5);
-        uint16_t exp = extract32(nn, 5, 5);
-        d[i] = coeff[idx] | (exp << 10);
-    }
-}
-
-void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
-{
-    /* These constants are cut-and-paste directly from the ARM pseudocode.  */
-    static const uint32_t coeff[] = {
-        0x000000, 0x0164d2, 0x02cd87, 0x043a29,
-        0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
-        0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
-        0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
-        0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
-        0x1ef532, 0x20b051, 0x227043, 0x243516,
-        0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
-        0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
-        0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
-        0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
-        0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
-        0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
-        0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
-        0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
-        0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
-        0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
-    };
-    intptr_t i, opr_sz = simd_oprsz(desc) / 4;
-    uint32_t *d = vd, *n = vn;
-
-    for (i = 0; i < opr_sz; i++) {
-        uint32_t nn = n[i];
-        intptr_t idx = extract32(nn, 0, 6);
-        uint32_t exp = extract32(nn, 6, 8);
-        d[i] = coeff[idx] | (exp << 23);
-    }
-}
-
-void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
-{
-    /* These constants are cut-and-paste directly from the ARM pseudocode.  */
-    static const uint64_t coeff[] = {
-        0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
-        0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
-        0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
-        0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
-        0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
-        0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
-        0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
-        0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
-        0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
-        0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
-        0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
-        0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
-        0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
-        0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
-        0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
-        0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
-        0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
-        0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
-        0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
-        0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
-        0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
-        0xFA7C1819E90D8ull,
-    };
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t *d = vd, *n = vn;
-
-    for (i = 0; i < opr_sz; i++) {
-        uint64_t nn = n[i];
-        intptr_t idx = extract32(nn, 0, 6);
-        uint64_t exp = extract32(nn, 6, 11);
-        d[i] = coeff[idx] | (exp << 52);
-    }
-}
-
-void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 2;
-    uint16_t *d = vd, *n = vn, *m = vm;
-    for (i = 0; i < opr_sz; i += 1) {
-        uint16_t nn = n[i];
-        uint16_t mm = m[i];
-        if (mm & 1) {
-            nn = float16_one;
-        }
-        d[i] = nn ^ (mm & 2) << 14;
-    }
-}
-
-void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 4;
-    uint32_t *d = vd, *n = vn, *m = vm;
-    for (i = 0; i < opr_sz; i += 1) {
-        uint32_t nn = n[i];
-        uint32_t mm = m[i];
-        if (mm & 1) {
-            nn = float32_one;
-        }
-        d[i] = nn ^ (mm & 2) << 30;
-    }
-}
-
-void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t *d = vd, *n = vn, *m = vm;
-    for (i = 0; i < opr_sz; i += 1) {
-        uint64_t nn = n[i];
-        uint64_t mm = m[i];
-        if (mm & 1) {
-            nn = float64_one;
-        }
-        d[i] = nn ^ (mm & 2) << 62;
-    }
-}
-
-/*
- * Signed saturating addition with scalar operand.
- */
-
-void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
-{
-    intptr_t i, oprsz = simd_oprsz(desc);
-
-    for (i = 0; i < oprsz; i += sizeof(int8_t)) {
-        *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
-    }
-}
-
-void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
-{
-    intptr_t i, oprsz = simd_oprsz(desc);
-
-    for (i = 0; i < oprsz; i += sizeof(int16_t)) {
-        *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
-    }
-}
-
-void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
-{
-    intptr_t i, oprsz = simd_oprsz(desc);
-
-    for (i = 0; i < oprsz; i += sizeof(int32_t)) {
-        *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
-    }
-}
-
-void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
-{
-    intptr_t i, oprsz = simd_oprsz(desc);
-
-    for (i = 0; i < oprsz; i += sizeof(int64_t)) {
-        *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
-    }
-}
-
-/*
- * Unsigned saturating addition with scalar operand.
- */
-
-void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
-{
-    intptr_t i, oprsz = simd_oprsz(desc);
-
-    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
-        *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
-    }
-}
-
-void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
-{
-    intptr_t i, oprsz = simd_oprsz(desc);
-
-    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
-        *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
-    }
-}
-
-void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
-{
-    intptr_t i, oprsz = simd_oprsz(desc);
-
-    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
-        *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
-    }
-}
-
-void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
-{
-    intptr_t i, oprsz = simd_oprsz(desc);
-
-    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
-        *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
-    }
-}
-
-void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
-{
-    intptr_t i, oprsz = simd_oprsz(desc);
-
-    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
-        *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
-    }
-}
-
-/* Two operand predicated copy immediate with merge.  All valid immediates
- * can fit within 17 signed bits in the simd_data field.
- */
-void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
-                         uint64_t mm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t *d = vd, *n = vn;
-    uint8_t *pg = vg;
-
-    mm = dup_const(MO_8, mm);
-    for (i = 0; i < opr_sz; i += 1) {
-        uint64_t nn = n[i];
-        uint64_t pp = expand_pred_b(pg[H1(i)]);
-        d[i] = (mm & pp) | (nn & ~pp);
-    }
-}
-
-void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
-                         uint64_t mm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t *d = vd, *n = vn;
-    uint8_t *pg = vg;
-
-    mm = dup_const(MO_16, mm);
-    for (i = 0; i < opr_sz; i += 1) {
-        uint64_t nn = n[i];
-        uint64_t pp = expand_pred_h(pg[H1(i)]);
-        d[i] = (mm & pp) | (nn & ~pp);
-    }
-}
-
-void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
-                         uint64_t mm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t *d = vd, *n = vn;
-    uint8_t *pg = vg;
-
-    mm = dup_const(MO_32, mm);
-    for (i = 0; i < opr_sz; i += 1) {
-        uint64_t nn = n[i];
-        uint64_t pp = expand_pred_s(pg[H1(i)]);
-        d[i] = (mm & pp) | (nn & ~pp);
-    }
-}
-
-void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
-                         uint64_t mm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t *d = vd, *n = vn;
-    uint8_t *pg = vg;
-
-    for (i = 0; i < opr_sz; i += 1) {
-        uint64_t nn = n[i];
-        d[i] = (pg[H1(i)] & 1 ? mm : nn);
-    }
-}
-
-void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t *d = vd;
-    uint8_t *pg = vg;
-
-    val = dup_const(MO_8, val);
-    for (i = 0; i < opr_sz; i += 1) {
-        d[i] = val & expand_pred_b(pg[H1(i)]);
-    }
-}
-
-void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t *d = vd;
-    uint8_t *pg = vg;
-
-    val = dup_const(MO_16, val);
-    for (i = 0; i < opr_sz; i += 1) {
-        d[i] = val & expand_pred_h(pg[H1(i)]);
-    }
-}
-
-void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t *d = vd;
-    uint8_t *pg = vg;
-
-    val = dup_const(MO_32, val);
-    for (i = 0; i < opr_sz; i += 1) {
-        d[i] = val & expand_pred_s(pg[H1(i)]);
-    }
-}
-
-void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t *d = vd;
-    uint8_t *pg = vg;
-
-    for (i = 0; i < opr_sz; i += 1) {
-        d[i] = (pg[H1(i)] & 1 ? val : 0);
-    }
-}
-
-/* Big-endian hosts need to frob the byte indices.  If the copy
- * happens to be 8-byte aligned, then no frobbing necessary.
- */
-static void swap_memmove(void *vd, void *vs, size_t n)
-{
-    uintptr_t d = (uintptr_t)vd;
-    uintptr_t s = (uintptr_t)vs;
-    uintptr_t o = (d | s | n) & 7;
-    size_t i;
-
-#if !HOST_BIG_ENDIAN
-    o = 0;
-#endif
-    switch (o) {
-    case 0:
-        memmove(vd, vs, n);
-        break;
-
-    case 4:
-        if (d < s || d >= s + n) {
-            for (i = 0; i < n; i += 4) {
-                *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
-            }
-        } else {
-            for (i = n; i > 0; ) {
-                i -= 4;
-                *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
-            }
-        }
-        break;
-
-    case 2:
-    case 6:
-        if (d < s || d >= s + n) {
-            for (i = 0; i < n; i += 2) {
-                *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
-            }
-        } else {
-            for (i = n; i > 0; ) {
-                i -= 2;
-                *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
-            }
-        }
-        break;
-
-    default:
-        if (d < s || d >= s + n) {
-            for (i = 0; i < n; i++) {
-                *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
-            }
-        } else {
-            for (i = n; i > 0; ) {
-                i -= 1;
-                *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
-            }
-        }
-        break;
-    }
-}
-
-/* Similarly for memset of 0.  */
-static void swap_memzero(void *vd, size_t n)
-{
-    uintptr_t d = (uintptr_t)vd;
-    uintptr_t o = (d | n) & 7;
-    size_t i;
-
-    /* Usually, the first bit of a predicate is set, so N is 0.  */
-    if (likely(n == 0)) {
-        return;
-    }
-
-#if !HOST_BIG_ENDIAN
-    o = 0;
-#endif
-    switch (o) {
-    case 0:
-        memset(vd, 0, n);
-        break;
-
-    case 4:
-        for (i = 0; i < n; i += 4) {
-            *(uint32_t *)H1_4(d + i) = 0;
-        }
-        break;
-
-    case 2:
-    case 6:
-        for (i = 0; i < n; i += 2) {
-            *(uint16_t *)H1_2(d + i) = 0;
-        }
-        break;
-
-    default:
-        for (i = 0; i < n; i++) {
-            *(uint8_t *)H1(d + i) = 0;
-        }
-        break;
-    }
-}
-
-void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t opr_sz = simd_oprsz(desc);
-    size_t n_ofs = simd_data(desc);
-    size_t n_siz = opr_sz - n_ofs;
-
-    if (vd != vm) {
-        swap_memmove(vd, vn + n_ofs, n_siz);
-        swap_memmove(vd + n_siz, vm, n_ofs);
-    } else if (vd != vn) {
-        swap_memmove(vd + n_siz, vd, n_ofs);
-        swap_memmove(vd, vn + n_ofs, n_siz);
-    } else {
-        /* vd == vn == vm.  Need temp space.  */
-        ARMVectorReg tmp;
-        swap_memmove(&tmp, vm, n_ofs);
-        swap_memmove(vd, vd + n_ofs, n_siz);
-        memcpy(vd + n_siz, &tmp, n_ofs);
-    }
-}
-
-#define DO_INSR(NAME, TYPE, H) \
-void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
-{                                                                  \
-    intptr_t opr_sz = simd_oprsz(desc);                            \
-    swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE));    \
-    *(TYPE *)(vd + H(0)) = val;                                    \
-}
-
-DO_INSR(sve_insr_b, uint8_t, H1)
-DO_INSR(sve_insr_h, uint16_t, H1_2)
-DO_INSR(sve_insr_s, uint32_t, H1_4)
-DO_INSR(sve_insr_d, uint64_t, H1_8)
-
-#undef DO_INSR
-
-void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
-{
-    intptr_t i, j, opr_sz = simd_oprsz(desc);
-    for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
-        uint64_t f = *(uint64_t *)(vn + i);
-        uint64_t b = *(uint64_t *)(vn + j);
-        *(uint64_t *)(vd + i) = bswap64(b);
-        *(uint64_t *)(vd + j) = bswap64(f);
-    }
-}
-
-void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
-{
-    intptr_t i, j, opr_sz = simd_oprsz(desc);
-    for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
-        uint64_t f = *(uint64_t *)(vn + i);
-        uint64_t b = *(uint64_t *)(vn + j);
-        *(uint64_t *)(vd + i) = hswap64(b);
-        *(uint64_t *)(vd + j) = hswap64(f);
-    }
-}
-
-void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
-{
-    intptr_t i, j, opr_sz = simd_oprsz(desc);
-    for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
-        uint64_t f = *(uint64_t *)(vn + i);
-        uint64_t b = *(uint64_t *)(vn + j);
-        *(uint64_t *)(vd + i) = rol64(b, 32);
-        *(uint64_t *)(vd + j) = rol64(f, 32);
-    }
-}
-
-void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
-{
-    intptr_t i, j, opr_sz = simd_oprsz(desc);
-    for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
-        uint64_t f = *(uint64_t *)(vn + i);
-        uint64_t b = *(uint64_t *)(vn + j);
-        *(uint64_t *)(vd + i) = b;
-        *(uint64_t *)(vd + j) = f;
-    }
-}
-
-typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
-
-static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
-                           bool is_tbx, tb_impl_fn *fn)
-{
-    ARMVectorReg scratch;
-    uintptr_t oprsz = simd_oprsz(desc);
-
-    if (unlikely(vd == vn)) {
-        vn = memcpy(&scratch, vn, oprsz);
-    }
-
-    fn(vd, vn, NULL, vm, oprsz, is_tbx);
-}
-
-static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
-                           uint32_t desc, bool is_tbx, tb_impl_fn *fn)
-{
-    ARMVectorReg scratch;
-    uintptr_t oprsz = simd_oprsz(desc);
-
-    if (unlikely(vd == vn0)) {
-        vn0 = memcpy(&scratch, vn0, oprsz);
-        if (vd == vn1) {
-            vn1 = vn0;
-        }
-    } else if (unlikely(vd == vn1)) {
-        vn1 = memcpy(&scratch, vn1, oprsz);
-    }
-
-    fn(vd, vn0, vn1, vm, oprsz, is_tbx);
-}
-
-#define DO_TB(SUFF, TYPE, H)                                            \
-static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1,         \
-                                void *vm, uintptr_t oprsz, bool is_tbx) \
-{                                                                       \
-    TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm;              \
-    uintptr_t i, nelem = oprsz / sizeof(TYPE);                          \
-    for (i = 0; i < nelem; ++i) {                                       \
-        TYPE index = indexes[H1(i)], val = 0;                           \
-        if (index < nelem) {                                            \
-            val = tbl0[H(index)];                                       \
-        } else {                                                        \
-            index -= nelem;                                             \
-            if (tbl1 && index < nelem) {                                \
-                val = tbl1[H(index)];                                   \
-            } else if (is_tbx) {                                        \
-                continue;                                               \
-            }                                                           \
-        }                                                               \
-        d[H(i)] = val;                                                  \
-    }                                                                   \
-}                                                                       \
-void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
-{                                                                       \
-    do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF);                     \
-}                                                                       \
-void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1,            \
-                             void *vm, uint32_t desc)                   \
-{                                                                       \
-    do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF);               \
-}                                                                       \
-void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
-{                                                                       \
-    do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF);                      \
-}
-
-DO_TB(b, uint8_t, H1)
-DO_TB(h, uint16_t, H2)
-DO_TB(s, uint32_t, H4)
-DO_TB(d, uint64_t, H8)
-
-#undef DO_TB
-
-#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
-void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
-{                                                              \
-    intptr_t i, opr_sz = simd_oprsz(desc);                     \
-    TYPED *d = vd;                                             \
-    TYPES *n = vn;                                             \
-    ARMVectorReg tmp;                                          \
-    if (unlikely(vn - vd < opr_sz)) {                          \
-        n = memcpy(&tmp, n, opr_sz / 2);                       \
-    }                                                          \
-    for (i = 0; i < opr_sz / sizeof(TYPED); i++) {             \
-        d[HD(i)] = n[HS(i)];                                   \
-    }                                                          \
-}
-
-DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
-DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
-DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
-
-DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
-DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
-DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
-
-#undef DO_UNPK
-
-/* Mask of bits included in the even numbered predicates of width esz.
- * We also use this for expand_bits/compress_bits, and so extend the
- * same pattern out to 16-bit units.
- */
-static const uint64_t even_bit_esz_masks[5] = {
-    0x5555555555555555ull,
-    0x3333333333333333ull,
-    0x0f0f0f0f0f0f0f0full,
-    0x00ff00ff00ff00ffull,
-    0x0000ffff0000ffffull,
-};
-
-/* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
- * For N==0, this corresponds to the operation that in qemu/bitops.h
- * we call half_shuffle64; this algorithm is from Hacker's Delight,
- * section 7-2 Shuffling Bits.
- */
-static uint64_t expand_bits(uint64_t x, int n)
-{
-    int i;
-
-    x &= 0xffffffffu;
-    for (i = 4; i >= n; i--) {
-        int sh = 1 << i;
-        x = ((x << sh) | x) & even_bit_esz_masks[i];
-    }
-    return x;
-}
-
-/* Compress units of 2**(N+1) bits to units of 2**N bits.
- * For N==0, this corresponds to the operation that in qemu/bitops.h
- * we call half_unshuffle64; this algorithm is from Hacker's Delight,
- * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
- */
-static uint64_t compress_bits(uint64_t x, int n)
-{
-    int i;
-
-    for (i = n; i <= 4; i++) {
-        int sh = 1 << i;
-        x &= even_bit_esz_masks[i];
-        x = (x >> sh) | x;
-    }
-    return x & 0xffffffffu;
-}
-
-void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
-{
-    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
-    int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
-    intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
-    int esize = 1 << esz;
-    uint64_t *d = vd;
-    intptr_t i;
-
-    if (oprsz <= 8) {
-        uint64_t nn = *(uint64_t *)vn;
-        uint64_t mm = *(uint64_t *)vm;
-        int half = 4 * oprsz;
-
-        nn = extract64(nn, high * half, half);
-        mm = extract64(mm, high * half, half);
-        nn = expand_bits(nn, esz);
-        mm = expand_bits(mm, esz);
-        d[0] = nn | (mm << esize);
-    } else {
-        ARMPredicateReg tmp;
-
-        /* We produce output faster than we consume input.
-           Therefore we must be mindful of possible overlap.  */
-        if (vd == vn) {
-            vn = memcpy(&tmp, vn, oprsz);
-            if (vd == vm) {
-                vm = vn;
-            }
-        } else if (vd == vm) {
-            vm = memcpy(&tmp, vm, oprsz);
-        }
-        if (high) {
-            high = oprsz >> 1;
-        }
-
-        if ((oprsz & 7) == 0) {
-            uint32_t *n = vn, *m = vm;
-            high >>= 2;
-
-            for (i = 0; i < oprsz / 8; i++) {
-                uint64_t nn = n[H4(high + i)];
-                uint64_t mm = m[H4(high + i)];
-
-                nn = expand_bits(nn, esz);
-                mm = expand_bits(mm, esz);
-                d[i] = nn | (mm << esize);
-            }
-        } else {
-            uint8_t *n = vn, *m = vm;
-            uint16_t *d16 = vd;
-
-            for (i = 0; i < oprsz / 2; i++) {
-                uint16_t nn = n[H1(high + i)];
-                uint16_t mm = m[H1(high + i)];
-
-                nn = expand_bits(nn, esz);
-                mm = expand_bits(mm, esz);
-                d16[H2(i)] = nn | (mm << esize);
-            }
-        }
-    }
-}
-
-void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
-{
-    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
-    int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
-    int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
-    uint64_t *d = vd, *n = vn, *m = vm;
-    uint64_t l, h;
-    intptr_t i;
-
-    if (oprsz <= 8) {
-        l = compress_bits(n[0] >> odd, esz);
-        h = compress_bits(m[0] >> odd, esz);
-        d[0] = l | (h << (4 * oprsz));
-    } else {
-        ARMPredicateReg tmp_m;
-        intptr_t oprsz_16 = oprsz / 16;
-
-        if ((vm - vd) < (uintptr_t)oprsz) {
-            m = memcpy(&tmp_m, vm, oprsz);
-        }
-
-        for (i = 0; i < oprsz_16; i++) {
-            l = n[2 * i + 0];
-            h = n[2 * i + 1];
-            l = compress_bits(l >> odd, esz);
-            h = compress_bits(h >> odd, esz);
-            d[i] = l | (h << 32);
-        }
-
-        /*
-         * For VL which is not a multiple of 512, the results from M do not
-         * align nicely with the uint64_t for D.  Put the aligned results
-         * from M into TMP_M and then copy it into place afterward.
-         */
-        if (oprsz & 15) {
-            int final_shift = (oprsz & 15) * 2;
-
-            l = n[2 * i + 0];
-            h = n[2 * i + 1];
-            l = compress_bits(l >> odd, esz);
-            h = compress_bits(h >> odd, esz);
-            d[i] = l | (h << final_shift);
-
-            for (i = 0; i < oprsz_16; i++) {
-                l = m[2 * i + 0];
-                h = m[2 * i + 1];
-                l = compress_bits(l >> odd, esz);
-                h = compress_bits(h >> odd, esz);
-                tmp_m.p[i] = l | (h << 32);
-            }
-            l = m[2 * i + 0];
-            h = m[2 * i + 1];
-            l = compress_bits(l >> odd, esz);
-            h = compress_bits(h >> odd, esz);
-            tmp_m.p[i] = l | (h << final_shift);
-
-            swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
-        } else {
-            for (i = 0; i < oprsz_16; i++) {
-                l = m[2 * i + 0];
-                h = m[2 * i + 1];
-                l = compress_bits(l >> odd, esz);
-                h = compress_bits(h >> odd, esz);
-                d[oprsz_16 + i] = l | (h << 32);
-            }
-        }
-    }
-}
-
-void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
-{
-    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
-    int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
-    int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
-    uint64_t *d = vd, *n = vn, *m = vm;
-    uint64_t mask;
-    int shr, shl;
-    intptr_t i;
-
-    shl = 1 << esz;
-    shr = 0;
-    mask = even_bit_esz_masks[esz];
-    if (odd) {
-        mask <<= shl;
-        shr = shl;
-        shl = 0;
-    }
-
-    for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
-        uint64_t nn = (n[i] & mask) >> shr;
-        uint64_t mm = (m[i] & mask) << shl;
-        d[i] = nn + mm;
-    }
-}
-
-/* Reverse units of 2**N bits.  */
-static uint64_t reverse_bits_64(uint64_t x, int n)
-{
-    int i, sh;
-
-    x = bswap64(x);
-    for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
-        uint64_t mask = even_bit_esz_masks[i];
-        x = ((x & mask) << sh) | ((x >> sh) & mask);
-    }
-    return x;
-}
-
-static uint8_t reverse_bits_8(uint8_t x, int n)
-{
-    static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
-    int i, sh;
-
-    for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
-        x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
-    }
-    return x;
-}
-
-void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
-{
-    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
-    int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
-    intptr_t i, oprsz_2 = oprsz / 2;
-
-    if (oprsz <= 8) {
-        uint64_t l = *(uint64_t *)vn;
-        l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
-        *(uint64_t *)vd = l;
-    } else if ((oprsz & 15) == 0) {
-        for (i = 0; i < oprsz_2; i += 8) {
-            intptr_t ih = oprsz - 8 - i;
-            uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
-            uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
-            *(uint64_t *)(vd + i) = h;
-            *(uint64_t *)(vd + ih) = l;
-        }
-    } else {
-        for (i = 0; i < oprsz_2; i += 1) {
-            intptr_t il = H1(i);
-            intptr_t ih = H1(oprsz - 1 - i);
-            uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
-            uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
-            *(uint8_t *)(vd + il) = h;
-            *(uint8_t *)(vd + ih) = l;
-        }
-    }
-}
-
-void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
-{
-    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
-    intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
-    uint64_t *d = vd;
-    intptr_t i;
-
-    if (oprsz <= 8) {
-        uint64_t nn = *(uint64_t *)vn;
-        int half = 4 * oprsz;
-
-        nn = extract64(nn, high * half, half);
-        nn = expand_bits(nn, 0);
-        d[0] = nn;
-    } else {
-        ARMPredicateReg tmp_n;
-
-        /* We produce output faster than we consume input.
-           Therefore we must be mindful of possible overlap.  */
-        if ((vn - vd) < (uintptr_t)oprsz) {
-            vn = memcpy(&tmp_n, vn, oprsz);
-        }
-        if (high) {
-            high = oprsz >> 1;
-        }
-
-        if ((oprsz & 7) == 0) {
-            uint32_t *n = vn;
-            high >>= 2;
-
-            for (i = 0; i < oprsz / 8; i++) {
-                uint64_t nn = n[H4(high + i)];
-                d[i] = expand_bits(nn, 0);
-            }
-        } else {
-            uint16_t *d16 = vd;
-            uint8_t *n = vn;
-
-            for (i = 0; i < oprsz / 2; i++) {
-                uint16_t nn = n[H1(high + i)];
-                d16[H2(i)] = expand_bits(nn, 0);
-            }
-        }
-    }
-}
-
-#define DO_ZIP(NAME, TYPE, H) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
-{                                                                    \
-    intptr_t oprsz = simd_oprsz(desc);                               \
-    intptr_t odd_ofs = simd_data(desc);                              \
-    intptr_t i, oprsz_2 = oprsz / 2;                                 \
-    ARMVectorReg tmp_n, tmp_m;                                       \
-    /* We produce output faster than we consume input.               \
-       Therefore we must be mindful of possible overlap.  */         \
-    if (unlikely((vn - vd) < (uintptr_t)oprsz)) {                    \
-        vn = memcpy(&tmp_n, vn, oprsz);                              \
-    }                                                                \
-    if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                    \
-        vm = memcpy(&tmp_m, vm, oprsz);                              \
-    }                                                                \
-    for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                    \
-        *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
-        *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) =                    \
-            *(TYPE *)(vm + odd_ofs + H(i));                          \
-    }                                                                \
-    if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                \
-        memset(vd + oprsz - 16, 0, 16);                              \
-    }                                                                \
-}
-
-DO_ZIP(sve_zip_b, uint8_t, H1)
-DO_ZIP(sve_zip_h, uint16_t, H1_2)
-DO_ZIP(sve_zip_s, uint32_t, H1_4)
-DO_ZIP(sve_zip_d, uint64_t, H1_8)
-DO_ZIP(sve2_zip_q, Int128, )
-
-#define DO_UZP(NAME, TYPE, H) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
-{                                                                      \
-    intptr_t oprsz = simd_oprsz(desc);                                 \
-    intptr_t odd_ofs = simd_data(desc);                                \
-    intptr_t i, p;                                                     \
-    ARMVectorReg tmp_m;                                                \
-    if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                      \
-        vm = memcpy(&tmp_m, vm, oprsz);                                \
-    }                                                                  \
-    i = 0, p = odd_ofs;                                                \
-    do {                                                               \
-        *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p));                   \
-        i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
-    } while (p < oprsz);                                               \
-    p -= oprsz;                                                        \
-    do {                                                               \
-        *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p));                   \
-        i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
-    } while (p < oprsz);                                               \
-    tcg_debug_assert(i == oprsz);                                      \
-}
-
-DO_UZP(sve_uzp_b, uint8_t, H1)
-DO_UZP(sve_uzp_h, uint16_t, H1_2)
-DO_UZP(sve_uzp_s, uint32_t, H1_4)
-DO_UZP(sve_uzp_d, uint64_t, H1_8)
-DO_UZP(sve2_uzp_q, Int128, )
-
-#define DO_TRN(NAME, TYPE, H) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
-{                                                                      \
-    intptr_t oprsz = simd_oprsz(desc);                                 \
-    intptr_t odd_ofs = simd_data(desc);                                \
-    intptr_t i;                                                        \
-    for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) {                    \
-        TYPE ae = *(TYPE *)(vn + H(i + odd_ofs));                      \
-        TYPE be = *(TYPE *)(vm + H(i + odd_ofs));                      \
-        *(TYPE *)(vd + H(i + 0)) = ae;                                 \
-        *(TYPE *)(vd + H(i + sizeof(TYPE))) = be;                      \
-    }                                                                  \
-    if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                  \
-        memset(vd + oprsz - 16, 0, 16);                                \
-    }                                                                  \
-}
-
-DO_TRN(sve_trn_b, uint8_t, H1)
-DO_TRN(sve_trn_h, uint16_t, H1_2)
-DO_TRN(sve_trn_s, uint32_t, H1_4)
-DO_TRN(sve_trn_d, uint64_t, H1_8)
-DO_TRN(sve2_trn_q, Int128, )
-
-#undef DO_ZIP
-#undef DO_UZP
-#undef DO_TRN
-
-void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
-{
-    intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
-    uint32_t *d = vd, *n = vn;
-    uint8_t *pg = vg;
-
-    for (i = j = 0; i < opr_sz; i++) {
-        if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
-            d[H4(j)] = n[H4(i)];
-            j++;
-        }
-    }
-    for (; j < opr_sz; j++) {
-        d[H4(j)] = 0;
-    }
-}
-
-void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
-{
-    intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t *d = vd, *n = vn;
-    uint8_t *pg = vg;
-
-    for (i = j = 0; i < opr_sz; i++) {
-        if (pg[H1(i)] & 1) {
-            d[j] = n[i];
-            j++;
-        }
-    }
-    for (; j < opr_sz; j++) {
-        d[j] = 0;
-    }
-}
-
-/* Similar to the ARM LastActiveElement pseudocode function, except the
- * result is multiplied by the element size.  This includes the not found
- * indication; e.g. not found for esz=3 is -8.
- */
-int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
-{
-    intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
-    intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
-
-    return last_active_element(vg, words, esz);
-}
-
-void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
-{
-    intptr_t opr_sz = simd_oprsz(desc) / 8;
-    int esz = simd_data(desc);
-    uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
-    intptr_t i, first_i, last_i;
-    ARMVectorReg tmp;
-
-    first_i = last_i = 0;
-    first_g = last_g = 0;
-
-    /* Find the extent of the active elements within VG.  */
-    for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
-        pg = *(uint64_t *)(vg + i) & mask;
-        if (pg) {
-            if (last_g == 0) {
-                last_g = pg;
-                last_i = i;
-            }
-            first_g = pg;
-            first_i = i;
-        }
-    }
-
-    len = 0;
-    if (first_g != 0) {
-        first_i = first_i * 8 + ctz64(first_g);
-        last_i = last_i * 8 + 63 - clz64(last_g);
-        len = last_i - first_i + (1 << esz);
-        if (vd == vm) {
-            vm = memcpy(&tmp, vm, opr_sz * 8);
-        }
-        swap_memmove(vd, vn + first_i, len);
-    }
-    swap_memmove(vd + len, vm, opr_sz * 8 - len);
-}
-
-void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
-                            void *vg, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t *d = vd, *n = vn, *m = vm;
-    uint8_t *pg = vg;
-
-    for (i = 0; i < opr_sz; i += 1) {
-        uint64_t nn = n[i], mm = m[i];
-        uint64_t pp = expand_pred_b(pg[H1(i)]);
-        d[i] = (nn & pp) | (mm & ~pp);
-    }
-}
-
-void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
-                            void *vg, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t *d = vd, *n = vn, *m = vm;
-    uint8_t *pg = vg;
-
-    for (i = 0; i < opr_sz; i += 1) {
-        uint64_t nn = n[i], mm = m[i];
-        uint64_t pp = expand_pred_h(pg[H1(i)]);
-        d[i] = (nn & pp) | (mm & ~pp);
-    }
-}
-
-void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
-                            void *vg, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t *d = vd, *n = vn, *m = vm;
-    uint8_t *pg = vg;
-
-    for (i = 0; i < opr_sz; i += 1) {
-        uint64_t nn = n[i], mm = m[i];
-        uint64_t pp = expand_pred_s(pg[H1(i)]);
-        d[i] = (nn & pp) | (mm & ~pp);
-    }
-}
-
-void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
-                            void *vg, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t *d = vd, *n = vn, *m = vm;
-    uint8_t *pg = vg;
-
-    for (i = 0; i < opr_sz; i += 1) {
-        uint64_t nn = n[i], mm = m[i];
-        d[i] = (pg[H1(i)] & 1 ? nn : mm);
-    }
-}
-
-void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
-                            void *vg, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 16;
-    Int128 *d = vd, *n = vn, *m = vm;
-    uint16_t *pg = vg;
-
-    for (i = 0; i < opr_sz; i += 1) {
-        d[i] = (pg[H2(i)] & 1 ? n : m)[i];
-    }
-}
-
-/* Two operand comparison controlled by a predicate.
- * ??? It is very tempting to want to be able to expand this inline
- * with x86 instructions, e.g.
- *
- *    vcmpeqw    zm, zn, %ymm0
- *    vpmovmskb  %ymm0, %eax
- *    and        $0x5555, %eax
- *    and        pg, %eax
- *
- * or even aarch64, e.g.
- *
- *    // mask = 4000 1000 0400 0100 0040 0010 0004 0001
- *    cmeq       v0.8h, zn, zm
- *    and        v0.8h, v0.8h, mask
- *    addv       h0, v0.8h
- *    and        v0.8b, pg
- *
- * However, coming up with an abstraction that allows vector inputs and
- * a scalar output, and also handles the byte-ordering of sub-uint64_t
- * scalar outputs, is tricky.
- */
-#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK)                                 \
-uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
-{                                                                            \
-    intptr_t opr_sz = simd_oprsz(desc);                                      \
-    uint32_t flags = PREDTEST_INIT;                                          \
-    intptr_t i = opr_sz;                                                     \
-    do {                                                                     \
-        uint64_t out = 0, pg;                                                \
-        do {                                                                 \
-            i -= sizeof(TYPE), out <<= sizeof(TYPE);                         \
-            TYPE nn = *(TYPE *)(vn + H(i));                                  \
-            TYPE mm = *(TYPE *)(vm + H(i));                                  \
-            out |= nn OP mm;                                                 \
-        } while (i & 63);                                                    \
-        pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
-        out &= pg;                                                           \
-        *(uint64_t *)(vd + (i >> 3)) = out;                                  \
-        flags = iter_predtest_bwd(out, pg, flags);                           \
-    } while (i > 0);                                                         \
-    return flags;                                                            \
-}
-
-#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
-    DO_CMP_PPZZ(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
-#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
-    DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
-#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
-    DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
-#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
-    DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
-
-DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t,  ==)
-DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
-DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
-DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
-
-DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t,  !=)
-DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
-DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
-DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
-
-DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t,  >)
-DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
-DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
-DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
-
-DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t,  >=)
-DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
-DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
-DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
-
-DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t,  >)
-DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
-DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
-DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
-
-DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t,  >=)
-DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
-DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
-DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
-
-#undef DO_CMP_PPZZ_B
-#undef DO_CMP_PPZZ_H
-#undef DO_CMP_PPZZ_S
-#undef DO_CMP_PPZZ_D
-#undef DO_CMP_PPZZ
-
-/* Similar, but the second source is "wide".  */
-#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK)                     \
-uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
-{                                                                            \
-    intptr_t opr_sz = simd_oprsz(desc);                                      \
-    uint32_t flags = PREDTEST_INIT;                                          \
-    intptr_t i = opr_sz;                                                     \
-    do {                                                                     \
-        uint64_t out = 0, pg;                                                \
-        do {                                                                 \
-            TYPEW mm = *(TYPEW *)(vm + i - 8);                               \
-            do {                                                             \
-                i -= sizeof(TYPE), out <<= sizeof(TYPE);                     \
-                TYPE nn = *(TYPE *)(vn + H(i));                              \
-                out |= nn OP mm;                                             \
-            } while (i & 7);                                                 \
-        } while (i & 63);                                                    \
-        pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
-        out &= pg;                                                           \
-        *(uint64_t *)(vd + (i >> 3)) = out;                                  \
-        flags = iter_predtest_bwd(out, pg, flags);                           \
-    } while (i > 0);                                                         \
-    return flags;                                                            \
-}
-
-#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
-    DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1,   0xffffffffffffffffull)
-#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
-    DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
-#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
-    DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
-
-DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t,  uint64_t, ==)
-DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
-DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
-
-DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t,  uint64_t, !=)
-DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
-DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
-
-DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t,   int64_t, >)
-DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t,  int64_t, >)
-DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t,  int64_t, >)
-
-DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t,   int64_t, >=)
-DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t,  int64_t, >=)
-DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t,  int64_t, >=)
-
-DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t,  uint64_t, >)
-DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
-DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
-
-DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t,  uint64_t, >=)
-DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
-DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
-
-DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t,   int64_t, <)
-DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t,  int64_t, <)
-DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t,  int64_t, <)
-
-DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t,   int64_t, <=)
-DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t,  int64_t, <=)
-DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t,  int64_t, <=)
-
-DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t,  uint64_t, <)
-DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
-DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
-
-DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t,  uint64_t, <=)
-DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
-DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
-
-#undef DO_CMP_PPZW_B
-#undef DO_CMP_PPZW_H
-#undef DO_CMP_PPZW_S
-#undef DO_CMP_PPZW
-
-/* Similar, but the second source is immediate.  */
-#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK)                         \
-uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)   \
-{                                                                    \
-    intptr_t opr_sz = simd_oprsz(desc);                              \
-    uint32_t flags = PREDTEST_INIT;                                  \
-    TYPE mm = simd_data(desc);                                       \
-    intptr_t i = opr_sz;                                             \
-    do {                                                             \
-        uint64_t out = 0, pg;                                        \
-        do {                                                         \
-            i -= sizeof(TYPE), out <<= sizeof(TYPE);                 \
-            TYPE nn = *(TYPE *)(vn + H(i));                          \
-            out |= nn OP mm;                                         \
-        } while (i & 63);                                            \
-        pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                    \
-        out &= pg;                                                   \
-        *(uint64_t *)(vd + (i >> 3)) = out;                          \
-        flags = iter_predtest_bwd(out, pg, flags);                   \
-    } while (i > 0);                                                 \
-    return flags;                                                    \
-}
-
-#define DO_CMP_PPZI_B(NAME, TYPE, OP) \
-    DO_CMP_PPZI(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
-#define DO_CMP_PPZI_H(NAME, TYPE, OP) \
-    DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
-#define DO_CMP_PPZI_S(NAME, TYPE, OP) \
-    DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
-#define DO_CMP_PPZI_D(NAME, TYPE, OP) \
-    DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
-
-DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t,  ==)
-DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
-DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
-DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
-
-DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t,  !=)
-DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
-DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
-DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
-
-DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t,  >)
-DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
-DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
-DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
-
-DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t,  >=)
-DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
-DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
-DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
-
-DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t,  >)
-DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
-DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
-DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
-
-DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t,  >=)
-DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
-DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
-DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
-
-DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t,  <)
-DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
-DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
-DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
-
-DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t,  <=)
-DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
-DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
-DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
-
-DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t,  <)
-DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
-DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
-DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
-
-DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t,  <=)
-DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
-DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
-DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
-
-#undef DO_CMP_PPZI_B
-#undef DO_CMP_PPZI_H
-#undef DO_CMP_PPZI_S
-#undef DO_CMP_PPZI_D
-#undef DO_CMP_PPZI
-
-/* Similar to the ARM LastActive pseudocode function.  */
-static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
-{
-    intptr_t i;
-
-    for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
-        uint64_t pg = *(uint64_t *)(vg + i);
-        if (pg) {
-            return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
-        }
-    }
-    return 0;
-}
-
-/* Compute a mask into RETB that is true for all G, up to and including
- * (if after) or excluding (if !after) the first G & N.
- * Return true if BRK found.
- */
-static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
-                        bool brk, bool after)
-{
-    uint64_t b;
-
-    if (brk) {
-        b = 0;
-    } else if ((g & n) == 0) {
-        /* For all G, no N are set; break not found.  */
-        b = g;
-    } else {
-        /* Break somewhere in N.  Locate it.  */
-        b = g & n;            /* guard true, pred true */
-        b = b & -b;           /* first such */
-        if (after) {
-            b = b | (b - 1);  /* break after same */
-        } else {
-            b = b - 1;        /* break before same */
-        }
-        brk = true;
-    }
-
-    *retb = b;
-    return brk;
-}
-
-/* Compute a zeroing BRK.  */
-static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
-                          intptr_t oprsz, bool after)
-{
-    bool brk = false;
-    intptr_t i;
-
-    for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
-        uint64_t this_b, this_g = g[i];
-
-        brk = compute_brk(&this_b, n[i], this_g, brk, after);
-        d[i] = this_b & this_g;
-    }
-}
-
-/* Likewise, but also compute flags.  */
-static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
-                               intptr_t oprsz, bool after)
-{
-    uint32_t flags = PREDTEST_INIT;
-    bool brk = false;
-    intptr_t i;
-
-    for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
-        uint64_t this_b, this_d, this_g = g[i];
-
-        brk = compute_brk(&this_b, n[i], this_g, brk, after);
-        d[i] = this_d = this_b & this_g;
-        flags = iter_predtest_fwd(this_d, this_g, flags);
-    }
-    return flags;
-}
-
-/* Compute a merging BRK.  */
-static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
-                          intptr_t oprsz, bool after)
-{
-    bool brk = false;
-    intptr_t i;
-
-    for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
-        uint64_t this_b, this_g = g[i];
-
-        brk = compute_brk(&this_b, n[i], this_g, brk, after);
-        d[i] = (this_b & this_g) | (d[i] & ~this_g);
-    }
-}
-
-/* Likewise, but also compute flags.  */
-static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
-                               intptr_t oprsz, bool after)
-{
-    uint32_t flags = PREDTEST_INIT;
-    bool brk = false;
-    intptr_t i;
-
-    for (i = 0; i < oprsz / 8; ++i) {
-        uint64_t this_b, this_d = d[i], this_g = g[i];
-
-        brk = compute_brk(&this_b, n[i], this_g, brk, after);
-        d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
-        flags = iter_predtest_fwd(this_d, this_g, flags);
-    }
-    return flags;
-}
-
-static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
-{
-    /* It is quicker to zero the whole predicate than loop on OPRSZ.
-     * The compiler should turn this into 4 64-bit integer stores.
-     */
-    memset(d, 0, sizeof(ARMPredicateReg));
-    return PREDTEST_INIT;
-}
-
-void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
-                       uint32_t pred_desc)
-{
-    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
-    if (last_active_pred(vn, vg, oprsz)) {
-        compute_brk_z(vd, vm, vg, oprsz, true);
-    } else {
-        do_zero(vd, oprsz);
-    }
-}
-
-uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
-                            uint32_t pred_desc)
-{
-    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
-    if (last_active_pred(vn, vg, oprsz)) {
-        return compute_brks_z(vd, vm, vg, oprsz, true);
-    } else {
-        return do_zero(vd, oprsz);
-    }
-}
-
-void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
-                       uint32_t pred_desc)
-{
-    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
-    if (last_active_pred(vn, vg, oprsz)) {
-        compute_brk_z(vd, vm, vg, oprsz, false);
-    } else {
-        do_zero(vd, oprsz);
-    }
-}
-
-uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
-                            uint32_t pred_desc)
-{
-    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
-    if (last_active_pred(vn, vg, oprsz)) {
-        return compute_brks_z(vd, vm, vg, oprsz, false);
-    } else {
-        return do_zero(vd, oprsz);
-    }
-}
-
-void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
-{
-    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
-    compute_brk_z(vd, vn, vg, oprsz, true);
-}
-
-uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
-{
-    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
-    return compute_brks_z(vd, vn, vg, oprsz, true);
-}
-
-void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
-{
-    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
-    compute_brk_z(vd, vn, vg, oprsz, false);
-}
-
-uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
-{
-    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
-    return compute_brks_z(vd, vn, vg, oprsz, false);
-}
-
-void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
-{
-    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
-    compute_brk_m(vd, vn, vg, oprsz, true);
-}
-
-uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
-{
-    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
-    return compute_brks_m(vd, vn, vg, oprsz, true);
-}
-
-void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
-{
-    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
-    compute_brk_m(vd, vn, vg, oprsz, false);
-}
-
-uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
-{
-    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
-    return compute_brks_m(vd, vn, vg, oprsz, false);
-}
-
-void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
-{
-    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
-    if (!last_active_pred(vn, vg, oprsz)) {
-        do_zero(vd, oprsz);
-    }
-}
-
-/* As if PredTest(Ones(PL), D, esz).  */
-static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
-                              uint64_t esz_mask)
-{
-    uint32_t flags = PREDTEST_INIT;
-    intptr_t i;
-
-    for (i = 0; i < oprsz / 8; i++) {
-        flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
-    }
-    if (oprsz & 7) {
-        uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
-        flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
-    }
-    return flags;
-}
-
-uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
-{
-    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
-    if (last_active_pred(vn, vg, oprsz)) {
-        return predtest_ones(vd, oprsz, -1);
-    } else {
-        return do_zero(vd, oprsz);
-    }
-}
-
-uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
-{
-    intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
-    intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
-    uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
-    intptr_t i;
-
-    for (i = 0; i < words; ++i) {
-        uint64_t t = n[i] & g[i] & mask;
-        sum += ctpop64(t);
-    }
-    return sum;
-}
-
-uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
-{
-    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
-    intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
-    uint64_t esz_mask = pred_esz_masks[esz];
-    ARMPredicateReg *d = vd;
-    uint32_t flags;
-    intptr_t i;
-
-    /* Begin with a zero predicate register.  */
-    flags = do_zero(d, oprsz);
-    if (count == 0) {
-        return flags;
-    }
-
-    /* Set all of the requested bits.  */
-    for (i = 0; i < count / 64; ++i) {
-        d->p[i] = esz_mask;
-    }
-    if (count & 63) {
-        d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
-    }
-
-    return predtest_ones(d, oprsz, esz_mask);
-}
-
-uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
-{
-    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
-    intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
-    uint64_t esz_mask = pred_esz_masks[esz];
-    ARMPredicateReg *d = vd;
-    intptr_t i, invcount, oprbits;
-    uint64_t bits;
-
-    if (count == 0) {
-        return do_zero(d, oprsz);
-    }
-
-    oprbits = oprsz * 8;
-    tcg_debug_assert(count <= oprbits);
-
-    bits = esz_mask;
-    if (oprbits & 63) {
-        bits &= MAKE_64BIT_MASK(0, oprbits & 63);
-    }
-
-    invcount = oprbits - count;
-    for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
-        d->p[i] = bits;
-        bits = esz_mask;
-    }
-
-    d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
-
-    while (--i >= 0) {
-        d->p[i] = 0;
-    }
-
-    return predtest_ones(d, oprsz, esz_mask);
-}
-
-/* Recursive reduction on a function;
- * C.f. the ARM ARM function ReducePredicated.
- *
- * While it would be possible to write this without the DATA temporary,
- * it is much simpler to process the predicate register this way.
- * The recursion is bounded to depth 7 (128 fp16 elements), so there's
- * little to gain with a more complex non-recursive form.
- */
-#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT)                         \
-static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
-{                                                                     \
-    if (n == 1) {                                                     \
-        return *data;                                                 \
-    } else {                                                          \
-        uintptr_t half = n / 2;                                       \
-        TYPE lo = NAME##_reduce(data, status, half);                  \
-        TYPE hi = NAME##_reduce(data + half, status, half);           \
-        return TYPE##_##FUNC(lo, hi, status);                         \
-    }                                                                 \
-}                                                                     \
-uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc)    \
-{                                                                     \
-    uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc);   \
-    TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)];                   \
-    for (i = 0; i < oprsz; ) {                                        \
-        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));               \
-        do {                                                          \
-            TYPE nn = *(TYPE *)(vn + H(i));                           \
-            *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT);      \
-            i += sizeof(TYPE), pg >>= sizeof(TYPE);                   \
-        } while (i & 15);                                             \
-    }                                                                 \
-    for (; i < maxsz; i += sizeof(TYPE)) {                            \
-        *(TYPE *)((void *)data + i) = IDENT;                          \
-    }                                                                 \
-    return NAME##_reduce(data, vs, maxsz / sizeof(TYPE));             \
-}
-
-DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
-DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
-DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero)
-
-/* Identity is floatN_default_nan, without the function call.  */
-DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
-DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
-DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL)
-
-DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
-DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
-DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL)
-
-DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
-DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
-DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity)
-
-DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
-DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
-DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity))
-
-#undef DO_REDUCE
-
-uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
-                             void *status, uint32_t desc)
-{
-    intptr_t i = 0, opr_sz = simd_oprsz(desc);
-    float16 result = nn;
-
-    do {
-        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
-        do {
-            if (pg & 1) {
-                float16 mm = *(float16 *)(vm + H1_2(i));
-                result = float16_add(result, mm, status);
-            }
-            i += sizeof(float16), pg >>= sizeof(float16);
-        } while (i & 15);
-    } while (i < opr_sz);
-
-    return result;
-}
-
-uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
-                             void *status, uint32_t desc)
-{
-    intptr_t i = 0, opr_sz = simd_oprsz(desc);
-    float32 result = nn;
-
-    do {
-        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
-        do {
-            if (pg & 1) {
-                float32 mm = *(float32 *)(vm + H1_2(i));
-                result = float32_add(result, mm, status);
-            }
-            i += sizeof(float32), pg >>= sizeof(float32);
-        } while (i & 15);
-    } while (i < opr_sz);
-
-    return result;
-}
-
-uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
-                             void *status, uint32_t desc)
-{
-    intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t *m = vm;
-    uint8_t *pg = vg;
-
-    for (i = 0; i < opr_sz; i++) {
-        if (pg[H1(i)] & 1) {
-            nn = float64_add(nn, m[i], status);
-        }
-    }
-
-    return nn;
-}
-
-/* Fully general three-operand expander, controlled by a predicate,
- * With the extra float_status parameter.
- */
-#define DO_ZPZZ_FP(NAME, TYPE, H, OP)                           \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,       \
-                  void *status, uint32_t desc)                  \
-{                                                               \
-    intptr_t i = simd_oprsz(desc);                              \
-    uint64_t *g = vg;                                           \
-    do {                                                        \
-        uint64_t pg = g[(i - 1) >> 6];                          \
-        do {                                                    \
-            i -= sizeof(TYPE);                                  \
-            if (likely((pg >> (i & 63)) & 1)) {                 \
-                TYPE nn = *(TYPE *)(vn + H(i));                 \
-                TYPE mm = *(TYPE *)(vm + H(i));                 \
-                *(TYPE *)(vd + H(i)) = OP(nn, mm, status);      \
-            }                                                   \
-        } while (i & 63);                                       \
-    } while (i != 0);                                           \
-}
-
-DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
-DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
-DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
-
-DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
-DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
-DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
-
-DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
-DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
-DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
-
-DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
-DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
-DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
-
-DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
-DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
-DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
-
-DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
-DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
-DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
-
-DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
-DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
-DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
-
-DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
-DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
-DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
-
-static inline float16 abd_h(float16 a, float16 b, float_status *s)
-{
-    return float16_abs(float16_sub(a, b, s));
-}
-
-static inline float32 abd_s(float32 a, float32 b, float_status *s)
-{
-    return float32_abs(float32_sub(a, b, s));
-}
-
-static inline float64 abd_d(float64 a, float64 b, float_status *s)
-{
-    return float64_abs(float64_sub(a, b, s));
-}
-
-DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
-DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
-DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
-
-static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
-{
-    int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
-    return float64_scalbn(a, b_int, s);
-}
-
-DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
-DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
-DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
-
-DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
-DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
-DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
-
-#undef DO_ZPZZ_FP
-
-/* Three-operand expander, with one scalar operand, controlled by
- * a predicate, with the extra float_status parameter.
- */
-#define DO_ZPZS_FP(NAME, TYPE, H, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar,  \
-                  void *status, uint32_t desc)                    \
-{                                                                 \
-    intptr_t i = simd_oprsz(desc);                                \
-    uint64_t *g = vg;                                             \
-    TYPE mm = scalar;                                             \
-    do {                                                          \
-        uint64_t pg = g[(i - 1) >> 6];                            \
-        do {                                                      \
-            i -= sizeof(TYPE);                                    \
-            if (likely((pg >> (i & 63)) & 1)) {                   \
-                TYPE nn = *(TYPE *)(vn + H(i));                   \
-                *(TYPE *)(vd + H(i)) = OP(nn, mm, status);        \
-            }                                                     \
-        } while (i & 63);                                         \
-    } while (i != 0);                                             \
-}
-
-DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
-DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
-DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
-
-DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
-DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
-DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
-
-DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
-DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
-DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
-
-static inline float16 subr_h(float16 a, float16 b, float_status *s)
-{
-    return float16_sub(b, a, s);
-}
-
-static inline float32 subr_s(float32 a, float32 b, float_status *s)
-{
-    return float32_sub(b, a, s);
-}
-
-static inline float64 subr_d(float64 a, float64 b, float_status *s)
-{
-    return float64_sub(b, a, s);
-}
-
-DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
-DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
-DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
-
-DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
-DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
-DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
-
-DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
-DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
-DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
-
-DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
-DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
-DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
-
-DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
-DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
-DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
-
-/* Fully general two-operand expander, controlled by a predicate,
- * With the extra float_status parameter.
- */
-#define DO_ZPZ_FP(NAME, TYPE, H, OP)                                  \
-void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
-{                                                                     \
-    intptr_t i = simd_oprsz(desc);                                    \
-    uint64_t *g = vg;                                                 \
-    do {                                                              \
-        uint64_t pg = g[(i - 1) >> 6];                                \
-        do {                                                          \
-            i -= sizeof(TYPE);                                        \
-            if (likely((pg >> (i & 63)) & 1)) {                       \
-                TYPE nn = *(TYPE *)(vn + H(i));                       \
-                *(TYPE *)(vd + H(i)) = OP(nn, status);                \
-            }                                                         \
-        } while (i & 63);                                             \
-    } while (i != 0);                                                 \
-}
-
-/* SVE fp16 conversions always use IEEE mode.  Like AdvSIMD, they ignore
- * FZ16.  When converting from fp16, this affects flushing input denormals;
- * when converting to fp16, this affects flushing output denormals.
- */
-static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
-{
-    bool save = get_flush_inputs_to_zero(fpst);
-    float32 ret;
-
-    set_flush_inputs_to_zero(false, fpst);
-    ret = float16_to_float32(f, true, fpst);
-    set_flush_inputs_to_zero(save, fpst);
-    return ret;
-}
-
-static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
-{
-    bool save = get_flush_inputs_to_zero(fpst);
-    float64 ret;
-
-    set_flush_inputs_to_zero(false, fpst);
-    ret = float16_to_float64(f, true, fpst);
-    set_flush_inputs_to_zero(save, fpst);
-    return ret;
-}
-
-static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
-{
-    bool save = get_flush_to_zero(fpst);
-    float16 ret;
-
-    set_flush_to_zero(false, fpst);
-    ret = float32_to_float16(f, true, fpst);
-    set_flush_to_zero(save, fpst);
-    return ret;
-}
-
-static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
-{
-    bool save = get_flush_to_zero(fpst);
-    float16 ret;
-
-    set_flush_to_zero(false, fpst);
-    ret = float64_to_float16(f, true, fpst);
-    set_flush_to_zero(save, fpst);
-    return ret;
-}
-
-static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
-{
-    if (float16_is_any_nan(f)) {
-        float_raise(float_flag_invalid, s);
-        return 0;
-    }
-    return float16_to_int16_round_to_zero(f, s);
-}
-
-static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
-{
-    if (float16_is_any_nan(f)) {
-        float_raise(float_flag_invalid, s);
-        return 0;
-    }
-    return float16_to_int64_round_to_zero(f, s);
-}
-
-static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
-{
-    if (float32_is_any_nan(f)) {
-        float_raise(float_flag_invalid, s);
-        return 0;
-    }
-    return float32_to_int64_round_to_zero(f, s);
-}
-
-static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
-{
-    if (float64_is_any_nan(f)) {
-        float_raise(float_flag_invalid, s);
-        return 0;
-    }
-    return float64_to_int64_round_to_zero(f, s);
-}
-
-static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
-{
-    if (float16_is_any_nan(f)) {
-        float_raise(float_flag_invalid, s);
-        return 0;
-    }
-    return float16_to_uint16_round_to_zero(f, s);
-}
-
-static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
-{
-    if (float16_is_any_nan(f)) {
-        float_raise(float_flag_invalid, s);
-        return 0;
-    }
-    return float16_to_uint64_round_to_zero(f, s);
-}
-
-static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
-{
-    if (float32_is_any_nan(f)) {
-        float_raise(float_flag_invalid, s);
-        return 0;
-    }
-    return float32_to_uint64_round_to_zero(f, s);
-}
-
-static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
-{
-    if (float64_is_any_nan(f)) {
-        float_raise(float_flag_invalid, s);
-        return 0;
-    }
-    return float64_to_uint64_round_to_zero(f, s);
-}
-
-DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
-DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
-DO_ZPZ_FP(sve_bfcvt,   uint32_t, H1_4, float32_to_bfloat16)
-DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
-DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
-DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
-DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
-
-DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
-DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
-DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
-DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
-DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
-DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
-DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
-
-DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
-DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
-DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
-DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
-DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
-DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
-DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
-
-DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
-DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
-DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
-
-DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
-DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
-DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
-
-DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
-DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
-DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
-
-DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
-DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
-DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
-
-DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
-DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
-DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
-DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
-DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
-DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
-DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
-
-DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
-DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
-DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
-DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
-DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
-DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
-DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
-
-static int16_t do_float16_logb_as_int(float16 a, float_status *s)
-{
-    /* Extract frac to the top of the uint32_t. */
-    uint32_t frac = (uint32_t)a << (16 + 6);
-    int16_t exp = extract32(a, 10, 5);
-
-    if (unlikely(exp == 0)) {
-        if (frac != 0) {
-            if (!get_flush_inputs_to_zero(s)) {
-                /* denormal: bias - fractional_zeros */
-                return -15 - clz32(frac);
-            }
-            /* flush to zero */
-            float_raise(float_flag_input_denormal, s);
-        }
-    } else if (unlikely(exp == 0x1f)) {
-        if (frac == 0) {
-            return INT16_MAX; /* infinity */
-        }
-    } else {
-        /* normal: exp - bias */
-        return exp - 15;
-    }
-    /* nan or zero */
-    float_raise(float_flag_invalid, s);
-    return INT16_MIN;
-}
-
-static int32_t do_float32_logb_as_int(float32 a, float_status *s)
-{
-    /* Extract frac to the top of the uint32_t. */
-    uint32_t frac = a << 9;
-    int32_t exp = extract32(a, 23, 8);
-
-    if (unlikely(exp == 0)) {
-        if (frac != 0) {
-            if (!get_flush_inputs_to_zero(s)) {
-                /* denormal: bias - fractional_zeros */
-                return -127 - clz32(frac);
-            }
-            /* flush to zero */
-            float_raise(float_flag_input_denormal, s);
-        }
-    } else if (unlikely(exp == 0xff)) {
-        if (frac == 0) {
-            return INT32_MAX; /* infinity */
-        }
-    } else {
-        /* normal: exp - bias */
-        return exp - 127;
-    }
-    /* nan or zero */
-    float_raise(float_flag_invalid, s);
-    return INT32_MIN;
-}
-
-static int64_t do_float64_logb_as_int(float64 a, float_status *s)
-{
-    /* Extract frac to the top of the uint64_t. */
-    uint64_t frac = a << 12;
-    int64_t exp = extract64(a, 52, 11);
-
-    if (unlikely(exp == 0)) {
-        if (frac != 0) {
-            if (!get_flush_inputs_to_zero(s)) {
-                /* denormal: bias - fractional_zeros */
-                return -1023 - clz64(frac);
-            }
-            /* flush to zero */
-            float_raise(float_flag_input_denormal, s);
-        }
-    } else if (unlikely(exp == 0x7ff)) {
-        if (frac == 0) {
-            return INT64_MAX; /* infinity */
-        }
-    } else {
-        /* normal: exp - bias */
-        return exp - 1023;
-    }
-    /* nan or zero */
-    float_raise(float_flag_invalid, s);
-    return INT64_MIN;
-}
-
-DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
-DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
-DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
-
-#undef DO_ZPZ_FP
-
-static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
-                            float_status *status, uint32_t desc,
-                            uint16_t neg1, uint16_t neg3)
-{
-    intptr_t i = simd_oprsz(desc);
-    uint64_t *g = vg;
-
-    do {
-        uint64_t pg = g[(i - 1) >> 6];
-        do {
-            i -= 2;
-            if (likely((pg >> (i & 63)) & 1)) {
-                float16 e1, e2, e3, r;
-
-                e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
-                e2 = *(uint16_t *)(vm + H1_2(i));
-                e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
-                r = float16_muladd(e1, e2, e3, 0, status);
-                *(uint16_t *)(vd + H1_2(i)) = r;
-            }
-        } while (i & 63);
-    } while (i != 0);
-}
-
-void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
-                              void *vg, void *status, uint32_t desc)
-{
-    do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
-}
-
-void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
-                              void *vg, void *status, uint32_t desc)
-{
-    do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
-}
-
-void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
-                               void *vg, void *status, uint32_t desc)
-{
-    do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
-}
-
-void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
-                               void *vg, void *status, uint32_t desc)
-{
-    do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
-}
-
-static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
-                            float_status *status, uint32_t desc,
-                            uint32_t neg1, uint32_t neg3)
-{
-    intptr_t i = simd_oprsz(desc);
-    uint64_t *g = vg;
-
-    do {
-        uint64_t pg = g[(i - 1) >> 6];
-        do {
-            i -= 4;
-            if (likely((pg >> (i & 63)) & 1)) {
-                float32 e1, e2, e3, r;
-
-                e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
-                e2 = *(uint32_t *)(vm + H1_4(i));
-                e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
-                r = float32_muladd(e1, e2, e3, 0, status);
-                *(uint32_t *)(vd + H1_4(i)) = r;
-            }
-        } while (i & 63);
-    } while (i != 0);
-}
-
-void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
-                              void *vg, void *status, uint32_t desc)
-{
-    do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
-}
-
-void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
-                              void *vg, void *status, uint32_t desc)
-{
-    do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
-}
-
-void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
-                               void *vg, void *status, uint32_t desc)
-{
-    do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
-}
-
-void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
-                               void *vg, void *status, uint32_t desc)
-{
-    do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
-}
-
-static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
-                            float_status *status, uint32_t desc,
-                            uint64_t neg1, uint64_t neg3)
-{
-    intptr_t i = simd_oprsz(desc);
-    uint64_t *g = vg;
-
-    do {
-        uint64_t pg = g[(i - 1) >> 6];
-        do {
-            i -= 8;
-            if (likely((pg >> (i & 63)) & 1)) {
-                float64 e1, e2, e3, r;
-
-                e1 = *(uint64_t *)(vn + i) ^ neg1;
-                e2 = *(uint64_t *)(vm + i);
-                e3 = *(uint64_t *)(va + i) ^ neg3;
-                r = float64_muladd(e1, e2, e3, 0, status);
-                *(uint64_t *)(vd + i) = r;
-            }
-        } while (i & 63);
-    } while (i != 0);
-}
-
-void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
-                              void *vg, void *status, uint32_t desc)
-{
-    do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
-}
-
-void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
-                              void *vg, void *status, uint32_t desc)
-{
-    do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
-}
-
-void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
-                               void *vg, void *status, uint32_t desc)
-{
-    do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
-}
-
-void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
-                               void *vg, void *status, uint32_t desc)
-{
-    do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
-}
-
-/* Two operand floating-point comparison controlled by a predicate.
- * Unlike the integer version, we are not allowed to optimistically
- * compare operands, since the comparison may have side effects wrt
- * the FPSR.
- */
-#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP)                                \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
-                  void *status, uint32_t desc)                          \
-{                                                                       \
-    intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;                    \
-    uint64_t *d = vd, *g = vg;                                          \
-    do {                                                                \
-        uint64_t out = 0, pg = g[j];                                    \
-        do {                                                            \
-            i -= sizeof(TYPE), out <<= sizeof(TYPE);                    \
-            if (likely((pg >> (i & 63)) & 1)) {                         \
-                TYPE nn = *(TYPE *)(vn + H(i));                         \
-                TYPE mm = *(TYPE *)(vm + H(i));                         \
-                out |= OP(TYPE, nn, mm, status);                        \
-            }                                                           \
-        } while (i & 63);                                               \
-        d[j--] = out;                                                   \
-    } while (i > 0);                                                    \
-}
-
-#define DO_FPCMP_PPZZ_H(NAME, OP) \
-    DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
-#define DO_FPCMP_PPZZ_S(NAME, OP) \
-    DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
-#define DO_FPCMP_PPZZ_D(NAME, OP) \
-    DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
-
-#define DO_FPCMP_PPZZ_ALL(NAME, OP) \
-    DO_FPCMP_PPZZ_H(NAME, OP)   \
-    DO_FPCMP_PPZZ_S(NAME, OP)   \
-    DO_FPCMP_PPZZ_D(NAME, OP)
-
-#define DO_FCMGE(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) <= 0
-#define DO_FCMGT(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) < 0
-#define DO_FCMLE(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) <= 0
-#define DO_FCMLT(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) < 0
-#define DO_FCMEQ(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) == 0
-#define DO_FCMNE(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) != 0
-#define DO_FCMUO(TYPE, X, Y, ST)  \
-    TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
-#define DO_FACGE(TYPE, X, Y, ST)  \
-    TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
-#define DO_FACGT(TYPE, X, Y, ST)  \
-    TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
-
-DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
-DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
-DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
-DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
-DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
-DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
-DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
-
-#undef DO_FPCMP_PPZZ_ALL
-#undef DO_FPCMP_PPZZ_D
-#undef DO_FPCMP_PPZZ_S
-#undef DO_FPCMP_PPZZ_H
-#undef DO_FPCMP_PPZZ
-
-/* One operand floating-point comparison against zero, controlled
- * by a predicate.
- */
-#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP)                   \
-void HELPER(NAME)(void *vd, void *vn, void *vg,            \
-                  void *status, uint32_t desc)             \
-{                                                          \
-    intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;       \
-    uint64_t *d = vd, *g = vg;                             \
-    do {                                                   \
-        uint64_t out = 0, pg = g[j];                       \
-        do {                                               \
-            i -= sizeof(TYPE), out <<= sizeof(TYPE);       \
-            if ((pg >> (i & 63)) & 1) {                    \
-                TYPE nn = *(TYPE *)(vn + H(i));            \
-                out |= OP(TYPE, nn, 0, status);            \
-            }                                              \
-        } while (i & 63);                                  \
-        d[j--] = out;                                      \
-    } while (i > 0);                                       \
-}
-
-#define DO_FPCMP_PPZ0_H(NAME, OP) \
-    DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
-#define DO_FPCMP_PPZ0_S(NAME, OP) \
-    DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
-#define DO_FPCMP_PPZ0_D(NAME, OP) \
-    DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
-
-#define DO_FPCMP_PPZ0_ALL(NAME, OP) \
-    DO_FPCMP_PPZ0_H(NAME, OP)   \
-    DO_FPCMP_PPZ0_S(NAME, OP)   \
-    DO_FPCMP_PPZ0_D(NAME, OP)
-
-DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
-DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
-DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
-DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
-DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
-DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
-
-/* FP Trig Multiply-Add. */
-
-void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
-{
-    static const float16 coeff[16] = {
-        0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-        0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    };
-    intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
-    intptr_t x = simd_data(desc);
-    float16 *d = vd, *n = vn, *m = vm;
-    for (i = 0; i < opr_sz; i++) {
-        float16 mm = m[i];
-        intptr_t xx = x;
-        if (float16_is_neg(mm)) {
-            mm = float16_abs(mm);
-            xx += 8;
-        }
-        d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
-    }
-}
-
-void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
-{
-    static const float32 coeff[16] = {
-        0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
-        0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
-        0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
-        0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
-    };
-    intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
-    intptr_t x = simd_data(desc);
-    float32 *d = vd, *n = vn, *m = vm;
-    for (i = 0; i < opr_sz; i++) {
-        float32 mm = m[i];
-        intptr_t xx = x;
-        if (float32_is_neg(mm)) {
-            mm = float32_abs(mm);
-            xx += 8;
-        }
-        d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
-    }
-}
-
-void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
-{
-    static const float64 coeff[16] = {
-        0x3ff0000000000000ull, 0xbfc5555555555543ull,
-        0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
-        0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
-        0x3de5d8408868552full, 0x0000000000000000ull,
-        0x3ff0000000000000ull, 0xbfe0000000000000ull,
-        0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
-        0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
-        0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
-    };
-    intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
-    intptr_t x = simd_data(desc);
-    float64 *d = vd, *n = vn, *m = vm;
-    for (i = 0; i < opr_sz; i++) {
-        float64 mm = m[i];
-        intptr_t xx = x;
-        if (float64_is_neg(mm)) {
-            mm = float64_abs(mm);
-            xx += 8;
-        }
-        d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
-    }
-}
-
-/*
- * FP Complex Add
- */
-
-void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
-                         void *vs, uint32_t desc)
-{
-    intptr_t j, i = simd_oprsz(desc);
-    uint64_t *g = vg;
-    float16 neg_imag = float16_set_sign(0, simd_data(desc));
-    float16 neg_real = float16_chs(neg_imag);
-
-    do {
-        uint64_t pg = g[(i - 1) >> 6];
-        do {
-            float16 e0, e1, e2, e3;
-
-            /* I holds the real index; J holds the imag index.  */
-            j = i - sizeof(float16);
-            i -= 2 * sizeof(float16);
-
-            e0 = *(float16 *)(vn + H1_2(i));
-            e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
-            e2 = *(float16 *)(vn + H1_2(j));
-            e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
-
-            if (likely((pg >> (i & 63)) & 1)) {
-                *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
-            }
-            if (likely((pg >> (j & 63)) & 1)) {
-                *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
-            }
-        } while (i & 63);
-    } while (i != 0);
-}
-
-void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
-                         void *vs, uint32_t desc)
-{
-    intptr_t j, i = simd_oprsz(desc);
-    uint64_t *g = vg;
-    float32 neg_imag = float32_set_sign(0, simd_data(desc));
-    float32 neg_real = float32_chs(neg_imag);
-
-    do {
-        uint64_t pg = g[(i - 1) >> 6];
-        do {
-            float32 e0, e1, e2, e3;
-
-            /* I holds the real index; J holds the imag index.  */
-            j = i - sizeof(float32);
-            i -= 2 * sizeof(float32);
-
-            e0 = *(float32 *)(vn + H1_2(i));
-            e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
-            e2 = *(float32 *)(vn + H1_2(j));
-            e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
-
-            if (likely((pg >> (i & 63)) & 1)) {
-                *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
-            }
-            if (likely((pg >> (j & 63)) & 1)) {
-                *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
-            }
-        } while (i & 63);
-    } while (i != 0);
-}
-
-void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
-                         void *vs, uint32_t desc)
-{
-    intptr_t j, i = simd_oprsz(desc);
-    uint64_t *g = vg;
-    float64 neg_imag = float64_set_sign(0, simd_data(desc));
-    float64 neg_real = float64_chs(neg_imag);
-
-    do {
-        uint64_t pg = g[(i - 1) >> 6];
-        do {
-            float64 e0, e1, e2, e3;
-
-            /* I holds the real index; J holds the imag index.  */
-            j = i - sizeof(float64);
-            i -= 2 * sizeof(float64);
-
-            e0 = *(float64 *)(vn + H1_2(i));
-            e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
-            e2 = *(float64 *)(vn + H1_2(j));
-            e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
-
-            if (likely((pg >> (i & 63)) & 1)) {
-                *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
-            }
-            if (likely((pg >> (j & 63)) & 1)) {
-                *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
-            }
-        } while (i & 63);
-    } while (i != 0);
-}
-
-/*
- * FP Complex Multiply
- */
-
-void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
-                               void *vg, void *status, uint32_t desc)
-{
-    intptr_t j, i = simd_oprsz(desc);
-    unsigned rot = simd_data(desc);
-    bool flip = rot & 1;
-    float16 neg_imag, neg_real;
-    uint64_t *g = vg;
-
-    neg_imag = float16_set_sign(0, (rot & 2) != 0);
-    neg_real = float16_set_sign(0, rot == 1 || rot == 2);
-
-    do {
-        uint64_t pg = g[(i - 1) >> 6];
-        do {
-            float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
-
-            /* I holds the real index; J holds the imag index.  */
-            j = i - sizeof(float16);
-            i -= 2 * sizeof(float16);
-
-            nr = *(float16 *)(vn + H1_2(i));
-            ni = *(float16 *)(vn + H1_2(j));
-            mr = *(float16 *)(vm + H1_2(i));
-            mi = *(float16 *)(vm + H1_2(j));
-
-            e2 = (flip ? ni : nr);
-            e1 = (flip ? mi : mr) ^ neg_real;
-            e4 = e2;
-            e3 = (flip ? mr : mi) ^ neg_imag;
-
-            if (likely((pg >> (i & 63)) & 1)) {
-                d = *(float16 *)(va + H1_2(i));
-                d = float16_muladd(e2, e1, d, 0, status);
-                *(float16 *)(vd + H1_2(i)) = d;
-            }
-            if (likely((pg >> (j & 63)) & 1)) {
-                d = *(float16 *)(va + H1_2(j));
-                d = float16_muladd(e4, e3, d, 0, status);
-                *(float16 *)(vd + H1_2(j)) = d;
-            }
-        } while (i & 63);
-    } while (i != 0);
-}
-
-void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
-                               void *vg, void *status, uint32_t desc)
-{
-    intptr_t j, i = simd_oprsz(desc);
-    unsigned rot = simd_data(desc);
-    bool flip = rot & 1;
-    float32 neg_imag, neg_real;
-    uint64_t *g = vg;
-
-    neg_imag = float32_set_sign(0, (rot & 2) != 0);
-    neg_real = float32_set_sign(0, rot == 1 || rot == 2);
-
-    do {
-        uint64_t pg = g[(i - 1) >> 6];
-        do {
-            float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
-
-            /* I holds the real index; J holds the imag index.  */
-            j = i - sizeof(float32);
-            i -= 2 * sizeof(float32);
-
-            nr = *(float32 *)(vn + H1_2(i));
-            ni = *(float32 *)(vn + H1_2(j));
-            mr = *(float32 *)(vm + H1_2(i));
-            mi = *(float32 *)(vm + H1_2(j));
-
-            e2 = (flip ? ni : nr);
-            e1 = (flip ? mi : mr) ^ neg_real;
-            e4 = e2;
-            e3 = (flip ? mr : mi) ^ neg_imag;
-
-            if (likely((pg >> (i & 63)) & 1)) {
-                d = *(float32 *)(va + H1_2(i));
-                d = float32_muladd(e2, e1, d, 0, status);
-                *(float32 *)(vd + H1_2(i)) = d;
-            }
-            if (likely((pg >> (j & 63)) & 1)) {
-                d = *(float32 *)(va + H1_2(j));
-                d = float32_muladd(e4, e3, d, 0, status);
-                *(float32 *)(vd + H1_2(j)) = d;
-            }
-        } while (i & 63);
-    } while (i != 0);
-}
-
-void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
-                               void *vg, void *status, uint32_t desc)
-{
-    intptr_t j, i = simd_oprsz(desc);
-    unsigned rot = simd_data(desc);
-    bool flip = rot & 1;
-    float64 neg_imag, neg_real;
-    uint64_t *g = vg;
-
-    neg_imag = float64_set_sign(0, (rot & 2) != 0);
-    neg_real = float64_set_sign(0, rot == 1 || rot == 2);
-
-    do {
-        uint64_t pg = g[(i - 1) >> 6];
-        do {
-            float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
-
-            /* I holds the real index; J holds the imag index.  */
-            j = i - sizeof(float64);
-            i -= 2 * sizeof(float64);
-
-            nr = *(float64 *)(vn + H1_2(i));
-            ni = *(float64 *)(vn + H1_2(j));
-            mr = *(float64 *)(vm + H1_2(i));
-            mi = *(float64 *)(vm + H1_2(j));
-
-            e2 = (flip ? ni : nr);
-            e1 = (flip ? mi : mr) ^ neg_real;
-            e4 = e2;
-            e3 = (flip ? mr : mi) ^ neg_imag;
-
-            if (likely((pg >> (i & 63)) & 1)) {
-                d = *(float64 *)(va + H1_2(i));
-                d = float64_muladd(e2, e1, d, 0, status);
-                *(float64 *)(vd + H1_2(i)) = d;
-            }
-            if (likely((pg >> (j & 63)) & 1)) {
-                d = *(float64 *)(va + H1_2(j));
-                d = float64_muladd(e4, e3, d, 0, status);
-                *(float64 *)(vd + H1_2(j)) = d;
-            }
-        } while (i & 63);
-    } while (i != 0);
-}
-
-/*
- * Load contiguous data, protected by a governing predicate.
- */
-
-/*
- * Skip through a sequence of inactive elements in the guarding predicate @vg,
- * beginning at @reg_off bounded by @reg_max.  Return the offset of the active
- * element >= @reg_off, or @reg_max if there were no active elements at all.
- */
-static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
-                                 intptr_t reg_max, int esz)
-{
-    uint64_t pg_mask = pred_esz_masks[esz];
-    uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
-
-    /* In normal usage, the first element is active.  */
-    if (likely(pg & 1)) {
-        return reg_off;
-    }
-
-    if (pg == 0) {
-        reg_off &= -64;
-        do {
-            reg_off += 64;
-            if (unlikely(reg_off >= reg_max)) {
-                /* The entire predicate was false.  */
-                return reg_max;
-            }
-            pg = vg[reg_off >> 6] & pg_mask;
-        } while (pg == 0);
-    }
-    reg_off += ctz64(pg);
-
-    /* We should never see an out of range predicate bit set.  */
-    tcg_debug_assert(reg_off < reg_max);
-    return reg_off;
-}
-
-/*
- * Resolve the guest virtual address to info->host and info->flags.
- * If @nofault, return false if the page is invalid, otherwise
- * exit via page fault exception.
- */
-
-bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
-                    target_ulong addr, int mem_off, MMUAccessType access_type,
-                    int mmu_idx, uintptr_t retaddr)
-{
-    int flags;
-
-    addr += mem_off;
-
-    /*
-     * User-only currently always issues with TBI.  See the comment
-     * above useronly_clean_ptr.  Usually we clean this top byte away
-     * during translation, but we can't do that for e.g. vector + imm
-     * addressing modes.
-     *
-     * We currently always enable TBI for user-only, and do not provide
-     * a way to turn it off.  So clean the pointer unconditionally here,
-     * rather than look it up here, or pass it down from above.
-     */
-    addr = useronly_clean_ptr(addr);
-
-#ifdef CONFIG_USER_ONLY
-    flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
-                               &info->host, retaddr);
-#else
-    CPUTLBEntryFull *full;
-    flags = probe_access_full(env, addr, access_type, mmu_idx, nofault,
-                              &info->host, &full, retaddr);
-#endif
-    info->flags = flags;
-
-    if (flags & TLB_INVALID_MASK) {
-        g_assert(nofault);
-        return false;
-    }
-
-#ifdef CONFIG_USER_ONLY
-    memset(&info->attrs, 0, sizeof(info->attrs));
-    /* Require both ANON and MTE; see allocation_tag_mem(). */
-    info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
-#else
-    info->attrs = full->attrs;
-    info->tagged = full->pte_attrs == 0xf0;
-#endif
-
-    /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
-    info->host -= mem_off;
-    return true;
-}
-
-/*
- * Find first active element on each page, and a loose bound for the
- * final element on each page.  Identify any single element that spans
- * the page boundary.  Return true if there are any active elements.
- */
-bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
-                            intptr_t reg_max, int esz, int msize)
-{
-    const int esize = 1 << esz;
-    const uint64_t pg_mask = pred_esz_masks[esz];
-    intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
-    intptr_t mem_off_last, mem_off_split;
-    intptr_t page_split, elt_split;
-    intptr_t i;
-
-    /* Set all of the element indices to -1, and the TLB data to 0. */
-    memset(info, -1, offsetof(SVEContLdSt, page));
-    memset(info->page, 0, sizeof(info->page));
-
-    /* Gross scan over the entire predicate to find bounds. */
-    i = 0;
-    do {
-        uint64_t pg = vg[i] & pg_mask;
-        if (pg) {
-            reg_off_last = i * 64 + 63 - clz64(pg);
-            if (reg_off_first < 0) {
-                reg_off_first = i * 64 + ctz64(pg);
-            }
-        }
-    } while (++i * 64 < reg_max);
-
-    if (unlikely(reg_off_first < 0)) {
-        /* No active elements, no pages touched. */
-        return false;
-    }
-    tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
-
-    info->reg_off_first[0] = reg_off_first;
-    info->mem_off_first[0] = (reg_off_first >> esz) * msize;
-    mem_off_last = (reg_off_last >> esz) * msize;
-
-    page_split = -(addr | TARGET_PAGE_MASK);
-    if (likely(mem_off_last + msize <= page_split)) {
-        /* The entire operation fits within a single page. */
-        info->reg_off_last[0] = reg_off_last;
-        return true;
-    }
-
-    info->page_split = page_split;
-    elt_split = page_split / msize;
-    reg_off_split = elt_split << esz;
-    mem_off_split = elt_split * msize;
-
-    /*
-     * This is the last full element on the first page, but it is not
-     * necessarily active.  If there is no full element, i.e. the first
-     * active element is the one that's split, this value remains -1.
-     * It is useful as iteration bounds.
-     */
-    if (elt_split != 0) {
-        info->reg_off_last[0] = reg_off_split - esize;
-    }
-
-    /* Determine if an unaligned element spans the pages.  */
-    if (page_split % msize != 0) {
-        /* It is helpful to know if the split element is active. */
-        if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
-            info->reg_off_split = reg_off_split;
-            info->mem_off_split = mem_off_split;
-
-            if (reg_off_split == reg_off_last) {
-                /* The page crossing element is last. */
-                return true;
-            }
-        }
-        reg_off_split += esize;
-        mem_off_split += msize;
-    }
-
-    /*
-     * We do want the first active element on the second page, because
-     * this may affect the address reported in an exception.
-     */
-    reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
-    tcg_debug_assert(reg_off_split <= reg_off_last);
-    info->reg_off_first[1] = reg_off_split;
-    info->mem_off_first[1] = (reg_off_split >> esz) * msize;
-    info->reg_off_last[1] = reg_off_last;
-    return true;
-}
-
-/*
- * Resolve the guest virtual addresses to info->page[].
- * Control the generation of page faults with @fault.  Return false if
- * there is no work to do, which can only happen with @fault == FAULT_NO.
- */
-bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
-                         CPUARMState *env, target_ulong addr,
-                         MMUAccessType access_type, uintptr_t retaddr)
-{
-    int mmu_idx = cpu_mmu_index(env, false);
-    int mem_off = info->mem_off_first[0];
-    bool nofault = fault == FAULT_NO;
-    bool have_work = true;
-
-    if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
-                        access_type, mmu_idx, retaddr)) {
-        /* No work to be done. */
-        return false;
-    }
-
-    if (likely(info->page_split < 0)) {
-        /* The entire operation was on the one page. */
-        return true;
-    }
-
-    /*
-     * If the second page is invalid, then we want the fault address to be
-     * the first byte on that page which is accessed.
-     */
-    if (info->mem_off_split >= 0) {
-        /*
-         * There is an element split across the pages.  The fault address
-         * should be the first byte of the second page.
-         */
-        mem_off = info->page_split;
-        /*
-         * If the split element is also the first active element
-         * of the vector, then:  For first-fault we should continue
-         * to generate faults for the second page.  For no-fault,
-         * we have work only if the second page is valid.
-         */
-        if (info->mem_off_first[0] < info->mem_off_split) {
-            nofault = FAULT_FIRST;
-            have_work = false;
-        }
-    } else {
-        /*
-         * There is no element split across the pages.  The fault address
-         * should be the first active element on the second page.
-         */
-        mem_off = info->mem_off_first[1];
-        /*
-         * There must have been one active element on the first page,
-         * so we're out of first-fault territory.
-         */
-        nofault = fault != FAULT_ALL;
-    }
-
-    have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
-                                access_type, mmu_idx, retaddr);
-    return have_work;
-}
-
-#ifndef CONFIG_USER_ONLY
-void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
-                               uint64_t *vg, target_ulong addr,
-                               int esize, int msize, int wp_access,
-                               uintptr_t retaddr)
-{
-    intptr_t mem_off, reg_off, reg_last;
-    int flags0 = info->page[0].flags;
-    int flags1 = info->page[1].flags;
-
-    if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
-        return;
-    }
-
-    /* Indicate that watchpoints are handled. */
-    info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
-    info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
-
-    if (flags0 & TLB_WATCHPOINT) {
-        mem_off = info->mem_off_first[0];
-        reg_off = info->reg_off_first[0];
-        reg_last = info->reg_off_last[0];
-
-        while (reg_off <= reg_last) {
-            uint64_t pg = vg[reg_off >> 6];
-            do {
-                if ((pg >> (reg_off & 63)) & 1) {
-                    cpu_check_watchpoint(env_cpu(env), addr + mem_off,
-                                         msize, info->page[0].attrs,
-                                         wp_access, retaddr);
-                }
-                reg_off += esize;
-                mem_off += msize;
-            } while (reg_off <= reg_last && (reg_off & 63));
-        }
-    }
-
-    mem_off = info->mem_off_split;
-    if (mem_off >= 0) {
-        cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
-                             info->page[0].attrs, wp_access, retaddr);
-    }
-
-    mem_off = info->mem_off_first[1];
-    if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
-        reg_off = info->reg_off_first[1];
-        reg_last = info->reg_off_last[1];
-
-        do {
-            uint64_t pg = vg[reg_off >> 6];
-            do {
-                if ((pg >> (reg_off & 63)) & 1) {
-                    cpu_check_watchpoint(env_cpu(env), addr + mem_off,
-                                         msize, info->page[1].attrs,
-                                         wp_access, retaddr);
-                }
-                reg_off += esize;
-                mem_off += msize;
-            } while (reg_off & 63);
-        } while (reg_off <= reg_last);
-    }
-}
-#endif
-
-void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
-                             uint64_t *vg, target_ulong addr, int esize,
-                             int msize, uint32_t mtedesc, uintptr_t ra)
-{
-    intptr_t mem_off, reg_off, reg_last;
-
-    /* Process the page only if MemAttr == Tagged. */
-    if (info->page[0].tagged) {
-        mem_off = info->mem_off_first[0];
-        reg_off = info->reg_off_first[0];
-        reg_last = info->reg_off_split;
-        if (reg_last < 0) {
-            reg_last = info->reg_off_last[0];
-        }
-
-        do {
-            uint64_t pg = vg[reg_off >> 6];
-            do {
-                if ((pg >> (reg_off & 63)) & 1) {
-                    mte_check(env, mtedesc, addr, ra);
-                }
-                reg_off += esize;
-                mem_off += msize;
-            } while (reg_off <= reg_last && (reg_off & 63));
-        } while (reg_off <= reg_last);
-    }
-
-    mem_off = info->mem_off_first[1];
-    if (mem_off >= 0 && info->page[1].tagged) {
-        reg_off = info->reg_off_first[1];
-        reg_last = info->reg_off_last[1];
-
-        do {
-            uint64_t pg = vg[reg_off >> 6];
-            do {
-                if ((pg >> (reg_off & 63)) & 1) {
-                    mte_check(env, mtedesc, addr, ra);
-                }
-                reg_off += esize;
-                mem_off += msize;
-            } while (reg_off & 63);
-        } while (reg_off <= reg_last);
-    }
-}
-
-/*
- * Common helper for all contiguous 1,2,3,4-register predicated stores.
- */
-static inline QEMU_ALWAYS_INLINE
-void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
-               uint32_t desc, const uintptr_t retaddr,
-               const int esz, const int msz, const int N, uint32_t mtedesc,
-               sve_ldst1_host_fn *host_fn,
-               sve_ldst1_tlb_fn *tlb_fn)
-{
-    const unsigned rd = simd_data(desc);
-    const intptr_t reg_max = simd_oprsz(desc);
-    intptr_t reg_off, reg_last, mem_off;
-    SVEContLdSt info;
-    void *host;
-    int flags, i;
-
-    /* Find the active elements.  */
-    if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
-        /* The entire predicate was false; no load occurs.  */
-        for (i = 0; i < N; ++i) {
-            memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
-        }
-        return;
-    }
-
-    /* Probe the page(s).  Exit with exception for any invalid page. */
-    sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
-
-    /* Handle watchpoints for all active elements. */
-    sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
-                              BP_MEM_READ, retaddr);
-
-    /*
-     * Handle mte checks for all active elements.
-     * Since TBI must be set for MTE, !mtedesc => !mte_active.
-     */
-    if (mtedesc) {
-        sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
-                                mtedesc, retaddr);
-    }
-
-    flags = info.page[0].flags | info.page[1].flags;
-    if (unlikely(flags != 0)) {
-#ifdef CONFIG_USER_ONLY
-        g_assert_not_reached();
-#else
-        /*
-         * At least one page includes MMIO.
-         * Any bus operation can fail with cpu_transaction_failed,
-         * which for ARM will raise SyncExternal.  Perform the load
-         * into scratch memory to preserve register state until the end.
-         */
-        ARMVectorReg scratch[4] = { };
-
-        mem_off = info.mem_off_first[0];
-        reg_off = info.reg_off_first[0];
-        reg_last = info.reg_off_last[1];
-        if (reg_last < 0) {
-            reg_last = info.reg_off_split;
-            if (reg_last < 0) {
-                reg_last = info.reg_off_last[0];
-            }
-        }
-
-        do {
-            uint64_t pg = vg[reg_off >> 6];
-            do {
-                if ((pg >> (reg_off & 63)) & 1) {
-                    for (i = 0; i < N; ++i) {
-                        tlb_fn(env, &scratch[i], reg_off,
-                               addr + mem_off + (i << msz), retaddr);
-                    }
-                }
-                reg_off += 1 << esz;
-                mem_off += N << msz;
-            } while (reg_off & 63);
-        } while (reg_off <= reg_last);
-
-        for (i = 0; i < N; ++i) {
-            memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
-        }
-        return;
-#endif
-    }
-
-    /* The entire operation is in RAM, on valid pages. */
-
-    for (i = 0; i < N; ++i) {
-        memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
-    }
-
-    mem_off = info.mem_off_first[0];
-    reg_off = info.reg_off_first[0];
-    reg_last = info.reg_off_last[0];
-    host = info.page[0].host;
-
-    while (reg_off <= reg_last) {
-        uint64_t pg = vg[reg_off >> 6];
-        do {
-            if ((pg >> (reg_off & 63)) & 1) {
-                for (i = 0; i < N; ++i) {
-                    host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
-                            host + mem_off + (i << msz));
-                }
-            }
-            reg_off += 1 << esz;
-            mem_off += N << msz;
-        } while (reg_off <= reg_last && (reg_off & 63));
-    }
-
-    /*
-     * Use the slow path to manage the cross-page misalignment.
-     * But we know this is RAM and cannot trap.
-     */
-    mem_off = info.mem_off_split;
-    if (unlikely(mem_off >= 0)) {
-        reg_off = info.reg_off_split;
-        for (i = 0; i < N; ++i) {
-            tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
-                   addr + mem_off + (i << msz), retaddr);
-        }
-    }
-
-    mem_off = info.mem_off_first[1];
-    if (unlikely(mem_off >= 0)) {
-        reg_off = info.reg_off_first[1];
-        reg_last = info.reg_off_last[1];
-        host = info.page[1].host;
-
-        do {
-            uint64_t pg = vg[reg_off >> 6];
-            do {
-                if ((pg >> (reg_off & 63)) & 1) {
-                    for (i = 0; i < N; ++i) {
-                        host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
-                                host + mem_off + (i << msz));
-                    }
-                }
-                reg_off += 1 << esz;
-                mem_off += N << msz;
-            } while (reg_off & 63);
-        } while (reg_off <= reg_last);
-    }
-}
-
-static inline QEMU_ALWAYS_INLINE
-void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
-                   uint32_t desc, const uintptr_t ra,
-                   const int esz, const int msz, const int N,
-                   sve_ldst1_host_fn *host_fn,
-                   sve_ldst1_tlb_fn *tlb_fn)
-{
-    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
-    int bit55 = extract64(addr, 55, 1);
-
-    /* Remove mtedesc from the normal sve descriptor. */
-    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
-
-    /* Perform gross MTE suppression early. */
-    if (!tbi_check(desc, bit55) ||
-        tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
-        mtedesc = 0;
-    }
-
-    sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
-}
-
-#define DO_LD1_1(NAME, ESZ)                                             \
-void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg,                 \
-                            target_ulong addr, uint32_t desc)           \
-{                                                                       \
-    sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0,            \
-              sve_##NAME##_host, sve_##NAME##_tlb);                     \
-}                                                                       \
-void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg,             \
-                                target_ulong addr, uint32_t desc)       \
-{                                                                       \
-    sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1,           \
-                  sve_##NAME##_host, sve_##NAME##_tlb);                 \
-}
-
-#define DO_LD1_2(NAME, ESZ, MSZ)                                        \
-void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg,              \
-                               target_ulong addr, uint32_t desc)        \
-{                                                                       \
-    sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
-              sve_##NAME##_le_host, sve_##NAME##_le_tlb);               \
-}                                                                       \
-void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg,              \
-                               target_ulong addr, uint32_t desc)        \
-{                                                                       \
-    sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
-              sve_##NAME##_be_host, sve_##NAME##_be_tlb);               \
-}                                                                       \
-void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg,          \
-                                   target_ulong addr, uint32_t desc)    \
-{                                                                       \
-    sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
-                  sve_##NAME##_le_host, sve_##NAME##_le_tlb);           \
-}                                                                       \
-void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg,          \
-                                   target_ulong addr, uint32_t desc)    \
-{                                                                       \
-    sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
-                  sve_##NAME##_be_host, sve_##NAME##_be_tlb);           \
-}
-
-DO_LD1_1(ld1bb,  MO_8)
-DO_LD1_1(ld1bhu, MO_16)
-DO_LD1_1(ld1bhs, MO_16)
-DO_LD1_1(ld1bsu, MO_32)
-DO_LD1_1(ld1bss, MO_32)
-DO_LD1_1(ld1bdu, MO_64)
-DO_LD1_1(ld1bds, MO_64)
-
-DO_LD1_2(ld1hh,  MO_16, MO_16)
-DO_LD1_2(ld1hsu, MO_32, MO_16)
-DO_LD1_2(ld1hss, MO_32, MO_16)
-DO_LD1_2(ld1hdu, MO_64, MO_16)
-DO_LD1_2(ld1hds, MO_64, MO_16)
-
-DO_LD1_2(ld1ss,  MO_32, MO_32)
-DO_LD1_2(ld1sdu, MO_64, MO_32)
-DO_LD1_2(ld1sds, MO_64, MO_32)
-
-DO_LD1_2(ld1dd,  MO_64, MO_64)
-
-#undef DO_LD1_1
-#undef DO_LD1_2
-
-#define DO_LDN_1(N)                                                     \
-void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg,                \
-                             target_ulong addr, uint32_t desc)          \
-{                                                                       \
-    sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0,           \
-              sve_ld1bb_host, sve_ld1bb_tlb);                           \
-}                                                                       \
-void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg,            \
-                                 target_ulong addr, uint32_t desc)      \
-{                                                                       \
-    sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N,          \
-                  sve_ld1bb_host, sve_ld1bb_tlb);                       \
-}
-
-#define DO_LDN_2(N, SUFF, ESZ)                                          \
-void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg,         \
-                                    target_ulong addr, uint32_t desc)   \
-{                                                                       \
-    sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
-              sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);         \
-}                                                                       \
-void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg,         \
-                                    target_ulong addr, uint32_t desc)   \
-{                                                                       \
-    sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
-              sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);         \
-}                                                                       \
-void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg,     \
-                                        target_ulong addr, uint32_t desc) \
-{                                                                       \
-    sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
-                  sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);     \
-}                                                                       \
-void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg,     \
-                                        target_ulong addr, uint32_t desc) \
-{                                                                       \
-    sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
-                  sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);     \
-}
-
-DO_LDN_1(2)
-DO_LDN_1(3)
-DO_LDN_1(4)
-
-DO_LDN_2(2, hh, MO_16)
-DO_LDN_2(3, hh, MO_16)
-DO_LDN_2(4, hh, MO_16)
-
-DO_LDN_2(2, ss, MO_32)
-DO_LDN_2(3, ss, MO_32)
-DO_LDN_2(4, ss, MO_32)
-
-DO_LDN_2(2, dd, MO_64)
-DO_LDN_2(3, dd, MO_64)
-DO_LDN_2(4, dd, MO_64)
-
-#undef DO_LDN_1
-#undef DO_LDN_2
-
-/*
- * Load contiguous data, first-fault and no-fault.
- *
- * For user-only, one could argue that we should hold the mmap_lock during
- * the operation so that there is no race between page_check_range and the
- * load operation.  However, unmapping pages out from under a running thread
- * is extraordinarily unlikely.  This theoretical race condition also affects
- * linux-user/ in its get_user/put_user macros.
- *
- * TODO: Construct some helpers, written in assembly, that interact with
- * host_signal_handler to produce memory ops which can properly report errors
- * without racing.
- */
-
-/* Fault on byte I.  All bits in FFR from I are cleared.  The vector
- * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
- * option, which leaves subsequent data unchanged.
- */
-static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
-{
-    uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
-
-    if (i & 63) {
-        ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
-        i = ROUND_UP(i, 64);
-    }
-    for (; i < oprsz; i += 64) {
-        ffr[i / 64] = 0;
-    }
-}
-
-/*
- * Common helper for all contiguous no-fault and first-fault loads.
- */
-static inline QEMU_ALWAYS_INLINE
-void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
-                   uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
-                   const int esz, const int msz, const SVEContFault fault,
-                   sve_ldst1_host_fn *host_fn,
-                   sve_ldst1_tlb_fn *tlb_fn)
-{
-    const unsigned rd = simd_data(desc);
-    void *vd = &env->vfp.zregs[rd];
-    const intptr_t reg_max = simd_oprsz(desc);
-    intptr_t reg_off, mem_off, reg_last;
-    SVEContLdSt info;
-    int flags;
-    void *host;
-
-    /* Find the active elements.  */
-    if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
-        /* The entire predicate was false; no load occurs.  */
-        memset(vd, 0, reg_max);
-        return;
-    }
-    reg_off = info.reg_off_first[0];
-
-    /* Probe the page(s). */
-    if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
-        /* Fault on first element. */
-        tcg_debug_assert(fault == FAULT_NO);
-        memset(vd, 0, reg_max);
-        goto do_fault;
-    }
-
-    mem_off = info.mem_off_first[0];
-    flags = info.page[0].flags;
-
-    /*
-     * Disable MTE checking if the Tagged bit is not set.  Since TBI must
-     * be set within MTEDESC for MTE, !mtedesc => !mte_active.
-     */
-    if (!info.page[0].tagged) {
-        mtedesc = 0;
-    }
-
-    if (fault == FAULT_FIRST) {
-        /* Trapping mte check for the first-fault element.  */
-        if (mtedesc) {
-            mte_check(env, mtedesc, addr + mem_off, retaddr);
-        }
-
-        /*
-         * Special handling of the first active element,
-         * if it crosses a page boundary or is MMIO.
-         */
-        bool is_split = mem_off == info.mem_off_split;
-        if (unlikely(flags != 0) || unlikely(is_split)) {
-            /*
-             * Use the slow path for cross-page handling.
-             * Might trap for MMIO or watchpoints.
-             */
-            tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
-
-            /* After any fault, zero the other elements. */
-            swap_memzero(vd, reg_off);
-            reg_off += 1 << esz;
-            mem_off += 1 << msz;
-            swap_memzero(vd + reg_off, reg_max - reg_off);
-
-            if (is_split) {
-                goto second_page;
-            }
-        } else {
-            memset(vd, 0, reg_max);
-        }
-    } else {
-        memset(vd, 0, reg_max);
-        if (unlikely(mem_off == info.mem_off_split)) {
-            /* The first active element crosses a page boundary. */
-            flags |= info.page[1].flags;
-            if (unlikely(flags & TLB_MMIO)) {
-                /* Some page is MMIO, see below. */
-                goto do_fault;
-            }
-            if (unlikely(flags & TLB_WATCHPOINT) &&
-                (cpu_watchpoint_address_matches
-                 (env_cpu(env), addr + mem_off, 1 << msz)
-                 & BP_MEM_READ)) {
-                /* Watchpoint hit, see below. */
-                goto do_fault;
-            }
-            if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
-                goto do_fault;
-            }
-            /*
-             * Use the slow path for cross-page handling.
-             * This is RAM, without a watchpoint, and will not trap.
-             */
-            tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
-            goto second_page;
-        }
-    }
-
-    /*
-     * From this point on, all memory operations are MemSingleNF.
-     *
-     * Per the MemSingleNF pseudocode, a no-fault load from Device memory
-     * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
-     *
-     * Unfortuately we do not have access to the memory attributes from the
-     * PTE to tell Device memory from Normal memory.  So we make a mostly
-     * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
-     * This gives the right answer for the common cases of "Normal memory,
-     * backed by host RAM" and "Device memory, backed by MMIO".
-     * The architecture allows us to suppress an NF load and return
-     * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
-     * case of "Normal memory, backed by MMIO" is permitted.  The case we
-     * get wrong is "Device memory, backed by host RAM", for which we
-     * should return (UNKNOWN, FAULT) for but do not.
-     *
-     * Similarly, CPU_BP breakpoints would raise exceptions, and so
-     * return (UNKNOWN, FAULT).  For simplicity, we consider gdb and
-     * architectural breakpoints the same.
-     */
-    if (unlikely(flags & TLB_MMIO)) {
-        goto do_fault;
-    }
-
-    reg_last = info.reg_off_last[0];
-    host = info.page[0].host;
-
-    do {
-        uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
-        do {
-            if ((pg >> (reg_off & 63)) & 1) {
-                if (unlikely(flags & TLB_WATCHPOINT) &&
-                    (cpu_watchpoint_address_matches
-                     (env_cpu(env), addr + mem_off, 1 << msz)
-                     & BP_MEM_READ)) {
-                    goto do_fault;
-                }
-                if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
-                    goto do_fault;
-                }
-                host_fn(vd, reg_off, host + mem_off);
-            }
-            reg_off += 1 << esz;
-            mem_off += 1 << msz;
-        } while (reg_off <= reg_last && (reg_off & 63));
-    } while (reg_off <= reg_last);
-
-    /*
-     * MemSingleNF is allowed to fail for any reason.  We have special
-     * code above to handle the first element crossing a page boundary.
-     * As an implementation choice, decline to handle a cross-page element
-     * in any other position.
-     */
-    reg_off = info.reg_off_split;
-    if (reg_off >= 0) {
-        goto do_fault;
-    }
-
- second_page:
-    reg_off = info.reg_off_first[1];
-    if (likely(reg_off < 0)) {
-        /* No active elements on the second page.  All done. */
-        return;
-    }
-
-    /*
-     * MemSingleNF is allowed to fail for any reason.  As an implementation
-     * choice, decline to handle elements on the second page.  This should
-     * be low frequency as the guest walks through memory -- the next
-     * iteration of the guest's loop should be aligned on the page boundary,
-     * and then all following iterations will stay aligned.
-     */
-
- do_fault:
-    record_fault(env, reg_off, reg_max);
-}
-
-static inline QEMU_ALWAYS_INLINE
-void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
-                       uint32_t desc, const uintptr_t retaddr,
-                       const int esz, const int msz, const SVEContFault fault,
-                       sve_ldst1_host_fn *host_fn,
-                       sve_ldst1_tlb_fn *tlb_fn)
-{
-    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
-    int bit55 = extract64(addr, 55, 1);
-
-    /* Remove mtedesc from the normal sve descriptor. */
-    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
-
-    /* Perform gross MTE suppression early. */
-    if (!tbi_check(desc, bit55) ||
-        tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
-        mtedesc = 0;
-    }
-
-    sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
-                  esz, msz, fault, host_fn, tlb_fn);
-}
-
-#define DO_LDFF1_LDNF1_1(PART, ESZ)                                     \
-void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg,            \
-                                 target_ulong addr, uint32_t desc)      \
-{                                                                       \
-    sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
-                  sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
-}                                                                       \
-void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg,            \
-                                 target_ulong addr, uint32_t desc)      \
-{                                                                       \
-    sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
-                  sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
-}                                                                       \
-void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg,        \
-                                     target_ulong addr, uint32_t desc)  \
-{                                                                       \
-    sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
-                      sve_ld1##PART##_host, sve_ld1##PART##_tlb);       \
-}                                                                       \
-void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg,        \
-                                     target_ulong addr, uint32_t desc)  \
-{                                                                       \
-    sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
-                  sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
-}
-
-#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ)                                \
-void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg,         \
-                                    target_ulong addr, uint32_t desc)   \
-{                                                                       \
-    sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
-                  sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
-}                                                                       \
-void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg,         \
-                                    target_ulong addr, uint32_t desc)   \
-{                                                                       \
-    sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
-                  sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
-}                                                                       \
-void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg,         \
-                                    target_ulong addr, uint32_t desc)   \
-{                                                                       \
-    sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
-                  sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
-}                                                                       \
-void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg,         \
-                                    target_ulong addr, uint32_t desc)   \
-{                                                                       \
-    sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
-                  sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
-}                                                                       \
-void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
-                                        target_ulong addr, uint32_t desc) \
-{                                                                       \
-    sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
-                      sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
-}                                                                       \
-void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
-                                        target_ulong addr, uint32_t desc) \
-{                                                                       \
-    sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
-                      sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
-}                                                                       \
-void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
-                                        target_ulong addr, uint32_t desc) \
-{                                                                       \
-    sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
-                      sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
-}                                                                       \
-void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
-                                        target_ulong addr, uint32_t desc) \
-{                                                                       \
-    sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
-                      sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
-}
-
-DO_LDFF1_LDNF1_1(bb,  MO_8)
-DO_LDFF1_LDNF1_1(bhu, MO_16)
-DO_LDFF1_LDNF1_1(bhs, MO_16)
-DO_LDFF1_LDNF1_1(bsu, MO_32)
-DO_LDFF1_LDNF1_1(bss, MO_32)
-DO_LDFF1_LDNF1_1(bdu, MO_64)
-DO_LDFF1_LDNF1_1(bds, MO_64)
-
-DO_LDFF1_LDNF1_2(hh,  MO_16, MO_16)
-DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
-DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
-DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
-DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
-
-DO_LDFF1_LDNF1_2(ss,  MO_32, MO_32)
-DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
-DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
-
-DO_LDFF1_LDNF1_2(dd,  MO_64, MO_64)
-
-#undef DO_LDFF1_LDNF1_1
-#undef DO_LDFF1_LDNF1_2
-
-/*
- * Common helper for all contiguous 1,2,3,4-register predicated stores.
- */
-
-static inline QEMU_ALWAYS_INLINE
-void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
-               uint32_t desc, const uintptr_t retaddr,
-               const int esz, const int msz, const int N, uint32_t mtedesc,
-               sve_ldst1_host_fn *host_fn,
-               sve_ldst1_tlb_fn *tlb_fn)
-{
-    const unsigned rd = simd_data(desc);
-    const intptr_t reg_max = simd_oprsz(desc);
-    intptr_t reg_off, reg_last, mem_off;
-    SVEContLdSt info;
-    void *host;
-    int i, flags;
-
-    /* Find the active elements.  */
-    if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
-        /* The entire predicate was false; no store occurs.  */
-        return;
-    }
-
-    /* Probe the page(s).  Exit with exception for any invalid page. */
-    sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
-
-    /* Handle watchpoints for all active elements. */
-    sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
-                              BP_MEM_WRITE, retaddr);
-
-    /*
-     * Handle mte checks for all active elements.
-     * Since TBI must be set for MTE, !mtedesc => !mte_active.
-     */
-    if (mtedesc) {
-        sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
-                                mtedesc, retaddr);
-    }
-
-    flags = info.page[0].flags | info.page[1].flags;
-    if (unlikely(flags != 0)) {
-#ifdef CONFIG_USER_ONLY
-        g_assert_not_reached();
-#else
-        /*
-         * At least one page includes MMIO.
-         * Any bus operation can fail with cpu_transaction_failed,
-         * which for ARM will raise SyncExternal.  We cannot avoid
-         * this fault and will leave with the store incomplete.
-         */
-        mem_off = info.mem_off_first[0];
-        reg_off = info.reg_off_first[0];
-        reg_last = info.reg_off_last[1];
-        if (reg_last < 0) {
-            reg_last = info.reg_off_split;
-            if (reg_last < 0) {
-                reg_last = info.reg_off_last[0];
-            }
-        }
-
-        do {
-            uint64_t pg = vg[reg_off >> 6];
-            do {
-                if ((pg >> (reg_off & 63)) & 1) {
-                    for (i = 0; i < N; ++i) {
-                        tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
-                               addr + mem_off + (i << msz), retaddr);
-                    }
-                }
-                reg_off += 1 << esz;
-                mem_off += N << msz;
-            } while (reg_off & 63);
-        } while (reg_off <= reg_last);
-        return;
-#endif
-    }
-
-    mem_off = info.mem_off_first[0];
-    reg_off = info.reg_off_first[0];
-    reg_last = info.reg_off_last[0];
-    host = info.page[0].host;
-
-    while (reg_off <= reg_last) {
-        uint64_t pg = vg[reg_off >> 6];
-        do {
-            if ((pg >> (reg_off & 63)) & 1) {
-                for (i = 0; i < N; ++i) {
-                    host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
-                            host + mem_off + (i << msz));
-                }
-            }
-            reg_off += 1 << esz;
-            mem_off += N << msz;
-        } while (reg_off <= reg_last && (reg_off & 63));
-    }
-
-    /*
-     * Use the slow path to manage the cross-page misalignment.
-     * But we know this is RAM and cannot trap.
-     */
-    mem_off = info.mem_off_split;
-    if (unlikely(mem_off >= 0)) {
-        reg_off = info.reg_off_split;
-        for (i = 0; i < N; ++i) {
-            tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
-                   addr + mem_off + (i << msz), retaddr);
-        }
-    }
-
-    mem_off = info.mem_off_first[1];
-    if (unlikely(mem_off >= 0)) {
-        reg_off = info.reg_off_first[1];
-        reg_last = info.reg_off_last[1];
-        host = info.page[1].host;
-
-        do {
-            uint64_t pg = vg[reg_off >> 6];
-            do {
-                if ((pg >> (reg_off & 63)) & 1) {
-                    for (i = 0; i < N; ++i) {
-                        host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
-                                host + mem_off + (i << msz));
-                    }
-                }
-                reg_off += 1 << esz;
-                mem_off += N << msz;
-            } while (reg_off & 63);
-        } while (reg_off <= reg_last);
-    }
-}
-
-static inline QEMU_ALWAYS_INLINE
-void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
-                   uint32_t desc, const uintptr_t ra,
-                   const int esz, const int msz, const int N,
-                   sve_ldst1_host_fn *host_fn,
-                   sve_ldst1_tlb_fn *tlb_fn)
-{
-    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
-    int bit55 = extract64(addr, 55, 1);
-
-    /* Remove mtedesc from the normal sve descriptor. */
-    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
-
-    /* Perform gross MTE suppression early. */
-    if (!tbi_check(desc, bit55) ||
-        tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
-        mtedesc = 0;
-    }
-
-    sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
-}
-
-#define DO_STN_1(N, NAME, ESZ)                                          \
-void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg,            \
-                                 target_ulong addr, uint32_t desc)      \
-{                                                                       \
-    sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0,            \
-              sve_st1##NAME##_host, sve_st1##NAME##_tlb);               \
-}                                                                       \
-void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg,        \
-                                     target_ulong addr, uint32_t desc)  \
-{                                                                       \
-    sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N,           \
-                  sve_st1##NAME##_host, sve_st1##NAME##_tlb);           \
-}
-
-#define DO_STN_2(N, NAME, ESZ, MSZ)                                     \
-void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg,         \
-                                    target_ulong addr, uint32_t desc)   \
-{                                                                       \
-    sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
-              sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);         \
-}                                                                       \
-void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg,         \
-                                    target_ulong addr, uint32_t desc)   \
-{                                                                       \
-    sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
-              sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);         \
-}                                                                       \
-void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg,     \
-                                        target_ulong addr, uint32_t desc) \
-{                                                                       \
-    sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
-                  sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);     \
-}                                                                       \
-void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg,     \
-                                        target_ulong addr, uint32_t desc) \
-{                                                                       \
-    sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
-                  sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);     \
-}
-
-DO_STN_1(1, bb, MO_8)
-DO_STN_1(1, bh, MO_16)
-DO_STN_1(1, bs, MO_32)
-DO_STN_1(1, bd, MO_64)
-DO_STN_1(2, bb, MO_8)
-DO_STN_1(3, bb, MO_8)
-DO_STN_1(4, bb, MO_8)
-
-DO_STN_2(1, hh, MO_16, MO_16)
-DO_STN_2(1, hs, MO_32, MO_16)
-DO_STN_2(1, hd, MO_64, MO_16)
-DO_STN_2(2, hh, MO_16, MO_16)
-DO_STN_2(3, hh, MO_16, MO_16)
-DO_STN_2(4, hh, MO_16, MO_16)
-
-DO_STN_2(1, ss, MO_32, MO_32)
-DO_STN_2(1, sd, MO_64, MO_32)
-DO_STN_2(2, ss, MO_32, MO_32)
-DO_STN_2(3, ss, MO_32, MO_32)
-DO_STN_2(4, ss, MO_32, MO_32)
-
-DO_STN_2(1, dd, MO_64, MO_64)
-DO_STN_2(2, dd, MO_64, MO_64)
-DO_STN_2(3, dd, MO_64, MO_64)
-DO_STN_2(4, dd, MO_64, MO_64)
-
-#undef DO_STN_1
-#undef DO_STN_2
-
-/*
- * Loads with a vector index.
- */
-
-/*
- * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
- */
-typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
-
-static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
-{
-    return *(uint32_t *)(reg + H1_4(reg_ofs));
-}
-
-static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
-{
-    return *(int32_t *)(reg + H1_4(reg_ofs));
-}
-
-static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
-{
-    return (uint32_t)*(uint64_t *)(reg + reg_ofs);
-}
-
-static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
-{
-    return (int32_t)*(uint64_t *)(reg + reg_ofs);
-}
-
-static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
-{
-    return *(uint64_t *)(reg + reg_ofs);
-}
-
-static inline QEMU_ALWAYS_INLINE
-void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
-               target_ulong base, uint32_t desc, uintptr_t retaddr,
-               uint32_t mtedesc, int esize, int msize,
-               zreg_off_fn *off_fn,
-               sve_ldst1_host_fn *host_fn,
-               sve_ldst1_tlb_fn *tlb_fn)
-{
-    const int mmu_idx = cpu_mmu_index(env, false);
-    const intptr_t reg_max = simd_oprsz(desc);
-    const int scale = simd_data(desc);
-    ARMVectorReg scratch;
-    intptr_t reg_off;
-    SVEHostPage info, info2;
-
-    memset(&scratch, 0, reg_max);
-    reg_off = 0;
-    do {
-        uint64_t pg = vg[reg_off >> 6];
-        do {
-            if (likely(pg & 1)) {
-                target_ulong addr = base + (off_fn(vm, reg_off) << scale);
-                target_ulong in_page = -(addr | TARGET_PAGE_MASK);
-
-                sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
-                               mmu_idx, retaddr);
-
-                if (likely(in_page >= msize)) {
-                    if (unlikely(info.flags & TLB_WATCHPOINT)) {
-                        cpu_check_watchpoint(env_cpu(env), addr, msize,
-                                             info.attrs, BP_MEM_READ, retaddr);
-                    }
-                    if (mtedesc && info.tagged) {
-                        mte_check(env, mtedesc, addr, retaddr);
-                    }
-                    if (unlikely(info.flags & TLB_MMIO)) {
-                        tlb_fn(env, &scratch, reg_off, addr, retaddr);
-                    } else {
-                        host_fn(&scratch, reg_off, info.host);
-                    }
-                } else {
-                    /* Element crosses the page boundary. */
-                    sve_probe_page(&info2, false, env, addr + in_page, 0,
-                                   MMU_DATA_LOAD, mmu_idx, retaddr);
-                    if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
-                        cpu_check_watchpoint(env_cpu(env), addr,
-                                             msize, info.attrs,
-                                             BP_MEM_READ, retaddr);
-                    }
-                    if (mtedesc && info.tagged) {
-                        mte_check(env, mtedesc, addr, retaddr);
-                    }
-                    tlb_fn(env, &scratch, reg_off, addr, retaddr);
-                }
-            }
-            reg_off += esize;
-            pg >>= esize;
-        } while (reg_off & 63);
-    } while (reg_off < reg_max);
-
-    /* Wait until all exceptions have been raised to write back.  */
-    memcpy(vd, &scratch, reg_max);
-}
-
-static inline QEMU_ALWAYS_INLINE
-void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
-                   target_ulong base, uint32_t desc, uintptr_t retaddr,
-                   int esize, int msize, zreg_off_fn *off_fn,
-                   sve_ldst1_host_fn *host_fn,
-                   sve_ldst1_tlb_fn *tlb_fn)
-{
-    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
-    /* Remove mtedesc from the normal sve descriptor. */
-    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
-
-    /*
-     * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
-     * offset base entirely over the address space hole to change the
-     * pointer tag, or change the bit55 selector.  So we could here
-     * examine TBI + TCMA like we do for sve_ldN_r_mte().
-     */
-    sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
-              esize, msize, off_fn, host_fn, tlb_fn);
-}
-
-#define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
-void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
-                                 void *vm, target_ulong base, uint32_t desc) \
-{                                                                            \
-    sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,          \
-              off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
-}                                                                            \
-void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
-     void *vm, target_ulong base, uint32_t desc)                             \
-{                                                                            \
-    sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,         \
-                  off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
-}
-
-#define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
-void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
-                                 void *vm, target_ulong base, uint32_t desc) \
-{                                                                            \
-    sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,          \
-              off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
-}                                                                            \
-void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
-    void *vm, target_ulong base, uint32_t desc)                              \
-{                                                                            \
-    sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,         \
-                  off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
-}
-
-DO_LD1_ZPZ_S(bsu, zsu, MO_8)
-DO_LD1_ZPZ_S(bsu, zss, MO_8)
-DO_LD1_ZPZ_D(bdu, zsu, MO_8)
-DO_LD1_ZPZ_D(bdu, zss, MO_8)
-DO_LD1_ZPZ_D(bdu, zd, MO_8)
-
-DO_LD1_ZPZ_S(bss, zsu, MO_8)
-DO_LD1_ZPZ_S(bss, zss, MO_8)
-DO_LD1_ZPZ_D(bds, zsu, MO_8)
-DO_LD1_ZPZ_D(bds, zss, MO_8)
-DO_LD1_ZPZ_D(bds, zd, MO_8)
-
-DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
-DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
-DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
-DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
-DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
-
-DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
-DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
-DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
-DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
-DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
-
-DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
-DO_LD1_ZPZ_S(hss_le, zss, MO_16)
-DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
-DO_LD1_ZPZ_D(hds_le, zss, MO_16)
-DO_LD1_ZPZ_D(hds_le, zd, MO_16)
-
-DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
-DO_LD1_ZPZ_S(hss_be, zss, MO_16)
-DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
-DO_LD1_ZPZ_D(hds_be, zss, MO_16)
-DO_LD1_ZPZ_D(hds_be, zd, MO_16)
-
-DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
-DO_LD1_ZPZ_S(ss_le, zss, MO_32)
-DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
-DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
-DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
-
-DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
-DO_LD1_ZPZ_S(ss_be, zss, MO_32)
-DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
-DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
-DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
-
-DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
-DO_LD1_ZPZ_D(sds_le, zss, MO_32)
-DO_LD1_ZPZ_D(sds_le, zd, MO_32)
-
-DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
-DO_LD1_ZPZ_D(sds_be, zss, MO_32)
-DO_LD1_ZPZ_D(sds_be, zd, MO_32)
-
-DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
-DO_LD1_ZPZ_D(dd_le, zss, MO_64)
-DO_LD1_ZPZ_D(dd_le, zd, MO_64)
-
-DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
-DO_LD1_ZPZ_D(dd_be, zss, MO_64)
-DO_LD1_ZPZ_D(dd_be, zd, MO_64)
-
-#undef DO_LD1_ZPZ_S
-#undef DO_LD1_ZPZ_D
-
-/* First fault loads with a vector index.  */
-
-/*
- * Common helpers for all gather first-faulting loads.
- */
-
-static inline QEMU_ALWAYS_INLINE
-void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
-                 target_ulong base, uint32_t desc, uintptr_t retaddr,
-                 uint32_t mtedesc, const int esz, const int msz,
-                 zreg_off_fn *off_fn,
-                 sve_ldst1_host_fn *host_fn,
-                 sve_ldst1_tlb_fn *tlb_fn)
-{
-    const int mmu_idx = cpu_mmu_index(env, false);
-    const intptr_t reg_max = simd_oprsz(desc);
-    const int scale = simd_data(desc);
-    const int esize = 1 << esz;
-    const int msize = 1 << msz;
-    intptr_t reg_off;
-    SVEHostPage info;
-    target_ulong addr, in_page;
-
-    /* Skip to the first true predicate.  */
-    reg_off = find_next_active(vg, 0, reg_max, esz);
-    if (unlikely(reg_off >= reg_max)) {
-        /* The entire predicate was false; no load occurs.  */
-        memset(vd, 0, reg_max);
-        return;
-    }
-
-    /*
-     * Probe the first element, allowing faults.
-     */
-    addr = base + (off_fn(vm, reg_off) << scale);
-    if (mtedesc) {
-        mte_check(env, mtedesc, addr, retaddr);
-    }
-    tlb_fn(env, vd, reg_off, addr, retaddr);
-
-    /* After any fault, zero the other elements. */
-    swap_memzero(vd, reg_off);
-    reg_off += esize;
-    swap_memzero(vd + reg_off, reg_max - reg_off);
-
-    /*
-     * Probe the remaining elements, not allowing faults.
-     */
-    while (reg_off < reg_max) {
-        uint64_t pg = vg[reg_off >> 6];
-        do {
-            if (likely((pg >> (reg_off & 63)) & 1)) {
-                addr = base + (off_fn(vm, reg_off) << scale);
-                in_page = -(addr | TARGET_PAGE_MASK);
-
-                if (unlikely(in_page < msize)) {
-                    /* Stop if the element crosses a page boundary. */
-                    goto fault;
-                }
-
-                sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
-                               mmu_idx, retaddr);
-                if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
-                    goto fault;
-                }
-                if (unlikely(info.flags & TLB_WATCHPOINT) &&
-                    (cpu_watchpoint_address_matches
-                     (env_cpu(env), addr, msize) & BP_MEM_READ)) {
-                    goto fault;
-                }
-                if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
-                    goto fault;
-                }
-
-                host_fn(vd, reg_off, info.host);
-            }
-            reg_off += esize;
-        } while (reg_off & 63);
-    }
-    return;
-
- fault:
-    record_fault(env, reg_off, reg_max);
-}
-
-static inline QEMU_ALWAYS_INLINE
-void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
-                     target_ulong base, uint32_t desc, uintptr_t retaddr,
-                     const int esz, const int msz,
-                     zreg_off_fn *off_fn,
-                     sve_ldst1_host_fn *host_fn,
-                     sve_ldst1_tlb_fn *tlb_fn)
-{
-    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
-    /* Remove mtedesc from the normal sve descriptor. */
-    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
-
-    /*
-     * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
-     * offset base entirely over the address space hole to change the
-     * pointer tag, or change the bit55 selector.  So we could here
-     * examine TBI + TCMA like we do for sve_ldN_r_mte().
-     */
-    sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
-                esz, msz, off_fn, host_fn, tlb_fn);
-}
-
-#define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ)                                   \
-void HELPER(sve_ldff##MEM##_##OFS)                                      \
-    (CPUARMState *env, void *vd, void *vg,                              \
-     void *vm, target_ulong base, uint32_t desc)                        \
-{                                                                       \
-    sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ,    \
-                off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
-}                                                                       \
-void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
-    (CPUARMState *env, void *vd, void *vg,                              \
-     void *vm, target_ulong base, uint32_t desc)                        \
-{                                                                       \
-    sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ,   \
-                    off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
-}
-
-#define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ)                                   \
-void HELPER(sve_ldff##MEM##_##OFS)                                      \
-    (CPUARMState *env, void *vd, void *vg,                              \
-     void *vm, target_ulong base, uint32_t desc)                        \
-{                                                                       \
-    sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ,    \
-                off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
-}                                                                       \
-void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
-    (CPUARMState *env, void *vd, void *vg,                              \
-     void *vm, target_ulong base, uint32_t desc)                        \
-{                                                                       \
-    sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ,   \
-                    off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
-}
-
-DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
-DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
-DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
-DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
-DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
-
-DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
-DO_LDFF1_ZPZ_S(bss, zss, MO_8)
-DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
-DO_LDFF1_ZPZ_D(bds, zss, MO_8)
-DO_LDFF1_ZPZ_D(bds, zd, MO_8)
-
-DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
-DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
-DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
-DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
-DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
-
-DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
-DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
-DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
-DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
-DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
-
-DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
-DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
-DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
-DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
-DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
-
-DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
-DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
-DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
-DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
-DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
-
-DO_LDFF1_ZPZ_S(ss_le,  zsu, MO_32)
-DO_LDFF1_ZPZ_S(ss_le,  zss, MO_32)
-DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
-DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
-DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
-
-DO_LDFF1_ZPZ_S(ss_be,  zsu, MO_32)
-DO_LDFF1_ZPZ_S(ss_be,  zss, MO_32)
-DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
-DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
-DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
-
-DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
-DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
-DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
-
-DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
-DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
-DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
-
-DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
-DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
-DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
-
-DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
-DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
-DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
-
-/* Stores with a vector index.  */
-
-static inline QEMU_ALWAYS_INLINE
-void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
-               target_ulong base, uint32_t desc, uintptr_t retaddr,
-               uint32_t mtedesc, int esize, int msize,
-               zreg_off_fn *off_fn,
-               sve_ldst1_host_fn *host_fn,
-               sve_ldst1_tlb_fn *tlb_fn)
-{
-    const int mmu_idx = cpu_mmu_index(env, false);
-    const intptr_t reg_max = simd_oprsz(desc);
-    const int scale = simd_data(desc);
-    void *host[ARM_MAX_VQ * 4];
-    intptr_t reg_off, i;
-    SVEHostPage info, info2;
-
-    /*
-     * Probe all of the elements for host addresses and flags.
-     */
-    i = reg_off = 0;
-    do {
-        uint64_t pg = vg[reg_off >> 6];
-        do {
-            target_ulong addr = base + (off_fn(vm, reg_off) << scale);
-            target_ulong in_page = -(addr | TARGET_PAGE_MASK);
-
-            host[i] = NULL;
-            if (likely((pg >> (reg_off & 63)) & 1)) {
-                if (likely(in_page >= msize)) {
-                    sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
-                                   mmu_idx, retaddr);
-                    if (!(info.flags & TLB_MMIO)) {
-                        host[i] = info.host;
-                    }
-                } else {
-                    /*
-                     * Element crosses the page boundary.
-                     * Probe both pages, but do not record the host address,
-                     * so that we use the slow path.
-                     */
-                    sve_probe_page(&info, false, env, addr, 0,
-                                   MMU_DATA_STORE, mmu_idx, retaddr);
-                    sve_probe_page(&info2, false, env, addr + in_page, 0,
-                                   MMU_DATA_STORE, mmu_idx, retaddr);
-                    info.flags |= info2.flags;
-                }
-
-                if (unlikely(info.flags & TLB_WATCHPOINT)) {
-                    cpu_check_watchpoint(env_cpu(env), addr, msize,
-                                         info.attrs, BP_MEM_WRITE, retaddr);
-                }
-
-                if (mtedesc && info.tagged) {
-                    mte_check(env, mtedesc, addr, retaddr);
-                }
-            }
-            i += 1;
-            reg_off += esize;
-        } while (reg_off & 63);
-    } while (reg_off < reg_max);
-
-    /*
-     * Now that we have recognized all exceptions except SyncExternal
-     * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
-     *
-     * Note for the common case of an element in RAM, not crossing a page
-     * boundary, we have stored the host address in host[].  This doubles
-     * as a first-level check against the predicate, since only enabled
-     * elements have non-null host addresses.
-     */
-    i = reg_off = 0;
-    do {
-        void *h = host[i];
-        if (likely(h != NULL)) {
-            host_fn(vd, reg_off, h);
-        } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
-            target_ulong addr = base + (off_fn(vm, reg_off) << scale);
-            tlb_fn(env, vd, reg_off, addr, retaddr);
-        }
-        i += 1;
-        reg_off += esize;
-    } while (reg_off < reg_max);
-}
-
-static inline QEMU_ALWAYS_INLINE
-void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
-                   target_ulong base, uint32_t desc, uintptr_t retaddr,
-                   int esize, int msize, zreg_off_fn *off_fn,
-                   sve_ldst1_host_fn *host_fn,
-                   sve_ldst1_tlb_fn *tlb_fn)
-{
-    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
-    /* Remove mtedesc from the normal sve descriptor. */
-    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
-
-    /*
-     * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
-     * offset base entirely over the address space hole to change the
-     * pointer tag, or change the bit55 selector.  So we could here
-     * examine TBI + TCMA like we do for sve_ldN_r_mte().
-     */
-    sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
-              esize, msize, off_fn, host_fn, tlb_fn);
-}
-
-#define DO_ST1_ZPZ_S(MEM, OFS, MSZ)                                     \
-void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
-                                 void *vm, target_ulong base, uint32_t desc) \
-{                                                                       \
-    sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,     \
-              off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
-}                                                                       \
-void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
-    void *vm, target_ulong base, uint32_t desc)                         \
-{                                                                       \
-    sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,    \
-                  off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
-}
-
-#define DO_ST1_ZPZ_D(MEM, OFS, MSZ)                                     \
-void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
-                                 void *vm, target_ulong base, uint32_t desc) \
-{                                                                       \
-    sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,     \
-              off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
-}                                                                       \
-void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
-    void *vm, target_ulong base, uint32_t desc)                         \
-{                                                                       \
-    sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,    \
-                  off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
-}
-
-DO_ST1_ZPZ_S(bs, zsu, MO_8)
-DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
-DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
-DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
-DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
-
-DO_ST1_ZPZ_S(bs, zss, MO_8)
-DO_ST1_ZPZ_S(hs_le, zss, MO_16)
-DO_ST1_ZPZ_S(hs_be, zss, MO_16)
-DO_ST1_ZPZ_S(ss_le, zss, MO_32)
-DO_ST1_ZPZ_S(ss_be, zss, MO_32)
-
-DO_ST1_ZPZ_D(bd, zsu, MO_8)
-DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
-DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
-DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
-DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
-DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
-DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
-
-DO_ST1_ZPZ_D(bd, zss, MO_8)
-DO_ST1_ZPZ_D(hd_le, zss, MO_16)
-DO_ST1_ZPZ_D(hd_be, zss, MO_16)
-DO_ST1_ZPZ_D(sd_le, zss, MO_32)
-DO_ST1_ZPZ_D(sd_be, zss, MO_32)
-DO_ST1_ZPZ_D(dd_le, zss, MO_64)
-DO_ST1_ZPZ_D(dd_be, zss, MO_64)
-
-DO_ST1_ZPZ_D(bd, zd, MO_8)
-DO_ST1_ZPZ_D(hd_le, zd, MO_16)
-DO_ST1_ZPZ_D(hd_be, zd, MO_16)
-DO_ST1_ZPZ_D(sd_le, zd, MO_32)
-DO_ST1_ZPZ_D(sd_be, zd, MO_32)
-DO_ST1_ZPZ_D(dd_le, zd, MO_64)
-DO_ST1_ZPZ_D(dd_be, zd, MO_64)
-
-#undef DO_ST1_ZPZ_S
-#undef DO_ST1_ZPZ_D
-
-void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
-
-    for (i = 0; i < opr_sz; ++i) {
-        d[i] = n[i] ^ m[i] ^ k[i];
-    }
-}
-
-void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
-
-    for (i = 0; i < opr_sz; ++i) {
-        d[i] = n[i] ^ (m[i] & ~k[i]);
-    }
-}
-
-void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
-
-    for (i = 0; i < opr_sz; ++i) {
-        d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
-    }
-}
-
-void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
-
-    for (i = 0; i < opr_sz; ++i) {
-        d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
-    }
-}
-
-void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
-
-    for (i = 0; i < opr_sz; ++i) {
-        d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
-    }
-}
-
-/*
- * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
- * See hasless(v,1) from
- *   https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
- */
-static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
-{
-    int bits = 8 << esz;
-    uint64_t ones = dup_const(esz, 1);
-    uint64_t signs = ones << (bits - 1);
-    uint64_t cmp0, cmp1;
-
-    cmp1 = dup_const(esz, n);
-    cmp0 = cmp1 ^ m0;
-    cmp1 = cmp1 ^ m1;
-    cmp0 = (cmp0 - ones) & ~cmp0;
-    cmp1 = (cmp1 - ones) & ~cmp1;
-    return (cmp0 | cmp1) & signs;
-}
-
-static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
-                                uint32_t desc, int esz, bool nmatch)
-{
-    uint16_t esz_mask = pred_esz_masks[esz];
-    intptr_t opr_sz = simd_oprsz(desc);
-    uint32_t flags = PREDTEST_INIT;
-    intptr_t i, j, k;
-
-    for (i = 0; i < opr_sz; i += 16) {
-        uint64_t m0 = *(uint64_t *)(vm + i);
-        uint64_t m1 = *(uint64_t *)(vm + i + 8);
-        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
-        uint16_t out = 0;
-
-        for (j = 0; j < 16; j += 8) {
-            uint64_t n = *(uint64_t *)(vn + i + j);
-
-            for (k = 0; k < 8; k += 1 << esz) {
-                if (pg & (1 << (j + k))) {
-                    bool o = do_match2(n >> (k * 8), m0, m1, esz);
-                    out |= (o ^ nmatch) << (j + k);
-                }
-            }
-        }
-        *(uint16_t *)(vd + H1_2(i >> 3)) = out;
-        flags = iter_predtest_fwd(out, pg, flags);
-    }
-    return flags;
-}
-
-#define DO_PPZZ_MATCH(NAME, ESZ, INV)                                         \
-uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
-{                                                                             \
-    return do_match(vd, vn, vm, vg, desc, ESZ, INV);                          \
-}
-
-DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
-DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
-
-DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
-DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
-
-#undef DO_PPZZ_MATCH
-
-void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
-                            uint32_t desc)
-{
-    ARMVectorReg scratch;
-    intptr_t i, j;
-    intptr_t opr_sz = simd_oprsz(desc);
-    uint32_t *d = vd, *n = vn, *m = vm;
-    uint8_t *pg = vg;
-
-    if (d == n) {
-        n = memcpy(&scratch, n, opr_sz);
-        if (d == m) {
-            m = n;
-        }
-    } else if (d == m) {
-        m = memcpy(&scratch, m, opr_sz);
-    }
-
-    for (i = 0; i < opr_sz; i += 4) {
-        uint64_t count = 0;
-        uint8_t pred;
-
-        pred = pg[H1(i >> 3)] >> (i & 7);
-        if (pred & 1) {
-            uint32_t nn = n[H4(i >> 2)];
-
-            for (j = 0; j <= i; j += 4) {
-                pred = pg[H1(j >> 3)] >> (j & 7);
-                if ((pred & 1) && nn == m[H4(j >> 2)]) {
-                    ++count;
-                }
-            }
-        }
-        d[H4(i >> 2)] = count;
-    }
-}
-
-void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
-                            uint32_t desc)
-{
-    ARMVectorReg scratch;
-    intptr_t i, j;
-    intptr_t opr_sz = simd_oprsz(desc);
-    uint64_t *d = vd, *n = vn, *m = vm;
-    uint8_t *pg = vg;
-
-    if (d == n) {
-        n = memcpy(&scratch, n, opr_sz);
-        if (d == m) {
-            m = n;
-        }
-    } else if (d == m) {
-        m = memcpy(&scratch, m, opr_sz);
-    }
-
-    for (i = 0; i < opr_sz / 8; ++i) {
-        uint64_t count = 0;
-        if (pg[H1(i)] & 1) {
-            uint64_t nn = n[i];
-            for (j = 0; j <= i; ++j) {
-                if ((pg[H1(j)] & 1) && nn == m[j]) {
-                    ++count;
-                }
-            }
-        }
-        d[i] = count;
-    }
-}
-
-/*
- * Returns the number of bytes in m0 and m1 that match n.
- * Unlike do_match2 we don't just need true/false, we need an exact count.
- * This requires two extra logical operations.
- */
-static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
-{
-    const uint64_t mask = dup_const(MO_8, 0x7f);
-    uint64_t cmp0, cmp1;
-
-    cmp1 = dup_const(MO_8, n);
-    cmp0 = cmp1 ^ m0;
-    cmp1 = cmp1 ^ m1;
-
-    /*
-     * 1: clear msb of each byte to avoid carry to next byte (& mask)
-     * 2: carry in to msb if byte != 0 (+ mask)
-     * 3: set msb if cmp has msb set (| cmp)
-     * 4: set ~msb to ignore them (| mask)
-     * We now have 0xff for byte != 0 or 0x7f for byte == 0.
-     * 5: invert, resulting in 0x80 if and only if byte == 0.
-     */
-    cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
-    cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
-
-    /*
-     * Combine the two compares in a way that the bits do
-     * not overlap, and so preserves the count of set bits.
-     * If the host has an efficient instruction for ctpop,
-     * then ctpop(x) + ctpop(y) has the same number of
-     * operations as ctpop(x | (y >> 1)).  If the host does
-     * not have an efficient ctpop, then we only want to
-     * use it once.
-     */
-    return ctpop64(cmp0 | (cmp1 >> 1));
-}
-
-void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, j;
-    intptr_t opr_sz = simd_oprsz(desc);
-
-    for (i = 0; i < opr_sz; i += 16) {
-        uint64_t n0 = *(uint64_t *)(vn + i);
-        uint64_t m0 = *(uint64_t *)(vm + i);
-        uint64_t n1 = *(uint64_t *)(vn + i + 8);
-        uint64_t m1 = *(uint64_t *)(vm + i + 8);
-        uint64_t out0 = 0;
-        uint64_t out1 = 0;
-
-        for (j = 0; j < 64; j += 8) {
-            uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
-            uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
-            out0 |= cnt0 << j;
-            out1 |= cnt1 << j;
-        }
-
-        *(uint64_t *)(vd + i) = out0;
-        *(uint64_t *)(vd + i + 8) = out1;
-    }
-}
-
-void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    int shr = simd_data(desc);
-    int shl = 8 - shr;
-    uint64_t mask = dup_const(MO_8, 0xff >> shr);
-    uint64_t *d = vd, *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz; ++i) {
-        uint64_t t = n[i] ^ m[i];
-        d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
-    }
-}
-
-void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    int shr = simd_data(desc);
-    int shl = 16 - shr;
-    uint64_t mask = dup_const(MO_16, 0xffff >> shr);
-    uint64_t *d = vd, *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz; ++i) {
-        uint64_t t = n[i] ^ m[i];
-        d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
-    }
-}
-
-void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 4;
-    int shr = simd_data(desc);
-    uint32_t *d = vd, *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz; ++i) {
-        d[i] = ror32(n[i] ^ m[i], shr);
-    }
-}
-
-void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
-                     void *status, uint32_t desc)
-{
-    intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
-
-    for (s = 0; s < opr_sz; ++s) {
-        float32 *n = vn + s * sizeof(float32) * 4;
-        float32 *m = vm + s * sizeof(float32) * 4;
-        float32 *a = va + s * sizeof(float32) * 4;
-        float32 *d = vd + s * sizeof(float32) * 4;
-        float32 n00 = n[H4(0)], n01 = n[H4(1)];
-        float32 n10 = n[H4(2)], n11 = n[H4(3)];
-        float32 m00 = m[H4(0)], m01 = m[H4(1)];
-        float32 m10 = m[H4(2)], m11 = m[H4(3)];
-        float32 p0, p1;
-
-        /* i = 0, j = 0 */
-        p0 = float32_mul(n00, m00, status);
-        p1 = float32_mul(n01, m01, status);
-        d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
-
-        /* i = 0, j = 1 */
-        p0 = float32_mul(n00, m10, status);
-        p1 = float32_mul(n01, m11, status);
-        d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
-
-        /* i = 1, j = 0 */
-        p0 = float32_mul(n10, m00, status);
-        p1 = float32_mul(n11, m01, status);
-        d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
-
-        /* i = 1, j = 1 */
-        p0 = float32_mul(n10, m10, status);
-        p1 = float32_mul(n11, m11, status);
-        d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
-    }
-}
-
-void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
-                     void *status, uint32_t desc)
-{
-    intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
-
-    for (s = 0; s < opr_sz; ++s) {
-        float64 *n = vn + s * sizeof(float64) * 4;
-        float64 *m = vm + s * sizeof(float64) * 4;
-        float64 *a = va + s * sizeof(float64) * 4;
-        float64 *d = vd + s * sizeof(float64) * 4;
-        float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
-        float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
-        float64 p0, p1;
-
-        /* i = 0, j = 0 */
-        p0 = float64_mul(n00, m00, status);
-        p1 = float64_mul(n01, m01, status);
-        d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
-
-        /* i = 0, j = 1 */
-        p0 = float64_mul(n00, m10, status);
-        p1 = float64_mul(n01, m11, status);
-        d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
-
-        /* i = 1, j = 0 */
-        p0 = float64_mul(n10, m00, status);
-        p1 = float64_mul(n11, m01, status);
-        d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
-
-        /* i = 1, j = 1 */
-        p0 = float64_mul(n10, m10, status);
-        p1 = float64_mul(n11, m11, status);
-        d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
-    }
-}
-
-#define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
-void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc)  \
-{                                                                             \
-    intptr_t i = simd_oprsz(desc);                                            \
-    uint64_t *g = vg;                                                         \
-    do {                                                                      \
-        uint64_t pg = g[(i - 1) >> 6];                                        \
-        do {                                                                  \
-            i -= sizeof(TYPEW);                                               \
-            if (likely((pg >> (i & 63)) & 1)) {                               \
-                TYPEW nn = *(TYPEW *)(vn + HW(i));                            \
-                *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status);      \
-            }                                                                 \
-        } while (i & 63);                                                     \
-    } while (i != 0);                                                         \
-}
-
-DO_FCVTNT(sve_bfcvtnt,    uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
-DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
-DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
-
-#define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
-void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc)  \
-{                                                                             \
-    intptr_t i = simd_oprsz(desc);                                            \
-    uint64_t *g = vg;                                                         \
-    do {                                                                      \
-        uint64_t pg = g[(i - 1) >> 6];                                        \
-        do {                                                                  \
-            i -= sizeof(TYPEW);                                               \
-            if (likely((pg >> (i & 63)) & 1)) {                               \
-                TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN)));            \
-                *(TYPEW *)(vd + HW(i)) = OP(nn, status);                      \
-            }                                                                 \
-        } while (i & 63);                                                     \
-    } while (i != 0);                                                         \
-}
-
-DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
-DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
-
-#undef DO_FCVTLT
-#undef DO_FCVTNT
diff --git a/target/arm/tcg-stubs.c b/target/arm/tcg-stubs.c
new file mode 100644
index 0000000..1a7ddb3
--- /dev/null
+++ b/target/arm/tcg-stubs.c
@@ -0,0 +1,23 @@
+/*
+ * QEMU ARM stubs for some TCG helper functions
+ *
+ * Copyright 2021 SUSE LLC
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "internals.h"
+
+void write_v7m_exception(CPUARMState *env, uint32_t new_exc)
+{
+    g_assert_not_reached();
+}
+
+void raise_exception_ra(CPUARMState *env, uint32_t excp, uint32_t syndrome,
+                        uint32_t target_el, uintptr_t ra)
+{
+    g_assert_not_reached();
+}
diff --git a/target/arm/tcg/crypto_helper.c b/target/arm/tcg/crypto_helper.c
new file mode 100644
index 0000000..d286903
--- /dev/null
+++ b/target/arm/tcg/crypto_helper.c
@@ -0,0 +1,778 @@
+/*
+ * crypto_helper.c - emulate v8 Crypto Extensions instructions
+ *
+ * Copyright (C) 2013 - 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+
+#include "cpu.h"
+#include "exec/helper-proto.h"
+#include "tcg/tcg-gvec-desc.h"
+#include "crypto/aes.h"
+#include "crypto/sm4.h"
+#include "vec_internal.h"
+
+union CRYPTO_STATE {
+    uint8_t    bytes[16];
+    uint32_t   words[4];
+    uint64_t   l[2];
+};
+
+#if HOST_BIG_ENDIAN
+#define CR_ST_BYTE(state, i)   ((state).bytes[(15 - (i)) ^ 8])
+#define CR_ST_WORD(state, i)   ((state).words[(3 - (i)) ^ 2])
+#else
+#define CR_ST_BYTE(state, i)   ((state).bytes[i])
+#define CR_ST_WORD(state, i)   ((state).words[i])
+#endif
+
+/*
+ * The caller has not been converted to full gvec, and so only
+ * modifies the low 16 bytes of the vector register.
+ */
+static void clear_tail_16(void *vd, uint32_t desc)
+{
+    int opr_sz = simd_oprsz(desc);
+    int max_sz = simd_maxsz(desc);
+
+    assert(opr_sz == 16);
+    clear_tail(vd, opr_sz, max_sz);
+}
+
+static void do_crypto_aese(uint64_t *rd, uint64_t *rn,
+                           uint64_t *rm, bool decrypt)
+{
+    static uint8_t const * const sbox[2] = { AES_sbox, AES_isbox };
+    static uint8_t const * const shift[2] = { AES_shifts, AES_ishifts };
+    union CRYPTO_STATE rk = { .l = { rm[0], rm[1] } };
+    union CRYPTO_STATE st = { .l = { rn[0], rn[1] } };
+    int i;
+
+    /* xor state vector with round key */
+    rk.l[0] ^= st.l[0];
+    rk.l[1] ^= st.l[1];
+
+    /* combine ShiftRows operation and sbox substitution */
+    for (i = 0; i < 16; i++) {
+        CR_ST_BYTE(st, i) = sbox[decrypt][CR_ST_BYTE(rk, shift[decrypt][i])];
+    }
+
+    rd[0] = st.l[0];
+    rd[1] = st.l[1];
+}
+
+void HELPER(crypto_aese)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    bool decrypt = simd_data(desc);
+
+    for (i = 0; i < opr_sz; i += 16) {
+        do_crypto_aese(vd + i, vn + i, vm + i, decrypt);
+    }
+    clear_tail(vd, opr_sz, simd_maxsz(desc));
+}
+
+static void do_crypto_aesmc(uint64_t *rd, uint64_t *rm, bool decrypt)
+{
+    static uint32_t const mc[][256] = { {
+        /* MixColumns lookup table */
+        0x00000000, 0x03010102, 0x06020204, 0x05030306,
+        0x0c040408, 0x0f05050a, 0x0a06060c, 0x0907070e,
+        0x18080810, 0x1b090912, 0x1e0a0a14, 0x1d0b0b16,
+        0x140c0c18, 0x170d0d1a, 0x120e0e1c, 0x110f0f1e,
+        0x30101020, 0x33111122, 0x36121224, 0x35131326,
+        0x3c141428, 0x3f15152a, 0x3a16162c, 0x3917172e,
+        0x28181830, 0x2b191932, 0x2e1a1a34, 0x2d1b1b36,
+        0x241c1c38, 0x271d1d3a, 0x221e1e3c, 0x211f1f3e,
+        0x60202040, 0x63212142, 0x66222244, 0x65232346,
+        0x6c242448, 0x6f25254a, 0x6a26264c, 0x6927274e,
+        0x78282850, 0x7b292952, 0x7e2a2a54, 0x7d2b2b56,
+        0x742c2c58, 0x772d2d5a, 0x722e2e5c, 0x712f2f5e,
+        0x50303060, 0x53313162, 0x56323264, 0x55333366,
+        0x5c343468, 0x5f35356a, 0x5a36366c, 0x5937376e,
+        0x48383870, 0x4b393972, 0x4e3a3a74, 0x4d3b3b76,
+        0x443c3c78, 0x473d3d7a, 0x423e3e7c, 0x413f3f7e,
+        0xc0404080, 0xc3414182, 0xc6424284, 0xc5434386,
+        0xcc444488, 0xcf45458a, 0xca46468c, 0xc947478e,
+        0xd8484890, 0xdb494992, 0xde4a4a94, 0xdd4b4b96,
+        0xd44c4c98, 0xd74d4d9a, 0xd24e4e9c, 0xd14f4f9e,
+        0xf05050a0, 0xf35151a2, 0xf65252a4, 0xf55353a6,
+        0xfc5454a8, 0xff5555aa, 0xfa5656ac, 0xf95757ae,
+        0xe85858b0, 0xeb5959b2, 0xee5a5ab4, 0xed5b5bb6,
+        0xe45c5cb8, 0xe75d5dba, 0xe25e5ebc, 0xe15f5fbe,
+        0xa06060c0, 0xa36161c2, 0xa66262c4, 0xa56363c6,
+        0xac6464c8, 0xaf6565ca, 0xaa6666cc, 0xa96767ce,
+        0xb86868d0, 0xbb6969d2, 0xbe6a6ad4, 0xbd6b6bd6,
+        0xb46c6cd8, 0xb76d6dda, 0xb26e6edc, 0xb16f6fde,
+        0x907070e0, 0x937171e2, 0x967272e4, 0x957373e6,
+        0x9c7474e8, 0x9f7575ea, 0x9a7676ec, 0x997777ee,
+        0x887878f0, 0x8b7979f2, 0x8e7a7af4, 0x8d7b7bf6,
+        0x847c7cf8, 0x877d7dfa, 0x827e7efc, 0x817f7ffe,
+        0x9b80801b, 0x98818119, 0x9d82821f, 0x9e83831d,
+        0x97848413, 0x94858511, 0x91868617, 0x92878715,
+        0x8388880b, 0x80898909, 0x858a8a0f, 0x868b8b0d,
+        0x8f8c8c03, 0x8c8d8d01, 0x898e8e07, 0x8a8f8f05,
+        0xab90903b, 0xa8919139, 0xad92923f, 0xae93933d,
+        0xa7949433, 0xa4959531, 0xa1969637, 0xa2979735,
+        0xb398982b, 0xb0999929, 0xb59a9a2f, 0xb69b9b2d,
+        0xbf9c9c23, 0xbc9d9d21, 0xb99e9e27, 0xba9f9f25,
+        0xfba0a05b, 0xf8a1a159, 0xfda2a25f, 0xfea3a35d,
+        0xf7a4a453, 0xf4a5a551, 0xf1a6a657, 0xf2a7a755,
+        0xe3a8a84b, 0xe0a9a949, 0xe5aaaa4f, 0xe6abab4d,
+        0xefacac43, 0xecadad41, 0xe9aeae47, 0xeaafaf45,
+        0xcbb0b07b, 0xc8b1b179, 0xcdb2b27f, 0xceb3b37d,
+        0xc7b4b473, 0xc4b5b571, 0xc1b6b677, 0xc2b7b775,
+        0xd3b8b86b, 0xd0b9b969, 0xd5baba6f, 0xd6bbbb6d,
+        0xdfbcbc63, 0xdcbdbd61, 0xd9bebe67, 0xdabfbf65,
+        0x5bc0c09b, 0x58c1c199, 0x5dc2c29f, 0x5ec3c39d,
+        0x57c4c493, 0x54c5c591, 0x51c6c697, 0x52c7c795,
+        0x43c8c88b, 0x40c9c989, 0x45caca8f, 0x46cbcb8d,
+        0x4fcccc83, 0x4ccdcd81, 0x49cece87, 0x4acfcf85,
+        0x6bd0d0bb, 0x68d1d1b9, 0x6dd2d2bf, 0x6ed3d3bd,
+        0x67d4d4b3, 0x64d5d5b1, 0x61d6d6b7, 0x62d7d7b5,
+        0x73d8d8ab, 0x70d9d9a9, 0x75dadaaf, 0x76dbdbad,
+        0x7fdcdca3, 0x7cdddda1, 0x79dedea7, 0x7adfdfa5,
+        0x3be0e0db, 0x38e1e1d9, 0x3de2e2df, 0x3ee3e3dd,
+        0x37e4e4d3, 0x34e5e5d1, 0x31e6e6d7, 0x32e7e7d5,
+        0x23e8e8cb, 0x20e9e9c9, 0x25eaeacf, 0x26ebebcd,
+        0x2fececc3, 0x2cededc1, 0x29eeeec7, 0x2aefefc5,
+        0x0bf0f0fb, 0x08f1f1f9, 0x0df2f2ff, 0x0ef3f3fd,
+        0x07f4f4f3, 0x04f5f5f1, 0x01f6f6f7, 0x02f7f7f5,
+        0x13f8f8eb, 0x10f9f9e9, 0x15fafaef, 0x16fbfbed,
+        0x1ffcfce3, 0x1cfdfde1, 0x19fefee7, 0x1affffe5,
+    }, {
+        /* Inverse MixColumns lookup table */
+        0x00000000, 0x0b0d090e, 0x161a121c, 0x1d171b12,
+        0x2c342438, 0x27392d36, 0x3a2e3624, 0x31233f2a,
+        0x58684870, 0x5365417e, 0x4e725a6c, 0x457f5362,
+        0x745c6c48, 0x7f516546, 0x62467e54, 0x694b775a,
+        0xb0d090e0, 0xbbdd99ee, 0xa6ca82fc, 0xadc78bf2,
+        0x9ce4b4d8, 0x97e9bdd6, 0x8afea6c4, 0x81f3afca,
+        0xe8b8d890, 0xe3b5d19e, 0xfea2ca8c, 0xf5afc382,
+        0xc48cfca8, 0xcf81f5a6, 0xd296eeb4, 0xd99be7ba,
+        0x7bbb3bdb, 0x70b632d5, 0x6da129c7, 0x66ac20c9,
+        0x578f1fe3, 0x5c8216ed, 0x41950dff, 0x4a9804f1,
+        0x23d373ab, 0x28de7aa5, 0x35c961b7, 0x3ec468b9,
+        0x0fe75793, 0x04ea5e9d, 0x19fd458f, 0x12f04c81,
+        0xcb6bab3b, 0xc066a235, 0xdd71b927, 0xd67cb029,
+        0xe75f8f03, 0xec52860d, 0xf1459d1f, 0xfa489411,
+        0x9303e34b, 0x980eea45, 0x8519f157, 0x8e14f859,
+        0xbf37c773, 0xb43ace7d, 0xa92dd56f, 0xa220dc61,
+        0xf66d76ad, 0xfd607fa3, 0xe07764b1, 0xeb7a6dbf,
+        0xda595295, 0xd1545b9b, 0xcc434089, 0xc74e4987,
+        0xae053edd, 0xa50837d3, 0xb81f2cc1, 0xb31225cf,
+        0x82311ae5, 0x893c13eb, 0x942b08f9, 0x9f2601f7,
+        0x46bde64d, 0x4db0ef43, 0x50a7f451, 0x5baafd5f,
+        0x6a89c275, 0x6184cb7b, 0x7c93d069, 0x779ed967,
+        0x1ed5ae3d, 0x15d8a733, 0x08cfbc21, 0x03c2b52f,
+        0x32e18a05, 0x39ec830b, 0x24fb9819, 0x2ff69117,
+        0x8dd64d76, 0x86db4478, 0x9bcc5f6a, 0x90c15664,
+        0xa1e2694e, 0xaaef6040, 0xb7f87b52, 0xbcf5725c,
+        0xd5be0506, 0xdeb30c08, 0xc3a4171a, 0xc8a91e14,
+        0xf98a213e, 0xf2872830, 0xef903322, 0xe49d3a2c,
+        0x3d06dd96, 0x360bd498, 0x2b1ccf8a, 0x2011c684,
+        0x1132f9ae, 0x1a3ff0a0, 0x0728ebb2, 0x0c25e2bc,
+        0x656e95e6, 0x6e639ce8, 0x737487fa, 0x78798ef4,
+        0x495ab1de, 0x4257b8d0, 0x5f40a3c2, 0x544daacc,
+        0xf7daec41, 0xfcd7e54f, 0xe1c0fe5d, 0xeacdf753,
+        0xdbeec879, 0xd0e3c177, 0xcdf4da65, 0xc6f9d36b,
+        0xafb2a431, 0xa4bfad3f, 0xb9a8b62d, 0xb2a5bf23,
+        0x83868009, 0x888b8907, 0x959c9215, 0x9e919b1b,
+        0x470a7ca1, 0x4c0775af, 0x51106ebd, 0x5a1d67b3,
+        0x6b3e5899, 0x60335197, 0x7d244a85, 0x7629438b,
+        0x1f6234d1, 0x146f3ddf, 0x097826cd, 0x02752fc3,
+        0x335610e9, 0x385b19e7, 0x254c02f5, 0x2e410bfb,
+        0x8c61d79a, 0x876cde94, 0x9a7bc586, 0x9176cc88,
+        0xa055f3a2, 0xab58faac, 0xb64fe1be, 0xbd42e8b0,
+        0xd4099fea, 0xdf0496e4, 0xc2138df6, 0xc91e84f8,
+        0xf83dbbd2, 0xf330b2dc, 0xee27a9ce, 0xe52aa0c0,
+        0x3cb1477a, 0x37bc4e74, 0x2aab5566, 0x21a65c68,
+        0x10856342, 0x1b886a4c, 0x069f715e, 0x0d927850,
+        0x64d90f0a, 0x6fd40604, 0x72c31d16, 0x79ce1418,
+        0x48ed2b32, 0x43e0223c, 0x5ef7392e, 0x55fa3020,
+        0x01b79aec, 0x0aba93e2, 0x17ad88f0, 0x1ca081fe,
+        0x2d83bed4, 0x268eb7da, 0x3b99acc8, 0x3094a5c6,
+        0x59dfd29c, 0x52d2db92, 0x4fc5c080, 0x44c8c98e,
+        0x75ebf6a4, 0x7ee6ffaa, 0x63f1e4b8, 0x68fcedb6,
+        0xb1670a0c, 0xba6a0302, 0xa77d1810, 0xac70111e,
+        0x9d532e34, 0x965e273a, 0x8b493c28, 0x80443526,
+        0xe90f427c, 0xe2024b72, 0xff155060, 0xf418596e,
+        0xc53b6644, 0xce366f4a, 0xd3217458, 0xd82c7d56,
+        0x7a0ca137, 0x7101a839, 0x6c16b32b, 0x671bba25,
+        0x5638850f, 0x5d358c01, 0x40229713, 0x4b2f9e1d,
+        0x2264e947, 0x2969e049, 0x347efb5b, 0x3f73f255,
+        0x0e50cd7f, 0x055dc471, 0x184adf63, 0x1347d66d,
+        0xcadc31d7, 0xc1d138d9, 0xdcc623cb, 0xd7cb2ac5,
+        0xe6e815ef, 0xede51ce1, 0xf0f207f3, 0xfbff0efd,
+        0x92b479a7, 0x99b970a9, 0x84ae6bbb, 0x8fa362b5,
+        0xbe805d9f, 0xb58d5491, 0xa89a4f83, 0xa397468d,
+    } };
+
+    union CRYPTO_STATE st = { .l = { rm[0], rm[1] } };
+    int i;
+
+    for (i = 0; i < 16; i += 4) {
+        CR_ST_WORD(st, i >> 2) =
+            mc[decrypt][CR_ST_BYTE(st, i)] ^
+            rol32(mc[decrypt][CR_ST_BYTE(st, i + 1)], 8) ^
+            rol32(mc[decrypt][CR_ST_BYTE(st, i + 2)], 16) ^
+            rol32(mc[decrypt][CR_ST_BYTE(st, i + 3)], 24);
+    }
+
+    rd[0] = st.l[0];
+    rd[1] = st.l[1];
+}
+
+void HELPER(crypto_aesmc)(void *vd, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    bool decrypt = simd_data(desc);
+
+    for (i = 0; i < opr_sz; i += 16) {
+        do_crypto_aesmc(vd + i, vm + i, decrypt);
+    }
+    clear_tail(vd, opr_sz, simd_maxsz(desc));
+}
+
+/*
+ * SHA-1 logical functions
+ */
+
+static uint32_t cho(uint32_t x, uint32_t y, uint32_t z)
+{
+    return (x & (y ^ z)) ^ z;
+}
+
+static uint32_t par(uint32_t x, uint32_t y, uint32_t z)
+{
+    return x ^ y ^ z;
+}
+
+static uint32_t maj(uint32_t x, uint32_t y, uint32_t z)
+{
+    return (x & y) | ((x | y) & z);
+}
+
+void HELPER(crypto_sha1su0)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    uint64_t *d = vd, *n = vn, *m = vm;
+    uint64_t d0, d1;
+
+    d0 = d[1] ^ d[0] ^ m[0];
+    d1 = n[0] ^ d[1] ^ m[1];
+    d[0] = d0;
+    d[1] = d1;
+
+    clear_tail_16(vd, desc);
+}
+
+static inline void crypto_sha1_3reg(uint64_t *rd, uint64_t *rn,
+                                    uint64_t *rm, uint32_t desc,
+                                    uint32_t (*fn)(union CRYPTO_STATE *d))
+{
+    union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
+    union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
+    union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
+    int i;
+
+    for (i = 0; i < 4; i++) {
+        uint32_t t = fn(&d);
+
+        t += rol32(CR_ST_WORD(d, 0), 5) + CR_ST_WORD(n, 0)
+             + CR_ST_WORD(m, i);
+
+        CR_ST_WORD(n, 0) = CR_ST_WORD(d, 3);
+        CR_ST_WORD(d, 3) = CR_ST_WORD(d, 2);
+        CR_ST_WORD(d, 2) = ror32(CR_ST_WORD(d, 1), 2);
+        CR_ST_WORD(d, 1) = CR_ST_WORD(d, 0);
+        CR_ST_WORD(d, 0) = t;
+    }
+    rd[0] = d.l[0];
+    rd[1] = d.l[1];
+
+    clear_tail_16(rd, desc);
+}
+
+static uint32_t do_sha1c(union CRYPTO_STATE *d)
+{
+    return cho(CR_ST_WORD(*d, 1), CR_ST_WORD(*d, 2), CR_ST_WORD(*d, 3));
+}
+
+void HELPER(crypto_sha1c)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    crypto_sha1_3reg(vd, vn, vm, desc, do_sha1c);
+}
+
+static uint32_t do_sha1p(union CRYPTO_STATE *d)
+{
+    return par(CR_ST_WORD(*d, 1), CR_ST_WORD(*d, 2), CR_ST_WORD(*d, 3));
+}
+
+void HELPER(crypto_sha1p)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    crypto_sha1_3reg(vd, vn, vm, desc, do_sha1p);
+}
+
+static uint32_t do_sha1m(union CRYPTO_STATE *d)
+{
+    return maj(CR_ST_WORD(*d, 1), CR_ST_WORD(*d, 2), CR_ST_WORD(*d, 3));
+}
+
+void HELPER(crypto_sha1m)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    crypto_sha1_3reg(vd, vn, vm, desc, do_sha1m);
+}
+
+void HELPER(crypto_sha1h)(void *vd, void *vm, uint32_t desc)
+{
+    uint64_t *rd = vd;
+    uint64_t *rm = vm;
+    union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
+
+    CR_ST_WORD(m, 0) = ror32(CR_ST_WORD(m, 0), 2);
+    CR_ST_WORD(m, 1) = CR_ST_WORD(m, 2) = CR_ST_WORD(m, 3) = 0;
+
+    rd[0] = m.l[0];
+    rd[1] = m.l[1];
+
+    clear_tail_16(vd, desc);
+}
+
+void HELPER(crypto_sha1su1)(void *vd, void *vm, uint32_t desc)
+{
+    uint64_t *rd = vd;
+    uint64_t *rm = vm;
+    union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
+    union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
+
+    CR_ST_WORD(d, 0) = rol32(CR_ST_WORD(d, 0) ^ CR_ST_WORD(m, 1), 1);
+    CR_ST_WORD(d, 1) = rol32(CR_ST_WORD(d, 1) ^ CR_ST_WORD(m, 2), 1);
+    CR_ST_WORD(d, 2) = rol32(CR_ST_WORD(d, 2) ^ CR_ST_WORD(m, 3), 1);
+    CR_ST_WORD(d, 3) = rol32(CR_ST_WORD(d, 3) ^ CR_ST_WORD(d, 0), 1);
+
+    rd[0] = d.l[0];
+    rd[1] = d.l[1];
+
+    clear_tail_16(vd, desc);
+}
+
+/*
+ * The SHA-256 logical functions, according to
+ * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf
+ */
+
+static uint32_t S0(uint32_t x)
+{
+    return ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22);
+}
+
+static uint32_t S1(uint32_t x)
+{
+    return ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25);
+}
+
+static uint32_t s0(uint32_t x)
+{
+    return ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3);
+}
+
+static uint32_t s1(uint32_t x)
+{
+    return ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10);
+}
+
+void HELPER(crypto_sha256h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    uint64_t *rd = vd;
+    uint64_t *rn = vn;
+    uint64_t *rm = vm;
+    union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
+    union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
+    union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
+    int i;
+
+    for (i = 0; i < 4; i++) {
+        uint32_t t = cho(CR_ST_WORD(n, 0), CR_ST_WORD(n, 1), CR_ST_WORD(n, 2))
+                     + CR_ST_WORD(n, 3) + S1(CR_ST_WORD(n, 0))
+                     + CR_ST_WORD(m, i);
+
+        CR_ST_WORD(n, 3) = CR_ST_WORD(n, 2);
+        CR_ST_WORD(n, 2) = CR_ST_WORD(n, 1);
+        CR_ST_WORD(n, 1) = CR_ST_WORD(n, 0);
+        CR_ST_WORD(n, 0) = CR_ST_WORD(d, 3) + t;
+
+        t += maj(CR_ST_WORD(d, 0), CR_ST_WORD(d, 1), CR_ST_WORD(d, 2))
+             + S0(CR_ST_WORD(d, 0));
+
+        CR_ST_WORD(d, 3) = CR_ST_WORD(d, 2);
+        CR_ST_WORD(d, 2) = CR_ST_WORD(d, 1);
+        CR_ST_WORD(d, 1) = CR_ST_WORD(d, 0);
+        CR_ST_WORD(d, 0) = t;
+    }
+
+    rd[0] = d.l[0];
+    rd[1] = d.l[1];
+
+    clear_tail_16(vd, desc);
+}
+
+void HELPER(crypto_sha256h2)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    uint64_t *rd = vd;
+    uint64_t *rn = vn;
+    uint64_t *rm = vm;
+    union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
+    union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
+    union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
+    int i;
+
+    for (i = 0; i < 4; i++) {
+        uint32_t t = cho(CR_ST_WORD(d, 0), CR_ST_WORD(d, 1), CR_ST_WORD(d, 2))
+                     + CR_ST_WORD(d, 3) + S1(CR_ST_WORD(d, 0))
+                     + CR_ST_WORD(m, i);
+
+        CR_ST_WORD(d, 3) = CR_ST_WORD(d, 2);
+        CR_ST_WORD(d, 2) = CR_ST_WORD(d, 1);
+        CR_ST_WORD(d, 1) = CR_ST_WORD(d, 0);
+        CR_ST_WORD(d, 0) = CR_ST_WORD(n, 3 - i) + t;
+    }
+
+    rd[0] = d.l[0];
+    rd[1] = d.l[1];
+
+    clear_tail_16(vd, desc);
+}
+
+void HELPER(crypto_sha256su0)(void *vd, void *vm, uint32_t desc)
+{
+    uint64_t *rd = vd;
+    uint64_t *rm = vm;
+    union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
+    union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
+
+    CR_ST_WORD(d, 0) += s0(CR_ST_WORD(d, 1));
+    CR_ST_WORD(d, 1) += s0(CR_ST_WORD(d, 2));
+    CR_ST_WORD(d, 2) += s0(CR_ST_WORD(d, 3));
+    CR_ST_WORD(d, 3) += s0(CR_ST_WORD(m, 0));
+
+    rd[0] = d.l[0];
+    rd[1] = d.l[1];
+
+    clear_tail_16(vd, desc);
+}
+
+void HELPER(crypto_sha256su1)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    uint64_t *rd = vd;
+    uint64_t *rn = vn;
+    uint64_t *rm = vm;
+    union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
+    union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
+    union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
+
+    CR_ST_WORD(d, 0) += s1(CR_ST_WORD(m, 2)) + CR_ST_WORD(n, 1);
+    CR_ST_WORD(d, 1) += s1(CR_ST_WORD(m, 3)) + CR_ST_WORD(n, 2);
+    CR_ST_WORD(d, 2) += s1(CR_ST_WORD(d, 0)) + CR_ST_WORD(n, 3);
+    CR_ST_WORD(d, 3) += s1(CR_ST_WORD(d, 1)) + CR_ST_WORD(m, 0);
+
+    rd[0] = d.l[0];
+    rd[1] = d.l[1];
+
+    clear_tail_16(vd, desc);
+}
+
+/*
+ * The SHA-512 logical functions (same as above but using 64-bit operands)
+ */
+
+static uint64_t cho512(uint64_t x, uint64_t y, uint64_t z)
+{
+    return (x & (y ^ z)) ^ z;
+}
+
+static uint64_t maj512(uint64_t x, uint64_t y, uint64_t z)
+{
+    return (x & y) | ((x | y) & z);
+}
+
+static uint64_t S0_512(uint64_t x)
+{
+    return ror64(x, 28) ^ ror64(x, 34) ^ ror64(x, 39);
+}
+
+static uint64_t S1_512(uint64_t x)
+{
+    return ror64(x, 14) ^ ror64(x, 18) ^ ror64(x, 41);
+}
+
+static uint64_t s0_512(uint64_t x)
+{
+    return ror64(x, 1) ^ ror64(x, 8) ^ (x >> 7);
+}
+
+static uint64_t s1_512(uint64_t x)
+{
+    return ror64(x, 19) ^ ror64(x, 61) ^ (x >> 6);
+}
+
+void HELPER(crypto_sha512h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    uint64_t *rd = vd;
+    uint64_t *rn = vn;
+    uint64_t *rm = vm;
+    uint64_t d0 = rd[0];
+    uint64_t d1 = rd[1];
+
+    d1 += S1_512(rm[1]) + cho512(rm[1], rn[0], rn[1]);
+    d0 += S1_512(d1 + rm[0]) + cho512(d1 + rm[0], rm[1], rn[0]);
+
+    rd[0] = d0;
+    rd[1] = d1;
+
+    clear_tail_16(vd, desc);
+}
+
+void HELPER(crypto_sha512h2)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    uint64_t *rd = vd;
+    uint64_t *rn = vn;
+    uint64_t *rm = vm;
+    uint64_t d0 = rd[0];
+    uint64_t d1 = rd[1];
+
+    d1 += S0_512(rm[0]) + maj512(rn[0], rm[1], rm[0]);
+    d0 += S0_512(d1) + maj512(d1, rm[0], rm[1]);
+
+    rd[0] = d0;
+    rd[1] = d1;
+
+    clear_tail_16(vd, desc);
+}
+
+void HELPER(crypto_sha512su0)(void *vd, void *vn, uint32_t desc)
+{
+    uint64_t *rd = vd;
+    uint64_t *rn = vn;
+    uint64_t d0 = rd[0];
+    uint64_t d1 = rd[1];
+
+    d0 += s0_512(rd[1]);
+    d1 += s0_512(rn[0]);
+
+    rd[0] = d0;
+    rd[1] = d1;
+
+    clear_tail_16(vd, desc);
+}
+
+void HELPER(crypto_sha512su1)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    uint64_t *rd = vd;
+    uint64_t *rn = vn;
+    uint64_t *rm = vm;
+
+    rd[0] += s1_512(rn[0]) + rm[0];
+    rd[1] += s1_512(rn[1]) + rm[1];
+
+    clear_tail_16(vd, desc);
+}
+
+void HELPER(crypto_sm3partw1)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    uint64_t *rd = vd;
+    uint64_t *rn = vn;
+    uint64_t *rm = vm;
+    union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
+    union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
+    union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
+    uint32_t t;
+
+    t = CR_ST_WORD(d, 0) ^ CR_ST_WORD(n, 0) ^ ror32(CR_ST_WORD(m, 1), 17);
+    CR_ST_WORD(d, 0) = t ^ ror32(t, 17) ^ ror32(t, 9);
+
+    t = CR_ST_WORD(d, 1) ^ CR_ST_WORD(n, 1) ^ ror32(CR_ST_WORD(m, 2), 17);
+    CR_ST_WORD(d, 1) = t ^ ror32(t, 17) ^ ror32(t, 9);
+
+    t = CR_ST_WORD(d, 2) ^ CR_ST_WORD(n, 2) ^ ror32(CR_ST_WORD(m, 3), 17);
+    CR_ST_WORD(d, 2) = t ^ ror32(t, 17) ^ ror32(t, 9);
+
+    t = CR_ST_WORD(d, 3) ^ CR_ST_WORD(n, 3) ^ ror32(CR_ST_WORD(d, 0), 17);
+    CR_ST_WORD(d, 3) = t ^ ror32(t, 17) ^ ror32(t, 9);
+
+    rd[0] = d.l[0];
+    rd[1] = d.l[1];
+
+    clear_tail_16(vd, desc);
+}
+
+void HELPER(crypto_sm3partw2)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    uint64_t *rd = vd;
+    uint64_t *rn = vn;
+    uint64_t *rm = vm;
+    union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
+    union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
+    union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
+    uint32_t t = CR_ST_WORD(n, 0) ^ ror32(CR_ST_WORD(m, 0), 25);
+
+    CR_ST_WORD(d, 0) ^= t;
+    CR_ST_WORD(d, 1) ^= CR_ST_WORD(n, 1) ^ ror32(CR_ST_WORD(m, 1), 25);
+    CR_ST_WORD(d, 2) ^= CR_ST_WORD(n, 2) ^ ror32(CR_ST_WORD(m, 2), 25);
+    CR_ST_WORD(d, 3) ^= CR_ST_WORD(n, 3) ^ ror32(CR_ST_WORD(m, 3), 25) ^
+                        ror32(t, 17) ^ ror32(t, 2) ^ ror32(t, 26);
+
+    rd[0] = d.l[0];
+    rd[1] = d.l[1];
+
+    clear_tail_16(vd, desc);
+}
+
+static inline void QEMU_ALWAYS_INLINE
+crypto_sm3tt(uint64_t *rd, uint64_t *rn, uint64_t *rm,
+             uint32_t desc, uint32_t opcode)
+{
+    union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
+    union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
+    union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
+    uint32_t imm2 = simd_data(desc);
+    uint32_t t;
+
+    assert(imm2 < 4);
+
+    if (opcode == 0 || opcode == 2) {
+        /* SM3TT1A, SM3TT2A */
+        t = par(CR_ST_WORD(d, 3), CR_ST_WORD(d, 2), CR_ST_WORD(d, 1));
+    } else if (opcode == 1) {
+        /* SM3TT1B */
+        t = maj(CR_ST_WORD(d, 3), CR_ST_WORD(d, 2), CR_ST_WORD(d, 1));
+    } else if (opcode == 3) {
+        /* SM3TT2B */
+        t = cho(CR_ST_WORD(d, 3), CR_ST_WORD(d, 2), CR_ST_WORD(d, 1));
+    } else {
+        qemu_build_not_reached();
+    }
+
+    t += CR_ST_WORD(d, 0) + CR_ST_WORD(m, imm2);
+
+    CR_ST_WORD(d, 0) = CR_ST_WORD(d, 1);
+
+    if (opcode < 2) {
+        /* SM3TT1A, SM3TT1B */
+        t += CR_ST_WORD(n, 3) ^ ror32(CR_ST_WORD(d, 3), 20);
+
+        CR_ST_WORD(d, 1) = ror32(CR_ST_WORD(d, 2), 23);
+    } else {
+        /* SM3TT2A, SM3TT2B */
+        t += CR_ST_WORD(n, 3);
+        t ^= rol32(t, 9) ^ rol32(t, 17);
+
+        CR_ST_WORD(d, 1) = ror32(CR_ST_WORD(d, 2), 13);
+    }
+
+    CR_ST_WORD(d, 2) = CR_ST_WORD(d, 3);
+    CR_ST_WORD(d, 3) = t;
+
+    rd[0] = d.l[0];
+    rd[1] = d.l[1];
+
+    clear_tail_16(rd, desc);
+}
+
+#define DO_SM3TT(NAME, OPCODE) \
+    void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+    { crypto_sm3tt(vd, vn, vm, desc, OPCODE); }
+
+DO_SM3TT(crypto_sm3tt1a, 0)
+DO_SM3TT(crypto_sm3tt1b, 1)
+DO_SM3TT(crypto_sm3tt2a, 2)
+DO_SM3TT(crypto_sm3tt2b, 3)
+
+#undef DO_SM3TT
+
+static void do_crypto_sm4e(uint64_t *rd, uint64_t *rn, uint64_t *rm)
+{
+    union CRYPTO_STATE d = { .l = { rn[0], rn[1] } };
+    union CRYPTO_STATE n = { .l = { rm[0], rm[1] } };
+    uint32_t t, i;
+
+    for (i = 0; i < 4; i++) {
+        t = CR_ST_WORD(d, (i + 1) % 4) ^
+            CR_ST_WORD(d, (i + 2) % 4) ^
+            CR_ST_WORD(d, (i + 3) % 4) ^
+            CR_ST_WORD(n, i);
+
+        t = sm4_sbox[t & 0xff] |
+            sm4_sbox[(t >> 8) & 0xff] << 8 |
+            sm4_sbox[(t >> 16) & 0xff] << 16 |
+            sm4_sbox[(t >> 24) & 0xff] << 24;
+
+        CR_ST_WORD(d, i) ^= t ^ rol32(t, 2) ^ rol32(t, 10) ^ rol32(t, 18) ^
+                            rol32(t, 24);
+    }
+
+    rd[0] = d.l[0];
+    rd[1] = d.l[1];
+}
+
+void HELPER(crypto_sm4e)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+
+    for (i = 0; i < opr_sz; i += 16) {
+        do_crypto_sm4e(vd + i, vn + i, vm + i);
+    }
+    clear_tail(vd, opr_sz, simd_maxsz(desc));
+}
+
+static void do_crypto_sm4ekey(uint64_t *rd, uint64_t *rn, uint64_t *rm)
+{
+    union CRYPTO_STATE d;
+    union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
+    union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
+    uint32_t t, i;
+
+    d = n;
+    for (i = 0; i < 4; i++) {
+        t = CR_ST_WORD(d, (i + 1) % 4) ^
+            CR_ST_WORD(d, (i + 2) % 4) ^
+            CR_ST_WORD(d, (i + 3) % 4) ^
+            CR_ST_WORD(m, i);
+
+        t = sm4_sbox[t & 0xff] |
+            sm4_sbox[(t >> 8) & 0xff] << 8 |
+            sm4_sbox[(t >> 16) & 0xff] << 16 |
+            sm4_sbox[(t >> 24) & 0xff] << 24;
+
+        CR_ST_WORD(d, i) ^= t ^ rol32(t, 13) ^ rol32(t, 23);
+    }
+
+    rd[0] = d.l[0];
+    rd[1] = d.l[1];
+}
+
+void HELPER(crypto_sm4ekey)(void *vd, void *vn, void* vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+
+    for (i = 0; i < opr_sz; i += 16) {
+        do_crypto_sm4ekey(vd + i, vn + i, vm + i);
+    }
+    clear_tail(vd, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(crypto_rax1)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    uint64_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz / 8; ++i) {
+        d[i] = n[i] ^ rol64(m[i], 1);
+    }
+    clear_tail(vd, opr_sz, simd_maxsz(desc));
+}
diff --git a/target/arm/tcg/helper-a64.c b/target/arm/tcg/helper-a64.c
new file mode 100644
index 0000000..0972a4b
--- /dev/null
+++ b/target/arm/tcg/helper-a64.c
@@ -0,0 +1,954 @@
+/*
+ *  AArch64 specific helpers
+ *
+ *  Copyright (c) 2013 Alexander Graf <agraf@suse.de>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/units.h"
+#include "cpu.h"
+#include "exec/gdbstub.h"
+#include "exec/helper-proto.h"
+#include "qemu/host-utils.h"
+#include "qemu/log.h"
+#include "qemu/main-loop.h"
+#include "qemu/bitops.h"
+#include "internals.h"
+#include "qemu/crc32c.h"
+#include "exec/exec-all.h"
+#include "exec/cpu_ldst.h"
+#include "qemu/int128.h"
+#include "qemu/atomic128.h"
+#include "fpu/softfloat.h"
+#include <zlib.h> /* For crc32 */
+
+/* C2.4.7 Multiply and divide */
+/* special cases for 0 and LLONG_MIN are mandated by the standard */
+uint64_t HELPER(udiv64)(uint64_t num, uint64_t den)
+{
+    if (den == 0) {
+        return 0;
+    }
+    return num / den;
+}
+
+int64_t HELPER(sdiv64)(int64_t num, int64_t den)
+{
+    if (den == 0) {
+        return 0;
+    }
+    if (num == LLONG_MIN && den == -1) {
+        return LLONG_MIN;
+    }
+    return num / den;
+}
+
+uint64_t HELPER(rbit64)(uint64_t x)
+{
+    return revbit64(x);
+}
+
+void HELPER(msr_i_spsel)(CPUARMState *env, uint32_t imm)
+{
+    update_spsel(env, imm);
+}
+
+static void daif_check(CPUARMState *env, uint32_t op,
+                       uint32_t imm, uintptr_t ra)
+{
+    /* DAIF update to PSTATE. This is OK from EL0 only if UMA is set.  */
+    if (arm_current_el(env) == 0 && !(arm_sctlr(env, 0) & SCTLR_UMA)) {
+        raise_exception_ra(env, EXCP_UDEF,
+                           syn_aa64_sysregtrap(0, extract32(op, 0, 3),
+                                               extract32(op, 3, 3), 4,
+                                               imm, 0x1f, 0),
+                           exception_target_el(env), ra);
+    }
+}
+
+void HELPER(msr_i_daifset)(CPUARMState *env, uint32_t imm)
+{
+    daif_check(env, 0x1e, imm, GETPC());
+    env->daif |= (imm << 6) & PSTATE_DAIF;
+    arm_rebuild_hflags(env);
+}
+
+void HELPER(msr_i_daifclear)(CPUARMState *env, uint32_t imm)
+{
+    daif_check(env, 0x1f, imm, GETPC());
+    env->daif &= ~((imm << 6) & PSTATE_DAIF);
+    arm_rebuild_hflags(env);
+}
+
+/* Convert a softfloat float_relation_ (as returned by
+ * the float*_compare functions) to the correct ARM
+ * NZCV flag state.
+ */
+static inline uint32_t float_rel_to_flags(int res)
+{
+    uint64_t flags;
+    switch (res) {
+    case float_relation_equal:
+        flags = PSTATE_Z | PSTATE_C;
+        break;
+    case float_relation_less:
+        flags = PSTATE_N;
+        break;
+    case float_relation_greater:
+        flags = PSTATE_C;
+        break;
+    case float_relation_unordered:
+    default:
+        flags = PSTATE_C | PSTATE_V;
+        break;
+    }
+    return flags;
+}
+
+uint64_t HELPER(vfp_cmph_a64)(uint32_t x, uint32_t y, void *fp_status)
+{
+    return float_rel_to_flags(float16_compare_quiet(x, y, fp_status));
+}
+
+uint64_t HELPER(vfp_cmpeh_a64)(uint32_t x, uint32_t y, void *fp_status)
+{
+    return float_rel_to_flags(float16_compare(x, y, fp_status));
+}
+
+uint64_t HELPER(vfp_cmps_a64)(float32 x, float32 y, void *fp_status)
+{
+    return float_rel_to_flags(float32_compare_quiet(x, y, fp_status));
+}
+
+uint64_t HELPER(vfp_cmpes_a64)(float32 x, float32 y, void *fp_status)
+{
+    return float_rel_to_flags(float32_compare(x, y, fp_status));
+}
+
+uint64_t HELPER(vfp_cmpd_a64)(float64 x, float64 y, void *fp_status)
+{
+    return float_rel_to_flags(float64_compare_quiet(x, y, fp_status));
+}
+
+uint64_t HELPER(vfp_cmped_a64)(float64 x, float64 y, void *fp_status)
+{
+    return float_rel_to_flags(float64_compare(x, y, fp_status));
+}
+
+float32 HELPER(vfp_mulxs)(float32 a, float32 b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+
+    a = float32_squash_input_denormal(a, fpst);
+    b = float32_squash_input_denormal(b, fpst);
+
+    if ((float32_is_zero(a) && float32_is_infinity(b)) ||
+        (float32_is_infinity(a) && float32_is_zero(b))) {
+        /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
+        return make_float32((1U << 30) |
+                            ((float32_val(a) ^ float32_val(b)) & (1U << 31)));
+    }
+    return float32_mul(a, b, fpst);
+}
+
+float64 HELPER(vfp_mulxd)(float64 a, float64 b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+
+    a = float64_squash_input_denormal(a, fpst);
+    b = float64_squash_input_denormal(b, fpst);
+
+    if ((float64_is_zero(a) && float64_is_infinity(b)) ||
+        (float64_is_infinity(a) && float64_is_zero(b))) {
+        /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
+        return make_float64((1ULL << 62) |
+                            ((float64_val(a) ^ float64_val(b)) & (1ULL << 63)));
+    }
+    return float64_mul(a, b, fpst);
+}
+
+/* 64bit/double versions of the neon float compare functions */
+uint64_t HELPER(neon_ceq_f64)(float64 a, float64 b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    return -float64_eq_quiet(a, b, fpst);
+}
+
+uint64_t HELPER(neon_cge_f64)(float64 a, float64 b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    return -float64_le(b, a, fpst);
+}
+
+uint64_t HELPER(neon_cgt_f64)(float64 a, float64 b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    return -float64_lt(b, a, fpst);
+}
+
+/* Reciprocal step and sqrt step. Note that unlike the A32/T32
+ * versions, these do a fully fused multiply-add or
+ * multiply-add-and-halve.
+ */
+
+uint32_t HELPER(recpsf_f16)(uint32_t a, uint32_t b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+
+    a = float16_squash_input_denormal(a, fpst);
+    b = float16_squash_input_denormal(b, fpst);
+
+    a = float16_chs(a);
+    if ((float16_is_infinity(a) && float16_is_zero(b)) ||
+        (float16_is_infinity(b) && float16_is_zero(a))) {
+        return float16_two;
+    }
+    return float16_muladd(a, b, float16_two, 0, fpst);
+}
+
+float32 HELPER(recpsf_f32)(float32 a, float32 b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+
+    a = float32_squash_input_denormal(a, fpst);
+    b = float32_squash_input_denormal(b, fpst);
+
+    a = float32_chs(a);
+    if ((float32_is_infinity(a) && float32_is_zero(b)) ||
+        (float32_is_infinity(b) && float32_is_zero(a))) {
+        return float32_two;
+    }
+    return float32_muladd(a, b, float32_two, 0, fpst);
+}
+
+float64 HELPER(recpsf_f64)(float64 a, float64 b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+
+    a = float64_squash_input_denormal(a, fpst);
+    b = float64_squash_input_denormal(b, fpst);
+
+    a = float64_chs(a);
+    if ((float64_is_infinity(a) && float64_is_zero(b)) ||
+        (float64_is_infinity(b) && float64_is_zero(a))) {
+        return float64_two;
+    }
+    return float64_muladd(a, b, float64_two, 0, fpst);
+}
+
+uint32_t HELPER(rsqrtsf_f16)(uint32_t a, uint32_t b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+
+    a = float16_squash_input_denormal(a, fpst);
+    b = float16_squash_input_denormal(b, fpst);
+
+    a = float16_chs(a);
+    if ((float16_is_infinity(a) && float16_is_zero(b)) ||
+        (float16_is_infinity(b) && float16_is_zero(a))) {
+        return float16_one_point_five;
+    }
+    return float16_muladd(a, b, float16_three, float_muladd_halve_result, fpst);
+}
+
+float32 HELPER(rsqrtsf_f32)(float32 a, float32 b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+
+    a = float32_squash_input_denormal(a, fpst);
+    b = float32_squash_input_denormal(b, fpst);
+
+    a = float32_chs(a);
+    if ((float32_is_infinity(a) && float32_is_zero(b)) ||
+        (float32_is_infinity(b) && float32_is_zero(a))) {
+        return float32_one_point_five;
+    }
+    return float32_muladd(a, b, float32_three, float_muladd_halve_result, fpst);
+}
+
+float64 HELPER(rsqrtsf_f64)(float64 a, float64 b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+
+    a = float64_squash_input_denormal(a, fpst);
+    b = float64_squash_input_denormal(b, fpst);
+
+    a = float64_chs(a);
+    if ((float64_is_infinity(a) && float64_is_zero(b)) ||
+        (float64_is_infinity(b) && float64_is_zero(a))) {
+        return float64_one_point_five;
+    }
+    return float64_muladd(a, b, float64_three, float_muladd_halve_result, fpst);
+}
+
+/* Pairwise long add: add pairs of adjacent elements into
+ * double-width elements in the result (eg _s8 is an 8x8->16 op)
+ */
+uint64_t HELPER(neon_addlp_s8)(uint64_t a)
+{
+    uint64_t nsignmask = 0x0080008000800080ULL;
+    uint64_t wsignmask = 0x8000800080008000ULL;
+    uint64_t elementmask = 0x00ff00ff00ff00ffULL;
+    uint64_t tmp1, tmp2;
+    uint64_t res, signres;
+
+    /* Extract odd elements, sign extend each to a 16 bit field */
+    tmp1 = a & elementmask;
+    tmp1 ^= nsignmask;
+    tmp1 |= wsignmask;
+    tmp1 = (tmp1 - nsignmask) ^ wsignmask;
+    /* Ditto for the even elements */
+    tmp2 = (a >> 8) & elementmask;
+    tmp2 ^= nsignmask;
+    tmp2 |= wsignmask;
+    tmp2 = (tmp2 - nsignmask) ^ wsignmask;
+
+    /* calculate the result by summing bits 0..14, 16..22, etc,
+     * and then adjusting the sign bits 15, 23, etc manually.
+     * This ensures the addition can't overflow the 16 bit field.
+     */
+    signres = (tmp1 ^ tmp2) & wsignmask;
+    res = (tmp1 & ~wsignmask) + (tmp2 & ~wsignmask);
+    res ^= signres;
+
+    return res;
+}
+
+uint64_t HELPER(neon_addlp_u8)(uint64_t a)
+{
+    uint64_t tmp;
+
+    tmp = a & 0x00ff00ff00ff00ffULL;
+    tmp += (a >> 8) & 0x00ff00ff00ff00ffULL;
+    return tmp;
+}
+
+uint64_t HELPER(neon_addlp_s16)(uint64_t a)
+{
+    int32_t reslo, reshi;
+
+    reslo = (int32_t)(int16_t)a + (int32_t)(int16_t)(a >> 16);
+    reshi = (int32_t)(int16_t)(a >> 32) + (int32_t)(int16_t)(a >> 48);
+
+    return (uint32_t)reslo | (((uint64_t)reshi) << 32);
+}
+
+uint64_t HELPER(neon_addlp_u16)(uint64_t a)
+{
+    uint64_t tmp;
+
+    tmp = a & 0x0000ffff0000ffffULL;
+    tmp += (a >> 16) & 0x0000ffff0000ffffULL;
+    return tmp;
+}
+
+/* Floating-point reciprocal exponent - see FPRecpX in ARM ARM */
+uint32_t HELPER(frecpx_f16)(uint32_t a, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    uint16_t val16, sbit;
+    int16_t exp;
+
+    if (float16_is_any_nan(a)) {
+        float16 nan = a;
+        if (float16_is_signaling_nan(a, fpst)) {
+            float_raise(float_flag_invalid, fpst);
+            if (!fpst->default_nan_mode) {
+                nan = float16_silence_nan(a, fpst);
+            }
+        }
+        if (fpst->default_nan_mode) {
+            nan = float16_default_nan(fpst);
+        }
+        return nan;
+    }
+
+    a = float16_squash_input_denormal(a, fpst);
+
+    val16 = float16_val(a);
+    sbit = 0x8000 & val16;
+    exp = extract32(val16, 10, 5);
+
+    if (exp == 0) {
+        return make_float16(deposit32(sbit, 10, 5, 0x1e));
+    } else {
+        return make_float16(deposit32(sbit, 10, 5, ~exp));
+    }
+}
+
+float32 HELPER(frecpx_f32)(float32 a, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    uint32_t val32, sbit;
+    int32_t exp;
+
+    if (float32_is_any_nan(a)) {
+        float32 nan = a;
+        if (float32_is_signaling_nan(a, fpst)) {
+            float_raise(float_flag_invalid, fpst);
+            if (!fpst->default_nan_mode) {
+                nan = float32_silence_nan(a, fpst);
+            }
+        }
+        if (fpst->default_nan_mode) {
+            nan = float32_default_nan(fpst);
+        }
+        return nan;
+    }
+
+    a = float32_squash_input_denormal(a, fpst);
+
+    val32 = float32_val(a);
+    sbit = 0x80000000ULL & val32;
+    exp = extract32(val32, 23, 8);
+
+    if (exp == 0) {
+        return make_float32(sbit | (0xfe << 23));
+    } else {
+        return make_float32(sbit | (~exp & 0xff) << 23);
+    }
+}
+
+float64 HELPER(frecpx_f64)(float64 a, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    uint64_t val64, sbit;
+    int64_t exp;
+
+    if (float64_is_any_nan(a)) {
+        float64 nan = a;
+        if (float64_is_signaling_nan(a, fpst)) {
+            float_raise(float_flag_invalid, fpst);
+            if (!fpst->default_nan_mode) {
+                nan = float64_silence_nan(a, fpst);
+            }
+        }
+        if (fpst->default_nan_mode) {
+            nan = float64_default_nan(fpst);
+        }
+        return nan;
+    }
+
+    a = float64_squash_input_denormal(a, fpst);
+
+    val64 = float64_val(a);
+    sbit = 0x8000000000000000ULL & val64;
+    exp = extract64(float64_val(a), 52, 11);
+
+    if (exp == 0) {
+        return make_float64(sbit | (0x7feULL << 52));
+    } else {
+        return make_float64(sbit | (~exp & 0x7ffULL) << 52);
+    }
+}
+
+float32 HELPER(fcvtx_f64_to_f32)(float64 a, CPUARMState *env)
+{
+    /* Von Neumann rounding is implemented by using round-to-zero
+     * and then setting the LSB of the result if Inexact was raised.
+     */
+    float32 r;
+    float_status *fpst = &env->vfp.fp_status;
+    float_status tstat = *fpst;
+    int exflags;
+
+    set_float_rounding_mode(float_round_to_zero, &tstat);
+    set_float_exception_flags(0, &tstat);
+    r = float64_to_float32(a, &tstat);
+    exflags = get_float_exception_flags(&tstat);
+    if (exflags & float_flag_inexact) {
+        r = make_float32(float32_val(r) | 1);
+    }
+    exflags |= get_float_exception_flags(fpst);
+    set_float_exception_flags(exflags, fpst);
+    return r;
+}
+
+/* 64-bit versions of the CRC helpers. Note that although the operation
+ * (and the prototypes of crc32c() and crc32() mean that only the bottom
+ * 32 bits of the accumulator and result are used, we pass and return
+ * uint64_t for convenience of the generated code. Unlike the 32-bit
+ * instruction set versions, val may genuinely have 64 bits of data in it.
+ * The upper bytes of val (above the number specified by 'bytes') must have
+ * been zeroed out by the caller.
+ */
+uint64_t HELPER(crc32_64)(uint64_t acc, uint64_t val, uint32_t bytes)
+{
+    uint8_t buf[8];
+
+    stq_le_p(buf, val);
+
+    /* zlib crc32 converts the accumulator and output to one's complement.  */
+    return crc32(acc ^ 0xffffffff, buf, bytes) ^ 0xffffffff;
+}
+
+uint64_t HELPER(crc32c_64)(uint64_t acc, uint64_t val, uint32_t bytes)
+{
+    uint8_t buf[8];
+
+    stq_le_p(buf, val);
+
+    /* Linux crc32c converts the output to one's complement.  */
+    return crc32c(acc, buf, bytes) ^ 0xffffffff;
+}
+
+/*
+ * AdvSIMD half-precision
+ */
+
+#define ADVSIMD_HELPER(name, suffix) HELPER(glue(glue(advsimd_, name), suffix))
+
+#define ADVSIMD_HALFOP(name) \
+uint32_t ADVSIMD_HELPER(name, h)(uint32_t a, uint32_t b, void *fpstp) \
+{ \
+    float_status *fpst = fpstp; \
+    return float16_ ## name(a, b, fpst);    \
+}
+
+ADVSIMD_HALFOP(add)
+ADVSIMD_HALFOP(sub)
+ADVSIMD_HALFOP(mul)
+ADVSIMD_HALFOP(div)
+ADVSIMD_HALFOP(min)
+ADVSIMD_HALFOP(max)
+ADVSIMD_HALFOP(minnum)
+ADVSIMD_HALFOP(maxnum)
+
+#define ADVSIMD_TWOHALFOP(name)                                         \
+uint32_t ADVSIMD_HELPER(name, 2h)(uint32_t two_a, uint32_t two_b, void *fpstp) \
+{ \
+    float16  a1, a2, b1, b2;                        \
+    uint32_t r1, r2;                                \
+    float_status *fpst = fpstp;                     \
+    a1 = extract32(two_a, 0, 16);                   \
+    a2 = extract32(two_a, 16, 16);                  \
+    b1 = extract32(two_b, 0, 16);                   \
+    b2 = extract32(two_b, 16, 16);                  \
+    r1 = float16_ ## name(a1, b1, fpst);            \
+    r2 = float16_ ## name(a2, b2, fpst);            \
+    return deposit32(r1, 16, 16, r2);               \
+}
+
+ADVSIMD_TWOHALFOP(add)
+ADVSIMD_TWOHALFOP(sub)
+ADVSIMD_TWOHALFOP(mul)
+ADVSIMD_TWOHALFOP(div)
+ADVSIMD_TWOHALFOP(min)
+ADVSIMD_TWOHALFOP(max)
+ADVSIMD_TWOHALFOP(minnum)
+ADVSIMD_TWOHALFOP(maxnum)
+
+/* Data processing - scalar floating-point and advanced SIMD */
+static float16 float16_mulx(float16 a, float16 b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+
+    a = float16_squash_input_denormal(a, fpst);
+    b = float16_squash_input_denormal(b, fpst);
+
+    if ((float16_is_zero(a) && float16_is_infinity(b)) ||
+        (float16_is_infinity(a) && float16_is_zero(b))) {
+        /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
+        return make_float16((1U << 14) |
+                            ((float16_val(a) ^ float16_val(b)) & (1U << 15)));
+    }
+    return float16_mul(a, b, fpst);
+}
+
+ADVSIMD_HALFOP(mulx)
+ADVSIMD_TWOHALFOP(mulx)
+
+/* fused multiply-accumulate */
+uint32_t HELPER(advsimd_muladdh)(uint32_t a, uint32_t b, uint32_t c,
+                                 void *fpstp)
+{
+    float_status *fpst = fpstp;
+    return float16_muladd(a, b, c, 0, fpst);
+}
+
+uint32_t HELPER(advsimd_muladd2h)(uint32_t two_a, uint32_t two_b,
+                                  uint32_t two_c, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    float16  a1, a2, b1, b2, c1, c2;
+    uint32_t r1, r2;
+    a1 = extract32(two_a, 0, 16);
+    a2 = extract32(two_a, 16, 16);
+    b1 = extract32(two_b, 0, 16);
+    b2 = extract32(two_b, 16, 16);
+    c1 = extract32(two_c, 0, 16);
+    c2 = extract32(two_c, 16, 16);
+    r1 = float16_muladd(a1, b1, c1, 0, fpst);
+    r2 = float16_muladd(a2, b2, c2, 0, fpst);
+    return deposit32(r1, 16, 16, r2);
+}
+
+/*
+ * Floating point comparisons produce an integer result. Softfloat
+ * routines return float_relation types which we convert to the 0/-1
+ * Neon requires.
+ */
+
+#define ADVSIMD_CMPRES(test) (test) ? 0xffff : 0
+
+uint32_t HELPER(advsimd_ceq_f16)(uint32_t a, uint32_t b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    int compare = float16_compare_quiet(a, b, fpst);
+    return ADVSIMD_CMPRES(compare == float_relation_equal);
+}
+
+uint32_t HELPER(advsimd_cge_f16)(uint32_t a, uint32_t b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    int compare = float16_compare(a, b, fpst);
+    return ADVSIMD_CMPRES(compare == float_relation_greater ||
+                          compare == float_relation_equal);
+}
+
+uint32_t HELPER(advsimd_cgt_f16)(uint32_t a, uint32_t b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    int compare = float16_compare(a, b, fpst);
+    return ADVSIMD_CMPRES(compare == float_relation_greater);
+}
+
+uint32_t HELPER(advsimd_acge_f16)(uint32_t a, uint32_t b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    float16 f0 = float16_abs(a);
+    float16 f1 = float16_abs(b);
+    int compare = float16_compare(f0, f1, fpst);
+    return ADVSIMD_CMPRES(compare == float_relation_greater ||
+                          compare == float_relation_equal);
+}
+
+uint32_t HELPER(advsimd_acgt_f16)(uint32_t a, uint32_t b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    float16 f0 = float16_abs(a);
+    float16 f1 = float16_abs(b);
+    int compare = float16_compare(f0, f1, fpst);
+    return ADVSIMD_CMPRES(compare == float_relation_greater);
+}
+
+/* round to integral */
+uint32_t HELPER(advsimd_rinth_exact)(uint32_t x, void *fp_status)
+{
+    return float16_round_to_int(x, fp_status);
+}
+
+uint32_t HELPER(advsimd_rinth)(uint32_t x, void *fp_status)
+{
+    int old_flags = get_float_exception_flags(fp_status), new_flags;
+    float16 ret;
+
+    ret = float16_round_to_int(x, fp_status);
+
+    /* Suppress any inexact exceptions the conversion produced */
+    if (!(old_flags & float_flag_inexact)) {
+        new_flags = get_float_exception_flags(fp_status);
+        set_float_exception_flags(new_flags & ~float_flag_inexact, fp_status);
+    }
+
+    return ret;
+}
+
+/*
+ * Half-precision floating point conversion functions
+ *
+ * There are a multitude of conversion functions with various
+ * different rounding modes. This is dealt with by the calling code
+ * setting the mode appropriately before calling the helper.
+ */
+
+uint32_t HELPER(advsimd_f16tosinth)(uint32_t a, void *fpstp)
+{
+    float_status *fpst = fpstp;
+
+    /* Invalid if we are passed a NaN */
+    if (float16_is_any_nan(a)) {
+        float_raise(float_flag_invalid, fpst);
+        return 0;
+    }
+    return float16_to_int16(a, fpst);
+}
+
+uint32_t HELPER(advsimd_f16touinth)(uint32_t a, void *fpstp)
+{
+    float_status *fpst = fpstp;
+
+    /* Invalid if we are passed a NaN */
+    if (float16_is_any_nan(a)) {
+        float_raise(float_flag_invalid, fpst);
+        return 0;
+    }
+    return float16_to_uint16(a, fpst);
+}
+
+static int el_from_spsr(uint32_t spsr)
+{
+    /* Return the exception level that this SPSR is requesting a return to,
+     * or -1 if it is invalid (an illegal return)
+     */
+    if (spsr & PSTATE_nRW) {
+        switch (spsr & CPSR_M) {
+        case ARM_CPU_MODE_USR:
+            return 0;
+        case ARM_CPU_MODE_HYP:
+            return 2;
+        case ARM_CPU_MODE_FIQ:
+        case ARM_CPU_MODE_IRQ:
+        case ARM_CPU_MODE_SVC:
+        case ARM_CPU_MODE_ABT:
+        case ARM_CPU_MODE_UND:
+        case ARM_CPU_MODE_SYS:
+            return 1;
+        case ARM_CPU_MODE_MON:
+            /* Returning to Mon from AArch64 is never possible,
+             * so this is an illegal return.
+             */
+        default:
+            return -1;
+        }
+    } else {
+        if (extract32(spsr, 1, 1)) {
+            /* Return with reserved M[1] bit set */
+            return -1;
+        }
+        if (extract32(spsr, 0, 4) == 1) {
+            /* return to EL0 with M[0] bit set */
+            return -1;
+        }
+        return extract32(spsr, 2, 2);
+    }
+}
+
+static void cpsr_write_from_spsr_elx(CPUARMState *env,
+                                     uint32_t val)
+{
+    uint32_t mask;
+
+    /* Save SPSR_ELx.SS into PSTATE. */
+    env->pstate = (env->pstate & ~PSTATE_SS) | (val & PSTATE_SS);
+    val &= ~PSTATE_SS;
+
+    /* Move DIT to the correct location for CPSR */
+    if (val & PSTATE_DIT) {
+        val &= ~PSTATE_DIT;
+        val |= CPSR_DIT;
+    }
+
+    mask = aarch32_cpsr_valid_mask(env->features, \
+        &env_archcpu(env)->isar);
+    cpsr_write(env, val, mask, CPSRWriteRaw);
+}
+
+void HELPER(exception_return)(CPUARMState *env, uint64_t new_pc)
+{
+    int cur_el = arm_current_el(env);
+    unsigned int spsr_idx = aarch64_banked_spsr_index(cur_el);
+    uint32_t spsr = env->banked_spsr[spsr_idx];
+    int new_el;
+    bool return_to_aa64 = (spsr & PSTATE_nRW) == 0;
+
+    aarch64_save_sp(env, cur_el);
+
+    arm_clear_exclusive(env);
+
+    /* We must squash the PSTATE.SS bit to zero unless both of the
+     * following hold:
+     *  1. debug exceptions are currently disabled
+     *  2. singlestep will be active in the EL we return to
+     * We check 1 here and 2 after we've done the pstate/cpsr write() to
+     * transition to the EL we're going to.
+     */
+    if (arm_generate_debug_exceptions(env)) {
+        spsr &= ~PSTATE_SS;
+    }
+
+    new_el = el_from_spsr(spsr);
+    if (new_el == -1) {
+        goto illegal_return;
+    }
+    if (new_el > cur_el || (new_el == 2 && !arm_is_el2_enabled(env))) {
+        /* Disallow return to an EL which is unimplemented or higher
+         * than the current one.
+         */
+        goto illegal_return;
+    }
+
+    if (new_el != 0 && arm_el_is_aa64(env, new_el) != return_to_aa64) {
+        /* Return to an EL which is configured for a different register width */
+        goto illegal_return;
+    }
+
+    if (new_el == 1 && (arm_hcr_el2_eff(env) & HCR_TGE)) {
+        goto illegal_return;
+    }
+
+    qemu_mutex_lock_iothread();
+    arm_call_pre_el_change_hook(env_archcpu(env));
+    qemu_mutex_unlock_iothread();
+
+    if (!return_to_aa64) {
+        env->aarch64 = false;
+        /* We do a raw CPSR write because aarch64_sync_64_to_32()
+         * will sort the register banks out for us, and we've already
+         * caught all the bad-mode cases in el_from_spsr().
+         */
+        cpsr_write_from_spsr_elx(env, spsr);
+        if (!arm_singlestep_active(env)) {
+            env->pstate &= ~PSTATE_SS;
+        }
+        aarch64_sync_64_to_32(env);
+
+        if (spsr & CPSR_T) {
+            env->regs[15] = new_pc & ~0x1;
+        } else {
+            env->regs[15] = new_pc & ~0x3;
+        }
+        helper_rebuild_hflags_a32(env, new_el);
+        qemu_log_mask(CPU_LOG_INT, "Exception return from AArch64 EL%d to "
+                      "AArch32 EL%d PC 0x%" PRIx32 "\n",
+                      cur_el, new_el, env->regs[15]);
+    } else {
+        int tbii;
+
+        env->aarch64 = true;
+        spsr &= aarch64_pstate_valid_mask(&env_archcpu(env)->isar);
+        pstate_write(env, spsr);
+        if (!arm_singlestep_active(env)) {
+            env->pstate &= ~PSTATE_SS;
+        }
+        aarch64_restore_sp(env, new_el);
+        helper_rebuild_hflags_a64(env, new_el);
+
+        /*
+         * Apply TBI to the exception return address.  We had to delay this
+         * until after we selected the new EL, so that we could select the
+         * correct TBI+TBID bits.  This is made easier by waiting until after
+         * the hflags rebuild, since we can pull the composite TBII field
+         * from there.
+         */
+        tbii = EX_TBFLAG_A64(env->hflags, TBII);
+        if ((tbii >> extract64(new_pc, 55, 1)) & 1) {
+            /* TBI is enabled. */
+            int core_mmu_idx = cpu_mmu_index(env, false);
+            if (regime_has_2_ranges(core_to_aa64_mmu_idx(core_mmu_idx))) {
+                new_pc = sextract64(new_pc, 0, 56);
+            } else {
+                new_pc = extract64(new_pc, 0, 56);
+            }
+        }
+        env->pc = new_pc;
+
+        qemu_log_mask(CPU_LOG_INT, "Exception return from AArch64 EL%d to "
+                      "AArch64 EL%d PC 0x%" PRIx64 "\n",
+                      cur_el, new_el, env->pc);
+    }
+
+    /*
+     * Note that cur_el can never be 0.  If new_el is 0, then
+     * el0_a64 is return_to_aa64, else el0_a64 is ignored.
+     */
+    aarch64_sve_change_el(env, cur_el, new_el, return_to_aa64);
+
+    qemu_mutex_lock_iothread();
+    arm_call_el_change_hook(env_archcpu(env));
+    qemu_mutex_unlock_iothread();
+
+    return;
+
+illegal_return:
+    /* Illegal return events of various kinds have architecturally
+     * mandated behaviour:
+     * restore NZCV and DAIF from SPSR_ELx
+     * set PSTATE.IL
+     * restore PC from ELR_ELx
+     * no change to exception level, execution state or stack pointer
+     */
+    env->pstate |= PSTATE_IL;
+    env->pc = new_pc;
+    spsr &= PSTATE_NZCV | PSTATE_DAIF;
+    spsr |= pstate_read(env) & ~(PSTATE_NZCV | PSTATE_DAIF);
+    pstate_write(env, spsr);
+    if (!arm_singlestep_active(env)) {
+        env->pstate &= ~PSTATE_SS;
+    }
+    helper_rebuild_hflags_a64(env, cur_el);
+    qemu_log_mask(LOG_GUEST_ERROR, "Illegal exception return at EL%d: "
+                  "resuming execution at 0x%" PRIx64 "\n", cur_el, env->pc);
+}
+
+/*
+ * Square Root and Reciprocal square root
+ */
+
+uint32_t HELPER(sqrt_f16)(uint32_t a, void *fpstp)
+{
+    float_status *s = fpstp;
+
+    return float16_sqrt(a, s);
+}
+
+void HELPER(dc_zva)(CPUARMState *env, uint64_t vaddr_in)
+{
+    /*
+     * Implement DC ZVA, which zeroes a fixed-length block of memory.
+     * Note that we do not implement the (architecturally mandated)
+     * alignment fault for attempts to use this on Device memory
+     * (which matches the usual QEMU behaviour of not implementing either
+     * alignment faults or any memory attribute handling).
+     */
+    int blocklen = 4 << env_archcpu(env)->dcz_blocksize;
+    uint64_t vaddr = vaddr_in & ~(blocklen - 1);
+    int mmu_idx = cpu_mmu_index(env, false);
+    void *mem;
+
+    /*
+     * Trapless lookup.  In addition to actual invalid page, may
+     * return NULL for I/O, watchpoints, clean pages, etc.
+     */
+    mem = tlb_vaddr_to_host(env, vaddr, MMU_DATA_STORE, mmu_idx);
+
+#ifndef CONFIG_USER_ONLY
+    if (unlikely(!mem)) {
+        uintptr_t ra = GETPC();
+
+        /*
+         * Trap if accessing an invalid page.  DC_ZVA requires that we supply
+         * the original pointer for an invalid page.  But watchpoints require
+         * that we probe the actual space.  So do both.
+         */
+        (void) probe_write(env, vaddr_in, 1, mmu_idx, ra);
+        mem = probe_write(env, vaddr, blocklen, mmu_idx, ra);
+
+        if (unlikely(!mem)) {
+            /*
+             * The only remaining reason for mem == NULL is I/O.
+             * Just do a series of byte writes as the architecture demands.
+             */
+            for (int i = 0; i < blocklen; i++) {
+                cpu_stb_mmuidx_ra(env, vaddr + i, 0, mmu_idx, ra);
+            }
+            return;
+        }
+    }
+#endif
+
+    memset(mem, 0, blocklen);
+}
diff --git a/target/arm/tcg/iwmmxt_helper.c b/target/arm/tcg/iwmmxt_helper.c
new file mode 100644
index 0000000..610b1b2
--- /dev/null
+++ b/target/arm/tcg/iwmmxt_helper.c
@@ -0,0 +1,670 @@
+/*
+ * iwMMXt micro operations for XScale.
+ *
+ * Copyright (c) 2007 OpenedHand, Ltd.
+ * Written by Andrzej Zaborowski <andrew@openedhand.com>
+ * Copyright (c) 2008 CodeSourcery
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+
+#include "cpu.h"
+#include "exec/helper-proto.h"
+
+/* iwMMXt macros extracted from GNU gdb.  */
+
+/* Set the SIMD wCASF flags for 8, 16, 32 or 64-bit operations.  */
+#define SIMD8_SET(v, n, b)      ((v != 0) << ((((b) + 1) * 4) + (n)))
+#define SIMD16_SET(v, n, h)     ((v != 0) << ((((h) + 1) * 8) + (n)))
+#define SIMD32_SET(v, n, w)     ((v != 0) << ((((w) + 1) * 16) + (n)))
+#define SIMD64_SET(v, n)        ((v != 0) << (32 + (n)))
+/* Flags to pass as "n" above.  */
+#define SIMD_NBIT       -1
+#define SIMD_ZBIT       -2
+#define SIMD_CBIT       -3
+#define SIMD_VBIT       -4
+/* Various status bit macros.  */
+#define NBIT8(x)        ((x) & 0x80)
+#define NBIT16(x)       ((x) & 0x8000)
+#define NBIT32(x)       ((x) & 0x80000000)
+#define NBIT64(x)       ((x) & 0x8000000000000000ULL)
+#define ZBIT8(x)        (((x) & 0xff) == 0)
+#define ZBIT16(x)       (((x) & 0xffff) == 0)
+#define ZBIT32(x)       (((x) & 0xffffffff) == 0)
+#define ZBIT64(x)       (x == 0)
+/* Sign extension macros.  */
+#define EXTEND8H(a)     ((uint16_t) (int8_t) (a))
+#define EXTEND8(a)      ((uint32_t) (int8_t) (a))
+#define EXTEND16(a)     ((uint32_t) (int16_t) (a))
+#define EXTEND16S(a)    ((int32_t) (int16_t) (a))
+#define EXTEND32(a)     ((uint64_t) (int32_t) (a))
+
+uint64_t HELPER(iwmmxt_maddsq)(uint64_t a, uint64_t b)
+{
+    a = ((
+            EXTEND16S((a >> 0) & 0xffff) * EXTEND16S((b >> 0) & 0xffff) +
+            EXTEND16S((a >> 16) & 0xffff) * EXTEND16S((b >> 16) & 0xffff)
+        ) & 0xffffffff) | ((uint64_t) (
+            EXTEND16S((a >> 32) & 0xffff) * EXTEND16S((b >> 32) & 0xffff) +
+            EXTEND16S((a >> 48) & 0xffff) * EXTEND16S((b >> 48) & 0xffff)
+        ) << 32);
+    return a;
+}
+
+uint64_t HELPER(iwmmxt_madduq)(uint64_t a, uint64_t b)
+{
+    a = ((
+            ((a >> 0) & 0xffff) * ((b >> 0) & 0xffff) +
+            ((a >> 16) & 0xffff) * ((b >> 16) & 0xffff)
+        ) & 0xffffffff) | ((
+            ((a >> 32) & 0xffff) * ((b >> 32) & 0xffff) +
+            ((a >> 48) & 0xffff) * ((b >> 48) & 0xffff)
+        ) << 32);
+    return a;
+}
+
+uint64_t HELPER(iwmmxt_sadb)(uint64_t a, uint64_t b)
+{
+#define abs(x) (((x) >= 0) ? x : -x)
+#define SADB(SHR) abs((int) ((a >> SHR) & 0xff) - (int) ((b >> SHR) & 0xff))
+    return
+        SADB(0) + SADB(8) + SADB(16) + SADB(24) +
+        SADB(32) + SADB(40) + SADB(48) + SADB(56);
+#undef SADB
+}
+
+uint64_t HELPER(iwmmxt_sadw)(uint64_t a, uint64_t b)
+{
+#define SADW(SHR) \
+    abs((int) ((a >> SHR) & 0xffff) - (int) ((b >> SHR) & 0xffff))
+    return SADW(0) + SADW(16) + SADW(32) + SADW(48);
+#undef SADW
+}
+
+uint64_t HELPER(iwmmxt_mulslw)(uint64_t a, uint64_t b)
+{
+#define MULS(SHR) ((uint64_t) ((( \
+        EXTEND16S((a >> SHR) & 0xffff) * EXTEND16S((b >> SHR) & 0xffff) \
+    ) >> 0) & 0xffff) << SHR)
+    return MULS(0) | MULS(16) | MULS(32) | MULS(48);
+#undef MULS
+}
+
+uint64_t HELPER(iwmmxt_mulshw)(uint64_t a, uint64_t b)
+{
+#define MULS(SHR) ((uint64_t) ((( \
+        EXTEND16S((a >> SHR) & 0xffff) * EXTEND16S((b >> SHR) & 0xffff) \
+    ) >> 16) & 0xffff) << SHR)
+    return MULS(0) | MULS(16) | MULS(32) | MULS(48);
+#undef MULS
+}
+
+uint64_t HELPER(iwmmxt_mululw)(uint64_t a, uint64_t b)
+{
+#define MULU(SHR) ((uint64_t) ((( \
+        ((a >> SHR) & 0xffff) * ((b >> SHR) & 0xffff) \
+    ) >> 0) & 0xffff) << SHR)
+    return MULU(0) | MULU(16) | MULU(32) | MULU(48);
+#undef MULU
+}
+
+uint64_t HELPER(iwmmxt_muluhw)(uint64_t a, uint64_t b)
+{
+#define MULU(SHR) ((uint64_t) ((( \
+        ((a >> SHR) & 0xffff) * ((b >> SHR) & 0xffff) \
+    ) >> 16) & 0xffff) << SHR)
+    return MULU(0) | MULU(16) | MULU(32) | MULU(48);
+#undef MULU
+}
+
+uint64_t HELPER(iwmmxt_macsw)(uint64_t a, uint64_t b)
+{
+#define MACS(SHR) ( \
+        EXTEND16((a >> SHR) & 0xffff) * EXTEND16S((b >> SHR) & 0xffff))
+    return (int64_t) (MACS(0) + MACS(16) + MACS(32) + MACS(48));
+#undef MACS
+}
+
+uint64_t HELPER(iwmmxt_macuw)(uint64_t a, uint64_t b)
+{
+#define MACU(SHR) ( \
+        (uint32_t) ((a >> SHR) & 0xffff) * \
+        (uint32_t) ((b >> SHR) & 0xffff))
+    return MACU(0) + MACU(16) + MACU(32) + MACU(48);
+#undef MACU
+}
+
+#define NZBIT8(x, i) \
+    SIMD8_SET(NBIT8((x) & 0xff), SIMD_NBIT, i) | \
+    SIMD8_SET(ZBIT8((x) & 0xff), SIMD_ZBIT, i)
+#define NZBIT16(x, i) \
+    SIMD16_SET(NBIT16((x) & 0xffff), SIMD_NBIT, i) | \
+    SIMD16_SET(ZBIT16((x) & 0xffff), SIMD_ZBIT, i)
+#define NZBIT32(x, i) \
+    SIMD32_SET(NBIT32((x) & 0xffffffff), SIMD_NBIT, i) | \
+    SIMD32_SET(ZBIT32((x) & 0xffffffff), SIMD_ZBIT, i)
+#define NZBIT64(x) \
+    SIMD64_SET(NBIT64(x), SIMD_NBIT) | \
+    SIMD64_SET(ZBIT64(x), SIMD_ZBIT)
+#define IWMMXT_OP_UNPACK(S, SH0, SH1, SH2, SH3)                         \
+uint64_t HELPER(glue(iwmmxt_unpack, glue(S, b)))(CPUARMState *env, \
+                                                 uint64_t a, uint64_t b) \
+{                                                               \
+    a =                                                                 \
+        (((a >> SH0) & 0xff) << 0) | (((b >> SH0) & 0xff) << 8) |       \
+        (((a >> SH1) & 0xff) << 16) | (((b >> SH1) & 0xff) << 24) |     \
+        (((a >> SH2) & 0xff) << 32) | (((b >> SH2) & 0xff) << 40) |     \
+        (((a >> SH3) & 0xff) << 48) | (((b >> SH3) & 0xff) << 56);      \
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =                       \
+        NZBIT8(a >> 0, 0) | NZBIT8(a >> 8, 1) |                         \
+        NZBIT8(a >> 16, 2) | NZBIT8(a >> 24, 3) |               \
+        NZBIT8(a >> 32, 4) | NZBIT8(a >> 40, 5) |               \
+        NZBIT8(a >> 48, 6) | NZBIT8(a >> 56, 7);                \
+    return a;                                                   \
+}                                                               \
+uint64_t HELPER(glue(iwmmxt_unpack, glue(S, w)))(CPUARMState *env, \
+                                        uint64_t a, uint64_t b) \
+{                                                               \
+    a =                                                                 \
+        (((a >> SH0) & 0xffff) << 0) |                          \
+        (((b >> SH0) & 0xffff) << 16) |                                 \
+        (((a >> SH2) & 0xffff) << 32) |                                 \
+        (((b >> SH2) & 0xffff) << 48);                          \
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =                       \
+        NZBIT8(a >> 0, 0) | NZBIT8(a >> 16, 1) |                \
+        NZBIT8(a >> 32, 2) | NZBIT8(a >> 48, 3);                \
+    return a;                                                   \
+}                                                               \
+uint64_t HELPER(glue(iwmmxt_unpack, glue(S, l)))(CPUARMState *env, \
+                                        uint64_t a, uint64_t b) \
+{                                                               \
+    a =                                                                 \
+        (((a >> SH0) & 0xffffffff) << 0) |                      \
+        (((b >> SH0) & 0xffffffff) << 32);                      \
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =                       \
+        NZBIT32(a >> 0, 0) | NZBIT32(a >> 32, 1);               \
+    return a;                                                   \
+}                                                               \
+uint64_t HELPER(glue(iwmmxt_unpack, glue(S, ub)))(CPUARMState *env, \
+                                                  uint64_t x)   \
+{                                                               \
+    x =                                                                 \
+        (((x >> SH0) & 0xff) << 0) |                            \
+        (((x >> SH1) & 0xff) << 16) |                           \
+        (((x >> SH2) & 0xff) << 32) |                           \
+        (((x >> SH3) & 0xff) << 48);                            \
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =                       \
+        NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) |              \
+        NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3);              \
+    return x;                                                   \
+}                                                               \
+uint64_t HELPER(glue(iwmmxt_unpack, glue(S, uw)))(CPUARMState *env, \
+                                                  uint64_t x)   \
+{                                                               \
+    x =                                                                 \
+        (((x >> SH0) & 0xffff) << 0) |                          \
+        (((x >> SH2) & 0xffff) << 32);                          \
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =                       \
+        NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1);               \
+    return x;                                                   \
+}                                                               \
+uint64_t HELPER(glue(iwmmxt_unpack, glue(S, ul)))(CPUARMState *env, \
+                                                  uint64_t x)   \
+{                                                               \
+    x = (((x >> SH0) & 0xffffffff) << 0);                       \
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x >> 0);      \
+    return x;                                                   \
+}                                                               \
+uint64_t HELPER(glue(iwmmxt_unpack, glue(S, sb)))(CPUARMState *env, \
+                                                  uint64_t x)   \
+{                                                               \
+    x =                                                                 \
+        ((uint64_t) EXTEND8H((x >> SH0) & 0xff) << 0) |                 \
+        ((uint64_t) EXTEND8H((x >> SH1) & 0xff) << 16) |        \
+        ((uint64_t) EXTEND8H((x >> SH2) & 0xff) << 32) |        \
+        ((uint64_t) EXTEND8H((x >> SH3) & 0xff) << 48);                 \
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =                       \
+        NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) |              \
+        NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3);              \
+    return x;                                                   \
+}                                                               \
+uint64_t HELPER(glue(iwmmxt_unpack, glue(S, sw)))(CPUARMState *env, \
+                                                  uint64_t x)   \
+{                                                               \
+    x =                                                                 \
+        ((uint64_t) EXTEND16((x >> SH0) & 0xffff) << 0) |       \
+        ((uint64_t) EXTEND16((x >> SH2) & 0xffff) << 32);       \
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =                       \
+        NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1);               \
+    return x;                                                   \
+}                                                               \
+uint64_t HELPER(glue(iwmmxt_unpack, glue(S, sl)))(CPUARMState *env, \
+                                                  uint64_t x)   \
+{                                                               \
+    x = EXTEND32((x >> SH0) & 0xffffffff);                      \
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x >> 0);      \
+    return x;                                                   \
+}
+IWMMXT_OP_UNPACK(l, 0, 8, 16, 24)
+IWMMXT_OP_UNPACK(h, 32, 40, 48, 56)
+
+#define IWMMXT_OP_CMP(SUFF, Tb, Tw, Tl, O)                      \
+uint64_t HELPER(glue(iwmmxt_, glue(SUFF, b)))(CPUARMState *env,    \
+                                        uint64_t a, uint64_t b) \
+{                                                               \
+    a =                                                                 \
+        CMP(0, Tb, O, 0xff) | CMP(8, Tb, O, 0xff) |             \
+        CMP(16, Tb, O, 0xff) | CMP(24, Tb, O, 0xff) |           \
+        CMP(32, Tb, O, 0xff) | CMP(40, Tb, O, 0xff) |           \
+        CMP(48, Tb, O, 0xff) | CMP(56, Tb, O, 0xff);            \
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =                       \
+        NZBIT8(a >> 0, 0) | NZBIT8(a >> 8, 1) |                         \
+        NZBIT8(a >> 16, 2) | NZBIT8(a >> 24, 3) |               \
+        NZBIT8(a >> 32, 4) | NZBIT8(a >> 40, 5) |               \
+        NZBIT8(a >> 48, 6) | NZBIT8(a >> 56, 7);                \
+    return a;                                                   \
+}                                                               \
+uint64_t HELPER(glue(iwmmxt_, glue(SUFF, w)))(CPUARMState *env,    \
+                                        uint64_t a, uint64_t b) \
+{                                                               \
+    a = CMP(0, Tw, O, 0xffff) | CMP(16, Tw, O, 0xffff) |        \
+        CMP(32, Tw, O, 0xffff) | CMP(48, Tw, O, 0xffff);        \
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =                       \
+        NZBIT16(a >> 0, 0) | NZBIT16(a >> 16, 1) |              \
+        NZBIT16(a >> 32, 2) | NZBIT16(a >> 48, 3);              \
+    return a;                                                   \
+}                                                               \
+uint64_t HELPER(glue(iwmmxt_, glue(SUFF, l)))(CPUARMState *env,    \
+                                        uint64_t a, uint64_t b) \
+{                                                               \
+    a = CMP(0, Tl, O, 0xffffffff) |                             \
+        CMP(32, Tl, O, 0xffffffff);                             \
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =                       \
+        NZBIT32(a >> 0, 0) | NZBIT32(a >> 32, 1);               \
+    return a;                                                   \
+}
+#define CMP(SHR, TYPE, OPER, MASK) ((((TYPE) ((a >> SHR) & MASK) OPER \
+            (TYPE) ((b >> SHR) & MASK)) ? (uint64_t) MASK : 0) << SHR)
+IWMMXT_OP_CMP(cmpeq, uint8_t, uint16_t, uint32_t, ==)
+IWMMXT_OP_CMP(cmpgts, int8_t, int16_t, int32_t, >)
+IWMMXT_OP_CMP(cmpgtu, uint8_t, uint16_t, uint32_t, >)
+#undef CMP
+#define CMP(SHR, TYPE, OPER, MASK) ((((TYPE) ((a >> SHR) & MASK) OPER \
+            (TYPE) ((b >> SHR) & MASK)) ? a : b) & ((uint64_t) MASK << SHR))
+IWMMXT_OP_CMP(mins, int8_t, int16_t, int32_t, <)
+IWMMXT_OP_CMP(minu, uint8_t, uint16_t, uint32_t, <)
+IWMMXT_OP_CMP(maxs, int8_t, int16_t, int32_t, >)
+IWMMXT_OP_CMP(maxu, uint8_t, uint16_t, uint32_t, >)
+#undef CMP
+#define CMP(SHR, TYPE, OPER, MASK) ((uint64_t) (((TYPE) ((a >> SHR) & MASK) \
+            OPER (TYPE) ((b >> SHR) & MASK)) & MASK) << SHR)
+IWMMXT_OP_CMP(subn, uint8_t, uint16_t, uint32_t, -)
+IWMMXT_OP_CMP(addn, uint8_t, uint16_t, uint32_t, +)
+#undef CMP
+/* TODO Signed- and Unsigned-Saturation */
+#define CMP(SHR, TYPE, OPER, MASK) ((uint64_t) (((TYPE) ((a >> SHR) & MASK) \
+            OPER (TYPE) ((b >> SHR) & MASK)) & MASK) << SHR)
+IWMMXT_OP_CMP(subu, uint8_t, uint16_t, uint32_t, -)
+IWMMXT_OP_CMP(addu, uint8_t, uint16_t, uint32_t, +)
+IWMMXT_OP_CMP(subs, int8_t, int16_t, int32_t, -)
+IWMMXT_OP_CMP(adds, int8_t, int16_t, int32_t, +)
+#undef CMP
+#undef IWMMXT_OP_CMP
+
+#define AVGB(SHR) ((( \
+        ((a >> SHR) & 0xff) + ((b >> SHR) & 0xff) + round) >> 1) << SHR)
+#define IWMMXT_OP_AVGB(r)                                                 \
+uint64_t HELPER(iwmmxt_avgb##r)(CPUARMState *env, uint64_t a, uint64_t b)    \
+{                                                                         \
+    const int round = r;                                                  \
+    a = AVGB(0) | AVGB(8) | AVGB(16) | AVGB(24) |                         \
+        AVGB(32) | AVGB(40) | AVGB(48) | AVGB(56);                        \
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =                                 \
+        SIMD8_SET(ZBIT8((a >> 0) & 0xff), SIMD_ZBIT, 0) |                 \
+        SIMD8_SET(ZBIT8((a >> 8) & 0xff), SIMD_ZBIT, 1) |                 \
+        SIMD8_SET(ZBIT8((a >> 16) & 0xff), SIMD_ZBIT, 2) |                \
+        SIMD8_SET(ZBIT8((a >> 24) & 0xff), SIMD_ZBIT, 3) |                \
+        SIMD8_SET(ZBIT8((a >> 32) & 0xff), SIMD_ZBIT, 4) |                \
+        SIMD8_SET(ZBIT8((a >> 40) & 0xff), SIMD_ZBIT, 5) |                \
+        SIMD8_SET(ZBIT8((a >> 48) & 0xff), SIMD_ZBIT, 6) |                \
+        SIMD8_SET(ZBIT8((a >> 56) & 0xff), SIMD_ZBIT, 7);                 \
+    return a;                                                             \
+}
+IWMMXT_OP_AVGB(0)
+IWMMXT_OP_AVGB(1)
+#undef IWMMXT_OP_AVGB
+#undef AVGB
+
+#define AVGW(SHR) ((( \
+        ((a >> SHR) & 0xffff) + ((b >> SHR) & 0xffff) + round) >> 1) << SHR)
+#define IWMMXT_OP_AVGW(r)                                               \
+uint64_t HELPER(iwmmxt_avgw##r)(CPUARMState *env, uint64_t a, uint64_t b)  \
+{                                                                       \
+    const int round = r;                                                \
+    a = AVGW(0) | AVGW(16) | AVGW(32) | AVGW(48);                       \
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =                               \
+        SIMD16_SET(ZBIT16((a >> 0) & 0xffff), SIMD_ZBIT, 0) |           \
+        SIMD16_SET(ZBIT16((a >> 16) & 0xffff), SIMD_ZBIT, 1) |          \
+        SIMD16_SET(ZBIT16((a >> 32) & 0xffff), SIMD_ZBIT, 2) |          \
+        SIMD16_SET(ZBIT16((a >> 48) & 0xffff), SIMD_ZBIT, 3);           \
+    return a;                                                           \
+}
+IWMMXT_OP_AVGW(0)
+IWMMXT_OP_AVGW(1)
+#undef IWMMXT_OP_AVGW
+#undef AVGW
+
+uint64_t HELPER(iwmmxt_align)(uint64_t a, uint64_t b, uint32_t n)
+{
+    a >>= n << 3;
+    a |= b << (64 - (n << 3));
+    return a;
+}
+
+uint64_t HELPER(iwmmxt_insr)(uint64_t x, uint32_t a, uint32_t b, uint32_t n)
+{
+    x &= ~((uint64_t) b << n);
+    x |= (uint64_t) (a & b) << n;
+    return x;
+}
+
+uint32_t HELPER(iwmmxt_setpsr_nz)(uint64_t x)
+{
+    return SIMD64_SET((x == 0), SIMD_ZBIT) |
+           SIMD64_SET((x & (1ULL << 63)), SIMD_NBIT);
+}
+
+uint64_t HELPER(iwmmxt_bcstb)(uint32_t arg)
+{
+    arg &= 0xff;
+    return
+        ((uint64_t) arg << 0 ) | ((uint64_t) arg << 8 ) |
+        ((uint64_t) arg << 16) | ((uint64_t) arg << 24) |
+        ((uint64_t) arg << 32) | ((uint64_t) arg << 40) |
+        ((uint64_t) arg << 48) | ((uint64_t) arg << 56);
+}
+
+uint64_t HELPER(iwmmxt_bcstw)(uint32_t arg)
+{
+    arg &= 0xffff;
+    return
+        ((uint64_t) arg << 0 ) | ((uint64_t) arg << 16) |
+        ((uint64_t) arg << 32) | ((uint64_t) arg << 48);
+}
+
+uint64_t HELPER(iwmmxt_bcstl)(uint32_t arg)
+{
+    return arg | ((uint64_t) arg << 32);
+}
+
+uint64_t HELPER(iwmmxt_addcb)(uint64_t x)
+{
+    return
+        ((x >> 0) & 0xff) + ((x >> 8) & 0xff) +
+        ((x >> 16) & 0xff) + ((x >> 24) & 0xff) +
+        ((x >> 32) & 0xff) + ((x >> 40) & 0xff) +
+        ((x >> 48) & 0xff) + ((x >> 56) & 0xff);
+}
+
+uint64_t HELPER(iwmmxt_addcw)(uint64_t x)
+{
+    return
+        ((x >> 0) & 0xffff) + ((x >> 16) & 0xffff) +
+        ((x >> 32) & 0xffff) + ((x >> 48) & 0xffff);
+}
+
+uint64_t HELPER(iwmmxt_addcl)(uint64_t x)
+{
+    return (x & 0xffffffff) + (x >> 32);
+}
+
+uint32_t HELPER(iwmmxt_msbb)(uint64_t x)
+{
+    return
+        ((x >> 7) & 0x01) | ((x >> 14) & 0x02) |
+        ((x >> 21) & 0x04) | ((x >> 28) & 0x08) |
+        ((x >> 35) & 0x10) | ((x >> 42) & 0x20) |
+        ((x >> 49) & 0x40) | ((x >> 56) & 0x80);
+}
+
+uint32_t HELPER(iwmmxt_msbw)(uint64_t x)
+{
+    return
+        ((x >> 15) & 0x01) | ((x >> 30) & 0x02) |
+        ((x >> 45) & 0x04) | ((x >> 52) & 0x08);
+}
+
+uint32_t HELPER(iwmmxt_msbl)(uint64_t x)
+{
+    return ((x >> 31) & 0x01) | ((x >> 62) & 0x02);
+}
+
+/* FIXME: Split wCASF setting into a separate op to avoid env use.  */
+uint64_t HELPER(iwmmxt_srlw)(CPUARMState *env, uint64_t x, uint32_t n)
+{
+    x = (((x & (0xffffll << 0)) >> n) & (0xffffll << 0)) |
+        (((x & (0xffffll << 16)) >> n) & (0xffffll << 16)) |
+        (((x & (0xffffll << 32)) >> n) & (0xffffll << 32)) |
+        (((x & (0xffffll << 48)) >> n) & (0xffffll << 48));
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+        NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) |
+        NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3);
+    return x;
+}
+
+uint64_t HELPER(iwmmxt_srll)(CPUARMState *env, uint64_t x, uint32_t n)
+{
+    x = ((x & (0xffffffffll << 0)) >> n) |
+        ((x >> n) & (0xffffffffll << 32));
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+        NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1);
+    return x;
+}
+
+uint64_t HELPER(iwmmxt_srlq)(CPUARMState *env, uint64_t x, uint32_t n)
+{
+    x >>= n;
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x);
+    return x;
+}
+
+uint64_t HELPER(iwmmxt_sllw)(CPUARMState *env, uint64_t x, uint32_t n)
+{
+    x = (((x & (0xffffll << 0)) << n) & (0xffffll << 0)) |
+        (((x & (0xffffll << 16)) << n) & (0xffffll << 16)) |
+        (((x & (0xffffll << 32)) << n) & (0xffffll << 32)) |
+        (((x & (0xffffll << 48)) << n) & (0xffffll << 48));
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+        NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) |
+        NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3);
+    return x;
+}
+
+uint64_t HELPER(iwmmxt_slll)(CPUARMState *env, uint64_t x, uint32_t n)
+{
+    x = ((x << n) & (0xffffffffll << 0)) |
+        ((x & (0xffffffffll << 32)) << n);
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+        NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1);
+    return x;
+}
+
+uint64_t HELPER(iwmmxt_sllq)(CPUARMState *env, uint64_t x, uint32_t n)
+{
+    x <<= n;
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x);
+    return x;
+}
+
+uint64_t HELPER(iwmmxt_sraw)(CPUARMState *env, uint64_t x, uint32_t n)
+{
+    x = ((uint64_t) ((EXTEND16(x >> 0) >> n) & 0xffff) << 0) |
+        ((uint64_t) ((EXTEND16(x >> 16) >> n) & 0xffff) << 16) |
+        ((uint64_t) ((EXTEND16(x >> 32) >> n) & 0xffff) << 32) |
+        ((uint64_t) ((EXTEND16(x >> 48) >> n) & 0xffff) << 48);
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+        NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) |
+        NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3);
+    return x;
+}
+
+uint64_t HELPER(iwmmxt_sral)(CPUARMState *env, uint64_t x, uint32_t n)
+{
+    x = (((EXTEND32(x >> 0) >> n) & 0xffffffff) << 0) |
+        (((EXTEND32(x >> 32) >> n) & 0xffffffff) << 32);
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+        NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1);
+    return x;
+}
+
+uint64_t HELPER(iwmmxt_sraq)(CPUARMState *env, uint64_t x, uint32_t n)
+{
+    x = (int64_t) x >> n;
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x);
+    return x;
+}
+
+uint64_t HELPER(iwmmxt_rorw)(CPUARMState *env, uint64_t x, uint32_t n)
+{
+    x = ((((x & (0xffffll << 0)) >> n) |
+          ((x & (0xffffll << 0)) << (16 - n))) & (0xffffll << 0)) |
+        ((((x & (0xffffll << 16)) >> n) |
+          ((x & (0xffffll << 16)) << (16 - n))) & (0xffffll << 16)) |
+        ((((x & (0xffffll << 32)) >> n) |
+          ((x & (0xffffll << 32)) << (16 - n))) & (0xffffll << 32)) |
+        ((((x & (0xffffll << 48)) >> n) |
+          ((x & (0xffffll << 48)) << (16 - n))) & (0xffffll << 48));
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+        NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) |
+        NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3);
+    return x;
+}
+
+uint64_t HELPER(iwmmxt_rorl)(CPUARMState *env, uint64_t x, uint32_t n)
+{
+    x = ((x & (0xffffffffll << 0)) >> n) |
+        ((x >> n) & (0xffffffffll << 32)) |
+        ((x << (32 - n)) & (0xffffffffll << 0)) |
+        ((x & (0xffffffffll << 32)) << (32 - n));
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+        NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1);
+    return x;
+}
+
+uint64_t HELPER(iwmmxt_rorq)(CPUARMState *env, uint64_t x, uint32_t n)
+{
+    x = ror64(x, n);
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x);
+    return x;
+}
+
+uint64_t HELPER(iwmmxt_shufh)(CPUARMState *env, uint64_t x, uint32_t n)
+{
+    x = (((x >> ((n << 4) & 0x30)) & 0xffff) << 0) |
+        (((x >> ((n << 2) & 0x30)) & 0xffff) << 16) |
+        (((x >> ((n << 0) & 0x30)) & 0xffff) << 32) |
+        (((x >> ((n >> 2) & 0x30)) & 0xffff) << 48);
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+        NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) |
+        NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3);
+    return x;
+}
+
+/* TODO: Unsigned-Saturation */
+uint64_t HELPER(iwmmxt_packuw)(CPUARMState *env, uint64_t a, uint64_t b)
+{
+    a = (((a >> 0) & 0xff) << 0) | (((a >> 16) & 0xff) << 8) |
+        (((a >> 32) & 0xff) << 16) | (((a >> 48) & 0xff) << 24) |
+        (((b >> 0) & 0xff) << 32) | (((b >> 16) & 0xff) << 40) |
+        (((b >> 32) & 0xff) << 48) | (((b >> 48) & 0xff) << 56);
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+        NZBIT8(a >> 0, 0) | NZBIT8(a >> 8, 1) |
+        NZBIT8(a >> 16, 2) | NZBIT8(a >> 24, 3) |
+        NZBIT8(a >> 32, 4) | NZBIT8(a >> 40, 5) |
+        NZBIT8(a >> 48, 6) | NZBIT8(a >> 56, 7);
+    return a;
+}
+
+uint64_t HELPER(iwmmxt_packul)(CPUARMState *env, uint64_t a, uint64_t b)
+{
+    a = (((a >> 0) & 0xffff) << 0) | (((a >> 32) & 0xffff) << 16) |
+        (((b >> 0) & 0xffff) << 32) | (((b >> 32) & 0xffff) << 48);
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+        NZBIT16(a >> 0, 0) | NZBIT16(a >> 16, 1) |
+        NZBIT16(a >> 32, 2) | NZBIT16(a >> 48, 3);
+    return a;
+}
+
+uint64_t HELPER(iwmmxt_packuq)(CPUARMState *env, uint64_t a, uint64_t b)
+{
+    a = (a & 0xffffffff) | ((b & 0xffffffff) << 32);
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+        NZBIT32(a >> 0, 0) | NZBIT32(a >> 32, 1);
+    return a;
+}
+
+/* TODO: Signed-Saturation */
+uint64_t HELPER(iwmmxt_packsw)(CPUARMState *env, uint64_t a, uint64_t b)
+{
+    a = (((a >> 0) & 0xff) << 0) | (((a >> 16) & 0xff) << 8) |
+        (((a >> 32) & 0xff) << 16) | (((a >> 48) & 0xff) << 24) |
+        (((b >> 0) & 0xff) << 32) | (((b >> 16) & 0xff) << 40) |
+        (((b >> 32) & 0xff) << 48) | (((b >> 48) & 0xff) << 56);
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+        NZBIT8(a >> 0, 0) | NZBIT8(a >> 8, 1) |
+        NZBIT8(a >> 16, 2) | NZBIT8(a >> 24, 3) |
+        NZBIT8(a >> 32, 4) | NZBIT8(a >> 40, 5) |
+        NZBIT8(a >> 48, 6) | NZBIT8(a >> 56, 7);
+    return a;
+}
+
+uint64_t HELPER(iwmmxt_packsl)(CPUARMState *env, uint64_t a, uint64_t b)
+{
+    a = (((a >> 0) & 0xffff) << 0) | (((a >> 32) & 0xffff) << 16) |
+        (((b >> 0) & 0xffff) << 32) | (((b >> 32) & 0xffff) << 48);
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+        NZBIT16(a >> 0, 0) | NZBIT16(a >> 16, 1) |
+        NZBIT16(a >> 32, 2) | NZBIT16(a >> 48, 3);
+    return a;
+}
+
+uint64_t HELPER(iwmmxt_packsq)(CPUARMState *env, uint64_t a, uint64_t b)
+{
+    a = (a & 0xffffffff) | ((b & 0xffffffff) << 32);
+    env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+        NZBIT32(a >> 0, 0) | NZBIT32(a >> 32, 1);
+    return a;
+}
+
+uint64_t HELPER(iwmmxt_muladdsl)(uint64_t c, uint32_t a, uint32_t b)
+{
+    return c + ((int32_t) EXTEND32(a) * (int32_t) EXTEND32(b));
+}
+
+uint64_t HELPER(iwmmxt_muladdsw)(uint64_t c, uint32_t a, uint32_t b)
+{
+    c += EXTEND32(EXTEND16S((a >> 0) & 0xffff) *
+                  EXTEND16S((b >> 0) & 0xffff));
+    c += EXTEND32(EXTEND16S((a >> 16) & 0xffff) *
+                  EXTEND16S((b >> 16) & 0xffff));
+    return c;
+}
+
+uint64_t HELPER(iwmmxt_muladdswl)(uint64_t c, uint32_t a, uint32_t b)
+{
+    return c + (EXTEND32(EXTEND16S(a & 0xffff) *
+                         EXTEND16S(b & 0xffff)));
+}
diff --git a/target/arm/tcg/m_helper.c b/target/arm/tcg/m_helper.c
new file mode 100644
index 0000000..f94e87e
--- /dev/null
+++ b/target/arm/tcg/m_helper.c
@@ -0,0 +1,2902 @@
+/*
+ * ARM generic helpers.
+ *
+ * This code is licensed under the GNU GPL v2 or later.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "internals.h"
+#include "exec/helper-proto.h"
+#include "qemu/main-loop.h"
+#include "qemu/bitops.h"
+#include "qemu/log.h"
+#include "exec/exec-all.h"
+#ifdef CONFIG_TCG
+#include "exec/cpu_ldst.h"
+#include "semihosting/common-semi.h"
+#endif
+#if !defined(CONFIG_USER_ONLY)
+#include "hw/intc/armv7m_nvic.h"
+#endif
+
+static void v7m_msr_xpsr(CPUARMState *env, uint32_t mask,
+                         uint32_t reg, uint32_t val)
+{
+    /* Only APSR is actually writable */
+    if (!(reg & 4)) {
+        uint32_t apsrmask = 0;
+
+        if (mask & 8) {
+            apsrmask |= XPSR_NZCV | XPSR_Q;
+        }
+        if ((mask & 4) && arm_feature(env, ARM_FEATURE_THUMB_DSP)) {
+            apsrmask |= XPSR_GE;
+        }
+        xpsr_write(env, val, apsrmask);
+    }
+}
+
+static uint32_t v7m_mrs_xpsr(CPUARMState *env, uint32_t reg, unsigned el)
+{
+    uint32_t mask = 0;
+
+    if ((reg & 1) && el) {
+        mask |= XPSR_EXCP; /* IPSR (unpriv. reads as zero) */
+    }
+    if (!(reg & 4)) {
+        mask |= XPSR_NZCV | XPSR_Q; /* APSR */
+        if (arm_feature(env, ARM_FEATURE_THUMB_DSP)) {
+            mask |= XPSR_GE;
+        }
+    }
+    /* EPSR reads as zero */
+    return xpsr_read(env) & mask;
+}
+
+static uint32_t v7m_mrs_control(CPUARMState *env, uint32_t secure)
+{
+    uint32_t value = env->v7m.control[secure];
+
+    if (!secure) {
+        /* SFPA is RAZ/WI from NS; FPCA is stored in the M_REG_S bank */
+        value |= env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK;
+    }
+    return value;
+}
+
+#ifdef CONFIG_USER_ONLY
+
+void HELPER(v7m_msr)(CPUARMState *env, uint32_t maskreg, uint32_t val)
+{
+    uint32_t mask = extract32(maskreg, 8, 4);
+    uint32_t reg = extract32(maskreg, 0, 8);
+
+    switch (reg) {
+    case 0 ... 7: /* xPSR sub-fields */
+        v7m_msr_xpsr(env, mask, reg, val);
+        break;
+    case 20: /* CONTROL */
+        /* There are no sub-fields that are actually writable from EL0. */
+        break;
+    default:
+        /* Unprivileged writes to other registers are ignored */
+        break;
+    }
+}
+
+uint32_t HELPER(v7m_mrs)(CPUARMState *env, uint32_t reg)
+{
+    switch (reg) {
+    case 0 ... 7: /* xPSR sub-fields */
+        return v7m_mrs_xpsr(env, reg, 0);
+    case 20: /* CONTROL */
+        return v7m_mrs_control(env, 0);
+    default:
+        /* Unprivileged reads others as zero.  */
+        return 0;
+    }
+}
+
+void HELPER(v7m_bxns)(CPUARMState *env, uint32_t dest)
+{
+    /* translate.c should never generate calls here in user-only mode */
+    g_assert_not_reached();
+}
+
+void HELPER(v7m_blxns)(CPUARMState *env, uint32_t dest)
+{
+    /* translate.c should never generate calls here in user-only mode */
+    g_assert_not_reached();
+}
+
+void HELPER(v7m_preserve_fp_state)(CPUARMState *env)
+{
+    /* translate.c should never generate calls here in user-only mode */
+    g_assert_not_reached();
+}
+
+void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr)
+{
+    /* translate.c should never generate calls here in user-only mode */
+    g_assert_not_reached();
+}
+
+void HELPER(v7m_vlldm)(CPUARMState *env, uint32_t fptr)
+{
+    /* translate.c should never generate calls here in user-only mode */
+    g_assert_not_reached();
+}
+
+uint32_t HELPER(v7m_tt)(CPUARMState *env, uint32_t addr, uint32_t op)
+{
+    /*
+     * The TT instructions can be used by unprivileged code, but in
+     * user-only emulation we don't have the MPU.
+     * Luckily since we know we are NonSecure unprivileged (and that in
+     * turn means that the A flag wasn't specified), all the bits in the
+     * register must be zero:
+     *  IREGION: 0 because IRVALID is 0
+     *  IRVALID: 0 because NS
+     *  S: 0 because NS
+     *  NSRW: 0 because NS
+     *  NSR: 0 because NS
+     *  RW: 0 because unpriv and A flag not set
+     *  R: 0 because unpriv and A flag not set
+     *  SRVALID: 0 because NS
+     *  MRVALID: 0 because unpriv and A flag not set
+     *  SREGION: 0 becaus SRVALID is 0
+     *  MREGION: 0 because MRVALID is 0
+     */
+    return 0;
+}
+
+ARMMMUIdx arm_v7m_mmu_idx_for_secstate(CPUARMState *env, bool secstate)
+{
+    return ARMMMUIdx_MUser;
+}
+
+#else /* !CONFIG_USER_ONLY */
+
+static ARMMMUIdx arm_v7m_mmu_idx_all(CPUARMState *env,
+                                     bool secstate, bool priv, bool negpri)
+{
+    ARMMMUIdx mmu_idx = ARM_MMU_IDX_M;
+
+    if (priv) {
+        mmu_idx |= ARM_MMU_IDX_M_PRIV;
+    }
+
+    if (negpri) {
+        mmu_idx |= ARM_MMU_IDX_M_NEGPRI;
+    }
+
+    if (secstate) {
+        mmu_idx |= ARM_MMU_IDX_M_S;
+    }
+
+    return mmu_idx;
+}
+
+static ARMMMUIdx arm_v7m_mmu_idx_for_secstate_and_priv(CPUARMState *env,
+                                                       bool secstate, bool priv)
+{
+    bool negpri = armv7m_nvic_neg_prio_requested(env->nvic, secstate);
+
+    return arm_v7m_mmu_idx_all(env, secstate, priv, negpri);
+}
+
+/* Return the MMU index for a v7M CPU in the specified security state */
+ARMMMUIdx arm_v7m_mmu_idx_for_secstate(CPUARMState *env, bool secstate)
+{
+    bool priv = arm_v7m_is_handler_mode(env) ||
+        !(env->v7m.control[secstate] & 1);
+
+    return arm_v7m_mmu_idx_for_secstate_and_priv(env, secstate, priv);
+}
+
+/*
+ * What kind of stack write are we doing? This affects how exceptions
+ * generated during the stacking are treated.
+ */
+typedef enum StackingMode {
+    STACK_NORMAL,
+    STACK_IGNFAULTS,
+    STACK_LAZYFP,
+} StackingMode;
+
+static bool v7m_stack_write(ARMCPU *cpu, uint32_t addr, uint32_t value,
+                            ARMMMUIdx mmu_idx, StackingMode mode)
+{
+    CPUState *cs = CPU(cpu);
+    CPUARMState *env = &cpu->env;
+    MemTxResult txres;
+    GetPhysAddrResult res = {};
+    ARMMMUFaultInfo fi = {};
+    bool secure = mmu_idx & ARM_MMU_IDX_M_S;
+    int exc;
+    bool exc_secure;
+
+    if (get_phys_addr(env, addr, MMU_DATA_STORE, mmu_idx, &res, &fi)) {
+        /* MPU/SAU lookup failed */
+        if (fi.type == ARMFault_QEMU_SFault) {
+            if (mode == STACK_LAZYFP) {
+                qemu_log_mask(CPU_LOG_INT,
+                              "...SecureFault with SFSR.LSPERR "
+                              "during lazy stacking\n");
+                env->v7m.sfsr |= R_V7M_SFSR_LSPERR_MASK;
+            } else {
+                qemu_log_mask(CPU_LOG_INT,
+                              "...SecureFault with SFSR.AUVIOL "
+                              "during stacking\n");
+                env->v7m.sfsr |= R_V7M_SFSR_AUVIOL_MASK;
+            }
+            env->v7m.sfsr |= R_V7M_SFSR_SFARVALID_MASK;
+            env->v7m.sfar = addr;
+            exc = ARMV7M_EXCP_SECURE;
+            exc_secure = false;
+        } else {
+            if (mode == STACK_LAZYFP) {
+                qemu_log_mask(CPU_LOG_INT,
+                              "...MemManageFault with CFSR.MLSPERR\n");
+                env->v7m.cfsr[secure] |= R_V7M_CFSR_MLSPERR_MASK;
+            } else {
+                qemu_log_mask(CPU_LOG_INT,
+                              "...MemManageFault with CFSR.MSTKERR\n");
+                env->v7m.cfsr[secure] |= R_V7M_CFSR_MSTKERR_MASK;
+            }
+            exc = ARMV7M_EXCP_MEM;
+            exc_secure = secure;
+        }
+        goto pend_fault;
+    }
+    address_space_stl_le(arm_addressspace(cs, res.f.attrs), res.f.phys_addr,
+                         value, res.f.attrs, &txres);
+    if (txres != MEMTX_OK) {
+        /* BusFault trying to write the data */
+        if (mode == STACK_LAZYFP) {
+            qemu_log_mask(CPU_LOG_INT, "...BusFault with BFSR.LSPERR\n");
+            env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_LSPERR_MASK;
+        } else {
+            qemu_log_mask(CPU_LOG_INT, "...BusFault with BFSR.STKERR\n");
+            env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_STKERR_MASK;
+        }
+        exc = ARMV7M_EXCP_BUS;
+        exc_secure = false;
+        goto pend_fault;
+    }
+    return true;
+
+pend_fault:
+    /*
+     * By pending the exception at this point we are making
+     * the IMPDEF choice "overridden exceptions pended" (see the
+     * MergeExcInfo() pseudocode). The other choice would be to not
+     * pend them now and then make a choice about which to throw away
+     * later if we have two derived exceptions.
+     * The only case when we must not pend the exception but instead
+     * throw it away is if we are doing the push of the callee registers
+     * and we've already generated a derived exception (this is indicated
+     * by the caller passing STACK_IGNFAULTS). Even in this case we will
+     * still update the fault status registers.
+     */
+    switch (mode) {
+    case STACK_NORMAL:
+        armv7m_nvic_set_pending_derived(env->nvic, exc, exc_secure);
+        break;
+    case STACK_LAZYFP:
+        armv7m_nvic_set_pending_lazyfp(env->nvic, exc, exc_secure);
+        break;
+    case STACK_IGNFAULTS:
+        break;
+    }
+    return false;
+}
+
+static bool v7m_stack_read(ARMCPU *cpu, uint32_t *dest, uint32_t addr,
+                           ARMMMUIdx mmu_idx)
+{
+    CPUState *cs = CPU(cpu);
+    CPUARMState *env = &cpu->env;
+    MemTxResult txres;
+    GetPhysAddrResult res = {};
+    ARMMMUFaultInfo fi = {};
+    bool secure = mmu_idx & ARM_MMU_IDX_M_S;
+    int exc;
+    bool exc_secure;
+    uint32_t value;
+
+    if (get_phys_addr(env, addr, MMU_DATA_LOAD, mmu_idx, &res, &fi)) {
+        /* MPU/SAU lookup failed */
+        if (fi.type == ARMFault_QEMU_SFault) {
+            qemu_log_mask(CPU_LOG_INT,
+                          "...SecureFault with SFSR.AUVIOL during unstack\n");
+            env->v7m.sfsr |= R_V7M_SFSR_AUVIOL_MASK | R_V7M_SFSR_SFARVALID_MASK;
+            env->v7m.sfar = addr;
+            exc = ARMV7M_EXCP_SECURE;
+            exc_secure = false;
+        } else {
+            qemu_log_mask(CPU_LOG_INT,
+                          "...MemManageFault with CFSR.MUNSTKERR\n");
+            env->v7m.cfsr[secure] |= R_V7M_CFSR_MUNSTKERR_MASK;
+            exc = ARMV7M_EXCP_MEM;
+            exc_secure = secure;
+        }
+        goto pend_fault;
+    }
+
+    value = address_space_ldl(arm_addressspace(cs, res.f.attrs),
+                              res.f.phys_addr, res.f.attrs, &txres);
+    if (txres != MEMTX_OK) {
+        /* BusFault trying to read the data */
+        qemu_log_mask(CPU_LOG_INT, "...BusFault with BFSR.UNSTKERR\n");
+        env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_UNSTKERR_MASK;
+        exc = ARMV7M_EXCP_BUS;
+        exc_secure = false;
+        goto pend_fault;
+    }
+
+    *dest = value;
+    return true;
+
+pend_fault:
+    /*
+     * By pending the exception at this point we are making
+     * the IMPDEF choice "overridden exceptions pended" (see the
+     * MergeExcInfo() pseudocode). The other choice would be to not
+     * pend them now and then make a choice about which to throw away
+     * later if we have two derived exceptions.
+     */
+    armv7m_nvic_set_pending(env->nvic, exc, exc_secure);
+    return false;
+}
+
+void HELPER(v7m_preserve_fp_state)(CPUARMState *env)
+{
+    /*
+     * Preserve FP state (because LSPACT was set and we are about
+     * to execute an FP instruction). This corresponds to the
+     * PreserveFPState() pseudocode.
+     * We may throw an exception if the stacking fails.
+     */
+    ARMCPU *cpu = env_archcpu(env);
+    bool is_secure = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_S_MASK;
+    bool negpri = !(env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_HFRDY_MASK);
+    bool is_priv = !(env->v7m.fpccr[is_secure] & R_V7M_FPCCR_USER_MASK);
+    bool splimviol = env->v7m.fpccr[is_secure] & R_V7M_FPCCR_SPLIMVIOL_MASK;
+    uint32_t fpcar = env->v7m.fpcar[is_secure];
+    bool stacked_ok = true;
+    bool ts = is_secure && (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK);
+    bool take_exception;
+
+    /* Take the iothread lock as we are going to touch the NVIC */
+    qemu_mutex_lock_iothread();
+
+    /* Check the background context had access to the FPU */
+    if (!v7m_cpacr_pass(env, is_secure, is_priv)) {
+        armv7m_nvic_set_pending_lazyfp(env->nvic, ARMV7M_EXCP_USAGE, is_secure);
+        env->v7m.cfsr[is_secure] |= R_V7M_CFSR_NOCP_MASK;
+        stacked_ok = false;
+    } else if (!is_secure && !extract32(env->v7m.nsacr, 10, 1)) {
+        armv7m_nvic_set_pending_lazyfp(env->nvic, ARMV7M_EXCP_USAGE, M_REG_S);
+        env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_NOCP_MASK;
+        stacked_ok = false;
+    }
+
+    if (!splimviol && stacked_ok) {
+        /* We only stack if the stack limit wasn't violated */
+        int i;
+        ARMMMUIdx mmu_idx;
+
+        mmu_idx = arm_v7m_mmu_idx_all(env, is_secure, is_priv, negpri);
+        for (i = 0; i < (ts ? 32 : 16); i += 2) {
+            uint64_t dn = *aa32_vfp_dreg(env, i / 2);
+            uint32_t faddr = fpcar + 4 * i;
+            uint32_t slo = extract64(dn, 0, 32);
+            uint32_t shi = extract64(dn, 32, 32);
+
+            if (i >= 16) {
+                faddr += 8; /* skip the slot for the FPSCR/VPR */
+            }
+            stacked_ok = stacked_ok &&
+                v7m_stack_write(cpu, faddr, slo, mmu_idx, STACK_LAZYFP) &&
+                v7m_stack_write(cpu, faddr + 4, shi, mmu_idx, STACK_LAZYFP);
+        }
+
+        stacked_ok = stacked_ok &&
+            v7m_stack_write(cpu, fpcar + 0x40,
+                            vfp_get_fpscr(env), mmu_idx, STACK_LAZYFP);
+        if (cpu_isar_feature(aa32_mve, cpu)) {
+            stacked_ok = stacked_ok &&
+                v7m_stack_write(cpu, fpcar + 0x44,
+                                env->v7m.vpr, mmu_idx, STACK_LAZYFP);
+        }
+    }
+
+    /*
+     * We definitely pended an exception, but it's possible that it
+     * might not be able to be taken now. If its priority permits us
+     * to take it now, then we must not update the LSPACT or FP regs,
+     * but instead jump out to take the exception immediately.
+     * If it's just pending and won't be taken until the current
+     * handler exits, then we do update LSPACT and the FP regs.
+     */
+    take_exception = !stacked_ok &&
+        armv7m_nvic_can_take_pending_exception(env->nvic);
+
+    qemu_mutex_unlock_iothread();
+
+    if (take_exception) {
+        raise_exception_ra(env, EXCP_LAZYFP, 0, 1, GETPC());
+    }
+
+    env->v7m.fpccr[is_secure] &= ~R_V7M_FPCCR_LSPACT_MASK;
+
+    if (ts) {
+        /* Clear s0 to s31 and the FPSCR and VPR */
+        int i;
+
+        for (i = 0; i < 32; i += 2) {
+            *aa32_vfp_dreg(env, i / 2) = 0;
+        }
+        vfp_set_fpscr(env, 0);
+        if (cpu_isar_feature(aa32_mve, cpu)) {
+            env->v7m.vpr = 0;
+        }
+    }
+    /*
+     * Otherwise s0 to s15, FPSCR and VPR are UNKNOWN; we choose to leave them
+     * unchanged.
+     */
+}
+
+/*
+ * Write to v7M CONTROL.SPSEL bit for the specified security bank.
+ * This may change the current stack pointer between Main and Process
+ * stack pointers if it is done for the CONTROL register for the current
+ * security state.
+ */
+static void write_v7m_control_spsel_for_secstate(CPUARMState *env,
+                                                 bool new_spsel,
+                                                 bool secstate)
+{
+    bool old_is_psp = v7m_using_psp(env);
+
+    env->v7m.control[secstate] =
+        deposit32(env->v7m.control[secstate],
+                  R_V7M_CONTROL_SPSEL_SHIFT,
+                  R_V7M_CONTROL_SPSEL_LENGTH, new_spsel);
+
+    if (secstate == env->v7m.secure) {
+        bool new_is_psp = v7m_using_psp(env);
+        uint32_t tmp;
+
+        if (old_is_psp != new_is_psp) {
+            tmp = env->v7m.other_sp;
+            env->v7m.other_sp = env->regs[13];
+            env->regs[13] = tmp;
+        }
+    }
+}
+
+/*
+ * Write to v7M CONTROL.SPSEL bit. This may change the current
+ * stack pointer between Main and Process stack pointers.
+ */
+static void write_v7m_control_spsel(CPUARMState *env, bool new_spsel)
+{
+    write_v7m_control_spsel_for_secstate(env, new_spsel, env->v7m.secure);
+}
+
+void write_v7m_exception(CPUARMState *env, uint32_t new_exc)
+{
+    /*
+     * Write a new value to v7m.exception, thus transitioning into or out
+     * of Handler mode; this may result in a change of active stack pointer.
+     */
+    bool new_is_psp, old_is_psp = v7m_using_psp(env);
+    uint32_t tmp;
+
+    env->v7m.exception = new_exc;
+
+    new_is_psp = v7m_using_psp(env);
+
+    if (old_is_psp != new_is_psp) {
+        tmp = env->v7m.other_sp;
+        env->v7m.other_sp = env->regs[13];
+        env->regs[13] = tmp;
+    }
+}
+
+/* Switch M profile security state between NS and S */
+static void switch_v7m_security_state(CPUARMState *env, bool new_secstate)
+{
+    uint32_t new_ss_msp, new_ss_psp;
+
+    if (env->v7m.secure == new_secstate) {
+        return;
+    }
+
+    /*
+     * All the banked state is accessed by looking at env->v7m.secure
+     * except for the stack pointer; rearrange the SP appropriately.
+     */
+    new_ss_msp = env->v7m.other_ss_msp;
+    new_ss_psp = env->v7m.other_ss_psp;
+
+    if (v7m_using_psp(env)) {
+        env->v7m.other_ss_psp = env->regs[13];
+        env->v7m.other_ss_msp = env->v7m.other_sp;
+    } else {
+        env->v7m.other_ss_msp = env->regs[13];
+        env->v7m.other_ss_psp = env->v7m.other_sp;
+    }
+
+    env->v7m.secure = new_secstate;
+
+    if (v7m_using_psp(env)) {
+        env->regs[13] = new_ss_psp;
+        env->v7m.other_sp = new_ss_msp;
+    } else {
+        env->regs[13] = new_ss_msp;
+        env->v7m.other_sp = new_ss_psp;
+    }
+}
+
+void HELPER(v7m_bxns)(CPUARMState *env, uint32_t dest)
+{
+    /*
+     * Handle v7M BXNS:
+     *  - if the return value is a magic value, do exception return (like BX)
+     *  - otherwise bit 0 of the return value is the target security state
+     */
+    uint32_t min_magic;
+
+    if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
+        /* Covers FNC_RETURN and EXC_RETURN magic */
+        min_magic = FNC_RETURN_MIN_MAGIC;
+    } else {
+        /* EXC_RETURN magic only */
+        min_magic = EXC_RETURN_MIN_MAGIC;
+    }
+
+    if (dest >= min_magic) {
+        /*
+         * This is an exception return magic value; put it where
+         * do_v7m_exception_exit() expects and raise EXCEPTION_EXIT.
+         * Note that if we ever add gen_ss_advance() singlestep support to
+         * M profile this should count as an "instruction execution complete"
+         * event (compare gen_bx_excret_final_code()).
+         */
+        env->regs[15] = dest & ~1;
+        env->thumb = dest & 1;
+        HELPER(exception_internal)(env, EXCP_EXCEPTION_EXIT);
+        /* notreached */
+    }
+
+    /* translate.c should have made BXNS UNDEF unless we're secure */
+    assert(env->v7m.secure);
+
+    if (!(dest & 1)) {
+        env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK;
+    }
+    switch_v7m_security_state(env, dest & 1);
+    env->thumb = true;
+    env->regs[15] = dest & ~1;
+    arm_rebuild_hflags(env);
+}
+
+void HELPER(v7m_blxns)(CPUARMState *env, uint32_t dest)
+{
+    /*
+     * Handle v7M BLXNS:
+     *  - bit 0 of the destination address is the target security state
+     */
+
+    /* At this point regs[15] is the address just after the BLXNS */
+    uint32_t nextinst = env->regs[15] | 1;
+    uint32_t sp = env->regs[13] - 8;
+    uint32_t saved_psr;
+
+    /* translate.c will have made BLXNS UNDEF unless we're secure */
+    assert(env->v7m.secure);
+
+    if (dest & 1) {
+        /*
+         * Target is Secure, so this is just a normal BLX,
+         * except that the low bit doesn't indicate Thumb/not.
+         */
+        env->regs[14] = nextinst;
+        env->thumb = true;
+        env->regs[15] = dest & ~1;
+        return;
+    }
+
+    /* Target is non-secure: first push a stack frame */
+    if (!QEMU_IS_ALIGNED(sp, 8)) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "BLXNS with misaligned SP is UNPREDICTABLE\n");
+    }
+
+    if (sp < v7m_sp_limit(env)) {
+        raise_exception(env, EXCP_STKOF, 0, 1);
+    }
+
+    saved_psr = env->v7m.exception;
+    if (env->v7m.control[M_REG_S] & R_V7M_CONTROL_SFPA_MASK) {
+        saved_psr |= XPSR_SFPA;
+    }
+
+    /* Note that these stores can throw exceptions on MPU faults */
+    cpu_stl_data_ra(env, sp, nextinst, GETPC());
+    cpu_stl_data_ra(env, sp + 4, saved_psr, GETPC());
+
+    env->regs[13] = sp;
+    env->regs[14] = 0xfeffffff;
+    if (arm_v7m_is_handler_mode(env)) {
+        /*
+         * Write a dummy value to IPSR, to avoid leaking the current secure
+         * exception number to non-secure code. This is guaranteed not
+         * to cause write_v7m_exception() to actually change stacks.
+         */
+        write_v7m_exception(env, 1);
+    }
+    env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK;
+    switch_v7m_security_state(env, 0);
+    env->thumb = true;
+    env->regs[15] = dest;
+    arm_rebuild_hflags(env);
+}
+
+static uint32_t *get_v7m_sp_ptr(CPUARMState *env, bool secure, bool threadmode,
+                                bool spsel)
+{
+    /*
+     * Return a pointer to the location where we currently store the
+     * stack pointer for the requested security state and thread mode.
+     * This pointer will become invalid if the CPU state is updated
+     * such that the stack pointers are switched around (eg changing
+     * the SPSEL control bit).
+     * Compare the v8M ARM ARM pseudocode LookUpSP_with_security_mode().
+     * Unlike that pseudocode, we require the caller to pass us in the
+     * SPSEL control bit value; this is because we also use this
+     * function in handling of pushing of the callee-saves registers
+     * part of the v8M stack frame (pseudocode PushCalleeStack()),
+     * and in the tailchain codepath the SPSEL bit comes from the exception
+     * return magic LR value from the previous exception. The pseudocode
+     * opencodes the stack-selection in PushCalleeStack(), but we prefer
+     * to make this utility function generic enough to do the job.
+     */
+    bool want_psp = threadmode && spsel;
+
+    if (secure == env->v7m.secure) {
+        if (want_psp == v7m_using_psp(env)) {
+            return &env->regs[13];
+        } else {
+            return &env->v7m.other_sp;
+        }
+    } else {
+        if (want_psp) {
+            return &env->v7m.other_ss_psp;
+        } else {
+            return &env->v7m.other_ss_msp;
+        }
+    }
+}
+
+static bool arm_v7m_load_vector(ARMCPU *cpu, int exc, bool targets_secure,
+                                uint32_t *pvec)
+{
+    CPUState *cs = CPU(cpu);
+    CPUARMState *env = &cpu->env;
+    MemTxResult result;
+    uint32_t addr = env->v7m.vecbase[targets_secure] + exc * 4;
+    uint32_t vector_entry;
+    MemTxAttrs attrs = {};
+    ARMMMUIdx mmu_idx;
+    bool exc_secure;
+
+    qemu_log_mask(CPU_LOG_INT,
+                  "...loading from element %d of %s vector table at 0x%x\n",
+                  exc, targets_secure ? "secure" : "non-secure", addr);
+
+    mmu_idx = arm_v7m_mmu_idx_for_secstate_and_priv(env, targets_secure, true);
+
+    /*
+     * We don't do a get_phys_addr() here because the rules for vector
+     * loads are special: they always use the default memory map, and
+     * the default memory map permits reads from all addresses.
+     * Since there's no easy way to pass through to pmsav8_mpu_lookup()
+     * that we want this special case which would always say "yes",
+     * we just do the SAU lookup here followed by a direct physical load.
+     */
+    attrs.secure = targets_secure;
+    attrs.user = false;
+
+    if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
+        V8M_SAttributes sattrs = {};
+
+        v8m_security_lookup(env, addr, MMU_DATA_LOAD, mmu_idx,
+                            targets_secure, &sattrs);
+        if (sattrs.ns) {
+            attrs.secure = false;
+        } else if (!targets_secure) {
+            /*
+             * NS access to S memory: the underlying exception which we escalate
+             * to HardFault is SecureFault, which always targets Secure.
+             */
+            exc_secure = true;
+            goto load_fail;
+        }
+    }
+
+    vector_entry = address_space_ldl(arm_addressspace(cs, attrs), addr,
+                                     attrs, &result);
+    if (result != MEMTX_OK) {
+        /*
+         * Underlying exception is BusFault: its target security state
+         * depends on BFHFNMINS.
+         */
+        exc_secure = !(cpu->env.v7m.aircr & R_V7M_AIRCR_BFHFNMINS_MASK);
+        goto load_fail;
+    }
+    *pvec = vector_entry;
+    qemu_log_mask(CPU_LOG_INT, "...loaded new PC 0x%x\n", *pvec);
+    return true;
+
+load_fail:
+    /*
+     * All vector table fetch fails are reported as HardFault, with
+     * HFSR.VECTTBL and .FORCED set. (FORCED is set because
+     * technically the underlying exception is a SecureFault or BusFault
+     * that is escalated to HardFault.) This is a terminal exception,
+     * so we will either take the HardFault immediately or else enter
+     * lockup (the latter case is handled in armv7m_nvic_set_pending_derived()).
+     * The HardFault is Secure if BFHFNMINS is 0 (meaning that all HFs are
+     * secure); otherwise it targets the same security state as the
+     * underlying exception.
+     * In v8.1M HardFaults from vector table fetch fails don't set FORCED.
+     */
+    if (!(cpu->env.v7m.aircr & R_V7M_AIRCR_BFHFNMINS_MASK)) {
+        exc_secure = true;
+    }
+    env->v7m.hfsr |= R_V7M_HFSR_VECTTBL_MASK;
+    if (!arm_feature(env, ARM_FEATURE_V8_1M)) {
+        env->v7m.hfsr |= R_V7M_HFSR_FORCED_MASK;
+    }
+    armv7m_nvic_set_pending_derived(env->nvic, ARMV7M_EXCP_HARD, exc_secure);
+    return false;
+}
+
+static uint32_t v7m_integrity_sig(CPUARMState *env, uint32_t lr)
+{
+    /*
+     * Return the integrity signature value for the callee-saves
+     * stack frame section. @lr is the exception return payload/LR value
+     * whose FType bit forms bit 0 of the signature if FP is present.
+     */
+    uint32_t sig = 0xfefa125a;
+
+    if (!cpu_isar_feature(aa32_vfp_simd, env_archcpu(env))
+        || (lr & R_V7M_EXCRET_FTYPE_MASK)) {
+        sig |= 1;
+    }
+    return sig;
+}
+
+static bool v7m_push_callee_stack(ARMCPU *cpu, uint32_t lr, bool dotailchain,
+                                  bool ignore_faults)
+{
+    /*
+     * For v8M, push the callee-saves register part of the stack frame.
+     * Compare the v8M pseudocode PushCalleeStack().
+     * In the tailchaining case this may not be the current stack.
+     */
+    CPUARMState *env = &cpu->env;
+    uint32_t *frame_sp_p;
+    uint32_t frameptr;
+    ARMMMUIdx mmu_idx;
+    bool stacked_ok;
+    uint32_t limit;
+    bool want_psp;
+    uint32_t sig;
+    StackingMode smode = ignore_faults ? STACK_IGNFAULTS : STACK_NORMAL;
+
+    if (dotailchain) {
+        bool mode = lr & R_V7M_EXCRET_MODE_MASK;
+        bool priv = !(env->v7m.control[M_REG_S] & R_V7M_CONTROL_NPRIV_MASK) ||
+            !mode;
+
+        mmu_idx = arm_v7m_mmu_idx_for_secstate_and_priv(env, M_REG_S, priv);
+        frame_sp_p = get_v7m_sp_ptr(env, M_REG_S, mode,
+                                    lr & R_V7M_EXCRET_SPSEL_MASK);
+        want_psp = mode && (lr & R_V7M_EXCRET_SPSEL_MASK);
+        if (want_psp) {
+            limit = env->v7m.psplim[M_REG_S];
+        } else {
+            limit = env->v7m.msplim[M_REG_S];
+        }
+    } else {
+        mmu_idx = arm_mmu_idx(env);
+        frame_sp_p = &env->regs[13];
+        limit = v7m_sp_limit(env);
+    }
+
+    frameptr = *frame_sp_p - 0x28;
+    if (frameptr < limit) {
+        /*
+         * Stack limit failure: set SP to the limit value, and generate
+         * STKOF UsageFault. Stack pushes below the limit must not be
+         * performed. It is IMPDEF whether pushes above the limit are
+         * performed; we choose not to.
+         */
+        qemu_log_mask(CPU_LOG_INT,
+                      "...STKOF during callee-saves register stacking\n");
+        env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_STKOF_MASK;
+        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
+                                env->v7m.secure);
+        *frame_sp_p = limit;
+        return true;
+    }
+
+    /*
+     * Write as much of the stack frame as we can. A write failure may
+     * cause us to pend a derived exception.
+     */
+    sig = v7m_integrity_sig(env, lr);
+    stacked_ok =
+        v7m_stack_write(cpu, frameptr, sig, mmu_idx, smode) &&
+        v7m_stack_write(cpu, frameptr + 0x8, env->regs[4], mmu_idx, smode) &&
+        v7m_stack_write(cpu, frameptr + 0xc, env->regs[5], mmu_idx, smode) &&
+        v7m_stack_write(cpu, frameptr + 0x10, env->regs[6], mmu_idx, smode) &&
+        v7m_stack_write(cpu, frameptr + 0x14, env->regs[7], mmu_idx, smode) &&
+        v7m_stack_write(cpu, frameptr + 0x18, env->regs[8], mmu_idx, smode) &&
+        v7m_stack_write(cpu, frameptr + 0x1c, env->regs[9], mmu_idx, smode) &&
+        v7m_stack_write(cpu, frameptr + 0x20, env->regs[10], mmu_idx, smode) &&
+        v7m_stack_write(cpu, frameptr + 0x24, env->regs[11], mmu_idx, smode);
+
+    /* Update SP regardless of whether any of the stack accesses failed. */
+    *frame_sp_p = frameptr;
+
+    return !stacked_ok;
+}
+
+static void v7m_exception_taken(ARMCPU *cpu, uint32_t lr, bool dotailchain,
+                                bool ignore_stackfaults)
+{
+    /*
+     * Do the "take the exception" parts of exception entry,
+     * but not the pushing of state to the stack. This is
+     * similar to the pseudocode ExceptionTaken() function.
+     */
+    CPUARMState *env = &cpu->env;
+    uint32_t addr;
+    bool targets_secure;
+    int exc;
+    bool push_failed = false;
+
+    armv7m_nvic_get_pending_irq_info(env->nvic, &exc, &targets_secure);
+    qemu_log_mask(CPU_LOG_INT, "...taking pending %s exception %d\n",
+                  targets_secure ? "secure" : "nonsecure", exc);
+
+    if (dotailchain) {
+        /* Sanitize LR FType and PREFIX bits */
+        if (!cpu_isar_feature(aa32_vfp_simd, cpu)) {
+            lr |= R_V7M_EXCRET_FTYPE_MASK;
+        }
+        lr = deposit32(lr, 24, 8, 0xff);
+    }
+
+    if (arm_feature(env, ARM_FEATURE_V8)) {
+        if (arm_feature(env, ARM_FEATURE_M_SECURITY) &&
+            (lr & R_V7M_EXCRET_S_MASK)) {
+            /*
+             * The background code (the owner of the registers in the
+             * exception frame) is Secure. This means it may either already
+             * have or now needs to push callee-saves registers.
+             */
+            if (targets_secure) {
+                if (dotailchain && !(lr & R_V7M_EXCRET_ES_MASK)) {
+                    /*
+                     * We took an exception from Secure to NonSecure
+                     * (which means the callee-saved registers got stacked)
+                     * and are now tailchaining to a Secure exception.
+                     * Clear DCRS so eventual return from this Secure
+                     * exception unstacks the callee-saved registers.
+                     */
+                    lr &= ~R_V7M_EXCRET_DCRS_MASK;
+                }
+            } else {
+                /*
+                 * We're going to a non-secure exception; push the
+                 * callee-saves registers to the stack now, if they're
+                 * not already saved.
+                 */
+                if (lr & R_V7M_EXCRET_DCRS_MASK &&
+                    !(dotailchain && !(lr & R_V7M_EXCRET_ES_MASK))) {
+                    push_failed = v7m_push_callee_stack(cpu, lr, dotailchain,
+                                                        ignore_stackfaults);
+                }
+                lr |= R_V7M_EXCRET_DCRS_MASK;
+            }
+        }
+
+        lr &= ~R_V7M_EXCRET_ES_MASK;
+        if (targets_secure) {
+            lr |= R_V7M_EXCRET_ES_MASK;
+        }
+        lr &= ~R_V7M_EXCRET_SPSEL_MASK;
+        if (env->v7m.control[targets_secure] & R_V7M_CONTROL_SPSEL_MASK) {
+            lr |= R_V7M_EXCRET_SPSEL_MASK;
+        }
+
+        /*
+         * Clear registers if necessary to prevent non-secure exception
+         * code being able to see register values from secure code.
+         * Where register values become architecturally UNKNOWN we leave
+         * them with their previous values. v8.1M is tighter than v8.0M
+         * here and always zeroes the caller-saved registers regardless
+         * of the security state the exception is targeting.
+         */
+        if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
+            if (!targets_secure || arm_feature(env, ARM_FEATURE_V8_1M)) {
+                /*
+                 * Always clear the caller-saved registers (they have been
+                 * pushed to the stack earlier in v7m_push_stack()).
+                 * Clear callee-saved registers if the background code is
+                 * Secure (in which case these regs were saved in
+                 * v7m_push_callee_stack()).
+                 */
+                int i;
+                /*
+                 * r4..r11 are callee-saves, zero only if background
+                 * state was Secure (EXCRET.S == 1) and exception
+                 * targets Non-secure state
+                 */
+                bool zero_callee_saves = !targets_secure &&
+                    (lr & R_V7M_EXCRET_S_MASK);
+
+                for (i = 0; i < 13; i++) {
+                    if (i < 4 || i > 11 || zero_callee_saves) {
+                        env->regs[i] = 0;
+                    }
+                }
+                /* Clear EAPSR */
+                xpsr_write(env, 0, XPSR_NZCV | XPSR_Q | XPSR_GE | XPSR_IT);
+            }
+        }
+    }
+
+    if (push_failed && !ignore_stackfaults) {
+        /*
+         * Derived exception on callee-saves register stacking:
+         * we might now want to take a different exception which
+         * targets a different security state, so try again from the top.
+         */
+        qemu_log_mask(CPU_LOG_INT,
+                      "...derived exception on callee-saves register stacking");
+        v7m_exception_taken(cpu, lr, true, true);
+        return;
+    }
+
+    if (!arm_v7m_load_vector(cpu, exc, targets_secure, &addr)) {
+        /* Vector load failed: derived exception */
+        qemu_log_mask(CPU_LOG_INT, "...derived exception on vector table load");
+        v7m_exception_taken(cpu, lr, true, true);
+        return;
+    }
+
+    /*
+     * Now we've done everything that might cause a derived exception
+     * we can go ahead and activate whichever exception we're going to
+     * take (which might now be the derived exception).
+     */
+    armv7m_nvic_acknowledge_irq(env->nvic);
+
+    /* Switch to target security state -- must do this before writing SPSEL */
+    switch_v7m_security_state(env, targets_secure);
+    write_v7m_control_spsel(env, 0);
+    arm_clear_exclusive(env);
+    /* Clear SFPA and FPCA (has no effect if no FPU) */
+    env->v7m.control[M_REG_S] &=
+        ~(R_V7M_CONTROL_FPCA_MASK | R_V7M_CONTROL_SFPA_MASK);
+    /* Clear IT bits */
+    env->condexec_bits = 0;
+    env->regs[14] = lr;
+    env->regs[15] = addr & 0xfffffffe;
+    env->thumb = addr & 1;
+    arm_rebuild_hflags(env);
+}
+
+static void v7m_update_fpccr(CPUARMState *env, uint32_t frameptr,
+                             bool apply_splim)
+{
+    /*
+     * Like the pseudocode UpdateFPCCR: save state in FPCAR and FPCCR
+     * that we will need later in order to do lazy FP reg stacking.
+     */
+    bool is_secure = env->v7m.secure;
+    NVICState *nvic = env->nvic;
+    /*
+     * Some bits are unbanked and live always in fpccr[M_REG_S]; some bits
+     * are banked and we want to update the bit in the bank for the
+     * current security state; and in one case we want to specifically
+     * update the NS banked version of a bit even if we are secure.
+     */
+    uint32_t *fpccr_s = &env->v7m.fpccr[M_REG_S];
+    uint32_t *fpccr_ns = &env->v7m.fpccr[M_REG_NS];
+    uint32_t *fpccr = &env->v7m.fpccr[is_secure];
+    bool hfrdy, bfrdy, mmrdy, ns_ufrdy, s_ufrdy, sfrdy, monrdy;
+
+    env->v7m.fpcar[is_secure] = frameptr & ~0x7;
+
+    if (apply_splim && arm_feature(env, ARM_FEATURE_V8)) {
+        bool splimviol;
+        uint32_t splim = v7m_sp_limit(env);
+        bool ign = armv7m_nvic_neg_prio_requested(nvic, is_secure) &&
+            (env->v7m.ccr[is_secure] & R_V7M_CCR_STKOFHFNMIGN_MASK);
+
+        splimviol = !ign && frameptr < splim;
+        *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, SPLIMVIOL, splimviol);
+    }
+
+    *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, LSPACT, 1);
+
+    *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, S, is_secure);
+
+    *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, USER, arm_current_el(env) == 0);
+
+    *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, THREAD,
+                        !arm_v7m_is_handler_mode(env));
+
+    hfrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_HARD, false);
+    *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, HFRDY, hfrdy);
+
+    bfrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_BUS, false);
+    *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, BFRDY, bfrdy);
+
+    mmrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_MEM, is_secure);
+    *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, MMRDY, mmrdy);
+
+    ns_ufrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_USAGE, false);
+    *fpccr_ns = FIELD_DP32(*fpccr_ns, V7M_FPCCR, UFRDY, ns_ufrdy);
+
+    monrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_DEBUG, false);
+    *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, MONRDY, monrdy);
+
+    if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
+        s_ufrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_USAGE, true);
+        *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, UFRDY, s_ufrdy);
+
+        sfrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_SECURE, false);
+        *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, SFRDY, sfrdy);
+    }
+}
+
+void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr)
+{
+    /* fptr is the value of Rn, the frame pointer we store the FP regs to */
+    ARMCPU *cpu = env_archcpu(env);
+    bool s = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_S_MASK;
+    bool lspact = env->v7m.fpccr[s] & R_V7M_FPCCR_LSPACT_MASK;
+    uintptr_t ra = GETPC();
+
+    assert(env->v7m.secure);
+
+    if (!(env->v7m.control[M_REG_S] & R_V7M_CONTROL_SFPA_MASK)) {
+        return;
+    }
+
+    /* Check access to the coprocessor is permitted */
+    if (!v7m_cpacr_pass(env, true, arm_current_el(env) != 0)) {
+        raise_exception_ra(env, EXCP_NOCP, 0, 1, GETPC());
+    }
+
+    if (lspact) {
+        /* LSPACT should not be active when there is active FP state */
+        raise_exception_ra(env, EXCP_LSERR, 0, 1, GETPC());
+    }
+
+    if (fptr & 7) {
+        raise_exception_ra(env, EXCP_UNALIGNED, 0, 1, GETPC());
+    }
+
+    /*
+     * Note that we do not use v7m_stack_write() here, because the
+     * accesses should not set the FSR bits for stacking errors if they
+     * fail. (In pseudocode terms, they are AccType_NORMAL, not AccType_STACK
+     * or AccType_LAZYFP). Faults in cpu_stl_data_ra() will throw exceptions
+     * and longjmp out.
+     */
+    if (!(env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPEN_MASK)) {
+        bool ts = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK;
+        int i;
+
+        for (i = 0; i < (ts ? 32 : 16); i += 2) {
+            uint64_t dn = *aa32_vfp_dreg(env, i / 2);
+            uint32_t faddr = fptr + 4 * i;
+            uint32_t slo = extract64(dn, 0, 32);
+            uint32_t shi = extract64(dn, 32, 32);
+
+            if (i >= 16) {
+                faddr += 8; /* skip the slot for the FPSCR */
+            }
+            cpu_stl_data_ra(env, faddr, slo, ra);
+            cpu_stl_data_ra(env, faddr + 4, shi, ra);
+        }
+        cpu_stl_data_ra(env, fptr + 0x40, vfp_get_fpscr(env), ra);
+        if (cpu_isar_feature(aa32_mve, cpu)) {
+            cpu_stl_data_ra(env, fptr + 0x44, env->v7m.vpr, ra);
+        }
+
+        /*
+         * If TS is 0 then s0 to s15, FPSCR and VPR are UNKNOWN; we choose to
+         * leave them unchanged, matching our choice in v7m_preserve_fp_state.
+         */
+        if (ts) {
+            for (i = 0; i < 32; i += 2) {
+                *aa32_vfp_dreg(env, i / 2) = 0;
+            }
+            vfp_set_fpscr(env, 0);
+            if (cpu_isar_feature(aa32_mve, cpu)) {
+                env->v7m.vpr = 0;
+            }
+        }
+    } else {
+        v7m_update_fpccr(env, fptr, false);
+    }
+
+    env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_FPCA_MASK;
+}
+
+void HELPER(v7m_vlldm)(CPUARMState *env, uint32_t fptr)
+{
+    ARMCPU *cpu = env_archcpu(env);
+    uintptr_t ra = GETPC();
+
+    /* fptr is the value of Rn, the frame pointer we load the FP regs from */
+    assert(env->v7m.secure);
+
+    if (!(env->v7m.control[M_REG_S] & R_V7M_CONTROL_SFPA_MASK)) {
+        return;
+    }
+
+    /* Check access to the coprocessor is permitted */
+    if (!v7m_cpacr_pass(env, true, arm_current_el(env) != 0)) {
+        raise_exception_ra(env, EXCP_NOCP, 0, 1, GETPC());
+    }
+
+    if (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPACT_MASK) {
+        /* State in FP is still valid */
+        env->v7m.fpccr[M_REG_S] &= ~R_V7M_FPCCR_LSPACT_MASK;
+    } else {
+        bool ts = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK;
+        int i;
+        uint32_t fpscr;
+
+        if (fptr & 7) {
+            raise_exception_ra(env, EXCP_UNALIGNED, 0, 1, GETPC());
+        }
+
+        for (i = 0; i < (ts ? 32 : 16); i += 2) {
+            uint32_t slo, shi;
+            uint64_t dn;
+            uint32_t faddr = fptr + 4 * i;
+
+            if (i >= 16) {
+                faddr += 8; /* skip the slot for the FPSCR and VPR */
+            }
+
+            slo = cpu_ldl_data_ra(env, faddr, ra);
+            shi = cpu_ldl_data_ra(env, faddr + 4, ra);
+
+            dn = (uint64_t) shi << 32 | slo;
+            *aa32_vfp_dreg(env, i / 2) = dn;
+        }
+        fpscr = cpu_ldl_data_ra(env, fptr + 0x40, ra);
+        vfp_set_fpscr(env, fpscr);
+        if (cpu_isar_feature(aa32_mve, cpu)) {
+            env->v7m.vpr = cpu_ldl_data_ra(env, fptr + 0x44, ra);
+        }
+    }
+
+    env->v7m.control[M_REG_S] |= R_V7M_CONTROL_FPCA_MASK;
+}
+
+static bool v7m_push_stack(ARMCPU *cpu)
+{
+    /*
+     * Do the "set up stack frame" part of exception entry,
+     * similar to pseudocode PushStack().
+     * Return true if we generate a derived exception (and so
+     * should ignore further stack faults trying to process
+     * that derived exception.)
+     */
+    bool stacked_ok = true, limitviol = false;
+    CPUARMState *env = &cpu->env;
+    uint32_t xpsr = xpsr_read(env);
+    uint32_t frameptr = env->regs[13];
+    ARMMMUIdx mmu_idx = arm_mmu_idx(env);
+    uint32_t framesize;
+    bool nsacr_cp10 = extract32(env->v7m.nsacr, 10, 1);
+
+    if ((env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK) &&
+        (env->v7m.secure || nsacr_cp10)) {
+        if (env->v7m.secure &&
+            env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK) {
+            framesize = 0xa8;
+        } else {
+            framesize = 0x68;
+        }
+    } else {
+        framesize = 0x20;
+    }
+
+    /* Align stack pointer if the guest wants that */
+    if ((frameptr & 4) &&
+        (env->v7m.ccr[env->v7m.secure] & R_V7M_CCR_STKALIGN_MASK)) {
+        frameptr -= 4;
+        xpsr |= XPSR_SPREALIGN;
+    }
+
+    xpsr &= ~XPSR_SFPA;
+    if (env->v7m.secure &&
+        (env->v7m.control[M_REG_S] & R_V7M_CONTROL_SFPA_MASK)) {
+        xpsr |= XPSR_SFPA;
+    }
+
+    frameptr -= framesize;
+
+    if (arm_feature(env, ARM_FEATURE_V8)) {
+        uint32_t limit = v7m_sp_limit(env);
+
+        if (frameptr < limit) {
+            /*
+             * Stack limit failure: set SP to the limit value, and generate
+             * STKOF UsageFault. Stack pushes below the limit must not be
+             * performed. It is IMPDEF whether pushes above the limit are
+             * performed; we choose not to.
+             */
+            qemu_log_mask(CPU_LOG_INT,
+                          "...STKOF during stacking\n");
+            env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_STKOF_MASK;
+            armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
+                                    env->v7m.secure);
+            env->regs[13] = limit;
+            /*
+             * We won't try to perform any further memory accesses but
+             * we must continue through the following code to check for
+             * permission faults during FPU state preservation, and we
+             * must update FPCCR if lazy stacking is enabled.
+             */
+            limitviol = true;
+            stacked_ok = false;
+        }
+    }
+
+    /*
+     * Write as much of the stack frame as we can. If we fail a stack
+     * write this will result in a derived exception being pended
+     * (which may be taken in preference to the one we started with
+     * if it has higher priority).
+     */
+    stacked_ok = stacked_ok &&
+        v7m_stack_write(cpu, frameptr, env->regs[0], mmu_idx, STACK_NORMAL) &&
+        v7m_stack_write(cpu, frameptr + 4, env->regs[1],
+                        mmu_idx, STACK_NORMAL) &&
+        v7m_stack_write(cpu, frameptr + 8, env->regs[2],
+                        mmu_idx, STACK_NORMAL) &&
+        v7m_stack_write(cpu, frameptr + 12, env->regs[3],
+                        mmu_idx, STACK_NORMAL) &&
+        v7m_stack_write(cpu, frameptr + 16, env->regs[12],
+                        mmu_idx, STACK_NORMAL) &&
+        v7m_stack_write(cpu, frameptr + 20, env->regs[14],
+                        mmu_idx, STACK_NORMAL) &&
+        v7m_stack_write(cpu, frameptr + 24, env->regs[15],
+                        mmu_idx, STACK_NORMAL) &&
+        v7m_stack_write(cpu, frameptr + 28, xpsr, mmu_idx, STACK_NORMAL);
+
+    if (env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK) {
+        /* FPU is active, try to save its registers */
+        bool fpccr_s = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_S_MASK;
+        bool lspact = env->v7m.fpccr[fpccr_s] & R_V7M_FPCCR_LSPACT_MASK;
+
+        if (lspact && arm_feature(env, ARM_FEATURE_M_SECURITY)) {
+            qemu_log_mask(CPU_LOG_INT,
+                          "...SecureFault because LSPACT and FPCA both set\n");
+            env->v7m.sfsr |= R_V7M_SFSR_LSERR_MASK;
+            armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
+        } else if (!env->v7m.secure && !nsacr_cp10) {
+            qemu_log_mask(CPU_LOG_INT,
+                          "...Secure UsageFault with CFSR.NOCP because "
+                          "NSACR.CP10 prevents stacking FP regs\n");
+            armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, M_REG_S);
+            env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_NOCP_MASK;
+        } else {
+            if (!(env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPEN_MASK)) {
+                /* Lazy stacking disabled, save registers now */
+                int i;
+                bool cpacr_pass = v7m_cpacr_pass(env, env->v7m.secure,
+                                                 arm_current_el(env) != 0);
+
+                if (stacked_ok && !cpacr_pass) {
+                    /*
+                     * Take UsageFault if CPACR forbids access. The pseudocode
+                     * here does a full CheckCPEnabled() but we know the NSACR
+                     * check can never fail as we have already handled that.
+                     */
+                    qemu_log_mask(CPU_LOG_INT,
+                                  "...UsageFault with CFSR.NOCP because "
+                                  "CPACR.CP10 prevents stacking FP regs\n");
+                    armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
+                                            env->v7m.secure);
+                    env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_NOCP_MASK;
+                    stacked_ok = false;
+                }
+
+                for (i = 0; i < ((framesize == 0xa8) ? 32 : 16); i += 2) {
+                    uint64_t dn = *aa32_vfp_dreg(env, i / 2);
+                    uint32_t faddr = frameptr + 0x20 + 4 * i;
+                    uint32_t slo = extract64(dn, 0, 32);
+                    uint32_t shi = extract64(dn, 32, 32);
+
+                    if (i >= 16) {
+                        faddr += 8; /* skip the slot for the FPSCR and VPR */
+                    }
+                    stacked_ok = stacked_ok &&
+                        v7m_stack_write(cpu, faddr, slo,
+                                        mmu_idx, STACK_NORMAL) &&
+                        v7m_stack_write(cpu, faddr + 4, shi,
+                                        mmu_idx, STACK_NORMAL);
+                }
+                stacked_ok = stacked_ok &&
+                    v7m_stack_write(cpu, frameptr + 0x60,
+                                    vfp_get_fpscr(env), mmu_idx, STACK_NORMAL);
+                if (cpu_isar_feature(aa32_mve, cpu)) {
+                    stacked_ok = stacked_ok &&
+                        v7m_stack_write(cpu, frameptr + 0x64,
+                                        env->v7m.vpr, mmu_idx, STACK_NORMAL);
+                }
+                if (cpacr_pass) {
+                    for (i = 0; i < ((framesize == 0xa8) ? 32 : 16); i += 2) {
+                        *aa32_vfp_dreg(env, i / 2) = 0;
+                    }
+                    vfp_set_fpscr(env, 0);
+                    if (cpu_isar_feature(aa32_mve, cpu)) {
+                        env->v7m.vpr = 0;
+                    }
+                }
+            } else {
+                /* Lazy stacking enabled, save necessary info to stack later */
+                v7m_update_fpccr(env, frameptr + 0x20, true);
+            }
+        }
+    }
+
+    /*
+     * If we broke a stack limit then SP was already updated earlier;
+     * otherwise we update SP regardless of whether any of the stack
+     * accesses failed or we took some other kind of fault.
+     */
+    if (!limitviol) {
+        env->regs[13] = frameptr;
+    }
+
+    return !stacked_ok;
+}
+
+static void do_v7m_exception_exit(ARMCPU *cpu)
+{
+    CPUARMState *env = &cpu->env;
+    uint32_t excret;
+    uint32_t xpsr, xpsr_mask;
+    bool ufault = false;
+    bool sfault = false;
+    bool return_to_sp_process;
+    bool return_to_handler;
+    bool rettobase = false;
+    bool exc_secure = false;
+    bool return_to_secure;
+    bool ftype;
+    bool restore_s16_s31 = false;
+
+    /*
+     * If we're not in Handler mode then jumps to magic exception-exit
+     * addresses don't have magic behaviour. However for the v8M
+     * security extensions the magic secure-function-return has to
+     * work in thread mode too, so to avoid doing an extra check in
+     * the generated code we allow exception-exit magic to also cause the
+     * internal exception and bring us here in thread mode. Correct code
+     * will never try to do this (the following insn fetch will always
+     * fault) so we the overhead of having taken an unnecessary exception
+     * doesn't matter.
+     */
+    if (!arm_v7m_is_handler_mode(env)) {
+        return;
+    }
+
+    /*
+     * In the spec pseudocode ExceptionReturn() is called directly
+     * from BXWritePC() and gets the full target PC value including
+     * bit zero. In QEMU's implementation we treat it as a normal
+     * jump-to-register (which is then caught later on), and so split
+     * the target value up between env->regs[15] and env->thumb in
+     * gen_bx(). Reconstitute it.
+     */
+    excret = env->regs[15];
+    if (env->thumb) {
+        excret |= 1;
+    }
+
+    qemu_log_mask(CPU_LOG_INT, "Exception return: magic PC %" PRIx32
+                  " previous exception %d\n",
+                  excret, env->v7m.exception);
+
+    if ((excret & R_V7M_EXCRET_RES1_MASK) != R_V7M_EXCRET_RES1_MASK) {
+        qemu_log_mask(LOG_GUEST_ERROR, "M profile: zero high bits in exception "
+                      "exit PC value 0x%" PRIx32 " are UNPREDICTABLE\n",
+                      excret);
+    }
+
+    ftype = excret & R_V7M_EXCRET_FTYPE_MASK;
+
+    if (!ftype && !cpu_isar_feature(aa32_vfp_simd, cpu)) {
+        qemu_log_mask(LOG_GUEST_ERROR, "M profile: zero FTYPE in exception "
+                      "exit PC value 0x%" PRIx32 " is UNPREDICTABLE "
+                      "if FPU not present\n",
+                      excret);
+        ftype = true;
+    }
+
+    if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
+        /*
+         * EXC_RETURN.ES validation check (R_SMFL). We must do this before
+         * we pick which FAULTMASK to clear.
+         */
+        if (!env->v7m.secure &&
+            ((excret & R_V7M_EXCRET_ES_MASK) ||
+             !(excret & R_V7M_EXCRET_DCRS_MASK))) {
+            sfault = 1;
+            /* For all other purposes, treat ES as 0 (R_HXSR) */
+            excret &= ~R_V7M_EXCRET_ES_MASK;
+        }
+        exc_secure = excret & R_V7M_EXCRET_ES_MASK;
+    }
+
+    if (env->v7m.exception != ARMV7M_EXCP_NMI) {
+        /*
+         * Auto-clear FAULTMASK on return from other than NMI.
+         * If the security extension is implemented then this only
+         * happens if the raw execution priority is >= 0; the
+         * value of the ES bit in the exception return value indicates
+         * which security state's faultmask to clear. (v8M ARM ARM R_KBNF.)
+         */
+        if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
+            if (armv7m_nvic_raw_execution_priority(env->nvic) >= 0) {
+                env->v7m.faultmask[exc_secure] = 0;
+            }
+        } else {
+            env->v7m.faultmask[M_REG_NS] = 0;
+        }
+    }
+
+    switch (armv7m_nvic_complete_irq(env->nvic, env->v7m.exception,
+                                     exc_secure)) {
+    case -1:
+        /* attempt to exit an exception that isn't active */
+        ufault = true;
+        break;
+    case 0:
+        /* still an irq active now */
+        break;
+    case 1:
+        /*
+         * We returned to base exception level, no nesting.
+         * (In the pseudocode this is written using "NestedActivation != 1"
+         * where we have 'rettobase == false'.)
+         */
+        rettobase = true;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    return_to_handler = !(excret & R_V7M_EXCRET_MODE_MASK);
+    return_to_sp_process = excret & R_V7M_EXCRET_SPSEL_MASK;
+    return_to_secure = arm_feature(env, ARM_FEATURE_M_SECURITY) &&
+        (excret & R_V7M_EXCRET_S_MASK);
+
+    if (arm_feature(env, ARM_FEATURE_V8)) {
+        if (!arm_feature(env, ARM_FEATURE_M_SECURITY)) {
+            /*
+             * UNPREDICTABLE if S == 1 or DCRS == 0 or ES == 1 (R_XLCP);
+             * we choose to take the UsageFault.
+             */
+            if ((excret & R_V7M_EXCRET_S_MASK) ||
+                (excret & R_V7M_EXCRET_ES_MASK) ||
+                !(excret & R_V7M_EXCRET_DCRS_MASK)) {
+                ufault = true;
+            }
+        }
+        if (excret & R_V7M_EXCRET_RES0_MASK) {
+            ufault = true;
+        }
+    } else {
+        /* For v7M we only recognize certain combinations of the low bits */
+        switch (excret & 0xf) {
+        case 1: /* Return to Handler */
+            break;
+        case 13: /* Return to Thread using Process stack */
+        case 9: /* Return to Thread using Main stack */
+            /*
+             * We only need to check NONBASETHRDENA for v7M, because in
+             * v8M this bit does not exist (it is RES1).
+             */
+            if (!rettobase &&
+                !(env->v7m.ccr[env->v7m.secure] &
+                  R_V7M_CCR_NONBASETHRDENA_MASK)) {
+                ufault = true;
+            }
+            break;
+        default:
+            ufault = true;
+        }
+    }
+
+    /*
+     * Set CONTROL.SPSEL from excret.SPSEL. Since we're still in
+     * Handler mode (and will be until we write the new XPSR.Interrupt
+     * field) this does not switch around the current stack pointer.
+     * We must do this before we do any kind of tailchaining, including
+     * for the derived exceptions on integrity check failures, or we will
+     * give the guest an incorrect EXCRET.SPSEL value on exception entry.
+     */
+    write_v7m_control_spsel_for_secstate(env, return_to_sp_process, exc_secure);
+
+    /*
+     * Clear scratch FP values left in caller saved registers; this
+     * must happen before any kind of tail chaining.
+     */
+    if ((env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_CLRONRET_MASK) &&
+        (env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK)) {
+        if (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPACT_MASK) {
+            env->v7m.sfsr |= R_V7M_SFSR_LSERR_MASK;
+            armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
+            qemu_log_mask(CPU_LOG_INT, "...taking SecureFault on existing "
+                          "stackframe: error during lazy state deactivation\n");
+            v7m_exception_taken(cpu, excret, true, false);
+            return;
+        } else {
+            if (arm_feature(env, ARM_FEATURE_V8_1M)) {
+                /* v8.1M adds this NOCP check */
+                bool nsacr_pass = exc_secure ||
+                    extract32(env->v7m.nsacr, 10, 1);
+                bool cpacr_pass = v7m_cpacr_pass(env, exc_secure, true);
+                if (!nsacr_pass) {
+                    armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, true);
+                    env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_NOCP_MASK;
+                    qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on existing "
+                        "stackframe: NSACR prevents clearing FPU registers\n");
+                    v7m_exception_taken(cpu, excret, true, false);
+                    return;
+                } else if (!cpacr_pass) {
+                    armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
+                                            exc_secure);
+                    env->v7m.cfsr[exc_secure] |= R_V7M_CFSR_NOCP_MASK;
+                    qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on existing "
+                        "stackframe: CPACR prevents clearing FPU registers\n");
+                    v7m_exception_taken(cpu, excret, true, false);
+                    return;
+                }
+            }
+            /* Clear s0..s15, FPSCR and VPR */
+            int i;
+
+            for (i = 0; i < 16; i += 2) {
+                *aa32_vfp_dreg(env, i / 2) = 0;
+            }
+            vfp_set_fpscr(env, 0);
+            if (cpu_isar_feature(aa32_mve, cpu)) {
+                env->v7m.vpr = 0;
+            }
+        }
+    }
+
+    if (sfault) {
+        env->v7m.sfsr |= R_V7M_SFSR_INVER_MASK;
+        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
+        qemu_log_mask(CPU_LOG_INT, "...taking SecureFault on existing "
+                      "stackframe: failed EXC_RETURN.ES validity check\n");
+        v7m_exception_taken(cpu, excret, true, false);
+        return;
+    }
+
+    if (ufault) {
+        /*
+         * Bad exception return: instead of popping the exception
+         * stack, directly take a usage fault on the current stack.
+         */
+        env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVPC_MASK;
+        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure);
+        qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on existing "
+                      "stackframe: failed exception return integrity check\n");
+        v7m_exception_taken(cpu, excret, true, false);
+        return;
+    }
+
+    /*
+     * Tailchaining: if there is currently a pending exception that
+     * is high enough priority to preempt execution at the level we're
+     * about to return to, then just directly take that exception now,
+     * avoiding an unstack-and-then-stack. Note that now we have
+     * deactivated the previous exception by calling armv7m_nvic_complete_irq()
+     * our current execution priority is already the execution priority we are
+     * returning to -- none of the state we would unstack or set based on
+     * the EXCRET value affects it.
+     */
+    if (armv7m_nvic_can_take_pending_exception(env->nvic)) {
+        qemu_log_mask(CPU_LOG_INT, "...tailchaining to pending exception\n");
+        v7m_exception_taken(cpu, excret, true, false);
+        return;
+    }
+
+    switch_v7m_security_state(env, return_to_secure);
+
+    {
+        /*
+         * The stack pointer we should be reading the exception frame from
+         * depends on bits in the magic exception return type value (and
+         * for v8M isn't necessarily the stack pointer we will eventually
+         * end up resuming execution with). Get a pointer to the location
+         * in the CPU state struct where the SP we need is currently being
+         * stored; we will use and modify it in place.
+         * We use this limited C variable scope so we don't accidentally
+         * use 'frame_sp_p' after we do something that makes it invalid.
+         */
+        bool spsel = env->v7m.control[return_to_secure] & R_V7M_CONTROL_SPSEL_MASK;
+        uint32_t *frame_sp_p = get_v7m_sp_ptr(env,
+                                              return_to_secure,
+                                              !return_to_handler,
+                                              spsel);
+        uint32_t frameptr = *frame_sp_p;
+        bool pop_ok = true;
+        ARMMMUIdx mmu_idx;
+        bool return_to_priv = return_to_handler ||
+            !(env->v7m.control[return_to_secure] & R_V7M_CONTROL_NPRIV_MASK);
+
+        mmu_idx = arm_v7m_mmu_idx_for_secstate_and_priv(env, return_to_secure,
+                                                        return_to_priv);
+
+        if (!QEMU_IS_ALIGNED(frameptr, 8) &&
+            arm_feature(env, ARM_FEATURE_V8)) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "M profile exception return with non-8-aligned SP "
+                          "for destination state is UNPREDICTABLE\n");
+        }
+
+        /* Do we need to pop callee-saved registers? */
+        if (return_to_secure &&
+            ((excret & R_V7M_EXCRET_ES_MASK) == 0 ||
+             (excret & R_V7M_EXCRET_DCRS_MASK) == 0)) {
+            uint32_t actual_sig;
+
+            pop_ok = v7m_stack_read(cpu, &actual_sig, frameptr, mmu_idx);
+
+            if (pop_ok && v7m_integrity_sig(env, excret) != actual_sig) {
+                /* Take a SecureFault on the current stack */
+                env->v7m.sfsr |= R_V7M_SFSR_INVIS_MASK;
+                armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
+                qemu_log_mask(CPU_LOG_INT, "...taking SecureFault on existing "
+                              "stackframe: failed exception return integrity "
+                              "signature check\n");
+                v7m_exception_taken(cpu, excret, true, false);
+                return;
+            }
+
+            pop_ok = pop_ok &&
+                v7m_stack_read(cpu, &env->regs[4], frameptr + 0x8, mmu_idx) &&
+                v7m_stack_read(cpu, &env->regs[5], frameptr + 0xc, mmu_idx) &&
+                v7m_stack_read(cpu, &env->regs[6], frameptr + 0x10, mmu_idx) &&
+                v7m_stack_read(cpu, &env->regs[7], frameptr + 0x14, mmu_idx) &&
+                v7m_stack_read(cpu, &env->regs[8], frameptr + 0x18, mmu_idx) &&
+                v7m_stack_read(cpu, &env->regs[9], frameptr + 0x1c, mmu_idx) &&
+                v7m_stack_read(cpu, &env->regs[10], frameptr + 0x20, mmu_idx) &&
+                v7m_stack_read(cpu, &env->regs[11], frameptr + 0x24, mmu_idx);
+
+            frameptr += 0x28;
+        }
+
+        /* Pop registers */
+        pop_ok = pop_ok &&
+            v7m_stack_read(cpu, &env->regs[0], frameptr, mmu_idx) &&
+            v7m_stack_read(cpu, &env->regs[1], frameptr + 0x4, mmu_idx) &&
+            v7m_stack_read(cpu, &env->regs[2], frameptr + 0x8, mmu_idx) &&
+            v7m_stack_read(cpu, &env->regs[3], frameptr + 0xc, mmu_idx) &&
+            v7m_stack_read(cpu, &env->regs[12], frameptr + 0x10, mmu_idx) &&
+            v7m_stack_read(cpu, &env->regs[14], frameptr + 0x14, mmu_idx) &&
+            v7m_stack_read(cpu, &env->regs[15], frameptr + 0x18, mmu_idx) &&
+            v7m_stack_read(cpu, &xpsr, frameptr + 0x1c, mmu_idx);
+
+        if (!pop_ok) {
+            /*
+             * v7m_stack_read() pended a fault, so take it (as a tail
+             * chained exception on the same stack frame)
+             */
+            qemu_log_mask(CPU_LOG_INT, "...derived exception on unstacking\n");
+            v7m_exception_taken(cpu, excret, true, false);
+            return;
+        }
+
+        /*
+         * Returning from an exception with a PC with bit 0 set is defined
+         * behaviour on v8M (bit 0 is ignored), but for v7M it was specified
+         * to be UNPREDICTABLE. In practice actual v7M hardware seems to ignore
+         * the lsbit, and there are several RTOSes out there which incorrectly
+         * assume the r15 in the stack frame should be a Thumb-style "lsbit
+         * indicates ARM/Thumb" value, so ignore the bit on v7M as well, but
+         * complain about the badly behaved guest.
+         */
+        if (env->regs[15] & 1) {
+            env->regs[15] &= ~1U;
+            if (!arm_feature(env, ARM_FEATURE_V8)) {
+                qemu_log_mask(LOG_GUEST_ERROR,
+                              "M profile return from interrupt with misaligned "
+                              "PC is UNPREDICTABLE on v7M\n");
+            }
+        }
+
+        if (arm_feature(env, ARM_FEATURE_V8)) {
+            /*
+             * For v8M we have to check whether the xPSR exception field
+             * matches the EXCRET value for return to handler/thread
+             * before we commit to changing the SP and xPSR.
+             */
+            bool will_be_handler = (xpsr & XPSR_EXCP) != 0;
+            if (return_to_handler != will_be_handler) {
+                /*
+                 * Take an INVPC UsageFault on the current stack.
+                 * By this point we will have switched to the security state
+                 * for the background state, so this UsageFault will target
+                 * that state.
+                 */
+                armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
+                                        env->v7m.secure);
+                env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVPC_MASK;
+                qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on existing "
+                              "stackframe: failed exception return integrity "
+                              "check\n");
+                v7m_exception_taken(cpu, excret, true, false);
+                return;
+            }
+        }
+
+        if (!ftype) {
+            /* FP present and we need to handle it */
+            if (!return_to_secure &&
+                (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPACT_MASK)) {
+                armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
+                env->v7m.sfsr |= R_V7M_SFSR_LSERR_MASK;
+                qemu_log_mask(CPU_LOG_INT,
+                              "...taking SecureFault on existing stackframe: "
+                              "Secure LSPACT set but exception return is "
+                              "not to secure state\n");
+                v7m_exception_taken(cpu, excret, true, false);
+                return;
+            }
+
+            restore_s16_s31 = return_to_secure &&
+                (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK);
+
+            if (env->v7m.fpccr[return_to_secure] & R_V7M_FPCCR_LSPACT_MASK) {
+                /* State in FPU is still valid, just clear LSPACT */
+                env->v7m.fpccr[return_to_secure] &= ~R_V7M_FPCCR_LSPACT_MASK;
+            } else {
+                int i;
+                uint32_t fpscr;
+                bool cpacr_pass, nsacr_pass;
+
+                cpacr_pass = v7m_cpacr_pass(env, return_to_secure,
+                                            return_to_priv);
+                nsacr_pass = return_to_secure ||
+                    extract32(env->v7m.nsacr, 10, 1);
+
+                if (!cpacr_pass) {
+                    armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
+                                            return_to_secure);
+                    env->v7m.cfsr[return_to_secure] |= R_V7M_CFSR_NOCP_MASK;
+                    qemu_log_mask(CPU_LOG_INT,
+                                  "...taking UsageFault on existing "
+                                  "stackframe: CPACR.CP10 prevents unstacking "
+                                  "FP regs\n");
+                    v7m_exception_taken(cpu, excret, true, false);
+                    return;
+                } else if (!nsacr_pass) {
+                    armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, true);
+                    env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_INVPC_MASK;
+                    qemu_log_mask(CPU_LOG_INT,
+                                  "...taking Secure UsageFault on existing "
+                                  "stackframe: NSACR.CP10 prevents unstacking "
+                                  "FP regs\n");
+                    v7m_exception_taken(cpu, excret, true, false);
+                    return;
+                }
+
+                for (i = 0; i < (restore_s16_s31 ? 32 : 16); i += 2) {
+                    uint32_t slo, shi;
+                    uint64_t dn;
+                    uint32_t faddr = frameptr + 0x20 + 4 * i;
+
+                    if (i >= 16) {
+                        faddr += 8; /* Skip the slot for the FPSCR and VPR */
+                    }
+
+                    pop_ok = pop_ok &&
+                        v7m_stack_read(cpu, &slo, faddr, mmu_idx) &&
+                        v7m_stack_read(cpu, &shi, faddr + 4, mmu_idx);
+
+                    if (!pop_ok) {
+                        break;
+                    }
+
+                    dn = (uint64_t)shi << 32 | slo;
+                    *aa32_vfp_dreg(env, i / 2) = dn;
+                }
+                pop_ok = pop_ok &&
+                    v7m_stack_read(cpu, &fpscr, frameptr + 0x60, mmu_idx);
+                if (pop_ok) {
+                    vfp_set_fpscr(env, fpscr);
+                }
+                if (cpu_isar_feature(aa32_mve, cpu)) {
+                    pop_ok = pop_ok &&
+                        v7m_stack_read(cpu, &env->v7m.vpr,
+                                       frameptr + 0x64, mmu_idx);
+                }
+                if (!pop_ok) {
+                    /*
+                     * These regs are 0 if security extension present;
+                     * otherwise merely UNKNOWN. We zero always.
+                     */
+                    for (i = 0; i < (restore_s16_s31 ? 32 : 16); i += 2) {
+                        *aa32_vfp_dreg(env, i / 2) = 0;
+                    }
+                    vfp_set_fpscr(env, 0);
+                    if (cpu_isar_feature(aa32_mve, cpu)) {
+                        env->v7m.vpr = 0;
+                    }
+                }
+            }
+        }
+        env->v7m.control[M_REG_S] = FIELD_DP32(env->v7m.control[M_REG_S],
+                                               V7M_CONTROL, FPCA, !ftype);
+
+        /* Commit to consuming the stack frame */
+        frameptr += 0x20;
+        if (!ftype) {
+            frameptr += 0x48;
+            if (restore_s16_s31) {
+                frameptr += 0x40;
+            }
+        }
+        /*
+         * Undo stack alignment (the SPREALIGN bit indicates that the original
+         * pre-exception SP was not 8-aligned and we added a padding word to
+         * align it, so we undo this by ORing in the bit that increases it
+         * from the current 8-aligned value to the 8-unaligned value. (Adding 4
+         * would work too but a logical OR is how the pseudocode specifies it.)
+         */
+        if (xpsr & XPSR_SPREALIGN) {
+            frameptr |= 4;
+        }
+        *frame_sp_p = frameptr;
+    }
+
+    xpsr_mask = ~(XPSR_SPREALIGN | XPSR_SFPA);
+    if (!arm_feature(env, ARM_FEATURE_THUMB_DSP)) {
+        xpsr_mask &= ~XPSR_GE;
+    }
+    /* This xpsr_write() will invalidate frame_sp_p as it may switch stack */
+    xpsr_write(env, xpsr, xpsr_mask);
+
+    if (env->v7m.secure) {
+        bool sfpa = xpsr & XPSR_SFPA;
+
+        env->v7m.control[M_REG_S] = FIELD_DP32(env->v7m.control[M_REG_S],
+                                               V7M_CONTROL, SFPA, sfpa);
+    }
+
+    /*
+     * The restored xPSR exception field will be zero if we're
+     * resuming in Thread mode. If that doesn't match what the
+     * exception return excret specified then this is a UsageFault.
+     * v7M requires we make this check here; v8M did it earlier.
+     */
+    if (return_to_handler != arm_v7m_is_handler_mode(env)) {
+        /*
+         * Take an INVPC UsageFault by pushing the stack again;
+         * we know we're v7M so this is never a Secure UsageFault.
+         */
+        bool ignore_stackfaults;
+
+        assert(!arm_feature(env, ARM_FEATURE_V8));
+        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, false);
+        env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVPC_MASK;
+        ignore_stackfaults = v7m_push_stack(cpu);
+        qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on new stackframe: "
+                      "failed exception return integrity check\n");
+        v7m_exception_taken(cpu, excret, false, ignore_stackfaults);
+        return;
+    }
+
+    /* Otherwise, we have a successful exception exit. */
+    arm_clear_exclusive(env);
+    arm_rebuild_hflags(env);
+    qemu_log_mask(CPU_LOG_INT, "...successful exception return\n");
+}
+
+static bool do_v7m_function_return(ARMCPU *cpu)
+{
+    /*
+     * v8M security extensions magic function return.
+     * We may either:
+     *  (1) throw an exception (longjump)
+     *  (2) return true if we successfully handled the function return
+     *  (3) return false if we failed a consistency check and have
+     *      pended a UsageFault that needs to be taken now
+     *
+     * At this point the magic return value is split between env->regs[15]
+     * and env->thumb. We don't bother to reconstitute it because we don't
+     * need it (all values are handled the same way).
+     */
+    CPUARMState *env = &cpu->env;
+    uint32_t newpc, newpsr, newpsr_exc;
+
+    qemu_log_mask(CPU_LOG_INT, "...really v7M secure function return\n");
+
+    {
+        bool threadmode, spsel;
+        MemOpIdx oi;
+        ARMMMUIdx mmu_idx;
+        uint32_t *frame_sp_p;
+        uint32_t frameptr;
+
+        /* Pull the return address and IPSR from the Secure stack */
+        threadmode = !arm_v7m_is_handler_mode(env);
+        spsel = env->v7m.control[M_REG_S] & R_V7M_CONTROL_SPSEL_MASK;
+
+        frame_sp_p = get_v7m_sp_ptr(env, true, threadmode, spsel);
+        frameptr = *frame_sp_p;
+
+        /*
+         * These loads may throw an exception (for MPU faults). We want to
+         * do them as secure, so work out what MMU index that is.
+         */
+        mmu_idx = arm_v7m_mmu_idx_for_secstate(env, true);
+        oi = make_memop_idx(MO_LEUL, arm_to_core_mmu_idx(mmu_idx));
+        newpc = cpu_ldl_le_mmu(env, frameptr, oi, 0);
+        newpsr = cpu_ldl_le_mmu(env, frameptr + 4, oi, 0);
+
+        /* Consistency checks on new IPSR */
+        newpsr_exc = newpsr & XPSR_EXCP;
+        if (!((env->v7m.exception == 0 && newpsr_exc == 0) ||
+              (env->v7m.exception == 1 && newpsr_exc != 0))) {
+            /* Pend the fault and tell our caller to take it */
+            env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVPC_MASK;
+            armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
+                                    env->v7m.secure);
+            qemu_log_mask(CPU_LOG_INT,
+                          "...taking INVPC UsageFault: "
+                          "IPSR consistency check failed\n");
+            return false;
+        }
+
+        *frame_sp_p = frameptr + 8;
+    }
+
+    /* This invalidates frame_sp_p */
+    switch_v7m_security_state(env, true);
+    env->v7m.exception = newpsr_exc;
+    env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK;
+    if (newpsr & XPSR_SFPA) {
+        env->v7m.control[M_REG_S] |= R_V7M_CONTROL_SFPA_MASK;
+    }
+    xpsr_write(env, 0, XPSR_IT);
+    env->thumb = newpc & 1;
+    env->regs[15] = newpc & ~1;
+    arm_rebuild_hflags(env);
+
+    qemu_log_mask(CPU_LOG_INT, "...function return successful\n");
+    return true;
+}
+
+static bool v7m_read_half_insn(ARMCPU *cpu, ARMMMUIdx mmu_idx, bool secure,
+                               uint32_t addr, uint16_t *insn)
+{
+    /*
+     * Load a 16-bit portion of a v7M instruction, returning true on success,
+     * or false on failure (in which case we will have pended the appropriate
+     * exception).
+     * We need to do the instruction fetch's MPU and SAU checks
+     * like this because there is no MMU index that would allow
+     * doing the load with a single function call. Instead we must
+     * first check that the security attributes permit the load
+     * and that they don't mismatch on the two halves of the instruction,
+     * and then we do the load as a secure load (ie using the security
+     * attributes of the address, not the CPU, as architecturally required).
+     */
+    CPUState *cs = CPU(cpu);
+    CPUARMState *env = &cpu->env;
+    V8M_SAttributes sattrs = {};
+    GetPhysAddrResult res = {};
+    ARMMMUFaultInfo fi = {};
+    MemTxResult txres;
+
+    v8m_security_lookup(env, addr, MMU_INST_FETCH, mmu_idx, secure, &sattrs);
+    if (!sattrs.nsc || sattrs.ns) {
+        /*
+         * This must be the second half of the insn, and it straddles a
+         * region boundary with the second half not being S&NSC.
+         */
+        env->v7m.sfsr |= R_V7M_SFSR_INVEP_MASK;
+        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
+        qemu_log_mask(CPU_LOG_INT,
+                      "...really SecureFault with SFSR.INVEP\n");
+        return false;
+    }
+    if (get_phys_addr(env, addr, MMU_INST_FETCH, mmu_idx, &res, &fi)) {
+        /* the MPU lookup failed */
+        env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_IACCVIOL_MASK;
+        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_MEM, env->v7m.secure);
+        qemu_log_mask(CPU_LOG_INT, "...really MemManage with CFSR.IACCVIOL\n");
+        return false;
+    }
+    *insn = address_space_lduw_le(arm_addressspace(cs, res.f.attrs),
+                                  res.f.phys_addr, res.f.attrs, &txres);
+    if (txres != MEMTX_OK) {
+        env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_IBUSERR_MASK;
+        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_BUS, false);
+        qemu_log_mask(CPU_LOG_INT, "...really BusFault with CFSR.IBUSERR\n");
+        return false;
+    }
+    return true;
+}
+
+static bool v7m_read_sg_stack_word(ARMCPU *cpu, ARMMMUIdx mmu_idx,
+                                   uint32_t addr, uint32_t *spdata)
+{
+    /*
+     * Read a word of data from the stack for the SG instruction,
+     * writing the value into *spdata. If the load succeeds, return
+     * true; otherwise pend an appropriate exception and return false.
+     * (We can't use data load helpers here that throw an exception
+     * because of the context we're called in, which is halfway through
+     * arm_v7m_cpu_do_interrupt().)
+     */
+    CPUState *cs = CPU(cpu);
+    CPUARMState *env = &cpu->env;
+    MemTxResult txres;
+    GetPhysAddrResult res = {};
+    ARMMMUFaultInfo fi = {};
+    uint32_t value;
+
+    if (get_phys_addr(env, addr, MMU_DATA_LOAD, mmu_idx, &res, &fi)) {
+        /* MPU/SAU lookup failed */
+        if (fi.type == ARMFault_QEMU_SFault) {
+            qemu_log_mask(CPU_LOG_INT,
+                          "...SecureFault during stack word read\n");
+            env->v7m.sfsr |= R_V7M_SFSR_AUVIOL_MASK | R_V7M_SFSR_SFARVALID_MASK;
+            env->v7m.sfar = addr;
+            armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
+        } else {
+            qemu_log_mask(CPU_LOG_INT,
+                          "...MemManageFault during stack word read\n");
+            env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_DACCVIOL_MASK |
+                R_V7M_CFSR_MMARVALID_MASK;
+            env->v7m.mmfar[M_REG_S] = addr;
+            armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_MEM, false);
+        }
+        return false;
+    }
+    value = address_space_ldl(arm_addressspace(cs, res.f.attrs),
+                              res.f.phys_addr, res.f.attrs, &txres);
+    if (txres != MEMTX_OK) {
+        /* BusFault trying to read the data */
+        qemu_log_mask(CPU_LOG_INT,
+                      "...BusFault during stack word read\n");
+        env->v7m.cfsr[M_REG_NS] |=
+            (R_V7M_CFSR_PRECISERR_MASK | R_V7M_CFSR_BFARVALID_MASK);
+        env->v7m.bfar = addr;
+        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_BUS, false);
+        return false;
+    }
+
+    *spdata = value;
+    return true;
+}
+
+static bool v7m_handle_execute_nsc(ARMCPU *cpu)
+{
+    /*
+     * Check whether this attempt to execute code in a Secure & NS-Callable
+     * memory region is for an SG instruction; if so, then emulate the
+     * effect of the SG instruction and return true. Otherwise pend
+     * the correct kind of exception and return false.
+     */
+    CPUARMState *env = &cpu->env;
+    ARMMMUIdx mmu_idx;
+    uint16_t insn;
+
+    /*
+     * We should never get here unless get_phys_addr_pmsav8() caused
+     * an exception for NS executing in S&NSC memory.
+     */
+    assert(!env->v7m.secure);
+    assert(arm_feature(env, ARM_FEATURE_M_SECURITY));
+
+    /* We want to do the MPU lookup as secure; work out what mmu_idx that is */
+    mmu_idx = arm_v7m_mmu_idx_for_secstate(env, true);
+
+    if (!v7m_read_half_insn(cpu, mmu_idx, true, env->regs[15], &insn)) {
+        return false;
+    }
+
+    if (!env->thumb) {
+        goto gen_invep;
+    }
+
+    if (insn != 0xe97f) {
+        /*
+         * Not an SG instruction first half (we choose the IMPDEF
+         * early-SG-check option).
+         */
+        goto gen_invep;
+    }
+
+    if (!v7m_read_half_insn(cpu, mmu_idx, true, env->regs[15] + 2, &insn)) {
+        return false;
+    }
+
+    if (insn != 0xe97f) {
+        /*
+         * Not an SG instruction second half (yes, both halves of the SG
+         * insn have the same hex value)
+         */
+        goto gen_invep;
+    }
+
+    /*
+     * OK, we have confirmed that we really have an SG instruction.
+     * We know we're NS in S memory so don't need to repeat those checks.
+     */
+    qemu_log_mask(CPU_LOG_INT, "...really an SG instruction at 0x%08" PRIx32
+                  ", executing it\n", env->regs[15]);
+
+    if (cpu_isar_feature(aa32_m_sec_state, cpu) &&
+        !arm_v7m_is_handler_mode(env)) {
+        /*
+         * v8.1M exception stack frame integrity check. Note that we
+         * must perform the memory access even if CCR_S.TRD is zero
+         * and we aren't going to check what the data loaded is.
+         */
+        uint32_t spdata, sp;
+
+        /*
+         * We know we are currently NS, so the S stack pointers must be
+         * in other_ss_{psp,msp}, not in regs[13]/other_sp.
+         */
+        sp = v7m_using_psp(env) ? env->v7m.other_ss_psp : env->v7m.other_ss_msp;
+        if (!v7m_read_sg_stack_word(cpu, mmu_idx, sp, &spdata)) {
+            /* Stack access failed and an exception has been pended */
+            return false;
+        }
+
+        if (env->v7m.ccr[M_REG_S] & R_V7M_CCR_TRD_MASK) {
+            if (((spdata & ~1) == 0xfefa125a) ||
+                !(env->v7m.control[M_REG_S] & 1)) {
+                goto gen_invep;
+            }
+        }
+    }
+
+    env->regs[14] &= ~1;
+    env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK;
+    switch_v7m_security_state(env, true);
+    xpsr_write(env, 0, XPSR_IT);
+    env->regs[15] += 4;
+    arm_rebuild_hflags(env);
+    return true;
+
+gen_invep:
+    env->v7m.sfsr |= R_V7M_SFSR_INVEP_MASK;
+    armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
+    qemu_log_mask(CPU_LOG_INT,
+                  "...really SecureFault with SFSR.INVEP\n");
+    return false;
+}
+
+void arm_v7m_cpu_do_interrupt(CPUState *cs)
+{
+    ARMCPU *cpu = ARM_CPU(cs);
+    CPUARMState *env = &cpu->env;
+    uint32_t lr;
+    bool ignore_stackfaults;
+
+    arm_log_exception(cs);
+
+    /*
+     * For exceptions we just mark as pending on the NVIC, and let that
+     * handle it.
+     */
+    switch (cs->exception_index) {
+    case EXCP_UDEF:
+        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure);
+        env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_UNDEFINSTR_MASK;
+        break;
+    case EXCP_NOCP:
+    {
+        /*
+         * NOCP might be directed to something other than the current
+         * security state if this fault is because of NSACR; we indicate
+         * the target security state using exception.target_el.
+         */
+        int target_secstate;
+
+        if (env->exception.target_el == 3) {
+            target_secstate = M_REG_S;
+        } else {
+            target_secstate = env->v7m.secure;
+        }
+        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, target_secstate);
+        env->v7m.cfsr[target_secstate] |= R_V7M_CFSR_NOCP_MASK;
+        break;
+    }
+    case EXCP_INVSTATE:
+        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure);
+        env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVSTATE_MASK;
+        break;
+    case EXCP_STKOF:
+        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure);
+        env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_STKOF_MASK;
+        break;
+    case EXCP_LSERR:
+        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
+        env->v7m.sfsr |= R_V7M_SFSR_LSERR_MASK;
+        break;
+    case EXCP_UNALIGNED:
+        /* Unaligned faults reported by M-profile aware code */
+        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure);
+        env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_UNALIGNED_MASK;
+        break;
+    case EXCP_DIVBYZERO:
+        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure);
+        env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_DIVBYZERO_MASK;
+        break;
+    case EXCP_SWI:
+        /* The PC already points to the next instruction.  */
+        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SVC, env->v7m.secure);
+        break;
+    case EXCP_PREFETCH_ABORT:
+    case EXCP_DATA_ABORT:
+        /*
+         * Note that for M profile we don't have a guest facing FSR, but
+         * the env->exception.fsr will be populated by the code that
+         * raises the fault, in the A profile short-descriptor format.
+         *
+         * Log the exception.vaddress now regardless of subtype, because
+         * logging below only logs it when it goes into a guest visible
+         * register.
+         */
+        qemu_log_mask(CPU_LOG_INT, "...at fault address 0x%x\n",
+                      (uint32_t)env->exception.vaddress);
+        switch (env->exception.fsr & 0xf) {
+        case M_FAKE_FSR_NSC_EXEC:
+            /*
+             * Exception generated when we try to execute code at an address
+             * which is marked as Secure & Non-Secure Callable and the CPU
+             * is in the Non-Secure state. The only instruction which can
+             * be executed like this is SG (and that only if both halves of
+             * the SG instruction have the same security attributes.)
+             * Everything else must generate an INVEP SecureFault, so we
+             * emulate the SG instruction here.
+             */
+            if (v7m_handle_execute_nsc(cpu)) {
+                return;
+            }
+            break;
+        case M_FAKE_FSR_SFAULT:
+            /*
+             * Various flavours of SecureFault for attempts to execute or
+             * access data in the wrong security state.
+             */
+            switch (cs->exception_index) {
+            case EXCP_PREFETCH_ABORT:
+                if (env->v7m.secure) {
+                    env->v7m.sfsr |= R_V7M_SFSR_INVTRAN_MASK;
+                    qemu_log_mask(CPU_LOG_INT,
+                                  "...really SecureFault with SFSR.INVTRAN\n");
+                } else {
+                    env->v7m.sfsr |= R_V7M_SFSR_INVEP_MASK;
+                    qemu_log_mask(CPU_LOG_INT,
+                                  "...really SecureFault with SFSR.INVEP\n");
+                }
+                break;
+            case EXCP_DATA_ABORT:
+                /* This must be an NS access to S memory */
+                env->v7m.sfsr |= R_V7M_SFSR_AUVIOL_MASK;
+                qemu_log_mask(CPU_LOG_INT,
+                              "...really SecureFault with SFSR.AUVIOL\n");
+                break;
+            }
+            armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
+            break;
+        case 0x8: /* External Abort */
+            switch (cs->exception_index) {
+            case EXCP_PREFETCH_ABORT:
+                env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_IBUSERR_MASK;
+                qemu_log_mask(CPU_LOG_INT, "...with CFSR.IBUSERR\n");
+                break;
+            case EXCP_DATA_ABORT:
+                env->v7m.cfsr[M_REG_NS] |=
+                    (R_V7M_CFSR_PRECISERR_MASK | R_V7M_CFSR_BFARVALID_MASK);
+                env->v7m.bfar = env->exception.vaddress;
+                qemu_log_mask(CPU_LOG_INT,
+                              "...with CFSR.PRECISERR and BFAR 0x%x\n",
+                              env->v7m.bfar);
+                break;
+            }
+            armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_BUS, false);
+            break;
+        case 0x1: /* Alignment fault reported by generic code */
+            qemu_log_mask(CPU_LOG_INT,
+                          "...really UsageFault with UFSR.UNALIGNED\n");
+            env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_UNALIGNED_MASK;
+            armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
+                                    env->v7m.secure);
+            break;
+        default:
+            /*
+             * All other FSR values are either MPU faults or "can't happen
+             * for M profile" cases.
+             */
+            switch (cs->exception_index) {
+            case EXCP_PREFETCH_ABORT:
+                env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_IACCVIOL_MASK;
+                qemu_log_mask(CPU_LOG_INT, "...with CFSR.IACCVIOL\n");
+                break;
+            case EXCP_DATA_ABORT:
+                env->v7m.cfsr[env->v7m.secure] |=
+                    (R_V7M_CFSR_DACCVIOL_MASK | R_V7M_CFSR_MMARVALID_MASK);
+                env->v7m.mmfar[env->v7m.secure] = env->exception.vaddress;
+                qemu_log_mask(CPU_LOG_INT,
+                              "...with CFSR.DACCVIOL and MMFAR 0x%x\n",
+                              env->v7m.mmfar[env->v7m.secure]);
+                break;
+            }
+            armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_MEM,
+                                    env->v7m.secure);
+            break;
+        }
+        break;
+    case EXCP_SEMIHOST:
+        qemu_log_mask(CPU_LOG_INT,
+                      "...handling as semihosting call 0x%x\n",
+                      env->regs[0]);
+#ifdef CONFIG_TCG
+        do_common_semihosting(cs);
+#else
+        g_assert_not_reached();
+#endif
+        env->regs[15] += env->thumb ? 2 : 4;
+        return;
+    case EXCP_BKPT:
+        armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_DEBUG, false);
+        break;
+    case EXCP_IRQ:
+        break;
+    case EXCP_EXCEPTION_EXIT:
+        if (env->regs[15] < EXC_RETURN_MIN_MAGIC) {
+            /* Must be v8M security extension function return */
+            assert(env->regs[15] >= FNC_RETURN_MIN_MAGIC);
+            assert(arm_feature(env, ARM_FEATURE_M_SECURITY));
+            if (do_v7m_function_return(cpu)) {
+                return;
+            }
+        } else {
+            do_v7m_exception_exit(cpu);
+            return;
+        }
+        break;
+    case EXCP_LAZYFP:
+        /*
+         * We already pended the specific exception in the NVIC in the
+         * v7m_preserve_fp_state() helper function.
+         */
+        break;
+    default:
+        cpu_abort(cs, "Unhandled exception 0x%x\n", cs->exception_index);
+        return; /* Never happens.  Keep compiler happy.  */
+    }
+
+    if (arm_feature(env, ARM_FEATURE_V8)) {
+        lr = R_V7M_EXCRET_RES1_MASK |
+            R_V7M_EXCRET_DCRS_MASK;
+        /*
+         * The S bit indicates whether we should return to Secure
+         * or NonSecure (ie our current state).
+         * The ES bit indicates whether we're taking this exception
+         * to Secure or NonSecure (ie our target state). We set it
+         * later, in v7m_exception_taken().
+         * The SPSEL bit is also set in v7m_exception_taken() for v8M.
+         * This corresponds to the ARM ARM pseudocode for v8M setting
+         * some LR bits in PushStack() and some in ExceptionTaken();
+         * the distinction matters for the tailchain cases where we
+         * can take an exception without pushing the stack.
+         */
+        if (env->v7m.secure) {
+            lr |= R_V7M_EXCRET_S_MASK;
+        }
+    } else {
+        lr = R_V7M_EXCRET_RES1_MASK |
+            R_V7M_EXCRET_S_MASK |
+            R_V7M_EXCRET_DCRS_MASK |
+            R_V7M_EXCRET_ES_MASK;
+        if (env->v7m.control[M_REG_NS] & R_V7M_CONTROL_SPSEL_MASK) {
+            lr |= R_V7M_EXCRET_SPSEL_MASK;
+        }
+    }
+    if (!(env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK)) {
+        lr |= R_V7M_EXCRET_FTYPE_MASK;
+    }
+    if (!arm_v7m_is_handler_mode(env)) {
+        lr |= R_V7M_EXCRET_MODE_MASK;
+    }
+
+    ignore_stackfaults = v7m_push_stack(cpu);
+    v7m_exception_taken(cpu, lr, false, ignore_stackfaults);
+}
+
+uint32_t HELPER(v7m_mrs)(CPUARMState *env, uint32_t reg)
+{
+    unsigned el = arm_current_el(env);
+
+    /* First handle registers which unprivileged can read */
+    switch (reg) {
+    case 0 ... 7: /* xPSR sub-fields */
+        return v7m_mrs_xpsr(env, reg, el);
+    case 20: /* CONTROL */
+        return v7m_mrs_control(env, env->v7m.secure);
+    case 0x94: /* CONTROL_NS */
+        /*
+         * We have to handle this here because unprivileged Secure code
+         * can read the NS CONTROL register.
+         */
+        if (!env->v7m.secure) {
+            return 0;
+        }
+        return env->v7m.control[M_REG_NS] |
+            (env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK);
+    }
+
+    if (el == 0) {
+        return 0; /* unprivileged reads others as zero */
+    }
+
+    if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
+        switch (reg) {
+        case 0x88: /* MSP_NS */
+            if (!env->v7m.secure) {
+                return 0;
+            }
+            return env->v7m.other_ss_msp;
+        case 0x89: /* PSP_NS */
+            if (!env->v7m.secure) {
+                return 0;
+            }
+            return env->v7m.other_ss_psp;
+        case 0x8a: /* MSPLIM_NS */
+            if (!env->v7m.secure) {
+                return 0;
+            }
+            return env->v7m.msplim[M_REG_NS];
+        case 0x8b: /* PSPLIM_NS */
+            if (!env->v7m.secure) {
+                return 0;
+            }
+            return env->v7m.psplim[M_REG_NS];
+        case 0x90: /* PRIMASK_NS */
+            if (!env->v7m.secure) {
+                return 0;
+            }
+            return env->v7m.primask[M_REG_NS];
+        case 0x91: /* BASEPRI_NS */
+            if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
+                goto bad_reg;
+            }
+            if (!env->v7m.secure) {
+                return 0;
+            }
+            return env->v7m.basepri[M_REG_NS];
+        case 0x93: /* FAULTMASK_NS */
+            if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
+                goto bad_reg;
+            }
+            if (!env->v7m.secure) {
+                return 0;
+            }
+            return env->v7m.faultmask[M_REG_NS];
+        case 0x98: /* SP_NS */
+        {
+            /*
+             * This gives the non-secure SP selected based on whether we're
+             * currently in handler mode or not, using the NS CONTROL.SPSEL.
+             */
+            bool spsel = env->v7m.control[M_REG_NS] & R_V7M_CONTROL_SPSEL_MASK;
+
+            if (!env->v7m.secure) {
+                return 0;
+            }
+            if (!arm_v7m_is_handler_mode(env) && spsel) {
+                return env->v7m.other_ss_psp;
+            } else {
+                return env->v7m.other_ss_msp;
+            }
+        }
+        default:
+            break;
+        }
+    }
+
+    switch (reg) {
+    case 8: /* MSP */
+        return v7m_using_psp(env) ? env->v7m.other_sp : env->regs[13];
+    case 9: /* PSP */
+        return v7m_using_psp(env) ? env->regs[13] : env->v7m.other_sp;
+    case 10: /* MSPLIM */
+        if (!arm_feature(env, ARM_FEATURE_V8)) {
+            goto bad_reg;
+        }
+        return env->v7m.msplim[env->v7m.secure];
+    case 11: /* PSPLIM */
+        if (!arm_feature(env, ARM_FEATURE_V8)) {
+            goto bad_reg;
+        }
+        return env->v7m.psplim[env->v7m.secure];
+    case 16: /* PRIMASK */
+        return env->v7m.primask[env->v7m.secure];
+    case 17: /* BASEPRI */
+    case 18: /* BASEPRI_MAX */
+        if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
+            goto bad_reg;
+        }
+        return env->v7m.basepri[env->v7m.secure];
+    case 19: /* FAULTMASK */
+        if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
+            goto bad_reg;
+        }
+        return env->v7m.faultmask[env->v7m.secure];
+    default:
+    bad_reg:
+        qemu_log_mask(LOG_GUEST_ERROR, "Attempt to read unknown special"
+                                       " register %d\n", reg);
+        return 0;
+    }
+}
+
+void HELPER(v7m_msr)(CPUARMState *env, uint32_t maskreg, uint32_t val)
+{
+    /*
+     * We're passed bits [11..0] of the instruction; extract
+     * SYSm and the mask bits.
+     * Invalid combinations of SYSm and mask are UNPREDICTABLE;
+     * we choose to treat them as if the mask bits were valid.
+     * NB that the pseudocode 'mask' variable is bits [11..10],
+     * whereas ours is [11..8].
+     */
+    uint32_t mask = extract32(maskreg, 8, 4);
+    uint32_t reg = extract32(maskreg, 0, 8);
+    int cur_el = arm_current_el(env);
+
+    if (cur_el == 0 && reg > 7 && reg != 20) {
+        /*
+         * only xPSR sub-fields and CONTROL.SFPA may be written by
+         * unprivileged code
+         */
+        return;
+    }
+
+    if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
+        switch (reg) {
+        case 0x88: /* MSP_NS */
+            if (!env->v7m.secure) {
+                return;
+            }
+            env->v7m.other_ss_msp = val & ~3;
+            return;
+        case 0x89: /* PSP_NS */
+            if (!env->v7m.secure) {
+                return;
+            }
+            env->v7m.other_ss_psp = val & ~3;
+            return;
+        case 0x8a: /* MSPLIM_NS */
+            if (!env->v7m.secure) {
+                return;
+            }
+            env->v7m.msplim[M_REG_NS] = val & ~7;
+            return;
+        case 0x8b: /* PSPLIM_NS */
+            if (!env->v7m.secure) {
+                return;
+            }
+            env->v7m.psplim[M_REG_NS] = val & ~7;
+            return;
+        case 0x90: /* PRIMASK_NS */
+            if (!env->v7m.secure) {
+                return;
+            }
+            env->v7m.primask[M_REG_NS] = val & 1;
+            return;
+        case 0x91: /* BASEPRI_NS */
+            if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
+                goto bad_reg;
+            }
+            if (!env->v7m.secure) {
+                return;
+            }
+            env->v7m.basepri[M_REG_NS] = val & 0xff;
+            return;
+        case 0x93: /* FAULTMASK_NS */
+            if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
+                goto bad_reg;
+            }
+            if (!env->v7m.secure) {
+                return;
+            }
+            env->v7m.faultmask[M_REG_NS] = val & 1;
+            return;
+        case 0x94: /* CONTROL_NS */
+            if (!env->v7m.secure) {
+                return;
+            }
+            write_v7m_control_spsel_for_secstate(env,
+                                                 val & R_V7M_CONTROL_SPSEL_MASK,
+                                                 M_REG_NS);
+            if (arm_feature(env, ARM_FEATURE_M_MAIN)) {
+                env->v7m.control[M_REG_NS] &= ~R_V7M_CONTROL_NPRIV_MASK;
+                env->v7m.control[M_REG_NS] |= val & R_V7M_CONTROL_NPRIV_MASK;
+            }
+            /*
+             * SFPA is RAZ/WI from NS. FPCA is RO if NSACR.CP10 == 0,
+             * RES0 if the FPU is not present, and is stored in the S bank
+             */
+            if (cpu_isar_feature(aa32_vfp_simd, env_archcpu(env)) &&
+                extract32(env->v7m.nsacr, 10, 1)) {
+                env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_FPCA_MASK;
+                env->v7m.control[M_REG_S] |= val & R_V7M_CONTROL_FPCA_MASK;
+            }
+            return;
+        case 0x98: /* SP_NS */
+        {
+            /*
+             * This gives the non-secure SP selected based on whether we're
+             * currently in handler mode or not, using the NS CONTROL.SPSEL.
+             */
+            bool spsel = env->v7m.control[M_REG_NS] & R_V7M_CONTROL_SPSEL_MASK;
+            bool is_psp = !arm_v7m_is_handler_mode(env) && spsel;
+            uint32_t limit;
+
+            if (!env->v7m.secure) {
+                return;
+            }
+
+            limit = is_psp ? env->v7m.psplim[false] : env->v7m.msplim[false];
+
+            val &= ~0x3;
+
+            if (val < limit) {
+                raise_exception_ra(env, EXCP_STKOF, 0, 1, GETPC());
+            }
+
+            if (is_psp) {
+                env->v7m.other_ss_psp = val;
+            } else {
+                env->v7m.other_ss_msp = val;
+            }
+            return;
+        }
+        default:
+            break;
+        }
+    }
+
+    switch (reg) {
+    case 0 ... 7: /* xPSR sub-fields */
+        v7m_msr_xpsr(env, mask, reg, val);
+        break;
+    case 8: /* MSP */
+        if (v7m_using_psp(env)) {
+            env->v7m.other_sp = val & ~3;
+        } else {
+            env->regs[13] = val & ~3;
+        }
+        break;
+    case 9: /* PSP */
+        if (v7m_using_psp(env)) {
+            env->regs[13] = val & ~3;
+        } else {
+            env->v7m.other_sp = val & ~3;
+        }
+        break;
+    case 10: /* MSPLIM */
+        if (!arm_feature(env, ARM_FEATURE_V8)) {
+            goto bad_reg;
+        }
+        env->v7m.msplim[env->v7m.secure] = val & ~7;
+        break;
+    case 11: /* PSPLIM */
+        if (!arm_feature(env, ARM_FEATURE_V8)) {
+            goto bad_reg;
+        }
+        env->v7m.psplim[env->v7m.secure] = val & ~7;
+        break;
+    case 16: /* PRIMASK */
+        env->v7m.primask[env->v7m.secure] = val & 1;
+        break;
+    case 17: /* BASEPRI */
+        if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
+            goto bad_reg;
+        }
+        env->v7m.basepri[env->v7m.secure] = val & 0xff;
+        break;
+    case 18: /* BASEPRI_MAX */
+        if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
+            goto bad_reg;
+        }
+        val &= 0xff;
+        if (val != 0 && (val < env->v7m.basepri[env->v7m.secure]
+                         || env->v7m.basepri[env->v7m.secure] == 0)) {
+            env->v7m.basepri[env->v7m.secure] = val;
+        }
+        break;
+    case 19: /* FAULTMASK */
+        if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
+            goto bad_reg;
+        }
+        env->v7m.faultmask[env->v7m.secure] = val & 1;
+        break;
+    case 20: /* CONTROL */
+        /*
+         * Writing to the SPSEL bit only has an effect if we are in
+         * thread mode; other bits can be updated by any privileged code.
+         * write_v7m_control_spsel() deals with updating the SPSEL bit in
+         * env->v7m.control, so we only need update the others.
+         * For v7M, we must just ignore explicit writes to SPSEL in handler
+         * mode; for v8M the write is permitted but will have no effect.
+         * All these bits are writes-ignored from non-privileged code,
+         * except for SFPA.
+         */
+        if (cur_el > 0 && (arm_feature(env, ARM_FEATURE_V8) ||
+                           !arm_v7m_is_handler_mode(env))) {
+            write_v7m_control_spsel(env, (val & R_V7M_CONTROL_SPSEL_MASK) != 0);
+        }
+        if (cur_el > 0 && arm_feature(env, ARM_FEATURE_M_MAIN)) {
+            env->v7m.control[env->v7m.secure] &= ~R_V7M_CONTROL_NPRIV_MASK;
+            env->v7m.control[env->v7m.secure] |= val & R_V7M_CONTROL_NPRIV_MASK;
+        }
+        if (cpu_isar_feature(aa32_vfp_simd, env_archcpu(env))) {
+            /*
+             * SFPA is RAZ/WI from NS or if no FPU.
+             * FPCA is RO if NSACR.CP10 == 0, RES0 if the FPU is not present.
+             * Both are stored in the S bank.
+             */
+            if (env->v7m.secure) {
+                env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK;
+                env->v7m.control[M_REG_S] |= val & R_V7M_CONTROL_SFPA_MASK;
+            }
+            if (cur_el > 0 &&
+                (env->v7m.secure || !arm_feature(env, ARM_FEATURE_M_SECURITY) ||
+                 extract32(env->v7m.nsacr, 10, 1))) {
+                env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_FPCA_MASK;
+                env->v7m.control[M_REG_S] |= val & R_V7M_CONTROL_FPCA_MASK;
+            }
+        }
+        break;
+    default:
+    bad_reg:
+        qemu_log_mask(LOG_GUEST_ERROR, "Attempt to write unknown special"
+                                       " register %d\n", reg);
+        return;
+    }
+}
+
+uint32_t HELPER(v7m_tt)(CPUARMState *env, uint32_t addr, uint32_t op)
+{
+    /* Implement the TT instruction. op is bits [7:6] of the insn. */
+    bool forceunpriv = op & 1;
+    bool alt = op & 2;
+    V8M_SAttributes sattrs = {};
+    uint32_t tt_resp;
+    bool r, rw, nsr, nsrw, mrvalid;
+    ARMMMUIdx mmu_idx;
+    uint32_t mregion;
+    bool targetpriv;
+    bool targetsec = env->v7m.secure;
+
+    /*
+     * Work out what the security state and privilege level we're
+     * interested in is...
+     */
+    if (alt) {
+        targetsec = !targetsec;
+    }
+
+    if (forceunpriv) {
+        targetpriv = false;
+    } else {
+        targetpriv = arm_v7m_is_handler_mode(env) ||
+            !(env->v7m.control[targetsec] & R_V7M_CONTROL_NPRIV_MASK);
+    }
+
+    /* ...and then figure out which MMU index this is */
+    mmu_idx = arm_v7m_mmu_idx_for_secstate_and_priv(env, targetsec, targetpriv);
+
+    /*
+     * We know that the MPU and SAU don't care about the access type
+     * for our purposes beyond that we don't want to claim to be
+     * an insn fetch, so we arbitrarily call this a read.
+     */
+
+    /*
+     * MPU region info only available for privileged or if
+     * inspecting the other MPU state.
+     */
+    if (arm_current_el(env) != 0 || alt) {
+        GetPhysAddrResult res = {};
+        ARMMMUFaultInfo fi = {};
+
+        /* We can ignore the return value as prot is always set */
+        pmsav8_mpu_lookup(env, addr, MMU_DATA_LOAD, mmu_idx, targetsec,
+                          &res, &fi, &mregion);
+        if (mregion == -1) {
+            mrvalid = false;
+            mregion = 0;
+        } else {
+            mrvalid = true;
+        }
+        r = res.f.prot & PAGE_READ;
+        rw = res.f.prot & PAGE_WRITE;
+    } else {
+        r = false;
+        rw = false;
+        mrvalid = false;
+        mregion = 0;
+    }
+
+    if (env->v7m.secure) {
+        v8m_security_lookup(env, addr, MMU_DATA_LOAD, mmu_idx,
+                            targetsec, &sattrs);
+        nsr = sattrs.ns && r;
+        nsrw = sattrs.ns && rw;
+    } else {
+        sattrs.ns = true;
+        nsr = false;
+        nsrw = false;
+    }
+
+    tt_resp = (sattrs.iregion << 24) |
+        (sattrs.irvalid << 23) |
+        ((!sattrs.ns) << 22) |
+        (nsrw << 21) |
+        (nsr << 20) |
+        (rw << 19) |
+        (r << 18) |
+        (sattrs.srvalid << 17) |
+        (mrvalid << 16) |
+        (sattrs.sregion << 8) |
+        mregion;
+
+    return tt_resp;
+}
+
+#endif /* !CONFIG_USER_ONLY */
diff --git a/target/arm/tcg/meson.build b/target/arm/tcg/meson.build
index 044561b..1f27ba1 100644
--- a/target/arm/tcg/meson.build
+++ b/target/arm/tcg/meson.build
@@ -23,10 +23,23 @@ arm_ss.add(files(
   'translate-mve.c',
   'translate-neon.c',
   'translate-vfp.c',
+  'crypto_helper.c',
+  'iwmmxt_helper.c',
+  'm_helper.c',
+  'mve_helper.c',
+  'neon_helper.c',
+  'op_helper.c',
+  'tlb_helper.c',
+  'vec_helper.c',
 ))
 
 arm_ss.add(when: 'TARGET_AARCH64', if_true: files(
   'translate-a64.c',
   'translate-sve.c',
   'translate-sme.c',
+  'helper-a64.c',
+  'mte_helper.c',
+  'pauth_helper.c',
+  'sme_helper.c',
+  'sve_helper.c',
 ))
diff --git a/target/arm/tcg/mte_helper.c b/target/arm/tcg/mte_helper.c
new file mode 100644
index 0000000..98bcf59
--- /dev/null
+++ b/target/arm/tcg/mte_helper.c
@@ -0,0 +1,908 @@
+/*
+ * ARM v8.5-MemTag Operations
+ *
+ * Copyright (c) 2020 Linaro, Ltd.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "cpu.h"
+#include "internals.h"
+#include "exec/exec-all.h"
+#include "exec/ram_addr.h"
+#include "exec/cpu_ldst.h"
+#include "exec/helper-proto.h"
+#include "qapi/error.h"
+#include "qemu/guest-random.h"
+
+
+static int choose_nonexcluded_tag(int tag, int offset, uint16_t exclude)
+{
+    if (exclude == 0xffff) {
+        return 0;
+    }
+    if (offset == 0) {
+        while (exclude & (1 << tag)) {
+            tag = (tag + 1) & 15;
+        }
+    } else {
+        do {
+            do {
+                tag = (tag + 1) & 15;
+            } while (exclude & (1 << tag));
+        } while (--offset > 0);
+    }
+    return tag;
+}
+
+/**
+ * allocation_tag_mem:
+ * @env: the cpu environment
+ * @ptr_mmu_idx: the addressing regime to use for the virtual address
+ * @ptr: the virtual address for which to look up tag memory
+ * @ptr_access: the access to use for the virtual address
+ * @ptr_size: the number of bytes in the normal memory access
+ * @tag_access: the access to use for the tag memory
+ * @tag_size: the number of bytes in the tag memory access
+ * @ra: the return address for exception handling
+ *
+ * Our tag memory is formatted as a sequence of little-endian nibbles.
+ * That is, the byte at (addr >> (LOG2_TAG_GRANULE + 1)) contains two
+ * tags, with the tag at [3:0] for the lower addr and the tag at [7:4]
+ * for the higher addr.
+ *
+ * Here, resolve the physical address from the virtual address, and return
+ * a pointer to the corresponding tag byte.  Exit with exception if the
+ * virtual address is not accessible for @ptr_access.
+ *
+ * The @ptr_size and @tag_size values may not have an obvious relation
+ * due to the alignment of @ptr, and the number of tag checks required.
+ *
+ * If there is no tag storage corresponding to @ptr, return NULL.
+ */
+static uint8_t *allocation_tag_mem(CPUARMState *env, int ptr_mmu_idx,
+                                   uint64_t ptr, MMUAccessType ptr_access,
+                                   int ptr_size, MMUAccessType tag_access,
+                                   int tag_size, uintptr_t ra)
+{
+#ifdef CONFIG_USER_ONLY
+    uint64_t clean_ptr = useronly_clean_ptr(ptr);
+    int flags = page_get_flags(clean_ptr);
+    uint8_t *tags;
+    uintptr_t index;
+
+    if (!(flags & (ptr_access == MMU_DATA_STORE ? PAGE_WRITE_ORG : PAGE_READ))) {
+        cpu_loop_exit_sigsegv(env_cpu(env), ptr, ptr_access,
+                              !(flags & PAGE_VALID), ra);
+    }
+
+    /* Require both MAP_ANON and PROT_MTE for the page. */
+    if (!(flags & PAGE_ANON) || !(flags & PAGE_MTE)) {
+        return NULL;
+    }
+
+    tags = page_get_target_data(clean_ptr);
+
+    index = extract32(ptr, LOG2_TAG_GRANULE + 1,
+                      TARGET_PAGE_BITS - LOG2_TAG_GRANULE - 1);
+    return tags + index;
+#else
+    CPUTLBEntryFull *full;
+    MemTxAttrs attrs;
+    int in_page, flags;
+    hwaddr ptr_paddr, tag_paddr, xlat;
+    MemoryRegion *mr;
+    ARMASIdx tag_asi;
+    AddressSpace *tag_as;
+    void *host;
+
+    /*
+     * Probe the first byte of the virtual address.  This raises an
+     * exception for inaccessible pages, and resolves the virtual address
+     * into the softmmu tlb.
+     *
+     * When RA == 0, this is for mte_probe.  The page is expected to be
+     * valid.  Indicate to probe_access_flags no-fault, then assert that
+     * we received a valid page.
+     */
+    flags = probe_access_full(env, ptr, ptr_access, ptr_mmu_idx,
+                              ra == 0, &host, &full, ra);
+    assert(!(flags & TLB_INVALID_MASK));
+
+    /* If the virtual page MemAttr != Tagged, access unchecked. */
+    if (full->pte_attrs != 0xf0) {
+        return NULL;
+    }
+
+    /*
+     * If not backed by host ram, there is no tag storage: access unchecked.
+     * This is probably a guest os bug though, so log it.
+     */
+    if (unlikely(flags & TLB_MMIO)) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "Page @ 0x%" PRIx64 " indicates Tagged Normal memory "
+                      "but is not backed by host ram\n", ptr);
+        return NULL;
+    }
+
+    /*
+     * Remember these values across the second lookup below,
+     * which may invalidate this pointer via tlb resize.
+     */
+    ptr_paddr = full->phys_addr | (ptr & ~TARGET_PAGE_MASK);
+    attrs = full->attrs;
+    full = NULL;
+
+    /*
+     * The Normal memory access can extend to the next page.  E.g. a single
+     * 8-byte access to the last byte of a page will check only the last
+     * tag on the first page.
+     * Any page access exception has priority over tag check exception.
+     */
+    in_page = -(ptr | TARGET_PAGE_MASK);
+    if (unlikely(ptr_size > in_page)) {
+        flags |= probe_access_full(env, ptr + in_page, ptr_access,
+                                   ptr_mmu_idx, ra == 0, &host, &full, ra);
+        assert(!(flags & TLB_INVALID_MASK));
+    }
+
+    /* Any debug exception has priority over a tag check exception. */
+    if (unlikely(flags & TLB_WATCHPOINT)) {
+        int wp = ptr_access == MMU_DATA_LOAD ? BP_MEM_READ : BP_MEM_WRITE;
+        assert(ra != 0);
+        cpu_check_watchpoint(env_cpu(env), ptr, ptr_size, attrs, wp, ra);
+    }
+
+    /* Convert to the physical address in tag space.  */
+    tag_paddr = ptr_paddr >> (LOG2_TAG_GRANULE + 1);
+
+    /* Look up the address in tag space. */
+    tag_asi = attrs.secure ? ARMASIdx_TagS : ARMASIdx_TagNS;
+    tag_as = cpu_get_address_space(env_cpu(env), tag_asi);
+    mr = address_space_translate(tag_as, tag_paddr, &xlat, NULL,
+                                 tag_access == MMU_DATA_STORE, attrs);
+
+    /*
+     * Note that @mr will never be NULL.  If there is nothing in the address
+     * space at @tag_paddr, the translation will return the unallocated memory
+     * region.  For our purposes, the result must be ram.
+     */
+    if (unlikely(!memory_region_is_ram(mr))) {
+        /* ??? Failure is a board configuration error. */
+        qemu_log_mask(LOG_UNIMP,
+                      "Tag Memory @ 0x%" HWADDR_PRIx " not found for "
+                      "Normal Memory @ 0x%" HWADDR_PRIx "\n",
+                      tag_paddr, ptr_paddr);
+        return NULL;
+    }
+
+    /*
+     * Ensure the tag memory is dirty on write, for migration.
+     * Tag memory can never contain code or display memory (vga).
+     */
+    if (tag_access == MMU_DATA_STORE) {
+        ram_addr_t tag_ra = memory_region_get_ram_addr(mr) + xlat;
+        cpu_physical_memory_set_dirty_flag(tag_ra, DIRTY_MEMORY_MIGRATION);
+    }
+
+    return memory_region_get_ram_ptr(mr) + xlat;
+#endif
+}
+
+uint64_t HELPER(irg)(CPUARMState *env, uint64_t rn, uint64_t rm)
+{
+    uint16_t exclude = extract32(rm | env->cp15.gcr_el1, 0, 16);
+    int rrnd = extract32(env->cp15.gcr_el1, 16, 1);
+    int start = extract32(env->cp15.rgsr_el1, 0, 4);
+    int seed = extract32(env->cp15.rgsr_el1, 8, 16);
+    int offset, i, rtag;
+
+    /*
+     * Our IMPDEF choice for GCR_EL1.RRND==1 is to continue to use the
+     * deterministic algorithm.  Except that with RRND==1 the kernel is
+     * not required to have set RGSR_EL1.SEED != 0, which is required for
+     * the deterministic algorithm to function.  So we force a non-zero
+     * SEED for that case.
+     */
+    if (unlikely(seed == 0) && rrnd) {
+        do {
+            Error *err = NULL;
+            uint16_t two;
+
+            if (qemu_guest_getrandom(&two, sizeof(two), &err) < 0) {
+                /*
+                 * Failed, for unknown reasons in the crypto subsystem.
+                 * Best we can do is log the reason and use a constant seed.
+                 */
+                qemu_log_mask(LOG_UNIMP, "IRG: Crypto failure: %s\n",
+                              error_get_pretty(err));
+                error_free(err);
+                two = 1;
+            }
+            seed = two;
+        } while (seed == 0);
+    }
+
+    /* RandomTag */
+    for (i = offset = 0; i < 4; ++i) {
+        /* NextRandomTagBit */
+        int top = (extract32(seed, 5, 1) ^ extract32(seed, 3, 1) ^
+                   extract32(seed, 2, 1) ^ extract32(seed, 0, 1));
+        seed = (top << 15) | (seed >> 1);
+        offset |= top << i;
+    }
+    rtag = choose_nonexcluded_tag(start, offset, exclude);
+    env->cp15.rgsr_el1 = rtag | (seed << 8);
+
+    return address_with_allocation_tag(rn, rtag);
+}
+
+uint64_t HELPER(addsubg)(CPUARMState *env, uint64_t ptr,
+                         int32_t offset, uint32_t tag_offset)
+{
+    int start_tag = allocation_tag_from_addr(ptr);
+    uint16_t exclude = extract32(env->cp15.gcr_el1, 0, 16);
+    int rtag = choose_nonexcluded_tag(start_tag, tag_offset, exclude);
+
+    return address_with_allocation_tag(ptr + offset, rtag);
+}
+
+static int load_tag1(uint64_t ptr, uint8_t *mem)
+{
+    int ofs = extract32(ptr, LOG2_TAG_GRANULE, 1) * 4;
+    return extract32(*mem, ofs, 4);
+}
+
+uint64_t HELPER(ldg)(CPUARMState *env, uint64_t ptr, uint64_t xt)
+{
+    int mmu_idx = cpu_mmu_index(env, false);
+    uint8_t *mem;
+    int rtag = 0;
+
+    /* Trap if accessing an invalid page.  */
+    mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_LOAD, 1,
+                             MMU_DATA_LOAD, 1, GETPC());
+
+    /* Load if page supports tags. */
+    if (mem) {
+        rtag = load_tag1(ptr, mem);
+    }
+
+    return address_with_allocation_tag(xt, rtag);
+}
+
+static void check_tag_aligned(CPUARMState *env, uint64_t ptr, uintptr_t ra)
+{
+    if (unlikely(!QEMU_IS_ALIGNED(ptr, TAG_GRANULE))) {
+        arm_cpu_do_unaligned_access(env_cpu(env), ptr, MMU_DATA_STORE,
+                                    cpu_mmu_index(env, false), ra);
+        g_assert_not_reached();
+    }
+}
+
+/* For use in a non-parallel context, store to the given nibble.  */
+static void store_tag1(uint64_t ptr, uint8_t *mem, int tag)
+{
+    int ofs = extract32(ptr, LOG2_TAG_GRANULE, 1) * 4;
+    *mem = deposit32(*mem, ofs, 4, tag);
+}
+
+/* For use in a parallel context, atomically store to the given nibble.  */
+static void store_tag1_parallel(uint64_t ptr, uint8_t *mem, int tag)
+{
+    int ofs = extract32(ptr, LOG2_TAG_GRANULE, 1) * 4;
+    uint8_t old = qatomic_read(mem);
+
+    while (1) {
+        uint8_t new = deposit32(old, ofs, 4, tag);
+        uint8_t cmp = qatomic_cmpxchg(mem, old, new);
+        if (likely(cmp == old)) {
+            return;
+        }
+        old = cmp;
+    }
+}
+
+typedef void stg_store1(uint64_t, uint8_t *, int);
+
+static inline void do_stg(CPUARMState *env, uint64_t ptr, uint64_t xt,
+                          uintptr_t ra, stg_store1 store1)
+{
+    int mmu_idx = cpu_mmu_index(env, false);
+    uint8_t *mem;
+
+    check_tag_aligned(env, ptr, ra);
+
+    /* Trap if accessing an invalid page.  */
+    mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE, TAG_GRANULE,
+                             MMU_DATA_STORE, 1, ra);
+
+    /* Store if page supports tags. */
+    if (mem) {
+        store1(ptr, mem, allocation_tag_from_addr(xt));
+    }
+}
+
+void HELPER(stg)(CPUARMState *env, uint64_t ptr, uint64_t xt)
+{
+    do_stg(env, ptr, xt, GETPC(), store_tag1);
+}
+
+void HELPER(stg_parallel)(CPUARMState *env, uint64_t ptr, uint64_t xt)
+{
+    do_stg(env, ptr, xt, GETPC(), store_tag1_parallel);
+}
+
+void HELPER(stg_stub)(CPUARMState *env, uint64_t ptr)
+{
+    int mmu_idx = cpu_mmu_index(env, false);
+    uintptr_t ra = GETPC();
+
+    check_tag_aligned(env, ptr, ra);
+    probe_write(env, ptr, TAG_GRANULE, mmu_idx, ra);
+}
+
+static inline void do_st2g(CPUARMState *env, uint64_t ptr, uint64_t xt,
+                           uintptr_t ra, stg_store1 store1)
+{
+    int mmu_idx = cpu_mmu_index(env, false);
+    int tag = allocation_tag_from_addr(xt);
+    uint8_t *mem1, *mem2;
+
+    check_tag_aligned(env, ptr, ra);
+
+    /*
+     * Trap if accessing an invalid page(s).
+     * This takes priority over !allocation_tag_access_enabled.
+     */
+    if (ptr & TAG_GRANULE) {
+        /* Two stores unaligned mod TAG_GRANULE*2 -- modify two bytes. */
+        mem1 = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE,
+                                  TAG_GRANULE, MMU_DATA_STORE, 1, ra);
+        mem2 = allocation_tag_mem(env, mmu_idx, ptr + TAG_GRANULE,
+                                  MMU_DATA_STORE, TAG_GRANULE,
+                                  MMU_DATA_STORE, 1, ra);
+
+        /* Store if page(s) support tags. */
+        if (mem1) {
+            store1(TAG_GRANULE, mem1, tag);
+        }
+        if (mem2) {
+            store1(0, mem2, tag);
+        }
+    } else {
+        /* Two stores aligned mod TAG_GRANULE*2 -- modify one byte. */
+        mem1 = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE,
+                                  2 * TAG_GRANULE, MMU_DATA_STORE, 1, ra);
+        if (mem1) {
+            tag |= tag << 4;
+            qatomic_set(mem1, tag);
+        }
+    }
+}
+
+void HELPER(st2g)(CPUARMState *env, uint64_t ptr, uint64_t xt)
+{
+    do_st2g(env, ptr, xt, GETPC(), store_tag1);
+}
+
+void HELPER(st2g_parallel)(CPUARMState *env, uint64_t ptr, uint64_t xt)
+{
+    do_st2g(env, ptr, xt, GETPC(), store_tag1_parallel);
+}
+
+void HELPER(st2g_stub)(CPUARMState *env, uint64_t ptr)
+{
+    int mmu_idx = cpu_mmu_index(env, false);
+    uintptr_t ra = GETPC();
+    int in_page = -(ptr | TARGET_PAGE_MASK);
+
+    check_tag_aligned(env, ptr, ra);
+
+    if (likely(in_page >= 2 * TAG_GRANULE)) {
+        probe_write(env, ptr, 2 * TAG_GRANULE, mmu_idx, ra);
+    } else {
+        probe_write(env, ptr, TAG_GRANULE, mmu_idx, ra);
+        probe_write(env, ptr + TAG_GRANULE, TAG_GRANULE, mmu_idx, ra);
+    }
+}
+
+#define LDGM_STGM_SIZE  (4 << GMID_EL1_BS)
+
+uint64_t HELPER(ldgm)(CPUARMState *env, uint64_t ptr)
+{
+    int mmu_idx = cpu_mmu_index(env, false);
+    uintptr_t ra = GETPC();
+    void *tag_mem;
+
+    ptr = QEMU_ALIGN_DOWN(ptr, LDGM_STGM_SIZE);
+
+    /* Trap if accessing an invalid page.  */
+    tag_mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_LOAD,
+                                 LDGM_STGM_SIZE, MMU_DATA_LOAD,
+                                 LDGM_STGM_SIZE / (2 * TAG_GRANULE), ra);
+
+    /* The tag is squashed to zero if the page does not support tags.  */
+    if (!tag_mem) {
+        return 0;
+    }
+
+    QEMU_BUILD_BUG_ON(GMID_EL1_BS != 6);
+    /*
+     * We are loading 64-bits worth of tags.  The ordering of elements
+     * within the word corresponds to a 64-bit little-endian operation.
+     */
+    return ldq_le_p(tag_mem);
+}
+
+void HELPER(stgm)(CPUARMState *env, uint64_t ptr, uint64_t val)
+{
+    int mmu_idx = cpu_mmu_index(env, false);
+    uintptr_t ra = GETPC();
+    void *tag_mem;
+
+    ptr = QEMU_ALIGN_DOWN(ptr, LDGM_STGM_SIZE);
+
+    /* Trap if accessing an invalid page.  */
+    tag_mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE,
+                                 LDGM_STGM_SIZE, MMU_DATA_LOAD,
+                                 LDGM_STGM_SIZE / (2 * TAG_GRANULE), ra);
+
+    /*
+     * Tag store only happens if the page support tags,
+     * and if the OS has enabled access to the tags.
+     */
+    if (!tag_mem) {
+        return;
+    }
+
+    QEMU_BUILD_BUG_ON(GMID_EL1_BS != 6);
+    /*
+     * We are storing 64-bits worth of tags.  The ordering of elements
+     * within the word corresponds to a 64-bit little-endian operation.
+     */
+    stq_le_p(tag_mem, val);
+}
+
+void HELPER(stzgm_tags)(CPUARMState *env, uint64_t ptr, uint64_t val)
+{
+    uintptr_t ra = GETPC();
+    int mmu_idx = cpu_mmu_index(env, false);
+    int log2_dcz_bytes, log2_tag_bytes;
+    intptr_t dcz_bytes, tag_bytes;
+    uint8_t *mem;
+
+    /*
+     * In arm_cpu_realizefn, we assert that dcz > LOG2_TAG_GRANULE+1,
+     * i.e. 32 bytes, which is an unreasonably small dcz anyway,
+     * to make sure that we can access one complete tag byte here.
+     */
+    log2_dcz_bytes = env_archcpu(env)->dcz_blocksize + 2;
+    log2_tag_bytes = log2_dcz_bytes - (LOG2_TAG_GRANULE + 1);
+    dcz_bytes = (intptr_t)1 << log2_dcz_bytes;
+    tag_bytes = (intptr_t)1 << log2_tag_bytes;
+    ptr &= -dcz_bytes;
+
+    mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE, dcz_bytes,
+                             MMU_DATA_STORE, tag_bytes, ra);
+    if (mem) {
+        int tag_pair = (val & 0xf) * 0x11;
+        memset(mem, tag_pair, tag_bytes);
+    }
+}
+
+static void mte_sync_check_fail(CPUARMState *env, uint32_t desc,
+                                uint64_t dirty_ptr, uintptr_t ra)
+{
+    int is_write, syn;
+
+    env->exception.vaddress = dirty_ptr;
+
+    is_write = FIELD_EX32(desc, MTEDESC, WRITE);
+    syn = syn_data_abort_no_iss(arm_current_el(env) != 0, 0, 0, 0, 0, is_write,
+                                0x11);
+    raise_exception_ra(env, EXCP_DATA_ABORT, syn, exception_target_el(env), ra);
+    g_assert_not_reached();
+}
+
+static void mte_async_check_fail(CPUARMState *env, uint64_t dirty_ptr,
+                                 uintptr_t ra, ARMMMUIdx arm_mmu_idx, int el)
+{
+    int select;
+
+    if (regime_has_2_ranges(arm_mmu_idx)) {
+        select = extract64(dirty_ptr, 55, 1);
+    } else {
+        select = 0;
+    }
+    env->cp15.tfsr_el[el] |= 1 << select;
+#ifdef CONFIG_USER_ONLY
+    /*
+     * Stand in for a timer irq, setting _TIF_MTE_ASYNC_FAULT,
+     * which then sends a SIGSEGV when the thread is next scheduled.
+     * This cpu will return to the main loop at the end of the TB,
+     * which is rather sooner than "normal".  But the alternative
+     * is waiting until the next syscall.
+     */
+    qemu_cpu_kick(env_cpu(env));
+#endif
+}
+
+/* Record a tag check failure.  */
+static void mte_check_fail(CPUARMState *env, uint32_t desc,
+                           uint64_t dirty_ptr, uintptr_t ra)
+{
+    int mmu_idx = FIELD_EX32(desc, MTEDESC, MIDX);
+    ARMMMUIdx arm_mmu_idx = core_to_aa64_mmu_idx(mmu_idx);
+    int el, reg_el, tcf;
+    uint64_t sctlr;
+
+    reg_el = regime_el(env, arm_mmu_idx);
+    sctlr = env->cp15.sctlr_el[reg_el];
+
+    switch (arm_mmu_idx) {
+    case ARMMMUIdx_E10_0:
+    case ARMMMUIdx_E20_0:
+        el = 0;
+        tcf = extract64(sctlr, 38, 2);
+        break;
+    default:
+        el = reg_el;
+        tcf = extract64(sctlr, 40, 2);
+    }
+
+    switch (tcf) {
+    case 1:
+        /* Tag check fail causes a synchronous exception. */
+        mte_sync_check_fail(env, desc, dirty_ptr, ra);
+        break;
+
+    case 0:
+        /*
+         * Tag check fail does not affect the PE.
+         * We eliminate this case by not setting MTE_ACTIVE
+         * in tb_flags, so that we never make this runtime call.
+         */
+        g_assert_not_reached();
+
+    case 2:
+        /* Tag check fail causes asynchronous flag set.  */
+        mte_async_check_fail(env, dirty_ptr, ra, arm_mmu_idx, el);
+        break;
+
+    case 3:
+        /*
+         * Tag check fail causes asynchronous flag set for stores, or
+         * a synchronous exception for loads.
+         */
+        if (FIELD_EX32(desc, MTEDESC, WRITE)) {
+            mte_async_check_fail(env, dirty_ptr, ra, arm_mmu_idx, el);
+        } else {
+            mte_sync_check_fail(env, desc, dirty_ptr, ra);
+        }
+        break;
+    }
+}
+
+/**
+ * checkN:
+ * @tag: tag memory to test
+ * @odd: true to begin testing at tags at odd nibble
+ * @cmp: the tag to compare against
+ * @count: number of tags to test
+ *
+ * Return the number of successful tests.
+ * Thus a return value < @count indicates a failure.
+ *
+ * A note about sizes: count is expected to be small.
+ *
+ * The most common use will be LDP/STP of two integer registers,
+ * which means 16 bytes of memory touching at most 2 tags, but
+ * often the access is aligned and thus just 1 tag.
+ *
+ * Using AdvSIMD LD/ST (multiple), one can access 64 bytes of memory,
+ * touching at most 5 tags.  SVE LDR/STR (vector) with the default
+ * vector length is also 64 bytes; the maximum architectural length
+ * is 256 bytes touching at most 9 tags.
+ *
+ * The loop below uses 7 logical operations and 1 memory operation
+ * per tag pair.  An implementation that loads an aligned word and
+ * uses masking to ignore adjacent tags requires 18 logical operations
+ * and thus does not begin to pay off until 6 tags.
+ * Which, according to the survey above, is unlikely to be common.
+ */
+static int checkN(uint8_t *mem, int odd, int cmp, int count)
+{
+    int n = 0, diff;
+
+    /* Replicate the test tag and compare.  */
+    cmp *= 0x11;
+    diff = *mem++ ^ cmp;
+
+    if (odd) {
+        goto start_odd;
+    }
+
+    while (1) {
+        /* Test even tag. */
+        if (unlikely((diff) & 0x0f)) {
+            break;
+        }
+        if (++n == count) {
+            break;
+        }
+
+    start_odd:
+        /* Test odd tag. */
+        if (unlikely((diff) & 0xf0)) {
+            break;
+        }
+        if (++n == count) {
+            break;
+        }
+
+        diff = *mem++ ^ cmp;
+    }
+    return n;
+}
+
+/**
+ * mte_probe_int() - helper for mte_probe and mte_check
+ * @env: CPU environment
+ * @desc: MTEDESC descriptor
+ * @ptr: virtual address of the base of the access
+ * @fault: return virtual address of the first check failure
+ *
+ * Internal routine for both mte_probe and mte_check.
+ * Return zero on failure, filling in *fault.
+ * Return negative on trivial success for tbi disabled.
+ * Return positive on success with tbi enabled.
+ */
+static int mte_probe_int(CPUARMState *env, uint32_t desc, uint64_t ptr,
+                         uintptr_t ra, uint64_t *fault)
+{
+    int mmu_idx, ptr_tag, bit55;
+    uint64_t ptr_last, prev_page, next_page;
+    uint64_t tag_first, tag_last;
+    uint64_t tag_byte_first, tag_byte_last;
+    uint32_t sizem1, tag_count, tag_size, n, c;
+    uint8_t *mem1, *mem2;
+    MMUAccessType type;
+
+    bit55 = extract64(ptr, 55, 1);
+    *fault = ptr;
+
+    /* If TBI is disabled, the access is unchecked, and ptr is not dirty. */
+    if (unlikely(!tbi_check(desc, bit55))) {
+        return -1;
+    }
+
+    ptr_tag = allocation_tag_from_addr(ptr);
+
+    if (tcma_check(desc, bit55, ptr_tag)) {
+        return 1;
+    }
+
+    mmu_idx = FIELD_EX32(desc, MTEDESC, MIDX);
+    type = FIELD_EX32(desc, MTEDESC, WRITE) ? MMU_DATA_STORE : MMU_DATA_LOAD;
+    sizem1 = FIELD_EX32(desc, MTEDESC, SIZEM1);
+
+    /* Find the addr of the end of the access */
+    ptr_last = ptr + sizem1;
+
+    /* Round the bounds to the tag granule, and compute the number of tags. */
+    tag_first = QEMU_ALIGN_DOWN(ptr, TAG_GRANULE);
+    tag_last = QEMU_ALIGN_DOWN(ptr_last, TAG_GRANULE);
+    tag_count = ((tag_last - tag_first) / TAG_GRANULE) + 1;
+
+    /* Round the bounds to twice the tag granule, and compute the bytes. */
+    tag_byte_first = QEMU_ALIGN_DOWN(ptr, 2 * TAG_GRANULE);
+    tag_byte_last = QEMU_ALIGN_DOWN(ptr_last, 2 * TAG_GRANULE);
+
+    /* Locate the page boundaries. */
+    prev_page = ptr & TARGET_PAGE_MASK;
+    next_page = prev_page + TARGET_PAGE_SIZE;
+
+    if (likely(tag_last - prev_page < TARGET_PAGE_SIZE)) {
+        /* Memory access stays on one page. */
+        tag_size = ((tag_byte_last - tag_byte_first) / (2 * TAG_GRANULE)) + 1;
+        mem1 = allocation_tag_mem(env, mmu_idx, ptr, type, sizem1 + 1,
+                                  MMU_DATA_LOAD, tag_size, ra);
+        if (!mem1) {
+            return 1;
+        }
+        /* Perform all of the comparisons. */
+        n = checkN(mem1, ptr & TAG_GRANULE, ptr_tag, tag_count);
+    } else {
+        /* Memory access crosses to next page. */
+        tag_size = (next_page - tag_byte_first) / (2 * TAG_GRANULE);
+        mem1 = allocation_tag_mem(env, mmu_idx, ptr, type, next_page - ptr,
+                                  MMU_DATA_LOAD, tag_size, ra);
+
+        tag_size = ((tag_byte_last - next_page) / (2 * TAG_GRANULE)) + 1;
+        mem2 = allocation_tag_mem(env, mmu_idx, next_page, type,
+                                  ptr_last - next_page + 1,
+                                  MMU_DATA_LOAD, tag_size, ra);
+
+        /*
+         * Perform all of the comparisons.
+         * Note the possible but unlikely case of the operation spanning
+         * two pages that do not both have tagging enabled.
+         */
+        n = c = (next_page - tag_first) / TAG_GRANULE;
+        if (mem1) {
+            n = checkN(mem1, ptr & TAG_GRANULE, ptr_tag, c);
+        }
+        if (n == c) {
+            if (!mem2) {
+                return 1;
+            }
+            n += checkN(mem2, 0, ptr_tag, tag_count - c);
+        }
+    }
+
+    if (likely(n == tag_count)) {
+        return 1;
+    }
+
+    /*
+     * If we failed, we know which granule.  For the first granule, the
+     * failure address is @ptr, the first byte accessed.  Otherwise the
+     * failure address is the first byte of the nth granule.
+     */
+    if (n > 0) {
+        *fault = tag_first + n * TAG_GRANULE;
+    }
+    return 0;
+}
+
+uint64_t mte_check(CPUARMState *env, uint32_t desc, uint64_t ptr, uintptr_t ra)
+{
+    uint64_t fault;
+    int ret = mte_probe_int(env, desc, ptr, ra, &fault);
+
+    if (unlikely(ret == 0)) {
+        mte_check_fail(env, desc, fault, ra);
+    } else if (ret < 0) {
+        return ptr;
+    }
+    return useronly_clean_ptr(ptr);
+}
+
+uint64_t HELPER(mte_check)(CPUARMState *env, uint32_t desc, uint64_t ptr)
+{
+    return mte_check(env, desc, ptr, GETPC());
+}
+
+/*
+ * No-fault version of mte_check, to be used by SVE for MemSingleNF.
+ * Returns false if the access is Checked and the check failed.  This
+ * is only intended to probe the tag -- the validity of the page must
+ * be checked beforehand.
+ */
+bool mte_probe(CPUARMState *env, uint32_t desc, uint64_t ptr)
+{
+    uint64_t fault;
+    int ret = mte_probe_int(env, desc, ptr, 0, &fault);
+
+    return ret != 0;
+}
+
+/*
+ * Perform an MTE checked access for DC_ZVA.
+ */
+uint64_t HELPER(mte_check_zva)(CPUARMState *env, uint32_t desc, uint64_t ptr)
+{
+    uintptr_t ra = GETPC();
+    int log2_dcz_bytes, log2_tag_bytes;
+    int mmu_idx, bit55;
+    intptr_t dcz_bytes, tag_bytes, i;
+    void *mem;
+    uint64_t ptr_tag, mem_tag, align_ptr;
+
+    bit55 = extract64(ptr, 55, 1);
+
+    /* If TBI is disabled, the access is unchecked, and ptr is not dirty. */
+    if (unlikely(!tbi_check(desc, bit55))) {
+        return ptr;
+    }
+
+    ptr_tag = allocation_tag_from_addr(ptr);
+
+    if (tcma_check(desc, bit55, ptr_tag)) {
+        goto done;
+    }
+
+    /*
+     * In arm_cpu_realizefn, we asserted that dcz > LOG2_TAG_GRANULE+1,
+     * i.e. 32 bytes, which is an unreasonably small dcz anyway, to make
+     * sure that we can access one complete tag byte here.
+     */
+    log2_dcz_bytes = env_archcpu(env)->dcz_blocksize + 2;
+    log2_tag_bytes = log2_dcz_bytes - (LOG2_TAG_GRANULE + 1);
+    dcz_bytes = (intptr_t)1 << log2_dcz_bytes;
+    tag_bytes = (intptr_t)1 << log2_tag_bytes;
+    align_ptr = ptr & -dcz_bytes;
+
+    /*
+     * Trap if accessing an invalid page.  DC_ZVA requires that we supply
+     * the original pointer for an invalid page.  But watchpoints require
+     * that we probe the actual space.  So do both.
+     */
+    mmu_idx = FIELD_EX32(desc, MTEDESC, MIDX);
+    (void) probe_write(env, ptr, 1, mmu_idx, ra);
+    mem = allocation_tag_mem(env, mmu_idx, align_ptr, MMU_DATA_STORE,
+                             dcz_bytes, MMU_DATA_LOAD, tag_bytes, ra);
+    if (!mem) {
+        goto done;
+    }
+
+    /*
+     * Unlike the reasoning for checkN, DC_ZVA is always aligned, and thus
+     * it is quite easy to perform all of the comparisons at once without
+     * any extra masking.
+     *
+     * The most common zva block size is 64; some of the thunderx cpus use
+     * a block size of 128.  For user-only, aarch64_max_initfn will set the
+     * block size to 512.  Fill out the other cases for future-proofing.
+     *
+     * In order to be able to find the first miscompare later, we want the
+     * tag bytes to be in little-endian order.
+     */
+    switch (log2_tag_bytes) {
+    case 0: /* zva_blocksize 32 */
+        mem_tag = *(uint8_t *)mem;
+        ptr_tag *= 0x11u;
+        break;
+    case 1: /* zva_blocksize 64 */
+        mem_tag = cpu_to_le16(*(uint16_t *)mem);
+        ptr_tag *= 0x1111u;
+        break;
+    case 2: /* zva_blocksize 128 */
+        mem_tag = cpu_to_le32(*(uint32_t *)mem);
+        ptr_tag *= 0x11111111u;
+        break;
+    case 3: /* zva_blocksize 256 */
+        mem_tag = cpu_to_le64(*(uint64_t *)mem);
+        ptr_tag *= 0x1111111111111111ull;
+        break;
+
+    default: /* zva_blocksize 512, 1024, 2048 */
+        ptr_tag *= 0x1111111111111111ull;
+        i = 0;
+        do {
+            mem_tag = cpu_to_le64(*(uint64_t *)(mem + i));
+            if (unlikely(mem_tag != ptr_tag)) {
+                goto fail;
+            }
+            i += 8;
+            align_ptr += 16 * TAG_GRANULE;
+        } while (i < tag_bytes);
+        goto done;
+    }
+
+    if (likely(mem_tag == ptr_tag)) {
+        goto done;
+    }
+
+ fail:
+    /* Locate the first nibble that differs. */
+    i = ctz64(mem_tag ^ ptr_tag) >> 4;
+    mte_check_fail(env, desc, align_ptr + i * TAG_GRANULE, ra);
+
+ done:
+    return useronly_clean_ptr(ptr);
+}
diff --git a/target/arm/tcg/mve_helper.c b/target/arm/tcg/mve_helper.c
new file mode 100644
index 0000000..403b345
--- /dev/null
+++ b/target/arm/tcg/mve_helper.c
@@ -0,0 +1,3450 @@
+/*
+ * M-profile MVE Operations
+ *
+ * Copyright (c) 2021 Linaro, Ltd.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "internals.h"
+#include "vec_internal.h"
+#include "exec/helper-proto.h"
+#include "exec/cpu_ldst.h"
+#include "exec/exec-all.h"
+#include "tcg/tcg.h"
+#include "fpu/softfloat.h"
+
+static uint16_t mve_eci_mask(CPUARMState *env)
+{
+    /*
+     * Return the mask of which elements in the MVE vector correspond
+     * to beats being executed. The mask has 1 bits for executed lanes
+     * and 0 bits where ECI says this beat was already executed.
+     */
+    int eci;
+
+    if ((env->condexec_bits & 0xf) != 0) {
+        return 0xffff;
+    }
+
+    eci = env->condexec_bits >> 4;
+    switch (eci) {
+    case ECI_NONE:
+        return 0xffff;
+    case ECI_A0:
+        return 0xfff0;
+    case ECI_A0A1:
+        return 0xff00;
+    case ECI_A0A1A2:
+    case ECI_A0A1A2B0:
+        return 0xf000;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+static uint16_t mve_element_mask(CPUARMState *env)
+{
+    /*
+     * Return the mask of which elements in the MVE vector should be
+     * updated. This is a combination of multiple things:
+     *  (1) by default, we update every lane in the vector
+     *  (2) VPT predication stores its state in the VPR register;
+     *  (3) low-overhead-branch tail predication will mask out part
+     *      the vector on the final iteration of the loop
+     *  (4) if EPSR.ECI is set then we must execute only some beats
+     *      of the insn
+     * We combine all these into a 16-bit result with the same semantics
+     * as VPR.P0: 0 to mask the lane, 1 if it is active.
+     * 8-bit vector ops will look at all bits of the result;
+     * 16-bit ops will look at bits 0, 2, 4, ...;
+     * 32-bit ops will look at bits 0, 4, 8 and 12.
+     * Compare pseudocode GetCurInstrBeat(), though that only returns
+     * the 4-bit slice of the mask corresponding to a single beat.
+     */
+    uint16_t mask = FIELD_EX32(env->v7m.vpr, V7M_VPR, P0);
+
+    if (!(env->v7m.vpr & R_V7M_VPR_MASK01_MASK)) {
+        mask |= 0xff;
+    }
+    if (!(env->v7m.vpr & R_V7M_VPR_MASK23_MASK)) {
+        mask |= 0xff00;
+    }
+
+    if (env->v7m.ltpsize < 4 &&
+        env->regs[14] <= (1 << (4 - env->v7m.ltpsize))) {
+        /*
+         * Tail predication active, and this is the last loop iteration.
+         * The element size is (1 << ltpsize), and we only want to process
+         * loopcount elements, so we want to retain the least significant
+         * (loopcount * esize) predicate bits and zero out bits above that.
+         */
+        int masklen = env->regs[14] << env->v7m.ltpsize;
+        assert(masklen <= 16);
+        uint16_t ltpmask = masklen ? MAKE_64BIT_MASK(0, masklen) : 0;
+        mask &= ltpmask;
+    }
+
+    /*
+     * ECI bits indicate which beats are already executed;
+     * we handle this by effectively predicating them out.
+     */
+    mask &= mve_eci_mask(env);
+    return mask;
+}
+
+static void mve_advance_vpt(CPUARMState *env)
+{
+    /* Advance the VPT and ECI state if necessary */
+    uint32_t vpr = env->v7m.vpr;
+    unsigned mask01, mask23;
+    uint16_t inv_mask;
+    uint16_t eci_mask = mve_eci_mask(env);
+
+    if ((env->condexec_bits & 0xf) == 0) {
+        env->condexec_bits = (env->condexec_bits == (ECI_A0A1A2B0 << 4)) ?
+            (ECI_A0 << 4) : (ECI_NONE << 4);
+    }
+
+    if (!(vpr & (R_V7M_VPR_MASK01_MASK | R_V7M_VPR_MASK23_MASK))) {
+        /* VPT not enabled, nothing to do */
+        return;
+    }
+
+    /* Invert P0 bits if needed, but only for beats we actually executed */
+    mask01 = FIELD_EX32(vpr, V7M_VPR, MASK01);
+    mask23 = FIELD_EX32(vpr, V7M_VPR, MASK23);
+    /* Start by assuming we invert all bits corresponding to executed beats */
+    inv_mask = eci_mask;
+    if (mask01 <= 8) {
+        /* MASK01 says don't invert low half of P0 */
+        inv_mask &= ~0xff;
+    }
+    if (mask23 <= 8) {
+        /* MASK23 says don't invert high half of P0 */
+        inv_mask &= ~0xff00;
+    }
+    vpr ^= inv_mask;
+    /* Only update MASK01 if beat 1 executed */
+    if (eci_mask & 0xf0) {
+        vpr = FIELD_DP32(vpr, V7M_VPR, MASK01, mask01 << 1);
+    }
+    /* Beat 3 always executes, so update MASK23 */
+    vpr = FIELD_DP32(vpr, V7M_VPR, MASK23, mask23 << 1);
+    env->v7m.vpr = vpr;
+}
+
+/* For loads, predicated lanes are zeroed instead of keeping their old values */
+#define DO_VLDR(OP, MSIZE, LDTYPE, ESIZE, TYPE)                         \
+    void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr)    \
+    {                                                                   \
+        TYPE *d = vd;                                                   \
+        uint16_t mask = mve_element_mask(env);                          \
+        uint16_t eci_mask = mve_eci_mask(env);                          \
+        unsigned b, e;                                                  \
+        /*                                                              \
+         * R_SXTM allows the dest reg to become UNKNOWN for abandoned   \
+         * beats so we don't care if we update part of the dest and     \
+         * then take an exception.                                      \
+         */                                                             \
+        for (b = 0, e = 0; b < 16; b += ESIZE, e++) {                   \
+            if (eci_mask & (1 << b)) {                                  \
+                d[H##ESIZE(e)] = (mask & (1 << b)) ?                    \
+                    cpu_##LDTYPE##_data_ra(env, addr, GETPC()) : 0;     \
+            }                                                           \
+            addr += MSIZE;                                              \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+#define DO_VSTR(OP, MSIZE, STTYPE, ESIZE, TYPE)                         \
+    void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr)    \
+    {                                                                   \
+        TYPE *d = vd;                                                   \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned b, e;                                                  \
+        for (b = 0, e = 0; b < 16; b += ESIZE, e++) {                   \
+            if (mask & (1 << b)) {                                      \
+                cpu_##STTYPE##_data_ra(env, addr, d[H##ESIZE(e)], GETPC()); \
+            }                                                           \
+            addr += MSIZE;                                              \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+DO_VLDR(vldrb, 1, ldub, 1, uint8_t)
+DO_VLDR(vldrh, 2, lduw, 2, uint16_t)
+DO_VLDR(vldrw, 4, ldl, 4, uint32_t)
+
+DO_VSTR(vstrb, 1, stb, 1, uint8_t)
+DO_VSTR(vstrh, 2, stw, 2, uint16_t)
+DO_VSTR(vstrw, 4, stl, 4, uint32_t)
+
+DO_VLDR(vldrb_sh, 1, ldsb, 2, int16_t)
+DO_VLDR(vldrb_sw, 1, ldsb, 4, int32_t)
+DO_VLDR(vldrb_uh, 1, ldub, 2, uint16_t)
+DO_VLDR(vldrb_uw, 1, ldub, 4, uint32_t)
+DO_VLDR(vldrh_sw, 2, ldsw, 4, int32_t)
+DO_VLDR(vldrh_uw, 2, lduw, 4, uint32_t)
+
+DO_VSTR(vstrb_h, 1, stb, 2, int16_t)
+DO_VSTR(vstrb_w, 1, stb, 4, int32_t)
+DO_VSTR(vstrh_w, 2, stw, 4, int32_t)
+
+#undef DO_VLDR
+#undef DO_VSTR
+
+/*
+ * Gather loads/scatter stores. Here each element of Qm specifies
+ * an offset to use from the base register Rm. In the _os_ versions
+ * that offset is scaled by the element size.
+ * For loads, predicated lanes are zeroed instead of retaining
+ * their previous values.
+ */
+#define DO_VLDR_SG(OP, LDTYPE, ESIZE, TYPE, OFFTYPE, ADDRFN, WB)        \
+    void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm,         \
+                          uint32_t base)                                \
+    {                                                                   \
+        TYPE *d = vd;                                                   \
+        OFFTYPE *m = vm;                                                \
+        uint16_t mask = mve_element_mask(env);                          \
+        uint16_t eci_mask = mve_eci_mask(env);                          \
+        unsigned e;                                                     \
+        uint32_t addr;                                                  \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE, eci_mask >>= ESIZE) { \
+            if (!(eci_mask & 1)) {                                      \
+                continue;                                               \
+            }                                                           \
+            addr = ADDRFN(base, m[H##ESIZE(e)]);                        \
+            d[H##ESIZE(e)] = (mask & 1) ?                               \
+                cpu_##LDTYPE##_data_ra(env, addr, GETPC()) : 0;         \
+            if (WB) {                                                   \
+                m[H##ESIZE(e)] = addr;                                  \
+            }                                                           \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+/* We know here TYPE is unsigned so always the same as the offset type */
+#define DO_VSTR_SG(OP, STTYPE, ESIZE, TYPE, ADDRFN, WB)                 \
+    void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm,         \
+                          uint32_t base)                                \
+    {                                                                   \
+        TYPE *d = vd;                                                   \
+        TYPE *m = vm;                                                   \
+        uint16_t mask = mve_element_mask(env);                          \
+        uint16_t eci_mask = mve_eci_mask(env);                          \
+        unsigned e;                                                     \
+        uint32_t addr;                                                  \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE, eci_mask >>= ESIZE) { \
+            if (!(eci_mask & 1)) {                                      \
+                continue;                                               \
+            }                                                           \
+            addr = ADDRFN(base, m[H##ESIZE(e)]);                        \
+            if (mask & 1) {                                             \
+                cpu_##STTYPE##_data_ra(env, addr, d[H##ESIZE(e)], GETPC()); \
+            }                                                           \
+            if (WB) {                                                   \
+                m[H##ESIZE(e)] = addr;                                  \
+            }                                                           \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+/*
+ * 64-bit accesses are slightly different: they are done as two 32-bit
+ * accesses, controlled by the predicate mask for the relevant beat,
+ * and with a single 32-bit offset in the first of the two Qm elements.
+ * Note that for QEMU our IMPDEF AIRCR.ENDIANNESS is always 0 (little).
+ * Address writeback happens on the odd beats and updates the address
+ * stored in the even-beat element.
+ */
+#define DO_VLDR64_SG(OP, ADDRFN, WB)                                    \
+    void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm,         \
+                          uint32_t base)                                \
+    {                                                                   \
+        uint32_t *d = vd;                                               \
+        uint32_t *m = vm;                                               \
+        uint16_t mask = mve_element_mask(env);                          \
+        uint16_t eci_mask = mve_eci_mask(env);                          \
+        unsigned e;                                                     \
+        uint32_t addr;                                                  \
+        for (e = 0; e < 16 / 4; e++, mask >>= 4, eci_mask >>= 4) {      \
+            if (!(eci_mask & 1)) {                                      \
+                continue;                                               \
+            }                                                           \
+            addr = ADDRFN(base, m[H4(e & ~1)]);                         \
+            addr += 4 * (e & 1);                                        \
+            d[H4(e)] = (mask & 1) ? cpu_ldl_data_ra(env, addr, GETPC()) : 0; \
+            if (WB && (e & 1)) {                                        \
+                m[H4(e & ~1)] = addr - 4;                               \
+            }                                                           \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+#define DO_VSTR64_SG(OP, ADDRFN, WB)                                    \
+    void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm,         \
+                          uint32_t base)                                \
+    {                                                                   \
+        uint32_t *d = vd;                                               \
+        uint32_t *m = vm;                                               \
+        uint16_t mask = mve_element_mask(env);                          \
+        uint16_t eci_mask = mve_eci_mask(env);                          \
+        unsigned e;                                                     \
+        uint32_t addr;                                                  \
+        for (e = 0; e < 16 / 4; e++, mask >>= 4, eci_mask >>= 4) {      \
+            if (!(eci_mask & 1)) {                                      \
+                continue;                                               \
+            }                                                           \
+            addr = ADDRFN(base, m[H4(e & ~1)]);                         \
+            addr += 4 * (e & 1);                                        \
+            if (mask & 1) {                                             \
+                cpu_stl_data_ra(env, addr, d[H4(e)], GETPC());          \
+            }                                                           \
+            if (WB && (e & 1)) {                                        \
+                m[H4(e & ~1)] = addr - 4;                               \
+            }                                                           \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+#define ADDR_ADD(BASE, OFFSET) ((BASE) + (OFFSET))
+#define ADDR_ADD_OSH(BASE, OFFSET) ((BASE) + ((OFFSET) << 1))
+#define ADDR_ADD_OSW(BASE, OFFSET) ((BASE) + ((OFFSET) << 2))
+#define ADDR_ADD_OSD(BASE, OFFSET) ((BASE) + ((OFFSET) << 3))
+
+DO_VLDR_SG(vldrb_sg_sh, ldsb, 2, int16_t, uint16_t, ADDR_ADD, false)
+DO_VLDR_SG(vldrb_sg_sw, ldsb, 4, int32_t, uint32_t, ADDR_ADD, false)
+DO_VLDR_SG(vldrh_sg_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD, false)
+
+DO_VLDR_SG(vldrb_sg_ub, ldub, 1, uint8_t, uint8_t, ADDR_ADD, false)
+DO_VLDR_SG(vldrb_sg_uh, ldub, 2, uint16_t, uint16_t, ADDR_ADD, false)
+DO_VLDR_SG(vldrb_sg_uw, ldub, 4, uint32_t, uint32_t, ADDR_ADD, false)
+DO_VLDR_SG(vldrh_sg_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD, false)
+DO_VLDR_SG(vldrh_sg_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD, false)
+DO_VLDR_SG(vldrw_sg_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD, false)
+DO_VLDR64_SG(vldrd_sg_ud, ADDR_ADD, false)
+
+DO_VLDR_SG(vldrh_sg_os_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD_OSH, false)
+DO_VLDR_SG(vldrh_sg_os_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD_OSH, false)
+DO_VLDR_SG(vldrh_sg_os_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD_OSH, false)
+DO_VLDR_SG(vldrw_sg_os_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD_OSW, false)
+DO_VLDR64_SG(vldrd_sg_os_ud, ADDR_ADD_OSD, false)
+
+DO_VSTR_SG(vstrb_sg_ub, stb, 1, uint8_t, ADDR_ADD, false)
+DO_VSTR_SG(vstrb_sg_uh, stb, 2, uint16_t, ADDR_ADD, false)
+DO_VSTR_SG(vstrb_sg_uw, stb, 4, uint32_t, ADDR_ADD, false)
+DO_VSTR_SG(vstrh_sg_uh, stw, 2, uint16_t, ADDR_ADD, false)
+DO_VSTR_SG(vstrh_sg_uw, stw, 4, uint32_t, ADDR_ADD, false)
+DO_VSTR_SG(vstrw_sg_uw, stl, 4, uint32_t, ADDR_ADD, false)
+DO_VSTR64_SG(vstrd_sg_ud, ADDR_ADD, false)
+
+DO_VSTR_SG(vstrh_sg_os_uh, stw, 2, uint16_t, ADDR_ADD_OSH, false)
+DO_VSTR_SG(vstrh_sg_os_uw, stw, 4, uint32_t, ADDR_ADD_OSH, false)
+DO_VSTR_SG(vstrw_sg_os_uw, stl, 4, uint32_t, ADDR_ADD_OSW, false)
+DO_VSTR64_SG(vstrd_sg_os_ud, ADDR_ADD_OSD, false)
+
+DO_VLDR_SG(vldrw_sg_wb_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD, true)
+DO_VLDR64_SG(vldrd_sg_wb_ud, ADDR_ADD, true)
+DO_VSTR_SG(vstrw_sg_wb_uw, stl, 4, uint32_t, ADDR_ADD, true)
+DO_VSTR64_SG(vstrd_sg_wb_ud, ADDR_ADD, true)
+
+/*
+ * Deinterleaving loads/interleaving stores.
+ *
+ * For these helpers we are passed the index of the first Qreg
+ * (VLD2/VST2 will also access Qn+1, VLD4/VST4 access Qn .. Qn+3)
+ * and the value of the base address register Rn.
+ * The helpers are specialized for pattern and element size, so
+ * for instance vld42h is VLD4 with pattern 2, element size MO_16.
+ *
+ * These insns are beatwise but not predicated, so we must honour ECI,
+ * but need not look at mve_element_mask().
+ *
+ * The pseudocode implements these insns with multiple memory accesses
+ * of the element size, but rules R_VVVG and R_FXDM permit us to make
+ * one 32-bit memory access per beat.
+ */
+#define DO_VLD4B(OP, O1, O2, O3, O4)                                    \
+    void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
+                          uint32_t base)                                \
+    {                                                                   \
+        int beat, e;                                                    \
+        uint16_t mask = mve_eci_mask(env);                              \
+        static const uint8_t off[4] = { O1, O2, O3, O4 };               \
+        uint32_t addr, data;                                            \
+        for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
+            if ((mask & 1) == 0) {                                      \
+                /* ECI says skip this beat */                           \
+                continue;                                               \
+            }                                                           \
+            addr = base + off[beat] * 4;                                \
+            data = cpu_ldl_le_data_ra(env, addr, GETPC());              \
+            for (e = 0; e < 4; e++, data >>= 8) {                       \
+                uint8_t *qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + e); \
+                qd[H1(off[beat])] = data;                               \
+            }                                                           \
+        }                                                               \
+    }
+
+#define DO_VLD4H(OP, O1, O2)                                            \
+    void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
+                          uint32_t base)                                \
+    {                                                                   \
+        int beat;                                                       \
+        uint16_t mask = mve_eci_mask(env);                              \
+        static const uint8_t off[4] = { O1, O1, O2, O2 };               \
+        uint32_t addr, data;                                            \
+        int y; /* y counts 0 2 0 2 */                                   \
+        uint16_t *qd;                                                   \
+        for (beat = 0, y = 0; beat < 4; beat++, mask >>= 4, y ^= 2) {   \
+            if ((mask & 1) == 0) {                                      \
+                /* ECI says skip this beat */                           \
+                continue;                                               \
+            }                                                           \
+            addr = base + off[beat] * 8 + (beat & 1) * 4;               \
+            data = cpu_ldl_le_data_ra(env, addr, GETPC());              \
+            qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y);             \
+            qd[H2(off[beat])] = data;                                   \
+            data >>= 16;                                                \
+            qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y + 1);         \
+            qd[H2(off[beat])] = data;                                   \
+        }                                                               \
+    }
+
+#define DO_VLD4W(OP, O1, O2, O3, O4)                                    \
+    void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
+                          uint32_t base)                                \
+    {                                                                   \
+        int beat;                                                       \
+        uint16_t mask = mve_eci_mask(env);                              \
+        static const uint8_t off[4] = { O1, O2, O3, O4 };               \
+        uint32_t addr, data;                                            \
+        uint32_t *qd;                                                   \
+        int y;                                                          \
+        for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
+            if ((mask & 1) == 0) {                                      \
+                /* ECI says skip this beat */                           \
+                continue;                                               \
+            }                                                           \
+            addr = base + off[beat] * 4;                                \
+            data = cpu_ldl_le_data_ra(env, addr, GETPC());              \
+            y = (beat + (O1 & 2)) & 3;                                  \
+            qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + y);             \
+            qd[H4(off[beat] >> 2)] = data;                              \
+        }                                                               \
+    }
+
+DO_VLD4B(vld40b, 0, 1, 10, 11)
+DO_VLD4B(vld41b, 2, 3, 12, 13)
+DO_VLD4B(vld42b, 4, 5, 14, 15)
+DO_VLD4B(vld43b, 6, 7, 8, 9)
+
+DO_VLD4H(vld40h, 0, 5)
+DO_VLD4H(vld41h, 1, 6)
+DO_VLD4H(vld42h, 2, 7)
+DO_VLD4H(vld43h, 3, 4)
+
+DO_VLD4W(vld40w, 0, 1, 10, 11)
+DO_VLD4W(vld41w, 2, 3, 12, 13)
+DO_VLD4W(vld42w, 4, 5, 14, 15)
+DO_VLD4W(vld43w, 6, 7, 8, 9)
+
+#define DO_VLD2B(OP, O1, O2, O3, O4)                                    \
+    void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
+                          uint32_t base)                                \
+    {                                                                   \
+        int beat, e;                                                    \
+        uint16_t mask = mve_eci_mask(env);                              \
+        static const uint8_t off[4] = { O1, O2, O3, O4 };               \
+        uint32_t addr, data;                                            \
+        uint8_t *qd;                                                    \
+        for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
+            if ((mask & 1) == 0) {                                      \
+                /* ECI says skip this beat */                           \
+                continue;                                               \
+            }                                                           \
+            addr = base + off[beat] * 2;                                \
+            data = cpu_ldl_le_data_ra(env, addr, GETPC());              \
+            for (e = 0; e < 4; e++, data >>= 8) {                       \
+                qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + (e & 1));    \
+                qd[H1(off[beat] + (e >> 1))] = data;                    \
+            }                                                           \
+        }                                                               \
+    }
+
+#define DO_VLD2H(OP, O1, O2, O3, O4)                                    \
+    void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
+                          uint32_t base)                                \
+    {                                                                   \
+        int beat;                                                       \
+        uint16_t mask = mve_eci_mask(env);                              \
+        static const uint8_t off[4] = { O1, O2, O3, O4 };               \
+        uint32_t addr, data;                                            \
+        int e;                                                          \
+        uint16_t *qd;                                                   \
+        for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
+            if ((mask & 1) == 0) {                                      \
+                /* ECI says skip this beat */                           \
+                continue;                                               \
+            }                                                           \
+            addr = base + off[beat] * 4;                                \
+            data = cpu_ldl_le_data_ra(env, addr, GETPC());              \
+            for (e = 0; e < 2; e++, data >>= 16) {                      \
+                qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + e);         \
+                qd[H2(off[beat])] = data;                               \
+            }                                                           \
+        }                                                               \
+    }
+
+#define DO_VLD2W(OP, O1, O2, O3, O4)                                    \
+    void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
+                          uint32_t base)                                \
+    {                                                                   \
+        int beat;                                                       \
+        uint16_t mask = mve_eci_mask(env);                              \
+        static const uint8_t off[4] = { O1, O2, O3, O4 };               \
+        uint32_t addr, data;                                            \
+        uint32_t *qd;                                                   \
+        for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
+            if ((mask & 1) == 0) {                                      \
+                /* ECI says skip this beat */                           \
+                continue;                                               \
+            }                                                           \
+            addr = base + off[beat];                                    \
+            data = cpu_ldl_le_data_ra(env, addr, GETPC());              \
+            qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + (beat & 1));    \
+            qd[H4(off[beat] >> 3)] = data;                              \
+        }                                                               \
+    }
+
+DO_VLD2B(vld20b, 0, 2, 12, 14)
+DO_VLD2B(vld21b, 4, 6, 8, 10)
+
+DO_VLD2H(vld20h, 0, 1, 6, 7)
+DO_VLD2H(vld21h, 2, 3, 4, 5)
+
+DO_VLD2W(vld20w, 0, 4, 24, 28)
+DO_VLD2W(vld21w, 8, 12, 16, 20)
+
+#define DO_VST4B(OP, O1, O2, O3, O4)                                    \
+    void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
+                          uint32_t base)                                \
+    {                                                                   \
+        int beat, e;                                                    \
+        uint16_t mask = mve_eci_mask(env);                              \
+        static const uint8_t off[4] = { O1, O2, O3, O4 };               \
+        uint32_t addr, data;                                            \
+        for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
+            if ((mask & 1) == 0) {                                      \
+                /* ECI says skip this beat */                           \
+                continue;                                               \
+            }                                                           \
+            addr = base + off[beat] * 4;                                \
+            data = 0;                                                   \
+            for (e = 3; e >= 0; e--) {                                  \
+                uint8_t *qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + e); \
+                data = (data << 8) | qd[H1(off[beat])];                 \
+            }                                                           \
+            cpu_stl_le_data_ra(env, addr, data, GETPC());               \
+        }                                                               \
+    }
+
+#define DO_VST4H(OP, O1, O2)                                            \
+    void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
+                          uint32_t base)                                \
+    {                                                                   \
+        int beat;                                                       \
+        uint16_t mask = mve_eci_mask(env);                              \
+        static const uint8_t off[4] = { O1, O1, O2, O2 };               \
+        uint32_t addr, data;                                            \
+        int y; /* y counts 0 2 0 2 */                                   \
+        uint16_t *qd;                                                   \
+        for (beat = 0, y = 0; beat < 4; beat++, mask >>= 4, y ^= 2) {   \
+            if ((mask & 1) == 0) {                                      \
+                /* ECI says skip this beat */                           \
+                continue;                                               \
+            }                                                           \
+            addr = base + off[beat] * 8 + (beat & 1) * 4;               \
+            qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y);             \
+            data = qd[H2(off[beat])];                                   \
+            qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y + 1);         \
+            data |= qd[H2(off[beat])] << 16;                            \
+            cpu_stl_le_data_ra(env, addr, data, GETPC());               \
+        }                                                               \
+    }
+
+#define DO_VST4W(OP, O1, O2, O3, O4)                                    \
+    void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
+                          uint32_t base)                                \
+    {                                                                   \
+        int beat;                                                       \
+        uint16_t mask = mve_eci_mask(env);                              \
+        static const uint8_t off[4] = { O1, O2, O3, O4 };               \
+        uint32_t addr, data;                                            \
+        uint32_t *qd;                                                   \
+        int y;                                                          \
+        for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
+            if ((mask & 1) == 0) {                                      \
+                /* ECI says skip this beat */                           \
+                continue;                                               \
+            }                                                           \
+            addr = base + off[beat] * 4;                                \
+            y = (beat + (O1 & 2)) & 3;                                  \
+            qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + y);             \
+            data = qd[H4(off[beat] >> 2)];                              \
+            cpu_stl_le_data_ra(env, addr, data, GETPC());               \
+        }                                                               \
+    }
+
+DO_VST4B(vst40b, 0, 1, 10, 11)
+DO_VST4B(vst41b, 2, 3, 12, 13)
+DO_VST4B(vst42b, 4, 5, 14, 15)
+DO_VST4B(vst43b, 6, 7, 8, 9)
+
+DO_VST4H(vst40h, 0, 5)
+DO_VST4H(vst41h, 1, 6)
+DO_VST4H(vst42h, 2, 7)
+DO_VST4H(vst43h, 3, 4)
+
+DO_VST4W(vst40w, 0, 1, 10, 11)
+DO_VST4W(vst41w, 2, 3, 12, 13)
+DO_VST4W(vst42w, 4, 5, 14, 15)
+DO_VST4W(vst43w, 6, 7, 8, 9)
+
+#define DO_VST2B(OP, O1, O2, O3, O4)                                    \
+    void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
+                          uint32_t base)                                \
+    {                                                                   \
+        int beat, e;                                                    \
+        uint16_t mask = mve_eci_mask(env);                              \
+        static const uint8_t off[4] = { O1, O2, O3, O4 };               \
+        uint32_t addr, data;                                            \
+        uint8_t *qd;                                                    \
+        for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
+            if ((mask & 1) == 0) {                                      \
+                /* ECI says skip this beat */                           \
+                continue;                                               \
+            }                                                           \
+            addr = base + off[beat] * 2;                                \
+            data = 0;                                                   \
+            for (e = 3; e >= 0; e--) {                                  \
+                qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + (e & 1));    \
+                data = (data << 8) | qd[H1(off[beat] + (e >> 1))];      \
+            }                                                           \
+            cpu_stl_le_data_ra(env, addr, data, GETPC());               \
+        }                                                               \
+    }
+
+#define DO_VST2H(OP, O1, O2, O3, O4)                                    \
+    void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
+                          uint32_t base)                                \
+    {                                                                   \
+        int beat;                                                       \
+        uint16_t mask = mve_eci_mask(env);                              \
+        static const uint8_t off[4] = { O1, O2, O3, O4 };               \
+        uint32_t addr, data;                                            \
+        int e;                                                          \
+        uint16_t *qd;                                                   \
+        for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
+            if ((mask & 1) == 0) {                                      \
+                /* ECI says skip this beat */                           \
+                continue;                                               \
+            }                                                           \
+            addr = base + off[beat] * 4;                                \
+            data = 0;                                                   \
+            for (e = 1; e >= 0; e--) {                                  \
+                qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + e);         \
+                data = (data << 16) | qd[H2(off[beat])];                \
+            }                                                           \
+            cpu_stl_le_data_ra(env, addr, data, GETPC());               \
+        }                                                               \
+    }
+
+#define DO_VST2W(OP, O1, O2, O3, O4)                                    \
+    void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
+                          uint32_t base)                                \
+    {                                                                   \
+        int beat;                                                       \
+        uint16_t mask = mve_eci_mask(env);                              \
+        static const uint8_t off[4] = { O1, O2, O3, O4 };               \
+        uint32_t addr, data;                                            \
+        uint32_t *qd;                                                   \
+        for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
+            if ((mask & 1) == 0) {                                      \
+                /* ECI says skip this beat */                           \
+                continue;                                               \
+            }                                                           \
+            addr = base + off[beat];                                    \
+            qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + (beat & 1));    \
+            data = qd[H4(off[beat] >> 3)];                              \
+            cpu_stl_le_data_ra(env, addr, data, GETPC());               \
+        }                                                               \
+    }
+
+DO_VST2B(vst20b, 0, 2, 12, 14)
+DO_VST2B(vst21b, 4, 6, 8, 10)
+
+DO_VST2H(vst20h, 0, 1, 6, 7)
+DO_VST2H(vst21h, 2, 3, 4, 5)
+
+DO_VST2W(vst20w, 0, 4, 24, 28)
+DO_VST2W(vst21w, 8, 12, 16, 20)
+
+/*
+ * The mergemask(D, R, M) macro performs the operation "*D = R" but
+ * storing only the bytes which correspond to 1 bits in M,
+ * leaving other bytes in *D unchanged. We use _Generic
+ * to select the correct implementation based on the type of D.
+ */
+
+static void mergemask_ub(uint8_t *d, uint8_t r, uint16_t mask)
+{
+    if (mask & 1) {
+        *d = r;
+    }
+}
+
+static void mergemask_sb(int8_t *d, int8_t r, uint16_t mask)
+{
+    mergemask_ub((uint8_t *)d, r, mask);
+}
+
+static void mergemask_uh(uint16_t *d, uint16_t r, uint16_t mask)
+{
+    uint16_t bmask = expand_pred_b(mask);
+    *d = (*d & ~bmask) | (r & bmask);
+}
+
+static void mergemask_sh(int16_t *d, int16_t r, uint16_t mask)
+{
+    mergemask_uh((uint16_t *)d, r, mask);
+}
+
+static void mergemask_uw(uint32_t *d, uint32_t r, uint16_t mask)
+{
+    uint32_t bmask = expand_pred_b(mask);
+    *d = (*d & ~bmask) | (r & bmask);
+}
+
+static void mergemask_sw(int32_t *d, int32_t r, uint16_t mask)
+{
+    mergemask_uw((uint32_t *)d, r, mask);
+}
+
+static void mergemask_uq(uint64_t *d, uint64_t r, uint16_t mask)
+{
+    uint64_t bmask = expand_pred_b(mask);
+    *d = (*d & ~bmask) | (r & bmask);
+}
+
+static void mergemask_sq(int64_t *d, int64_t r, uint16_t mask)
+{
+    mergemask_uq((uint64_t *)d, r, mask);
+}
+
+#define mergemask(D, R, M)                      \
+    _Generic(D,                                 \
+             uint8_t *: mergemask_ub,           \
+             int8_t *:  mergemask_sb,           \
+             uint16_t *: mergemask_uh,          \
+             int16_t *:  mergemask_sh,          \
+             uint32_t *: mergemask_uw,          \
+             int32_t *:  mergemask_sw,          \
+             uint64_t *: mergemask_uq,          \
+             int64_t *:  mergemask_sq)(D, R, M)
+
+void HELPER(mve_vdup)(CPUARMState *env, void *vd, uint32_t val)
+{
+    /*
+     * The generated code already replicated an 8 or 16 bit constant
+     * into the 32-bit value, so we only need to write the 32-bit
+     * value to all elements of the Qreg, allowing for predication.
+     */
+    uint32_t *d = vd;
+    uint16_t mask = mve_element_mask(env);
+    unsigned e;
+    for (e = 0; e < 16 / 4; e++, mask >>= 4) {
+        mergemask(&d[H4(e)], val, mask);
+    }
+    mve_advance_vpt(env);
+}
+
+#define DO_1OP(OP, ESIZE, TYPE, FN)                                     \
+    void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm)         \
+    {                                                                   \
+        TYPE *d = vd, *m = vm;                                          \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned e;                                                     \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
+            mergemask(&d[H##ESIZE(e)], FN(m[H##ESIZE(e)]), mask);       \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+#define DO_CLS_B(N)   (clrsb32(N) - 24)
+#define DO_CLS_H(N)   (clrsb32(N) - 16)
+
+DO_1OP(vclsb, 1, int8_t, DO_CLS_B)
+DO_1OP(vclsh, 2, int16_t, DO_CLS_H)
+DO_1OP(vclsw, 4, int32_t, clrsb32)
+
+#define DO_CLZ_B(N)   (clz32(N) - 24)
+#define DO_CLZ_H(N)   (clz32(N) - 16)
+
+DO_1OP(vclzb, 1, uint8_t, DO_CLZ_B)
+DO_1OP(vclzh, 2, uint16_t, DO_CLZ_H)
+DO_1OP(vclzw, 4, uint32_t, clz32)
+
+DO_1OP(vrev16b, 2, uint16_t, bswap16)
+DO_1OP(vrev32b, 4, uint32_t, bswap32)
+DO_1OP(vrev32h, 4, uint32_t, hswap32)
+DO_1OP(vrev64b, 8, uint64_t, bswap64)
+DO_1OP(vrev64h, 8, uint64_t, hswap64)
+DO_1OP(vrev64w, 8, uint64_t, wswap64)
+
+#define DO_NOT(N) (~(N))
+
+DO_1OP(vmvn, 8, uint64_t, DO_NOT)
+
+#define DO_ABS(N) ((N) < 0 ? -(N) : (N))
+#define DO_FABSH(N)  ((N) & dup_const(MO_16, 0x7fff))
+#define DO_FABSS(N)  ((N) & dup_const(MO_32, 0x7fffffff))
+
+DO_1OP(vabsb, 1, int8_t, DO_ABS)
+DO_1OP(vabsh, 2, int16_t, DO_ABS)
+DO_1OP(vabsw, 4, int32_t, DO_ABS)
+
+/* We can do these 64 bits at a time */
+DO_1OP(vfabsh, 8, uint64_t, DO_FABSH)
+DO_1OP(vfabss, 8, uint64_t, DO_FABSS)
+
+#define DO_NEG(N)    (-(N))
+#define DO_FNEGH(N) ((N) ^ dup_const(MO_16, 0x8000))
+#define DO_FNEGS(N) ((N) ^ dup_const(MO_32, 0x80000000))
+
+DO_1OP(vnegb, 1, int8_t, DO_NEG)
+DO_1OP(vnegh, 2, int16_t, DO_NEG)
+DO_1OP(vnegw, 4, int32_t, DO_NEG)
+
+/* We can do these 64 bits at a time */
+DO_1OP(vfnegh, 8, uint64_t, DO_FNEGH)
+DO_1OP(vfnegs, 8, uint64_t, DO_FNEGS)
+
+/*
+ * 1 operand immediates: Vda is destination and possibly also one source.
+ * All these insns work at 64-bit widths.
+ */
+#define DO_1OP_IMM(OP, FN)                                              \
+    void HELPER(mve_##OP)(CPUARMState *env, void *vda, uint64_t imm)    \
+    {                                                                   \
+        uint64_t *da = vda;                                             \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned e;                                                     \
+        for (e = 0; e < 16 / 8; e++, mask >>= 8) {                      \
+            mergemask(&da[H8(e)], FN(da[H8(e)], imm), mask);            \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+#define DO_MOVI(N, I) (I)
+#define DO_ANDI(N, I) ((N) & (I))
+#define DO_ORRI(N, I) ((N) | (I))
+
+DO_1OP_IMM(vmovi, DO_MOVI)
+DO_1OP_IMM(vandi, DO_ANDI)
+DO_1OP_IMM(vorri, DO_ORRI)
+
+#define DO_2OP(OP, ESIZE, TYPE, FN)                                     \
+    void HELPER(glue(mve_, OP))(CPUARMState *env,                       \
+                                void *vd, void *vn, void *vm)           \
+    {                                                                   \
+        TYPE *d = vd, *n = vn, *m = vm;                                 \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned e;                                                     \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
+            mergemask(&d[H##ESIZE(e)],                                  \
+                      FN(n[H##ESIZE(e)], m[H##ESIZE(e)]), mask);        \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+/* provide unsigned 2-op helpers for all sizes */
+#define DO_2OP_U(OP, FN)                        \
+    DO_2OP(OP##b, 1, uint8_t, FN)               \
+    DO_2OP(OP##h, 2, uint16_t, FN)              \
+    DO_2OP(OP##w, 4, uint32_t, FN)
+
+/* provide signed 2-op helpers for all sizes */
+#define DO_2OP_S(OP, FN)                        \
+    DO_2OP(OP##b, 1, int8_t, FN)                \
+    DO_2OP(OP##h, 2, int16_t, FN)               \
+    DO_2OP(OP##w, 4, int32_t, FN)
+
+/*
+ * "Long" operations where two half-sized inputs (taken from either the
+ * top or the bottom of the input vector) produce a double-width result.
+ * Here ESIZE, TYPE are for the input, and LESIZE, LTYPE for the output.
+ */
+#define DO_2OP_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN)               \
+    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \
+    {                                                                   \
+        LTYPE *d = vd;                                                  \
+        TYPE *n = vn, *m = vm;                                          \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned le;                                                    \
+        for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) {         \
+            LTYPE r = FN((LTYPE)n[H##ESIZE(le * 2 + TOP)],              \
+                         m[H##ESIZE(le * 2 + TOP)]);                    \
+            mergemask(&d[H##LESIZE(le)], r, mask);                      \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+#define DO_2OP_SAT(OP, ESIZE, TYPE, FN)                                 \
+    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \
+    {                                                                   \
+        TYPE *d = vd, *n = vn, *m = vm;                                 \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned e;                                                     \
+        bool qc = false;                                                \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
+            bool sat = false;                                           \
+            TYPE r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], &sat);          \
+            mergemask(&d[H##ESIZE(e)], r, mask);                        \
+            qc |= sat & mask & 1;                                       \
+        }                                                               \
+        if (qc) {                                                       \
+            env->vfp.qc[0] = qc;                                        \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+/* provide unsigned 2-op helpers for all sizes */
+#define DO_2OP_SAT_U(OP, FN)                    \
+    DO_2OP_SAT(OP##b, 1, uint8_t, FN)           \
+    DO_2OP_SAT(OP##h, 2, uint16_t, FN)          \
+    DO_2OP_SAT(OP##w, 4, uint32_t, FN)
+
+/* provide signed 2-op helpers for all sizes */
+#define DO_2OP_SAT_S(OP, FN)                    \
+    DO_2OP_SAT(OP##b, 1, int8_t, FN)            \
+    DO_2OP_SAT(OP##h, 2, int16_t, FN)           \
+    DO_2OP_SAT(OP##w, 4, int32_t, FN)
+
+#define DO_AND(N, M)  ((N) & (M))
+#define DO_BIC(N, M)  ((N) & ~(M))
+#define DO_ORR(N, M)  ((N) | (M))
+#define DO_ORN(N, M)  ((N) | ~(M))
+#define DO_EOR(N, M)  ((N) ^ (M))
+
+DO_2OP(vand, 8, uint64_t, DO_AND)
+DO_2OP(vbic, 8, uint64_t, DO_BIC)
+DO_2OP(vorr, 8, uint64_t, DO_ORR)
+DO_2OP(vorn, 8, uint64_t, DO_ORN)
+DO_2OP(veor, 8, uint64_t, DO_EOR)
+
+#define DO_ADD(N, M) ((N) + (M))
+#define DO_SUB(N, M) ((N) - (M))
+#define DO_MUL(N, M) ((N) * (M))
+
+DO_2OP_U(vadd, DO_ADD)
+DO_2OP_U(vsub, DO_SUB)
+DO_2OP_U(vmul, DO_MUL)
+
+DO_2OP_L(vmullbsb, 0, 1, int8_t, 2, int16_t, DO_MUL)
+DO_2OP_L(vmullbsh, 0, 2, int16_t, 4, int32_t, DO_MUL)
+DO_2OP_L(vmullbsw, 0, 4, int32_t, 8, int64_t, DO_MUL)
+DO_2OP_L(vmullbub, 0, 1, uint8_t, 2, uint16_t, DO_MUL)
+DO_2OP_L(vmullbuh, 0, 2, uint16_t, 4, uint32_t, DO_MUL)
+DO_2OP_L(vmullbuw, 0, 4, uint32_t, 8, uint64_t, DO_MUL)
+
+DO_2OP_L(vmulltsb, 1, 1, int8_t, 2, int16_t, DO_MUL)
+DO_2OP_L(vmulltsh, 1, 2, int16_t, 4, int32_t, DO_MUL)
+DO_2OP_L(vmulltsw, 1, 4, int32_t, 8, int64_t, DO_MUL)
+DO_2OP_L(vmulltub, 1, 1, uint8_t, 2, uint16_t, DO_MUL)
+DO_2OP_L(vmulltuh, 1, 2, uint16_t, 4, uint32_t, DO_MUL)
+DO_2OP_L(vmulltuw, 1, 4, uint32_t, 8, uint64_t, DO_MUL)
+
+/*
+ * Polynomial multiply. We can always do this generating 64 bits
+ * of the result at a time, so we don't need to use DO_2OP_L.
+ */
+#define VMULLPH_MASK 0x00ff00ff00ff00ffULL
+#define VMULLPW_MASK 0x0000ffff0000ffffULL
+#define DO_VMULLPBH(N, M) pmull_h((N) & VMULLPH_MASK, (M) & VMULLPH_MASK)
+#define DO_VMULLPTH(N, M) DO_VMULLPBH((N) >> 8, (M) >> 8)
+#define DO_VMULLPBW(N, M) pmull_w((N) & VMULLPW_MASK, (M) & VMULLPW_MASK)
+#define DO_VMULLPTW(N, M) DO_VMULLPBW((N) >> 16, (M) >> 16)
+
+DO_2OP(vmullpbh, 8, uint64_t, DO_VMULLPBH)
+DO_2OP(vmullpth, 8, uint64_t, DO_VMULLPTH)
+DO_2OP(vmullpbw, 8, uint64_t, DO_VMULLPBW)
+DO_2OP(vmullptw, 8, uint64_t, DO_VMULLPTW)
+
+/*
+ * Because the computation type is at least twice as large as required,
+ * these work for both signed and unsigned source types.
+ */
+static inline uint8_t do_mulh_b(int32_t n, int32_t m)
+{
+    return (n * m) >> 8;
+}
+
+static inline uint16_t do_mulh_h(int32_t n, int32_t m)
+{
+    return (n * m) >> 16;
+}
+
+static inline uint32_t do_mulh_w(int64_t n, int64_t m)
+{
+    return (n * m) >> 32;
+}
+
+static inline uint8_t do_rmulh_b(int32_t n, int32_t m)
+{
+    return (n * m + (1U << 7)) >> 8;
+}
+
+static inline uint16_t do_rmulh_h(int32_t n, int32_t m)
+{
+    return (n * m + (1U << 15)) >> 16;
+}
+
+static inline uint32_t do_rmulh_w(int64_t n, int64_t m)
+{
+    return (n * m + (1U << 31)) >> 32;
+}
+
+DO_2OP(vmulhsb, 1, int8_t, do_mulh_b)
+DO_2OP(vmulhsh, 2, int16_t, do_mulh_h)
+DO_2OP(vmulhsw, 4, int32_t, do_mulh_w)
+DO_2OP(vmulhub, 1, uint8_t, do_mulh_b)
+DO_2OP(vmulhuh, 2, uint16_t, do_mulh_h)
+DO_2OP(vmulhuw, 4, uint32_t, do_mulh_w)
+
+DO_2OP(vrmulhsb, 1, int8_t, do_rmulh_b)
+DO_2OP(vrmulhsh, 2, int16_t, do_rmulh_h)
+DO_2OP(vrmulhsw, 4, int32_t, do_rmulh_w)
+DO_2OP(vrmulhub, 1, uint8_t, do_rmulh_b)
+DO_2OP(vrmulhuh, 2, uint16_t, do_rmulh_h)
+DO_2OP(vrmulhuw, 4, uint32_t, do_rmulh_w)
+
+#define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
+#define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
+
+DO_2OP_S(vmaxs, DO_MAX)
+DO_2OP_U(vmaxu, DO_MAX)
+DO_2OP_S(vmins, DO_MIN)
+DO_2OP_U(vminu, DO_MIN)
+
+#define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
+
+DO_2OP_S(vabds, DO_ABD)
+DO_2OP_U(vabdu, DO_ABD)
+
+static inline uint32_t do_vhadd_u(uint32_t n, uint32_t m)
+{
+    return ((uint64_t)n + m) >> 1;
+}
+
+static inline int32_t do_vhadd_s(int32_t n, int32_t m)
+{
+    return ((int64_t)n + m) >> 1;
+}
+
+static inline uint32_t do_vhsub_u(uint32_t n, uint32_t m)
+{
+    return ((uint64_t)n - m) >> 1;
+}
+
+static inline int32_t do_vhsub_s(int32_t n, int32_t m)
+{
+    return ((int64_t)n - m) >> 1;
+}
+
+DO_2OP_S(vhadds, do_vhadd_s)
+DO_2OP_U(vhaddu, do_vhadd_u)
+DO_2OP_S(vhsubs, do_vhsub_s)
+DO_2OP_U(vhsubu, do_vhsub_u)
+
+#define DO_VSHLS(N, M) do_sqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL)
+#define DO_VSHLU(N, M) do_uqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL)
+#define DO_VRSHLS(N, M) do_sqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, true, NULL)
+#define DO_VRSHLU(N, M) do_uqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, true, NULL)
+
+DO_2OP_S(vshls, DO_VSHLS)
+DO_2OP_U(vshlu, DO_VSHLU)
+DO_2OP_S(vrshls, DO_VRSHLS)
+DO_2OP_U(vrshlu, DO_VRSHLU)
+
+#define DO_RHADD_S(N, M) (((int64_t)(N) + (M) + 1) >> 1)
+#define DO_RHADD_U(N, M) (((uint64_t)(N) + (M) + 1) >> 1)
+
+DO_2OP_S(vrhadds, DO_RHADD_S)
+DO_2OP_U(vrhaddu, DO_RHADD_U)
+
+static void do_vadc(CPUARMState *env, uint32_t *d, uint32_t *n, uint32_t *m,
+                    uint32_t inv, uint32_t carry_in, bool update_flags)
+{
+    uint16_t mask = mve_element_mask(env);
+    unsigned e;
+
+    /* If any additions trigger, we will update flags. */
+    if (mask & 0x1111) {
+        update_flags = true;
+    }
+
+    for (e = 0; e < 16 / 4; e++, mask >>= 4) {
+        uint64_t r = carry_in;
+        r += n[H4(e)];
+        r += m[H4(e)] ^ inv;
+        if (mask & 1) {
+            carry_in = r >> 32;
+        }
+        mergemask(&d[H4(e)], r, mask);
+    }
+
+    if (update_flags) {
+        /* Store C, clear NZV. */
+        env->vfp.xregs[ARM_VFP_FPSCR] &= ~FPCR_NZCV_MASK;
+        env->vfp.xregs[ARM_VFP_FPSCR] |= carry_in * FPCR_C;
+    }
+    mve_advance_vpt(env);
+}
+
+void HELPER(mve_vadc)(CPUARMState *env, void *vd, void *vn, void *vm)
+{
+    bool carry_in = env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_C;
+    do_vadc(env, vd, vn, vm, 0, carry_in, false);
+}
+
+void HELPER(mve_vsbc)(CPUARMState *env, void *vd, void *vn, void *vm)
+{
+    bool carry_in = env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_C;
+    do_vadc(env, vd, vn, vm, -1, carry_in, false);
+}
+
+
+void HELPER(mve_vadci)(CPUARMState *env, void *vd, void *vn, void *vm)
+{
+    do_vadc(env, vd, vn, vm, 0, 0, true);
+}
+
+void HELPER(mve_vsbci)(CPUARMState *env, void *vd, void *vn, void *vm)
+{
+    do_vadc(env, vd, vn, vm, -1, 1, true);
+}
+
+#define DO_VCADD(OP, ESIZE, TYPE, FN0, FN1)                             \
+    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \
+    {                                                                   \
+        TYPE *d = vd, *n = vn, *m = vm;                                 \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned e;                                                     \
+        TYPE r[16 / ESIZE];                                             \
+        /* Calculate all results first to avoid overwriting inputs */   \
+        for (e = 0; e < 16 / ESIZE; e++) {                              \
+            if (!(e & 1)) {                                             \
+                r[e] = FN0(n[H##ESIZE(e)], m[H##ESIZE(e + 1)]);         \
+            } else {                                                    \
+                r[e] = FN1(n[H##ESIZE(e)], m[H##ESIZE(e - 1)]);         \
+            }                                                           \
+        }                                                               \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
+            mergemask(&d[H##ESIZE(e)], r[e], mask);                     \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+#define DO_VCADD_ALL(OP, FN0, FN1)              \
+    DO_VCADD(OP##b, 1, int8_t, FN0, FN1)        \
+    DO_VCADD(OP##h, 2, int16_t, FN0, FN1)       \
+    DO_VCADD(OP##w, 4, int32_t, FN0, FN1)
+
+DO_VCADD_ALL(vcadd90, DO_SUB, DO_ADD)
+DO_VCADD_ALL(vcadd270, DO_ADD, DO_SUB)
+DO_VCADD_ALL(vhcadd90, do_vhsub_s, do_vhadd_s)
+DO_VCADD_ALL(vhcadd270, do_vhadd_s, do_vhsub_s)
+
+static inline int32_t do_sat_bhw(int64_t val, int64_t min, int64_t max, bool *s)
+{
+    if (val > max) {
+        *s = true;
+        return max;
+    } else if (val < min) {
+        *s = true;
+        return min;
+    }
+    return val;
+}
+
+#define DO_SQADD_B(n, m, s) do_sat_bhw((int64_t)n + m, INT8_MIN, INT8_MAX, s)
+#define DO_SQADD_H(n, m, s) do_sat_bhw((int64_t)n + m, INT16_MIN, INT16_MAX, s)
+#define DO_SQADD_W(n, m, s) do_sat_bhw((int64_t)n + m, INT32_MIN, INT32_MAX, s)
+
+#define DO_UQADD_B(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT8_MAX, s)
+#define DO_UQADD_H(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT16_MAX, s)
+#define DO_UQADD_W(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT32_MAX, s)
+
+#define DO_SQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, INT8_MIN, INT8_MAX, s)
+#define DO_SQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, INT16_MIN, INT16_MAX, s)
+#define DO_SQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, INT32_MIN, INT32_MAX, s)
+
+#define DO_UQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT8_MAX, s)
+#define DO_UQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT16_MAX, s)
+#define DO_UQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT32_MAX, s)
+
+/*
+ * For QDMULH and QRDMULH we simplify "double and shift by esize" into
+ * "shift by esize-1", adjusting the QRDMULH rounding constant to match.
+ */
+#define DO_QDMULH_B(n, m, s) do_sat_bhw(((int64_t)n * m) >> 7, \
+                                        INT8_MIN, INT8_MAX, s)
+#define DO_QDMULH_H(n, m, s) do_sat_bhw(((int64_t)n * m) >> 15, \
+                                        INT16_MIN, INT16_MAX, s)
+#define DO_QDMULH_W(n, m, s) do_sat_bhw(((int64_t)n * m) >> 31, \
+                                        INT32_MIN, INT32_MAX, s)
+
+#define DO_QRDMULH_B(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 6)) >> 7, \
+                                         INT8_MIN, INT8_MAX, s)
+#define DO_QRDMULH_H(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 14)) >> 15, \
+                                         INT16_MIN, INT16_MAX, s)
+#define DO_QRDMULH_W(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 30)) >> 31, \
+                                         INT32_MIN, INT32_MAX, s)
+
+DO_2OP_SAT(vqdmulhb, 1, int8_t, DO_QDMULH_B)
+DO_2OP_SAT(vqdmulhh, 2, int16_t, DO_QDMULH_H)
+DO_2OP_SAT(vqdmulhw, 4, int32_t, DO_QDMULH_W)
+
+DO_2OP_SAT(vqrdmulhb, 1, int8_t, DO_QRDMULH_B)
+DO_2OP_SAT(vqrdmulhh, 2, int16_t, DO_QRDMULH_H)
+DO_2OP_SAT(vqrdmulhw, 4, int32_t, DO_QRDMULH_W)
+
+DO_2OP_SAT(vqaddub, 1, uint8_t, DO_UQADD_B)
+DO_2OP_SAT(vqadduh, 2, uint16_t, DO_UQADD_H)
+DO_2OP_SAT(vqadduw, 4, uint32_t, DO_UQADD_W)
+DO_2OP_SAT(vqaddsb, 1, int8_t, DO_SQADD_B)
+DO_2OP_SAT(vqaddsh, 2, int16_t, DO_SQADD_H)
+DO_2OP_SAT(vqaddsw, 4, int32_t, DO_SQADD_W)
+
+DO_2OP_SAT(vqsubub, 1, uint8_t, DO_UQSUB_B)
+DO_2OP_SAT(vqsubuh, 2, uint16_t, DO_UQSUB_H)
+DO_2OP_SAT(vqsubuw, 4, uint32_t, DO_UQSUB_W)
+DO_2OP_SAT(vqsubsb, 1, int8_t, DO_SQSUB_B)
+DO_2OP_SAT(vqsubsh, 2, int16_t, DO_SQSUB_H)
+DO_2OP_SAT(vqsubsw, 4, int32_t, DO_SQSUB_W)
+
+/*
+ * This wrapper fixes up the impedance mismatch between do_sqrshl_bhs()
+ * and friends wanting a uint32_t* sat and our needing a bool*.
+ */
+#define WRAP_QRSHL_HELPER(FN, N, M, ROUND, satp)                        \
+    ({                                                                  \
+        uint32_t su32 = 0;                                              \
+        typeof(N) r = FN(N, (int8_t)(M), sizeof(N) * 8, ROUND, &su32);  \
+        if (su32) {                                                     \
+            *satp = true;                                               \
+        }                                                               \
+        r;                                                              \
+    })
+
+#define DO_SQSHL_OP(N, M, satp) \
+    WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, false, satp)
+#define DO_UQSHL_OP(N, M, satp) \
+    WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, false, satp)
+#define DO_SQRSHL_OP(N, M, satp) \
+    WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, true, satp)
+#define DO_UQRSHL_OP(N, M, satp) \
+    WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, true, satp)
+#define DO_SUQSHL_OP(N, M, satp) \
+    WRAP_QRSHL_HELPER(do_suqrshl_bhs, N, M, false, satp)
+
+DO_2OP_SAT_S(vqshls, DO_SQSHL_OP)
+DO_2OP_SAT_U(vqshlu, DO_UQSHL_OP)
+DO_2OP_SAT_S(vqrshls, DO_SQRSHL_OP)
+DO_2OP_SAT_U(vqrshlu, DO_UQRSHL_OP)
+
+/*
+ * Multiply add dual returning high half
+ * The 'FN' here takes four inputs A, B, C, D, a 0/1 indicator of
+ * whether to add the rounding constant, and the pointer to the
+ * saturation flag, and should do "(A * B + C * D) * 2 + rounding constant",
+ * saturate to twice the input size and return the high half; or
+ * (A * B - C * D) etc for VQDMLSDH.
+ */
+#define DO_VQDMLADH_OP(OP, ESIZE, TYPE, XCHG, ROUND, FN)                \
+    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn,   \
+                                void *vm)                               \
+    {                                                                   \
+        TYPE *d = vd, *n = vn, *m = vm;                                 \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned e;                                                     \
+        bool qc = false;                                                \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
+            bool sat = false;                                           \
+            if ((e & 1) == XCHG) {                                      \
+                TYPE r = FN(n[H##ESIZE(e)],                             \
+                            m[H##ESIZE(e - XCHG)],                      \
+                            n[H##ESIZE(e + (1 - 2 * XCHG))],            \
+                            m[H##ESIZE(e + (1 - XCHG))],                \
+                            ROUND, &sat);                               \
+                mergemask(&d[H##ESIZE(e)], r, mask);                    \
+                qc |= sat & mask & 1;                                   \
+            }                                                           \
+        }                                                               \
+        if (qc) {                                                       \
+            env->vfp.qc[0] = qc;                                        \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+static int8_t do_vqdmladh_b(int8_t a, int8_t b, int8_t c, int8_t d,
+                            int round, bool *sat)
+{
+    int64_t r = ((int64_t)a * b + (int64_t)c * d) * 2 + (round << 7);
+    return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8;
+}
+
+static int16_t do_vqdmladh_h(int16_t a, int16_t b, int16_t c, int16_t d,
+                             int round, bool *sat)
+{
+    int64_t r = ((int64_t)a * b + (int64_t)c * d) * 2 + (round << 15);
+    return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16;
+}
+
+static int32_t do_vqdmladh_w(int32_t a, int32_t b, int32_t c, int32_t d,
+                             int round, bool *sat)
+{
+    int64_t m1 = (int64_t)a * b;
+    int64_t m2 = (int64_t)c * d;
+    int64_t r;
+    /*
+     * Architecturally we should do the entire add, double, round
+     * and then check for saturation. We do three saturating adds,
+     * but we need to be careful about the order. If the first
+     * m1 + m2 saturates then it's impossible for the *2+rc to
+     * bring it back into the non-saturated range. However, if
+     * m1 + m2 is negative then it's possible that doing the doubling
+     * would take the intermediate result below INT64_MAX and the
+     * addition of the rounding constant then brings it back in range.
+     * So we add half the rounding constant before doubling rather
+     * than adding the rounding constant after the doubling.
+     */
+    if (sadd64_overflow(m1, m2, &r) ||
+        sadd64_overflow(r, (round << 30), &r) ||
+        sadd64_overflow(r, r, &r)) {
+        *sat = true;
+        return r < 0 ? INT32_MAX : INT32_MIN;
+    }
+    return r >> 32;
+}
+
+static int8_t do_vqdmlsdh_b(int8_t a, int8_t b, int8_t c, int8_t d,
+                            int round, bool *sat)
+{
+    int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 7);
+    return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8;
+}
+
+static int16_t do_vqdmlsdh_h(int16_t a, int16_t b, int16_t c, int16_t d,
+                             int round, bool *sat)
+{
+    int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 15);
+    return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16;
+}
+
+static int32_t do_vqdmlsdh_w(int32_t a, int32_t b, int32_t c, int32_t d,
+                             int round, bool *sat)
+{
+    int64_t m1 = (int64_t)a * b;
+    int64_t m2 = (int64_t)c * d;
+    int64_t r;
+    /* The same ordering issue as in do_vqdmladh_w applies here too */
+    if (ssub64_overflow(m1, m2, &r) ||
+        sadd64_overflow(r, (round << 30), &r) ||
+        sadd64_overflow(r, r, &r)) {
+        *sat = true;
+        return r < 0 ? INT32_MAX : INT32_MIN;
+    }
+    return r >> 32;
+}
+
+DO_VQDMLADH_OP(vqdmladhb, 1, int8_t, 0, 0, do_vqdmladh_b)
+DO_VQDMLADH_OP(vqdmladhh, 2, int16_t, 0, 0, do_vqdmladh_h)
+DO_VQDMLADH_OP(vqdmladhw, 4, int32_t, 0, 0, do_vqdmladh_w)
+DO_VQDMLADH_OP(vqdmladhxb, 1, int8_t, 1, 0, do_vqdmladh_b)
+DO_VQDMLADH_OP(vqdmladhxh, 2, int16_t, 1, 0, do_vqdmladh_h)
+DO_VQDMLADH_OP(vqdmladhxw, 4, int32_t, 1, 0, do_vqdmladh_w)
+
+DO_VQDMLADH_OP(vqrdmladhb, 1, int8_t, 0, 1, do_vqdmladh_b)
+DO_VQDMLADH_OP(vqrdmladhh, 2, int16_t, 0, 1, do_vqdmladh_h)
+DO_VQDMLADH_OP(vqrdmladhw, 4, int32_t, 0, 1, do_vqdmladh_w)
+DO_VQDMLADH_OP(vqrdmladhxb, 1, int8_t, 1, 1, do_vqdmladh_b)
+DO_VQDMLADH_OP(vqrdmladhxh, 2, int16_t, 1, 1, do_vqdmladh_h)
+DO_VQDMLADH_OP(vqrdmladhxw, 4, int32_t, 1, 1, do_vqdmladh_w)
+
+DO_VQDMLADH_OP(vqdmlsdhb, 1, int8_t, 0, 0, do_vqdmlsdh_b)
+DO_VQDMLADH_OP(vqdmlsdhh, 2, int16_t, 0, 0, do_vqdmlsdh_h)
+DO_VQDMLADH_OP(vqdmlsdhw, 4, int32_t, 0, 0, do_vqdmlsdh_w)
+DO_VQDMLADH_OP(vqdmlsdhxb, 1, int8_t, 1, 0, do_vqdmlsdh_b)
+DO_VQDMLADH_OP(vqdmlsdhxh, 2, int16_t, 1, 0, do_vqdmlsdh_h)
+DO_VQDMLADH_OP(vqdmlsdhxw, 4, int32_t, 1, 0, do_vqdmlsdh_w)
+
+DO_VQDMLADH_OP(vqrdmlsdhb, 1, int8_t, 0, 1, do_vqdmlsdh_b)
+DO_VQDMLADH_OP(vqrdmlsdhh, 2, int16_t, 0, 1, do_vqdmlsdh_h)
+DO_VQDMLADH_OP(vqrdmlsdhw, 4, int32_t, 0, 1, do_vqdmlsdh_w)
+DO_VQDMLADH_OP(vqrdmlsdhxb, 1, int8_t, 1, 1, do_vqdmlsdh_b)
+DO_VQDMLADH_OP(vqrdmlsdhxh, 2, int16_t, 1, 1, do_vqdmlsdh_h)
+DO_VQDMLADH_OP(vqrdmlsdhxw, 4, int32_t, 1, 1, do_vqdmlsdh_w)
+
+#define DO_2OP_SCALAR(OP, ESIZE, TYPE, FN)                              \
+    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn,   \
+                                uint32_t rm)                            \
+    {                                                                   \
+        TYPE *d = vd, *n = vn;                                          \
+        TYPE m = rm;                                                    \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned e;                                                     \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
+            mergemask(&d[H##ESIZE(e)], FN(n[H##ESIZE(e)], m), mask);    \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+#define DO_2OP_SAT_SCALAR(OP, ESIZE, TYPE, FN)                          \
+    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn,   \
+                                uint32_t rm)                            \
+    {                                                                   \
+        TYPE *d = vd, *n = vn;                                          \
+        TYPE m = rm;                                                    \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned e;                                                     \
+        bool qc = false;                                                \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
+            bool sat = false;                                           \
+            mergemask(&d[H##ESIZE(e)], FN(n[H##ESIZE(e)], m, &sat),     \
+                      mask);                                            \
+            qc |= sat & mask & 1;                                       \
+        }                                                               \
+        if (qc) {                                                       \
+            env->vfp.qc[0] = qc;                                        \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+/* "accumulating" version where FN takes d as well as n and m */
+#define DO_2OP_ACC_SCALAR(OP, ESIZE, TYPE, FN)                          \
+    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn,   \
+                                uint32_t rm)                            \
+    {                                                                   \
+        TYPE *d = vd, *n = vn;                                          \
+        TYPE m = rm;                                                    \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned e;                                                     \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
+            mergemask(&d[H##ESIZE(e)],                                  \
+                      FN(d[H##ESIZE(e)], n[H##ESIZE(e)], m), mask);     \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+#define DO_2OP_SAT_ACC_SCALAR(OP, ESIZE, TYPE, FN)                      \
+    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn,   \
+                                uint32_t rm)                            \
+    {                                                                   \
+        TYPE *d = vd, *n = vn;                                          \
+        TYPE m = rm;                                                    \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned e;                                                     \
+        bool qc = false;                                                \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
+            bool sat = false;                                           \
+            mergemask(&d[H##ESIZE(e)],                                  \
+                      FN(d[H##ESIZE(e)], n[H##ESIZE(e)], m, &sat),      \
+                      mask);                                            \
+            qc |= sat & mask & 1;                                       \
+        }                                                               \
+        if (qc) {                                                       \
+            env->vfp.qc[0] = qc;                                        \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+/* provide unsigned 2-op scalar helpers for all sizes */
+#define DO_2OP_SCALAR_U(OP, FN)                 \
+    DO_2OP_SCALAR(OP##b, 1, uint8_t, FN)        \
+    DO_2OP_SCALAR(OP##h, 2, uint16_t, FN)       \
+    DO_2OP_SCALAR(OP##w, 4, uint32_t, FN)
+#define DO_2OP_SCALAR_S(OP, FN)                 \
+    DO_2OP_SCALAR(OP##b, 1, int8_t, FN)         \
+    DO_2OP_SCALAR(OP##h, 2, int16_t, FN)        \
+    DO_2OP_SCALAR(OP##w, 4, int32_t, FN)
+
+#define DO_2OP_ACC_SCALAR_U(OP, FN)             \
+    DO_2OP_ACC_SCALAR(OP##b, 1, uint8_t, FN)    \
+    DO_2OP_ACC_SCALAR(OP##h, 2, uint16_t, FN)   \
+    DO_2OP_ACC_SCALAR(OP##w, 4, uint32_t, FN)
+
+DO_2OP_SCALAR_U(vadd_scalar, DO_ADD)
+DO_2OP_SCALAR_U(vsub_scalar, DO_SUB)
+DO_2OP_SCALAR_U(vmul_scalar, DO_MUL)
+DO_2OP_SCALAR_S(vhadds_scalar, do_vhadd_s)
+DO_2OP_SCALAR_U(vhaddu_scalar, do_vhadd_u)
+DO_2OP_SCALAR_S(vhsubs_scalar, do_vhsub_s)
+DO_2OP_SCALAR_U(vhsubu_scalar, do_vhsub_u)
+
+DO_2OP_SAT_SCALAR(vqaddu_scalarb, 1, uint8_t, DO_UQADD_B)
+DO_2OP_SAT_SCALAR(vqaddu_scalarh, 2, uint16_t, DO_UQADD_H)
+DO_2OP_SAT_SCALAR(vqaddu_scalarw, 4, uint32_t, DO_UQADD_W)
+DO_2OP_SAT_SCALAR(vqadds_scalarb, 1, int8_t, DO_SQADD_B)
+DO_2OP_SAT_SCALAR(vqadds_scalarh, 2, int16_t, DO_SQADD_H)
+DO_2OP_SAT_SCALAR(vqadds_scalarw, 4, int32_t, DO_SQADD_W)
+
+DO_2OP_SAT_SCALAR(vqsubu_scalarb, 1, uint8_t, DO_UQSUB_B)
+DO_2OP_SAT_SCALAR(vqsubu_scalarh, 2, uint16_t, DO_UQSUB_H)
+DO_2OP_SAT_SCALAR(vqsubu_scalarw, 4, uint32_t, DO_UQSUB_W)
+DO_2OP_SAT_SCALAR(vqsubs_scalarb, 1, int8_t, DO_SQSUB_B)
+DO_2OP_SAT_SCALAR(vqsubs_scalarh, 2, int16_t, DO_SQSUB_H)
+DO_2OP_SAT_SCALAR(vqsubs_scalarw, 4, int32_t, DO_SQSUB_W)
+
+DO_2OP_SAT_SCALAR(vqdmulh_scalarb, 1, int8_t, DO_QDMULH_B)
+DO_2OP_SAT_SCALAR(vqdmulh_scalarh, 2, int16_t, DO_QDMULH_H)
+DO_2OP_SAT_SCALAR(vqdmulh_scalarw, 4, int32_t, DO_QDMULH_W)
+DO_2OP_SAT_SCALAR(vqrdmulh_scalarb, 1, int8_t, DO_QRDMULH_B)
+DO_2OP_SAT_SCALAR(vqrdmulh_scalarh, 2, int16_t, DO_QRDMULH_H)
+DO_2OP_SAT_SCALAR(vqrdmulh_scalarw, 4, int32_t, DO_QRDMULH_W)
+
+static int8_t do_vqdmlah_b(int8_t a, int8_t b, int8_t c, int round, bool *sat)
+{
+    int64_t r = (int64_t)a * b * 2 + ((int64_t)c << 8) + (round << 7);
+    return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8;
+}
+
+static int16_t do_vqdmlah_h(int16_t a, int16_t b, int16_t c,
+                           int round, bool *sat)
+{
+    int64_t r = (int64_t)a * b * 2 + ((int64_t)c << 16) + (round << 15);
+    return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16;
+}
+
+static int32_t do_vqdmlah_w(int32_t a, int32_t b, int32_t c,
+                            int round, bool *sat)
+{
+    /*
+     * Architecturally we should do the entire add, double, round
+     * and then check for saturation. We do three saturating adds,
+     * but we need to be careful about the order. If the first
+     * m1 + m2 saturates then it's impossible for the *2+rc to
+     * bring it back into the non-saturated range. However, if
+     * m1 + m2 is negative then it's possible that doing the doubling
+     * would take the intermediate result below INT64_MAX and the
+     * addition of the rounding constant then brings it back in range.
+     * So we add half the rounding constant and half the "c << esize"
+     * before doubling rather than adding the rounding constant after
+     * the doubling.
+     */
+    int64_t m1 = (int64_t)a * b;
+    int64_t m2 = (int64_t)c << 31;
+    int64_t r;
+    if (sadd64_overflow(m1, m2, &r) ||
+        sadd64_overflow(r, (round << 30), &r) ||
+        sadd64_overflow(r, r, &r)) {
+        *sat = true;
+        return r < 0 ? INT32_MAX : INT32_MIN;
+    }
+    return r >> 32;
+}
+
+/*
+ * The *MLAH insns are vector * scalar + vector;
+ * the *MLASH insns are vector * vector + scalar
+ */
+#define DO_VQDMLAH_B(D, N, M, S) do_vqdmlah_b(N, M, D, 0, S)
+#define DO_VQDMLAH_H(D, N, M, S) do_vqdmlah_h(N, M, D, 0, S)
+#define DO_VQDMLAH_W(D, N, M, S) do_vqdmlah_w(N, M, D, 0, S)
+#define DO_VQRDMLAH_B(D, N, M, S) do_vqdmlah_b(N, M, D, 1, S)
+#define DO_VQRDMLAH_H(D, N, M, S) do_vqdmlah_h(N, M, D, 1, S)
+#define DO_VQRDMLAH_W(D, N, M, S) do_vqdmlah_w(N, M, D, 1, S)
+
+#define DO_VQDMLASH_B(D, N, M, S) do_vqdmlah_b(N, D, M, 0, S)
+#define DO_VQDMLASH_H(D, N, M, S) do_vqdmlah_h(N, D, M, 0, S)
+#define DO_VQDMLASH_W(D, N, M, S) do_vqdmlah_w(N, D, M, 0, S)
+#define DO_VQRDMLASH_B(D, N, M, S) do_vqdmlah_b(N, D, M, 1, S)
+#define DO_VQRDMLASH_H(D, N, M, S) do_vqdmlah_h(N, D, M, 1, S)
+#define DO_VQRDMLASH_W(D, N, M, S) do_vqdmlah_w(N, D, M, 1, S)
+
+DO_2OP_SAT_ACC_SCALAR(vqdmlahb, 1, int8_t, DO_VQDMLAH_B)
+DO_2OP_SAT_ACC_SCALAR(vqdmlahh, 2, int16_t, DO_VQDMLAH_H)
+DO_2OP_SAT_ACC_SCALAR(vqdmlahw, 4, int32_t, DO_VQDMLAH_W)
+DO_2OP_SAT_ACC_SCALAR(vqrdmlahb, 1, int8_t, DO_VQRDMLAH_B)
+DO_2OP_SAT_ACC_SCALAR(vqrdmlahh, 2, int16_t, DO_VQRDMLAH_H)
+DO_2OP_SAT_ACC_SCALAR(vqrdmlahw, 4, int32_t, DO_VQRDMLAH_W)
+
+DO_2OP_SAT_ACC_SCALAR(vqdmlashb, 1, int8_t, DO_VQDMLASH_B)
+DO_2OP_SAT_ACC_SCALAR(vqdmlashh, 2, int16_t, DO_VQDMLASH_H)
+DO_2OP_SAT_ACC_SCALAR(vqdmlashw, 4, int32_t, DO_VQDMLASH_W)
+DO_2OP_SAT_ACC_SCALAR(vqrdmlashb, 1, int8_t, DO_VQRDMLASH_B)
+DO_2OP_SAT_ACC_SCALAR(vqrdmlashh, 2, int16_t, DO_VQRDMLASH_H)
+DO_2OP_SAT_ACC_SCALAR(vqrdmlashw, 4, int32_t, DO_VQRDMLASH_W)
+
+/* Vector by scalar plus vector */
+#define DO_VMLA(D, N, M) ((N) * (M) + (D))
+
+DO_2OP_ACC_SCALAR_U(vmla, DO_VMLA)
+
+/* Vector by vector plus scalar */
+#define DO_VMLAS(D, N, M) ((N) * (D) + (M))
+
+DO_2OP_ACC_SCALAR_U(vmlas, DO_VMLAS)
+
+/*
+ * Long saturating scalar ops. As with DO_2OP_L, TYPE and H are for the
+ * input (smaller) type and LESIZE, LTYPE, LH for the output (long) type.
+ * SATMASK specifies which bits of the predicate mask matter for determining
+ * whether to propagate a saturation indication into FPSCR.QC -- for
+ * the 16x16->32 case we must check only the bit corresponding to the T or B
+ * half that we used, but for the 32x32->64 case we propagate if the mask
+ * bit is set for either half.
+ */
+#define DO_2OP_SAT_SCALAR_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN, SATMASK) \
+    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn,   \
+                                uint32_t rm)                            \
+    {                                                                   \
+        LTYPE *d = vd;                                                  \
+        TYPE *n = vn;                                                   \
+        TYPE m = rm;                                                    \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned le;                                                    \
+        bool qc = false;                                                \
+        for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) {         \
+            bool sat = false;                                           \
+            LTYPE r = FN((LTYPE)n[H##ESIZE(le * 2 + TOP)], m, &sat);    \
+            mergemask(&d[H##LESIZE(le)], r, mask);                      \
+            qc |= sat && (mask & SATMASK);                              \
+        }                                                               \
+        if (qc) {                                                       \
+            env->vfp.qc[0] = qc;                                        \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+static inline int32_t do_qdmullh(int16_t n, int16_t m, bool *sat)
+{
+    int64_t r = ((int64_t)n * m) * 2;
+    return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat);
+}
+
+static inline int64_t do_qdmullw(int32_t n, int32_t m, bool *sat)
+{
+    /* The multiply can't overflow, but the doubling might */
+    int64_t r = (int64_t)n * m;
+    if (r > INT64_MAX / 2) {
+        *sat = true;
+        return INT64_MAX;
+    } else if (r < INT64_MIN / 2) {
+        *sat = true;
+        return INT64_MIN;
+    } else {
+        return r * 2;
+    }
+}
+
+#define SATMASK16B 1
+#define SATMASK16T (1 << 2)
+#define SATMASK32 ((1 << 4) | 1)
+
+DO_2OP_SAT_SCALAR_L(vqdmullb_scalarh, 0, 2, int16_t, 4, int32_t, \
+                    do_qdmullh, SATMASK16B)
+DO_2OP_SAT_SCALAR_L(vqdmullb_scalarw, 0, 4, int32_t, 8, int64_t, \
+                    do_qdmullw, SATMASK32)
+DO_2OP_SAT_SCALAR_L(vqdmullt_scalarh, 1, 2, int16_t, 4, int32_t, \
+                    do_qdmullh, SATMASK16T)
+DO_2OP_SAT_SCALAR_L(vqdmullt_scalarw, 1, 4, int32_t, 8, int64_t, \
+                    do_qdmullw, SATMASK32)
+
+/*
+ * Long saturating ops
+ */
+#define DO_2OP_SAT_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN, SATMASK)  \
+    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn,   \
+                                void *vm)                               \
+    {                                                                   \
+        LTYPE *d = vd;                                                  \
+        TYPE *n = vn, *m = vm;                                          \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned le;                                                    \
+        bool qc = false;                                                \
+        for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) {         \
+            bool sat = false;                                           \
+            LTYPE op1 = n[H##ESIZE(le * 2 + TOP)];                      \
+            LTYPE op2 = m[H##ESIZE(le * 2 + TOP)];                      \
+            mergemask(&d[H##LESIZE(le)], FN(op1, op2, &sat), mask);     \
+            qc |= sat && (mask & SATMASK);                              \
+        }                                                               \
+        if (qc) {                                                       \
+            env->vfp.qc[0] = qc;                                        \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+DO_2OP_SAT_L(vqdmullbh, 0, 2, int16_t, 4, int32_t, do_qdmullh, SATMASK16B)
+DO_2OP_SAT_L(vqdmullbw, 0, 4, int32_t, 8, int64_t, do_qdmullw, SATMASK32)
+DO_2OP_SAT_L(vqdmullth, 1, 2, int16_t, 4, int32_t, do_qdmullh, SATMASK16T)
+DO_2OP_SAT_L(vqdmulltw, 1, 4, int32_t, 8, int64_t, do_qdmullw, SATMASK32)
+
+static inline uint32_t do_vbrsrb(uint32_t n, uint32_t m)
+{
+    m &= 0xff;
+    if (m == 0) {
+        return 0;
+    }
+    n = revbit8(n);
+    if (m < 8) {
+        n >>= 8 - m;
+    }
+    return n;
+}
+
+static inline uint32_t do_vbrsrh(uint32_t n, uint32_t m)
+{
+    m &= 0xff;
+    if (m == 0) {
+        return 0;
+    }
+    n = revbit16(n);
+    if (m < 16) {
+        n >>= 16 - m;
+    }
+    return n;
+}
+
+static inline uint32_t do_vbrsrw(uint32_t n, uint32_t m)
+{
+    m &= 0xff;
+    if (m == 0) {
+        return 0;
+    }
+    n = revbit32(n);
+    if (m < 32) {
+        n >>= 32 - m;
+    }
+    return n;
+}
+
+DO_2OP_SCALAR(vbrsrb, 1, uint8_t, do_vbrsrb)
+DO_2OP_SCALAR(vbrsrh, 2, uint16_t, do_vbrsrh)
+DO_2OP_SCALAR(vbrsrw, 4, uint32_t, do_vbrsrw)
+
+/*
+ * Multiply add long dual accumulate ops.
+ */
+#define DO_LDAV(OP, ESIZE, TYPE, XCHG, EVENACC, ODDACC)                 \
+    uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn,         \
+                                    void *vm, uint64_t a)               \
+    {                                                                   \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned e;                                                     \
+        TYPE *n = vn, *m = vm;                                          \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
+            if (mask & 1) {                                             \
+                if (e & 1) {                                            \
+                    a ODDACC                                            \
+                        (int64_t)n[H##ESIZE(e - 1 * XCHG)] * m[H##ESIZE(e)]; \
+                } else {                                                \
+                    a EVENACC                                           \
+                        (int64_t)n[H##ESIZE(e + 1 * XCHG)] * m[H##ESIZE(e)]; \
+                }                                                       \
+            }                                                           \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+        return a;                                                       \
+    }
+
+DO_LDAV(vmlaldavsh, 2, int16_t, false, +=, +=)
+DO_LDAV(vmlaldavxsh, 2, int16_t, true, +=, +=)
+DO_LDAV(vmlaldavsw, 4, int32_t, false, +=, +=)
+DO_LDAV(vmlaldavxsw, 4, int32_t, true, +=, +=)
+
+DO_LDAV(vmlaldavuh, 2, uint16_t, false, +=, +=)
+DO_LDAV(vmlaldavuw, 4, uint32_t, false, +=, +=)
+
+DO_LDAV(vmlsldavsh, 2, int16_t, false, +=, -=)
+DO_LDAV(vmlsldavxsh, 2, int16_t, true, +=, -=)
+DO_LDAV(vmlsldavsw, 4, int32_t, false, +=, -=)
+DO_LDAV(vmlsldavxsw, 4, int32_t, true, +=, -=)
+
+/*
+ * Multiply add dual accumulate ops
+ */
+#define DO_DAV(OP, ESIZE, TYPE, XCHG, EVENACC, ODDACC) \
+    uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn,         \
+                                    void *vm, uint32_t a)               \
+    {                                                                   \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned e;                                                     \
+        TYPE *n = vn, *m = vm;                                          \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
+            if (mask & 1) {                                             \
+                if (e & 1) {                                            \
+                    a ODDACC                                            \
+                        n[H##ESIZE(e - 1 * XCHG)] * m[H##ESIZE(e)];     \
+                } else {                                                \
+                    a EVENACC                                           \
+                        n[H##ESIZE(e + 1 * XCHG)] * m[H##ESIZE(e)];     \
+                }                                                       \
+            }                                                           \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+        return a;                                                       \
+    }
+
+#define DO_DAV_S(INSN, XCHG, EVENACC, ODDACC)           \
+    DO_DAV(INSN##b, 1, int8_t, XCHG, EVENACC, ODDACC)   \
+    DO_DAV(INSN##h, 2, int16_t, XCHG, EVENACC, ODDACC)  \
+    DO_DAV(INSN##w, 4, int32_t, XCHG, EVENACC, ODDACC)
+
+#define DO_DAV_U(INSN, XCHG, EVENACC, ODDACC)           \
+    DO_DAV(INSN##b, 1, uint8_t, XCHG, EVENACC, ODDACC)  \
+    DO_DAV(INSN##h, 2, uint16_t, XCHG, EVENACC, ODDACC) \
+    DO_DAV(INSN##w, 4, uint32_t, XCHG, EVENACC, ODDACC)
+
+DO_DAV_S(vmladavs, false, +=, +=)
+DO_DAV_U(vmladavu, false, +=, +=)
+DO_DAV_S(vmlsdav, false, +=, -=)
+DO_DAV_S(vmladavsx, true, +=, +=)
+DO_DAV_S(vmlsdavx, true, +=, -=)
+
+/*
+ * Rounding multiply add long dual accumulate high. In the pseudocode
+ * this is implemented with a 72-bit internal accumulator value of which
+ * the top 64 bits are returned. We optimize this to avoid having to
+ * use 128-bit arithmetic -- we can do this because the 74-bit accumulator
+ * is squashed back into 64-bits after each beat.
+ */
+#define DO_LDAVH(OP, TYPE, LTYPE, XCHG, SUB)                            \
+    uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn,         \
+                                    void *vm, uint64_t a)               \
+    {                                                                   \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned e;                                                     \
+        TYPE *n = vn, *m = vm;                                          \
+        for (e = 0; e < 16 / 4; e++, mask >>= 4) {                      \
+            if (mask & 1) {                                             \
+                LTYPE mul;                                              \
+                if (e & 1) {                                            \
+                    mul = (LTYPE)n[H4(e - 1 * XCHG)] * m[H4(e)];        \
+                    if (SUB) {                                          \
+                        mul = -mul;                                     \
+                    }                                                   \
+                } else {                                                \
+                    mul = (LTYPE)n[H4(e + 1 * XCHG)] * m[H4(e)];        \
+                }                                                       \
+                mul = (mul >> 8) + ((mul >> 7) & 1);                    \
+                a += mul;                                               \
+            }                                                           \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+        return a;                                                       \
+    }
+
+DO_LDAVH(vrmlaldavhsw, int32_t, int64_t, false, false)
+DO_LDAVH(vrmlaldavhxsw, int32_t, int64_t, true, false)
+
+DO_LDAVH(vrmlaldavhuw, uint32_t, uint64_t, false, false)
+
+DO_LDAVH(vrmlsldavhsw, int32_t, int64_t, false, true)
+DO_LDAVH(vrmlsldavhxsw, int32_t, int64_t, true, true)
+
+/* Vector add across vector */
+#define DO_VADDV(OP, ESIZE, TYPE)                               \
+    uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \
+                                    uint32_t ra)                \
+    {                                                           \
+        uint16_t mask = mve_element_mask(env);                  \
+        unsigned e;                                             \
+        TYPE *m = vm;                                           \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {      \
+            if (mask & 1) {                                     \
+                ra += m[H##ESIZE(e)];                           \
+            }                                                   \
+        }                                                       \
+        mve_advance_vpt(env);                                   \
+        return ra;                                              \
+    }                                                           \
+
+DO_VADDV(vaddvsb, 1, int8_t)
+DO_VADDV(vaddvsh, 2, int16_t)
+DO_VADDV(vaddvsw, 4, int32_t)
+DO_VADDV(vaddvub, 1, uint8_t)
+DO_VADDV(vaddvuh, 2, uint16_t)
+DO_VADDV(vaddvuw, 4, uint32_t)
+
+/*
+ * Vector max/min across vector. Unlike VADDV, we must
+ * read ra as the element size, not its full width.
+ * We work with int64_t internally for simplicity.
+ */
+#define DO_VMAXMINV(OP, ESIZE, TYPE, RATYPE, FN)                \
+    uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \
+                                    uint32_t ra_in)             \
+    {                                                           \
+        uint16_t mask = mve_element_mask(env);                  \
+        unsigned e;                                             \
+        TYPE *m = vm;                                           \
+        int64_t ra = (RATYPE)ra_in;                             \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {      \
+            if (mask & 1) {                                     \
+                ra = FN(ra, m[H##ESIZE(e)]);                    \
+            }                                                   \
+        }                                                       \
+        mve_advance_vpt(env);                                   \
+        return ra;                                              \
+    }                                                           \
+
+#define DO_VMAXMINV_U(INSN, FN)                         \
+    DO_VMAXMINV(INSN##b, 1, uint8_t, uint8_t, FN)       \
+    DO_VMAXMINV(INSN##h, 2, uint16_t, uint16_t, FN)     \
+    DO_VMAXMINV(INSN##w, 4, uint32_t, uint32_t, FN)
+#define DO_VMAXMINV_S(INSN, FN)                         \
+    DO_VMAXMINV(INSN##b, 1, int8_t, int8_t, FN)         \
+    DO_VMAXMINV(INSN##h, 2, int16_t, int16_t, FN)       \
+    DO_VMAXMINV(INSN##w, 4, int32_t, int32_t, FN)
+
+/*
+ * Helpers for max and min of absolute values across vector:
+ * note that we only take the absolute value of 'm', not 'n'
+ */
+static int64_t do_maxa(int64_t n, int64_t m)
+{
+    if (m < 0) {
+        m = -m;
+    }
+    return MAX(n, m);
+}
+
+static int64_t do_mina(int64_t n, int64_t m)
+{
+    if (m < 0) {
+        m = -m;
+    }
+    return MIN(n, m);
+}
+
+DO_VMAXMINV_S(vmaxvs, DO_MAX)
+DO_VMAXMINV_U(vmaxvu, DO_MAX)
+DO_VMAXMINV_S(vminvs, DO_MIN)
+DO_VMAXMINV_U(vminvu, DO_MIN)
+/*
+ * VMAXAV, VMINAV treat the general purpose input as unsigned
+ * and the vector elements as signed.
+ */
+DO_VMAXMINV(vmaxavb, 1, int8_t, uint8_t, do_maxa)
+DO_VMAXMINV(vmaxavh, 2, int16_t, uint16_t, do_maxa)
+DO_VMAXMINV(vmaxavw, 4, int32_t, uint32_t, do_maxa)
+DO_VMAXMINV(vminavb, 1, int8_t, uint8_t, do_mina)
+DO_VMAXMINV(vminavh, 2, int16_t, uint16_t, do_mina)
+DO_VMAXMINV(vminavw, 4, int32_t, uint32_t, do_mina)
+
+#define DO_VABAV(OP, ESIZE, TYPE)                               \
+    uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \
+                                    void *vm, uint32_t ra)      \
+    {                                                           \
+        uint16_t mask = mve_element_mask(env);                  \
+        unsigned e;                                             \
+        TYPE *m = vm, *n = vn;                                  \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {      \
+            if (mask & 1) {                                     \
+                int64_t n0 = n[H##ESIZE(e)];                    \
+                int64_t m0 = m[H##ESIZE(e)];                    \
+                uint32_t r = n0 >= m0 ? (n0 - m0) : (m0 - n0);  \
+                ra += r;                                        \
+            }                                                   \
+        }                                                       \
+        mve_advance_vpt(env);                                   \
+        return ra;                                              \
+    }
+
+DO_VABAV(vabavsb, 1, int8_t)
+DO_VABAV(vabavsh, 2, int16_t)
+DO_VABAV(vabavsw, 4, int32_t)
+DO_VABAV(vabavub, 1, uint8_t)
+DO_VABAV(vabavuh, 2, uint16_t)
+DO_VABAV(vabavuw, 4, uint32_t)
+
+#define DO_VADDLV(OP, TYPE, LTYPE)                              \
+    uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \
+                                    uint64_t ra)                \
+    {                                                           \
+        uint16_t mask = mve_element_mask(env);                  \
+        unsigned e;                                             \
+        TYPE *m = vm;                                           \
+        for (e = 0; e < 16 / 4; e++, mask >>= 4) {              \
+            if (mask & 1) {                                     \
+                ra += (LTYPE)m[H4(e)];                          \
+            }                                                   \
+        }                                                       \
+        mve_advance_vpt(env);                                   \
+        return ra;                                              \
+    }                                                           \
+
+DO_VADDLV(vaddlv_s, int32_t, int64_t)
+DO_VADDLV(vaddlv_u, uint32_t, uint64_t)
+
+/* Shifts by immediate */
+#define DO_2SHIFT(OP, ESIZE, TYPE, FN)                          \
+    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd,     \
+                                void *vm, uint32_t shift)       \
+    {                                                           \
+        TYPE *d = vd, *m = vm;                                  \
+        uint16_t mask = mve_element_mask(env);                  \
+        unsigned e;                                             \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {      \
+            mergemask(&d[H##ESIZE(e)],                          \
+                      FN(m[H##ESIZE(e)], shift), mask);         \
+        }                                                       \
+        mve_advance_vpt(env);                                   \
+    }
+
+#define DO_2SHIFT_SAT(OP, ESIZE, TYPE, FN)                      \
+    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd,     \
+                                void *vm, uint32_t shift)       \
+    {                                                           \
+        TYPE *d = vd, *m = vm;                                  \
+        uint16_t mask = mve_element_mask(env);                  \
+        unsigned e;                                             \
+        bool qc = false;                                        \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {      \
+            bool sat = false;                                   \
+            mergemask(&d[H##ESIZE(e)],                          \
+                      FN(m[H##ESIZE(e)], shift, &sat), mask);   \
+            qc |= sat & mask & 1;                               \
+        }                                                       \
+        if (qc) {                                               \
+            env->vfp.qc[0] = qc;                                \
+        }                                                       \
+        mve_advance_vpt(env);                                   \
+    }
+
+/* provide unsigned 2-op shift helpers for all sizes */
+#define DO_2SHIFT_U(OP, FN)                     \
+    DO_2SHIFT(OP##b, 1, uint8_t, FN)            \
+    DO_2SHIFT(OP##h, 2, uint16_t, FN)           \
+    DO_2SHIFT(OP##w, 4, uint32_t, FN)
+#define DO_2SHIFT_S(OP, FN)                     \
+    DO_2SHIFT(OP##b, 1, int8_t, FN)             \
+    DO_2SHIFT(OP##h, 2, int16_t, FN)            \
+    DO_2SHIFT(OP##w, 4, int32_t, FN)
+
+#define DO_2SHIFT_SAT_U(OP, FN)                 \
+    DO_2SHIFT_SAT(OP##b, 1, uint8_t, FN)        \
+    DO_2SHIFT_SAT(OP##h, 2, uint16_t, FN)       \
+    DO_2SHIFT_SAT(OP##w, 4, uint32_t, FN)
+#define DO_2SHIFT_SAT_S(OP, FN)                 \
+    DO_2SHIFT_SAT(OP##b, 1, int8_t, FN)         \
+    DO_2SHIFT_SAT(OP##h, 2, int16_t, FN)        \
+    DO_2SHIFT_SAT(OP##w, 4, int32_t, FN)
+
+DO_2SHIFT_U(vshli_u, DO_VSHLU)
+DO_2SHIFT_S(vshli_s, DO_VSHLS)
+DO_2SHIFT_SAT_U(vqshli_u, DO_UQSHL_OP)
+DO_2SHIFT_SAT_S(vqshli_s, DO_SQSHL_OP)
+DO_2SHIFT_SAT_S(vqshlui_s, DO_SUQSHL_OP)
+DO_2SHIFT_U(vrshli_u, DO_VRSHLU)
+DO_2SHIFT_S(vrshli_s, DO_VRSHLS)
+DO_2SHIFT_SAT_U(vqrshli_u, DO_UQRSHL_OP)
+DO_2SHIFT_SAT_S(vqrshli_s, DO_SQRSHL_OP)
+
+/* Shift-and-insert; we always work with 64 bits at a time */
+#define DO_2SHIFT_INSERT(OP, ESIZE, SHIFTFN, MASKFN)                    \
+    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd,             \
+                                void *vm, uint32_t shift)               \
+    {                                                                   \
+        uint64_t *d = vd, *m = vm;                                      \
+        uint16_t mask;                                                  \
+        uint64_t shiftmask;                                             \
+        unsigned e;                                                     \
+        if (shift == ESIZE * 8) {                                       \
+            /*                                                          \
+             * Only VSRI can shift by <dt>; it should mean "don't       \
+             * update the destination". The generic logic can't handle  \
+             * this because it would try to shift by an out-of-range    \
+             * amount, so special case it here.                         \
+             */                                                         \
+            goto done;                                                  \
+        }                                                               \
+        assert(shift < ESIZE * 8);                                      \
+        mask = mve_element_mask(env);                                   \
+        /* ESIZE / 2 gives the MO_* value if ESIZE is in [1,2,4] */     \
+        shiftmask = dup_const(ESIZE / 2, MASKFN(ESIZE * 8, shift));     \
+        for (e = 0; e < 16 / 8; e++, mask >>= 8) {                      \
+            uint64_t r = (SHIFTFN(m[H8(e)], shift) & shiftmask) |       \
+                (d[H8(e)] & ~shiftmask);                                \
+            mergemask(&d[H8(e)], r, mask);                              \
+        }                                                               \
+done:                                                                   \
+        mve_advance_vpt(env);                                           \
+    }
+
+#define DO_SHL(N, SHIFT) ((N) << (SHIFT))
+#define DO_SHR(N, SHIFT) ((N) >> (SHIFT))
+#define SHL_MASK(EBITS, SHIFT) MAKE_64BIT_MASK((SHIFT), (EBITS) - (SHIFT))
+#define SHR_MASK(EBITS, SHIFT) MAKE_64BIT_MASK(0, (EBITS) - (SHIFT))
+
+DO_2SHIFT_INSERT(vsrib, 1, DO_SHR, SHR_MASK)
+DO_2SHIFT_INSERT(vsrih, 2, DO_SHR, SHR_MASK)
+DO_2SHIFT_INSERT(vsriw, 4, DO_SHR, SHR_MASK)
+DO_2SHIFT_INSERT(vslib, 1, DO_SHL, SHL_MASK)
+DO_2SHIFT_INSERT(vslih, 2, DO_SHL, SHL_MASK)
+DO_2SHIFT_INSERT(vsliw, 4, DO_SHL, SHL_MASK)
+
+/*
+ * Long shifts taking half-sized inputs from top or bottom of the input
+ * vector and producing a double-width result. ESIZE, TYPE are for
+ * the input, and LESIZE, LTYPE for the output.
+ * Unlike the normal shift helpers, we do not handle negative shift counts,
+ * because the long shift is strictly left-only.
+ */
+#define DO_VSHLL(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE)                   \
+    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd,             \
+                                void *vm, uint32_t shift)               \
+    {                                                                   \
+        LTYPE *d = vd;                                                  \
+        TYPE *m = vm;                                                   \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned le;                                                    \
+        assert(shift <= 16);                                            \
+        for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) {         \
+            LTYPE r = (LTYPE)m[H##ESIZE(le * 2 + TOP)] << shift;        \
+            mergemask(&d[H##LESIZE(le)], r, mask);                      \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+#define DO_VSHLL_ALL(OP, TOP)                                \
+    DO_VSHLL(OP##sb, TOP, 1, int8_t, 2, int16_t)             \
+    DO_VSHLL(OP##ub, TOP, 1, uint8_t, 2, uint16_t)           \
+    DO_VSHLL(OP##sh, TOP, 2, int16_t, 4, int32_t)            \
+    DO_VSHLL(OP##uh, TOP, 2, uint16_t, 4, uint32_t)          \
+
+DO_VSHLL_ALL(vshllb, false)
+DO_VSHLL_ALL(vshllt, true)
+
+/*
+ * Narrowing right shifts, taking a double sized input, shifting it
+ * and putting the result in either the top or bottom half of the output.
+ * ESIZE, TYPE are the output, and LESIZE, LTYPE the input.
+ */
+#define DO_VSHRN(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN)       \
+    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd,     \
+                                void *vm, uint32_t shift)       \
+    {                                                           \
+        LTYPE *m = vm;                                          \
+        TYPE *d = vd;                                           \
+        uint16_t mask = mve_element_mask(env);                  \
+        unsigned le;                                            \
+        mask >>= ESIZE * TOP;                                   \
+        for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
+            TYPE r = FN(m[H##LESIZE(le)], shift);               \
+            mergemask(&d[H##ESIZE(le * 2 + TOP)], r, mask);     \
+        }                                                       \
+        mve_advance_vpt(env);                                   \
+    }
+
+#define DO_VSHRN_ALL(OP, FN)                                    \
+    DO_VSHRN(OP##bb, false, 1, uint8_t, 2, uint16_t, FN)        \
+    DO_VSHRN(OP##bh, false, 2, uint16_t, 4, uint32_t, FN)       \
+    DO_VSHRN(OP##tb, true, 1, uint8_t, 2, uint16_t, FN)         \
+    DO_VSHRN(OP##th, true, 2, uint16_t, 4, uint32_t, FN)
+
+static inline uint64_t do_urshr(uint64_t x, unsigned sh)
+{
+    if (likely(sh < 64)) {
+        return (x >> sh) + ((x >> (sh - 1)) & 1);
+    } else if (sh == 64) {
+        return x >> 63;
+    } else {
+        return 0;
+    }
+}
+
+static inline int64_t do_srshr(int64_t x, unsigned sh)
+{
+    if (likely(sh < 64)) {
+        return (x >> sh) + ((x >> (sh - 1)) & 1);
+    } else {
+        /* Rounding the sign bit always produces 0. */
+        return 0;
+    }
+}
+
+DO_VSHRN_ALL(vshrn, DO_SHR)
+DO_VSHRN_ALL(vrshrn, do_urshr)
+
+static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max,
+                                 bool *satp)
+{
+    if (val > max) {
+        *satp = true;
+        return max;
+    } else if (val < min) {
+        *satp = true;
+        return min;
+    } else {
+        return val;
+    }
+}
+
+/* Saturating narrowing right shifts */
+#define DO_VSHRN_SAT(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN)   \
+    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd,     \
+                                void *vm, uint32_t shift)       \
+    {                                                           \
+        LTYPE *m = vm;                                          \
+        TYPE *d = vd;                                           \
+        uint16_t mask = mve_element_mask(env);                  \
+        bool qc = false;                                        \
+        unsigned le;                                            \
+        mask >>= ESIZE * TOP;                                   \
+        for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
+            bool sat = false;                                   \
+            TYPE r = FN(m[H##LESIZE(le)], shift, &sat);         \
+            mergemask(&d[H##ESIZE(le * 2 + TOP)], r, mask);     \
+            qc |= sat & mask & 1;                               \
+        }                                                       \
+        if (qc) {                                               \
+            env->vfp.qc[0] = qc;                                \
+        }                                                       \
+        mve_advance_vpt(env);                                   \
+    }
+
+#define DO_VSHRN_SAT_UB(BOP, TOP, FN)                           \
+    DO_VSHRN_SAT(BOP, false, 1, uint8_t, 2, uint16_t, FN)       \
+    DO_VSHRN_SAT(TOP, true, 1, uint8_t, 2, uint16_t, FN)
+
+#define DO_VSHRN_SAT_UH(BOP, TOP, FN)                           \
+    DO_VSHRN_SAT(BOP, false, 2, uint16_t, 4, uint32_t, FN)      \
+    DO_VSHRN_SAT(TOP, true, 2, uint16_t, 4, uint32_t, FN)
+
+#define DO_VSHRN_SAT_SB(BOP, TOP, FN)                           \
+    DO_VSHRN_SAT(BOP, false, 1, int8_t, 2, int16_t, FN)         \
+    DO_VSHRN_SAT(TOP, true, 1, int8_t, 2, int16_t, FN)
+
+#define DO_VSHRN_SAT_SH(BOP, TOP, FN)                           \
+    DO_VSHRN_SAT(BOP, false, 2, int16_t, 4, int32_t, FN)        \
+    DO_VSHRN_SAT(TOP, true, 2, int16_t, 4, int32_t, FN)
+
+#define DO_SHRN_SB(N, M, SATP)                                  \
+    do_sat_bhs((int64_t)(N) >> (M), INT8_MIN, INT8_MAX, SATP)
+#define DO_SHRN_UB(N, M, SATP)                                  \
+    do_sat_bhs((uint64_t)(N) >> (M), 0, UINT8_MAX, SATP)
+#define DO_SHRUN_B(N, M, SATP)                                  \
+    do_sat_bhs((int64_t)(N) >> (M), 0, UINT8_MAX, SATP)
+
+#define DO_SHRN_SH(N, M, SATP)                                  \
+    do_sat_bhs((int64_t)(N) >> (M), INT16_MIN, INT16_MAX, SATP)
+#define DO_SHRN_UH(N, M, SATP)                                  \
+    do_sat_bhs((uint64_t)(N) >> (M), 0, UINT16_MAX, SATP)
+#define DO_SHRUN_H(N, M, SATP)                                  \
+    do_sat_bhs((int64_t)(N) >> (M), 0, UINT16_MAX, SATP)
+
+#define DO_RSHRN_SB(N, M, SATP)                                 \
+    do_sat_bhs(do_srshr(N, M), INT8_MIN, INT8_MAX, SATP)
+#define DO_RSHRN_UB(N, M, SATP)                                 \
+    do_sat_bhs(do_urshr(N, M), 0, UINT8_MAX, SATP)
+#define DO_RSHRUN_B(N, M, SATP)                                 \
+    do_sat_bhs(do_srshr(N, M), 0, UINT8_MAX, SATP)
+
+#define DO_RSHRN_SH(N, M, SATP)                                 \
+    do_sat_bhs(do_srshr(N, M), INT16_MIN, INT16_MAX, SATP)
+#define DO_RSHRN_UH(N, M, SATP)                                 \
+    do_sat_bhs(do_urshr(N, M), 0, UINT16_MAX, SATP)
+#define DO_RSHRUN_H(N, M, SATP)                                 \
+    do_sat_bhs(do_srshr(N, M), 0, UINT16_MAX, SATP)
+
+DO_VSHRN_SAT_SB(vqshrnb_sb, vqshrnt_sb, DO_SHRN_SB)
+DO_VSHRN_SAT_SH(vqshrnb_sh, vqshrnt_sh, DO_SHRN_SH)
+DO_VSHRN_SAT_UB(vqshrnb_ub, vqshrnt_ub, DO_SHRN_UB)
+DO_VSHRN_SAT_UH(vqshrnb_uh, vqshrnt_uh, DO_SHRN_UH)
+DO_VSHRN_SAT_SB(vqshrunbb, vqshruntb, DO_SHRUN_B)
+DO_VSHRN_SAT_SH(vqshrunbh, vqshrunth, DO_SHRUN_H)
+
+DO_VSHRN_SAT_SB(vqrshrnb_sb, vqrshrnt_sb, DO_RSHRN_SB)
+DO_VSHRN_SAT_SH(vqrshrnb_sh, vqrshrnt_sh, DO_RSHRN_SH)
+DO_VSHRN_SAT_UB(vqrshrnb_ub, vqrshrnt_ub, DO_RSHRN_UB)
+DO_VSHRN_SAT_UH(vqrshrnb_uh, vqrshrnt_uh, DO_RSHRN_UH)
+DO_VSHRN_SAT_SB(vqrshrunbb, vqrshruntb, DO_RSHRUN_B)
+DO_VSHRN_SAT_SH(vqrshrunbh, vqrshrunth, DO_RSHRUN_H)
+
+#define DO_VMOVN(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE)                   \
+    void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm)         \
+    {                                                                   \
+        LTYPE *m = vm;                                                  \
+        TYPE *d = vd;                                                   \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned le;                                                    \
+        mask >>= ESIZE * TOP;                                           \
+        for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) {         \
+            mergemask(&d[H##ESIZE(le * 2 + TOP)],                       \
+                      m[H##LESIZE(le)], mask);                          \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+DO_VMOVN(vmovnbb, false, 1, uint8_t, 2, uint16_t)
+DO_VMOVN(vmovnbh, false, 2, uint16_t, 4, uint32_t)
+DO_VMOVN(vmovntb, true, 1, uint8_t, 2, uint16_t)
+DO_VMOVN(vmovnth, true, 2, uint16_t, 4, uint32_t)
+
+#define DO_VMOVN_SAT(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN)           \
+    void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm)         \
+    {                                                                   \
+        LTYPE *m = vm;                                                  \
+        TYPE *d = vd;                                                   \
+        uint16_t mask = mve_element_mask(env);                          \
+        bool qc = false;                                                \
+        unsigned le;                                                    \
+        mask >>= ESIZE * TOP;                                           \
+        for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) {         \
+            bool sat = false;                                           \
+            TYPE r = FN(m[H##LESIZE(le)], &sat);                        \
+            mergemask(&d[H##ESIZE(le * 2 + TOP)], r, mask);             \
+            qc |= sat & mask & 1;                                       \
+        }                                                               \
+        if (qc) {                                                       \
+            env->vfp.qc[0] = qc;                                        \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+#define DO_VMOVN_SAT_UB(BOP, TOP, FN)                           \
+    DO_VMOVN_SAT(BOP, false, 1, uint8_t, 2, uint16_t, FN)       \
+    DO_VMOVN_SAT(TOP, true, 1, uint8_t, 2, uint16_t, FN)
+
+#define DO_VMOVN_SAT_UH(BOP, TOP, FN)                           \
+    DO_VMOVN_SAT(BOP, false, 2, uint16_t, 4, uint32_t, FN)      \
+    DO_VMOVN_SAT(TOP, true, 2, uint16_t, 4, uint32_t, FN)
+
+#define DO_VMOVN_SAT_SB(BOP, TOP, FN)                           \
+    DO_VMOVN_SAT(BOP, false, 1, int8_t, 2, int16_t, FN)         \
+    DO_VMOVN_SAT(TOP, true, 1, int8_t, 2, int16_t, FN)
+
+#define DO_VMOVN_SAT_SH(BOP, TOP, FN)                           \
+    DO_VMOVN_SAT(BOP, false, 2, int16_t, 4, int32_t, FN)        \
+    DO_VMOVN_SAT(TOP, true, 2, int16_t, 4, int32_t, FN)
+
+#define DO_VQMOVN_SB(N, SATP)                           \
+    do_sat_bhs((int64_t)(N), INT8_MIN, INT8_MAX, SATP)
+#define DO_VQMOVN_UB(N, SATP)                           \
+    do_sat_bhs((uint64_t)(N), 0, UINT8_MAX, SATP)
+#define DO_VQMOVUN_B(N, SATP)                           \
+    do_sat_bhs((int64_t)(N), 0, UINT8_MAX, SATP)
+
+#define DO_VQMOVN_SH(N, SATP)                           \
+    do_sat_bhs((int64_t)(N), INT16_MIN, INT16_MAX, SATP)
+#define DO_VQMOVN_UH(N, SATP)                           \
+    do_sat_bhs((uint64_t)(N), 0, UINT16_MAX, SATP)
+#define DO_VQMOVUN_H(N, SATP)                           \
+    do_sat_bhs((int64_t)(N), 0, UINT16_MAX, SATP)
+
+DO_VMOVN_SAT_SB(vqmovnbsb, vqmovntsb, DO_VQMOVN_SB)
+DO_VMOVN_SAT_SH(vqmovnbsh, vqmovntsh, DO_VQMOVN_SH)
+DO_VMOVN_SAT_UB(vqmovnbub, vqmovntub, DO_VQMOVN_UB)
+DO_VMOVN_SAT_UH(vqmovnbuh, vqmovntuh, DO_VQMOVN_UH)
+DO_VMOVN_SAT_SB(vqmovunbb, vqmovuntb, DO_VQMOVUN_B)
+DO_VMOVN_SAT_SH(vqmovunbh, vqmovunth, DO_VQMOVUN_H)
+
+uint32_t HELPER(mve_vshlc)(CPUARMState *env, void *vd, uint32_t rdm,
+                           uint32_t shift)
+{
+    uint32_t *d = vd;
+    uint16_t mask = mve_element_mask(env);
+    unsigned e;
+    uint32_t r;
+
+    /*
+     * For each 32-bit element, we shift it left, bringing in the
+     * low 'shift' bits of rdm at the bottom. Bits shifted out at
+     * the top become the new rdm, if the predicate mask permits.
+     * The final rdm value is returned to update the register.
+     * shift == 0 here means "shift by 32 bits".
+     */
+    if (shift == 0) {
+        for (e = 0; e < 16 / 4; e++, mask >>= 4) {
+            r = rdm;
+            if (mask & 1) {
+                rdm = d[H4(e)];
+            }
+            mergemask(&d[H4(e)], r, mask);
+        }
+    } else {
+        uint32_t shiftmask = MAKE_64BIT_MASK(0, shift);
+
+        for (e = 0; e < 16 / 4; e++, mask >>= 4) {
+            r = (d[H4(e)] << shift) | (rdm & shiftmask);
+            if (mask & 1) {
+                rdm = d[H4(e)] >> (32 - shift);
+            }
+            mergemask(&d[H4(e)], r, mask);
+        }
+    }
+    mve_advance_vpt(env);
+    return rdm;
+}
+
+uint64_t HELPER(mve_sshrl)(CPUARMState *env, uint64_t n, uint32_t shift)
+{
+    return do_sqrshl_d(n, -(int8_t)shift, false, NULL);
+}
+
+uint64_t HELPER(mve_ushll)(CPUARMState *env, uint64_t n, uint32_t shift)
+{
+    return do_uqrshl_d(n, (int8_t)shift, false, NULL);
+}
+
+uint64_t HELPER(mve_sqshll)(CPUARMState *env, uint64_t n, uint32_t shift)
+{
+    return do_sqrshl_d(n, (int8_t)shift, false, &env->QF);
+}
+
+uint64_t HELPER(mve_uqshll)(CPUARMState *env, uint64_t n, uint32_t shift)
+{
+    return do_uqrshl_d(n, (int8_t)shift, false, &env->QF);
+}
+
+uint64_t HELPER(mve_sqrshrl)(CPUARMState *env, uint64_t n, uint32_t shift)
+{
+    return do_sqrshl_d(n, -(int8_t)shift, true, &env->QF);
+}
+
+uint64_t HELPER(mve_uqrshll)(CPUARMState *env, uint64_t n, uint32_t shift)
+{
+    return do_uqrshl_d(n, (int8_t)shift, true, &env->QF);
+}
+
+/* Operate on 64-bit values, but saturate at 48 bits */
+static inline int64_t do_sqrshl48_d(int64_t src, int64_t shift,
+                                    bool round, uint32_t *sat)
+{
+    int64_t val, extval;
+
+    if (shift <= -48) {
+        /* Rounding the sign bit always produces 0. */
+        if (round) {
+            return 0;
+        }
+        return src >> 63;
+    } else if (shift < 0) {
+        if (round) {
+            src >>= -shift - 1;
+            val = (src >> 1) + (src & 1);
+        } else {
+            val = src >> -shift;
+        }
+        extval = sextract64(val, 0, 48);
+        if (!sat || val == extval) {
+            return extval;
+        }
+    } else if (shift < 48) {
+        int64_t extval = sextract64(src << shift, 0, 48);
+        if (!sat || src == (extval >> shift)) {
+            return extval;
+        }
+    } else if (!sat || src == 0) {
+        return 0;
+    }
+
+    *sat = 1;
+    return src >= 0 ? MAKE_64BIT_MASK(0, 47) : MAKE_64BIT_MASK(47, 17);
+}
+
+/* Operate on 64-bit values, but saturate at 48 bits */
+static inline uint64_t do_uqrshl48_d(uint64_t src, int64_t shift,
+                                     bool round, uint32_t *sat)
+{
+    uint64_t val, extval;
+
+    if (shift <= -(48 + round)) {
+        return 0;
+    } else if (shift < 0) {
+        if (round) {
+            val = src >> (-shift - 1);
+            val = (val >> 1) + (val & 1);
+        } else {
+            val = src >> -shift;
+        }
+        extval = extract64(val, 0, 48);
+        if (!sat || val == extval) {
+            return extval;
+        }
+    } else if (shift < 48) {
+        uint64_t extval = extract64(src << shift, 0, 48);
+        if (!sat || src == (extval >> shift)) {
+            return extval;
+        }
+    } else if (!sat || src == 0) {
+        return 0;
+    }
+
+    *sat = 1;
+    return MAKE_64BIT_MASK(0, 48);
+}
+
+uint64_t HELPER(mve_sqrshrl48)(CPUARMState *env, uint64_t n, uint32_t shift)
+{
+    return do_sqrshl48_d(n, -(int8_t)shift, true, &env->QF);
+}
+
+uint64_t HELPER(mve_uqrshll48)(CPUARMState *env, uint64_t n, uint32_t shift)
+{
+    return do_uqrshl48_d(n, (int8_t)shift, true, &env->QF);
+}
+
+uint32_t HELPER(mve_uqshl)(CPUARMState *env, uint32_t n, uint32_t shift)
+{
+    return do_uqrshl_bhs(n, (int8_t)shift, 32, false, &env->QF);
+}
+
+uint32_t HELPER(mve_sqshl)(CPUARMState *env, uint32_t n, uint32_t shift)
+{
+    return do_sqrshl_bhs(n, (int8_t)shift, 32, false, &env->QF);
+}
+
+uint32_t HELPER(mve_uqrshl)(CPUARMState *env, uint32_t n, uint32_t shift)
+{
+    return do_uqrshl_bhs(n, (int8_t)shift, 32, true, &env->QF);
+}
+
+uint32_t HELPER(mve_sqrshr)(CPUARMState *env, uint32_t n, uint32_t shift)
+{
+    return do_sqrshl_bhs(n, -(int8_t)shift, 32, true, &env->QF);
+}
+
+#define DO_VIDUP(OP, ESIZE, TYPE, FN)                           \
+    uint32_t HELPER(mve_##OP)(CPUARMState *env, void *vd,       \
+                           uint32_t offset, uint32_t imm)       \
+    {                                                           \
+        TYPE *d = vd;                                           \
+        uint16_t mask = mve_element_mask(env);                  \
+        unsigned e;                                             \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {      \
+            mergemask(&d[H##ESIZE(e)], offset, mask);           \
+            offset = FN(offset, imm);                           \
+        }                                                       \
+        mve_advance_vpt(env);                                   \
+        return offset;                                          \
+    }
+
+#define DO_VIWDUP(OP, ESIZE, TYPE, FN)                          \
+    uint32_t HELPER(mve_##OP)(CPUARMState *env, void *vd,       \
+                              uint32_t offset, uint32_t wrap,   \
+                              uint32_t imm)                     \
+    {                                                           \
+        TYPE *d = vd;                                           \
+        uint16_t mask = mve_element_mask(env);                  \
+        unsigned e;                                             \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {      \
+            mergemask(&d[H##ESIZE(e)], offset, mask);           \
+            offset = FN(offset, wrap, imm);                     \
+        }                                                       \
+        mve_advance_vpt(env);                                   \
+        return offset;                                          \
+    }
+
+#define DO_VIDUP_ALL(OP, FN)                    \
+    DO_VIDUP(OP##b, 1, int8_t, FN)              \
+    DO_VIDUP(OP##h, 2, int16_t, FN)             \
+    DO_VIDUP(OP##w, 4, int32_t, FN)
+
+#define DO_VIWDUP_ALL(OP, FN)                   \
+    DO_VIWDUP(OP##b, 1, int8_t, FN)             \
+    DO_VIWDUP(OP##h, 2, int16_t, FN)            \
+    DO_VIWDUP(OP##w, 4, int32_t, FN)
+
+static uint32_t do_add_wrap(uint32_t offset, uint32_t wrap, uint32_t imm)
+{
+    offset += imm;
+    if (offset == wrap) {
+        offset = 0;
+    }
+    return offset;
+}
+
+static uint32_t do_sub_wrap(uint32_t offset, uint32_t wrap, uint32_t imm)
+{
+    if (offset == 0) {
+        offset = wrap;
+    }
+    offset -= imm;
+    return offset;
+}
+
+DO_VIDUP_ALL(vidup, DO_ADD)
+DO_VIWDUP_ALL(viwdup, do_add_wrap)
+DO_VIWDUP_ALL(vdwdup, do_sub_wrap)
+
+/*
+ * Vector comparison.
+ * P0 bits for non-executed beats (where eci_mask is 0) are unchanged.
+ * P0 bits for predicated lanes in executed beats (where mask is 0) are 0.
+ * P0 bits otherwise are updated with the results of the comparisons.
+ * We must also keep unchanged the MASK fields at the top of v7m.vpr.
+ */
+#define DO_VCMP(OP, ESIZE, TYPE, FN)                                    \
+    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, void *vm)   \
+    {                                                                   \
+        TYPE *n = vn, *m = vm;                                          \
+        uint16_t mask = mve_element_mask(env);                          \
+        uint16_t eci_mask = mve_eci_mask(env);                          \
+        uint16_t beatpred = 0;                                          \
+        uint16_t emask = MAKE_64BIT_MASK(0, ESIZE);                     \
+        unsigned e;                                                     \
+        for (e = 0; e < 16 / ESIZE; e++) {                              \
+            bool r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)]);                \
+            /* Comparison sets 0/1 bits for each byte in the element */ \
+            beatpred |= r * emask;                                      \
+            emask <<= ESIZE;                                            \
+        }                                                               \
+        beatpred &= mask;                                               \
+        env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) |           \
+            (beatpred & eci_mask);                                      \
+        mve_advance_vpt(env);                                           \
+    }
+
+#define DO_VCMP_SCALAR(OP, ESIZE, TYPE, FN)                             \
+    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn,             \
+                                uint32_t rm)                            \
+    {                                                                   \
+        TYPE *n = vn;                                                   \
+        uint16_t mask = mve_element_mask(env);                          \
+        uint16_t eci_mask = mve_eci_mask(env);                          \
+        uint16_t beatpred = 0;                                          \
+        uint16_t emask = MAKE_64BIT_MASK(0, ESIZE);                     \
+        unsigned e;                                                     \
+        for (e = 0; e < 16 / ESIZE; e++) {                              \
+            bool r = FN(n[H##ESIZE(e)], (TYPE)rm);                      \
+            /* Comparison sets 0/1 bits for each byte in the element */ \
+            beatpred |= r * emask;                                      \
+            emask <<= ESIZE;                                            \
+        }                                                               \
+        beatpred &= mask;                                               \
+        env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) |           \
+            (beatpred & eci_mask);                                      \
+        mve_advance_vpt(env);                                           \
+    }
+
+#define DO_VCMP_S(OP, FN)                               \
+    DO_VCMP(OP##b, 1, int8_t, FN)                       \
+    DO_VCMP(OP##h, 2, int16_t, FN)                      \
+    DO_VCMP(OP##w, 4, int32_t, FN)                      \
+    DO_VCMP_SCALAR(OP##_scalarb, 1, int8_t, FN)         \
+    DO_VCMP_SCALAR(OP##_scalarh, 2, int16_t, FN)        \
+    DO_VCMP_SCALAR(OP##_scalarw, 4, int32_t, FN)
+
+#define DO_VCMP_U(OP, FN)                               \
+    DO_VCMP(OP##b, 1, uint8_t, FN)                      \
+    DO_VCMP(OP##h, 2, uint16_t, FN)                     \
+    DO_VCMP(OP##w, 4, uint32_t, FN)                     \
+    DO_VCMP_SCALAR(OP##_scalarb, 1, uint8_t, FN)        \
+    DO_VCMP_SCALAR(OP##_scalarh, 2, uint16_t, FN)       \
+    DO_VCMP_SCALAR(OP##_scalarw, 4, uint32_t, FN)
+
+#define DO_EQ(N, M) ((N) == (M))
+#define DO_NE(N, M) ((N) != (M))
+#define DO_EQ(N, M) ((N) == (M))
+#define DO_EQ(N, M) ((N) == (M))
+#define DO_GE(N, M) ((N) >= (M))
+#define DO_LT(N, M) ((N) < (M))
+#define DO_GT(N, M) ((N) > (M))
+#define DO_LE(N, M) ((N) <= (M))
+
+DO_VCMP_U(vcmpeq, DO_EQ)
+DO_VCMP_U(vcmpne, DO_NE)
+DO_VCMP_U(vcmpcs, DO_GE)
+DO_VCMP_U(vcmphi, DO_GT)
+DO_VCMP_S(vcmpge, DO_GE)
+DO_VCMP_S(vcmplt, DO_LT)
+DO_VCMP_S(vcmpgt, DO_GT)
+DO_VCMP_S(vcmple, DO_LE)
+
+void HELPER(mve_vpsel)(CPUARMState *env, void *vd, void *vn, void *vm)
+{
+    /*
+     * Qd[n] = VPR.P0[n] ? Qn[n] : Qm[n]
+     * but note that whether bytes are written to Qd is still subject
+     * to (all forms of) predication in the usual way.
+     */
+    uint64_t *d = vd, *n = vn, *m = vm;
+    uint16_t mask = mve_element_mask(env);
+    uint16_t p0 = FIELD_EX32(env->v7m.vpr, V7M_VPR, P0);
+    unsigned e;
+    for (e = 0; e < 16 / 8; e++, mask >>= 8, p0 >>= 8) {
+        uint64_t r = m[H8(e)];
+        mergemask(&r, n[H8(e)], p0);
+        mergemask(&d[H8(e)], r, mask);
+    }
+    mve_advance_vpt(env);
+}
+
+void HELPER(mve_vpnot)(CPUARMState *env)
+{
+    /*
+     * P0 bits for unexecuted beats (where eci_mask is 0) are unchanged.
+     * P0 bits for predicated lanes in executed bits (where mask is 0) are 0.
+     * P0 bits otherwise are inverted.
+     * (This is the same logic as VCMP.)
+     * This insn is itself subject to predication and to beat-wise execution,
+     * and after it executes VPT state advances in the usual way.
+     */
+    uint16_t mask = mve_element_mask(env);
+    uint16_t eci_mask = mve_eci_mask(env);
+    uint16_t beatpred = ~env->v7m.vpr & mask;
+    env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | (beatpred & eci_mask);
+    mve_advance_vpt(env);
+}
+
+/*
+ * VCTP: P0 unexecuted bits unchanged, predicated bits zeroed,
+ * otherwise set according to value of Rn. The calculation of
+ * newmask here works in the same way as the calculation of the
+ * ltpmask in mve_element_mask(), but we have pre-calculated
+ * the masklen in the generated code.
+ */
+void HELPER(mve_vctp)(CPUARMState *env, uint32_t masklen)
+{
+    uint16_t mask = mve_element_mask(env);
+    uint16_t eci_mask = mve_eci_mask(env);
+    uint16_t newmask;
+
+    assert(masklen <= 16);
+    newmask = masklen ? MAKE_64BIT_MASK(0, masklen) : 0;
+    newmask &= mask;
+    env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | (newmask & eci_mask);
+    mve_advance_vpt(env);
+}
+
+#define DO_1OP_SAT(OP, ESIZE, TYPE, FN)                                 \
+    void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm)         \
+    {                                                                   \
+        TYPE *d = vd, *m = vm;                                          \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned e;                                                     \
+        bool qc = false;                                                \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
+            bool sat = false;                                           \
+            mergemask(&d[H##ESIZE(e)], FN(m[H##ESIZE(e)], &sat), mask); \
+            qc |= sat & mask & 1;                                       \
+        }                                                               \
+        if (qc) {                                                       \
+            env->vfp.qc[0] = qc;                                        \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+#define DO_VQABS_B(N, SATP) \
+    do_sat_bhs(DO_ABS((int64_t)N), INT8_MIN, INT8_MAX, SATP)
+#define DO_VQABS_H(N, SATP) \
+    do_sat_bhs(DO_ABS((int64_t)N), INT16_MIN, INT16_MAX, SATP)
+#define DO_VQABS_W(N, SATP) \
+    do_sat_bhs(DO_ABS((int64_t)N), INT32_MIN, INT32_MAX, SATP)
+
+#define DO_VQNEG_B(N, SATP) do_sat_bhs(-(int64_t)N, INT8_MIN, INT8_MAX, SATP)
+#define DO_VQNEG_H(N, SATP) do_sat_bhs(-(int64_t)N, INT16_MIN, INT16_MAX, SATP)
+#define DO_VQNEG_W(N, SATP) do_sat_bhs(-(int64_t)N, INT32_MIN, INT32_MAX, SATP)
+
+DO_1OP_SAT(vqabsb, 1, int8_t, DO_VQABS_B)
+DO_1OP_SAT(vqabsh, 2, int16_t, DO_VQABS_H)
+DO_1OP_SAT(vqabsw, 4, int32_t, DO_VQABS_W)
+
+DO_1OP_SAT(vqnegb, 1, int8_t, DO_VQNEG_B)
+DO_1OP_SAT(vqnegh, 2, int16_t, DO_VQNEG_H)
+DO_1OP_SAT(vqnegw, 4, int32_t, DO_VQNEG_W)
+
+/*
+ * VMAXA, VMINA: vd is unsigned; vm is signed, and we take its
+ * absolute value; we then do an unsigned comparison.
+ */
+#define DO_VMAXMINA(OP, ESIZE, STYPE, UTYPE, FN)                        \
+    void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm)         \
+    {                                                                   \
+        UTYPE *d = vd;                                                  \
+        STYPE *m = vm;                                                  \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned e;                                                     \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
+            UTYPE r = DO_ABS(m[H##ESIZE(e)]);                           \
+            r = FN(d[H##ESIZE(e)], r);                                  \
+            mergemask(&d[H##ESIZE(e)], r, mask);                        \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+DO_VMAXMINA(vmaxab, 1, int8_t, uint8_t, DO_MAX)
+DO_VMAXMINA(vmaxah, 2, int16_t, uint16_t, DO_MAX)
+DO_VMAXMINA(vmaxaw, 4, int32_t, uint32_t, DO_MAX)
+DO_VMAXMINA(vminab, 1, int8_t, uint8_t, DO_MIN)
+DO_VMAXMINA(vminah, 2, int16_t, uint16_t, DO_MIN)
+DO_VMAXMINA(vminaw, 4, int32_t, uint32_t, DO_MIN)
+
+/*
+ * 2-operand floating point. Note that if an element is partially
+ * predicated we must do the FP operation to update the non-predicated
+ * bytes, but we must be careful to avoid updating the FP exception
+ * state unless byte 0 of the element was unpredicated.
+ */
+#define DO_2OP_FP(OP, ESIZE, TYPE, FN)                                  \
+    void HELPER(glue(mve_, OP))(CPUARMState *env,                       \
+                                void *vd, void *vn, void *vm)           \
+    {                                                                   \
+        TYPE *d = vd, *n = vn, *m = vm;                                 \
+        TYPE r;                                                         \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned e;                                                     \
+        float_status *fpst;                                             \
+        float_status scratch_fpst;                                      \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
+            if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) {              \
+                continue;                                               \
+            }                                                           \
+            fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 :    \
+                &env->vfp.standard_fp_status;                           \
+            if (!(mask & 1)) {                                          \
+                /* We need the result but without updating flags */     \
+                scratch_fpst = *fpst;                                   \
+                fpst = &scratch_fpst;                                   \
+            }                                                           \
+            r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], fpst);               \
+            mergemask(&d[H##ESIZE(e)], r, mask);                        \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+#define DO_2OP_FP_ALL(OP, FN)                  \
+    DO_2OP_FP(OP##h, 2, float16, float16_##FN) \
+    DO_2OP_FP(OP##s, 4, float32, float32_##FN)
+
+DO_2OP_FP_ALL(vfadd, add)
+DO_2OP_FP_ALL(vfsub, sub)
+DO_2OP_FP_ALL(vfmul, mul)
+
+static inline float16 float16_abd(float16 a, float16 b, float_status *s)
+{
+    return float16_abs(float16_sub(a, b, s));
+}
+
+static inline float32 float32_abd(float32 a, float32 b, float_status *s)
+{
+    return float32_abs(float32_sub(a, b, s));
+}
+
+DO_2OP_FP_ALL(vfabd, abd)
+DO_2OP_FP_ALL(vmaxnm, maxnum)
+DO_2OP_FP_ALL(vminnm, minnum)
+
+static inline float16 float16_maxnuma(float16 a, float16 b, float_status *s)
+{
+    return float16_maxnum(float16_abs(a), float16_abs(b), s);
+}
+
+static inline float32 float32_maxnuma(float32 a, float32 b, float_status *s)
+{
+    return float32_maxnum(float32_abs(a), float32_abs(b), s);
+}
+
+static inline float16 float16_minnuma(float16 a, float16 b, float_status *s)
+{
+    return float16_minnum(float16_abs(a), float16_abs(b), s);
+}
+
+static inline float32 float32_minnuma(float32 a, float32 b, float_status *s)
+{
+    return float32_minnum(float32_abs(a), float32_abs(b), s);
+}
+
+DO_2OP_FP_ALL(vmaxnma, maxnuma)
+DO_2OP_FP_ALL(vminnma, minnuma)
+
+#define DO_VCADD_FP(OP, ESIZE, TYPE, FN0, FN1)                          \
+    void HELPER(glue(mve_, OP))(CPUARMState *env,                       \
+                                void *vd, void *vn, void *vm)           \
+    {                                                                   \
+        TYPE *d = vd, *n = vn, *m = vm;                                 \
+        TYPE r[16 / ESIZE];                                             \
+        uint16_t tm, mask = mve_element_mask(env);                      \
+        unsigned e;                                                     \
+        float_status *fpst;                                             \
+        float_status scratch_fpst;                                      \
+        /* Calculate all results first to avoid overwriting inputs */   \
+        for (e = 0, tm = mask; e < 16 / ESIZE; e++, tm >>= ESIZE) {     \
+            if ((tm & MAKE_64BIT_MASK(0, ESIZE)) == 0) {                \
+                r[e] = 0;                                               \
+                continue;                                               \
+            }                                                           \
+            fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 :    \
+                &env->vfp.standard_fp_status;                           \
+            if (!(tm & 1)) {                                            \
+                /* We need the result but without updating flags */     \
+                scratch_fpst = *fpst;                                   \
+                fpst = &scratch_fpst;                                   \
+            }                                                           \
+            if (!(e & 1)) {                                             \
+                r[e] = FN0(n[H##ESIZE(e)], m[H##ESIZE(e + 1)], fpst);   \
+            } else {                                                    \
+                r[e] = FN1(n[H##ESIZE(e)], m[H##ESIZE(e - 1)], fpst);   \
+            }                                                           \
+        }                                                               \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
+            mergemask(&d[H##ESIZE(e)], r[e], mask);                     \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+DO_VCADD_FP(vfcadd90h, 2, float16, float16_sub, float16_add)
+DO_VCADD_FP(vfcadd90s, 4, float32, float32_sub, float32_add)
+DO_VCADD_FP(vfcadd270h, 2, float16, float16_add, float16_sub)
+DO_VCADD_FP(vfcadd270s, 4, float32, float32_add, float32_sub)
+
+#define DO_VFMA(OP, ESIZE, TYPE, CHS)                                   \
+    void HELPER(glue(mve_, OP))(CPUARMState *env,                       \
+                                void *vd, void *vn, void *vm)           \
+    {                                                                   \
+        TYPE *d = vd, *n = vn, *m = vm;                                 \
+        TYPE r;                                                         \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned e;                                                     \
+        float_status *fpst;                                             \
+        float_status scratch_fpst;                                      \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
+            if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) {              \
+                continue;                                               \
+            }                                                           \
+            fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 :    \
+                &env->vfp.standard_fp_status;                           \
+            if (!(mask & 1)) {                                          \
+                /* We need the result but without updating flags */     \
+                scratch_fpst = *fpst;                                   \
+                fpst = &scratch_fpst;                                   \
+            }                                                           \
+            r = n[H##ESIZE(e)];                                         \
+            if (CHS) {                                                  \
+                r = TYPE##_chs(r);                                      \
+            }                                                           \
+            r = TYPE##_muladd(r, m[H##ESIZE(e)], d[H##ESIZE(e)],        \
+                              0, fpst);                                 \
+            mergemask(&d[H##ESIZE(e)], r, mask);                        \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+DO_VFMA(vfmah, 2, float16, false)
+DO_VFMA(vfmas, 4, float32, false)
+DO_VFMA(vfmsh, 2, float16, true)
+DO_VFMA(vfmss, 4, float32, true)
+
+#define DO_VCMLA(OP, ESIZE, TYPE, ROT, FN)                              \
+    void HELPER(glue(mve_, OP))(CPUARMState *env,                       \
+                                void *vd, void *vn, void *vm)           \
+    {                                                                   \
+        TYPE *d = vd, *n = vn, *m = vm;                                 \
+        TYPE r0, r1, e1, e2, e3, e4;                                    \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned e;                                                     \
+        float_status *fpst0, *fpst1;                                    \
+        float_status scratch_fpst;                                      \
+        /* We loop through pairs of elements at a time */               \
+        for (e = 0; e < 16 / ESIZE; e += 2, mask >>= ESIZE * 2) {       \
+            if ((mask & MAKE_64BIT_MASK(0, ESIZE * 2)) == 0) {          \
+                continue;                                               \
+            }                                                           \
+            fpst0 = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 :   \
+                &env->vfp.standard_fp_status;                           \
+            fpst1 = fpst0;                                              \
+            if (!(mask & 1)) {                                          \
+                scratch_fpst = *fpst0;                                  \
+                fpst0 = &scratch_fpst;                                  \
+            }                                                           \
+            if (!(mask & (1 << ESIZE))) {                               \
+                scratch_fpst = *fpst1;                                  \
+                fpst1 = &scratch_fpst;                                  \
+            }                                                           \
+            switch (ROT) {                                              \
+            case 0:                                                     \
+                e1 = m[H##ESIZE(e)];                                    \
+                e2 = n[H##ESIZE(e)];                                    \
+                e3 = m[H##ESIZE(e + 1)];                                \
+                e4 = n[H##ESIZE(e)];                                    \
+                break;                                                  \
+            case 1:                                                     \
+                e1 = TYPE##_chs(m[H##ESIZE(e + 1)]);                    \
+                e2 = n[H##ESIZE(e + 1)];                                \
+                e3 = m[H##ESIZE(e)];                                    \
+                e4 = n[H##ESIZE(e + 1)];                                \
+                break;                                                  \
+            case 2:                                                     \
+                e1 = TYPE##_chs(m[H##ESIZE(e)]);                        \
+                e2 = n[H##ESIZE(e)];                                    \
+                e3 = TYPE##_chs(m[H##ESIZE(e + 1)]);                    \
+                e4 = n[H##ESIZE(e)];                                    \
+                break;                                                  \
+            case 3:                                                     \
+                e1 = m[H##ESIZE(e + 1)];                                \
+                e2 = n[H##ESIZE(e + 1)];                                \
+                e3 = TYPE##_chs(m[H##ESIZE(e)]);                        \
+                e4 = n[H##ESIZE(e + 1)];                                \
+                break;                                                  \
+            default:                                                    \
+                g_assert_not_reached();                                 \
+            }                                                           \
+            r0 = FN(e2, e1, d[H##ESIZE(e)], fpst0);                     \
+            r1 = FN(e4, e3, d[H##ESIZE(e + 1)], fpst1);                 \
+            mergemask(&d[H##ESIZE(e)], r0, mask);                       \
+            mergemask(&d[H##ESIZE(e + 1)], r1, mask >> ESIZE);          \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+#define DO_VCMULH(N, M, D, S) float16_mul(N, M, S)
+#define DO_VCMULS(N, M, D, S) float32_mul(N, M, S)
+
+#define DO_VCMLAH(N, M, D, S) float16_muladd(N, M, D, 0, S)
+#define DO_VCMLAS(N, M, D, S) float32_muladd(N, M, D, 0, S)
+
+DO_VCMLA(vcmul0h, 2, float16, 0, DO_VCMULH)
+DO_VCMLA(vcmul0s, 4, float32, 0, DO_VCMULS)
+DO_VCMLA(vcmul90h, 2, float16, 1, DO_VCMULH)
+DO_VCMLA(vcmul90s, 4, float32, 1, DO_VCMULS)
+DO_VCMLA(vcmul180h, 2, float16, 2, DO_VCMULH)
+DO_VCMLA(vcmul180s, 4, float32, 2, DO_VCMULS)
+DO_VCMLA(vcmul270h, 2, float16, 3, DO_VCMULH)
+DO_VCMLA(vcmul270s, 4, float32, 3, DO_VCMULS)
+
+DO_VCMLA(vcmla0h, 2, float16, 0, DO_VCMLAH)
+DO_VCMLA(vcmla0s, 4, float32, 0, DO_VCMLAS)
+DO_VCMLA(vcmla90h, 2, float16, 1, DO_VCMLAH)
+DO_VCMLA(vcmla90s, 4, float32, 1, DO_VCMLAS)
+DO_VCMLA(vcmla180h, 2, float16, 2, DO_VCMLAH)
+DO_VCMLA(vcmla180s, 4, float32, 2, DO_VCMLAS)
+DO_VCMLA(vcmla270h, 2, float16, 3, DO_VCMLAH)
+DO_VCMLA(vcmla270s, 4, float32, 3, DO_VCMLAS)
+
+#define DO_2OP_FP_SCALAR(OP, ESIZE, TYPE, FN)                           \
+    void HELPER(glue(mve_, OP))(CPUARMState *env,                       \
+                                void *vd, void *vn, uint32_t rm)        \
+    {                                                                   \
+        TYPE *d = vd, *n = vn;                                          \
+        TYPE r, m = rm;                                                 \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned e;                                                     \
+        float_status *fpst;                                             \
+        float_status scratch_fpst;                                      \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
+            if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) {              \
+                continue;                                               \
+            }                                                           \
+            fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 :    \
+                &env->vfp.standard_fp_status;                           \
+            if (!(mask & 1)) {                                          \
+                /* We need the result but without updating flags */     \
+                scratch_fpst = *fpst;                                   \
+                fpst = &scratch_fpst;                                   \
+            }                                                           \
+            r = FN(n[H##ESIZE(e)], m, fpst);                            \
+            mergemask(&d[H##ESIZE(e)], r, mask);                        \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+#define DO_2OP_FP_SCALAR_ALL(OP, FN)                    \
+    DO_2OP_FP_SCALAR(OP##h, 2, float16, float16_##FN)   \
+    DO_2OP_FP_SCALAR(OP##s, 4, float32, float32_##FN)
+
+DO_2OP_FP_SCALAR_ALL(vfadd_scalar, add)
+DO_2OP_FP_SCALAR_ALL(vfsub_scalar, sub)
+DO_2OP_FP_SCALAR_ALL(vfmul_scalar, mul)
+
+#define DO_2OP_FP_ACC_SCALAR(OP, ESIZE, TYPE, FN)                       \
+    void HELPER(glue(mve_, OP))(CPUARMState *env,                       \
+                                void *vd, void *vn, uint32_t rm)        \
+    {                                                                   \
+        TYPE *d = vd, *n = vn;                                          \
+        TYPE r, m = rm;                                                 \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned e;                                                     \
+        float_status *fpst;                                             \
+        float_status scratch_fpst;                                      \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
+            if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) {              \
+                continue;                                               \
+            }                                                           \
+            fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 :    \
+                &env->vfp.standard_fp_status;                           \
+            if (!(mask & 1)) {                                          \
+                /* We need the result but without updating flags */     \
+                scratch_fpst = *fpst;                                   \
+                fpst = &scratch_fpst;                                   \
+            }                                                           \
+            r = FN(n[H##ESIZE(e)], m, d[H##ESIZE(e)], 0, fpst);         \
+            mergemask(&d[H##ESIZE(e)], r, mask);                        \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+/* VFMAS is vector * vector + scalar, so swap op2 and op3 */
+#define DO_VFMAS_SCALARH(N, M, D, F, S) float16_muladd(N, D, M, F, S)
+#define DO_VFMAS_SCALARS(N, M, D, F, S) float32_muladd(N, D, M, F, S)
+
+/* VFMA is vector * scalar + vector */
+DO_2OP_FP_ACC_SCALAR(vfma_scalarh, 2, float16, float16_muladd)
+DO_2OP_FP_ACC_SCALAR(vfma_scalars, 4, float32, float32_muladd)
+DO_2OP_FP_ACC_SCALAR(vfmas_scalarh, 2, float16, DO_VFMAS_SCALARH)
+DO_2OP_FP_ACC_SCALAR(vfmas_scalars, 4, float32, DO_VFMAS_SCALARS)
+
+/* Floating point max/min across vector. */
+#define DO_FP_VMAXMINV(OP, ESIZE, TYPE, ABS, FN)                \
+    uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \
+                                    uint32_t ra_in)             \
+    {                                                           \
+        uint16_t mask = mve_element_mask(env);                  \
+        unsigned e;                                             \
+        TYPE *m = vm;                                           \
+        TYPE ra = (TYPE)ra_in;                                  \
+        float_status *fpst = (ESIZE == 2) ?                     \
+            &env->vfp.standard_fp_status_f16 :                  \
+            &env->vfp.standard_fp_status;                       \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {      \
+            if (mask & 1) {                                     \
+                TYPE v = m[H##ESIZE(e)];                        \
+                if (TYPE##_is_signaling_nan(ra, fpst)) {        \
+                    ra = TYPE##_silence_nan(ra, fpst);          \
+                    float_raise(float_flag_invalid, fpst);      \
+                }                                               \
+                if (TYPE##_is_signaling_nan(v, fpst)) {         \
+                    v = TYPE##_silence_nan(v, fpst);            \
+                    float_raise(float_flag_invalid, fpst);      \
+                }                                               \
+                if (ABS) {                                      \
+                    v = TYPE##_abs(v);                          \
+                }                                               \
+                ra = FN(ra, v, fpst);                           \
+            }                                                   \
+        }                                                       \
+        mve_advance_vpt(env);                                   \
+        return ra;                                              \
+    }                                                           \
+
+#define NOP(X) (X)
+
+DO_FP_VMAXMINV(vmaxnmvh, 2, float16, false, float16_maxnum)
+DO_FP_VMAXMINV(vmaxnmvs, 4, float32, false, float32_maxnum)
+DO_FP_VMAXMINV(vminnmvh, 2, float16, false, float16_minnum)
+DO_FP_VMAXMINV(vminnmvs, 4, float32, false, float32_minnum)
+DO_FP_VMAXMINV(vmaxnmavh, 2, float16, true, float16_maxnum)
+DO_FP_VMAXMINV(vmaxnmavs, 4, float32, true, float32_maxnum)
+DO_FP_VMAXMINV(vminnmavh, 2, float16, true, float16_minnum)
+DO_FP_VMAXMINV(vminnmavs, 4, float32, true, float32_minnum)
+
+/* FP compares; note that all comparisons signal InvalidOp for QNaNs */
+#define DO_VCMP_FP(OP, ESIZE, TYPE, FN)                                 \
+    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, void *vm)   \
+    {                                                                   \
+        TYPE *n = vn, *m = vm;                                          \
+        uint16_t mask = mve_element_mask(env);                          \
+        uint16_t eci_mask = mve_eci_mask(env);                          \
+        uint16_t beatpred = 0;                                          \
+        uint16_t emask = MAKE_64BIT_MASK(0, ESIZE);                     \
+        unsigned e;                                                     \
+        float_status *fpst;                                             \
+        float_status scratch_fpst;                                      \
+        bool r;                                                         \
+        for (e = 0; e < 16 / ESIZE; e++, emask <<= ESIZE) {             \
+            if ((mask & emask) == 0) {                                  \
+                continue;                                               \
+            }                                                           \
+            fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 :    \
+                &env->vfp.standard_fp_status;                           \
+            if (!(mask & (1 << (e * ESIZE)))) {                         \
+                /* We need the result but without updating flags */     \
+                scratch_fpst = *fpst;                                   \
+                fpst = &scratch_fpst;                                   \
+            }                                                           \
+            r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], fpst);               \
+            /* Comparison sets 0/1 bits for each byte in the element */ \
+            beatpred |= r * emask;                                      \
+        }                                                               \
+        beatpred &= mask;                                               \
+        env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) |           \
+            (beatpred & eci_mask);                                      \
+        mve_advance_vpt(env);                                           \
+    }
+
+#define DO_VCMP_FP_SCALAR(OP, ESIZE, TYPE, FN)                          \
+    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn,             \
+                                uint32_t rm)                            \
+    {                                                                   \
+        TYPE *n = vn;                                                   \
+        uint16_t mask = mve_element_mask(env);                          \
+        uint16_t eci_mask = mve_eci_mask(env);                          \
+        uint16_t beatpred = 0;                                          \
+        uint16_t emask = MAKE_64BIT_MASK(0, ESIZE);                     \
+        unsigned e;                                                     \
+        float_status *fpst;                                             \
+        float_status scratch_fpst;                                      \
+        bool r;                                                         \
+        for (e = 0; e < 16 / ESIZE; e++, emask <<= ESIZE) {             \
+            if ((mask & emask) == 0) {                                  \
+                continue;                                               \
+            }                                                           \
+            fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 :    \
+                &env->vfp.standard_fp_status;                           \
+            if (!(mask & (1 << (e * ESIZE)))) {                         \
+                /* We need the result but without updating flags */     \
+                scratch_fpst = *fpst;                                   \
+                fpst = &scratch_fpst;                                   \
+            }                                                           \
+            r = FN(n[H##ESIZE(e)], (TYPE)rm, fpst);                     \
+            /* Comparison sets 0/1 bits for each byte in the element */ \
+            beatpred |= r * emask;                                      \
+        }                                                               \
+        beatpred &= mask;                                               \
+        env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) |           \
+            (beatpred & eci_mask);                                      \
+        mve_advance_vpt(env);                                           \
+    }
+
+#define DO_VCMP_FP_BOTH(VOP, SOP, ESIZE, TYPE, FN)      \
+    DO_VCMP_FP(VOP, ESIZE, TYPE, FN)                    \
+    DO_VCMP_FP_SCALAR(SOP, ESIZE, TYPE, FN)
+
+/*
+ * Some care is needed here to get the correct result for the unordered case.
+ * Architecturally EQ, GE and GT are defined to be false for unordered, but
+ * the NE, LT and LE comparisons are defined as simple logical inverses of
+ * EQ, GE and GT and so they must return true for unordered. The softfloat
+ * comparison functions float*_{eq,le,lt} all return false for unordered.
+ */
+#define DO_GE16(X, Y, S) float16_le(Y, X, S)
+#define DO_GE32(X, Y, S) float32_le(Y, X, S)
+#define DO_GT16(X, Y, S) float16_lt(Y, X, S)
+#define DO_GT32(X, Y, S) float32_lt(Y, X, S)
+
+DO_VCMP_FP_BOTH(vfcmpeqh, vfcmpeq_scalarh, 2, float16, float16_eq)
+DO_VCMP_FP_BOTH(vfcmpeqs, vfcmpeq_scalars, 4, float32, float32_eq)
+
+DO_VCMP_FP_BOTH(vfcmpneh, vfcmpne_scalarh, 2, float16, !float16_eq)
+DO_VCMP_FP_BOTH(vfcmpnes, vfcmpne_scalars, 4, float32, !float32_eq)
+
+DO_VCMP_FP_BOTH(vfcmpgeh, vfcmpge_scalarh, 2, float16, DO_GE16)
+DO_VCMP_FP_BOTH(vfcmpges, vfcmpge_scalars, 4, float32, DO_GE32)
+
+DO_VCMP_FP_BOTH(vfcmplth, vfcmplt_scalarh, 2, float16, !DO_GE16)
+DO_VCMP_FP_BOTH(vfcmplts, vfcmplt_scalars, 4, float32, !DO_GE32)
+
+DO_VCMP_FP_BOTH(vfcmpgth, vfcmpgt_scalarh, 2, float16, DO_GT16)
+DO_VCMP_FP_BOTH(vfcmpgts, vfcmpgt_scalars, 4, float32, DO_GT32)
+
+DO_VCMP_FP_BOTH(vfcmpleh, vfcmple_scalarh, 2, float16, !DO_GT16)
+DO_VCMP_FP_BOTH(vfcmples, vfcmple_scalars, 4, float32, !DO_GT32)
+
+#define DO_VCVT_FIXED(OP, ESIZE, TYPE, FN)                              \
+    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vm,   \
+                                uint32_t shift)                         \
+    {                                                                   \
+        TYPE *d = vd, *m = vm;                                          \
+        TYPE r;                                                         \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned e;                                                     \
+        float_status *fpst;                                             \
+        float_status scratch_fpst;                                      \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
+            if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) {              \
+                continue;                                               \
+            }                                                           \
+            fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 :    \
+                &env->vfp.standard_fp_status;                           \
+            if (!(mask & 1)) {                                          \
+                /* We need the result but without updating flags */     \
+                scratch_fpst = *fpst;                                   \
+                fpst = &scratch_fpst;                                   \
+            }                                                           \
+            r = FN(m[H##ESIZE(e)], shift, fpst);                        \
+            mergemask(&d[H##ESIZE(e)], r, mask);                        \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+DO_VCVT_FIXED(vcvt_sh, 2, int16_t, helper_vfp_shtoh)
+DO_VCVT_FIXED(vcvt_uh, 2, uint16_t, helper_vfp_uhtoh)
+DO_VCVT_FIXED(vcvt_hs, 2, int16_t, helper_vfp_toshh_round_to_zero)
+DO_VCVT_FIXED(vcvt_hu, 2, uint16_t, helper_vfp_touhh_round_to_zero)
+DO_VCVT_FIXED(vcvt_sf, 4, int32_t, helper_vfp_sltos)
+DO_VCVT_FIXED(vcvt_uf, 4, uint32_t, helper_vfp_ultos)
+DO_VCVT_FIXED(vcvt_fs, 4, int32_t, helper_vfp_tosls_round_to_zero)
+DO_VCVT_FIXED(vcvt_fu, 4, uint32_t, helper_vfp_touls_round_to_zero)
+
+/* VCVT with specified rmode */
+#define DO_VCVT_RMODE(OP, ESIZE, TYPE, FN)                              \
+    void HELPER(glue(mve_, OP))(CPUARMState *env,                       \
+                                void *vd, void *vm, uint32_t rmode)     \
+    {                                                                   \
+        TYPE *d = vd, *m = vm;                                          \
+        TYPE r;                                                         \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned e;                                                     \
+        float_status *fpst;                                             \
+        float_status scratch_fpst;                                      \
+        float_status *base_fpst = (ESIZE == 2) ?                        \
+            &env->vfp.standard_fp_status_f16 :                          \
+            &env->vfp.standard_fp_status;                               \
+        uint32_t prev_rmode = get_float_rounding_mode(base_fpst);       \
+        set_float_rounding_mode(rmode, base_fpst);                      \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
+            if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) {              \
+                continue;                                               \
+            }                                                           \
+            fpst = base_fpst;                                           \
+            if (!(mask & 1)) {                                          \
+                /* We need the result but without updating flags */     \
+                scratch_fpst = *fpst;                                   \
+                fpst = &scratch_fpst;                                   \
+            }                                                           \
+            r = FN(m[H##ESIZE(e)], 0, fpst);                            \
+            mergemask(&d[H##ESIZE(e)], r, mask);                        \
+        }                                                               \
+        set_float_rounding_mode(prev_rmode, base_fpst);                 \
+        mve_advance_vpt(env);                                           \
+    }
+
+DO_VCVT_RMODE(vcvt_rm_sh, 2, uint16_t, helper_vfp_toshh)
+DO_VCVT_RMODE(vcvt_rm_uh, 2, uint16_t, helper_vfp_touhh)
+DO_VCVT_RMODE(vcvt_rm_ss, 4, uint32_t, helper_vfp_tosls)
+DO_VCVT_RMODE(vcvt_rm_us, 4, uint32_t, helper_vfp_touls)
+
+#define DO_VRINT_RM_H(M, F, S) helper_rinth(M, S)
+#define DO_VRINT_RM_S(M, F, S) helper_rints(M, S)
+
+DO_VCVT_RMODE(vrint_rm_h, 2, uint16_t, DO_VRINT_RM_H)
+DO_VCVT_RMODE(vrint_rm_s, 4, uint32_t, DO_VRINT_RM_S)
+
+/*
+ * VCVT between halfprec and singleprec. As usual for halfprec
+ * conversions, FZ16 is ignored and AHP is observed.
+ */
+static void do_vcvt_sh(CPUARMState *env, void *vd, void *vm, int top)
+{
+    uint16_t *d = vd;
+    uint32_t *m = vm;
+    uint16_t r;
+    uint16_t mask = mve_element_mask(env);
+    bool ieee = !(env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_AHP);
+    unsigned e;
+    float_status *fpst;
+    float_status scratch_fpst;
+    float_status *base_fpst = &env->vfp.standard_fp_status;
+    bool old_fz = get_flush_to_zero(base_fpst);
+    set_flush_to_zero(false, base_fpst);
+    for (e = 0; e < 16 / 4; e++, mask >>= 4) {
+        if ((mask & MAKE_64BIT_MASK(0, 4)) == 0) {
+            continue;
+        }
+        fpst = base_fpst;
+        if (!(mask & 1)) {
+            /* We need the result but without updating flags */
+            scratch_fpst = *fpst;
+            fpst = &scratch_fpst;
+        }
+        r = float32_to_float16(m[H4(e)], ieee, fpst);
+        mergemask(&d[H2(e * 2 + top)], r, mask >> (top * 2));
+    }
+    set_flush_to_zero(old_fz, base_fpst);
+    mve_advance_vpt(env);
+}
+
+static void do_vcvt_hs(CPUARMState *env, void *vd, void *vm, int top)
+{
+    uint32_t *d = vd;
+    uint16_t *m = vm;
+    uint32_t r;
+    uint16_t mask = mve_element_mask(env);
+    bool ieee = !(env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_AHP);
+    unsigned e;
+    float_status *fpst;
+    float_status scratch_fpst;
+    float_status *base_fpst = &env->vfp.standard_fp_status;
+    bool old_fiz = get_flush_inputs_to_zero(base_fpst);
+    set_flush_inputs_to_zero(false, base_fpst);
+    for (e = 0; e < 16 / 4; e++, mask >>= 4) {
+        if ((mask & MAKE_64BIT_MASK(0, 4)) == 0) {
+            continue;
+        }
+        fpst = base_fpst;
+        if (!(mask & (1 << (top * 2)))) {
+            /* We need the result but without updating flags */
+            scratch_fpst = *fpst;
+            fpst = &scratch_fpst;
+        }
+        r = float16_to_float32(m[H2(e * 2 + top)], ieee, fpst);
+        mergemask(&d[H4(e)], r, mask);
+    }
+    set_flush_inputs_to_zero(old_fiz, base_fpst);
+    mve_advance_vpt(env);
+}
+
+void HELPER(mve_vcvtb_sh)(CPUARMState *env, void *vd, void *vm)
+{
+    do_vcvt_sh(env, vd, vm, 0);
+}
+void HELPER(mve_vcvtt_sh)(CPUARMState *env, void *vd, void *vm)
+{
+    do_vcvt_sh(env, vd, vm, 1);
+}
+void HELPER(mve_vcvtb_hs)(CPUARMState *env, void *vd, void *vm)
+{
+    do_vcvt_hs(env, vd, vm, 0);
+}
+void HELPER(mve_vcvtt_hs)(CPUARMState *env, void *vd, void *vm)
+{
+    do_vcvt_hs(env, vd, vm, 1);
+}
+
+#define DO_1OP_FP(OP, ESIZE, TYPE, FN)                                  \
+    void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vm)   \
+    {                                                                   \
+        TYPE *d = vd, *m = vm;                                          \
+        TYPE r;                                                         \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned e;                                                     \
+        float_status *fpst;                                             \
+        float_status scratch_fpst;                                      \
+        for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
+            if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) {              \
+                continue;                                               \
+            }                                                           \
+            fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 :    \
+                &env->vfp.standard_fp_status;                           \
+            if (!(mask & 1)) {                                          \
+                /* We need the result but without updating flags */     \
+                scratch_fpst = *fpst;                                   \
+                fpst = &scratch_fpst;                                   \
+            }                                                           \
+            r = FN(m[H##ESIZE(e)], fpst);                               \
+            mergemask(&d[H##ESIZE(e)], r, mask);                        \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+DO_1OP_FP(vrintx_h, 2, float16, float16_round_to_int)
+DO_1OP_FP(vrintx_s, 4, float32, float32_round_to_int)
diff --git a/target/arm/tcg/neon_helper.c b/target/arm/tcg/neon_helper.c
new file mode 100644
index 0000000..bc6c4a5
--- /dev/null
+++ b/target/arm/tcg/neon_helper.c
@@ -0,0 +1,1740 @@
+/*
+ * ARM NEON vector operations.
+ *
+ * Copyright (c) 2007, 2008 CodeSourcery.
+ * Written by Paul Brook
+ *
+ * This code is licensed under the GNU GPL v2.
+ */
+#include "qemu/osdep.h"
+
+#include "cpu.h"
+#include "exec/helper-proto.h"
+#include "fpu/softfloat.h"
+#include "vec_internal.h"
+
+#define SIGNBIT (uint32_t)0x80000000
+#define SIGNBIT64 ((uint64_t)1 << 63)
+
+#define SET_QC() env->vfp.qc[0] = 1
+
+#define NEON_TYPE1(name, type) \
+typedef struct \
+{ \
+    type v1; \
+} neon_##name;
+#if HOST_BIG_ENDIAN
+#define NEON_TYPE2(name, type) \
+typedef struct \
+{ \
+    type v2; \
+    type v1; \
+} neon_##name;
+#define NEON_TYPE4(name, type) \
+typedef struct \
+{ \
+    type v4; \
+    type v3; \
+    type v2; \
+    type v1; \
+} neon_##name;
+#else
+#define NEON_TYPE2(name, type) \
+typedef struct \
+{ \
+    type v1; \
+    type v2; \
+} neon_##name;
+#define NEON_TYPE4(name, type) \
+typedef struct \
+{ \
+    type v1; \
+    type v2; \
+    type v3; \
+    type v4; \
+} neon_##name;
+#endif
+
+NEON_TYPE4(s8, int8_t)
+NEON_TYPE4(u8, uint8_t)
+NEON_TYPE2(s16, int16_t)
+NEON_TYPE2(u16, uint16_t)
+NEON_TYPE1(s32, int32_t)
+NEON_TYPE1(u32, uint32_t)
+#undef NEON_TYPE4
+#undef NEON_TYPE2
+#undef NEON_TYPE1
+
+/* Copy from a uint32_t to a vector structure type.  */
+#define NEON_UNPACK(vtype, dest, val) do { \
+    union { \
+        vtype v; \
+        uint32_t i; \
+    } conv_u; \
+    conv_u.i = (val); \
+    dest = conv_u.v; \
+    } while(0)
+
+/* Copy from a vector structure type to a uint32_t.  */
+#define NEON_PACK(vtype, dest, val) do { \
+    union { \
+        vtype v; \
+        uint32_t i; \
+    } conv_u; \
+    conv_u.v = (val); \
+    dest = conv_u.i; \
+    } while(0)
+
+#define NEON_DO1 \
+    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
+#define NEON_DO2 \
+    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
+    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
+#define NEON_DO4 \
+    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
+    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
+    NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
+    NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
+
+#define NEON_VOP_BODY(vtype, n) \
+{ \
+    uint32_t res; \
+    vtype vsrc1; \
+    vtype vsrc2; \
+    vtype vdest; \
+    NEON_UNPACK(vtype, vsrc1, arg1); \
+    NEON_UNPACK(vtype, vsrc2, arg2); \
+    NEON_DO##n; \
+    NEON_PACK(vtype, res, vdest); \
+    return res; \
+}
+
+#define NEON_VOP(name, vtype, n) \
+uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
+NEON_VOP_BODY(vtype, n)
+
+#define NEON_VOP_ENV(name, vtype, n) \
+uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
+NEON_VOP_BODY(vtype, n)
+
+/* Pairwise operations.  */
+/* For 32-bit elements each segment only contains a single element, so
+   the elementwise and pairwise operations are the same.  */
+#define NEON_PDO2 \
+    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
+    NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
+#define NEON_PDO4 \
+    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
+    NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
+    NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
+    NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
+
+#define NEON_POP(name, vtype, n) \
+uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
+{ \
+    uint32_t res; \
+    vtype vsrc1; \
+    vtype vsrc2; \
+    vtype vdest; \
+    NEON_UNPACK(vtype, vsrc1, arg1); \
+    NEON_UNPACK(vtype, vsrc2, arg2); \
+    NEON_PDO##n; \
+    NEON_PACK(vtype, res, vdest); \
+    return res; \
+}
+
+/* Unary operators.  */
+#define NEON_VOP1(name, vtype, n) \
+uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
+{ \
+    vtype vsrc1; \
+    vtype vdest; \
+    NEON_UNPACK(vtype, vsrc1, arg); \
+    NEON_DO##n; \
+    NEON_PACK(vtype, arg, vdest); \
+    return arg; \
+}
+
+
+#define NEON_USAT(dest, src1, src2, type) do { \
+    uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
+    if (tmp != (type)tmp) { \
+        SET_QC(); \
+        dest = ~0; \
+    } else { \
+        dest = tmp; \
+    }} while(0)
+#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
+NEON_VOP_ENV(qadd_u8, neon_u8, 4)
+#undef NEON_FN
+#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
+NEON_VOP_ENV(qadd_u16, neon_u16, 2)
+#undef NEON_FN
+#undef NEON_USAT
+
+uint32_t HELPER(neon_qadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+    uint32_t res = a + b;
+    if (res < a) {
+        SET_QC();
+        res = ~0;
+    }
+    return res;
+}
+
+uint64_t HELPER(neon_qadd_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
+{
+    uint64_t res;
+
+    res = src1 + src2;
+    if (res < src1) {
+        SET_QC();
+        res = ~(uint64_t)0;
+    }
+    return res;
+}
+
+#define NEON_SSAT(dest, src1, src2, type) do { \
+    int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
+    if (tmp != (type)tmp) { \
+        SET_QC(); \
+        if (src2 > 0) { \
+            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
+        } else { \
+            tmp = 1 << (sizeof(type) * 8 - 1); \
+        } \
+    } \
+    dest = tmp; \
+    } while(0)
+#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
+NEON_VOP_ENV(qadd_s8, neon_s8, 4)
+#undef NEON_FN
+#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
+NEON_VOP_ENV(qadd_s16, neon_s16, 2)
+#undef NEON_FN
+#undef NEON_SSAT
+
+uint32_t HELPER(neon_qadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+    uint32_t res = a + b;
+    if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) {
+        SET_QC();
+        res = ~(((int32_t)a >> 31) ^ SIGNBIT);
+    }
+    return res;
+}
+
+uint64_t HELPER(neon_qadd_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
+{
+    uint64_t res;
+
+    res = src1 + src2;
+    if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
+        SET_QC();
+        res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
+    }
+    return res;
+}
+
+/* Unsigned saturating accumulate of signed value
+ *
+ * Op1/Rn is treated as signed
+ * Op2/Rd is treated as unsigned
+ *
+ * Explicit casting is used to ensure the correct sign extension of
+ * inputs. The result is treated as a unsigned value and saturated as such.
+ *
+ * We use a macro for the 8/16 bit cases which expects signed integers of va,
+ * vb, and vr for interim calculation and an unsigned 32 bit result value r.
+ */
+
+#define USATACC(bits, shift) \
+    do { \
+        va = sextract32(a, shift, bits);                                \
+        vb = extract32(b, shift, bits);                                 \
+        vr = va + vb;                                                   \
+        if (vr > UINT##bits##_MAX) {                                    \
+            SET_QC();                                                   \
+            vr = UINT##bits##_MAX;                                      \
+        } else if (vr < 0) {                                            \
+            SET_QC();                                                   \
+            vr = 0;                                                     \
+        }                                                               \
+        r = deposit32(r, shift, bits, vr);                              \
+   } while (0)
+
+uint32_t HELPER(neon_uqadd_s8)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+    int16_t va, vb, vr;
+    uint32_t r = 0;
+
+    USATACC(8, 0);
+    USATACC(8, 8);
+    USATACC(8, 16);
+    USATACC(8, 24);
+    return r;
+}
+
+uint32_t HELPER(neon_uqadd_s16)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+    int32_t va, vb, vr;
+    uint64_t r = 0;
+
+    USATACC(16, 0);
+    USATACC(16, 16);
+    return r;
+}
+
+#undef USATACC
+
+uint32_t HELPER(neon_uqadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+    int64_t va = (int32_t)a;
+    int64_t vb = (uint32_t)b;
+    int64_t vr = va + vb;
+    if (vr > UINT32_MAX) {
+        SET_QC();
+        vr = UINT32_MAX;
+    } else if (vr < 0) {
+        SET_QC();
+        vr = 0;
+    }
+    return vr;
+}
+
+uint64_t HELPER(neon_uqadd_s64)(CPUARMState *env, uint64_t a, uint64_t b)
+{
+    uint64_t res;
+    res = a + b;
+    /* We only need to look at the pattern of SIGN bits to detect
+     * +ve/-ve saturation
+     */
+    if (~a & b & ~res & SIGNBIT64) {
+        SET_QC();
+        res = UINT64_MAX;
+    } else if (a & ~b & res & SIGNBIT64) {
+        SET_QC();
+        res = 0;
+    }
+    return res;
+}
+
+/* Signed saturating accumulate of unsigned value
+ *
+ * Op1/Rn is treated as unsigned
+ * Op2/Rd is treated as signed
+ *
+ * The result is treated as a signed value and saturated as such
+ *
+ * We use a macro for the 8/16 bit cases which expects signed integers of va,
+ * vb, and vr for interim calculation and an unsigned 32 bit result value r.
+ */
+
+#define SSATACC(bits, shift) \
+    do { \
+        va = extract32(a, shift, bits);                                 \
+        vb = sextract32(b, shift, bits);                                \
+        vr = va + vb;                                                   \
+        if (vr > INT##bits##_MAX) {                                     \
+            SET_QC();                                                   \
+            vr = INT##bits##_MAX;                                       \
+        } else if (vr < INT##bits##_MIN) {                              \
+            SET_QC();                                                   \
+            vr = INT##bits##_MIN;                                       \
+        }                                                               \
+        r = deposit32(r, shift, bits, vr);                              \
+    } while (0)
+
+uint32_t HELPER(neon_sqadd_u8)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+    int16_t va, vb, vr;
+    uint32_t r = 0;
+
+    SSATACC(8, 0);
+    SSATACC(8, 8);
+    SSATACC(8, 16);
+    SSATACC(8, 24);
+    return r;
+}
+
+uint32_t HELPER(neon_sqadd_u16)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+    int32_t va, vb, vr;
+    uint32_t r = 0;
+
+    SSATACC(16, 0);
+    SSATACC(16, 16);
+
+    return r;
+}
+
+#undef SSATACC
+
+uint32_t HELPER(neon_sqadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+    int64_t res;
+    int64_t op1 = (uint32_t)a;
+    int64_t op2 = (int32_t)b;
+    res = op1 + op2;
+    if (res > INT32_MAX) {
+        SET_QC();
+        res = INT32_MAX;
+    } else if (res < INT32_MIN) {
+        SET_QC();
+        res = INT32_MIN;
+    }
+    return res;
+}
+
+uint64_t HELPER(neon_sqadd_u64)(CPUARMState *env, uint64_t a, uint64_t b)
+{
+    uint64_t res;
+    res = a + b;
+    /* We only need to look at the pattern of SIGN bits to detect an overflow */
+    if (((a & res)
+         | (~b & res)
+         | (a & ~b)) & SIGNBIT64) {
+        SET_QC();
+        res = INT64_MAX;
+    }
+    return res;
+}
+
+
+#define NEON_USAT(dest, src1, src2, type) do { \
+    uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
+    if (tmp != (type)tmp) { \
+        SET_QC(); \
+        dest = 0; \
+    } else { \
+        dest = tmp; \
+    }} while(0)
+#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
+NEON_VOP_ENV(qsub_u8, neon_u8, 4)
+#undef NEON_FN
+#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
+NEON_VOP_ENV(qsub_u16, neon_u16, 2)
+#undef NEON_FN
+#undef NEON_USAT
+
+uint32_t HELPER(neon_qsub_u32)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+    uint32_t res = a - b;
+    if (res > a) {
+        SET_QC();
+        res = 0;
+    }
+    return res;
+}
+
+uint64_t HELPER(neon_qsub_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
+{
+    uint64_t res;
+
+    if (src1 < src2) {
+        SET_QC();
+        res = 0;
+    } else {
+        res = src1 - src2;
+    }
+    return res;
+}
+
+#define NEON_SSAT(dest, src1, src2, type) do { \
+    int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
+    if (tmp != (type)tmp) { \
+        SET_QC(); \
+        if (src2 < 0) { \
+            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
+        } else { \
+            tmp = 1 << (sizeof(type) * 8 - 1); \
+        } \
+    } \
+    dest = tmp; \
+    } while(0)
+#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
+NEON_VOP_ENV(qsub_s8, neon_s8, 4)
+#undef NEON_FN
+#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
+NEON_VOP_ENV(qsub_s16, neon_s16, 2)
+#undef NEON_FN
+#undef NEON_SSAT
+
+uint32_t HELPER(neon_qsub_s32)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+    uint32_t res = a - b;
+    if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) {
+        SET_QC();
+        res = ~(((int32_t)a >> 31) ^ SIGNBIT);
+    }
+    return res;
+}
+
+uint64_t HELPER(neon_qsub_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
+{
+    uint64_t res;
+
+    res = src1 - src2;
+    if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
+        SET_QC();
+        res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
+    }
+    return res;
+}
+
+#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
+NEON_VOP(hadd_s8, neon_s8, 4)
+NEON_VOP(hadd_u8, neon_u8, 4)
+NEON_VOP(hadd_s16, neon_s16, 2)
+NEON_VOP(hadd_u16, neon_u16, 2)
+#undef NEON_FN
+
+int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
+{
+    int32_t dest;
+
+    dest = (src1 >> 1) + (src2 >> 1);
+    if (src1 & src2 & 1)
+        dest++;
+    return dest;
+}
+
+uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
+{
+    uint32_t dest;
+
+    dest = (src1 >> 1) + (src2 >> 1);
+    if (src1 & src2 & 1)
+        dest++;
+    return dest;
+}
+
+#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
+NEON_VOP(rhadd_s8, neon_s8, 4)
+NEON_VOP(rhadd_u8, neon_u8, 4)
+NEON_VOP(rhadd_s16, neon_s16, 2)
+NEON_VOP(rhadd_u16, neon_u16, 2)
+#undef NEON_FN
+
+int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
+{
+    int32_t dest;
+
+    dest = (src1 >> 1) + (src2 >> 1);
+    if ((src1 | src2) & 1)
+        dest++;
+    return dest;
+}
+
+uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
+{
+    uint32_t dest;
+
+    dest = (src1 >> 1) + (src2 >> 1);
+    if ((src1 | src2) & 1)
+        dest++;
+    return dest;
+}
+
+#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
+NEON_VOP(hsub_s8, neon_s8, 4)
+NEON_VOP(hsub_u8, neon_u8, 4)
+NEON_VOP(hsub_s16, neon_s16, 2)
+NEON_VOP(hsub_u16, neon_u16, 2)
+#undef NEON_FN
+
+int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
+{
+    int32_t dest;
+
+    dest = (src1 >> 1) - (src2 >> 1);
+    if ((~src1) & src2 & 1)
+        dest--;
+    return dest;
+}
+
+uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
+{
+    uint32_t dest;
+
+    dest = (src1 >> 1) - (src2 >> 1);
+    if ((~src1) & src2 & 1)
+        dest--;
+    return dest;
+}
+
+#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
+NEON_POP(pmin_s8, neon_s8, 4)
+NEON_POP(pmin_u8, neon_u8, 4)
+NEON_POP(pmin_s16, neon_s16, 2)
+NEON_POP(pmin_u16, neon_u16, 2)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
+NEON_POP(pmax_s8, neon_s8, 4)
+NEON_POP(pmax_u8, neon_u8, 4)
+NEON_POP(pmax_s16, neon_s16, 2)
+NEON_POP(pmax_u16, neon_u16, 2)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
+NEON_VOP(shl_u16, neon_u16, 2)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
+NEON_VOP(shl_s16, neon_s16, 2)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
+NEON_VOP(rshl_s8, neon_s8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
+NEON_VOP(rshl_s16, neon_s16, 2)
+#undef NEON_FN
+
+uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
+{
+    return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
+}
+
+uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
+{
+    return do_sqrshl_d(val, (int8_t)shift, true, NULL);
+}
+
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
+NEON_VOP(rshl_u8, neon_u8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
+NEON_VOP(rshl_u16, neon_u16, 2)
+#undef NEON_FN
+
+uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
+{
+    return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
+}
+
+uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
+{
+    return do_uqrshl_d(val, (int8_t)shift, true, NULL);
+}
+
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
+NEON_VOP_ENV(qshl_u8, neon_u8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
+NEON_VOP_ENV(qshl_u16, neon_u16, 2)
+#undef NEON_FN
+
+uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
+{
+    return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
+}
+
+uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
+{
+    return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
+}
+
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
+NEON_VOP_ENV(qshl_s8, neon_s8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
+NEON_VOP_ENV(qshl_s16, neon_s16, 2)
+#undef NEON_FN
+
+uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
+{
+    return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
+}
+
+uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
+{
+    return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
+}
+
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
+NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
+NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
+#undef NEON_FN
+
+uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
+{
+    return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
+}
+
+uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
+{
+    return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
+}
+
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
+NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
+NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
+#undef NEON_FN
+
+uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
+{
+    return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
+}
+
+uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
+{
+    return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
+}
+
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
+NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
+NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
+#undef NEON_FN
+
+uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
+{
+    return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
+}
+
+uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
+{
+    return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
+}
+
+uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
+{
+    uint32_t mask;
+    mask = (a ^ b) & 0x80808080u;
+    a &= ~0x80808080u;
+    b &= ~0x80808080u;
+    return (a + b) ^ mask;
+}
+
+uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
+{
+    uint32_t mask;
+    mask = (a ^ b) & 0x80008000u;
+    a &= ~0x80008000u;
+    b &= ~0x80008000u;
+    return (a + b) ^ mask;
+}
+
+#define NEON_FN(dest, src1, src2) dest = src1 + src2
+NEON_POP(padd_u8, neon_u8, 4)
+NEON_POP(padd_u16, neon_u16, 2)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) dest = src1 - src2
+NEON_VOP(sub_u8, neon_u8, 4)
+NEON_VOP(sub_u16, neon_u16, 2)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) dest = src1 * src2
+NEON_VOP(mul_u8, neon_u8, 4)
+NEON_VOP(mul_u16, neon_u16, 2)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
+NEON_VOP(tst_u8, neon_u8, 4)
+NEON_VOP(tst_u16, neon_u16, 2)
+NEON_VOP(tst_u32, neon_u32, 1)
+#undef NEON_FN
+
+/* Count Leading Sign/Zero Bits.  */
+static inline int do_clz8(uint8_t x)
+{
+    int n;
+    for (n = 8; x; n--)
+        x >>= 1;
+    return n;
+}
+
+static inline int do_clz16(uint16_t x)
+{
+    int n;
+    for (n = 16; x; n--)
+        x >>= 1;
+    return n;
+}
+
+#define NEON_FN(dest, src, dummy) dest = do_clz8(src)
+NEON_VOP1(clz_u8, neon_u8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src, dummy) dest = do_clz16(src)
+NEON_VOP1(clz_u16, neon_u16, 2)
+#undef NEON_FN
+
+#define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
+NEON_VOP1(cls_s8, neon_s8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
+NEON_VOP1(cls_s16, neon_s16, 2)
+#undef NEON_FN
+
+uint32_t HELPER(neon_cls_s32)(uint32_t x)
+{
+    int count;
+    if ((int32_t)x < 0)
+        x = ~x;
+    for (count = 32; x; count--)
+        x = x >> 1;
+    return count - 1;
+}
+
+/* Bit count.  */
+uint32_t HELPER(neon_cnt_u8)(uint32_t x)
+{
+    x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
+    x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
+    x = (x & 0x0f0f0f0f) + ((x >>  4) & 0x0f0f0f0f);
+    return x;
+}
+
+/* Reverse bits in each 8 bit word */
+uint32_t HELPER(neon_rbit_u8)(uint32_t x)
+{
+    x =  ((x & 0xf0f0f0f0) >> 4)
+       | ((x & 0x0f0f0f0f) << 4);
+    x =  ((x & 0x88888888) >> 3)
+       | ((x & 0x44444444) >> 1)
+       | ((x & 0x22222222) << 1)
+       | ((x & 0x11111111) << 3);
+    return x;
+}
+
+#define NEON_QDMULH16(dest, src1, src2, round) do { \
+    uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
+    if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
+        SET_QC(); \
+        tmp = (tmp >> 31) ^ ~SIGNBIT; \
+    } else { \
+        tmp <<= 1; \
+    } \
+    if (round) { \
+        int32_t old = tmp; \
+        tmp += 1 << 15; \
+        if ((int32_t)tmp < old) { \
+            SET_QC(); \
+            tmp = SIGNBIT - 1; \
+        } \
+    } \
+    dest = tmp >> 16; \
+    } while(0)
+#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
+NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
+#undef NEON_FN
+#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
+NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
+#undef NEON_FN
+#undef NEON_QDMULH16
+
+#define NEON_QDMULH32(dest, src1, src2, round) do { \
+    uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
+    if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
+        SET_QC(); \
+        tmp = (tmp >> 63) ^ ~SIGNBIT64; \
+    } else { \
+        tmp <<= 1; \
+    } \
+    if (round) { \
+        int64_t old = tmp; \
+        tmp += (int64_t)1 << 31; \
+        if ((int64_t)tmp < old) { \
+            SET_QC(); \
+            tmp = SIGNBIT64 - 1; \
+        } \
+    } \
+    dest = tmp >> 32; \
+    } while(0)
+#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
+NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
+#undef NEON_FN
+#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
+NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
+#undef NEON_FN
+#undef NEON_QDMULH32
+
+uint32_t HELPER(neon_narrow_u8)(uint64_t x)
+{
+    return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
+           | ((x >> 24) & 0xff000000u);
+}
+
+uint32_t HELPER(neon_narrow_u16)(uint64_t x)
+{
+    return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
+}
+
+uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
+{
+    return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
+            | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
+}
+
+uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
+{
+    return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
+}
+
+uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
+{
+    x &= 0xff80ff80ff80ff80ull;
+    x += 0x0080008000800080ull;
+    return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
+            | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
+}
+
+uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
+{
+    x &= 0xffff8000ffff8000ull;
+    x += 0x0000800000008000ull;
+    return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
+}
+
+uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
+{
+    uint16_t s;
+    uint8_t d;
+    uint32_t res = 0;
+#define SAT8(n) \
+    s = x >> n; \
+    if (s & 0x8000) { \
+        SET_QC(); \
+    } else { \
+        if (s > 0xff) { \
+            d = 0xff; \
+            SET_QC(); \
+        } else  { \
+            d = s; \
+        } \
+        res |= (uint32_t)d << (n / 2); \
+    }
+
+    SAT8(0);
+    SAT8(16);
+    SAT8(32);
+    SAT8(48);
+#undef SAT8
+    return res;
+}
+
+uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
+{
+    uint16_t s;
+    uint8_t d;
+    uint32_t res = 0;
+#define SAT8(n) \
+    s = x >> n; \
+    if (s > 0xff) { \
+        d = 0xff; \
+        SET_QC(); \
+    } else  { \
+        d = s; \
+    } \
+    res |= (uint32_t)d << (n / 2);
+
+    SAT8(0);
+    SAT8(16);
+    SAT8(32);
+    SAT8(48);
+#undef SAT8
+    return res;
+}
+
+uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
+{
+    int16_t s;
+    uint8_t d;
+    uint32_t res = 0;
+#define SAT8(n) \
+    s = x >> n; \
+    if (s != (int8_t)s) { \
+        d = (s >> 15) ^ 0x7f; \
+        SET_QC(); \
+    } else  { \
+        d = s; \
+    } \
+    res |= (uint32_t)d << (n / 2);
+
+    SAT8(0);
+    SAT8(16);
+    SAT8(32);
+    SAT8(48);
+#undef SAT8
+    return res;
+}
+
+uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
+{
+    uint32_t high;
+    uint32_t low;
+    low = x;
+    if (low & 0x80000000) {
+        low = 0;
+        SET_QC();
+    } else if (low > 0xffff) {
+        low = 0xffff;
+        SET_QC();
+    }
+    high = x >> 32;
+    if (high & 0x80000000) {
+        high = 0;
+        SET_QC();
+    } else if (high > 0xffff) {
+        high = 0xffff;
+        SET_QC();
+    }
+    return low | (high << 16);
+}
+
+uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
+{
+    uint32_t high;
+    uint32_t low;
+    low = x;
+    if (low > 0xffff) {
+        low = 0xffff;
+        SET_QC();
+    }
+    high = x >> 32;
+    if (high > 0xffff) {
+        high = 0xffff;
+        SET_QC();
+    }
+    return low | (high << 16);
+}
+
+uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
+{
+    int32_t low;
+    int32_t high;
+    low = x;
+    if (low != (int16_t)low) {
+        low = (low >> 31) ^ 0x7fff;
+        SET_QC();
+    }
+    high = x >> 32;
+    if (high != (int16_t)high) {
+        high = (high >> 31) ^ 0x7fff;
+        SET_QC();
+    }
+    return (uint16_t)low | (high << 16);
+}
+
+uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
+{
+    if (x & 0x8000000000000000ull) {
+        SET_QC();
+        return 0;
+    }
+    if (x > 0xffffffffu) {
+        SET_QC();
+        return 0xffffffffu;
+    }
+    return x;
+}
+
+uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
+{
+    if (x > 0xffffffffu) {
+        SET_QC();
+        return 0xffffffffu;
+    }
+    return x;
+}
+
+uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
+{
+    if ((int64_t)x != (int32_t)x) {
+        SET_QC();
+        return ((int64_t)x >> 63) ^ 0x7fffffff;
+    }
+    return x;
+}
+
+uint64_t HELPER(neon_widen_u8)(uint32_t x)
+{
+    uint64_t tmp;
+    uint64_t ret;
+    ret = (uint8_t)x;
+    tmp = (uint8_t)(x >> 8);
+    ret |= tmp << 16;
+    tmp = (uint8_t)(x >> 16);
+    ret |= tmp << 32;
+    tmp = (uint8_t)(x >> 24);
+    ret |= tmp << 48;
+    return ret;
+}
+
+uint64_t HELPER(neon_widen_s8)(uint32_t x)
+{
+    uint64_t tmp;
+    uint64_t ret;
+    ret = (uint16_t)(int8_t)x;
+    tmp = (uint16_t)(int8_t)(x >> 8);
+    ret |= tmp << 16;
+    tmp = (uint16_t)(int8_t)(x >> 16);
+    ret |= tmp << 32;
+    tmp = (uint16_t)(int8_t)(x >> 24);
+    ret |= tmp << 48;
+    return ret;
+}
+
+uint64_t HELPER(neon_widen_u16)(uint32_t x)
+{
+    uint64_t high = (uint16_t)(x >> 16);
+    return ((uint16_t)x) | (high << 32);
+}
+
+uint64_t HELPER(neon_widen_s16)(uint32_t x)
+{
+    uint64_t high = (int16_t)(x >> 16);
+    return ((uint32_t)(int16_t)x) | (high << 32);
+}
+
+uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
+{
+    uint64_t mask;
+    mask = (a ^ b) & 0x8000800080008000ull;
+    a &= ~0x8000800080008000ull;
+    b &= ~0x8000800080008000ull;
+    return (a + b) ^ mask;
+}
+
+uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
+{
+    uint64_t mask;
+    mask = (a ^ b) & 0x8000000080000000ull;
+    a &= ~0x8000000080000000ull;
+    b &= ~0x8000000080000000ull;
+    return (a + b) ^ mask;
+}
+
+uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
+{
+    uint64_t tmp;
+    uint64_t tmp2;
+
+    tmp = a & 0x0000ffff0000ffffull;
+    tmp += (a >> 16) & 0x0000ffff0000ffffull;
+    tmp2 = b & 0xffff0000ffff0000ull;
+    tmp2 += (b << 16) & 0xffff0000ffff0000ull;
+    return    ( tmp         & 0xffff)
+            | ((tmp  >> 16) & 0xffff0000ull)
+            | ((tmp2 << 16) & 0xffff00000000ull)
+            | ( tmp2        & 0xffff000000000000ull);
+}
+
+uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
+{
+    uint32_t low = a + (a >> 32);
+    uint32_t high = b + (b >> 32);
+    return low + ((uint64_t)high << 32);
+}
+
+uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
+{
+    uint64_t mask;
+    mask = (a ^ ~b) & 0x8000800080008000ull;
+    a |= 0x8000800080008000ull;
+    b &= ~0x8000800080008000ull;
+    return (a - b) ^ mask;
+}
+
+uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
+{
+    uint64_t mask;
+    mask = (a ^ ~b) & 0x8000000080000000ull;
+    a |= 0x8000000080000000ull;
+    b &= ~0x8000000080000000ull;
+    return (a - b) ^ mask;
+}
+
+uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
+{
+    uint32_t x, y;
+    uint32_t low, high;
+
+    x = a;
+    y = b;
+    low = x + y;
+    if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
+        SET_QC();
+        low = ((int32_t)x >> 31) ^ ~SIGNBIT;
+    }
+    x = a >> 32;
+    y = b >> 32;
+    high = x + y;
+    if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
+        SET_QC();
+        high = ((int32_t)x >> 31) ^ ~SIGNBIT;
+    }
+    return low | ((uint64_t)high << 32);
+}
+
+uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
+{
+    uint64_t result;
+
+    result = a + b;
+    if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
+        SET_QC();
+        result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
+    }
+    return result;
+}
+
+/* We have to do the arithmetic in a larger type than
+ * the input type, because for example with a signed 32 bit
+ * op the absolute difference can overflow a signed 32 bit value.
+ */
+#define DO_ABD(dest, x, y, intype, arithtype) do {            \
+    arithtype tmp_x = (intype)(x);                            \
+    arithtype tmp_y = (intype)(y);                            \
+    dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
+    } while(0)
+
+uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
+{
+    uint64_t tmp;
+    uint64_t result;
+    DO_ABD(result, a, b, uint8_t, uint32_t);
+    DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
+    result |= tmp << 16;
+    DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
+    result |= tmp << 32;
+    DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
+    result |= tmp << 48;
+    return result;
+}
+
+uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
+{
+    uint64_t tmp;
+    uint64_t result;
+    DO_ABD(result, a, b, int8_t, int32_t);
+    DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
+    result |= tmp << 16;
+    DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
+    result |= tmp << 32;
+    DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
+    result |= tmp << 48;
+    return result;
+}
+
+uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
+{
+    uint64_t tmp;
+    uint64_t result;
+    DO_ABD(result, a, b, uint16_t, uint32_t);
+    DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
+    return result | (tmp << 32);
+}
+
+uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
+{
+    uint64_t tmp;
+    uint64_t result;
+    DO_ABD(result, a, b, int16_t, int32_t);
+    DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
+    return result | (tmp << 32);
+}
+
+uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
+{
+    uint64_t result;
+    DO_ABD(result, a, b, uint32_t, uint64_t);
+    return result;
+}
+
+uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
+{
+    uint64_t result;
+    DO_ABD(result, a, b, int32_t, int64_t);
+    return result;
+}
+#undef DO_ABD
+
+/* Widening multiply. Named type is the source type.  */
+#define DO_MULL(dest, x, y, type1, type2) do { \
+    type1 tmp_x = x; \
+    type1 tmp_y = y; \
+    dest = (type2)((type2)tmp_x * (type2)tmp_y); \
+    } while(0)
+
+uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
+{
+    uint64_t tmp;
+    uint64_t result;
+
+    DO_MULL(result, a, b, uint8_t, uint16_t);
+    DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
+    result |= tmp << 16;
+    DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
+    result |= tmp << 32;
+    DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
+    result |= tmp << 48;
+    return result;
+}
+
+uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
+{
+    uint64_t tmp;
+    uint64_t result;
+
+    DO_MULL(result, a, b, int8_t, uint16_t);
+    DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
+    result |= tmp << 16;
+    DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
+    result |= tmp << 32;
+    DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
+    result |= tmp << 48;
+    return result;
+}
+
+uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
+{
+    uint64_t tmp;
+    uint64_t result;
+
+    DO_MULL(result, a, b, uint16_t, uint32_t);
+    DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
+    return result | (tmp << 32);
+}
+
+uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
+{
+    uint64_t tmp;
+    uint64_t result;
+
+    DO_MULL(result, a, b, int16_t, uint32_t);
+    DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
+    return result | (tmp << 32);
+}
+
+uint64_t HELPER(neon_negl_u16)(uint64_t x)
+{
+    uint16_t tmp;
+    uint64_t result;
+    result = (uint16_t)-x;
+    tmp = -(x >> 16);
+    result |= (uint64_t)tmp << 16;
+    tmp = -(x >> 32);
+    result |= (uint64_t)tmp << 32;
+    tmp = -(x >> 48);
+    result |= (uint64_t)tmp << 48;
+    return result;
+}
+
+uint64_t HELPER(neon_negl_u32)(uint64_t x)
+{
+    uint32_t low = -x;
+    uint32_t high = -(x >> 32);
+    return low | ((uint64_t)high << 32);
+}
+
+/* Saturating sign manipulation.  */
+/* ??? Make these use NEON_VOP1 */
+#define DO_QABS8(x) do { \
+    if (x == (int8_t)0x80) { \
+        x = 0x7f; \
+        SET_QC(); \
+    } else if (x < 0) { \
+        x = -x; \
+    }} while (0)
+uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
+{
+    neon_s8 vec;
+    NEON_UNPACK(neon_s8, vec, x);
+    DO_QABS8(vec.v1);
+    DO_QABS8(vec.v2);
+    DO_QABS8(vec.v3);
+    DO_QABS8(vec.v4);
+    NEON_PACK(neon_s8, x, vec);
+    return x;
+}
+#undef DO_QABS8
+
+#define DO_QNEG8(x) do { \
+    if (x == (int8_t)0x80) { \
+        x = 0x7f; \
+        SET_QC(); \
+    } else { \
+        x = -x; \
+    }} while (0)
+uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
+{
+    neon_s8 vec;
+    NEON_UNPACK(neon_s8, vec, x);
+    DO_QNEG8(vec.v1);
+    DO_QNEG8(vec.v2);
+    DO_QNEG8(vec.v3);
+    DO_QNEG8(vec.v4);
+    NEON_PACK(neon_s8, x, vec);
+    return x;
+}
+#undef DO_QNEG8
+
+#define DO_QABS16(x) do { \
+    if (x == (int16_t)0x8000) { \
+        x = 0x7fff; \
+        SET_QC(); \
+    } else if (x < 0) { \
+        x = -x; \
+    }} while (0)
+uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
+{
+    neon_s16 vec;
+    NEON_UNPACK(neon_s16, vec, x);
+    DO_QABS16(vec.v1);
+    DO_QABS16(vec.v2);
+    NEON_PACK(neon_s16, x, vec);
+    return x;
+}
+#undef DO_QABS16
+
+#define DO_QNEG16(x) do { \
+    if (x == (int16_t)0x8000) { \
+        x = 0x7fff; \
+        SET_QC(); \
+    } else { \
+        x = -x; \
+    }} while (0)
+uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
+{
+    neon_s16 vec;
+    NEON_UNPACK(neon_s16, vec, x);
+    DO_QNEG16(vec.v1);
+    DO_QNEG16(vec.v2);
+    NEON_PACK(neon_s16, x, vec);
+    return x;
+}
+#undef DO_QNEG16
+
+uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
+{
+    if (x == SIGNBIT) {
+        SET_QC();
+        x = ~SIGNBIT;
+    } else if ((int32_t)x < 0) {
+        x = -x;
+    }
+    return x;
+}
+
+uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
+{
+    if (x == SIGNBIT) {
+        SET_QC();
+        x = ~SIGNBIT;
+    } else {
+        x = -x;
+    }
+    return x;
+}
+
+uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x)
+{
+    if (x == SIGNBIT64) {
+        SET_QC();
+        x = ~SIGNBIT64;
+    } else if ((int64_t)x < 0) {
+        x = -x;
+    }
+    return x;
+}
+
+uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x)
+{
+    if (x == SIGNBIT64) {
+        SET_QC();
+        x = ~SIGNBIT64;
+    } else {
+        x = -x;
+    }
+    return x;
+}
+
+/* NEON Float helpers.  */
+
+/* Floating point comparisons produce an integer result.
+ * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
+ * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
+ */
+uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
+}
+
+uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    return -float32_le(make_float32(b), make_float32(a), fpst);
+}
+
+uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    return -float32_lt(make_float32(b), make_float32(a), fpst);
+}
+
+uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    float32 f0 = float32_abs(make_float32(a));
+    float32 f1 = float32_abs(make_float32(b));
+    return -float32_le(f1, f0, fpst);
+}
+
+uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    float32 f0 = float32_abs(make_float32(a));
+    float32 f1 = float32_abs(make_float32(b));
+    return -float32_lt(f1, f0, fpst);
+}
+
+uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    float64 f0 = float64_abs(make_float64(a));
+    float64 f1 = float64_abs(make_float64(b));
+    return -float64_le(f1, f0, fpst);
+}
+
+uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    float64 f0 = float64_abs(make_float64(a));
+    float64 f1 = float64_abs(make_float64(b));
+    return -float64_lt(f1, f0, fpst);
+}
+
+#define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
+
+void HELPER(neon_qunzip8)(void *vd, void *vm)
+{
+    uint64_t *rd = vd, *rm = vm;
+    uint64_t zd0 = rd[0], zd1 = rd[1];
+    uint64_t zm0 = rm[0], zm1 = rm[1];
+
+    uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
+        | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
+        | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
+        | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
+    uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
+        | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
+        | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
+        | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
+    uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
+        | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
+        | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
+        | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
+    uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
+        | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
+        | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
+        | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
+
+    rm[0] = m0;
+    rm[1] = m1;
+    rd[0] = d0;
+    rd[1] = d1;
+}
+
+void HELPER(neon_qunzip16)(void *vd, void *vm)
+{
+    uint64_t *rd = vd, *rm = vm;
+    uint64_t zd0 = rd[0], zd1 = rd[1];
+    uint64_t zm0 = rm[0], zm1 = rm[1];
+
+    uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
+        | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
+    uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
+        | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
+    uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
+        | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
+    uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
+        | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
+
+    rm[0] = m0;
+    rm[1] = m1;
+    rd[0] = d0;
+    rd[1] = d1;
+}
+
+void HELPER(neon_qunzip32)(void *vd, void *vm)
+{
+    uint64_t *rd = vd, *rm = vm;
+    uint64_t zd0 = rd[0], zd1 = rd[1];
+    uint64_t zm0 = rm[0], zm1 = rm[1];
+
+    uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
+    uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
+    uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
+    uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
+
+    rm[0] = m0;
+    rm[1] = m1;
+    rd[0] = d0;
+    rd[1] = d1;
+}
+
+void HELPER(neon_unzip8)(void *vd, void *vm)
+{
+    uint64_t *rd = vd, *rm = vm;
+    uint64_t zd = rd[0], zm = rm[0];
+
+    uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
+        | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
+        | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
+        | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
+    uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
+        | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
+        | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
+        | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
+
+    rm[0] = m0;
+    rd[0] = d0;
+}
+
+void HELPER(neon_unzip16)(void *vd, void *vm)
+{
+    uint64_t *rd = vd, *rm = vm;
+    uint64_t zd = rd[0], zm = rm[0];
+
+    uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
+        | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
+    uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
+        | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
+
+    rm[0] = m0;
+    rd[0] = d0;
+}
+
+void HELPER(neon_qzip8)(void *vd, void *vm)
+{
+    uint64_t *rd = vd, *rm = vm;
+    uint64_t zd0 = rd[0], zd1 = rd[1];
+    uint64_t zm0 = rm[0], zm1 = rm[1];
+
+    uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
+        | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
+        | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
+        | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
+    uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
+        | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
+        | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
+        | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
+    uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
+        | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
+        | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
+        | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
+    uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
+        | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
+        | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
+        | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
+
+    rm[0] = m0;
+    rm[1] = m1;
+    rd[0] = d0;
+    rd[1] = d1;
+}
+
+void HELPER(neon_qzip16)(void *vd, void *vm)
+{
+    uint64_t *rd = vd, *rm = vm;
+    uint64_t zd0 = rd[0], zd1 = rd[1];
+    uint64_t zm0 = rm[0], zm1 = rm[1];
+
+    uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
+        | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
+    uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
+        | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
+    uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
+        | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
+    uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
+        | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
+
+    rm[0] = m0;
+    rm[1] = m1;
+    rd[0] = d0;
+    rd[1] = d1;
+}
+
+void HELPER(neon_qzip32)(void *vd, void *vm)
+{
+    uint64_t *rd = vd, *rm = vm;
+    uint64_t zd0 = rd[0], zd1 = rd[1];
+    uint64_t zm0 = rm[0], zm1 = rm[1];
+
+    uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
+    uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
+    uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
+    uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
+
+    rm[0] = m0;
+    rm[1] = m1;
+    rd[0] = d0;
+    rd[1] = d1;
+}
+
+void HELPER(neon_zip8)(void *vd, void *vm)
+{
+    uint64_t *rd = vd, *rm = vm;
+    uint64_t zd = rd[0], zm = rm[0];
+
+    uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
+        | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
+        | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
+        | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
+    uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
+        | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
+        | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
+        | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
+
+    rm[0] = m0;
+    rd[0] = d0;
+}
+
+void HELPER(neon_zip16)(void *vd, void *vm)
+{
+    uint64_t *rd = vd, *rm = vm;
+    uint64_t zd = rd[0], zm = rm[0];
+
+    uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
+        | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
+    uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
+        | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
+
+    rm[0] = m0;
+    rd[0] = d0;
+}
diff --git a/target/arm/tcg/op_helper.c b/target/arm/tcg/op_helper.c
new file mode 100644
index 0000000..3baf800
--- /dev/null
+++ b/target/arm/tcg/op_helper.c
@@ -0,0 +1,1069 @@
+/*
+ *  ARM helper routines
+ *
+ *  Copyright (c) 2005-2007 CodeSourcery, LLC
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include "qemu/osdep.h"
+#include "qemu/main-loop.h"
+#include "cpu.h"
+#include "exec/helper-proto.h"
+#include "internals.h"
+#include "exec/exec-all.h"
+#include "exec/cpu_ldst.h"
+#include "cpregs.h"
+
+#define SIGNBIT (uint32_t)0x80000000
+#define SIGNBIT64 ((uint64_t)1 << 63)
+
+int exception_target_el(CPUARMState *env)
+{
+    int target_el = MAX(1, arm_current_el(env));
+
+    /*
+     * No such thing as secure EL1 if EL3 is aarch32,
+     * so update the target EL to EL3 in this case.
+     */
+    if (arm_is_secure(env) && !arm_el_is_aa64(env, 3) && target_el == 1) {
+        target_el = 3;
+    }
+
+    return target_el;
+}
+
+void raise_exception(CPUARMState *env, uint32_t excp,
+                     uint32_t syndrome, uint32_t target_el)
+{
+    CPUState *cs = env_cpu(env);
+
+    if (target_el == 1 && (arm_hcr_el2_eff(env) & HCR_TGE)) {
+        /*
+         * Redirect NS EL1 exceptions to NS EL2. These are reported with
+         * their original syndrome register value, with the exception of
+         * SIMD/FP access traps, which are reported as uncategorized
+         * (see DDI0478C.a D1.10.4)
+         */
+        target_el = 2;
+        if (syn_get_ec(syndrome) == EC_ADVSIMDFPACCESSTRAP) {
+            syndrome = syn_uncategorized();
+        }
+    }
+
+    assert(!excp_is_internal(excp));
+    cs->exception_index = excp;
+    env->exception.syndrome = syndrome;
+    env->exception.target_el = target_el;
+    cpu_loop_exit(cs);
+}
+
+void raise_exception_ra(CPUARMState *env, uint32_t excp, uint32_t syndrome,
+                        uint32_t target_el, uintptr_t ra)
+{
+    CPUState *cs = env_cpu(env);
+
+    /*
+     * restore_state_to_opc() will set env->exception.syndrome, so
+     * we must restore CPU state here before setting the syndrome
+     * the caller passed us, and cannot use cpu_loop_exit_restore().
+     */
+    cpu_restore_state(cs, ra);
+    raise_exception(env, excp, syndrome, target_el);
+}
+
+uint64_t HELPER(neon_tbl)(CPUARMState *env, uint32_t desc,
+                          uint64_t ireg, uint64_t def)
+{
+    uint64_t tmp, val = 0;
+    uint32_t maxindex = ((desc & 3) + 1) * 8;
+    uint32_t base_reg = desc >> 2;
+    uint32_t shift, index, reg;
+
+    for (shift = 0; shift < 64; shift += 8) {
+        index = (ireg >> shift) & 0xff;
+        if (index < maxindex) {
+            reg = base_reg + (index >> 3);
+            tmp = *aa32_vfp_dreg(env, reg);
+            tmp = ((tmp >> ((index & 7) << 3)) & 0xff) << shift;
+        } else {
+            tmp = def & (0xffull << shift);
+        }
+        val |= tmp;
+    }
+    return val;
+}
+
+void HELPER(v8m_stackcheck)(CPUARMState *env, uint32_t newvalue)
+{
+    /*
+     * Perform the v8M stack limit check for SP updates from translated code,
+     * raising an exception if the limit is breached.
+     */
+    if (newvalue < v7m_sp_limit(env)) {
+        /*
+         * Stack limit exceptions are a rare case, so rather than syncing
+         * PC/condbits before the call, we use raise_exception_ra() so
+         * that cpu_restore_state() will sort them out.
+         */
+        raise_exception_ra(env, EXCP_STKOF, 0, 1, GETPC());
+    }
+}
+
+uint32_t HELPER(add_setq)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+    uint32_t res = a + b;
+    if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT))
+        env->QF = 1;
+    return res;
+}
+
+uint32_t HELPER(add_saturate)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+    uint32_t res = a + b;
+    if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) {
+        env->QF = 1;
+        res = ~(((int32_t)a >> 31) ^ SIGNBIT);
+    }
+    return res;
+}
+
+uint32_t HELPER(sub_saturate)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+    uint32_t res = a - b;
+    if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) {
+        env->QF = 1;
+        res = ~(((int32_t)a >> 31) ^ SIGNBIT);
+    }
+    return res;
+}
+
+uint32_t HELPER(add_usaturate)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+    uint32_t res = a + b;
+    if (res < a) {
+        env->QF = 1;
+        res = ~0;
+    }
+    return res;
+}
+
+uint32_t HELPER(sub_usaturate)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+    uint32_t res = a - b;
+    if (res > a) {
+        env->QF = 1;
+        res = 0;
+    }
+    return res;
+}
+
+/* Signed saturation.  */
+static inline uint32_t do_ssat(CPUARMState *env, int32_t val, int shift)
+{
+    int32_t top;
+    uint32_t mask;
+
+    top = val >> shift;
+    mask = (1u << shift) - 1;
+    if (top > 0) {
+        env->QF = 1;
+        return mask;
+    } else if (top < -1) {
+        env->QF = 1;
+        return ~mask;
+    }
+    return val;
+}
+
+/* Unsigned saturation.  */
+static inline uint32_t do_usat(CPUARMState *env, int32_t val, int shift)
+{
+    uint32_t max;
+
+    max = (1u << shift) - 1;
+    if (val < 0) {
+        env->QF = 1;
+        return 0;
+    } else if (val > max) {
+        env->QF = 1;
+        return max;
+    }
+    return val;
+}
+
+/* Signed saturate.  */
+uint32_t HELPER(ssat)(CPUARMState *env, uint32_t x, uint32_t shift)
+{
+    return do_ssat(env, x, shift);
+}
+
+/* Dual halfword signed saturate.  */
+uint32_t HELPER(ssat16)(CPUARMState *env, uint32_t x, uint32_t shift)
+{
+    uint32_t res;
+
+    res = (uint16_t)do_ssat(env, (int16_t)x, shift);
+    res |= do_ssat(env, ((int32_t)x) >> 16, shift) << 16;
+    return res;
+}
+
+/* Unsigned saturate.  */
+uint32_t HELPER(usat)(CPUARMState *env, uint32_t x, uint32_t shift)
+{
+    return do_usat(env, x, shift);
+}
+
+/* Dual halfword unsigned saturate.  */
+uint32_t HELPER(usat16)(CPUARMState *env, uint32_t x, uint32_t shift)
+{
+    uint32_t res;
+
+    res = (uint16_t)do_usat(env, (int16_t)x, shift);
+    res |= do_usat(env, ((int32_t)x) >> 16, shift) << 16;
+    return res;
+}
+
+void HELPER(setend)(CPUARMState *env)
+{
+    env->uncached_cpsr ^= CPSR_E;
+    arm_rebuild_hflags(env);
+}
+
+void HELPER(check_bxj_trap)(CPUARMState *env, uint32_t rm)
+{
+    /*
+     * Only called if in NS EL0 or EL1 for a BXJ for a v7A CPU;
+     * check if HSTR.TJDBX means we need to trap to EL2.
+     */
+    if (env->cp15.hstr_el2 & HSTR_TJDBX) {
+        /*
+         * We know the condition code check passed, so take the IMPDEF
+         * choice to always report CV=1 COND 0xe
+         */
+        uint32_t syn = syn_bxjtrap(1, 0xe, rm);
+        raise_exception_ra(env, EXCP_HYP_TRAP, syn, 2, GETPC());
+    }
+}
+
+#ifndef CONFIG_USER_ONLY
+/* Function checks whether WFx (WFI/WFE) instructions are set up to be trapped.
+ * The function returns the target EL (1-3) if the instruction is to be trapped;
+ * otherwise it returns 0 indicating it is not trapped.
+ */
+static inline int check_wfx_trap(CPUARMState *env, bool is_wfe)
+{
+    int cur_el = arm_current_el(env);
+    uint64_t mask;
+
+    if (arm_feature(env, ARM_FEATURE_M)) {
+        /* M profile cores can never trap WFI/WFE. */
+        return 0;
+    }
+
+    /* If we are currently in EL0 then we need to check if SCTLR is set up for
+     * WFx instructions being trapped to EL1. These trap bits don't exist in v7.
+     */
+    if (cur_el < 1 && arm_feature(env, ARM_FEATURE_V8)) {
+        int target_el;
+
+        mask = is_wfe ? SCTLR_nTWE : SCTLR_nTWI;
+        if (arm_is_secure_below_el3(env) && !arm_el_is_aa64(env, 3)) {
+            /* Secure EL0 and Secure PL1 is at EL3 */
+            target_el = 3;
+        } else {
+            target_el = 1;
+        }
+
+        if (!(env->cp15.sctlr_el[target_el] & mask)) {
+            return target_el;
+        }
+    }
+
+    /* We are not trapping to EL1; trap to EL2 if HCR_EL2 requires it
+     * No need for ARM_FEATURE check as if HCR_EL2 doesn't exist the
+     * bits will be zero indicating no trap.
+     */
+    if (cur_el < 2) {
+        mask = is_wfe ? HCR_TWE : HCR_TWI;
+        if (arm_hcr_el2_eff(env) & mask) {
+            return 2;
+        }
+    }
+
+    /* We are not trapping to EL1 or EL2; trap to EL3 if SCR_EL3 requires it */
+    if (cur_el < 3) {
+        mask = (is_wfe) ? SCR_TWE : SCR_TWI;
+        if (env->cp15.scr_el3 & mask) {
+            return 3;
+        }
+    }
+
+    return 0;
+}
+#endif
+
+void HELPER(wfi)(CPUARMState *env, uint32_t insn_len)
+{
+#ifdef CONFIG_USER_ONLY
+    /*
+     * WFI in the user-mode emulator is technically permitted but not
+     * something any real-world code would do. AArch64 Linux kernels
+     * trap it via SCTRL_EL1.nTWI and make it an (expensive) NOP;
+     * AArch32 kernels don't trap it so it will delay a bit.
+     * For QEMU, make it NOP here, because trying to raise EXCP_HLT
+     * would trigger an abort.
+     */
+    return;
+#else
+    CPUState *cs = env_cpu(env);
+    int target_el = check_wfx_trap(env, false);
+
+    if (cpu_has_work(cs)) {
+        /* Don't bother to go into our "low power state" if
+         * we would just wake up immediately.
+         */
+        return;
+    }
+
+    if (target_el) {
+        if (env->aarch64) {
+            env->pc -= insn_len;
+        } else {
+            env->regs[15] -= insn_len;
+        }
+
+        raise_exception(env, EXCP_UDEF, syn_wfx(1, 0xe, 0, insn_len == 2),
+                        target_el);
+    }
+
+    cs->exception_index = EXCP_HLT;
+    cs->halted = 1;
+    cpu_loop_exit(cs);
+#endif
+}
+
+void HELPER(wfe)(CPUARMState *env)
+{
+    /* This is a hint instruction that is semantically different
+     * from YIELD even though we currently implement it identically.
+     * Don't actually halt the CPU, just yield back to top
+     * level loop. This is not going into a "low power state"
+     * (ie halting until some event occurs), so we never take
+     * a configurable trap to a different exception level.
+     */
+    HELPER(yield)(env);
+}
+
+void HELPER(yield)(CPUARMState *env)
+{
+    CPUState *cs = env_cpu(env);
+
+    /* This is a non-trappable hint instruction that generally indicates
+     * that the guest is currently busy-looping. Yield control back to the
+     * top level loop so that a more deserving VCPU has a chance to run.
+     */
+    cs->exception_index = EXCP_YIELD;
+    cpu_loop_exit(cs);
+}
+
+/* Raise an internal-to-QEMU exception. This is limited to only
+ * those EXCP values which are special cases for QEMU to interrupt
+ * execution and not to be used for exceptions which are passed to
+ * the guest (those must all have syndrome information and thus should
+ * use exception_with_syndrome*).
+ */
+void HELPER(exception_internal)(CPUARMState *env, uint32_t excp)
+{
+    CPUState *cs = env_cpu(env);
+
+    assert(excp_is_internal(excp));
+    cs->exception_index = excp;
+    cpu_loop_exit(cs);
+}
+
+/* Raise an exception with the specified syndrome register value */
+void HELPER(exception_with_syndrome_el)(CPUARMState *env, uint32_t excp,
+                                        uint32_t syndrome, uint32_t target_el)
+{
+    raise_exception(env, excp, syndrome, target_el);
+}
+
+/*
+ * Raise an exception with the specified syndrome register value
+ * to the default target el.
+ */
+void HELPER(exception_with_syndrome)(CPUARMState *env, uint32_t excp,
+                                     uint32_t syndrome)
+{
+    raise_exception(env, excp, syndrome, exception_target_el(env));
+}
+
+uint32_t HELPER(cpsr_read)(CPUARMState *env)
+{
+    return cpsr_read(env) & ~CPSR_EXEC;
+}
+
+void HELPER(cpsr_write)(CPUARMState *env, uint32_t val, uint32_t mask)
+{
+    cpsr_write(env, val, mask, CPSRWriteByInstr);
+    /* TODO: Not all cpsr bits are relevant to hflags.  */
+    arm_rebuild_hflags(env);
+}
+
+/* Write the CPSR for a 32-bit exception return */
+void HELPER(cpsr_write_eret)(CPUARMState *env, uint32_t val)
+{
+    uint32_t mask;
+
+    qemu_mutex_lock_iothread();
+    arm_call_pre_el_change_hook(env_archcpu(env));
+    qemu_mutex_unlock_iothread();
+
+    mask = aarch32_cpsr_valid_mask(env->features, &env_archcpu(env)->isar);
+    cpsr_write(env, val, mask, CPSRWriteExceptionReturn);
+
+    /* Generated code has already stored the new PC value, but
+     * without masking out its low bits, because which bits need
+     * masking depends on whether we're returning to Thumb or ARM
+     * state. Do the masking now.
+     */
+    env->regs[15] &= (env->thumb ? ~1 : ~3);
+    arm_rebuild_hflags(env);
+
+    qemu_mutex_lock_iothread();
+    arm_call_el_change_hook(env_archcpu(env));
+    qemu_mutex_unlock_iothread();
+}
+
+/* Access to user mode registers from privileged modes.  */
+uint32_t HELPER(get_user_reg)(CPUARMState *env, uint32_t regno)
+{
+    uint32_t val;
+
+    if (regno == 13) {
+        val = env->banked_r13[BANK_USRSYS];
+    } else if (regno == 14) {
+        val = env->banked_r14[BANK_USRSYS];
+    } else if (regno >= 8
+               && (env->uncached_cpsr & 0x1f) == ARM_CPU_MODE_FIQ) {
+        val = env->usr_regs[regno - 8];
+    } else {
+        val = env->regs[regno];
+    }
+    return val;
+}
+
+void HELPER(set_user_reg)(CPUARMState *env, uint32_t regno, uint32_t val)
+{
+    if (regno == 13) {
+        env->banked_r13[BANK_USRSYS] = val;
+    } else if (regno == 14) {
+        env->banked_r14[BANK_USRSYS] = val;
+    } else if (regno >= 8
+               && (env->uncached_cpsr & 0x1f) == ARM_CPU_MODE_FIQ) {
+        env->usr_regs[regno - 8] = val;
+    } else {
+        env->regs[regno] = val;
+    }
+}
+
+void HELPER(set_r13_banked)(CPUARMState *env, uint32_t mode, uint32_t val)
+{
+    if ((env->uncached_cpsr & CPSR_M) == mode) {
+        env->regs[13] = val;
+    } else {
+        env->banked_r13[bank_number(mode)] = val;
+    }
+}
+
+uint32_t HELPER(get_r13_banked)(CPUARMState *env, uint32_t mode)
+{
+    if ((env->uncached_cpsr & CPSR_M) == ARM_CPU_MODE_SYS) {
+        /* SRS instruction is UNPREDICTABLE from System mode; we UNDEF.
+         * Other UNPREDICTABLE and UNDEF cases were caught at translate time.
+         */
+        raise_exception(env, EXCP_UDEF, syn_uncategorized(),
+                        exception_target_el(env));
+    }
+
+    if ((env->uncached_cpsr & CPSR_M) == mode) {
+        return env->regs[13];
+    } else {
+        return env->banked_r13[bank_number(mode)];
+    }
+}
+
+static void msr_mrs_banked_exc_checks(CPUARMState *env, uint32_t tgtmode,
+                                      uint32_t regno)
+{
+    /* Raise an exception if the requested access is one of the UNPREDICTABLE
+     * cases; otherwise return. This broadly corresponds to the pseudocode
+     * BankedRegisterAccessValid() and SPSRAccessValid(),
+     * except that we have already handled some cases at translate time.
+     */
+    int curmode = env->uncached_cpsr & CPSR_M;
+
+    if (regno == 17) {
+        /* ELR_Hyp: a special case because access from tgtmode is OK */
+        if (curmode != ARM_CPU_MODE_HYP && curmode != ARM_CPU_MODE_MON) {
+            goto undef;
+        }
+        return;
+    }
+
+    if (curmode == tgtmode) {
+        goto undef;
+    }
+
+    if (tgtmode == ARM_CPU_MODE_USR) {
+        switch (regno) {
+        case 8 ... 12:
+            if (curmode != ARM_CPU_MODE_FIQ) {
+                goto undef;
+            }
+            break;
+        case 13:
+            if (curmode == ARM_CPU_MODE_SYS) {
+                goto undef;
+            }
+            break;
+        case 14:
+            if (curmode == ARM_CPU_MODE_HYP || curmode == ARM_CPU_MODE_SYS) {
+                goto undef;
+            }
+            break;
+        default:
+            break;
+        }
+    }
+
+    if (tgtmode == ARM_CPU_MODE_HYP) {
+        /* SPSR_Hyp, r13_hyp: accessible from Monitor mode only */
+        if (curmode != ARM_CPU_MODE_MON) {
+            goto undef;
+        }
+    }
+
+    return;
+
+undef:
+    raise_exception(env, EXCP_UDEF, syn_uncategorized(),
+                    exception_target_el(env));
+}
+
+void HELPER(msr_banked)(CPUARMState *env, uint32_t value, uint32_t tgtmode,
+                        uint32_t regno)
+{
+    msr_mrs_banked_exc_checks(env, tgtmode, regno);
+
+    switch (regno) {
+    case 16: /* SPSRs */
+        env->banked_spsr[bank_number(tgtmode)] = value;
+        break;
+    case 17: /* ELR_Hyp */
+        env->elr_el[2] = value;
+        break;
+    case 13:
+        env->banked_r13[bank_number(tgtmode)] = value;
+        break;
+    case 14:
+        env->banked_r14[r14_bank_number(tgtmode)] = value;
+        break;
+    case 8 ... 12:
+        switch (tgtmode) {
+        case ARM_CPU_MODE_USR:
+            env->usr_regs[regno - 8] = value;
+            break;
+        case ARM_CPU_MODE_FIQ:
+            env->fiq_regs[regno - 8] = value;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+uint32_t HELPER(mrs_banked)(CPUARMState *env, uint32_t tgtmode, uint32_t regno)
+{
+    msr_mrs_banked_exc_checks(env, tgtmode, regno);
+
+    switch (regno) {
+    case 16: /* SPSRs */
+        return env->banked_spsr[bank_number(tgtmode)];
+    case 17: /* ELR_Hyp */
+        return env->elr_el[2];
+    case 13:
+        return env->banked_r13[bank_number(tgtmode)];
+    case 14:
+        return env->banked_r14[r14_bank_number(tgtmode)];
+    case 8 ... 12:
+        switch (tgtmode) {
+        case ARM_CPU_MODE_USR:
+            return env->usr_regs[regno - 8];
+        case ARM_CPU_MODE_FIQ:
+            return env->fiq_regs[regno - 8];
+        default:
+            g_assert_not_reached();
+        }
+    default:
+        g_assert_not_reached();
+    }
+}
+
+const void *HELPER(access_check_cp_reg)(CPUARMState *env, uint32_t key,
+                                        uint32_t syndrome, uint32_t isread)
+{
+    ARMCPU *cpu = env_archcpu(env);
+    const ARMCPRegInfo *ri = get_arm_cp_reginfo(cpu->cp_regs, key);
+    CPAccessResult res = CP_ACCESS_OK;
+    int target_el;
+
+    assert(ri != NULL);
+
+    if (arm_feature(env, ARM_FEATURE_XSCALE) && ri->cp < 14
+        && extract32(env->cp15.c15_cpar, ri->cp, 1) == 0) {
+        res = CP_ACCESS_TRAP;
+        goto fail;
+    }
+
+    if (ri->accessfn) {
+        res = ri->accessfn(env, ri, isread);
+    }
+
+    /*
+     * If the access function indicates a trap from EL0 to EL1 then
+     * that always takes priority over the HSTR_EL2 trap. (If it indicates
+     * a trap to EL3, then the HSTR_EL2 trap takes priority; if it indicates
+     * a trap to EL2, then the syndrome is the same either way so we don't
+     * care whether technically the architecture says that HSTR_EL2 trap or
+     * the other trap takes priority. So we take the "check HSTR_EL2" path
+     * for all of those cases.)
+     */
+    if (res != CP_ACCESS_OK && ((res & CP_ACCESS_EL_MASK) == 0) &&
+        arm_current_el(env) == 0) {
+        goto fail;
+    }
+
+    /*
+     * HSTR_EL2 traps from EL1 are checked earlier, in generated code;
+     * we only need to check here for traps from EL0.
+     */
+    if (!is_a64(env) && arm_current_el(env) == 0 && ri->cp == 15 &&
+        arm_is_el2_enabled(env) &&
+        (arm_hcr_el2_eff(env) & (HCR_E2H | HCR_TGE)) != (HCR_E2H | HCR_TGE)) {
+        uint32_t mask = 1 << ri->crn;
+
+        if (ri->type & ARM_CP_64BIT) {
+            mask = 1 << ri->crm;
+        }
+
+        /* T4 and T14 are RES0 */
+        mask &= ~((1 << 4) | (1 << 14));
+
+        if (env->cp15.hstr_el2 & mask) {
+            res = CP_ACCESS_TRAP_EL2;
+            goto fail;
+        }
+    }
+
+    /*
+     * Fine-grained traps also are lower priority than undef-to-EL1,
+     * higher priority than trap-to-EL3, and we don't care about priority
+     * order with other EL2 traps because the syndrome value is the same.
+     */
+    if (arm_fgt_active(env, arm_current_el(env))) {
+        uint64_t trapword = 0;
+        unsigned int idx = FIELD_EX32(ri->fgt, FGT, IDX);
+        unsigned int bitpos = FIELD_EX32(ri->fgt, FGT, BITPOS);
+        bool rev = FIELD_EX32(ri->fgt, FGT, REV);
+        bool trapbit;
+
+        if (ri->fgt & FGT_EXEC) {
+            assert(idx < ARRAY_SIZE(env->cp15.fgt_exec));
+            trapword = env->cp15.fgt_exec[idx];
+        } else if (isread && (ri->fgt & FGT_R)) {
+            assert(idx < ARRAY_SIZE(env->cp15.fgt_read));
+            trapword = env->cp15.fgt_read[idx];
+        } else if (!isread && (ri->fgt & FGT_W)) {
+            assert(idx < ARRAY_SIZE(env->cp15.fgt_write));
+            trapword = env->cp15.fgt_write[idx];
+        }
+
+        trapbit = extract64(trapword, bitpos, 1);
+        if (trapbit != rev) {
+            res = CP_ACCESS_TRAP_EL2;
+            goto fail;
+        }
+    }
+
+    if (likely(res == CP_ACCESS_OK)) {
+        return ri;
+    }
+
+ fail:
+    switch (res & ~CP_ACCESS_EL_MASK) {
+    case CP_ACCESS_TRAP:
+        break;
+    case CP_ACCESS_TRAP_UNCATEGORIZED:
+        /* Only CP_ACCESS_TRAP traps are direct to a specified EL */
+        assert((res & CP_ACCESS_EL_MASK) == 0);
+        if (cpu_isar_feature(aa64_ids, cpu) && isread &&
+            arm_cpreg_in_idspace(ri)) {
+            /*
+             * FEAT_IDST says this should be reported as EC_SYSTEMREGISTERTRAP,
+             * not EC_UNCATEGORIZED
+             */
+            break;
+        }
+        syndrome = syn_uncategorized();
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    target_el = res & CP_ACCESS_EL_MASK;
+    switch (target_el) {
+    case 0:
+        target_el = exception_target_el(env);
+        break;
+    case 2:
+        assert(arm_current_el(env) != 3);
+        assert(arm_is_el2_enabled(env));
+        break;
+    case 3:
+        assert(arm_feature(env, ARM_FEATURE_EL3));
+        break;
+    default:
+        /* No "direct" traps to EL1 */
+        g_assert_not_reached();
+    }
+
+    raise_exception(env, EXCP_UDEF, syndrome, target_el);
+}
+
+const void *HELPER(lookup_cp_reg)(CPUARMState *env, uint32_t key)
+{
+    ARMCPU *cpu = env_archcpu(env);
+    const ARMCPRegInfo *ri = get_arm_cp_reginfo(cpu->cp_regs, key);
+
+    assert(ri != NULL);
+    return ri;
+}
+
+void HELPER(set_cp_reg)(CPUARMState *env, const void *rip, uint32_t value)
+{
+    const ARMCPRegInfo *ri = rip;
+
+    if (ri->type & ARM_CP_IO) {
+        qemu_mutex_lock_iothread();
+        ri->writefn(env, ri, value);
+        qemu_mutex_unlock_iothread();
+    } else {
+        ri->writefn(env, ri, value);
+    }
+}
+
+uint32_t HELPER(get_cp_reg)(CPUARMState *env, const void *rip)
+{
+    const ARMCPRegInfo *ri = rip;
+    uint32_t res;
+
+    if (ri->type & ARM_CP_IO) {
+        qemu_mutex_lock_iothread();
+        res = ri->readfn(env, ri);
+        qemu_mutex_unlock_iothread();
+    } else {
+        res = ri->readfn(env, ri);
+    }
+
+    return res;
+}
+
+void HELPER(set_cp_reg64)(CPUARMState *env, const void *rip, uint64_t value)
+{
+    const ARMCPRegInfo *ri = rip;
+
+    if (ri->type & ARM_CP_IO) {
+        qemu_mutex_lock_iothread();
+        ri->writefn(env, ri, value);
+        qemu_mutex_unlock_iothread();
+    } else {
+        ri->writefn(env, ri, value);
+    }
+}
+
+uint64_t HELPER(get_cp_reg64)(CPUARMState *env, const void *rip)
+{
+    const ARMCPRegInfo *ri = rip;
+    uint64_t res;
+
+    if (ri->type & ARM_CP_IO) {
+        qemu_mutex_lock_iothread();
+        res = ri->readfn(env, ri);
+        qemu_mutex_unlock_iothread();
+    } else {
+        res = ri->readfn(env, ri);
+    }
+
+    return res;
+}
+
+void HELPER(pre_hvc)(CPUARMState *env)
+{
+    ARMCPU *cpu = env_archcpu(env);
+    int cur_el = arm_current_el(env);
+    /* FIXME: Use actual secure state.  */
+    bool secure = false;
+    bool undef;
+
+    if (arm_is_psci_call(cpu, EXCP_HVC)) {
+        /* If PSCI is enabled and this looks like a valid PSCI call then
+         * that overrides the architecturally mandated HVC behaviour.
+         */
+        return;
+    }
+
+    if (!arm_feature(env, ARM_FEATURE_EL2)) {
+        /* If EL2 doesn't exist, HVC always UNDEFs */
+        undef = true;
+    } else if (arm_feature(env, ARM_FEATURE_EL3)) {
+        /* EL3.HCE has priority over EL2.HCD. */
+        undef = !(env->cp15.scr_el3 & SCR_HCE);
+    } else {
+        undef = env->cp15.hcr_el2 & HCR_HCD;
+    }
+
+    /* In ARMv7 and ARMv8/AArch32, HVC is undef in secure state.
+     * For ARMv8/AArch64, HVC is allowed in EL3.
+     * Note that we've already trapped HVC from EL0 at translation
+     * time.
+     */
+    if (secure && (!is_a64(env) || cur_el == 1)) {
+        undef = true;
+    }
+
+    if (undef) {
+        raise_exception(env, EXCP_UDEF, syn_uncategorized(),
+                        exception_target_el(env));
+    }
+}
+
+void HELPER(pre_smc)(CPUARMState *env, uint32_t syndrome)
+{
+    ARMCPU *cpu = env_archcpu(env);
+    int cur_el = arm_current_el(env);
+    bool secure = arm_is_secure(env);
+    bool smd_flag = env->cp15.scr_el3 & SCR_SMD;
+
+    /*
+     * SMC behaviour is summarized in the following table.
+     * This helper handles the "Trap to EL2" and "Undef insn" cases.
+     * The "Trap to EL3" and "PSCI call" cases are handled in the exception
+     * helper.
+     *
+     *  -> ARM_FEATURE_EL3 and !SMD
+     *                           HCR_TSC && NS EL1   !HCR_TSC || !NS EL1
+     *
+     *  Conduit SMC, valid call  Trap to EL2         PSCI Call
+     *  Conduit SMC, inval call  Trap to EL2         Trap to EL3
+     *  Conduit not SMC          Trap to EL2         Trap to EL3
+     *
+     *
+     *  -> ARM_FEATURE_EL3 and SMD
+     *                           HCR_TSC && NS EL1   !HCR_TSC || !NS EL1
+     *
+     *  Conduit SMC, valid call  Trap to EL2         PSCI Call
+     *  Conduit SMC, inval call  Trap to EL2         Undef insn
+     *  Conduit not SMC          Trap to EL2         Undef insn
+     *
+     *
+     *  -> !ARM_FEATURE_EL3
+     *                           HCR_TSC && NS EL1   !HCR_TSC || !NS EL1
+     *
+     *  Conduit SMC, valid call  Trap to EL2         PSCI Call
+     *  Conduit SMC, inval call  Trap to EL2         Undef insn
+     *  Conduit not SMC          Undef insn          Undef insn
+     */
+
+    /* On ARMv8 with EL3 AArch64, SMD applies to both S and NS state.
+     * On ARMv8 with EL3 AArch32, or ARMv7 with the Virtualization
+     *  extensions, SMD only applies to NS state.
+     * On ARMv7 without the Virtualization extensions, the SMD bit
+     * doesn't exist, but we forbid the guest to set it to 1 in scr_write(),
+     * so we need not special case this here.
+     */
+    bool smd = arm_feature(env, ARM_FEATURE_AARCH64) ? smd_flag
+                                                     : smd_flag && !secure;
+
+    if (!arm_feature(env, ARM_FEATURE_EL3) &&
+        cpu->psci_conduit != QEMU_PSCI_CONDUIT_SMC) {
+        /* If we have no EL3 then SMC always UNDEFs and can't be
+         * trapped to EL2. PSCI-via-SMC is a sort of ersatz EL3
+         * firmware within QEMU, and we want an EL2 guest to be able
+         * to forbid its EL1 from making PSCI calls into QEMU's
+         * "firmware" via HCR.TSC, so for these purposes treat
+         * PSCI-via-SMC as implying an EL3.
+         * This handles the very last line of the previous table.
+         */
+        raise_exception(env, EXCP_UDEF, syn_uncategorized(),
+                        exception_target_el(env));
+    }
+
+    if (cur_el == 1 && (arm_hcr_el2_eff(env) & HCR_TSC)) {
+        /* In NS EL1, HCR controlled routing to EL2 has priority over SMD.
+         * We also want an EL2 guest to be able to forbid its EL1 from
+         * making PSCI calls into QEMU's "firmware" via HCR.TSC.
+         * This handles all the "Trap to EL2" cases of the previous table.
+         */
+        raise_exception(env, EXCP_HYP_TRAP, syndrome, 2);
+    }
+
+    /* Catch the two remaining "Undef insn" cases of the previous table:
+     *    - PSCI conduit is SMC but we don't have a valid PCSI call,
+     *    - We don't have EL3 or SMD is set.
+     */
+    if (!arm_is_psci_call(cpu, EXCP_SMC) &&
+        (smd || !arm_feature(env, ARM_FEATURE_EL3))) {
+        raise_exception(env, EXCP_UDEF, syn_uncategorized(),
+                        exception_target_el(env));
+    }
+}
+
+/* ??? Flag setting arithmetic is awkward because we need to do comparisons.
+   The only way to do that in TCG is a conditional branch, which clobbers
+   all our temporaries.  For now implement these as helper functions.  */
+
+/* Similarly for variable shift instructions.  */
+
+uint32_t HELPER(shl_cc)(CPUARMState *env, uint32_t x, uint32_t i)
+{
+    int shift = i & 0xff;
+    if (shift >= 32) {
+        if (shift == 32)
+            env->CF = x & 1;
+        else
+            env->CF = 0;
+        return 0;
+    } else if (shift != 0) {
+        env->CF = (x >> (32 - shift)) & 1;
+        return x << shift;
+    }
+    return x;
+}
+
+uint32_t HELPER(shr_cc)(CPUARMState *env, uint32_t x, uint32_t i)
+{
+    int shift = i & 0xff;
+    if (shift >= 32) {
+        if (shift == 32)
+            env->CF = (x >> 31) & 1;
+        else
+            env->CF = 0;
+        return 0;
+    } else if (shift != 0) {
+        env->CF = (x >> (shift - 1)) & 1;
+        return x >> shift;
+    }
+    return x;
+}
+
+uint32_t HELPER(sar_cc)(CPUARMState *env, uint32_t x, uint32_t i)
+{
+    int shift = i & 0xff;
+    if (shift >= 32) {
+        env->CF = (x >> 31) & 1;
+        return (int32_t)x >> 31;
+    } else if (shift != 0) {
+        env->CF = (x >> (shift - 1)) & 1;
+        return (int32_t)x >> shift;
+    }
+    return x;
+}
+
+uint32_t HELPER(ror_cc)(CPUARMState *env, uint32_t x, uint32_t i)
+{
+    int shift1, shift;
+    shift1 = i & 0xff;
+    shift = shift1 & 0x1f;
+    if (shift == 0) {
+        if (shift1 != 0)
+            env->CF = (x >> 31) & 1;
+        return x;
+    } else {
+        env->CF = (x >> (shift - 1)) & 1;
+        return ((uint32_t)x >> shift) | (x << (32 - shift));
+    }
+}
+
+void HELPER(probe_access)(CPUARMState *env, target_ulong ptr,
+                          uint32_t access_type, uint32_t mmu_idx,
+                          uint32_t size)
+{
+    uint32_t in_page = -((uint32_t)ptr | TARGET_PAGE_SIZE);
+    uintptr_t ra = GETPC();
+
+    if (likely(size <= in_page)) {
+        probe_access(env, ptr, size, access_type, mmu_idx, ra);
+    } else {
+        probe_access(env, ptr, in_page, access_type, mmu_idx, ra);
+        probe_access(env, ptr + in_page, size - in_page,
+                     access_type, mmu_idx, ra);
+    }
+}
+
+/*
+ * This function corresponds to AArch64.vESBOperation().
+ * Note that the AArch32 version is not functionally different.
+ */
+void HELPER(vesb)(CPUARMState *env)
+{
+    /*
+     * The EL2Enabled() check is done inside arm_hcr_el2_eff,
+     * and will return HCR_EL2.VSE == 0, so nothing happens.
+     */
+    uint64_t hcr = arm_hcr_el2_eff(env);
+    bool enabled = !(hcr & HCR_TGE) && (hcr & HCR_AMO);
+    bool pending = enabled && (hcr & HCR_VSE);
+    bool masked  = (env->daif & PSTATE_A);
+
+    /* If VSE pending and masked, defer the exception.  */
+    if (pending && masked) {
+        uint32_t syndrome;
+
+        if (arm_el_is_aa64(env, 1)) {
+            /* Copy across IDS and ISS from VSESR. */
+            syndrome = env->cp15.vsesr_el2 & 0x1ffffff;
+        } else {
+            ARMMMUFaultInfo fi = { .type = ARMFault_AsyncExternal };
+
+            if (extended_addresses_enabled(env)) {
+                syndrome = arm_fi_to_lfsc(&fi);
+            } else {
+                syndrome = arm_fi_to_sfsc(&fi);
+            }
+            /* Copy across AET and ExT from VSESR. */
+            syndrome |= env->cp15.vsesr_el2 & 0xd000;
+        }
+
+        /* Set VDISR_EL2.A along with the syndrome. */
+        env->cp15.vdisr_el2 = syndrome | (1u << 31);
+
+        /* Clear pending virtual SError */
+        env->cp15.hcr_el2 &= ~HCR_VSE;
+        cpu_reset_interrupt(env_cpu(env), CPU_INTERRUPT_VSERR);
+    }
+}
diff --git a/target/arm/tcg/pauth_helper.c b/target/arm/tcg/pauth_helper.c
new file mode 100644
index 0000000..d0483bf
--- /dev/null
+++ b/target/arm/tcg/pauth_helper.c
@@ -0,0 +1,515 @@
+/*
+ * ARM v8.3-PAuth Operations
+ *
+ * Copyright (c) 2019 Linaro, Ltd.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "internals.h"
+#include "exec/exec-all.h"
+#include "exec/cpu_ldst.h"
+#include "exec/helper-proto.h"
+#include "tcg/tcg-gvec-desc.h"
+#include "qemu/xxhash.h"
+
+
+static uint64_t pac_cell_shuffle(uint64_t i)
+{
+    uint64_t o = 0;
+
+    o |= extract64(i, 52, 4);
+    o |= extract64(i, 24, 4) << 4;
+    o |= extract64(i, 44, 4) << 8;
+    o |= extract64(i,  0, 4) << 12;
+
+    o |= extract64(i, 28, 4) << 16;
+    o |= extract64(i, 48, 4) << 20;
+    o |= extract64(i,  4, 4) << 24;
+    o |= extract64(i, 40, 4) << 28;
+
+    o |= extract64(i, 32, 4) << 32;
+    o |= extract64(i, 12, 4) << 36;
+    o |= extract64(i, 56, 4) << 40;
+    o |= extract64(i, 20, 4) << 44;
+
+    o |= extract64(i,  8, 4) << 48;
+    o |= extract64(i, 36, 4) << 52;
+    o |= extract64(i, 16, 4) << 56;
+    o |= extract64(i, 60, 4) << 60;
+
+    return o;
+}
+
+static uint64_t pac_cell_inv_shuffle(uint64_t i)
+{
+    uint64_t o = 0;
+
+    o |= extract64(i, 12, 4);
+    o |= extract64(i, 24, 4) << 4;
+    o |= extract64(i, 48, 4) << 8;
+    o |= extract64(i, 36, 4) << 12;
+
+    o |= extract64(i, 56, 4) << 16;
+    o |= extract64(i, 44, 4) << 20;
+    o |= extract64(i,  4, 4) << 24;
+    o |= extract64(i, 16, 4) << 28;
+
+    o |= i & MAKE_64BIT_MASK(32, 4);
+    o |= extract64(i, 52, 4) << 36;
+    o |= extract64(i, 28, 4) << 40;
+    o |= extract64(i,  8, 4) << 44;
+
+    o |= extract64(i, 20, 4) << 48;
+    o |= extract64(i,  0, 4) << 52;
+    o |= extract64(i, 40, 4) << 56;
+    o |= i & MAKE_64BIT_MASK(60, 4);
+
+    return o;
+}
+
+static uint64_t pac_sub(uint64_t i)
+{
+    static const uint8_t sub[16] = {
+        0xb, 0x6, 0x8, 0xf, 0xc, 0x0, 0x9, 0xe,
+        0x3, 0x7, 0x4, 0x5, 0xd, 0x2, 0x1, 0xa,
+    };
+    uint64_t o = 0;
+    int b;
+
+    for (b = 0; b < 64; b += 4) {
+        o |= (uint64_t)sub[(i >> b) & 0xf] << b;
+    }
+    return o;
+}
+
+static uint64_t pac_inv_sub(uint64_t i)
+{
+    static const uint8_t inv_sub[16] = {
+        0x5, 0xe, 0xd, 0x8, 0xa, 0xb, 0x1, 0x9,
+        0x2, 0x6, 0xf, 0x0, 0x4, 0xc, 0x7, 0x3,
+    };
+    uint64_t o = 0;
+    int b;
+
+    for (b = 0; b < 64; b += 4) {
+        o |= (uint64_t)inv_sub[(i >> b) & 0xf] << b;
+    }
+    return o;
+}
+
+static int rot_cell(int cell, int n)
+{
+    /* 4-bit rotate left by n.  */
+    cell |= cell << 4;
+    return extract32(cell, 4 - n, 4);
+}
+
+static uint64_t pac_mult(uint64_t i)
+{
+    uint64_t o = 0;
+    int b;
+
+    for (b = 0; b < 4 * 4; b += 4) {
+        int i0, i4, i8, ic, t0, t1, t2, t3;
+
+        i0 = extract64(i, b, 4);
+        i4 = extract64(i, b + 4 * 4, 4);
+        i8 = extract64(i, b + 8 * 4, 4);
+        ic = extract64(i, b + 12 * 4, 4);
+
+        t0 = rot_cell(i8, 1) ^ rot_cell(i4, 2) ^ rot_cell(i0, 1);
+        t1 = rot_cell(ic, 1) ^ rot_cell(i4, 1) ^ rot_cell(i0, 2);
+        t2 = rot_cell(ic, 2) ^ rot_cell(i8, 1) ^ rot_cell(i0, 1);
+        t3 = rot_cell(ic, 1) ^ rot_cell(i8, 2) ^ rot_cell(i4, 1);
+
+        o |= (uint64_t)t3 << b;
+        o |= (uint64_t)t2 << (b + 4 * 4);
+        o |= (uint64_t)t1 << (b + 8 * 4);
+        o |= (uint64_t)t0 << (b + 12 * 4);
+    }
+    return o;
+}
+
+static uint64_t tweak_cell_rot(uint64_t cell)
+{
+    return (cell >> 1) | (((cell ^ (cell >> 1)) & 1) << 3);
+}
+
+static uint64_t tweak_shuffle(uint64_t i)
+{
+    uint64_t o = 0;
+
+    o |= extract64(i, 16, 4) << 0;
+    o |= extract64(i, 20, 4) << 4;
+    o |= tweak_cell_rot(extract64(i, 24, 4)) << 8;
+    o |= extract64(i, 28, 4) << 12;
+
+    o |= tweak_cell_rot(extract64(i, 44, 4)) << 16;
+    o |= extract64(i,  8, 4) << 20;
+    o |= extract64(i, 12, 4) << 24;
+    o |= tweak_cell_rot(extract64(i, 32, 4)) << 28;
+
+    o |= extract64(i, 48, 4) << 32;
+    o |= extract64(i, 52, 4) << 36;
+    o |= extract64(i, 56, 4) << 40;
+    o |= tweak_cell_rot(extract64(i, 60, 4)) << 44;
+
+    o |= tweak_cell_rot(extract64(i,  0, 4)) << 48;
+    o |= extract64(i,  4, 4) << 52;
+    o |= tweak_cell_rot(extract64(i, 40, 4)) << 56;
+    o |= tweak_cell_rot(extract64(i, 36, 4)) << 60;
+
+    return o;
+}
+
+static uint64_t tweak_cell_inv_rot(uint64_t cell)
+{
+    return ((cell << 1) & 0xf) | ((cell & 1) ^ (cell >> 3));
+}
+
+static uint64_t tweak_inv_shuffle(uint64_t i)
+{
+    uint64_t o = 0;
+
+    o |= tweak_cell_inv_rot(extract64(i, 48, 4));
+    o |= extract64(i, 52, 4) << 4;
+    o |= extract64(i, 20, 4) << 8;
+    o |= extract64(i, 24, 4) << 12;
+
+    o |= extract64(i,  0, 4) << 16;
+    o |= extract64(i,  4, 4) << 20;
+    o |= tweak_cell_inv_rot(extract64(i,  8, 4)) << 24;
+    o |= extract64(i, 12, 4) << 28;
+
+    o |= tweak_cell_inv_rot(extract64(i, 28, 4)) << 32;
+    o |= tweak_cell_inv_rot(extract64(i, 60, 4)) << 36;
+    o |= tweak_cell_inv_rot(extract64(i, 56, 4)) << 40;
+    o |= tweak_cell_inv_rot(extract64(i, 16, 4)) << 44;
+
+    o |= extract64(i, 32, 4) << 48;
+    o |= extract64(i, 36, 4) << 52;
+    o |= extract64(i, 40, 4) << 56;
+    o |= tweak_cell_inv_rot(extract64(i, 44, 4)) << 60;
+
+    return o;
+}
+
+static uint64_t pauth_computepac_architected(uint64_t data, uint64_t modifier,
+                                             ARMPACKey key)
+{
+    static const uint64_t RC[5] = {
+        0x0000000000000000ull,
+        0x13198A2E03707344ull,
+        0xA4093822299F31D0ull,
+        0x082EFA98EC4E6C89ull,
+        0x452821E638D01377ull,
+    };
+    const uint64_t alpha = 0xC0AC29B7C97C50DDull;
+    /*
+     * Note that in the ARM pseudocode, key0 contains bits <127:64>
+     * and key1 contains bits <63:0> of the 128-bit key.
+     */
+    uint64_t key0 = key.hi, key1 = key.lo;
+    uint64_t workingval, runningmod, roundkey, modk0;
+    int i;
+
+    modk0 = (key0 << 63) | ((key0 >> 1) ^ (key0 >> 63));
+    runningmod = modifier;
+    workingval = data ^ key0;
+
+    for (i = 0; i <= 4; ++i) {
+        roundkey = key1 ^ runningmod;
+        workingval ^= roundkey;
+        workingval ^= RC[i];
+        if (i > 0) {
+            workingval = pac_cell_shuffle(workingval);
+            workingval = pac_mult(workingval);
+        }
+        workingval = pac_sub(workingval);
+        runningmod = tweak_shuffle(runningmod);
+    }
+    roundkey = modk0 ^ runningmod;
+    workingval ^= roundkey;
+    workingval = pac_cell_shuffle(workingval);
+    workingval = pac_mult(workingval);
+    workingval = pac_sub(workingval);
+    workingval = pac_cell_shuffle(workingval);
+    workingval = pac_mult(workingval);
+    workingval ^= key1;
+    workingval = pac_cell_inv_shuffle(workingval);
+    workingval = pac_inv_sub(workingval);
+    workingval = pac_mult(workingval);
+    workingval = pac_cell_inv_shuffle(workingval);
+    workingval ^= key0;
+    workingval ^= runningmod;
+    for (i = 0; i <= 4; ++i) {
+        workingval = pac_inv_sub(workingval);
+        if (i < 4) {
+            workingval = pac_mult(workingval);
+            workingval = pac_cell_inv_shuffle(workingval);
+        }
+        runningmod = tweak_inv_shuffle(runningmod);
+        roundkey = key1 ^ runningmod;
+        workingval ^= RC[4 - i];
+        workingval ^= roundkey;
+        workingval ^= alpha;
+    }
+    workingval ^= modk0;
+
+    return workingval;
+}
+
+static uint64_t pauth_computepac_impdef(uint64_t data, uint64_t modifier,
+                                        ARMPACKey key)
+{
+    return qemu_xxhash64_4(data, modifier, key.lo, key.hi);
+}
+
+static uint64_t pauth_computepac(CPUARMState *env, uint64_t data,
+                                 uint64_t modifier, ARMPACKey key)
+{
+    if (cpu_isar_feature(aa64_pauth_arch, env_archcpu(env))) {
+        return pauth_computepac_architected(data, modifier, key);
+    } else {
+        return pauth_computepac_impdef(data, modifier, key);
+    }
+}
+
+static uint64_t pauth_addpac(CPUARMState *env, uint64_t ptr, uint64_t modifier,
+                             ARMPACKey *key, bool data)
+{
+    ARMMMUIdx mmu_idx = arm_stage1_mmu_idx(env);
+    ARMVAParameters param = aa64_va_parameters(env, ptr, mmu_idx, data);
+    uint64_t pac, ext_ptr, ext, test;
+    int bot_bit, top_bit;
+
+    /* If tagged pointers are in use, use ptr<55>, otherwise ptr<63>.  */
+    if (param.tbi) {
+        ext = sextract64(ptr, 55, 1);
+    } else {
+        ext = sextract64(ptr, 63, 1);
+    }
+
+    /* Build a pointer with known good extension bits.  */
+    top_bit = 64 - 8 * param.tbi;
+    bot_bit = 64 - param.tsz;
+    ext_ptr = deposit64(ptr, bot_bit, top_bit - bot_bit, ext);
+
+    pac = pauth_computepac(env, ext_ptr, modifier, *key);
+
+    /*
+     * Check if the ptr has good extension bits and corrupt the
+     * pointer authentication code if not.
+     */
+    test = sextract64(ptr, bot_bit, top_bit - bot_bit);
+    if (test != 0 && test != -1) {
+        /*
+         * Note that our top_bit is one greater than the pseudocode's
+         * version, hence "- 2" here.
+         */
+        pac ^= MAKE_64BIT_MASK(top_bit - 2, 1);
+    }
+
+    /*
+     * Preserve the determination between upper and lower at bit 55,
+     * and insert pointer authentication code.
+     */
+    if (param.tbi) {
+        ptr &= ~MAKE_64BIT_MASK(bot_bit, 55 - bot_bit + 1);
+        pac &= MAKE_64BIT_MASK(bot_bit, 54 - bot_bit + 1);
+    } else {
+        ptr &= MAKE_64BIT_MASK(0, bot_bit);
+        pac &= ~(MAKE_64BIT_MASK(55, 1) | MAKE_64BIT_MASK(0, bot_bit));
+    }
+    ext &= MAKE_64BIT_MASK(55, 1);
+    return pac | ext | ptr;
+}
+
+static uint64_t pauth_original_ptr(uint64_t ptr, ARMVAParameters param)
+{
+    /* Note that bit 55 is used whether or not the regime has 2 ranges. */
+    uint64_t extfield = sextract64(ptr, 55, 1);
+    int bot_pac_bit = 64 - param.tsz;
+    int top_pac_bit = 64 - 8 * param.tbi;
+
+    return deposit64(ptr, bot_pac_bit, top_pac_bit - bot_pac_bit, extfield);
+}
+
+static uint64_t pauth_auth(CPUARMState *env, uint64_t ptr, uint64_t modifier,
+                           ARMPACKey *key, bool data, int keynumber)
+{
+    ARMMMUIdx mmu_idx = arm_stage1_mmu_idx(env);
+    ARMVAParameters param = aa64_va_parameters(env, ptr, mmu_idx, data);
+    int bot_bit, top_bit;
+    uint64_t pac, orig_ptr, test;
+
+    orig_ptr = pauth_original_ptr(ptr, param);
+    pac = pauth_computepac(env, orig_ptr, modifier, *key);
+    bot_bit = 64 - param.tsz;
+    top_bit = 64 - 8 * param.tbi;
+
+    test = (pac ^ ptr) & ~MAKE_64BIT_MASK(55, 1);
+    if (unlikely(extract64(test, bot_bit, top_bit - bot_bit))) {
+        int error_code = (keynumber << 1) | (keynumber ^ 1);
+        if (param.tbi) {
+            return deposit64(orig_ptr, 53, 2, error_code);
+        } else {
+            return deposit64(orig_ptr, 61, 2, error_code);
+        }
+    }
+    return orig_ptr;
+}
+
+static uint64_t pauth_strip(CPUARMState *env, uint64_t ptr, bool data)
+{
+    ARMMMUIdx mmu_idx = arm_stage1_mmu_idx(env);
+    ARMVAParameters param = aa64_va_parameters(env, ptr, mmu_idx, data);
+
+    return pauth_original_ptr(ptr, param);
+}
+
+static G_NORETURN
+void pauth_trap(CPUARMState *env, int target_el, uintptr_t ra)
+{
+    raise_exception_ra(env, EXCP_UDEF, syn_pactrap(), target_el, ra);
+}
+
+static void pauth_check_trap(CPUARMState *env, int el, uintptr_t ra)
+{
+    if (el < 2 && arm_is_el2_enabled(env)) {
+        uint64_t hcr = arm_hcr_el2_eff(env);
+        bool trap = !(hcr & HCR_API);
+        if (el == 0) {
+            /* Trap only applies to EL1&0 regime.  */
+            trap &= (hcr & (HCR_E2H | HCR_TGE)) != (HCR_E2H | HCR_TGE);
+        }
+        /* FIXME: ARMv8.3-NV: HCR_NV trap takes precedence for ERETA[AB].  */
+        if (trap) {
+            pauth_trap(env, 2, ra);
+        }
+    }
+    if (el < 3 && arm_feature(env, ARM_FEATURE_EL3)) {
+        if (!(env->cp15.scr_el3 & SCR_API)) {
+            pauth_trap(env, 3, ra);
+        }
+    }
+}
+
+static bool pauth_key_enabled(CPUARMState *env, int el, uint32_t bit)
+{
+    return (arm_sctlr(env, el) & bit) != 0;
+}
+
+uint64_t HELPER(pacia)(CPUARMState *env, uint64_t x, uint64_t y)
+{
+    int el = arm_current_el(env);
+    if (!pauth_key_enabled(env, el, SCTLR_EnIA)) {
+        return x;
+    }
+    pauth_check_trap(env, el, GETPC());
+    return pauth_addpac(env, x, y, &env->keys.apia, false);
+}
+
+uint64_t HELPER(pacib)(CPUARMState *env, uint64_t x, uint64_t y)
+{
+    int el = arm_current_el(env);
+    if (!pauth_key_enabled(env, el, SCTLR_EnIB)) {
+        return x;
+    }
+    pauth_check_trap(env, el, GETPC());
+    return pauth_addpac(env, x, y, &env->keys.apib, false);
+}
+
+uint64_t HELPER(pacda)(CPUARMState *env, uint64_t x, uint64_t y)
+{
+    int el = arm_current_el(env);
+    if (!pauth_key_enabled(env, el, SCTLR_EnDA)) {
+        return x;
+    }
+    pauth_check_trap(env, el, GETPC());
+    return pauth_addpac(env, x, y, &env->keys.apda, true);
+}
+
+uint64_t HELPER(pacdb)(CPUARMState *env, uint64_t x, uint64_t y)
+{
+    int el = arm_current_el(env);
+    if (!pauth_key_enabled(env, el, SCTLR_EnDB)) {
+        return x;
+    }
+    pauth_check_trap(env, el, GETPC());
+    return pauth_addpac(env, x, y, &env->keys.apdb, true);
+}
+
+uint64_t HELPER(pacga)(CPUARMState *env, uint64_t x, uint64_t y)
+{
+    uint64_t pac;
+
+    pauth_check_trap(env, arm_current_el(env), GETPC());
+    pac = pauth_computepac(env, x, y, env->keys.apga);
+
+    return pac & 0xffffffff00000000ull;
+}
+
+uint64_t HELPER(autia)(CPUARMState *env, uint64_t x, uint64_t y)
+{
+    int el = arm_current_el(env);
+    if (!pauth_key_enabled(env, el, SCTLR_EnIA)) {
+        return x;
+    }
+    pauth_check_trap(env, el, GETPC());
+    return pauth_auth(env, x, y, &env->keys.apia, false, 0);
+}
+
+uint64_t HELPER(autib)(CPUARMState *env, uint64_t x, uint64_t y)
+{
+    int el = arm_current_el(env);
+    if (!pauth_key_enabled(env, el, SCTLR_EnIB)) {
+        return x;
+    }
+    pauth_check_trap(env, el, GETPC());
+    return pauth_auth(env, x, y, &env->keys.apib, false, 1);
+}
+
+uint64_t HELPER(autda)(CPUARMState *env, uint64_t x, uint64_t y)
+{
+    int el = arm_current_el(env);
+    if (!pauth_key_enabled(env, el, SCTLR_EnDA)) {
+        return x;
+    }
+    pauth_check_trap(env, el, GETPC());
+    return pauth_auth(env, x, y, &env->keys.apda, true, 0);
+}
+
+uint64_t HELPER(autdb)(CPUARMState *env, uint64_t x, uint64_t y)
+{
+    int el = arm_current_el(env);
+    if (!pauth_key_enabled(env, el, SCTLR_EnDB)) {
+        return x;
+    }
+    pauth_check_trap(env, el, GETPC());
+    return pauth_auth(env, x, y, &env->keys.apdb, true, 1);
+}
+
+uint64_t HELPER(xpaci)(CPUARMState *env, uint64_t a)
+{
+    return pauth_strip(env, a, false);
+}
+
+uint64_t HELPER(xpacd)(CPUARMState *env, uint64_t a)
+{
+    return pauth_strip(env, a, true);
+}
diff --git a/target/arm/tcg/sme_helper.c b/target/arm/tcg/sme_helper.c
new file mode 100644
index 0000000..1e67fca
--- /dev/null
+++ b/target/arm/tcg/sme_helper.c
@@ -0,0 +1,1168 @@
+/*
+ * ARM SME Operations
+ *
+ * Copyright (c) 2022 Linaro, Ltd.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "internals.h"
+#include "tcg/tcg-gvec-desc.h"
+#include "exec/helper-proto.h"
+#include "exec/cpu_ldst.h"
+#include "exec/exec-all.h"
+#include "qemu/int128.h"
+#include "fpu/softfloat.h"
+#include "vec_internal.h"
+#include "sve_ldst_internal.h"
+
+void helper_set_svcr(CPUARMState *env, uint32_t val, uint32_t mask)
+{
+    aarch64_set_svcr(env, val, mask);
+}
+
+void helper_sme_zero(CPUARMState *env, uint32_t imm, uint32_t svl)
+{
+    uint32_t i;
+
+    /*
+     * Special case clearing the entire ZA space.
+     * This falls into the CONSTRAINED UNPREDICTABLE zeroing of any
+     * parts of the ZA storage outside of SVL.
+     */
+    if (imm == 0xff) {
+        memset(env->zarray, 0, sizeof(env->zarray));
+        return;
+    }
+
+    /*
+     * Recall that ZAnH.D[m] is spread across ZA[n+8*m],
+     * so each row is discontiguous within ZA[].
+     */
+    for (i = 0; i < svl; i++) {
+        if (imm & (1 << (i % 8))) {
+            memset(&env->zarray[i], 0, svl);
+        }
+    }
+}
+
+
+/*
+ * When considering the ZA storage as an array of elements of
+ * type T, the index within that array of the Nth element of
+ * a vertical slice of a tile can be calculated like this,
+ * regardless of the size of type T. This is because the tiles
+ * are interleaved, so if type T is size N bytes then row 1 of
+ * the tile is N rows away from row 0. The division by N to
+ * convert a byte offset into an array index and the multiplication
+ * by N to convert from vslice-index-within-the-tile to
+ * the index within the ZA storage cancel out.
+ */
+#define tile_vslice_index(i) ((i) * sizeof(ARMVectorReg))
+
+/*
+ * When doing byte arithmetic on the ZA storage, the element
+ * byteoff bytes away in a tile vertical slice is always this
+ * many bytes away in the ZA storage, regardless of the
+ * size of the tile element, assuming that byteoff is a multiple
+ * of the element size. Again this is because of the interleaving
+ * of the tiles. For instance if we have 1 byte per element then
+ * each row of the ZA storage has one byte of the vslice data,
+ * and (counting from 0) byte 8 goes in row 8 of the storage
+ * at offset (8 * row-size-in-bytes).
+ * If we have 8 bytes per element then each row of the ZA storage
+ * has 8 bytes of the data, but there are 8 interleaved tiles and
+ * so byte 8 of the data goes into row 1 of the tile,
+ * which is again row 8 of the storage, so the offset is still
+ * (8 * row-size-in-bytes). Similarly for other element sizes.
+ */
+#define tile_vslice_offset(byteoff) ((byteoff) * sizeof(ARMVectorReg))
+
+
+/*
+ * Move Zreg vector to ZArray column.
+ */
+#define DO_MOVA_C(NAME, TYPE, H)                                        \
+void HELPER(NAME)(void *za, void *vn, void *vg, uint32_t desc)          \
+{                                                                       \
+    int i, oprsz = simd_oprsz(desc);                                    \
+    for (i = 0; i < oprsz; ) {                                          \
+        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
+        do {                                                            \
+            if (pg & 1) {                                               \
+                *(TYPE *)(za + tile_vslice_offset(i)) = *(TYPE *)(vn + H(i)); \
+            }                                                           \
+            i += sizeof(TYPE);                                          \
+            pg >>= sizeof(TYPE);                                        \
+        } while (i & 15);                                               \
+    }                                                                   \
+}
+
+DO_MOVA_C(sme_mova_cz_b, uint8_t, H1)
+DO_MOVA_C(sme_mova_cz_h, uint16_t, H1_2)
+DO_MOVA_C(sme_mova_cz_s, uint32_t, H1_4)
+
+void HELPER(sme_mova_cz_d)(void *za, void *vn, void *vg, uint32_t desc)
+{
+    int i, oprsz = simd_oprsz(desc) / 8;
+    uint8_t *pg = vg;
+    uint64_t *n = vn;
+    uint64_t *a = za;
+
+    for (i = 0; i < oprsz; i++) {
+        if (pg[H1(i)] & 1) {
+            a[tile_vslice_index(i)] = n[i];
+        }
+    }
+}
+
+void HELPER(sme_mova_cz_q)(void *za, void *vn, void *vg, uint32_t desc)
+{
+    int i, oprsz = simd_oprsz(desc) / 16;
+    uint16_t *pg = vg;
+    Int128 *n = vn;
+    Int128 *a = za;
+
+    /*
+     * Int128 is used here simply to copy 16 bytes, and to simplify
+     * the address arithmetic.
+     */
+    for (i = 0; i < oprsz; i++) {
+        if (pg[H2(i)] & 1) {
+            a[tile_vslice_index(i)] = n[i];
+        }
+    }
+}
+
+#undef DO_MOVA_C
+
+/*
+ * Move ZArray column to Zreg vector.
+ */
+#define DO_MOVA_Z(NAME, TYPE, H)                                        \
+void HELPER(NAME)(void *vd, void *za, void *vg, uint32_t desc)          \
+{                                                                       \
+    int i, oprsz = simd_oprsz(desc);                                    \
+    for (i = 0; i < oprsz; ) {                                          \
+        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
+        do {                                                            \
+            if (pg & 1) {                                               \
+                *(TYPE *)(vd + H(i)) = *(TYPE *)(za + tile_vslice_offset(i)); \
+            }                                                           \
+            i += sizeof(TYPE);                                          \
+            pg >>= sizeof(TYPE);                                        \
+        } while (i & 15);                                               \
+    }                                                                   \
+}
+
+DO_MOVA_Z(sme_mova_zc_b, uint8_t, H1)
+DO_MOVA_Z(sme_mova_zc_h, uint16_t, H1_2)
+DO_MOVA_Z(sme_mova_zc_s, uint32_t, H1_4)
+
+void HELPER(sme_mova_zc_d)(void *vd, void *za, void *vg, uint32_t desc)
+{
+    int i, oprsz = simd_oprsz(desc) / 8;
+    uint8_t *pg = vg;
+    uint64_t *d = vd;
+    uint64_t *a = za;
+
+    for (i = 0; i < oprsz; i++) {
+        if (pg[H1(i)] & 1) {
+            d[i] = a[tile_vslice_index(i)];
+        }
+    }
+}
+
+void HELPER(sme_mova_zc_q)(void *vd, void *za, void *vg, uint32_t desc)
+{
+    int i, oprsz = simd_oprsz(desc) / 16;
+    uint16_t *pg = vg;
+    Int128 *d = vd;
+    Int128 *a = za;
+
+    /*
+     * Int128 is used here simply to copy 16 bytes, and to simplify
+     * the address arithmetic.
+     */
+    for (i = 0; i < oprsz; i++, za += sizeof(ARMVectorReg)) {
+        if (pg[H2(i)] & 1) {
+            d[i] = a[tile_vslice_index(i)];
+        }
+    }
+}
+
+#undef DO_MOVA_Z
+
+/*
+ * Clear elements in a tile slice comprising len bytes.
+ */
+
+typedef void ClearFn(void *ptr, size_t off, size_t len);
+
+static void clear_horizontal(void *ptr, size_t off, size_t len)
+{
+    memset(ptr + off, 0, len);
+}
+
+static void clear_vertical_b(void *vptr, size_t off, size_t len)
+{
+    for (size_t i = 0; i < len; ++i) {
+        *(uint8_t *)(vptr + tile_vslice_offset(i + off)) = 0;
+    }
+}
+
+static void clear_vertical_h(void *vptr, size_t off, size_t len)
+{
+    for (size_t i = 0; i < len; i += 2) {
+        *(uint16_t *)(vptr + tile_vslice_offset(i + off)) = 0;
+    }
+}
+
+static void clear_vertical_s(void *vptr, size_t off, size_t len)
+{
+    for (size_t i = 0; i < len; i += 4) {
+        *(uint32_t *)(vptr + tile_vslice_offset(i + off)) = 0;
+    }
+}
+
+static void clear_vertical_d(void *vptr, size_t off, size_t len)
+{
+    for (size_t i = 0; i < len; i += 8) {
+        *(uint64_t *)(vptr + tile_vslice_offset(i + off)) = 0;
+    }
+}
+
+static void clear_vertical_q(void *vptr, size_t off, size_t len)
+{
+    for (size_t i = 0; i < len; i += 16) {
+        memset(vptr + tile_vslice_offset(i + off), 0, 16);
+    }
+}
+
+/*
+ * Copy elements from an array into a tile slice comprising len bytes.
+ */
+
+typedef void CopyFn(void *dst, const void *src, size_t len);
+
+static void copy_horizontal(void *dst, const void *src, size_t len)
+{
+    memcpy(dst, src, len);
+}
+
+static void copy_vertical_b(void *vdst, const void *vsrc, size_t len)
+{
+    const uint8_t *src = vsrc;
+    uint8_t *dst = vdst;
+    size_t i;
+
+    for (i = 0; i < len; ++i) {
+        dst[tile_vslice_index(i)] = src[i];
+    }
+}
+
+static void copy_vertical_h(void *vdst, const void *vsrc, size_t len)
+{
+    const uint16_t *src = vsrc;
+    uint16_t *dst = vdst;
+    size_t i;
+
+    for (i = 0; i < len / 2; ++i) {
+        dst[tile_vslice_index(i)] = src[i];
+    }
+}
+
+static void copy_vertical_s(void *vdst, const void *vsrc, size_t len)
+{
+    const uint32_t *src = vsrc;
+    uint32_t *dst = vdst;
+    size_t i;
+
+    for (i = 0; i < len / 4; ++i) {
+        dst[tile_vslice_index(i)] = src[i];
+    }
+}
+
+static void copy_vertical_d(void *vdst, const void *vsrc, size_t len)
+{
+    const uint64_t *src = vsrc;
+    uint64_t *dst = vdst;
+    size_t i;
+
+    for (i = 0; i < len / 8; ++i) {
+        dst[tile_vslice_index(i)] = src[i];
+    }
+}
+
+static void copy_vertical_q(void *vdst, const void *vsrc, size_t len)
+{
+    for (size_t i = 0; i < len; i += 16) {
+        memcpy(vdst + tile_vslice_offset(i), vsrc + i, 16);
+    }
+}
+
+/*
+ * Host and TLB primitives for vertical tile slice addressing.
+ */
+
+#define DO_LD(NAME, TYPE, HOST, TLB)                                        \
+static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host)  \
+{                                                                           \
+    TYPE val = HOST(host);                                                  \
+    *(TYPE *)(za + tile_vslice_offset(off)) = val;                          \
+}                                                                           \
+static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za,           \
+                        intptr_t off, target_ulong addr, uintptr_t ra)      \
+{                                                                           \
+    TYPE val = TLB(env, useronly_clean_ptr(addr), ra);                      \
+    *(TYPE *)(za + tile_vslice_offset(off)) = val;                          \
+}
+
+#define DO_ST(NAME, TYPE, HOST, TLB)                                        \
+static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host)  \
+{                                                                           \
+    TYPE val = *(TYPE *)(za + tile_vslice_offset(off));                     \
+    HOST(host, val);                                                        \
+}                                                                           \
+static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za,           \
+                        intptr_t off, target_ulong addr, uintptr_t ra)      \
+{                                                                           \
+    TYPE val = *(TYPE *)(za + tile_vslice_offset(off));                     \
+    TLB(env, useronly_clean_ptr(addr), val, ra);                            \
+}
+
+/*
+ * The ARMVectorReg elements are stored in host-endian 64-bit units.
+ * For 128-bit quantities, the sequence defined by the Elem[] pseudocode
+ * corresponds to storing the two 64-bit pieces in little-endian order.
+ */
+#define DO_LDQ(HNAME, VNAME, BE, HOST, TLB)                                 \
+static inline void HNAME##_host(void *za, intptr_t off, void *host)         \
+{                                                                           \
+    uint64_t val0 = HOST(host), val1 = HOST(host + 8);                      \
+    uint64_t *ptr = za + off;                                               \
+    ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1;                   \
+}                                                                           \
+static inline void VNAME##_v_host(void *za, intptr_t off, void *host)       \
+{                                                                           \
+    HNAME##_host(za, tile_vslice_offset(off), host);                        \
+}                                                                           \
+static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off,    \
+                               target_ulong addr, uintptr_t ra)             \
+{                                                                           \
+    uint64_t val0 = TLB(env, useronly_clean_ptr(addr), ra);                 \
+    uint64_t val1 = TLB(env, useronly_clean_ptr(addr + 8), ra);             \
+    uint64_t *ptr = za + off;                                               \
+    ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1;                   \
+}                                                                           \
+static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off,  \
+                               target_ulong addr, uintptr_t ra)             \
+{                                                                           \
+    HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra);                \
+}
+
+#define DO_STQ(HNAME, VNAME, BE, HOST, TLB)                                 \
+static inline void HNAME##_host(void *za, intptr_t off, void *host)         \
+{                                                                           \
+    uint64_t *ptr = za + off;                                               \
+    HOST(host, ptr[BE]);                                                    \
+    HOST(host + 1, ptr[!BE]);                                               \
+}                                                                           \
+static inline void VNAME##_v_host(void *za, intptr_t off, void *host)       \
+{                                                                           \
+    HNAME##_host(za, tile_vslice_offset(off), host);                        \
+}                                                                           \
+static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off,    \
+                               target_ulong addr, uintptr_t ra)             \
+{                                                                           \
+    uint64_t *ptr = za + off;                                               \
+    TLB(env, useronly_clean_ptr(addr), ptr[BE], ra);                        \
+    TLB(env, useronly_clean_ptr(addr + 8), ptr[!BE], ra);                   \
+}                                                                           \
+static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off,  \
+                               target_ulong addr, uintptr_t ra)             \
+{                                                                           \
+    HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra);                \
+}
+
+DO_LD(ld1b, uint8_t, ldub_p, cpu_ldub_data_ra)
+DO_LD(ld1h_be, uint16_t, lduw_be_p, cpu_lduw_be_data_ra)
+DO_LD(ld1h_le, uint16_t, lduw_le_p, cpu_lduw_le_data_ra)
+DO_LD(ld1s_be, uint32_t, ldl_be_p, cpu_ldl_be_data_ra)
+DO_LD(ld1s_le, uint32_t, ldl_le_p, cpu_ldl_le_data_ra)
+DO_LD(ld1d_be, uint64_t, ldq_be_p, cpu_ldq_be_data_ra)
+DO_LD(ld1d_le, uint64_t, ldq_le_p, cpu_ldq_le_data_ra)
+
+DO_LDQ(sve_ld1qq_be, sme_ld1q_be, 1, ldq_be_p, cpu_ldq_be_data_ra)
+DO_LDQ(sve_ld1qq_le, sme_ld1q_le, 0, ldq_le_p, cpu_ldq_le_data_ra)
+
+DO_ST(st1b, uint8_t, stb_p, cpu_stb_data_ra)
+DO_ST(st1h_be, uint16_t, stw_be_p, cpu_stw_be_data_ra)
+DO_ST(st1h_le, uint16_t, stw_le_p, cpu_stw_le_data_ra)
+DO_ST(st1s_be, uint32_t, stl_be_p, cpu_stl_be_data_ra)
+DO_ST(st1s_le, uint32_t, stl_le_p, cpu_stl_le_data_ra)
+DO_ST(st1d_be, uint64_t, stq_be_p, cpu_stq_be_data_ra)
+DO_ST(st1d_le, uint64_t, stq_le_p, cpu_stq_le_data_ra)
+
+DO_STQ(sve_st1qq_be, sme_st1q_be, 1, stq_be_p, cpu_stq_be_data_ra)
+DO_STQ(sve_st1qq_le, sme_st1q_le, 0, stq_le_p, cpu_stq_le_data_ra)
+
+#undef DO_LD
+#undef DO_ST
+#undef DO_LDQ
+#undef DO_STQ
+
+/*
+ * Common helper for all contiguous predicated loads.
+ */
+
+static inline QEMU_ALWAYS_INLINE
+void sme_ld1(CPUARMState *env, void *za, uint64_t *vg,
+             const target_ulong addr, uint32_t desc, const uintptr_t ra,
+             const int esz, uint32_t mtedesc, bool vertical,
+             sve_ldst1_host_fn *host_fn,
+             sve_ldst1_tlb_fn *tlb_fn,
+             ClearFn *clr_fn,
+             CopyFn *cpy_fn)
+{
+    const intptr_t reg_max = simd_oprsz(desc);
+    const intptr_t esize = 1 << esz;
+    intptr_t reg_off, reg_last;
+    SVEContLdSt info;
+    void *host;
+    int flags;
+
+    /* Find the active elements.  */
+    if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
+        /* The entire predicate was false; no load occurs.  */
+        clr_fn(za, 0, reg_max);
+        return;
+    }
+
+    /* Probe the page(s).  Exit with exception for any invalid page. */
+    sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra);
+
+    /* Handle watchpoints for all active elements. */
+    sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
+                              BP_MEM_READ, ra);
+
+    /*
+     * Handle mte checks for all active elements.
+     * Since TBI must be set for MTE, !mtedesc => !mte_active.
+     */
+    if (mtedesc) {
+        sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
+                                mtedesc, ra);
+    }
+
+    flags = info.page[0].flags | info.page[1].flags;
+    if (unlikely(flags != 0)) {
+#ifdef CONFIG_USER_ONLY
+        g_assert_not_reached();
+#else
+        /*
+         * At least one page includes MMIO.
+         * Any bus operation can fail with cpu_transaction_failed,
+         * which for ARM will raise SyncExternal.  Perform the load
+         * into scratch memory to preserve register state until the end.
+         */
+        ARMVectorReg scratch = { };
+
+        reg_off = info.reg_off_first[0];
+        reg_last = info.reg_off_last[1];
+        if (reg_last < 0) {
+            reg_last = info.reg_off_split;
+            if (reg_last < 0) {
+                reg_last = info.reg_off_last[0];
+            }
+        }
+
+        do {
+            uint64_t pg = vg[reg_off >> 6];
+            do {
+                if ((pg >> (reg_off & 63)) & 1) {
+                    tlb_fn(env, &scratch, reg_off, addr + reg_off, ra);
+                }
+                reg_off += esize;
+            } while (reg_off & 63);
+        } while (reg_off <= reg_last);
+
+        cpy_fn(za, &scratch, reg_max);
+        return;
+#endif
+    }
+
+    /* The entire operation is in RAM, on valid pages. */
+
+    reg_off = info.reg_off_first[0];
+    reg_last = info.reg_off_last[0];
+    host = info.page[0].host;
+
+    if (!vertical) {
+        memset(za, 0, reg_max);
+    } else if (reg_off) {
+        clr_fn(za, 0, reg_off);
+    }
+
+    while (reg_off <= reg_last) {
+        uint64_t pg = vg[reg_off >> 6];
+        do {
+            if ((pg >> (reg_off & 63)) & 1) {
+                host_fn(za, reg_off, host + reg_off);
+            } else if (vertical) {
+                clr_fn(za, reg_off, esize);
+            }
+            reg_off += esize;
+        } while (reg_off <= reg_last && (reg_off & 63));
+    }
+
+    /*
+     * Use the slow path to manage the cross-page misalignment.
+     * But we know this is RAM and cannot trap.
+     */
+    reg_off = info.reg_off_split;
+    if (unlikely(reg_off >= 0)) {
+        tlb_fn(env, za, reg_off, addr + reg_off, ra);
+    }
+
+    reg_off = info.reg_off_first[1];
+    if (unlikely(reg_off >= 0)) {
+        reg_last = info.reg_off_last[1];
+        host = info.page[1].host;
+
+        do {
+            uint64_t pg = vg[reg_off >> 6];
+            do {
+                if ((pg >> (reg_off & 63)) & 1) {
+                    host_fn(za, reg_off, host + reg_off);
+                } else if (vertical) {
+                    clr_fn(za, reg_off, esize);
+                }
+                reg_off += esize;
+            } while (reg_off & 63);
+        } while (reg_off <= reg_last);
+    }
+}
+
+static inline QEMU_ALWAYS_INLINE
+void sme_ld1_mte(CPUARMState *env, void *za, uint64_t *vg,
+                 target_ulong addr, uint32_t desc, uintptr_t ra,
+                 const int esz, bool vertical,
+                 sve_ldst1_host_fn *host_fn,
+                 sve_ldst1_tlb_fn *tlb_fn,
+                 ClearFn *clr_fn,
+                 CopyFn *cpy_fn)
+{
+    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+    int bit55 = extract64(addr, 55, 1);
+
+    /* Remove mtedesc from the normal sve descriptor. */
+    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+
+    /* Perform gross MTE suppression early. */
+    if (!tbi_check(desc, bit55) ||
+        tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
+        mtedesc = 0;
+    }
+
+    sme_ld1(env, za, vg, addr, desc, ra, esz, mtedesc, vertical,
+            host_fn, tlb_fn, clr_fn, cpy_fn);
+}
+
+#define DO_LD(L, END, ESZ)                                                 \
+void HELPER(sme_ld1##L##END##_h)(CPUARMState *env, void *za, void *vg,     \
+                                 target_ulong addr, uint32_t desc)         \
+{                                                                          \
+    sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false,               \
+            sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb,           \
+            clear_horizontal, copy_horizontal);                            \
+}                                                                          \
+void HELPER(sme_ld1##L##END##_v)(CPUARMState *env, void *za, void *vg,     \
+                                 target_ulong addr, uint32_t desc)         \
+{                                                                          \
+    sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true,                \
+            sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb,             \
+            clear_vertical_##L, copy_vertical_##L);                        \
+}                                                                          \
+void HELPER(sme_ld1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
+                                     target_ulong addr, uint32_t desc)     \
+{                                                                          \
+    sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false,              \
+                sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb,       \
+                clear_horizontal, copy_horizontal);                        \
+}                                                                          \
+void HELPER(sme_ld1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
+                                     target_ulong addr, uint32_t desc)     \
+{                                                                          \
+    sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true,               \
+                sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb,         \
+                clear_vertical_##L, copy_vertical_##L);                    \
+}
+
+DO_LD(b, , MO_8)
+DO_LD(h, _be, MO_16)
+DO_LD(h, _le, MO_16)
+DO_LD(s, _be, MO_32)
+DO_LD(s, _le, MO_32)
+DO_LD(d, _be, MO_64)
+DO_LD(d, _le, MO_64)
+DO_LD(q, _be, MO_128)
+DO_LD(q, _le, MO_128)
+
+#undef DO_LD
+
+/*
+ * Common helper for all contiguous predicated stores.
+ */
+
+static inline QEMU_ALWAYS_INLINE
+void sme_st1(CPUARMState *env, void *za, uint64_t *vg,
+             const target_ulong addr, uint32_t desc, const uintptr_t ra,
+             const int esz, uint32_t mtedesc, bool vertical,
+             sve_ldst1_host_fn *host_fn,
+             sve_ldst1_tlb_fn *tlb_fn)
+{
+    const intptr_t reg_max = simd_oprsz(desc);
+    const intptr_t esize = 1 << esz;
+    intptr_t reg_off, reg_last;
+    SVEContLdSt info;
+    void *host;
+    int flags;
+
+    /* Find the active elements.  */
+    if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
+        /* The entire predicate was false; no store occurs.  */
+        return;
+    }
+
+    /* Probe the page(s).  Exit with exception for any invalid page. */
+    sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra);
+
+    /* Handle watchpoints for all active elements. */
+    sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
+                              BP_MEM_WRITE, ra);
+
+    /*
+     * Handle mte checks for all active elements.
+     * Since TBI must be set for MTE, !mtedesc => !mte_active.
+     */
+    if (mtedesc) {
+        sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
+                                mtedesc, ra);
+    }
+
+    flags = info.page[0].flags | info.page[1].flags;
+    if (unlikely(flags != 0)) {
+#ifdef CONFIG_USER_ONLY
+        g_assert_not_reached();
+#else
+        /*
+         * At least one page includes MMIO.
+         * Any bus operation can fail with cpu_transaction_failed,
+         * which for ARM will raise SyncExternal.  We cannot avoid
+         * this fault and will leave with the store incomplete.
+         */
+        reg_off = info.reg_off_first[0];
+        reg_last = info.reg_off_last[1];
+        if (reg_last < 0) {
+            reg_last = info.reg_off_split;
+            if (reg_last < 0) {
+                reg_last = info.reg_off_last[0];
+            }
+        }
+
+        do {
+            uint64_t pg = vg[reg_off >> 6];
+            do {
+                if ((pg >> (reg_off & 63)) & 1) {
+                    tlb_fn(env, za, reg_off, addr + reg_off, ra);
+                }
+                reg_off += esize;
+            } while (reg_off & 63);
+        } while (reg_off <= reg_last);
+        return;
+#endif
+    }
+
+    reg_off = info.reg_off_first[0];
+    reg_last = info.reg_off_last[0];
+    host = info.page[0].host;
+
+    while (reg_off <= reg_last) {
+        uint64_t pg = vg[reg_off >> 6];
+        do {
+            if ((pg >> (reg_off & 63)) & 1) {
+                host_fn(za, reg_off, host + reg_off);
+            }
+            reg_off += 1 << esz;
+        } while (reg_off <= reg_last && (reg_off & 63));
+    }
+
+    /*
+     * Use the slow path to manage the cross-page misalignment.
+     * But we know this is RAM and cannot trap.
+     */
+    reg_off = info.reg_off_split;
+    if (unlikely(reg_off >= 0)) {
+        tlb_fn(env, za, reg_off, addr + reg_off, ra);
+    }
+
+    reg_off = info.reg_off_first[1];
+    if (unlikely(reg_off >= 0)) {
+        reg_last = info.reg_off_last[1];
+        host = info.page[1].host;
+
+        do {
+            uint64_t pg = vg[reg_off >> 6];
+            do {
+                if ((pg >> (reg_off & 63)) & 1) {
+                    host_fn(za, reg_off, host + reg_off);
+                }
+                reg_off += 1 << esz;
+            } while (reg_off & 63);
+        } while (reg_off <= reg_last);
+    }
+}
+
+static inline QEMU_ALWAYS_INLINE
+void sme_st1_mte(CPUARMState *env, void *za, uint64_t *vg, target_ulong addr,
+                 uint32_t desc, uintptr_t ra, int esz, bool vertical,
+                 sve_ldst1_host_fn *host_fn,
+                 sve_ldst1_tlb_fn *tlb_fn)
+{
+    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+    int bit55 = extract64(addr, 55, 1);
+
+    /* Remove mtedesc from the normal sve descriptor. */
+    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+
+    /* Perform gross MTE suppression early. */
+    if (!tbi_check(desc, bit55) ||
+        tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
+        mtedesc = 0;
+    }
+
+    sme_st1(env, za, vg, addr, desc, ra, esz, mtedesc,
+            vertical, host_fn, tlb_fn);
+}
+
+#define DO_ST(L, END, ESZ)                                                 \
+void HELPER(sme_st1##L##END##_h)(CPUARMState *env, void *za, void *vg,     \
+                                 target_ulong addr, uint32_t desc)         \
+{                                                                          \
+    sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false,               \
+            sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb);          \
+}                                                                          \
+void HELPER(sme_st1##L##END##_v)(CPUARMState *env, void *za, void *vg,     \
+                                 target_ulong addr, uint32_t desc)         \
+{                                                                          \
+    sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true,                \
+            sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb);            \
+}                                                                          \
+void HELPER(sme_st1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
+                                     target_ulong addr, uint32_t desc)     \
+{                                                                          \
+    sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false,              \
+                sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb);      \
+}                                                                          \
+void HELPER(sme_st1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
+                                     target_ulong addr, uint32_t desc)     \
+{                                                                          \
+    sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true,               \
+                sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb);        \
+}
+
+DO_ST(b, , MO_8)
+DO_ST(h, _be, MO_16)
+DO_ST(h, _le, MO_16)
+DO_ST(s, _be, MO_32)
+DO_ST(s, _le, MO_32)
+DO_ST(d, _be, MO_64)
+DO_ST(d, _le, MO_64)
+DO_ST(q, _be, MO_128)
+DO_ST(q, _le, MO_128)
+
+#undef DO_ST
+
+void HELPER(sme_addha_s)(void *vzda, void *vzn, void *vpn,
+                         void *vpm, uint32_t desc)
+{
+    intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
+    uint64_t *pn = vpn, *pm = vpm;
+    uint32_t *zda = vzda, *zn = vzn;
+
+    for (row = 0; row < oprsz; ) {
+        uint64_t pa = pn[row >> 4];
+        do {
+            if (pa & 1) {
+                for (col = 0; col < oprsz; ) {
+                    uint64_t pb = pm[col >> 4];
+                    do {
+                        if (pb & 1) {
+                            zda[tile_vslice_index(row) + H4(col)] += zn[H4(col)];
+                        }
+                        pb >>= 4;
+                    } while (++col & 15);
+                }
+            }
+            pa >>= 4;
+        } while (++row & 15);
+    }
+}
+
+void HELPER(sme_addha_d)(void *vzda, void *vzn, void *vpn,
+                         void *vpm, uint32_t desc)
+{
+    intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
+    uint8_t *pn = vpn, *pm = vpm;
+    uint64_t *zda = vzda, *zn = vzn;
+
+    for (row = 0; row < oprsz; ++row) {
+        if (pn[H1(row)] & 1) {
+            for (col = 0; col < oprsz; ++col) {
+                if (pm[H1(col)] & 1) {
+                    zda[tile_vslice_index(row) + col] += zn[col];
+                }
+            }
+        }
+    }
+}
+
+void HELPER(sme_addva_s)(void *vzda, void *vzn, void *vpn,
+                         void *vpm, uint32_t desc)
+{
+    intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
+    uint64_t *pn = vpn, *pm = vpm;
+    uint32_t *zda = vzda, *zn = vzn;
+
+    for (row = 0; row < oprsz; ) {
+        uint64_t pa = pn[row >> 4];
+        do {
+            if (pa & 1) {
+                uint32_t zn_row = zn[H4(row)];
+                for (col = 0; col < oprsz; ) {
+                    uint64_t pb = pm[col >> 4];
+                    do {
+                        if (pb & 1) {
+                            zda[tile_vslice_index(row) + H4(col)] += zn_row;
+                        }
+                        pb >>= 4;
+                    } while (++col & 15);
+                }
+            }
+            pa >>= 4;
+        } while (++row & 15);
+    }
+}
+
+void HELPER(sme_addva_d)(void *vzda, void *vzn, void *vpn,
+                         void *vpm, uint32_t desc)
+{
+    intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
+    uint8_t *pn = vpn, *pm = vpm;
+    uint64_t *zda = vzda, *zn = vzn;
+
+    for (row = 0; row < oprsz; ++row) {
+        if (pn[H1(row)] & 1) {
+            uint64_t zn_row = zn[row];
+            for (col = 0; col < oprsz; ++col) {
+                if (pm[H1(col)] & 1) {
+                    zda[tile_vslice_index(row) + col] += zn_row;
+                }
+            }
+        }
+    }
+}
+
+void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn,
+                         void *vpm, void *vst, uint32_t desc)
+{
+    intptr_t row, col, oprsz = simd_maxsz(desc);
+    uint32_t neg = simd_data(desc) << 31;
+    uint16_t *pn = vpn, *pm = vpm;
+    float_status fpst;
+
+    /*
+     * Make a copy of float_status because this operation does not
+     * update the cumulative fp exception status.  It also produces
+     * default nans.
+     */
+    fpst = *(float_status *)vst;
+    set_default_nan_mode(true, &fpst);
+
+    for (row = 0; row < oprsz; ) {
+        uint16_t pa = pn[H2(row >> 4)];
+        do {
+            if (pa & 1) {
+                void *vza_row = vza + tile_vslice_offset(row);
+                uint32_t n = *(uint32_t *)(vzn + H1_4(row)) ^ neg;
+
+                for (col = 0; col < oprsz; ) {
+                    uint16_t pb = pm[H2(col >> 4)];
+                    do {
+                        if (pb & 1) {
+                            uint32_t *a = vza_row + H1_4(col);
+                            uint32_t *m = vzm + H1_4(col);
+                            *a = float32_muladd(n, *m, *a, 0, vst);
+                        }
+                        col += 4;
+                        pb >>= 4;
+                    } while (col & 15);
+                }
+            }
+            row += 4;
+            pa >>= 4;
+        } while (row & 15);
+    }
+}
+
+void HELPER(sme_fmopa_d)(void *vza, void *vzn, void *vzm, void *vpn,
+                         void *vpm, void *vst, uint32_t desc)
+{
+    intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
+    uint64_t neg = (uint64_t)simd_data(desc) << 63;
+    uint64_t *za = vza, *zn = vzn, *zm = vzm;
+    uint8_t *pn = vpn, *pm = vpm;
+    float_status fpst = *(float_status *)vst;
+
+    set_default_nan_mode(true, &fpst);
+
+    for (row = 0; row < oprsz; ++row) {
+        if (pn[H1(row)] & 1) {
+            uint64_t *za_row = &za[tile_vslice_index(row)];
+            uint64_t n = zn[row] ^ neg;
+
+            for (col = 0; col < oprsz; ++col) {
+                if (pm[H1(col)] & 1) {
+                    uint64_t *a = &za_row[col];
+                    *a = float64_muladd(n, zm[col], *a, 0, &fpst);
+                }
+            }
+        }
+    }
+}
+
+/*
+ * Alter PAIR as needed for controlling predicates being false,
+ * and for NEG on an enabled row element.
+ */
+static inline uint32_t f16mop_adj_pair(uint32_t pair, uint32_t pg, uint32_t neg)
+{
+    /*
+     * The pseudocode uses a conditional negate after the conditional zero.
+     * It is simpler here to unconditionally negate before conditional zero.
+     */
+    pair ^= neg;
+    if (!(pg & 1)) {
+        pair &= 0xffff0000u;
+    }
+    if (!(pg & 4)) {
+        pair &= 0x0000ffffu;
+    }
+    return pair;
+}
+
+static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2,
+                          float_status *s_std, float_status *s_odd)
+{
+    float64 e1r = float16_to_float64(e1 & 0xffff, true, s_std);
+    float64 e1c = float16_to_float64(e1 >> 16, true, s_std);
+    float64 e2r = float16_to_float64(e2 & 0xffff, true, s_std);
+    float64 e2c = float16_to_float64(e2 >> 16, true, s_std);
+    float64 t64;
+    float32 t32;
+
+    /*
+     * The ARM pseudocode function FPDot performs both multiplies
+     * and the add with a single rounding operation.  Emulate this
+     * by performing the first multiply in round-to-odd, then doing
+     * the second multiply as fused multiply-add, and rounding to
+     * float32 all in one step.
+     */
+    t64 = float64_mul(e1r, e2r, s_odd);
+    t64 = float64r32_muladd(e1c, e2c, t64, 0, s_std);
+
+    /* This conversion is exact, because we've already rounded. */
+    t32 = float64_to_float32(t64, s_std);
+
+    /* The final accumulation step is not fused. */
+    return float32_add(sum, t32, s_std);
+}
+
+void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn,
+                         void *vpm, void *vst, uint32_t desc)
+{
+    intptr_t row, col, oprsz = simd_maxsz(desc);
+    uint32_t neg = simd_data(desc) * 0x80008000u;
+    uint16_t *pn = vpn, *pm = vpm;
+    float_status fpst_odd, fpst_std;
+
+    /*
+     * Make a copy of float_status because this operation does not
+     * update the cumulative fp exception status.  It also produces
+     * default nans.  Make a second copy with round-to-odd -- see above.
+     */
+    fpst_std = *(float_status *)vst;
+    set_default_nan_mode(true, &fpst_std);
+    fpst_odd = fpst_std;
+    set_float_rounding_mode(float_round_to_odd, &fpst_odd);
+
+    for (row = 0; row < oprsz; ) {
+        uint16_t prow = pn[H2(row >> 4)];
+        do {
+            void *vza_row = vza + tile_vslice_offset(row);
+            uint32_t n = *(uint32_t *)(vzn + H1_4(row));
+
+            n = f16mop_adj_pair(n, prow, neg);
+
+            for (col = 0; col < oprsz; ) {
+                uint16_t pcol = pm[H2(col >> 4)];
+                do {
+                    if (prow & pcol & 0b0101) {
+                        uint32_t *a = vza_row + H1_4(col);
+                        uint32_t m = *(uint32_t *)(vzm + H1_4(col));
+
+                        m = f16mop_adj_pair(m, pcol, 0);
+                        *a = f16_dotadd(*a, n, m, &fpst_std, &fpst_odd);
+
+                        col += 4;
+                        pcol >>= 4;
+                    }
+                } while (col & 15);
+            }
+            row += 4;
+            prow >>= 4;
+        } while (row & 15);
+    }
+}
+
+void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, void *vpn,
+                        void *vpm, uint32_t desc)
+{
+    intptr_t row, col, oprsz = simd_maxsz(desc);
+    uint32_t neg = simd_data(desc) * 0x80008000u;
+    uint16_t *pn = vpn, *pm = vpm;
+
+    for (row = 0; row < oprsz; ) {
+        uint16_t prow = pn[H2(row >> 4)];
+        do {
+            void *vza_row = vza + tile_vslice_offset(row);
+            uint32_t n = *(uint32_t *)(vzn + H1_4(row));
+
+            n = f16mop_adj_pair(n, prow, neg);
+
+            for (col = 0; col < oprsz; ) {
+                uint16_t pcol = pm[H2(col >> 4)];
+                do {
+                    if (prow & pcol & 0b0101) {
+                        uint32_t *a = vza_row + H1_4(col);
+                        uint32_t m = *(uint32_t *)(vzm + H1_4(col));
+
+                        m = f16mop_adj_pair(m, pcol, 0);
+                        *a = bfdotadd(*a, n, m);
+
+                        col += 4;
+                        pcol >>= 4;
+                    }
+                } while (col & 15);
+            }
+            row += 4;
+            prow >>= 4;
+        } while (row & 15);
+    }
+}
+
+typedef uint64_t IMOPFn(uint64_t, uint64_t, uint64_t, uint8_t, bool);
+
+static inline void do_imopa(uint64_t *za, uint64_t *zn, uint64_t *zm,
+                            uint8_t *pn, uint8_t *pm,
+                            uint32_t desc, IMOPFn *fn)
+{
+    intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
+    bool neg = simd_data(desc);
+
+    for (row = 0; row < oprsz; ++row) {
+        uint8_t pa = pn[H1(row)];
+        uint64_t *za_row = &za[tile_vslice_index(row)];
+        uint64_t n = zn[row];
+
+        for (col = 0; col < oprsz; ++col) {
+            uint8_t pb = pm[H1(col)];
+            uint64_t *a = &za_row[col];
+
+            *a = fn(n, zm[col], *a, pa & pb, neg);
+        }
+    }
+}
+
+#define DEF_IMOP_32(NAME, NTYPE, MTYPE) \
+static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
+{                                                                           \
+    uint32_t sum0 = 0, sum1 = 0;                                            \
+    /* Apply P to N as a mask, making the inactive elements 0. */           \
+    n &= expand_pred_b(p);                                                  \
+    sum0 += (NTYPE)(n >> 0) * (MTYPE)(m >> 0);                              \
+    sum0 += (NTYPE)(n >> 8) * (MTYPE)(m >> 8);                              \
+    sum0 += (NTYPE)(n >> 16) * (MTYPE)(m >> 16);                            \
+    sum0 += (NTYPE)(n >> 24) * (MTYPE)(m >> 24);                            \
+    sum1 += (NTYPE)(n >> 32) * (MTYPE)(m >> 32);                            \
+    sum1 += (NTYPE)(n >> 40) * (MTYPE)(m >> 40);                            \
+    sum1 += (NTYPE)(n >> 48) * (MTYPE)(m >> 48);                            \
+    sum1 += (NTYPE)(n >> 56) * (MTYPE)(m >> 56);                            \
+    if (neg) {                                                              \
+        sum0 = (uint32_t)a - sum0, sum1 = (uint32_t)(a >> 32) - sum1;       \
+    } else {                                                                \
+        sum0 = (uint32_t)a + sum0, sum1 = (uint32_t)(a >> 32) + sum1;       \
+    }                                                                       \
+    return ((uint64_t)sum1 << 32) | sum0;                                   \
+}
+
+#define DEF_IMOP_64(NAME, NTYPE, MTYPE) \
+static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
+{                                                                           \
+    uint64_t sum = 0;                                                       \
+    /* Apply P to N as a mask, making the inactive elements 0. */           \
+    n &= expand_pred_h(p);                                                  \
+    sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0);                               \
+    sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16);                             \
+    sum += (NTYPE)(n >> 32) * (MTYPE)(m >> 32);                             \
+    sum += (NTYPE)(n >> 48) * (MTYPE)(m >> 48);                             \
+    return neg ? a - sum : a + sum;                                         \
+}
+
+DEF_IMOP_32(smopa_s, int8_t, int8_t)
+DEF_IMOP_32(umopa_s, uint8_t, uint8_t)
+DEF_IMOP_32(sumopa_s, int8_t, uint8_t)
+DEF_IMOP_32(usmopa_s, uint8_t, int8_t)
+
+DEF_IMOP_64(smopa_d, int16_t, int16_t)
+DEF_IMOP_64(umopa_d, uint16_t, uint16_t)
+DEF_IMOP_64(sumopa_d, int16_t, uint16_t)
+DEF_IMOP_64(usmopa_d, uint16_t, int16_t)
+
+#define DEF_IMOPH(NAME) \
+    void HELPER(sme_##NAME)(void *vza, void *vzn, void *vzm, void *vpn,      \
+                            void *vpm, uint32_t desc)                        \
+    { do_imopa(vza, vzn, vzm, vpn, vpm, desc, NAME); }
+
+DEF_IMOPH(smopa_s)
+DEF_IMOPH(umopa_s)
+DEF_IMOPH(sumopa_s)
+DEF_IMOPH(usmopa_s)
+DEF_IMOPH(smopa_d)
+DEF_IMOPH(umopa_d)
+DEF_IMOPH(sumopa_d)
+DEF_IMOPH(usmopa_d)
diff --git a/target/arm/tcg/sve_helper.c b/target/arm/tcg/sve_helper.c
new file mode 100644
index 0000000..521fc9b
--- /dev/null
+++ b/target/arm/tcg/sve_helper.c
@@ -0,0 +1,7483 @@
+/*
+ * ARM SVE Operations
+ *
+ * Copyright (c) 2018 Linaro, Ltd.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "internals.h"
+#include "exec/exec-all.h"
+#include "exec/helper-proto.h"
+#include "tcg/tcg-gvec-desc.h"
+#include "fpu/softfloat.h"
+#include "tcg/tcg.h"
+#include "vec_internal.h"
+#include "sve_ldst_internal.h"
+
+
+/* Return a value for NZCV as per the ARM PredTest pseudofunction.
+ *
+ * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
+ * and bit 0 set if C is set.  Compare the definitions of these variables
+ * within CPUARMState.
+ */
+
+/* For no G bits set, NZCV = C.  */
+#define PREDTEST_INIT  1
+
+/* This is an iterative function, called for each Pd and Pg word
+ * moving forward.
+ */
+static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
+{
+    if (likely(g)) {
+        /* Compute N from first D & G.
+           Use bit 2 to signal first G bit seen.  */
+        if (!(flags & 4)) {
+            flags |= ((d & (g & -g)) != 0) << 31;
+            flags |= 4;
+        }
+
+        /* Accumulate Z from each D & G.  */
+        flags |= ((d & g) != 0) << 1;
+
+        /* Compute C from last !(D & G).  Replace previous.  */
+        flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
+    }
+    return flags;
+}
+
+/* This is an iterative function, called for each Pd and Pg word
+ * moving backward.
+ */
+static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
+{
+    if (likely(g)) {
+        /* Compute C from first (i.e last) !(D & G).
+           Use bit 2 to signal first G bit seen.  */
+        if (!(flags & 4)) {
+            flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
+            flags |= (d & pow2floor(g)) == 0;
+        }
+
+        /* Accumulate Z from each D & G.  */
+        flags |= ((d & g) != 0) << 1;
+
+        /* Compute N from last (i.e first) D & G.  Replace previous.  */
+        flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
+    }
+    return flags;
+}
+
+/* The same for a single word predicate.  */
+uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
+{
+    return iter_predtest_fwd(d, g, PREDTEST_INIT);
+}
+
+/* The same for a multi-word predicate.  */
+uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
+{
+    uint32_t flags = PREDTEST_INIT;
+    uint64_t *d = vd, *g = vg;
+    uintptr_t i = 0;
+
+    do {
+        flags = iter_predtest_fwd(d[i], g[i], flags);
+    } while (++i < words);
+
+    return flags;
+}
+
+/* Similarly for single word elements.  */
+static inline uint64_t expand_pred_s(uint8_t byte)
+{
+    static const uint64_t word[] = {
+        [0x01] = 0x00000000ffffffffull,
+        [0x10] = 0xffffffff00000000ull,
+        [0x11] = 0xffffffffffffffffull,
+    };
+    return word[byte & 0x11];
+}
+
+#define LOGICAL_PPPP(NAME, FUNC) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
+{                                                                         \
+    uintptr_t opr_sz = simd_oprsz(desc);                                  \
+    uint64_t *d = vd, *n = vn, *m = vm, *g = vg;                          \
+    uintptr_t i;                                                          \
+    for (i = 0; i < opr_sz / 8; ++i) {                                    \
+        d[i] = FUNC(n[i], m[i], g[i]);                                    \
+    }                                                                     \
+}
+
+#define DO_AND(N, M, G)  (((N) & (M)) & (G))
+#define DO_BIC(N, M, G)  (((N) & ~(M)) & (G))
+#define DO_EOR(N, M, G)  (((N) ^ (M)) & (G))
+#define DO_ORR(N, M, G)  (((N) | (M)) & (G))
+#define DO_ORN(N, M, G)  (((N) | ~(M)) & (G))
+#define DO_NOR(N, M, G)  (~((N) | (M)) & (G))
+#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
+#define DO_SEL(N, M, G)  (((N) & (G)) | ((M) & ~(G)))
+
+LOGICAL_PPPP(sve_and_pppp, DO_AND)
+LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
+LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
+LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
+LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
+LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
+LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
+LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
+
+#undef DO_AND
+#undef DO_BIC
+#undef DO_EOR
+#undef DO_ORR
+#undef DO_ORN
+#undef DO_NOR
+#undef DO_NAND
+#undef DO_SEL
+#undef LOGICAL_PPPP
+
+/* Fully general three-operand expander, controlled by a predicate.
+ * This is complicated by the host-endian storage of the register file.
+ */
+/* ??? I don't expect the compiler could ever vectorize this itself.
+ * With some tables we can convert bit masks to byte masks, and with
+ * extra care wrt byte/word ordering we could use gcc generic vectors
+ * and do 16 bytes at a time.
+ */
+#define DO_ZPZZ(NAME, TYPE, H, OP)                                       \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
+{                                                                       \
+    intptr_t i, opr_sz = simd_oprsz(desc);                              \
+    for (i = 0; i < opr_sz; ) {                                         \
+        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
+        do {                                                            \
+            if (pg & 1) {                                               \
+                TYPE nn = *(TYPE *)(vn + H(i));                         \
+                TYPE mm = *(TYPE *)(vm + H(i));                         \
+                *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
+            }                                                           \
+            i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
+        } while (i & 15);                                               \
+    }                                                                   \
+}
+
+/* Similarly, specialized for 64-bit operands.  */
+#define DO_ZPZZ_D(NAME, TYPE, OP)                                \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
+{                                                               \
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
+    TYPE *d = vd, *n = vn, *m = vm;                             \
+    uint8_t *pg = vg;                                           \
+    for (i = 0; i < opr_sz; i += 1) {                           \
+        if (pg[H1(i)] & 1) {                                    \
+            TYPE nn = n[i], mm = m[i];                          \
+            d[i] = OP(nn, mm);                                  \
+        }                                                       \
+    }                                                           \
+}
+
+#define DO_AND(N, M)  (N & M)
+#define DO_EOR(N, M)  (N ^ M)
+#define DO_ORR(N, M)  (N | M)
+#define DO_BIC(N, M)  (N & ~M)
+#define DO_ADD(N, M)  (N + M)
+#define DO_SUB(N, M)  (N - M)
+#define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
+#define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
+#define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
+#define DO_MUL(N, M)  (N * M)
+
+
+/*
+ * We must avoid the C undefined behaviour cases: division by
+ * zero and signed division of INT_MIN by -1. Both of these
+ * have architecturally defined required results for Arm.
+ * We special case all signed divisions by -1 to avoid having
+ * to deduce the minimum integer for the type involved.
+ */
+#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
+#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
+
+DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
+DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
+DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
+DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
+
+DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
+DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
+DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
+DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
+
+DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
+DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
+DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
+DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
+
+DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
+DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
+DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
+DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
+
+DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
+DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
+DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
+DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
+
+DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
+DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
+DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
+DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
+
+DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
+DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
+DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
+DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
+
+DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
+DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
+DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
+DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
+
+DO_ZPZZ(sve_smin_zpzz_b, int8_t,  H1, DO_MIN)
+DO_ZPZZ(sve_smin_zpzz_h, int16_t,  H1_2, DO_MIN)
+DO_ZPZZ(sve_smin_zpzz_s, int32_t,  H1_4, DO_MIN)
+DO_ZPZZ_D(sve_smin_zpzz_d, int64_t,  DO_MIN)
+
+DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
+DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
+DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
+DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
+
+DO_ZPZZ(sve_sabd_zpzz_b, int8_t,  H1, DO_ABD)
+DO_ZPZZ(sve_sabd_zpzz_h, int16_t,  H1_2, DO_ABD)
+DO_ZPZZ(sve_sabd_zpzz_s, int32_t,  H1_4, DO_ABD)
+DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t,  DO_ABD)
+
+DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
+DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
+DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
+DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
+
+/* Because the computation type is at least twice as large as required,
+   these work for both signed and unsigned source types.  */
+static inline uint8_t do_mulh_b(int32_t n, int32_t m)
+{
+    return (n * m) >> 8;
+}
+
+static inline uint16_t do_mulh_h(int32_t n, int32_t m)
+{
+    return (n * m) >> 16;
+}
+
+static inline uint32_t do_mulh_s(int64_t n, int64_t m)
+{
+    return (n * m) >> 32;
+}
+
+static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
+{
+    uint64_t lo, hi;
+    muls64(&lo, &hi, n, m);
+    return hi;
+}
+
+static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
+{
+    uint64_t lo, hi;
+    mulu64(&lo, &hi, n, m);
+    return hi;
+}
+
+DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
+DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
+DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
+DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
+
+DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
+DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
+DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
+DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
+
+DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
+DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
+DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
+DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
+
+DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
+DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
+
+DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
+DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
+
+/* Note that all bits of the shift are significant
+   and not modulo the element size.  */
+#define DO_ASR(N, M)  (N >> MIN(M, sizeof(N) * 8 - 1))
+#define DO_LSR(N, M)  (M < sizeof(N) * 8 ? N >> M : 0)
+#define DO_LSL(N, M)  (M < sizeof(N) * 8 ? N << M : 0)
+
+DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
+DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
+DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
+
+DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
+DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
+DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
+
+DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
+DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
+DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
+
+DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
+DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
+DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
+
+static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
+{
+    int8_t n1 = n, n2 = n >> 8;
+    return m + n1 + n2;
+}
+
+static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
+{
+    int16_t n1 = n, n2 = n >> 16;
+    return m + n1 + n2;
+}
+
+static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
+{
+    int32_t n1 = n, n2 = n >> 32;
+    return m + n1 + n2;
+}
+
+DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
+DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
+DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
+
+static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
+{
+    uint8_t n1 = n, n2 = n >> 8;
+    return m + n1 + n2;
+}
+
+static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
+{
+    uint16_t n1 = n, n2 = n >> 16;
+    return m + n1 + n2;
+}
+
+static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
+{
+    uint32_t n1 = n, n2 = n >> 32;
+    return m + n1 + n2;
+}
+
+DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
+DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
+DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
+
+#define do_srshl_b(n, m)  do_sqrshl_bhs(n, m, 8, true, NULL)
+#define do_srshl_h(n, m)  do_sqrshl_bhs(n, m, 16, true, NULL)
+#define do_srshl_s(n, m)  do_sqrshl_bhs(n, m, 32, true, NULL)
+#define do_srshl_d(n, m)  do_sqrshl_d(n, m, true, NULL)
+
+DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
+DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
+DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
+DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
+
+#define do_urshl_b(n, m)  do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
+#define do_urshl_h(n, m)  do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
+#define do_urshl_s(n, m)  do_uqrshl_bhs(n, m, 32, true, NULL)
+#define do_urshl_d(n, m)  do_uqrshl_d(n, m, true, NULL)
+
+DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
+DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
+DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
+DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
+
+/*
+ * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
+ * We pass in a pointer to a dummy saturation field to trigger
+ * the saturating arithmetic but discard the information about
+ * whether it has occurred.
+ */
+#define do_sqshl_b(n, m) \
+   ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
+#define do_sqshl_h(n, m) \
+   ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
+#define do_sqshl_s(n, m) \
+   ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
+#define do_sqshl_d(n, m) \
+   ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
+
+DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
+DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
+DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
+DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
+
+#define do_uqshl_b(n, m) \
+   ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
+#define do_uqshl_h(n, m) \
+   ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
+#define do_uqshl_s(n, m) \
+   ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
+#define do_uqshl_d(n, m) \
+   ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
+
+DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
+DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
+DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
+DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
+
+#define do_sqrshl_b(n, m) \
+   ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
+#define do_sqrshl_h(n, m) \
+   ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
+#define do_sqrshl_s(n, m) \
+   ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
+#define do_sqrshl_d(n, m) \
+   ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
+
+DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
+DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
+DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
+DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
+
+#undef do_sqrshl_d
+
+#define do_uqrshl_b(n, m) \
+   ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
+#define do_uqrshl_h(n, m) \
+   ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
+#define do_uqrshl_s(n, m) \
+   ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
+#define do_uqrshl_d(n, m) \
+   ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
+
+DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
+DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
+DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
+DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
+
+#undef do_uqrshl_d
+
+#define DO_HADD_BHS(n, m)  (((int64_t)n + m) >> 1)
+#define DO_HADD_D(n, m)    ((n >> 1) + (m >> 1) + (n & m & 1))
+
+DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
+DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
+DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
+DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
+
+DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
+DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
+DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
+DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
+
+#define DO_RHADD_BHS(n, m)  (((int64_t)n + m + 1) >> 1)
+#define DO_RHADD_D(n, m)    ((n >> 1) + (m >> 1) + ((n | m) & 1))
+
+DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
+DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
+DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
+DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
+
+DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
+DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
+DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
+DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
+
+#define DO_HSUB_BHS(n, m)  (((int64_t)n - m) >> 1)
+#define DO_HSUB_D(n, m)    ((n >> 1) - (m >> 1) - (~n & m & 1))
+
+DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
+DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
+DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
+DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
+
+DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
+DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
+DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
+DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
+
+static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
+{
+    return val >= max ? max : val <= min ? min : val;
+}
+
+#define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
+#define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
+#define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
+
+static inline int64_t do_sqadd_d(int64_t n, int64_t m)
+{
+    int64_t r = n + m;
+    if (((r ^ n) & ~(n ^ m)) < 0) {
+        /* Signed overflow.  */
+        return r < 0 ? INT64_MAX : INT64_MIN;
+    }
+    return r;
+}
+
+DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
+DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
+DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
+DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
+
+#define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
+#define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
+#define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
+
+static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
+{
+    uint64_t r = n + m;
+    return r < n ? UINT64_MAX : r;
+}
+
+DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
+DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
+DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
+DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
+
+#define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
+#define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
+#define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
+
+static inline int64_t do_sqsub_d(int64_t n, int64_t m)
+{
+    int64_t r = n - m;
+    if (((r ^ n) & (n ^ m)) < 0) {
+        /* Signed overflow.  */
+        return r < 0 ? INT64_MAX : INT64_MIN;
+    }
+    return r;
+}
+
+DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
+DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
+DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
+DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
+
+#define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
+#define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
+#define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
+
+static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
+{
+    return n > m ? n - m : 0;
+}
+
+DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
+DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
+DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
+DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
+
+#define DO_SUQADD_B(n, m) \
+    do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
+#define DO_SUQADD_H(n, m) \
+    do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
+#define DO_SUQADD_S(n, m) \
+    do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
+
+static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
+{
+    uint64_t r = n + m;
+
+    if (n < 0) {
+        /* Note that m - abs(n) cannot underflow. */
+        if (r > INT64_MAX) {
+            /* Result is either very large positive or negative. */
+            if (m > -n) {
+                /* m > abs(n), so r is a very large positive. */
+                return INT64_MAX;
+            }
+            /* Result is negative. */
+        }
+    } else {
+        /* Both inputs are positive: check for overflow.  */
+        if (r < m || r > INT64_MAX) {
+            return INT64_MAX;
+        }
+    }
+    return r;
+}
+
+DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
+DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
+DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
+DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
+
+#define DO_USQADD_B(n, m) \
+    do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
+#define DO_USQADD_H(n, m) \
+    do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
+#define DO_USQADD_S(n, m) \
+    do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
+
+static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
+{
+    uint64_t r = n + m;
+
+    if (m < 0) {
+        return n < -m ? 0 : r;
+    }
+    return r < n ? UINT64_MAX : r;
+}
+
+DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
+DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
+DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
+DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
+
+#undef DO_ZPZZ
+#undef DO_ZPZZ_D
+
+/*
+ * Three operand expander, operating on element pairs.
+ * If the slot I is even, the elements from from VN {I, I+1}.
+ * If the slot I is odd, the elements from from VM {I-1, I}.
+ * Load all of the input elements in each pair before overwriting output.
+ */
+#define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
+{                                                               \
+    intptr_t i, opr_sz = simd_oprsz(desc);                      \
+    for (i = 0; i < opr_sz; ) {                                 \
+        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
+        do {                                                    \
+            TYPE n0 = *(TYPE *)(vn + H(i));                     \
+            TYPE m0 = *(TYPE *)(vm + H(i));                     \
+            TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));      \
+            TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));      \
+            if (pg & 1) {                                       \
+                *(TYPE *)(vd + H(i)) = OP(n0, n1);              \
+            }                                                   \
+            i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
+            if (pg & 1) {                                       \
+                *(TYPE *)(vd + H(i)) = OP(m0, m1);              \
+            }                                                   \
+            i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
+        } while (i & 15);                                       \
+    }                                                           \
+}
+
+/* Similarly, specialized for 64-bit operands.  */
+#define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
+{                                                               \
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
+    TYPE *d = vd, *n = vn, *m = vm;                             \
+    uint8_t *pg = vg;                                           \
+    for (i = 0; i < opr_sz; i += 2) {                           \
+        TYPE n0 = n[i], n1 = n[i + 1];                          \
+        TYPE m0 = m[i], m1 = m[i + 1];                          \
+        if (pg[H1(i)] & 1) {                                    \
+            d[i] = OP(n0, n1);                                  \
+        }                                                       \
+        if (pg[H1(i + 1)] & 1) {                                \
+            d[i + 1] = OP(m0, m1);                              \
+        }                                                       \
+    }                                                           \
+}
+
+DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
+DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
+DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
+DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
+
+DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
+DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
+DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
+DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
+
+DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
+DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
+DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
+DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
+
+DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
+DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
+DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
+DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
+
+DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
+DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
+DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
+DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
+
+#undef DO_ZPZZ_PAIR
+#undef DO_ZPZZ_PAIR_D
+
+#define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP)                              \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
+                  void *status, uint32_t desc)                          \
+{                                                                       \
+    intptr_t i, opr_sz = simd_oprsz(desc);                              \
+    for (i = 0; i < opr_sz; ) {                                         \
+        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
+        do {                                                            \
+            TYPE n0 = *(TYPE *)(vn + H(i));                             \
+            TYPE m0 = *(TYPE *)(vm + H(i));                             \
+            TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));              \
+            TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));              \
+            if (pg & 1) {                                               \
+                *(TYPE *)(vd + H(i)) = OP(n0, n1, status);              \
+            }                                                           \
+            i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
+            if (pg & 1) {                                               \
+                *(TYPE *)(vd + H(i)) = OP(m0, m1, status);              \
+            }                                                           \
+            i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
+        } while (i & 15);                                               \
+    }                                                                   \
+}
+
+DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
+DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
+DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
+
+DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
+DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
+DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
+
+DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
+DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
+DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
+
+DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
+DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
+DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
+
+DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
+DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
+DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
+
+#undef DO_ZPZZ_PAIR_FP
+
+/* Three-operand expander, controlled by a predicate, in which the
+ * third operand is "wide".  That is, for D = N op M, the same 64-bit
+ * value of M is used with all of the narrower values of N.
+ */
+#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP)                               \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
+{                                                                       \
+    intptr_t i, opr_sz = simd_oprsz(desc);                              \
+    for (i = 0; i < opr_sz; ) {                                         \
+        uint8_t pg = *(uint8_t *)(vg + H1(i >> 3));                     \
+        TYPEW mm = *(TYPEW *)(vm + i);                                  \
+        do {                                                            \
+            if (pg & 1) {                                               \
+                TYPE nn = *(TYPE *)(vn + H(i));                         \
+                *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
+            }                                                           \
+            i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
+        } while (i & 7);                                                \
+    }                                                                   \
+}
+
+DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
+DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
+DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
+
+DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
+DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
+DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
+
+DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
+DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
+DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
+
+#undef DO_ZPZW
+
+/* Fully general two-operand expander, controlled by a predicate.
+ */
+#define DO_ZPZ(NAME, TYPE, H, OP)                               \
+void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
+{                                                               \
+    intptr_t i, opr_sz = simd_oprsz(desc);                      \
+    for (i = 0; i < opr_sz; ) {                                 \
+        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
+        do {                                                    \
+            if (pg & 1) {                                       \
+                TYPE nn = *(TYPE *)(vn + H(i));                 \
+                *(TYPE *)(vd + H(i)) = OP(nn);                  \
+            }                                                   \
+            i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
+        } while (i & 15);                                       \
+    }                                                           \
+}
+
+/* Similarly, specialized for 64-bit operands.  */
+#define DO_ZPZ_D(NAME, TYPE, OP)                                \
+void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
+{                                                               \
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
+    TYPE *d = vd, *n = vn;                                      \
+    uint8_t *pg = vg;                                           \
+    for (i = 0; i < opr_sz; i += 1) {                           \
+        if (pg[H1(i)] & 1) {                                    \
+            TYPE nn = n[i];                                     \
+            d[i] = OP(nn);                                      \
+        }                                                       \
+    }                                                           \
+}
+
+#define DO_CLS_B(N)   (clrsb32(N) - 24)
+#define DO_CLS_H(N)   (clrsb32(N) - 16)
+
+DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
+DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
+DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
+DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
+
+#define DO_CLZ_B(N)   (clz32(N) - 24)
+#define DO_CLZ_H(N)   (clz32(N) - 16)
+
+DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
+DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
+DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
+DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
+
+DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
+DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
+DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
+DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
+
+#define DO_CNOT(N)    (N == 0)
+
+DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
+DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
+DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
+DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
+
+#define DO_FABS(N)    (N & ((__typeof(N))-1 >> 1))
+
+DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
+DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
+DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
+
+#define DO_FNEG(N)    (N ^ ~((__typeof(N))-1 >> 1))
+
+DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
+DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
+DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
+
+#define DO_NOT(N)    (~N)
+
+DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
+DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
+DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
+DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
+
+#define DO_SXTB(N)    ((int8_t)N)
+#define DO_SXTH(N)    ((int16_t)N)
+#define DO_SXTS(N)    ((int32_t)N)
+#define DO_UXTB(N)    ((uint8_t)N)
+#define DO_UXTH(N)    ((uint16_t)N)
+#define DO_UXTS(N)    ((uint32_t)N)
+
+DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
+DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
+DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
+DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
+DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
+DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
+
+DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
+DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
+DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
+DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
+DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
+DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
+
+#define DO_ABS(N)    (N < 0 ? -N : N)
+
+DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
+DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
+DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
+DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
+
+#define DO_NEG(N)    (-N)
+
+DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
+DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
+DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
+DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
+
+DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
+DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
+DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
+
+DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
+DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
+
+DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
+
+void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t *d = vd, *n = vn;
+    uint8_t *pg = vg;
+
+    for (i = 0; i < opr_sz; i += 2) {
+        if (pg[H1(i)] & 1) {
+            uint64_t n0 = n[i + 0];
+            uint64_t n1 = n[i + 1];
+            d[i + 0] = n1;
+            d[i + 1] = n0;
+        }
+    }
+}
+
+DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
+DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
+DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
+DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
+
+#define DO_SQABS(X) \
+    ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
+       x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
+
+DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
+DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
+DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
+DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
+
+#define DO_SQNEG(X) \
+    ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
+       x_ == min_ ? -min_ - 1 : -x_; })
+
+DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
+DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
+DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
+DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
+
+DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
+DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
+
+/* Three-operand expander, unpredicated, in which the third operand is "wide".
+ */
+#define DO_ZZW(NAME, TYPE, TYPEW, H, OP)                       \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{                                                              \
+    intptr_t i, opr_sz = simd_oprsz(desc);                     \
+    for (i = 0; i < opr_sz; ) {                                \
+        TYPEW mm = *(TYPEW *)(vm + i);                         \
+        do {                                                   \
+            TYPE nn = *(TYPE *)(vn + H(i));                    \
+            *(TYPE *)(vd + H(i)) = OP(nn, mm);                 \
+            i += sizeof(TYPE);                                 \
+        } while (i & 7);                                       \
+    }                                                          \
+}
+
+DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
+DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
+DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
+
+DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
+DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
+DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
+
+DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
+DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
+DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
+
+#undef DO_ZZW
+
+#undef DO_CLS_B
+#undef DO_CLS_H
+#undef DO_CLZ_B
+#undef DO_CLZ_H
+#undef DO_CNOT
+#undef DO_FABS
+#undef DO_FNEG
+#undef DO_ABS
+#undef DO_NEG
+#undef DO_ZPZ
+#undef DO_ZPZ_D
+
+/*
+ * Three-operand expander, unpredicated, in which the two inputs are
+ * selected from the top or bottom half of the wide column.
+ */
+#define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
+{                                                                       \
+    intptr_t i, opr_sz = simd_oprsz(desc);                              \
+    int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
+    int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
+    for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
+        TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
+        TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
+        *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                            \
+    }                                                                   \
+}
+
+DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
+DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
+DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
+
+DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
+DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
+DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
+
+DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
+DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
+DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
+
+DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
+DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
+DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
+
+DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
+DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
+DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
+
+DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
+DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
+DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
+
+DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
+DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
+DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
+
+DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
+DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
+DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
+
+/* Note that the multiply cannot overflow, but the doubling can. */
+static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
+{
+    int16_t val = n * m;
+    return DO_SQADD_H(val, val);
+}
+
+static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
+{
+    int32_t val = n * m;
+    return DO_SQADD_S(val, val);
+}
+
+static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
+{
+    int64_t val = n * m;
+    return do_sqadd_d(val, val);
+}
+
+DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
+DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
+DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
+
+#undef DO_ZZZ_TB
+
+#define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{                                                              \
+    intptr_t i, opr_sz = simd_oprsz(desc);                     \
+    int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
+    for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
+        TYPEW nn = *(TYPEW *)(vn + HW(i));                     \
+        TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));              \
+        *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                   \
+    }                                                          \
+}
+
+DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
+DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
+DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
+
+DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
+DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
+DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
+
+DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
+DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
+DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
+
+DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
+DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
+DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
+
+#undef DO_ZZZ_WTB
+
+#define DO_ZZZ_NTB(NAME, TYPE, H, OP)                                   \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
+{                                                                       \
+    intptr_t i, opr_sz = simd_oprsz(desc);                              \
+    intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
+    intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
+    for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {                    \
+        TYPE nn = *(TYPE *)(vn + H(i + sel1));                          \
+        TYPE mm = *(TYPE *)(vm + H(i + sel2));                          \
+        *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm);                       \
+    }                                                                   \
+}
+
+DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
+DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
+DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
+DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
+
+#undef DO_ZZZ_NTB
+
+#define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
+{                                                               \
+    intptr_t i, opr_sz = simd_oprsz(desc);                      \
+    intptr_t sel1 = simd_data(desc) * sizeof(TYPEN);            \
+    for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {               \
+        TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));               \
+        TYPEW mm = *(TYPEN *)(vm + HN(i + sel1));               \
+        TYPEW aa = *(TYPEW *)(va + HW(i));                      \
+        *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa;               \
+    }                                                           \
+}
+
+DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
+DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
+DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
+
+DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
+DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
+DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
+
+DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
+DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
+DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
+
+DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
+DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
+DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
+
+#define DO_NMUL(N, M)  -(N * M)
+
+DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
+DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
+DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
+
+DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
+DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
+DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
+
+#undef DO_ZZZW_ACC
+
+#define DO_XTNB(NAME, TYPE, OP) \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
+{                                                            \
+    intptr_t i, opr_sz = simd_oprsz(desc);                   \
+    for (i = 0; i < opr_sz; i += sizeof(TYPE)) {             \
+        TYPE nn = *(TYPE *)(vn + i);                         \
+        nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4);  \
+        *(TYPE *)(vd + i) = nn;                              \
+    }                                                        \
+}
+
+#define DO_XTNT(NAME, TYPE, TYPEN, H, OP)                               \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc)                    \
+{                                                                       \
+    intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN));      \
+    for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
+        TYPE nn = *(TYPE *)(vn + i);                                    \
+        *(TYPEN *)(vd + i + odd) = OP(nn);                              \
+    }                                                                   \
+}
+
+#define DO_SQXTN_H(n)  do_sat_bhs(n, INT8_MIN, INT8_MAX)
+#define DO_SQXTN_S(n)  do_sat_bhs(n, INT16_MIN, INT16_MAX)
+#define DO_SQXTN_D(n)  do_sat_bhs(n, INT32_MIN, INT32_MAX)
+
+DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
+DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
+DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
+
+DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
+DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
+DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
+
+#define DO_UQXTN_H(n)  do_sat_bhs(n, 0, UINT8_MAX)
+#define DO_UQXTN_S(n)  do_sat_bhs(n, 0, UINT16_MAX)
+#define DO_UQXTN_D(n)  do_sat_bhs(n, 0, UINT32_MAX)
+
+DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
+DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
+DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
+
+DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
+DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
+DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
+
+DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
+DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
+DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
+
+DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
+DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
+DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
+
+#undef DO_XTNB
+#undef DO_XTNT
+
+void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
+    uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+    uint32_t *a = va, *n = vn;
+    uint64_t *d = vd, *m = vm;
+
+    for (i = 0; i < opr_sz / 8; ++i) {
+        uint32_t e1 = a[2 * i + H4(0)];
+        uint32_t e2 = n[2 * i + sel] ^ inv;
+        uint64_t c = extract64(m[i], 32, 1);
+        /* Compute and store the entire 33-bit result at once. */
+        d[i] = c + e1 + e2;
+    }
+}
+
+void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
+    uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+    uint64_t *d = vd, *a = va, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz / 8; i += 2) {
+        Int128 e1 = int128_make64(a[i]);
+        Int128 e2 = int128_make64(n[i + sel] ^ inv);
+        Int128 c = int128_make64(m[i + 1] & 1);
+        Int128 r = int128_add(int128_add(e1, e2), c);
+        d[i + 0] = int128_getlo(r);
+        d[i + 1] = int128_gethi(r);
+    }
+}
+
+#define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
+{                                                                       \
+    intptr_t i, opr_sz = simd_oprsz(desc);                              \
+    int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
+    int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
+    for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
+        TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
+        TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
+        TYPEW aa = *(TYPEW *)(va + HW(i));                              \
+        *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm));           \
+    }                                                                   \
+}
+
+DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
+           do_sqdmull_h, DO_SQADD_H)
+DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
+           do_sqdmull_s, DO_SQADD_S)
+DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
+           do_sqdmull_d, do_sqadd_d)
+
+DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
+           do_sqdmull_h, DO_SQSUB_H)
+DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
+           do_sqdmull_s, DO_SQSUB_S)
+DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
+           do_sqdmull_d, do_sqsub_d)
+
+#undef DO_SQDMLAL
+
+#define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
+{                                                               \
+    intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);       \
+    int rot = simd_data(desc);                                  \
+    int sel_a = rot & 1, sel_b = sel_a ^ 1;                     \
+    bool sub_r = rot == 1 || rot == 2;                          \
+    bool sub_i = rot >= 2;                                      \
+    TYPE *d = vd, *n = vn, *m = vm, *a = va;                    \
+    for (i = 0; i < opr_sz; i += 2) {                           \
+        TYPE elt1_a = n[H(i + sel_a)];                          \
+        TYPE elt2_a = m[H(i + sel_a)];                          \
+        TYPE elt2_b = m[H(i + sel_b)];                          \
+        d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r);           \
+        d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i);   \
+    }                                                           \
+}
+
+#define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
+
+DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
+DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
+DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
+DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
+
+#define DO_SQRDMLAH_B(N, M, A, S) \
+    do_sqrdmlah_b(N, M, A, S, true)
+#define DO_SQRDMLAH_H(N, M, A, S) \
+    ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
+#define DO_SQRDMLAH_S(N, M, A, S) \
+    ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
+#define DO_SQRDMLAH_D(N, M, A, S) \
+    do_sqrdmlah_d(N, M, A, S, true)
+
+DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
+DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
+DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
+DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
+
+#define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)    \
+{                                                                           \
+    intptr_t i, j, oprsz = simd_oprsz(desc);                                \
+    int rot = extract32(desc, SIMD_DATA_SHIFT, 2);                          \
+    int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2;                  \
+    int sel_a = rot & 1, sel_b = sel_a ^ 1;                                 \
+    bool sub_r = rot == 1 || rot == 2;                                      \
+    bool sub_i = rot >= 2;                                                  \
+    TYPE *d = vd, *n = vn, *m = vm, *a = va;                                \
+    for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) {         \
+        TYPE elt2_a = m[H(i + idx + sel_a)];                                \
+        TYPE elt2_b = m[H(i + idx + sel_b)];                                \
+        for (j = 0; j < 16 / sizeof(TYPE); j += 2) {                        \
+            TYPE elt1_a = n[H(i + j + sel_a)];                              \
+            d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r);          \
+            d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i);  \
+        }                                                                   \
+    }                                                                       \
+}
+
+DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
+DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
+
+DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
+DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
+
+#undef DO_CMLA
+#undef DO_CMLA_FUNC
+#undef DO_CMLA_IDX_FUNC
+#undef DO_SQRDMLAH_B
+#undef DO_SQRDMLAH_H
+#undef DO_SQRDMLAH_S
+#undef DO_SQRDMLAH_D
+
+/* Note N and M are 4 elements bundled into one unit. */
+static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
+                         int sel_a, int sel_b, int sub_i)
+{
+    for (int i = 0; i <= 1; i++) {
+        int32_t elt1_r = (int8_t)(n >> (16 * i));
+        int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
+        int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
+        int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
+
+        a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
+    }
+    return a;
+}
+
+static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
+                         int sel_a, int sel_b, int sub_i)
+{
+    for (int i = 0; i <= 1; i++) {
+        int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
+        int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
+        int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
+        int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
+
+        a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
+    }
+    return a;
+}
+
+void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
+                              void *va, uint32_t desc)
+{
+    int opr_sz = simd_oprsz(desc);
+    int rot = simd_data(desc);
+    int sel_a = rot & 1;
+    int sel_b = sel_a ^ 1;
+    int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
+    uint32_t *d = vd, *n = vn, *m = vm, *a = va;
+
+    for (int e = 0; e < opr_sz / 4; e++) {
+        d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
+    }
+}
+
+void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
+                              void *va, uint32_t desc)
+{
+    int opr_sz = simd_oprsz(desc);
+    int rot = simd_data(desc);
+    int sel_a = rot & 1;
+    int sel_b = sel_a ^ 1;
+    int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
+    uint64_t *d = vd, *n = vn, *m = vm, *a = va;
+
+    for (int e = 0; e < opr_sz / 8; e++) {
+        d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
+    }
+}
+
+void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
+                             void *va, uint32_t desc)
+{
+    int opr_sz = simd_oprsz(desc);
+    int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
+    int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
+    int sel_a = rot & 1;
+    int sel_b = sel_a ^ 1;
+    int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
+    uint32_t *d = vd, *n = vn, *m = vm, *a = va;
+
+    for (int seg = 0; seg < opr_sz / 4; seg += 4) {
+        uint32_t seg_m = m[seg + idx];
+        for (int e = 0; e < 4; e++) {
+            d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
+                                   sel_a, sel_b, sub_i);
+        }
+    }
+}
+
+void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
+                             void *va, uint32_t desc)
+{
+    int seg, opr_sz = simd_oprsz(desc);
+    int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
+    int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
+    int sel_a = rot & 1;
+    int sel_b = sel_a ^ 1;
+    int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
+    uint64_t *d = vd, *n = vn, *m = vm, *a = va;
+
+    for (seg = 0; seg < opr_sz / 8; seg += 2) {
+        uint64_t seg_m = m[seg + idx];
+        for (int e = 0; e < 2; e++) {
+            d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
+                                   sel_a, sel_b, sub_i);
+        }
+    }
+}
+
+#define DO_ZZXZ(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
+{                                                                       \
+    intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE);     \
+    intptr_t i, j, idx = simd_data(desc);                               \
+    TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx);           \
+    for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {               \
+        TYPE mm = m[i];                                                 \
+        for (j = 0; j < segment; j++) {                                 \
+            d[i + j] = OP(n[i + j], mm, a[i + j]);                      \
+        }                                                               \
+    }                                                                   \
+}
+
+#define DO_SQRDMLAH_H(N, M, A) \
+    ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
+#define DO_SQRDMLAH_S(N, M, A) \
+    ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
+#define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
+
+DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
+DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
+DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
+
+#define DO_SQRDMLSH_H(N, M, A) \
+    ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
+#define DO_SQRDMLSH_S(N, M, A) \
+    ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
+#define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
+
+DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
+DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
+DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
+
+#undef DO_ZZXZ
+
+#define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
+{                                                                         \
+    intptr_t i, j, oprsz = simd_oprsz(desc);                              \
+    intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
+    intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
+    for (i = 0; i < oprsz; i += 16) {                                     \
+        TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
+        for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
+            TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
+            TYPEW aa = *(TYPEW *)(va + HW(i + j));                        \
+            *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa);                  \
+        }                                                                 \
+    }                                                                     \
+}
+
+#define DO_MLA(N, M, A)  (A + N * M)
+
+DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
+DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
+DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
+DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
+
+#define DO_MLS(N, M, A)  (A - N * M)
+
+DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
+DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
+DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
+DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
+
+#define DO_SQDMLAL_S(N, M, A)  DO_SQADD_S(A, do_sqdmull_s(N, M))
+#define DO_SQDMLAL_D(N, M, A)  do_sqadd_d(A, do_sqdmull_d(N, M))
+
+DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
+DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
+
+#define DO_SQDMLSL_S(N, M, A)  DO_SQSUB_S(A, do_sqdmull_s(N, M))
+#define DO_SQDMLSL_D(N, M, A)  do_sqsub_d(A, do_sqdmull_d(N, M))
+
+DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
+DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
+
+#undef DO_MLA
+#undef DO_MLS
+#undef DO_ZZXW
+
+#define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)            \
+{                                                                         \
+    intptr_t i, j, oprsz = simd_oprsz(desc);                              \
+    intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
+    intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
+    for (i = 0; i < oprsz; i += 16) {                                     \
+        TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
+        for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
+            TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
+            *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm);                      \
+        }                                                                 \
+    }                                                                     \
+}
+
+DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
+DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
+
+DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
+DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
+
+DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
+DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
+
+#undef DO_ZZX
+
+#define DO_BITPERM(NAME, TYPE, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{                                                              \
+    intptr_t i, opr_sz = simd_oprsz(desc);                     \
+    for (i = 0; i < opr_sz; i += sizeof(TYPE)) {               \
+        TYPE nn = *(TYPE *)(vn + i);                           \
+        TYPE mm = *(TYPE *)(vm + i);                           \
+        *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8);      \
+    }                                                          \
+}
+
+static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
+{
+    uint64_t res = 0;
+    int db, rb = 0;
+
+    for (db = 0; db < n; ++db) {
+        if ((mask >> db) & 1) {
+            res |= ((data >> db) & 1) << rb;
+            ++rb;
+        }
+    }
+    return res;
+}
+
+DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
+DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
+DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
+DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
+
+static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
+{
+    uint64_t res = 0;
+    int rb, db = 0;
+
+    for (rb = 0; rb < n; ++rb) {
+        if ((mask >> rb) & 1) {
+            res |= ((data >> db) & 1) << rb;
+            ++db;
+        }
+    }
+    return res;
+}
+
+DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
+DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
+DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
+DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
+
+static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
+{
+    uint64_t resm = 0, resu = 0;
+    int db, rbm = 0, rbu = 0;
+
+    for (db = 0; db < n; ++db) {
+        uint64_t val = (data >> db) & 1;
+        if ((mask >> db) & 1) {
+            resm |= val << rbm++;
+        } else {
+            resu |= val << rbu++;
+        }
+    }
+
+    return resm | (resu << rbm);
+}
+
+DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
+DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
+DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
+DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
+
+#undef DO_BITPERM
+
+#define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP)                  \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
+{                                                               \
+    intptr_t i, opr_sz = simd_oprsz(desc);                      \
+    int sub_r = simd_data(desc);                                \
+    if (sub_r) {                                                \
+        for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
+            TYPE acc_r = *(TYPE *)(vn + H(i));                  \
+            TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
+            TYPE el2_r = *(TYPE *)(vm + H(i));                  \
+            TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
+            acc_r = ADD_OP(acc_r, el2_i);                       \
+            acc_i = SUB_OP(acc_i, el2_r);                       \
+            *(TYPE *)(vd + H(i)) = acc_r;                       \
+            *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
+        }                                                       \
+    } else {                                                    \
+        for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
+            TYPE acc_r = *(TYPE *)(vn + H(i));                  \
+            TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
+            TYPE el2_r = *(TYPE *)(vm + H(i));                  \
+            TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
+            acc_r = SUB_OP(acc_r, el2_i);                       \
+            acc_i = ADD_OP(acc_i, el2_r);                       \
+            *(TYPE *)(vd + H(i)) = acc_r;                       \
+            *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
+        }                                                       \
+    }                                                           \
+}
+
+DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
+DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
+DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
+DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
+
+DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
+DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
+DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
+DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
+
+#undef DO_CADD
+
+#define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
+{                                                              \
+    intptr_t i, opr_sz = simd_oprsz(desc);                     \
+    intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN);      \
+    int shift = simd_data(desc) >> 1;                          \
+    for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
+        TYPEW nn = *(TYPEN *)(vn + HN(i + sel));               \
+        *(TYPEW *)(vd + HW(i)) = nn << shift;                  \
+    }                                                          \
+}
+
+DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
+DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
+DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
+
+DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
+DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
+DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
+
+#undef DO_ZZI_SHLL
+
+/* Two-operand reduction expander, controlled by a predicate.
+ * The difference between TYPERED and TYPERET has to do with
+ * sign-extension.  E.g. for SMAX, TYPERED must be signed,
+ * but TYPERET must be unsigned so that e.g. a 32-bit value
+ * is not sign-extended to the ABI uint64_t return type.
+ */
+/* ??? If we were to vectorize this by hand the reduction ordering
+ * would change.  For integer operands, this is perfectly fine.
+ */
+#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
+uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
+{                                                          \
+    intptr_t i, opr_sz = simd_oprsz(desc);                 \
+    TYPERED ret = INIT;                                    \
+    for (i = 0; i < opr_sz; ) {                            \
+        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
+        do {                                               \
+            if (pg & 1) {                                  \
+                TYPEELT nn = *(TYPEELT *)(vn + H(i));      \
+                ret = OP(ret, nn);                         \
+            }                                              \
+            i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT);  \
+        } while (i & 15);                                  \
+    }                                                      \
+    return (TYPERET)ret;                                   \
+}
+
+#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP)             \
+uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
+{                                                          \
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;             \
+    TYPEE *n = vn;                                         \
+    uint8_t *pg = vg;                                      \
+    TYPER ret = INIT;                                      \
+    for (i = 0; i < opr_sz; i += 1) {                      \
+        if (pg[H1(i)] & 1) {                               \
+            TYPEE nn = n[i];                               \
+            ret = OP(ret, nn);                             \
+        }                                                  \
+    }                                                      \
+    return ret;                                            \
+}
+
+DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
+DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
+DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
+DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
+
+DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
+DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
+DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
+DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
+
+DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
+DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
+DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
+DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
+
+DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
+DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
+DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
+
+DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
+DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
+DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
+DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
+
+DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
+DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
+DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
+DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
+
+DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
+DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
+DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
+DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
+
+DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
+DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
+DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
+DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
+
+DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
+DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
+DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
+DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
+
+#undef DO_VPZ
+#undef DO_VPZ_D
+
+/* Two vector operand, one scalar operand, unpredicated.  */
+#define DO_ZZI(NAME, TYPE, OP)                                       \
+void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc)   \
+{                                                                    \
+    intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);            \
+    TYPE s = s64, *d = vd, *n = vn;                                  \
+    for (i = 0; i < opr_sz; ++i) {                                   \
+        d[i] = OP(n[i], s);                                          \
+    }                                                                \
+}
+
+#define DO_SUBR(X, Y)   (Y - X)
+
+DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
+DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
+DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
+DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
+
+DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
+DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
+DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
+DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
+
+DO_ZZI(sve_smini_b, int8_t, DO_MIN)
+DO_ZZI(sve_smini_h, int16_t, DO_MIN)
+DO_ZZI(sve_smini_s, int32_t, DO_MIN)
+DO_ZZI(sve_smini_d, int64_t, DO_MIN)
+
+DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
+DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
+DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
+DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
+
+DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
+DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
+DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
+DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
+
+#undef DO_ZZI
+
+#undef DO_AND
+#undef DO_ORR
+#undef DO_EOR
+#undef DO_BIC
+#undef DO_ADD
+#undef DO_SUB
+#undef DO_MAX
+#undef DO_MIN
+#undef DO_ABD
+#undef DO_MUL
+#undef DO_DIV
+#undef DO_ASR
+#undef DO_LSR
+#undef DO_LSL
+#undef DO_SUBR
+
+/* Similar to the ARM LastActiveElement pseudocode function, except the
+   result is multiplied by the element size.  This includes the not found
+   indication; e.g. not found for esz=3 is -8.  */
+static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
+{
+    uint64_t mask = pred_esz_masks[esz];
+    intptr_t i = words;
+
+    do {
+        uint64_t this_g = g[--i] & mask;
+        if (this_g) {
+            return i * 64 + (63 - clz64(this_g));
+        }
+    } while (i > 0);
+    return (intptr_t)-1 << esz;
+}
+
+uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
+{
+    intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
+    uint32_t flags = PREDTEST_INIT;
+    uint64_t *d = vd, *g = vg;
+    intptr_t i = 0;
+
+    do {
+        uint64_t this_d = d[i];
+        uint64_t this_g = g[i];
+
+        if (this_g) {
+            if (!(flags & 4)) {
+                /* Set in D the first bit of G.  */
+                this_d |= this_g & -this_g;
+                d[i] = this_d;
+            }
+            flags = iter_predtest_fwd(this_d, this_g, flags);
+        }
+    } while (++i < words);
+
+    return flags;
+}
+
+uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
+{
+    intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
+    intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+    uint32_t flags = PREDTEST_INIT;
+    uint64_t *d = vd, *g = vg, esz_mask;
+    intptr_t i, next;
+
+    next = last_active_element(vd, words, esz) + (1 << esz);
+    esz_mask = pred_esz_masks[esz];
+
+    /* Similar to the pseudocode for pnext, but scaled by ESZ
+       so that we find the correct bit.  */
+    if (next < words * 64) {
+        uint64_t mask = -1;
+
+        if (next & 63) {
+            mask = ~((1ull << (next & 63)) - 1);
+            next &= -64;
+        }
+        do {
+            uint64_t this_g = g[next / 64] & esz_mask & mask;
+            if (this_g != 0) {
+                next = (next & -64) + ctz64(this_g);
+                break;
+            }
+            next += 64;
+            mask = -1;
+        } while (next < words * 64);
+    }
+
+    i = 0;
+    do {
+        uint64_t this_d = 0;
+        if (i == next / 64) {
+            this_d = 1ull << (next & 63);
+        }
+        d[i] = this_d;
+        flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
+    } while (++i < words);
+
+    return flags;
+}
+
+/*
+ * Copy Zn into Zd, and store zero into inactive elements.
+ * If inv, store zeros into the active elements.
+ */
+void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
+    uint64_t *d = vd, *n = vn;
+    uint8_t *pg = vg;
+
+    for (i = 0; i < opr_sz; i += 1) {
+        d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
+    }
+}
+
+void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
+    uint64_t *d = vd, *n = vn;
+    uint8_t *pg = vg;
+
+    for (i = 0; i < opr_sz; i += 1) {
+        d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
+    }
+}
+
+void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
+    uint64_t *d = vd, *n = vn;
+    uint8_t *pg = vg;
+
+    for (i = 0; i < opr_sz; i += 1) {
+        d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
+    }
+}
+
+void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t *d = vd, *n = vn;
+    uint8_t *pg = vg;
+    uint8_t inv = simd_data(desc);
+
+    for (i = 0; i < opr_sz; i += 1) {
+        d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
+    }
+}
+
+/* Three-operand expander, immediate operand, controlled by a predicate.
+ */
+#define DO_ZPZI(NAME, TYPE, H, OP)                              \
+void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
+{                                                               \
+    intptr_t i, opr_sz = simd_oprsz(desc);                      \
+    TYPE imm = simd_data(desc);                                 \
+    for (i = 0; i < opr_sz; ) {                                 \
+        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
+        do {                                                    \
+            if (pg & 1) {                                       \
+                TYPE nn = *(TYPE *)(vn + H(i));                 \
+                *(TYPE *)(vd + H(i)) = OP(nn, imm);             \
+            }                                                   \
+            i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
+        } while (i & 15);                                       \
+    }                                                           \
+}
+
+/* Similarly, specialized for 64-bit operands.  */
+#define DO_ZPZI_D(NAME, TYPE, OP)                               \
+void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
+{                                                               \
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
+    TYPE *d = vd, *n = vn;                                      \
+    TYPE imm = simd_data(desc);                                 \
+    uint8_t *pg = vg;                                           \
+    for (i = 0; i < opr_sz; i += 1) {                           \
+        if (pg[H1(i)] & 1) {                                    \
+            TYPE nn = n[i];                                     \
+            d[i] = OP(nn, imm);                                 \
+        }                                                       \
+    }                                                           \
+}
+
+#define DO_SHR(N, M)  (N >> M)
+#define DO_SHL(N, M)  (N << M)
+
+/* Arithmetic shift right for division.  This rounds negative numbers
+   toward zero as per signed division.  Therefore before shifting,
+   when N is negative, add 2**M-1.  */
+#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
+
+static inline uint64_t do_urshr(uint64_t x, unsigned sh)
+{
+    if (likely(sh < 64)) {
+        return (x >> sh) + ((x >> (sh - 1)) & 1);
+    } else if (sh == 64) {
+        return x >> 63;
+    } else {
+        return 0;
+    }
+}
+
+static inline int64_t do_srshr(int64_t x, unsigned sh)
+{
+    if (likely(sh < 64)) {
+        return (x >> sh) + ((x >> (sh - 1)) & 1);
+    } else {
+        /* Rounding the sign bit always produces 0. */
+        return 0;
+    }
+}
+
+DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
+DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
+DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
+DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
+
+DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
+DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
+DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
+DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
+
+DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
+DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
+DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
+DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
+
+DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
+DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
+DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
+DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
+
+/* SVE2 bitwise shift by immediate */
+DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
+DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
+DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
+DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
+
+DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
+DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
+DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
+DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
+
+DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
+DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
+DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
+DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
+
+DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
+DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
+DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
+DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
+
+#define do_suqrshl_b(n, m) \
+   ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
+#define do_suqrshl_h(n, m) \
+   ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
+#define do_suqrshl_s(n, m) \
+   ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
+#define do_suqrshl_d(n, m) \
+   ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
+
+DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
+DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
+DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
+DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
+
+#undef DO_ASRD
+#undef DO_ZPZI
+#undef DO_ZPZI_D
+
+#define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
+{                                                            \
+    intptr_t i, opr_sz = simd_oprsz(desc);                   \
+    int shift = simd_data(desc);                             \
+    for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {            \
+        TYPEW nn = *(TYPEW *)(vn + i);                       \
+        *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift);           \
+    }                                                        \
+}
+
+#define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP)                  \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc)              \
+{                                                                 \
+    intptr_t i, opr_sz = simd_oprsz(desc);                        \
+    int shift = simd_data(desc);                                  \
+    for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                 \
+        TYPEW nn = *(TYPEW *)(vn + HW(i));                        \
+        *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift);   \
+    }                                                             \
+}
+
+DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
+DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
+DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
+
+DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
+DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
+DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
+
+DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
+DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
+DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
+
+DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
+DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
+DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
+
+#define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
+#define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
+#define DO_SQSHRUN_D(x, sh) \
+    do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
+
+DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
+DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
+DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
+
+DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
+DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
+DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
+
+#define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
+#define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
+#define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
+
+DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
+DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
+DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
+
+DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
+DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
+DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
+
+#define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
+#define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
+#define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
+
+DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
+DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
+DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
+
+DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
+DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
+DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
+
+#define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
+#define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
+#define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
+
+DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
+DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
+DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
+
+DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
+DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
+DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
+
+#define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
+#define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
+#define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
+
+DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
+DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
+DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
+
+DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
+DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
+DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
+
+#define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
+#define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
+#define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
+
+DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
+DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
+DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
+
+DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
+DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
+DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
+
+#undef DO_SHRNB
+#undef DO_SHRNT
+
+#define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP)                           \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
+{                                                                           \
+    intptr_t i, opr_sz = simd_oprsz(desc);                                  \
+    for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
+        TYPEW nn = *(TYPEW *)(vn + i);                                      \
+        TYPEW mm = *(TYPEW *)(vm + i);                                      \
+        *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT);                      \
+    }                                                                       \
+}
+
+#define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP)                   \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
+{                                                                           \
+    intptr_t i, opr_sz = simd_oprsz(desc);                                  \
+    for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
+        TYPEW nn = *(TYPEW *)(vn + HW(i));                                  \
+        TYPEW mm = *(TYPEW *)(vm + HW(i));                                  \
+        *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT);         \
+    }                                                                       \
+}
+
+#define DO_ADDHN(N, M, SH)  ((N + M) >> SH)
+#define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
+#define DO_SUBHN(N, M, SH)  ((N - M) >> SH)
+#define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
+
+DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
+DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
+DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
+
+DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
+DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
+DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
+
+DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
+DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
+DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
+
+DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
+DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
+DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
+
+DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
+DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
+DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
+
+DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
+DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
+DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
+
+DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
+DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
+DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
+
+DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
+DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
+DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
+
+#undef DO_RSUBHN
+#undef DO_SUBHN
+#undef DO_RADDHN
+#undef DO_ADDHN
+
+#undef DO_BINOPNB
+
+/* Fully general four-operand expander, controlled by a predicate.
+ */
+#define DO_ZPZZZ(NAME, TYPE, H, OP)                           \
+void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
+                  void *vg, uint32_t desc)                    \
+{                                                             \
+    intptr_t i, opr_sz = simd_oprsz(desc);                    \
+    for (i = 0; i < opr_sz; ) {                               \
+        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));       \
+        do {                                                  \
+            if (pg & 1) {                                     \
+                TYPE nn = *(TYPE *)(vn + H(i));               \
+                TYPE mm = *(TYPE *)(vm + H(i));               \
+                TYPE aa = *(TYPE *)(va + H(i));               \
+                *(TYPE *)(vd + H(i)) = OP(aa, nn, mm);        \
+            }                                                 \
+            i += sizeof(TYPE), pg >>= sizeof(TYPE);           \
+        } while (i & 15);                                     \
+    }                                                         \
+}
+
+/* Similarly, specialized for 64-bit operands.  */
+#define DO_ZPZZZ_D(NAME, TYPE, OP)                            \
+void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
+                  void *vg, uint32_t desc)                    \
+{                                                             \
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;                \
+    TYPE *d = vd, *a = va, *n = vn, *m = vm;                  \
+    uint8_t *pg = vg;                                         \
+    for (i = 0; i < opr_sz; i += 1) {                         \
+        if (pg[H1(i)] & 1) {                                  \
+            TYPE aa = a[i], nn = n[i], mm = m[i];             \
+            d[i] = OP(aa, nn, mm);                            \
+        }                                                     \
+    }                                                         \
+}
+
+#define DO_MLA(A, N, M)  (A + N * M)
+#define DO_MLS(A, N, M)  (A - N * M)
+
+DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
+DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
+
+DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
+DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
+
+DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
+DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
+
+DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
+DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
+
+#undef DO_MLA
+#undef DO_MLS
+#undef DO_ZPZZZ
+#undef DO_ZPZZZ_D
+
+void HELPER(sve_index_b)(void *vd, uint32_t start,
+                         uint32_t incr, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    uint8_t *d = vd;
+    for (i = 0; i < opr_sz; i += 1) {
+        d[H1(i)] = start + i * incr;
+    }
+}
+
+void HELPER(sve_index_h)(void *vd, uint32_t start,
+                         uint32_t incr, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 2;
+    uint16_t *d = vd;
+    for (i = 0; i < opr_sz; i += 1) {
+        d[H2(i)] = start + i * incr;
+    }
+}
+
+void HELPER(sve_index_s)(void *vd, uint32_t start,
+                         uint32_t incr, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 4;
+    uint32_t *d = vd;
+    for (i = 0; i < opr_sz; i += 1) {
+        d[H4(i)] = start + i * incr;
+    }
+}
+
+void HELPER(sve_index_d)(void *vd, uint64_t start,
+                         uint64_t incr, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t *d = vd;
+    for (i = 0; i < opr_sz; i += 1) {
+        d[i] = start + i * incr;
+    }
+}
+
+void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 4;
+    uint32_t sh = simd_data(desc);
+    uint32_t *d = vd, *n = vn, *m = vm;
+    for (i = 0; i < opr_sz; i += 1) {
+        d[i] = n[i] + (m[i] << sh);
+    }
+}
+
+void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t sh = simd_data(desc);
+    uint64_t *d = vd, *n = vn, *m = vm;
+    for (i = 0; i < opr_sz; i += 1) {
+        d[i] = n[i] + (m[i] << sh);
+    }
+}
+
+void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t sh = simd_data(desc);
+    uint64_t *d = vd, *n = vn, *m = vm;
+    for (i = 0; i < opr_sz; i += 1) {
+        d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
+    }
+}
+
+void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t sh = simd_data(desc);
+    uint64_t *d = vd, *n = vn, *m = vm;
+    for (i = 0; i < opr_sz; i += 1) {
+        d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
+    }
+}
+
+void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
+{
+    /* These constants are cut-and-paste directly from the ARM pseudocode.  */
+    static const uint16_t coeff[] = {
+        0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
+        0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
+        0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
+        0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
+    };
+    intptr_t i, opr_sz = simd_oprsz(desc) / 2;
+    uint16_t *d = vd, *n = vn;
+
+    for (i = 0; i < opr_sz; i++) {
+        uint16_t nn = n[i];
+        intptr_t idx = extract32(nn, 0, 5);
+        uint16_t exp = extract32(nn, 5, 5);
+        d[i] = coeff[idx] | (exp << 10);
+    }
+}
+
+void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
+{
+    /* These constants are cut-and-paste directly from the ARM pseudocode.  */
+    static const uint32_t coeff[] = {
+        0x000000, 0x0164d2, 0x02cd87, 0x043a29,
+        0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
+        0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
+        0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
+        0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
+        0x1ef532, 0x20b051, 0x227043, 0x243516,
+        0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
+        0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
+        0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
+        0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
+        0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
+        0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
+        0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
+        0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
+        0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
+        0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
+    };
+    intptr_t i, opr_sz = simd_oprsz(desc) / 4;
+    uint32_t *d = vd, *n = vn;
+
+    for (i = 0; i < opr_sz; i++) {
+        uint32_t nn = n[i];
+        intptr_t idx = extract32(nn, 0, 6);
+        uint32_t exp = extract32(nn, 6, 8);
+        d[i] = coeff[idx] | (exp << 23);
+    }
+}
+
+void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
+{
+    /* These constants are cut-and-paste directly from the ARM pseudocode.  */
+    static const uint64_t coeff[] = {
+        0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
+        0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
+        0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
+        0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
+        0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
+        0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
+        0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
+        0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
+        0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
+        0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
+        0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
+        0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
+        0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
+        0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
+        0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
+        0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
+        0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
+        0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
+        0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
+        0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
+        0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
+        0xFA7C1819E90D8ull,
+    };
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t *d = vd, *n = vn;
+
+    for (i = 0; i < opr_sz; i++) {
+        uint64_t nn = n[i];
+        intptr_t idx = extract32(nn, 0, 6);
+        uint64_t exp = extract32(nn, 6, 11);
+        d[i] = coeff[idx] | (exp << 52);
+    }
+}
+
+void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 2;
+    uint16_t *d = vd, *n = vn, *m = vm;
+    for (i = 0; i < opr_sz; i += 1) {
+        uint16_t nn = n[i];
+        uint16_t mm = m[i];
+        if (mm & 1) {
+            nn = float16_one;
+        }
+        d[i] = nn ^ (mm & 2) << 14;
+    }
+}
+
+void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 4;
+    uint32_t *d = vd, *n = vn, *m = vm;
+    for (i = 0; i < opr_sz; i += 1) {
+        uint32_t nn = n[i];
+        uint32_t mm = m[i];
+        if (mm & 1) {
+            nn = float32_one;
+        }
+        d[i] = nn ^ (mm & 2) << 30;
+    }
+}
+
+void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t *d = vd, *n = vn, *m = vm;
+    for (i = 0; i < opr_sz; i += 1) {
+        uint64_t nn = n[i];
+        uint64_t mm = m[i];
+        if (mm & 1) {
+            nn = float64_one;
+        }
+        d[i] = nn ^ (mm & 2) << 62;
+    }
+}
+
+/*
+ * Signed saturating addition with scalar operand.
+ */
+
+void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
+{
+    intptr_t i, oprsz = simd_oprsz(desc);
+
+    for (i = 0; i < oprsz; i += sizeof(int8_t)) {
+        *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
+    }
+}
+
+void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
+{
+    intptr_t i, oprsz = simd_oprsz(desc);
+
+    for (i = 0; i < oprsz; i += sizeof(int16_t)) {
+        *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
+    }
+}
+
+void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
+{
+    intptr_t i, oprsz = simd_oprsz(desc);
+
+    for (i = 0; i < oprsz; i += sizeof(int32_t)) {
+        *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
+    }
+}
+
+void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
+{
+    intptr_t i, oprsz = simd_oprsz(desc);
+
+    for (i = 0; i < oprsz; i += sizeof(int64_t)) {
+        *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
+    }
+}
+
+/*
+ * Unsigned saturating addition with scalar operand.
+ */
+
+void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
+{
+    intptr_t i, oprsz = simd_oprsz(desc);
+
+    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
+        *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
+    }
+}
+
+void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
+{
+    intptr_t i, oprsz = simd_oprsz(desc);
+
+    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
+        *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
+    }
+}
+
+void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
+{
+    intptr_t i, oprsz = simd_oprsz(desc);
+
+    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
+        *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
+    }
+}
+
+void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+    intptr_t i, oprsz = simd_oprsz(desc);
+
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
+    }
+}
+
+void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+    intptr_t i, oprsz = simd_oprsz(desc);
+
+    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+        *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
+    }
+}
+
+/* Two operand predicated copy immediate with merge.  All valid immediates
+ * can fit within 17 signed bits in the simd_data field.
+ */
+void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
+                         uint64_t mm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t *d = vd, *n = vn;
+    uint8_t *pg = vg;
+
+    mm = dup_const(MO_8, mm);
+    for (i = 0; i < opr_sz; i += 1) {
+        uint64_t nn = n[i];
+        uint64_t pp = expand_pred_b(pg[H1(i)]);
+        d[i] = (mm & pp) | (nn & ~pp);
+    }
+}
+
+void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
+                         uint64_t mm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t *d = vd, *n = vn;
+    uint8_t *pg = vg;
+
+    mm = dup_const(MO_16, mm);
+    for (i = 0; i < opr_sz; i += 1) {
+        uint64_t nn = n[i];
+        uint64_t pp = expand_pred_h(pg[H1(i)]);
+        d[i] = (mm & pp) | (nn & ~pp);
+    }
+}
+
+void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
+                         uint64_t mm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t *d = vd, *n = vn;
+    uint8_t *pg = vg;
+
+    mm = dup_const(MO_32, mm);
+    for (i = 0; i < opr_sz; i += 1) {
+        uint64_t nn = n[i];
+        uint64_t pp = expand_pred_s(pg[H1(i)]);
+        d[i] = (mm & pp) | (nn & ~pp);
+    }
+}
+
+void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
+                         uint64_t mm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t *d = vd, *n = vn;
+    uint8_t *pg = vg;
+
+    for (i = 0; i < opr_sz; i += 1) {
+        uint64_t nn = n[i];
+        d[i] = (pg[H1(i)] & 1 ? mm : nn);
+    }
+}
+
+void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t *d = vd;
+    uint8_t *pg = vg;
+
+    val = dup_const(MO_8, val);
+    for (i = 0; i < opr_sz; i += 1) {
+        d[i] = val & expand_pred_b(pg[H1(i)]);
+    }
+}
+
+void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t *d = vd;
+    uint8_t *pg = vg;
+
+    val = dup_const(MO_16, val);
+    for (i = 0; i < opr_sz; i += 1) {
+        d[i] = val & expand_pred_h(pg[H1(i)]);
+    }
+}
+
+void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t *d = vd;
+    uint8_t *pg = vg;
+
+    val = dup_const(MO_32, val);
+    for (i = 0; i < opr_sz; i += 1) {
+        d[i] = val & expand_pred_s(pg[H1(i)]);
+    }
+}
+
+void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t *d = vd;
+    uint8_t *pg = vg;
+
+    for (i = 0; i < opr_sz; i += 1) {
+        d[i] = (pg[H1(i)] & 1 ? val : 0);
+    }
+}
+
+/* Big-endian hosts need to frob the byte indices.  If the copy
+ * happens to be 8-byte aligned, then no frobbing necessary.
+ */
+static void swap_memmove(void *vd, void *vs, size_t n)
+{
+    uintptr_t d = (uintptr_t)vd;
+    uintptr_t s = (uintptr_t)vs;
+    uintptr_t o = (d | s | n) & 7;
+    size_t i;
+
+#if !HOST_BIG_ENDIAN
+    o = 0;
+#endif
+    switch (o) {
+    case 0:
+        memmove(vd, vs, n);
+        break;
+
+    case 4:
+        if (d < s || d >= s + n) {
+            for (i = 0; i < n; i += 4) {
+                *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
+            }
+        } else {
+            for (i = n; i > 0; ) {
+                i -= 4;
+                *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
+            }
+        }
+        break;
+
+    case 2:
+    case 6:
+        if (d < s || d >= s + n) {
+            for (i = 0; i < n; i += 2) {
+                *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
+            }
+        } else {
+            for (i = n; i > 0; ) {
+                i -= 2;
+                *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
+            }
+        }
+        break;
+
+    default:
+        if (d < s || d >= s + n) {
+            for (i = 0; i < n; i++) {
+                *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
+            }
+        } else {
+            for (i = n; i > 0; ) {
+                i -= 1;
+                *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
+            }
+        }
+        break;
+    }
+}
+
+/* Similarly for memset of 0.  */
+static void swap_memzero(void *vd, size_t n)
+{
+    uintptr_t d = (uintptr_t)vd;
+    uintptr_t o = (d | n) & 7;
+    size_t i;
+
+    /* Usually, the first bit of a predicate is set, so N is 0.  */
+    if (likely(n == 0)) {
+        return;
+    }
+
+#if !HOST_BIG_ENDIAN
+    o = 0;
+#endif
+    switch (o) {
+    case 0:
+        memset(vd, 0, n);
+        break;
+
+    case 4:
+        for (i = 0; i < n; i += 4) {
+            *(uint32_t *)H1_4(d + i) = 0;
+        }
+        break;
+
+    case 2:
+    case 6:
+        for (i = 0; i < n; i += 2) {
+            *(uint16_t *)H1_2(d + i) = 0;
+        }
+        break;
+
+    default:
+        for (i = 0; i < n; i++) {
+            *(uint8_t *)H1(d + i) = 0;
+        }
+        break;
+    }
+}
+
+void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t opr_sz = simd_oprsz(desc);
+    size_t n_ofs = simd_data(desc);
+    size_t n_siz = opr_sz - n_ofs;
+
+    if (vd != vm) {
+        swap_memmove(vd, vn + n_ofs, n_siz);
+        swap_memmove(vd + n_siz, vm, n_ofs);
+    } else if (vd != vn) {
+        swap_memmove(vd + n_siz, vd, n_ofs);
+        swap_memmove(vd, vn + n_ofs, n_siz);
+    } else {
+        /* vd == vn == vm.  Need temp space.  */
+        ARMVectorReg tmp;
+        swap_memmove(&tmp, vm, n_ofs);
+        swap_memmove(vd, vd + n_ofs, n_siz);
+        memcpy(vd + n_siz, &tmp, n_ofs);
+    }
+}
+
+#define DO_INSR(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
+{                                                                  \
+    intptr_t opr_sz = simd_oprsz(desc);                            \
+    swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE));    \
+    *(TYPE *)(vd + H(0)) = val;                                    \
+}
+
+DO_INSR(sve_insr_b, uint8_t, H1)
+DO_INSR(sve_insr_h, uint16_t, H1_2)
+DO_INSR(sve_insr_s, uint32_t, H1_4)
+DO_INSR(sve_insr_d, uint64_t, H1_8)
+
+#undef DO_INSR
+
+void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
+{
+    intptr_t i, j, opr_sz = simd_oprsz(desc);
+    for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
+        uint64_t f = *(uint64_t *)(vn + i);
+        uint64_t b = *(uint64_t *)(vn + j);
+        *(uint64_t *)(vd + i) = bswap64(b);
+        *(uint64_t *)(vd + j) = bswap64(f);
+    }
+}
+
+void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
+{
+    intptr_t i, j, opr_sz = simd_oprsz(desc);
+    for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
+        uint64_t f = *(uint64_t *)(vn + i);
+        uint64_t b = *(uint64_t *)(vn + j);
+        *(uint64_t *)(vd + i) = hswap64(b);
+        *(uint64_t *)(vd + j) = hswap64(f);
+    }
+}
+
+void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
+{
+    intptr_t i, j, opr_sz = simd_oprsz(desc);
+    for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
+        uint64_t f = *(uint64_t *)(vn + i);
+        uint64_t b = *(uint64_t *)(vn + j);
+        *(uint64_t *)(vd + i) = rol64(b, 32);
+        *(uint64_t *)(vd + j) = rol64(f, 32);
+    }
+}
+
+void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
+{
+    intptr_t i, j, opr_sz = simd_oprsz(desc);
+    for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
+        uint64_t f = *(uint64_t *)(vn + i);
+        uint64_t b = *(uint64_t *)(vn + j);
+        *(uint64_t *)(vd + i) = b;
+        *(uint64_t *)(vd + j) = f;
+    }
+}
+
+typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
+
+static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
+                           bool is_tbx, tb_impl_fn *fn)
+{
+    ARMVectorReg scratch;
+    uintptr_t oprsz = simd_oprsz(desc);
+
+    if (unlikely(vd == vn)) {
+        vn = memcpy(&scratch, vn, oprsz);
+    }
+
+    fn(vd, vn, NULL, vm, oprsz, is_tbx);
+}
+
+static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
+                           uint32_t desc, bool is_tbx, tb_impl_fn *fn)
+{
+    ARMVectorReg scratch;
+    uintptr_t oprsz = simd_oprsz(desc);
+
+    if (unlikely(vd == vn0)) {
+        vn0 = memcpy(&scratch, vn0, oprsz);
+        if (vd == vn1) {
+            vn1 = vn0;
+        }
+    } else if (unlikely(vd == vn1)) {
+        vn1 = memcpy(&scratch, vn1, oprsz);
+    }
+
+    fn(vd, vn0, vn1, vm, oprsz, is_tbx);
+}
+
+#define DO_TB(SUFF, TYPE, H)                                            \
+static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1,         \
+                                void *vm, uintptr_t oprsz, bool is_tbx) \
+{                                                                       \
+    TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm;              \
+    uintptr_t i, nelem = oprsz / sizeof(TYPE);                          \
+    for (i = 0; i < nelem; ++i) {                                       \
+        TYPE index = indexes[H1(i)], val = 0;                           \
+        if (index < nelem) {                                            \
+            val = tbl0[H(index)];                                       \
+        } else {                                                        \
+            index -= nelem;                                             \
+            if (tbl1 && index < nelem) {                                \
+                val = tbl1[H(index)];                                   \
+            } else if (is_tbx) {                                        \
+                continue;                                               \
+            }                                                           \
+        }                                                               \
+        d[H(i)] = val;                                                  \
+    }                                                                   \
+}                                                                       \
+void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
+{                                                                       \
+    do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF);                     \
+}                                                                       \
+void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1,            \
+                             void *vm, uint32_t desc)                   \
+{                                                                       \
+    do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF);               \
+}                                                                       \
+void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
+{                                                                       \
+    do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF);                      \
+}
+
+DO_TB(b, uint8_t, H1)
+DO_TB(h, uint16_t, H2)
+DO_TB(s, uint32_t, H4)
+DO_TB(d, uint64_t, H8)
+
+#undef DO_TB
+
+#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
+{                                                              \
+    intptr_t i, opr_sz = simd_oprsz(desc);                     \
+    TYPED *d = vd;                                             \
+    TYPES *n = vn;                                             \
+    ARMVectorReg tmp;                                          \
+    if (unlikely(vn - vd < opr_sz)) {                          \
+        n = memcpy(&tmp, n, opr_sz / 2);                       \
+    }                                                          \
+    for (i = 0; i < opr_sz / sizeof(TYPED); i++) {             \
+        d[HD(i)] = n[HS(i)];                                   \
+    }                                                          \
+}
+
+DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
+DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
+DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
+
+DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
+DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
+DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
+
+#undef DO_UNPK
+
+/* Mask of bits included in the even numbered predicates of width esz.
+ * We also use this for expand_bits/compress_bits, and so extend the
+ * same pattern out to 16-bit units.
+ */
+static const uint64_t even_bit_esz_masks[5] = {
+    0x5555555555555555ull,
+    0x3333333333333333ull,
+    0x0f0f0f0f0f0f0f0full,
+    0x00ff00ff00ff00ffull,
+    0x0000ffff0000ffffull,
+};
+
+/* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
+ * For N==0, this corresponds to the operation that in qemu/bitops.h
+ * we call half_shuffle64; this algorithm is from Hacker's Delight,
+ * section 7-2 Shuffling Bits.
+ */
+static uint64_t expand_bits(uint64_t x, int n)
+{
+    int i;
+
+    x &= 0xffffffffu;
+    for (i = 4; i >= n; i--) {
+        int sh = 1 << i;
+        x = ((x << sh) | x) & even_bit_esz_masks[i];
+    }
+    return x;
+}
+
+/* Compress units of 2**(N+1) bits to units of 2**N bits.
+ * For N==0, this corresponds to the operation that in qemu/bitops.h
+ * we call half_unshuffle64; this algorithm is from Hacker's Delight,
+ * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
+ */
+static uint64_t compress_bits(uint64_t x, int n)
+{
+    int i;
+
+    for (i = n; i <= 4; i++) {
+        int sh = 1 << i;
+        x &= even_bit_esz_masks[i];
+        x = (x >> sh) | x;
+    }
+    return x & 0xffffffffu;
+}
+
+void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
+{
+    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+    int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+    intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
+    int esize = 1 << esz;
+    uint64_t *d = vd;
+    intptr_t i;
+
+    if (oprsz <= 8) {
+        uint64_t nn = *(uint64_t *)vn;
+        uint64_t mm = *(uint64_t *)vm;
+        int half = 4 * oprsz;
+
+        nn = extract64(nn, high * half, half);
+        mm = extract64(mm, high * half, half);
+        nn = expand_bits(nn, esz);
+        mm = expand_bits(mm, esz);
+        d[0] = nn | (mm << esize);
+    } else {
+        ARMPredicateReg tmp;
+
+        /* We produce output faster than we consume input.
+           Therefore we must be mindful of possible overlap.  */
+        if (vd == vn) {
+            vn = memcpy(&tmp, vn, oprsz);
+            if (vd == vm) {
+                vm = vn;
+            }
+        } else if (vd == vm) {
+            vm = memcpy(&tmp, vm, oprsz);
+        }
+        if (high) {
+            high = oprsz >> 1;
+        }
+
+        if ((oprsz & 7) == 0) {
+            uint32_t *n = vn, *m = vm;
+            high >>= 2;
+
+            for (i = 0; i < oprsz / 8; i++) {
+                uint64_t nn = n[H4(high + i)];
+                uint64_t mm = m[H4(high + i)];
+
+                nn = expand_bits(nn, esz);
+                mm = expand_bits(mm, esz);
+                d[i] = nn | (mm << esize);
+            }
+        } else {
+            uint8_t *n = vn, *m = vm;
+            uint16_t *d16 = vd;
+
+            for (i = 0; i < oprsz / 2; i++) {
+                uint16_t nn = n[H1(high + i)];
+                uint16_t mm = m[H1(high + i)];
+
+                nn = expand_bits(nn, esz);
+                mm = expand_bits(mm, esz);
+                d16[H2(i)] = nn | (mm << esize);
+            }
+        }
+    }
+}
+
+void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
+{
+    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+    int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+    int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
+    uint64_t *d = vd, *n = vn, *m = vm;
+    uint64_t l, h;
+    intptr_t i;
+
+    if (oprsz <= 8) {
+        l = compress_bits(n[0] >> odd, esz);
+        h = compress_bits(m[0] >> odd, esz);
+        d[0] = l | (h << (4 * oprsz));
+    } else {
+        ARMPredicateReg tmp_m;
+        intptr_t oprsz_16 = oprsz / 16;
+
+        if ((vm - vd) < (uintptr_t)oprsz) {
+            m = memcpy(&tmp_m, vm, oprsz);
+        }
+
+        for (i = 0; i < oprsz_16; i++) {
+            l = n[2 * i + 0];
+            h = n[2 * i + 1];
+            l = compress_bits(l >> odd, esz);
+            h = compress_bits(h >> odd, esz);
+            d[i] = l | (h << 32);
+        }
+
+        /*
+         * For VL which is not a multiple of 512, the results from M do not
+         * align nicely with the uint64_t for D.  Put the aligned results
+         * from M into TMP_M and then copy it into place afterward.
+         */
+        if (oprsz & 15) {
+            int final_shift = (oprsz & 15) * 2;
+
+            l = n[2 * i + 0];
+            h = n[2 * i + 1];
+            l = compress_bits(l >> odd, esz);
+            h = compress_bits(h >> odd, esz);
+            d[i] = l | (h << final_shift);
+
+            for (i = 0; i < oprsz_16; i++) {
+                l = m[2 * i + 0];
+                h = m[2 * i + 1];
+                l = compress_bits(l >> odd, esz);
+                h = compress_bits(h >> odd, esz);
+                tmp_m.p[i] = l | (h << 32);
+            }
+            l = m[2 * i + 0];
+            h = m[2 * i + 1];
+            l = compress_bits(l >> odd, esz);
+            h = compress_bits(h >> odd, esz);
+            tmp_m.p[i] = l | (h << final_shift);
+
+            swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
+        } else {
+            for (i = 0; i < oprsz_16; i++) {
+                l = m[2 * i + 0];
+                h = m[2 * i + 1];
+                l = compress_bits(l >> odd, esz);
+                h = compress_bits(h >> odd, esz);
+                d[oprsz_16 + i] = l | (h << 32);
+            }
+        }
+    }
+}
+
+void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
+{
+    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+    int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+    int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
+    uint64_t *d = vd, *n = vn, *m = vm;
+    uint64_t mask;
+    int shr, shl;
+    intptr_t i;
+
+    shl = 1 << esz;
+    shr = 0;
+    mask = even_bit_esz_masks[esz];
+    if (odd) {
+        mask <<= shl;
+        shr = shl;
+        shl = 0;
+    }
+
+    for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
+        uint64_t nn = (n[i] & mask) >> shr;
+        uint64_t mm = (m[i] & mask) << shl;
+        d[i] = nn + mm;
+    }
+}
+
+/* Reverse units of 2**N bits.  */
+static uint64_t reverse_bits_64(uint64_t x, int n)
+{
+    int i, sh;
+
+    x = bswap64(x);
+    for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
+        uint64_t mask = even_bit_esz_masks[i];
+        x = ((x & mask) << sh) | ((x >> sh) & mask);
+    }
+    return x;
+}
+
+static uint8_t reverse_bits_8(uint8_t x, int n)
+{
+    static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
+    int i, sh;
+
+    for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
+        x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
+    }
+    return x;
+}
+
+void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
+{
+    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+    int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+    intptr_t i, oprsz_2 = oprsz / 2;
+
+    if (oprsz <= 8) {
+        uint64_t l = *(uint64_t *)vn;
+        l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
+        *(uint64_t *)vd = l;
+    } else if ((oprsz & 15) == 0) {
+        for (i = 0; i < oprsz_2; i += 8) {
+            intptr_t ih = oprsz - 8 - i;
+            uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
+            uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
+            *(uint64_t *)(vd + i) = h;
+            *(uint64_t *)(vd + ih) = l;
+        }
+    } else {
+        for (i = 0; i < oprsz_2; i += 1) {
+            intptr_t il = H1(i);
+            intptr_t ih = H1(oprsz - 1 - i);
+            uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
+            uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
+            *(uint8_t *)(vd + il) = h;
+            *(uint8_t *)(vd + ih) = l;
+        }
+    }
+}
+
+void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
+{
+    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+    intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
+    uint64_t *d = vd;
+    intptr_t i;
+
+    if (oprsz <= 8) {
+        uint64_t nn = *(uint64_t *)vn;
+        int half = 4 * oprsz;
+
+        nn = extract64(nn, high * half, half);
+        nn = expand_bits(nn, 0);
+        d[0] = nn;
+    } else {
+        ARMPredicateReg tmp_n;
+
+        /* We produce output faster than we consume input.
+           Therefore we must be mindful of possible overlap.  */
+        if ((vn - vd) < (uintptr_t)oprsz) {
+            vn = memcpy(&tmp_n, vn, oprsz);
+        }
+        if (high) {
+            high = oprsz >> 1;
+        }
+
+        if ((oprsz & 7) == 0) {
+            uint32_t *n = vn;
+            high >>= 2;
+
+            for (i = 0; i < oprsz / 8; i++) {
+                uint64_t nn = n[H4(high + i)];
+                d[i] = expand_bits(nn, 0);
+            }
+        } else {
+            uint16_t *d16 = vd;
+            uint8_t *n = vn;
+
+            for (i = 0; i < oprsz / 2; i++) {
+                uint16_t nn = n[H1(high + i)];
+                d16[H2(i)] = expand_bits(nn, 0);
+            }
+        }
+    }
+}
+
+#define DO_ZIP(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
+{                                                                    \
+    intptr_t oprsz = simd_oprsz(desc);                               \
+    intptr_t odd_ofs = simd_data(desc);                              \
+    intptr_t i, oprsz_2 = oprsz / 2;                                 \
+    ARMVectorReg tmp_n, tmp_m;                                       \
+    /* We produce output faster than we consume input.               \
+       Therefore we must be mindful of possible overlap.  */         \
+    if (unlikely((vn - vd) < (uintptr_t)oprsz)) {                    \
+        vn = memcpy(&tmp_n, vn, oprsz);                              \
+    }                                                                \
+    if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                    \
+        vm = memcpy(&tmp_m, vm, oprsz);                              \
+    }                                                                \
+    for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                    \
+        *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
+        *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) =                    \
+            *(TYPE *)(vm + odd_ofs + H(i));                          \
+    }                                                                \
+    if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                \
+        memset(vd + oprsz - 16, 0, 16);                              \
+    }                                                                \
+}
+
+DO_ZIP(sve_zip_b, uint8_t, H1)
+DO_ZIP(sve_zip_h, uint16_t, H1_2)
+DO_ZIP(sve_zip_s, uint32_t, H1_4)
+DO_ZIP(sve_zip_d, uint64_t, H1_8)
+DO_ZIP(sve2_zip_q, Int128, )
+
+#define DO_UZP(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
+{                                                                      \
+    intptr_t oprsz = simd_oprsz(desc);                                 \
+    intptr_t odd_ofs = simd_data(desc);                                \
+    intptr_t i, p;                                                     \
+    ARMVectorReg tmp_m;                                                \
+    if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                      \
+        vm = memcpy(&tmp_m, vm, oprsz);                                \
+    }                                                                  \
+    i = 0, p = odd_ofs;                                                \
+    do {                                                               \
+        *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p));                   \
+        i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
+    } while (p < oprsz);                                               \
+    p -= oprsz;                                                        \
+    do {                                                               \
+        *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p));                   \
+        i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
+    } while (p < oprsz);                                               \
+    tcg_debug_assert(i == oprsz);                                      \
+}
+
+DO_UZP(sve_uzp_b, uint8_t, H1)
+DO_UZP(sve_uzp_h, uint16_t, H1_2)
+DO_UZP(sve_uzp_s, uint32_t, H1_4)
+DO_UZP(sve_uzp_d, uint64_t, H1_8)
+DO_UZP(sve2_uzp_q, Int128, )
+
+#define DO_TRN(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
+{                                                                      \
+    intptr_t oprsz = simd_oprsz(desc);                                 \
+    intptr_t odd_ofs = simd_data(desc);                                \
+    intptr_t i;                                                        \
+    for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) {                    \
+        TYPE ae = *(TYPE *)(vn + H(i + odd_ofs));                      \
+        TYPE be = *(TYPE *)(vm + H(i + odd_ofs));                      \
+        *(TYPE *)(vd + H(i + 0)) = ae;                                 \
+        *(TYPE *)(vd + H(i + sizeof(TYPE))) = be;                      \
+    }                                                                  \
+    if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                  \
+        memset(vd + oprsz - 16, 0, 16);                                \
+    }                                                                  \
+}
+
+DO_TRN(sve_trn_b, uint8_t, H1)
+DO_TRN(sve_trn_h, uint16_t, H1_2)
+DO_TRN(sve_trn_s, uint32_t, H1_4)
+DO_TRN(sve_trn_d, uint64_t, H1_8)
+DO_TRN(sve2_trn_q, Int128, )
+
+#undef DO_ZIP
+#undef DO_UZP
+#undef DO_TRN
+
+void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
+{
+    intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
+    uint32_t *d = vd, *n = vn;
+    uint8_t *pg = vg;
+
+    for (i = j = 0; i < opr_sz; i++) {
+        if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
+            d[H4(j)] = n[H4(i)];
+            j++;
+        }
+    }
+    for (; j < opr_sz; j++) {
+        d[H4(j)] = 0;
+    }
+}
+
+void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
+{
+    intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t *d = vd, *n = vn;
+    uint8_t *pg = vg;
+
+    for (i = j = 0; i < opr_sz; i++) {
+        if (pg[H1(i)] & 1) {
+            d[j] = n[i];
+            j++;
+        }
+    }
+    for (; j < opr_sz; j++) {
+        d[j] = 0;
+    }
+}
+
+/* Similar to the ARM LastActiveElement pseudocode function, except the
+ * result is multiplied by the element size.  This includes the not found
+ * indication; e.g. not found for esz=3 is -8.
+ */
+int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
+{
+    intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
+    intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+
+    return last_active_element(vg, words, esz);
+}
+
+void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
+{
+    intptr_t opr_sz = simd_oprsz(desc) / 8;
+    int esz = simd_data(desc);
+    uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
+    intptr_t i, first_i, last_i;
+    ARMVectorReg tmp;
+
+    first_i = last_i = 0;
+    first_g = last_g = 0;
+
+    /* Find the extent of the active elements within VG.  */
+    for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
+        pg = *(uint64_t *)(vg + i) & mask;
+        if (pg) {
+            if (last_g == 0) {
+                last_g = pg;
+                last_i = i;
+            }
+            first_g = pg;
+            first_i = i;
+        }
+    }
+
+    len = 0;
+    if (first_g != 0) {
+        first_i = first_i * 8 + ctz64(first_g);
+        last_i = last_i * 8 + 63 - clz64(last_g);
+        len = last_i - first_i + (1 << esz);
+        if (vd == vm) {
+            vm = memcpy(&tmp, vm, opr_sz * 8);
+        }
+        swap_memmove(vd, vn + first_i, len);
+    }
+    swap_memmove(vd + len, vm, opr_sz * 8 - len);
+}
+
+void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
+                            void *vg, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t *d = vd, *n = vn, *m = vm;
+    uint8_t *pg = vg;
+
+    for (i = 0; i < opr_sz; i += 1) {
+        uint64_t nn = n[i], mm = m[i];
+        uint64_t pp = expand_pred_b(pg[H1(i)]);
+        d[i] = (nn & pp) | (mm & ~pp);
+    }
+}
+
+void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
+                            void *vg, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t *d = vd, *n = vn, *m = vm;
+    uint8_t *pg = vg;
+
+    for (i = 0; i < opr_sz; i += 1) {
+        uint64_t nn = n[i], mm = m[i];
+        uint64_t pp = expand_pred_h(pg[H1(i)]);
+        d[i] = (nn & pp) | (mm & ~pp);
+    }
+}
+
+void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
+                            void *vg, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t *d = vd, *n = vn, *m = vm;
+    uint8_t *pg = vg;
+
+    for (i = 0; i < opr_sz; i += 1) {
+        uint64_t nn = n[i], mm = m[i];
+        uint64_t pp = expand_pred_s(pg[H1(i)]);
+        d[i] = (nn & pp) | (mm & ~pp);
+    }
+}
+
+void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
+                            void *vg, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t *d = vd, *n = vn, *m = vm;
+    uint8_t *pg = vg;
+
+    for (i = 0; i < opr_sz; i += 1) {
+        uint64_t nn = n[i], mm = m[i];
+        d[i] = (pg[H1(i)] & 1 ? nn : mm);
+    }
+}
+
+void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
+                            void *vg, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 16;
+    Int128 *d = vd, *n = vn, *m = vm;
+    uint16_t *pg = vg;
+
+    for (i = 0; i < opr_sz; i += 1) {
+        d[i] = (pg[H2(i)] & 1 ? n : m)[i];
+    }
+}
+
+/* Two operand comparison controlled by a predicate.
+ * ??? It is very tempting to want to be able to expand this inline
+ * with x86 instructions, e.g.
+ *
+ *    vcmpeqw    zm, zn, %ymm0
+ *    vpmovmskb  %ymm0, %eax
+ *    and        $0x5555, %eax
+ *    and        pg, %eax
+ *
+ * or even aarch64, e.g.
+ *
+ *    // mask = 4000 1000 0400 0100 0040 0010 0004 0001
+ *    cmeq       v0.8h, zn, zm
+ *    and        v0.8h, v0.8h, mask
+ *    addv       h0, v0.8h
+ *    and        v0.8b, pg
+ *
+ * However, coming up with an abstraction that allows vector inputs and
+ * a scalar output, and also handles the byte-ordering of sub-uint64_t
+ * scalar outputs, is tricky.
+ */
+#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK)                                 \
+uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
+{                                                                            \
+    intptr_t opr_sz = simd_oprsz(desc);                                      \
+    uint32_t flags = PREDTEST_INIT;                                          \
+    intptr_t i = opr_sz;                                                     \
+    do {                                                                     \
+        uint64_t out = 0, pg;                                                \
+        do {                                                                 \
+            i -= sizeof(TYPE), out <<= sizeof(TYPE);                         \
+            TYPE nn = *(TYPE *)(vn + H(i));                                  \
+            TYPE mm = *(TYPE *)(vm + H(i));                                  \
+            out |= nn OP mm;                                                 \
+        } while (i & 63);                                                    \
+        pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
+        out &= pg;                                                           \
+        *(uint64_t *)(vd + (i >> 3)) = out;                                  \
+        flags = iter_predtest_bwd(out, pg, flags);                           \
+    } while (i > 0);                                                         \
+    return flags;                                                            \
+}
+
+#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
+    DO_CMP_PPZZ(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
+#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
+    DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
+#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
+    DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
+#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
+    DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
+
+DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t,  ==)
+DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
+DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
+DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
+
+DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t,  !=)
+DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
+DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
+DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
+
+DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t,  >)
+DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
+DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
+DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
+
+DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t,  >=)
+DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
+DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
+DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
+
+DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t,  >)
+DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
+DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
+DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
+
+DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t,  >=)
+DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
+DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
+DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
+
+#undef DO_CMP_PPZZ_B
+#undef DO_CMP_PPZZ_H
+#undef DO_CMP_PPZZ_S
+#undef DO_CMP_PPZZ_D
+#undef DO_CMP_PPZZ
+
+/* Similar, but the second source is "wide".  */
+#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK)                     \
+uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
+{                                                                            \
+    intptr_t opr_sz = simd_oprsz(desc);                                      \
+    uint32_t flags = PREDTEST_INIT;                                          \
+    intptr_t i = opr_sz;                                                     \
+    do {                                                                     \
+        uint64_t out = 0, pg;                                                \
+        do {                                                                 \
+            TYPEW mm = *(TYPEW *)(vm + i - 8);                               \
+            do {                                                             \
+                i -= sizeof(TYPE), out <<= sizeof(TYPE);                     \
+                TYPE nn = *(TYPE *)(vn + H(i));                              \
+                out |= nn OP mm;                                             \
+            } while (i & 7);                                                 \
+        } while (i & 63);                                                    \
+        pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
+        out &= pg;                                                           \
+        *(uint64_t *)(vd + (i >> 3)) = out;                                  \
+        flags = iter_predtest_bwd(out, pg, flags);                           \
+    } while (i > 0);                                                         \
+    return flags;                                                            \
+}
+
+#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
+    DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1,   0xffffffffffffffffull)
+#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
+    DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
+#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
+    DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
+
+DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t,  uint64_t, ==)
+DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
+DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
+
+DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t,  uint64_t, !=)
+DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
+DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
+
+DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t,   int64_t, >)
+DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t,  int64_t, >)
+DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t,  int64_t, >)
+
+DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t,   int64_t, >=)
+DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t,  int64_t, >=)
+DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t,  int64_t, >=)
+
+DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t,  uint64_t, >)
+DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
+DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
+
+DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t,  uint64_t, >=)
+DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
+DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
+
+DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t,   int64_t, <)
+DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t,  int64_t, <)
+DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t,  int64_t, <)
+
+DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t,   int64_t, <=)
+DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t,  int64_t, <=)
+DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t,  int64_t, <=)
+
+DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t,  uint64_t, <)
+DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
+DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
+
+DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t,  uint64_t, <=)
+DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
+DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
+
+#undef DO_CMP_PPZW_B
+#undef DO_CMP_PPZW_H
+#undef DO_CMP_PPZW_S
+#undef DO_CMP_PPZW
+
+/* Similar, but the second source is immediate.  */
+#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK)                         \
+uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)   \
+{                                                                    \
+    intptr_t opr_sz = simd_oprsz(desc);                              \
+    uint32_t flags = PREDTEST_INIT;                                  \
+    TYPE mm = simd_data(desc);                                       \
+    intptr_t i = opr_sz;                                             \
+    do {                                                             \
+        uint64_t out = 0, pg;                                        \
+        do {                                                         \
+            i -= sizeof(TYPE), out <<= sizeof(TYPE);                 \
+            TYPE nn = *(TYPE *)(vn + H(i));                          \
+            out |= nn OP mm;                                         \
+        } while (i & 63);                                            \
+        pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                    \
+        out &= pg;                                                   \
+        *(uint64_t *)(vd + (i >> 3)) = out;                          \
+        flags = iter_predtest_bwd(out, pg, flags);                   \
+    } while (i > 0);                                                 \
+    return flags;                                                    \
+}
+
+#define DO_CMP_PPZI_B(NAME, TYPE, OP) \
+    DO_CMP_PPZI(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
+#define DO_CMP_PPZI_H(NAME, TYPE, OP) \
+    DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
+#define DO_CMP_PPZI_S(NAME, TYPE, OP) \
+    DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
+#define DO_CMP_PPZI_D(NAME, TYPE, OP) \
+    DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
+
+DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t,  ==)
+DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
+DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
+DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
+
+DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t,  !=)
+DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
+DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
+DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
+
+DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t,  >)
+DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
+DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
+DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
+
+DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t,  >=)
+DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
+DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
+DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
+
+DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t,  >)
+DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
+DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
+DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
+
+DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t,  >=)
+DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
+DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
+DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
+
+DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t,  <)
+DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
+DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
+DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
+
+DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t,  <=)
+DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
+DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
+DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
+
+DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t,  <)
+DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
+DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
+DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
+
+DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t,  <=)
+DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
+DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
+DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
+
+#undef DO_CMP_PPZI_B
+#undef DO_CMP_PPZI_H
+#undef DO_CMP_PPZI_S
+#undef DO_CMP_PPZI_D
+#undef DO_CMP_PPZI
+
+/* Similar to the ARM LastActive pseudocode function.  */
+static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
+{
+    intptr_t i;
+
+    for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
+        uint64_t pg = *(uint64_t *)(vg + i);
+        if (pg) {
+            return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
+        }
+    }
+    return 0;
+}
+
+/* Compute a mask into RETB that is true for all G, up to and including
+ * (if after) or excluding (if !after) the first G & N.
+ * Return true if BRK found.
+ */
+static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
+                        bool brk, bool after)
+{
+    uint64_t b;
+
+    if (brk) {
+        b = 0;
+    } else if ((g & n) == 0) {
+        /* For all G, no N are set; break not found.  */
+        b = g;
+    } else {
+        /* Break somewhere in N.  Locate it.  */
+        b = g & n;            /* guard true, pred true */
+        b = b & -b;           /* first such */
+        if (after) {
+            b = b | (b - 1);  /* break after same */
+        } else {
+            b = b - 1;        /* break before same */
+        }
+        brk = true;
+    }
+
+    *retb = b;
+    return brk;
+}
+
+/* Compute a zeroing BRK.  */
+static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
+                          intptr_t oprsz, bool after)
+{
+    bool brk = false;
+    intptr_t i;
+
+    for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
+        uint64_t this_b, this_g = g[i];
+
+        brk = compute_brk(&this_b, n[i], this_g, brk, after);
+        d[i] = this_b & this_g;
+    }
+}
+
+/* Likewise, but also compute flags.  */
+static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
+                               intptr_t oprsz, bool after)
+{
+    uint32_t flags = PREDTEST_INIT;
+    bool brk = false;
+    intptr_t i;
+
+    for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
+        uint64_t this_b, this_d, this_g = g[i];
+
+        brk = compute_brk(&this_b, n[i], this_g, brk, after);
+        d[i] = this_d = this_b & this_g;
+        flags = iter_predtest_fwd(this_d, this_g, flags);
+    }
+    return flags;
+}
+
+/* Compute a merging BRK.  */
+static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
+                          intptr_t oprsz, bool after)
+{
+    bool brk = false;
+    intptr_t i;
+
+    for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
+        uint64_t this_b, this_g = g[i];
+
+        brk = compute_brk(&this_b, n[i], this_g, brk, after);
+        d[i] = (this_b & this_g) | (d[i] & ~this_g);
+    }
+}
+
+/* Likewise, but also compute flags.  */
+static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
+                               intptr_t oprsz, bool after)
+{
+    uint32_t flags = PREDTEST_INIT;
+    bool brk = false;
+    intptr_t i;
+
+    for (i = 0; i < oprsz / 8; ++i) {
+        uint64_t this_b, this_d = d[i], this_g = g[i];
+
+        brk = compute_brk(&this_b, n[i], this_g, brk, after);
+        d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
+        flags = iter_predtest_fwd(this_d, this_g, flags);
+    }
+    return flags;
+}
+
+static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
+{
+    /* It is quicker to zero the whole predicate than loop on OPRSZ.
+     * The compiler should turn this into 4 64-bit integer stores.
+     */
+    memset(d, 0, sizeof(ARMPredicateReg));
+    return PREDTEST_INIT;
+}
+
+void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
+                       uint32_t pred_desc)
+{
+    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+    if (last_active_pred(vn, vg, oprsz)) {
+        compute_brk_z(vd, vm, vg, oprsz, true);
+    } else {
+        do_zero(vd, oprsz);
+    }
+}
+
+uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
+                            uint32_t pred_desc)
+{
+    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+    if (last_active_pred(vn, vg, oprsz)) {
+        return compute_brks_z(vd, vm, vg, oprsz, true);
+    } else {
+        return do_zero(vd, oprsz);
+    }
+}
+
+void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
+                       uint32_t pred_desc)
+{
+    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+    if (last_active_pred(vn, vg, oprsz)) {
+        compute_brk_z(vd, vm, vg, oprsz, false);
+    } else {
+        do_zero(vd, oprsz);
+    }
+}
+
+uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
+                            uint32_t pred_desc)
+{
+    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+    if (last_active_pred(vn, vg, oprsz)) {
+        return compute_brks_z(vd, vm, vg, oprsz, false);
+    } else {
+        return do_zero(vd, oprsz);
+    }
+}
+
+void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
+{
+    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+    compute_brk_z(vd, vn, vg, oprsz, true);
+}
+
+uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
+{
+    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+    return compute_brks_z(vd, vn, vg, oprsz, true);
+}
+
+void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
+{
+    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+    compute_brk_z(vd, vn, vg, oprsz, false);
+}
+
+uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
+{
+    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+    return compute_brks_z(vd, vn, vg, oprsz, false);
+}
+
+void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
+{
+    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+    compute_brk_m(vd, vn, vg, oprsz, true);
+}
+
+uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
+{
+    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+    return compute_brks_m(vd, vn, vg, oprsz, true);
+}
+
+void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
+{
+    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+    compute_brk_m(vd, vn, vg, oprsz, false);
+}
+
+uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
+{
+    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+    return compute_brks_m(vd, vn, vg, oprsz, false);
+}
+
+void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
+{
+    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+    if (!last_active_pred(vn, vg, oprsz)) {
+        do_zero(vd, oprsz);
+    }
+}
+
+/* As if PredTest(Ones(PL), D, esz).  */
+static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
+                              uint64_t esz_mask)
+{
+    uint32_t flags = PREDTEST_INIT;
+    intptr_t i;
+
+    for (i = 0; i < oprsz / 8; i++) {
+        flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
+    }
+    if (oprsz & 7) {
+        uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
+        flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
+    }
+    return flags;
+}
+
+uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
+{
+    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+    if (last_active_pred(vn, vg, oprsz)) {
+        return predtest_ones(vd, oprsz, -1);
+    } else {
+        return do_zero(vd, oprsz);
+    }
+}
+
+uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
+{
+    intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
+    intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+    uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
+    intptr_t i;
+
+    for (i = 0; i < words; ++i) {
+        uint64_t t = n[i] & g[i] & mask;
+        sum += ctpop64(t);
+    }
+    return sum;
+}
+
+uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
+{
+    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+    intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+    uint64_t esz_mask = pred_esz_masks[esz];
+    ARMPredicateReg *d = vd;
+    uint32_t flags;
+    intptr_t i;
+
+    /* Begin with a zero predicate register.  */
+    flags = do_zero(d, oprsz);
+    if (count == 0) {
+        return flags;
+    }
+
+    /* Set all of the requested bits.  */
+    for (i = 0; i < count / 64; ++i) {
+        d->p[i] = esz_mask;
+    }
+    if (count & 63) {
+        d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
+    }
+
+    return predtest_ones(d, oprsz, esz_mask);
+}
+
+uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
+{
+    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+    intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+    uint64_t esz_mask = pred_esz_masks[esz];
+    ARMPredicateReg *d = vd;
+    intptr_t i, invcount, oprbits;
+    uint64_t bits;
+
+    if (count == 0) {
+        return do_zero(d, oprsz);
+    }
+
+    oprbits = oprsz * 8;
+    tcg_debug_assert(count <= oprbits);
+
+    bits = esz_mask;
+    if (oprbits & 63) {
+        bits &= MAKE_64BIT_MASK(0, oprbits & 63);
+    }
+
+    invcount = oprbits - count;
+    for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
+        d->p[i] = bits;
+        bits = esz_mask;
+    }
+
+    d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
+
+    while (--i >= 0) {
+        d->p[i] = 0;
+    }
+
+    return predtest_ones(d, oprsz, esz_mask);
+}
+
+/* Recursive reduction on a function;
+ * C.f. the ARM ARM function ReducePredicated.
+ *
+ * While it would be possible to write this without the DATA temporary,
+ * it is much simpler to process the predicate register this way.
+ * The recursion is bounded to depth 7 (128 fp16 elements), so there's
+ * little to gain with a more complex non-recursive form.
+ */
+#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT)                         \
+static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
+{                                                                     \
+    if (n == 1) {                                                     \
+        return *data;                                                 \
+    } else {                                                          \
+        uintptr_t half = n / 2;                                       \
+        TYPE lo = NAME##_reduce(data, status, half);                  \
+        TYPE hi = NAME##_reduce(data + half, status, half);           \
+        return TYPE##_##FUNC(lo, hi, status);                         \
+    }                                                                 \
+}                                                                     \
+uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc)    \
+{                                                                     \
+    uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc);   \
+    TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)];                   \
+    for (i = 0; i < oprsz; ) {                                        \
+        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));               \
+        do {                                                          \
+            TYPE nn = *(TYPE *)(vn + H(i));                           \
+            *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT);      \
+            i += sizeof(TYPE), pg >>= sizeof(TYPE);                   \
+        } while (i & 15);                                             \
+    }                                                                 \
+    for (; i < maxsz; i += sizeof(TYPE)) {                            \
+        *(TYPE *)((void *)data + i) = IDENT;                          \
+    }                                                                 \
+    return NAME##_reduce(data, vs, maxsz / sizeof(TYPE));             \
+}
+
+DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
+DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
+DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero)
+
+/* Identity is floatN_default_nan, without the function call.  */
+DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
+DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
+DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL)
+
+DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
+DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
+DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL)
+
+DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
+DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
+DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity)
+
+DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
+DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
+DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity))
+
+#undef DO_REDUCE
+
+uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
+                             void *status, uint32_t desc)
+{
+    intptr_t i = 0, opr_sz = simd_oprsz(desc);
+    float16 result = nn;
+
+    do {
+        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
+        do {
+            if (pg & 1) {
+                float16 mm = *(float16 *)(vm + H1_2(i));
+                result = float16_add(result, mm, status);
+            }
+            i += sizeof(float16), pg >>= sizeof(float16);
+        } while (i & 15);
+    } while (i < opr_sz);
+
+    return result;
+}
+
+uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
+                             void *status, uint32_t desc)
+{
+    intptr_t i = 0, opr_sz = simd_oprsz(desc);
+    float32 result = nn;
+
+    do {
+        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
+        do {
+            if (pg & 1) {
+                float32 mm = *(float32 *)(vm + H1_2(i));
+                result = float32_add(result, mm, status);
+            }
+            i += sizeof(float32), pg >>= sizeof(float32);
+        } while (i & 15);
+    } while (i < opr_sz);
+
+    return result;
+}
+
+uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
+                             void *status, uint32_t desc)
+{
+    intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t *m = vm;
+    uint8_t *pg = vg;
+
+    for (i = 0; i < opr_sz; i++) {
+        if (pg[H1(i)] & 1) {
+            nn = float64_add(nn, m[i], status);
+        }
+    }
+
+    return nn;
+}
+
+/* Fully general three-operand expander, controlled by a predicate,
+ * With the extra float_status parameter.
+ */
+#define DO_ZPZZ_FP(NAME, TYPE, H, OP)                           \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,       \
+                  void *status, uint32_t desc)                  \
+{                                                               \
+    intptr_t i = simd_oprsz(desc);                              \
+    uint64_t *g = vg;                                           \
+    do {                                                        \
+        uint64_t pg = g[(i - 1) >> 6];                          \
+        do {                                                    \
+            i -= sizeof(TYPE);                                  \
+            if (likely((pg >> (i & 63)) & 1)) {                 \
+                TYPE nn = *(TYPE *)(vn + H(i));                 \
+                TYPE mm = *(TYPE *)(vm + H(i));                 \
+                *(TYPE *)(vd + H(i)) = OP(nn, mm, status);      \
+            }                                                   \
+        } while (i & 63);                                       \
+    } while (i != 0);                                           \
+}
+
+DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
+DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
+DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
+
+DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
+DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
+DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
+
+DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
+DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
+DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
+
+DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
+DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
+DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
+
+DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
+DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
+DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
+
+DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
+DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
+DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
+
+DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
+DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
+DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
+
+DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
+DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
+DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
+
+static inline float16 abd_h(float16 a, float16 b, float_status *s)
+{
+    return float16_abs(float16_sub(a, b, s));
+}
+
+static inline float32 abd_s(float32 a, float32 b, float_status *s)
+{
+    return float32_abs(float32_sub(a, b, s));
+}
+
+static inline float64 abd_d(float64 a, float64 b, float_status *s)
+{
+    return float64_abs(float64_sub(a, b, s));
+}
+
+DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
+DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
+DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
+
+static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
+{
+    int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
+    return float64_scalbn(a, b_int, s);
+}
+
+DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
+DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
+DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
+
+DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
+DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
+DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
+
+#undef DO_ZPZZ_FP
+
+/* Three-operand expander, with one scalar operand, controlled by
+ * a predicate, with the extra float_status parameter.
+ */
+#define DO_ZPZS_FP(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar,  \
+                  void *status, uint32_t desc)                    \
+{                                                                 \
+    intptr_t i = simd_oprsz(desc);                                \
+    uint64_t *g = vg;                                             \
+    TYPE mm = scalar;                                             \
+    do {                                                          \
+        uint64_t pg = g[(i - 1) >> 6];                            \
+        do {                                                      \
+            i -= sizeof(TYPE);                                    \
+            if (likely((pg >> (i & 63)) & 1)) {                   \
+                TYPE nn = *(TYPE *)(vn + H(i));                   \
+                *(TYPE *)(vd + H(i)) = OP(nn, mm, status);        \
+            }                                                     \
+        } while (i & 63);                                         \
+    } while (i != 0);                                             \
+}
+
+DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
+DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
+DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
+
+DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
+DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
+DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
+
+DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
+DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
+DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
+
+static inline float16 subr_h(float16 a, float16 b, float_status *s)
+{
+    return float16_sub(b, a, s);
+}
+
+static inline float32 subr_s(float32 a, float32 b, float_status *s)
+{
+    return float32_sub(b, a, s);
+}
+
+static inline float64 subr_d(float64 a, float64 b, float_status *s)
+{
+    return float64_sub(b, a, s);
+}
+
+DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
+DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
+DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
+
+DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
+DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
+DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
+
+DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
+DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
+DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
+
+DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
+DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
+DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
+
+DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
+DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
+DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
+
+/* Fully general two-operand expander, controlled by a predicate,
+ * With the extra float_status parameter.
+ */
+#define DO_ZPZ_FP(NAME, TYPE, H, OP)                                  \
+void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
+{                                                                     \
+    intptr_t i = simd_oprsz(desc);                                    \
+    uint64_t *g = vg;                                                 \
+    do {                                                              \
+        uint64_t pg = g[(i - 1) >> 6];                                \
+        do {                                                          \
+            i -= sizeof(TYPE);                                        \
+            if (likely((pg >> (i & 63)) & 1)) {                       \
+                TYPE nn = *(TYPE *)(vn + H(i));                       \
+                *(TYPE *)(vd + H(i)) = OP(nn, status);                \
+            }                                                         \
+        } while (i & 63);                                             \
+    } while (i != 0);                                                 \
+}
+
+/* SVE fp16 conversions always use IEEE mode.  Like AdvSIMD, they ignore
+ * FZ16.  When converting from fp16, this affects flushing input denormals;
+ * when converting to fp16, this affects flushing output denormals.
+ */
+static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
+{
+    bool save = get_flush_inputs_to_zero(fpst);
+    float32 ret;
+
+    set_flush_inputs_to_zero(false, fpst);
+    ret = float16_to_float32(f, true, fpst);
+    set_flush_inputs_to_zero(save, fpst);
+    return ret;
+}
+
+static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
+{
+    bool save = get_flush_inputs_to_zero(fpst);
+    float64 ret;
+
+    set_flush_inputs_to_zero(false, fpst);
+    ret = float16_to_float64(f, true, fpst);
+    set_flush_inputs_to_zero(save, fpst);
+    return ret;
+}
+
+static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
+{
+    bool save = get_flush_to_zero(fpst);
+    float16 ret;
+
+    set_flush_to_zero(false, fpst);
+    ret = float32_to_float16(f, true, fpst);
+    set_flush_to_zero(save, fpst);
+    return ret;
+}
+
+static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
+{
+    bool save = get_flush_to_zero(fpst);
+    float16 ret;
+
+    set_flush_to_zero(false, fpst);
+    ret = float64_to_float16(f, true, fpst);
+    set_flush_to_zero(save, fpst);
+    return ret;
+}
+
+static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
+{
+    if (float16_is_any_nan(f)) {
+        float_raise(float_flag_invalid, s);
+        return 0;
+    }
+    return float16_to_int16_round_to_zero(f, s);
+}
+
+static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
+{
+    if (float16_is_any_nan(f)) {
+        float_raise(float_flag_invalid, s);
+        return 0;
+    }
+    return float16_to_int64_round_to_zero(f, s);
+}
+
+static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
+{
+    if (float32_is_any_nan(f)) {
+        float_raise(float_flag_invalid, s);
+        return 0;
+    }
+    return float32_to_int64_round_to_zero(f, s);
+}
+
+static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
+{
+    if (float64_is_any_nan(f)) {
+        float_raise(float_flag_invalid, s);
+        return 0;
+    }
+    return float64_to_int64_round_to_zero(f, s);
+}
+
+static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
+{
+    if (float16_is_any_nan(f)) {
+        float_raise(float_flag_invalid, s);
+        return 0;
+    }
+    return float16_to_uint16_round_to_zero(f, s);
+}
+
+static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
+{
+    if (float16_is_any_nan(f)) {
+        float_raise(float_flag_invalid, s);
+        return 0;
+    }
+    return float16_to_uint64_round_to_zero(f, s);
+}
+
+static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
+{
+    if (float32_is_any_nan(f)) {
+        float_raise(float_flag_invalid, s);
+        return 0;
+    }
+    return float32_to_uint64_round_to_zero(f, s);
+}
+
+static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
+{
+    if (float64_is_any_nan(f)) {
+        float_raise(float_flag_invalid, s);
+        return 0;
+    }
+    return float64_to_uint64_round_to_zero(f, s);
+}
+
+DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
+DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
+DO_ZPZ_FP(sve_bfcvt,   uint32_t, H1_4, float32_to_bfloat16)
+DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
+DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
+DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
+DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
+
+DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
+DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
+DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
+DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
+DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
+DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
+DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
+
+DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
+DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
+DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
+DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
+DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
+DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
+DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
+
+DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
+DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
+DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
+
+DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
+DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
+DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
+
+DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
+DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
+DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
+
+DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
+DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
+DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
+
+DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
+DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
+DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
+DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
+DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
+DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
+DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
+
+DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
+DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
+DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
+DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
+DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
+DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
+DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
+
+static int16_t do_float16_logb_as_int(float16 a, float_status *s)
+{
+    /* Extract frac to the top of the uint32_t. */
+    uint32_t frac = (uint32_t)a << (16 + 6);
+    int16_t exp = extract32(a, 10, 5);
+
+    if (unlikely(exp == 0)) {
+        if (frac != 0) {
+            if (!get_flush_inputs_to_zero(s)) {
+                /* denormal: bias - fractional_zeros */
+                return -15 - clz32(frac);
+            }
+            /* flush to zero */
+            float_raise(float_flag_input_denormal, s);
+        }
+    } else if (unlikely(exp == 0x1f)) {
+        if (frac == 0) {
+            return INT16_MAX; /* infinity */
+        }
+    } else {
+        /* normal: exp - bias */
+        return exp - 15;
+    }
+    /* nan or zero */
+    float_raise(float_flag_invalid, s);
+    return INT16_MIN;
+}
+
+static int32_t do_float32_logb_as_int(float32 a, float_status *s)
+{
+    /* Extract frac to the top of the uint32_t. */
+    uint32_t frac = a << 9;
+    int32_t exp = extract32(a, 23, 8);
+
+    if (unlikely(exp == 0)) {
+        if (frac != 0) {
+            if (!get_flush_inputs_to_zero(s)) {
+                /* denormal: bias - fractional_zeros */
+                return -127 - clz32(frac);
+            }
+            /* flush to zero */
+            float_raise(float_flag_input_denormal, s);
+        }
+    } else if (unlikely(exp == 0xff)) {
+        if (frac == 0) {
+            return INT32_MAX; /* infinity */
+        }
+    } else {
+        /* normal: exp - bias */
+        return exp - 127;
+    }
+    /* nan or zero */
+    float_raise(float_flag_invalid, s);
+    return INT32_MIN;
+}
+
+static int64_t do_float64_logb_as_int(float64 a, float_status *s)
+{
+    /* Extract frac to the top of the uint64_t. */
+    uint64_t frac = a << 12;
+    int64_t exp = extract64(a, 52, 11);
+
+    if (unlikely(exp == 0)) {
+        if (frac != 0) {
+            if (!get_flush_inputs_to_zero(s)) {
+                /* denormal: bias - fractional_zeros */
+                return -1023 - clz64(frac);
+            }
+            /* flush to zero */
+            float_raise(float_flag_input_denormal, s);
+        }
+    } else if (unlikely(exp == 0x7ff)) {
+        if (frac == 0) {
+            return INT64_MAX; /* infinity */
+        }
+    } else {
+        /* normal: exp - bias */
+        return exp - 1023;
+    }
+    /* nan or zero */
+    float_raise(float_flag_invalid, s);
+    return INT64_MIN;
+}
+
+DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
+DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
+DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
+
+#undef DO_ZPZ_FP
+
+static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
+                            float_status *status, uint32_t desc,
+                            uint16_t neg1, uint16_t neg3)
+{
+    intptr_t i = simd_oprsz(desc);
+    uint64_t *g = vg;
+
+    do {
+        uint64_t pg = g[(i - 1) >> 6];
+        do {
+            i -= 2;
+            if (likely((pg >> (i & 63)) & 1)) {
+                float16 e1, e2, e3, r;
+
+                e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
+                e2 = *(uint16_t *)(vm + H1_2(i));
+                e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
+                r = float16_muladd(e1, e2, e3, 0, status);
+                *(uint16_t *)(vd + H1_2(i)) = r;
+            }
+        } while (i & 63);
+    } while (i != 0);
+}
+
+void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
+                              void *vg, void *status, uint32_t desc)
+{
+    do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
+}
+
+void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
+                              void *vg, void *status, uint32_t desc)
+{
+    do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
+}
+
+void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
+                               void *vg, void *status, uint32_t desc)
+{
+    do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
+}
+
+void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
+                               void *vg, void *status, uint32_t desc)
+{
+    do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
+}
+
+static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
+                            float_status *status, uint32_t desc,
+                            uint32_t neg1, uint32_t neg3)
+{
+    intptr_t i = simd_oprsz(desc);
+    uint64_t *g = vg;
+
+    do {
+        uint64_t pg = g[(i - 1) >> 6];
+        do {
+            i -= 4;
+            if (likely((pg >> (i & 63)) & 1)) {
+                float32 e1, e2, e3, r;
+
+                e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
+                e2 = *(uint32_t *)(vm + H1_4(i));
+                e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
+                r = float32_muladd(e1, e2, e3, 0, status);
+                *(uint32_t *)(vd + H1_4(i)) = r;
+            }
+        } while (i & 63);
+    } while (i != 0);
+}
+
+void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
+                              void *vg, void *status, uint32_t desc)
+{
+    do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
+}
+
+void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
+                              void *vg, void *status, uint32_t desc)
+{
+    do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
+}
+
+void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
+                               void *vg, void *status, uint32_t desc)
+{
+    do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
+}
+
+void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
+                               void *vg, void *status, uint32_t desc)
+{
+    do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
+}
+
+static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
+                            float_status *status, uint32_t desc,
+                            uint64_t neg1, uint64_t neg3)
+{
+    intptr_t i = simd_oprsz(desc);
+    uint64_t *g = vg;
+
+    do {
+        uint64_t pg = g[(i - 1) >> 6];
+        do {
+            i -= 8;
+            if (likely((pg >> (i & 63)) & 1)) {
+                float64 e1, e2, e3, r;
+
+                e1 = *(uint64_t *)(vn + i) ^ neg1;
+                e2 = *(uint64_t *)(vm + i);
+                e3 = *(uint64_t *)(va + i) ^ neg3;
+                r = float64_muladd(e1, e2, e3, 0, status);
+                *(uint64_t *)(vd + i) = r;
+            }
+        } while (i & 63);
+    } while (i != 0);
+}
+
+void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
+                              void *vg, void *status, uint32_t desc)
+{
+    do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
+}
+
+void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
+                              void *vg, void *status, uint32_t desc)
+{
+    do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
+}
+
+void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
+                               void *vg, void *status, uint32_t desc)
+{
+    do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
+}
+
+void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
+                               void *vg, void *status, uint32_t desc)
+{
+    do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
+}
+
+/* Two operand floating-point comparison controlled by a predicate.
+ * Unlike the integer version, we are not allowed to optimistically
+ * compare operands, since the comparison may have side effects wrt
+ * the FPSR.
+ */
+#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP)                                \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
+                  void *status, uint32_t desc)                          \
+{                                                                       \
+    intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;                    \
+    uint64_t *d = vd, *g = vg;                                          \
+    do {                                                                \
+        uint64_t out = 0, pg = g[j];                                    \
+        do {                                                            \
+            i -= sizeof(TYPE), out <<= sizeof(TYPE);                    \
+            if (likely((pg >> (i & 63)) & 1)) {                         \
+                TYPE nn = *(TYPE *)(vn + H(i));                         \
+                TYPE mm = *(TYPE *)(vm + H(i));                         \
+                out |= OP(TYPE, nn, mm, status);                        \
+            }                                                           \
+        } while (i & 63);                                               \
+        d[j--] = out;                                                   \
+    } while (i > 0);                                                    \
+}
+
+#define DO_FPCMP_PPZZ_H(NAME, OP) \
+    DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
+#define DO_FPCMP_PPZZ_S(NAME, OP) \
+    DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
+#define DO_FPCMP_PPZZ_D(NAME, OP) \
+    DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
+
+#define DO_FPCMP_PPZZ_ALL(NAME, OP) \
+    DO_FPCMP_PPZZ_H(NAME, OP)   \
+    DO_FPCMP_PPZZ_S(NAME, OP)   \
+    DO_FPCMP_PPZZ_D(NAME, OP)
+
+#define DO_FCMGE(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) <= 0
+#define DO_FCMGT(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) < 0
+#define DO_FCMLE(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) <= 0
+#define DO_FCMLT(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) < 0
+#define DO_FCMEQ(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) == 0
+#define DO_FCMNE(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) != 0
+#define DO_FCMUO(TYPE, X, Y, ST)  \
+    TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
+#define DO_FACGE(TYPE, X, Y, ST)  \
+    TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
+#define DO_FACGT(TYPE, X, Y, ST)  \
+    TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
+
+DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
+DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
+DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
+DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
+DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
+DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
+DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
+
+#undef DO_FPCMP_PPZZ_ALL
+#undef DO_FPCMP_PPZZ_D
+#undef DO_FPCMP_PPZZ_S
+#undef DO_FPCMP_PPZZ_H
+#undef DO_FPCMP_PPZZ
+
+/* One operand floating-point comparison against zero, controlled
+ * by a predicate.
+ */
+#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP)                   \
+void HELPER(NAME)(void *vd, void *vn, void *vg,            \
+                  void *status, uint32_t desc)             \
+{                                                          \
+    intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;       \
+    uint64_t *d = vd, *g = vg;                             \
+    do {                                                   \
+        uint64_t out = 0, pg = g[j];                       \
+        do {                                               \
+            i -= sizeof(TYPE), out <<= sizeof(TYPE);       \
+            if ((pg >> (i & 63)) & 1) {                    \
+                TYPE nn = *(TYPE *)(vn + H(i));            \
+                out |= OP(TYPE, nn, 0, status);            \
+            }                                              \
+        } while (i & 63);                                  \
+        d[j--] = out;                                      \
+    } while (i > 0);                                       \
+}
+
+#define DO_FPCMP_PPZ0_H(NAME, OP) \
+    DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
+#define DO_FPCMP_PPZ0_S(NAME, OP) \
+    DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
+#define DO_FPCMP_PPZ0_D(NAME, OP) \
+    DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
+
+#define DO_FPCMP_PPZ0_ALL(NAME, OP) \
+    DO_FPCMP_PPZ0_H(NAME, OP)   \
+    DO_FPCMP_PPZ0_S(NAME, OP)   \
+    DO_FPCMP_PPZ0_D(NAME, OP)
+
+DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
+DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
+DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
+DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
+DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
+DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
+
+/* FP Trig Multiply-Add. */
+
+void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
+{
+    static const float16 coeff[16] = {
+        0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+        0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    };
+    intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
+    intptr_t x = simd_data(desc);
+    float16 *d = vd, *n = vn, *m = vm;
+    for (i = 0; i < opr_sz; i++) {
+        float16 mm = m[i];
+        intptr_t xx = x;
+        if (float16_is_neg(mm)) {
+            mm = float16_abs(mm);
+            xx += 8;
+        }
+        d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
+    }
+}
+
+void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
+{
+    static const float32 coeff[16] = {
+        0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
+        0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
+        0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
+        0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
+    };
+    intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
+    intptr_t x = simd_data(desc);
+    float32 *d = vd, *n = vn, *m = vm;
+    for (i = 0; i < opr_sz; i++) {
+        float32 mm = m[i];
+        intptr_t xx = x;
+        if (float32_is_neg(mm)) {
+            mm = float32_abs(mm);
+            xx += 8;
+        }
+        d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
+    }
+}
+
+void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
+{
+    static const float64 coeff[16] = {
+        0x3ff0000000000000ull, 0xbfc5555555555543ull,
+        0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
+        0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
+        0x3de5d8408868552full, 0x0000000000000000ull,
+        0x3ff0000000000000ull, 0xbfe0000000000000ull,
+        0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
+        0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
+        0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
+    };
+    intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
+    intptr_t x = simd_data(desc);
+    float64 *d = vd, *n = vn, *m = vm;
+    for (i = 0; i < opr_sz; i++) {
+        float64 mm = m[i];
+        intptr_t xx = x;
+        if (float64_is_neg(mm)) {
+            mm = float64_abs(mm);
+            xx += 8;
+        }
+        d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
+    }
+}
+
+/*
+ * FP Complex Add
+ */
+
+void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
+                         void *vs, uint32_t desc)
+{
+    intptr_t j, i = simd_oprsz(desc);
+    uint64_t *g = vg;
+    float16 neg_imag = float16_set_sign(0, simd_data(desc));
+    float16 neg_real = float16_chs(neg_imag);
+
+    do {
+        uint64_t pg = g[(i - 1) >> 6];
+        do {
+            float16 e0, e1, e2, e3;
+
+            /* I holds the real index; J holds the imag index.  */
+            j = i - sizeof(float16);
+            i -= 2 * sizeof(float16);
+
+            e0 = *(float16 *)(vn + H1_2(i));
+            e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
+            e2 = *(float16 *)(vn + H1_2(j));
+            e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
+
+            if (likely((pg >> (i & 63)) & 1)) {
+                *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
+            }
+            if (likely((pg >> (j & 63)) & 1)) {
+                *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
+            }
+        } while (i & 63);
+    } while (i != 0);
+}
+
+void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
+                         void *vs, uint32_t desc)
+{
+    intptr_t j, i = simd_oprsz(desc);
+    uint64_t *g = vg;
+    float32 neg_imag = float32_set_sign(0, simd_data(desc));
+    float32 neg_real = float32_chs(neg_imag);
+
+    do {
+        uint64_t pg = g[(i - 1) >> 6];
+        do {
+            float32 e0, e1, e2, e3;
+
+            /* I holds the real index; J holds the imag index.  */
+            j = i - sizeof(float32);
+            i -= 2 * sizeof(float32);
+
+            e0 = *(float32 *)(vn + H1_2(i));
+            e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
+            e2 = *(float32 *)(vn + H1_2(j));
+            e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
+
+            if (likely((pg >> (i & 63)) & 1)) {
+                *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
+            }
+            if (likely((pg >> (j & 63)) & 1)) {
+                *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
+            }
+        } while (i & 63);
+    } while (i != 0);
+}
+
+void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
+                         void *vs, uint32_t desc)
+{
+    intptr_t j, i = simd_oprsz(desc);
+    uint64_t *g = vg;
+    float64 neg_imag = float64_set_sign(0, simd_data(desc));
+    float64 neg_real = float64_chs(neg_imag);
+
+    do {
+        uint64_t pg = g[(i - 1) >> 6];
+        do {
+            float64 e0, e1, e2, e3;
+
+            /* I holds the real index; J holds the imag index.  */
+            j = i - sizeof(float64);
+            i -= 2 * sizeof(float64);
+
+            e0 = *(float64 *)(vn + H1_2(i));
+            e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
+            e2 = *(float64 *)(vn + H1_2(j));
+            e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
+
+            if (likely((pg >> (i & 63)) & 1)) {
+                *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
+            }
+            if (likely((pg >> (j & 63)) & 1)) {
+                *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
+            }
+        } while (i & 63);
+    } while (i != 0);
+}
+
+/*
+ * FP Complex Multiply
+ */
+
+void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
+                               void *vg, void *status, uint32_t desc)
+{
+    intptr_t j, i = simd_oprsz(desc);
+    unsigned rot = simd_data(desc);
+    bool flip = rot & 1;
+    float16 neg_imag, neg_real;
+    uint64_t *g = vg;
+
+    neg_imag = float16_set_sign(0, (rot & 2) != 0);
+    neg_real = float16_set_sign(0, rot == 1 || rot == 2);
+
+    do {
+        uint64_t pg = g[(i - 1) >> 6];
+        do {
+            float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
+
+            /* I holds the real index; J holds the imag index.  */
+            j = i - sizeof(float16);
+            i -= 2 * sizeof(float16);
+
+            nr = *(float16 *)(vn + H1_2(i));
+            ni = *(float16 *)(vn + H1_2(j));
+            mr = *(float16 *)(vm + H1_2(i));
+            mi = *(float16 *)(vm + H1_2(j));
+
+            e2 = (flip ? ni : nr);
+            e1 = (flip ? mi : mr) ^ neg_real;
+            e4 = e2;
+            e3 = (flip ? mr : mi) ^ neg_imag;
+
+            if (likely((pg >> (i & 63)) & 1)) {
+                d = *(float16 *)(va + H1_2(i));
+                d = float16_muladd(e2, e1, d, 0, status);
+                *(float16 *)(vd + H1_2(i)) = d;
+            }
+            if (likely((pg >> (j & 63)) & 1)) {
+                d = *(float16 *)(va + H1_2(j));
+                d = float16_muladd(e4, e3, d, 0, status);
+                *(float16 *)(vd + H1_2(j)) = d;
+            }
+        } while (i & 63);
+    } while (i != 0);
+}
+
+void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
+                               void *vg, void *status, uint32_t desc)
+{
+    intptr_t j, i = simd_oprsz(desc);
+    unsigned rot = simd_data(desc);
+    bool flip = rot & 1;
+    float32 neg_imag, neg_real;
+    uint64_t *g = vg;
+
+    neg_imag = float32_set_sign(0, (rot & 2) != 0);
+    neg_real = float32_set_sign(0, rot == 1 || rot == 2);
+
+    do {
+        uint64_t pg = g[(i - 1) >> 6];
+        do {
+            float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
+
+            /* I holds the real index; J holds the imag index.  */
+            j = i - sizeof(float32);
+            i -= 2 * sizeof(float32);
+
+            nr = *(float32 *)(vn + H1_2(i));
+            ni = *(float32 *)(vn + H1_2(j));
+            mr = *(float32 *)(vm + H1_2(i));
+            mi = *(float32 *)(vm + H1_2(j));
+
+            e2 = (flip ? ni : nr);
+            e1 = (flip ? mi : mr) ^ neg_real;
+            e4 = e2;
+            e3 = (flip ? mr : mi) ^ neg_imag;
+
+            if (likely((pg >> (i & 63)) & 1)) {
+                d = *(float32 *)(va + H1_2(i));
+                d = float32_muladd(e2, e1, d, 0, status);
+                *(float32 *)(vd + H1_2(i)) = d;
+            }
+            if (likely((pg >> (j & 63)) & 1)) {
+                d = *(float32 *)(va + H1_2(j));
+                d = float32_muladd(e4, e3, d, 0, status);
+                *(float32 *)(vd + H1_2(j)) = d;
+            }
+        } while (i & 63);
+    } while (i != 0);
+}
+
+void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
+                               void *vg, void *status, uint32_t desc)
+{
+    intptr_t j, i = simd_oprsz(desc);
+    unsigned rot = simd_data(desc);
+    bool flip = rot & 1;
+    float64 neg_imag, neg_real;
+    uint64_t *g = vg;
+
+    neg_imag = float64_set_sign(0, (rot & 2) != 0);
+    neg_real = float64_set_sign(0, rot == 1 || rot == 2);
+
+    do {
+        uint64_t pg = g[(i - 1) >> 6];
+        do {
+            float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
+
+            /* I holds the real index; J holds the imag index.  */
+            j = i - sizeof(float64);
+            i -= 2 * sizeof(float64);
+
+            nr = *(float64 *)(vn + H1_2(i));
+            ni = *(float64 *)(vn + H1_2(j));
+            mr = *(float64 *)(vm + H1_2(i));
+            mi = *(float64 *)(vm + H1_2(j));
+
+            e2 = (flip ? ni : nr);
+            e1 = (flip ? mi : mr) ^ neg_real;
+            e4 = e2;
+            e3 = (flip ? mr : mi) ^ neg_imag;
+
+            if (likely((pg >> (i & 63)) & 1)) {
+                d = *(float64 *)(va + H1_2(i));
+                d = float64_muladd(e2, e1, d, 0, status);
+                *(float64 *)(vd + H1_2(i)) = d;
+            }
+            if (likely((pg >> (j & 63)) & 1)) {
+                d = *(float64 *)(va + H1_2(j));
+                d = float64_muladd(e4, e3, d, 0, status);
+                *(float64 *)(vd + H1_2(j)) = d;
+            }
+        } while (i & 63);
+    } while (i != 0);
+}
+
+/*
+ * Load contiguous data, protected by a governing predicate.
+ */
+
+/*
+ * Skip through a sequence of inactive elements in the guarding predicate @vg,
+ * beginning at @reg_off bounded by @reg_max.  Return the offset of the active
+ * element >= @reg_off, or @reg_max if there were no active elements at all.
+ */
+static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
+                                 intptr_t reg_max, int esz)
+{
+    uint64_t pg_mask = pred_esz_masks[esz];
+    uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
+
+    /* In normal usage, the first element is active.  */
+    if (likely(pg & 1)) {
+        return reg_off;
+    }
+
+    if (pg == 0) {
+        reg_off &= -64;
+        do {
+            reg_off += 64;
+            if (unlikely(reg_off >= reg_max)) {
+                /* The entire predicate was false.  */
+                return reg_max;
+            }
+            pg = vg[reg_off >> 6] & pg_mask;
+        } while (pg == 0);
+    }
+    reg_off += ctz64(pg);
+
+    /* We should never see an out of range predicate bit set.  */
+    tcg_debug_assert(reg_off < reg_max);
+    return reg_off;
+}
+
+/*
+ * Resolve the guest virtual address to info->host and info->flags.
+ * If @nofault, return false if the page is invalid, otherwise
+ * exit via page fault exception.
+ */
+
+bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
+                    target_ulong addr, int mem_off, MMUAccessType access_type,
+                    int mmu_idx, uintptr_t retaddr)
+{
+    int flags;
+
+    addr += mem_off;
+
+    /*
+     * User-only currently always issues with TBI.  See the comment
+     * above useronly_clean_ptr.  Usually we clean this top byte away
+     * during translation, but we can't do that for e.g. vector + imm
+     * addressing modes.
+     *
+     * We currently always enable TBI for user-only, and do not provide
+     * a way to turn it off.  So clean the pointer unconditionally here,
+     * rather than look it up here, or pass it down from above.
+     */
+    addr = useronly_clean_ptr(addr);
+
+#ifdef CONFIG_USER_ONLY
+    flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
+                               &info->host, retaddr);
+#else
+    CPUTLBEntryFull *full;
+    flags = probe_access_full(env, addr, access_type, mmu_idx, nofault,
+                              &info->host, &full, retaddr);
+#endif
+    info->flags = flags;
+
+    if (flags & TLB_INVALID_MASK) {
+        g_assert(nofault);
+        return false;
+    }
+
+#ifdef CONFIG_USER_ONLY
+    memset(&info->attrs, 0, sizeof(info->attrs));
+    /* Require both ANON and MTE; see allocation_tag_mem(). */
+    info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
+#else
+    info->attrs = full->attrs;
+    info->tagged = full->pte_attrs == 0xf0;
+#endif
+
+    /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
+    info->host -= mem_off;
+    return true;
+}
+
+/*
+ * Find first active element on each page, and a loose bound for the
+ * final element on each page.  Identify any single element that spans
+ * the page boundary.  Return true if there are any active elements.
+ */
+bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
+                            intptr_t reg_max, int esz, int msize)
+{
+    const int esize = 1 << esz;
+    const uint64_t pg_mask = pred_esz_masks[esz];
+    intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
+    intptr_t mem_off_last, mem_off_split;
+    intptr_t page_split, elt_split;
+    intptr_t i;
+
+    /* Set all of the element indices to -1, and the TLB data to 0. */
+    memset(info, -1, offsetof(SVEContLdSt, page));
+    memset(info->page, 0, sizeof(info->page));
+
+    /* Gross scan over the entire predicate to find bounds. */
+    i = 0;
+    do {
+        uint64_t pg = vg[i] & pg_mask;
+        if (pg) {
+            reg_off_last = i * 64 + 63 - clz64(pg);
+            if (reg_off_first < 0) {
+                reg_off_first = i * 64 + ctz64(pg);
+            }
+        }
+    } while (++i * 64 < reg_max);
+
+    if (unlikely(reg_off_first < 0)) {
+        /* No active elements, no pages touched. */
+        return false;
+    }
+    tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
+
+    info->reg_off_first[0] = reg_off_first;
+    info->mem_off_first[0] = (reg_off_first >> esz) * msize;
+    mem_off_last = (reg_off_last >> esz) * msize;
+
+    page_split = -(addr | TARGET_PAGE_MASK);
+    if (likely(mem_off_last + msize <= page_split)) {
+        /* The entire operation fits within a single page. */
+        info->reg_off_last[0] = reg_off_last;
+        return true;
+    }
+
+    info->page_split = page_split;
+    elt_split = page_split / msize;
+    reg_off_split = elt_split << esz;
+    mem_off_split = elt_split * msize;
+
+    /*
+     * This is the last full element on the first page, but it is not
+     * necessarily active.  If there is no full element, i.e. the first
+     * active element is the one that's split, this value remains -1.
+     * It is useful as iteration bounds.
+     */
+    if (elt_split != 0) {
+        info->reg_off_last[0] = reg_off_split - esize;
+    }
+
+    /* Determine if an unaligned element spans the pages.  */
+    if (page_split % msize != 0) {
+        /* It is helpful to know if the split element is active. */
+        if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
+            info->reg_off_split = reg_off_split;
+            info->mem_off_split = mem_off_split;
+
+            if (reg_off_split == reg_off_last) {
+                /* The page crossing element is last. */
+                return true;
+            }
+        }
+        reg_off_split += esize;
+        mem_off_split += msize;
+    }
+
+    /*
+     * We do want the first active element on the second page, because
+     * this may affect the address reported in an exception.
+     */
+    reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
+    tcg_debug_assert(reg_off_split <= reg_off_last);
+    info->reg_off_first[1] = reg_off_split;
+    info->mem_off_first[1] = (reg_off_split >> esz) * msize;
+    info->reg_off_last[1] = reg_off_last;
+    return true;
+}
+
+/*
+ * Resolve the guest virtual addresses to info->page[].
+ * Control the generation of page faults with @fault.  Return false if
+ * there is no work to do, which can only happen with @fault == FAULT_NO.
+ */
+bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
+                         CPUARMState *env, target_ulong addr,
+                         MMUAccessType access_type, uintptr_t retaddr)
+{
+    int mmu_idx = cpu_mmu_index(env, false);
+    int mem_off = info->mem_off_first[0];
+    bool nofault = fault == FAULT_NO;
+    bool have_work = true;
+
+    if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
+                        access_type, mmu_idx, retaddr)) {
+        /* No work to be done. */
+        return false;
+    }
+
+    if (likely(info->page_split < 0)) {
+        /* The entire operation was on the one page. */
+        return true;
+    }
+
+    /*
+     * If the second page is invalid, then we want the fault address to be
+     * the first byte on that page which is accessed.
+     */
+    if (info->mem_off_split >= 0) {
+        /*
+         * There is an element split across the pages.  The fault address
+         * should be the first byte of the second page.
+         */
+        mem_off = info->page_split;
+        /*
+         * If the split element is also the first active element
+         * of the vector, then:  For first-fault we should continue
+         * to generate faults for the second page.  For no-fault,
+         * we have work only if the second page is valid.
+         */
+        if (info->mem_off_first[0] < info->mem_off_split) {
+            nofault = FAULT_FIRST;
+            have_work = false;
+        }
+    } else {
+        /*
+         * There is no element split across the pages.  The fault address
+         * should be the first active element on the second page.
+         */
+        mem_off = info->mem_off_first[1];
+        /*
+         * There must have been one active element on the first page,
+         * so we're out of first-fault territory.
+         */
+        nofault = fault != FAULT_ALL;
+    }
+
+    have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
+                                access_type, mmu_idx, retaddr);
+    return have_work;
+}
+
+#ifndef CONFIG_USER_ONLY
+void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
+                               uint64_t *vg, target_ulong addr,
+                               int esize, int msize, int wp_access,
+                               uintptr_t retaddr)
+{
+    intptr_t mem_off, reg_off, reg_last;
+    int flags0 = info->page[0].flags;
+    int flags1 = info->page[1].flags;
+
+    if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
+        return;
+    }
+
+    /* Indicate that watchpoints are handled. */
+    info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
+    info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
+
+    if (flags0 & TLB_WATCHPOINT) {
+        mem_off = info->mem_off_first[0];
+        reg_off = info->reg_off_first[0];
+        reg_last = info->reg_off_last[0];
+
+        while (reg_off <= reg_last) {
+            uint64_t pg = vg[reg_off >> 6];
+            do {
+                if ((pg >> (reg_off & 63)) & 1) {
+                    cpu_check_watchpoint(env_cpu(env), addr + mem_off,
+                                         msize, info->page[0].attrs,
+                                         wp_access, retaddr);
+                }
+                reg_off += esize;
+                mem_off += msize;
+            } while (reg_off <= reg_last && (reg_off & 63));
+        }
+    }
+
+    mem_off = info->mem_off_split;
+    if (mem_off >= 0) {
+        cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
+                             info->page[0].attrs, wp_access, retaddr);
+    }
+
+    mem_off = info->mem_off_first[1];
+    if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
+        reg_off = info->reg_off_first[1];
+        reg_last = info->reg_off_last[1];
+
+        do {
+            uint64_t pg = vg[reg_off >> 6];
+            do {
+                if ((pg >> (reg_off & 63)) & 1) {
+                    cpu_check_watchpoint(env_cpu(env), addr + mem_off,
+                                         msize, info->page[1].attrs,
+                                         wp_access, retaddr);
+                }
+                reg_off += esize;
+                mem_off += msize;
+            } while (reg_off & 63);
+        } while (reg_off <= reg_last);
+    }
+}
+#endif
+
+void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
+                             uint64_t *vg, target_ulong addr, int esize,
+                             int msize, uint32_t mtedesc, uintptr_t ra)
+{
+    intptr_t mem_off, reg_off, reg_last;
+
+    /* Process the page only if MemAttr == Tagged. */
+    if (info->page[0].tagged) {
+        mem_off = info->mem_off_first[0];
+        reg_off = info->reg_off_first[0];
+        reg_last = info->reg_off_split;
+        if (reg_last < 0) {
+            reg_last = info->reg_off_last[0];
+        }
+
+        do {
+            uint64_t pg = vg[reg_off >> 6];
+            do {
+                if ((pg >> (reg_off & 63)) & 1) {
+                    mte_check(env, mtedesc, addr, ra);
+                }
+                reg_off += esize;
+                mem_off += msize;
+            } while (reg_off <= reg_last && (reg_off & 63));
+        } while (reg_off <= reg_last);
+    }
+
+    mem_off = info->mem_off_first[1];
+    if (mem_off >= 0 && info->page[1].tagged) {
+        reg_off = info->reg_off_first[1];
+        reg_last = info->reg_off_last[1];
+
+        do {
+            uint64_t pg = vg[reg_off >> 6];
+            do {
+                if ((pg >> (reg_off & 63)) & 1) {
+                    mte_check(env, mtedesc, addr, ra);
+                }
+                reg_off += esize;
+                mem_off += msize;
+            } while (reg_off & 63);
+        } while (reg_off <= reg_last);
+    }
+}
+
+/*
+ * Common helper for all contiguous 1,2,3,4-register predicated stores.
+ */
+static inline QEMU_ALWAYS_INLINE
+void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
+               uint32_t desc, const uintptr_t retaddr,
+               const int esz, const int msz, const int N, uint32_t mtedesc,
+               sve_ldst1_host_fn *host_fn,
+               sve_ldst1_tlb_fn *tlb_fn)
+{
+    const unsigned rd = simd_data(desc);
+    const intptr_t reg_max = simd_oprsz(desc);
+    intptr_t reg_off, reg_last, mem_off;
+    SVEContLdSt info;
+    void *host;
+    int flags, i;
+
+    /* Find the active elements.  */
+    if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
+        /* The entire predicate was false; no load occurs.  */
+        for (i = 0; i < N; ++i) {
+            memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
+        }
+        return;
+    }
+
+    /* Probe the page(s).  Exit with exception for any invalid page. */
+    sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
+
+    /* Handle watchpoints for all active elements. */
+    sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
+                              BP_MEM_READ, retaddr);
+
+    /*
+     * Handle mte checks for all active elements.
+     * Since TBI must be set for MTE, !mtedesc => !mte_active.
+     */
+    if (mtedesc) {
+        sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
+                                mtedesc, retaddr);
+    }
+
+    flags = info.page[0].flags | info.page[1].flags;
+    if (unlikely(flags != 0)) {
+#ifdef CONFIG_USER_ONLY
+        g_assert_not_reached();
+#else
+        /*
+         * At least one page includes MMIO.
+         * Any bus operation can fail with cpu_transaction_failed,
+         * which for ARM will raise SyncExternal.  Perform the load
+         * into scratch memory to preserve register state until the end.
+         */
+        ARMVectorReg scratch[4] = { };
+
+        mem_off = info.mem_off_first[0];
+        reg_off = info.reg_off_first[0];
+        reg_last = info.reg_off_last[1];
+        if (reg_last < 0) {
+            reg_last = info.reg_off_split;
+            if (reg_last < 0) {
+                reg_last = info.reg_off_last[0];
+            }
+        }
+
+        do {
+            uint64_t pg = vg[reg_off >> 6];
+            do {
+                if ((pg >> (reg_off & 63)) & 1) {
+                    for (i = 0; i < N; ++i) {
+                        tlb_fn(env, &scratch[i], reg_off,
+                               addr + mem_off + (i << msz), retaddr);
+                    }
+                }
+                reg_off += 1 << esz;
+                mem_off += N << msz;
+            } while (reg_off & 63);
+        } while (reg_off <= reg_last);
+
+        for (i = 0; i < N; ++i) {
+            memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
+        }
+        return;
+#endif
+    }
+
+    /* The entire operation is in RAM, on valid pages. */
+
+    for (i = 0; i < N; ++i) {
+        memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
+    }
+
+    mem_off = info.mem_off_first[0];
+    reg_off = info.reg_off_first[0];
+    reg_last = info.reg_off_last[0];
+    host = info.page[0].host;
+
+    while (reg_off <= reg_last) {
+        uint64_t pg = vg[reg_off >> 6];
+        do {
+            if ((pg >> (reg_off & 63)) & 1) {
+                for (i = 0; i < N; ++i) {
+                    host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
+                            host + mem_off + (i << msz));
+                }
+            }
+            reg_off += 1 << esz;
+            mem_off += N << msz;
+        } while (reg_off <= reg_last && (reg_off & 63));
+    }
+
+    /*
+     * Use the slow path to manage the cross-page misalignment.
+     * But we know this is RAM and cannot trap.
+     */
+    mem_off = info.mem_off_split;
+    if (unlikely(mem_off >= 0)) {
+        reg_off = info.reg_off_split;
+        for (i = 0; i < N; ++i) {
+            tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
+                   addr + mem_off + (i << msz), retaddr);
+        }
+    }
+
+    mem_off = info.mem_off_first[1];
+    if (unlikely(mem_off >= 0)) {
+        reg_off = info.reg_off_first[1];
+        reg_last = info.reg_off_last[1];
+        host = info.page[1].host;
+
+        do {
+            uint64_t pg = vg[reg_off >> 6];
+            do {
+                if ((pg >> (reg_off & 63)) & 1) {
+                    for (i = 0; i < N; ++i) {
+                        host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
+                                host + mem_off + (i << msz));
+                    }
+                }
+                reg_off += 1 << esz;
+                mem_off += N << msz;
+            } while (reg_off & 63);
+        } while (reg_off <= reg_last);
+    }
+}
+
+static inline QEMU_ALWAYS_INLINE
+void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
+                   uint32_t desc, const uintptr_t ra,
+                   const int esz, const int msz, const int N,
+                   sve_ldst1_host_fn *host_fn,
+                   sve_ldst1_tlb_fn *tlb_fn)
+{
+    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+    int bit55 = extract64(addr, 55, 1);
+
+    /* Remove mtedesc from the normal sve descriptor. */
+    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+
+    /* Perform gross MTE suppression early. */
+    if (!tbi_check(desc, bit55) ||
+        tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
+        mtedesc = 0;
+    }
+
+    sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
+}
+
+#define DO_LD1_1(NAME, ESZ)                                             \
+void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg,                 \
+                            target_ulong addr, uint32_t desc)           \
+{                                                                       \
+    sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0,            \
+              sve_##NAME##_host, sve_##NAME##_tlb);                     \
+}                                                                       \
+void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg,             \
+                                target_ulong addr, uint32_t desc)       \
+{                                                                       \
+    sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1,           \
+                  sve_##NAME##_host, sve_##NAME##_tlb);                 \
+}
+
+#define DO_LD1_2(NAME, ESZ, MSZ)                                        \
+void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg,              \
+                               target_ulong addr, uint32_t desc)        \
+{                                                                       \
+    sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
+              sve_##NAME##_le_host, sve_##NAME##_le_tlb);               \
+}                                                                       \
+void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg,              \
+                               target_ulong addr, uint32_t desc)        \
+{                                                                       \
+    sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
+              sve_##NAME##_be_host, sve_##NAME##_be_tlb);               \
+}                                                                       \
+void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg,          \
+                                   target_ulong addr, uint32_t desc)    \
+{                                                                       \
+    sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
+                  sve_##NAME##_le_host, sve_##NAME##_le_tlb);           \
+}                                                                       \
+void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg,          \
+                                   target_ulong addr, uint32_t desc)    \
+{                                                                       \
+    sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
+                  sve_##NAME##_be_host, sve_##NAME##_be_tlb);           \
+}
+
+DO_LD1_1(ld1bb,  MO_8)
+DO_LD1_1(ld1bhu, MO_16)
+DO_LD1_1(ld1bhs, MO_16)
+DO_LD1_1(ld1bsu, MO_32)
+DO_LD1_1(ld1bss, MO_32)
+DO_LD1_1(ld1bdu, MO_64)
+DO_LD1_1(ld1bds, MO_64)
+
+DO_LD1_2(ld1hh,  MO_16, MO_16)
+DO_LD1_2(ld1hsu, MO_32, MO_16)
+DO_LD1_2(ld1hss, MO_32, MO_16)
+DO_LD1_2(ld1hdu, MO_64, MO_16)
+DO_LD1_2(ld1hds, MO_64, MO_16)
+
+DO_LD1_2(ld1ss,  MO_32, MO_32)
+DO_LD1_2(ld1sdu, MO_64, MO_32)
+DO_LD1_2(ld1sds, MO_64, MO_32)
+
+DO_LD1_2(ld1dd,  MO_64, MO_64)
+
+#undef DO_LD1_1
+#undef DO_LD1_2
+
+#define DO_LDN_1(N)                                                     \
+void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg,                \
+                             target_ulong addr, uint32_t desc)          \
+{                                                                       \
+    sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0,           \
+              sve_ld1bb_host, sve_ld1bb_tlb);                           \
+}                                                                       \
+void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg,            \
+                                 target_ulong addr, uint32_t desc)      \
+{                                                                       \
+    sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N,          \
+                  sve_ld1bb_host, sve_ld1bb_tlb);                       \
+}
+
+#define DO_LDN_2(N, SUFF, ESZ)                                          \
+void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg,         \
+                                    target_ulong addr, uint32_t desc)   \
+{                                                                       \
+    sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
+              sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);         \
+}                                                                       \
+void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg,         \
+                                    target_ulong addr, uint32_t desc)   \
+{                                                                       \
+    sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
+              sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);         \
+}                                                                       \
+void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg,     \
+                                        target_ulong addr, uint32_t desc) \
+{                                                                       \
+    sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
+                  sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);     \
+}                                                                       \
+void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg,     \
+                                        target_ulong addr, uint32_t desc) \
+{                                                                       \
+    sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
+                  sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);     \
+}
+
+DO_LDN_1(2)
+DO_LDN_1(3)
+DO_LDN_1(4)
+
+DO_LDN_2(2, hh, MO_16)
+DO_LDN_2(3, hh, MO_16)
+DO_LDN_2(4, hh, MO_16)
+
+DO_LDN_2(2, ss, MO_32)
+DO_LDN_2(3, ss, MO_32)
+DO_LDN_2(4, ss, MO_32)
+
+DO_LDN_2(2, dd, MO_64)
+DO_LDN_2(3, dd, MO_64)
+DO_LDN_2(4, dd, MO_64)
+
+#undef DO_LDN_1
+#undef DO_LDN_2
+
+/*
+ * Load contiguous data, first-fault and no-fault.
+ *
+ * For user-only, one could argue that we should hold the mmap_lock during
+ * the operation so that there is no race between page_check_range and the
+ * load operation.  However, unmapping pages out from under a running thread
+ * is extraordinarily unlikely.  This theoretical race condition also affects
+ * linux-user/ in its get_user/put_user macros.
+ *
+ * TODO: Construct some helpers, written in assembly, that interact with
+ * host_signal_handler to produce memory ops which can properly report errors
+ * without racing.
+ */
+
+/* Fault on byte I.  All bits in FFR from I are cleared.  The vector
+ * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
+ * option, which leaves subsequent data unchanged.
+ */
+static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
+{
+    uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
+
+    if (i & 63) {
+        ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
+        i = ROUND_UP(i, 64);
+    }
+    for (; i < oprsz; i += 64) {
+        ffr[i / 64] = 0;
+    }
+}
+
+/*
+ * Common helper for all contiguous no-fault and first-fault loads.
+ */
+static inline QEMU_ALWAYS_INLINE
+void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
+                   uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
+                   const int esz, const int msz, const SVEContFault fault,
+                   sve_ldst1_host_fn *host_fn,
+                   sve_ldst1_tlb_fn *tlb_fn)
+{
+    const unsigned rd = simd_data(desc);
+    void *vd = &env->vfp.zregs[rd];
+    const intptr_t reg_max = simd_oprsz(desc);
+    intptr_t reg_off, mem_off, reg_last;
+    SVEContLdSt info;
+    int flags;
+    void *host;
+
+    /* Find the active elements.  */
+    if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
+        /* The entire predicate was false; no load occurs.  */
+        memset(vd, 0, reg_max);
+        return;
+    }
+    reg_off = info.reg_off_first[0];
+
+    /* Probe the page(s). */
+    if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
+        /* Fault on first element. */
+        tcg_debug_assert(fault == FAULT_NO);
+        memset(vd, 0, reg_max);
+        goto do_fault;
+    }
+
+    mem_off = info.mem_off_first[0];
+    flags = info.page[0].flags;
+
+    /*
+     * Disable MTE checking if the Tagged bit is not set.  Since TBI must
+     * be set within MTEDESC for MTE, !mtedesc => !mte_active.
+     */
+    if (!info.page[0].tagged) {
+        mtedesc = 0;
+    }
+
+    if (fault == FAULT_FIRST) {
+        /* Trapping mte check for the first-fault element.  */
+        if (mtedesc) {
+            mte_check(env, mtedesc, addr + mem_off, retaddr);
+        }
+
+        /*
+         * Special handling of the first active element,
+         * if it crosses a page boundary or is MMIO.
+         */
+        bool is_split = mem_off == info.mem_off_split;
+        if (unlikely(flags != 0) || unlikely(is_split)) {
+            /*
+             * Use the slow path for cross-page handling.
+             * Might trap for MMIO or watchpoints.
+             */
+            tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
+
+            /* After any fault, zero the other elements. */
+            swap_memzero(vd, reg_off);
+            reg_off += 1 << esz;
+            mem_off += 1 << msz;
+            swap_memzero(vd + reg_off, reg_max - reg_off);
+
+            if (is_split) {
+                goto second_page;
+            }
+        } else {
+            memset(vd, 0, reg_max);
+        }
+    } else {
+        memset(vd, 0, reg_max);
+        if (unlikely(mem_off == info.mem_off_split)) {
+            /* The first active element crosses a page boundary. */
+            flags |= info.page[1].flags;
+            if (unlikely(flags & TLB_MMIO)) {
+                /* Some page is MMIO, see below. */
+                goto do_fault;
+            }
+            if (unlikely(flags & TLB_WATCHPOINT) &&
+                (cpu_watchpoint_address_matches
+                 (env_cpu(env), addr + mem_off, 1 << msz)
+                 & BP_MEM_READ)) {
+                /* Watchpoint hit, see below. */
+                goto do_fault;
+            }
+            if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
+                goto do_fault;
+            }
+            /*
+             * Use the slow path for cross-page handling.
+             * This is RAM, without a watchpoint, and will not trap.
+             */
+            tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
+            goto second_page;
+        }
+    }
+
+    /*
+     * From this point on, all memory operations are MemSingleNF.
+     *
+     * Per the MemSingleNF pseudocode, a no-fault load from Device memory
+     * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
+     *
+     * Unfortuately we do not have access to the memory attributes from the
+     * PTE to tell Device memory from Normal memory.  So we make a mostly
+     * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
+     * This gives the right answer for the common cases of "Normal memory,
+     * backed by host RAM" and "Device memory, backed by MMIO".
+     * The architecture allows us to suppress an NF load and return
+     * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
+     * case of "Normal memory, backed by MMIO" is permitted.  The case we
+     * get wrong is "Device memory, backed by host RAM", for which we
+     * should return (UNKNOWN, FAULT) for but do not.
+     *
+     * Similarly, CPU_BP breakpoints would raise exceptions, and so
+     * return (UNKNOWN, FAULT).  For simplicity, we consider gdb and
+     * architectural breakpoints the same.
+     */
+    if (unlikely(flags & TLB_MMIO)) {
+        goto do_fault;
+    }
+
+    reg_last = info.reg_off_last[0];
+    host = info.page[0].host;
+
+    do {
+        uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
+        do {
+            if ((pg >> (reg_off & 63)) & 1) {
+                if (unlikely(flags & TLB_WATCHPOINT) &&
+                    (cpu_watchpoint_address_matches
+                     (env_cpu(env), addr + mem_off, 1 << msz)
+                     & BP_MEM_READ)) {
+                    goto do_fault;
+                }
+                if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
+                    goto do_fault;
+                }
+                host_fn(vd, reg_off, host + mem_off);
+            }
+            reg_off += 1 << esz;
+            mem_off += 1 << msz;
+        } while (reg_off <= reg_last && (reg_off & 63));
+    } while (reg_off <= reg_last);
+
+    /*
+     * MemSingleNF is allowed to fail for any reason.  We have special
+     * code above to handle the first element crossing a page boundary.
+     * As an implementation choice, decline to handle a cross-page element
+     * in any other position.
+     */
+    reg_off = info.reg_off_split;
+    if (reg_off >= 0) {
+        goto do_fault;
+    }
+
+ second_page:
+    reg_off = info.reg_off_first[1];
+    if (likely(reg_off < 0)) {
+        /* No active elements on the second page.  All done. */
+        return;
+    }
+
+    /*
+     * MemSingleNF is allowed to fail for any reason.  As an implementation
+     * choice, decline to handle elements on the second page.  This should
+     * be low frequency as the guest walks through memory -- the next
+     * iteration of the guest's loop should be aligned on the page boundary,
+     * and then all following iterations will stay aligned.
+     */
+
+ do_fault:
+    record_fault(env, reg_off, reg_max);
+}
+
+static inline QEMU_ALWAYS_INLINE
+void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
+                       uint32_t desc, const uintptr_t retaddr,
+                       const int esz, const int msz, const SVEContFault fault,
+                       sve_ldst1_host_fn *host_fn,
+                       sve_ldst1_tlb_fn *tlb_fn)
+{
+    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+    int bit55 = extract64(addr, 55, 1);
+
+    /* Remove mtedesc from the normal sve descriptor. */
+    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+
+    /* Perform gross MTE suppression early. */
+    if (!tbi_check(desc, bit55) ||
+        tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
+        mtedesc = 0;
+    }
+
+    sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
+                  esz, msz, fault, host_fn, tlb_fn);
+}
+
+#define DO_LDFF1_LDNF1_1(PART, ESZ)                                     \
+void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg,            \
+                                 target_ulong addr, uint32_t desc)      \
+{                                                                       \
+    sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
+                  sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
+}                                                                       \
+void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg,            \
+                                 target_ulong addr, uint32_t desc)      \
+{                                                                       \
+    sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
+                  sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
+}                                                                       \
+void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg,        \
+                                     target_ulong addr, uint32_t desc)  \
+{                                                                       \
+    sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
+                      sve_ld1##PART##_host, sve_ld1##PART##_tlb);       \
+}                                                                       \
+void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg,        \
+                                     target_ulong addr, uint32_t desc)  \
+{                                                                       \
+    sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
+                  sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
+}
+
+#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ)                                \
+void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg,         \
+                                    target_ulong addr, uint32_t desc)   \
+{                                                                       \
+    sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
+                  sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
+}                                                                       \
+void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg,         \
+                                    target_ulong addr, uint32_t desc)   \
+{                                                                       \
+    sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
+                  sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
+}                                                                       \
+void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg,         \
+                                    target_ulong addr, uint32_t desc)   \
+{                                                                       \
+    sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
+                  sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
+}                                                                       \
+void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg,         \
+                                    target_ulong addr, uint32_t desc)   \
+{                                                                       \
+    sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
+                  sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
+}                                                                       \
+void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
+                                        target_ulong addr, uint32_t desc) \
+{                                                                       \
+    sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
+                      sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
+}                                                                       \
+void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
+                                        target_ulong addr, uint32_t desc) \
+{                                                                       \
+    sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
+                      sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
+}                                                                       \
+void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
+                                        target_ulong addr, uint32_t desc) \
+{                                                                       \
+    sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
+                      sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
+}                                                                       \
+void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
+                                        target_ulong addr, uint32_t desc) \
+{                                                                       \
+    sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
+                      sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
+}
+
+DO_LDFF1_LDNF1_1(bb,  MO_8)
+DO_LDFF1_LDNF1_1(bhu, MO_16)
+DO_LDFF1_LDNF1_1(bhs, MO_16)
+DO_LDFF1_LDNF1_1(bsu, MO_32)
+DO_LDFF1_LDNF1_1(bss, MO_32)
+DO_LDFF1_LDNF1_1(bdu, MO_64)
+DO_LDFF1_LDNF1_1(bds, MO_64)
+
+DO_LDFF1_LDNF1_2(hh,  MO_16, MO_16)
+DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
+DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
+DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
+DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
+
+DO_LDFF1_LDNF1_2(ss,  MO_32, MO_32)
+DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
+DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
+
+DO_LDFF1_LDNF1_2(dd,  MO_64, MO_64)
+
+#undef DO_LDFF1_LDNF1_1
+#undef DO_LDFF1_LDNF1_2
+
+/*
+ * Common helper for all contiguous 1,2,3,4-register predicated stores.
+ */
+
+static inline QEMU_ALWAYS_INLINE
+void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
+               uint32_t desc, const uintptr_t retaddr,
+               const int esz, const int msz, const int N, uint32_t mtedesc,
+               sve_ldst1_host_fn *host_fn,
+               sve_ldst1_tlb_fn *tlb_fn)
+{
+    const unsigned rd = simd_data(desc);
+    const intptr_t reg_max = simd_oprsz(desc);
+    intptr_t reg_off, reg_last, mem_off;
+    SVEContLdSt info;
+    void *host;
+    int i, flags;
+
+    /* Find the active elements.  */
+    if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
+        /* The entire predicate was false; no store occurs.  */
+        return;
+    }
+
+    /* Probe the page(s).  Exit with exception for any invalid page. */
+    sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
+
+    /* Handle watchpoints for all active elements. */
+    sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
+                              BP_MEM_WRITE, retaddr);
+
+    /*
+     * Handle mte checks for all active elements.
+     * Since TBI must be set for MTE, !mtedesc => !mte_active.
+     */
+    if (mtedesc) {
+        sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
+                                mtedesc, retaddr);
+    }
+
+    flags = info.page[0].flags | info.page[1].flags;
+    if (unlikely(flags != 0)) {
+#ifdef CONFIG_USER_ONLY
+        g_assert_not_reached();
+#else
+        /*
+         * At least one page includes MMIO.
+         * Any bus operation can fail with cpu_transaction_failed,
+         * which for ARM will raise SyncExternal.  We cannot avoid
+         * this fault and will leave with the store incomplete.
+         */
+        mem_off = info.mem_off_first[0];
+        reg_off = info.reg_off_first[0];
+        reg_last = info.reg_off_last[1];
+        if (reg_last < 0) {
+            reg_last = info.reg_off_split;
+            if (reg_last < 0) {
+                reg_last = info.reg_off_last[0];
+            }
+        }
+
+        do {
+            uint64_t pg = vg[reg_off >> 6];
+            do {
+                if ((pg >> (reg_off & 63)) & 1) {
+                    for (i = 0; i < N; ++i) {
+                        tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
+                               addr + mem_off + (i << msz), retaddr);
+                    }
+                }
+                reg_off += 1 << esz;
+                mem_off += N << msz;
+            } while (reg_off & 63);
+        } while (reg_off <= reg_last);
+        return;
+#endif
+    }
+
+    mem_off = info.mem_off_first[0];
+    reg_off = info.reg_off_first[0];
+    reg_last = info.reg_off_last[0];
+    host = info.page[0].host;
+
+    while (reg_off <= reg_last) {
+        uint64_t pg = vg[reg_off >> 6];
+        do {
+            if ((pg >> (reg_off & 63)) & 1) {
+                for (i = 0; i < N; ++i) {
+                    host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
+                            host + mem_off + (i << msz));
+                }
+            }
+            reg_off += 1 << esz;
+            mem_off += N << msz;
+        } while (reg_off <= reg_last && (reg_off & 63));
+    }
+
+    /*
+     * Use the slow path to manage the cross-page misalignment.
+     * But we know this is RAM and cannot trap.
+     */
+    mem_off = info.mem_off_split;
+    if (unlikely(mem_off >= 0)) {
+        reg_off = info.reg_off_split;
+        for (i = 0; i < N; ++i) {
+            tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
+                   addr + mem_off + (i << msz), retaddr);
+        }
+    }
+
+    mem_off = info.mem_off_first[1];
+    if (unlikely(mem_off >= 0)) {
+        reg_off = info.reg_off_first[1];
+        reg_last = info.reg_off_last[1];
+        host = info.page[1].host;
+
+        do {
+            uint64_t pg = vg[reg_off >> 6];
+            do {
+                if ((pg >> (reg_off & 63)) & 1) {
+                    for (i = 0; i < N; ++i) {
+                        host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
+                                host + mem_off + (i << msz));
+                    }
+                }
+                reg_off += 1 << esz;
+                mem_off += N << msz;
+            } while (reg_off & 63);
+        } while (reg_off <= reg_last);
+    }
+}
+
+static inline QEMU_ALWAYS_INLINE
+void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
+                   uint32_t desc, const uintptr_t ra,
+                   const int esz, const int msz, const int N,
+                   sve_ldst1_host_fn *host_fn,
+                   sve_ldst1_tlb_fn *tlb_fn)
+{
+    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+    int bit55 = extract64(addr, 55, 1);
+
+    /* Remove mtedesc from the normal sve descriptor. */
+    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+
+    /* Perform gross MTE suppression early. */
+    if (!tbi_check(desc, bit55) ||
+        tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
+        mtedesc = 0;
+    }
+
+    sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
+}
+
+#define DO_STN_1(N, NAME, ESZ)                                          \
+void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg,            \
+                                 target_ulong addr, uint32_t desc)      \
+{                                                                       \
+    sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0,            \
+              sve_st1##NAME##_host, sve_st1##NAME##_tlb);               \
+}                                                                       \
+void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg,        \
+                                     target_ulong addr, uint32_t desc)  \
+{                                                                       \
+    sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N,           \
+                  sve_st1##NAME##_host, sve_st1##NAME##_tlb);           \
+}
+
+#define DO_STN_2(N, NAME, ESZ, MSZ)                                     \
+void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg,         \
+                                    target_ulong addr, uint32_t desc)   \
+{                                                                       \
+    sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
+              sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);         \
+}                                                                       \
+void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg,         \
+                                    target_ulong addr, uint32_t desc)   \
+{                                                                       \
+    sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
+              sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);         \
+}                                                                       \
+void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg,     \
+                                        target_ulong addr, uint32_t desc) \
+{                                                                       \
+    sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
+                  sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);     \
+}                                                                       \
+void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg,     \
+                                        target_ulong addr, uint32_t desc) \
+{                                                                       \
+    sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
+                  sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);     \
+}
+
+DO_STN_1(1, bb, MO_8)
+DO_STN_1(1, bh, MO_16)
+DO_STN_1(1, bs, MO_32)
+DO_STN_1(1, bd, MO_64)
+DO_STN_1(2, bb, MO_8)
+DO_STN_1(3, bb, MO_8)
+DO_STN_1(4, bb, MO_8)
+
+DO_STN_2(1, hh, MO_16, MO_16)
+DO_STN_2(1, hs, MO_32, MO_16)
+DO_STN_2(1, hd, MO_64, MO_16)
+DO_STN_2(2, hh, MO_16, MO_16)
+DO_STN_2(3, hh, MO_16, MO_16)
+DO_STN_2(4, hh, MO_16, MO_16)
+
+DO_STN_2(1, ss, MO_32, MO_32)
+DO_STN_2(1, sd, MO_64, MO_32)
+DO_STN_2(2, ss, MO_32, MO_32)
+DO_STN_2(3, ss, MO_32, MO_32)
+DO_STN_2(4, ss, MO_32, MO_32)
+
+DO_STN_2(1, dd, MO_64, MO_64)
+DO_STN_2(2, dd, MO_64, MO_64)
+DO_STN_2(3, dd, MO_64, MO_64)
+DO_STN_2(4, dd, MO_64, MO_64)
+
+#undef DO_STN_1
+#undef DO_STN_2
+
+/*
+ * Loads with a vector index.
+ */
+
+/*
+ * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
+ */
+typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
+
+static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
+{
+    return *(uint32_t *)(reg + H1_4(reg_ofs));
+}
+
+static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
+{
+    return *(int32_t *)(reg + H1_4(reg_ofs));
+}
+
+static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
+{
+    return (uint32_t)*(uint64_t *)(reg + reg_ofs);
+}
+
+static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
+{
+    return (int32_t)*(uint64_t *)(reg + reg_ofs);
+}
+
+static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
+{
+    return *(uint64_t *)(reg + reg_ofs);
+}
+
+static inline QEMU_ALWAYS_INLINE
+void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
+               target_ulong base, uint32_t desc, uintptr_t retaddr,
+               uint32_t mtedesc, int esize, int msize,
+               zreg_off_fn *off_fn,
+               sve_ldst1_host_fn *host_fn,
+               sve_ldst1_tlb_fn *tlb_fn)
+{
+    const int mmu_idx = cpu_mmu_index(env, false);
+    const intptr_t reg_max = simd_oprsz(desc);
+    const int scale = simd_data(desc);
+    ARMVectorReg scratch;
+    intptr_t reg_off;
+    SVEHostPage info, info2;
+
+    memset(&scratch, 0, reg_max);
+    reg_off = 0;
+    do {
+        uint64_t pg = vg[reg_off >> 6];
+        do {
+            if (likely(pg & 1)) {
+                target_ulong addr = base + (off_fn(vm, reg_off) << scale);
+                target_ulong in_page = -(addr | TARGET_PAGE_MASK);
+
+                sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
+                               mmu_idx, retaddr);
+
+                if (likely(in_page >= msize)) {
+                    if (unlikely(info.flags & TLB_WATCHPOINT)) {
+                        cpu_check_watchpoint(env_cpu(env), addr, msize,
+                                             info.attrs, BP_MEM_READ, retaddr);
+                    }
+                    if (mtedesc && info.tagged) {
+                        mte_check(env, mtedesc, addr, retaddr);
+                    }
+                    if (unlikely(info.flags & TLB_MMIO)) {
+                        tlb_fn(env, &scratch, reg_off, addr, retaddr);
+                    } else {
+                        host_fn(&scratch, reg_off, info.host);
+                    }
+                } else {
+                    /* Element crosses the page boundary. */
+                    sve_probe_page(&info2, false, env, addr + in_page, 0,
+                                   MMU_DATA_LOAD, mmu_idx, retaddr);
+                    if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
+                        cpu_check_watchpoint(env_cpu(env), addr,
+                                             msize, info.attrs,
+                                             BP_MEM_READ, retaddr);
+                    }
+                    if (mtedesc && info.tagged) {
+                        mte_check(env, mtedesc, addr, retaddr);
+                    }
+                    tlb_fn(env, &scratch, reg_off, addr, retaddr);
+                }
+            }
+            reg_off += esize;
+            pg >>= esize;
+        } while (reg_off & 63);
+    } while (reg_off < reg_max);
+
+    /* Wait until all exceptions have been raised to write back.  */
+    memcpy(vd, &scratch, reg_max);
+}
+
+static inline QEMU_ALWAYS_INLINE
+void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
+                   target_ulong base, uint32_t desc, uintptr_t retaddr,
+                   int esize, int msize, zreg_off_fn *off_fn,
+                   sve_ldst1_host_fn *host_fn,
+                   sve_ldst1_tlb_fn *tlb_fn)
+{
+    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+    /* Remove mtedesc from the normal sve descriptor. */
+    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+
+    /*
+     * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
+     * offset base entirely over the address space hole to change the
+     * pointer tag, or change the bit55 selector.  So we could here
+     * examine TBI + TCMA like we do for sve_ldN_r_mte().
+     */
+    sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
+              esize, msize, off_fn, host_fn, tlb_fn);
+}
+
+#define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
+void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
+                                 void *vm, target_ulong base, uint32_t desc) \
+{                                                                            \
+    sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,          \
+              off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
+}                                                                            \
+void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
+     void *vm, target_ulong base, uint32_t desc)                             \
+{                                                                            \
+    sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,         \
+                  off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
+}
+
+#define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
+void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
+                                 void *vm, target_ulong base, uint32_t desc) \
+{                                                                            \
+    sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,          \
+              off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
+}                                                                            \
+void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
+    void *vm, target_ulong base, uint32_t desc)                              \
+{                                                                            \
+    sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,         \
+                  off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
+}
+
+DO_LD1_ZPZ_S(bsu, zsu, MO_8)
+DO_LD1_ZPZ_S(bsu, zss, MO_8)
+DO_LD1_ZPZ_D(bdu, zsu, MO_8)
+DO_LD1_ZPZ_D(bdu, zss, MO_8)
+DO_LD1_ZPZ_D(bdu, zd, MO_8)
+
+DO_LD1_ZPZ_S(bss, zsu, MO_8)
+DO_LD1_ZPZ_S(bss, zss, MO_8)
+DO_LD1_ZPZ_D(bds, zsu, MO_8)
+DO_LD1_ZPZ_D(bds, zss, MO_8)
+DO_LD1_ZPZ_D(bds, zd, MO_8)
+
+DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
+DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
+DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
+DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
+DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
+
+DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
+DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
+DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
+DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
+DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
+
+DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
+DO_LD1_ZPZ_S(hss_le, zss, MO_16)
+DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
+DO_LD1_ZPZ_D(hds_le, zss, MO_16)
+DO_LD1_ZPZ_D(hds_le, zd, MO_16)
+
+DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
+DO_LD1_ZPZ_S(hss_be, zss, MO_16)
+DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
+DO_LD1_ZPZ_D(hds_be, zss, MO_16)
+DO_LD1_ZPZ_D(hds_be, zd, MO_16)
+
+DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
+DO_LD1_ZPZ_S(ss_le, zss, MO_32)
+DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
+DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
+DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
+
+DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
+DO_LD1_ZPZ_S(ss_be, zss, MO_32)
+DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
+DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
+DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
+
+DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
+DO_LD1_ZPZ_D(sds_le, zss, MO_32)
+DO_LD1_ZPZ_D(sds_le, zd, MO_32)
+
+DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
+DO_LD1_ZPZ_D(sds_be, zss, MO_32)
+DO_LD1_ZPZ_D(sds_be, zd, MO_32)
+
+DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
+DO_LD1_ZPZ_D(dd_le, zss, MO_64)
+DO_LD1_ZPZ_D(dd_le, zd, MO_64)
+
+DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
+DO_LD1_ZPZ_D(dd_be, zss, MO_64)
+DO_LD1_ZPZ_D(dd_be, zd, MO_64)
+
+#undef DO_LD1_ZPZ_S
+#undef DO_LD1_ZPZ_D
+
+/* First fault loads with a vector index.  */
+
+/*
+ * Common helpers for all gather first-faulting loads.
+ */
+
+static inline QEMU_ALWAYS_INLINE
+void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
+                 target_ulong base, uint32_t desc, uintptr_t retaddr,
+                 uint32_t mtedesc, const int esz, const int msz,
+                 zreg_off_fn *off_fn,
+                 sve_ldst1_host_fn *host_fn,
+                 sve_ldst1_tlb_fn *tlb_fn)
+{
+    const int mmu_idx = cpu_mmu_index(env, false);
+    const intptr_t reg_max = simd_oprsz(desc);
+    const int scale = simd_data(desc);
+    const int esize = 1 << esz;
+    const int msize = 1 << msz;
+    intptr_t reg_off;
+    SVEHostPage info;
+    target_ulong addr, in_page;
+
+    /* Skip to the first true predicate.  */
+    reg_off = find_next_active(vg, 0, reg_max, esz);
+    if (unlikely(reg_off >= reg_max)) {
+        /* The entire predicate was false; no load occurs.  */
+        memset(vd, 0, reg_max);
+        return;
+    }
+
+    /*
+     * Probe the first element, allowing faults.
+     */
+    addr = base + (off_fn(vm, reg_off) << scale);
+    if (mtedesc) {
+        mte_check(env, mtedesc, addr, retaddr);
+    }
+    tlb_fn(env, vd, reg_off, addr, retaddr);
+
+    /* After any fault, zero the other elements. */
+    swap_memzero(vd, reg_off);
+    reg_off += esize;
+    swap_memzero(vd + reg_off, reg_max - reg_off);
+
+    /*
+     * Probe the remaining elements, not allowing faults.
+     */
+    while (reg_off < reg_max) {
+        uint64_t pg = vg[reg_off >> 6];
+        do {
+            if (likely((pg >> (reg_off & 63)) & 1)) {
+                addr = base + (off_fn(vm, reg_off) << scale);
+                in_page = -(addr | TARGET_PAGE_MASK);
+
+                if (unlikely(in_page < msize)) {
+                    /* Stop if the element crosses a page boundary. */
+                    goto fault;
+                }
+
+                sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
+                               mmu_idx, retaddr);
+                if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
+                    goto fault;
+                }
+                if (unlikely(info.flags & TLB_WATCHPOINT) &&
+                    (cpu_watchpoint_address_matches
+                     (env_cpu(env), addr, msize) & BP_MEM_READ)) {
+                    goto fault;
+                }
+                if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
+                    goto fault;
+                }
+
+                host_fn(vd, reg_off, info.host);
+            }
+            reg_off += esize;
+        } while (reg_off & 63);
+    }
+    return;
+
+ fault:
+    record_fault(env, reg_off, reg_max);
+}
+
+static inline QEMU_ALWAYS_INLINE
+void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
+                     target_ulong base, uint32_t desc, uintptr_t retaddr,
+                     const int esz, const int msz,
+                     zreg_off_fn *off_fn,
+                     sve_ldst1_host_fn *host_fn,
+                     sve_ldst1_tlb_fn *tlb_fn)
+{
+    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+    /* Remove mtedesc from the normal sve descriptor. */
+    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+
+    /*
+     * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
+     * offset base entirely over the address space hole to change the
+     * pointer tag, or change the bit55 selector.  So we could here
+     * examine TBI + TCMA like we do for sve_ldN_r_mte().
+     */
+    sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
+                esz, msz, off_fn, host_fn, tlb_fn);
+}
+
+#define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ)                                   \
+void HELPER(sve_ldff##MEM##_##OFS)                                      \
+    (CPUARMState *env, void *vd, void *vg,                              \
+     void *vm, target_ulong base, uint32_t desc)                        \
+{                                                                       \
+    sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ,    \
+                off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
+}                                                                       \
+void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
+    (CPUARMState *env, void *vd, void *vg,                              \
+     void *vm, target_ulong base, uint32_t desc)                        \
+{                                                                       \
+    sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ,   \
+                    off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
+}
+
+#define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ)                                   \
+void HELPER(sve_ldff##MEM##_##OFS)                                      \
+    (CPUARMState *env, void *vd, void *vg,                              \
+     void *vm, target_ulong base, uint32_t desc)                        \
+{                                                                       \
+    sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ,    \
+                off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
+}                                                                       \
+void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
+    (CPUARMState *env, void *vd, void *vg,                              \
+     void *vm, target_ulong base, uint32_t desc)                        \
+{                                                                       \
+    sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ,   \
+                    off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
+}
+
+DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
+DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
+DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
+DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
+DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
+
+DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
+DO_LDFF1_ZPZ_S(bss, zss, MO_8)
+DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
+DO_LDFF1_ZPZ_D(bds, zss, MO_8)
+DO_LDFF1_ZPZ_D(bds, zd, MO_8)
+
+DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
+DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
+DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
+DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
+DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
+
+DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
+DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
+DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
+DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
+DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
+
+DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
+DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
+DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
+DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
+DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
+
+DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
+DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
+DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
+DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
+DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
+
+DO_LDFF1_ZPZ_S(ss_le,  zsu, MO_32)
+DO_LDFF1_ZPZ_S(ss_le,  zss, MO_32)
+DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
+DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
+DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
+
+DO_LDFF1_ZPZ_S(ss_be,  zsu, MO_32)
+DO_LDFF1_ZPZ_S(ss_be,  zss, MO_32)
+DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
+DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
+DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
+
+DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
+DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
+DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
+
+DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
+DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
+DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
+
+DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
+DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
+DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
+
+DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
+DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
+DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
+
+/* Stores with a vector index.  */
+
+static inline QEMU_ALWAYS_INLINE
+void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
+               target_ulong base, uint32_t desc, uintptr_t retaddr,
+               uint32_t mtedesc, int esize, int msize,
+               zreg_off_fn *off_fn,
+               sve_ldst1_host_fn *host_fn,
+               sve_ldst1_tlb_fn *tlb_fn)
+{
+    const int mmu_idx = cpu_mmu_index(env, false);
+    const intptr_t reg_max = simd_oprsz(desc);
+    const int scale = simd_data(desc);
+    void *host[ARM_MAX_VQ * 4];
+    intptr_t reg_off, i;
+    SVEHostPage info, info2;
+
+    /*
+     * Probe all of the elements for host addresses and flags.
+     */
+    i = reg_off = 0;
+    do {
+        uint64_t pg = vg[reg_off >> 6];
+        do {
+            target_ulong addr = base + (off_fn(vm, reg_off) << scale);
+            target_ulong in_page = -(addr | TARGET_PAGE_MASK);
+
+            host[i] = NULL;
+            if (likely((pg >> (reg_off & 63)) & 1)) {
+                if (likely(in_page >= msize)) {
+                    sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
+                                   mmu_idx, retaddr);
+                    if (!(info.flags & TLB_MMIO)) {
+                        host[i] = info.host;
+                    }
+                } else {
+                    /*
+                     * Element crosses the page boundary.
+                     * Probe both pages, but do not record the host address,
+                     * so that we use the slow path.
+                     */
+                    sve_probe_page(&info, false, env, addr, 0,
+                                   MMU_DATA_STORE, mmu_idx, retaddr);
+                    sve_probe_page(&info2, false, env, addr + in_page, 0,
+                                   MMU_DATA_STORE, mmu_idx, retaddr);
+                    info.flags |= info2.flags;
+                }
+
+                if (unlikely(info.flags & TLB_WATCHPOINT)) {
+                    cpu_check_watchpoint(env_cpu(env), addr, msize,
+                                         info.attrs, BP_MEM_WRITE, retaddr);
+                }
+
+                if (mtedesc && info.tagged) {
+                    mte_check(env, mtedesc, addr, retaddr);
+                }
+            }
+            i += 1;
+            reg_off += esize;
+        } while (reg_off & 63);
+    } while (reg_off < reg_max);
+
+    /*
+     * Now that we have recognized all exceptions except SyncExternal
+     * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
+     *
+     * Note for the common case of an element in RAM, not crossing a page
+     * boundary, we have stored the host address in host[].  This doubles
+     * as a first-level check against the predicate, since only enabled
+     * elements have non-null host addresses.
+     */
+    i = reg_off = 0;
+    do {
+        void *h = host[i];
+        if (likely(h != NULL)) {
+            host_fn(vd, reg_off, h);
+        } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
+            target_ulong addr = base + (off_fn(vm, reg_off) << scale);
+            tlb_fn(env, vd, reg_off, addr, retaddr);
+        }
+        i += 1;
+        reg_off += esize;
+    } while (reg_off < reg_max);
+}
+
+static inline QEMU_ALWAYS_INLINE
+void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
+                   target_ulong base, uint32_t desc, uintptr_t retaddr,
+                   int esize, int msize, zreg_off_fn *off_fn,
+                   sve_ldst1_host_fn *host_fn,
+                   sve_ldst1_tlb_fn *tlb_fn)
+{
+    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+    /* Remove mtedesc from the normal sve descriptor. */
+    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+
+    /*
+     * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
+     * offset base entirely over the address space hole to change the
+     * pointer tag, or change the bit55 selector.  So we could here
+     * examine TBI + TCMA like we do for sve_ldN_r_mte().
+     */
+    sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
+              esize, msize, off_fn, host_fn, tlb_fn);
+}
+
+#define DO_ST1_ZPZ_S(MEM, OFS, MSZ)                                     \
+void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
+                                 void *vm, target_ulong base, uint32_t desc) \
+{                                                                       \
+    sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,     \
+              off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
+}                                                                       \
+void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
+    void *vm, target_ulong base, uint32_t desc)                         \
+{                                                                       \
+    sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,    \
+                  off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
+}
+
+#define DO_ST1_ZPZ_D(MEM, OFS, MSZ)                                     \
+void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
+                                 void *vm, target_ulong base, uint32_t desc) \
+{                                                                       \
+    sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,     \
+              off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
+}                                                                       \
+void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
+    void *vm, target_ulong base, uint32_t desc)                         \
+{                                                                       \
+    sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,    \
+                  off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
+}
+
+DO_ST1_ZPZ_S(bs, zsu, MO_8)
+DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
+DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
+DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
+DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
+
+DO_ST1_ZPZ_S(bs, zss, MO_8)
+DO_ST1_ZPZ_S(hs_le, zss, MO_16)
+DO_ST1_ZPZ_S(hs_be, zss, MO_16)
+DO_ST1_ZPZ_S(ss_le, zss, MO_32)
+DO_ST1_ZPZ_S(ss_be, zss, MO_32)
+
+DO_ST1_ZPZ_D(bd, zsu, MO_8)
+DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
+DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
+DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
+DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
+DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
+DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
+
+DO_ST1_ZPZ_D(bd, zss, MO_8)
+DO_ST1_ZPZ_D(hd_le, zss, MO_16)
+DO_ST1_ZPZ_D(hd_be, zss, MO_16)
+DO_ST1_ZPZ_D(sd_le, zss, MO_32)
+DO_ST1_ZPZ_D(sd_be, zss, MO_32)
+DO_ST1_ZPZ_D(dd_le, zss, MO_64)
+DO_ST1_ZPZ_D(dd_be, zss, MO_64)
+
+DO_ST1_ZPZ_D(bd, zd, MO_8)
+DO_ST1_ZPZ_D(hd_le, zd, MO_16)
+DO_ST1_ZPZ_D(hd_be, zd, MO_16)
+DO_ST1_ZPZ_D(sd_le, zd, MO_32)
+DO_ST1_ZPZ_D(sd_be, zd, MO_32)
+DO_ST1_ZPZ_D(dd_le, zd, MO_64)
+DO_ST1_ZPZ_D(dd_be, zd, MO_64)
+
+#undef DO_ST1_ZPZ_S
+#undef DO_ST1_ZPZ_D
+
+void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
+
+    for (i = 0; i < opr_sz; ++i) {
+        d[i] = n[i] ^ m[i] ^ k[i];
+    }
+}
+
+void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
+
+    for (i = 0; i < opr_sz; ++i) {
+        d[i] = n[i] ^ (m[i] & ~k[i]);
+    }
+}
+
+void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
+
+    for (i = 0; i < opr_sz; ++i) {
+        d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
+    }
+}
+
+void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
+
+    for (i = 0; i < opr_sz; ++i) {
+        d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
+    }
+}
+
+void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
+
+    for (i = 0; i < opr_sz; ++i) {
+        d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
+    }
+}
+
+/*
+ * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
+ * See hasless(v,1) from
+ *   https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
+ */
+static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
+{
+    int bits = 8 << esz;
+    uint64_t ones = dup_const(esz, 1);
+    uint64_t signs = ones << (bits - 1);
+    uint64_t cmp0, cmp1;
+
+    cmp1 = dup_const(esz, n);
+    cmp0 = cmp1 ^ m0;
+    cmp1 = cmp1 ^ m1;
+    cmp0 = (cmp0 - ones) & ~cmp0;
+    cmp1 = (cmp1 - ones) & ~cmp1;
+    return (cmp0 | cmp1) & signs;
+}
+
+static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
+                                uint32_t desc, int esz, bool nmatch)
+{
+    uint16_t esz_mask = pred_esz_masks[esz];
+    intptr_t opr_sz = simd_oprsz(desc);
+    uint32_t flags = PREDTEST_INIT;
+    intptr_t i, j, k;
+
+    for (i = 0; i < opr_sz; i += 16) {
+        uint64_t m0 = *(uint64_t *)(vm + i);
+        uint64_t m1 = *(uint64_t *)(vm + i + 8);
+        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
+        uint16_t out = 0;
+
+        for (j = 0; j < 16; j += 8) {
+            uint64_t n = *(uint64_t *)(vn + i + j);
+
+            for (k = 0; k < 8; k += 1 << esz) {
+                if (pg & (1 << (j + k))) {
+                    bool o = do_match2(n >> (k * 8), m0, m1, esz);
+                    out |= (o ^ nmatch) << (j + k);
+                }
+            }
+        }
+        *(uint16_t *)(vd + H1_2(i >> 3)) = out;
+        flags = iter_predtest_fwd(out, pg, flags);
+    }
+    return flags;
+}
+
+#define DO_PPZZ_MATCH(NAME, ESZ, INV)                                         \
+uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
+{                                                                             \
+    return do_match(vd, vn, vm, vg, desc, ESZ, INV);                          \
+}
+
+DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
+DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
+
+DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
+DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
+
+#undef DO_PPZZ_MATCH
+
+void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
+                            uint32_t desc)
+{
+    ARMVectorReg scratch;
+    intptr_t i, j;
+    intptr_t opr_sz = simd_oprsz(desc);
+    uint32_t *d = vd, *n = vn, *m = vm;
+    uint8_t *pg = vg;
+
+    if (d == n) {
+        n = memcpy(&scratch, n, opr_sz);
+        if (d == m) {
+            m = n;
+        }
+    } else if (d == m) {
+        m = memcpy(&scratch, m, opr_sz);
+    }
+
+    for (i = 0; i < opr_sz; i += 4) {
+        uint64_t count = 0;
+        uint8_t pred;
+
+        pred = pg[H1(i >> 3)] >> (i & 7);
+        if (pred & 1) {
+            uint32_t nn = n[H4(i >> 2)];
+
+            for (j = 0; j <= i; j += 4) {
+                pred = pg[H1(j >> 3)] >> (j & 7);
+                if ((pred & 1) && nn == m[H4(j >> 2)]) {
+                    ++count;
+                }
+            }
+        }
+        d[H4(i >> 2)] = count;
+    }
+}
+
+void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
+                            uint32_t desc)
+{
+    ARMVectorReg scratch;
+    intptr_t i, j;
+    intptr_t opr_sz = simd_oprsz(desc);
+    uint64_t *d = vd, *n = vn, *m = vm;
+    uint8_t *pg = vg;
+
+    if (d == n) {
+        n = memcpy(&scratch, n, opr_sz);
+        if (d == m) {
+            m = n;
+        }
+    } else if (d == m) {
+        m = memcpy(&scratch, m, opr_sz);
+    }
+
+    for (i = 0; i < opr_sz / 8; ++i) {
+        uint64_t count = 0;
+        if (pg[H1(i)] & 1) {
+            uint64_t nn = n[i];
+            for (j = 0; j <= i; ++j) {
+                if ((pg[H1(j)] & 1) && nn == m[j]) {
+                    ++count;
+                }
+            }
+        }
+        d[i] = count;
+    }
+}
+
+/*
+ * Returns the number of bytes in m0 and m1 that match n.
+ * Unlike do_match2 we don't just need true/false, we need an exact count.
+ * This requires two extra logical operations.
+ */
+static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
+{
+    const uint64_t mask = dup_const(MO_8, 0x7f);
+    uint64_t cmp0, cmp1;
+
+    cmp1 = dup_const(MO_8, n);
+    cmp0 = cmp1 ^ m0;
+    cmp1 = cmp1 ^ m1;
+
+    /*
+     * 1: clear msb of each byte to avoid carry to next byte (& mask)
+     * 2: carry in to msb if byte != 0 (+ mask)
+     * 3: set msb if cmp has msb set (| cmp)
+     * 4: set ~msb to ignore them (| mask)
+     * We now have 0xff for byte != 0 or 0x7f for byte == 0.
+     * 5: invert, resulting in 0x80 if and only if byte == 0.
+     */
+    cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
+    cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
+
+    /*
+     * Combine the two compares in a way that the bits do
+     * not overlap, and so preserves the count of set bits.
+     * If the host has an efficient instruction for ctpop,
+     * then ctpop(x) + ctpop(y) has the same number of
+     * operations as ctpop(x | (y >> 1)).  If the host does
+     * not have an efficient ctpop, then we only want to
+     * use it once.
+     */
+    return ctpop64(cmp0 | (cmp1 >> 1));
+}
+
+void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, j;
+    intptr_t opr_sz = simd_oprsz(desc);
+
+    for (i = 0; i < opr_sz; i += 16) {
+        uint64_t n0 = *(uint64_t *)(vn + i);
+        uint64_t m0 = *(uint64_t *)(vm + i);
+        uint64_t n1 = *(uint64_t *)(vn + i + 8);
+        uint64_t m1 = *(uint64_t *)(vm + i + 8);
+        uint64_t out0 = 0;
+        uint64_t out1 = 0;
+
+        for (j = 0; j < 64; j += 8) {
+            uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
+            uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
+            out0 |= cnt0 << j;
+            out1 |= cnt1 << j;
+        }
+
+        *(uint64_t *)(vd + i) = out0;
+        *(uint64_t *)(vd + i + 8) = out1;
+    }
+}
+
+void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    int shr = simd_data(desc);
+    int shl = 8 - shr;
+    uint64_t mask = dup_const(MO_8, 0xff >> shr);
+    uint64_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz; ++i) {
+        uint64_t t = n[i] ^ m[i];
+        d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
+    }
+}
+
+void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    int shr = simd_data(desc);
+    int shl = 16 - shr;
+    uint64_t mask = dup_const(MO_16, 0xffff >> shr);
+    uint64_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz; ++i) {
+        uint64_t t = n[i] ^ m[i];
+        d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
+    }
+}
+
+void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 4;
+    int shr = simd_data(desc);
+    uint32_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz; ++i) {
+        d[i] = ror32(n[i] ^ m[i], shr);
+    }
+}
+
+void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
+                     void *status, uint32_t desc)
+{
+    intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
+
+    for (s = 0; s < opr_sz; ++s) {
+        float32 *n = vn + s * sizeof(float32) * 4;
+        float32 *m = vm + s * sizeof(float32) * 4;
+        float32 *a = va + s * sizeof(float32) * 4;
+        float32 *d = vd + s * sizeof(float32) * 4;
+        float32 n00 = n[H4(0)], n01 = n[H4(1)];
+        float32 n10 = n[H4(2)], n11 = n[H4(3)];
+        float32 m00 = m[H4(0)], m01 = m[H4(1)];
+        float32 m10 = m[H4(2)], m11 = m[H4(3)];
+        float32 p0, p1;
+
+        /* i = 0, j = 0 */
+        p0 = float32_mul(n00, m00, status);
+        p1 = float32_mul(n01, m01, status);
+        d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
+
+        /* i = 0, j = 1 */
+        p0 = float32_mul(n00, m10, status);
+        p1 = float32_mul(n01, m11, status);
+        d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
+
+        /* i = 1, j = 0 */
+        p0 = float32_mul(n10, m00, status);
+        p1 = float32_mul(n11, m01, status);
+        d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
+
+        /* i = 1, j = 1 */
+        p0 = float32_mul(n10, m10, status);
+        p1 = float32_mul(n11, m11, status);
+        d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
+    }
+}
+
+void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
+                     void *status, uint32_t desc)
+{
+    intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
+
+    for (s = 0; s < opr_sz; ++s) {
+        float64 *n = vn + s * sizeof(float64) * 4;
+        float64 *m = vm + s * sizeof(float64) * 4;
+        float64 *a = va + s * sizeof(float64) * 4;
+        float64 *d = vd + s * sizeof(float64) * 4;
+        float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
+        float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
+        float64 p0, p1;
+
+        /* i = 0, j = 0 */
+        p0 = float64_mul(n00, m00, status);
+        p1 = float64_mul(n01, m01, status);
+        d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
+
+        /* i = 0, j = 1 */
+        p0 = float64_mul(n00, m10, status);
+        p1 = float64_mul(n01, m11, status);
+        d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
+
+        /* i = 1, j = 0 */
+        p0 = float64_mul(n10, m00, status);
+        p1 = float64_mul(n11, m01, status);
+        d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
+
+        /* i = 1, j = 1 */
+        p0 = float64_mul(n10, m10, status);
+        p1 = float64_mul(n11, m11, status);
+        d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
+    }
+}
+
+#define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
+void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc)  \
+{                                                                             \
+    intptr_t i = simd_oprsz(desc);                                            \
+    uint64_t *g = vg;                                                         \
+    do {                                                                      \
+        uint64_t pg = g[(i - 1) >> 6];                                        \
+        do {                                                                  \
+            i -= sizeof(TYPEW);                                               \
+            if (likely((pg >> (i & 63)) & 1)) {                               \
+                TYPEW nn = *(TYPEW *)(vn + HW(i));                            \
+                *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status);      \
+            }                                                                 \
+        } while (i & 63);                                                     \
+    } while (i != 0);                                                         \
+}
+
+DO_FCVTNT(sve_bfcvtnt,    uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
+DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
+DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
+
+#define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
+void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc)  \
+{                                                                             \
+    intptr_t i = simd_oprsz(desc);                                            \
+    uint64_t *g = vg;                                                         \
+    do {                                                                      \
+        uint64_t pg = g[(i - 1) >> 6];                                        \
+        do {                                                                  \
+            i -= sizeof(TYPEW);                                               \
+            if (likely((pg >> (i & 63)) & 1)) {                               \
+                TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN)));            \
+                *(TYPEW *)(vd + HW(i)) = OP(nn, status);                      \
+            }                                                                 \
+        } while (i & 63);                                                     \
+    } while (i != 0);                                                         \
+}
+
+DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
+DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
+
+#undef DO_FCVTLT
+#undef DO_FCVTNT
diff --git a/target/arm/tcg/tlb_helper.c b/target/arm/tcg/tlb_helper.c
new file mode 100644
index 0000000..60abcbe
--- /dev/null
+++ b/target/arm/tcg/tlb_helper.c
@@ -0,0 +1,287 @@
+/*
+ * ARM TLB (Translation lookaside buffer) helpers.
+ *
+ * This code is licensed under the GNU GPL v2 or later.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "internals.h"
+#include "exec/exec-all.h"
+#include "exec/helper-proto.h"
+
+
+/* Return true if the translation regime is using LPAE format page tables */
+bool regime_using_lpae_format(CPUARMState *env, ARMMMUIdx mmu_idx)
+{
+    int el = regime_el(env, mmu_idx);
+    if (el == 2 || arm_el_is_aa64(env, el)) {
+        return true;
+    }
+    if (arm_feature(env, ARM_FEATURE_PMSA) &&
+        arm_feature(env, ARM_FEATURE_V8)) {
+        return true;
+    }
+    if (arm_feature(env, ARM_FEATURE_LPAE)
+        && (regime_tcr(env, mmu_idx) & TTBCR_EAE)) {
+        return true;
+    }
+    return false;
+}
+
+/*
+ * Returns true if the stage 1 translation regime is using LPAE format page
+ * tables. Used when raising alignment exceptions, whose FSR changes depending
+ * on whether the long or short descriptor format is in use.
+ */
+bool arm_s1_regime_using_lpae_format(CPUARMState *env, ARMMMUIdx mmu_idx)
+{
+    mmu_idx = stage_1_mmu_idx(mmu_idx);
+    return regime_using_lpae_format(env, mmu_idx);
+}
+
+static inline uint32_t merge_syn_data_abort(uint32_t template_syn,
+                                            unsigned int target_el,
+                                            bool same_el, bool ea,
+                                            bool s1ptw, bool is_write,
+                                            int fsc)
+{
+    uint32_t syn;
+
+    /*
+     * ISV is only set for data aborts routed to EL2 and
+     * never for stage-1 page table walks faulting on stage 2.
+     *
+     * Furthermore, ISV is only set for certain kinds of load/stores.
+     * If the template syndrome does not have ISV set, we should leave
+     * it cleared.
+     *
+     * See ARMv8 specs, D7-1974:
+     * ISS encoding for an exception from a Data Abort, the
+     * ISV field.
+     */
+    if (!(template_syn & ARM_EL_ISV) || target_el != 2 || s1ptw) {
+        syn = syn_data_abort_no_iss(same_el, 0,
+                                    ea, 0, s1ptw, is_write, fsc);
+    } else {
+        /*
+         * Fields: IL, ISV, SAS, SSE, SRT, SF and AR come from the template
+         * syndrome created at translation time.
+         * Now we create the runtime syndrome with the remaining fields.
+         */
+        syn = syn_data_abort_with_iss(same_el,
+                                      0, 0, 0, 0, 0,
+                                      ea, 0, s1ptw, is_write, fsc,
+                                      true);
+        /* Merge the runtime syndrome with the template syndrome.  */
+        syn |= template_syn;
+    }
+    return syn;
+}
+
+static uint32_t compute_fsr_fsc(CPUARMState *env, ARMMMUFaultInfo *fi,
+                                int target_el, int mmu_idx, uint32_t *ret_fsc)
+{
+    ARMMMUIdx arm_mmu_idx = core_to_arm_mmu_idx(env, mmu_idx);
+    uint32_t fsr, fsc;
+
+    if (target_el == 2 || arm_el_is_aa64(env, target_el) ||
+        arm_s1_regime_using_lpae_format(env, arm_mmu_idx)) {
+        /*
+         * LPAE format fault status register : bottom 6 bits are
+         * status code in the same form as needed for syndrome
+         */
+        fsr = arm_fi_to_lfsc(fi);
+        fsc = extract32(fsr, 0, 6);
+    } else {
+        fsr = arm_fi_to_sfsc(fi);
+        /*
+         * Short format FSR : this fault will never actually be reported
+         * to an EL that uses a syndrome register. Use a (currently)
+         * reserved FSR code in case the constructed syndrome does leak
+         * into the guest somehow.
+         */
+        fsc = 0x3f;
+    }
+
+    *ret_fsc = fsc;
+    return fsr;
+}
+
+static G_NORETURN
+void arm_deliver_fault(ARMCPU *cpu, vaddr addr,
+                       MMUAccessType access_type,
+                       int mmu_idx, ARMMMUFaultInfo *fi)
+{
+    CPUARMState *env = &cpu->env;
+    int target_el;
+    bool same_el;
+    uint32_t syn, exc, fsr, fsc;
+
+    target_el = exception_target_el(env);
+    if (fi->stage2) {
+        target_el = 2;
+        env->cp15.hpfar_el2 = extract64(fi->s2addr, 12, 47) << 4;
+        if (arm_is_secure_below_el3(env) && fi->s1ns) {
+            env->cp15.hpfar_el2 |= HPFAR_NS;
+        }
+    }
+    same_el = (arm_current_el(env) == target_el);
+
+    fsr = compute_fsr_fsc(env, fi, target_el, mmu_idx, &fsc);
+
+    if (access_type == MMU_INST_FETCH) {
+        syn = syn_insn_abort(same_el, fi->ea, fi->s1ptw, fsc);
+        exc = EXCP_PREFETCH_ABORT;
+    } else {
+        syn = merge_syn_data_abort(env->exception.syndrome, target_el,
+                                   same_el, fi->ea, fi->s1ptw,
+                                   access_type == MMU_DATA_STORE,
+                                   fsc);
+        if (access_type == MMU_DATA_STORE
+            && arm_feature(env, ARM_FEATURE_V6)) {
+            fsr |= (1 << 11);
+        }
+        exc = EXCP_DATA_ABORT;
+    }
+
+    env->exception.vaddress = addr;
+    env->exception.fsr = fsr;
+    raise_exception(env, exc, syn, target_el);
+}
+
+/* Raise a data fault alignment exception for the specified virtual address */
+void arm_cpu_do_unaligned_access(CPUState *cs, vaddr vaddr,
+                                 MMUAccessType access_type,
+                                 int mmu_idx, uintptr_t retaddr)
+{
+    ARMCPU *cpu = ARM_CPU(cs);
+    ARMMMUFaultInfo fi = {};
+
+    /* now we have a real cpu fault */
+    cpu_restore_state(cs, retaddr);
+
+    fi.type = ARMFault_Alignment;
+    arm_deliver_fault(cpu, vaddr, access_type, mmu_idx, &fi);
+}
+
+void helper_exception_pc_alignment(CPUARMState *env, target_ulong pc)
+{
+    ARMMMUFaultInfo fi = { .type = ARMFault_Alignment };
+    int target_el = exception_target_el(env);
+    int mmu_idx = cpu_mmu_index(env, true);
+    uint32_t fsc;
+
+    env->exception.vaddress = pc;
+
+    /*
+     * Note that the fsc is not applicable to this exception,
+     * since any syndrome is pcalignment not insn_abort.
+     */
+    env->exception.fsr = compute_fsr_fsc(env, &fi, target_el, mmu_idx, &fsc);
+    raise_exception(env, EXCP_PREFETCH_ABORT, syn_pcalignment(), target_el);
+}
+
+#if !defined(CONFIG_USER_ONLY)
+
+/*
+ * arm_cpu_do_transaction_failed: handle a memory system error response
+ * (eg "no device/memory present at address") by raising an external abort
+ * exception
+ */
+void arm_cpu_do_transaction_failed(CPUState *cs, hwaddr physaddr,
+                                   vaddr addr, unsigned size,
+                                   MMUAccessType access_type,
+                                   int mmu_idx, MemTxAttrs attrs,
+                                   MemTxResult response, uintptr_t retaddr)
+{
+    ARMCPU *cpu = ARM_CPU(cs);
+    ARMMMUFaultInfo fi = {};
+
+    /* now we have a real cpu fault */
+    cpu_restore_state(cs, retaddr);
+
+    fi.ea = arm_extabort_type(response);
+    fi.type = ARMFault_SyncExternal;
+    arm_deliver_fault(cpu, addr, access_type, mmu_idx, &fi);
+}
+
+bool arm_cpu_tlb_fill(CPUState *cs, vaddr address, int size,
+                      MMUAccessType access_type, int mmu_idx,
+                      bool probe, uintptr_t retaddr)
+{
+    ARMCPU *cpu = ARM_CPU(cs);
+    GetPhysAddrResult res = {};
+    ARMMMUFaultInfo local_fi, *fi;
+    int ret;
+
+    /*
+     * Allow S1_ptw_translate to see any fault generated here.
+     * Since this may recurse, read and clear.
+     */
+    fi = cpu->env.tlb_fi;
+    if (fi) {
+        cpu->env.tlb_fi = NULL;
+    } else {
+        fi = memset(&local_fi, 0, sizeof(local_fi));
+    }
+
+    /*
+     * Walk the page table and (if the mapping exists) add the page
+     * to the TLB.  On success, return true.  Otherwise, if probing,
+     * return false.  Otherwise populate fsr with ARM DFSR/IFSR fault
+     * register format, and signal the fault.
+     */
+    ret = get_phys_addr(&cpu->env, address, access_type,
+                        core_to_arm_mmu_idx(&cpu->env, mmu_idx),
+                        &res, fi);
+    if (likely(!ret)) {
+        /*
+         * Map a single [sub]page. Regions smaller than our declared
+         * target page size are handled specially, so for those we
+         * pass in the exact addresses.
+         */
+        if (res.f.lg_page_size >= TARGET_PAGE_BITS) {
+            res.f.phys_addr &= TARGET_PAGE_MASK;
+            address &= TARGET_PAGE_MASK;
+        }
+
+        res.f.pte_attrs = res.cacheattrs.attrs;
+        res.f.shareability = res.cacheattrs.shareability;
+
+        tlb_set_page_full(cs, mmu_idx, address, &res.f);
+        return true;
+    } else if (probe) {
+        return false;
+    } else {
+        /* now we have a real cpu fault */
+        cpu_restore_state(cs, retaddr);
+        arm_deliver_fault(cpu, address, access_type, mmu_idx, fi);
+    }
+}
+#else
+void arm_cpu_record_sigsegv(CPUState *cs, vaddr addr,
+                            MMUAccessType access_type,
+                            bool maperr, uintptr_t ra)
+{
+    ARMMMUFaultInfo fi = {
+        .type = maperr ? ARMFault_Translation : ARMFault_Permission,
+        .level = 3,
+    };
+    ARMCPU *cpu = ARM_CPU(cs);
+
+    /*
+     * We report both ESR and FAR to signal handlers.
+     * For now, it's easiest to deliver the fault normally.
+     */
+    cpu_restore_state(cs, ra);
+    arm_deliver_fault(cpu, addr, access_type, MMU_USER_IDX, &fi);
+}
+
+void arm_cpu_record_sigbus(CPUState *cs, vaddr addr,
+                           MMUAccessType access_type, uintptr_t ra)
+{
+    arm_cpu_do_unaligned_access(cs, addr, access_type, MMU_USER_IDX, ra);
+}
+#endif /* !defined(CONFIG_USER_ONLY) */
diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c
new file mode 100644
index 0000000..f59d3b2
--- /dev/null
+++ b/target/arm/tcg/vec_helper.c
@@ -0,0 +1,2716 @@
+/*
+ * ARM AdvSIMD / SVE Vector Operations
+ *
+ * Copyright (c) 2018 Linaro
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "exec/helper-proto.h"
+#include "tcg/tcg-gvec-desc.h"
+#include "fpu/softfloat.h"
+#include "qemu/int128.h"
+#include "vec_internal.h"
+
+/*
+ * Data for expanding active predicate bits to bytes, for byte elements.
+ *
+ *  for (i = 0; i < 256; ++i) {
+ *      unsigned long m = 0;
+ *      for (j = 0; j < 8; j++) {
+ *          if ((i >> j) & 1) {
+ *              m |= 0xfful << (j << 3);
+ *          }
+ *      }
+ *      printf("0x%016lx,\n", m);
+ *  }
+ */
+const uint64_t expand_pred_b_data[256] = {
+    0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
+    0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
+    0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
+    0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
+    0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
+    0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
+    0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
+    0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
+    0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
+    0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
+    0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
+    0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
+    0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
+    0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
+    0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
+    0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
+    0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
+    0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
+    0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
+    0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
+    0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
+    0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
+    0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
+    0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
+    0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
+    0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
+    0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
+    0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
+    0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
+    0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
+    0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
+    0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
+    0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
+    0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
+    0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
+    0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
+    0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
+    0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
+    0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
+    0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
+    0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
+    0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
+    0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
+    0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
+    0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
+    0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
+    0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
+    0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
+    0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
+    0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
+    0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
+    0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
+    0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
+    0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
+    0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
+    0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
+    0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
+    0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
+    0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
+    0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
+    0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
+    0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
+    0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
+    0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
+    0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
+    0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
+    0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
+    0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
+    0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
+    0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
+    0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
+    0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
+    0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
+    0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
+    0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
+    0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
+    0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
+    0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
+    0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
+    0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
+    0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
+    0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
+    0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
+    0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
+    0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
+    0xffffffffffffffff,
+};
+
+/*
+ * Similarly for half-word elements.
+ *  for (i = 0; i < 256; ++i) {
+ *      unsigned long m = 0;
+ *      if (i & 0xaa) {
+ *          continue;
+ *      }
+ *      for (j = 0; j < 8; j += 2) {
+ *          if ((i >> j) & 1) {
+ *              m |= 0xfffful << (j << 3);
+ *          }
+ *      }
+ *      printf("[0x%x] = 0x%016lx,\n", i, m);
+ *  }
+ */
+const uint64_t expand_pred_h_data[0x55 + 1] = {
+    [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
+    [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
+    [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
+    [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
+    [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
+    [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
+    [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
+    [0x55] = 0xffffffffffffffff,
+};
+
+/* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
+int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
+                     bool neg, bool round)
+{
+    /*
+     * Simplify:
+     * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
+     * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
+     */
+    int32_t ret = (int32_t)src1 * src2;
+    if (neg) {
+        ret = -ret;
+    }
+    ret += ((int32_t)src3 << 7) + (round << 6);
+    ret >>= 7;
+
+    if (ret != (int8_t)ret) {
+        ret = (ret < 0 ? INT8_MIN : INT8_MAX);
+    }
+    return ret;
+}
+
+void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
+                             void *va, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    int8_t *d = vd, *n = vn, *m = vm, *a = va;
+
+    for (i = 0; i < opr_sz; ++i) {
+        d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
+    }
+}
+
+void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
+                             void *va, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    int8_t *d = vd, *n = vn, *m = vm, *a = va;
+
+    for (i = 0; i < opr_sz; ++i) {
+        d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
+    }
+}
+
+void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    int8_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz; ++i) {
+        d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
+    }
+}
+
+void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    int8_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz; ++i) {
+        d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
+    }
+}
+
+/* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
+int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
+                      bool neg, bool round, uint32_t *sat)
+{
+    /* Simplify similarly to do_sqrdmlah_b above.  */
+    int32_t ret = (int32_t)src1 * src2;
+    if (neg) {
+        ret = -ret;
+    }
+    ret += ((int32_t)src3 << 15) + (round << 14);
+    ret >>= 15;
+
+    if (ret != (int16_t)ret) {
+        *sat = 1;
+        ret = (ret < 0 ? INT16_MIN : INT16_MAX);
+    }
+    return ret;
+}
+
+uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
+                                  uint32_t src2, uint32_t src3)
+{
+    uint32_t *sat = &env->vfp.qc[0];
+    uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
+    uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
+                                false, true, sat);
+    return deposit32(e1, 16, 16, e2);
+}
+
+void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
+                              void *vq, uint32_t desc)
+{
+    uintptr_t opr_sz = simd_oprsz(desc);
+    int16_t *d = vd;
+    int16_t *n = vn;
+    int16_t *m = vm;
+    uintptr_t i;
+
+    for (i = 0; i < opr_sz / 2; ++i) {
+        d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
+                                  uint32_t src2, uint32_t src3)
+{
+    uint32_t *sat = &env->vfp.qc[0];
+    uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
+    uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
+                                true, true, sat);
+    return deposit32(e1, 16, 16, e2);
+}
+
+void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
+                              void *vq, uint32_t desc)
+{
+    uintptr_t opr_sz = simd_oprsz(desc);
+    int16_t *d = vd;
+    int16_t *n = vn;
+    int16_t *m = vm;
+    uintptr_t i;
+
+    for (i = 0; i < opr_sz / 2; ++i) {
+        d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
+                            void *vq, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    int16_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz / 2; ++i) {
+        d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
+                             void *vq, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    int16_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz / 2; ++i) {
+        d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
+                             void *va, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    int16_t *d = vd, *n = vn, *m = vm, *a = va;
+    uint32_t discard;
+
+    for (i = 0; i < opr_sz / 2; ++i) {
+        d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
+    }
+}
+
+void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
+                             void *va, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    int16_t *d = vd, *n = vn, *m = vm, *a = va;
+    uint32_t discard;
+
+    for (i = 0; i < opr_sz / 2; ++i) {
+        d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
+    }
+}
+
+void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    int16_t *d = vd, *n = vn, *m = vm;
+    uint32_t discard;
+
+    for (i = 0; i < opr_sz / 2; ++i) {
+        d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
+    }
+}
+
+void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    int16_t *d = vd, *n = vn, *m = vm;
+    uint32_t discard;
+
+    for (i = 0; i < opr_sz / 2; ++i) {
+        d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
+    }
+}
+
+void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, j, opr_sz = simd_oprsz(desc);
+    int idx = simd_data(desc);
+    int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
+    uint32_t discard;
+
+    for (i = 0; i < opr_sz / 2; i += 16 / 2) {
+        int16_t mm = m[i];
+        for (j = 0; j < 16 / 2; ++j) {
+            d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
+        }
+    }
+}
+
+void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, j, opr_sz = simd_oprsz(desc);
+    int idx = simd_data(desc);
+    int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
+    uint32_t discard;
+
+    for (i = 0; i < opr_sz / 2; i += 16 / 2) {
+        int16_t mm = m[i];
+        for (j = 0; j < 16 / 2; ++j) {
+            d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
+        }
+    }
+}
+
+/* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
+int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
+                      bool neg, bool round, uint32_t *sat)
+{
+    /* Simplify similarly to do_sqrdmlah_b above.  */
+    int64_t ret = (int64_t)src1 * src2;
+    if (neg) {
+        ret = -ret;
+    }
+    ret += ((int64_t)src3 << 31) + (round << 30);
+    ret >>= 31;
+
+    if (ret != (int32_t)ret) {
+        *sat = 1;
+        ret = (ret < 0 ? INT32_MIN : INT32_MAX);
+    }
+    return ret;
+}
+
+uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
+                                  int32_t src2, int32_t src3)
+{
+    uint32_t *sat = &env->vfp.qc[0];
+    return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
+}
+
+void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
+                              void *vq, uint32_t desc)
+{
+    uintptr_t opr_sz = simd_oprsz(desc);
+    int32_t *d = vd;
+    int32_t *n = vn;
+    int32_t *m = vm;
+    uintptr_t i;
+
+    for (i = 0; i < opr_sz / 4; ++i) {
+        d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
+                                  int32_t src2, int32_t src3)
+{
+    uint32_t *sat = &env->vfp.qc[0];
+    return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
+}
+
+void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
+                              void *vq, uint32_t desc)
+{
+    uintptr_t opr_sz = simd_oprsz(desc);
+    int32_t *d = vd;
+    int32_t *n = vn;
+    int32_t *m = vm;
+    uintptr_t i;
+
+    for (i = 0; i < opr_sz / 4; ++i) {
+        d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
+                            void *vq, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    int32_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz / 4; ++i) {
+        d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
+                             void *vq, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    int32_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz / 4; ++i) {
+        d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
+                             void *va, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    int32_t *d = vd, *n = vn, *m = vm, *a = va;
+    uint32_t discard;
+
+    for (i = 0; i < opr_sz / 4; ++i) {
+        d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
+    }
+}
+
+void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
+                             void *va, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    int32_t *d = vd, *n = vn, *m = vm, *a = va;
+    uint32_t discard;
+
+    for (i = 0; i < opr_sz / 4; ++i) {
+        d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
+    }
+}
+
+void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    int32_t *d = vd, *n = vn, *m = vm;
+    uint32_t discard;
+
+    for (i = 0; i < opr_sz / 4; ++i) {
+        d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
+    }
+}
+
+void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    int32_t *d = vd, *n = vn, *m = vm;
+    uint32_t discard;
+
+    for (i = 0; i < opr_sz / 4; ++i) {
+        d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
+    }
+}
+
+void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, j, opr_sz = simd_oprsz(desc);
+    int idx = simd_data(desc);
+    int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
+    uint32_t discard;
+
+    for (i = 0; i < opr_sz / 4; i += 16 / 4) {
+        int32_t mm = m[i];
+        for (j = 0; j < 16 / 4; ++j) {
+            d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
+        }
+    }
+}
+
+void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, j, opr_sz = simd_oprsz(desc);
+    int idx = simd_data(desc);
+    int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
+    uint32_t discard;
+
+    for (i = 0; i < opr_sz / 4; i += 16 / 4) {
+        int32_t mm = m[i];
+        for (j = 0; j < 16 / 4; ++j) {
+            d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
+        }
+    }
+}
+
+/* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
+static int64_t do_sat128_d(Int128 r)
+{
+    int64_t ls = int128_getlo(r);
+    int64_t hs = int128_gethi(r);
+
+    if (unlikely(hs != (ls >> 63))) {
+        return hs < 0 ? INT64_MIN : INT64_MAX;
+    }
+    return ls;
+}
+
+int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
+{
+    uint64_t l, h;
+    Int128 r, t;
+
+    /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
+    muls64(&l, &h, m, n);
+    r = int128_make128(l, h);
+    if (neg) {
+        r = int128_neg(r);
+    }
+    if (a) {
+        t = int128_exts64(a);
+        t = int128_lshift(t, 63);
+        r = int128_add(r, t);
+    }
+    if (round) {
+        t = int128_exts64(1ll << 62);
+        r = int128_add(r, t);
+    }
+    r = int128_rshift(r, 63);
+
+    return do_sat128_d(r);
+}
+
+void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
+                             void *va, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    int64_t *d = vd, *n = vn, *m = vm, *a = va;
+
+    for (i = 0; i < opr_sz / 8; ++i) {
+        d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
+    }
+}
+
+void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
+                             void *va, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    int64_t *d = vd, *n = vn, *m = vm, *a = va;
+
+    for (i = 0; i < opr_sz / 8; ++i) {
+        d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
+    }
+}
+
+void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    int64_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz / 8; ++i) {
+        d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
+    }
+}
+
+void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    int64_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz / 8; ++i) {
+        d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
+    }
+}
+
+void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, j, opr_sz = simd_oprsz(desc);
+    int idx = simd_data(desc);
+    int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
+
+    for (i = 0; i < opr_sz / 8; i += 16 / 8) {
+        int64_t mm = m[i];
+        for (j = 0; j < 16 / 8; ++j) {
+            d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
+        }
+    }
+}
+
+void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, j, opr_sz = simd_oprsz(desc);
+    int idx = simd_data(desc);
+    int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
+
+    for (i = 0; i < opr_sz / 8; i += 16 / 8) {
+        int64_t mm = m[i];
+        for (j = 0; j < 16 / 8; ++j) {
+            d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
+        }
+    }
+}
+
+/* Integer 8 and 16-bit dot-product.
+ *
+ * Note that for the loops herein, host endianness does not matter
+ * with respect to the ordering of data within the quad-width lanes.
+ * All elements are treated equally, no matter where they are.
+ */
+
+#define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
+{                                                                         \
+    intptr_t i, opr_sz = simd_oprsz(desc);                                \
+    TYPED *d = vd, *a = va;                                               \
+    TYPEN *n = vn;                                                        \
+    TYPEM *m = vm;                                                        \
+    for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
+        d[i] = (a[i] +                                                    \
+                (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
+                (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
+                (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
+                (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
+    }                                                                     \
+    clear_tail(d, opr_sz, simd_maxsz(desc));                              \
+}
+
+DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
+DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
+DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
+DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
+DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
+
+#define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
+{                                                                         \
+    intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
+    intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
+    intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
+    intptr_t index = simd_data(desc);                                     \
+    TYPED *d = vd, *a = va;                                               \
+    TYPEN *n = vn;                                                        \
+    TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
+    do {                                                                  \
+        TYPED m0 = m_indexed[i * 4 + 0];                                  \
+        TYPED m1 = m_indexed[i * 4 + 1];                                  \
+        TYPED m2 = m_indexed[i * 4 + 2];                                  \
+        TYPED m3 = m_indexed[i * 4 + 3];                                  \
+        do {                                                              \
+            d[i] = (a[i] +                                                \
+                    n[i * 4 + 0] * m0 +                                   \
+                    n[i * 4 + 1] * m1 +                                   \
+                    n[i * 4 + 2] * m2 +                                   \
+                    n[i * 4 + 3] * m3);                                   \
+        } while (++i < segend);                                           \
+        segend = i + 4;                                                   \
+    } while (i < opr_sz_n);                                               \
+    clear_tail(d, opr_sz, simd_maxsz(desc));                              \
+}
+
+DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
+DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
+DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
+DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
+DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
+DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
+
+void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
+                         void *vfpst, uint32_t desc)
+{
+    uintptr_t opr_sz = simd_oprsz(desc);
+    float16 *d = vd;
+    float16 *n = vn;
+    float16 *m = vm;
+    float_status *fpst = vfpst;
+    uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
+    uint32_t neg_imag = neg_real ^ 1;
+    uintptr_t i;
+
+    /* Shift boolean to the sign bit so we can xor to negate.  */
+    neg_real <<= 15;
+    neg_imag <<= 15;
+
+    for (i = 0; i < opr_sz / 2; i += 2) {
+        float16 e0 = n[H2(i)];
+        float16 e1 = m[H2(i + 1)] ^ neg_imag;
+        float16 e2 = n[H2(i + 1)];
+        float16 e3 = m[H2(i)] ^ neg_real;
+
+        d[H2(i)] = float16_add(e0, e1, fpst);
+        d[H2(i + 1)] = float16_add(e2, e3, fpst);
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
+                         void *vfpst, uint32_t desc)
+{
+    uintptr_t opr_sz = simd_oprsz(desc);
+    float32 *d = vd;
+    float32 *n = vn;
+    float32 *m = vm;
+    float_status *fpst = vfpst;
+    uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
+    uint32_t neg_imag = neg_real ^ 1;
+    uintptr_t i;
+
+    /* Shift boolean to the sign bit so we can xor to negate.  */
+    neg_real <<= 31;
+    neg_imag <<= 31;
+
+    for (i = 0; i < opr_sz / 4; i += 2) {
+        float32 e0 = n[H4(i)];
+        float32 e1 = m[H4(i + 1)] ^ neg_imag;
+        float32 e2 = n[H4(i + 1)];
+        float32 e3 = m[H4(i)] ^ neg_real;
+
+        d[H4(i)] = float32_add(e0, e1, fpst);
+        d[H4(i + 1)] = float32_add(e2, e3, fpst);
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
+                         void *vfpst, uint32_t desc)
+{
+    uintptr_t opr_sz = simd_oprsz(desc);
+    float64 *d = vd;
+    float64 *n = vn;
+    float64 *m = vm;
+    float_status *fpst = vfpst;
+    uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
+    uint64_t neg_imag = neg_real ^ 1;
+    uintptr_t i;
+
+    /* Shift boolean to the sign bit so we can xor to negate.  */
+    neg_real <<= 63;
+    neg_imag <<= 63;
+
+    for (i = 0; i < opr_sz / 8; i += 2) {
+        float64 e0 = n[i];
+        float64 e1 = m[i + 1] ^ neg_imag;
+        float64 e2 = n[i + 1];
+        float64 e3 = m[i] ^ neg_real;
+
+        d[i] = float64_add(e0, e1, fpst);
+        d[i + 1] = float64_add(e2, e3, fpst);
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
+                         void *vfpst, uint32_t desc)
+{
+    uintptr_t opr_sz = simd_oprsz(desc);
+    float16 *d = vd, *n = vn, *m = vm, *a = va;
+    float_status *fpst = vfpst;
+    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
+    uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+    uint32_t neg_real = flip ^ neg_imag;
+    uintptr_t i;
+
+    /* Shift boolean to the sign bit so we can xor to negate.  */
+    neg_real <<= 15;
+    neg_imag <<= 15;
+
+    for (i = 0; i < opr_sz / 2; i += 2) {
+        float16 e2 = n[H2(i + flip)];
+        float16 e1 = m[H2(i + flip)] ^ neg_real;
+        float16 e4 = e2;
+        float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
+
+        d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
+        d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
+                             void *vfpst, uint32_t desc)
+{
+    uintptr_t opr_sz = simd_oprsz(desc);
+    float16 *d = vd, *n = vn, *m = vm, *a = va;
+    float_status *fpst = vfpst;
+    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
+    uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+    intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
+    uint32_t neg_real = flip ^ neg_imag;
+    intptr_t elements = opr_sz / sizeof(float16);
+    intptr_t eltspersegment = 16 / sizeof(float16);
+    intptr_t i, j;
+
+    /* Shift boolean to the sign bit so we can xor to negate.  */
+    neg_real <<= 15;
+    neg_imag <<= 15;
+
+    for (i = 0; i < elements; i += eltspersegment) {
+        float16 mr = m[H2(i + 2 * index + 0)];
+        float16 mi = m[H2(i + 2 * index + 1)];
+        float16 e1 = neg_real ^ (flip ? mi : mr);
+        float16 e3 = neg_imag ^ (flip ? mr : mi);
+
+        for (j = i; j < i + eltspersegment; j += 2) {
+            float16 e2 = n[H2(j + flip)];
+            float16 e4 = e2;
+
+            d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
+            d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
+        }
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
+                         void *vfpst, uint32_t desc)
+{
+    uintptr_t opr_sz = simd_oprsz(desc);
+    float32 *d = vd, *n = vn, *m = vm, *a = va;
+    float_status *fpst = vfpst;
+    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
+    uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+    uint32_t neg_real = flip ^ neg_imag;
+    uintptr_t i;
+
+    /* Shift boolean to the sign bit so we can xor to negate.  */
+    neg_real <<= 31;
+    neg_imag <<= 31;
+
+    for (i = 0; i < opr_sz / 4; i += 2) {
+        float32 e2 = n[H4(i + flip)];
+        float32 e1 = m[H4(i + flip)] ^ neg_real;
+        float32 e4 = e2;
+        float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
+
+        d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
+        d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
+                             void *vfpst, uint32_t desc)
+{
+    uintptr_t opr_sz = simd_oprsz(desc);
+    float32 *d = vd, *n = vn, *m = vm, *a = va;
+    float_status *fpst = vfpst;
+    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
+    uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+    intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
+    uint32_t neg_real = flip ^ neg_imag;
+    intptr_t elements = opr_sz / sizeof(float32);
+    intptr_t eltspersegment = 16 / sizeof(float32);
+    intptr_t i, j;
+
+    /* Shift boolean to the sign bit so we can xor to negate.  */
+    neg_real <<= 31;
+    neg_imag <<= 31;
+
+    for (i = 0; i < elements; i += eltspersegment) {
+        float32 mr = m[H4(i + 2 * index + 0)];
+        float32 mi = m[H4(i + 2 * index + 1)];
+        float32 e1 = neg_real ^ (flip ? mi : mr);
+        float32 e3 = neg_imag ^ (flip ? mr : mi);
+
+        for (j = i; j < i + eltspersegment; j += 2) {
+            float32 e2 = n[H4(j + flip)];
+            float32 e4 = e2;
+
+            d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
+            d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
+        }
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
+                         void *vfpst, uint32_t desc)
+{
+    uintptr_t opr_sz = simd_oprsz(desc);
+    float64 *d = vd, *n = vn, *m = vm, *a = va;
+    float_status *fpst = vfpst;
+    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
+    uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+    uint64_t neg_real = flip ^ neg_imag;
+    uintptr_t i;
+
+    /* Shift boolean to the sign bit so we can xor to negate.  */
+    neg_real <<= 63;
+    neg_imag <<= 63;
+
+    for (i = 0; i < opr_sz / 8; i += 2) {
+        float64 e2 = n[i + flip];
+        float64 e1 = m[i + flip] ^ neg_real;
+        float64 e4 = e2;
+        float64 e3 = m[i + 1 - flip] ^ neg_imag;
+
+        d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
+        d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+/*
+ * Floating point comparisons producing an integer result (all 1s or all 0s).
+ * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
+ * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
+ */
+static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
+{
+    return -float16_eq_quiet(op1, op2, stat);
+}
+
+static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
+{
+    return -float32_eq_quiet(op1, op2, stat);
+}
+
+static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
+{
+    return -float16_le(op2, op1, stat);
+}
+
+static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
+{
+    return -float32_le(op2, op1, stat);
+}
+
+static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
+{
+    return -float16_lt(op2, op1, stat);
+}
+
+static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
+{
+    return -float32_lt(op2, op1, stat);
+}
+
+static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
+{
+    return -float16_le(float16_abs(op2), float16_abs(op1), stat);
+}
+
+static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
+{
+    return -float32_le(float32_abs(op2), float32_abs(op1), stat);
+}
+
+static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
+{
+    return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
+}
+
+static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
+{
+    return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
+}
+
+static int16_t vfp_tosszh(float16 x, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    if (float16_is_any_nan(x)) {
+        float_raise(float_flag_invalid, fpst);
+        return 0;
+    }
+    return float16_to_int16_round_to_zero(x, fpst);
+}
+
+static uint16_t vfp_touszh(float16 x, void *fpstp)
+{
+    float_status *fpst = fpstp;
+    if (float16_is_any_nan(x)) {
+        float_raise(float_flag_invalid, fpst);
+        return 0;
+    }
+    return float16_to_uint16_round_to_zero(x, fpst);
+}
+
+#define DO_2OP(NAME, FUNC, TYPE) \
+void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)  \
+{                                                                 \
+    intptr_t i, oprsz = simd_oprsz(desc);                         \
+    TYPE *d = vd, *n = vn;                                        \
+    for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
+        d[i] = FUNC(n[i], stat);                                  \
+    }                                                             \
+    clear_tail(d, oprsz, simd_maxsz(desc));                       \
+}
+
+DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
+DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
+DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
+
+DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
+DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
+DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
+
+DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
+DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
+
+DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
+DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
+DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
+DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
+DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
+DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
+DO_2OP(gvec_tosszh, vfp_tosszh, float16)
+DO_2OP(gvec_touszh, vfp_touszh, float16)
+
+#define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
+    static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
+    {                                                           \
+        return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
+    }
+
+#define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
+    static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
+    {                                                           \
+        return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
+    }
+
+#define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
+    WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
+    WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
+    DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
+    DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
+
+DO_2OP_CMP0(cgt, cgt, FWD)
+DO_2OP_CMP0(cge, cge, FWD)
+DO_2OP_CMP0(ceq, ceq, FWD)
+DO_2OP_CMP0(clt, cgt, REV)
+DO_2OP_CMP0(cle, cge, REV)
+
+#undef DO_2OP
+#undef DO_2OP_CMP0
+
+/* Floating-point trigonometric starting value.
+ * See the ARM ARM pseudocode function FPTrigSMul.
+ */
+static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
+{
+    float16 result = float16_mul(op1, op1, stat);
+    if (!float16_is_any_nan(result)) {
+        result = float16_set_sign(result, op2 & 1);
+    }
+    return result;
+}
+
+static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
+{
+    float32 result = float32_mul(op1, op1, stat);
+    if (!float32_is_any_nan(result)) {
+        result = float32_set_sign(result, op2 & 1);
+    }
+    return result;
+}
+
+static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
+{
+    float64 result = float64_mul(op1, op1, stat);
+    if (!float64_is_any_nan(result)) {
+        result = float64_set_sign(result, op2 & 1);
+    }
+    return result;
+}
+
+static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
+{
+    return float16_abs(float16_sub(op1, op2, stat));
+}
+
+static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
+{
+    return float32_abs(float32_sub(op1, op2, stat));
+}
+
+/*
+ * Reciprocal step. These are the AArch32 version which uses a
+ * non-fused multiply-and-subtract.
+ */
+static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
+{
+    op1 = float16_squash_input_denormal(op1, stat);
+    op2 = float16_squash_input_denormal(op2, stat);
+
+    if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
+        (float16_is_infinity(op2) && float16_is_zero(op1))) {
+        return float16_two;
+    }
+    return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
+}
+
+static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
+{
+    op1 = float32_squash_input_denormal(op1, stat);
+    op2 = float32_squash_input_denormal(op2, stat);
+
+    if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
+        (float32_is_infinity(op2) && float32_is_zero(op1))) {
+        return float32_two;
+    }
+    return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
+}
+
+/* Reciprocal square-root step. AArch32 non-fused semantics. */
+static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
+{
+    op1 = float16_squash_input_denormal(op1, stat);
+    op2 = float16_squash_input_denormal(op2, stat);
+
+    if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
+        (float16_is_infinity(op2) && float16_is_zero(op1))) {
+        return float16_one_point_five;
+    }
+    op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
+    return float16_div(op1, float16_two, stat);
+}
+
+static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
+{
+    op1 = float32_squash_input_denormal(op1, stat);
+    op2 = float32_squash_input_denormal(op2, stat);
+
+    if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
+        (float32_is_infinity(op2) && float32_is_zero(op1))) {
+        return float32_one_point_five;
+    }
+    op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
+    return float32_div(op1, float32_two, stat);
+}
+
+#define DO_3OP(NAME, FUNC, TYPE) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
+{                                                                          \
+    intptr_t i, oprsz = simd_oprsz(desc);                                  \
+    TYPE *d = vd, *n = vn, *m = vm;                                        \
+    for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
+        d[i] = FUNC(n[i], m[i], stat);                                     \
+    }                                                                      \
+    clear_tail(d, oprsz, simd_maxsz(desc));                                \
+}
+
+DO_3OP(gvec_fadd_h, float16_add, float16)
+DO_3OP(gvec_fadd_s, float32_add, float32)
+DO_3OP(gvec_fadd_d, float64_add, float64)
+
+DO_3OP(gvec_fsub_h, float16_sub, float16)
+DO_3OP(gvec_fsub_s, float32_sub, float32)
+DO_3OP(gvec_fsub_d, float64_sub, float64)
+
+DO_3OP(gvec_fmul_h, float16_mul, float16)
+DO_3OP(gvec_fmul_s, float32_mul, float32)
+DO_3OP(gvec_fmul_d, float64_mul, float64)
+
+DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
+DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
+DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
+
+DO_3OP(gvec_fabd_h, float16_abd, float16)
+DO_3OP(gvec_fabd_s, float32_abd, float32)
+
+DO_3OP(gvec_fceq_h, float16_ceq, float16)
+DO_3OP(gvec_fceq_s, float32_ceq, float32)
+
+DO_3OP(gvec_fcge_h, float16_cge, float16)
+DO_3OP(gvec_fcge_s, float32_cge, float32)
+
+DO_3OP(gvec_fcgt_h, float16_cgt, float16)
+DO_3OP(gvec_fcgt_s, float32_cgt, float32)
+
+DO_3OP(gvec_facge_h, float16_acge, float16)
+DO_3OP(gvec_facge_s, float32_acge, float32)
+
+DO_3OP(gvec_facgt_h, float16_acgt, float16)
+DO_3OP(gvec_facgt_s, float32_acgt, float32)
+
+DO_3OP(gvec_fmax_h, float16_max, float16)
+DO_3OP(gvec_fmax_s, float32_max, float32)
+
+DO_3OP(gvec_fmin_h, float16_min, float16)
+DO_3OP(gvec_fmin_s, float32_min, float32)
+
+DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
+DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
+
+DO_3OP(gvec_fminnum_h, float16_minnum, float16)
+DO_3OP(gvec_fminnum_s, float32_minnum, float32)
+
+DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
+DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
+
+DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
+DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
+
+#ifdef TARGET_AARCH64
+
+DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
+DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
+DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
+
+DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
+DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
+DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
+
+#endif
+#undef DO_3OP
+
+/* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
+static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
+                                 float_status *stat)
+{
+    return float16_add(dest, float16_mul(op1, op2, stat), stat);
+}
+
+static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
+                                 float_status *stat)
+{
+    return float32_add(dest, float32_mul(op1, op2, stat), stat);
+}
+
+static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
+                                 float_status *stat)
+{
+    return float16_sub(dest, float16_mul(op1, op2, stat), stat);
+}
+
+static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
+                                 float_status *stat)
+{
+    return float32_sub(dest, float32_mul(op1, op2, stat), stat);
+}
+
+/* Fused versions; these have the semantics Neon VFMA/VFMS want */
+static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
+                                float_status *stat)
+{
+    return float16_muladd(op1, op2, dest, 0, stat);
+}
+
+static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
+                                 float_status *stat)
+{
+    return float32_muladd(op1, op2, dest, 0, stat);
+}
+
+static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
+                                 float_status *stat)
+{
+    return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
+}
+
+static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
+                                 float_status *stat)
+{
+    return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
+}
+
+#define DO_MULADD(NAME, FUNC, TYPE)                                     \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
+{                                                                          \
+    intptr_t i, oprsz = simd_oprsz(desc);                                  \
+    TYPE *d = vd, *n = vn, *m = vm;                                        \
+    for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
+        d[i] = FUNC(d[i], n[i], m[i], stat);                               \
+    }                                                                      \
+    clear_tail(d, oprsz, simd_maxsz(desc));                                \
+}
+
+DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
+DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
+
+DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
+DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
+
+DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
+DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
+
+DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
+DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
+
+/* For the indexed ops, SVE applies the index per 128-bit vector segment.
+ * For AdvSIMD, there is of course only one such vector segment.
+ */
+
+#define DO_MUL_IDX(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{                                                                          \
+    intptr_t i, j, oprsz = simd_oprsz(desc);                               \
+    intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
+    intptr_t idx = simd_data(desc);                                        \
+    TYPE *d = vd, *n = vn, *m = vm;                                        \
+    for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
+        TYPE mm = m[H(i + idx)];                                           \
+        for (j = 0; j < segment; j++) {                                    \
+            d[i + j] = n[i + j] * mm;                                      \
+        }                                                                  \
+    }                                                                      \
+    clear_tail(d, oprsz, simd_maxsz(desc));                                \
+}
+
+DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
+DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
+DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
+
+#undef DO_MUL_IDX
+
+#define DO_MLA_IDX(NAME, TYPE, OP, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
+{                                                                          \
+    intptr_t i, j, oprsz = simd_oprsz(desc);                               \
+    intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
+    intptr_t idx = simd_data(desc);                                        \
+    TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
+    for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
+        TYPE mm = m[H(i + idx)];                                           \
+        for (j = 0; j < segment; j++) {                                    \
+            d[i + j] = a[i + j] OP n[i + j] * mm;                          \
+        }                                                                  \
+    }                                                                      \
+    clear_tail(d, oprsz, simd_maxsz(desc));                                \
+}
+
+DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
+DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
+DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
+
+DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
+DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
+DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
+
+#undef DO_MLA_IDX
+
+#define DO_FMUL_IDX(NAME, ADD, TYPE, H)                                    \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
+{                                                                          \
+    intptr_t i, j, oprsz = simd_oprsz(desc);                               \
+    intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
+    intptr_t idx = simd_data(desc);                                        \
+    TYPE *d = vd, *n = vn, *m = vm;                                        \
+    for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
+        TYPE mm = m[H(i + idx)];                                           \
+        for (j = 0; j < segment; j++) {                                    \
+            d[i + j] = TYPE##_##ADD(d[i + j],                              \
+                                    TYPE##_mul(n[i + j], mm, stat), stat); \
+        }                                                                  \
+    }                                                                      \
+    clear_tail(d, oprsz, simd_maxsz(desc));                                \
+}
+
+#define float16_nop(N, M, S) (M)
+#define float32_nop(N, M, S) (M)
+#define float64_nop(N, M, S) (M)
+
+DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2)
+DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4)
+DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, H8)
+
+/*
+ * Non-fused multiply-accumulate operations, for Neon. NB that unlike
+ * the fused ops below they assume accumulate both from and into Vd.
+ */
+DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2)
+DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4)
+DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2)
+DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4)
+
+#undef float16_nop
+#undef float32_nop
+#undef float64_nop
+#undef DO_FMUL_IDX
+
+#define DO_FMLA_IDX(NAME, TYPE, H)                                         \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
+                  void *stat, uint32_t desc)                               \
+{                                                                          \
+    intptr_t i, j, oprsz = simd_oprsz(desc);                               \
+    intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
+    TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
+    intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
+    TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
+    op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
+    for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
+        TYPE mm = m[H(i + idx)];                                           \
+        for (j = 0; j < segment; j++) {                                    \
+            d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
+                                     mm, a[i + j], 0, stat);               \
+        }                                                                  \
+    }                                                                      \
+    clear_tail(d, oprsz, simd_maxsz(desc));                                \
+}
+
+DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
+DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
+DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
+
+#undef DO_FMLA_IDX
+
+#define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
+void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
+{                                                                          \
+    intptr_t i, oprsz = simd_oprsz(desc);                                  \
+    TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
+    bool q = false;                                                        \
+    for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
+        WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
+        if (dd < MIN) {                                                    \
+            dd = MIN;                                                      \
+            q = true;                                                      \
+        } else if (dd > MAX) {                                             \
+            dd = MAX;                                                      \
+            q = true;                                                      \
+        }                                                                  \
+        d[i] = dd;                                                         \
+    }                                                                      \
+    if (q) {                                                               \
+        uint32_t *qc = vq;                                                 \
+        qc[0] = 1;                                                         \
+    }                                                                      \
+    clear_tail(d, oprsz, simd_maxsz(desc));                                \
+}
+
+DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
+DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
+DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
+
+DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
+DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
+DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
+
+DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
+DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
+DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
+
+DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
+DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
+DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
+
+#undef DO_SAT
+
+void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
+                          void *vm, uint32_t desc)
+{
+    intptr_t i, oprsz = simd_oprsz(desc);
+    uint64_t *d = vd, *n = vn, *m = vm;
+    bool q = false;
+
+    for (i = 0; i < oprsz / 8; i++) {
+        uint64_t nn = n[i], mm = m[i], dd = nn + mm;
+        if (dd < nn) {
+            dd = UINT64_MAX;
+            q = true;
+        }
+        d[i] = dd;
+    }
+    if (q) {
+        uint32_t *qc = vq;
+        qc[0] = 1;
+    }
+    clear_tail(d, oprsz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
+                          void *vm, uint32_t desc)
+{
+    intptr_t i, oprsz = simd_oprsz(desc);
+    uint64_t *d = vd, *n = vn, *m = vm;
+    bool q = false;
+
+    for (i = 0; i < oprsz / 8; i++) {
+        uint64_t nn = n[i], mm = m[i], dd = nn - mm;
+        if (nn < mm) {
+            dd = 0;
+            q = true;
+        }
+        d[i] = dd;
+    }
+    if (q) {
+        uint32_t *qc = vq;
+        qc[0] = 1;
+    }
+    clear_tail(d, oprsz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
+                          void *vm, uint32_t desc)
+{
+    intptr_t i, oprsz = simd_oprsz(desc);
+    int64_t *d = vd, *n = vn, *m = vm;
+    bool q = false;
+
+    for (i = 0; i < oprsz / 8; i++) {
+        int64_t nn = n[i], mm = m[i], dd = nn + mm;
+        if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
+            dd = (nn >> 63) ^ ~INT64_MIN;
+            q = true;
+        }
+        d[i] = dd;
+    }
+    if (q) {
+        uint32_t *qc = vq;
+        qc[0] = 1;
+    }
+    clear_tail(d, oprsz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
+                          void *vm, uint32_t desc)
+{
+    intptr_t i, oprsz = simd_oprsz(desc);
+    int64_t *d = vd, *n = vn, *m = vm;
+    bool q = false;
+
+    for (i = 0; i < oprsz / 8; i++) {
+        int64_t nn = n[i], mm = m[i], dd = nn - mm;
+        if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
+            dd = (nn >> 63) ^ ~INT64_MIN;
+            q = true;
+        }
+        d[i] = dd;
+    }
+    if (q) {
+        uint32_t *qc = vq;
+        qc[0] = 1;
+    }
+    clear_tail(d, oprsz, simd_maxsz(desc));
+}
+
+
+#define DO_SRA(NAME, TYPE)                              \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
+{                                                       \
+    intptr_t i, oprsz = simd_oprsz(desc);               \
+    int shift = simd_data(desc);                        \
+    TYPE *d = vd, *n = vn;                              \
+    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
+        d[i] += n[i] >> shift;                          \
+    }                                                   \
+    clear_tail(d, oprsz, simd_maxsz(desc));             \
+}
+
+DO_SRA(gvec_ssra_b, int8_t)
+DO_SRA(gvec_ssra_h, int16_t)
+DO_SRA(gvec_ssra_s, int32_t)
+DO_SRA(gvec_ssra_d, int64_t)
+
+DO_SRA(gvec_usra_b, uint8_t)
+DO_SRA(gvec_usra_h, uint16_t)
+DO_SRA(gvec_usra_s, uint32_t)
+DO_SRA(gvec_usra_d, uint64_t)
+
+#undef DO_SRA
+
+#define DO_RSHR(NAME, TYPE)                             \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
+{                                                       \
+    intptr_t i, oprsz = simd_oprsz(desc);               \
+    int shift = simd_data(desc);                        \
+    TYPE *d = vd, *n = vn;                              \
+    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
+        TYPE tmp = n[i] >> (shift - 1);                 \
+        d[i] = (tmp >> 1) + (tmp & 1);                  \
+    }                                                   \
+    clear_tail(d, oprsz, simd_maxsz(desc));             \
+}
+
+DO_RSHR(gvec_srshr_b, int8_t)
+DO_RSHR(gvec_srshr_h, int16_t)
+DO_RSHR(gvec_srshr_s, int32_t)
+DO_RSHR(gvec_srshr_d, int64_t)
+
+DO_RSHR(gvec_urshr_b, uint8_t)
+DO_RSHR(gvec_urshr_h, uint16_t)
+DO_RSHR(gvec_urshr_s, uint32_t)
+DO_RSHR(gvec_urshr_d, uint64_t)
+
+#undef DO_RSHR
+
+#define DO_RSRA(NAME, TYPE)                             \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
+{                                                       \
+    intptr_t i, oprsz = simd_oprsz(desc);               \
+    int shift = simd_data(desc);                        \
+    TYPE *d = vd, *n = vn;                              \
+    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
+        TYPE tmp = n[i] >> (shift - 1);                 \
+        d[i] += (tmp >> 1) + (tmp & 1);                 \
+    }                                                   \
+    clear_tail(d, oprsz, simd_maxsz(desc));             \
+}
+
+DO_RSRA(gvec_srsra_b, int8_t)
+DO_RSRA(gvec_srsra_h, int16_t)
+DO_RSRA(gvec_srsra_s, int32_t)
+DO_RSRA(gvec_srsra_d, int64_t)
+
+DO_RSRA(gvec_ursra_b, uint8_t)
+DO_RSRA(gvec_ursra_h, uint16_t)
+DO_RSRA(gvec_ursra_s, uint32_t)
+DO_RSRA(gvec_ursra_d, uint64_t)
+
+#undef DO_RSRA
+
+#define DO_SRI(NAME, TYPE)                              \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
+{                                                       \
+    intptr_t i, oprsz = simd_oprsz(desc);               \
+    int shift = simd_data(desc);                        \
+    TYPE *d = vd, *n = vn;                              \
+    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
+        d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
+    }                                                   \
+    clear_tail(d, oprsz, simd_maxsz(desc));             \
+}
+
+DO_SRI(gvec_sri_b, uint8_t)
+DO_SRI(gvec_sri_h, uint16_t)
+DO_SRI(gvec_sri_s, uint32_t)
+DO_SRI(gvec_sri_d, uint64_t)
+
+#undef DO_SRI
+
+#define DO_SLI(NAME, TYPE)                              \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
+{                                                       \
+    intptr_t i, oprsz = simd_oprsz(desc);               \
+    int shift = simd_data(desc);                        \
+    TYPE *d = vd, *n = vn;                              \
+    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
+        d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
+    }                                                   \
+    clear_tail(d, oprsz, simd_maxsz(desc));             \
+}
+
+DO_SLI(gvec_sli_b, uint8_t)
+DO_SLI(gvec_sli_h, uint16_t)
+DO_SLI(gvec_sli_s, uint32_t)
+DO_SLI(gvec_sli_d, uint64_t)
+
+#undef DO_SLI
+
+/*
+ * Convert float16 to float32, raising no exceptions and
+ * preserving exceptional values, including SNaN.
+ * This is effectively an unpack+repack operation.
+ */
+static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
+{
+    const int f16_bias = 15;
+    const int f32_bias = 127;
+    uint32_t sign = extract32(f16, 15, 1);
+    uint32_t exp = extract32(f16, 10, 5);
+    uint32_t frac = extract32(f16, 0, 10);
+
+    if (exp == 0x1f) {
+        /* Inf or NaN */
+        exp = 0xff;
+    } else if (exp == 0) {
+        /* Zero or denormal.  */
+        if (frac != 0) {
+            if (fz16) {
+                frac = 0;
+            } else {
+                /*
+                 * Denormal; these are all normal float32.
+                 * Shift the fraction so that the msb is at bit 11,
+                 * then remove bit 11 as the implicit bit of the
+                 * normalized float32.  Note that we still go through
+                 * the shift for normal numbers below, to put the
+                 * float32 fraction at the right place.
+                 */
+                int shift = clz32(frac) - 21;
+                frac = (frac << shift) & 0x3ff;
+                exp = f32_bias - f16_bias - shift + 1;
+            }
+        }
+    } else {
+        /* Normal number; adjust the bias.  */
+        exp += f32_bias - f16_bias;
+    }
+    sign <<= 31;
+    exp <<= 23;
+    frac <<= 23 - 10;
+
+    return sign | exp | frac;
+}
+
+static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
+{
+    /*
+     * Branchless load of u32[0], u64[0], u32[1], or u64[1].
+     * Load the 2nd qword iff is_q & is_2.
+     * Shift to the 2nd dword iff !is_q & is_2.
+     * For !is_q & !is_2, the upper bits of the result are garbage.
+     */
+    return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
+}
+
+/*
+ * Note that FMLAL requires oprsz == 8 or oprsz == 16,
+ * as there is not yet SVE versions that might use blocking.
+ */
+
+static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
+                     uint32_t desc, bool fz16)
+{
+    intptr_t i, oprsz = simd_oprsz(desc);
+    int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
+    int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+    int is_q = oprsz == 16;
+    uint64_t n_4, m_4;
+
+    /* Pre-load all of the f16 data, avoiding overlap issues.  */
+    n_4 = load4_f16(vn, is_q, is_2);
+    m_4 = load4_f16(vm, is_q, is_2);
+
+    /* Negate all inputs for FMLSL at once.  */
+    if (is_s) {
+        n_4 ^= 0x8000800080008000ull;
+    }
+
+    for (i = 0; i < oprsz / 4; i++) {
+        float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
+        float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
+        d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
+    }
+    clear_tail(d, oprsz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
+                            void *venv, uint32_t desc)
+{
+    CPUARMState *env = venv;
+    do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
+             get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
+}
+
+void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
+                            void *venv, uint32_t desc)
+{
+    CPUARMState *env = venv;
+    do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
+             get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
+}
+
+void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
+                               void *venv, uint32_t desc)
+{
+    intptr_t i, oprsz = simd_oprsz(desc);
+    uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
+    intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
+    CPUARMState *env = venv;
+    float_status *status = &env->vfp.fp_status;
+    bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
+
+    for (i = 0; i < oprsz; i += sizeof(float32)) {
+        float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
+        float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
+        float32 nn = float16_to_float32_by_bits(nn_16, fz16);
+        float32 mm = float16_to_float32_by_bits(mm_16, fz16);
+        float32 aa = *(float32 *)(va + H1_4(i));
+
+        *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
+    }
+}
+
+static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
+                         uint32_t desc, bool fz16)
+{
+    intptr_t i, oprsz = simd_oprsz(desc);
+    int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
+    int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+    int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
+    int is_q = oprsz == 16;
+    uint64_t n_4;
+    float32 m_1;
+
+    /* Pre-load all of the f16 data, avoiding overlap issues.  */
+    n_4 = load4_f16(vn, is_q, is_2);
+
+    /* Negate all inputs for FMLSL at once.  */
+    if (is_s) {
+        n_4 ^= 0x8000800080008000ull;
+    }
+
+    m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
+
+    for (i = 0; i < oprsz / 4; i++) {
+        float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
+        d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
+    }
+    clear_tail(d, oprsz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
+                                void *venv, uint32_t desc)
+{
+    CPUARMState *env = venv;
+    do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
+                 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
+}
+
+void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
+                                void *venv, uint32_t desc)
+{
+    CPUARMState *env = venv;
+    do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
+                 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
+}
+
+void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
+                               void *venv, uint32_t desc)
+{
+    intptr_t i, j, oprsz = simd_oprsz(desc);
+    uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
+    intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
+    intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
+    CPUARMState *env = venv;
+    float_status *status = &env->vfp.fp_status;
+    bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
+
+    for (i = 0; i < oprsz; i += 16) {
+        float16 mm_16 = *(float16 *)(vm + i + idx);
+        float32 mm = float16_to_float32_by_bits(mm_16, fz16);
+
+        for (j = 0; j < 16; j += sizeof(float32)) {
+            float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
+            float32 nn = float16_to_float32_by_bits(nn_16, fz16);
+            float32 aa = *(float32 *)(va + H1_4(i + j));
+
+            *(float32 *)(vd + H1_4(i + j)) =
+                float32_muladd(nn, mm, aa, 0, status);
+        }
+    }
+}
+
+void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    int8_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz; ++i) {
+        int8_t mm = m[i];
+        int8_t nn = n[i];
+        int8_t res = 0;
+        if (mm >= 0) {
+            if (mm < 8) {
+                res = nn << mm;
+            }
+        } else {
+            res = nn >> (mm > -8 ? -mm : 7);
+        }
+        d[i] = res;
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    int16_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz / 2; ++i) {
+        int8_t mm = m[i];   /* only 8 bits of shift are significant */
+        int16_t nn = n[i];
+        int16_t res = 0;
+        if (mm >= 0) {
+            if (mm < 16) {
+                res = nn << mm;
+            }
+        } else {
+            res = nn >> (mm > -16 ? -mm : 15);
+        }
+        d[i] = res;
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    uint8_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz; ++i) {
+        int8_t mm = m[i];
+        uint8_t nn = n[i];
+        uint8_t res = 0;
+        if (mm >= 0) {
+            if (mm < 8) {
+                res = nn << mm;
+            }
+        } else {
+            if (mm > -8) {
+                res = nn >> -mm;
+            }
+        }
+        d[i] = res;
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    uint16_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz / 2; ++i) {
+        int8_t mm = m[i];   /* only 8 bits of shift are significant */
+        uint16_t nn = n[i];
+        uint16_t res = 0;
+        if (mm >= 0) {
+            if (mm < 16) {
+                res = nn << mm;
+            }
+        } else {
+            if (mm > -16) {
+                res = nn >> -mm;
+            }
+        }
+        d[i] = res;
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+/*
+ * 8x8->8 polynomial multiply.
+ *
+ * Polynomial multiplication is like integer multiplication except the
+ * partial products are XORed, not added.
+ *
+ * TODO: expose this as a generic vector operation, as it is a common
+ * crypto building block.
+ */
+void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, j, opr_sz = simd_oprsz(desc);
+    uint64_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz / 8; ++i) {
+        uint64_t nn = n[i];
+        uint64_t mm = m[i];
+        uint64_t rr = 0;
+
+        for (j = 0; j < 8; ++j) {
+            uint64_t mask = (nn & 0x0101010101010101ull) * 0xff;
+            rr ^= mm & mask;
+            mm = (mm << 1) & 0xfefefefefefefefeull;
+            nn >>= 1;
+        }
+        d[i] = rr;
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+/*
+ * 64x64->128 polynomial multiply.
+ * Because of the lanes are not accessed in strict columns,
+ * this probably cannot be turned into a generic helper.
+ */
+void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, j, opr_sz = simd_oprsz(desc);
+    intptr_t hi = simd_data(desc);
+    uint64_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz / 8; i += 2) {
+        uint64_t nn = n[i + hi];
+        uint64_t mm = m[i + hi];
+        uint64_t rhi = 0;
+        uint64_t rlo = 0;
+
+        /* Bit 0 can only influence the low 64-bit result.  */
+        if (nn & 1) {
+            rlo = mm;
+        }
+
+        for (j = 1; j < 64; ++j) {
+            uint64_t mask = -((nn >> j) & 1);
+            rlo ^= (mm << j) & mask;
+            rhi ^= (mm >> (64 - j)) & mask;
+        }
+        d[i] = rlo;
+        d[i + 1] = rhi;
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+/*
+ * 8x8->16 polynomial multiply.
+ *
+ * The byte inputs are expanded to (or extracted from) half-words.
+ * Note that neon and sve2 get the inputs from different positions.
+ * This allows 4 bytes to be processed in parallel with uint64_t.
+ */
+
+static uint64_t expand_byte_to_half(uint64_t x)
+{
+    return  (x & 0x000000ff)
+         | ((x & 0x0000ff00) << 8)
+         | ((x & 0x00ff0000) << 16)
+         | ((x & 0xff000000) << 24);
+}
+
+uint64_t pmull_w(uint64_t op1, uint64_t op2)
+{
+    uint64_t result = 0;
+    int i;
+    for (i = 0; i < 16; ++i) {
+        uint64_t mask = (op1 & 0x0000000100000001ull) * 0xffffffff;
+        result ^= op2 & mask;
+        op1 >>= 1;
+        op2 <<= 1;
+    }
+    return result;
+}
+
+uint64_t pmull_h(uint64_t op1, uint64_t op2)
+{
+    uint64_t result = 0;
+    int i;
+    for (i = 0; i < 8; ++i) {
+        uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff;
+        result ^= op2 & mask;
+        op1 >>= 1;
+        op2 <<= 1;
+    }
+    return result;
+}
+
+void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    int hi = simd_data(desc);
+    uint64_t *d = vd, *n = vn, *m = vm;
+    uint64_t nn = n[hi], mm = m[hi];
+
+    d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
+    nn >>= 32;
+    mm >>= 32;
+    d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
+
+    clear_tail(d, 16, simd_maxsz(desc));
+}
+
+#ifdef TARGET_AARCH64
+void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    int shift = simd_data(desc) * 8;
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    uint64_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz / 8; ++i) {
+        uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull;
+        uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull;
+
+        d[i] = pmull_h(nn, mm);
+    }
+}
+
+static uint64_t pmull_d(uint64_t op1, uint64_t op2)
+{
+    uint64_t result = 0;
+    int i;
+
+    for (i = 0; i < 32; ++i) {
+        uint64_t mask = -((op1 >> i) & 1);
+        result ^= (op2 << i) & mask;
+    }
+    return result;
+}
+
+void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t sel = H4(simd_data(desc));
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    uint32_t *n = vn, *m = vm;
+    uint64_t *d = vd;
+
+    for (i = 0; i < opr_sz / 8; ++i) {
+        d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]);
+    }
+}
+#endif
+
+#define DO_CMP0(NAME, TYPE, OP)                         \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
+{                                                       \
+    intptr_t i, opr_sz = simd_oprsz(desc);              \
+    for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
+        TYPE nn = *(TYPE *)(vn + i);                    \
+        *(TYPE *)(vd + i) = -(nn OP 0);                 \
+    }                                                   \
+    clear_tail(vd, opr_sz, simd_maxsz(desc));           \
+}
+
+DO_CMP0(gvec_ceq0_b, int8_t, ==)
+DO_CMP0(gvec_clt0_b, int8_t, <)
+DO_CMP0(gvec_cle0_b, int8_t, <=)
+DO_CMP0(gvec_cgt0_b, int8_t, >)
+DO_CMP0(gvec_cge0_b, int8_t, >=)
+
+DO_CMP0(gvec_ceq0_h, int16_t, ==)
+DO_CMP0(gvec_clt0_h, int16_t, <)
+DO_CMP0(gvec_cle0_h, int16_t, <=)
+DO_CMP0(gvec_cgt0_h, int16_t, >)
+DO_CMP0(gvec_cge0_h, int16_t, >=)
+
+#undef DO_CMP0
+
+#define DO_ABD(NAME, TYPE)                                      \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
+{                                                               \
+    intptr_t i, opr_sz = simd_oprsz(desc);                      \
+    TYPE *d = vd, *n = vn, *m = vm;                             \
+                                                                \
+    for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
+        d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
+    }                                                           \
+    clear_tail(d, opr_sz, simd_maxsz(desc));                    \
+}
+
+DO_ABD(gvec_sabd_b, int8_t)
+DO_ABD(gvec_sabd_h, int16_t)
+DO_ABD(gvec_sabd_s, int32_t)
+DO_ABD(gvec_sabd_d, int64_t)
+
+DO_ABD(gvec_uabd_b, uint8_t)
+DO_ABD(gvec_uabd_h, uint16_t)
+DO_ABD(gvec_uabd_s, uint32_t)
+DO_ABD(gvec_uabd_d, uint64_t)
+
+#undef DO_ABD
+
+#define DO_ABA(NAME, TYPE)                                      \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
+{                                                               \
+    intptr_t i, opr_sz = simd_oprsz(desc);                      \
+    TYPE *d = vd, *n = vn, *m = vm;                             \
+                                                                \
+    for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
+        d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
+    }                                                           \
+    clear_tail(d, opr_sz, simd_maxsz(desc));                    \
+}
+
+DO_ABA(gvec_saba_b, int8_t)
+DO_ABA(gvec_saba_h, int16_t)
+DO_ABA(gvec_saba_s, int32_t)
+DO_ABA(gvec_saba_d, int64_t)
+
+DO_ABA(gvec_uaba_b, uint8_t)
+DO_ABA(gvec_uaba_h, uint16_t)
+DO_ABA(gvec_uaba_s, uint32_t)
+DO_ABA(gvec_uaba_d, uint64_t)
+
+#undef DO_ABA
+
+#define DO_NEON_PAIRWISE(NAME, OP)                                      \
+    void HELPER(NAME##s)(void *vd, void *vn, void *vm,                  \
+                         void *stat, uint32_t oprsz)                    \
+    {                                                                   \
+        float_status *fpst = stat;                                      \
+        float32 *d = vd;                                                \
+        float32 *n = vn;                                                \
+        float32 *m = vm;                                                \
+        float32 r0, r1;                                                 \
+                                                                        \
+        /* Read all inputs before writing outputs in case vm == vd */   \
+        r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst);                    \
+        r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst);                    \
+                                                                        \
+        d[H4(0)] = r0;                                                  \
+        d[H4(1)] = r1;                                                  \
+    }                                                                   \
+                                                                        \
+    void HELPER(NAME##h)(void *vd, void *vn, void *vm,                  \
+                         void *stat, uint32_t oprsz)                    \
+    {                                                                   \
+        float_status *fpst = stat;                                      \
+        float16 *d = vd;                                                \
+        float16 *n = vn;                                                \
+        float16 *m = vm;                                                \
+        float16 r0, r1, r2, r3;                                         \
+                                                                        \
+        /* Read all inputs before writing outputs in case vm == vd */   \
+        r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst);                    \
+        r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst);                    \
+        r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst);                    \
+        r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst);                    \
+                                                                        \
+        d[H2(0)] = r0;                                                  \
+        d[H2(1)] = r1;                                                  \
+        d[H2(2)] = r2;                                                  \
+        d[H2(3)] = r3;                                                  \
+    }
+
+DO_NEON_PAIRWISE(neon_padd, add)
+DO_NEON_PAIRWISE(neon_pmax, max)
+DO_NEON_PAIRWISE(neon_pmin, min)
+
+#undef DO_NEON_PAIRWISE
+
+#define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
+    void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
+    {                                                                   \
+        intptr_t i, oprsz = simd_oprsz(desc);                           \
+        int shift = simd_data(desc);                                    \
+        TYPE *d = vd, *n = vn;                                          \
+        float_status *fpst = stat;                                      \
+        for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
+            d[i] = FUNC(n[i], shift, fpst);                             \
+        }                                                               \
+        clear_tail(d, oprsz, simd_maxsz(desc));                         \
+    }
+
+DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
+DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
+DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
+DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
+DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
+DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
+DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
+DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
+
+#undef DO_VCVT_FIXED
+
+#define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
+    void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
+    {                                                                   \
+        float_status *fpst = stat;                                      \
+        intptr_t i, oprsz = simd_oprsz(desc);                           \
+        uint32_t rmode = simd_data(desc);                               \
+        uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
+        TYPE *d = vd, *n = vn;                                          \
+        set_float_rounding_mode(rmode, fpst);                           \
+        for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
+            d[i] = FUNC(n[i], 0, fpst);                                 \
+        }                                                               \
+        set_float_rounding_mode(prev_rmode, fpst);                      \
+        clear_tail(d, oprsz, simd_maxsz(desc));                         \
+    }
+
+DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
+DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
+DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
+DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
+
+#undef DO_VCVT_RMODE
+
+#define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
+    void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
+    {                                                                   \
+        float_status *fpst = stat;                                      \
+        intptr_t i, oprsz = simd_oprsz(desc);                           \
+        uint32_t rmode = simd_data(desc);                               \
+        uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
+        TYPE *d = vd, *n = vn;                                          \
+        set_float_rounding_mode(rmode, fpst);                           \
+        for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
+            d[i] = FUNC(n[i], fpst);                                    \
+        }                                                               \
+        set_float_rounding_mode(prev_rmode, fpst);                      \
+        clear_tail(d, oprsz, simd_maxsz(desc));                         \
+    }
+
+DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
+DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
+
+#undef DO_VRINT_RMODE
+
+#ifdef TARGET_AARCH64
+void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
+{
+    const uint8_t *indices = vm;
+    CPUARMState *env = venv;
+    size_t oprsz = simd_oprsz(desc);
+    uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
+    bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
+    uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
+    union {
+        uint8_t b[16];
+        uint64_t d[2];
+    } result;
+
+    /*
+     * We must construct the final result in a temp, lest the output
+     * overlaps the input table.  For TBL, begin with zero; for TBX,
+     * begin with the original register contents.  Note that we always
+     * copy 16 bytes here to avoid an extra branch; clearing the high
+     * bits of the register for oprsz == 8 is handled below.
+     */
+    if (is_tbx) {
+        memcpy(&result, vd, 16);
+    } else {
+        memset(&result, 0, 16);
+    }
+
+    for (size_t i = 0; i < oprsz; ++i) {
+        uint32_t index = indices[H1(i)];
+
+        if (index < table_len) {
+            /*
+             * Convert index (a byte offset into the virtual table
+             * which is a series of 128-bit vectors concatenated)
+             * into the correct register element, bearing in mind
+             * that the table can wrap around from V31 to V0.
+             */
+            const uint8_t *table = (const uint8_t *)
+                aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
+            result.b[H1(i)] = table[H1(index % 16)];
+        }
+    }
+
+    memcpy(vd, &result, 16);
+    clear_tail(vd, oprsz, simd_maxsz(desc));
+}
+#endif
+
+/*
+ * NxN -> N highpart multiply
+ *
+ * TODO: expose this as a generic vector operation.
+ */
+
+void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    int8_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz; ++i) {
+        d[i] = ((int32_t)n[i] * m[i]) >> 8;
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    int16_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz / 2; ++i) {
+        d[i] = ((int32_t)n[i] * m[i]) >> 16;
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    int32_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz / 4; ++i) {
+        d[i] = ((int64_t)n[i] * m[i]) >> 32;
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    uint64_t *d = vd, *n = vn, *m = vm;
+    uint64_t discard;
+
+    for (i = 0; i < opr_sz / 8; ++i) {
+        muls64(&discard, &d[i], n[i], m[i]);
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    uint8_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz; ++i) {
+        d[i] = ((uint32_t)n[i] * m[i]) >> 8;
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    uint16_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz / 2; ++i) {
+        d[i] = ((uint32_t)n[i] * m[i]) >> 16;
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    uint32_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz / 4; ++i) {
+        d[i] = ((uint64_t)n[i] * m[i]) >> 32;
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    uint64_t *d = vd, *n = vn, *m = vm;
+    uint64_t discard;
+
+    for (i = 0; i < opr_sz / 8; ++i) {
+        mulu64(&discard, &d[i], n[i], m[i]);
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+    int shr = simd_data(desc);
+    uint64_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz; ++i) {
+        d[i] = ror64(n[i] ^ m[i], shr);
+    }
+    clear_tail(d, opr_sz * 8, simd_maxsz(desc));
+}
+
+/*
+ * Integer matrix-multiply accumulate
+ */
+
+static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
+{
+    int8_t *n = vn, *m = vm;
+
+    for (intptr_t k = 0; k < 8; ++k) {
+        sum += n[H1(k)] * m[H1(k)];
+    }
+    return sum;
+}
+
+static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
+{
+    uint8_t *n = vn, *m = vm;
+
+    for (intptr_t k = 0; k < 8; ++k) {
+        sum += n[H1(k)] * m[H1(k)];
+    }
+    return sum;
+}
+
+static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
+{
+    uint8_t *n = vn;
+    int8_t *m = vm;
+
+    for (intptr_t k = 0; k < 8; ++k) {
+        sum += n[H1(k)] * m[H1(k)];
+    }
+    return sum;
+}
+
+static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
+                      uint32_t (*inner_loop)(uint32_t, void *, void *))
+{
+    intptr_t seg, opr_sz = simd_oprsz(desc);
+
+    for (seg = 0; seg < opr_sz; seg += 16) {
+        uint32_t *d = vd + seg;
+        uint32_t *a = va + seg;
+        uint32_t sum0, sum1, sum2, sum3;
+
+        /*
+         * Process the entire segment at once, writing back the
+         * results only after we've consumed all of the inputs.
+         *
+         * Key to indices by column:
+         *          i   j                  i             j
+         */
+        sum0 = a[H4(0 + 0)];
+        sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
+        sum1 = a[H4(0 + 1)];
+        sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
+        sum2 = a[H4(2 + 0)];
+        sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
+        sum3 = a[H4(2 + 1)];
+        sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
+
+        d[H4(0)] = sum0;
+        d[H4(1)] = sum1;
+        d[H4(2)] = sum2;
+        d[H4(3)] = sum3;
+    }
+    clear_tail(vd, opr_sz, simd_maxsz(desc));
+}
+
+#define DO_MMLA_B(NAME, INNER) \
+    void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
+    { do_mmla_b(vd, vn, vm, va, desc, INNER); }
+
+DO_MMLA_B(gvec_smmla_b, do_smmla_b)
+DO_MMLA_B(gvec_ummla_b, do_ummla_b)
+DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
+
+/*
+ * BFloat16 Dot Product
+ */
+
+float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2)
+{
+    /* FPCR is ignored for BFDOT and BFMMLA. */
+    float_status bf_status = {
+        .tininess_before_rounding = float_tininess_before_rounding,
+        .float_rounding_mode = float_round_to_odd_inf,
+        .flush_to_zero = true,
+        .flush_inputs_to_zero = true,
+        .default_nan_mode = true,
+    };
+    float32 t1, t2;
+
+    /*
+     * Extract each BFloat16 from the element pair, and shift
+     * them such that they become float32.
+     */
+    t1 = float32_mul(e1 << 16, e2 << 16, &bf_status);
+    t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status);
+    t1 = float32_add(t1, t2, &bf_status);
+    t1 = float32_add(sum, t1, &bf_status);
+
+    return t1;
+}
+
+void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    float32 *d = vd, *a = va;
+    uint32_t *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz / 4; ++i) {
+        d[i] = bfdotadd(a[i], n[i], m[i]);
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
+                            void *va, uint32_t desc)
+{
+    intptr_t i, j, opr_sz = simd_oprsz(desc);
+    intptr_t index = simd_data(desc);
+    intptr_t elements = opr_sz / 4;
+    intptr_t eltspersegment = MIN(16 / 4, elements);
+    float32 *d = vd, *a = va;
+    uint32_t *n = vn, *m = vm;
+
+    for (i = 0; i < elements; i += eltspersegment) {
+        uint32_t m_idx = m[i + H4(index)];
+
+        for (j = i; j < i + eltspersegment; j++) {
+            d[j] = bfdotadd(a[j], n[j], m_idx);
+        }
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
+{
+    intptr_t s, opr_sz = simd_oprsz(desc);
+    float32 *d = vd, *a = va;
+    uint32_t *n = vn, *m = vm;
+
+    for (s = 0; s < opr_sz / 4; s += 4) {
+        float32 sum00, sum01, sum10, sum11;
+
+        /*
+         * Process the entire segment at once, writing back the
+         * results only after we've consumed all of the inputs.
+         *
+         * Key to indicies by column:
+         *               i   j           i   k             j   k
+         */
+        sum00 = a[s + H4(0 + 0)];
+        sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
+        sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
+
+        sum01 = a[s + H4(0 + 1)];
+        sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]);
+        sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]);
+
+        sum10 = a[s + H4(2 + 0)];
+        sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]);
+        sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]);
+
+        sum11 = a[s + H4(2 + 1)];
+        sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]);
+        sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]);
+
+        d[s + H4(0 + 0)] = sum00;
+        d[s + H4(0 + 1)] = sum01;
+        d[s + H4(2 + 0)] = sum10;
+        d[s + H4(2 + 1)] = sum11;
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
+                         void *stat, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    intptr_t sel = simd_data(desc);
+    float32 *d = vd, *a = va;
+    bfloat16 *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz / 4; ++i) {
+        float32 nn = n[H2(i * 2 + sel)] << 16;
+        float32 mm = m[H2(i * 2 + sel)] << 16;
+        d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
+                             void *va, void *stat, uint32_t desc)
+{
+    intptr_t i, j, opr_sz = simd_oprsz(desc);
+    intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
+    intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
+    intptr_t elements = opr_sz / 4;
+    intptr_t eltspersegment = MIN(16 / 4, elements);
+    float32 *d = vd, *a = va;
+    bfloat16 *n = vn, *m = vm;
+
+    for (i = 0; i < elements; i += eltspersegment) {
+        float32 m_idx = m[H2(2 * i + index)] << 16;
+
+        for (j = i; j < i + eltspersegment; j++) {
+            float32 n_j = n[H2(2 * j + sel)] << 16;
+            d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
+        }
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+#define DO_CLAMP(NAME, TYPE) \
+void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
+{                                                                       \
+    intptr_t i, opr_sz = simd_oprsz(desc);                              \
+    for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
+        TYPE aa = *(TYPE *)(a + i);                                     \
+        TYPE nn = *(TYPE *)(n + i);                                     \
+        TYPE mm = *(TYPE *)(m + i);                                     \
+        TYPE dd = MIN(MAX(aa, nn), mm);                                 \
+        *(TYPE *)(d + i) = dd;                                          \
+    }                                                                   \
+    clear_tail(d, opr_sz, simd_maxsz(desc));                            \
+}
+
+DO_CLAMP(gvec_sclamp_b, int8_t)
+DO_CLAMP(gvec_sclamp_h, int16_t)
+DO_CLAMP(gvec_sclamp_s, int32_t)
+DO_CLAMP(gvec_sclamp_d, int64_t)
+
+DO_CLAMP(gvec_uclamp_b, uint8_t)
+DO_CLAMP(gvec_uclamp_h, uint16_t)
+DO_CLAMP(gvec_uclamp_s, uint32_t)
+DO_CLAMP(gvec_uclamp_d, uint64_t)
diff --git a/target/arm/tcg/vec_internal.h b/target/arm/tcg/vec_internal.h
new file mode 100644
index 0000000..1f4ed80
--- /dev/null
+++ b/target/arm/tcg/vec_internal.h
@@ -0,0 +1,246 @@
+/*
+ * ARM AdvSIMD / SVE Vector Helpers
+ *
+ * Copyright (c) 2020 Linaro
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef TARGET_ARM_VEC_INTERNAL_H
+#define TARGET_ARM_VEC_INTERNAL_H
+
+/*
+ * Note that vector data is stored in host-endian 64-bit chunks,
+ * so addressing units smaller than that needs a host-endian fixup.
+ *
+ * The H<N> macros are used when indexing an array of elements of size N.
+ *
+ * The H1_<N> macros are used when performing byte arithmetic and then
+ * casting the final pointer to a type of size N.
+ */
+#if HOST_BIG_ENDIAN
+#define H1(x)   ((x) ^ 7)
+#define H1_2(x) ((x) ^ 6)
+#define H1_4(x) ((x) ^ 4)
+#define H2(x)   ((x) ^ 3)
+#define H4(x)   ((x) ^ 1)
+#else
+#define H1(x)   (x)
+#define H1_2(x) (x)
+#define H1_4(x) (x)
+#define H2(x)   (x)
+#define H4(x)   (x)
+#endif
+/*
+ * Access to 64-bit elements isn't host-endian dependent; we provide H8
+ * and H1_8 so that when a function is being generated from a macro we
+ * can pass these rather than an empty macro argument, for clarity.
+ */
+#define H8(x)   (x)
+#define H1_8(x) (x)
+
+/*
+ * Expand active predicate bits to bytes, for byte elements.
+ */
+extern const uint64_t expand_pred_b_data[256];
+static inline uint64_t expand_pred_b(uint8_t byte)
+{
+    return expand_pred_b_data[byte];
+}
+
+/* Similarly for half-word elements. */
+extern const uint64_t expand_pred_h_data[0x55 + 1];
+static inline uint64_t expand_pred_h(uint8_t byte)
+{
+    return expand_pred_h_data[byte & 0x55];
+}
+
+static inline void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz)
+{
+    uint64_t *d = vd + opr_sz;
+    uintptr_t i;
+
+    for (i = opr_sz; i < max_sz; i += 8) {
+        *d++ = 0;
+    }
+}
+
+static inline int32_t do_sqrshl_bhs(int32_t src, int32_t shift, int bits,
+                                    bool round, uint32_t *sat)
+{
+    if (shift <= -bits) {
+        /* Rounding the sign bit always produces 0. */
+        if (round) {
+            return 0;
+        }
+        return src >> 31;
+    } else if (shift < 0) {
+        if (round) {
+            src >>= -shift - 1;
+            return (src >> 1) + (src & 1);
+        }
+        return src >> -shift;
+    } else if (shift < bits) {
+        int32_t val = src << shift;
+        if (bits == 32) {
+            if (!sat || val >> shift == src) {
+                return val;
+            }
+        } else {
+            int32_t extval = sextract32(val, 0, bits);
+            if (!sat || val == extval) {
+                return extval;
+            }
+        }
+    } else if (!sat || src == 0) {
+        return 0;
+    }
+
+    *sat = 1;
+    return (1u << (bits - 1)) - (src >= 0);
+}
+
+static inline uint32_t do_uqrshl_bhs(uint32_t src, int32_t shift, int bits,
+                                     bool round, uint32_t *sat)
+{
+    if (shift <= -(bits + round)) {
+        return 0;
+    } else if (shift < 0) {
+        if (round) {
+            src >>= -shift - 1;
+            return (src >> 1) + (src & 1);
+        }
+        return src >> -shift;
+    } else if (shift < bits) {
+        uint32_t val = src << shift;
+        if (bits == 32) {
+            if (!sat || val >> shift == src) {
+                return val;
+            }
+        } else {
+            uint32_t extval = extract32(val, 0, bits);
+            if (!sat || val == extval) {
+                return extval;
+            }
+        }
+    } else if (!sat || src == 0) {
+        return 0;
+    }
+
+    *sat = 1;
+    return MAKE_64BIT_MASK(0, bits);
+}
+
+static inline int32_t do_suqrshl_bhs(int32_t src, int32_t shift, int bits,
+                                     bool round, uint32_t *sat)
+{
+    if (sat && src < 0) {
+        *sat = 1;
+        return 0;
+    }
+    return do_uqrshl_bhs(src, shift, bits, round, sat);
+}
+
+static inline int64_t do_sqrshl_d(int64_t src, int64_t shift,
+                                  bool round, uint32_t *sat)
+{
+    if (shift <= -64) {
+        /* Rounding the sign bit always produces 0. */
+        if (round) {
+            return 0;
+        }
+        return src >> 63;
+    } else if (shift < 0) {
+        if (round) {
+            src >>= -shift - 1;
+            return (src >> 1) + (src & 1);
+        }
+        return src >> -shift;
+    } else if (shift < 64) {
+        int64_t val = src << shift;
+        if (!sat || val >> shift == src) {
+            return val;
+        }
+    } else if (!sat || src == 0) {
+        return 0;
+    }
+
+    *sat = 1;
+    return src < 0 ? INT64_MIN : INT64_MAX;
+}
+
+static inline uint64_t do_uqrshl_d(uint64_t src, int64_t shift,
+                                   bool round, uint32_t *sat)
+{
+    if (shift <= -(64 + round)) {
+        return 0;
+    } else if (shift < 0) {
+        if (round) {
+            src >>= -shift - 1;
+            return (src >> 1) + (src & 1);
+        }
+        return src >> -shift;
+    } else if (shift < 64) {
+        uint64_t val = src << shift;
+        if (!sat || val >> shift == src) {
+            return val;
+        }
+    } else if (!sat || src == 0) {
+        return 0;
+    }
+
+    *sat = 1;
+    return UINT64_MAX;
+}
+
+static inline int64_t do_suqrshl_d(int64_t src, int64_t shift,
+                                   bool round, uint32_t *sat)
+{
+    if (sat && src < 0) {
+        *sat = 1;
+        return 0;
+    }
+    return do_uqrshl_d(src, shift, round, sat);
+}
+
+int8_t do_sqrdmlah_b(int8_t, int8_t, int8_t, bool, bool);
+int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *);
+int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *);
+int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool);
+
+/*
+ * 8 x 8 -> 16 vector polynomial multiply where the inputs are
+ * in the low 8 bits of each 16-bit element
+*/
+uint64_t pmull_h(uint64_t op1, uint64_t op2);
+/*
+ * 16 x 16 -> 32 vector polynomial multiply where the inputs are
+ * in the low 16 bits of each 32-bit element
+ */
+uint64_t pmull_w(uint64_t op1, uint64_t op2);
+
+/**
+ * bfdotadd:
+ * @sum: addend
+ * @e1, @e2: multiplicand vectors
+ *
+ * BFloat16 2-way dot product of @e1 & @e2, accumulating with @sum.
+ * The @e1 and @e2 operands correspond to the 32-bit source vector
+ * slots and contain two Bfloat16 values each.
+ *
+ * Corresponds to the ARM pseudocode function BFDotAdd.
+ */
+float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2);
+
+#endif /* TARGET_ARM_VEC_INTERNAL_H */
diff --git a/target/arm/tlb_helper.c b/target/arm/tlb_helper.c
deleted file mode 100644
index 60abcbe..0000000
--- a/target/arm/tlb_helper.c
+++ /dev/null
@@ -1,287 +0,0 @@
-/*
- * ARM TLB (Translation lookaside buffer) helpers.
- *
- * This code is licensed under the GNU GPL v2 or later.
- *
- * SPDX-License-Identifier: GPL-2.0-or-later
- */
-#include "qemu/osdep.h"
-#include "cpu.h"
-#include "internals.h"
-#include "exec/exec-all.h"
-#include "exec/helper-proto.h"
-
-
-/* Return true if the translation regime is using LPAE format page tables */
-bool regime_using_lpae_format(CPUARMState *env, ARMMMUIdx mmu_idx)
-{
-    int el = regime_el(env, mmu_idx);
-    if (el == 2 || arm_el_is_aa64(env, el)) {
-        return true;
-    }
-    if (arm_feature(env, ARM_FEATURE_PMSA) &&
-        arm_feature(env, ARM_FEATURE_V8)) {
-        return true;
-    }
-    if (arm_feature(env, ARM_FEATURE_LPAE)
-        && (regime_tcr(env, mmu_idx) & TTBCR_EAE)) {
-        return true;
-    }
-    return false;
-}
-
-/*
- * Returns true if the stage 1 translation regime is using LPAE format page
- * tables. Used when raising alignment exceptions, whose FSR changes depending
- * on whether the long or short descriptor format is in use.
- */
-bool arm_s1_regime_using_lpae_format(CPUARMState *env, ARMMMUIdx mmu_idx)
-{
-    mmu_idx = stage_1_mmu_idx(mmu_idx);
-    return regime_using_lpae_format(env, mmu_idx);
-}
-
-static inline uint32_t merge_syn_data_abort(uint32_t template_syn,
-                                            unsigned int target_el,
-                                            bool same_el, bool ea,
-                                            bool s1ptw, bool is_write,
-                                            int fsc)
-{
-    uint32_t syn;
-
-    /*
-     * ISV is only set for data aborts routed to EL2 and
-     * never for stage-1 page table walks faulting on stage 2.
-     *
-     * Furthermore, ISV is only set for certain kinds of load/stores.
-     * If the template syndrome does not have ISV set, we should leave
-     * it cleared.
-     *
-     * See ARMv8 specs, D7-1974:
-     * ISS encoding for an exception from a Data Abort, the
-     * ISV field.
-     */
-    if (!(template_syn & ARM_EL_ISV) || target_el != 2 || s1ptw) {
-        syn = syn_data_abort_no_iss(same_el, 0,
-                                    ea, 0, s1ptw, is_write, fsc);
-    } else {
-        /*
-         * Fields: IL, ISV, SAS, SSE, SRT, SF and AR come from the template
-         * syndrome created at translation time.
-         * Now we create the runtime syndrome with the remaining fields.
-         */
-        syn = syn_data_abort_with_iss(same_el,
-                                      0, 0, 0, 0, 0,
-                                      ea, 0, s1ptw, is_write, fsc,
-                                      true);
-        /* Merge the runtime syndrome with the template syndrome.  */
-        syn |= template_syn;
-    }
-    return syn;
-}
-
-static uint32_t compute_fsr_fsc(CPUARMState *env, ARMMMUFaultInfo *fi,
-                                int target_el, int mmu_idx, uint32_t *ret_fsc)
-{
-    ARMMMUIdx arm_mmu_idx = core_to_arm_mmu_idx(env, mmu_idx);
-    uint32_t fsr, fsc;
-
-    if (target_el == 2 || arm_el_is_aa64(env, target_el) ||
-        arm_s1_regime_using_lpae_format(env, arm_mmu_idx)) {
-        /*
-         * LPAE format fault status register : bottom 6 bits are
-         * status code in the same form as needed for syndrome
-         */
-        fsr = arm_fi_to_lfsc(fi);
-        fsc = extract32(fsr, 0, 6);
-    } else {
-        fsr = arm_fi_to_sfsc(fi);
-        /*
-         * Short format FSR : this fault will never actually be reported
-         * to an EL that uses a syndrome register. Use a (currently)
-         * reserved FSR code in case the constructed syndrome does leak
-         * into the guest somehow.
-         */
-        fsc = 0x3f;
-    }
-
-    *ret_fsc = fsc;
-    return fsr;
-}
-
-static G_NORETURN
-void arm_deliver_fault(ARMCPU *cpu, vaddr addr,
-                       MMUAccessType access_type,
-                       int mmu_idx, ARMMMUFaultInfo *fi)
-{
-    CPUARMState *env = &cpu->env;
-    int target_el;
-    bool same_el;
-    uint32_t syn, exc, fsr, fsc;
-
-    target_el = exception_target_el(env);
-    if (fi->stage2) {
-        target_el = 2;
-        env->cp15.hpfar_el2 = extract64(fi->s2addr, 12, 47) << 4;
-        if (arm_is_secure_below_el3(env) && fi->s1ns) {
-            env->cp15.hpfar_el2 |= HPFAR_NS;
-        }
-    }
-    same_el = (arm_current_el(env) == target_el);
-
-    fsr = compute_fsr_fsc(env, fi, target_el, mmu_idx, &fsc);
-
-    if (access_type == MMU_INST_FETCH) {
-        syn = syn_insn_abort(same_el, fi->ea, fi->s1ptw, fsc);
-        exc = EXCP_PREFETCH_ABORT;
-    } else {
-        syn = merge_syn_data_abort(env->exception.syndrome, target_el,
-                                   same_el, fi->ea, fi->s1ptw,
-                                   access_type == MMU_DATA_STORE,
-                                   fsc);
-        if (access_type == MMU_DATA_STORE
-            && arm_feature(env, ARM_FEATURE_V6)) {
-            fsr |= (1 << 11);
-        }
-        exc = EXCP_DATA_ABORT;
-    }
-
-    env->exception.vaddress = addr;
-    env->exception.fsr = fsr;
-    raise_exception(env, exc, syn, target_el);
-}
-
-/* Raise a data fault alignment exception for the specified virtual address */
-void arm_cpu_do_unaligned_access(CPUState *cs, vaddr vaddr,
-                                 MMUAccessType access_type,
-                                 int mmu_idx, uintptr_t retaddr)
-{
-    ARMCPU *cpu = ARM_CPU(cs);
-    ARMMMUFaultInfo fi = {};
-
-    /* now we have a real cpu fault */
-    cpu_restore_state(cs, retaddr);
-
-    fi.type = ARMFault_Alignment;
-    arm_deliver_fault(cpu, vaddr, access_type, mmu_idx, &fi);
-}
-
-void helper_exception_pc_alignment(CPUARMState *env, target_ulong pc)
-{
-    ARMMMUFaultInfo fi = { .type = ARMFault_Alignment };
-    int target_el = exception_target_el(env);
-    int mmu_idx = cpu_mmu_index(env, true);
-    uint32_t fsc;
-
-    env->exception.vaddress = pc;
-
-    /*
-     * Note that the fsc is not applicable to this exception,
-     * since any syndrome is pcalignment not insn_abort.
-     */
-    env->exception.fsr = compute_fsr_fsc(env, &fi, target_el, mmu_idx, &fsc);
-    raise_exception(env, EXCP_PREFETCH_ABORT, syn_pcalignment(), target_el);
-}
-
-#if !defined(CONFIG_USER_ONLY)
-
-/*
- * arm_cpu_do_transaction_failed: handle a memory system error response
- * (eg "no device/memory present at address") by raising an external abort
- * exception
- */
-void arm_cpu_do_transaction_failed(CPUState *cs, hwaddr physaddr,
-                                   vaddr addr, unsigned size,
-                                   MMUAccessType access_type,
-                                   int mmu_idx, MemTxAttrs attrs,
-                                   MemTxResult response, uintptr_t retaddr)
-{
-    ARMCPU *cpu = ARM_CPU(cs);
-    ARMMMUFaultInfo fi = {};
-
-    /* now we have a real cpu fault */
-    cpu_restore_state(cs, retaddr);
-
-    fi.ea = arm_extabort_type(response);
-    fi.type = ARMFault_SyncExternal;
-    arm_deliver_fault(cpu, addr, access_type, mmu_idx, &fi);
-}
-
-bool arm_cpu_tlb_fill(CPUState *cs, vaddr address, int size,
-                      MMUAccessType access_type, int mmu_idx,
-                      bool probe, uintptr_t retaddr)
-{
-    ARMCPU *cpu = ARM_CPU(cs);
-    GetPhysAddrResult res = {};
-    ARMMMUFaultInfo local_fi, *fi;
-    int ret;
-
-    /*
-     * Allow S1_ptw_translate to see any fault generated here.
-     * Since this may recurse, read and clear.
-     */
-    fi = cpu->env.tlb_fi;
-    if (fi) {
-        cpu->env.tlb_fi = NULL;
-    } else {
-        fi = memset(&local_fi, 0, sizeof(local_fi));
-    }
-
-    /*
-     * Walk the page table and (if the mapping exists) add the page
-     * to the TLB.  On success, return true.  Otherwise, if probing,
-     * return false.  Otherwise populate fsr with ARM DFSR/IFSR fault
-     * register format, and signal the fault.
-     */
-    ret = get_phys_addr(&cpu->env, address, access_type,
-                        core_to_arm_mmu_idx(&cpu->env, mmu_idx),
-                        &res, fi);
-    if (likely(!ret)) {
-        /*
-         * Map a single [sub]page. Regions smaller than our declared
-         * target page size are handled specially, so for those we
-         * pass in the exact addresses.
-         */
-        if (res.f.lg_page_size >= TARGET_PAGE_BITS) {
-            res.f.phys_addr &= TARGET_PAGE_MASK;
-            address &= TARGET_PAGE_MASK;
-        }
-
-        res.f.pte_attrs = res.cacheattrs.attrs;
-        res.f.shareability = res.cacheattrs.shareability;
-
-        tlb_set_page_full(cs, mmu_idx, address, &res.f);
-        return true;
-    } else if (probe) {
-        return false;
-    } else {
-        /* now we have a real cpu fault */
-        cpu_restore_state(cs, retaddr);
-        arm_deliver_fault(cpu, address, access_type, mmu_idx, fi);
-    }
-}
-#else
-void arm_cpu_record_sigsegv(CPUState *cs, vaddr addr,
-                            MMUAccessType access_type,
-                            bool maperr, uintptr_t ra)
-{
-    ARMMMUFaultInfo fi = {
-        .type = maperr ? ARMFault_Translation : ARMFault_Permission,
-        .level = 3,
-    };
-    ARMCPU *cpu = ARM_CPU(cs);
-
-    /*
-     * We report both ESR and FAR to signal handlers.
-     * For now, it's easiest to deliver the fault normally.
-     */
-    cpu_restore_state(cs, ra);
-    arm_deliver_fault(cpu, addr, access_type, MMU_USER_IDX, &fi);
-}
-
-void arm_cpu_record_sigbus(CPUState *cs, vaddr addr,
-                           MMUAccessType access_type, uintptr_t ra)
-{
-    arm_cpu_do_unaligned_access(cs, addr, access_type, MMU_USER_IDX, ra);
-}
-#endif /* !defined(CONFIG_USER_ONLY) */
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
deleted file mode 100644
index f59d3b2..0000000
--- a/target/arm/vec_helper.c
+++ /dev/null
@@ -1,2716 +0,0 @@
-/*
- * ARM AdvSIMD / SVE Vector Operations
- *
- * Copyright (c) 2018 Linaro
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "qemu/osdep.h"
-#include "cpu.h"
-#include "exec/helper-proto.h"
-#include "tcg/tcg-gvec-desc.h"
-#include "fpu/softfloat.h"
-#include "qemu/int128.h"
-#include "vec_internal.h"
-
-/*
- * Data for expanding active predicate bits to bytes, for byte elements.
- *
- *  for (i = 0; i < 256; ++i) {
- *      unsigned long m = 0;
- *      for (j = 0; j < 8; j++) {
- *          if ((i >> j) & 1) {
- *              m |= 0xfful << (j << 3);
- *          }
- *      }
- *      printf("0x%016lx,\n", m);
- *  }
- */
-const uint64_t expand_pred_b_data[256] = {
-    0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
-    0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
-    0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
-    0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
-    0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
-    0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
-    0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
-    0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
-    0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
-    0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
-    0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
-    0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
-    0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
-    0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
-    0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
-    0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
-    0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
-    0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
-    0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
-    0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
-    0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
-    0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
-    0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
-    0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
-    0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
-    0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
-    0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
-    0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
-    0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
-    0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
-    0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
-    0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
-    0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
-    0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
-    0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
-    0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
-    0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
-    0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
-    0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
-    0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
-    0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
-    0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
-    0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
-    0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
-    0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
-    0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
-    0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
-    0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
-    0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
-    0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
-    0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
-    0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
-    0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
-    0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
-    0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
-    0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
-    0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
-    0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
-    0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
-    0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
-    0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
-    0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
-    0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
-    0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
-    0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
-    0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
-    0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
-    0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
-    0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
-    0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
-    0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
-    0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
-    0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
-    0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
-    0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
-    0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
-    0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
-    0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
-    0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
-    0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
-    0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
-    0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
-    0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
-    0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
-    0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
-    0xffffffffffffffff,
-};
-
-/*
- * Similarly for half-word elements.
- *  for (i = 0; i < 256; ++i) {
- *      unsigned long m = 0;
- *      if (i & 0xaa) {
- *          continue;
- *      }
- *      for (j = 0; j < 8; j += 2) {
- *          if ((i >> j) & 1) {
- *              m |= 0xfffful << (j << 3);
- *          }
- *      }
- *      printf("[0x%x] = 0x%016lx,\n", i, m);
- *  }
- */
-const uint64_t expand_pred_h_data[0x55 + 1] = {
-    [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
-    [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
-    [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
-    [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
-    [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
-    [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
-    [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
-    [0x55] = 0xffffffffffffffff,
-};
-
-/* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
-int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
-                     bool neg, bool round)
-{
-    /*
-     * Simplify:
-     * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
-     * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
-     */
-    int32_t ret = (int32_t)src1 * src2;
-    if (neg) {
-        ret = -ret;
-    }
-    ret += ((int32_t)src3 << 7) + (round << 6);
-    ret >>= 7;
-
-    if (ret != (int8_t)ret) {
-        ret = (ret < 0 ? INT8_MIN : INT8_MAX);
-    }
-    return ret;
-}
-
-void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
-                             void *va, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    int8_t *d = vd, *n = vn, *m = vm, *a = va;
-
-    for (i = 0; i < opr_sz; ++i) {
-        d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
-    }
-}
-
-void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
-                             void *va, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    int8_t *d = vd, *n = vn, *m = vm, *a = va;
-
-    for (i = 0; i < opr_sz; ++i) {
-        d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
-    }
-}
-
-void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    int8_t *d = vd, *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz; ++i) {
-        d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
-    }
-}
-
-void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    int8_t *d = vd, *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz; ++i) {
-        d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
-    }
-}
-
-/* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
-int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
-                      bool neg, bool round, uint32_t *sat)
-{
-    /* Simplify similarly to do_sqrdmlah_b above.  */
-    int32_t ret = (int32_t)src1 * src2;
-    if (neg) {
-        ret = -ret;
-    }
-    ret += ((int32_t)src3 << 15) + (round << 14);
-    ret >>= 15;
-
-    if (ret != (int16_t)ret) {
-        *sat = 1;
-        ret = (ret < 0 ? INT16_MIN : INT16_MAX);
-    }
-    return ret;
-}
-
-uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
-                                  uint32_t src2, uint32_t src3)
-{
-    uint32_t *sat = &env->vfp.qc[0];
-    uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
-    uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
-                                false, true, sat);
-    return deposit32(e1, 16, 16, e2);
-}
-
-void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
-                              void *vq, uint32_t desc)
-{
-    uintptr_t opr_sz = simd_oprsz(desc);
-    int16_t *d = vd;
-    int16_t *n = vn;
-    int16_t *m = vm;
-    uintptr_t i;
-
-    for (i = 0; i < opr_sz / 2; ++i) {
-        d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
-                                  uint32_t src2, uint32_t src3)
-{
-    uint32_t *sat = &env->vfp.qc[0];
-    uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
-    uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
-                                true, true, sat);
-    return deposit32(e1, 16, 16, e2);
-}
-
-void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
-                              void *vq, uint32_t desc)
-{
-    uintptr_t opr_sz = simd_oprsz(desc);
-    int16_t *d = vd;
-    int16_t *n = vn;
-    int16_t *m = vm;
-    uintptr_t i;
-
-    for (i = 0; i < opr_sz / 2; ++i) {
-        d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
-                            void *vq, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    int16_t *d = vd, *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz / 2; ++i) {
-        d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
-                             void *vq, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    int16_t *d = vd, *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz / 2; ++i) {
-        d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
-                             void *va, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    int16_t *d = vd, *n = vn, *m = vm, *a = va;
-    uint32_t discard;
-
-    for (i = 0; i < opr_sz / 2; ++i) {
-        d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
-    }
-}
-
-void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
-                             void *va, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    int16_t *d = vd, *n = vn, *m = vm, *a = va;
-    uint32_t discard;
-
-    for (i = 0; i < opr_sz / 2; ++i) {
-        d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
-    }
-}
-
-void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    int16_t *d = vd, *n = vn, *m = vm;
-    uint32_t discard;
-
-    for (i = 0; i < opr_sz / 2; ++i) {
-        d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
-    }
-}
-
-void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    int16_t *d = vd, *n = vn, *m = vm;
-    uint32_t discard;
-
-    for (i = 0; i < opr_sz / 2; ++i) {
-        d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
-    }
-}
-
-void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, j, opr_sz = simd_oprsz(desc);
-    int idx = simd_data(desc);
-    int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
-    uint32_t discard;
-
-    for (i = 0; i < opr_sz / 2; i += 16 / 2) {
-        int16_t mm = m[i];
-        for (j = 0; j < 16 / 2; ++j) {
-            d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
-        }
-    }
-}
-
-void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, j, opr_sz = simd_oprsz(desc);
-    int idx = simd_data(desc);
-    int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
-    uint32_t discard;
-
-    for (i = 0; i < opr_sz / 2; i += 16 / 2) {
-        int16_t mm = m[i];
-        for (j = 0; j < 16 / 2; ++j) {
-            d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
-        }
-    }
-}
-
-/* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
-int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
-                      bool neg, bool round, uint32_t *sat)
-{
-    /* Simplify similarly to do_sqrdmlah_b above.  */
-    int64_t ret = (int64_t)src1 * src2;
-    if (neg) {
-        ret = -ret;
-    }
-    ret += ((int64_t)src3 << 31) + (round << 30);
-    ret >>= 31;
-
-    if (ret != (int32_t)ret) {
-        *sat = 1;
-        ret = (ret < 0 ? INT32_MIN : INT32_MAX);
-    }
-    return ret;
-}
-
-uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
-                                  int32_t src2, int32_t src3)
-{
-    uint32_t *sat = &env->vfp.qc[0];
-    return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
-}
-
-void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
-                              void *vq, uint32_t desc)
-{
-    uintptr_t opr_sz = simd_oprsz(desc);
-    int32_t *d = vd;
-    int32_t *n = vn;
-    int32_t *m = vm;
-    uintptr_t i;
-
-    for (i = 0; i < opr_sz / 4; ++i) {
-        d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
-                                  int32_t src2, int32_t src3)
-{
-    uint32_t *sat = &env->vfp.qc[0];
-    return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
-}
-
-void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
-                              void *vq, uint32_t desc)
-{
-    uintptr_t opr_sz = simd_oprsz(desc);
-    int32_t *d = vd;
-    int32_t *n = vn;
-    int32_t *m = vm;
-    uintptr_t i;
-
-    for (i = 0; i < opr_sz / 4; ++i) {
-        d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
-                            void *vq, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    int32_t *d = vd, *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz / 4; ++i) {
-        d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
-                             void *vq, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    int32_t *d = vd, *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz / 4; ++i) {
-        d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
-                             void *va, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    int32_t *d = vd, *n = vn, *m = vm, *a = va;
-    uint32_t discard;
-
-    for (i = 0; i < opr_sz / 4; ++i) {
-        d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
-    }
-}
-
-void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
-                             void *va, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    int32_t *d = vd, *n = vn, *m = vm, *a = va;
-    uint32_t discard;
-
-    for (i = 0; i < opr_sz / 4; ++i) {
-        d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
-    }
-}
-
-void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    int32_t *d = vd, *n = vn, *m = vm;
-    uint32_t discard;
-
-    for (i = 0; i < opr_sz / 4; ++i) {
-        d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
-    }
-}
-
-void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    int32_t *d = vd, *n = vn, *m = vm;
-    uint32_t discard;
-
-    for (i = 0; i < opr_sz / 4; ++i) {
-        d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
-    }
-}
-
-void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, j, opr_sz = simd_oprsz(desc);
-    int idx = simd_data(desc);
-    int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
-    uint32_t discard;
-
-    for (i = 0; i < opr_sz / 4; i += 16 / 4) {
-        int32_t mm = m[i];
-        for (j = 0; j < 16 / 4; ++j) {
-            d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
-        }
-    }
-}
-
-void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, j, opr_sz = simd_oprsz(desc);
-    int idx = simd_data(desc);
-    int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
-    uint32_t discard;
-
-    for (i = 0; i < opr_sz / 4; i += 16 / 4) {
-        int32_t mm = m[i];
-        for (j = 0; j < 16 / 4; ++j) {
-            d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
-        }
-    }
-}
-
-/* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
-static int64_t do_sat128_d(Int128 r)
-{
-    int64_t ls = int128_getlo(r);
-    int64_t hs = int128_gethi(r);
-
-    if (unlikely(hs != (ls >> 63))) {
-        return hs < 0 ? INT64_MIN : INT64_MAX;
-    }
-    return ls;
-}
-
-int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
-{
-    uint64_t l, h;
-    Int128 r, t;
-
-    /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
-    muls64(&l, &h, m, n);
-    r = int128_make128(l, h);
-    if (neg) {
-        r = int128_neg(r);
-    }
-    if (a) {
-        t = int128_exts64(a);
-        t = int128_lshift(t, 63);
-        r = int128_add(r, t);
-    }
-    if (round) {
-        t = int128_exts64(1ll << 62);
-        r = int128_add(r, t);
-    }
-    r = int128_rshift(r, 63);
-
-    return do_sat128_d(r);
-}
-
-void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
-                             void *va, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    int64_t *d = vd, *n = vn, *m = vm, *a = va;
-
-    for (i = 0; i < opr_sz / 8; ++i) {
-        d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
-    }
-}
-
-void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
-                             void *va, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    int64_t *d = vd, *n = vn, *m = vm, *a = va;
-
-    for (i = 0; i < opr_sz / 8; ++i) {
-        d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
-    }
-}
-
-void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    int64_t *d = vd, *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz / 8; ++i) {
-        d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
-    }
-}
-
-void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    int64_t *d = vd, *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz / 8; ++i) {
-        d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
-    }
-}
-
-void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, j, opr_sz = simd_oprsz(desc);
-    int idx = simd_data(desc);
-    int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
-
-    for (i = 0; i < opr_sz / 8; i += 16 / 8) {
-        int64_t mm = m[i];
-        for (j = 0; j < 16 / 8; ++j) {
-            d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
-        }
-    }
-}
-
-void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, j, opr_sz = simd_oprsz(desc);
-    int idx = simd_data(desc);
-    int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
-
-    for (i = 0; i < opr_sz / 8; i += 16 / 8) {
-        int64_t mm = m[i];
-        for (j = 0; j < 16 / 8; ++j) {
-            d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
-        }
-    }
-}
-
-/* Integer 8 and 16-bit dot-product.
- *
- * Note that for the loops herein, host endianness does not matter
- * with respect to the ordering of data within the quad-width lanes.
- * All elements are treated equally, no matter where they are.
- */
-
-#define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
-{                                                                         \
-    intptr_t i, opr_sz = simd_oprsz(desc);                                \
-    TYPED *d = vd, *a = va;                                               \
-    TYPEN *n = vn;                                                        \
-    TYPEM *m = vm;                                                        \
-    for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
-        d[i] = (a[i] +                                                    \
-                (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
-                (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
-                (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
-                (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
-    }                                                                     \
-    clear_tail(d, opr_sz, simd_maxsz(desc));                              \
-}
-
-DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
-DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
-DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
-DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
-DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
-
-#define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
-{                                                                         \
-    intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
-    intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
-    intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
-    intptr_t index = simd_data(desc);                                     \
-    TYPED *d = vd, *a = va;                                               \
-    TYPEN *n = vn;                                                        \
-    TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
-    do {                                                                  \
-        TYPED m0 = m_indexed[i * 4 + 0];                                  \
-        TYPED m1 = m_indexed[i * 4 + 1];                                  \
-        TYPED m2 = m_indexed[i * 4 + 2];                                  \
-        TYPED m3 = m_indexed[i * 4 + 3];                                  \
-        do {                                                              \
-            d[i] = (a[i] +                                                \
-                    n[i * 4 + 0] * m0 +                                   \
-                    n[i * 4 + 1] * m1 +                                   \
-                    n[i * 4 + 2] * m2 +                                   \
-                    n[i * 4 + 3] * m3);                                   \
-        } while (++i < segend);                                           \
-        segend = i + 4;                                                   \
-    } while (i < opr_sz_n);                                               \
-    clear_tail(d, opr_sz, simd_maxsz(desc));                              \
-}
-
-DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
-DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
-DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
-DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
-DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
-DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
-
-void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
-                         void *vfpst, uint32_t desc)
-{
-    uintptr_t opr_sz = simd_oprsz(desc);
-    float16 *d = vd;
-    float16 *n = vn;
-    float16 *m = vm;
-    float_status *fpst = vfpst;
-    uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
-    uint32_t neg_imag = neg_real ^ 1;
-    uintptr_t i;
-
-    /* Shift boolean to the sign bit so we can xor to negate.  */
-    neg_real <<= 15;
-    neg_imag <<= 15;
-
-    for (i = 0; i < opr_sz / 2; i += 2) {
-        float16 e0 = n[H2(i)];
-        float16 e1 = m[H2(i + 1)] ^ neg_imag;
-        float16 e2 = n[H2(i + 1)];
-        float16 e3 = m[H2(i)] ^ neg_real;
-
-        d[H2(i)] = float16_add(e0, e1, fpst);
-        d[H2(i + 1)] = float16_add(e2, e3, fpst);
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
-                         void *vfpst, uint32_t desc)
-{
-    uintptr_t opr_sz = simd_oprsz(desc);
-    float32 *d = vd;
-    float32 *n = vn;
-    float32 *m = vm;
-    float_status *fpst = vfpst;
-    uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
-    uint32_t neg_imag = neg_real ^ 1;
-    uintptr_t i;
-
-    /* Shift boolean to the sign bit so we can xor to negate.  */
-    neg_real <<= 31;
-    neg_imag <<= 31;
-
-    for (i = 0; i < opr_sz / 4; i += 2) {
-        float32 e0 = n[H4(i)];
-        float32 e1 = m[H4(i + 1)] ^ neg_imag;
-        float32 e2 = n[H4(i + 1)];
-        float32 e3 = m[H4(i)] ^ neg_real;
-
-        d[H4(i)] = float32_add(e0, e1, fpst);
-        d[H4(i + 1)] = float32_add(e2, e3, fpst);
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
-                         void *vfpst, uint32_t desc)
-{
-    uintptr_t opr_sz = simd_oprsz(desc);
-    float64 *d = vd;
-    float64 *n = vn;
-    float64 *m = vm;
-    float_status *fpst = vfpst;
-    uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
-    uint64_t neg_imag = neg_real ^ 1;
-    uintptr_t i;
-
-    /* Shift boolean to the sign bit so we can xor to negate.  */
-    neg_real <<= 63;
-    neg_imag <<= 63;
-
-    for (i = 0; i < opr_sz / 8; i += 2) {
-        float64 e0 = n[i];
-        float64 e1 = m[i + 1] ^ neg_imag;
-        float64 e2 = n[i + 1];
-        float64 e3 = m[i] ^ neg_real;
-
-        d[i] = float64_add(e0, e1, fpst);
-        d[i + 1] = float64_add(e2, e3, fpst);
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
-                         void *vfpst, uint32_t desc)
-{
-    uintptr_t opr_sz = simd_oprsz(desc);
-    float16 *d = vd, *n = vn, *m = vm, *a = va;
-    float_status *fpst = vfpst;
-    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
-    uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
-    uint32_t neg_real = flip ^ neg_imag;
-    uintptr_t i;
-
-    /* Shift boolean to the sign bit so we can xor to negate.  */
-    neg_real <<= 15;
-    neg_imag <<= 15;
-
-    for (i = 0; i < opr_sz / 2; i += 2) {
-        float16 e2 = n[H2(i + flip)];
-        float16 e1 = m[H2(i + flip)] ^ neg_real;
-        float16 e4 = e2;
-        float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
-
-        d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
-        d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
-                             void *vfpst, uint32_t desc)
-{
-    uintptr_t opr_sz = simd_oprsz(desc);
-    float16 *d = vd, *n = vn, *m = vm, *a = va;
-    float_status *fpst = vfpst;
-    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
-    uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
-    intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
-    uint32_t neg_real = flip ^ neg_imag;
-    intptr_t elements = opr_sz / sizeof(float16);
-    intptr_t eltspersegment = 16 / sizeof(float16);
-    intptr_t i, j;
-
-    /* Shift boolean to the sign bit so we can xor to negate.  */
-    neg_real <<= 15;
-    neg_imag <<= 15;
-
-    for (i = 0; i < elements; i += eltspersegment) {
-        float16 mr = m[H2(i + 2 * index + 0)];
-        float16 mi = m[H2(i + 2 * index + 1)];
-        float16 e1 = neg_real ^ (flip ? mi : mr);
-        float16 e3 = neg_imag ^ (flip ? mr : mi);
-
-        for (j = i; j < i + eltspersegment; j += 2) {
-            float16 e2 = n[H2(j + flip)];
-            float16 e4 = e2;
-
-            d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
-            d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
-        }
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
-                         void *vfpst, uint32_t desc)
-{
-    uintptr_t opr_sz = simd_oprsz(desc);
-    float32 *d = vd, *n = vn, *m = vm, *a = va;
-    float_status *fpst = vfpst;
-    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
-    uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
-    uint32_t neg_real = flip ^ neg_imag;
-    uintptr_t i;
-
-    /* Shift boolean to the sign bit so we can xor to negate.  */
-    neg_real <<= 31;
-    neg_imag <<= 31;
-
-    for (i = 0; i < opr_sz / 4; i += 2) {
-        float32 e2 = n[H4(i + flip)];
-        float32 e1 = m[H4(i + flip)] ^ neg_real;
-        float32 e4 = e2;
-        float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
-
-        d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
-        d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
-                             void *vfpst, uint32_t desc)
-{
-    uintptr_t opr_sz = simd_oprsz(desc);
-    float32 *d = vd, *n = vn, *m = vm, *a = va;
-    float_status *fpst = vfpst;
-    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
-    uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
-    intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
-    uint32_t neg_real = flip ^ neg_imag;
-    intptr_t elements = opr_sz / sizeof(float32);
-    intptr_t eltspersegment = 16 / sizeof(float32);
-    intptr_t i, j;
-
-    /* Shift boolean to the sign bit so we can xor to negate.  */
-    neg_real <<= 31;
-    neg_imag <<= 31;
-
-    for (i = 0; i < elements; i += eltspersegment) {
-        float32 mr = m[H4(i + 2 * index + 0)];
-        float32 mi = m[H4(i + 2 * index + 1)];
-        float32 e1 = neg_real ^ (flip ? mi : mr);
-        float32 e3 = neg_imag ^ (flip ? mr : mi);
-
-        for (j = i; j < i + eltspersegment; j += 2) {
-            float32 e2 = n[H4(j + flip)];
-            float32 e4 = e2;
-
-            d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
-            d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
-        }
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
-                         void *vfpst, uint32_t desc)
-{
-    uintptr_t opr_sz = simd_oprsz(desc);
-    float64 *d = vd, *n = vn, *m = vm, *a = va;
-    float_status *fpst = vfpst;
-    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
-    uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
-    uint64_t neg_real = flip ^ neg_imag;
-    uintptr_t i;
-
-    /* Shift boolean to the sign bit so we can xor to negate.  */
-    neg_real <<= 63;
-    neg_imag <<= 63;
-
-    for (i = 0; i < opr_sz / 8; i += 2) {
-        float64 e2 = n[i + flip];
-        float64 e1 = m[i + flip] ^ neg_real;
-        float64 e4 = e2;
-        float64 e3 = m[i + 1 - flip] ^ neg_imag;
-
-        d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
-        d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-/*
- * Floating point comparisons producing an integer result (all 1s or all 0s).
- * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
- * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
- */
-static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
-{
-    return -float16_eq_quiet(op1, op2, stat);
-}
-
-static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
-{
-    return -float32_eq_quiet(op1, op2, stat);
-}
-
-static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
-{
-    return -float16_le(op2, op1, stat);
-}
-
-static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
-{
-    return -float32_le(op2, op1, stat);
-}
-
-static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
-{
-    return -float16_lt(op2, op1, stat);
-}
-
-static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
-{
-    return -float32_lt(op2, op1, stat);
-}
-
-static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
-{
-    return -float16_le(float16_abs(op2), float16_abs(op1), stat);
-}
-
-static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
-{
-    return -float32_le(float32_abs(op2), float32_abs(op1), stat);
-}
-
-static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
-{
-    return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
-}
-
-static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
-{
-    return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
-}
-
-static int16_t vfp_tosszh(float16 x, void *fpstp)
-{
-    float_status *fpst = fpstp;
-    if (float16_is_any_nan(x)) {
-        float_raise(float_flag_invalid, fpst);
-        return 0;
-    }
-    return float16_to_int16_round_to_zero(x, fpst);
-}
-
-static uint16_t vfp_touszh(float16 x, void *fpstp)
-{
-    float_status *fpst = fpstp;
-    if (float16_is_any_nan(x)) {
-        float_raise(float_flag_invalid, fpst);
-        return 0;
-    }
-    return float16_to_uint16_round_to_zero(x, fpst);
-}
-
-#define DO_2OP(NAME, FUNC, TYPE) \
-void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)  \
-{                                                                 \
-    intptr_t i, oprsz = simd_oprsz(desc);                         \
-    TYPE *d = vd, *n = vn;                                        \
-    for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
-        d[i] = FUNC(n[i], stat);                                  \
-    }                                                             \
-    clear_tail(d, oprsz, simd_maxsz(desc));                       \
-}
-
-DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
-DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
-DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
-
-DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
-DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
-DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
-
-DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
-DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
-
-DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
-DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
-DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
-DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
-DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
-DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
-DO_2OP(gvec_tosszh, vfp_tosszh, float16)
-DO_2OP(gvec_touszh, vfp_touszh, float16)
-
-#define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
-    static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
-    {                                                           \
-        return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
-    }
-
-#define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
-    static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
-    {                                                           \
-        return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
-    }
-
-#define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
-    WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
-    WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
-    DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
-    DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
-
-DO_2OP_CMP0(cgt, cgt, FWD)
-DO_2OP_CMP0(cge, cge, FWD)
-DO_2OP_CMP0(ceq, ceq, FWD)
-DO_2OP_CMP0(clt, cgt, REV)
-DO_2OP_CMP0(cle, cge, REV)
-
-#undef DO_2OP
-#undef DO_2OP_CMP0
-
-/* Floating-point trigonometric starting value.
- * See the ARM ARM pseudocode function FPTrigSMul.
- */
-static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
-{
-    float16 result = float16_mul(op1, op1, stat);
-    if (!float16_is_any_nan(result)) {
-        result = float16_set_sign(result, op2 & 1);
-    }
-    return result;
-}
-
-static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
-{
-    float32 result = float32_mul(op1, op1, stat);
-    if (!float32_is_any_nan(result)) {
-        result = float32_set_sign(result, op2 & 1);
-    }
-    return result;
-}
-
-static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
-{
-    float64 result = float64_mul(op1, op1, stat);
-    if (!float64_is_any_nan(result)) {
-        result = float64_set_sign(result, op2 & 1);
-    }
-    return result;
-}
-
-static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
-{
-    return float16_abs(float16_sub(op1, op2, stat));
-}
-
-static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
-{
-    return float32_abs(float32_sub(op1, op2, stat));
-}
-
-/*
- * Reciprocal step. These are the AArch32 version which uses a
- * non-fused multiply-and-subtract.
- */
-static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
-{
-    op1 = float16_squash_input_denormal(op1, stat);
-    op2 = float16_squash_input_denormal(op2, stat);
-
-    if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
-        (float16_is_infinity(op2) && float16_is_zero(op1))) {
-        return float16_two;
-    }
-    return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
-}
-
-static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
-{
-    op1 = float32_squash_input_denormal(op1, stat);
-    op2 = float32_squash_input_denormal(op2, stat);
-
-    if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
-        (float32_is_infinity(op2) && float32_is_zero(op1))) {
-        return float32_two;
-    }
-    return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
-}
-
-/* Reciprocal square-root step. AArch32 non-fused semantics. */
-static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
-{
-    op1 = float16_squash_input_denormal(op1, stat);
-    op2 = float16_squash_input_denormal(op2, stat);
-
-    if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
-        (float16_is_infinity(op2) && float16_is_zero(op1))) {
-        return float16_one_point_five;
-    }
-    op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
-    return float16_div(op1, float16_two, stat);
-}
-
-static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
-{
-    op1 = float32_squash_input_denormal(op1, stat);
-    op2 = float32_squash_input_denormal(op2, stat);
-
-    if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
-        (float32_is_infinity(op2) && float32_is_zero(op1))) {
-        return float32_one_point_five;
-    }
-    op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
-    return float32_div(op1, float32_two, stat);
-}
-
-#define DO_3OP(NAME, FUNC, TYPE) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
-{                                                                          \
-    intptr_t i, oprsz = simd_oprsz(desc);                                  \
-    TYPE *d = vd, *n = vn, *m = vm;                                        \
-    for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
-        d[i] = FUNC(n[i], m[i], stat);                                     \
-    }                                                                      \
-    clear_tail(d, oprsz, simd_maxsz(desc));                                \
-}
-
-DO_3OP(gvec_fadd_h, float16_add, float16)
-DO_3OP(gvec_fadd_s, float32_add, float32)
-DO_3OP(gvec_fadd_d, float64_add, float64)
-
-DO_3OP(gvec_fsub_h, float16_sub, float16)
-DO_3OP(gvec_fsub_s, float32_sub, float32)
-DO_3OP(gvec_fsub_d, float64_sub, float64)
-
-DO_3OP(gvec_fmul_h, float16_mul, float16)
-DO_3OP(gvec_fmul_s, float32_mul, float32)
-DO_3OP(gvec_fmul_d, float64_mul, float64)
-
-DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
-DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
-DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
-
-DO_3OP(gvec_fabd_h, float16_abd, float16)
-DO_3OP(gvec_fabd_s, float32_abd, float32)
-
-DO_3OP(gvec_fceq_h, float16_ceq, float16)
-DO_3OP(gvec_fceq_s, float32_ceq, float32)
-
-DO_3OP(gvec_fcge_h, float16_cge, float16)
-DO_3OP(gvec_fcge_s, float32_cge, float32)
-
-DO_3OP(gvec_fcgt_h, float16_cgt, float16)
-DO_3OP(gvec_fcgt_s, float32_cgt, float32)
-
-DO_3OP(gvec_facge_h, float16_acge, float16)
-DO_3OP(gvec_facge_s, float32_acge, float32)
-
-DO_3OP(gvec_facgt_h, float16_acgt, float16)
-DO_3OP(gvec_facgt_s, float32_acgt, float32)
-
-DO_3OP(gvec_fmax_h, float16_max, float16)
-DO_3OP(gvec_fmax_s, float32_max, float32)
-
-DO_3OP(gvec_fmin_h, float16_min, float16)
-DO_3OP(gvec_fmin_s, float32_min, float32)
-
-DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
-DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
-
-DO_3OP(gvec_fminnum_h, float16_minnum, float16)
-DO_3OP(gvec_fminnum_s, float32_minnum, float32)
-
-DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
-DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
-
-DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
-DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
-
-#ifdef TARGET_AARCH64
-
-DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
-DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
-DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
-
-DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
-DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
-DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
-
-#endif
-#undef DO_3OP
-
-/* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
-static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
-                                 float_status *stat)
-{
-    return float16_add(dest, float16_mul(op1, op2, stat), stat);
-}
-
-static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
-                                 float_status *stat)
-{
-    return float32_add(dest, float32_mul(op1, op2, stat), stat);
-}
-
-static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
-                                 float_status *stat)
-{
-    return float16_sub(dest, float16_mul(op1, op2, stat), stat);
-}
-
-static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
-                                 float_status *stat)
-{
-    return float32_sub(dest, float32_mul(op1, op2, stat), stat);
-}
-
-/* Fused versions; these have the semantics Neon VFMA/VFMS want */
-static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
-                                float_status *stat)
-{
-    return float16_muladd(op1, op2, dest, 0, stat);
-}
-
-static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
-                                 float_status *stat)
-{
-    return float32_muladd(op1, op2, dest, 0, stat);
-}
-
-static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
-                                 float_status *stat)
-{
-    return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
-}
-
-static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
-                                 float_status *stat)
-{
-    return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
-}
-
-#define DO_MULADD(NAME, FUNC, TYPE)                                     \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
-{                                                                          \
-    intptr_t i, oprsz = simd_oprsz(desc);                                  \
-    TYPE *d = vd, *n = vn, *m = vm;                                        \
-    for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
-        d[i] = FUNC(d[i], n[i], m[i], stat);                               \
-    }                                                                      \
-    clear_tail(d, oprsz, simd_maxsz(desc));                                \
-}
-
-DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
-DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
-
-DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
-DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
-
-DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
-DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
-
-DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
-DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
-
-/* For the indexed ops, SVE applies the index per 128-bit vector segment.
- * For AdvSIMD, there is of course only one such vector segment.
- */
-
-#define DO_MUL_IDX(NAME, TYPE, H) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
-{                                                                          \
-    intptr_t i, j, oprsz = simd_oprsz(desc);                               \
-    intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
-    intptr_t idx = simd_data(desc);                                        \
-    TYPE *d = vd, *n = vn, *m = vm;                                        \
-    for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
-        TYPE mm = m[H(i + idx)];                                           \
-        for (j = 0; j < segment; j++) {                                    \
-            d[i + j] = n[i + j] * mm;                                      \
-        }                                                                  \
-    }                                                                      \
-    clear_tail(d, oprsz, simd_maxsz(desc));                                \
-}
-
-DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
-DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
-DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
-
-#undef DO_MUL_IDX
-
-#define DO_MLA_IDX(NAME, TYPE, OP, H) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
-{                                                                          \
-    intptr_t i, j, oprsz = simd_oprsz(desc);                               \
-    intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
-    intptr_t idx = simd_data(desc);                                        \
-    TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
-    for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
-        TYPE mm = m[H(i + idx)];                                           \
-        for (j = 0; j < segment; j++) {                                    \
-            d[i + j] = a[i + j] OP n[i + j] * mm;                          \
-        }                                                                  \
-    }                                                                      \
-    clear_tail(d, oprsz, simd_maxsz(desc));                                \
-}
-
-DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
-DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
-DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
-
-DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
-DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
-DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
-
-#undef DO_MLA_IDX
-
-#define DO_FMUL_IDX(NAME, ADD, TYPE, H)                                    \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
-{                                                                          \
-    intptr_t i, j, oprsz = simd_oprsz(desc);                               \
-    intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
-    intptr_t idx = simd_data(desc);                                        \
-    TYPE *d = vd, *n = vn, *m = vm;                                        \
-    for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
-        TYPE mm = m[H(i + idx)];                                           \
-        for (j = 0; j < segment; j++) {                                    \
-            d[i + j] = TYPE##_##ADD(d[i + j],                              \
-                                    TYPE##_mul(n[i + j], mm, stat), stat); \
-        }                                                                  \
-    }                                                                      \
-    clear_tail(d, oprsz, simd_maxsz(desc));                                \
-}
-
-#define float16_nop(N, M, S) (M)
-#define float32_nop(N, M, S) (M)
-#define float64_nop(N, M, S) (M)
-
-DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2)
-DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4)
-DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, H8)
-
-/*
- * Non-fused multiply-accumulate operations, for Neon. NB that unlike
- * the fused ops below they assume accumulate both from and into Vd.
- */
-DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2)
-DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4)
-DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2)
-DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4)
-
-#undef float16_nop
-#undef float32_nop
-#undef float64_nop
-#undef DO_FMUL_IDX
-
-#define DO_FMLA_IDX(NAME, TYPE, H)                                         \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
-                  void *stat, uint32_t desc)                               \
-{                                                                          \
-    intptr_t i, j, oprsz = simd_oprsz(desc);                               \
-    intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
-    TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
-    intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
-    TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
-    op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
-    for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
-        TYPE mm = m[H(i + idx)];                                           \
-        for (j = 0; j < segment; j++) {                                    \
-            d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
-                                     mm, a[i + j], 0, stat);               \
-        }                                                                  \
-    }                                                                      \
-    clear_tail(d, oprsz, simd_maxsz(desc));                                \
-}
-
-DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
-DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
-DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
-
-#undef DO_FMLA_IDX
-
-#define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
-void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
-{                                                                          \
-    intptr_t i, oprsz = simd_oprsz(desc);                                  \
-    TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
-    bool q = false;                                                        \
-    for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
-        WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
-        if (dd < MIN) {                                                    \
-            dd = MIN;                                                      \
-            q = true;                                                      \
-        } else if (dd > MAX) {                                             \
-            dd = MAX;                                                      \
-            q = true;                                                      \
-        }                                                                  \
-        d[i] = dd;                                                         \
-    }                                                                      \
-    if (q) {                                                               \
-        uint32_t *qc = vq;                                                 \
-        qc[0] = 1;                                                         \
-    }                                                                      \
-    clear_tail(d, oprsz, simd_maxsz(desc));                                \
-}
-
-DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
-DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
-DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
-
-DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
-DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
-DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
-
-DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
-DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
-DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
-
-DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
-DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
-DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
-
-#undef DO_SAT
-
-void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
-                          void *vm, uint32_t desc)
-{
-    intptr_t i, oprsz = simd_oprsz(desc);
-    uint64_t *d = vd, *n = vn, *m = vm;
-    bool q = false;
-
-    for (i = 0; i < oprsz / 8; i++) {
-        uint64_t nn = n[i], mm = m[i], dd = nn + mm;
-        if (dd < nn) {
-            dd = UINT64_MAX;
-            q = true;
-        }
-        d[i] = dd;
-    }
-    if (q) {
-        uint32_t *qc = vq;
-        qc[0] = 1;
-    }
-    clear_tail(d, oprsz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
-                          void *vm, uint32_t desc)
-{
-    intptr_t i, oprsz = simd_oprsz(desc);
-    uint64_t *d = vd, *n = vn, *m = vm;
-    bool q = false;
-
-    for (i = 0; i < oprsz / 8; i++) {
-        uint64_t nn = n[i], mm = m[i], dd = nn - mm;
-        if (nn < mm) {
-            dd = 0;
-            q = true;
-        }
-        d[i] = dd;
-    }
-    if (q) {
-        uint32_t *qc = vq;
-        qc[0] = 1;
-    }
-    clear_tail(d, oprsz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
-                          void *vm, uint32_t desc)
-{
-    intptr_t i, oprsz = simd_oprsz(desc);
-    int64_t *d = vd, *n = vn, *m = vm;
-    bool q = false;
-
-    for (i = 0; i < oprsz / 8; i++) {
-        int64_t nn = n[i], mm = m[i], dd = nn + mm;
-        if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
-            dd = (nn >> 63) ^ ~INT64_MIN;
-            q = true;
-        }
-        d[i] = dd;
-    }
-    if (q) {
-        uint32_t *qc = vq;
-        qc[0] = 1;
-    }
-    clear_tail(d, oprsz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
-                          void *vm, uint32_t desc)
-{
-    intptr_t i, oprsz = simd_oprsz(desc);
-    int64_t *d = vd, *n = vn, *m = vm;
-    bool q = false;
-
-    for (i = 0; i < oprsz / 8; i++) {
-        int64_t nn = n[i], mm = m[i], dd = nn - mm;
-        if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
-            dd = (nn >> 63) ^ ~INT64_MIN;
-            q = true;
-        }
-        d[i] = dd;
-    }
-    if (q) {
-        uint32_t *qc = vq;
-        qc[0] = 1;
-    }
-    clear_tail(d, oprsz, simd_maxsz(desc));
-}
-
-
-#define DO_SRA(NAME, TYPE)                              \
-void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
-{                                                       \
-    intptr_t i, oprsz = simd_oprsz(desc);               \
-    int shift = simd_data(desc);                        \
-    TYPE *d = vd, *n = vn;                              \
-    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
-        d[i] += n[i] >> shift;                          \
-    }                                                   \
-    clear_tail(d, oprsz, simd_maxsz(desc));             \
-}
-
-DO_SRA(gvec_ssra_b, int8_t)
-DO_SRA(gvec_ssra_h, int16_t)
-DO_SRA(gvec_ssra_s, int32_t)
-DO_SRA(gvec_ssra_d, int64_t)
-
-DO_SRA(gvec_usra_b, uint8_t)
-DO_SRA(gvec_usra_h, uint16_t)
-DO_SRA(gvec_usra_s, uint32_t)
-DO_SRA(gvec_usra_d, uint64_t)
-
-#undef DO_SRA
-
-#define DO_RSHR(NAME, TYPE)                             \
-void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
-{                                                       \
-    intptr_t i, oprsz = simd_oprsz(desc);               \
-    int shift = simd_data(desc);                        \
-    TYPE *d = vd, *n = vn;                              \
-    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
-        TYPE tmp = n[i] >> (shift - 1);                 \
-        d[i] = (tmp >> 1) + (tmp & 1);                  \
-    }                                                   \
-    clear_tail(d, oprsz, simd_maxsz(desc));             \
-}
-
-DO_RSHR(gvec_srshr_b, int8_t)
-DO_RSHR(gvec_srshr_h, int16_t)
-DO_RSHR(gvec_srshr_s, int32_t)
-DO_RSHR(gvec_srshr_d, int64_t)
-
-DO_RSHR(gvec_urshr_b, uint8_t)
-DO_RSHR(gvec_urshr_h, uint16_t)
-DO_RSHR(gvec_urshr_s, uint32_t)
-DO_RSHR(gvec_urshr_d, uint64_t)
-
-#undef DO_RSHR
-
-#define DO_RSRA(NAME, TYPE)                             \
-void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
-{                                                       \
-    intptr_t i, oprsz = simd_oprsz(desc);               \
-    int shift = simd_data(desc);                        \
-    TYPE *d = vd, *n = vn;                              \
-    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
-        TYPE tmp = n[i] >> (shift - 1);                 \
-        d[i] += (tmp >> 1) + (tmp & 1);                 \
-    }                                                   \
-    clear_tail(d, oprsz, simd_maxsz(desc));             \
-}
-
-DO_RSRA(gvec_srsra_b, int8_t)
-DO_RSRA(gvec_srsra_h, int16_t)
-DO_RSRA(gvec_srsra_s, int32_t)
-DO_RSRA(gvec_srsra_d, int64_t)
-
-DO_RSRA(gvec_ursra_b, uint8_t)
-DO_RSRA(gvec_ursra_h, uint16_t)
-DO_RSRA(gvec_ursra_s, uint32_t)
-DO_RSRA(gvec_ursra_d, uint64_t)
-
-#undef DO_RSRA
-
-#define DO_SRI(NAME, TYPE)                              \
-void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
-{                                                       \
-    intptr_t i, oprsz = simd_oprsz(desc);               \
-    int shift = simd_data(desc);                        \
-    TYPE *d = vd, *n = vn;                              \
-    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
-        d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
-    }                                                   \
-    clear_tail(d, oprsz, simd_maxsz(desc));             \
-}
-
-DO_SRI(gvec_sri_b, uint8_t)
-DO_SRI(gvec_sri_h, uint16_t)
-DO_SRI(gvec_sri_s, uint32_t)
-DO_SRI(gvec_sri_d, uint64_t)
-
-#undef DO_SRI
-
-#define DO_SLI(NAME, TYPE)                              \
-void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
-{                                                       \
-    intptr_t i, oprsz = simd_oprsz(desc);               \
-    int shift = simd_data(desc);                        \
-    TYPE *d = vd, *n = vn;                              \
-    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
-        d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
-    }                                                   \
-    clear_tail(d, oprsz, simd_maxsz(desc));             \
-}
-
-DO_SLI(gvec_sli_b, uint8_t)
-DO_SLI(gvec_sli_h, uint16_t)
-DO_SLI(gvec_sli_s, uint32_t)
-DO_SLI(gvec_sli_d, uint64_t)
-
-#undef DO_SLI
-
-/*
- * Convert float16 to float32, raising no exceptions and
- * preserving exceptional values, including SNaN.
- * This is effectively an unpack+repack operation.
- */
-static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
-{
-    const int f16_bias = 15;
-    const int f32_bias = 127;
-    uint32_t sign = extract32(f16, 15, 1);
-    uint32_t exp = extract32(f16, 10, 5);
-    uint32_t frac = extract32(f16, 0, 10);
-
-    if (exp == 0x1f) {
-        /* Inf or NaN */
-        exp = 0xff;
-    } else if (exp == 0) {
-        /* Zero or denormal.  */
-        if (frac != 0) {
-            if (fz16) {
-                frac = 0;
-            } else {
-                /*
-                 * Denormal; these are all normal float32.
-                 * Shift the fraction so that the msb is at bit 11,
-                 * then remove bit 11 as the implicit bit of the
-                 * normalized float32.  Note that we still go through
-                 * the shift for normal numbers below, to put the
-                 * float32 fraction at the right place.
-                 */
-                int shift = clz32(frac) - 21;
-                frac = (frac << shift) & 0x3ff;
-                exp = f32_bias - f16_bias - shift + 1;
-            }
-        }
-    } else {
-        /* Normal number; adjust the bias.  */
-        exp += f32_bias - f16_bias;
-    }
-    sign <<= 31;
-    exp <<= 23;
-    frac <<= 23 - 10;
-
-    return sign | exp | frac;
-}
-
-static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
-{
-    /*
-     * Branchless load of u32[0], u64[0], u32[1], or u64[1].
-     * Load the 2nd qword iff is_q & is_2.
-     * Shift to the 2nd dword iff !is_q & is_2.
-     * For !is_q & !is_2, the upper bits of the result are garbage.
-     */
-    return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
-}
-
-/*
- * Note that FMLAL requires oprsz == 8 or oprsz == 16,
- * as there is not yet SVE versions that might use blocking.
- */
-
-static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
-                     uint32_t desc, bool fz16)
-{
-    intptr_t i, oprsz = simd_oprsz(desc);
-    int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
-    int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
-    int is_q = oprsz == 16;
-    uint64_t n_4, m_4;
-
-    /* Pre-load all of the f16 data, avoiding overlap issues.  */
-    n_4 = load4_f16(vn, is_q, is_2);
-    m_4 = load4_f16(vm, is_q, is_2);
-
-    /* Negate all inputs for FMLSL at once.  */
-    if (is_s) {
-        n_4 ^= 0x8000800080008000ull;
-    }
-
-    for (i = 0; i < oprsz / 4; i++) {
-        float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
-        float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
-        d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
-    }
-    clear_tail(d, oprsz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
-                            void *venv, uint32_t desc)
-{
-    CPUARMState *env = venv;
-    do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
-             get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
-}
-
-void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
-                            void *venv, uint32_t desc)
-{
-    CPUARMState *env = venv;
-    do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
-             get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
-}
-
-void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
-                               void *venv, uint32_t desc)
-{
-    intptr_t i, oprsz = simd_oprsz(desc);
-    uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
-    intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
-    CPUARMState *env = venv;
-    float_status *status = &env->vfp.fp_status;
-    bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
-
-    for (i = 0; i < oprsz; i += sizeof(float32)) {
-        float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
-        float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
-        float32 nn = float16_to_float32_by_bits(nn_16, fz16);
-        float32 mm = float16_to_float32_by_bits(mm_16, fz16);
-        float32 aa = *(float32 *)(va + H1_4(i));
-
-        *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
-    }
-}
-
-static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
-                         uint32_t desc, bool fz16)
-{
-    intptr_t i, oprsz = simd_oprsz(desc);
-    int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
-    int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
-    int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
-    int is_q = oprsz == 16;
-    uint64_t n_4;
-    float32 m_1;
-
-    /* Pre-load all of the f16 data, avoiding overlap issues.  */
-    n_4 = load4_f16(vn, is_q, is_2);
-
-    /* Negate all inputs for FMLSL at once.  */
-    if (is_s) {
-        n_4 ^= 0x8000800080008000ull;
-    }
-
-    m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
-
-    for (i = 0; i < oprsz / 4; i++) {
-        float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
-        d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
-    }
-    clear_tail(d, oprsz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
-                                void *venv, uint32_t desc)
-{
-    CPUARMState *env = venv;
-    do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
-                 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
-}
-
-void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
-                                void *venv, uint32_t desc)
-{
-    CPUARMState *env = venv;
-    do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
-                 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
-}
-
-void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
-                               void *venv, uint32_t desc)
-{
-    intptr_t i, j, oprsz = simd_oprsz(desc);
-    uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
-    intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
-    intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
-    CPUARMState *env = venv;
-    float_status *status = &env->vfp.fp_status;
-    bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
-
-    for (i = 0; i < oprsz; i += 16) {
-        float16 mm_16 = *(float16 *)(vm + i + idx);
-        float32 mm = float16_to_float32_by_bits(mm_16, fz16);
-
-        for (j = 0; j < 16; j += sizeof(float32)) {
-            float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
-            float32 nn = float16_to_float32_by_bits(nn_16, fz16);
-            float32 aa = *(float32 *)(va + H1_4(i + j));
-
-            *(float32 *)(vd + H1_4(i + j)) =
-                float32_muladd(nn, mm, aa, 0, status);
-        }
-    }
-}
-
-void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    int8_t *d = vd, *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz; ++i) {
-        int8_t mm = m[i];
-        int8_t nn = n[i];
-        int8_t res = 0;
-        if (mm >= 0) {
-            if (mm < 8) {
-                res = nn << mm;
-            }
-        } else {
-            res = nn >> (mm > -8 ? -mm : 7);
-        }
-        d[i] = res;
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    int16_t *d = vd, *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz / 2; ++i) {
-        int8_t mm = m[i];   /* only 8 bits of shift are significant */
-        int16_t nn = n[i];
-        int16_t res = 0;
-        if (mm >= 0) {
-            if (mm < 16) {
-                res = nn << mm;
-            }
-        } else {
-            res = nn >> (mm > -16 ? -mm : 15);
-        }
-        d[i] = res;
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    uint8_t *d = vd, *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz; ++i) {
-        int8_t mm = m[i];
-        uint8_t nn = n[i];
-        uint8_t res = 0;
-        if (mm >= 0) {
-            if (mm < 8) {
-                res = nn << mm;
-            }
-        } else {
-            if (mm > -8) {
-                res = nn >> -mm;
-            }
-        }
-        d[i] = res;
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    uint16_t *d = vd, *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz / 2; ++i) {
-        int8_t mm = m[i];   /* only 8 bits of shift are significant */
-        uint16_t nn = n[i];
-        uint16_t res = 0;
-        if (mm >= 0) {
-            if (mm < 16) {
-                res = nn << mm;
-            }
-        } else {
-            if (mm > -16) {
-                res = nn >> -mm;
-            }
-        }
-        d[i] = res;
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-/*
- * 8x8->8 polynomial multiply.
- *
- * Polynomial multiplication is like integer multiplication except the
- * partial products are XORed, not added.
- *
- * TODO: expose this as a generic vector operation, as it is a common
- * crypto building block.
- */
-void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, j, opr_sz = simd_oprsz(desc);
-    uint64_t *d = vd, *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz / 8; ++i) {
-        uint64_t nn = n[i];
-        uint64_t mm = m[i];
-        uint64_t rr = 0;
-
-        for (j = 0; j < 8; ++j) {
-            uint64_t mask = (nn & 0x0101010101010101ull) * 0xff;
-            rr ^= mm & mask;
-            mm = (mm << 1) & 0xfefefefefefefefeull;
-            nn >>= 1;
-        }
-        d[i] = rr;
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-/*
- * 64x64->128 polynomial multiply.
- * Because of the lanes are not accessed in strict columns,
- * this probably cannot be turned into a generic helper.
- */
-void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, j, opr_sz = simd_oprsz(desc);
-    intptr_t hi = simd_data(desc);
-    uint64_t *d = vd, *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz / 8; i += 2) {
-        uint64_t nn = n[i + hi];
-        uint64_t mm = m[i + hi];
-        uint64_t rhi = 0;
-        uint64_t rlo = 0;
-
-        /* Bit 0 can only influence the low 64-bit result.  */
-        if (nn & 1) {
-            rlo = mm;
-        }
-
-        for (j = 1; j < 64; ++j) {
-            uint64_t mask = -((nn >> j) & 1);
-            rlo ^= (mm << j) & mask;
-            rhi ^= (mm >> (64 - j)) & mask;
-        }
-        d[i] = rlo;
-        d[i + 1] = rhi;
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-/*
- * 8x8->16 polynomial multiply.
- *
- * The byte inputs are expanded to (or extracted from) half-words.
- * Note that neon and sve2 get the inputs from different positions.
- * This allows 4 bytes to be processed in parallel with uint64_t.
- */
-
-static uint64_t expand_byte_to_half(uint64_t x)
-{
-    return  (x & 0x000000ff)
-         | ((x & 0x0000ff00) << 8)
-         | ((x & 0x00ff0000) << 16)
-         | ((x & 0xff000000) << 24);
-}
-
-uint64_t pmull_w(uint64_t op1, uint64_t op2)
-{
-    uint64_t result = 0;
-    int i;
-    for (i = 0; i < 16; ++i) {
-        uint64_t mask = (op1 & 0x0000000100000001ull) * 0xffffffff;
-        result ^= op2 & mask;
-        op1 >>= 1;
-        op2 <<= 1;
-    }
-    return result;
-}
-
-uint64_t pmull_h(uint64_t op1, uint64_t op2)
-{
-    uint64_t result = 0;
-    int i;
-    for (i = 0; i < 8; ++i) {
-        uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff;
-        result ^= op2 & mask;
-        op1 >>= 1;
-        op2 <<= 1;
-    }
-    return result;
-}
-
-void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    int hi = simd_data(desc);
-    uint64_t *d = vd, *n = vn, *m = vm;
-    uint64_t nn = n[hi], mm = m[hi];
-
-    d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
-    nn >>= 32;
-    mm >>= 32;
-    d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
-
-    clear_tail(d, 16, simd_maxsz(desc));
-}
-
-#ifdef TARGET_AARCH64
-void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    int shift = simd_data(desc) * 8;
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    uint64_t *d = vd, *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz / 8; ++i) {
-        uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull;
-        uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull;
-
-        d[i] = pmull_h(nn, mm);
-    }
-}
-
-static uint64_t pmull_d(uint64_t op1, uint64_t op2)
-{
-    uint64_t result = 0;
-    int i;
-
-    for (i = 0; i < 32; ++i) {
-        uint64_t mask = -((op1 >> i) & 1);
-        result ^= (op2 << i) & mask;
-    }
-    return result;
-}
-
-void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t sel = H4(simd_data(desc));
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    uint32_t *n = vn, *m = vm;
-    uint64_t *d = vd;
-
-    for (i = 0; i < opr_sz / 8; ++i) {
-        d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]);
-    }
-}
-#endif
-
-#define DO_CMP0(NAME, TYPE, OP)                         \
-void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
-{                                                       \
-    intptr_t i, opr_sz = simd_oprsz(desc);              \
-    for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
-        TYPE nn = *(TYPE *)(vn + i);                    \
-        *(TYPE *)(vd + i) = -(nn OP 0);                 \
-    }                                                   \
-    clear_tail(vd, opr_sz, simd_maxsz(desc));           \
-}
-
-DO_CMP0(gvec_ceq0_b, int8_t, ==)
-DO_CMP0(gvec_clt0_b, int8_t, <)
-DO_CMP0(gvec_cle0_b, int8_t, <=)
-DO_CMP0(gvec_cgt0_b, int8_t, >)
-DO_CMP0(gvec_cge0_b, int8_t, >=)
-
-DO_CMP0(gvec_ceq0_h, int16_t, ==)
-DO_CMP0(gvec_clt0_h, int16_t, <)
-DO_CMP0(gvec_cle0_h, int16_t, <=)
-DO_CMP0(gvec_cgt0_h, int16_t, >)
-DO_CMP0(gvec_cge0_h, int16_t, >=)
-
-#undef DO_CMP0
-
-#define DO_ABD(NAME, TYPE)                                      \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
-{                                                               \
-    intptr_t i, opr_sz = simd_oprsz(desc);                      \
-    TYPE *d = vd, *n = vn, *m = vm;                             \
-                                                                \
-    for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
-        d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
-    }                                                           \
-    clear_tail(d, opr_sz, simd_maxsz(desc));                    \
-}
-
-DO_ABD(gvec_sabd_b, int8_t)
-DO_ABD(gvec_sabd_h, int16_t)
-DO_ABD(gvec_sabd_s, int32_t)
-DO_ABD(gvec_sabd_d, int64_t)
-
-DO_ABD(gvec_uabd_b, uint8_t)
-DO_ABD(gvec_uabd_h, uint16_t)
-DO_ABD(gvec_uabd_s, uint32_t)
-DO_ABD(gvec_uabd_d, uint64_t)
-
-#undef DO_ABD
-
-#define DO_ABA(NAME, TYPE)                                      \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
-{                                                               \
-    intptr_t i, opr_sz = simd_oprsz(desc);                      \
-    TYPE *d = vd, *n = vn, *m = vm;                             \
-                                                                \
-    for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
-        d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
-    }                                                           \
-    clear_tail(d, opr_sz, simd_maxsz(desc));                    \
-}
-
-DO_ABA(gvec_saba_b, int8_t)
-DO_ABA(gvec_saba_h, int16_t)
-DO_ABA(gvec_saba_s, int32_t)
-DO_ABA(gvec_saba_d, int64_t)
-
-DO_ABA(gvec_uaba_b, uint8_t)
-DO_ABA(gvec_uaba_h, uint16_t)
-DO_ABA(gvec_uaba_s, uint32_t)
-DO_ABA(gvec_uaba_d, uint64_t)
-
-#undef DO_ABA
-
-#define DO_NEON_PAIRWISE(NAME, OP)                                      \
-    void HELPER(NAME##s)(void *vd, void *vn, void *vm,                  \
-                         void *stat, uint32_t oprsz)                    \
-    {                                                                   \
-        float_status *fpst = stat;                                      \
-        float32 *d = vd;                                                \
-        float32 *n = vn;                                                \
-        float32 *m = vm;                                                \
-        float32 r0, r1;                                                 \
-                                                                        \
-        /* Read all inputs before writing outputs in case vm == vd */   \
-        r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst);                    \
-        r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst);                    \
-                                                                        \
-        d[H4(0)] = r0;                                                  \
-        d[H4(1)] = r1;                                                  \
-    }                                                                   \
-                                                                        \
-    void HELPER(NAME##h)(void *vd, void *vn, void *vm,                  \
-                         void *stat, uint32_t oprsz)                    \
-    {                                                                   \
-        float_status *fpst = stat;                                      \
-        float16 *d = vd;                                                \
-        float16 *n = vn;                                                \
-        float16 *m = vm;                                                \
-        float16 r0, r1, r2, r3;                                         \
-                                                                        \
-        /* Read all inputs before writing outputs in case vm == vd */   \
-        r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst);                    \
-        r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst);                    \
-        r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst);                    \
-        r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst);                    \
-                                                                        \
-        d[H2(0)] = r0;                                                  \
-        d[H2(1)] = r1;                                                  \
-        d[H2(2)] = r2;                                                  \
-        d[H2(3)] = r3;                                                  \
-    }
-
-DO_NEON_PAIRWISE(neon_padd, add)
-DO_NEON_PAIRWISE(neon_pmax, max)
-DO_NEON_PAIRWISE(neon_pmin, min)
-
-#undef DO_NEON_PAIRWISE
-
-#define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
-    void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
-    {                                                                   \
-        intptr_t i, oprsz = simd_oprsz(desc);                           \
-        int shift = simd_data(desc);                                    \
-        TYPE *d = vd, *n = vn;                                          \
-        float_status *fpst = stat;                                      \
-        for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
-            d[i] = FUNC(n[i], shift, fpst);                             \
-        }                                                               \
-        clear_tail(d, oprsz, simd_maxsz(desc));                         \
-    }
-
-DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
-DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
-DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
-DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
-DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
-DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
-DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
-DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
-
-#undef DO_VCVT_FIXED
-
-#define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
-    void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
-    {                                                                   \
-        float_status *fpst = stat;                                      \
-        intptr_t i, oprsz = simd_oprsz(desc);                           \
-        uint32_t rmode = simd_data(desc);                               \
-        uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
-        TYPE *d = vd, *n = vn;                                          \
-        set_float_rounding_mode(rmode, fpst);                           \
-        for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
-            d[i] = FUNC(n[i], 0, fpst);                                 \
-        }                                                               \
-        set_float_rounding_mode(prev_rmode, fpst);                      \
-        clear_tail(d, oprsz, simd_maxsz(desc));                         \
-    }
-
-DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
-DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
-DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
-DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
-
-#undef DO_VCVT_RMODE
-
-#define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
-    void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
-    {                                                                   \
-        float_status *fpst = stat;                                      \
-        intptr_t i, oprsz = simd_oprsz(desc);                           \
-        uint32_t rmode = simd_data(desc);                               \
-        uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
-        TYPE *d = vd, *n = vn;                                          \
-        set_float_rounding_mode(rmode, fpst);                           \
-        for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
-            d[i] = FUNC(n[i], fpst);                                    \
-        }                                                               \
-        set_float_rounding_mode(prev_rmode, fpst);                      \
-        clear_tail(d, oprsz, simd_maxsz(desc));                         \
-    }
-
-DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
-DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
-
-#undef DO_VRINT_RMODE
-
-#ifdef TARGET_AARCH64
-void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
-{
-    const uint8_t *indices = vm;
-    CPUARMState *env = venv;
-    size_t oprsz = simd_oprsz(desc);
-    uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
-    bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
-    uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
-    union {
-        uint8_t b[16];
-        uint64_t d[2];
-    } result;
-
-    /*
-     * We must construct the final result in a temp, lest the output
-     * overlaps the input table.  For TBL, begin with zero; for TBX,
-     * begin with the original register contents.  Note that we always
-     * copy 16 bytes here to avoid an extra branch; clearing the high
-     * bits of the register for oprsz == 8 is handled below.
-     */
-    if (is_tbx) {
-        memcpy(&result, vd, 16);
-    } else {
-        memset(&result, 0, 16);
-    }
-
-    for (size_t i = 0; i < oprsz; ++i) {
-        uint32_t index = indices[H1(i)];
-
-        if (index < table_len) {
-            /*
-             * Convert index (a byte offset into the virtual table
-             * which is a series of 128-bit vectors concatenated)
-             * into the correct register element, bearing in mind
-             * that the table can wrap around from V31 to V0.
-             */
-            const uint8_t *table = (const uint8_t *)
-                aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
-            result.b[H1(i)] = table[H1(index % 16)];
-        }
-    }
-
-    memcpy(vd, &result, 16);
-    clear_tail(vd, oprsz, simd_maxsz(desc));
-}
-#endif
-
-/*
- * NxN -> N highpart multiply
- *
- * TODO: expose this as a generic vector operation.
- */
-
-void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    int8_t *d = vd, *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz; ++i) {
-        d[i] = ((int32_t)n[i] * m[i]) >> 8;
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    int16_t *d = vd, *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz / 2; ++i) {
-        d[i] = ((int32_t)n[i] * m[i]) >> 16;
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    int32_t *d = vd, *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz / 4; ++i) {
-        d[i] = ((int64_t)n[i] * m[i]) >> 32;
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    uint64_t *d = vd, *n = vn, *m = vm;
-    uint64_t discard;
-
-    for (i = 0; i < opr_sz / 8; ++i) {
-        muls64(&discard, &d[i], n[i], m[i]);
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    uint8_t *d = vd, *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz; ++i) {
-        d[i] = ((uint32_t)n[i] * m[i]) >> 8;
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    uint16_t *d = vd, *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz / 2; ++i) {
-        d[i] = ((uint32_t)n[i] * m[i]) >> 16;
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    uint32_t *d = vd, *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz / 4; ++i) {
-        d[i] = ((uint64_t)n[i] * m[i]) >> 32;
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    uint64_t *d = vd, *n = vn, *m = vm;
-    uint64_t discard;
-
-    for (i = 0; i < opr_sz / 8; ++i) {
-        mulu64(&discard, &d[i], n[i], m[i]);
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
-    int shr = simd_data(desc);
-    uint64_t *d = vd, *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz; ++i) {
-        d[i] = ror64(n[i] ^ m[i], shr);
-    }
-    clear_tail(d, opr_sz * 8, simd_maxsz(desc));
-}
-
-/*
- * Integer matrix-multiply accumulate
- */
-
-static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
-{
-    int8_t *n = vn, *m = vm;
-
-    for (intptr_t k = 0; k < 8; ++k) {
-        sum += n[H1(k)] * m[H1(k)];
-    }
-    return sum;
-}
-
-static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
-{
-    uint8_t *n = vn, *m = vm;
-
-    for (intptr_t k = 0; k < 8; ++k) {
-        sum += n[H1(k)] * m[H1(k)];
-    }
-    return sum;
-}
-
-static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
-{
-    uint8_t *n = vn;
-    int8_t *m = vm;
-
-    for (intptr_t k = 0; k < 8; ++k) {
-        sum += n[H1(k)] * m[H1(k)];
-    }
-    return sum;
-}
-
-static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
-                      uint32_t (*inner_loop)(uint32_t, void *, void *))
-{
-    intptr_t seg, opr_sz = simd_oprsz(desc);
-
-    for (seg = 0; seg < opr_sz; seg += 16) {
-        uint32_t *d = vd + seg;
-        uint32_t *a = va + seg;
-        uint32_t sum0, sum1, sum2, sum3;
-
-        /*
-         * Process the entire segment at once, writing back the
-         * results only after we've consumed all of the inputs.
-         *
-         * Key to indices by column:
-         *          i   j                  i             j
-         */
-        sum0 = a[H4(0 + 0)];
-        sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
-        sum1 = a[H4(0 + 1)];
-        sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
-        sum2 = a[H4(2 + 0)];
-        sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
-        sum3 = a[H4(2 + 1)];
-        sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
-
-        d[H4(0)] = sum0;
-        d[H4(1)] = sum1;
-        d[H4(2)] = sum2;
-        d[H4(3)] = sum3;
-    }
-    clear_tail(vd, opr_sz, simd_maxsz(desc));
-}
-
-#define DO_MMLA_B(NAME, INNER) \
-    void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
-    { do_mmla_b(vd, vn, vm, va, desc, INNER); }
-
-DO_MMLA_B(gvec_smmla_b, do_smmla_b)
-DO_MMLA_B(gvec_ummla_b, do_ummla_b)
-DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
-
-/*
- * BFloat16 Dot Product
- */
-
-float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2)
-{
-    /* FPCR is ignored for BFDOT and BFMMLA. */
-    float_status bf_status = {
-        .tininess_before_rounding = float_tininess_before_rounding,
-        .float_rounding_mode = float_round_to_odd_inf,
-        .flush_to_zero = true,
-        .flush_inputs_to_zero = true,
-        .default_nan_mode = true,
-    };
-    float32 t1, t2;
-
-    /*
-     * Extract each BFloat16 from the element pair, and shift
-     * them such that they become float32.
-     */
-    t1 = float32_mul(e1 << 16, e2 << 16, &bf_status);
-    t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status);
-    t1 = float32_add(t1, t2, &bf_status);
-    t1 = float32_add(sum, t1, &bf_status);
-
-    return t1;
-}
-
-void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    float32 *d = vd, *a = va;
-    uint32_t *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz / 4; ++i) {
-        d[i] = bfdotadd(a[i], n[i], m[i]);
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
-                            void *va, uint32_t desc)
-{
-    intptr_t i, j, opr_sz = simd_oprsz(desc);
-    intptr_t index = simd_data(desc);
-    intptr_t elements = opr_sz / 4;
-    intptr_t eltspersegment = MIN(16 / 4, elements);
-    float32 *d = vd, *a = va;
-    uint32_t *n = vn, *m = vm;
-
-    for (i = 0; i < elements; i += eltspersegment) {
-        uint32_t m_idx = m[i + H4(index)];
-
-        for (j = i; j < i + eltspersegment; j++) {
-            d[j] = bfdotadd(a[j], n[j], m_idx);
-        }
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
-{
-    intptr_t s, opr_sz = simd_oprsz(desc);
-    float32 *d = vd, *a = va;
-    uint32_t *n = vn, *m = vm;
-
-    for (s = 0; s < opr_sz / 4; s += 4) {
-        float32 sum00, sum01, sum10, sum11;
-
-        /*
-         * Process the entire segment at once, writing back the
-         * results only after we've consumed all of the inputs.
-         *
-         * Key to indicies by column:
-         *               i   j           i   k             j   k
-         */
-        sum00 = a[s + H4(0 + 0)];
-        sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
-        sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
-
-        sum01 = a[s + H4(0 + 1)];
-        sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]);
-        sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]);
-
-        sum10 = a[s + H4(2 + 0)];
-        sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]);
-        sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]);
-
-        sum11 = a[s + H4(2 + 1)];
-        sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]);
-        sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]);
-
-        d[s + H4(0 + 0)] = sum00;
-        d[s + H4(0 + 1)] = sum01;
-        d[s + H4(2 + 0)] = sum10;
-        d[s + H4(2 + 1)] = sum11;
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
-                         void *stat, uint32_t desc)
-{
-    intptr_t i, opr_sz = simd_oprsz(desc);
-    intptr_t sel = simd_data(desc);
-    float32 *d = vd, *a = va;
-    bfloat16 *n = vn, *m = vm;
-
-    for (i = 0; i < opr_sz / 4; ++i) {
-        float32 nn = n[H2(i * 2 + sel)] << 16;
-        float32 mm = m[H2(i * 2 + sel)] << 16;
-        d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
-                             void *va, void *stat, uint32_t desc)
-{
-    intptr_t i, j, opr_sz = simd_oprsz(desc);
-    intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
-    intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
-    intptr_t elements = opr_sz / 4;
-    intptr_t eltspersegment = MIN(16 / 4, elements);
-    float32 *d = vd, *a = va;
-    bfloat16 *n = vn, *m = vm;
-
-    for (i = 0; i < elements; i += eltspersegment) {
-        float32 m_idx = m[H2(2 * i + index)] << 16;
-
-        for (j = i; j < i + eltspersegment; j++) {
-            float32 n_j = n[H2(2 * j + sel)] << 16;
-            d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
-        }
-    }
-    clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-#define DO_CLAMP(NAME, TYPE) \
-void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
-{                                                                       \
-    intptr_t i, opr_sz = simd_oprsz(desc);                              \
-    for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
-        TYPE aa = *(TYPE *)(a + i);                                     \
-        TYPE nn = *(TYPE *)(n + i);                                     \
-        TYPE mm = *(TYPE *)(m + i);                                     \
-        TYPE dd = MIN(MAX(aa, nn), mm);                                 \
-        *(TYPE *)(d + i) = dd;                                          \
-    }                                                                   \
-    clear_tail(d, opr_sz, simd_maxsz(desc));                            \
-}
-
-DO_CLAMP(gvec_sclamp_b, int8_t)
-DO_CLAMP(gvec_sclamp_h, int16_t)
-DO_CLAMP(gvec_sclamp_s, int32_t)
-DO_CLAMP(gvec_sclamp_d, int64_t)
-
-DO_CLAMP(gvec_uclamp_b, uint8_t)
-DO_CLAMP(gvec_uclamp_h, uint16_t)
-DO_CLAMP(gvec_uclamp_s, uint32_t)
-DO_CLAMP(gvec_uclamp_d, uint64_t)
diff --git a/target/arm/vec_internal.h b/target/arm/vec_internal.h
deleted file mode 100644
index 1f4ed80..0000000
--- a/target/arm/vec_internal.h
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * ARM AdvSIMD / SVE Vector Helpers
- *
- * Copyright (c) 2020 Linaro
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef TARGET_ARM_VEC_INTERNAL_H
-#define TARGET_ARM_VEC_INTERNAL_H
-
-/*
- * Note that vector data is stored in host-endian 64-bit chunks,
- * so addressing units smaller than that needs a host-endian fixup.
- *
- * The H<N> macros are used when indexing an array of elements of size N.
- *
- * The H1_<N> macros are used when performing byte arithmetic and then
- * casting the final pointer to a type of size N.
- */
-#if HOST_BIG_ENDIAN
-#define H1(x)   ((x) ^ 7)
-#define H1_2(x) ((x) ^ 6)
-#define H1_4(x) ((x) ^ 4)
-#define H2(x)   ((x) ^ 3)
-#define H4(x)   ((x) ^ 1)
-#else
-#define H1(x)   (x)
-#define H1_2(x) (x)
-#define H1_4(x) (x)
-#define H2(x)   (x)
-#define H4(x)   (x)
-#endif
-/*
- * Access to 64-bit elements isn't host-endian dependent; we provide H8
- * and H1_8 so that when a function is being generated from a macro we
- * can pass these rather than an empty macro argument, for clarity.
- */
-#define H8(x)   (x)
-#define H1_8(x) (x)
-
-/*
- * Expand active predicate bits to bytes, for byte elements.
- */
-extern const uint64_t expand_pred_b_data[256];
-static inline uint64_t expand_pred_b(uint8_t byte)
-{
-    return expand_pred_b_data[byte];
-}
-
-/* Similarly for half-word elements. */
-extern const uint64_t expand_pred_h_data[0x55 + 1];
-static inline uint64_t expand_pred_h(uint8_t byte)
-{
-    return expand_pred_h_data[byte & 0x55];
-}
-
-static inline void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz)
-{
-    uint64_t *d = vd + opr_sz;
-    uintptr_t i;
-
-    for (i = opr_sz; i < max_sz; i += 8) {
-        *d++ = 0;
-    }
-}
-
-static inline int32_t do_sqrshl_bhs(int32_t src, int32_t shift, int bits,
-                                    bool round, uint32_t *sat)
-{
-    if (shift <= -bits) {
-        /* Rounding the sign bit always produces 0. */
-        if (round) {
-            return 0;
-        }
-        return src >> 31;
-    } else if (shift < 0) {
-        if (round) {
-            src >>= -shift - 1;
-            return (src >> 1) + (src & 1);
-        }
-        return src >> -shift;
-    } else if (shift < bits) {
-        int32_t val = src << shift;
-        if (bits == 32) {
-            if (!sat || val >> shift == src) {
-                return val;
-            }
-        } else {
-            int32_t extval = sextract32(val, 0, bits);
-            if (!sat || val == extval) {
-                return extval;
-            }
-        }
-    } else if (!sat || src == 0) {
-        return 0;
-    }
-
-    *sat = 1;
-    return (1u << (bits - 1)) - (src >= 0);
-}
-
-static inline uint32_t do_uqrshl_bhs(uint32_t src, int32_t shift, int bits,
-                                     bool round, uint32_t *sat)
-{
-    if (shift <= -(bits + round)) {
-        return 0;
-    } else if (shift < 0) {
-        if (round) {
-            src >>= -shift - 1;
-            return (src >> 1) + (src & 1);
-        }
-        return src >> -shift;
-    } else if (shift < bits) {
-        uint32_t val = src << shift;
-        if (bits == 32) {
-            if (!sat || val >> shift == src) {
-                return val;
-            }
-        } else {
-            uint32_t extval = extract32(val, 0, bits);
-            if (!sat || val == extval) {
-                return extval;
-            }
-        }
-    } else if (!sat || src == 0) {
-        return 0;
-    }
-
-    *sat = 1;
-    return MAKE_64BIT_MASK(0, bits);
-}
-
-static inline int32_t do_suqrshl_bhs(int32_t src, int32_t shift, int bits,
-                                     bool round, uint32_t *sat)
-{
-    if (sat && src < 0) {
-        *sat = 1;
-        return 0;
-    }
-    return do_uqrshl_bhs(src, shift, bits, round, sat);
-}
-
-static inline int64_t do_sqrshl_d(int64_t src, int64_t shift,
-                                  bool round, uint32_t *sat)
-{
-    if (shift <= -64) {
-        /* Rounding the sign bit always produces 0. */
-        if (round) {
-            return 0;
-        }
-        return src >> 63;
-    } else if (shift < 0) {
-        if (round) {
-            src >>= -shift - 1;
-            return (src >> 1) + (src & 1);
-        }
-        return src >> -shift;
-    } else if (shift < 64) {
-        int64_t val = src << shift;
-        if (!sat || val >> shift == src) {
-            return val;
-        }
-    } else if (!sat || src == 0) {
-        return 0;
-    }
-
-    *sat = 1;
-    return src < 0 ? INT64_MIN : INT64_MAX;
-}
-
-static inline uint64_t do_uqrshl_d(uint64_t src, int64_t shift,
-                                   bool round, uint32_t *sat)
-{
-    if (shift <= -(64 + round)) {
-        return 0;
-    } else if (shift < 0) {
-        if (round) {
-            src >>= -shift - 1;
-            return (src >> 1) + (src & 1);
-        }
-        return src >> -shift;
-    } else if (shift < 64) {
-        uint64_t val = src << shift;
-        if (!sat || val >> shift == src) {
-            return val;
-        }
-    } else if (!sat || src == 0) {
-        return 0;
-    }
-
-    *sat = 1;
-    return UINT64_MAX;
-}
-
-static inline int64_t do_suqrshl_d(int64_t src, int64_t shift,
-                                   bool round, uint32_t *sat)
-{
-    if (sat && src < 0) {
-        *sat = 1;
-        return 0;
-    }
-    return do_uqrshl_d(src, shift, round, sat);
-}
-
-int8_t do_sqrdmlah_b(int8_t, int8_t, int8_t, bool, bool);
-int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *);
-int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *);
-int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool);
-
-/*
- * 8 x 8 -> 16 vector polynomial multiply where the inputs are
- * in the low 8 bits of each 16-bit element
-*/
-uint64_t pmull_h(uint64_t op1, uint64_t op2);
-/*
- * 16 x 16 -> 32 vector polynomial multiply where the inputs are
- * in the low 16 bits of each 32-bit element
- */
-uint64_t pmull_w(uint64_t op1, uint64_t op2);
-
-/**
- * bfdotadd:
- * @sum: addend
- * @e1, @e2: multiplicand vectors
- *
- * BFloat16 2-way dot product of @e1 & @e2, accumulating with @sum.
- * The @e1 and @e2 operands correspond to the 32-bit source vector
- * slots and contain two Bfloat16 values each.
- *
- * Corresponds to the ARM pseudocode function BFDotAdd.
- */
-float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2);
-
-#endif /* TARGET_ARM_VEC_INTERNAL_H */
-- 
cgit v1.1


From 9def656e7a23515d5afd5e5e350574d1dfb7fcc9 Mon Sep 17 00:00:00 2001
From: Claudio Fontana <cfontana@suse.de>
Date: Fri, 17 Feb 2023 17:11:31 -0300
Subject: target/arm: Move psci.c into the tcg directory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Claudio Fontana <cfontana@suse.de>
Signed-off-by: Fabiano Rosas <farosas@suse.de>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Tested-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 target/arm/meson.build     |   1 -
 target/arm/psci.c          | 222 ---------------------------------------------
 target/arm/tcg/meson.build |   4 +
 target/arm/tcg/psci.c      | 222 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 226 insertions(+), 223 deletions(-)
 delete mode 100644 target/arm/psci.c
 create mode 100644 target/arm/tcg/psci.c

diff --git a/target/arm/meson.build b/target/arm/meson.build
index 3e2f403..a5191b5 100644
--- a/target/arm/meson.build
+++ b/target/arm/meson.build
@@ -22,7 +22,6 @@ arm_softmmu_ss.add(files(
   'arm-powerctl.c',
   'machine.c',
   'monitor.c',
-  'psci.c',
   'ptw.c',
 ))
 
diff --git a/target/arm/psci.c b/target/arm/psci.c
deleted file mode 100644
index 6c1239b..0000000
--- a/target/arm/psci.c
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- * Copyright (C) 2014 - Linaro
- * Author: Rob Herring <rob.herring@linaro.org>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "qemu/osdep.h"
-#include "cpu.h"
-#include "exec/helper-proto.h"
-#include "kvm-consts.h"
-#include "qemu/main-loop.h"
-#include "sysemu/runstate.h"
-#include "internals.h"
-#include "arm-powerctl.h"
-
-bool arm_is_psci_call(ARMCPU *cpu, int excp_type)
-{
-    /*
-     * Return true if the exception type matches the configured PSCI conduit.
-     * This is called before the SMC/HVC instruction is executed, to decide
-     * whether we should treat it as a PSCI call or with the architecturally
-     * defined behaviour for an SMC or HVC (which might be UNDEF or trap
-     * to EL2 or to EL3).
-     */
-
-    switch (excp_type) {
-    case EXCP_HVC:
-        if (cpu->psci_conduit != QEMU_PSCI_CONDUIT_HVC) {
-            return false;
-        }
-        break;
-    case EXCP_SMC:
-        if (cpu->psci_conduit != QEMU_PSCI_CONDUIT_SMC) {
-            return false;
-        }
-        break;
-    default:
-        return false;
-    }
-
-    return true;
-}
-
-void arm_handle_psci_call(ARMCPU *cpu)
-{
-    /*
-     * This function partially implements the logic for dispatching Power State
-     * Coordination Interface (PSCI) calls (as described in ARM DEN 0022D.b),
-     * to the extent required for bringing up and taking down secondary cores,
-     * and for handling reset and poweroff requests.
-     * Additional information about the calling convention used is available in
-     * the document 'SMC Calling Convention' (ARM DEN 0028)
-     */
-    CPUARMState *env = &cpu->env;
-    uint64_t param[4];
-    uint64_t context_id, mpidr;
-    target_ulong entry;
-    int32_t ret = 0;
-    int i;
-
-    for (i = 0; i < 4; i++) {
-        /*
-         * All PSCI functions take explicit 32-bit or native int sized
-         * arguments so we can simply zero-extend all arguments regardless
-         * of which exact function we are about to call.
-         */
-        param[i] = is_a64(env) ? env->xregs[i] : env->regs[i];
-    }
-
-    if ((param[0] & QEMU_PSCI_0_2_64BIT) && !is_a64(env)) {
-        ret = QEMU_PSCI_RET_NOT_SUPPORTED;
-        goto err;
-    }
-
-    switch (param[0]) {
-        CPUState *target_cpu_state;
-        ARMCPU *target_cpu;
-
-    case QEMU_PSCI_0_2_FN_PSCI_VERSION:
-        ret = QEMU_PSCI_VERSION_1_1;
-        break;
-    case QEMU_PSCI_0_2_FN_MIGRATE_INFO_TYPE:
-        ret = QEMU_PSCI_0_2_RET_TOS_MIGRATION_NOT_REQUIRED; /* No trusted OS */
-        break;
-    case QEMU_PSCI_0_2_FN_AFFINITY_INFO:
-    case QEMU_PSCI_0_2_FN64_AFFINITY_INFO:
-        mpidr = param[1];
-
-        switch (param[2]) {
-        case 0:
-            target_cpu_state = arm_get_cpu_by_id(mpidr);
-            if (!target_cpu_state) {
-                ret = QEMU_PSCI_RET_INVALID_PARAMS;
-                break;
-            }
-            target_cpu = ARM_CPU(target_cpu_state);
-
-            g_assert(qemu_mutex_iothread_locked());
-            ret = target_cpu->power_state;
-            break;
-        default:
-            /* Everything above affinity level 0 is always on. */
-            ret = 0;
-        }
-        break;
-    case QEMU_PSCI_0_2_FN_SYSTEM_RESET:
-        qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
-        /* QEMU reset and shutdown are async requests, but PSCI
-         * mandates that we never return from the reset/shutdown
-         * call, so power the CPU off now so it doesn't execute
-         * anything further.
-         */
-        goto cpu_off;
-    case QEMU_PSCI_0_2_FN_SYSTEM_OFF:
-        qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
-        goto cpu_off;
-    case QEMU_PSCI_0_1_FN_CPU_ON:
-    case QEMU_PSCI_0_2_FN_CPU_ON:
-    case QEMU_PSCI_0_2_FN64_CPU_ON:
-    {
-        /* The PSCI spec mandates that newly brought up CPUs start
-         * in the highest exception level which exists and is enabled
-         * on the calling CPU. Since the QEMU PSCI implementation is
-         * acting as a "fake EL3" or "fake EL2" firmware, this for us
-         * means that we want to start at the highest NS exception level
-         * that we are providing to the guest.
-         * The execution mode should be that which is currently in use
-         * by the same exception level on the calling CPU.
-         * The CPU should be started with the context_id value
-         * in x0 (if AArch64) or r0 (if AArch32).
-         */
-        int target_el = arm_feature(env, ARM_FEATURE_EL2) ? 2 : 1;
-        bool target_aarch64 = arm_el_is_aa64(env, target_el);
-
-        mpidr = param[1];
-        entry = param[2];
-        context_id = param[3];
-        ret = arm_set_cpu_on(mpidr, entry, context_id,
-                             target_el, target_aarch64);
-        break;
-    }
-    case QEMU_PSCI_0_1_FN_CPU_OFF:
-    case QEMU_PSCI_0_2_FN_CPU_OFF:
-        goto cpu_off;
-    case QEMU_PSCI_0_1_FN_CPU_SUSPEND:
-    case QEMU_PSCI_0_2_FN_CPU_SUSPEND:
-    case QEMU_PSCI_0_2_FN64_CPU_SUSPEND:
-        /* Affinity levels are not supported in QEMU */
-        if (param[1] & 0xfffe0000) {
-            ret = QEMU_PSCI_RET_INVALID_PARAMS;
-            break;
-        }
-        /* Powerdown is not supported, we always go into WFI */
-        if (is_a64(env)) {
-            env->xregs[0] = 0;
-        } else {
-            env->regs[0] = 0;
-        }
-        helper_wfi(env, 4);
-        break;
-    case QEMU_PSCI_1_0_FN_PSCI_FEATURES:
-        switch (param[1]) {
-        case QEMU_PSCI_0_2_FN_PSCI_VERSION:
-        case QEMU_PSCI_0_2_FN_MIGRATE_INFO_TYPE:
-        case QEMU_PSCI_0_2_FN_AFFINITY_INFO:
-        case QEMU_PSCI_0_2_FN64_AFFINITY_INFO:
-        case QEMU_PSCI_0_2_FN_SYSTEM_RESET:
-        case QEMU_PSCI_0_2_FN_SYSTEM_OFF:
-        case QEMU_PSCI_0_1_FN_CPU_ON:
-        case QEMU_PSCI_0_2_FN_CPU_ON:
-        case QEMU_PSCI_0_2_FN64_CPU_ON:
-        case QEMU_PSCI_0_1_FN_CPU_OFF:
-        case QEMU_PSCI_0_2_FN_CPU_OFF:
-        case QEMU_PSCI_0_1_FN_CPU_SUSPEND:
-        case QEMU_PSCI_0_2_FN_CPU_SUSPEND:
-        case QEMU_PSCI_0_2_FN64_CPU_SUSPEND:
-        case QEMU_PSCI_1_0_FN_PSCI_FEATURES:
-            if (!(param[1] & QEMU_PSCI_0_2_64BIT) || is_a64(env)) {
-                ret = 0;
-                break;
-            }
-            /* fallthrough */
-        case QEMU_PSCI_0_1_FN_MIGRATE:
-        case QEMU_PSCI_0_2_FN_MIGRATE:
-        default:
-            ret = QEMU_PSCI_RET_NOT_SUPPORTED;
-            break;
-        }
-        break;
-    case QEMU_PSCI_0_1_FN_MIGRATE:
-    case QEMU_PSCI_0_2_FN_MIGRATE:
-    default:
-        ret = QEMU_PSCI_RET_NOT_SUPPORTED;
-        break;
-    }
-
-err:
-    if (is_a64(env)) {
-        env->xregs[0] = ret;
-    } else {
-        env->regs[0] = ret;
-    }
-    return;
-
-cpu_off:
-    ret = arm_set_cpu_off(cpu->mp_affinity);
-    /* notreached */
-    /* sanity check in case something failed */
-    assert(ret == QEMU_ARM_POWERCTL_RET_SUCCESS);
-}
diff --git a/target/arm/tcg/meson.build b/target/arm/tcg/meson.build
index 1f27ba1..fa8a9ea 100644
--- a/target/arm/tcg/meson.build
+++ b/target/arm/tcg/meson.build
@@ -43,3 +43,7 @@ arm_ss.add(when: 'TARGET_AARCH64', if_true: files(
   'sme_helper.c',
   'sve_helper.c',
 ))
+
+arm_softmmu_ss.add(files(
+  'psci.c',
+))
diff --git a/target/arm/tcg/psci.c b/target/arm/tcg/psci.c
new file mode 100644
index 0000000..6c1239b
--- /dev/null
+++ b/target/arm/tcg/psci.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (C) 2014 - Linaro
+ * Author: Rob Herring <rob.herring@linaro.org>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "exec/helper-proto.h"
+#include "kvm-consts.h"
+#include "qemu/main-loop.h"
+#include "sysemu/runstate.h"
+#include "internals.h"
+#include "arm-powerctl.h"
+
+bool arm_is_psci_call(ARMCPU *cpu, int excp_type)
+{
+    /*
+     * Return true if the exception type matches the configured PSCI conduit.
+     * This is called before the SMC/HVC instruction is executed, to decide
+     * whether we should treat it as a PSCI call or with the architecturally
+     * defined behaviour for an SMC or HVC (which might be UNDEF or trap
+     * to EL2 or to EL3).
+     */
+
+    switch (excp_type) {
+    case EXCP_HVC:
+        if (cpu->psci_conduit != QEMU_PSCI_CONDUIT_HVC) {
+            return false;
+        }
+        break;
+    case EXCP_SMC:
+        if (cpu->psci_conduit != QEMU_PSCI_CONDUIT_SMC) {
+            return false;
+        }
+        break;
+    default:
+        return false;
+    }
+
+    return true;
+}
+
+void arm_handle_psci_call(ARMCPU *cpu)
+{
+    /*
+     * This function partially implements the logic for dispatching Power State
+     * Coordination Interface (PSCI) calls (as described in ARM DEN 0022D.b),
+     * to the extent required for bringing up and taking down secondary cores,
+     * and for handling reset and poweroff requests.
+     * Additional information about the calling convention used is available in
+     * the document 'SMC Calling Convention' (ARM DEN 0028)
+     */
+    CPUARMState *env = &cpu->env;
+    uint64_t param[4];
+    uint64_t context_id, mpidr;
+    target_ulong entry;
+    int32_t ret = 0;
+    int i;
+
+    for (i = 0; i < 4; i++) {
+        /*
+         * All PSCI functions take explicit 32-bit or native int sized
+         * arguments so we can simply zero-extend all arguments regardless
+         * of which exact function we are about to call.
+         */
+        param[i] = is_a64(env) ? env->xregs[i] : env->regs[i];
+    }
+
+    if ((param[0] & QEMU_PSCI_0_2_64BIT) && !is_a64(env)) {
+        ret = QEMU_PSCI_RET_NOT_SUPPORTED;
+        goto err;
+    }
+
+    switch (param[0]) {
+        CPUState *target_cpu_state;
+        ARMCPU *target_cpu;
+
+    case QEMU_PSCI_0_2_FN_PSCI_VERSION:
+        ret = QEMU_PSCI_VERSION_1_1;
+        break;
+    case QEMU_PSCI_0_2_FN_MIGRATE_INFO_TYPE:
+        ret = QEMU_PSCI_0_2_RET_TOS_MIGRATION_NOT_REQUIRED; /* No trusted OS */
+        break;
+    case QEMU_PSCI_0_2_FN_AFFINITY_INFO:
+    case QEMU_PSCI_0_2_FN64_AFFINITY_INFO:
+        mpidr = param[1];
+
+        switch (param[2]) {
+        case 0:
+            target_cpu_state = arm_get_cpu_by_id(mpidr);
+            if (!target_cpu_state) {
+                ret = QEMU_PSCI_RET_INVALID_PARAMS;
+                break;
+            }
+            target_cpu = ARM_CPU(target_cpu_state);
+
+            g_assert(qemu_mutex_iothread_locked());
+            ret = target_cpu->power_state;
+            break;
+        default:
+            /* Everything above affinity level 0 is always on. */
+            ret = 0;
+        }
+        break;
+    case QEMU_PSCI_0_2_FN_SYSTEM_RESET:
+        qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
+        /* QEMU reset and shutdown are async requests, but PSCI
+         * mandates that we never return from the reset/shutdown
+         * call, so power the CPU off now so it doesn't execute
+         * anything further.
+         */
+        goto cpu_off;
+    case QEMU_PSCI_0_2_FN_SYSTEM_OFF:
+        qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
+        goto cpu_off;
+    case QEMU_PSCI_0_1_FN_CPU_ON:
+    case QEMU_PSCI_0_2_FN_CPU_ON:
+    case QEMU_PSCI_0_2_FN64_CPU_ON:
+    {
+        /* The PSCI spec mandates that newly brought up CPUs start
+         * in the highest exception level which exists and is enabled
+         * on the calling CPU. Since the QEMU PSCI implementation is
+         * acting as a "fake EL3" or "fake EL2" firmware, this for us
+         * means that we want to start at the highest NS exception level
+         * that we are providing to the guest.
+         * The execution mode should be that which is currently in use
+         * by the same exception level on the calling CPU.
+         * The CPU should be started with the context_id value
+         * in x0 (if AArch64) or r0 (if AArch32).
+         */
+        int target_el = arm_feature(env, ARM_FEATURE_EL2) ? 2 : 1;
+        bool target_aarch64 = arm_el_is_aa64(env, target_el);
+
+        mpidr = param[1];
+        entry = param[2];
+        context_id = param[3];
+        ret = arm_set_cpu_on(mpidr, entry, context_id,
+                             target_el, target_aarch64);
+        break;
+    }
+    case QEMU_PSCI_0_1_FN_CPU_OFF:
+    case QEMU_PSCI_0_2_FN_CPU_OFF:
+        goto cpu_off;
+    case QEMU_PSCI_0_1_FN_CPU_SUSPEND:
+    case QEMU_PSCI_0_2_FN_CPU_SUSPEND:
+    case QEMU_PSCI_0_2_FN64_CPU_SUSPEND:
+        /* Affinity levels are not supported in QEMU */
+        if (param[1] & 0xfffe0000) {
+            ret = QEMU_PSCI_RET_INVALID_PARAMS;
+            break;
+        }
+        /* Powerdown is not supported, we always go into WFI */
+        if (is_a64(env)) {
+            env->xregs[0] = 0;
+        } else {
+            env->regs[0] = 0;
+        }
+        helper_wfi(env, 4);
+        break;
+    case QEMU_PSCI_1_0_FN_PSCI_FEATURES:
+        switch (param[1]) {
+        case QEMU_PSCI_0_2_FN_PSCI_VERSION:
+        case QEMU_PSCI_0_2_FN_MIGRATE_INFO_TYPE:
+        case QEMU_PSCI_0_2_FN_AFFINITY_INFO:
+        case QEMU_PSCI_0_2_FN64_AFFINITY_INFO:
+        case QEMU_PSCI_0_2_FN_SYSTEM_RESET:
+        case QEMU_PSCI_0_2_FN_SYSTEM_OFF:
+        case QEMU_PSCI_0_1_FN_CPU_ON:
+        case QEMU_PSCI_0_2_FN_CPU_ON:
+        case QEMU_PSCI_0_2_FN64_CPU_ON:
+        case QEMU_PSCI_0_1_FN_CPU_OFF:
+        case QEMU_PSCI_0_2_FN_CPU_OFF:
+        case QEMU_PSCI_0_1_FN_CPU_SUSPEND:
+        case QEMU_PSCI_0_2_FN_CPU_SUSPEND:
+        case QEMU_PSCI_0_2_FN64_CPU_SUSPEND:
+        case QEMU_PSCI_1_0_FN_PSCI_FEATURES:
+            if (!(param[1] & QEMU_PSCI_0_2_64BIT) || is_a64(env)) {
+                ret = 0;
+                break;
+            }
+            /* fallthrough */
+        case QEMU_PSCI_0_1_FN_MIGRATE:
+        case QEMU_PSCI_0_2_FN_MIGRATE:
+        default:
+            ret = QEMU_PSCI_RET_NOT_SUPPORTED;
+            break;
+        }
+        break;
+    case QEMU_PSCI_0_1_FN_MIGRATE:
+    case QEMU_PSCI_0_2_FN_MIGRATE:
+    default:
+        ret = QEMU_PSCI_RET_NOT_SUPPORTED;
+        break;
+    }
+
+err:
+    if (is_a64(env)) {
+        env->xregs[0] = ret;
+    } else {
+        env->regs[0] = ret;
+    }
+    return;
+
+cpu_off:
+    ret = arm_set_cpu_off(cpu->mp_affinity);
+    /* notreached */
+    /* sanity check in case something failed */
+    assert(ret == QEMU_ARM_POWERCTL_RET_SUCCESS);
+}
-- 
cgit v1.1


From 2b77ad4de615542dd8f6b9886a816e744b0abffd Mon Sep 17 00:00:00 2001
From: Fabiano Rosas <farosas@suse.de>
Date: Fri, 17 Feb 2023 17:11:32 -0300
Subject: target/arm: Wrap arm_rebuild_hflags calls with tcg_enabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is in preparation to moving the hflags code into its own file
under the tcg/ directory.

Signed-off-by: Fabiano Rosas <farosas@suse.de>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Tested-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 hw/arm/boot.c             |  6 +++++-
 hw/intc/armv7m_nvic.c     | 20 +++++++++++++-------
 target/arm/arm-powerctl.c |  7 +++++--
 target/arm/cpu.c          |  3 ++-
 target/arm/helper.c       | 18 +++++++++++++-----
 target/arm/machine.c      |  5 ++++-
 6 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/hw/arm/boot.c b/hw/arm/boot.c
index 3d7d11f..1e021c4 100644
--- a/hw/arm/boot.c
+++ b/hw/arm/boot.c
@@ -15,6 +15,7 @@
 #include "hw/arm/boot.h"
 #include "hw/arm/linux-boot-if.h"
 #include "sysemu/kvm.h"
+#include "sysemu/tcg.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/numa.h"
 #include "hw/boards.h"
@@ -827,7 +828,10 @@ static void do_cpu_reset(void *opaque)
                 info->secondary_cpu_reset_hook(cpu, info);
             }
         }
-        arm_rebuild_hflags(env);
+
+        if (tcg_enabled()) {
+            arm_rebuild_hflags(env);
+        }
     }
 }
 
diff --git a/hw/intc/armv7m_nvic.c b/hw/intc/armv7m_nvic.c
index e545532..8e28905 100644
--- a/hw/intc/armv7m_nvic.c
+++ b/hw/intc/armv7m_nvic.c
@@ -18,6 +18,7 @@
 #include "hw/intc/armv7m_nvic.h"
 #include "hw/irq.h"
 #include "hw/qdev-properties.h"
+#include "sysemu/tcg.h"
 #include "sysemu/runstate.h"
 #include "target/arm/cpu.h"
 #include "exec/exec-all.h"
@@ -2454,8 +2455,10 @@ static MemTxResult nvic_sysreg_write(void *opaque, hwaddr addr,
     /* This is UNPREDICTABLE; treat as RAZ/WI */
 
  exit_ok:
-    /* Ensure any changes made are reflected in the cached hflags.  */
-    arm_rebuild_hflags(&s->cpu->env);
+    if (tcg_enabled()) {
+        /* Ensure any changes made are reflected in the cached hflags. */
+        arm_rebuild_hflags(&s->cpu->env);
+    }
     return MEMTX_OK;
 }
 
@@ -2636,11 +2639,14 @@ static void armv7m_nvic_reset(DeviceState *dev)
         }
     }
 
-    /*
-     * We updated state that affects the CPU's MMUidx and thus its hflags;
-     * and we can't guarantee that we run before the CPU reset function.
-     */
-    arm_rebuild_hflags(&s->cpu->env);
+    if (tcg_enabled()) {
+        /*
+         * We updated state that affects the CPU's MMUidx and thus its
+         * hflags; and we can't guarantee that we run before the CPU
+         * reset function.
+         */
+        arm_rebuild_hflags(&s->cpu->env);
+    }
 }
 
 static void nvic_systick_trigger(void *opaque, int n, int level)
diff --git a/target/arm/arm-powerctl.c b/target/arm/arm-powerctl.c
index b75f813..326a031 100644
--- a/target/arm/arm-powerctl.c
+++ b/target/arm/arm-powerctl.c
@@ -15,6 +15,7 @@
 #include "arm-powerctl.h"
 #include "qemu/log.h"
 #include "qemu/main-loop.h"
+#include "sysemu/tcg.h"
 
 #ifndef DEBUG_ARM_POWERCTL
 #define DEBUG_ARM_POWERCTL 0
@@ -127,8 +128,10 @@ static void arm_set_cpu_on_async_work(CPUState *target_cpu_state,
         target_cpu->env.regs[0] = info->context_id;
     }
 
-    /* CP15 update requires rebuilding hflags */
-    arm_rebuild_hflags(&target_cpu->env);
+    if (tcg_enabled()) {
+        /* CP15 update requires rebuilding hflags */
+        arm_rebuild_hflags(&target_cpu->env);
+    }
 
     /* Start the new CPU at the requested address */
     cpu_set_pc(target_cpu_state, info->entry);
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index da416f7..0b333a7 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -542,8 +542,9 @@ static void arm_cpu_reset_hold(Object *obj)
     if (tcg_enabled()) {
         hw_breakpoint_update_all(cpu);
         hw_watchpoint_update_all(cpu);
+
+        arm_rebuild_hflags(env);
     }
-    arm_rebuild_hflags(env);
 }
 
 #if defined(CONFIG_TCG) && !defined(CONFIG_USER_ONLY)
diff --git a/target/arm/helper.c b/target/arm/helper.c
index 07d4100..af72e6d 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -5173,7 +5173,7 @@ static void sctlr_write(CPUARMState *env, const ARMCPRegInfo *ri,
     /* This may enable/disable the MMU, so do a TLB flush.  */
     tlb_flush(CPU(cpu));
 
-    if (ri->type & ARM_CP_SUPPRESS_TB_END) {
+    if (tcg_enabled() && ri->type & ARM_CP_SUPPRESS_TB_END) {
         /*
          * Normally we would always end the TB on an SCTLR write; see the
          * comment in ARMCPRegInfo sctlr initialization below for why Xscale
@@ -6841,7 +6841,9 @@ void aarch64_set_svcr(CPUARMState *env, uint64_t new, uint64_t mask)
         memset(env->zarray, 0, sizeof(env->zarray));
     }
 
-    arm_rebuild_hflags(env);
+    if (tcg_enabled()) {
+        arm_rebuild_hflags(env);
+    }
 }
 
 static void svcr_write(CPUARMState *env, const ARMCPRegInfo *ri,
@@ -9886,7 +9888,7 @@ void cpsr_write(CPUARMState *env, uint32_t val, uint32_t mask,
     }
     mask &= ~CACHED_CPSR_BITS;
     env->uncached_cpsr = (env->uncached_cpsr & ~mask) | (val & mask);
-    if (rebuild_hflags) {
+    if (tcg_enabled() && rebuild_hflags) {
         arm_rebuild_hflags(env);
     }
 }
@@ -10445,7 +10447,10 @@ static void take_aarch32_exception(CPUARMState *env, int new_mode,
         env->regs[14] = env->regs[15] + offset;
     }
     env->regs[15] = newpc;
-    arm_rebuild_hflags(env);
+
+    if (tcg_enabled()) {
+        arm_rebuild_hflags(env);
+    }
 }
 
 static void arm_cpu_do_interrupt_aarch32_hyp(CPUState *cs)
@@ -11001,7 +11006,10 @@ static void arm_cpu_do_interrupt_aarch64(CPUState *cs)
     pstate_write(env, PSTATE_DAIF | new_mode);
     env->aarch64 = true;
     aarch64_restore_sp(env, new_el);
-    helper_rebuild_hflags_a64(env, new_el);
+
+    if (tcg_enabled()) {
+        helper_rebuild_hflags_a64(env, new_el);
+    }
 
     env->pc = addr;
 
diff --git a/target/arm/machine.c b/target/arm/machine.c
index fd6323f..fc4a4a4 100644
--- a/target/arm/machine.c
+++ b/target/arm/machine.c
@@ -871,7 +871,10 @@ static int cpu_post_load(void *opaque, int version_id)
     if (!kvm_enabled()) {
         pmu_op_finish(&cpu->env);
     }
-    arm_rebuild_hflags(&cpu->env);
+
+    if (tcg_enabled()) {
+        arm_rebuild_hflags(&cpu->env);
+    }
 
     return 0;
 }
-- 
cgit v1.1


From 671efad16a242b3fb5fb5111e9981d56887f7755 Mon Sep 17 00:00:00 2001
From: Fabiano Rosas <farosas@suse.de>
Date: Fri, 17 Feb 2023 17:11:33 -0300
Subject: target/arm: Move hflags code into the tcg directory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The hflags are used only for TCG code, so introduce a new file
hflags.c to keep that code.

Signed-off-by: Fabiano Rosas <farosas@suse.de>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Tested-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 target/arm/helper.c        | 393 +------------------------------------------
 target/arm/internals.h     |   2 +
 target/arm/tcg-stubs.c     |   4 +
 target/arm/tcg/hflags.c    | 403 +++++++++++++++++++++++++++++++++++++++++++++
 target/arm/tcg/meson.build |   1 +
 5 files changed, 411 insertions(+), 392 deletions(-)
 create mode 100644 target/arm/tcg/hflags.c

diff --git a/target/arm/helper.c b/target/arm/helper.c
index af72e6d..14af7ba 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -6669,32 +6669,6 @@ int sme_exception_el(CPUARMState *env, int el)
     return 0;
 }
 
-/* This corresponds to the ARM pseudocode function IsFullA64Enabled(). */
-static bool sme_fa64(CPUARMState *env, int el)
-{
-    if (!cpu_isar_feature(aa64_sme_fa64, env_archcpu(env))) {
-        return false;
-    }
-
-    if (el <= 1 && !el_is_in_host(env, el)) {
-        if (!FIELD_EX64(env->vfp.smcr_el[1], SMCR, FA64)) {
-            return false;
-        }
-    }
-    if (el <= 2 && arm_is_el2_enabled(env)) {
-        if (!FIELD_EX64(env->vfp.smcr_el[2], SMCR, FA64)) {
-            return false;
-        }
-    }
-    if (arm_feature(env, ARM_FEATURE_EL3)) {
-        if (!FIELD_EX64(env->vfp.smcr_el[3], SMCR, FA64)) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
 /*
  * Given that SVE is enabled, return the vector length for EL.
  */
@@ -11150,7 +11124,7 @@ int aa64_va_parameter_tbid(uint64_t tcr, ARMMMUIdx mmu_idx)
     }
 }
 
-static int aa64_va_parameter_tcma(uint64_t tcr, ARMMMUIdx mmu_idx)
+int aa64_va_parameter_tcma(uint64_t tcr, ARMMMUIdx mmu_idx)
 {
     if (regime_has_2_ranges(mmu_idx)) {
         return extract64(tcr, 57, 2);
@@ -11861,371 +11835,6 @@ ARMMMUIdx arm_mmu_idx(CPUARMState *env)
     return arm_mmu_idx_el(env, arm_current_el(env));
 }
 
-static inline bool fgt_svc(CPUARMState *env, int el)
-{
-    /*
-     * Assuming fine-grained-traps are active, return true if we
-     * should be trapping on SVC instructions. Only AArch64 can
-     * trap on an SVC at EL1, but we don't need to special-case this
-     * because if this is AArch32 EL1 then arm_fgt_active() is false.
-     * We also know el is 0 or 1.
-     */
-    return el == 0 ?
-        FIELD_EX64(env->cp15.fgt_exec[FGTREG_HFGITR], HFGITR_EL2, SVC_EL0) :
-        FIELD_EX64(env->cp15.fgt_exec[FGTREG_HFGITR], HFGITR_EL2, SVC_EL1);
-}
-
-static CPUARMTBFlags rebuild_hflags_common(CPUARMState *env, int fp_el,
-                                           ARMMMUIdx mmu_idx,
-                                           CPUARMTBFlags flags)
-{
-    DP_TBFLAG_ANY(flags, FPEXC_EL, fp_el);
-    DP_TBFLAG_ANY(flags, MMUIDX, arm_to_core_mmu_idx(mmu_idx));
-
-    if (arm_singlestep_active(env)) {
-        DP_TBFLAG_ANY(flags, SS_ACTIVE, 1);
-    }
-
-    return flags;
-}
-
-static CPUARMTBFlags rebuild_hflags_common_32(CPUARMState *env, int fp_el,
-                                              ARMMMUIdx mmu_idx,
-                                              CPUARMTBFlags flags)
-{
-    bool sctlr_b = arm_sctlr_b(env);
-
-    if (sctlr_b) {
-        DP_TBFLAG_A32(flags, SCTLR__B, 1);
-    }
-    if (arm_cpu_data_is_big_endian_a32(env, sctlr_b)) {
-        DP_TBFLAG_ANY(flags, BE_DATA, 1);
-    }
-    DP_TBFLAG_A32(flags, NS, !access_secure_reg(env));
-
-    return rebuild_hflags_common(env, fp_el, mmu_idx, flags);
-}
-
-static CPUARMTBFlags rebuild_hflags_m32(CPUARMState *env, int fp_el,
-                                        ARMMMUIdx mmu_idx)
-{
-    CPUARMTBFlags flags = {};
-    uint32_t ccr = env->v7m.ccr[env->v7m.secure];
-
-    /* Without HaveMainExt, CCR.UNALIGN_TRP is RES1. */
-    if (ccr & R_V7M_CCR_UNALIGN_TRP_MASK) {
-        DP_TBFLAG_ANY(flags, ALIGN_MEM, 1);
-    }
-
-    if (arm_v7m_is_handler_mode(env)) {
-        DP_TBFLAG_M32(flags, HANDLER, 1);
-    }
-
-    /*
-     * v8M always applies stack limit checks unless CCR.STKOFHFNMIGN
-     * is suppressing them because the requested execution priority
-     * is less than 0.
-     */
-    if (arm_feature(env, ARM_FEATURE_V8) &&
-        !((mmu_idx & ARM_MMU_IDX_M_NEGPRI) &&
-          (ccr & R_V7M_CCR_STKOFHFNMIGN_MASK))) {
-        DP_TBFLAG_M32(flags, STACKCHECK, 1);
-    }
-
-    if (arm_feature(env, ARM_FEATURE_M_SECURITY) && env->v7m.secure) {
-        DP_TBFLAG_M32(flags, SECURE, 1);
-    }
-
-    return rebuild_hflags_common_32(env, fp_el, mmu_idx, flags);
-}
-
-static CPUARMTBFlags rebuild_hflags_a32(CPUARMState *env, int fp_el,
-                                        ARMMMUIdx mmu_idx)
-{
-    CPUARMTBFlags flags = {};
-    int el = arm_current_el(env);
-
-    if (arm_sctlr(env, el) & SCTLR_A) {
-        DP_TBFLAG_ANY(flags, ALIGN_MEM, 1);
-    }
-
-    if (arm_el_is_aa64(env, 1)) {
-        DP_TBFLAG_A32(flags, VFPEN, 1);
-    }
-
-    if (el < 2 && env->cp15.hstr_el2 && arm_is_el2_enabled(env) &&
-        (arm_hcr_el2_eff(env) & (HCR_E2H | HCR_TGE)) != (HCR_E2H | HCR_TGE)) {
-        DP_TBFLAG_A32(flags, HSTR_ACTIVE, 1);
-    }
-
-    if (arm_fgt_active(env, el)) {
-        DP_TBFLAG_ANY(flags, FGT_ACTIVE, 1);
-        if (fgt_svc(env, el)) {
-            DP_TBFLAG_ANY(flags, FGT_SVC, 1);
-        }
-    }
-
-    if (env->uncached_cpsr & CPSR_IL) {
-        DP_TBFLAG_ANY(flags, PSTATE__IL, 1);
-    }
-
-    /*
-     * The SME exception we are testing for is raised via
-     * AArch64.CheckFPAdvSIMDEnabled(), as called from
-     * AArch32.CheckAdvSIMDOrFPEnabled().
-     */
-    if (el == 0
-        && FIELD_EX64(env->svcr, SVCR, SM)
-        && (!arm_is_el2_enabled(env)
-            || (arm_el_is_aa64(env, 2) && !(env->cp15.hcr_el2 & HCR_TGE)))
-        && arm_el_is_aa64(env, 1)
-        && !sme_fa64(env, el)) {
-        DP_TBFLAG_A32(flags, SME_TRAP_NONSTREAMING, 1);
-    }
-
-    return rebuild_hflags_common_32(env, fp_el, mmu_idx, flags);
-}
-
-static CPUARMTBFlags rebuild_hflags_a64(CPUARMState *env, int el, int fp_el,
-                                        ARMMMUIdx mmu_idx)
-{
-    CPUARMTBFlags flags = {};
-    ARMMMUIdx stage1 = stage_1_mmu_idx(mmu_idx);
-    uint64_t tcr = regime_tcr(env, mmu_idx);
-    uint64_t sctlr;
-    int tbii, tbid;
-
-    DP_TBFLAG_ANY(flags, AARCH64_STATE, 1);
-
-    /* Get control bits for tagged addresses.  */
-    tbid = aa64_va_parameter_tbi(tcr, mmu_idx);
-    tbii = tbid & ~aa64_va_parameter_tbid(tcr, mmu_idx);
-
-    DP_TBFLAG_A64(flags, TBII, tbii);
-    DP_TBFLAG_A64(flags, TBID, tbid);
-
-    if (cpu_isar_feature(aa64_sve, env_archcpu(env))) {
-        int sve_el = sve_exception_el(env, el);
-
-        /*
-         * If either FP or SVE are disabled, translator does not need len.
-         * If SVE EL > FP EL, FP exception has precedence, and translator
-         * does not need SVE EL.  Save potential re-translations by forcing
-         * the unneeded data to zero.
-         */
-        if (fp_el != 0) {
-            if (sve_el > fp_el) {
-                sve_el = 0;
-            }
-        } else if (sve_el == 0) {
-            DP_TBFLAG_A64(flags, VL, sve_vqm1_for_el(env, el));
-        }
-        DP_TBFLAG_A64(flags, SVEEXC_EL, sve_el);
-    }
-    if (cpu_isar_feature(aa64_sme, env_archcpu(env))) {
-        int sme_el = sme_exception_el(env, el);
-        bool sm = FIELD_EX64(env->svcr, SVCR, SM);
-
-        DP_TBFLAG_A64(flags, SMEEXC_EL, sme_el);
-        if (sme_el == 0) {
-            /* Similarly, do not compute SVL if SME is disabled. */
-            int svl = sve_vqm1_for_el_sm(env, el, true);
-            DP_TBFLAG_A64(flags, SVL, svl);
-            if (sm) {
-                /* If SVE is disabled, we will not have set VL above. */
-                DP_TBFLAG_A64(flags, VL, svl);
-            }
-        }
-        if (sm) {
-            DP_TBFLAG_A64(flags, PSTATE_SM, 1);
-            DP_TBFLAG_A64(flags, SME_TRAP_NONSTREAMING, !sme_fa64(env, el));
-        }
-        DP_TBFLAG_A64(flags, PSTATE_ZA, FIELD_EX64(env->svcr, SVCR, ZA));
-    }
-
-    sctlr = regime_sctlr(env, stage1);
-
-    if (sctlr & SCTLR_A) {
-        DP_TBFLAG_ANY(flags, ALIGN_MEM, 1);
-    }
-
-    if (arm_cpu_data_is_big_endian_a64(el, sctlr)) {
-        DP_TBFLAG_ANY(flags, BE_DATA, 1);
-    }
-
-    if (cpu_isar_feature(aa64_pauth, env_archcpu(env))) {
-        /*
-         * In order to save space in flags, we record only whether
-         * pauth is "inactive", meaning all insns are implemented as
-         * a nop, or "active" when some action must be performed.
-         * The decision of which action to take is left to a helper.
-         */
-        if (sctlr & (SCTLR_EnIA | SCTLR_EnIB | SCTLR_EnDA | SCTLR_EnDB)) {
-            DP_TBFLAG_A64(flags, PAUTH_ACTIVE, 1);
-        }
-    }
-
-    if (cpu_isar_feature(aa64_bti, env_archcpu(env))) {
-        /* Note that SCTLR_EL[23].BT == SCTLR_BT1.  */
-        if (sctlr & (el == 0 ? SCTLR_BT0 : SCTLR_BT1)) {
-            DP_TBFLAG_A64(flags, BT, 1);
-        }
-    }
-
-    /* Compute the condition for using AccType_UNPRIV for LDTR et al. */
-    if (!(env->pstate & PSTATE_UAO)) {
-        switch (mmu_idx) {
-        case ARMMMUIdx_E10_1:
-        case ARMMMUIdx_E10_1_PAN:
-            /* TODO: ARMv8.3-NV */
-            DP_TBFLAG_A64(flags, UNPRIV, 1);
-            break;
-        case ARMMMUIdx_E20_2:
-        case ARMMMUIdx_E20_2_PAN:
-            /*
-             * Note that EL20_2 is gated by HCR_EL2.E2H == 1, but EL20_0 is
-             * gated by HCR_EL2.<E2H,TGE> == '11', and so is LDTR.
-             */
-            if (env->cp15.hcr_el2 & HCR_TGE) {
-                DP_TBFLAG_A64(flags, UNPRIV, 1);
-            }
-            break;
-        default:
-            break;
-        }
-    }
-
-    if (env->pstate & PSTATE_IL) {
-        DP_TBFLAG_ANY(flags, PSTATE__IL, 1);
-    }
-
-    if (arm_fgt_active(env, el)) {
-        DP_TBFLAG_ANY(flags, FGT_ACTIVE, 1);
-        if (FIELD_EX64(env->cp15.fgt_exec[FGTREG_HFGITR], HFGITR_EL2, ERET)) {
-            DP_TBFLAG_A64(flags, FGT_ERET, 1);
-        }
-        if (fgt_svc(env, el)) {
-            DP_TBFLAG_ANY(flags, FGT_SVC, 1);
-        }
-    }
-
-    if (cpu_isar_feature(aa64_mte, env_archcpu(env))) {
-        /*
-         * Set MTE_ACTIVE if any access may be Checked, and leave clear
-         * if all accesses must be Unchecked:
-         * 1) If no TBI, then there are no tags in the address to check,
-         * 2) If Tag Check Override, then all accesses are Unchecked,
-         * 3) If Tag Check Fail == 0, then Checked access have no effect,
-         * 4) If no Allocation Tag Access, then all accesses are Unchecked.
-         */
-        if (allocation_tag_access_enabled(env, el, sctlr)) {
-            DP_TBFLAG_A64(flags, ATA, 1);
-            if (tbid
-                && !(env->pstate & PSTATE_TCO)
-                && (sctlr & (el == 0 ? SCTLR_TCF0 : SCTLR_TCF))) {
-                DP_TBFLAG_A64(flags, MTE_ACTIVE, 1);
-            }
-        }
-        /* And again for unprivileged accesses, if required.  */
-        if (EX_TBFLAG_A64(flags, UNPRIV)
-            && tbid
-            && !(env->pstate & PSTATE_TCO)
-            && (sctlr & SCTLR_TCF0)
-            && allocation_tag_access_enabled(env, 0, sctlr)) {
-            DP_TBFLAG_A64(flags, MTE0_ACTIVE, 1);
-        }
-        /* Cache TCMA as well as TBI. */
-        DP_TBFLAG_A64(flags, TCMA, aa64_va_parameter_tcma(tcr, mmu_idx));
-    }
-
-    return rebuild_hflags_common(env, fp_el, mmu_idx, flags);
-}
-
-static CPUARMTBFlags rebuild_hflags_internal(CPUARMState *env)
-{
-    int el = arm_current_el(env);
-    int fp_el = fp_exception_el(env, el);
-    ARMMMUIdx mmu_idx = arm_mmu_idx_el(env, el);
-
-    if (is_a64(env)) {
-        return rebuild_hflags_a64(env, el, fp_el, mmu_idx);
-    } else if (arm_feature(env, ARM_FEATURE_M)) {
-        return rebuild_hflags_m32(env, fp_el, mmu_idx);
-    } else {
-        return rebuild_hflags_a32(env, fp_el, mmu_idx);
-    }
-}
-
-void arm_rebuild_hflags(CPUARMState *env)
-{
-    env->hflags = rebuild_hflags_internal(env);
-}
-
-/*
- * If we have triggered a EL state change we can't rely on the
- * translator having passed it to us, we need to recompute.
- */
-void HELPER(rebuild_hflags_m32_newel)(CPUARMState *env)
-{
-    int el = arm_current_el(env);
-    int fp_el = fp_exception_el(env, el);
-    ARMMMUIdx mmu_idx = arm_mmu_idx_el(env, el);
-
-    env->hflags = rebuild_hflags_m32(env, fp_el, mmu_idx);
-}
-
-void HELPER(rebuild_hflags_m32)(CPUARMState *env, int el)
-{
-    int fp_el = fp_exception_el(env, el);
-    ARMMMUIdx mmu_idx = arm_mmu_idx_el(env, el);
-
-    env->hflags = rebuild_hflags_m32(env, fp_el, mmu_idx);
-}
-
-/*
- * If we have triggered a EL state change we can't rely on the
- * translator having passed it to us, we need to recompute.
- */
-void HELPER(rebuild_hflags_a32_newel)(CPUARMState *env)
-{
-    int el = arm_current_el(env);
-    int fp_el = fp_exception_el(env, el);
-    ARMMMUIdx mmu_idx = arm_mmu_idx_el(env, el);
-    env->hflags = rebuild_hflags_a32(env, fp_el, mmu_idx);
-}
-
-void HELPER(rebuild_hflags_a32)(CPUARMState *env, int el)
-{
-    int fp_el = fp_exception_el(env, el);
-    ARMMMUIdx mmu_idx = arm_mmu_idx_el(env, el);
-
-    env->hflags = rebuild_hflags_a32(env, fp_el, mmu_idx);
-}
-
-void HELPER(rebuild_hflags_a64)(CPUARMState *env, int el)
-{
-    int fp_el = fp_exception_el(env, el);
-    ARMMMUIdx mmu_idx = arm_mmu_idx_el(env, el);
-
-    env->hflags = rebuild_hflags_a64(env, el, fp_el, mmu_idx);
-}
-
-static inline void assert_hflags_rebuild_correctly(CPUARMState *env)
-{
-#ifdef CONFIG_DEBUG_TCG
-    CPUARMTBFlags c = env->hflags;
-    CPUARMTBFlags r = rebuild_hflags_internal(env);
-
-    if (unlikely(c.flags != r.flags || c.flags2 != r.flags2)) {
-        fprintf(stderr, "TCG hflags mismatch "
-                        "(current:(0x%08x,0x" TARGET_FMT_lx ")"
-                        " rebuilt:(0x%08x,0x" TARGET_FMT_lx ")\n",
-                c.flags, c.flags2, r.flags, r.flags2);
-        abort();
-    }
-#endif
-}
-
 static bool mve_no_pred(CPUARMState *env)
 {
     /*
diff --git a/target/arm/internals.h b/target/arm/internals.h
index 759b70c..ed48f8c 100644
--- a/target/arm/internals.h
+++ b/target/arm/internals.h
@@ -1073,6 +1073,7 @@ ARMVAParameters aa64_va_parameters(CPUARMState *env, uint64_t va,
 
 int aa64_va_parameter_tbi(uint64_t tcr, ARMMMUIdx mmu_idx);
 int aa64_va_parameter_tbid(uint64_t tcr, ARMMMUIdx mmu_idx);
+int aa64_va_parameter_tcma(uint64_t tcr, ARMMMUIdx mmu_idx);
 
 /* Determine if allocation tags are available.  */
 static inline bool allocation_tag_access_enabled(CPUARMState *env, int el,
@@ -1383,4 +1384,5 @@ static inline bool arm_fgt_active(CPUARMState *env, int el)
         (!arm_feature(env, ARM_FEATURE_EL3) || (env->cp15.scr_el3 & SCR_FGTEN));
 }
 
+void assert_hflags_rebuild_correctly(CPUARMState *env);
 #endif
diff --git a/target/arm/tcg-stubs.c b/target/arm/tcg-stubs.c
index 1a7ddb3..152b172 100644
--- a/target/arm/tcg-stubs.c
+++ b/target/arm/tcg-stubs.c
@@ -21,3 +21,7 @@ void raise_exception_ra(CPUARMState *env, uint32_t excp, uint32_t syndrome,
 {
     g_assert_not_reached();
 }
+/* Temporarily while cpu_get_tb_cpu_state() is still in common code */
+void assert_hflags_rebuild_correctly(CPUARMState *env)
+{
+}
diff --git a/target/arm/tcg/hflags.c b/target/arm/tcg/hflags.c
new file mode 100644
index 0000000..b2ccd77
--- /dev/null
+++ b/target/arm/tcg/hflags.c
@@ -0,0 +1,403 @@
+/*
+ * ARM hflags
+ *
+ * This code is licensed under the GNU GPL v2 or later.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "internals.h"
+#include "exec/helper-proto.h"
+#include "cpregs.h"
+
+static inline bool fgt_svc(CPUARMState *env, int el)
+{
+    /*
+     * Assuming fine-grained-traps are active, return true if we
+     * should be trapping on SVC instructions. Only AArch64 can
+     * trap on an SVC at EL1, but we don't need to special-case this
+     * because if this is AArch32 EL1 then arm_fgt_active() is false.
+     * We also know el is 0 or 1.
+     */
+    return el == 0 ?
+        FIELD_EX64(env->cp15.fgt_exec[FGTREG_HFGITR], HFGITR_EL2, SVC_EL0) :
+        FIELD_EX64(env->cp15.fgt_exec[FGTREG_HFGITR], HFGITR_EL2, SVC_EL1);
+}
+
+static CPUARMTBFlags rebuild_hflags_common(CPUARMState *env, int fp_el,
+                                           ARMMMUIdx mmu_idx,
+                                           CPUARMTBFlags flags)
+{
+    DP_TBFLAG_ANY(flags, FPEXC_EL, fp_el);
+    DP_TBFLAG_ANY(flags, MMUIDX, arm_to_core_mmu_idx(mmu_idx));
+
+    if (arm_singlestep_active(env)) {
+        DP_TBFLAG_ANY(flags, SS_ACTIVE, 1);
+    }
+
+    return flags;
+}
+
+static CPUARMTBFlags rebuild_hflags_common_32(CPUARMState *env, int fp_el,
+                                              ARMMMUIdx mmu_idx,
+                                              CPUARMTBFlags flags)
+{
+    bool sctlr_b = arm_sctlr_b(env);
+
+    if (sctlr_b) {
+        DP_TBFLAG_A32(flags, SCTLR__B, 1);
+    }
+    if (arm_cpu_data_is_big_endian_a32(env, sctlr_b)) {
+        DP_TBFLAG_ANY(flags, BE_DATA, 1);
+    }
+    DP_TBFLAG_A32(flags, NS, !access_secure_reg(env));
+
+    return rebuild_hflags_common(env, fp_el, mmu_idx, flags);
+}
+
+static CPUARMTBFlags rebuild_hflags_m32(CPUARMState *env, int fp_el,
+                                        ARMMMUIdx mmu_idx)
+{
+    CPUARMTBFlags flags = {};
+    uint32_t ccr = env->v7m.ccr[env->v7m.secure];
+
+    /* Without HaveMainExt, CCR.UNALIGN_TRP is RES1. */
+    if (ccr & R_V7M_CCR_UNALIGN_TRP_MASK) {
+        DP_TBFLAG_ANY(flags, ALIGN_MEM, 1);
+    }
+
+    if (arm_v7m_is_handler_mode(env)) {
+        DP_TBFLAG_M32(flags, HANDLER, 1);
+    }
+
+    /*
+     * v8M always applies stack limit checks unless CCR.STKOFHFNMIGN
+     * is suppressing them because the requested execution priority
+     * is less than 0.
+     */
+    if (arm_feature(env, ARM_FEATURE_V8) &&
+        !((mmu_idx & ARM_MMU_IDX_M_NEGPRI) &&
+          (ccr & R_V7M_CCR_STKOFHFNMIGN_MASK))) {
+        DP_TBFLAG_M32(flags, STACKCHECK, 1);
+    }
+
+    if (arm_feature(env, ARM_FEATURE_M_SECURITY) && env->v7m.secure) {
+        DP_TBFLAG_M32(flags, SECURE, 1);
+    }
+
+    return rebuild_hflags_common_32(env, fp_el, mmu_idx, flags);
+}
+
+/* This corresponds to the ARM pseudocode function IsFullA64Enabled(). */
+static bool sme_fa64(CPUARMState *env, int el)
+{
+    if (!cpu_isar_feature(aa64_sme_fa64, env_archcpu(env))) {
+        return false;
+    }
+
+    if (el <= 1 && !el_is_in_host(env, el)) {
+        if (!FIELD_EX64(env->vfp.smcr_el[1], SMCR, FA64)) {
+            return false;
+        }
+    }
+    if (el <= 2 && arm_is_el2_enabled(env)) {
+        if (!FIELD_EX64(env->vfp.smcr_el[2], SMCR, FA64)) {
+            return false;
+        }
+    }
+    if (arm_feature(env, ARM_FEATURE_EL3)) {
+        if (!FIELD_EX64(env->vfp.smcr_el[3], SMCR, FA64)) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static CPUARMTBFlags rebuild_hflags_a32(CPUARMState *env, int fp_el,
+                                        ARMMMUIdx mmu_idx)
+{
+    CPUARMTBFlags flags = {};
+    int el = arm_current_el(env);
+
+    if (arm_sctlr(env, el) & SCTLR_A) {
+        DP_TBFLAG_ANY(flags, ALIGN_MEM, 1);
+    }
+
+    if (arm_el_is_aa64(env, 1)) {
+        DP_TBFLAG_A32(flags, VFPEN, 1);
+    }
+
+    if (el < 2 && env->cp15.hstr_el2 && arm_is_el2_enabled(env) &&
+        (arm_hcr_el2_eff(env) & (HCR_E2H | HCR_TGE)) != (HCR_E2H | HCR_TGE)) {
+        DP_TBFLAG_A32(flags, HSTR_ACTIVE, 1);
+    }
+
+    if (arm_fgt_active(env, el)) {
+        DP_TBFLAG_ANY(flags, FGT_ACTIVE, 1);
+        if (fgt_svc(env, el)) {
+            DP_TBFLAG_ANY(flags, FGT_SVC, 1);
+        }
+    }
+
+    if (env->uncached_cpsr & CPSR_IL) {
+        DP_TBFLAG_ANY(flags, PSTATE__IL, 1);
+    }
+
+    /*
+     * The SME exception we are testing for is raised via
+     * AArch64.CheckFPAdvSIMDEnabled(), as called from
+     * AArch32.CheckAdvSIMDOrFPEnabled().
+     */
+    if (el == 0
+        && FIELD_EX64(env->svcr, SVCR, SM)
+        && (!arm_is_el2_enabled(env)
+            || (arm_el_is_aa64(env, 2) && !(env->cp15.hcr_el2 & HCR_TGE)))
+        && arm_el_is_aa64(env, 1)
+        && !sme_fa64(env, el)) {
+        DP_TBFLAG_A32(flags, SME_TRAP_NONSTREAMING, 1);
+    }
+
+    return rebuild_hflags_common_32(env, fp_el, mmu_idx, flags);
+}
+
+static CPUARMTBFlags rebuild_hflags_a64(CPUARMState *env, int el, int fp_el,
+                                        ARMMMUIdx mmu_idx)
+{
+    CPUARMTBFlags flags = {};
+    ARMMMUIdx stage1 = stage_1_mmu_idx(mmu_idx);
+    uint64_t tcr = regime_tcr(env, mmu_idx);
+    uint64_t sctlr;
+    int tbii, tbid;
+
+    DP_TBFLAG_ANY(flags, AARCH64_STATE, 1);
+
+    /* Get control bits for tagged addresses.  */
+    tbid = aa64_va_parameter_tbi(tcr, mmu_idx);
+    tbii = tbid & ~aa64_va_parameter_tbid(tcr, mmu_idx);
+
+    DP_TBFLAG_A64(flags, TBII, tbii);
+    DP_TBFLAG_A64(flags, TBID, tbid);
+
+    if (cpu_isar_feature(aa64_sve, env_archcpu(env))) {
+        int sve_el = sve_exception_el(env, el);
+
+        /*
+         * If either FP or SVE are disabled, translator does not need len.
+         * If SVE EL > FP EL, FP exception has precedence, and translator
+         * does not need SVE EL.  Save potential re-translations by forcing
+         * the unneeded data to zero.
+         */
+        if (fp_el != 0) {
+            if (sve_el > fp_el) {
+                sve_el = 0;
+            }
+        } else if (sve_el == 0) {
+            DP_TBFLAG_A64(flags, VL, sve_vqm1_for_el(env, el));
+        }
+        DP_TBFLAG_A64(flags, SVEEXC_EL, sve_el);
+    }
+    if (cpu_isar_feature(aa64_sme, env_archcpu(env))) {
+        int sme_el = sme_exception_el(env, el);
+        bool sm = FIELD_EX64(env->svcr, SVCR, SM);
+
+        DP_TBFLAG_A64(flags, SMEEXC_EL, sme_el);
+        if (sme_el == 0) {
+            /* Similarly, do not compute SVL if SME is disabled. */
+            int svl = sve_vqm1_for_el_sm(env, el, true);
+            DP_TBFLAG_A64(flags, SVL, svl);
+            if (sm) {
+                /* If SVE is disabled, we will not have set VL above. */
+                DP_TBFLAG_A64(flags, VL, svl);
+            }
+        }
+        if (sm) {
+            DP_TBFLAG_A64(flags, PSTATE_SM, 1);
+            DP_TBFLAG_A64(flags, SME_TRAP_NONSTREAMING, !sme_fa64(env, el));
+        }
+        DP_TBFLAG_A64(flags, PSTATE_ZA, FIELD_EX64(env->svcr, SVCR, ZA));
+    }
+
+    sctlr = regime_sctlr(env, stage1);
+
+    if (sctlr & SCTLR_A) {
+        DP_TBFLAG_ANY(flags, ALIGN_MEM, 1);
+    }
+
+    if (arm_cpu_data_is_big_endian_a64(el, sctlr)) {
+        DP_TBFLAG_ANY(flags, BE_DATA, 1);
+    }
+
+    if (cpu_isar_feature(aa64_pauth, env_archcpu(env))) {
+        /*
+         * In order to save space in flags, we record only whether
+         * pauth is "inactive", meaning all insns are implemented as
+         * a nop, or "active" when some action must be performed.
+         * The decision of which action to take is left to a helper.
+         */
+        if (sctlr & (SCTLR_EnIA | SCTLR_EnIB | SCTLR_EnDA | SCTLR_EnDB)) {
+            DP_TBFLAG_A64(flags, PAUTH_ACTIVE, 1);
+        }
+    }
+
+    if (cpu_isar_feature(aa64_bti, env_archcpu(env))) {
+        /* Note that SCTLR_EL[23].BT == SCTLR_BT1.  */
+        if (sctlr & (el == 0 ? SCTLR_BT0 : SCTLR_BT1)) {
+            DP_TBFLAG_A64(flags, BT, 1);
+        }
+    }
+
+    /* Compute the condition for using AccType_UNPRIV for LDTR et al. */
+    if (!(env->pstate & PSTATE_UAO)) {
+        switch (mmu_idx) {
+        case ARMMMUIdx_E10_1:
+        case ARMMMUIdx_E10_1_PAN:
+            /* TODO: ARMv8.3-NV */
+            DP_TBFLAG_A64(flags, UNPRIV, 1);
+            break;
+        case ARMMMUIdx_E20_2:
+        case ARMMMUIdx_E20_2_PAN:
+            /*
+             * Note that EL20_2 is gated by HCR_EL2.E2H == 1, but EL20_0 is
+             * gated by HCR_EL2.<E2H,TGE> == '11', and so is LDTR.
+             */
+            if (env->cp15.hcr_el2 & HCR_TGE) {
+                DP_TBFLAG_A64(flags, UNPRIV, 1);
+            }
+            break;
+        default:
+            break;
+        }
+    }
+
+    if (env->pstate & PSTATE_IL) {
+        DP_TBFLAG_ANY(flags, PSTATE__IL, 1);
+    }
+
+    if (arm_fgt_active(env, el)) {
+        DP_TBFLAG_ANY(flags, FGT_ACTIVE, 1);
+        if (FIELD_EX64(env->cp15.fgt_exec[FGTREG_HFGITR], HFGITR_EL2, ERET)) {
+            DP_TBFLAG_A64(flags, FGT_ERET, 1);
+        }
+        if (fgt_svc(env, el)) {
+            DP_TBFLAG_ANY(flags, FGT_SVC, 1);
+        }
+    }
+
+    if (cpu_isar_feature(aa64_mte, env_archcpu(env))) {
+        /*
+         * Set MTE_ACTIVE if any access may be Checked, and leave clear
+         * if all accesses must be Unchecked:
+         * 1) If no TBI, then there are no tags in the address to check,
+         * 2) If Tag Check Override, then all accesses are Unchecked,
+         * 3) If Tag Check Fail == 0, then Checked access have no effect,
+         * 4) If no Allocation Tag Access, then all accesses are Unchecked.
+         */
+        if (allocation_tag_access_enabled(env, el, sctlr)) {
+            DP_TBFLAG_A64(flags, ATA, 1);
+            if (tbid
+                && !(env->pstate & PSTATE_TCO)
+                && (sctlr & (el == 0 ? SCTLR_TCF0 : SCTLR_TCF))) {
+                DP_TBFLAG_A64(flags, MTE_ACTIVE, 1);
+            }
+        }
+        /* And again for unprivileged accesses, if required.  */
+        if (EX_TBFLAG_A64(flags, UNPRIV)
+            && tbid
+            && !(env->pstate & PSTATE_TCO)
+            && (sctlr & SCTLR_TCF0)
+            && allocation_tag_access_enabled(env, 0, sctlr)) {
+            DP_TBFLAG_A64(flags, MTE0_ACTIVE, 1);
+        }
+        /* Cache TCMA as well as TBI. */
+        DP_TBFLAG_A64(flags, TCMA, aa64_va_parameter_tcma(tcr, mmu_idx));
+    }
+
+    return rebuild_hflags_common(env, fp_el, mmu_idx, flags);
+}
+
+static CPUARMTBFlags rebuild_hflags_internal(CPUARMState *env)
+{
+    int el = arm_current_el(env);
+    int fp_el = fp_exception_el(env, el);
+    ARMMMUIdx mmu_idx = arm_mmu_idx_el(env, el);
+
+    if (is_a64(env)) {
+        return rebuild_hflags_a64(env, el, fp_el, mmu_idx);
+    } else if (arm_feature(env, ARM_FEATURE_M)) {
+        return rebuild_hflags_m32(env, fp_el, mmu_idx);
+    } else {
+        return rebuild_hflags_a32(env, fp_el, mmu_idx);
+    }
+}
+
+void arm_rebuild_hflags(CPUARMState *env)
+{
+    env->hflags = rebuild_hflags_internal(env);
+}
+
+/*
+ * If we have triggered a EL state change we can't rely on the
+ * translator having passed it to us, we need to recompute.
+ */
+void HELPER(rebuild_hflags_m32_newel)(CPUARMState *env)
+{
+    int el = arm_current_el(env);
+    int fp_el = fp_exception_el(env, el);
+    ARMMMUIdx mmu_idx = arm_mmu_idx_el(env, el);
+
+    env->hflags = rebuild_hflags_m32(env, fp_el, mmu_idx);
+}
+
+void HELPER(rebuild_hflags_m32)(CPUARMState *env, int el)
+{
+    int fp_el = fp_exception_el(env, el);
+    ARMMMUIdx mmu_idx = arm_mmu_idx_el(env, el);
+
+    env->hflags = rebuild_hflags_m32(env, fp_el, mmu_idx);
+}
+
+/*
+ * If we have triggered a EL state change we can't rely on the
+ * translator having passed it to us, we need to recompute.
+ */
+void HELPER(rebuild_hflags_a32_newel)(CPUARMState *env)
+{
+    int el = arm_current_el(env);
+    int fp_el = fp_exception_el(env, el);
+    ARMMMUIdx mmu_idx = arm_mmu_idx_el(env, el);
+    env->hflags = rebuild_hflags_a32(env, fp_el, mmu_idx);
+}
+
+void HELPER(rebuild_hflags_a32)(CPUARMState *env, int el)
+{
+    int fp_el = fp_exception_el(env, el);
+    ARMMMUIdx mmu_idx = arm_mmu_idx_el(env, el);
+
+    env->hflags = rebuild_hflags_a32(env, fp_el, mmu_idx);
+}
+
+void HELPER(rebuild_hflags_a64)(CPUARMState *env, int el)
+{
+    int fp_el = fp_exception_el(env, el);
+    ARMMMUIdx mmu_idx = arm_mmu_idx_el(env, el);
+
+    env->hflags = rebuild_hflags_a64(env, el, fp_el, mmu_idx);
+}
+
+void assert_hflags_rebuild_correctly(CPUARMState *env)
+{
+#ifdef CONFIG_DEBUG_TCG
+    CPUARMTBFlags c = env->hflags;
+    CPUARMTBFlags r = rebuild_hflags_internal(env);
+
+    if (unlikely(c.flags != r.flags || c.flags2 != r.flags2)) {
+        fprintf(stderr, "TCG hflags mismatch "
+                        "(current:(0x%08x,0x" TARGET_FMT_lx ")"
+                        " rebuilt:(0x%08x,0x" TARGET_FMT_lx ")\n",
+                c.flags, c.flags2, r.flags, r.flags2);
+        abort();
+    }
+#endif
+}
diff --git a/target/arm/tcg/meson.build b/target/arm/tcg/meson.build
index fa8a9ea..d27e76a 100644
--- a/target/arm/tcg/meson.build
+++ b/target/arm/tcg/meson.build
@@ -24,6 +24,7 @@ arm_ss.add(files(
   'translate-neon.c',
   'translate-vfp.c',
   'crypto_helper.c',
+  'hflags.c',
   'iwmmxt_helper.c',
   'm_helper.c',
   'mve_helper.c',
-- 
cgit v1.1


From 2ea2998f27da92ae1225c1da95cee51a4a6783af Mon Sep 17 00:00:00 2001
From: Fabiano Rosas <farosas@suse.de>
Date: Fri, 17 Feb 2023 17:11:34 -0300
Subject: target/arm: Move regime_using_lpae_format into internal.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This function is needed by common code (ptw.c), so move it along with
the other regime_* functions in internal.h. When we enable the build
without TCG, the tlb_helper.c file will not be present.

Signed-off-by: Fabiano Rosas <farosas@suse.de>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Tested-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 target/arm/internals.h      | 21 ++++++++++++++++++---
 target/arm/tcg/tlb_helper.c | 18 ------------------
 2 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/target/arm/internals.h b/target/arm/internals.h
index ed48f8c..680c574 100644
--- a/target/arm/internals.h
+++ b/target/arm/internals.h
@@ -600,9 +600,6 @@ int arm_mmu_idx_to_el(ARMMMUIdx mmu_idx);
 /* Return the MMU index for a v7M CPU in the specified security state */
 ARMMMUIdx arm_v7m_mmu_idx_for_secstate(CPUARMState *env, bool secstate);
 
-/* Return true if the translation regime is using LPAE format page tables */
-bool regime_using_lpae_format(CPUARMState *env, ARMMMUIdx mmu_idx);
-
 /*
  * Return true if the stage 1 translation regime is using LPAE
  * format page tables
@@ -767,6 +764,24 @@ static inline uint64_t regime_tcr(CPUARMState *env, ARMMMUIdx mmu_idx)
     return env->cp15.tcr_el[regime_el(env, mmu_idx)];
 }
 
+/* Return true if the translation regime is using LPAE format page tables */
+static inline bool regime_using_lpae_format(CPUARMState *env, ARMMMUIdx mmu_idx)
+{
+    int el = regime_el(env, mmu_idx);
+    if (el == 2 || arm_el_is_aa64(env, el)) {
+        return true;
+    }
+    if (arm_feature(env, ARM_FEATURE_PMSA) &&
+        arm_feature(env, ARM_FEATURE_V8)) {
+        return true;
+    }
+    if (arm_feature(env, ARM_FEATURE_LPAE)
+        && (regime_tcr(env, mmu_idx) & TTBCR_EAE)) {
+        return true;
+    }
+    return false;
+}
+
 /**
  * arm_num_brps: Return number of implemented breakpoints.
  * Note that the ID register BRPS field is "number of bps - 1",
diff --git a/target/arm/tcg/tlb_helper.c b/target/arm/tcg/tlb_helper.c
index 60abcbe..31eb77f 100644
--- a/target/arm/tcg/tlb_helper.c
+++ b/target/arm/tcg/tlb_helper.c
@@ -12,24 +12,6 @@
 #include "exec/helper-proto.h"
 
 
-/* Return true if the translation regime is using LPAE format page tables */
-bool regime_using_lpae_format(CPUARMState *env, ARMMMUIdx mmu_idx)
-{
-    int el = regime_el(env, mmu_idx);
-    if (el == 2 || arm_el_is_aa64(env, el)) {
-        return true;
-    }
-    if (arm_feature(env, ARM_FEATURE_PMSA) &&
-        arm_feature(env, ARM_FEATURE_V8)) {
-        return true;
-    }
-    if (arm_feature(env, ARM_FEATURE_LPAE)
-        && (regime_tcr(env, mmu_idx) & TTBCR_EAE)) {
-        return true;
-    }
-    return false;
-}
-
 /*
  * Returns true if the stage 1 translation regime is using LPAE format page
  * tables. Used when raising alignment exceptions, whose FSR changes depending
-- 
cgit v1.1


From 0d3de77a07f4f774f7a9248afa8ea497ad5f2ae5 Mon Sep 17 00:00:00 2001
From: Fabiano Rosas <farosas@suse.de>
Date: Fri, 17 Feb 2023 17:11:35 -0300
Subject: target/arm: Don't access TCG code when debugging with KVM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When TCG is disabled this part of the code should not be reachable, so
wrap it with an ifdef for now.

Signed-off-by: Fabiano Rosas <farosas@suse.de>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Tested-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 target/arm/ptw.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/target/arm/ptw.c b/target/arm/ptw.c
index 2b125ff..be0cc6b 100644
--- a/target/arm/ptw.c
+++ b/target/arm/ptw.c
@@ -254,6 +254,7 @@ static bool S1_ptw_translate(CPUARMState *env, S1Translate *ptw,
         ptw->out_host = NULL;
         ptw->out_rw = false;
     } else {
+#ifdef CONFIG_TCG
         CPUTLBEntryFull *full;
         int flags;
 
@@ -270,6 +271,9 @@ static bool S1_ptw_translate(CPUARMState *env, S1Translate *ptw,
         ptw->out_rw = full->prot & PAGE_WRITE;
         pte_attrs = full->pte_attrs;
         pte_secure = full->attrs.secure;
+#else
+        g_assert_not_reached();
+#endif
     }
 
     if (regime_is_stage2(s2_mmu_idx)) {
-- 
cgit v1.1


From 4cb884e994a85a86e4e305ea3e6296434f4af0df Mon Sep 17 00:00:00 2001
From: Fabiano Rosas <farosas@suse.de>
Date: Fri, 17 Feb 2023 17:11:36 -0300
Subject: cpu-defs.h: Expose CPUTLBEntryFull to non-TCG code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This struct has no dependencies on TCG code and it is being used in
target/arm/ptw.c to simplify the passing around of page table walk
results. Those routines can be reached by KVM code via the gdbstub
breakpoint code, so take the structure out of CONFIG_TCG to make it
visible when building with --disable-tcg.

Signed-off-by: Fabiano Rosas <farosas@suse.de>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Tested-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 include/exec/cpu-defs.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index 21309cf..d5a4f30 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -135,6 +135,10 @@ typedef struct CPUTLBEntry {
 
 QEMU_BUILD_BUG_ON(sizeof(CPUTLBEntry) != (1 << CPU_TLB_ENTRY_BITS));
 
+
+#endif  /* !CONFIG_USER_ONLY && CONFIG_TCG */
+
+#if !defined(CONFIG_USER_ONLY)
 /*
  * The full TLB entry, which is not accessed by generated TCG code,
  * so the layout is not as critical as that of CPUTLBEntry. This is
@@ -176,7 +180,9 @@ typedef struct CPUTLBEntryFull {
     TARGET_PAGE_ENTRY_EXTRA
 #endif
 } CPUTLBEntryFull;
+#endif  /* !CONFIG_USER_ONLY */
 
+#if !defined(CONFIG_USER_ONLY) && defined(CONFIG_TCG)
 /*
  * Data elements that are per MMU mode, minus the bits accessed by
  * the TCG fast path.
-- 
cgit v1.1


From 9880e9bc63efcaf3f5230c2f93fb03068df2e465 Mon Sep 17 00:00:00 2001
From: Fabiano Rosas <farosas@suse.de>
Date: Fri, 17 Feb 2023 17:11:46 -0300
Subject: tests/avocado: add machine:none tag to version.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This test currently fails when run on a host for which the QEMU target
has no default machine set:

ERROR| Output: qemu-system-aarch64: No machine specified, and there is
no default

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Fabiano Rosas <farosas@suse.de>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 tests/avocado/version.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/avocado/version.py b/tests/avocado/version.py
index ded7f03..dd77595 100644
--- a/tests/avocado/version.py
+++ b/tests/avocado/version.py
@@ -15,6 +15,7 @@ from avocado_qemu import QemuSystemTest
 class Version(QemuSystemTest):
     """
     :avocado: tags=quick
+    :avocado: tags=machine:none
     """
     def test_qmp_human_info_version(self):
         self.vm.add_args('-nodefaults')
-- 
cgit v1.1


From cb4c33f014925d860387ecef4d74a7f03cab6626 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= <philmd@linaro.org>
Date: Mon, 20 Feb 2023 12:51:07 +0100
Subject: hw/gpio/max7310: Simplify max7310_realize()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since &I2C_SLAVE(dev)->qdev == dev, no need to go back and
forth with QOM type casting. Directly use 'dev'.

Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20230220115114.25237-2-philmd@linaro.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 hw/gpio/max7310.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/hw/gpio/max7310.c b/hw/gpio/max7310.c
index db6b5e3..031482d 100644
--- a/hw/gpio/max7310.c
+++ b/hw/gpio/max7310.c
@@ -183,11 +183,10 @@ static void max7310_gpio_set(void *opaque, int line, int level)
  * but also accepts sequences that are not SMBus so return an I2C device.  */
 static void max7310_realize(DeviceState *dev, Error **errp)
 {
-    I2CSlave *i2c = I2C_SLAVE(dev);
     MAX7310State *s = MAX7310(dev);
 
-    qdev_init_gpio_in(&i2c->qdev, max7310_gpio_set, 8);
-    qdev_init_gpio_out(&i2c->qdev, s->handler, 8);
+    qdev_init_gpio_in(dev, max7310_gpio_set, ARRAY_SIZE(s->handler));
+    qdev_init_gpio_out(dev, s->handler, ARRAY_SIZE(s->handler));
 }
 
 static void max7310_class_init(ObjectClass *klass, void *data)
-- 
cgit v1.1


From 11f2ee1db6ed333a74a71fa38ce66fda926e230d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= <philmd@linaro.org>
Date: Mon, 20 Feb 2023 12:51:08 +0100
Subject: hw/char/pl011: Un-inline pl011_create()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pl011_create() is only used in DeviceRealize handlers,
not a hot-path. Inlining is not justified.

Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20230220115114.25237-3-philmd@linaro.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 hw/char/pl011.c         | 17 +++++++++++++++++
 include/hw/char/pl011.h | 19 +------------------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/hw/char/pl011.c b/hw/char/pl011.c
index c15cb7a..77bbc2a 100644
--- a/hw/char/pl011.c
+++ b/hw/char/pl011.c
@@ -19,10 +19,12 @@
  */
 
 #include "qemu/osdep.h"
+#include "qapi/error.h"
 #include "hw/char/pl011.h"
 #include "hw/irq.h"
 #include "hw/sysbus.h"
 #include "hw/qdev-clock.h"
+#include "hw/qdev-properties.h"
 #include "hw/qdev-properties-system.h"
 #include "migration/vmstate.h"
 #include "chardev/char-fe.h"
@@ -31,6 +33,21 @@
 #include "qemu/module.h"
 #include "trace.h"
 
+DeviceState *pl011_create(hwaddr addr, qemu_irq irq, Chardev *chr)
+{
+    DeviceState *dev;
+    SysBusDevice *s;
+
+    dev = qdev_new("pl011");
+    s = SYS_BUS_DEVICE(dev);
+    qdev_prop_set_chr(dev, "chardev", chr);
+    sysbus_realize_and_unref(s, &error_fatal);
+    sysbus_mmio_map(s, 0, addr);
+    sysbus_connect_irq(s, 0, irq);
+
+    return dev;
+}
+
 #define PL011_INT_TX 0x20
 #define PL011_INT_RX 0x10
 
diff --git a/include/hw/char/pl011.h b/include/hw/char/pl011.h
index 926322e..d82870c 100644
--- a/include/hw/char/pl011.h
+++ b/include/hw/char/pl011.h
@@ -15,10 +15,8 @@
 #ifndef HW_PL011_H
 #define HW_PL011_H
 
-#include "hw/qdev-properties.h"
 #include "hw/sysbus.h"
 #include "chardev/char-fe.h"
-#include "qapi/error.h"
 #include "qom/object.h"
 
 #define TYPE_PL011 "pl011"
@@ -57,22 +55,7 @@ struct PL011State {
     const unsigned char *id;
 };
 
-static inline DeviceState *pl011_create(hwaddr addr,
-                                        qemu_irq irq,
-                                        Chardev *chr)
-{
-    DeviceState *dev;
-    SysBusDevice *s;
-
-    dev = qdev_new("pl011");
-    s = SYS_BUS_DEVICE(dev);
-    qdev_prop_set_chr(dev, "chardev", chr);
-    sysbus_realize_and_unref(s, &error_fatal);
-    sysbus_mmio_map(s, 0, addr);
-    sysbus_connect_irq(s, 0, irq);
-
-    return dev;
-}
+DeviceState *pl011_create(hwaddr addr, qemu_irq irq, Chardev *chr);
 
 static inline DeviceState *pl011_luminary_create(hwaddr addr,
                                                  qemu_irq irq,
-- 
cgit v1.1


From b7f93098d1899c88eec4026b0a2118c94cbdefc9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= <philmd@linaro.org>
Date: Mon, 20 Feb 2023 12:51:09 +0100
Subject: hw/char/pl011: Open-code pl011_luminary_create()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pl011_luminary_create() is only used for the Stellaris board,
open-code it.

Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20230220115114.25237-4-philmd@linaro.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 hw/arm/stellaris.c      | 11 ++++++++---
 include/hw/char/pl011.h | 17 -----------------
 2 files changed, 8 insertions(+), 20 deletions(-)

diff --git a/hw/arm/stellaris.c b/hw/arm/stellaris.c
index 67a2293..f7e99ba 100644
--- a/hw/arm/stellaris.c
+++ b/hw/arm/stellaris.c
@@ -1146,9 +1146,14 @@ static void stellaris_init(MachineState *ms, stellaris_board_info *board)
 
     for (i = 0; i < 4; i++) {
         if (board->dc2 & (1 << i)) {
-            pl011_luminary_create(0x4000c000 + i * 0x1000,
-                                  qdev_get_gpio_in(nvic, uart_irq[i]),
-                                  serial_hd(i));
+            SysBusDevice *sbd;
+
+            dev = qdev_new("pl011_luminary");
+            sbd = SYS_BUS_DEVICE(dev);
+            qdev_prop_set_chr(dev, "chardev", serial_hd(i));
+            sysbus_realize_and_unref(sbd, &error_fatal);
+            sysbus_mmio_map(sbd, 0, 0x4000c000 + i * 0x1000);
+            sysbus_connect_irq(sbd, 0, qdev_get_gpio_in(nvic, uart_irq[i]));
         }
     }
     if (board->dc2 & (1 << 4)) {
diff --git a/include/hw/char/pl011.h b/include/hw/char/pl011.h
index d82870c..d853802 100644
--- a/include/hw/char/pl011.h
+++ b/include/hw/char/pl011.h
@@ -57,21 +57,4 @@ struct PL011State {
 
 DeviceState *pl011_create(hwaddr addr, qemu_irq irq, Chardev *chr);
 
-static inline DeviceState *pl011_luminary_create(hwaddr addr,
-                                                 qemu_irq irq,
-                                                 Chardev *chr)
-{
-    DeviceState *dev;
-    SysBusDevice *s;
-
-    dev = qdev_new("pl011_luminary");
-    s = SYS_BUS_DEVICE(dev);
-    qdev_prop_set_chr(dev, "chardev", chr);
-    sysbus_realize_and_unref(s, &error_fatal);
-    sysbus_mmio_map(s, 0, addr);
-    sysbus_connect_irq(s, 0, irq);
-
-    return dev;
-}
-
 #endif
-- 
cgit v1.1


From 3440a4a93aebb2abe9d9ed6ba4a0f8db86b2a6ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= <philmd@linaro.org>
Date: Mon, 20 Feb 2023 12:51:10 +0100
Subject: hw/char/xilinx_uartlite: Expose XILINX_UARTLITE QOM type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20230220115114.25237-5-philmd@linaro.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 hw/char/xilinx_uartlite.c         | 4 +---
 include/hw/char/xilinx_uartlite.h | 6 +++++-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/hw/char/xilinx_uartlite.c b/hw/char/xilinx_uartlite.c
index 99b9a6f..180bb97 100644
--- a/hw/char/xilinx_uartlite.c
+++ b/hw/char/xilinx_uartlite.c
@@ -24,6 +24,7 @@
 
 #include "qemu/osdep.h"
 #include "qemu/log.h"
+#include "hw/char/xilinx_uartlite.h"
 #include "hw/irq.h"
 #include "hw/qdev-properties.h"
 #include "hw/qdev-properties-system.h"
@@ -53,9 +54,6 @@
 #define CONTROL_RST_RX    0x02
 #define CONTROL_IE        0x10
 
-#define TYPE_XILINX_UARTLITE "xlnx.xps-uartlite"
-OBJECT_DECLARE_SIMPLE_TYPE(XilinxUARTLite, XILINX_UARTLITE)
-
 struct XilinxUARTLite {
     SysBusDevice parent_obj;
 
diff --git a/include/hw/char/xilinx_uartlite.h b/include/hw/char/xilinx_uartlite.h
index dd09c06..753d3a4 100644
--- a/include/hw/char/xilinx_uartlite.h
+++ b/include/hw/char/xilinx_uartlite.h
@@ -18,6 +18,10 @@
 #include "hw/qdev-properties.h"
 #include "hw/sysbus.h"
 #include "qapi/error.h"
+#include "qom/object.h"
+
+#define TYPE_XILINX_UARTLITE "xlnx.xps-uartlite"
+OBJECT_DECLARE_SIMPLE_TYPE(XilinxUARTLite, XILINX_UARTLITE)
 
 static inline DeviceState *xilinx_uartlite_create(hwaddr addr,
                                         qemu_irq irq,
@@ -26,7 +30,7 @@ static inline DeviceState *xilinx_uartlite_create(hwaddr addr,
     DeviceState *dev;
     SysBusDevice *s;
 
-    dev = qdev_new("xlnx.xps-uartlite");
+    dev = qdev_new(TYPE_XILINX_UARTLITE);
     s = SYS_BUS_DEVICE(dev);
     qdev_prop_set_chr(dev, "chardev", chr);
     sysbus_realize_and_unref(s, &error_fatal);
-- 
cgit v1.1


From dc1daf392c7cc6d1481bd9ce40a5594e624e7b47 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= <philmd@linaro.org>
Date: Mon, 20 Feb 2023 12:51:11 +0100
Subject: hw/char/xilinx_uartlite: Open-code xilinx_uartlite_create()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Open-code the single use of xilinx_uartlite_create().

Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20230220115114.25237-6-philmd@linaro.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 hw/microblaze/petalogix_s3adsp1800_mmu.c |  7 +++++--
 include/hw/char/xilinx_uartlite.h        | 20 --------------------
 2 files changed, 5 insertions(+), 22 deletions(-)

diff --git a/hw/microblaze/petalogix_s3adsp1800_mmu.c b/hw/microblaze/petalogix_s3adsp1800_mmu.c
index 9d959d1..505639c 100644
--- a/hw/microblaze/petalogix_s3adsp1800_mmu.c
+++ b/hw/microblaze/petalogix_s3adsp1800_mmu.c
@@ -100,8 +100,11 @@ petalogix_s3adsp1800_init(MachineState *machine)
         irq[i] = qdev_get_gpio_in(dev, i);
     }
 
-    xilinx_uartlite_create(UARTLITE_BASEADDR, irq[UARTLITE_IRQ],
-                           serial_hd(0));
+    dev = qdev_new(TYPE_XILINX_UARTLITE);
+    qdev_prop_set_chr(dev, "chardev", serial_hd(0));
+    sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal);
+    sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, UARTLITE_BASEADDR);
+    sysbus_connect_irq(SYS_BUS_DEVICE(dev), 0, irq[UARTLITE_IRQ]);
 
     /* 2 timers at irq 2 @ 62 Mhz.  */
     dev = qdev_new("xlnx.xps-timer");
diff --git a/include/hw/char/xilinx_uartlite.h b/include/hw/char/xilinx_uartlite.h
index 753d3a4..36d4e84 100644
--- a/include/hw/char/xilinx_uartlite.h
+++ b/include/hw/char/xilinx_uartlite.h
@@ -15,29 +15,9 @@
 #ifndef XILINX_UARTLITE_H
 #define XILINX_UARTLITE_H
 
-#include "hw/qdev-properties.h"
-#include "hw/sysbus.h"
-#include "qapi/error.h"
 #include "qom/object.h"
 
 #define TYPE_XILINX_UARTLITE "xlnx.xps-uartlite"
 OBJECT_DECLARE_SIMPLE_TYPE(XilinxUARTLite, XILINX_UARTLITE)
 
-static inline DeviceState *xilinx_uartlite_create(hwaddr addr,
-                                        qemu_irq irq,
-                                        Chardev *chr)
-{
-    DeviceState *dev;
-    SysBusDevice *s;
-
-    dev = qdev_new(TYPE_XILINX_UARTLITE);
-    s = SYS_BUS_DEVICE(dev);
-    qdev_prop_set_chr(dev, "chardev", chr);
-    sysbus_realize_and_unref(s, &error_fatal);
-    sysbus_mmio_map(s, 0, addr);
-    sysbus_connect_irq(s, 0, irq);
-
-    return dev;
-}
-
 #endif
-- 
cgit v1.1


From 4ab694b9a81df82f3ac7ce1e59f7855c57af2eb1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= <philmd@linaro.org>
Date: Mon, 20 Feb 2023 12:51:12 +0100
Subject: hw/char/cmsdk-apb-uart: Open-code cmsdk_apb_uart_create()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

cmsdk_apb_uart_create() is only used twice in the same
file. Open-code it.

Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Message-id: 20230220115114.25237-7-philmd@linaro.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 hw/arm/mps2.c                    | 41 ++++++++++++++++++++++++++--------------
 include/hw/char/cmsdk-apb-uart.h | 34 ---------------------------------
 2 files changed, 27 insertions(+), 48 deletions(-)

diff --git a/hw/arm/mps2.c b/hw/arm/mps2.c
index a86a994..d92fd60 100644
--- a/hw/arm/mps2.c
+++ b/hw/arm/mps2.c
@@ -35,6 +35,7 @@
 #include "hw/boards.h"
 #include "exec/address-spaces.h"
 #include "sysemu/sysemu.h"
+#include "hw/qdev-properties.h"
 #include "hw/misc/unimp.h"
 #include "hw/char/cmsdk-apb-uart.h"
 #include "hw/timer/cmsdk-apb-timer.h"
@@ -282,6 +283,9 @@ static void mps2_common_init(MachineState *machine)
         qdev_connect_gpio_out(orgate_dev, 0, qdev_get_gpio_in(armv7m, 12));
 
         for (i = 0; i < 5; i++) {
+            DeviceState *dev;
+            SysBusDevice *s;
+
             static const hwaddr uartbase[] = {0x40004000, 0x40005000,
                                               0x40006000, 0x40007000,
                                               0x40009000};
@@ -294,12 +298,16 @@ static void mps2_common_init(MachineState *machine)
                 rxovrint = qdev_get_gpio_in(orgate_dev, i * 2 + 1);
             }
 
-            cmsdk_apb_uart_create(uartbase[i],
-                                  qdev_get_gpio_in(armv7m, uartirq[i] + 1),
-                                  qdev_get_gpio_in(armv7m, uartirq[i]),
-                                  txovrint, rxovrint,
-                                  NULL,
-                                  serial_hd(i), SYSCLK_FRQ);
+            dev = qdev_new(TYPE_CMSDK_APB_UART);
+            s = SYS_BUS_DEVICE(dev);
+            qdev_prop_set_chr(dev, "chardev", serial_hd(i));
+            qdev_prop_set_uint32(dev, "pclk-frq", SYSCLK_FRQ);
+            sysbus_realize_and_unref(s, &error_fatal);
+            sysbus_mmio_map(s, 0, uartbase[i]);
+            sysbus_connect_irq(s, 0, qdev_get_gpio_in(armv7m, uartirq[i] + 1));
+            sysbus_connect_irq(s, 1, qdev_get_gpio_in(armv7m, uartirq[i]));
+            sysbus_connect_irq(s, 2, txovrint);
+            sysbus_connect_irq(s, 3, rxovrint);
         }
         break;
     }
@@ -324,7 +332,8 @@ static void mps2_common_init(MachineState *machine)
                                               0x4002c000, 0x4002d000,
                                               0x4002e000};
             Object *txrx_orgate;
-            DeviceState *txrx_orgate_dev;
+            DeviceState *txrx_orgate_dev, *dev;
+            SysBusDevice *s;
 
             txrx_orgate = object_new(TYPE_OR_IRQ);
             object_property_set_int(txrx_orgate, "num-lines", 2, &error_fatal);
@@ -332,13 +341,17 @@ static void mps2_common_init(MachineState *machine)
             txrx_orgate_dev = DEVICE(txrx_orgate);
             qdev_connect_gpio_out(txrx_orgate_dev, 0,
                                   qdev_get_gpio_in(armv7m, uart_txrx_irqno[i]));
-            cmsdk_apb_uart_create(uartbase[i],
-                                  qdev_get_gpio_in(txrx_orgate_dev, 0),
-                                  qdev_get_gpio_in(txrx_orgate_dev, 1),
-                                  qdev_get_gpio_in(orgate_dev, i * 2),
-                                  qdev_get_gpio_in(orgate_dev, i * 2 + 1),
-                                  NULL,
-                                  serial_hd(i), SYSCLK_FRQ);
+
+            dev = qdev_new(TYPE_CMSDK_APB_UART);
+            s = SYS_BUS_DEVICE(dev);
+            qdev_prop_set_chr(dev, "chardev", serial_hd(i));
+            qdev_prop_set_uint32(dev, "pclk-frq", SYSCLK_FRQ);
+            sysbus_realize_and_unref(s, &error_fatal);
+            sysbus_mmio_map(s, 0, uartbase[i]);
+            sysbus_connect_irq(s, 0, qdev_get_gpio_in(txrx_orgate_dev, 0));
+            sysbus_connect_irq(s, 1, qdev_get_gpio_in(txrx_orgate_dev, 1));
+            sysbus_connect_irq(s, 2, qdev_get_gpio_in(orgate_dev, i * 2));
+            sysbus_connect_irq(s, 3, qdev_get_gpio_in(orgate_dev, i * 2 + 1));
         }
         break;
     }
diff --git a/include/hw/char/cmsdk-apb-uart.h b/include/hw/char/cmsdk-apb-uart.h
index 64b0a3d..7de8f8d 100644
--- a/include/hw/char/cmsdk-apb-uart.h
+++ b/include/hw/char/cmsdk-apb-uart.h
@@ -12,10 +12,8 @@
 #ifndef CMSDK_APB_UART_H
 #define CMSDK_APB_UART_H
 
-#include "hw/qdev-properties.h"
 #include "hw/sysbus.h"
 #include "chardev/char-fe.h"
-#include "qapi/error.h"
 #include "qom/object.h"
 
 #define TYPE_CMSDK_APB_UART "cmsdk-apb-uart"
@@ -45,36 +43,4 @@ struct CMSDKAPBUART {
     uint8_t rxbuf;
 };
 
-/**
- * cmsdk_apb_uart_create - convenience function to create TYPE_CMSDK_APB_UART
- * @addr: location in system memory to map registers
- * @chr: Chardev backend to connect UART to, or NULL if no backend
- * @pclk_frq: frequency in Hz of the PCLK clock (used for calculating baud rate)
- */
-static inline DeviceState *cmsdk_apb_uart_create(hwaddr addr,
-                                                 qemu_irq txint,
-                                                 qemu_irq rxint,
-                                                 qemu_irq txovrint,
-                                                 qemu_irq rxovrint,
-                                                 qemu_irq uartint,
-                                                 Chardev *chr,
-                                                 uint32_t pclk_frq)
-{
-    DeviceState *dev;
-    SysBusDevice *s;
-
-    dev = qdev_new(TYPE_CMSDK_APB_UART);
-    s = SYS_BUS_DEVICE(dev);
-    qdev_prop_set_chr(dev, "chardev", chr);
-    qdev_prop_set_uint32(dev, "pclk-frq", pclk_frq);
-    sysbus_realize_and_unref(s, &error_fatal);
-    sysbus_mmio_map(s, 0, addr);
-    sysbus_connect_irq(s, 0, txint);
-    sysbus_connect_irq(s, 1, rxint);
-    sysbus_connect_irq(s, 2, txovrint);
-    sysbus_connect_irq(s, 3, rxovrint);
-    sysbus_connect_irq(s, 4, uartint);
-    return dev;
-}
-
 #endif
-- 
cgit v1.1


From 7c72f2196ac3d1fe46e4f0bff103beef2b555633 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= <philmd@linaro.org>
Date: Mon, 20 Feb 2023 12:51:13 +0100
Subject: hw/timer/cmsdk-apb-timer: Remove unused 'qdev-properties.h' header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Message-id: 20230220115114.25237-8-philmd@linaro.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 include/hw/timer/cmsdk-apb-timer.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/hw/timer/cmsdk-apb-timer.h b/include/hw/timer/cmsdk-apb-timer.h
index c4c7eae..2dd615d 100644
--- a/include/hw/timer/cmsdk-apb-timer.h
+++ b/include/hw/timer/cmsdk-apb-timer.h
@@ -12,7 +12,6 @@
 #ifndef CMSDK_APB_TIMER_H
 #define CMSDK_APB_TIMER_H
 
-#include "hw/qdev-properties.h"
 #include "hw/sysbus.h"
 #include "hw/ptimer.h"
 #include "hw/clock.h"
-- 
cgit v1.1


From d4fb55a6083c8535ad7b46019ae3cbc1be5497e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= <philmd@linaro.org>
Date: Mon, 20 Feb 2023 12:51:14 +0100
Subject: hw/intc/armv7m_nvic: Use QOM cast CPU() macro
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Avoid accessing 'parent_obj' directly.

Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Message-id: 20230220115114.25237-9-philmd@linaro.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 hw/intc/armv7m_nvic.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hw/intc/armv7m_nvic.c b/hw/intc/armv7m_nvic.c
index 8e28905..63afe1f 100644
--- a/hw/intc/armv7m_nvic.c
+++ b/hw/intc/armv7m_nvic.c
@@ -578,7 +578,7 @@ static void do_armv7m_nvic_set_pending(void *opaque, int irq, bool secure,
              * which saves having to have an extra argument is_terminal
              * that we'd only use in one place.
              */
-            cpu_abort(&s->cpu->parent_obj,
+            cpu_abort(CPU(s->cpu),
                       "Lockup: can't take terminal derived exception "
                       "(original exception priority %d)\n",
                       s->vectpending_prio);
@@ -644,7 +644,7 @@ static void do_armv7m_nvic_set_pending(void *opaque, int irq, bool secure,
                  * Lockup condition due to a guest bug. We don't model
                  * Lockup, so report via cpu_abort() instead.
                  */
-                cpu_abort(&s->cpu->parent_obj,
+                cpu_abort(CPU(s->cpu),
                           "Lockup: can't escalate %d to HardFault "
                           "(current priority %d)\n", irq, running);
             }
@@ -742,7 +742,7 @@ void armv7m_nvic_set_pending_lazyfp(NVICState *s, int irq, bool secure)
              * We want to escalate to HardFault but the context the
              * FP state belongs to prevents the exception pre-empting.
              */
-            cpu_abort(&s->cpu->parent_obj,
+            cpu_abort(CPU(s->cpu),
                       "Lockup: can't escalate to HardFault during "
                       "lazy FP register stacking\n");
         }
-- 
cgit v1.1


From 799d6a3c9923308f8ae7d12438e0723c4eadd124 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= <philmd@linaro.org>
Date: Mon, 9 Jan 2023 10:26:09 +0100
Subject: hw/arm/musicpal: Remove unused dummy MemoryRegion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 hw/arm/musicpal.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/hw/arm/musicpal.c b/hw/arm/musicpal.c
index 89b6660..06d9add 100644
--- a/hw/arm/musicpal.c
+++ b/hw/arm/musicpal.c
@@ -1072,7 +1072,6 @@ struct musicpal_key_state {
     SysBusDevice parent_obj;
     /*< public >*/
 
-    MemoryRegion iomem;
     uint32_t kbd_extended;
     uint32_t pressed_keys;
     qemu_irq out[8];
@@ -1161,9 +1160,6 @@ static void musicpal_key_init(Object *obj)
     DeviceState *dev = DEVICE(sbd);
     musicpal_key_state *s = MUSICPAL_KEY(dev);
 
-    memory_region_init(&s->iomem, obj, "dummy", 0);
-    sysbus_init_mmio(sbd, &s->iomem);
-
     s->kbd_extended = 0;
     s->pressed_keys = 0;
 
-- 
cgit v1.1


From dc892d1e4c8d3682d624725d7cfeee90f35d6eff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= <philmd@linaro.org>
Date: Fri, 13 Jan 2023 21:01:35 +0100
Subject: iothread: Remove unused IOThreadClass / IOTHREAD_CLASS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since commit be8d853766 ("iothread: add I/O thread object") we
never used IOThreadClass / IOTHREAD_CLASS() / IOTHREAD_GET_CLASS(),
remove these definitions.

Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
Message-id: 20230113200138.52869-2-philmd@linaro.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 iothread.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/iothread.c b/iothread.c
index 3862a64..b41c305 100644
--- a/iothread.c
+++ b/iothread.c
@@ -25,10 +25,6 @@
 #include "qemu/rcu.h"
 #include "qemu/main-loop.h"
 
-typedef ObjectClass IOThreadClass;
-
-DECLARE_CLASS_CHECKERS(IOThreadClass, IOTHREAD,
-                       TYPE_IOTHREAD)
 
 #ifdef CONFIG_POSIX
 /* Benchmark results from 2016 on NVMe SSD drives show max polling times around
-- 
cgit v1.1


From 4703f6c2f7a982ccc6e970ad3010bfe1203a828d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= <philmd@linaro.org>
Date: Fri, 13 Jan 2023 21:01:36 +0100
Subject: hw/irq: Declare QOM macros using OBJECT_DECLARE_SIMPLE_TYPE()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

QOM *DECLARE* macros expect a typedef as first argument,
not a structure. Replace 'struct IRQState' by 'IRQState'
to avoid when modifying the macros:

  ../hw/core/irq.c:29:1: error: declaration of anonymous struct must be a definition
  DECLARE_INSTANCE_CHECKER(struct IRQState, IRQ,
  ^

Use OBJECT_DECLARE_SIMPLE_TYPE instead of DECLARE_INSTANCE_CHECKER.

Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
Message-id: 20230113200138.52869-3-philmd@linaro.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 hw/core/irq.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/hw/core/irq.c b/hw/core/irq.c
index 3623f71..3f14e2d 100644
--- a/hw/core/irq.c
+++ b/hw/core/irq.c
@@ -26,8 +26,7 @@
 #include "hw/irq.h"
 #include "qom/object.h"
 
-DECLARE_INSTANCE_CHECKER(struct IRQState, IRQ,
-                         TYPE_IRQ)
+OBJECT_DECLARE_SIMPLE_TYPE(IRQState, IRQ)
 
 struct IRQState {
     Object parent_obj;
@@ -68,7 +67,7 @@ qemu_irq *qemu_allocate_irqs(qemu_irq_handler handler, void *opaque, int n)
 
 qemu_irq qemu_allocate_irq(qemu_irq_handler handler, void *opaque, int n)
 {
-    struct IRQState *irq;
+    IRQState *irq;
 
     irq = IRQ(object_new(TYPE_IRQ));
     irq->handler = handler;
@@ -94,7 +93,7 @@ void qemu_free_irq(qemu_irq irq)
 
 static void qemu_notirq(void *opaque, int line, int level)
 {
-    struct IRQState *irq = opaque;
+    IRQState *irq = opaque;
 
     irq->handler(irq->opaque, irq->n, !level);
 }
@@ -120,7 +119,7 @@ void qemu_irq_intercept_in(qemu_irq *gpio_in, qemu_irq_handler handler, int n)
 static const TypeInfo irq_type_info = {
    .name = TYPE_IRQ,
    .parent = TYPE_OBJECT,
-   .instance_size = sizeof(struct IRQState),
+   .instance_size = sizeof(IRQState),
 };
 
 static void irq_register_types(void)
-- 
cgit v1.1


From 60d3ccfbe119974d3adda799de85cdae698c5434 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= <philmd@linaro.org>
Date: Fri, 13 Jan 2023 21:01:37 +0100
Subject: hw/or-irq: Declare QOM macros using OBJECT_DECLARE_SIMPLE_TYPE()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Missed during automatic conversion from commit 8063396bf3
("Use OBJECT_DECLARE_SIMPLE_TYPE when possible").

Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
Message-id: 20230113200138.52869-4-philmd@linaro.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 include/hw/or-irq.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/hw/or-irq.h b/include/hw/or-irq.h
index f2f0a27..131abc2 100644
--- a/include/hw/or-irq.h
+++ b/include/hw/or-irq.h
@@ -37,8 +37,7 @@
 
 typedef struct OrIRQState qemu_or_irq;
 
-DECLARE_INSTANCE_CHECKER(qemu_or_irq, OR_IRQ,
-                         TYPE_OR_IRQ)
+OBJECT_DECLARE_SIMPLE_TYPE(OrIRQState, OR_IRQ)
 
 struct OrIRQState {
     DeviceState parent_obj;
-- 
cgit v1.1


From e844f0c5d0bd2c4d8d3c1622eb2a88586c9c4677 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= <philmd@linaro.org>
Date: Fri, 13 Jan 2023 21:01:38 +0100
Subject: hw: Replace qemu_or_irq typedef by OrIRQState
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

OBJECT_DECLARE_SIMPLE_TYPE() macro provides the OrIRQState
declaration for free. Besides, the QOM code style is to use
the structure name as typedef, and QEMU style is to use Camel
Case, so rename qemu_or_irq as OrIRQState.

Mechanical change using:

  $ sed -i -e 's/qemu_or_irq/OrIRQState/g' $(git grep -l qemu_or_irq)

Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
Message-id: 20230113200138.52869-5-philmd@linaro.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 hw/arm/exynos4210.c                  |  4 ++--
 hw/arm/mps2-tz.c                     |  2 +-
 hw/core/or-irq.c                     | 18 +++++++++---------
 hw/pci-host/raven.c                  |  2 +-
 include/hw/arm/armsse.h              |  6 +++---
 include/hw/arm/bcm2835_peripherals.h |  2 +-
 include/hw/arm/exynos4210.h          |  4 ++--
 include/hw/arm/stm32f205_soc.h       |  2 +-
 include/hw/arm/stm32f405_soc.h       |  2 +-
 include/hw/arm/xlnx-versal.h         |  6 +++---
 include/hw/arm/xlnx-zynqmp.h         |  2 +-
 include/hw/or-irq.h                  |  2 --
 12 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/hw/arm/exynos4210.c b/hw/arm/exynos4210.c
index 8dafa22..6f2dda1 100644
--- a/hw/arm/exynos4210.c
+++ b/hw/arm/exynos4210.c
@@ -507,7 +507,7 @@ static uint64_t exynos4210_calc_affinity(int cpu)
     return (0x9 << ARM_AFF1_SHIFT) | cpu;
 }
 
-static DeviceState *pl330_create(uint32_t base, qemu_or_irq *orgate,
+static DeviceState *pl330_create(uint32_t base, OrIRQState *orgate,
                                  qemu_irq irq, int nreq, int nevents, int width)
 {
     SysBusDevice *busdev;
@@ -806,7 +806,7 @@ static void exynos4210_init(Object *obj)
 
     for (i = 0; i < ARRAY_SIZE(s->pl330_irq_orgate); i++) {
         char *name = g_strdup_printf("pl330-irq-orgate%d", i);
-        qemu_or_irq *orgate = &s->pl330_irq_orgate[i];
+        OrIRQState *orgate = &s->pl330_irq_orgate[i];
 
         object_initialize_child(obj, name, orgate, TYPE_OR_IRQ);
         g_free(name);
diff --git a/hw/arm/mps2-tz.c b/hw/arm/mps2-tz.c
index 284c09c..07aecd9 100644
--- a/hw/arm/mps2-tz.c
+++ b/hw/arm/mps2-tz.c
@@ -152,7 +152,7 @@ struct MPS2TZMachineState {
     TZMSC msc[4];
     CMSDKAPBUART uart[6];
     SplitIRQ sec_resp_splitter;
-    qemu_or_irq uart_irq_orgate;
+    OrIRQState uart_irq_orgate;
     DeviceState *lan9118;
     SplitIRQ cpu_irq_splitter[MPS2TZ_NUMIRQ_MAX];
     Clock *sysclk;
diff --git a/hw/core/or-irq.c b/hw/core/or-irq.c
index d8f3754..1df4bc0 100644
--- a/hw/core/or-irq.c
+++ b/hw/core/or-irq.c
@@ -31,7 +31,7 @@
 
 static void or_irq_handler(void *opaque, int n, int level)
 {
-    qemu_or_irq *s = OR_IRQ(opaque);
+    OrIRQState *s = OR_IRQ(opaque);
     int or_level = 0;
     int i;
 
@@ -46,7 +46,7 @@ static void or_irq_handler(void *opaque, int n, int level)
 
 static void or_irq_reset(DeviceState *dev)
 {
-    qemu_or_irq *s = OR_IRQ(dev);
+    OrIRQState *s = OR_IRQ(dev);
     int i;
 
     for (i = 0; i < MAX_OR_LINES; i++) {
@@ -56,7 +56,7 @@ static void or_irq_reset(DeviceState *dev)
 
 static void or_irq_realize(DeviceState *dev, Error **errp)
 {
-    qemu_or_irq *s = OR_IRQ(dev);
+    OrIRQState *s = OR_IRQ(dev);
 
     assert(s->num_lines <= MAX_OR_LINES);
 
@@ -65,7 +65,7 @@ static void or_irq_realize(DeviceState *dev, Error **errp)
 
 static void or_irq_init(Object *obj)
 {
-    qemu_or_irq *s = OR_IRQ(obj);
+    OrIRQState *s = OR_IRQ(obj);
 
     qdev_init_gpio_out(DEVICE(obj), &s->out_irq, 1);
 }
@@ -84,7 +84,7 @@ static void or_irq_init(Object *obj)
 
 static bool vmstate_extras_needed(void *opaque)
 {
-    qemu_or_irq *s = OR_IRQ(opaque);
+    OrIRQState *s = OR_IRQ(opaque);
 
     return s->num_lines >= OLD_MAX_OR_LINES;
 }
@@ -95,7 +95,7 @@ static const VMStateDescription vmstate_or_irq_extras = {
     .minimum_version_id = 1,
     .needed = vmstate_extras_needed,
     .fields = (VMStateField[]) {
-        VMSTATE_VARRAY_UINT16_UNSAFE(levels, qemu_or_irq, num_lines, 0,
+        VMSTATE_VARRAY_UINT16_UNSAFE(levels, OrIRQState, num_lines, 0,
                                      vmstate_info_bool, bool),
         VMSTATE_END_OF_LIST(),
     },
@@ -106,7 +106,7 @@ static const VMStateDescription vmstate_or_irq = {
     .version_id = 1,
     .minimum_version_id = 1,
     .fields = (VMStateField[]) {
-        VMSTATE_BOOL_SUB_ARRAY(levels, qemu_or_irq, 0, OLD_MAX_OR_LINES),
+        VMSTATE_BOOL_SUB_ARRAY(levels, OrIRQState, 0, OLD_MAX_OR_LINES),
         VMSTATE_END_OF_LIST(),
     },
     .subsections = (const VMStateDescription*[]) {
@@ -116,7 +116,7 @@ static const VMStateDescription vmstate_or_irq = {
 };
 
 static Property or_irq_properties[] = {
-    DEFINE_PROP_UINT16("num-lines", qemu_or_irq, num_lines, 1),
+    DEFINE_PROP_UINT16("num-lines", OrIRQState, num_lines, 1),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -136,7 +136,7 @@ static void or_irq_class_init(ObjectClass *klass, void *data)
 static const TypeInfo or_irq_type_info = {
    .name = TYPE_OR_IRQ,
    .parent = TYPE_DEVICE,
-   .instance_size = sizeof(qemu_or_irq),
+   .instance_size = sizeof(OrIRQState),
    .instance_init = or_irq_init,
    .class_init = or_irq_class_init,
 };
diff --git a/hw/pci-host/raven.c b/hw/pci-host/raven.c
index cdfb62a..072ffe3 100644
--- a/hw/pci-host/raven.c
+++ b/hw/pci-host/raven.c
@@ -60,7 +60,7 @@ DECLARE_INSTANCE_CHECKER(PREPPCIState, RAVEN_PCI_HOST_BRIDGE,
 struct PRePPCIState {
     PCIHostState parent_obj;
 
-    qemu_or_irq *or_irq;
+    OrIRQState *or_irq;
     qemu_irq pci_irqs[PCI_NUM_PINS];
     PCIBus pci_bus;
     AddressSpace pci_io_as;
diff --git a/include/hw/arm/armsse.h b/include/hw/arm/armsse.h
index 9648e7a..cd0931d 100644
--- a/include/hw/arm/armsse.h
+++ b/include/hw/arm/armsse.h
@@ -155,12 +155,12 @@ struct ARMSSE {
     TZPPC apb_ppc[NUM_INTERNAL_PPCS];
     TZMPC mpc[IOTS_NUM_MPC];
     CMSDKAPBTimer timer[3];
-    qemu_or_irq ppc_irq_orgate;
+    OrIRQState ppc_irq_orgate;
     SplitIRQ sec_resp_splitter;
     SplitIRQ ppc_irq_splitter[NUM_PPCS];
     SplitIRQ mpc_irq_splitter[IOTS_NUM_EXP_MPC + IOTS_NUM_MPC];
-    qemu_or_irq mpc_irq_orgate;
-    qemu_or_irq nmi_orgate;
+    OrIRQState mpc_irq_orgate;
+    OrIRQState nmi_orgate;
 
     SplitIRQ cpu_irq_splitter[NUM_SSE_IRQS];
 
diff --git a/include/hw/arm/bcm2835_peripherals.h b/include/hw/arm/bcm2835_peripherals.h
index c9d25d4..d724a2f 100644
--- a/include/hw/arm/bcm2835_peripherals.h
+++ b/include/hw/arm/bcm2835_peripherals.h
@@ -56,7 +56,7 @@ struct BCM2835PeripheralState {
     BCM2835AuxState aux;
     BCM2835FBState fb;
     BCM2835DMAState dma;
-    qemu_or_irq orgated_dma_irq;
+    OrIRQState orgated_dma_irq;
     BCM2835ICState ic;
     BCM2835PropertyState property;
     BCM2835RngState rng;
diff --git a/include/hw/arm/exynos4210.h b/include/hw/arm/exynos4210.h
index 97353f1..68db19f 100644
--- a/include/hw/arm/exynos4210.h
+++ b/include/hw/arm/exynos4210.h
@@ -96,8 +96,8 @@ struct Exynos4210State {
     MemoryRegion boot_secondary;
     MemoryRegion bootreg_mem;
     I2CBus *i2c_if[EXYNOS4210_I2C_NUMBER];
-    qemu_or_irq pl330_irq_orgate[EXYNOS4210_NUM_DMA];
-    qemu_or_irq cpu_irq_orgate[EXYNOS4210_NCPUS];
+    OrIRQState pl330_irq_orgate[EXYNOS4210_NUM_DMA];
+    OrIRQState cpu_irq_orgate[EXYNOS4210_NCPUS];
     A9MPPrivState a9mpcore;
     Exynos4210GicState ext_gic;
     Exynos4210CombinerState int_combiner;
diff --git a/include/hw/arm/stm32f205_soc.h b/include/hw/arm/stm32f205_soc.h
index 849d3ed..5a4f776 100644
--- a/include/hw/arm/stm32f205_soc.h
+++ b/include/hw/arm/stm32f205_soc.h
@@ -63,7 +63,7 @@ struct STM32F205State {
     STM32F2XXADCState adc[STM_NUM_ADCS];
     STM32F2XXSPIState spi[STM_NUM_SPIS];
 
-    qemu_or_irq *adc_irqs;
+    OrIRQState *adc_irqs;
 
     MemoryRegion sram;
     MemoryRegion flash;
diff --git a/include/hw/arm/stm32f405_soc.h b/include/hw/arm/stm32f405_soc.h
index 249ab54..c968ce3 100644
--- a/include/hw/arm/stm32f405_soc.h
+++ b/include/hw/arm/stm32f405_soc.h
@@ -63,7 +63,7 @@ struct STM32F405State {
     STM32F4xxExtiState exti;
     STM32F2XXUsartState usart[STM_NUM_USARTS];
     STM32F2XXTimerState timer[STM_NUM_TIMERS];
-    qemu_or_irq adc_irqs;
+    OrIRQState adc_irqs;
     STM32F2XXADCState adc[STM_NUM_ADCS];
     STM32F2XXSPIState spi[STM_NUM_SPIS];
 
diff --git a/include/hw/arm/xlnx-versal.h b/include/hw/arm/xlnx-versal.h
index cbe8a19..b6786e9 100644
--- a/include/hw/arm/xlnx-versal.h
+++ b/include/hw/arm/xlnx-versal.h
@@ -85,7 +85,7 @@ struct Versal {
         } rpu;
 
         struct {
-            qemu_or_irq irq_orgate;
+            OrIRQState irq_orgate;
             XlnxXramCtrl ctrl[XLNX_VERSAL_NR_XRAM];
         } xram;
 
@@ -103,7 +103,7 @@ struct Versal {
                 XlnxCSUDMA dma_src;
                 XlnxCSUDMA dma_dst;
                 MemoryRegion linear_mr;
-                qemu_or_irq irq_orgate;
+                OrIRQState irq_orgate;
             } ospi;
         } iou;
 
@@ -113,7 +113,7 @@ struct Versal {
         XlnxVersalEFuseCtrl efuse_ctrl;
         XlnxVersalEFuseCache efuse_cache;
 
-        qemu_or_irq apb_irq_orgate;
+        OrIRQState apb_irq_orgate;
     } pmc;
 
     struct {
diff --git a/include/hw/arm/xlnx-zynqmp.h b/include/hw/arm/xlnx-zynqmp.h
index 20bdf89..687c75e 100644
--- a/include/hw/arm/xlnx-zynqmp.h
+++ b/include/hw/arm/xlnx-zynqmp.h
@@ -130,7 +130,7 @@ struct XlnxZynqMPState {
     XlnxZDMA gdma[XLNX_ZYNQMP_NUM_GDMA_CH];
     XlnxZDMA adma[XLNX_ZYNQMP_NUM_ADMA_CH];
     XlnxCSUDMA qspi_dma;
-    qemu_or_irq qspi_irq_orgate;
+    OrIRQState qspi_irq_orgate;
     XlnxZynqMPAPUCtrl apu_ctrl;
     XlnxZynqMPCRF crf;
     CadenceTTCState ttc[XLNX_ZYNQMP_NUM_TTC];
diff --git a/include/hw/or-irq.h b/include/hw/or-irq.h
index 131abc2..c0a42f3 100644
--- a/include/hw/or-irq.h
+++ b/include/hw/or-irq.h
@@ -35,8 +35,6 @@
  */
 #define MAX_OR_LINES      48
 
-typedef struct OrIRQState qemu_or_irq;
-
 OBJECT_DECLARE_SIMPLE_TYPE(OrIRQState, OR_IRQ)
 
 struct OrIRQState {
-- 
cgit v1.1