17 files changed, 3083 insertions, 583 deletions
diff --git a/target-arm/Makefile.objs b/target-arm/Makefile.objs
index 356fbfc..dcd167e 100644
--- a/target-arm/Makefile.objs
+++ b/target-arm/Makefile.objs
@@ -1,8 +1,11 @@
 obj-y += arm-semi.o
 obj-$(CONFIG_SOFTMMU) += machine.o
 obj-$(CONFIG_KVM) += kvm.o
+obj-$(call land,$(CONFIG_KVM),$(call lnot,$(TARGET_AARCH64))) += kvm32.o
+obj-$(call land,$(CONFIG_KVM),$(TARGET_AARCH64)) += kvm64.o
 obj-$(call lnot,$(CONFIG_KVM)) += kvm-stub.o
 obj-y += translate.o op_helper.o helper.o cpu.o
 obj-y += neon_helper.o iwmmxt_helper.o
 obj-y += gdbstub.o
-obj-$(TARGET_AARCH64) += cpu64.o translate-a64.o gdbstub64.o
+obj-$(TARGET_AARCH64) += cpu64.o translate-a64.o helper-a64.o gdbstub64.o
+obj-y += crypto_helper.o
diff --git a/target-arm/cpu-qom.h b/target-arm/cpu-qom.h
index f32178a..afbd422 100644
--- a/target-arm/cpu-qom.h
+++ b/target-arm/cpu-qom.h
@@ -139,6 +139,7 @@ typedef struct ARMCPU {
     uint32_t ccsidr[16];
     uint32_t reset_cbar;
     uint32_t reset_auxcr;
+    bool reset_hivecs;
 } ARMCPU;
 
 #define TYPE_AARCH64_CPU "aarch64-cpu"
diff --git a/target-arm/cpu.c b/target-arm/cpu.c
index 0635e78..408d207 100644
--- a/target-arm/cpu.c
+++ b/target-arm/cpu.c
@@ -21,6 +21,7 @@
 #include "cpu.h"
 #include "qemu-common.h"
 #include "hw/qdev-properties.h"
+#include "qapi/qmp/qerror.h"
 #if !defined(CONFIG_USER_ONLY)
 #include "hw/loader.h"
 #endif
@@ -88,6 +89,12 @@ static void arm_cpu_reset(CPUState *s)
     if (arm_feature(env, ARM_FEATURE_AARCH64)) {
         /* 64 bit CPUs always start in 64 bit mode */
         env->aarch64 = 1;
+#if defined(CONFIG_USER_ONLY)
+        env->pstate = PSTATE_MODE_EL0t;
+#else
+        env->pstate = PSTATE_D | PSTATE_A | PSTATE_I | PSTATE_F
+            | PSTATE_MODE_EL1h;
+#endif
     }
 
 #if defined(CONFIG_USER_ONLY)
@@ -120,6 +127,11 @@ static void arm_cpu_reset(CPUState *s)
             env->regs[15] = pc & ~1;
         }
     }
+
+    if (env->cp15.c1_sys & (1 << 13)) {
+            env->regs[15] = 0xFFFF0000;
+    }
+
     env->vfp.xregs[ARM_VFP_FPEXC] = 0;
 #endif
     set_flush_to_zero(1, &env->vfp.standard_fp_status);
@@ -231,6 +243,30 @@ static void arm_cpu_initfn(Object *obj)
     }
 }
 
+static Property arm_cpu_reset_cbar_property =
+            DEFINE_PROP_UINT32("reset-cbar", ARMCPU, reset_cbar, 0);
+
+static Property arm_cpu_reset_hivecs_property =
+            DEFINE_PROP_BOOL("reset-hivecs", ARMCPU, reset_hivecs, false);
+
+static void arm_cpu_post_init(Object *obj)
+{
+    ARMCPU *cpu = ARM_CPU(obj);
+    Error *err = NULL;
+
+    if (arm_feature(&cpu->env, ARM_FEATURE_CBAR)) {
+        qdev_property_add_static(DEVICE(obj), &arm_cpu_reset_cbar_property,
+                                 &err);
+        assert_no_error(err);
+    }
+
+    if (!arm_feature(&cpu->env, ARM_FEATURE_M)) {
+        qdev_property_add_static(DEVICE(obj), &arm_cpu_reset_hivecs_property,
+                                 &err);
+        assert_no_error(err);
+    }
+}
+
 static void arm_cpu_finalizefn(Object *obj)
 {
     ARMCPU *cpu = ARM_CPU(obj);
@@ -249,6 +285,7 @@ static void arm_cpu_realizefn(DeviceState *dev, Error **errp)
         set_feature(env, ARM_FEATURE_V7);
         set_feature(env, ARM_FEATURE_ARM_DIV);
         set_feature(env, ARM_FEATURE_LPAE);
+        set_feature(env, ARM_FEATURE_V8_AES);
     }
     if (arm_feature(env, ARM_FEATURE_V7)) {
         set_feature(env, ARM_FEATURE_VAPA);
@@ -290,6 +327,10 @@ static void arm_cpu_realizefn(DeviceState *dev, Error **errp)
         set_feature(env, ARM_FEATURE_PXN);
     }
 
+    if (cpu->reset_hivecs) {
+            cpu->reset_sctlr |= (1 << 13);
+    }
+
     register_cp_regs_for_features(cpu);
     arm_cpu_register_gdb_regs_for_features(cpu);
 
@@ -616,6 +657,7 @@ static void cortex_a9_initfn(Object *obj)
      * and valid configurations; we don't model A9UP).
      */
     set_feature(&cpu->env, ARM_FEATURE_V7MP);
+    set_feature(&cpu->env, ARM_FEATURE_CBAR);
     cpu->midr = 0x410fc090;
     cpu->reset_fpsid = 0x41033090;
     cpu->mvfr0 = 0x11110222;
@@ -638,15 +680,7 @@ static void cortex_a9_initfn(Object *obj)
     cpu->clidr = (1 << 27) | (1 << 24) | 3;
     cpu->ccsidr[0] = 0xe00fe015; /* 16k L1 dcache. */
     cpu->ccsidr[1] = 0x200fe015; /* 16k L1 icache. */
-    {
-        ARMCPRegInfo cbar = {
-            .name = "CBAR", .cp = 15, .crn = 15,  .crm = 0, .opc1 = 4,
-            .opc2 = 0, .access = PL1_R|PL3_W, .resetvalue = cpu->reset_cbar,
-            .fieldoffset = offsetof(CPUARMState, cp15.c15_config_base_address)
-        };
-        define_one_arm_cp_reg(cpu, &cbar);
-        define_arm_cp_regs(cpu, cortexa9_cp_reginfo);
-    }
+    define_arm_cp_regs(cpu, cortexa9_cp_reginfo);
 }
 
 #ifndef CONFIG_USER_ONLY
@@ -685,6 +719,7 @@ static void cortex_a15_initfn(Object *obj)
     set_feature(&cpu->env, ARM_FEATURE_ARM_DIV);
     set_feature(&cpu->env, ARM_FEATURE_GENERIC_TIMER);
     set_feature(&cpu->env, ARM_FEATURE_DUMMY_C15_REGS);
+    set_feature(&cpu->env, ARM_FEATURE_CBAR);
     set_feature(&cpu->env, ARM_FEATURE_LPAE);
     cpu->kvm_target = QEMU_KVM_ARM_TARGET_CORTEX_A15;
     cpu->midr = 0x412fc0f1;
@@ -999,6 +1034,7 @@ static const TypeInfo arm_cpu_type_info = {
     .parent = TYPE_CPU,
     .instance_size = sizeof(ARMCPU),
     .instance_init = arm_cpu_initfn,
+    .instance_post_init = arm_cpu_post_init,
     .instance_finalize = arm_cpu_finalizefn,
     .abstract = true,
     .class_size = sizeof(ARMCPUClass),
diff --git a/target-arm/cpu.h b/target-arm/cpu.h
index c3f007f..56ed591 100644
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -113,8 +113,15 @@ typedef struct CPUARMState {
     /* Regs for A64 mode.  */
     uint64_t xregs[32];
     uint64_t pc;
-    /* TODO: pstate doesn't correspond to an architectural register;
-     * it would be better modelled as the underlying fields.
+    /* PSTATE isn't an architectural register for ARMv8. However, it is
+     * convenient for us to assemble the underlying state into a 32 bit format
+     * identical to the architectural format used for the SPSR. (This is also
+     * what the Linux kernel's 'pstate' field in signal handlers and KVM's
+     * 'pstate' register are.) Of the PSTATE bits:
+     *  NZCV are kept in the split out env->CF/VF/NF/ZF, (which have the same
+     *    semantics as for AArch32, as described in the comments on each field)
+     *  nRW (also known as M[4]) is kept, inverted, in env->aarch64
+     *  all other bits are stored in their correct places in env->pstate
      */
     uint32_t pstate;
     uint32_t aarch64; /* 1 if CPU is in aarch64 state; inverse of PSTATE.nRW */
@@ -309,15 +316,6 @@ static inline bool is_a64(CPUARMState *env)
     return env->aarch64;
 }
 
-#define PSTATE_N_SHIFT 3
-#define PSTATE_N  (1 << PSTATE_N_SHIFT)
-#define PSTATE_Z_SHIFT 2
-#define PSTATE_Z  (1 << PSTATE_Z_SHIFT)
-#define PSTATE_C_SHIFT 1
-#define PSTATE_C  (1 << PSTATE_C_SHIFT)
-#define PSTATE_V_SHIFT 0
-#define PSTATE_V  (1 << PSTATE_V_SHIFT)
-
 /* you can call this signal handler from your SIGBUS and SIGSEGV
    signal handlers to inform the virtual CPU of exceptions. non zero
    is returned if the signal was handled by the virtual CPU.  */
@@ -352,6 +350,56 @@ int cpu_arm_handle_mmu_fault (CPUARMState *env, target_ulong address, int rw,
 /* Execution state bits.  MRS read as zero, MSR writes ignored.  */
 #define CPSR_EXEC (CPSR_T | CPSR_IT | CPSR_J)
 
+/* Bit definitions for ARMv8 SPSR (PSTATE) format.
+ * Only these are valid when in AArch64 mode; in
+ * AArch32 mode SPSRs are basically CPSR-format.
+ */
+#define PSTATE_M (0xFU)
+#define PSTATE_nRW (1U << 4)
+#define PSTATE_F (1U << 6)
+#define PSTATE_I (1U << 7)
+#define PSTATE_A (1U << 8)
+#define PSTATE_D (1U << 9)
+#define PSTATE_IL (1U << 20)
+#define PSTATE_SS (1U << 21)
+#define PSTATE_V (1U << 28)
+#define PSTATE_C (1U << 29)
+#define PSTATE_Z (1U << 30)
+#define PSTATE_N (1U << 31)
+#define PSTATE_NZCV (PSTATE_N | PSTATE_Z | PSTATE_C | PSTATE_V)
+#define CACHED_PSTATE_BITS (PSTATE_NZCV)
+/* Mode values for AArch64 */
+#define PSTATE_MODE_EL3h 13
+#define PSTATE_MODE_EL3t 12
+#define PSTATE_MODE_EL2h 9
+#define PSTATE_MODE_EL2t 8
+#define PSTATE_MODE_EL1h 5
+#define PSTATE_MODE_EL1t 4
+#define PSTATE_MODE_EL0t 0
+
+/* Return the current PSTATE value. For the moment we don't support 32<->64 bit
+ * interprocessing, so we don't attempt to sync with the cpsr state used by
+ * the 32 bit decoder.
+ */
+static inline uint32_t pstate_read(CPUARMState *env)
+{
+    int ZF;
+
+    ZF = (env->ZF == 0);
+    return (env->NF & 0x80000000) | (ZF << 30)
+        | (env->CF << 29) | ((env->VF & 0x80000000) >> 3)
+        | env->pstate;
+}
+
+static inline void pstate_write(CPUARMState *env, uint32_t val)
+{
+    env->ZF = (~val) & PSTATE_Z;
+    env->NF = val;
+    env->CF = (val >> 29) & 1;
+    env->VF = (val << 3) & 0x80000000;
+    env->pstate = val & ~CACHED_PSTATE_BITS;
+}
+
 /* Return the current CPSR value.  */
 uint32_t cpsr_read(CPUARMState *env);
 /* Set the CPSR.  Note that some bits of mask must be all-set or all-clear.  */
@@ -399,6 +447,34 @@ static inline void xpsr_write(CPUARMState *env, uint32_t val, uint32_t mask)
 uint32_t vfp_get_fpscr(CPUARMState *env);
 void vfp_set_fpscr(CPUARMState *env, uint32_t val);
 
+/* For A64 the FPSCR is split into two logically distinct registers,
+ * FPCR and FPSR. However since they still use non-overlapping bits
+ * we store the underlying state in fpscr and just mask on read/write.
+ */
+#define FPSR_MASK 0xf800009f
+#define FPCR_MASK 0x07f79f00
+static inline uint32_t vfp_get_fpsr(CPUARMState *env)
+{
+    return vfp_get_fpscr(env) & FPSR_MASK;
+}
+
+static inline void vfp_set_fpsr(CPUARMState *env, uint32_t val)
+{
+    uint32_t new_fpscr = (vfp_get_fpscr(env) & ~FPSR_MASK) | (val & FPSR_MASK);
+    vfp_set_fpscr(env, new_fpscr);
+}
+
+static inline uint32_t vfp_get_fpcr(CPUARMState *env)
+{
+    return vfp_get_fpscr(env) & FPCR_MASK;
+}
+
+static inline void vfp_set_fpcr(CPUARMState *env, uint32_t val)
+{
+    uint32_t new_fpscr = (vfp_get_fpscr(env) & ~FPCR_MASK) | (val & FPCR_MASK);
+    vfp_set_fpscr(env, new_fpscr);
+}
+
 enum arm_cpu_mode {
   ARM_CPU_MODE_USR = 0x10,
   ARM_CPU_MODE_FIQ = 0x11,
@@ -467,6 +543,8 @@ enum arm_features {
     ARM_FEATURE_LPAE, /* has Large Physical Address Extension */
     ARM_FEATURE_V8,
     ARM_FEATURE_AARCH64, /* supports 64 bit mode */
+    ARM_FEATURE_V8_AES, /* implements AES part of v8 Crypto Extensions */
+    ARM_FEATURE_CBAR, /* has cp15 CBAR */
 };
 
 static inline int arm_feature(CPUARMState *env, int feature)
diff --git a/target-arm/cpu64.c b/target-arm/cpu64.c
index 3e99c21..04ce879 100644
--- a/target-arm/cpu64.c
+++ b/target-arm/cpu64.c
@@ -68,11 +68,22 @@ static void aarch64_cpu_finalizefn(Object *obj)
 {
 }
 
+static void aarch64_cpu_set_pc(CPUState *cs, vaddr value)
+{
+    ARMCPU *cpu = ARM_CPU(cs);
+    /*
+     * TODO: this will need updating for system emulation,
+     * when the core may be in AArch32 mode.
+     */
+    cpu->env.pc = value;
+}
+
 static void aarch64_cpu_class_init(ObjectClass *oc, void *data)
 {
     CPUClass *cc = CPU_CLASS(oc);
 
     cc->dump_state = aarch64_cpu_dump_state;
+    cc->set_pc = aarch64_cpu_set_pc;
     cc->gdb_read_register = aarch64_cpu_gdb_read_register;
     cc->gdb_write_register = aarch64_cpu_gdb_write_register;
     cc->gdb_num_core_regs = 34;
diff --git a/target-arm/crypto_helper.c b/target-arm/crypto_helper.c
new file mode 100644
index 0000000..f94be69
--- /dev/null
+++ b/target-arm/crypto_helper.c
@@ -0,0 +1,281 @@
+/*
+ * crypto_helper.c - emulate v8 Crypto Extensions instructions
+ *
+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ */
+
+#include <stdlib.h>
+
+#include "cpu.h"
+#include "exec/exec-all.h"
+#include "helper.h"
+
+union AES_STATE {
+    uint8_t    bytes[16];
+    uint32_t   cols[4];
+    uint64_t   l[2];
+};
+
+void HELPER(crypto_aese)(CPUARMState *env, uint32_t rd, uint32_t rm,
+                         uint32_t decrypt)
+{
+    static uint8_t const sbox[][256] = { {
+        /* S-box for encryption */
+        0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5,
+        0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
+        0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
+        0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
+        0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc,
+        0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
+        0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a,
+        0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
+        0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
+        0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
+        0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b,
+        0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
+        0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85,
+        0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
+        0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
+        0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
+        0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17,
+        0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
+        0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88,
+        0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
+        0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
+        0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
+        0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9,
+        0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
+        0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6,
+        0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
+        0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
+        0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
+        0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94,
+        0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
+        0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68,
+        0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+    }, {
+        /* S-box for decryption */
+        0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38,
+        0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
+        0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
+        0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
+        0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d,
+        0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
+        0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2,
+        0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
+        0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16,
+        0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
+        0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda,
+        0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
+        0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a,
+        0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
+        0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02,
+        0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
+        0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea,
+        0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
+        0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85,
+        0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
+        0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89,
+        0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
+        0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20,
+        0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
+        0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31,
+        0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
+        0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d,
+        0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
+        0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0,
+        0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
+        0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26,
+        0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+    } };
+    static uint8_t const shift[][16] = {
+        /* ShiftRows permutation vector for encryption */
+        { 0,  5, 10, 15, 4, 9, 14,  3, 8, 13, 2,  7, 12, 1, 6, 11 },
+        /* ShiftRows permutation vector for decryption */
+        { 0, 13, 10,  7, 4, 1, 14, 11, 8,  5, 2, 15, 12, 9, 6,  3 },
+    };
+    union AES_STATE rk = { .l = {
+        float64_val(env->vfp.regs[rm]),
+        float64_val(env->vfp.regs[rm + 1])
+    } };
+    union AES_STATE st = { .l = {
+        float64_val(env->vfp.regs[rd]),
+        float64_val(env->vfp.regs[rd + 1])
+    } };
+    int i;
+
+    assert(decrypt < 2);
+
+    /* xor state vector with round key */
+    rk.l[0] ^= st.l[0];
+    rk.l[1] ^= st.l[1];
+
+    /* combine ShiftRows operation and sbox substitution */
+    for (i = 0; i < 16; i++) {
+        st.bytes[i] = sbox[decrypt][rk.bytes[shift[decrypt][i]]];
+    }
+
+    env->vfp.regs[rd] = make_float64(st.l[0]);
+    env->vfp.regs[rd + 1] = make_float64(st.l[1]);
+}
+
+void HELPER(crypto_aesmc)(CPUARMState *env, uint32_t rd, uint32_t rm,
+                          uint32_t decrypt)
+{
+    static uint32_t const mc[][256] = { {
+        /* MixColumns lookup table */
+        0x00000000, 0x03010102, 0x06020204, 0x05030306,
+        0x0c040408, 0x0f05050a, 0x0a06060c, 0x0907070e,
+        0x18080810, 0x1b090912, 0x1e0a0a14, 0x1d0b0b16,
+        0x140c0c18, 0x170d0d1a, 0x120e0e1c, 0x110f0f1e,
+        0x30101020, 0x33111122, 0x36121224, 0x35131326,
+        0x3c141428, 0x3f15152a, 0x3a16162c, 0x3917172e,
+        0x28181830, 0x2b191932, 0x2e1a1a34, 0x2d1b1b36,
+        0x241c1c38, 0x271d1d3a, 0x221e1e3c, 0x211f1f3e,
+        0x60202040, 0x63212142, 0x66222244, 0x65232346,
+        0x6c242448, 0x6f25254a, 0x6a26264c, 0x6927274e,
+        0x78282850, 0x7b292952, 0x7e2a2a54, 0x7d2b2b56,
+        0x742c2c58, 0x772d2d5a, 0x722e2e5c, 0x712f2f5e,
+        0x50303060, 0x53313162, 0x56323264, 0x55333366,
+        0x5c343468, 0x5f35356a, 0x5a36366c, 0x5937376e,
+        0x48383870, 0x4b393972, 0x4e3a3a74, 0x4d3b3b76,
+        0x443c3c78, 0x473d3d7a, 0x423e3e7c, 0x413f3f7e,
+        0xc0404080, 0xc3414182, 0xc6424284, 0xc5434386,
+        0xcc444488, 0xcf45458a, 0xca46468c, 0xc947478e,
+        0xd8484890, 0xdb494992, 0xde4a4a94, 0xdd4b4b96,
+        0xd44c4c98, 0xd74d4d9a, 0xd24e4e9c, 0xd14f4f9e,
+        0xf05050a0, 0xf35151a2, 0xf65252a4, 0xf55353a6,
+        0xfc5454a8, 0xff5555aa, 0xfa5656ac, 0xf95757ae,
+        0xe85858b0, 0xeb5959b2, 0xee5a5ab4, 0xed5b5bb6,
+        0xe45c5cb8, 0xe75d5dba, 0xe25e5ebc, 0xe15f5fbe,
+        0xa06060c0, 0xa36161c2, 0xa66262c4, 0xa56363c6,
+        0xac6464c8, 0xaf6565ca, 0xaa6666cc, 0xa96767ce,
+        0xb86868d0, 0xbb6969d2, 0xbe6a6ad4, 0xbd6b6bd6,
+        0xb46c6cd8, 0xb76d6dda, 0xb26e6edc, 0xb16f6fde,
+        0x907070e0, 0x937171e2, 0x967272e4, 0x957373e6,
+        0x9c7474e8, 0x9f7575ea, 0x9a7676ec, 0x997777ee,
+        0x887878f0, 0x8b7979f2, 0x8e7a7af4, 0x8d7b7bf6,
+        0x847c7cf8, 0x877d7dfa, 0x827e7efc, 0x817f7ffe,
+        0x9b80801b, 0x98818119, 0x9d82821f, 0x9e83831d,
+        0x97848413, 0x94858511, 0x91868617, 0x92878715,
+        0x8388880b, 0x80898909, 0x858a8a0f, 0x868b8b0d,
+        0x8f8c8c03, 0x8c8d8d01, 0x898e8e07, 0x8a8f8f05,
+        0xab90903b, 0xa8919139, 0xad92923f, 0xae93933d,
+        0xa7949433, 0xa4959531, 0xa1969637, 0xa2979735,
+        0xb398982b, 0xb0999929, 0xb59a9a2f, 0xb69b9b2d,
+        0xbf9c9c23, 0xbc9d9d21, 0xb99e9e27, 0xba9f9f25,
+        0xfba0a05b, 0xf8a1a159, 0xfda2a25f, 0xfea3a35d,
+        0xf7a4a453, 0xf4a5a551, 0xf1a6a657, 0xf2a7a755,
+        0xe3a8a84b, 0xe0a9a949, 0xe5aaaa4f, 0xe6abab4d,
+        0xefacac43, 0xecadad41, 0xe9aeae47, 0xeaafaf45,
+        0xcbb0b07b, 0xc8b1b179, 0xcdb2b27f, 0xceb3b37d,
+        0xc7b4b473, 0xc4b5b571, 0xc1b6b677, 0xc2b7b775,
+        0xd3b8b86b, 0xd0b9b969, 0xd5baba6f, 0xd6bbbb6d,
+        0xdfbcbc63, 0xdcbdbd61, 0xd9bebe67, 0xdabfbf65,
+        0x5bc0c09b, 0x58c1c199, 0x5dc2c29f, 0x5ec3c39d,
+        0x57c4c493, 0x54c5c591, 0x51c6c697, 0x52c7c795,
+        0x43c8c88b, 0x40c9c989, 0x45caca8f, 0x46cbcb8d,
+        0x4fcccc83, 0x4ccdcd81, 0x49cece87, 0x4acfcf85,
+        0x6bd0d0bb, 0x68d1d1b9, 0x6dd2d2bf, 0x6ed3d3bd,
+        0x67d4d4b3, 0x64d5d5b1, 0x61d6d6b7, 0x62d7d7b5,
+        0x73d8d8ab, 0x70d9d9a9, 0x75dadaaf, 0x76dbdbad,
+        0x7fdcdca3, 0x7cdddda1, 0x79dedea7, 0x7adfdfa5,
+        0x3be0e0db, 0x38e1e1d9, 0x3de2e2df, 0x3ee3e3dd,
+        0x37e4e4d3, 0x34e5e5d1, 0x31e6e6d7, 0x32e7e7d5,
+        0x23e8e8cb, 0x20e9e9c9, 0x25eaeacf, 0x26ebebcd,
+        0x2fececc3, 0x2cededc1, 0x29eeeec7, 0x2aefefc5,
+        0x0bf0f0fb, 0x08f1f1f9, 0x0df2f2ff, 0x0ef3f3fd,
+        0x07f4f4f3, 0x04f5f5f1, 0x01f6f6f7, 0x02f7f7f5,
+        0x13f8f8eb, 0x10f9f9e9, 0x15fafaef, 0x16fbfbed,
+        0x1ffcfce3, 0x1cfdfde1, 0x19fefee7, 0x1affffe5,
+    }, {
+        /* Inverse MixColumns lookup table */
+        0x00000000, 0x0b0d090e, 0x161a121c, 0x1d171b12,
+        0x2c342438, 0x27392d36, 0x3a2e3624, 0x31233f2a,
+        0x58684870, 0x5365417e, 0x4e725a6c, 0x457f5362,
+        0x745c6c48, 0x7f516546, 0x62467e54, 0x694b775a,
+        0xb0d090e0, 0xbbdd99ee, 0xa6ca82fc, 0xadc78bf2,
+        0x9ce4b4d8, 0x97e9bdd6, 0x8afea6c4, 0x81f3afca,
+        0xe8b8d890, 0xe3b5d19e, 0xfea2ca8c, 0xf5afc382,
+        0xc48cfca8, 0xcf81f5a6, 0xd296eeb4, 0xd99be7ba,
+        0x7bbb3bdb, 0x70b632d5, 0x6da129c7, 0x66ac20c9,
+        0x578f1fe3, 0x5c8216ed, 0x41950dff, 0x4a9804f1,
+        0x23d373ab, 0x28de7aa5, 0x35c961b7, 0x3ec468b9,
+        0x0fe75793, 0x04ea5e9d, 0x19fd458f, 0x12f04c81,
+        0xcb6bab3b, 0xc066a235, 0xdd71b927, 0xd67cb029,
+        0xe75f8f03, 0xec52860d, 0xf1459d1f, 0xfa489411,
+        0x9303e34b, 0x980eea45, 0x8519f157, 0x8e14f859,
+        0xbf37c773, 0xb43ace7d, 0xa92dd56f, 0xa220dc61,
+        0xf66d76ad, 0xfd607fa3, 0xe07764b1, 0xeb7a6dbf,
+        0xda595295, 0xd1545b9b, 0xcc434089, 0xc74e4987,
+        0xae053edd, 0xa50837d3, 0xb81f2cc1, 0xb31225cf,
+        0x82311ae5, 0x893c13eb, 0x942b08f9, 0x9f2601f7,
+        0x46bde64d, 0x4db0ef43, 0x50a7f451, 0x5baafd5f,
+        0x6a89c275, 0x6184cb7b, 0x7c93d069, 0x779ed967,
+        0x1ed5ae3d, 0x15d8a733, 0x08cfbc21, 0x03c2b52f,
+        0x32e18a05, 0x39ec830b, 0x24fb9819, 0x2ff69117,
+        0x8dd64d76, 0x86db4478, 0x9bcc5f6a, 0x90c15664,
+        0xa1e2694e, 0xaaef6040, 0xb7f87b52, 0xbcf5725c,
+        0xd5be0506, 0xdeb30c08, 0xc3a4171a, 0xc8a91e14,
+        0xf98a213e, 0xf2872830, 0xef903322, 0xe49d3a2c,
+        0x3d06dd96, 0x360bd498, 0x2b1ccf8a, 0x2011c684,
+        0x1132f9ae, 0x1a3ff0a0, 0x0728ebb2, 0x0c25e2bc,
+        0x656e95e6, 0x6e639ce8, 0x737487fa, 0x78798ef4,
+        0x495ab1de, 0x4257b8d0, 0x5f40a3c2, 0x544daacc,
+        0xf7daec41, 0xfcd7e54f, 0xe1c0fe5d, 0xeacdf753,
+        0xdbeec879, 0xd0e3c177, 0xcdf4da65, 0xc6f9d36b,
+        0xafb2a431, 0xa4bfad3f, 0xb9a8b62d, 0xb2a5bf23,
+        0x83868009, 0x888b8907, 0x959c9215, 0x9e919b1b,
+        0x470a7ca1, 0x4c0775af, 0x51106ebd, 0x5a1d67b3,
+        0x6b3e5899, 0x60335197, 0x7d244a85, 0x7629438b,
+        0x1f6234d1, 0x146f3ddf, 0x097826cd, 0x02752fc3,
+        0x335610e9, 0x385b19e7, 0x254c02f5, 0x2e410bfb,
+        0x8c61d79a, 0x876cde94, 0x9a7bc586, 0x9176cc88,
+        0xa055f3a2, 0xab58faac, 0xb64fe1be, 0xbd42e8b0,
+        0xd4099fea, 0xdf0496e4, 0xc2138df6, 0xc91e84f8,
+        0xf83dbbd2, 0xf330b2dc, 0xee27a9ce, 0xe52aa0c0,
+        0x3cb1477a, 0x37bc4e74, 0x2aab5566, 0x21a65c68,
+        0x10856342, 0x1b886a4c, 0x069f715e, 0x0d927850,
+        0x64d90f0a, 0x6fd40604, 0x72c31d16, 0x79ce1418,
+        0x48ed2b32, 0x43e0223c, 0x5ef7392e, 0x55fa3020,
+        0x01b79aec, 0x0aba93e2, 0x17ad88f0, 0x1ca081fe,
+        0x2d83bed4, 0x268eb7da, 0x3b99acc8, 0x3094a5c6,
+        0x59dfd29c, 0x52d2db92, 0x4fc5c080, 0x44c8c98e,
+        0x75ebf6a4, 0x7ee6ffaa, 0x63f1e4b8, 0x68fcedb6,
+        0xb1670a0c, 0xba6a0302, 0xa77d1810, 0xac70111e,
+        0x9d532e34, 0x965e273a, 0x8b493c28, 0x80443526,
+        0xe90f427c, 0xe2024b72, 0xff155060, 0xf418596e,
+        0xc53b6644, 0xce366f4a, 0xd3217458, 0xd82c7d56,
+        0x7a0ca137, 0x7101a839, 0x6c16b32b, 0x671bba25,
+        0x5638850f, 0x5d358c01, 0x40229713, 0x4b2f9e1d,
+        0x2264e947, 0x2969e049, 0x347efb5b, 0x3f73f255,
+        0x0e50cd7f, 0x055dc471, 0x184adf63, 0x1347d66d,
+        0xcadc31d7, 0xc1d138d9, 0xdcc623cb, 0xd7cb2ac5,
+        0xe6e815ef, 0xede51ce1, 0xf0f207f3, 0xfbff0efd,
+        0x92b479a7, 0x99b970a9, 0x84ae6bbb, 0x8fa362b5,
+        0xbe805d9f, 0xb58d5491, 0xa89a4f83, 0xa397468d,
+    } };
+    union AES_STATE st = { .l = {
+        float64_val(env->vfp.regs[rm]),
+        float64_val(env->vfp.regs[rm + 1])
+    } };
+    int i;
+
+    assert(decrypt < 2);
+
+    for (i = 0; i < 16; i += 4) {
+        st.cols[i >> 2] = cpu_to_le32(
+            mc[decrypt][st.bytes[i]] ^
+            rol32(mc[decrypt][st.bytes[i + 1]], 8) ^
+            rol32(mc[decrypt][st.bytes[i + 2]], 16) ^
+            rol32(mc[decrypt][st.bytes[i + 3]], 24));
+    }
+
+    env->vfp.regs[rd] = make_float64(st.l[0]);
+    env->vfp.regs[rd + 1] = make_float64(st.l[1]);
+}
diff --git a/target-arm/gdbstub64.c b/target-arm/gdbstub64.c
index 7cb6a7c..e8a8295 100644
--- a/target-arm/gdbstub64.c
+++ b/target-arm/gdbstub64.c
@@ -37,7 +37,7 @@ int aarch64_cpu_gdb_read_register(CPUState *cs, uint8_t *mem_buf, int n)
         return gdb_get_reg64(mem_buf, env->pc);
         break;
     case 33:
-        return gdb_get_reg32(mem_buf, env->pstate);
+        return gdb_get_reg32(mem_buf, pstate_read(env));
     }
     /* Unknown register.  */
     return 0;
@@ -65,7 +65,7 @@ int aarch64_cpu_gdb_write_register(CPUState *cs, uint8_t *mem_buf, int n)
         return 8;
     case 33:
         /* CPSR */
-        env->pstate = tmp;
+        pstate_write(env, tmp);
         return 4;
     }
     /* Unknown register.  */
diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c
new file mode 100644
index 0000000..d3f7067
--- /dev/null
+++ b/target-arm/helper-a64.c
@@ -0,0 +1,79 @@
+/*
+ *  AArch64 specific helpers
+ *
+ *  Copyright (c) 2013 Alexander Graf <agraf@suse.de>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "cpu.h"
+#include "exec/gdbstub.h"
+#include "helper.h"
+#include "qemu/host-utils.h"
+#include "sysemu/sysemu.h"
+#include "qemu/bitops.h"
+
+/* C2.4.7 Multiply and divide */
+/* special cases for 0 and LLONG_MIN are mandated by the standard */
+uint64_t HELPER(udiv64)(uint64_t num, uint64_t den)
+{
+    if (den == 0) {
+        return 0;
+    }
+    return num / den;
+}
+
+int64_t HELPER(sdiv64)(int64_t num, int64_t den)
+{
+    if (den == 0) {
+        return 0;
+    }
+    if (num == LLONG_MIN && den == -1) {
+        return LLONG_MIN;
+    }
+    return num / den;
+}
+
+uint64_t HELPER(clz64)(uint64_t x)
+{
+    return clz64(x);
+}
+
+uint64_t HELPER(cls64)(uint64_t x)
+{
+    return clrsb64(x);
+}
+
+uint32_t HELPER(cls32)(uint32_t x)
+{
+    return clrsb32(x);
+}
+
+uint64_t HELPER(rbit64)(uint64_t x)
+{
+    /* assign the correct byte position */
+    x = bswap64(x);
+
+    /* assign the correct nibble position */
+    x = ((x & 0xf0f0f0f0f0f0f0f0ULL) >> 4)
+        | ((x & 0x0f0f0f0f0f0f0f0fULL) << 4);
+
+    /* assign the correct bit position */
+    x = ((x & 0x8888888888888888ULL) >> 3)
+        | ((x & 0x4444444444444444ULL) >> 1)
+        | ((x & 0x2222222222222222ULL) << 1)
+        | ((x & 0x1111111111111111ULL) << 3);
+
+    return x;
+}
diff --git a/target-arm/helper-a64.h b/target-arm/helper-a64.h
new file mode 100644
index 0000000..a163a94
--- /dev/null
+++ b/target-arm/helper-a64.h
@@ -0,0 +1,24 @@
+/*
+ *  AArch64 specific helper definitions
+ *
+ *  Copyright (c) 2013 Alexander Graf <agraf@suse.de>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+DEF_HELPER_FLAGS_2(udiv64, TCG_CALL_NO_RWG_SE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(sdiv64, TCG_CALL_NO_RWG_SE, s64, s64, s64)
+DEF_HELPER_FLAGS_1(clz64, TCG_CALL_NO_RWG_SE, i64, i64)
+DEF_HELPER_FLAGS_1(cls64, TCG_CALL_NO_RWG_SE, i64, i64)
+DEF_HELPER_FLAGS_1(cls32, TCG_CALL_NO_RWG_SE, i32, i32)
+DEF_HELPER_FLAGS_1(rbit64, TCG_CALL_NO_RWG_SE, i64, i64)
diff --git a/target-arm/helper.c b/target-arm/helper.c
index 5e5e5aa..6ebd7dc 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -65,6 +65,48 @@ static int vfp_gdb_set_reg(CPUARMState *env, uint8_t *buf, int reg)
     return 0;
 }
 
+static int aarch64_fpu_gdb_get_reg(CPUARMState *env, uint8_t *buf, int reg)
+{
+    switch (reg) {
+    case 0 ... 31:
+        /* 128 bit FP register */
+        stfq_le_p(buf, env->vfp.regs[reg * 2]);
+        stfq_le_p(buf + 8, env->vfp.regs[reg * 2 + 1]);
+        return 16;
+    case 32:
+        /* FPSR */
+        stl_p(buf, vfp_get_fpsr(env));
+        return 4;
+    case 33:
+        /* FPCR */
+        stl_p(buf, vfp_get_fpcr(env));
+        return 4;
+    default:
+        return 0;
+    }
+}
+
+static int aarch64_fpu_gdb_set_reg(CPUARMState *env, uint8_t *buf, int reg)
+{
+    switch (reg) {
+    case 0 ... 31:
+        /* 128 bit FP register */
+        env->vfp.regs[reg * 2] = ldfq_le_p(buf);
+        env->vfp.regs[reg * 2 + 1] = ldfq_le_p(buf + 8);
+        return 16;
+    case 32:
+        /* FPSR */
+        vfp_set_fpsr(env, ldl_p(buf));
+        return 4;
+    case 33:
+        /* FPCR */
+        vfp_set_fpcr(env, ldl_p(buf));
+        return 4;
+    default:
+        return 0;
+    }
+}
+
 static int raw_read(CPUARMState *env, const ARMCPRegInfo *ri,
                     uint64_t *value)
 {
@@ -1338,7 +1380,8 @@ static const ARMCPRegInfo dummy_c15_cp_reginfo[] = {
      */
     { .name = "C15_IMPDEF", .cp = 15, .crn = 15,
       .crm = CP_ANY, .opc1 = CP_ANY, .opc2 = CP_ANY,
-      .access = PL1_RW, .type = ARM_CP_CONST | ARM_CP_NO_MIGRATE,
+      .access = PL1_RW,
+      .type = ARM_CP_CONST | ARM_CP_NO_MIGRATE | ARM_CP_OVERRIDE,
       .resetvalue = 0 },
     REGINFO_SENTINEL
 };
@@ -1744,6 +1787,15 @@ void register_cp_regs_for_features(ARMCPU *cpu)
         define_one_arm_cp_reg(cpu, &auxcr);
     }
 
+    if (arm_feature(env, ARM_FEATURE_CBAR)) {
+        ARMCPRegInfo cbar = {
+            .name = "CBAR", .cp = 15, .crn = 15, .crm = 0, .opc1 = 4, .opc2 = 0,
+            .access = PL1_R|PL3_W, .resetvalue = cpu->reset_cbar,
+            .fieldoffset = offsetof(CPUARMState, cp15.c15_config_base_address)
+        };
+        define_one_arm_cp_reg(cpu, &cbar);
+    }
+
     /* Generic registers whose values depend on the implementation */
     {
         ARMCPRegInfo sctlr = {
@@ -1785,7 +1837,11 @@ void arm_cpu_register_gdb_regs_for_features(ARMCPU *cpu)
     CPUState *cs = CPU(cpu);
     CPUARMState *env = &cpu->env;
 
-    if (arm_feature(env, ARM_FEATURE_NEON)) {
+    if (arm_feature(env, ARM_FEATURE_AARCH64)) {
+        gdb_register_coprocessor(cs, aarch64_fpu_gdb_get_reg,
+                                 aarch64_fpu_gdb_set_reg,
+                                 34, "aarch64-fpu.xml", 0);
+    } else if (arm_feature(env, ARM_FEATURE_NEON)) {
         gdb_register_coprocessor(cs, vfp_gdb_get_reg, vfp_gdb_set_reg,
                                  51, "arm-neon.xml", 0);
     } else if (arm_feature(env, ARM_FEATURE_VFP3)) {
diff --git a/target-arm/helper.h b/target-arm/helper.h
index d459a39..73d67dc 100644
--- a/target-arm/helper.h
+++ b/target-arm/helper.h
@@ -463,4 +463,11 @@ DEF_HELPER_3(neon_qzip8, void, env, i32, i32)
 DEF_HELPER_3(neon_qzip16, void, env, i32, i32)
 DEF_HELPER_3(neon_qzip32, void, env, i32, i32)
 
+DEF_HELPER_4(crypto_aese, void, env, i32, i32, i32)
+DEF_HELPER_4(crypto_aesmc, void, env, i32, i32, i32)
+
+#ifdef TARGET_AARCH64
+#include "helper-a64.h"
+#endif
+
 #include "exec/def-helper.h"
diff --git a/target-arm/kvm.c b/target-arm/kvm.c
index f865dac..1d2688d 100644
--- a/target-arm/kvm.c
+++ b/target-arm/kvm.c
@@ -100,120 +100,6 @@ void kvm_arm_destroy_scratch_host_vcpu(int *fdarray)
     }
 }
 
-static inline void set_feature(uint64_t *features, int feature)
-{
-    *features |= 1ULL << feature;
-}
-
-bool kvm_arm_get_host_cpu_features(ARMHostCPUClass *ahcc)
-{
-    /* Identify the feature bits corresponding to the host CPU, and
-     * fill out the ARMHostCPUClass fields accordingly. To do this
-     * we have to create a scratch VM, create a single CPU inside it,
-     * and then query that CPU for the relevant ID registers.
-     */
-    int i, ret, fdarray[3];
-    uint32_t midr, id_pfr0, id_isar0, mvfr1;
-    uint64_t features = 0;
-    /* Old kernels may not know about the PREFERRED_TARGET ioctl: however
-     * we know these will only support creating one kind of guest CPU,
-     * which is its preferred CPU type.
-     */
-    static const uint32_t cpus_to_try[] = {
-        QEMU_KVM_ARM_TARGET_CORTEX_A15,
-        QEMU_KVM_ARM_TARGET_NONE
-    };
-    struct kvm_vcpu_init init;
-    struct kvm_one_reg idregs[] = {
-        {
-            .id = KVM_REG_ARM | KVM_REG_SIZE_U32
-            | ENCODE_CP_REG(15, 0, 0, 0, 0, 0),
-            .addr = (uintptr_t)&midr,
-        },
-        {
-            .id = KVM_REG_ARM | KVM_REG_SIZE_U32
-            | ENCODE_CP_REG(15, 0, 0, 1, 0, 0),
-            .addr = (uintptr_t)&id_pfr0,
-        },
-        {
-            .id = KVM_REG_ARM | KVM_REG_SIZE_U32
-            | ENCODE_CP_REG(15, 0, 0, 2, 0, 0),
-            .addr = (uintptr_t)&id_isar0,
-        },
-        {
-            .id = KVM_REG_ARM | KVM_REG_SIZE_U32
-            | KVM_REG_ARM_VFP | KVM_REG_ARM_VFP_MVFR1,
-            .addr = (uintptr_t)&mvfr1,
-        },
-    };
-
-    if (!kvm_arm_create_scratch_host_vcpu(cpus_to_try, fdarray, &init)) {
-        return false;
-    }
-
-    ahcc->target = init.target;
-
-    /* This is not strictly blessed by the device tree binding docs yet,
-     * but in practice the kernel does not care about this string so
-     * there is no point maintaining an KVM_ARM_TARGET_* -> string table.
-     */
-    ahcc->dtb_compatible = "arm,arm-v7";
-
-    for (i = 0; i < ARRAY_SIZE(idregs); i++) {
-        ret = ioctl(fdarray[2], KVM_GET_ONE_REG, &idregs[i]);
-        if (ret) {
-            break;
-        }
-    }
-
-    kvm_arm_destroy_scratch_host_vcpu(fdarray);
-
-    if (ret) {
-        return false;
-    }
-
-    /* Now we've retrieved all the register information we can
-     * set the feature bits based on the ID register fields.
-     * We can assume any KVM supporting CPU is at least a v7
-     * with VFPv3, LPAE and the generic timers; this in turn implies
-     * most of the other feature bits, but a few must be tested.
-     */
-    set_feature(&features, ARM_FEATURE_V7);
-    set_feature(&features, ARM_FEATURE_VFP3);
-    set_feature(&features, ARM_FEATURE_LPAE);
-    set_feature(&features, ARM_FEATURE_GENERIC_TIMER);
-
-    switch (extract32(id_isar0, 24, 4)) {
-    case 1:
-        set_feature(&features, ARM_FEATURE_THUMB_DIV);
-        break;
-    case 2:
-        set_feature(&features, ARM_FEATURE_ARM_DIV);
-        set_feature(&features, ARM_FEATURE_THUMB_DIV);
-        break;
-    default:
-        break;
-    }
-
-    if (extract32(id_pfr0, 12, 4) == 1) {
-        set_feature(&features, ARM_FEATURE_THUMB2EE);
-    }
-    if (extract32(mvfr1, 20, 4) == 1) {
-        set_feature(&features, ARM_FEATURE_VFP_FP16);
-    }
-    if (extract32(mvfr1, 12, 4) == 1) {
-        set_feature(&features, ARM_FEATURE_NEON);
-    }
-    if (extract32(mvfr1, 28, 4) == 1) {
-        /* FMAC support implies VFPv4 */
-        set_feature(&features, ARM_FEATURE_VFP4);
-    }
-
-    ahcc->features = features;
-
-    return true;
-}
-
 static void kvm_arm_host_cpu_class_init(ObjectClass *oc, void *data)
 {
     ARMHostCPUClass *ahcc = ARM_HOST_CPU_CLASS(oc);
@@ -242,7 +128,11 @@ static void kvm_arm_host_cpu_initfn(Object *obj)
 
 static const TypeInfo host_arm_cpu_type_info = {
     .name = TYPE_ARM_HOST_CPU,
+#ifdef TARGET_AARCH64
+    .parent = TYPE_AARCH64_CPU,
+#else
     .parent = TYPE_ARM_CPU,
+#endif
     .instance_init = kvm_arm_host_cpu_initfn,
     .class_init = kvm_arm_host_cpu_class_init,
     .class_size = sizeof(ARMHostCPUClass),
@@ -265,144 +155,6 @@ unsigned long kvm_arch_vcpu_id(CPUState *cpu)
     return cpu->cpu_index;
 }
 
-static bool reg_syncs_via_tuple_list(uint64_t regidx)
-{
-    /* Return true if the regidx is a register we should synchronize
-     * via the cpreg_tuples array (ie is not a core reg we sync by
-     * hand in kvm_arch_get/put_registers())
-     */
-    switch (regidx & KVM_REG_ARM_COPROC_MASK) {
-    case KVM_REG_ARM_CORE:
-    case KVM_REG_ARM_VFP:
-        return false;
-    default:
-        return true;
-    }
-}
-
-static int compare_u64(const void *a, const void *b)
-{
-    if (*(uint64_t *)a > *(uint64_t *)b) {
-        return 1;
-    }
-    if (*(uint64_t *)a < *(uint64_t *)b) {
-        return -1;
-    }
-    return 0;
-}
-
-int kvm_arch_init_vcpu(CPUState *cs)
-{
-    struct kvm_vcpu_init init;
-    int i, ret, arraylen;
-    uint64_t v;
-    struct kvm_one_reg r;
-    struct kvm_reg_list rl;
-    struct kvm_reg_list *rlp;
-    ARMCPU *cpu = ARM_CPU(cs);
-
-    if (cpu->kvm_target == QEMU_KVM_ARM_TARGET_NONE) {
-        fprintf(stderr, "KVM is not supported for this guest CPU type\n");
-        return -EINVAL;
-    }
-
-    init.target = cpu->kvm_target;
-    memset(init.features, 0, sizeof(init.features));
-    if (cpu->start_powered_off) {
-        init.features[0] = 1 << KVM_ARM_VCPU_POWER_OFF;
-    }
-    ret = kvm_vcpu_ioctl(cs, KVM_ARM_VCPU_INIT, &init);
-    if (ret) {
-        return ret;
-    }
-    /* Query the kernel to make sure it supports 32 VFP
-     * registers: QEMU's "cortex-a15" CPU is always a
-     * VFP-D32 core. The simplest way to do this is just
-     * to attempt to read register d31.
-     */
-    r.id = KVM_REG_ARM | KVM_REG_SIZE_U64 | KVM_REG_ARM_VFP | 31;
-    r.addr = (uintptr_t)(&v);
-    ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &r);
-    if (ret == -ENOENT) {
-        return -EINVAL;
-    }
-
-    /* Populate the cpreg list based on the kernel's idea
-     * of what registers exist (and throw away the TCG-created list).
-     */
-    rl.n = 0;
-    ret = kvm_vcpu_ioctl(cs, KVM_GET_REG_LIST, &rl);
-    if (ret != -E2BIG) {
-        return ret;
-    }
-    rlp = g_malloc(sizeof(struct kvm_reg_list) + rl.n * sizeof(uint64_t));
-    rlp->n = rl.n;
-    ret = kvm_vcpu_ioctl(cs, KVM_GET_REG_LIST, rlp);
-    if (ret) {
-        goto out;
-    }
-    /* Sort the list we get back from the kernel, since cpreg_tuples
-     * must be in strictly ascending order.
-     */
-    qsort(&rlp->reg, rlp->n, sizeof(rlp->reg[0]), compare_u64);
-
-    for (i = 0, arraylen = 0; i < rlp->n; i++) {
-        if (!reg_syncs_via_tuple_list(rlp->reg[i])) {
-            continue;
-        }
-        switch (rlp->reg[i] & KVM_REG_SIZE_MASK) {
-        case KVM_REG_SIZE_U32:
-        case KVM_REG_SIZE_U64:
-            break;
-        default:
-            fprintf(stderr, "Can't handle size of register in kernel list\n");
-            ret = -EINVAL;
-            goto out;
-        }
-
-        arraylen++;
-    }
-
-    cpu->cpreg_indexes = g_renew(uint64_t, cpu->cpreg_indexes, arraylen);
-    cpu->cpreg_values = g_renew(uint64_t, cpu->cpreg_values, arraylen);
-    cpu->cpreg_vmstate_indexes = g_renew(uint64_t, cpu->cpreg_vmstate_indexes,
-                                         arraylen);
-    cpu->cpreg_vmstate_values = g_renew(uint64_t, cpu->cpreg_vmstate_values,
-                                        arraylen);
-    cpu->cpreg_array_len = arraylen;
-    cpu->cpreg_vmstate_array_len = arraylen;
-
-    for (i = 0, arraylen = 0; i < rlp->n; i++) {
-        uint64_t regidx = rlp->reg[i];
-        if (!reg_syncs_via_tuple_list(regidx)) {
-            continue;
-        }
-        cpu->cpreg_indexes[arraylen] = regidx;
-        arraylen++;
-    }
-    assert(cpu->cpreg_array_len == arraylen);
-
-    if (!write_kvmstate_to_list(cpu)) {
-        /* Shouldn't happen unless kernel is inconsistent about
-         * what registers exist.
-         */
-        fprintf(stderr, "Initial read of kernel register state failed\n");
-        ret = -EINVAL;
-        goto out;
-    }
-
-    /* Save a copy of the initial register values so that we can
-     * feed it back to the kernel on VCPU reset.
-     */
-    cpu->cpreg_reset_values = g_memdup(cpu->cpreg_values,
-                                       cpu->cpreg_array_len *
-                                       sizeof(cpu->cpreg_values[0]));
-
-out:
-    g_free(rlp);
-    return ret;
-}
-
 /* We track all the KVM devices which need their memory addresses
  * passing to the kernel in a list of these structures.
  * When board init is complete we run through the list and
@@ -563,232 +315,6 @@ bool write_list_to_kvmstate(ARMCPU *cpu)
     return ok;
 }
 
-typedef struct Reg {
-    uint64_t id;
-    int offset;
-} Reg;
-
-#define COREREG(KERNELNAME, QEMUFIELD)                       \
-    {                                                        \
-        KVM_REG_ARM | KVM_REG_SIZE_U32 |                     \
-        KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(KERNELNAME), \
-        offsetof(CPUARMState, QEMUFIELD)                     \
-    }
-
-#define VFPSYSREG(R)                                       \
-    {                                                      \
-        KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_VFP | \
-        KVM_REG_ARM_VFP_##R,                               \
-        offsetof(CPUARMState, vfp.xregs[ARM_VFP_##R])      \
-    }
-
-static const Reg regs[] = {
-    /* R0_usr .. R14_usr */
-    COREREG(usr_regs.uregs[0], regs[0]),
-    COREREG(usr_regs.uregs[1], regs[1]),
-    COREREG(usr_regs.uregs[2], regs[2]),
-    COREREG(usr_regs.uregs[3], regs[3]),
-    COREREG(usr_regs.uregs[4], regs[4]),
-    COREREG(usr_regs.uregs[5], regs[5]),
-    COREREG(usr_regs.uregs[6], regs[6]),
-    COREREG(usr_regs.uregs[7], regs[7]),
-    COREREG(usr_regs.uregs[8], usr_regs[0]),
-    COREREG(usr_regs.uregs[9], usr_regs[1]),
-    COREREG(usr_regs.uregs[10], usr_regs[2]),
-    COREREG(usr_regs.uregs[11], usr_regs[3]),
-    COREREG(usr_regs.uregs[12], usr_regs[4]),
-    COREREG(usr_regs.uregs[13], banked_r13[0]),
-    COREREG(usr_regs.uregs[14], banked_r14[0]),
-    /* R13, R14, SPSR for SVC, ABT, UND, IRQ banks */
-    COREREG(svc_regs[0], banked_r13[1]),
-    COREREG(svc_regs[1], banked_r14[1]),
-    COREREG(svc_regs[2], banked_spsr[1]),
-    COREREG(abt_regs[0], banked_r13[2]),
-    COREREG(abt_regs[1], banked_r14[2]),
-    COREREG(abt_regs[2], banked_spsr[2]),
-    COREREG(und_regs[0], banked_r13[3]),
-    COREREG(und_regs[1], banked_r14[3]),
-    COREREG(und_regs[2], banked_spsr[3]),
-    COREREG(irq_regs[0], banked_r13[4]),
-    COREREG(irq_regs[1], banked_r14[4]),
-    COREREG(irq_regs[2], banked_spsr[4]),
-    /* R8_fiq .. R14_fiq and SPSR_fiq */
-    COREREG(fiq_regs[0], fiq_regs[0]),
-    COREREG(fiq_regs[1], fiq_regs[1]),
-    COREREG(fiq_regs[2], fiq_regs[2]),
-    COREREG(fiq_regs[3], fiq_regs[3]),
-    COREREG(fiq_regs[4], fiq_regs[4]),
-    COREREG(fiq_regs[5], banked_r13[5]),
-    COREREG(fiq_regs[6], banked_r14[5]),
-    COREREG(fiq_regs[7], banked_spsr[5]),
-    /* R15 */
-    COREREG(usr_regs.uregs[15], regs[15]),
-    /* VFP system registers */
-    VFPSYSREG(FPSID),
-    VFPSYSREG(MVFR1),
-    VFPSYSREG(MVFR0),
-    VFPSYSREG(FPEXC),
-    VFPSYSREG(FPINST),
-    VFPSYSREG(FPINST2),
-};
-
-int kvm_arch_put_registers(CPUState *cs, int level)
-{
-    ARMCPU *cpu = ARM_CPU(cs);
-    CPUARMState *env = &cpu->env;
-    struct kvm_one_reg r;
-    int mode, bn;
-    int ret, i;
-    uint32_t cpsr, fpscr;
-
-    /* Make sure the banked regs are properly set */
-    mode = env->uncached_cpsr & CPSR_M;
-    bn = bank_number(mode);
-    if (mode == ARM_CPU_MODE_FIQ) {
-        memcpy(env->fiq_regs, env->regs + 8, 5 * sizeof(uint32_t));
-    } else {
-        memcpy(env->usr_regs, env->regs + 8, 5 * sizeof(uint32_t));
-    }
-    env->banked_r13[bn] = env->regs[13];
-    env->banked_r14[bn] = env->regs[14];
-    env->banked_spsr[bn] = env->spsr;
-
-    /* Now we can safely copy stuff down to the kernel */
-    for (i = 0; i < ARRAY_SIZE(regs); i++) {
-        r.id = regs[i].id;
-        r.addr = (uintptr_t)(env) + regs[i].offset;
-        ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &r);
-        if (ret) {
-            return ret;
-        }
-    }
-
-    /* Special cases which aren't a single CPUARMState field */
-    cpsr = cpsr_read(env);
-    r.id = KVM_REG_ARM | KVM_REG_SIZE_U32 |
-        KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(usr_regs.ARM_cpsr);
-    r.addr = (uintptr_t)(&cpsr);
-    ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &r);
-    if (ret) {
-        return ret;
-    }
-
-    /* VFP registers */
-    r.id = KVM_REG_ARM | KVM_REG_SIZE_U64 | KVM_REG_ARM_VFP;
-    for (i = 0; i < 32; i++) {
-        r.addr = (uintptr_t)(&env->vfp.regs[i]);
-        ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &r);
-        if (ret) {
-            return ret;
-        }
-        r.id++;
-    }
-
-    r.id = KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_VFP |
-        KVM_REG_ARM_VFP_FPSCR;
-    fpscr = vfp_get_fpscr(env);
-    r.addr = (uintptr_t)&fpscr;
-    ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &r);
-    if (ret) {
-        return ret;
-    }
-
-    /* Note that we do not call write_cpustate_to_list()
-     * here, so we are only writing the tuple list back to
-     * KVM. This is safe because nothing can change the
-     * CPUARMState cp15 fields (in particular gdb accesses cannot)
-     * and so there are no changes to sync. In fact syncing would
-     * be wrong at this point: for a constant register where TCG and
-     * KVM disagree about its value, the preceding write_list_to_cpustate()
-     * would not have had any effect on the CPUARMState value (since the
-     * register is read-only), and a write_cpustate_to_list() here would
-     * then try to write the TCG value back into KVM -- this would either
-     * fail or incorrectly change the value the guest sees.
-     *
-     * If we ever want to allow the user to modify cp15 registers via
-     * the gdb stub, we would need to be more clever here (for instance
-     * tracking the set of registers kvm_arch_get_registers() successfully
-     * managed to update the CPUARMState with, and only allowing those
-     * to be written back up into the kernel).
-     */
-    if (!write_list_to_kvmstate(cpu)) {
-        return EINVAL;
-    }
-
-    return ret;
-}
-
-int kvm_arch_get_registers(CPUState *cs)
-{
-    ARMCPU *cpu = ARM_CPU(cs);
-    CPUARMState *env = &cpu->env;
-    struct kvm_one_reg r;
-    int mode, bn;
-    int ret, i;
-    uint32_t cpsr, fpscr;
-
-    for (i = 0; i < ARRAY_SIZE(regs); i++) {
-        r.id = regs[i].id;
-        r.addr = (uintptr_t)(env) + regs[i].offset;
-        ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &r);
-        if (ret) {
-            return ret;
-        }
-    }
-
-    /* Special cases which aren't a single CPUARMState field */
-    r.id = KVM_REG_ARM | KVM_REG_SIZE_U32 |
-        KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(usr_regs.ARM_cpsr);
-    r.addr = (uintptr_t)(&cpsr);
-    ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &r);
-    if (ret) {
-        return ret;
-    }
-    cpsr_write(env, cpsr, 0xffffffff);
-
-    /* Make sure the current mode regs are properly set */
-    mode = env->uncached_cpsr & CPSR_M;
-    bn = bank_number(mode);
-    if (mode == ARM_CPU_MODE_FIQ) {
-        memcpy(env->regs + 8, env->fiq_regs, 5 * sizeof(uint32_t));
-    } else {
-        memcpy(env->regs + 8, env->usr_regs, 5 * sizeof(uint32_t));
-    }
-    env->regs[13] = env->banked_r13[bn];
-    env->regs[14] = env->banked_r14[bn];
-    env->spsr = env->banked_spsr[bn];
-
-    /* VFP registers */
-    r.id = KVM_REG_ARM | KVM_REG_SIZE_U64 | KVM_REG_ARM_VFP;
-    for (i = 0; i < 32; i++) {
-        r.addr = (uintptr_t)(&env->vfp.regs[i]);
-        ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &r);
-        if (ret) {
-            return ret;
-        }
-        r.id++;
-    }
-
-    r.id = KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_VFP |
-        KVM_REG_ARM_VFP_FPSCR;
-    r.addr = (uintptr_t)&fpscr;
-    ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &r);
-    if (ret) {
-        return ret;
-    }
-    vfp_set_fpscr(env, fpscr);
-
-    if (!write_kvmstate_to_list(cpu)) {
-        return EINVAL;
-    }
-    /* Note that it's OK to have registers which aren't in CPUState,
-     * so we can ignore a failure return here.
-     */
-    write_list_to_cpustate(cpu);
-
-    return 0;
-}
-
 void kvm_arch_pre_run(CPUState *cs, struct kvm_run *run)
 {
 }
@@ -802,19 +328,6 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
     return 0;
 }
 
-void kvm_arch_reset_vcpu(CPUState *cs)
-{
-    /* Feed the kernel back its initial register state */
-    ARMCPU *cpu = ARM_CPU(cs);
-
-    memmove(cpu->cpreg_values, cpu->cpreg_reset_values,
-            cpu->cpreg_array_len * sizeof(cpu->cpreg_values[0]));
-
-    if (!write_list_to_kvmstate(cpu)) {
-        abort();
-    }
-}
-
 bool kvm_arch_stop_on_emulation_error(CPUState *cs)
 {
     return true;
diff --git a/target-arm/kvm32.c b/target-arm/kvm32.c
new file mode 100644
index 0000000..a4fde07
--- /dev/null
+++ b/target-arm/kvm32.c
@@ -0,0 +1,515 @@
+/*
+ * ARM implementation of KVM hooks, 32 bit specific code.
+ *
+ * Copyright Christoffer Dall 2009-2010
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include <linux/kvm.h>
+
+#include "qemu-common.h"
+#include "qemu/timer.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/kvm.h"
+#include "kvm_arm.h"
+#include "cpu.h"
+#include "hw/arm/arm.h"
+
+static inline void set_feature(uint64_t *features, int feature)
+{
+    *features |= 1ULL << feature;
+}
+
+bool kvm_arm_get_host_cpu_features(ARMHostCPUClass *ahcc)
+{
+    /* Identify the feature bits corresponding to the host CPU, and
+     * fill out the ARMHostCPUClass fields accordingly. To do this
+     * we have to create a scratch VM, create a single CPU inside it,
+     * and then query that CPU for the relevant ID registers.
+     */
+    int i, ret, fdarray[3];
+    uint32_t midr, id_pfr0, id_isar0, mvfr1;
+    uint64_t features = 0;
+    /* Old kernels may not know about the PREFERRED_TARGET ioctl: however
+     * we know these will only support creating one kind of guest CPU,
+     * which is its preferred CPU type.
+     */
+    static const uint32_t cpus_to_try[] = {
+        QEMU_KVM_ARM_TARGET_CORTEX_A15,
+        QEMU_KVM_ARM_TARGET_NONE
+    };
+    struct kvm_vcpu_init init;
+    struct kvm_one_reg idregs[] = {
+        {
+            .id = KVM_REG_ARM | KVM_REG_SIZE_U32
+            | ENCODE_CP_REG(15, 0, 0, 0, 0, 0),
+            .addr = (uintptr_t)&midr,
+        },
+        {
+            .id = KVM_REG_ARM | KVM_REG_SIZE_U32
+            | ENCODE_CP_REG(15, 0, 0, 1, 0, 0),
+            .addr = (uintptr_t)&id_pfr0,
+        },
+        {
+            .id = KVM_REG_ARM | KVM_REG_SIZE_U32
+            | ENCODE_CP_REG(15, 0, 0, 2, 0, 0),
+            .addr = (uintptr_t)&id_isar0,
+        },
+        {
+            .id = KVM_REG_ARM | KVM_REG_SIZE_U32
+            | KVM_REG_ARM_VFP | KVM_REG_ARM_VFP_MVFR1,
+            .addr = (uintptr_t)&mvfr1,
+        },
+    };
+
+    if (!kvm_arm_create_scratch_host_vcpu(cpus_to_try, fdarray, &init)) {
+        return false;
+    }
+
+    ahcc->target = init.target;
+
+    /* This is not strictly blessed by the device tree binding docs yet,
+     * but in practice the kernel does not care about this string so
+     * there is no point maintaining an KVM_ARM_TARGET_* -> string table.
+     */
+    ahcc->dtb_compatible = "arm,arm-v7";
+
+    for (i = 0; i < ARRAY_SIZE(idregs); i++) {
+        ret = ioctl(fdarray[2], KVM_GET_ONE_REG, &idregs[i]);
+        if (ret) {
+            break;
+        }
+    }
+
+    kvm_arm_destroy_scratch_host_vcpu(fdarray);
+
+    if (ret) {
+        return false;
+    }
+
+    /* Now we've retrieved all the register information we can
+     * set the feature bits based on the ID register fields.
+     * We can assume any KVM supporting CPU is at least a v7
+     * with VFPv3, LPAE and the generic timers; this in turn implies
+     * most of the other feature bits, but a few must be tested.
+     */
+    set_feature(&features, ARM_FEATURE_V7);
+    set_feature(&features, ARM_FEATURE_VFP3);
+    set_feature(&features, ARM_FEATURE_LPAE);
+    set_feature(&features, ARM_FEATURE_GENERIC_TIMER);
+
+    switch (extract32(id_isar0, 24, 4)) {
+    case 1:
+        set_feature(&features, ARM_FEATURE_THUMB_DIV);
+        break;
+    case 2:
+        set_feature(&features, ARM_FEATURE_ARM_DIV);
+        set_feature(&features, ARM_FEATURE_THUMB_DIV);
+        break;
+    default:
+        break;
+    }
+
+    if (extract32(id_pfr0, 12, 4) == 1) {
+        set_feature(&features, ARM_FEATURE_THUMB2EE);
+    }
+    if (extract32(mvfr1, 20, 4) == 1) {
+        set_feature(&features, ARM_FEATURE_VFP_FP16);
+    }
+    if (extract32(mvfr1, 12, 4) == 1) {
+        set_feature(&features, ARM_FEATURE_NEON);
+    }
+    if (extract32(mvfr1, 28, 4) == 1) {
+        /* FMAC support implies VFPv4 */
+        set_feature(&features, ARM_FEATURE_VFP4);
+    }
+
+    ahcc->features = features;
+
+    return true;
+}
+
+static bool reg_syncs_via_tuple_list(uint64_t regidx)
+{
+    /* Return true if the regidx is a register we should synchronize
+     * via the cpreg_tuples array (ie is not a core reg we sync by
+     * hand in kvm_arch_get/put_registers())
+     */
+    switch (regidx & KVM_REG_ARM_COPROC_MASK) {
+    case KVM_REG_ARM_CORE:
+    case KVM_REG_ARM_VFP:
+        return false;
+    default:
+        return true;
+    }
+}
+
+static int compare_u64(const void *a, const void *b)
+{
+    if (*(uint64_t *)a > *(uint64_t *)b) {
+        return 1;
+    }
+    if (*(uint64_t *)a < *(uint64_t *)b) {
+        return -1;
+    }
+    return 0;
+}
+
+int kvm_arch_init_vcpu(CPUState *cs)
+{
+    struct kvm_vcpu_init init;
+    int i, ret, arraylen;
+    uint64_t v;
+    struct kvm_one_reg r;
+    struct kvm_reg_list rl;
+    struct kvm_reg_list *rlp;
+    ARMCPU *cpu = ARM_CPU(cs);
+
+    if (cpu->kvm_target == QEMU_KVM_ARM_TARGET_NONE) {
+        fprintf(stderr, "KVM is not supported for this guest CPU type\n");
+        return -EINVAL;
+    }
+
+    init.target = cpu->kvm_target;
+    memset(init.features, 0, sizeof(init.features));
+    if (cpu->start_powered_off) {
+        init.features[0] = 1 << KVM_ARM_VCPU_POWER_OFF;
+    }
+    ret = kvm_vcpu_ioctl(cs, KVM_ARM_VCPU_INIT, &init);
+    if (ret) {
+        return ret;
+    }
+    /* Query the kernel to make sure it supports 32 VFP
+     * registers: QEMU's "cortex-a15" CPU is always a
+     * VFP-D32 core. The simplest way to do this is just
+     * to attempt to read register d31.
+     */
+    r.id = KVM_REG_ARM | KVM_REG_SIZE_U64 | KVM_REG_ARM_VFP | 31;
+    r.addr = (uintptr_t)(&v);
+    ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &r);
+    if (ret == -ENOENT) {
+        return -EINVAL;
+    }
+
+    /* Populate the cpreg list based on the kernel's idea
+     * of what registers exist (and throw away the TCG-created list).
+     */
+    rl.n = 0;
+    ret = kvm_vcpu_ioctl(cs, KVM_GET_REG_LIST, &rl);
+    if (ret != -E2BIG) {
+        return ret;
+    }
+    rlp = g_malloc(sizeof(struct kvm_reg_list) + rl.n * sizeof(uint64_t));
+    rlp->n = rl.n;
+    ret = kvm_vcpu_ioctl(cs, KVM_GET_REG_LIST, rlp);
+    if (ret) {
+        goto out;
+    }
+    /* Sort the list we get back from the kernel, since cpreg_tuples
+     * must be in strictly ascending order.
+     */
+    qsort(&rlp->reg, rlp->n, sizeof(rlp->reg[0]), compare_u64);
+
+    for (i = 0, arraylen = 0; i < rlp->n; i++) {
+        if (!reg_syncs_via_tuple_list(rlp->reg[i])) {
+            continue;
+        }
+        switch (rlp->reg[i] & KVM_REG_SIZE_MASK) {
+        case KVM_REG_SIZE_U32:
+        case KVM_REG_SIZE_U64:
+            break;
+        default:
+            fprintf(stderr, "Can't handle size of register in kernel list\n");
+            ret = -EINVAL;
+            goto out;
+        }
+
+        arraylen++;
+    }
+
+    cpu->cpreg_indexes = g_renew(uint64_t, cpu->cpreg_indexes, arraylen);
+    cpu->cpreg_values = g_renew(uint64_t, cpu->cpreg_values, arraylen);
+    cpu->cpreg_vmstate_indexes = g_renew(uint64_t, cpu->cpreg_vmstate_indexes,
+                                         arraylen);
+    cpu->cpreg_vmstate_values = g_renew(uint64_t, cpu->cpreg_vmstate_values,
+                                        arraylen);
+    cpu->cpreg_array_len = arraylen;
+    cpu->cpreg_vmstate_array_len = arraylen;
+
+    for (i = 0, arraylen = 0; i < rlp->n; i++) {
+        uint64_t regidx = rlp->reg[i];
+        if (!reg_syncs_via_tuple_list(regidx)) {
+            continue;
+        }
+        cpu->cpreg_indexes[arraylen] = regidx;
+        arraylen++;
+    }
+    assert(cpu->cpreg_array_len == arraylen);
+
+    if (!write_kvmstate_to_list(cpu)) {
+        /* Shouldn't happen unless kernel is inconsistent about
+         * what registers exist.
+         */
+        fprintf(stderr, "Initial read of kernel register state failed\n");
+        ret = -EINVAL;
+        goto out;
+    }
+
+    /* Save a copy of the initial register values so that we can
+     * feed it back to the kernel on VCPU reset.
+     */
+    cpu->cpreg_reset_values = g_memdup(cpu->cpreg_values,
+                                       cpu->cpreg_array_len *
+                                       sizeof(cpu->cpreg_values[0]));
+
+out:
+    g_free(rlp);
+    return ret;
+}
+
+typedef struct Reg {
+    uint64_t id;
+    int offset;
+} Reg;
+
+#define COREREG(KERNELNAME, QEMUFIELD)                       \
+    {                                                        \
+        KVM_REG_ARM | KVM_REG_SIZE_U32 |                     \
+        KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(KERNELNAME), \
+        offsetof(CPUARMState, QEMUFIELD)                     \
+    }
+
+#define VFPSYSREG(R)                                       \
+    {                                                      \
+        KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_VFP | \
+        KVM_REG_ARM_VFP_##R,                               \
+        offsetof(CPUARMState, vfp.xregs[ARM_VFP_##R])      \
+    }
+
+static const Reg regs[] = {
+    /* R0_usr .. R14_usr */
+    COREREG(usr_regs.uregs[0], regs[0]),
+    COREREG(usr_regs.uregs[1], regs[1]),
+    COREREG(usr_regs.uregs[2], regs[2]),
+    COREREG(usr_regs.uregs[3], regs[3]),
+    COREREG(usr_regs.uregs[4], regs[4]),
+    COREREG(usr_regs.uregs[5], regs[5]),
+    COREREG(usr_regs.uregs[6], regs[6]),
+    COREREG(usr_regs.uregs[7], regs[7]),
+    COREREG(usr_regs.uregs[8], usr_regs[0]),
+    COREREG(usr_regs.uregs[9], usr_regs[1]),
+    COREREG(usr_regs.uregs[10], usr_regs[2]),
+    COREREG(usr_regs.uregs[11], usr_regs[3]),
+    COREREG(usr_regs.uregs[12], usr_regs[4]),
+    COREREG(usr_regs.uregs[13], banked_r13[0]),
+    COREREG(usr_regs.uregs[14], banked_r14[0]),
+    /* R13, R14, SPSR for SVC, ABT, UND, IRQ banks */
+    COREREG(svc_regs[0], banked_r13[1]),
+    COREREG(svc_regs[1], banked_r14[1]),
+    COREREG(svc_regs[2], banked_spsr[1]),
+    COREREG(abt_regs[0], banked_r13[2]),
+    COREREG(abt_regs[1], banked_r14[2]),
+    COREREG(abt_regs[2], banked_spsr[2]),
+    COREREG(und_regs[0], banked_r13[3]),
+    COREREG(und_regs[1], banked_r14[3]),
+    COREREG(und_regs[2], banked_spsr[3]),
+    COREREG(irq_regs[0], banked_r13[4]),
+    COREREG(irq_regs[1], banked_r14[4]),
+    COREREG(irq_regs[2], banked_spsr[4]),
+    /* R8_fiq .. R14_fiq and SPSR_fiq */
+    COREREG(fiq_regs[0], fiq_regs[0]),
+    COREREG(fiq_regs[1], fiq_regs[1]),
+    COREREG(fiq_regs[2], fiq_regs[2]),
+    COREREG(fiq_regs[3], fiq_regs[3]),
+    COREREG(fiq_regs[4], fiq_regs[4]),
+    COREREG(fiq_regs[5], banked_r13[5]),
+    COREREG(fiq_regs[6], banked_r14[5]),
+    COREREG(fiq_regs[7], banked_spsr[5]),
+    /* R15 */
+    COREREG(usr_regs.uregs[15], regs[15]),
+    /* VFP system registers */
+    VFPSYSREG(FPSID),
+    VFPSYSREG(MVFR1),
+    VFPSYSREG(MVFR0),
+    VFPSYSREG(FPEXC),
+    VFPSYSREG(FPINST),
+    VFPSYSREG(FPINST2),
+};
+
+int kvm_arch_put_registers(CPUState *cs, int level)
+{
+    ARMCPU *cpu = ARM_CPU(cs);
+    CPUARMState *env = &cpu->env;
+    struct kvm_one_reg r;
+    int mode, bn;
+    int ret, i;
+    uint32_t cpsr, fpscr;
+
+    /* Make sure the banked regs are properly set */
+    mode = env->uncached_cpsr & CPSR_M;
+    bn = bank_number(mode);
+    if (mode == ARM_CPU_MODE_FIQ) {
+        memcpy(env->fiq_regs, env->regs + 8, 5 * sizeof(uint32_t));
+    } else {
+        memcpy(env->usr_regs, env->regs + 8, 5 * sizeof(uint32_t));
+    }
+    env->banked_r13[bn] = env->regs[13];
+    env->banked_r14[bn] = env->regs[14];
+    env->banked_spsr[bn] = env->spsr;
+
+    /* Now we can safely copy stuff down to the kernel */
+    for (i = 0; i < ARRAY_SIZE(regs); i++) {
+        r.id = regs[i].id;
+        r.addr = (uintptr_t)(env) + regs[i].offset;
+        ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &r);
+        if (ret) {
+            return ret;
+        }
+    }
+
+    /* Special cases which aren't a single CPUARMState field */
+    cpsr = cpsr_read(env);
+    r.id = KVM_REG_ARM | KVM_REG_SIZE_U32 |
+        KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(usr_regs.ARM_cpsr);
+    r.addr = (uintptr_t)(&cpsr);
+    ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &r);
+    if (ret) {
+        return ret;
+    }
+
+    /* VFP registers */
+    r.id = KVM_REG_ARM | KVM_REG_SIZE_U64 | KVM_REG_ARM_VFP;
+    for (i = 0; i < 32; i++) {
+        r.addr = (uintptr_t)(&env->vfp.regs[i]);
+        ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &r);
+        if (ret) {
+            return ret;
+        }
+        r.id++;
+    }
+
+    r.id = KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_VFP |
+        KVM_REG_ARM_VFP_FPSCR;
+    fpscr = vfp_get_fpscr(env);
+    r.addr = (uintptr_t)&fpscr;
+    ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &r);
+    if (ret) {
+        return ret;
+    }
+
+    /* Note that we do not call write_cpustate_to_list()
+     * here, so we are only writing the tuple list back to
+     * KVM. This is safe because nothing can change the
+     * CPUARMState cp15 fields (in particular gdb accesses cannot)
+     * and so there are no changes to sync. In fact syncing would
+     * be wrong at this point: for a constant register where TCG and
+     * KVM disagree about its value, the preceding write_list_to_cpustate()
+     * would not have had any effect on the CPUARMState value (since the
+     * register is read-only), and a write_cpustate_to_list() here would
+     * then try to write the TCG value back into KVM -- this would either
+     * fail or incorrectly change the value the guest sees.
+     *
+     * If we ever want to allow the user to modify cp15 registers via
+     * the gdb stub, we would need to be more clever here (for instance
+     * tracking the set of registers kvm_arch_get_registers() successfully
+     * managed to update the CPUARMState with, and only allowing those
+     * to be written back up into the kernel).
+     */
+    if (!write_list_to_kvmstate(cpu)) {
+        return EINVAL;
+    }
+
+    return ret;
+}
+
+int kvm_arch_get_registers(CPUState *cs)
+{
+    ARMCPU *cpu = ARM_CPU(cs);
+    CPUARMState *env = &cpu->env;
+    struct kvm_one_reg r;
+    int mode, bn;
+    int ret, i;
+    uint32_t cpsr, fpscr;
+
+    for (i = 0; i < ARRAY_SIZE(regs); i++) {
+        r.id = regs[i].id;
+        r.addr = (uintptr_t)(env) + regs[i].offset;
+        ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &r);
+        if (ret) {
+            return ret;
+        }
+    }
+
+    /* Special cases which aren't a single CPUARMState field */
+    r.id = KVM_REG_ARM | KVM_REG_SIZE_U32 |
+        KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(usr_regs.ARM_cpsr);
+    r.addr = (uintptr_t)(&cpsr);
+    ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &r);
+    if (ret) {
+        return ret;
+    }
+    cpsr_write(env, cpsr, 0xffffffff);
+
+    /* Make sure the current mode regs are properly set */
+    mode = env->uncached_cpsr & CPSR_M;
+    bn = bank_number(mode);
+    if (mode == ARM_CPU_MODE_FIQ) {
+        memcpy(env->regs + 8, env->fiq_regs, 5 * sizeof(uint32_t));
+    } else {
+        memcpy(env->regs + 8, env->usr_regs, 5 * sizeof(uint32_t));
+    }
+    env->regs[13] = env->banked_r13[bn];
+    env->regs[14] = env->banked_r14[bn];
+    env->spsr = env->banked_spsr[bn];
+
+    /* VFP registers */
+    r.id = KVM_REG_ARM | KVM_REG_SIZE_U64 | KVM_REG_ARM_VFP;
+    for (i = 0; i < 32; i++) {
+        r.addr = (uintptr_t)(&env->vfp.regs[i]);
+        ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &r);
+        if (ret) {
+            return ret;
+        }
+        r.id++;
+    }
+
+    r.id = KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_VFP |
+        KVM_REG_ARM_VFP_FPSCR;
+    r.addr = (uintptr_t)&fpscr;
+    ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &r);
+    if (ret) {
+        return ret;
+    }
+    vfp_set_fpscr(env, fpscr);
+
+    if (!write_kvmstate_to_list(cpu)) {
+        return EINVAL;
+    }
+    /* Note that it's OK to have registers which aren't in CPUState,
+     * so we can ignore a failure return here.
+     */
+    write_list_to_cpustate(cpu);
+
+    return 0;
+}
+
+void kvm_arch_reset_vcpu(CPUState *cs)
+{
+    /* Feed the kernel back its initial register state */
+    ARMCPU *cpu = ARM_CPU(cs);
+
+    memmove(cpu->cpreg_values, cpu->cpreg_reset_values,
+            cpu->cpreg_array_len * sizeof(cpu->cpreg_values[0]));
+
+    if (!write_list_to_kvmstate(cpu)) {
+        abort();
+    }
+}
diff --git a/target-arm/kvm64.c b/target-arm/kvm64.c
new file mode 100644
index 0000000..1b7ca90
--- /dev/null
+++ b/target-arm/kvm64.c
@@ -0,0 +1,204 @@
+/*
+ * ARM implementation of KVM hooks, 64 bit specific code
+ *
+ * Copyright Mian-M. Hamayun 2013, Virtual Open Systems
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include <linux/kvm.h>
+
+#include "qemu-common.h"
+#include "qemu/timer.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/kvm.h"
+#include "kvm_arm.h"
+#include "cpu.h"
+#include "hw/arm/arm.h"
+
+static inline void set_feature(uint64_t *features, int feature)
+{
+    *features |= 1ULL << feature;
+}
+
+bool kvm_arm_get_host_cpu_features(ARMHostCPUClass *ahcc)
+{
+    /* Identify the feature bits corresponding to the host CPU, and
+     * fill out the ARMHostCPUClass fields accordingly. To do this
+     * we have to create a scratch VM, create a single CPU inside it,
+     * and then query that CPU for the relevant ID registers.
+     * For AArch64 we currently don't care about ID registers at
+     * all; we just want to know the CPU type.
+     */
+    int fdarray[3];
+    uint64_t features = 0;
+    /* Old kernels may not know about the PREFERRED_TARGET ioctl: however
+     * we know these will only support creating one kind of guest CPU,
+     * which is its preferred CPU type. Fortunately these old kernels
+     * support only a very limited number of CPUs.
+     */
+    static const uint32_t cpus_to_try[] = {
+        KVM_ARM_TARGET_AEM_V8,
+        KVM_ARM_TARGET_FOUNDATION_V8,
+        KVM_ARM_TARGET_CORTEX_A57,
+        QEMU_KVM_ARM_TARGET_NONE
+    };
+    struct kvm_vcpu_init init;
+
+    if (!kvm_arm_create_scratch_host_vcpu(cpus_to_try, fdarray, &init)) {
+        return false;
+    }
+
+    ahcc->target = init.target;
+    ahcc->dtb_compatible = "arm,arm-v8";
+
+    kvm_arm_destroy_scratch_host_vcpu(fdarray);
+
+   /* We can assume any KVM supporting CPU is at least a v8
+     * with VFPv4+Neon; this in turn implies most of the other
+     * feature bits.
+     */
+    set_feature(&features, ARM_FEATURE_V8);
+    set_feature(&features, ARM_FEATURE_VFP4);
+    set_feature(&features, ARM_FEATURE_NEON);
+    set_feature(&features, ARM_FEATURE_AARCH64);
+
+    ahcc->features = features;
+
+    return true;
+}
+
+int kvm_arch_init_vcpu(CPUState *cs)
+{
+    ARMCPU *cpu = ARM_CPU(cs);
+    struct kvm_vcpu_init init;
+    int ret;
+
+    if (cpu->kvm_target == QEMU_KVM_ARM_TARGET_NONE ||
+        !arm_feature(&cpu->env, ARM_FEATURE_AARCH64)) {
+        fprintf(stderr, "KVM is not supported for this guest CPU type\n");
+        return -EINVAL;
+    }
+
+    init.target = cpu->kvm_target;
+    memset(init.features, 0, sizeof(init.features));
+    if (cpu->start_powered_off) {
+        init.features[0] = 1 << KVM_ARM_VCPU_POWER_OFF;
+    }
+    ret = kvm_vcpu_ioctl(cs, KVM_ARM_VCPU_INIT, &init);
+
+    /* TODO : support for save/restore/reset of system regs via tuple list */
+
+    return ret;
+}
+
+#define AARCH64_CORE_REG(x)   (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \
+                 KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(x))
+
+int kvm_arch_put_registers(CPUState *cs, int level)
+{
+    struct kvm_one_reg reg;
+    uint64_t val;
+    int i;
+    int ret;
+
+    ARMCPU *cpu = ARM_CPU(cs);
+    CPUARMState *env = &cpu->env;
+
+    for (i = 0; i < 31; i++) {
+        reg.id = AARCH64_CORE_REG(regs.regs[i]);
+        reg.addr = (uintptr_t) &env->xregs[i];
+        ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
+        if (ret) {
+            return ret;
+        }
+    }
+
+    reg.id = AARCH64_CORE_REG(regs.sp);
+    reg.addr = (uintptr_t) &env->xregs[31];
+    ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
+    if (ret) {
+        return ret;
+    }
+
+    /* Note that KVM thinks pstate is 64 bit but we use a uint32_t */
+    val = pstate_read(env);
+    reg.id = AARCH64_CORE_REG(regs.pstate);
+    reg.addr = (uintptr_t) &val;
+    ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
+    if (ret) {
+        return ret;
+    }
+
+    reg.id = AARCH64_CORE_REG(regs.pc);
+    reg.addr = (uintptr_t) &env->pc;
+    ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
+    if (ret) {
+        return ret;
+    }
+
+    /* TODO:
+     * SP_EL1
+     * ELR_EL1
+     * SPSR[]
+     * FP state
+     * system registers
+     */
+    return ret;
+}
+
+int kvm_arch_get_registers(CPUState *cs)
+{
+    struct kvm_one_reg reg;
+    uint64_t val;
+    int i;
+    int ret;
+
+    ARMCPU *cpu = ARM_CPU(cs);
+    CPUARMState *env = &cpu->env;
+
+    for (i = 0; i < 31; i++) {
+        reg.id = AARCH64_CORE_REG(regs.regs[i]);
+        reg.addr = (uintptr_t) &env->xregs[i];
+        ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
+        if (ret) {
+            return ret;
+        }
+    }
+
+    reg.id = AARCH64_CORE_REG(regs.sp);
+    reg.addr = (uintptr_t) &env->xregs[31];
+    ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
+    if (ret) {
+        return ret;
+    }
+
+    reg.id = AARCH64_CORE_REG(regs.pstate);
+    reg.addr = (uintptr_t) &val;
+    ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
+    if (ret) {
+        return ret;
+    }
+    pstate_write(env, val);
+
+    reg.id = AARCH64_CORE_REG(regs.pc);
+    reg.addr = (uintptr_t) &env->pc;
+    ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
+    if (ret) {
+        return ret;
+    }
+
+    /* TODO: other registers */
+    return ret;
+}
+
+void kvm_arch_reset_vcpu(CPUState *cs)
+{
+}
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index f120088..0a76130 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -28,13 +28,15 @@
 #include "translate.h"
 #include "qemu/host-utils.h"
 
+#include "exec/gen-icount.h"
+
 #include "helper.h"
 #define GEN_HELPER 1
 #include "helper.h"
 
 static TCGv_i64 cpu_X[32];
 static TCGv_i64 cpu_pc;
-static TCGv_i32 pstate;
+static TCGv_i32 cpu_NF, cpu_ZF, cpu_CF, cpu_VF;
 
 static const char *regnames[] = {
     "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
@@ -43,6 +45,13 @@ static const char *regnames[] = {
     "x24", "x25", "x26", "x27", "x28", "x29", "lr", "sp"
 };
 
+enum a64_shift_type {
+    A64_SHIFT_TYPE_LSL = 0,
+    A64_SHIFT_TYPE_LSR = 1,
+    A64_SHIFT_TYPE_ASR = 2,
+    A64_SHIFT_TYPE_ROR = 3
+};
+
 /* initialize TCG globals.  */
 void a64_translate_init(void)
 {
@@ -57,9 +66,10 @@ void a64_translate_init(void)
                                           regnames[i]);
     }
 
-    pstate = tcg_global_mem_new_i32(TCG_AREG0,
-                                    offsetof(CPUARMState, pstate),
-                                    "pstate");
+    cpu_NF = tcg_global_mem_new_i32(TCG_AREG0, offsetof(CPUARMState, NF), "NF");
+    cpu_ZF = tcg_global_mem_new_i32(TCG_AREG0, offsetof(CPUARMState, ZF), "ZF");
+    cpu_CF = tcg_global_mem_new_i32(TCG_AREG0, offsetof(CPUARMState, CF), "CF");
+    cpu_VF = tcg_global_mem_new_i32(TCG_AREG0, offsetof(CPUARMState, VF), "VF");
 }
 
 void aarch64_cpu_dump_state(CPUState *cs, FILE *f,
@@ -67,6 +77,7 @@ void aarch64_cpu_dump_state(CPUState *cs, FILE *f,
 {
     ARMCPU *cpu = ARM_CPU(cs);
     CPUARMState *env = &cpu->env;
+    uint32_t psr = pstate_read(env);
     int i;
 
     cpu_fprintf(f, "PC=%016"PRIx64"  SP=%016"PRIx64"\n",
@@ -79,11 +90,12 @@ void aarch64_cpu_dump_state(CPUState *cs, FILE *f,
             cpu_fprintf(f, " ");
         }
     }
-    cpu_fprintf(f, "PSTATE=%c%c%c%c\n",
-        env->pstate & PSTATE_N ? 'n' : '.',
-        env->pstate & PSTATE_Z ? 'z' : '.',
-        env->pstate & PSTATE_C ? 'c' : '.',
-        env->pstate & PSTATE_V ? 'v' : '.');
+    cpu_fprintf(f, "PSTATE=%08x (flags %c%c%c%c)\n",
+                psr,
+                psr & PSTATE_N ? 'N' : '-',
+                psr & PSTATE_Z ? 'Z' : '-',
+                psr & PSTATE_C ? 'C' : '-',
+                psr & PSTATE_V ? 'V' : '-');
     cpu_fprintf(f, "\n");
 }
 
@@ -104,21 +116,1474 @@ static void gen_exception_insn(DisasContext *s, int offset, int excp)
 {
     gen_a64_set_pc_im(s->pc - offset);
     gen_exception(excp);
-    s->is_jmp = DISAS_JUMP;
+    s->is_jmp = DISAS_EXC;
+}
+
+static inline bool use_goto_tb(DisasContext *s, int n, uint64_t dest)
+{
+    /* No direct tb linking with singlestep or deterministic io */
+    if (s->singlestep_enabled || (s->tb->cflags & CF_LAST_IO)) {
+        return false;
+    }
+
+    /* Only link tbs from inside the same guest page */
+    if ((s->tb->pc & TARGET_PAGE_MASK) != (dest & TARGET_PAGE_MASK)) {
+        return false;
+    }
+
+    return true;
+}
+
+static inline void gen_goto_tb(DisasContext *s, int n, uint64_t dest)
+{
+    TranslationBlock *tb;
+
+    tb = s->tb;
+    if (use_goto_tb(s, n, dest)) {
+        tcg_gen_goto_tb(n);
+        gen_a64_set_pc_im(dest);
+        tcg_gen_exit_tb((tcg_target_long)tb + n);
+        s->is_jmp = DISAS_TB_JUMP;
+    } else {
+        gen_a64_set_pc_im(dest);
+        if (s->singlestep_enabled) {
+            gen_exception(EXCP_DEBUG);
+        }
+        tcg_gen_exit_tb(0);
+        s->is_jmp = DISAS_JUMP;
+    }
 }
 
-static void real_unallocated_encoding(DisasContext *s)
+static void unallocated_encoding(DisasContext *s)
 {
-    fprintf(stderr, "Unknown instruction: %#x\n", s->insn);
     gen_exception_insn(s, 4, EXCP_UDEF);
 }
 
-#define unallocated_encoding(s) do { \
-    fprintf(stderr, "unallocated encoding at line: %d\n", __LINE__); \
-    real_unallocated_encoding(s); \
-    } while (0)
+#define unsupported_encoding(s, insn)                                    \
+    do {                                                                 \
+        qemu_log_mask(LOG_UNIMP,                                         \
+                      "%s:%d: unsupported instruction encoding 0x%08x "  \
+                      "at pc=%016" PRIx64 "\n",                          \
+                      __FILE__, __LINE__, insn, s->pc - 4);              \
+        unallocated_encoding(s);                                         \
+    } while (0);
+
+static void init_tmp_a64_array(DisasContext *s)
+{
+#ifdef CONFIG_DEBUG_TCG
+    int i;
+    for (i = 0; i < ARRAY_SIZE(s->tmp_a64); i++) {
+        TCGV_UNUSED_I64(s->tmp_a64[i]);
+    }
+#endif
+    s->tmp_a64_count = 0;
+}
+
+static void free_tmp_a64(DisasContext *s)
+{
+    int i;
+    for (i = 0; i < s->tmp_a64_count; i++) {
+        tcg_temp_free_i64(s->tmp_a64[i]);
+    }
+    init_tmp_a64_array(s);
+}
+
+static TCGv_i64 new_tmp_a64(DisasContext *s)
+{
+    assert(s->tmp_a64_count < TMP_A64_MAX);
+    return s->tmp_a64[s->tmp_a64_count++] = tcg_temp_new_i64();
+}
+
+static TCGv_i64 new_tmp_a64_zero(DisasContext *s)
+{
+    TCGv_i64 t = new_tmp_a64(s);
+    tcg_gen_movi_i64(t, 0);
+    return t;
+}
+
+/*
+ * Register access functions
+ *
+ * These functions are used for directly accessing a register in where
+ * changes to the final register value are likely to be made. If you
+ * need to use a register for temporary calculation (e.g. index type
+ * operations) use the read_* form.
+ *
+ * B1.2.1 Register mappings
+ *
+ * In instruction register encoding 31 can refer to ZR (zero register) or
+ * the SP (stack pointer) depending on context. In QEMU's case we map SP
+ * to cpu_X[31] and ZR accesses to a temporary which can be discarded.
+ * This is the point of the _sp forms.
+ */
+static TCGv_i64 cpu_reg(DisasContext *s, int reg)
+{
+    if (reg == 31) {
+        return new_tmp_a64_zero(s);
+    } else {
+        return cpu_X[reg];
+    }
+}
+
+/* register access for when 31 == SP */
+static TCGv_i64 cpu_reg_sp(DisasContext *s, int reg)
+{
+    return cpu_X[reg];
+}
+
+/* read a cpu register in 32bit/64bit mode. Returns a TCGv_i64
+ * representing the register contents. This TCGv is an auto-freed
+ * temporary so it need not be explicitly freed, and may be modified.
+ */
+static TCGv_i64 read_cpu_reg(DisasContext *s, int reg, int sf)
+{
+    TCGv_i64 v = new_tmp_a64(s);
+    if (reg != 31) {
+        if (sf) {
+            tcg_gen_mov_i64(v, cpu_X[reg]);
+        } else {
+            tcg_gen_ext32u_i64(v, cpu_X[reg]);
+        }
+    } else {
+        tcg_gen_movi_i64(v, 0);
+    }
+    return v;
+}
+
+/* Set ZF and NF based on a 64 bit result. This is alas fiddlier
+ * than the 32 bit equivalent.
+ */
+static inline void gen_set_NZ64(TCGv_i64 result)
+{
+    TCGv_i64 flag = tcg_temp_new_i64();
+
+    tcg_gen_setcondi_i64(TCG_COND_NE, flag, result, 0);
+    tcg_gen_trunc_i64_i32(cpu_ZF, flag);
+    tcg_gen_shri_i64(flag, result, 32);
+    tcg_gen_trunc_i64_i32(cpu_NF, flag);
+    tcg_temp_free_i64(flag);
+}
+
+/* Set NZCV as for a logical operation: NZ as per result, CV cleared. */
+static inline void gen_logic_CC(int sf, TCGv_i64 result)
+{
+    if (sf) {
+        gen_set_NZ64(result);
+    } else {
+        tcg_gen_trunc_i64_i32(cpu_ZF, result);
+        tcg_gen_trunc_i64_i32(cpu_NF, result);
+    }
+    tcg_gen_movi_i32(cpu_CF, 0);
+    tcg_gen_movi_i32(cpu_VF, 0);
+}
+
+/*
+ * the instruction disassembly implemented here matches
+ * the instruction encoding classifications in chapter 3 (C3)
+ * of the ARM Architecture Reference Manual (DDI0487A_a)
+ */
+
+/* C3.2.7 Unconditional branch (immediate)
+ *   31  30       26 25                                  0
+ * +----+-----------+-------------------------------------+
+ * | op | 0 0 1 0 1 |                 imm26               |
+ * +----+-----------+-------------------------------------+
+ */
+static void disas_uncond_b_imm(DisasContext *s, uint32_t insn)
+{
+    uint64_t addr = s->pc + sextract32(insn, 0, 26) * 4 - 4;
+
+    if (insn & (1 << 31)) {
+        /* C5.6.26 BL Branch with link */
+        tcg_gen_movi_i64(cpu_reg(s, 30), s->pc);
+    }
+
+    /* C5.6.20 B Branch / C5.6.26 BL Branch with link */
+    gen_goto_tb(s, 0, addr);
+}
+
+/* C3.2.1 Compare & branch (immediate)
+ *   31  30         25  24  23                  5 4      0
+ * +----+-------------+----+---------------------+--------+
+ * | sf | 0 1 1 0 1 0 | op |         imm19       |   Rt   |
+ * +----+-------------+----+---------------------+--------+
+ */
+static void disas_comp_b_imm(DisasContext *s, uint32_t insn)
+{
+    unsigned int sf, op, rt;
+    uint64_t addr;
+    int label_match;
+    TCGv_i64 tcg_cmp;
+
+    sf = extract32(insn, 31, 1);
+    op = extract32(insn, 24, 1); /* 0: CBZ; 1: CBNZ */
+    rt = extract32(insn, 0, 5);
+    addr = s->pc + sextract32(insn, 5, 19) * 4 - 4;
+
+    tcg_cmp = read_cpu_reg(s, rt, sf);
+    label_match = gen_new_label();
+
+    tcg_gen_brcondi_i64(op ? TCG_COND_NE : TCG_COND_EQ,
+                        tcg_cmp, 0, label_match);
+
+    gen_goto_tb(s, 0, s->pc);
+    gen_set_label(label_match);
+    gen_goto_tb(s, 1, addr);
+}
+
+/* C3.2.5 Test & branch (immediate)
+ *   31  30         25  24  23   19 18          5 4    0
+ * +----+-------------+----+-------+-------------+------+
+ * | b5 | 0 1 1 0 1 1 | op |  b40  |    imm14    |  Rt  |
+ * +----+-------------+----+-------+-------------+------+
+ */
+static void disas_test_b_imm(DisasContext *s, uint32_t insn)
+{
+    unsigned int bit_pos, op, rt;
+    uint64_t addr;
+    int label_match;
+    TCGv_i64 tcg_cmp;
+
+    bit_pos = (extract32(insn, 31, 1) << 5) | extract32(insn, 19, 5);
+    op = extract32(insn, 24, 1); /* 0: TBZ; 1: TBNZ */
+    addr = s->pc + sextract32(insn, 5, 14) * 4 - 4;
+    rt = extract32(insn, 0, 5);
+
+    tcg_cmp = tcg_temp_new_i64();
+    tcg_gen_andi_i64(tcg_cmp, cpu_reg(s, rt), (1ULL << bit_pos));
+    label_match = gen_new_label();
+    tcg_gen_brcondi_i64(op ? TCG_COND_NE : TCG_COND_EQ,
+                        tcg_cmp, 0, label_match);
+    tcg_temp_free_i64(tcg_cmp);
+    gen_goto_tb(s, 0, s->pc);
+    gen_set_label(label_match);
+    gen_goto_tb(s, 1, addr);
+}
+
+/* C3.2.2 / C5.6.19 Conditional branch (immediate)
+ *  31           25  24  23                  5   4  3    0
+ * +---------------+----+---------------------+----+------+
+ * | 0 1 0 1 0 1 0 | o1 |         imm19       | o0 | cond |
+ * +---------------+----+---------------------+----+------+
+ */
+static void disas_cond_b_imm(DisasContext *s, uint32_t insn)
+{
+    unsigned int cond;
+    uint64_t addr;
+
+    if ((insn & (1 << 4)) || (insn & (1 << 24))) {
+        unallocated_encoding(s);
+        return;
+    }
+    addr = s->pc + sextract32(insn, 5, 19) * 4 - 4;
+    cond = extract32(insn, 0, 4);
+
+    if (cond < 0x0e) {
+        /* genuinely conditional branches */
+        int label_match = gen_new_label();
+        arm_gen_test_cc(cond, label_match);
+        gen_goto_tb(s, 0, s->pc);
+        gen_set_label(label_match);
+        gen_goto_tb(s, 1, addr);
+    } else {
+        /* 0xe and 0xf are both "always" conditions */
+        gen_goto_tb(s, 0, addr);
+    }
+}
+
+/* C5.6.68 HINT */
+static void handle_hint(DisasContext *s, uint32_t insn,
+                        unsigned int op1, unsigned int op2, unsigned int crm)
+{
+    unsigned int selector = crm << 3 | op2;
+
+    if (op1 != 3) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (selector) {
+    case 0: /* NOP */
+        return;
+    case 1: /* YIELD */
+    case 2: /* WFE */
+    case 3: /* WFI */
+    case 4: /* SEV */
+    case 5: /* SEVL */
+        /* we treat all as NOP at least for now */
+        return;
+    default:
+        /* default specified as NOP equivalent */
+        return;
+    }
+}
+
+/* CLREX, DSB, DMB, ISB */
+static void handle_sync(DisasContext *s, uint32_t insn,
+                        unsigned int op1, unsigned int op2, unsigned int crm)
+{
+    if (op1 != 3) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (op2) {
+    case 2: /* CLREX */
+        unsupported_encoding(s, insn);
+        return;
+    case 4: /* DSB */
+    case 5: /* DMB */
+    case 6: /* ISB */
+        /* We don't emulate caches so barriers are no-ops */
+        return;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+}
+
+/* C5.6.130 MSR (immediate) - move immediate to processor state field */
+static void handle_msr_i(DisasContext *s, uint32_t insn,
+                         unsigned int op1, unsigned int op2, unsigned int crm)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* C5.6.204 SYS */
+static void handle_sys(DisasContext *s, uint32_t insn, unsigned int l,
+                       unsigned int op1, unsigned int op2,
+                       unsigned int crn, unsigned int crm, unsigned int rt)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* C5.6.129 MRS - move from system register */
+static void handle_mrs(DisasContext *s, uint32_t insn, unsigned int op0,
+                       unsigned int op1, unsigned int op2,
+                       unsigned int crn, unsigned int crm, unsigned int rt)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* C5.6.131 MSR (register) - move to system register */
+static void handle_msr(DisasContext *s, uint32_t insn, unsigned int op0,
+                       unsigned int op1, unsigned int op2,
+                       unsigned int crn, unsigned int crm, unsigned int rt)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* C3.2.4 System
+ *  31                 22 21  20 19 18 16 15   12 11    8 7   5 4    0
+ * +---------------------+---+-----+-----+-------+-------+-----+------+
+ * | 1 1 0 1 0 1 0 1 0 0 | L | op0 | op1 |  CRn  |  CRm  | op2 |  Rt  |
+ * +---------------------+---+-----+-----+-------+-------+-----+------+
+ */
+static void disas_system(DisasContext *s, uint32_t insn)
+{
+    unsigned int l, op0, op1, crn, crm, op2, rt;
+    l = extract32(insn, 21, 1);
+    op0 = extract32(insn, 19, 2);
+    op1 = extract32(insn, 16, 3);
+    crn = extract32(insn, 12, 4);
+    crm = extract32(insn, 8, 4);
+    op2 = extract32(insn, 5, 3);
+    rt = extract32(insn, 0, 5);
+
+    if (op0 == 0) {
+        if (l || rt != 31) {
+            unallocated_encoding(s);
+            return;
+        }
+        switch (crn) {
+        case 2: /* C5.6.68 HINT */
+            handle_hint(s, insn, op1, op2, crm);
+            break;
+        case 3: /* CLREX, DSB, DMB, ISB */
+            handle_sync(s, insn, op1, op2, crm);
+            break;
+        case 4: /* C5.6.130 MSR (immediate) */
+            handle_msr_i(s, insn, op1, op2, crm);
+            break;
+        default:
+            unallocated_encoding(s);
+            break;
+        }
+        return;
+    }
+
+    if (op0 == 1) {
+        /* C5.6.204 SYS */
+        handle_sys(s, insn, l, op1, op2, crn, crm, rt);
+    } else if (l) { /* op0 > 1 */
+        /* C5.6.129 MRS - move from system register */
+        handle_mrs(s, insn, op0, op1, op2, crn, crm, rt);
+    } else {
+        /* C5.6.131 MSR (register) - move to system register */
+        handle_msr(s, insn, op0, op1, op2, crn, crm, rt);
+    }
+}
+
+/* Exception generation */
+static void disas_exc(DisasContext *s, uint32_t insn)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* C3.2.7 Unconditional branch (register)
+ *  31           25 24   21 20   16 15   10 9    5 4     0
+ * +---------------+-------+-------+-------+------+-------+
+ * | 1 1 0 1 0 1 1 |  opc  |  op2  |  op3  |  Rn  |  op4  |
+ * +---------------+-------+-------+-------+------+-------+
+ */
+static void disas_uncond_b_reg(DisasContext *s, uint32_t insn)
+{
+    unsigned int opc, op2, op3, rn, op4;
+
+    opc = extract32(insn, 21, 4);
+    op2 = extract32(insn, 16, 5);
+    op3 = extract32(insn, 10, 6);
+    rn = extract32(insn, 5, 5);
+    op4 = extract32(insn, 0, 5);
+
+    if (op4 != 0x0 || op3 != 0x0 || op2 != 0x1f) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (opc) {
+    case 0: /* BR */
+    case 2: /* RET */
+        break;
+    case 1: /* BLR */
+        tcg_gen_movi_i64(cpu_reg(s, 30), s->pc);
+        break;
+    case 4: /* ERET */
+    case 5: /* DRPS */
+        if (rn != 0x1f) {
+            unallocated_encoding(s);
+        } else {
+            unsupported_encoding(s, insn);
+        }
+        return;
+    default:
+        unallocated_encoding(s);
+        return;
+    }
+
+    tcg_gen_mov_i64(cpu_pc, cpu_reg(s, rn));
+    s->is_jmp = DISAS_JUMP;
+}
+
+/* C3.2 Branches, exception generating and system instructions */
+static void disas_b_exc_sys(DisasContext *s, uint32_t insn)
+{
+    switch (extract32(insn, 25, 7)) {
+    case 0x0a: case 0x0b:
+    case 0x4a: case 0x4b: /* Unconditional branch (immediate) */
+        disas_uncond_b_imm(s, insn);
+        break;
+    case 0x1a: case 0x5a: /* Compare & branch (immediate) */
+        disas_comp_b_imm(s, insn);
+        break;
+    case 0x1b: case 0x5b: /* Test & branch (immediate) */
+        disas_test_b_imm(s, insn);
+        break;
+    case 0x2a: /* Conditional branch (immediate) */
+        disas_cond_b_imm(s, insn);
+        break;
+    case 0x6a: /* Exception generation / System */
+        if (insn & (1 << 24)) {
+            disas_system(s, insn);
+        } else {
+            disas_exc(s, insn);
+        }
+        break;
+    case 0x6b: /* Unconditional branch (register) */
+        disas_uncond_b_reg(s, insn);
+        break;
+    default:
+        unallocated_encoding(s);
+        break;
+    }
+}
+
+/* Load/store exclusive */
+static void disas_ldst_excl(DisasContext *s, uint32_t insn)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* Load register (literal) */
+static void disas_ld_lit(DisasContext *s, uint32_t insn)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* Load/store pair (all forms) */
+static void disas_ldst_pair(DisasContext *s, uint32_t insn)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* Load/store register (all forms) */
+static void disas_ldst_reg(DisasContext *s, uint32_t insn)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* AdvSIMD load/store multiple structures */
+static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* AdvSIMD load/store single structure */
+static void disas_ldst_single_struct(DisasContext *s, uint32_t insn)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* C3.3 Loads and stores */
+static void disas_ldst(DisasContext *s, uint32_t insn)
+{
+    switch (extract32(insn, 24, 6)) {
+    case 0x08: /* Load/store exclusive */
+        disas_ldst_excl(s, insn);
+        break;
+    case 0x18: case 0x1c: /* Load register (literal) */
+        disas_ld_lit(s, insn);
+        break;
+    case 0x28: case 0x29:
+    case 0x2c: case 0x2d: /* Load/store pair (all forms) */
+        disas_ldst_pair(s, insn);
+        break;
+    case 0x38: case 0x39:
+    case 0x3c: case 0x3d: /* Load/store register (all forms) */
+        disas_ldst_reg(s, insn);
+        break;
+    case 0x0c: /* AdvSIMD load/store multiple structures */
+        disas_ldst_multiple_struct(s, insn);
+        break;
+    case 0x0d: /* AdvSIMD load/store single structure */
+        disas_ldst_single_struct(s, insn);
+        break;
+    default:
+        unallocated_encoding(s);
+        break;
+    }
+}
+
+/* C3.4.6 PC-rel. addressing
+ *   31  30   29 28       24 23                5 4    0
+ * +----+-------+-----------+-------------------+------+
+ * | op | immlo | 1 0 0 0 0 |       immhi       |  Rd  |
+ * +----+-------+-----------+-------------------+------+
+ */
+static void disas_pc_rel_adr(DisasContext *s, uint32_t insn)
+{
+    unsigned int page, rd;
+    uint64_t base;
+    int64_t offset;
+
+    page = extract32(insn, 31, 1);
+    /* SignExtend(immhi:immlo) -> offset */
+    offset = ((int64_t)sextract32(insn, 5, 19) << 2) | extract32(insn, 29, 2);
+    rd = extract32(insn, 0, 5);
+    base = s->pc - 4;
+
+    if (page) {
+        /* ADRP (page based) */
+        base &= ~0xfff;
+        offset <<= 12;
+    }
+
+    tcg_gen_movi_i64(cpu_reg(s, rd), base + offset);
+}
+
+/* Add/subtract (immediate) */
+static void disas_add_sub_imm(DisasContext *s, uint32_t insn)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* The input should be a value in the bottom e bits (with higher
+ * bits zero); returns that value replicated into every element
+ * of size e in a 64 bit integer.
+ */
+static uint64_t bitfield_replicate(uint64_t mask, unsigned int e)
+{
+    assert(e != 0);
+    while (e < 64) {
+        mask |= mask << e;
+        e *= 2;
+    }
+    return mask;
+}
+
+/* Return a value with the bottom len bits set (where 0 < len <= 64) */
+static inline uint64_t bitmask64(unsigned int length)
+{
+    assert(length > 0 && length <= 64);
+    return ~0ULL >> (64 - length);
+}
+
+/* Simplified variant of pseudocode DecodeBitMasks() for the case where we
+ * only require the wmask. Returns false if the imms/immr/immn are a reserved
+ * value (ie should cause a guest UNDEF exception), and true if they are
+ * valid, in which case the decoded bit pattern is written to result.
+ */
+static bool logic_imm_decode_wmask(uint64_t *result, unsigned int immn,
+                                   unsigned int imms, unsigned int immr)
+{
+    uint64_t mask;
+    unsigned e, levels, s, r;
+    int len;
+
+    assert(immn < 2 && imms < 64 && immr < 64);
+
+    /* The bit patterns we create here are 64 bit patterns which
+     * are vectors of identical elements of size e = 2, 4, 8, 16, 32 or
+     * 64 bits each. Each element contains the same value: a run
+     * of between 1 and e-1 non-zero bits, rotated within the
+     * element by between 0 and e-1 bits.
+     *
+     * The element size and run length are encoded into immn (1 bit)
+     * and imms (6 bits) as follows:
+     * 64 bit elements: immn = 1, imms = <length of run - 1>
+     * 32 bit elements: immn = 0, imms = 0 : <length of run - 1>
+     * 16 bit elements: immn = 0, imms = 10 : <length of run - 1>
+     *  8 bit elements: immn = 0, imms = 110 : <length of run - 1>
+     *  4 bit elements: immn = 0, imms = 1110 : <length of run - 1>
+     *  2 bit elements: immn = 0, imms = 11110 : <length of run - 1>
+     * Notice that immn = 0, imms = 11111x is the only combination
+     * not covered by one of the above options; this is reserved.
+     * Further, <length of run - 1> all-ones is a reserved pattern.
+     *
+     * In all cases the rotation is by immr % e (and immr is 6 bits).
+     */
+
+    /* First determine the element size */
+    len = 31 - clz32((immn << 6) | (~imms & 0x3f));
+    if (len < 1) {
+        /* This is the immn == 0, imms == 0x11111x case */
+        return false;
+    }
+    e = 1 << len;
+
+    levels = e - 1;
+    s = imms & levels;
+    r = immr & levels;
+
+    if (s == levels) {
+        /* <length of run - 1> mustn't be all-ones. */
+        return false;
+    }
+
+    /* Create the value of one element: s+1 set bits rotated
+     * by r within the element (which is e bits wide)...
+     */
+    mask = bitmask64(s + 1);
+    mask = (mask >> r) | (mask << (e - r));
+    /* ...then replicate the element over the whole 64 bit value */
+    mask = bitfield_replicate(mask, e);
+    *result = mask;
+    return true;
+}
+
+/* C3.4.4 Logical (immediate)
+ *   31  30 29 28         23 22  21  16 15  10 9    5 4    0
+ * +----+-----+-------------+---+------+------+------+------+
+ * | sf | opc | 1 0 0 1 0 0 | N | immr | imms |  Rn  |  Rd  |
+ * +----+-----+-------------+---+------+------+------+------+
+ */
+static void disas_logic_imm(DisasContext *s, uint32_t insn)
+{
+    unsigned int sf, opc, is_n, immr, imms, rn, rd;
+    TCGv_i64 tcg_rd, tcg_rn;
+    uint64_t wmask;
+    bool is_and = false;
+
+    sf = extract32(insn, 31, 1);
+    opc = extract32(insn, 29, 2);
+    is_n = extract32(insn, 22, 1);
+    immr = extract32(insn, 16, 6);
+    imms = extract32(insn, 10, 6);
+    rn = extract32(insn, 5, 5);
+    rd = extract32(insn, 0, 5);
+
+    if (!sf && is_n) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (opc == 0x3) { /* ANDS */
+        tcg_rd = cpu_reg(s, rd);
+    } else {
+        tcg_rd = cpu_reg_sp(s, rd);
+    }
+    tcg_rn = cpu_reg(s, rn);
+
+    if (!logic_imm_decode_wmask(&wmask, is_n, imms, immr)) {
+        /* some immediate field values are reserved */
+        unallocated_encoding(s);
+        return;
+    }
+
+    if (!sf) {
+        wmask &= 0xffffffff;
+    }
+
+    switch (opc) {
+    case 0x3: /* ANDS */
+    case 0x0: /* AND */
+        tcg_gen_andi_i64(tcg_rd, tcg_rn, wmask);
+        is_and = true;
+        break;
+    case 0x1: /* ORR */
+        tcg_gen_ori_i64(tcg_rd, tcg_rn, wmask);
+        break;
+    case 0x2: /* EOR */
+        tcg_gen_xori_i64(tcg_rd, tcg_rn, wmask);
+        break;
+    default:
+        assert(FALSE); /* must handle all above */
+        break;
+    }
+
+    if (!sf && !is_and) {
+        /* zero extend final result; we know we can skip this for AND
+         * since the immediate had the high 32 bits clear.
+         */
+        tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
+    }
 
-void disas_a64_insn(CPUARMState *env, DisasContext *s)
+    if (opc == 3) { /* ANDS */
+        gen_logic_CC(sf, tcg_rd);
+    }
+}
+
+/* Move wide (immediate) */
+static void disas_movw_imm(DisasContext *s, uint32_t insn)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* C3.4.2 Bitfield
+ *   31  30 29 28         23 22  21  16 15  10 9    5 4    0
+ * +----+-----+-------------+---+------+------+------+------+
+ * | sf | opc | 1 0 0 1 1 0 | N | immr | imms |  Rn  |  Rd  |
+ * +----+-----+-------------+---+------+------+------+------+
+ */
+static void disas_bitfield(DisasContext *s, uint32_t insn)
+{
+    unsigned int sf, n, opc, ri, si, rn, rd, bitsize, pos, len;
+    TCGv_i64 tcg_rd, tcg_tmp;
+
+    sf = extract32(insn, 31, 1);
+    opc = extract32(insn, 29, 2);
+    n = extract32(insn, 22, 1);
+    ri = extract32(insn, 16, 6);
+    si = extract32(insn, 10, 6);
+    rn = extract32(insn, 5, 5);
+    rd = extract32(insn, 0, 5);
+    bitsize = sf ? 64 : 32;
+
+    if (sf != n || ri >= bitsize || si >= bitsize || opc > 2) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    tcg_rd = cpu_reg(s, rd);
+    tcg_tmp = read_cpu_reg(s, rn, sf);
+
+    /* OPTME: probably worth recognizing common cases of ext{8,16,32}{u,s} */
+
+    if (opc != 1) { /* SBFM or UBFM */
+        tcg_gen_movi_i64(tcg_rd, 0);
+    }
+
+    /* do the bit move operation */
+    if (si >= ri) {
+        /* Wd<s-r:0> = Wn<s:r> */
+        tcg_gen_shri_i64(tcg_tmp, tcg_tmp, ri);
+        pos = 0;
+        len = (si - ri) + 1;
+    } else {
+        /* Wd<32+s-r,32-r> = Wn<s:0> */
+        pos = bitsize - ri;
+        len = si + 1;
+    }
+
+    tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_tmp, pos, len);
+
+    if (opc == 0) { /* SBFM - sign extend the destination field */
+        tcg_gen_shli_i64(tcg_rd, tcg_rd, 64 - (pos + len));
+        tcg_gen_sari_i64(tcg_rd, tcg_rd, 64 - (pos + len));
+    }
+
+    if (!sf) { /* zero extend final result */
+        tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
+    }
+}
+
+/* C3.4.3 Extract
+ *   31  30  29 28         23 22   21  20  16 15    10 9    5 4    0
+ * +----+------+-------------+---+----+------+--------+------+------+
+ * | sf | op21 | 1 0 0 1 1 1 | N | o0 |  Rm  |  imms  |  Rn  |  Rd  |
+ * +----+------+-------------+---+----+------+--------+------+------+
+ */
+static void disas_extract(DisasContext *s, uint32_t insn)
+{
+    unsigned int sf, n, rm, imm, rn, rd, bitsize, op21, op0;
+
+    sf = extract32(insn, 31, 1);
+    n = extract32(insn, 22, 1);
+    rm = extract32(insn, 16, 5);
+    imm = extract32(insn, 10, 6);
+    rn = extract32(insn, 5, 5);
+    rd = extract32(insn, 0, 5);
+    op21 = extract32(insn, 29, 2);
+    op0 = extract32(insn, 21, 1);
+    bitsize = sf ? 64 : 32;
+
+    if (sf != n || op21 || op0 || imm >= bitsize) {
+        unallocated_encoding(s);
+    } else {
+        TCGv_i64 tcg_rd, tcg_rm, tcg_rn;
+
+        tcg_rd = cpu_reg(s, rd);
+
+        if (imm) {
+            /* OPTME: we can special case rm==rn as a rotate */
+            tcg_rm = read_cpu_reg(s, rm, sf);
+            tcg_rn = read_cpu_reg(s, rn, sf);
+            tcg_gen_shri_i64(tcg_rm, tcg_rm, imm);
+            tcg_gen_shli_i64(tcg_rn, tcg_rn, bitsize - imm);
+            tcg_gen_or_i64(tcg_rd, tcg_rm, tcg_rn);
+            if (!sf) {
+                tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
+            }
+        } else {
+            /* tcg shl_i32/shl_i64 is undefined for 32/64 bit shifts,
+             * so an extract from bit 0 is a special case.
+             */
+            if (sf) {
+                tcg_gen_mov_i64(tcg_rd, cpu_reg(s, rm));
+            } else {
+                tcg_gen_ext32u_i64(tcg_rd, cpu_reg(s, rm));
+            }
+        }
+
+    }
+}
+
+/* C3.4 Data processing - immediate */
+static void disas_data_proc_imm(DisasContext *s, uint32_t insn)
+{
+    switch (extract32(insn, 23, 6)) {
+    case 0x20: case 0x21: /* PC-rel. addressing */
+        disas_pc_rel_adr(s, insn);
+        break;
+    case 0x22: case 0x23: /* Add/subtract (immediate) */
+        disas_add_sub_imm(s, insn);
+        break;
+    case 0x24: /* Logical (immediate) */
+        disas_logic_imm(s, insn);
+        break;
+    case 0x25: /* Move wide (immediate) */
+        disas_movw_imm(s, insn);
+        break;
+    case 0x26: /* Bitfield */
+        disas_bitfield(s, insn);
+        break;
+    case 0x27: /* Extract */
+        disas_extract(s, insn);
+        break;
+    default:
+        unallocated_encoding(s);
+        break;
+    }
+}
+
+/* Shift a TCGv src by TCGv shift_amount, put result in dst.
+ * Note that it is the caller's responsibility to ensure that the
+ * shift amount is in range (ie 0..31 or 0..63) and provide the ARM
+ * mandated semantics for out of range shifts.
+ */
+static void shift_reg(TCGv_i64 dst, TCGv_i64 src, int sf,
+                      enum a64_shift_type shift_type, TCGv_i64 shift_amount)
+{
+    switch (shift_type) {
+    case A64_SHIFT_TYPE_LSL:
+        tcg_gen_shl_i64(dst, src, shift_amount);
+        break;
+    case A64_SHIFT_TYPE_LSR:
+        tcg_gen_shr_i64(dst, src, shift_amount);
+        break;
+    case A64_SHIFT_TYPE_ASR:
+        if (!sf) {
+            tcg_gen_ext32s_i64(dst, src);
+        }
+        tcg_gen_sar_i64(dst, sf ? src : dst, shift_amount);
+        break;
+    case A64_SHIFT_TYPE_ROR:
+        if (sf) {
+            tcg_gen_rotr_i64(dst, src, shift_amount);
+        } else {
+            TCGv_i32 t0, t1;
+            t0 = tcg_temp_new_i32();
+            t1 = tcg_temp_new_i32();
+            tcg_gen_trunc_i64_i32(t0, src);
+            tcg_gen_trunc_i64_i32(t1, shift_amount);
+            tcg_gen_rotr_i32(t0, t0, t1);
+            tcg_gen_extu_i32_i64(dst, t0);
+            tcg_temp_free_i32(t0);
+            tcg_temp_free_i32(t1);
+        }
+        break;
+    default:
+        assert(FALSE); /* all shift types should be handled */
+        break;
+    }
+
+    if (!sf) { /* zero extend final result */
+        tcg_gen_ext32u_i64(dst, dst);
+    }
+}
+
+/* Shift a TCGv src by immediate, put result in dst.
+ * The shift amount must be in range (this should always be true as the
+ * relevant instructions will UNDEF on bad shift immediates).
+ */
+static void shift_reg_imm(TCGv_i64 dst, TCGv_i64 src, int sf,
+                          enum a64_shift_type shift_type, unsigned int shift_i)
+{
+    assert(shift_i < (sf ? 64 : 32));
+
+    if (shift_i == 0) {
+        tcg_gen_mov_i64(dst, src);
+    } else {
+        TCGv_i64 shift_const;
+
+        shift_const = tcg_const_i64(shift_i);
+        shift_reg(dst, src, sf, shift_type, shift_const);
+        tcg_temp_free_i64(shift_const);
+    }
+}
+
+/* C3.5.10 Logical (shifted register)
+ *   31  30 29 28       24 23   22 21  20  16 15    10 9    5 4    0
+ * +----+-----+-----------+-------+---+------+--------+------+------+
+ * | sf | opc | 0 1 0 1 0 | shift | N |  Rm  |  imm6  |  Rn  |  Rd  |
+ * +----+-----+-----------+-------+---+------+--------+------+------+
+ */
+static void disas_logic_reg(DisasContext *s, uint32_t insn)
+{
+    TCGv_i64 tcg_rd, tcg_rn, tcg_rm;
+    unsigned int sf, opc, shift_type, invert, rm, shift_amount, rn, rd;
+
+    sf = extract32(insn, 31, 1);
+    opc = extract32(insn, 29, 2);
+    shift_type = extract32(insn, 22, 2);
+    invert = extract32(insn, 21, 1);
+    rm = extract32(insn, 16, 5);
+    shift_amount = extract32(insn, 10, 6);
+    rn = extract32(insn, 5, 5);
+    rd = extract32(insn, 0, 5);
+
+    if (!sf && (shift_amount & (1 << 5))) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    tcg_rd = cpu_reg(s, rd);
+
+    if (opc == 1 && shift_amount == 0 && shift_type == 0 && rn == 31) {
+        /* Unshifted ORR and ORN with WZR/XZR is the standard encoding for
+         * register-register MOV and MVN, so it is worth special casing.
+         */
+        tcg_rm = cpu_reg(s, rm);
+        if (invert) {
+            tcg_gen_not_i64(tcg_rd, tcg_rm);
+            if (!sf) {
+                tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
+            }
+        } else {
+            if (sf) {
+                tcg_gen_mov_i64(tcg_rd, tcg_rm);
+            } else {
+                tcg_gen_ext32u_i64(tcg_rd, tcg_rm);
+            }
+        }
+        return;
+    }
+
+    tcg_rm = read_cpu_reg(s, rm, sf);
+
+    if (shift_amount) {
+        shift_reg_imm(tcg_rm, tcg_rm, sf, shift_type, shift_amount);
+    }
+
+    tcg_rn = cpu_reg(s, rn);
+
+    switch (opc | (invert << 2)) {
+    case 0: /* AND */
+    case 3: /* ANDS */
+        tcg_gen_and_i64(tcg_rd, tcg_rn, tcg_rm);
+        break;
+    case 1: /* ORR */
+        tcg_gen_or_i64(tcg_rd, tcg_rn, tcg_rm);
+        break;
+    case 2: /* EOR */
+        tcg_gen_xor_i64(tcg_rd, tcg_rn, tcg_rm);
+        break;
+    case 4: /* BIC */
+    case 7: /* BICS */
+        tcg_gen_andc_i64(tcg_rd, tcg_rn, tcg_rm);
+        break;
+    case 5: /* ORN */
+        tcg_gen_orc_i64(tcg_rd, tcg_rn, tcg_rm);
+        break;
+    case 6: /* EON */
+        tcg_gen_eqv_i64(tcg_rd, tcg_rn, tcg_rm);
+        break;
+    default:
+        assert(FALSE);
+        break;
+    }
+
+    if (!sf) {
+        tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
+    }
+
+    if (opc == 3) {
+        gen_logic_CC(sf, tcg_rd);
+    }
+}
+
+/* Add/subtract (extended register) */
+static void disas_add_sub_ext_reg(DisasContext *s, uint32_t insn)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* Add/subtract (shifted register) */
+static void disas_add_sub_reg(DisasContext *s, uint32_t insn)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* Data-processing (3 source) */
+static void disas_data_proc_3src(DisasContext *s, uint32_t insn)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* Add/subtract (with carry) */
+static void disas_adc_sbc(DisasContext *s, uint32_t insn)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* Conditional compare (immediate) */
+static void disas_cc_imm(DisasContext *s, uint32_t insn)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* Conditional compare (register) */
+static void disas_cc_reg(DisasContext *s, uint32_t insn)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* C3.5.6 Conditional select
+ *   31   30  29  28             21 20  16 15  12 11 10 9    5 4    0
+ * +----+----+---+-----------------+------+------+-----+------+------+
+ * | sf | op | S | 1 1 0 1 0 1 0 0 |  Rm  | cond | op2 |  Rn  |  Rd  |
+ * +----+----+---+-----------------+------+------+-----+------+------+
+ */
+static void disas_cond_select(DisasContext *s, uint32_t insn)
+{
+    unsigned int sf, else_inv, rm, cond, else_inc, rn, rd;
+    TCGv_i64 tcg_rd, tcg_src;
+
+    if (extract32(insn, 29, 1) || extract32(insn, 11, 1)) {
+        /* S == 1 or op2<1> == 1 */
+        unallocated_encoding(s);
+        return;
+    }
+    sf = extract32(insn, 31, 1);
+    else_inv = extract32(insn, 30, 1);
+    rm = extract32(insn, 16, 5);
+    cond = extract32(insn, 12, 4);
+    else_inc = extract32(insn, 10, 1);
+    rn = extract32(insn, 5, 5);
+    rd = extract32(insn, 0, 5);
+
+    if (rd == 31) {
+        /* silly no-op write; until we use movcond we must special-case
+         * this to avoid a dead temporary across basic blocks.
+         */
+        return;
+    }
+
+    tcg_rd = cpu_reg(s, rd);
+
+    if (cond >= 0x0e) { /* condition "always" */
+        tcg_src = read_cpu_reg(s, rn, sf);
+        tcg_gen_mov_i64(tcg_rd, tcg_src);
+    } else {
+        /* OPTME: we could use movcond here, at the cost of duplicating
+         * a lot of the arm_gen_test_cc() logic.
+         */
+        int label_match = gen_new_label();
+        int label_continue = gen_new_label();
+
+        arm_gen_test_cc(cond, label_match);
+        /* nomatch: */
+        tcg_src = cpu_reg(s, rm);
+
+        if (else_inv && else_inc) {
+            tcg_gen_neg_i64(tcg_rd, tcg_src);
+        } else if (else_inv) {
+            tcg_gen_not_i64(tcg_rd, tcg_src);
+        } else if (else_inc) {
+            tcg_gen_addi_i64(tcg_rd, tcg_src, 1);
+        } else {
+            tcg_gen_mov_i64(tcg_rd, tcg_src);
+        }
+        if (!sf) {
+            tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
+        }
+        tcg_gen_br(label_continue);
+        /* match: */
+        gen_set_label(label_match);
+        tcg_src = read_cpu_reg(s, rn, sf);
+        tcg_gen_mov_i64(tcg_rd, tcg_src);
+        /* continue: */
+        gen_set_label(label_continue);
+    }
+}
+
+static void handle_clz(DisasContext *s, unsigned int sf,
+                       unsigned int rn, unsigned int rd)
+{
+    TCGv_i64 tcg_rd, tcg_rn;
+    tcg_rd = cpu_reg(s, rd);
+    tcg_rn = cpu_reg(s, rn);
+
+    if (sf) {
+        gen_helper_clz64(tcg_rd, tcg_rn);
+    } else {
+        TCGv_i32 tcg_tmp32 = tcg_temp_new_i32();
+        tcg_gen_trunc_i64_i32(tcg_tmp32, tcg_rn);
+        gen_helper_clz(tcg_tmp32, tcg_tmp32);
+        tcg_gen_extu_i32_i64(tcg_rd, tcg_tmp32);
+        tcg_temp_free_i32(tcg_tmp32);
+    }
+}
+
+static void handle_cls(DisasContext *s, unsigned int sf,
+                       unsigned int rn, unsigned int rd)
+{
+    TCGv_i64 tcg_rd, tcg_rn;
+    tcg_rd = cpu_reg(s, rd);
+    tcg_rn = cpu_reg(s, rn);
+
+    if (sf) {
+        gen_helper_cls64(tcg_rd, tcg_rn);
+    } else {
+        TCGv_i32 tcg_tmp32 = tcg_temp_new_i32();
+        tcg_gen_trunc_i64_i32(tcg_tmp32, tcg_rn);
+        gen_helper_cls32(tcg_tmp32, tcg_tmp32);
+        tcg_gen_extu_i32_i64(tcg_rd, tcg_tmp32);
+        tcg_temp_free_i32(tcg_tmp32);
+    }
+}
+
+static void handle_rbit(DisasContext *s, unsigned int sf,
+                        unsigned int rn, unsigned int rd)
+{
+    TCGv_i64 tcg_rd, tcg_rn;
+    tcg_rd = cpu_reg(s, rd);
+    tcg_rn = cpu_reg(s, rn);
+
+    if (sf) {
+        gen_helper_rbit64(tcg_rd, tcg_rn);
+    } else {
+        TCGv_i32 tcg_tmp32 = tcg_temp_new_i32();
+        tcg_gen_trunc_i64_i32(tcg_tmp32, tcg_rn);
+        gen_helper_rbit(tcg_tmp32, tcg_tmp32);
+        tcg_gen_extu_i32_i64(tcg_rd, tcg_tmp32);
+        tcg_temp_free_i32(tcg_tmp32);
+    }
+}
+
+/* C5.6.149 REV with sf==1, opcode==3 ("REV64") */
+static void handle_rev64(DisasContext *s, unsigned int sf,
+                         unsigned int rn, unsigned int rd)
+{
+    if (!sf) {
+        unallocated_encoding(s);
+        return;
+    }
+    tcg_gen_bswap64_i64(cpu_reg(s, rd), cpu_reg(s, rn));
+}
+
+/* C5.6.149 REV with sf==0, opcode==2
+ * C5.6.151 REV32 (sf==1, opcode==2)
+ */
+static void handle_rev32(DisasContext *s, unsigned int sf,
+                         unsigned int rn, unsigned int rd)
+{
+    TCGv_i64 tcg_rd = cpu_reg(s, rd);
+
+    if (sf) {
+        TCGv_i64 tcg_tmp = tcg_temp_new_i64();
+        TCGv_i64 tcg_rn = read_cpu_reg(s, rn, sf);
+
+        /* bswap32_i64 requires zero high word */
+        tcg_gen_ext32u_i64(tcg_tmp, tcg_rn);
+        tcg_gen_bswap32_i64(tcg_rd, tcg_tmp);
+        tcg_gen_shri_i64(tcg_tmp, tcg_rn, 32);
+        tcg_gen_bswap32_i64(tcg_tmp, tcg_tmp);
+        tcg_gen_concat32_i64(tcg_rd, tcg_rd, tcg_tmp);
+
+        tcg_temp_free_i64(tcg_tmp);
+    } else {
+        tcg_gen_ext32u_i64(tcg_rd, cpu_reg(s, rn));
+        tcg_gen_bswap32_i64(tcg_rd, tcg_rd);
+    }
+}
+
+/* C5.6.150 REV16 (opcode==1) */
+static void handle_rev16(DisasContext *s, unsigned int sf,
+                         unsigned int rn, unsigned int rd)
+{
+    TCGv_i64 tcg_rd = cpu_reg(s, rd);
+    TCGv_i64 tcg_tmp = tcg_temp_new_i64();
+    TCGv_i64 tcg_rn = read_cpu_reg(s, rn, sf);
+
+    tcg_gen_andi_i64(tcg_tmp, tcg_rn, 0xffff);
+    tcg_gen_bswap16_i64(tcg_rd, tcg_tmp);
+
+    tcg_gen_shri_i64(tcg_tmp, tcg_rn, 16);
+    tcg_gen_andi_i64(tcg_tmp, tcg_tmp, 0xffff);
+    tcg_gen_bswap16_i64(tcg_tmp, tcg_tmp);
+    tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_tmp, 16, 16);
+
+    if (sf) {
+        tcg_gen_shri_i64(tcg_tmp, tcg_rn, 32);
+        tcg_gen_andi_i64(tcg_tmp, tcg_tmp, 0xffff);
+        tcg_gen_bswap16_i64(tcg_tmp, tcg_tmp);
+        tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_tmp, 32, 16);
+
+        tcg_gen_shri_i64(tcg_tmp, tcg_rn, 48);
+        tcg_gen_bswap16_i64(tcg_tmp, tcg_tmp);
+        tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_tmp, 48, 16);
+    }
+
+    tcg_temp_free_i64(tcg_tmp);
+}
+
+/* C3.5.7 Data-processing (1 source)
+ *   31  30  29  28             21 20     16 15    10 9    5 4    0
+ * +----+---+---+-----------------+---------+--------+------+------+
+ * | sf | 1 | S | 1 1 0 1 0 1 1 0 | opcode2 | opcode |  Rn  |  Rd  |
+ * +----+---+---+-----------------+---------+--------+------+------+
+ */
+static void disas_data_proc_1src(DisasContext *s, uint32_t insn)
+{
+    unsigned int sf, opcode, rn, rd;
+
+    if (extract32(insn, 29, 1) || extract32(insn, 16, 5)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    sf = extract32(insn, 31, 1);
+    opcode = extract32(insn, 10, 6);
+    rn = extract32(insn, 5, 5);
+    rd = extract32(insn, 0, 5);
+
+    switch (opcode) {
+    case 0: /* RBIT */
+        handle_rbit(s, sf, rn, rd);
+        break;
+    case 1: /* REV16 */
+        handle_rev16(s, sf, rn, rd);
+        break;
+    case 2: /* REV32 */
+        handle_rev32(s, sf, rn, rd);
+        break;
+    case 3: /* REV64 */
+        handle_rev64(s, sf, rn, rd);
+        break;
+    case 4: /* CLZ */
+        handle_clz(s, sf, rn, rd);
+        break;
+    case 5: /* CLS */
+        handle_cls(s, sf, rn, rd);
+        break;
+    }
+}
+
+static void handle_div(DisasContext *s, bool is_signed, unsigned int sf,
+                       unsigned int rm, unsigned int rn, unsigned int rd)
+{
+    TCGv_i64 tcg_n, tcg_m, tcg_rd;
+    tcg_rd = cpu_reg(s, rd);
+
+    if (!sf && is_signed) {
+        tcg_n = new_tmp_a64(s);
+        tcg_m = new_tmp_a64(s);
+        tcg_gen_ext32s_i64(tcg_n, cpu_reg(s, rn));
+        tcg_gen_ext32s_i64(tcg_m, cpu_reg(s, rm));
+    } else {
+        tcg_n = read_cpu_reg(s, rn, sf);
+        tcg_m = read_cpu_reg(s, rm, sf);
+    }
+
+    if (is_signed) {
+        gen_helper_sdiv64(tcg_rd, tcg_n, tcg_m);
+    } else {
+        gen_helper_udiv64(tcg_rd, tcg_n, tcg_m);
+    }
+
+    if (!sf) { /* zero extend final result */
+        tcg_gen_ext32u_i64(tcg_rd, tcg_rd);
+    }
+}
+
+/* C5.6.115 LSLV, C5.6.118 LSRV, C5.6.17 ASRV, C5.6.154 RORV */
+static void handle_shift_reg(DisasContext *s,
+                             enum a64_shift_type shift_type, unsigned int sf,
+                             unsigned int rm, unsigned int rn, unsigned int rd)
+{
+    TCGv_i64 tcg_shift = tcg_temp_new_i64();
+    TCGv_i64 tcg_rd = cpu_reg(s, rd);
+    TCGv_i64 tcg_rn = read_cpu_reg(s, rn, sf);
+
+    tcg_gen_andi_i64(tcg_shift, cpu_reg(s, rm), sf ? 63 : 31);
+    shift_reg(tcg_rd, tcg_rn, sf, shift_type, tcg_shift);
+    tcg_temp_free_i64(tcg_shift);
+}
+
+/* C3.5.8 Data-processing (2 source)
+ *   31   30  29 28             21 20  16 15    10 9    5 4    0
+ * +----+---+---+-----------------+------+--------+------+------+
+ * | sf | 0 | S | 1 1 0 1 0 1 1 0 |  Rm  | opcode |  Rn  |  Rd  |
+ * +----+---+---+-----------------+------+--------+------+------+
+ */
+static void disas_data_proc_2src(DisasContext *s, uint32_t insn)
+{
+    unsigned int sf, rm, opcode, rn, rd;
+    sf = extract32(insn, 31, 1);
+    rm = extract32(insn, 16, 5);
+    opcode = extract32(insn, 10, 6);
+    rn = extract32(insn, 5, 5);
+    rd = extract32(insn, 0, 5);
+
+    if (extract32(insn, 29, 1)) {
+        unallocated_encoding(s);
+        return;
+    }
+
+    switch (opcode) {
+    case 2: /* UDIV */
+        handle_div(s, false, sf, rm, rn, rd);
+        break;
+    case 3: /* SDIV */
+        handle_div(s, true, sf, rm, rn, rd);
+        break;
+    case 8: /* LSLV */
+        handle_shift_reg(s, A64_SHIFT_TYPE_LSL, sf, rm, rn, rd);
+        break;
+    case 9: /* LSRV */
+        handle_shift_reg(s, A64_SHIFT_TYPE_LSR, sf, rm, rn, rd);
+        break;
+    case 10: /* ASRV */
+        handle_shift_reg(s, A64_SHIFT_TYPE_ASR, sf, rm, rn, rd);
+        break;
+    case 11: /* RORV */
+        handle_shift_reg(s, A64_SHIFT_TYPE_ROR, sf, rm, rn, rd);
+        break;
+    case 16:
+    case 17:
+    case 18:
+    case 19:
+    case 20:
+    case 21:
+    case 22:
+    case 23: /* CRC32 */
+        unsupported_encoding(s, insn);
+        break;
+    default:
+        unallocated_encoding(s);
+        break;
+    }
+}
+
+/* C3.5 Data processing - register */
+static void disas_data_proc_reg(DisasContext *s, uint32_t insn)
+{
+    switch (extract32(insn, 24, 5)) {
+    case 0x0a: /* Logical (shifted register) */
+        disas_logic_reg(s, insn);
+        break;
+    case 0x0b: /* Add/subtract */
+        if (insn & (1 << 21)) { /* (extended register) */
+            disas_add_sub_ext_reg(s, insn);
+        } else {
+            disas_add_sub_reg(s, insn);
+        }
+        break;
+    case 0x1b: /* Data-processing (3 source) */
+        disas_data_proc_3src(s, insn);
+        break;
+    case 0x1a:
+        switch (extract32(insn, 21, 3)) {
+        case 0x0: /* Add/subtract (with carry) */
+            disas_adc_sbc(s, insn);
+            break;
+        case 0x2: /* Conditional compare */
+            if (insn & (1 << 11)) { /* (immediate) */
+                disas_cc_imm(s, insn);
+            } else {            /* (register) */
+                disas_cc_reg(s, insn);
+            }
+            break;
+        case 0x4: /* Conditional select */
+            disas_cond_select(s, insn);
+            break;
+        case 0x6: /* Data-processing */
+            if (insn & (1 << 30)) { /* (1 source) */
+                disas_data_proc_1src(s, insn);
+            } else {            /* (2 source) */
+                disas_data_proc_2src(s, insn);
+            }
+            break;
+        default:
+            unallocated_encoding(s);
+            break;
+        }
+        break;
+    default:
+        unallocated_encoding(s);
+        break;
+    }
+}
+
+/* C3.6 Data processing - SIMD and floating point */
+static void disas_data_proc_simd_fp(DisasContext *s, uint32_t insn)
+{
+    unsupported_encoding(s, insn);
+}
+
+/* C3.1 A64 instruction index by encoding */
+static void disas_a64_insn(CPUARMState *env, DisasContext *s)
 {
     uint32_t insn;
 
@@ -126,14 +1591,204 @@ void disas_a64_insn(CPUARMState *env, DisasContext *s)
     s->insn = insn;
     s->pc += 4;
 
-    switch ((insn >> 24) & 0x1f) {
-    default:
+    switch (extract32(insn, 25, 4)) {
+    case 0x0: case 0x1: case 0x2: case 0x3: /* UNALLOCATED */
         unallocated_encoding(s);
         break;
+    case 0x8: case 0x9: /* Data processing - immediate */
+        disas_data_proc_imm(s, insn);
+        break;
+    case 0xa: case 0xb: /* Branch, exception generation and system insns */
+        disas_b_exc_sys(s, insn);
+        break;
+    case 0x4:
+    case 0x6:
+    case 0xc:
+    case 0xe:      /* Loads and stores */
+        disas_ldst(s, insn);
+        break;
+    case 0x5:
+    case 0xd:      /* Data processing - register */
+        disas_data_proc_reg(s, insn);
+        break;
+    case 0x7:
+    case 0xf:      /* Data processing - SIMD and floating point */
+        disas_data_proc_simd_fp(s, insn);
+        break;
+    default:
+        assert(FALSE); /* all 15 cases should be handled above */
+        break;
     }
 
-    if (unlikely(s->singlestep_enabled) && (s->is_jmp == DISAS_TB_JUMP)) {
-        /* go through the main loop for single step */
-        s->is_jmp = DISAS_JUMP;
+    /* if we allocated any temporaries, free them here */
+    free_tmp_a64(s);
+}
+
+void gen_intermediate_code_internal_a64(ARMCPU *cpu,
+                                        TranslationBlock *tb,
+                                        bool search_pc)
+{
+    CPUState *cs = CPU(cpu);
+    CPUARMState *env = &cpu->env;
+    DisasContext dc1, *dc = &dc1;
+    CPUBreakpoint *bp;
+    uint16_t *gen_opc_end;
+    int j, lj;
+    target_ulong pc_start;
+    target_ulong next_page_start;
+    int num_insns;
+    int max_insns;
+
+    pc_start = tb->pc;
+
+    dc->tb = tb;
+
+    gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
+
+    dc->is_jmp = DISAS_NEXT;
+    dc->pc = pc_start;
+    dc->singlestep_enabled = cs->singlestep_enabled;
+    dc->condjmp = 0;
+
+    dc->aarch64 = 1;
+    dc->thumb = 0;
+    dc->bswap_code = 0;
+    dc->condexec_mask = 0;
+    dc->condexec_cond = 0;
+#if !defined(CONFIG_USER_ONLY)
+    dc->user = 0;
+#endif
+    dc->vfp_enabled = 0;
+    dc->vec_len = 0;
+    dc->vec_stride = 0;
+
+    init_tmp_a64_array(dc);
+
+    next_page_start = (pc_start & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE;
+    lj = -1;
+    num_insns = 0;
+    max_insns = tb->cflags & CF_COUNT_MASK;
+    if (max_insns == 0) {
+        max_insns = CF_COUNT_MASK;
+    }
+
+    gen_tb_start();
+
+    tcg_clear_temp_count();
+
+    do {
+        if (unlikely(!QTAILQ_EMPTY(&env->breakpoints))) {
+            QTAILQ_FOREACH(bp, &env->breakpoints, entry) {
+                if (bp->pc == dc->pc) {
+                    gen_exception_insn(dc, 0, EXCP_DEBUG);
+                    /* Advance PC so that clearing the breakpoint will
+                       invalidate this TB.  */
+                    dc->pc += 2;
+                    goto done_generating;
+                }
+            }
+        }
+
+        if (search_pc) {
+            j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+            if (lj < j) {
+                lj++;
+                while (lj < j) {
+                    tcg_ctx.gen_opc_instr_start[lj++] = 0;
+                }
+            }
+            tcg_ctx.gen_opc_pc[lj] = dc->pc;
+            tcg_ctx.gen_opc_instr_start[lj] = 1;
+            tcg_ctx.gen_opc_icount[lj] = num_insns;
+        }
+
+        if (num_insns + 1 == max_insns && (tb->cflags & CF_LAST_IO)) {
+            gen_io_start();
+        }
+
+        if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP | CPU_LOG_TB_OP_OPT))) {
+            tcg_gen_debug_insn_start(dc->pc);
+        }
+
+        disas_a64_insn(env, dc);
+
+        if (tcg_check_temp_count()) {
+            fprintf(stderr, "TCG temporary leak before "TARGET_FMT_lx"\n",
+                    dc->pc);
+        }
+
+        /* Translation stops when a conditional branch is encountered.
+         * Otherwise the subsequent code could get translated several times.
+         * Also stop translation when a page boundary is reached.  This
+         * ensures prefetch aborts occur at the right place.
+         */
+        num_insns++;
+    } while (!dc->is_jmp && tcg_ctx.gen_opc_ptr < gen_opc_end &&
+             !cs->singlestep_enabled &&
+             !singlestep &&
+             dc->pc < next_page_start &&
+             num_insns < max_insns);
+
+    if (tb->cflags & CF_LAST_IO) {
+        gen_io_end();
+    }
+
+    if (unlikely(cs->singlestep_enabled) && dc->is_jmp != DISAS_EXC) {
+        /* Note that this means single stepping WFI doesn't halt the CPU.
+         * For conditional branch insns this is harmless unreachable code as
+         * gen_goto_tb() has already handled emitting the debug exception
+         * (and thus a tb-jump is not possible when singlestepping).
+         */
+        assert(dc->is_jmp != DISAS_TB_JUMP);
+        if (dc->is_jmp != DISAS_JUMP) {
+            gen_a64_set_pc_im(dc->pc);
+        }
+        gen_exception(EXCP_DEBUG);
+    } else {
+        switch (dc->is_jmp) {
+        case DISAS_NEXT:
+            gen_goto_tb(dc, 1, dc->pc);
+            break;
+        default:
+        case DISAS_JUMP:
+        case DISAS_UPDATE:
+            /* indicate that the hash table must be used to find the next TB */
+            tcg_gen_exit_tb(0);
+            break;
+        case DISAS_TB_JUMP:
+        case DISAS_EXC:
+        case DISAS_SWI:
+            break;
+        case DISAS_WFI:
+            /* This is a special case because we don't want to just halt the CPU
+             * if trying to debug across a WFI.
+             */
+            gen_helper_wfi(cpu_env);
+            break;
+        }
+    }
+
+done_generating:
+    gen_tb_end(tb, num_insns);
+    *tcg_ctx.gen_opc_ptr = INDEX_op_end;
+
+#ifdef DEBUG_DISAS
+    if (qemu_loglevel_mask(CPU_LOG_TB_IN_ASM)) {
+        qemu_log("----------------\n");
+        qemu_log("IN: %s\n", lookup_symbol(pc_start));
+        log_target_disas(env, pc_start, dc->pc - pc_start,
+                         dc->thumb | (dc->bswap_code << 1));
+        qemu_log("\n");
+    }
+#endif
+    if (search_pc) {
+        j = tcg_ctx.gen_opc_ptr - tcg_ctx.gen_opc_buf;
+        lj++;
+        while (lj <= j) {
+            tcg_ctx.gen_opc_instr_start[lj++] = 0;
+        }
+    } else {
+        tb->size = dc->pc - pc_start;
+        tb->icount = num_insns;
     }
 }
diff --git a/target-arm/translate.c b/target-arm/translate.c
index 8c479ff..1403ecf 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -56,11 +56,6 @@ static uint32_t gen_opc_condexec_bits[OPC_BUF_SIZE];
 #define IS_USER(s) (s->user)
 #endif
 
-/* These instructions trap after executing, so defer them until after the
-   conditional execution state has been updated.  */
-#define DISAS_WFI 4
-#define DISAS_SWI 5
-
 TCGv_ptr cpu_env;
 /* We reuse the same 64-bit temporaries for efficiency.  */
 static TCGv_i64 cpu_V0, cpu_V1, cpu_M0;
@@ -676,7 +671,11 @@ static void gen_thumb2_parallel_addsub(int op1, int op2, TCGv_i32 a, TCGv_i32 b)
 }
 #undef PAS_OP
 
-static void gen_test_cc(int cc, int label)
+/*
+ * generate a conditional branch based on ARM condition code cc.
+ * This is common between ARM and Aarch64 targets.
+ */
+void arm_gen_test_cc(int cc, int label)
 {
     TCGv_i32 tmp;
     int inv;
@@ -900,11 +899,7 @@ DO_GEN_ST(32, MO_TEUL)
 
 static inline void gen_set_pc_im(DisasContext *s, target_ulong val)
 {
-    if (s->aarch64) {
-        gen_a64_set_pc_im(val);
-    } else {
-        tcg_gen_movi_i32(cpu_R[15], val);
-    }
+    tcg_gen_movi_i32(cpu_R[15], val);
 }
 
 /* Force a TB lookup after an instruction that changes the CPU state.  */
@@ -4592,6 +4587,8 @@ static const uint8_t neon_3r_sizes[] = {
 #define NEON_2RM_VREV16 2
 #define NEON_2RM_VPADDL 4
 #define NEON_2RM_VPADDL_U 5
+#define NEON_2RM_AESE 6 /* Includes AESD */
+#define NEON_2RM_AESMC 7 /* Includes AESIMC */
 #define NEON_2RM_VCLS 8
 #define NEON_2RM_VCLZ 9
 #define NEON_2RM_VCNT 10
@@ -4649,6 +4646,8 @@ static const uint8_t neon_2rm_sizes[] = {
     [NEON_2RM_VREV16] = 0x1,
     [NEON_2RM_VPADDL] = 0x7,
     [NEON_2RM_VPADDL_U] = 0x7,
+    [NEON_2RM_AESE] = 0x1,
+    [NEON_2RM_AESMC] = 0x1,
     [NEON_2RM_VCLS] = 0x7,
     [NEON_2RM_VCLZ] = 0x7,
     [NEON_2RM_VCNT] = 0x1,
@@ -6184,6 +6183,28 @@ static int disas_neon_data_insn(CPUARMState * env, DisasContext *s, uint32_t ins
                     tcg_temp_free_i32(tmp2);
                     tcg_temp_free_i32(tmp3);
                     break;
+                case NEON_2RM_AESE: case NEON_2RM_AESMC:
+                    if (!arm_feature(env, ARM_FEATURE_V8_AES)
+                        || ((rm | rd) & 1)) {
+                        return 1;
+                    }
+                    tmp = tcg_const_i32(rd);
+                    tmp2 = tcg_const_i32(rm);
+
+                     /* Bit 6 is the lowest opcode bit; it distinguishes between
+                      * encryption (AESE/AESMC) and decryption (AESD/AESIMC)
+                      */
+                    tmp3 = tcg_const_i32(extract32(insn, 6, 1));
+
+                    if (op == NEON_2RM_AESE) {
+                        gen_helper_crypto_aese(cpu_env, tmp, tmp2, tmp3);
+                    } else {
+                        gen_helper_crypto_aesmc(cpu_env, tmp, tmp2, tmp3);
+                    }
+                    tcg_temp_free_i32(tmp);
+                    tcg_temp_free_i32(tmp2);
+                    tcg_temp_free_i32(tmp3);
+                    break;
                 default:
                 elementwise:
                     for (pass = 0; pass < (q ? 4 : 2); pass++) {
@@ -7114,7 +7135,7 @@ static void disas_arm_insn(CPUARMState * env, DisasContext *s)
         /* if not always execute, we generate a conditional jump to
            next instruction */
         s->condlabel = gen_new_label();
-        gen_test_cc(cond ^ 1, s->condlabel);
+        arm_gen_test_cc(cond ^ 1, s->condlabel);
         s->condjmp = 1;
     }
     if ((insn & 0x0f900000) == 0x03000000) {
@@ -9131,7 +9152,7 @@ static int disas_thumb2_insn(CPUARMState *env, DisasContext *s, uint16_t insn_hw
                 op = (insn >> 22) & 0xf;
                 /* Generate a conditional jump to next instruction.  */
                 s->condlabel = gen_new_label();
-                gen_test_cc(op ^ 1, s->condlabel);
+                arm_gen_test_cc(op ^ 1, s->condlabel);
                 s->condjmp = 1;
 
                 /* offset[11:1] = insn[10:0] */
@@ -9488,7 +9509,7 @@ static void disas_thumb_insn(CPUARMState *env, DisasContext *s)
         cond = s->condexec_cond;
         if (cond != 0x0e) {     /* Skip conditional when condition is AL. */
           s->condlabel = gen_new_label();
-          gen_test_cc(cond ^ 1, s->condlabel);
+          arm_gen_test_cc(cond ^ 1, s->condlabel);
           s->condjmp = 1;
         }
     }
@@ -10161,7 +10182,7 @@ static void disas_thumb_insn(CPUARMState *env, DisasContext *s)
         }
         /* generate a conditional jump to next instruction */
         s->condlabel = gen_new_label();
-        gen_test_cc(cond ^ 1, s->condlabel);
+        arm_gen_test_cc(cond ^ 1, s->condlabel);
         s->condjmp = 1;
 
         /* jump to the offset */
@@ -10217,6 +10238,15 @@ static inline void gen_intermediate_code_internal(ARMCPU *cpu,
     int max_insns;
 
     /* generate intermediate code */
+
+    /* The A64 decoder has its own top level loop, because it doesn't need
+     * the A32/T32 complexity to do with conditional execution/IT blocks/etc.
+     */
+    if (ARM_TBFLAG_AARCH64_STATE(tb->flags)) {
+        gen_intermediate_code_internal_a64(cpu, tb, search_pc);
+        return;
+    }
+
     pc_start = tb->pc;
 
     dc->tb = tb;
@@ -10228,31 +10258,18 @@ static inline void gen_intermediate_code_internal(ARMCPU *cpu,
     dc->singlestep_enabled = cs->singlestep_enabled;
     dc->condjmp = 0;
 
-    if (ARM_TBFLAG_AARCH64_STATE(tb->flags)) {
-        dc->aarch64 = 1;
-        dc->thumb = 0;
-        dc->bswap_code = 0;
-        dc->condexec_mask = 0;
-        dc->condexec_cond = 0;
-#if !defined(CONFIG_USER_ONLY)
-        dc->user = 0;
-#endif
-        dc->vfp_enabled = 0;
-        dc->vec_len = 0;
-        dc->vec_stride = 0;
-    } else {
-        dc->aarch64 = 0;
-        dc->thumb = ARM_TBFLAG_THUMB(tb->flags);
-        dc->bswap_code = ARM_TBFLAG_BSWAP_CODE(tb->flags);
-        dc->condexec_mask = (ARM_TBFLAG_CONDEXEC(tb->flags) & 0xf) << 1;
-        dc->condexec_cond = ARM_TBFLAG_CONDEXEC(tb->flags) >> 4;
+    dc->aarch64 = 0;
+    dc->thumb = ARM_TBFLAG_THUMB(tb->flags);
+    dc->bswap_code = ARM_TBFLAG_BSWAP_CODE(tb->flags);
+    dc->condexec_mask = (ARM_TBFLAG_CONDEXEC(tb->flags) & 0xf) << 1;
+    dc->condexec_cond = ARM_TBFLAG_CONDEXEC(tb->flags) >> 4;
 #if !defined(CONFIG_USER_ONLY)
-        dc->user = (ARM_TBFLAG_PRIV(tb->flags) == 0);
+    dc->user = (ARM_TBFLAG_PRIV(tb->flags) == 0);
 #endif
-        dc->vfp_enabled = ARM_TBFLAG_VFPEN(tb->flags);
-        dc->vec_len = ARM_TBFLAG_VECLEN(tb->flags);
-        dc->vec_stride = ARM_TBFLAG_VECSTRIDE(tb->flags);
-    }
+    dc->vfp_enabled = ARM_TBFLAG_VFPEN(tb->flags);
+    dc->vec_len = ARM_TBFLAG_VECLEN(tb->flags);
+    dc->vec_stride = ARM_TBFLAG_VECSTRIDE(tb->flags);
+
     cpu_F0s = tcg_temp_new_i32();
     cpu_F1s = tcg_temp_new_i32();
     cpu_F0d = tcg_temp_new_i64();
@@ -10314,7 +10331,7 @@ static inline void gen_intermediate_code_internal(ARMCPU *cpu,
     do {
 #ifdef CONFIG_USER_ONLY
         /* Intercept jump to the magic kernel page.  */
-        if (!dc->aarch64 && dc->pc >= 0xffff0000) {
+        if (dc->pc >= 0xffff0000) {
             /* We always get here via a jump, so know we are not in a
                conditional execution block.  */
             gen_exception(EXCP_KERNEL_TRAP);
@@ -10362,9 +10379,7 @@ static inline void gen_intermediate_code_internal(ARMCPU *cpu,
             tcg_gen_debug_insn_start(dc->pc);
         }
 
-        if (dc->aarch64) {
-            disas_a64_insn(env, dc);
-        } else if (dc->thumb) {
+        if (dc->thumb) {
             disas_thumb_insn(env, dc);
             if (dc->condexec_mask) {
                 dc->condexec_cond = (dc->condexec_cond & 0xe)
@@ -10559,8 +10574,9 @@ void restore_state_to_opc(CPUARMState *env, TranslationBlock *tb, int pc_pos)
 {
     if (is_a64(env)) {
         env->pc = tcg_ctx.gen_opc_pc[pc_pos];
+        env->condexec_bits = 0;
     } else {
         env->regs[15] = tcg_ctx.gen_opc_pc[pc_pos];
+        env->condexec_bits = gen_opc_condexec_bits[pc_pos];
     }
-    env->condexec_bits = gen_opc_condexec_bits[pc_pos];
 }
diff --git a/target-arm/translate.h b/target-arm/translate.h
index 67c7760..a6f6b3e 100644
--- a/target-arm/translate.h
+++ b/target-arm/translate.h
@@ -24,20 +24,39 @@ typedef struct DisasContext {
     int vec_len;
     int vec_stride;
     int aarch64;
+#define TMP_A64_MAX 16
+    int tmp_a64_count;
+    TCGv_i64 tmp_a64[TMP_A64_MAX];
 } DisasContext;
 
 extern TCGv_ptr cpu_env;
 
+/* target-specific extra values for is_jmp */
+/* These instructions trap after executing, so the A32/T32 decoder must
+ * defer them until after the conditional execution state has been updated.
+ * WFI also needs special handling when single-stepping.
+ */
+#define DISAS_WFI 4
+#define DISAS_SWI 5
+/* For instructions which unconditionally cause an exception we can skip
+ * emitting unreachable code at the end of the TB in the A64 decoder
+ */
+#define DISAS_EXC 6
+
 #ifdef TARGET_AARCH64
 void a64_translate_init(void);
-void disas_a64_insn(CPUARMState *env, DisasContext *s);
+void gen_intermediate_code_internal_a64(ARMCPU *cpu,
+                                        TranslationBlock *tb,
+                                        bool search_pc);
 void gen_a64_set_pc_im(uint64_t val);
 #else
 static inline void a64_translate_init(void)
 {
 }
 
-static inline void disas_a64_insn(CPUARMState *env, DisasContext *s)
+static inline void gen_intermediate_code_internal_a64(ARMCPU *cpu,
+                                                      TranslationBlock *tb,
+                                                      bool search_pc)
 {
 }
 
@@ -46,4 +65,6 @@ static inline void gen_a64_set_pc_im(uint64_t val)
 }
 #endif
 
+void arm_gen_test_cc(int cc, int label);
+
 #endif /* TARGET_ARM_TRANSLATE_H */