diff options
Diffstat (limited to 'target/arm/tcg')
-rw-r--r-- | target/arm/tcg/cpu64.c | 1 | ||||
-rw-r--r-- | target/arm/tcg/helper-a64.c | 7 | ||||
-rw-r--r-- | target/arm/tcg/helper-a64.h | 3 | ||||
-rw-r--r-- | target/arm/tcg/hflags.c | 6 | ||||
-rw-r--r-- | target/arm/tcg/mte_helper.c | 18 | ||||
-rw-r--r-- | target/arm/tcg/translate-a64.c | 477 | ||||
-rw-r--r-- | target/arm/tcg/translate-a64.h | 4 | ||||
-rw-r--r-- | target/arm/tcg/translate-sve.c | 106 | ||||
-rw-r--r-- | target/arm/tcg/translate.c | 1 | ||||
-rw-r--r-- | target/arm/tcg/translate.h | 65 |
10 files changed, 495 insertions, 193 deletions
diff --git a/target/arm/tcg/cpu64.c b/target/arm/tcg/cpu64.c index 886674a..2976f94 100644 --- a/target/arm/tcg/cpu64.c +++ b/target/arm/tcg/cpu64.c @@ -644,6 +644,7 @@ void aarch64_max_tcg_initfn(Object *obj) t = FIELD_DP64(t, ID_AA64MMFR2, IESB, 1); /* FEAT_IESB */ t = FIELD_DP64(t, ID_AA64MMFR2, VARANGE, 1); /* FEAT_LVA */ t = FIELD_DP64(t, ID_AA64MMFR2, ST, 1); /* FEAT_TTST */ + t = FIELD_DP64(t, ID_AA64MMFR2, AT, 1); /* FEAT_LSE2 */ t = FIELD_DP64(t, ID_AA64MMFR2, IDS, 1); /* FEAT_IDST */ t = FIELD_DP64(t, ID_AA64MMFR2, FWB, 1); /* FEAT_S2FWB */ t = FIELD_DP64(t, ID_AA64MMFR2, TTL, 1); /* FEAT_TTL */ diff --git a/target/arm/tcg/helper-a64.c b/target/arm/tcg/helper-a64.c index c3edf16..1c9370f 100644 --- a/target/arm/tcg/helper-a64.c +++ b/target/arm/tcg/helper-a64.c @@ -952,3 +952,10 @@ void HELPER(dc_zva)(CPUARMState *env, uint64_t vaddr_in) memset(mem, 0, blocklen); } + +void HELPER(unaligned_access)(CPUARMState *env, uint64_t addr, + uint32_t access_type, uint32_t mmu_idx) +{ + arm_cpu_do_unaligned_access(env_cpu(env), addr, access_type, + mmu_idx, GETPC()); +} diff --git a/target/arm/tcg/helper-a64.h b/target/arm/tcg/helper-a64.h index ff56807..3d5957c 100644 --- a/target/arm/tcg/helper-a64.h +++ b/target/arm/tcg/helper-a64.h @@ -110,3 +110,6 @@ DEF_HELPER_FLAGS_2(st2g_stub, TCG_CALL_NO_WG, void, env, i64) DEF_HELPER_FLAGS_2(ldgm, TCG_CALL_NO_WG, i64, env, i64) DEF_HELPER_FLAGS_3(stgm, TCG_CALL_NO_WG, void, env, i64, i64) DEF_HELPER_FLAGS_3(stzgm_tags, TCG_CALL_NO_WG, void, env, i64, i64) + +DEF_HELPER_FLAGS_4(unaligned_access, TCG_CALL_NO_WG, + noreturn, env, i64, i32, i32) diff --git a/target/arm/tcg/hflags.c b/target/arm/tcg/hflags.c index b2ccd77..616c5fa 100644 --- a/target/arm/tcg/hflags.c +++ b/target/arm/tcg/hflags.c @@ -248,6 +248,12 @@ static CPUARMTBFlags rebuild_hflags_a64(CPUARMState *env, int el, int fp_el, } } + if (cpu_isar_feature(aa64_lse2, env_archcpu(env))) { + if (sctlr & SCTLR_nAA) { + DP_TBFLAG_A64(flags, NAA, 1); + } + } + /* Compute the condition for using AccType_UNPRIV for LDTR et al. */ if (!(env->pstate & PSTATE_UAO)) { switch (mmu_idx) { diff --git a/target/arm/tcg/mte_helper.c b/target/arm/tcg/mte_helper.c index a4f3f92..9c64def 100644 --- a/target/arm/tcg/mte_helper.c +++ b/target/arm/tcg/mte_helper.c @@ -785,6 +785,24 @@ uint64_t mte_check(CPUARMState *env, uint32_t desc, uint64_t ptr, uintptr_t ra) uint64_t HELPER(mte_check)(CPUARMState *env, uint32_t desc, uint64_t ptr) { + /* + * R_XCHFJ: Alignment check not caused by memory type is priority 1, + * higher than any translation fault. When MTE is disabled, tcg + * performs the alignment check during the code generated for the + * memory access. With MTE enabled, we must check this here before + * raising any translation fault in allocation_tag_mem. + */ + unsigned align = FIELD_EX32(desc, MTEDESC, ALIGN); + if (unlikely(align)) { + align = (1u << align) - 1; + if (unlikely(ptr & align)) { + int idx = FIELD_EX32(desc, MTEDESC, MIDX); + bool w = FIELD_EX32(desc, MTEDESC, WRITE); + MMUAccessType type = w ? MMU_DATA_STORE : MMU_DATA_LOAD; + arm_cpu_do_unaligned_access(env_cpu(env), ptr, type, idx, GETPC()); + } + } + return mte_check(env, desc, ptr, GETPC()); } diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c index d980033..aa93f37 100644 --- a/target/arm/tcg/translate-a64.c +++ b/target/arm/tcg/translate-a64.c @@ -253,7 +253,7 @@ static void gen_probe_access(DisasContext *s, TCGv_i64 ptr, */ static TCGv_i64 gen_mte_check1_mmuidx(DisasContext *s, TCGv_i64 addr, bool is_write, bool tag_checked, - int log2_size, bool is_unpriv, + MemOp memop, bool is_unpriv, int core_idx) { if (tag_checked && s->mte_active[is_unpriv]) { @@ -264,7 +264,8 @@ static TCGv_i64 gen_mte_check1_mmuidx(DisasContext *s, TCGv_i64 addr, desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid); desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma); desc = FIELD_DP32(desc, MTEDESC, WRITE, is_write); - desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (1 << log2_size) - 1); + desc = FIELD_DP32(desc, MTEDESC, ALIGN, get_alignment_bits(memop)); + desc = FIELD_DP32(desc, MTEDESC, SIZEM1, memop_size(memop) - 1); ret = tcg_temp_new_i64(); gen_helper_mte_check(ret, cpu_env, tcg_constant_i32(desc), addr); @@ -275,9 +276,9 @@ static TCGv_i64 gen_mte_check1_mmuidx(DisasContext *s, TCGv_i64 addr, } TCGv_i64 gen_mte_check1(DisasContext *s, TCGv_i64 addr, bool is_write, - bool tag_checked, int log2_size) + bool tag_checked, MemOp memop) { - return gen_mte_check1_mmuidx(s, addr, is_write, tag_checked, log2_size, + return gen_mte_check1_mmuidx(s, addr, is_write, tag_checked, memop, false, get_mem_index(s)); } @@ -285,7 +286,7 @@ TCGv_i64 gen_mte_check1(DisasContext *s, TCGv_i64 addr, bool is_write, * For MTE, check multiple logical sequential accesses. */ TCGv_i64 gen_mte_checkN(DisasContext *s, TCGv_i64 addr, bool is_write, - bool tag_checked, int size) + bool tag_checked, int total_size, MemOp single_mop) { if (tag_checked && s->mte_active[0]) { TCGv_i64 ret; @@ -295,7 +296,8 @@ TCGv_i64 gen_mte_checkN(DisasContext *s, TCGv_i64 addr, bool is_write, desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid); desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma); desc = FIELD_DP32(desc, MTEDESC, WRITE, is_write); - desc = FIELD_DP32(desc, MTEDESC, SIZEM1, size - 1); + desc = FIELD_DP32(desc, MTEDESC, ALIGN, get_alignment_bits(single_mop)); + desc = FIELD_DP32(desc, MTEDESC, SIZEM1, total_size - 1); ret = tcg_temp_new_i64(); gen_helper_mte_check(ret, cpu_env, tcg_constant_i32(desc), addr); @@ -305,6 +307,89 @@ TCGv_i64 gen_mte_checkN(DisasContext *s, TCGv_i64 addr, bool is_write, return clean_data_tbi(s, addr); } +/* + * Generate the special alignment check that applies to AccType_ATOMIC + * and AccType_ORDERED insns under FEAT_LSE2: the access need not be + * naturally aligned, but it must not cross a 16-byte boundary. + * See AArch64.CheckAlignment(). + */ +static void check_lse2_align(DisasContext *s, int rn, int imm, + bool is_write, MemOp mop) +{ + TCGv_i32 tmp; + TCGv_i64 addr; + TCGLabel *over_label; + MMUAccessType type; + int mmu_idx; + + tmp = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(tmp, cpu_reg_sp(s, rn)); + tcg_gen_addi_i32(tmp, tmp, imm & 15); + tcg_gen_andi_i32(tmp, tmp, 15); + tcg_gen_addi_i32(tmp, tmp, memop_size(mop)); + + over_label = gen_new_label(); + tcg_gen_brcondi_i32(TCG_COND_LEU, tmp, 16, over_label); + + addr = tcg_temp_new_i64(); + tcg_gen_addi_i64(addr, cpu_reg_sp(s, rn), imm); + + type = is_write ? MMU_DATA_STORE : MMU_DATA_LOAD, + mmu_idx = get_mem_index(s); + gen_helper_unaligned_access(cpu_env, addr, tcg_constant_i32(type), + tcg_constant_i32(mmu_idx)); + + gen_set_label(over_label); + +} + +/* Handle the alignment check for AccType_ATOMIC instructions. */ +static MemOp check_atomic_align(DisasContext *s, int rn, MemOp mop) +{ + MemOp size = mop & MO_SIZE; + + if (size == MO_8) { + return mop; + } + + /* + * If size == MO_128, this is a LDXP, and the operation is single-copy + * atomic for each doubleword, not the entire quadword; it still must + * be quadword aligned. + */ + if (size == MO_128) { + return finalize_memop_atom(s, MO_128 | MO_ALIGN, + MO_ATOM_IFALIGN_PAIR); + } + if (dc_isar_feature(aa64_lse2, s)) { + check_lse2_align(s, rn, 0, true, mop); + } else { + mop |= MO_ALIGN; + } + return finalize_memop(s, mop); +} + +/* Handle the alignment check for AccType_ORDERED instructions. */ +static MemOp check_ordered_align(DisasContext *s, int rn, int imm, + bool is_write, MemOp mop) +{ + MemOp size = mop & MO_SIZE; + + if (size == MO_8) { + return mop; + } + if (size == MO_128) { + return finalize_memop_atom(s, MO_128 | MO_ALIGN, + MO_ATOM_IFALIGN_PAIR); + } + if (!dc_isar_feature(aa64_lse2, s)) { + mop |= MO_ALIGN; + } else if (!s->naa) { + check_lse2_align(s, rn, imm, is_write, mop); + } + return finalize_memop(s, mop); +} + typedef struct DisasCompare64 { TCGCond cond; TCGv_i64 value; @@ -838,7 +923,6 @@ static void do_gpr_st_memidx(DisasContext *s, TCGv_i64 source, unsigned int iss_srt, bool iss_sf, bool iss_ar) { - memop = finalize_memop(s, memop); tcg_gen_qemu_st_i64(source, tcg_addr, memidx, memop); if (iss_valid) { @@ -873,7 +957,6 @@ static void do_gpr_ld_memidx(DisasContext *s, TCGv_i64 dest, TCGv_i64 tcg_addr, bool iss_valid, unsigned int iss_srt, bool iss_sf, bool iss_ar) { - memop = finalize_memop(s, memop); tcg_gen_qemu_ld_i64(dest, tcg_addr, memidx, memop); if (extend && (memop & MO_SIGN)) { @@ -907,59 +990,44 @@ static void do_gpr_ld(DisasContext *s, TCGv_i64 dest, TCGv_i64 tcg_addr, /* * Store from FP register to memory */ -static void do_fp_st(DisasContext *s, int srcidx, TCGv_i64 tcg_addr, int size) +static void do_fp_st(DisasContext *s, int srcidx, TCGv_i64 tcg_addr, MemOp mop) { /* This writes the bottom N bits of a 128 bit wide vector to memory */ TCGv_i64 tmplo = tcg_temp_new_i64(); - MemOp mop; tcg_gen_ld_i64(tmplo, cpu_env, fp_reg_offset(s, srcidx, MO_64)); - if (size < 4) { - mop = finalize_memop(s, size); + if ((mop & MO_SIZE) < MO_128) { tcg_gen_qemu_st_i64(tmplo, tcg_addr, get_mem_index(s), mop); } else { - bool be = s->be_data == MO_BE; - TCGv_i64 tcg_hiaddr = tcg_temp_new_i64(); TCGv_i64 tmphi = tcg_temp_new_i64(); + TCGv_i128 t16 = tcg_temp_new_i128(); tcg_gen_ld_i64(tmphi, cpu_env, fp_reg_hi_offset(s, srcidx)); + tcg_gen_concat_i64_i128(t16, tmplo, tmphi); - mop = s->be_data | MO_UQ; - tcg_gen_qemu_st_i64(be ? tmphi : tmplo, tcg_addr, get_mem_index(s), - mop | (s->align_mem ? MO_ALIGN_16 : 0)); - tcg_gen_addi_i64(tcg_hiaddr, tcg_addr, 8); - tcg_gen_qemu_st_i64(be ? tmplo : tmphi, tcg_hiaddr, - get_mem_index(s), mop); + tcg_gen_qemu_st_i128(t16, tcg_addr, get_mem_index(s), mop); } } /* * Load from memory to FP register */ -static void do_fp_ld(DisasContext *s, int destidx, TCGv_i64 tcg_addr, int size) +static void do_fp_ld(DisasContext *s, int destidx, TCGv_i64 tcg_addr, MemOp mop) { /* This always zero-extends and writes to a full 128 bit wide vector */ TCGv_i64 tmplo = tcg_temp_new_i64(); TCGv_i64 tmphi = NULL; - MemOp mop; - if (size < 4) { - mop = finalize_memop(s, size); + if ((mop & MO_SIZE) < MO_128) { tcg_gen_qemu_ld_i64(tmplo, tcg_addr, get_mem_index(s), mop); } else { - bool be = s->be_data == MO_BE; - TCGv_i64 tcg_hiaddr; + TCGv_i128 t16 = tcg_temp_new_i128(); - tmphi = tcg_temp_new_i64(); - tcg_hiaddr = tcg_temp_new_i64(); + tcg_gen_qemu_ld_i128(t16, tcg_addr, get_mem_index(s), mop); - mop = s->be_data | MO_UQ; - tcg_gen_qemu_ld_i64(be ? tmphi : tmplo, tcg_addr, get_mem_index(s), - mop | (s->align_mem ? MO_ALIGN_16 : 0)); - tcg_gen_addi_i64(tcg_hiaddr, tcg_addr, 8); - tcg_gen_qemu_ld_i64(be ? tmplo : tmphi, tcg_hiaddr, - get_mem_index(s), mop); + tmphi = tcg_temp_new_i64(); + tcg_gen_extr_i128_i64(tmplo, tmphi, t16); } tcg_gen_st_i64(tmplo, cpu_env, fp_reg_offset(s, destidx, MO_64)); @@ -2382,19 +2450,22 @@ static void disas_b_exc_sys(DisasContext *s, uint32_t insn) * races in multi-threaded linux-user and when MTTCG softmmu is * enabled. */ -static void gen_load_exclusive(DisasContext *s, int rt, int rt2, - TCGv_i64 addr, int size, bool is_pair) +static void gen_load_exclusive(DisasContext *s, int rt, int rt2, int rn, + int size, bool is_pair) { int idx = get_mem_index(s); - MemOp memop = s->be_data; + TCGv_i64 dirty_addr, clean_addr; + MemOp memop = check_atomic_align(s, rn, size + is_pair); + + s->is_ldex = true; + dirty_addr = cpu_reg_sp(s, rn); + clean_addr = gen_mte_check1(s, dirty_addr, false, rn != 31, memop); g_assert(size <= 3); if (is_pair) { g_assert(size >= 2); if (size == 2) { - /* The pair must be single-copy atomic for the doubleword. */ - memop |= MO_64 | MO_ALIGN; - tcg_gen_qemu_ld_i64(cpu_exclusive_val, addr, idx, memop); + tcg_gen_qemu_ld_i64(cpu_exclusive_val, clean_addr, idx, memop); if (s->be_data == MO_LE) { tcg_gen_extract_i64(cpu_reg(s, rt), cpu_exclusive_val, 0, 32); tcg_gen_extract_i64(cpu_reg(s, rt2), cpu_exclusive_val, 32, 32); @@ -2403,29 +2474,29 @@ static void gen_load_exclusive(DisasContext *s, int rt, int rt2, tcg_gen_extract_i64(cpu_reg(s, rt2), cpu_exclusive_val, 0, 32); } } else { - /* The pair must be single-copy atomic for *each* doubleword, not - the entire quadword, however it must be quadword aligned. */ - memop |= MO_64; - tcg_gen_qemu_ld_i64(cpu_exclusive_val, addr, idx, - memop | MO_ALIGN_16); + TCGv_i128 t16 = tcg_temp_new_i128(); - TCGv_i64 addr2 = tcg_temp_new_i64(); - tcg_gen_addi_i64(addr2, addr, 8); - tcg_gen_qemu_ld_i64(cpu_exclusive_high, addr2, idx, memop); + tcg_gen_qemu_ld_i128(t16, clean_addr, idx, memop); + if (s->be_data == MO_LE) { + tcg_gen_extr_i128_i64(cpu_exclusive_val, + cpu_exclusive_high, t16); + } else { + tcg_gen_extr_i128_i64(cpu_exclusive_high, + cpu_exclusive_val, t16); + } tcg_gen_mov_i64(cpu_reg(s, rt), cpu_exclusive_val); tcg_gen_mov_i64(cpu_reg(s, rt2), cpu_exclusive_high); } } else { - memop |= size | MO_ALIGN; - tcg_gen_qemu_ld_i64(cpu_exclusive_val, addr, idx, memop); + tcg_gen_qemu_ld_i64(cpu_exclusive_val, clean_addr, idx, memop); tcg_gen_mov_i64(cpu_reg(s, rt), cpu_exclusive_val); } - tcg_gen_mov_i64(cpu_exclusive_addr, addr); + tcg_gen_mov_i64(cpu_exclusive_addr, clean_addr); } static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2, - TCGv_i64 addr, int size, int is_pair) + int rn, int size, int is_pair) { /* if (env->exclusive_addr == addr && env->exclusive_val == [addr] * && (!is_pair || env->exclusive_high == [addr + datasize])) { @@ -2441,9 +2512,46 @@ static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2, */ TCGLabel *fail_label = gen_new_label(); TCGLabel *done_label = gen_new_label(); - TCGv_i64 tmp; + TCGv_i64 tmp, clean_addr; + MemOp memop; + + /* + * FIXME: We are out of spec here. We have recorded only the address + * from load_exclusive, not the entire range, and we assume that the + * size of the access on both sides match. The architecture allows the + * store to be smaller than the load, so long as the stored bytes are + * within the range recorded by the load. + */ + + /* See AArch64.ExclusiveMonitorsPass() and AArch64.IsExclusiveVA(). */ + clean_addr = clean_data_tbi(s, cpu_reg_sp(s, rn)); + tcg_gen_brcond_i64(TCG_COND_NE, clean_addr, cpu_exclusive_addr, fail_label); - tcg_gen_brcond_i64(TCG_COND_NE, addr, cpu_exclusive_addr, fail_label); + /* + * The write, and any associated faults, only happen if the virtual + * and physical addresses pass the exclusive monitor check. These + * faults are exceedingly unlikely, because normally the guest uses + * the exact same address register for the load_exclusive, and we + * would have recognized these faults there. + * + * It is possible to trigger an alignment fault pre-LSE2, e.g. with an + * unaligned 4-byte write within the range of an aligned 8-byte load. + * With LSE2, the store would need to cross a 16-byte boundary when the + * load did not, which would mean the store is outside the range + * recorded for the monitor, which would have failed a corrected monitor + * check above. For now, we assume no size change and retain the + * MO_ALIGN to let tcg know what we checked in the load_exclusive. + * + * It is possible to trigger an MTE fault, by performing the load with + * a virtual address with a valid tag and performing the store with the + * same virtual address and a different invalid tag. + */ + memop = size + is_pair; + if (memop == MO_128 || !dc_isar_feature(aa64_lse2, s)) { + memop |= MO_ALIGN; + } + memop = finalize_memop(s, memop); + gen_mte_check1(s, cpu_reg_sp(s, rn), true, rn != 31, memop); tmp = tcg_temp_new_i64(); if (is_pair) { @@ -2455,8 +2563,7 @@ static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2, } tcg_gen_atomic_cmpxchg_i64(tmp, cpu_exclusive_addr, cpu_exclusive_val, tmp, - get_mem_index(s), - MO_64 | MO_ALIGN | s->be_data); + get_mem_index(s), memop); tcg_gen_setcond_i64(TCG_COND_NE, tmp, tmp, cpu_exclusive_val); } else { TCGv_i128 t16 = tcg_temp_new_i128(); @@ -2474,8 +2581,7 @@ static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2, } tcg_gen_atomic_cmpxchg_i128(t16, cpu_exclusive_addr, c16, t16, - get_mem_index(s), - MO_128 | MO_ALIGN | s->be_data); + get_mem_index(s), memop); a = tcg_temp_new_i64(); b = tcg_temp_new_i64(); @@ -2493,8 +2599,7 @@ static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2, } } else { tcg_gen_atomic_cmpxchg_i64(tmp, cpu_exclusive_addr, cpu_exclusive_val, - cpu_reg(s, rt), get_mem_index(s), - size | MO_ALIGN | s->be_data); + cpu_reg(s, rt), get_mem_index(s), memop); tcg_gen_setcond_i64(TCG_COND_NE, tmp, tmp, cpu_exclusive_val); } tcg_gen_mov_i64(cpu_reg(s, rd), tmp); @@ -2513,13 +2618,15 @@ static void gen_compare_and_swap(DisasContext *s, int rs, int rt, TCGv_i64 tcg_rt = cpu_reg(s, rt); int memidx = get_mem_index(s); TCGv_i64 clean_addr; + MemOp memop; if (rn == 31) { gen_check_sp_alignment(s); } - clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), true, rn != 31, size); - tcg_gen_atomic_cmpxchg_i64(tcg_rs, clean_addr, tcg_rs, tcg_rt, memidx, - size | MO_ALIGN | s->be_data); + memop = check_atomic_align(s, rn, size); + clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), true, rn != 31, memop); + tcg_gen_atomic_cmpxchg_i64(tcg_rs, clean_addr, tcg_rs, tcg_rt, + memidx, memop); } static void gen_compare_and_swap_pair(DisasContext *s, int rs, int rt, @@ -2531,13 +2638,15 @@ static void gen_compare_and_swap_pair(DisasContext *s, int rs, int rt, TCGv_i64 t2 = cpu_reg(s, rt + 1); TCGv_i64 clean_addr; int memidx = get_mem_index(s); + MemOp memop; if (rn == 31) { gen_check_sp_alignment(s); } /* This is a single atomic access, despite the "pair". */ - clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), true, rn != 31, size + 1); + memop = check_atomic_align(s, rn, size + 1); + clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), true, rn != 31, memop); if (size == 2) { TCGv_i64 cmp = tcg_temp_new_i64(); @@ -2551,8 +2660,7 @@ static void gen_compare_and_swap_pair(DisasContext *s, int rs, int rt, tcg_gen_concat32_i64(cmp, s2, s1); } - tcg_gen_atomic_cmpxchg_i64(cmp, clean_addr, cmp, val, memidx, - MO_64 | MO_ALIGN | s->be_data); + tcg_gen_atomic_cmpxchg_i64(cmp, clean_addr, cmp, val, memidx, memop); if (s->be_data == MO_LE) { tcg_gen_extr32_i64(s1, s2, cmp); @@ -2571,8 +2679,7 @@ static void gen_compare_and_swap_pair(DisasContext *s, int rs, int rt, tcg_gen_concat_i64_i128(cmp, s2, s1); } - tcg_gen_atomic_cmpxchg_i128(cmp, clean_addr, cmp, val, memidx, - MO_128 | MO_ALIGN | s->be_data); + tcg_gen_atomic_cmpxchg_i128(cmp, clean_addr, cmp, val, memidx, memop); if (s->be_data == MO_LE) { tcg_gen_extr_i128_i64(s1, s2, cmp); @@ -2621,6 +2728,7 @@ static void disas_ldst_excl(DisasContext *s, uint32_t insn) int o2_L_o1_o0 = extract32(insn, 21, 3) * 2 | is_lasr; int size = extract32(insn, 30, 2); TCGv_i64 clean_addr; + MemOp memop; switch (o2_L_o1_o0) { case 0x0: /* STXR */ @@ -2631,9 +2739,7 @@ static void disas_ldst_excl(DisasContext *s, uint32_t insn) if (is_lasr) { tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL); } - clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), - true, rn != 31, size); - gen_store_exclusive(s, rs, rt, rt2, clean_addr, size, false); + gen_store_exclusive(s, rs, rt, rt2, rn, size, false); return; case 0x4: /* LDXR */ @@ -2641,10 +2747,7 @@ static void disas_ldst_excl(DisasContext *s, uint32_t insn) if (rn == 31) { gen_check_sp_alignment(s); } - clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), - false, rn != 31, size); - s->is_ldex = true; - gen_load_exclusive(s, rt, rt2, clean_addr, size, false); + gen_load_exclusive(s, rt, rt2, rn, size, false); if (is_lasr) { tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ); } @@ -2662,10 +2765,10 @@ static void disas_ldst_excl(DisasContext *s, uint32_t insn) gen_check_sp_alignment(s); } tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL); + memop = check_ordered_align(s, rn, 0, true, size); clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), - true, rn != 31, size); - /* TODO: ARMv8.4-LSE SCTLR.nAA */ - do_gpr_st(s, cpu_reg(s, rt), clean_addr, size | MO_ALIGN, true, rt, + true, rn != 31, memop); + do_gpr_st(s, cpu_reg(s, rt), clean_addr, memop, true, rt, disas_ldst_compute_iss_sf(size, false, 0), is_lasr); return; @@ -2680,10 +2783,10 @@ static void disas_ldst_excl(DisasContext *s, uint32_t insn) if (rn == 31) { gen_check_sp_alignment(s); } + memop = check_ordered_align(s, rn, 0, false, size); clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), - false, rn != 31, size); - /* TODO: ARMv8.4-LSE SCTLR.nAA */ - do_gpr_ld(s, cpu_reg(s, rt), clean_addr, size | MO_ALIGN, false, true, + false, rn != 31, memop); + do_gpr_ld(s, cpu_reg(s, rt), clean_addr, memop, false, true, rt, disas_ldst_compute_iss_sf(size, false, 0), is_lasr); tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ); return; @@ -2696,9 +2799,7 @@ static void disas_ldst_excl(DisasContext *s, uint32_t insn) if (is_lasr) { tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL); } - clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), - true, rn != 31, size); - gen_store_exclusive(s, rs, rt, rt2, clean_addr, size, true); + gen_store_exclusive(s, rs, rt, rt2, rn, size, true); return; } if (rt2 == 31 @@ -2715,10 +2816,7 @@ static void disas_ldst_excl(DisasContext *s, uint32_t insn) if (rn == 31) { gen_check_sp_alignment(s); } - clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), - false, rn != 31, size); - s->is_ldex = true; - gen_load_exclusive(s, rt, rt2, clean_addr, size, true); + gen_load_exclusive(s, rt, rt2, rn, size, true); if (is_lasr) { tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ); } @@ -2768,6 +2866,7 @@ static void disas_ld_lit(DisasContext *s, uint32_t insn) bool is_signed = false; int size = 2; TCGv_i64 tcg_rt, clean_addr; + MemOp memop; if (is_vector) { if (opc == 3) { @@ -2778,6 +2877,7 @@ static void disas_ld_lit(DisasContext *s, uint32_t insn) if (!fp_access_check(s)) { return; } + memop = finalize_memop_asimd(s, size); } else { if (opc == 3) { /* PRFM (literal) : prefetch */ @@ -2785,20 +2885,20 @@ static void disas_ld_lit(DisasContext *s, uint32_t insn) } size = 2 + extract32(opc, 0, 1); is_signed = extract32(opc, 1, 1); + memop = finalize_memop(s, size + is_signed * MO_SIGN); } tcg_rt = cpu_reg(s, rt); clean_addr = tcg_temp_new_i64(); gen_pc_plus_diff(s, clean_addr, imm); + if (is_vector) { - do_fp_ld(s, rt, clean_addr, size); + do_fp_ld(s, rt, clean_addr, memop); } else { /* Only unsigned 32bit loads target 32bit registers. */ bool iss_sf = opc != 0; - - do_gpr_ld(s, tcg_rt, clean_addr, size + is_signed * MO_SIGN, - false, true, rt, iss_sf, false); + do_gpr_ld(s, tcg_rt, clean_addr, memop, false, true, rt, iss_sf, false); } } @@ -2840,14 +2940,12 @@ static void disas_ldst_pair(DisasContext *s, uint32_t insn) bool is_vector = extract32(insn, 26, 1); bool is_load = extract32(insn, 22, 1); int opc = extract32(insn, 30, 2); - bool is_signed = false; bool postindex = false; bool wback = false; bool set_tag = false; - TCGv_i64 clean_addr, dirty_addr; - + MemOp mop; int size; if (opc == 3) { @@ -2930,44 +3028,94 @@ static void disas_ldst_pair(DisasContext *s, uint32_t insn) } } + if (is_vector) { + mop = finalize_memop_asimd(s, size); + } else { + mop = finalize_memop(s, size); + } clean_addr = gen_mte_checkN(s, dirty_addr, !is_load, - (wback || rn != 31) && !set_tag, 2 << size); + (wback || rn != 31) && !set_tag, + 2 << size, mop); if (is_vector) { + /* LSE2 does not merge FP pairs; leave these as separate operations. */ if (is_load) { - do_fp_ld(s, rt, clean_addr, size); + do_fp_ld(s, rt, clean_addr, mop); } else { - do_fp_st(s, rt, clean_addr, size); + do_fp_st(s, rt, clean_addr, mop); } tcg_gen_addi_i64(clean_addr, clean_addr, 1 << size); if (is_load) { - do_fp_ld(s, rt2, clean_addr, size); + do_fp_ld(s, rt2, clean_addr, mop); } else { - do_fp_st(s, rt2, clean_addr, size); + do_fp_st(s, rt2, clean_addr, mop); } } else { TCGv_i64 tcg_rt = cpu_reg(s, rt); TCGv_i64 tcg_rt2 = cpu_reg(s, rt2); + /* + * We built mop above for the single logical access -- rebuild it + * now for the paired operation. + * + * With LSE2, non-sign-extending pairs are treated atomically if + * aligned, and if unaligned one of the pair will be completely + * within a 16-byte block and that element will be atomic. + * Otherwise each element is separately atomic. + * In all cases, issue one operation with the correct atomicity. + * + * This treats sign-extending loads like zero-extending loads, + * since that reuses the most code below. + */ + mop = size + 1; + if (s->align_mem) { + mop |= (size == 2 ? MO_ALIGN_4 : MO_ALIGN_8); + } + mop = finalize_memop_pair(s, mop); + if (is_load) { - TCGv_i64 tmp = tcg_temp_new_i64(); + if (size == 2) { + int o2 = s->be_data == MO_LE ? 32 : 0; + int o1 = o2 ^ 32; - /* Do not modify tcg_rt before recognizing any exception - * from the second load. - */ - do_gpr_ld(s, tmp, clean_addr, size + is_signed * MO_SIGN, - false, false, 0, false, false); - tcg_gen_addi_i64(clean_addr, clean_addr, 1 << size); - do_gpr_ld(s, tcg_rt2, clean_addr, size + is_signed * MO_SIGN, - false, false, 0, false, false); + tcg_gen_qemu_ld_i64(tcg_rt, clean_addr, get_mem_index(s), mop); + if (is_signed) { + tcg_gen_sextract_i64(tcg_rt2, tcg_rt, o2, 32); + tcg_gen_sextract_i64(tcg_rt, tcg_rt, o1, 32); + } else { + tcg_gen_extract_i64(tcg_rt2, tcg_rt, o2, 32); + tcg_gen_extract_i64(tcg_rt, tcg_rt, o1, 32); + } + } else { + TCGv_i128 tmp = tcg_temp_new_i128(); - tcg_gen_mov_i64(tcg_rt, tmp); + tcg_gen_qemu_ld_i128(tmp, clean_addr, get_mem_index(s), mop); + if (s->be_data == MO_LE) { + tcg_gen_extr_i128_i64(tcg_rt, tcg_rt2, tmp); + } else { + tcg_gen_extr_i128_i64(tcg_rt2, tcg_rt, tmp); + } + } } else { - do_gpr_st(s, tcg_rt, clean_addr, size, - false, 0, false, false); - tcg_gen_addi_i64(clean_addr, clean_addr, 1 << size); - do_gpr_st(s, tcg_rt2, clean_addr, size, - false, 0, false, false); + if (size == 2) { + TCGv_i64 tmp = tcg_temp_new_i64(); + + if (s->be_data == MO_LE) { + tcg_gen_concat32_i64(tmp, tcg_rt, tcg_rt2); + } else { + tcg_gen_concat32_i64(tmp, tcg_rt2, tcg_rt); + } + tcg_gen_qemu_st_i64(tmp, clean_addr, get_mem_index(s), mop); + } else { + TCGv_i128 tmp = tcg_temp_new_i128(); + + if (s->be_data == MO_LE) { + tcg_gen_concat_i64_i128(tmp, tcg_rt, tcg_rt2); + } else { + tcg_gen_concat_i64_i128(tmp, tcg_rt2, tcg_rt); + } + tcg_gen_qemu_st_i128(tmp, clean_addr, get_mem_index(s), mop); + } } } @@ -3012,7 +3160,7 @@ static void disas_ldst_reg_imm9(DisasContext *s, uint32_t insn, bool post_index; bool writeback; int memidx; - + MemOp memop; TCGv_i64 clean_addr, dirty_addr; if (is_vector) { @@ -3025,6 +3173,7 @@ static void disas_ldst_reg_imm9(DisasContext *s, uint32_t insn, if (!fp_access_check(s)) { return; } + memop = finalize_memop_asimd(s, size); } else { if (size == 3 && opc == 2) { /* PRFM - prefetch */ @@ -3039,8 +3188,9 @@ static void disas_ldst_reg_imm9(DisasContext *s, uint32_t insn, return; } is_store = (opc == 0); - is_signed = extract32(opc, 1, 1); + is_signed = !is_store && extract32(opc, 1, 1); is_extended = (size < 3) && extract32(opc, 0, 1); + memop = finalize_memop(s, size + is_signed * MO_SIGN); } switch (idx) { @@ -3073,25 +3223,26 @@ static void disas_ldst_reg_imm9(DisasContext *s, uint32_t insn, } memidx = is_unpriv ? get_a64_user_mem_index(s) : get_mem_index(s); + clean_addr = gen_mte_check1_mmuidx(s, dirty_addr, is_store, writeback || rn != 31, size, is_unpriv, memidx); if (is_vector) { if (is_store) { - do_fp_st(s, rt, clean_addr, size); + do_fp_st(s, rt, clean_addr, memop); } else { - do_fp_ld(s, rt, clean_addr, size); + do_fp_ld(s, rt, clean_addr, memop); } } else { TCGv_i64 tcg_rt = cpu_reg(s, rt); bool iss_sf = disas_ldst_compute_iss_sf(size, is_signed, opc); if (is_store) { - do_gpr_st_memidx(s, tcg_rt, clean_addr, size, memidx, + do_gpr_st_memidx(s, tcg_rt, clean_addr, memop, memidx, iss_valid, rt, iss_sf, false); } else { - do_gpr_ld_memidx(s, tcg_rt, clean_addr, size + is_signed * MO_SIGN, + do_gpr_ld_memidx(s, tcg_rt, clean_addr, memop, is_extended, memidx, iss_valid, rt, iss_sf, false); } @@ -3140,8 +3291,8 @@ static void disas_ldst_reg_roffset(DisasContext *s, uint32_t insn, bool is_signed = false; bool is_store = false; bool is_extended = false; - TCGv_i64 tcg_rm, clean_addr, dirty_addr; + MemOp memop; if (extract32(opt, 1, 1) == 0) { unallocated_encoding(s); @@ -3168,7 +3319,7 @@ static void disas_ldst_reg_roffset(DisasContext *s, uint32_t insn, return; } is_store = (opc == 0); - is_signed = extract32(opc, 1, 1); + is_signed = !is_store && extract32(opc, 1, 1); is_extended = (size < 3) && extract32(opc, 0, 1); } @@ -3181,22 +3332,25 @@ static void disas_ldst_reg_roffset(DisasContext *s, uint32_t insn, ext_and_shift_reg(tcg_rm, tcg_rm, opt, shift ? size : 0); tcg_gen_add_i64(dirty_addr, dirty_addr, tcg_rm); - clean_addr = gen_mte_check1(s, dirty_addr, is_store, true, size); + + memop = finalize_memop(s, size + is_signed * MO_SIGN); + clean_addr = gen_mte_check1(s, dirty_addr, is_store, true, memop); if (is_vector) { if (is_store) { - do_fp_st(s, rt, clean_addr, size); + do_fp_st(s, rt, clean_addr, memop); } else { - do_fp_ld(s, rt, clean_addr, size); + do_fp_ld(s, rt, clean_addr, memop); } } else { TCGv_i64 tcg_rt = cpu_reg(s, rt); bool iss_sf = disas_ldst_compute_iss_sf(size, is_signed, opc); + if (is_store) { - do_gpr_st(s, tcg_rt, clean_addr, size, + do_gpr_st(s, tcg_rt, clean_addr, memop, true, rt, iss_sf, false); } else { - do_gpr_ld(s, tcg_rt, clean_addr, size + is_signed * MO_SIGN, + do_gpr_ld(s, tcg_rt, clean_addr, memop, is_extended, true, rt, iss_sf, false); } } @@ -3228,12 +3382,11 @@ static void disas_ldst_reg_unsigned_imm(DisasContext *s, uint32_t insn, int rn = extract32(insn, 5, 5); unsigned int imm12 = extract32(insn, 10, 12); unsigned int offset; - TCGv_i64 clean_addr, dirty_addr; - bool is_store; bool is_signed = false; bool is_extended = false; + MemOp memop; if (is_vector) { size |= (opc & 2) << 1; @@ -3255,7 +3408,7 @@ static void disas_ldst_reg_unsigned_imm(DisasContext *s, uint32_t insn, return; } is_store = (opc == 0); - is_signed = extract32(opc, 1, 1); + is_signed = !is_store && extract32(opc, 1, 1); is_extended = (size < 3) && extract32(opc, 0, 1); } @@ -3265,22 +3418,23 @@ static void disas_ldst_reg_unsigned_imm(DisasContext *s, uint32_t insn, dirty_addr = read_cpu_reg_sp(s, rn, 1); offset = imm12 << size; tcg_gen_addi_i64(dirty_addr, dirty_addr, offset); - clean_addr = gen_mte_check1(s, dirty_addr, is_store, rn != 31, size); + + memop = finalize_memop(s, size + is_signed * MO_SIGN); + clean_addr = gen_mte_check1(s, dirty_addr, is_store, rn != 31, memop); if (is_vector) { if (is_store) { - do_fp_st(s, rt, clean_addr, size); + do_fp_st(s, rt, clean_addr, memop); } else { - do_fp_ld(s, rt, clean_addr, size); + do_fp_ld(s, rt, clean_addr, memop); } } else { TCGv_i64 tcg_rt = cpu_reg(s, rt); bool iss_sf = disas_ldst_compute_iss_sf(size, is_signed, opc); if (is_store) { - do_gpr_st(s, tcg_rt, clean_addr, size, - true, rt, iss_sf, false); + do_gpr_st(s, tcg_rt, clean_addr, memop, true, rt, iss_sf, false); } else { - do_gpr_ld(s, tcg_rt, clean_addr, size + is_signed * MO_SIGN, + do_gpr_ld(s, tcg_rt, clean_addr, memop, is_extended, true, rt, iss_sf, false); } } @@ -3310,7 +3464,7 @@ static void disas_ldst_atomic(DisasContext *s, uint32_t insn, bool a = extract32(insn, 23, 1); TCGv_i64 tcg_rs, tcg_rt, clean_addr; AtomicThreeOpFn *fn = NULL; - MemOp mop = s->be_data | size | MO_ALIGN; + MemOp mop = size; if (is_vector || !dc_isar_feature(aa64_atomics, s)) { unallocated_encoding(s); @@ -3361,7 +3515,9 @@ static void disas_ldst_atomic(DisasContext *s, uint32_t insn, if (rn == 31) { gen_check_sp_alignment(s); } - clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), false, rn != 31, size); + + mop = check_atomic_align(s, rn, mop); + clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), false, rn != 31, mop); if (o3_opc == 014) { /* @@ -3371,7 +3527,7 @@ static void disas_ldst_atomic(DisasContext *s, uint32_t insn, * full load-acquire (we only need "load-acquire processor consistent"), * but we choose to implement them as full LDAQ. */ - do_gpr_ld(s, cpu_reg(s, rt), clean_addr, size, false, + do_gpr_ld(s, cpu_reg(s, rt), clean_addr, mop, false, true, rt, disas_ldst_compute_iss_sf(size, false, 0), true); tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ); return; @@ -3417,6 +3573,7 @@ static void disas_ldst_pac(DisasContext *s, uint32_t insn, bool use_key_a = !extract32(insn, 23, 1); int offset; TCGv_i64 clean_addr, dirty_addr, tcg_rt; + MemOp memop; if (size != 3 || is_vector || !dc_isar_feature(aa64_pauth, s)) { unallocated_encoding(s); @@ -3443,12 +3600,14 @@ static void disas_ldst_pac(DisasContext *s, uint32_t insn, offset = sextract32(offset << size, 0, 10 + size); tcg_gen_addi_i64(dirty_addr, dirty_addr, offset); + memop = finalize_memop(s, size); + /* Note that "clean" and "dirty" here refer to TBI not PAC. */ clean_addr = gen_mte_check1(s, dirty_addr, false, - is_wback || rn != 31, size); + is_wback || rn != 31, memop); tcg_rt = cpu_reg(s, rt); - do_gpr_ld(s, tcg_rt, clean_addr, size, + do_gpr_ld(s, tcg_rt, clean_addr, memop, /* extend */ false, /* iss_valid */ !is_wback, /* iss_srt */ rt, /* iss_sf */ true, /* iss_ar */ false); @@ -3482,16 +3641,13 @@ static void disas_ldst_ldapr_stlr(DisasContext *s, uint32_t insn) bool is_store = false; bool extend = false; bool iss_sf; - MemOp mop; + MemOp mop = size; if (!dc_isar_feature(aa64_rcpc_8_4, s)) { unallocated_encoding(s); return; } - /* TODO: ARMv8.4-LSE SCTLR.nAA */ - mop = size | MO_ALIGN; - switch (opc) { case 0: /* STLURB */ is_store = true; @@ -3523,6 +3679,8 @@ static void disas_ldst_ldapr_stlr(DisasContext *s, uint32_t insn) gen_check_sp_alignment(s); } + mop = check_ordered_align(s, rn, offset, is_store, mop); + dirty_addr = read_cpu_reg_sp(s, rn, 1); tcg_gen_addi_i64(dirty_addr, dirty_addr, offset); clean_addr = clean_data_tbi(s, dirty_addr); @@ -3689,7 +3847,7 @@ static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn) * promote consecutive little-endian elements below. */ clean_addr = gen_mte_checkN(s, tcg_rn, is_store, is_postidx || rn != 31, - total); + total, finalize_memop(s, size)); /* * Consecutive little-endian elements from a single register @@ -3847,10 +4005,11 @@ static void disas_ldst_single_struct(DisasContext *s, uint32_t insn) total = selem << scale; tcg_rn = cpu_reg_sp(s, rn); - clean_addr = gen_mte_checkN(s, tcg_rn, !is_load, is_postidx || rn != 31, - total); mop = finalize_memop(s, scale); + clean_addr = gen_mte_checkN(s, tcg_rn, !is_load, is_postidx || rn != 31, + total, mop); + tcg_ebytes = tcg_constant_i64(1 << scale); for (xs = 0; xs < selem; xs++) { if (replicate) { @@ -4062,15 +4221,18 @@ static void disas_ldst_tag(DisasContext *s, uint32_t insn) if (is_zero) { TCGv_i64 clean_addr = clean_data_tbi(s, addr); - TCGv_i64 tcg_zero = tcg_constant_i64(0); + TCGv_i64 zero64 = tcg_constant_i64(0); + TCGv_i128 zero128 = tcg_temp_new_i128(); int mem_index = get_mem_index(s); - int i, n = (1 + is_pair) << LOG2_TAG_GRANULE; + MemOp mop = finalize_memop(s, MO_128 | MO_ALIGN); + + tcg_gen_concat_i64_i128(zero128, zero64, zero64); - tcg_gen_qemu_st_i64(tcg_zero, clean_addr, mem_index, - MO_UQ | MO_ALIGN_16); - for (i = 8; i < n; i += 8) { - tcg_gen_addi_i64(clean_addr, clean_addr, 8); - tcg_gen_qemu_st_i64(tcg_zero, clean_addr, mem_index, MO_UQ); + /* This is 1 or 2 atomic 16-byte operations. */ + tcg_gen_qemu_st_i128(zero128, clean_addr, mem_index, mop); + if (is_pair) { + tcg_gen_addi_i64(clean_addr, clean_addr, 16); + tcg_gen_qemu_st_i128(zero128, clean_addr, mem_index, mop); } } @@ -14087,6 +14249,7 @@ static void aarch64_tr_init_disas_context(DisasContextBase *dcbase, dc->pstate_sm = EX_TBFLAG_A64(tb_flags, PSTATE_SM); dc->pstate_za = EX_TBFLAG_A64(tb_flags, PSTATE_ZA); dc->sme_trap_nonstreaming = EX_TBFLAG_A64(tb_flags, SME_TRAP_NONSTREAMING); + dc->naa = EX_TBFLAG_A64(tb_flags, NAA); dc->vec_len = 0; dc->vec_stride = 0; dc->cp_regs = arm_cpu->cp_regs; @@ -14098,6 +14261,8 @@ static void aarch64_tr_init_disas_context(DisasContextBase *dcbase, tcg_debug_assert(dc->tbid & 1); #endif + dc->lse2 = dc_isar_feature(aa64_lse2, dc); + /* Single step state. The code-generation logic here is: * SS_ACTIVE == 0: * generate code with no special handling for single-stepping (except diff --git a/target/arm/tcg/translate-a64.h b/target/arm/tcg/translate-a64.h index 0576c4e..b55dc43 100644 --- a/target/arm/tcg/translate-a64.h +++ b/target/arm/tcg/translate-a64.h @@ -49,9 +49,9 @@ static inline bool sme_smza_enabled_check(DisasContext *s) TCGv_i64 clean_data_tbi(DisasContext *s, TCGv_i64 addr); TCGv_i64 gen_mte_check1(DisasContext *s, TCGv_i64 addr, bool is_write, - bool tag_checked, int log2_size); + bool tag_checked, MemOp memop); TCGv_i64 gen_mte_checkN(DisasContext *s, TCGv_i64 addr, bool is_write, - bool tag_checked, int size); + bool tag_checked, int total_size, MemOp memop); /* We should have at some point before trying to access an FP register * done the necessary access check, so assert that diff --git a/target/arm/tcg/translate-sve.c b/target/arm/tcg/translate-sve.c index d9d5810..ff05062 100644 --- a/target/arm/tcg/translate-sve.c +++ b/target/arm/tcg/translate-sve.c @@ -4167,15 +4167,16 @@ TRANS_FEAT(UCVTF_dd, aa64_sve, gen_gvec_fpst_arg_zpz, void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, int len, int rn, int imm) { - int len_align = QEMU_ALIGN_DOWN(len, 8); - int len_remain = len % 8; - int nparts = len / 8 + ctpop8(len_remain); + int len_align = QEMU_ALIGN_DOWN(len, 16); + int len_remain = len % 16; + int nparts = len / 16 + ctpop8(len_remain); int midx = get_mem_index(s); TCGv_i64 dirty_addr, clean_addr, t0, t1; + TCGv_i128 t16; dirty_addr = tcg_temp_new_i64(); tcg_gen_addi_i64(dirty_addr, cpu_reg_sp(s, rn), imm); - clean_addr = gen_mte_checkN(s, dirty_addr, false, rn != 31, len); + clean_addr = gen_mte_checkN(s, dirty_addr, false, rn != 31, len, MO_8); /* * Note that unpredicated load/store of vector/predicate registers @@ -4188,10 +4189,16 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, int i; t0 = tcg_temp_new_i64(); - for (i = 0; i < len_align; i += 8) { - tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ); + t1 = tcg_temp_new_i64(); + t16 = tcg_temp_new_i128(); + + for (i = 0; i < len_align; i += 16) { + tcg_gen_qemu_ld_i128(t16, clean_addr, midx, + MO_LE | MO_128 | MO_ATOM_NONE); + tcg_gen_extr_i128_i64(t0, t1, t16); tcg_gen_st_i64(t0, base, vofs + i); - tcg_gen_addi_i64(clean_addr, clean_addr, 8); + tcg_gen_st_i64(t1, base, vofs + i + 8); + tcg_gen_addi_i64(clean_addr, clean_addr, 16); } } else { TCGLabel *loop = gen_new_label(); @@ -4200,14 +4207,21 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, tcg_gen_movi_ptr(i, 0); gen_set_label(loop); - t0 = tcg_temp_new_i64(); - tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ); - tcg_gen_addi_i64(clean_addr, clean_addr, 8); + t16 = tcg_temp_new_i128(); + tcg_gen_qemu_ld_i128(t16, clean_addr, midx, + MO_LE | MO_128 | MO_ATOM_NONE); + tcg_gen_addi_i64(clean_addr, clean_addr, 16); tp = tcg_temp_new_ptr(); tcg_gen_add_ptr(tp, base, i); - tcg_gen_addi_ptr(i, i, 8); + tcg_gen_addi_ptr(i, i, 16); + + t0 = tcg_temp_new_i64(); + t1 = tcg_temp_new_i64(); + tcg_gen_extr_i128_i64(t0, t1, t16); + tcg_gen_st_i64(t0, tp, vofs); + tcg_gen_st_i64(t1, tp, vofs + 8); tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop); } @@ -4216,6 +4230,16 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, * Predicate register loads can be any multiple of 2. * Note that we still store the entire 64-bit unit into cpu_env. */ + if (len_remain >= 8) { + t0 = tcg_temp_new_i64(); + tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ | MO_ATOM_NONE); + tcg_gen_st_i64(t0, base, vofs + len_align); + len_remain -= 8; + len_align += 8; + if (len_remain) { + tcg_gen_addi_i64(clean_addr, clean_addr, 8); + } + } if (len_remain) { t0 = tcg_temp_new_i64(); switch (len_remain) { @@ -4223,14 +4247,14 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, case 4: case 8: tcg_gen_qemu_ld_i64(t0, clean_addr, midx, - MO_LE | ctz32(len_remain)); + MO_LE | ctz32(len_remain) | MO_ATOM_NONE); break; case 6: t1 = tcg_temp_new_i64(); - tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUL); + tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUL | MO_ATOM_NONE); tcg_gen_addi_i64(clean_addr, clean_addr, 4); - tcg_gen_qemu_ld_i64(t1, clean_addr, midx, MO_LEUW); + tcg_gen_qemu_ld_i64(t1, clean_addr, midx, MO_LEUW | MO_ATOM_NONE); tcg_gen_deposit_i64(t0, t0, t1, 32, 32); break; @@ -4245,15 +4269,16 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs, int len, int rn, int imm) { - int len_align = QEMU_ALIGN_DOWN(len, 8); - int len_remain = len % 8; - int nparts = len / 8 + ctpop8(len_remain); + int len_align = QEMU_ALIGN_DOWN(len, 16); + int len_remain = len % 16; + int nparts = len / 16 + ctpop8(len_remain); int midx = get_mem_index(s); - TCGv_i64 dirty_addr, clean_addr, t0; + TCGv_i64 dirty_addr, clean_addr, t0, t1; + TCGv_i128 t16; dirty_addr = tcg_temp_new_i64(); tcg_gen_addi_i64(dirty_addr, cpu_reg_sp(s, rn), imm); - clean_addr = gen_mte_checkN(s, dirty_addr, false, rn != 31, len); + clean_addr = gen_mte_checkN(s, dirty_addr, false, rn != 31, len, MO_8); /* Note that unpredicated load/store of vector/predicate registers * are defined as a stream of bytes, which equates to little-endian @@ -4267,10 +4292,15 @@ void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs, int i; t0 = tcg_temp_new_i64(); + t1 = tcg_temp_new_i64(); + t16 = tcg_temp_new_i128(); for (i = 0; i < len_align; i += 8) { tcg_gen_ld_i64(t0, base, vofs + i); - tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ); - tcg_gen_addi_i64(clean_addr, clean_addr, 8); + tcg_gen_ld_i64(t1, base, vofs + i + 8); + tcg_gen_concat_i64_i128(t16, t0, t1); + tcg_gen_qemu_st_i128(t16, clean_addr, midx, + MO_LE | MO_128 | MO_ATOM_NONE); + tcg_gen_addi_i64(clean_addr, clean_addr, 16); } } else { TCGLabel *loop = gen_new_label(); @@ -4280,18 +4310,33 @@ void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs, gen_set_label(loop); t0 = tcg_temp_new_i64(); + t1 = tcg_temp_new_i64(); tp = tcg_temp_new_ptr(); tcg_gen_add_ptr(tp, base, i); tcg_gen_ld_i64(t0, tp, vofs); - tcg_gen_addi_ptr(i, i, 8); + tcg_gen_ld_i64(t1, tp, vofs + 8); + tcg_gen_addi_ptr(i, i, 16); - tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ); - tcg_gen_addi_i64(clean_addr, clean_addr, 8); + t16 = tcg_temp_new_i128(); + tcg_gen_concat_i64_i128(t16, t0, t1); + + tcg_gen_qemu_st_i128(t16, clean_addr, midx, MO_LEUQ); + tcg_gen_addi_i64(clean_addr, clean_addr, 16); tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop); } /* Predicate register stores can be any multiple of 2. */ + if (len_remain >= 8) { + t0 = tcg_temp_new_i64(); + tcg_gen_st_i64(t0, base, vofs + len_align); + tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ | MO_ATOM_NONE); + len_remain -= 8; + len_align += 8; + if (len_remain) { + tcg_gen_addi_i64(clean_addr, clean_addr, 8); + } + } if (len_remain) { t0 = tcg_temp_new_i64(); tcg_gen_ld_i64(t0, base, vofs + len_align); @@ -4301,14 +4346,14 @@ void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs, case 4: case 8: tcg_gen_qemu_st_i64(t0, clean_addr, midx, - MO_LE | ctz32(len_remain)); + MO_LE | ctz32(len_remain) | MO_ATOM_NONE); break; case 6: - tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUL); + tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUL | MO_ATOM_NONE); tcg_gen_addi_i64(clean_addr, clean_addr, 4); tcg_gen_shri_i64(t0, t0, 32); - tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUW); + tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUW | MO_ATOM_NONE); break; default: @@ -4964,6 +5009,7 @@ static bool trans_LD1R_zpri(DisasContext *s, arg_rpri_load *a) unsigned msz = dtype_msz(a->dtype); TCGLabel *over; TCGv_i64 temp, clean_addr; + MemOp memop; if (!dc_isar_feature(aa64_sve, s)) { return false; @@ -4993,10 +5039,10 @@ static bool trans_LD1R_zpri(DisasContext *s, arg_rpri_load *a) /* Load the data. */ temp = tcg_temp_new_i64(); tcg_gen_addi_i64(temp, cpu_reg_sp(s, a->rn), a->imm << msz); - clean_addr = gen_mte_check1(s, temp, false, true, msz); - tcg_gen_qemu_ld_i64(temp, clean_addr, get_mem_index(s), - finalize_memop(s, dtype_mop[a->dtype])); + memop = finalize_memop(s, dtype_mop[a->dtype]); + clean_addr = gen_mte_check1(s, temp, false, true, memop); + tcg_gen_qemu_ld_i64(temp, clean_addr, get_mem_index(s), memop); /* Broadcast to *all* elements. */ tcg_gen_gvec_dup_i64(esz, vec_full_reg_offset(s, a->rd), diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c index a68d3c7..13c88ba 100644 --- a/target/arm/tcg/translate.c +++ b/target/arm/tcg/translate.c @@ -9168,6 +9168,7 @@ static void arm_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs) dc->sme_trap_nonstreaming = EX_TBFLAG_A32(tb_flags, SME_TRAP_NONSTREAMING); } + dc->lse2 = false; /* applies only to aarch64 */ dc->cp_regs = cpu->cp_regs; dc->features = env->features; diff --git a/target/arm/tcg/translate.h b/target/arm/tcg/translate.h index 4d88197..d1cacff 100644 --- a/target/arm/tcg/translate.h +++ b/target/arm/tcg/translate.h @@ -90,6 +90,7 @@ typedef struct DisasContext { uint64_t features; /* CPU features bits */ bool aarch64; bool thumb; + bool lse2; /* Because unallocated encodings generate different exception syndrome * information from traps due to FP being disabled, we can't do a single * "is fp access disabled" check at a high level in the decode tree. @@ -141,6 +142,8 @@ typedef struct DisasContext { bool fgt_eret; /* True if fine-grained trap on SVC is enabled */ bool fgt_svc; + /* True if FEAT_LSE2 SCTLR_ELx.nAA is set */ + bool naa; /* * >= 0, a copy of PSTATE.BTYPE, which will be 0 without v8.5-BTI. * < 0, set by the current instruction. @@ -557,12 +560,13 @@ static inline TCGv_ptr fpstatus_ptr(ARMFPStatusFlavour flavour) } /** - * finalize_memop: + * finalize_memop_atom: * @s: DisasContext * @opc: size+sign+align of the memory operation + * @atom: atomicity of the memory operation * - * Build the complete MemOp for a memory operation, including alignment - * and endianness. + * Build the complete MemOp for a memory operation, including alignment, + * endianness, and atomicity. * * If (op & MO_AMASK) then the operation already contains the required * alignment, e.g. for AccType_ATOMIC. Otherwise, this an optionally @@ -572,12 +576,63 @@ static inline TCGv_ptr fpstatus_ptr(ARMFPStatusFlavour flavour) * and this is applied here. Note that there is no way to indicate that * no alignment should ever be enforced; this must be handled manually. */ -static inline MemOp finalize_memop(DisasContext *s, MemOp opc) +static inline MemOp finalize_memop_atom(DisasContext *s, MemOp opc, MemOp atom) { if (s->align_mem && !(opc & MO_AMASK)) { opc |= MO_ALIGN; } - return opc | s->be_data; + return opc | atom | s->be_data; +} + +/** + * finalize_memop: + * @s: DisasContext + * @opc: size+sign+align of the memory operation + * + * Like finalize_memop_atom, but with default atomicity. + */ +static inline MemOp finalize_memop(DisasContext *s, MemOp opc) +{ + MemOp atom = s->lse2 ? MO_ATOM_WITHIN16 : MO_ATOM_IFALIGN; + return finalize_memop_atom(s, opc, atom); +} + +/** + * finalize_memop_pair: + * @s: DisasContext + * @opc: size+sign+align of the memory operation + * + * Like finalize_memop_atom, but with atomicity for a pair. + * C.f. Pseudocode for Mem[], operand ispair. + */ +static inline MemOp finalize_memop_pair(DisasContext *s, MemOp opc) +{ + MemOp atom = s->lse2 ? MO_ATOM_WITHIN16_PAIR : MO_ATOM_IFALIGN_PAIR; + return finalize_memop_atom(s, opc, atom); +} + +/** + * finalize_memop_asimd: + * @s: DisasContext + * @opc: size+sign+align of the memory operation + * + * Like finalize_memop_atom, but with atomicity of AccessType_ASIMD. + */ +static inline MemOp finalize_memop_asimd(DisasContext *s, MemOp opc) +{ + /* + * In the pseudocode for Mem[], with AccessType_ASIMD, size == 16, + * if IsAligned(8), the first case provides separate atomicity for + * the pair of 64-bit accesses. If !IsAligned(8), the middle cases + * do not apply, and we're left with the final case of no atomicity. + * Thus MO_ATOM_IFALIGN_PAIR. + * + * For other sizes, normal LSE2 rules apply. + */ + if ((opc & MO_SIZE) == MO_128) { + return finalize_memop_atom(s, opc, MO_ATOM_IFALIGN_PAIR); + } + return finalize_memop(s, opc); } /** |