diff options
author | Stefan Hajnoczi <stefanha@redhat.com> | 2022-10-05 10:17:02 -0400 |
---|---|---|
committer | Stefan Hajnoczi <stefanha@redhat.com> | 2022-10-05 10:17:02 -0400 |
commit | 4a9c04672a875ed00ea807ea4d552c01f6440bc7 (patch) | |
tree | 5ae0c7c85d114c608cf6018b8dc50f4feaaa83b8 /tcg | |
parent | fafd35a6dab8e70a7c395aaa8e1273267cf9f3c8 (diff) | |
parent | ab419fd8a035a65942de4e63effcd55ccbf1a9fe (diff) | |
download | qemu-4a9c04672a875ed00ea807ea4d552c01f6440bc7.zip qemu-4a9c04672a875ed00ea807ea4d552c01f6440bc7.tar.gz qemu-4a9c04672a875ed00ea807ea4d552c01f6440bc7.tar.bz2 |
Merge tag 'pull-tcg-20221004' of https://gitlab.com/rth7680/qemu into staging
Cache CPUClass for use in hot code paths.
Add CPUTLBEntryFull, probe_access_full, tlb_set_page_full.
Add generic support for TARGET_TB_PCREL.
tcg/ppc: Optimize 26-bit jumps using STQ for POWER 2.07
target/sh4: Fix TB_FLAG_UNALIGN
# -----BEGIN PGP SIGNATURE-----
#
# iQFRBAABCgA7FiEEekgeeIaLTbaoWgXAZN846K9+IV8FAmM8jXEdHHJpY2hhcmQu
# aGVuZGVyc29uQGxpbmFyby5vcmcACgkQZN846K9+IV/oEggArAHK8FtydfQ4ZwnF
# SjXfpdP50OC0SZn3uBN93FZOrxz9UYG9t1oDHs39J/+b/u2nwJYch//EH2k+NtOW
# hc3iIgS9bWgs/UWZESkViKQccw7gpYlc21Br38WWwFNEFyecX0p+e9pJgld5rSv1
# mRGvCs5J2svH2tcXl/Sb/JWgcumOJoG7qy2aLyJGolR6UOfwcfFMzQXzq8qjpRKH
# Jh84qusE/rLbzBsdN6snJY4+dyvUo03lT5IJ4d+FQg2tUip+Qqt7pnMbsqq6qF6H
# R6fWU1JTbsh7GxXJwQJ83jLBnUsi8cy6FKrZ3jyiBq76+DIpR0PqoEe+PN/weInU
# TN0z4g==
# =RfXJ
# -----END PGP SIGNATURE-----
# gpg: Signature made Tue 04 Oct 2022 15:45:53 EDT
# gpg: using RSA key 7A481E78868B4DB6A85A05C064DF38E8AF7E215F
# gpg: issuer "richard.henderson@linaro.org"
# gpg: Good signature from "Richard Henderson <richard.henderson@linaro.org>" [full]
# Primary key fingerprint: 7A48 1E78 868B 4DB6 A85A 05C0 64DF 38E8 AF7E 215F
* tag 'pull-tcg-20221004' of https://gitlab.com/rth7680/qemu:
target/sh4: Fix TB_FLAG_UNALIGN
tcg/ppc: Optimize 26-bit jumps
accel/tcg: Introduce TARGET_TB_PCREL
accel/tcg: Introduce tb_pc and log_pc
hw/core: Add CPUClass.get_pc
include/hw/core: Create struct CPUJumpCache
accel/tcg: Inline tb_flush_jmp_cache
accel/tcg: Do not align tb->page_addr[0]
accel/tcg: Use DisasContextBase in plugin_gen_tb_start
accel/tcg: Use bool for page_find_alloc
accel/tcg: Remove PageDesc code_bitmap
include/exec: Introduce TARGET_PAGE_ENTRY_EXTRA
accel/tcg: Introduce tlb_set_page_full
accel/tcg: Introduce probe_access_full
accel/tcg: Suppress auto-invalidate in probe_access_internal
accel/tcg: Drop addr member from SavedIOTLB
accel/tcg: Rename CPUIOTLBEntry to CPUTLBEntryFull
cputlb: used cached CPUClass in our hot-paths
hw/core/cpu-sysemu: used cached class in cpu_asidx_from_attrs
cpu: cache CPUClass in CPUState for hot code paths
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Diffstat (limited to 'tcg')
-rw-r--r-- | tcg/ppc/tcg-target.c.inc | 119 | ||||
-rw-r--r-- | tcg/tcg.c | 8 |
2 files changed, 92 insertions, 35 deletions
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc index 1cbd047..e3dba47 100644 --- a/tcg/ppc/tcg-target.c.inc +++ b/tcg/ppc/tcg-target.c.inc @@ -1847,44 +1847,101 @@ static void tcg_out_mb(TCGContext *s, TCGArg a0) tcg_out32(s, insn); } -void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx, - uintptr_t jmp_rw, uintptr_t addr) +static inline uint64_t make_pair(tcg_insn_unit i1, tcg_insn_unit i2) { - if (TCG_TARGET_REG_BITS == 64) { - tcg_insn_unit i1, i2; - intptr_t tb_diff = addr - tc_ptr; - intptr_t br_diff = addr - (jmp_rx + 4); - uint64_t pair; - - /* This does not exercise the range of the branch, but we do - still need to be able to load the new value of TCG_REG_TB. - But this does still happen quite often. */ - if (tb_diff == (int16_t)tb_diff) { - i1 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, tb_diff); - i2 = B | (br_diff & 0x3fffffc); - } else { - intptr_t lo = (int16_t)tb_diff; - intptr_t hi = (int32_t)(tb_diff - lo); - assert(tb_diff == hi + lo); - i1 = ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, hi >> 16); - i2 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, lo); - } -#if HOST_BIG_ENDIAN - pair = (uint64_t)i1 << 32 | i2; + if (HOST_BIG_ENDIAN) { + return (uint64_t)i1 << 32 | i2; + } + return (uint64_t)i2 << 32 | i1; +} + +static inline void ppc64_replace2(uintptr_t rx, uintptr_t rw, + tcg_insn_unit i0, tcg_insn_unit i1) +{ +#if TCG_TARGET_REG_BITS == 64 + qatomic_set((uint64_t *)rw, make_pair(i0, i1)); + flush_idcache_range(rx, rw, 8); #else - pair = (uint64_t)i2 << 32 | i1; + qemu_build_not_reached(); #endif +} - /* As per the enclosing if, this is ppc64. Avoid the _Static_assert - within qatomic_set that would fail to build a ppc32 host. */ - qatomic_set__nocheck((uint64_t *)jmp_rw, pair); - flush_idcache_range(jmp_rx, jmp_rw, 8); - } else { +static inline void ppc64_replace4(uintptr_t rx, uintptr_t rw, + tcg_insn_unit i0, tcg_insn_unit i1, + tcg_insn_unit i2, tcg_insn_unit i3) +{ + uint64_t p[2]; + + p[!HOST_BIG_ENDIAN] = make_pair(i0, i1); + p[HOST_BIG_ENDIAN] = make_pair(i2, i3); + + /* + * There's no convenient way to get the compiler to allocate a pair + * of registers at an even index, so copy into r6/r7 and clobber. + */ + asm("mr %%r6, %1\n\t" + "mr %%r7, %2\n\t" + "stq %%r6, %0" + : "=Q"(*(__int128 *)rw) : "r"(p[0]), "r"(p[1]) : "r6", "r7"); + flush_idcache_range(rx, rw, 16); +} + +void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx, + uintptr_t jmp_rw, uintptr_t addr) +{ + tcg_insn_unit i0, i1, i2, i3; + intptr_t tb_diff = addr - tc_ptr; + intptr_t br_diff = addr - (jmp_rx + 4); + intptr_t lo, hi; + + if (TCG_TARGET_REG_BITS == 32) { intptr_t diff = addr - jmp_rx; tcg_debug_assert(in_range_b(diff)); qatomic_set((uint32_t *)jmp_rw, B | (diff & 0x3fffffc)); flush_idcache_range(jmp_rx, jmp_rw, 4); + return; + } + + /* + * For 16-bit displacements, we can use a single add + branch. + * This happens quite often. + */ + if (tb_diff == (int16_t)tb_diff) { + i0 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, tb_diff); + i1 = B | (br_diff & 0x3fffffc); + ppc64_replace2(jmp_rx, jmp_rw, i0, i1); + return; + } + + lo = (int16_t)tb_diff; + hi = (int32_t)(tb_diff - lo); + assert(tb_diff == hi + lo); + i0 = ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, hi >> 16); + i1 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, lo); + + /* + * Without stq from 2.07, we can only update two insns, + * and those must be the ones that load the target address. + */ + if (!have_isa_2_07) { + ppc64_replace2(jmp_rx, jmp_rw, i0, i1); + return; + } + + /* + * For 26-bit displacements, we can use a direct branch. + * Otherwise we still need the indirect branch, which we + * must restore after a potential direct branch write. + */ + br_diff -= 4; + if (in_range_b(br_diff)) { + i2 = B | (br_diff & 0x3fffffc); + i3 = NOP; + } else { + i2 = MTSPR | RS(TCG_REG_TB) | CTR; + i3 = BCCTR | BO_ALWAYS; } + ppc64_replace4(jmp_rx, jmp_rw, i0, i1, i2, i3); } static void tcg_out_call_int(TCGContext *s, int lk, @@ -2574,8 +2631,8 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, if (s->tb_jmp_insn_offset) { /* Direct jump. */ if (TCG_TARGET_REG_BITS == 64) { - /* Ensure the next insns are 8-byte aligned. */ - if ((uintptr_t)s->code_ptr & 7) { + /* Ensure the next insns are 8 or 16-byte aligned. */ + while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) { tcg_out32(s, NOP); } s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s); @@ -4188,7 +4188,7 @@ int64_t tcg_cpu_exec_time(void) #endif -int tcg_gen_code(TCGContext *s, TranslationBlock *tb) +int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start) { #ifdef CONFIG_PROFILER TCGProfile *prof = &s->prof; @@ -4218,7 +4218,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb) #ifdef DEBUG_DISAS if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP) - && qemu_log_in_addr_range(tb->pc))) { + && qemu_log_in_addr_range(pc_start))) { FILE *logfile = qemu_log_trylock(); if (logfile) { fprintf(logfile, "OP:\n"); @@ -4265,7 +4265,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb) if (s->nb_indirects > 0) { #ifdef DEBUG_DISAS if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_IND) - && qemu_log_in_addr_range(tb->pc))) { + && qemu_log_in_addr_range(pc_start))) { FILE *logfile = qemu_log_trylock(); if (logfile) { fprintf(logfile, "OP before indirect lowering:\n"); @@ -4288,7 +4288,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb) #ifdef DEBUG_DISAS if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_OPT) - && qemu_log_in_addr_range(tb->pc))) { + && qemu_log_in_addr_range(pc_start))) { FILE *logfile = qemu_log_trylock(); if (logfile) { fprintf(logfile, "OP after optimization and liveness analysis:\n"); |