aboutsummaryrefslogtreecommitdiff
path: root/tcg
diff options
context:
space:
mode:
authorStefan Hajnoczi <stefanha@redhat.com>2022-10-05 10:17:02 -0400
committerStefan Hajnoczi <stefanha@redhat.com>2022-10-05 10:17:02 -0400
commit4a9c04672a875ed00ea807ea4d552c01f6440bc7 (patch)
tree5ae0c7c85d114c608cf6018b8dc50f4feaaa83b8 /tcg
parentfafd35a6dab8e70a7c395aaa8e1273267cf9f3c8 (diff)
parentab419fd8a035a65942de4e63effcd55ccbf1a9fe (diff)
downloadqemu-4a9c04672a875ed00ea807ea4d552c01f6440bc7.zip
qemu-4a9c04672a875ed00ea807ea4d552c01f6440bc7.tar.gz
qemu-4a9c04672a875ed00ea807ea4d552c01f6440bc7.tar.bz2
Merge tag 'pull-tcg-20221004' of https://gitlab.com/rth7680/qemu into staging
Cache CPUClass for use in hot code paths. Add CPUTLBEntryFull, probe_access_full, tlb_set_page_full. Add generic support for TARGET_TB_PCREL. tcg/ppc: Optimize 26-bit jumps using STQ for POWER 2.07 target/sh4: Fix TB_FLAG_UNALIGN # -----BEGIN PGP SIGNATURE----- # # iQFRBAABCgA7FiEEekgeeIaLTbaoWgXAZN846K9+IV8FAmM8jXEdHHJpY2hhcmQu # aGVuZGVyc29uQGxpbmFyby5vcmcACgkQZN846K9+IV/oEggArAHK8FtydfQ4ZwnF # SjXfpdP50OC0SZn3uBN93FZOrxz9UYG9t1oDHs39J/+b/u2nwJYch//EH2k+NtOW # hc3iIgS9bWgs/UWZESkViKQccw7gpYlc21Br38WWwFNEFyecX0p+e9pJgld5rSv1 # mRGvCs5J2svH2tcXl/Sb/JWgcumOJoG7qy2aLyJGolR6UOfwcfFMzQXzq8qjpRKH # Jh84qusE/rLbzBsdN6snJY4+dyvUo03lT5IJ4d+FQg2tUip+Qqt7pnMbsqq6qF6H # R6fWU1JTbsh7GxXJwQJ83jLBnUsi8cy6FKrZ3jyiBq76+DIpR0PqoEe+PN/weInU # TN0z4g== # =RfXJ # -----END PGP SIGNATURE----- # gpg: Signature made Tue 04 Oct 2022 15:45:53 EDT # gpg: using RSA key 7A481E78868B4DB6A85A05C064DF38E8AF7E215F # gpg: issuer "richard.henderson@linaro.org" # gpg: Good signature from "Richard Henderson <richard.henderson@linaro.org>" [full] # Primary key fingerprint: 7A48 1E78 868B 4DB6 A85A 05C0 64DF 38E8 AF7E 215F * tag 'pull-tcg-20221004' of https://gitlab.com/rth7680/qemu: target/sh4: Fix TB_FLAG_UNALIGN tcg/ppc: Optimize 26-bit jumps accel/tcg: Introduce TARGET_TB_PCREL accel/tcg: Introduce tb_pc and log_pc hw/core: Add CPUClass.get_pc include/hw/core: Create struct CPUJumpCache accel/tcg: Inline tb_flush_jmp_cache accel/tcg: Do not align tb->page_addr[0] accel/tcg: Use DisasContextBase in plugin_gen_tb_start accel/tcg: Use bool for page_find_alloc accel/tcg: Remove PageDesc code_bitmap include/exec: Introduce TARGET_PAGE_ENTRY_EXTRA accel/tcg: Introduce tlb_set_page_full accel/tcg: Introduce probe_access_full accel/tcg: Suppress auto-invalidate in probe_access_internal accel/tcg: Drop addr member from SavedIOTLB accel/tcg: Rename CPUIOTLBEntry to CPUTLBEntryFull cputlb: used cached CPUClass in our hot-paths hw/core/cpu-sysemu: used cached class in cpu_asidx_from_attrs cpu: cache CPUClass in CPUState for hot code paths Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Diffstat (limited to 'tcg')
-rw-r--r--tcg/ppc/tcg-target.c.inc119
-rw-r--r--tcg/tcg.c8
2 files changed, 92 insertions, 35 deletions
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index 1cbd047..e3dba47 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -1847,44 +1847,101 @@ static void tcg_out_mb(TCGContext *s, TCGArg a0)
tcg_out32(s, insn);
}
-void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
- uintptr_t jmp_rw, uintptr_t addr)
+static inline uint64_t make_pair(tcg_insn_unit i1, tcg_insn_unit i2)
{
- if (TCG_TARGET_REG_BITS == 64) {
- tcg_insn_unit i1, i2;
- intptr_t tb_diff = addr - tc_ptr;
- intptr_t br_diff = addr - (jmp_rx + 4);
- uint64_t pair;
-
- /* This does not exercise the range of the branch, but we do
- still need to be able to load the new value of TCG_REG_TB.
- But this does still happen quite often. */
- if (tb_diff == (int16_t)tb_diff) {
- i1 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, tb_diff);
- i2 = B | (br_diff & 0x3fffffc);
- } else {
- intptr_t lo = (int16_t)tb_diff;
- intptr_t hi = (int32_t)(tb_diff - lo);
- assert(tb_diff == hi + lo);
- i1 = ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, hi >> 16);
- i2 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, lo);
- }
-#if HOST_BIG_ENDIAN
- pair = (uint64_t)i1 << 32 | i2;
+ if (HOST_BIG_ENDIAN) {
+ return (uint64_t)i1 << 32 | i2;
+ }
+ return (uint64_t)i2 << 32 | i1;
+}
+
+static inline void ppc64_replace2(uintptr_t rx, uintptr_t rw,
+ tcg_insn_unit i0, tcg_insn_unit i1)
+{
+#if TCG_TARGET_REG_BITS == 64
+ qatomic_set((uint64_t *)rw, make_pair(i0, i1));
+ flush_idcache_range(rx, rw, 8);
#else
- pair = (uint64_t)i2 << 32 | i1;
+ qemu_build_not_reached();
#endif
+}
- /* As per the enclosing if, this is ppc64. Avoid the _Static_assert
- within qatomic_set that would fail to build a ppc32 host. */
- qatomic_set__nocheck((uint64_t *)jmp_rw, pair);
- flush_idcache_range(jmp_rx, jmp_rw, 8);
- } else {
+static inline void ppc64_replace4(uintptr_t rx, uintptr_t rw,
+ tcg_insn_unit i0, tcg_insn_unit i1,
+ tcg_insn_unit i2, tcg_insn_unit i3)
+{
+ uint64_t p[2];
+
+ p[!HOST_BIG_ENDIAN] = make_pair(i0, i1);
+ p[HOST_BIG_ENDIAN] = make_pair(i2, i3);
+
+ /*
+ * There's no convenient way to get the compiler to allocate a pair
+ * of registers at an even index, so copy into r6/r7 and clobber.
+ */
+ asm("mr %%r6, %1\n\t"
+ "mr %%r7, %2\n\t"
+ "stq %%r6, %0"
+ : "=Q"(*(__int128 *)rw) : "r"(p[0]), "r"(p[1]) : "r6", "r7");
+ flush_idcache_range(rx, rw, 16);
+}
+
+void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
+ uintptr_t jmp_rw, uintptr_t addr)
+{
+ tcg_insn_unit i0, i1, i2, i3;
+ intptr_t tb_diff = addr - tc_ptr;
+ intptr_t br_diff = addr - (jmp_rx + 4);
+ intptr_t lo, hi;
+
+ if (TCG_TARGET_REG_BITS == 32) {
intptr_t diff = addr - jmp_rx;
tcg_debug_assert(in_range_b(diff));
qatomic_set((uint32_t *)jmp_rw, B | (diff & 0x3fffffc));
flush_idcache_range(jmp_rx, jmp_rw, 4);
+ return;
+ }
+
+ /*
+ * For 16-bit displacements, we can use a single add + branch.
+ * This happens quite often.
+ */
+ if (tb_diff == (int16_t)tb_diff) {
+ i0 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, tb_diff);
+ i1 = B | (br_diff & 0x3fffffc);
+ ppc64_replace2(jmp_rx, jmp_rw, i0, i1);
+ return;
+ }
+
+ lo = (int16_t)tb_diff;
+ hi = (int32_t)(tb_diff - lo);
+ assert(tb_diff == hi + lo);
+ i0 = ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, hi >> 16);
+ i1 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, lo);
+
+ /*
+ * Without stq from 2.07, we can only update two insns,
+ * and those must be the ones that load the target address.
+ */
+ if (!have_isa_2_07) {
+ ppc64_replace2(jmp_rx, jmp_rw, i0, i1);
+ return;
+ }
+
+ /*
+ * For 26-bit displacements, we can use a direct branch.
+ * Otherwise we still need the indirect branch, which we
+ * must restore after a potential direct branch write.
+ */
+ br_diff -= 4;
+ if (in_range_b(br_diff)) {
+ i2 = B | (br_diff & 0x3fffffc);
+ i3 = NOP;
+ } else {
+ i2 = MTSPR | RS(TCG_REG_TB) | CTR;
+ i3 = BCCTR | BO_ALWAYS;
}
+ ppc64_replace4(jmp_rx, jmp_rw, i0, i1, i2, i3);
}
static void tcg_out_call_int(TCGContext *s, int lk,
@@ -2574,8 +2631,8 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
if (s->tb_jmp_insn_offset) {
/* Direct jump. */
if (TCG_TARGET_REG_BITS == 64) {
- /* Ensure the next insns are 8-byte aligned. */
- if ((uintptr_t)s->code_ptr & 7) {
+ /* Ensure the next insns are 8 or 16-byte aligned. */
+ while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
tcg_out32(s, NOP);
}
s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 0f9cfe9..612a12f 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -4188,7 +4188,7 @@ int64_t tcg_cpu_exec_time(void)
#endif
-int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
+int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
{
#ifdef CONFIG_PROFILER
TCGProfile *prof = &s->prof;
@@ -4218,7 +4218,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
#ifdef DEBUG_DISAS
if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP)
- && qemu_log_in_addr_range(tb->pc))) {
+ && qemu_log_in_addr_range(pc_start))) {
FILE *logfile = qemu_log_trylock();
if (logfile) {
fprintf(logfile, "OP:\n");
@@ -4265,7 +4265,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
if (s->nb_indirects > 0) {
#ifdef DEBUG_DISAS
if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_IND)
- && qemu_log_in_addr_range(tb->pc))) {
+ && qemu_log_in_addr_range(pc_start))) {
FILE *logfile = qemu_log_trylock();
if (logfile) {
fprintf(logfile, "OP before indirect lowering:\n");
@@ -4288,7 +4288,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
#ifdef DEBUG_DISAS
if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_OPT)
- && qemu_log_in_addr_range(tb->pc))) {
+ && qemu_log_in_addr_range(pc_start))) {
FILE *logfile = qemu_log_trylock();
if (logfile) {
fprintf(logfile, "OP after optimization and liveness analysis:\n");