diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2016-08-08 10:39:18 +0100 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2016-08-08 10:39:18 +0100 |
commit | cf5198d58088e3b416fe517b3946e5120b342761 (patch) | |
tree | 9d90c54eca81527f3f423e2daf7b29eccc32a35e | |
parent | 51009170d8fc263cfdcd5a60fe3ba213daa3d15b (diff) | |
parent | 5a18407f55ade924aa6397c9a043a9ffd59645fe (diff) | |
download | qemu-cf5198d58088e3b416fe517b3946e5120b342761.zip qemu-cf5198d58088e3b416fe517b3946e5120b342761.tar.gz qemu-cf5198d58088e3b416fe517b3946e5120b342761.tar.bz2 |
Merge remote-tracking branch 'remotes/rth/tags/pull-tcg-20160805' into staging
indirect register lowering
# gpg: Signature made Fri 05 Aug 2016 17:34:53 BST
# gpg: using RSA key 0xAD1270CC4DD0279B
# gpg: Good signature from "Richard Henderson <rth7680@gmail.com>"
# gpg: aka "Richard Henderson <rth@redhat.com>"
# gpg: aka "Richard Henderson <rth@twiddle.net>"
# Primary key fingerprint: 9CB1 8DDA F8E8 49AD 2AFC 16A4 AD12 70CC 4DD0 279B
* remotes/rth/tags/pull-tcg-20160805:
tcg: Lower indirect registers in a separate pass
tcg: Require liveness analysis
tcg: Include liveness info in the dumps
tcg: Compress dead_temps and mem_temps into a single array
tcg: Fold life data into TCGOp
tcg: Reorg TCGOp chaining
tcg: Compress liveness data to 16 bits
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r-- | include/exec/gen-icount.h | 2 | ||||
-rw-r--r-- | include/qemu/log.h | 3 | ||||
-rw-r--r-- | tcg/optimize.c | 37 | ||||
-rw-r--r-- | tcg/tcg-op.c | 2 | ||||
-rw-r--r-- | tcg/tcg.c | 588 | ||||
-rw-r--r-- | tcg/tcg.h | 52 | ||||
-rw-r--r-- | util/log.c | 24 |
7 files changed, 441 insertions, 267 deletions
diff --git a/include/exec/gen-icount.h b/include/exec/gen-icount.h index 1af03d8..050de59 100644 --- a/include/exec/gen-icount.h +++ b/include/exec/gen-icount.h @@ -59,7 +59,7 @@ static void gen_tb_end(TranslationBlock *tb, int num_insns) } /* Terminate the linked list. */ - tcg_ctx.gen_op_buf[tcg_ctx.gen_last_op_idx].next = -1; + tcg_ctx.gen_op_buf[tcg_ctx.gen_op_buf[0].prev].next = 0; } static inline void gen_io_start(void) diff --git a/include/qemu/log.h b/include/qemu/log.h index 8bec6b4..00bf37f 100644 --- a/include/qemu/log.h +++ b/include/qemu/log.h @@ -42,6 +42,7 @@ static inline bool qemu_log_separate(void) #define CPU_LOG_TB_NOCHAIN (1 << 13) #define CPU_LOG_PAGE (1 << 14) #define LOG_TRACE (1 << 15) +#define CPU_LOG_TB_OP_IND (1 << 16) /* Returns true if a bit is set in the current loglevel mask */ @@ -54,7 +55,7 @@ static inline bool qemu_loglevel_mask(int mask) /* main logging function */ -void GCC_FMT_ATTR(1, 2) qemu_log(const char *fmt, ...); +int GCC_FMT_ATTR(1, 2) qemu_log(const char *fmt, ...); /* vfprintf-like logging function */ diff --git a/tcg/optimize.c b/tcg/optimize.c index c0d975b..cffe89b 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -82,37 +82,6 @@ static void init_temp_info(TCGArg temp) } } -static TCGOp *insert_op_before(TCGContext *s, TCGOp *old_op, - TCGOpcode opc, int nargs) -{ - int oi = s->gen_next_op_idx; - int pi = s->gen_next_parm_idx; - int prev = old_op->prev; - int next = old_op - s->gen_op_buf; - TCGOp *new_op; - - tcg_debug_assert(oi < OPC_BUF_SIZE); - tcg_debug_assert(pi + nargs <= OPPARAM_BUF_SIZE); - s->gen_next_op_idx = oi + 1; - s->gen_next_parm_idx = pi + nargs; - - new_op = &s->gen_op_buf[oi]; - *new_op = (TCGOp){ - .opc = opc, - .args = pi, - .prev = prev, - .next = next - }; - if (prev >= 0) { - s->gen_op_buf[prev].next = oi; - } else { - s->gen_first_op_idx = oi; - } - old_op->prev = oi; - - return new_op; -} - static int op_bits(TCGOpcode op) { const TCGOpDef *def = &tcg_op_defs[op]; @@ -583,7 +552,7 @@ void tcg_optimize(TCGContext *s) nb_globals = s->nb_globals; reset_all_temps(nb_temps); - for (oi = s->gen_first_op_idx; oi >= 0; oi = oi_next) { + for (oi = s->gen_op_buf[0].next; oi != 0; oi = oi_next) { tcg_target_ulong mask, partmask, affected; int nb_oargs, nb_iargs, i; TCGArg tmp; @@ -1120,7 +1089,7 @@ void tcg_optimize(TCGContext *s) uint64_t a = ((uint64_t)ah << 32) | al; uint64_t b = ((uint64_t)bh << 32) | bl; TCGArg rl, rh; - TCGOp *op2 = insert_op_before(s, op, INDEX_op_movi_i32, 2); + TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_movi_i32, 2); TCGArg *args2 = &s->gen_opparam_buf[op2->args]; if (opc == INDEX_op_add2_i32) { @@ -1146,7 +1115,7 @@ void tcg_optimize(TCGContext *s) uint32_t b = temps[args[3]].val; uint64_t r = (uint64_t)a * b; TCGArg rl, rh; - TCGOp *op2 = insert_op_before(s, op, INDEX_op_movi_i32, 2); + TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_movi_i32, 2); TCGArg *args2 = &s->gen_opparam_buf[op2->args]; rl = args[0]; diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c index 293b854..0243c99 100644 --- a/tcg/tcg-op.c +++ b/tcg/tcg-op.c @@ -52,7 +52,7 @@ static void tcg_emit_op(TCGContext *ctx, TCGOpcode opc, int args) int pi = oi - 1; tcg_debug_assert(oi < OPC_BUF_SIZE); - ctx->gen_last_op_idx = oi; + ctx->gen_op_buf[0].prev = oi; ctx->gen_next_op_idx = ni; ctx->gen_op_buf[oi] = (TCGOp){ @@ -23,7 +23,6 @@ */ /* define it to use liveness analysis (better code) */ -#define USE_LIVENESS_ANALYSIS #define USE_TCG_OPTIMIZATIONS #include "qemu/osdep.h" @@ -333,7 +332,7 @@ void tcg_context_init(TCGContext *s) memset(s, 0, sizeof(*s)); s->nb_globals = 0; - + /* Count total number of arguments and allocate the corresponding space */ total_args = 0; @@ -438,9 +437,9 @@ void tcg_func_start(TCGContext *s) s->goto_tb_issue_mask = 0; #endif - s->gen_first_op_idx = 0; - s->gen_last_op_idx = -1; - s->gen_next_op_idx = 0; + s->gen_op_buf[0].next = 1; + s->gen_op_buf[0].prev = 0; + s->gen_next_op_idx = 1; s->gen_next_parm_idx = 0; s->be = tcg_malloc(sizeof(TCGBackendData)); @@ -532,8 +531,12 @@ int tcg_global_mem_new_internal(TCGType type, TCGv_ptr base, #endif if (!base_ts->fixed_reg) { - indirect_reg = 1; + /* We do not support double-indirect registers. */ + tcg_debug_assert(!base_ts->indirect_reg); base_ts->indirect_base = 1; + s->nb_indirects += (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64 + ? 2 : 1); + indirect_reg = 1; } if (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64) { @@ -825,16 +828,16 @@ void tcg_gen_callN(TCGContext *s, void *func, TCGArg ret, real_args++; } #endif - /* If stack grows up, then we will be placing successive - arguments at lower addresses, which means we need to - reverse the order compared to how we would normally - treat either big or little-endian. For those arguments - that will wind up in registers, this still works for - HPPA (the only current STACK_GROWSUP target) since the - argument registers are *also* allocated in decreasing - order. If another such target is added, this logic may - have to get more complicated to differentiate between - stack arguments and register arguments. */ + /* If stack grows up, then we will be placing successive + arguments at lower addresses, which means we need to + reverse the order compared to how we would normally + treat either big or little-endian. For those arguments + that will wind up in registers, this still works for + HPPA (the only current STACK_GROWSUP target) since the + argument registers are *also* allocated in decreasing + order. If another such target is added, this logic may + have to get more complicated to differentiate between + stack arguments and register arguments. */ #if defined(HOST_WORDS_BIGENDIAN) != defined(TCG_TARGET_STACK_GROWSUP) s->gen_opparam_buf[pi++] = args[i] + 1; s->gen_opparam_buf[pi++] = args[i]; @@ -869,7 +872,7 @@ void tcg_gen_callN(TCGContext *s, void *func, TCGArg ret, /* Make sure the calli field didn't overflow. */ tcg_debug_assert(s->gen_op_buf[i].calli == real_args); - s->gen_last_op_idx = i; + s->gen_op_buf[0].prev = i; s->gen_next_op_idx = i + 1; s->gen_next_parm_idx = pi; @@ -1021,11 +1024,12 @@ void tcg_dump_ops(TCGContext *s) TCGOp *op; int oi; - for (oi = s->gen_first_op_idx; oi >= 0; oi = op->next) { + for (oi = s->gen_op_buf[0].next; oi != 0; oi = op->next) { int i, k, nb_oargs, nb_iargs, nb_cargs; const TCGOpDef *def; const TCGArg *args; TCGOpcode c; + int col = 0; op = &s->gen_op_buf[oi]; c = op->opc; @@ -1033,7 +1037,7 @@ void tcg_dump_ops(TCGContext *s) args = &s->gen_opparam_buf[op->args]; if (c == INDEX_op_insn_start) { - qemu_log("%s ----", oi != s->gen_first_op_idx ? "\n" : ""); + col += qemu_log("%s ----", oi != s->gen_op_buf[0].next ? "\n" : ""); for (i = 0; i < TARGET_INSN_START_WORDS; ++i) { target_ulong a; @@ -1042,7 +1046,7 @@ void tcg_dump_ops(TCGContext *s) #else a = args[i]; #endif - qemu_log(" " TARGET_FMT_lx, a); + col += qemu_log(" " TARGET_FMT_lx, a); } } else if (c == INDEX_op_call) { /* variable number of arguments */ @@ -1051,12 +1055,12 @@ void tcg_dump_ops(TCGContext *s) nb_cargs = def->nb_cargs; /* function name, flags, out args */ - qemu_log(" %s %s,$0x%" TCG_PRIlx ",$%d", def->name, - tcg_find_helper(s, args[nb_oargs + nb_iargs]), - args[nb_oargs + nb_iargs + 1], nb_oargs); + col += qemu_log(" %s %s,$0x%" TCG_PRIlx ",$%d", def->name, + tcg_find_helper(s, args[nb_oargs + nb_iargs]), + args[nb_oargs + nb_iargs + 1], nb_oargs); for (i = 0; i < nb_oargs; i++) { - qemu_log(",%s", tcg_get_arg_str_idx(s, buf, sizeof(buf), - args[i])); + col += qemu_log(",%s", tcg_get_arg_str_idx(s, buf, sizeof(buf), + args[i])); } for (i = 0; i < nb_iargs; i++) { TCGArg arg = args[nb_oargs + i]; @@ -1064,10 +1068,10 @@ void tcg_dump_ops(TCGContext *s) if (arg != TCG_CALL_DUMMY_ARG) { t = tcg_get_arg_str_idx(s, buf, sizeof(buf), arg); } - qemu_log(",%s", t); + col += qemu_log(",%s", t); } } else { - qemu_log(" %s ", def->name); + col += qemu_log(" %s ", def->name); nb_oargs = def->nb_oargs; nb_iargs = def->nb_iargs; @@ -1076,17 +1080,17 @@ void tcg_dump_ops(TCGContext *s) k = 0; for (i = 0; i < nb_oargs; i++) { if (k != 0) { - qemu_log(","); + col += qemu_log(","); } - qemu_log("%s", tcg_get_arg_str_idx(s, buf, sizeof(buf), - args[k++])); + col += qemu_log("%s", tcg_get_arg_str_idx(s, buf, sizeof(buf), + args[k++])); } for (i = 0; i < nb_iargs; i++) { if (k != 0) { - qemu_log(","); + col += qemu_log(","); } - qemu_log("%s", tcg_get_arg_str_idx(s, buf, sizeof(buf), - args[k++])); + col += qemu_log("%s", tcg_get_arg_str_idx(s, buf, sizeof(buf), + args[k++])); } switch (c) { case INDEX_op_brcond_i32: @@ -1098,9 +1102,9 @@ void tcg_dump_ops(TCGContext *s) case INDEX_op_setcond_i64: case INDEX_op_movcond_i64: if (args[k] < ARRAY_SIZE(cond_name) && cond_name[args[k]]) { - qemu_log(",%s", cond_name[args[k++]]); + col += qemu_log(",%s", cond_name[args[k++]]); } else { - qemu_log(",$0x%" TCG_PRIlx, args[k++]); + col += qemu_log(",$0x%" TCG_PRIlx, args[k++]); } i = 1; break; @@ -1114,12 +1118,12 @@ void tcg_dump_ops(TCGContext *s) unsigned ix = get_mmuidx(oi); if (op & ~(MO_AMASK | MO_BSWAP | MO_SSIZE)) { - qemu_log(",$0x%x,%u", op, ix); + col += qemu_log(",$0x%x,%u", op, ix); } else { const char *s_al, *s_op; s_al = alignment_name[(op & MO_AMASK) >> MO_ASHIFT]; s_op = ldst_name[op & (MO_BSWAP | MO_SSIZE)]; - qemu_log(",%s%s,%u", s_al, s_op, ix); + col += qemu_log(",%s%s,%u", s_al, s_op, ix); } i = 1; } @@ -1134,14 +1138,39 @@ void tcg_dump_ops(TCGContext *s) case INDEX_op_brcond_i32: case INDEX_op_brcond_i64: case INDEX_op_brcond2_i32: - qemu_log("%s$L%d", k ? "," : "", arg_label(args[k])->id); + col += qemu_log("%s$L%d", k ? "," : "", arg_label(args[k])->id); i++, k++; break; default: break; } for (; i < nb_cargs; i++, k++) { - qemu_log("%s$0x%" TCG_PRIlx, k ? "," : "", args[k]); + col += qemu_log("%s$0x%" TCG_PRIlx, k ? "," : "", args[k]); + } + } + if (op->life) { + unsigned life = op->life; + + for (; col < 48; ++col) { + putc(' ', qemu_logfile); + } + + if (life & (SYNC_ARG * 3)) { + qemu_log(" sync:"); + for (i = 0; i < 2; ++i) { + if (life & (SYNC_ARG << i)) { + qemu_log(" %d", i); + } + } + } + life /= DEAD_ARG; + if (life) { + qemu_log(" dead:"); + for (i = 0; life; ++i, life >>= 1) { + if (life & 1) { + qemu_log(" %d", i); + } + } } } qemu_log("\n"); @@ -1298,71 +1327,116 @@ void tcg_op_remove(TCGContext *s, TCGOp *op) int next = op->next; int prev = op->prev; - if (next >= 0) { - s->gen_op_buf[next].prev = prev; - } else { - s->gen_last_op_idx = prev; - } - if (prev >= 0) { - s->gen_op_buf[prev].next = next; - } else { - s->gen_first_op_idx = next; - } + /* We should never attempt to remove the list terminator. */ + tcg_debug_assert(op != &s->gen_op_buf[0]); + + s->gen_op_buf[next].prev = prev; + s->gen_op_buf[prev].next = next; - memset(op, -1, sizeof(*op)); + memset(op, 0, sizeof(*op)); #ifdef CONFIG_PROFILER s->del_op_count++; #endif } -#ifdef USE_LIVENESS_ANALYSIS +TCGOp *tcg_op_insert_before(TCGContext *s, TCGOp *old_op, + TCGOpcode opc, int nargs) +{ + int oi = s->gen_next_op_idx; + int pi = s->gen_next_parm_idx; + int prev = old_op->prev; + int next = old_op - s->gen_op_buf; + TCGOp *new_op; + + tcg_debug_assert(oi < OPC_BUF_SIZE); + tcg_debug_assert(pi + nargs <= OPPARAM_BUF_SIZE); + s->gen_next_op_idx = oi + 1; + s->gen_next_parm_idx = pi + nargs; + + new_op = &s->gen_op_buf[oi]; + *new_op = (TCGOp){ + .opc = opc, + .args = pi, + .prev = prev, + .next = next + }; + s->gen_op_buf[prev].next = oi; + old_op->prev = oi; + + return new_op; +} + +TCGOp *tcg_op_insert_after(TCGContext *s, TCGOp *old_op, + TCGOpcode opc, int nargs) +{ + int oi = s->gen_next_op_idx; + int pi = s->gen_next_parm_idx; + int prev = old_op - s->gen_op_buf; + int next = old_op->next; + TCGOp *new_op; + + tcg_debug_assert(oi < OPC_BUF_SIZE); + tcg_debug_assert(pi + nargs <= OPPARAM_BUF_SIZE); + s->gen_next_op_idx = oi + 1; + s->gen_next_parm_idx = pi + nargs; + + new_op = &s->gen_op_buf[oi]; + *new_op = (TCGOp){ + .opc = opc, + .args = pi, + .prev = prev, + .next = next + }; + s->gen_op_buf[next].prev = oi; + old_op->next = oi; + + return new_op; +} + +#define TS_DEAD 1 +#define TS_MEM 2 + +#define IS_DEAD_ARG(n) (arg_life & (DEAD_ARG << (n))) +#define NEED_SYNC_ARG(n) (arg_life & (SYNC_ARG << (n))) + /* liveness analysis: end of function: all temps are dead, and globals should be in memory. */ -static inline void tcg_la_func_end(TCGContext *s, uint8_t *dead_temps, - uint8_t *mem_temps) +static inline void tcg_la_func_end(TCGContext *s, uint8_t *temp_state) { - memset(dead_temps, 1, s->nb_temps); - memset(mem_temps, 1, s->nb_globals); - memset(mem_temps + s->nb_globals, 0, s->nb_temps - s->nb_globals); + memset(temp_state, TS_DEAD | TS_MEM, s->nb_globals); + memset(temp_state + s->nb_globals, TS_DEAD, s->nb_temps - s->nb_globals); } /* liveness analysis: end of basic block: all temps are dead, globals and local temps should be in memory. */ -static inline void tcg_la_bb_end(TCGContext *s, uint8_t *dead_temps, - uint8_t *mem_temps) +static inline void tcg_la_bb_end(TCGContext *s, uint8_t *temp_state) { - int i; + int i, n; - memset(dead_temps, 1, s->nb_temps); - memset(mem_temps, 1, s->nb_globals); - for(i = s->nb_globals; i < s->nb_temps; i++) { - mem_temps[i] = s->temps[i].temp_local; + tcg_la_func_end(s, temp_state); + for (i = s->nb_globals, n = s->nb_temps; i < n; i++) { + if (s->temps[i].temp_local) { + temp_state[i] |= TS_MEM; + } } } -/* Liveness analysis : update the opc_dead_args array to tell if a +/* Liveness analysis : update the opc_arg_life array to tell if a given input arguments is dead. Instructions updating dead temporaries are removed. */ -static void tcg_liveness_analysis(TCGContext *s) +static void liveness_pass_1(TCGContext *s, uint8_t *temp_state) { - uint8_t *dead_temps, *mem_temps; - int oi, oi_prev, nb_ops; + int nb_globals = s->nb_globals; + int oi, oi_prev; - nb_ops = s->gen_next_op_idx; - s->op_dead_args = tcg_malloc(nb_ops * sizeof(uint16_t)); - s->op_sync_args = tcg_malloc(nb_ops * sizeof(uint8_t)); - - dead_temps = tcg_malloc(s->nb_temps); - mem_temps = tcg_malloc(s->nb_temps); - tcg_la_func_end(s, dead_temps, mem_temps); + tcg_la_func_end(s, temp_state); - for (oi = s->gen_last_op_idx; oi >= 0; oi = oi_prev) { + for (oi = s->gen_op_buf[0].prev; oi != 0; oi = oi_prev) { int i, nb_iargs, nb_oargs; TCGOpcode opc_new, opc_new2; bool have_opc_new2; - uint16_t dead_args; - uint8_t sync_args; + TCGLifeData arg_life = 0; TCGArg arg; TCGOp * const op = &s->gen_op_buf[oi]; @@ -1385,7 +1459,7 @@ static void tcg_liveness_analysis(TCGContext *s) if (call_flags & TCG_CALL_NO_SIDE_EFFECTS) { for (i = 0; i < nb_oargs; i++) { arg = args[i]; - if (!dead_temps[arg] || mem_temps[arg]) { + if (temp_state[arg] != TS_DEAD) { goto do_not_remove_call; } } @@ -1394,46 +1468,44 @@ static void tcg_liveness_analysis(TCGContext *s) do_not_remove_call: /* output args are dead */ - dead_args = 0; - sync_args = 0; for (i = 0; i < nb_oargs; i++) { arg = args[i]; - if (dead_temps[arg]) { - dead_args |= (1 << i); + if (temp_state[arg] & TS_DEAD) { + arg_life |= DEAD_ARG << i; } - if (mem_temps[arg]) { - sync_args |= (1 << i); + if (temp_state[arg] & TS_MEM) { + arg_life |= SYNC_ARG << i; } - dead_temps[arg] = 1; - mem_temps[arg] = 0; + temp_state[arg] = TS_DEAD; } - if (!(call_flags & TCG_CALL_NO_READ_GLOBALS)) { - /* globals should be synced to memory */ - memset(mem_temps, 1, s->nb_globals); - } if (!(call_flags & (TCG_CALL_NO_WRITE_GLOBALS | TCG_CALL_NO_READ_GLOBALS))) { /* globals should go back to memory */ - memset(dead_temps, 1, s->nb_globals); + memset(temp_state, TS_DEAD | TS_MEM, nb_globals); + } else if (!(call_flags & TCG_CALL_NO_READ_GLOBALS)) { + /* globals should be synced to memory */ + for (i = 0; i < nb_globals; i++) { + temp_state[i] |= TS_MEM; + } } /* record arguments that die in this helper */ for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) { arg = args[i]; if (arg != TCG_CALL_DUMMY_ARG) { - if (dead_temps[arg]) { - dead_args |= (1 << i); + if (temp_state[arg] & TS_DEAD) { + arg_life |= DEAD_ARG << i; } } } /* input arguments are live for preceding opcodes */ - for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) { + for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) { arg = args[i]; - dead_temps[arg] = 0; + if (arg != TCG_CALL_DUMMY_ARG) { + temp_state[arg] &= ~TS_DEAD; + } } - s->op_dead_args[oi] = dead_args; - s->op_sync_args[oi] = sync_args; } } break; @@ -1441,8 +1513,7 @@ static void tcg_liveness_analysis(TCGContext *s) break; case INDEX_op_discard: /* mark the temporary as dead */ - dead_temps[args[0]] = 1; - mem_temps[args[0]] = 0; + temp_state[args[0]] = TS_DEAD; break; case INDEX_op_add2_i32: @@ -1463,8 +1534,8 @@ static void tcg_liveness_analysis(TCGContext *s) the low part. The result can be optimized to a simple add or sub. This happens often for x86_64 guest when the cpu mode is set to 32 bit. */ - if (dead_temps[args[1]] && !mem_temps[args[1]]) { - if (dead_temps[args[0]] && !mem_temps[args[0]]) { + if (temp_state[args[1]] == TS_DEAD) { + if (temp_state[args[0]] == TS_DEAD) { goto do_remove; } /* Replace the opcode and adjust the args in place, @@ -1501,8 +1572,8 @@ static void tcg_liveness_analysis(TCGContext *s) do_mul2: nb_iargs = 2; nb_oargs = 2; - if (dead_temps[args[1]] && !mem_temps[args[1]]) { - if (dead_temps[args[0]] && !mem_temps[args[0]]) { + if (temp_state[args[1]] == TS_DEAD) { + if (temp_state[args[0]] == TS_DEAD) { /* Both parts of the operation are dead. */ goto do_remove; } @@ -1510,8 +1581,7 @@ static void tcg_liveness_analysis(TCGContext *s) op->opc = opc = opc_new; args[1] = args[2]; args[2] = args[3]; - } else if (have_opc_new2 && dead_temps[args[0]] - && !mem_temps[args[0]]) { + } else if (temp_state[args[0]] == TS_DEAD && have_opc_new2) { /* The low part of the operation is dead; generate the high. */ op->opc = opc = opc_new2; args[0] = args[1]; @@ -1534,8 +1604,7 @@ static void tcg_liveness_analysis(TCGContext *s) implies side effects */ if (!(def->flags & TCG_OPF_SIDE_EFFECTS) && nb_oargs != 0) { for (i = 0; i < nb_oargs; i++) { - arg = args[i]; - if (!dead_temps[arg] || mem_temps[arg]) { + if (temp_state[args[i]] != TS_DEAD) { goto do_not_remove; } } @@ -1544,59 +1613,203 @@ static void tcg_liveness_analysis(TCGContext *s) } else { do_not_remove: /* output args are dead */ - dead_args = 0; - sync_args = 0; for (i = 0; i < nb_oargs; i++) { arg = args[i]; - if (dead_temps[arg]) { - dead_args |= (1 << i); + if (temp_state[arg] & TS_DEAD) { + arg_life |= DEAD_ARG << i; } - if (mem_temps[arg]) { - sync_args |= (1 << i); + if (temp_state[arg] & TS_MEM) { + arg_life |= SYNC_ARG << i; } - dead_temps[arg] = 1; - mem_temps[arg] = 0; + temp_state[arg] = TS_DEAD; } /* if end of basic block, update */ if (def->flags & TCG_OPF_BB_END) { - tcg_la_bb_end(s, dead_temps, mem_temps); + tcg_la_bb_end(s, temp_state); } else if (def->flags & TCG_OPF_SIDE_EFFECTS) { /* globals should be synced to memory */ - memset(mem_temps, 1, s->nb_globals); + for (i = 0; i < nb_globals; i++) { + temp_state[i] |= TS_MEM; + } } /* record arguments that die in this opcode */ for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) { arg = args[i]; - if (dead_temps[arg]) { - dead_args |= (1 << i); + if (temp_state[arg] & TS_DEAD) { + arg_life |= DEAD_ARG << i; } } /* input arguments are live for preceding opcodes */ for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) { - arg = args[i]; - dead_temps[arg] = 0; + temp_state[args[i]] &= ~TS_DEAD; } - s->op_dead_args[oi] = dead_args; - s->op_sync_args[oi] = sync_args; } break; } + op->life = arg_life; } } -#else -/* dummy liveness analysis */ -static void tcg_liveness_analysis(TCGContext *s) + +/* Liveness analysis: Convert indirect regs to direct temporaries. */ +static bool liveness_pass_2(TCGContext *s, uint8_t *temp_state) { - int nb_ops = s->gen_next_op_idx; + int nb_globals = s->nb_globals; + int16_t *dir_temps; + int i, oi, oi_next; + bool changes = false; + + dir_temps = tcg_malloc(nb_globals * sizeof(int16_t)); + memset(dir_temps, 0, nb_globals * sizeof(int16_t)); + + /* Create a temporary for each indirect global. */ + for (i = 0; i < nb_globals; ++i) { + TCGTemp *its = &s->temps[i]; + if (its->indirect_reg) { + TCGTemp *dts = tcg_temp_alloc(s); + dts->type = its->type; + dts->base_type = its->base_type; + dir_temps[i] = temp_idx(s, dts); + } + } + + memset(temp_state, TS_DEAD, nb_globals); + + for (oi = s->gen_op_buf[0].next; oi != 0; oi = oi_next) { + TCGOp *op = &s->gen_op_buf[oi]; + TCGArg *args = &s->gen_opparam_buf[op->args]; + TCGOpcode opc = op->opc; + const TCGOpDef *def = &tcg_op_defs[opc]; + TCGLifeData arg_life = op->life; + int nb_iargs, nb_oargs, call_flags; + TCGArg arg, dir; + + oi_next = op->next; + + if (opc == INDEX_op_call) { + nb_oargs = op->callo; + nb_iargs = op->calli; + call_flags = args[nb_oargs + nb_iargs + 1]; + } else { + nb_iargs = def->nb_iargs; + nb_oargs = def->nb_oargs; + + /* Set flags similar to how calls require. */ + if (def->flags & TCG_OPF_BB_END) { + /* Like writing globals: save_globals */ + call_flags = 0; + } else if (def->flags & TCG_OPF_SIDE_EFFECTS) { + /* Like reading globals: sync_globals */ + call_flags = TCG_CALL_NO_WRITE_GLOBALS; + } else { + /* No effect on globals. */ + call_flags = (TCG_CALL_NO_READ_GLOBALS | + TCG_CALL_NO_WRITE_GLOBALS); + } + } + + /* Make sure that input arguments are available. */ + for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) { + arg = args[i]; + /* Note this unsigned test catches TCG_CALL_ARG_DUMMY too. */ + if (arg < nb_globals) { + dir = dir_temps[arg]; + if (dir != 0 && temp_state[arg] == TS_DEAD) { + TCGTemp *its = &s->temps[arg]; + TCGOpcode lopc = (its->type == TCG_TYPE_I32 + ? INDEX_op_ld_i32 + : INDEX_op_ld_i64); + TCGOp *lop = tcg_op_insert_before(s, op, lopc, 3); + TCGArg *largs = &s->gen_opparam_buf[lop->args]; + + largs[0] = dir; + largs[1] = temp_idx(s, its->mem_base); + largs[2] = its->mem_offset; + + /* Loaded, but synced with memory. */ + temp_state[arg] = TS_MEM; + } + } + } + + /* Perform input replacement, and mark inputs that became dead. + No action is required except keeping temp_state up to date + so that we reload when needed. */ + for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) { + arg = args[i]; + if (arg < nb_globals) { + dir = dir_temps[arg]; + if (dir != 0) { + args[i] = dir; + changes = true; + if (IS_DEAD_ARG(i)) { + temp_state[arg] = TS_DEAD; + } + } + } + } + + /* Liveness analysis should ensure that the following are + all correct, for call sites and basic block end points. */ + if (call_flags & TCG_CALL_NO_READ_GLOBALS) { + /* Nothing to do */ + } else if (call_flags & TCG_CALL_NO_WRITE_GLOBALS) { + for (i = 0; i < nb_globals; ++i) { + /* Liveness should see that globals are synced back, + that is, either TS_DEAD or TS_MEM. */ + tcg_debug_assert(dir_temps[i] == 0 + || temp_state[i] != 0); + } + } else { + for (i = 0; i < nb_globals; ++i) { + /* Liveness should see that globals are saved back, + that is, TS_DEAD, waiting to be reloaded. */ + tcg_debug_assert(dir_temps[i] == 0 + || temp_state[i] == TS_DEAD); + } + } + + /* Outputs become available. */ + for (i = 0; i < nb_oargs; i++) { + arg = args[i]; + if (arg >= nb_globals) { + continue; + } + dir = dir_temps[arg]; + if (dir == 0) { + continue; + } + args[i] = dir; + changes = true; + + /* The output is now live and modified. */ + temp_state[arg] = 0; + + /* Sync outputs upon their last write. */ + if (NEED_SYNC_ARG(i)) { + TCGTemp *its = &s->temps[arg]; + TCGOpcode sopc = (its->type == TCG_TYPE_I32 + ? INDEX_op_st_i32 + : INDEX_op_st_i64); + TCGOp *sop = tcg_op_insert_after(s, op, sopc, 3); + TCGArg *sargs = &s->gen_opparam_buf[sop->args]; + + sargs[0] = dir; + sargs[1] = temp_idx(s, its->mem_base); + sargs[2] = its->mem_offset; + + temp_state[arg] = TS_MEM; + } + /* Drop outputs that are dead. */ + if (IS_DEAD_ARG(i)) { + temp_state[arg] = TS_DEAD; + } + } + } - s->op_dead_args = tcg_malloc(nb_ops * sizeof(uint16_t)); - memset(s->op_dead_args, 0, nb_ops * sizeof(uint16_t)); - s->op_sync_args = tcg_malloc(nb_ops * sizeof(uint8_t)); - memset(s->op_sync_args, 0, nb_ops * sizeof(uint8_t)); + return changes; } -#endif #ifdef CONFIG_DEBUG_TCG static void dump_regs(TCGContext *s) @@ -1728,14 +1941,6 @@ static void temp_sync(TCGContext *s, TCGTemp *ts, if (!ts->mem_allocated) { temp_allocate_frame(s, temp_idx(s, ts)); } - if (ts->indirect_reg) { - if (ts->val_type == TEMP_VAL_REG) { - tcg_regset_set_reg(allocated_regs, ts->reg); - } - temp_load(s, ts->mem_base, - tcg_target_available_regs[TCG_TYPE_PTR], - allocated_regs); - } switch (ts->val_type) { case TEMP_VAL_CONST: /* If we're going to free the temp immediately, then we won't @@ -1826,12 +2031,6 @@ static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs, break; case TEMP_VAL_MEM: reg = tcg_reg_alloc(s, desired_regs, allocated_regs, ts->indirect_base); - if (ts->indirect_reg) { - tcg_regset_set_reg(allocated_regs, reg); - temp_load(s, ts->mem_base, - tcg_target_available_regs[TCG_TYPE_PTR], - allocated_regs); - } tcg_out_ld(s, ts->type, reg, ts->mem_base->reg, ts->mem_offset); ts->mem_coherent = 1; break; @@ -1848,16 +2047,9 @@ static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs, temporary registers needs to be allocated to store a constant. */ static void temp_save(TCGContext *s, TCGTemp *ts, TCGRegSet allocated_regs) { -#ifdef USE_LIVENESS_ANALYSIS - /* ??? Liveness does not yet incorporate indirect bases. */ - if (!ts->indirect_base) { - /* The liveness analysis already ensures that globals are back - in memory. Keep an tcg_debug_assert for safety. */ - tcg_debug_assert(ts->val_type == TEMP_VAL_MEM || ts->fixed_reg); - return; - } -#endif - temp_sync(s, ts, allocated_regs, 1); + /* The liveness analysis already ensures that globals are back + in memory. Keep an tcg_debug_assert for safety. */ + tcg_debug_assert(ts->val_type == TEMP_VAL_MEM || ts->fixed_reg); } /* save globals to their canonical location and assume they can be @@ -1881,16 +2073,9 @@ static void sync_globals(TCGContext *s, TCGRegSet allocated_regs) for (i = 0; i < s->nb_globals; i++) { TCGTemp *ts = &s->temps[i]; -#ifdef USE_LIVENESS_ANALYSIS - /* ??? Liveness does not yet incorporate indirect bases. */ - if (!ts->indirect_base) { - tcg_debug_assert(ts->val_type != TEMP_VAL_REG - || ts->fixed_reg - || ts->mem_coherent); - continue; - } -#endif - temp_sync(s, ts, allocated_regs, 0); + tcg_debug_assert(ts->val_type != TEMP_VAL_REG + || ts->fixed_reg + || ts->mem_coherent); } } @@ -1905,27 +2090,17 @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs) if (ts->temp_local) { temp_save(s, ts, allocated_regs); } else { -#ifdef USE_LIVENESS_ANALYSIS - /* ??? Liveness does not yet incorporate indirect bases. */ - if (!ts->indirect_base) { - /* The liveness analysis already ensures that temps are dead. - Keep an tcg_debug_assert for safety. */ - tcg_debug_assert(ts->val_type == TEMP_VAL_DEAD); - continue; - } -#endif - temp_dead(s, ts); + /* The liveness analysis already ensures that temps are dead. + Keep an tcg_debug_assert for safety. */ + tcg_debug_assert(ts->val_type == TEMP_VAL_DEAD); } } save_globals(s, allocated_regs); } -#define IS_DEAD_ARG(n) ((dead_args >> (n)) & 1) -#define NEED_SYNC_ARG(n) ((sync_args >> (n)) & 1) - static void tcg_reg_alloc_movi(TCGContext *s, const TCGArg *args, - uint16_t dead_args, uint8_t sync_args) + TCGLifeData arg_life) { TCGTemp *ots; tcg_target_ulong val; @@ -1954,8 +2129,7 @@ static void tcg_reg_alloc_movi(TCGContext *s, const TCGArg *args, } static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def, - const TCGArg *args, uint16_t dead_args, - uint8_t sync_args) + const TCGArg *args, TCGLifeData arg_life) { TCGRegSet allocated_regs; TCGTemp *ts, *ots; @@ -1987,12 +2161,6 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def, if (!ots->mem_allocated) { temp_allocate_frame(s, args[0]); } - if (ots->indirect_reg) { - tcg_regset_set_reg(allocated_regs, ts->reg); - temp_load(s, ots->mem_base, - tcg_target_available_regs[TCG_TYPE_PTR], - allocated_regs); - } tcg_out_st(s, otype, ts->reg, ots->mem_base->reg, ots->mem_offset); if (IS_DEAD_ARG(1)) { temp_dead(s, ts); @@ -2040,8 +2208,7 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def, static void tcg_reg_alloc_op(TCGContext *s, const TCGOpDef *def, TCGOpcode opc, - const TCGArg *args, uint16_t dead_args, - uint8_t sync_args) + const TCGArg *args, TCGLifeData arg_life) { TCGRegSet allocated_regs; int i, k, nb_iargs, nb_oargs; @@ -2206,8 +2373,7 @@ static void tcg_reg_alloc_op(TCGContext *s, #endif static void tcg_reg_alloc_call(TCGContext *s, int nb_oargs, int nb_iargs, - const TCGArg * const args, uint16_t dead_args, - uint8_t sync_args) + const TCGArg * const args, TCGLifeData arg_life) { int flags, nb_regs, i; TCGReg reg; @@ -2363,7 +2529,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb) { int n; - n = s->gen_last_op_idx + 1; + n = s->gen_op_buf[0].prev + 1; s->op_count += n; if (n > s->op_count_max) { s->op_count_max = n; @@ -2399,7 +2565,27 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb) s->la_time -= profile_getclock(); #endif - tcg_liveness_analysis(s); + { + uint8_t *temp_state = tcg_malloc(s->nb_temps + s->nb_indirects); + + liveness_pass_1(s, temp_state); + + if (s->nb_indirects > 0) { +#ifdef DEBUG_DISAS + if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_IND) + && qemu_log_in_addr_range(tb->pc))) { + qemu_log("OP before indirect lowering:\n"); + tcg_dump_ops(s); + qemu_log("\n"); + } +#endif + /* Replace indirect temps with direct temps. */ + if (liveness_pass_2(s, temp_state)) { + /* If changes were made, re-run liveness. */ + liveness_pass_1(s, temp_state); + } + } + } #ifdef CONFIG_PROFILER s->la_time += profile_getclock(); @@ -2422,13 +2608,12 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb) tcg_out_tb_init(s); num_insns = -1; - for (oi = s->gen_first_op_idx; oi >= 0; oi = oi_next) { + for (oi = s->gen_op_buf[0].next; oi != 0; oi = oi_next) { TCGOp * const op = &s->gen_op_buf[oi]; TCGArg * const args = &s->gen_opparam_buf[op->args]; TCGOpcode opc = op->opc; const TCGOpDef *def = &tcg_op_defs[opc]; - uint16_t dead_args = s->op_dead_args[oi]; - uint8_t sync_args = s->op_sync_args[oi]; + TCGLifeData arg_life = op->life; oi_next = op->next; #ifdef CONFIG_PROFILER @@ -2438,11 +2623,11 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb) switch (opc) { case INDEX_op_mov_i32: case INDEX_op_mov_i64: - tcg_reg_alloc_mov(s, def, args, dead_args, sync_args); + tcg_reg_alloc_mov(s, def, args, arg_life); break; case INDEX_op_movi_i32: case INDEX_op_movi_i64: - tcg_reg_alloc_movi(s, args, dead_args, sync_args); + tcg_reg_alloc_movi(s, args, arg_life); break; case INDEX_op_insn_start: if (num_insns >= 0) { @@ -2467,8 +2652,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb) tcg_out_label(s, arg_label(args[0]), s->code_ptr); break; case INDEX_op_call: - tcg_reg_alloc_call(s, op->callo, op->calli, args, - dead_args, sync_args); + tcg_reg_alloc_call(s, op->callo, op->calli, args, arg_life); break; default: /* Sanity check that we've not introduced any unhandled opcodes. */ @@ -2478,7 +2662,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb) /* Note: in order to speed up the code, it would be much faster to have specialized register allocator functions for some common argument patterns */ - tcg_reg_alloc_op(s, def, opc, args, dead_args, sync_args); + tcg_reg_alloc_op(s, def, opc, args, arg_life); break; } #ifdef CONFIG_DEBUG_TCG @@ -575,24 +575,41 @@ typedef struct TCGTempSet { unsigned long l[BITS_TO_LONGS(TCG_MAX_TEMPS)]; } TCGTempSet; +/* While we limit helpers to 6 arguments, for 32-bit hosts, with padding, + this imples a max of 6*2 (64-bit in) + 2 (64-bit out) = 14 operands. + There are never more than 2 outputs, which means that we can store all + dead + sync data within 16 bits. */ +#define DEAD_ARG 4 +#define SYNC_ARG 1 +typedef uint16_t TCGLifeData; + +/* The layout here is designed to avoid crossing of a 32-bit boundary. + If we do so, gcc adds padding, expanding the size to 12. */ typedef struct TCGOp { - TCGOpcode opc : 8; + TCGOpcode opc : 8; /* 8 */ + + /* Index of the prev/next op, or 0 for the end of the list. */ + unsigned prev : 10; /* 18 */ + unsigned next : 10; /* 28 */ /* The number of out and in parameter for a call. */ - unsigned callo : 2; - unsigned calli : 6; + unsigned calli : 4; /* 32 */ + unsigned callo : 2; /* 34 */ - /* Index of the arguments for this op, or -1 for zero-operand ops. */ - signed args : 16; + /* Index of the arguments for this op, or 0 for zero-operand ops. */ + unsigned args : 14; /* 48 */ - /* Index of the prex/next op, or -1 for the end of the list. */ - signed prev : 16; - signed next : 16; + /* Lifetime data of the operands. */ + unsigned life : 16; /* 64 */ } TCGOp; -QEMU_BUILD_BUG_ON(NB_OPS > 0xff); -QEMU_BUILD_BUG_ON(OPC_BUF_SIZE >= 0x7fff); -QEMU_BUILD_BUG_ON(OPPARAM_BUF_SIZE >= 0x7fff); +/* Make sure operands fit in the bitfields above. */ +QEMU_BUILD_BUG_ON(NB_OPS > (1 << 8)); +QEMU_BUILD_BUG_ON(OPC_BUF_SIZE > (1 << 10)); +QEMU_BUILD_BUG_ON(OPPARAM_BUF_SIZE > (1 << 14)); + +/* Make sure that we don't overflow 64 bits without noticing. */ +QEMU_BUILD_BUG_ON(sizeof(TCGOp) > 8); struct TCGContext { uint8_t *pool_cur, *pool_end; @@ -600,6 +617,7 @@ struct TCGContext { int nb_labels; int nb_globals; int nb_temps; + int nb_indirects; /* goto_tb support */ tcg_insn_unit *code_buf; @@ -607,13 +625,6 @@ struct TCGContext { uint16_t *tb_jmp_insn_offset; /* tb->jmp_insn_offset if USE_DIRECT_JUMP */ uintptr_t *tb_jmp_target_addr; /* tb->jmp_target_addr if !USE_DIRECT_JUMP */ - /* liveness analysis */ - uint16_t *op_dead_args; /* for each operation, each bit tells if the - corresponding argument is dead */ - uint8_t *op_sync_args; /* for each operation, each bit tells if the - corresponding output argument needs to be - sync to memory. */ - TCGRegSet reserved_regs; intptr_t current_frame_offset; intptr_t frame_start; @@ -649,8 +660,6 @@ struct TCGContext { int goto_tb_issue_mask; #endif - int gen_first_op_idx; - int gen_last_op_idx; int gen_next_op_idx; int gen_next_parm_idx; @@ -890,6 +899,9 @@ void tcg_gen_callN(TCGContext *s, void *func, TCGArg ret, int nargs, TCGArg *args); void tcg_op_remove(TCGContext *s, TCGOp *op); +TCGOp *tcg_op_insert_before(TCGContext *s, TCGOp *op, TCGOpcode opc, int narg); +TCGOp *tcg_op_insert_after(TCGContext *s, TCGOp *op, TCGOpcode opc, int narg); + void tcg_optimize(TCGContext *s); /* only used for debugging purposes */ @@ -32,15 +32,22 @@ int qemu_loglevel; static int log_append = 0; static GArray *debug_regions; -void qemu_log(const char *fmt, ...) +/* Return the number of characters emitted. */ +int qemu_log(const char *fmt, ...) { - va_list ap; - - va_start(ap, fmt); + int ret = 0; if (qemu_logfile) { - vfprintf(qemu_logfile, fmt, ap); + va_list ap; + va_start(ap, fmt); + ret = vfprintf(qemu_logfile, fmt, ap); + va_end(ap); + + /* Don't pass back error results. */ + if (ret < 0) { + ret = 0; + } } - va_end(ap); + return ret; } static bool log_uses_own_buffers; @@ -240,8 +247,9 @@ const QEMULogItem qemu_log_items[] = { { CPU_LOG_TB_OP, "op", "show micro ops for each compiled TB" }, { CPU_LOG_TB_OP_OPT, "op_opt", - "show micro ops (x86 only: before eflags optimization) and\n" - "after liveness analysis" }, + "show micro ops after optimization" }, + { CPU_LOG_TB_OP_IND, "op_ind", + "show micro ops before indirect lowering" }, { CPU_LOG_INT, "int", "show interrupts/exceptions in short format" }, { CPU_LOG_EXEC, "exec", |