diff options
-rw-r--r-- | accel/tcg/cpu-exec.c | 111 | ||||
-rw-r--r-- | accel/tcg/debuginfo.c | 96 | ||||
-rw-r--r-- | accel/tcg/debuginfo.h | 77 | ||||
-rw-r--r-- | accel/tcg/meson.build | 2 | ||||
-rw-r--r-- | accel/tcg/perf.c | 375 | ||||
-rw-r--r-- | accel/tcg/perf.h | 49 | ||||
-rw-r--r-- | accel/tcg/translate-all.c | 7 | ||||
-rw-r--r-- | docs/devel/tcg.rst | 23 | ||||
-rw-r--r-- | hw/core/loader.c | 5 | ||||
-rw-r--r-- | linux-user/elfload.c | 3 | ||||
-rw-r--r-- | linux-user/exit.c | 2 | ||||
-rw-r--r-- | linux-user/main.c | 15 | ||||
-rw-r--r-- | linux-user/meson.build | 1 | ||||
-rw-r--r-- | linux-user/signal.c | 8 | ||||
-rw-r--r-- | meson.build | 16 | ||||
-rw-r--r-- | qemu-options.hx | 20 | ||||
-rw-r--r-- | softmmu/vl.c | 11 | ||||
-rw-r--r-- | tcg/tcg.c | 2 | ||||
-rw-r--r-- | util/bufferiszero.c | 41 |
19 files changed, 763 insertions, 101 deletions
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c index 356fe34..8927092 100644 --- a/accel/tcg/cpu-exec.c +++ b/accel/tcg/cpu-exec.c @@ -909,64 +909,10 @@ static inline void cpu_loop_exec_tb(CPUState *cpu, TranslationBlock *tb, /* main execution loop */ -int cpu_exec(CPUState *cpu) +static int __attribute__((noinline)) +cpu_exec_loop(CPUState *cpu, SyncClocks *sc) { int ret; - SyncClocks sc = { 0 }; - - /* replay_interrupt may need current_cpu */ - current_cpu = cpu; - - if (cpu_handle_halt(cpu)) { - return EXCP_HALTED; - } - - rcu_read_lock(); - - cpu_exec_enter(cpu); - - /* Calculate difference between guest clock and host clock. - * This delay includes the delay of the last cycle, so - * what we have to do is sleep until it is 0. As for the - * advance/delay we gain here, we try to fix it next time. - */ - init_delay_params(&sc, cpu); - - /* prepare setjmp context for exception handling */ - if (sigsetjmp(cpu->jmp_env, 0) != 0) { -#if defined(__clang__) - /* - * Some compilers wrongly smash all local variables after - * siglongjmp (the spec requires that only non-volatile locals - * which are changed between the sigsetjmp and siglongjmp are - * permitted to be trashed). There were bug reports for gcc - * 4.5.0 and clang. The bug is fixed in all versions of gcc - * that we support, but is still unfixed in clang: - * https://bugs.llvm.org/show_bug.cgi?id=21183 - * - * Reload an essential local variable here for those compilers. - * Newer versions of gcc would complain about this code (-Wclobbered), - * so we only perform the workaround for clang. - */ - cpu = current_cpu; -#else - /* Non-buggy compilers preserve this; assert the correct value. */ - g_assert(cpu == current_cpu); -#endif - -#ifndef CONFIG_SOFTMMU - clear_helper_retaddr(); - if (have_mmap_lock()) { - mmap_unlock(); - } -#endif - if (qemu_mutex_iothread_locked()) { - qemu_mutex_unlock_iothread(); - } - qemu_plugin_disable_mem_helpers(cpu); - - assert_no_pages_locked(); - } /* if an exception is pending, we execute it here */ while (!cpu_handle_exception(cpu, &ret)) { @@ -1033,9 +979,60 @@ int cpu_exec(CPUState *cpu) /* Try to align the host and virtual clocks if the guest is in advance */ - align_clocks(&sc, cpu); + align_clocks(sc, cpu); } } + return ret; +} + +static int cpu_exec_setjmp(CPUState *cpu, SyncClocks *sc) +{ + /* Prepare setjmp context for exception handling. */ + if (unlikely(sigsetjmp(cpu->jmp_env, 0) != 0)) { + /* Non-buggy compilers preserve this; assert the correct value. */ + g_assert(cpu == current_cpu); + +#ifndef CONFIG_SOFTMMU + clear_helper_retaddr(); + if (have_mmap_lock()) { + mmap_unlock(); + } +#endif + if (qemu_mutex_iothread_locked()) { + qemu_mutex_unlock_iothread(); + } + qemu_plugin_disable_mem_helpers(cpu); + + assert_no_pages_locked(); + } + + return cpu_exec_loop(cpu, sc); +} + +int cpu_exec(CPUState *cpu) +{ + int ret; + SyncClocks sc = { 0 }; + + /* replay_interrupt may need current_cpu */ + current_cpu = cpu; + + if (cpu_handle_halt(cpu)) { + return EXCP_HALTED; + } + + rcu_read_lock(); + cpu_exec_enter(cpu); + + /* + * Calculate difference between guest clock and host clock. + * This delay includes the delay of the last cycle, so + * what we have to do is sleep until it is 0. As for the + * advance/delay we gain here, we try to fix it next time. + */ + init_delay_params(&sc, cpu); + + ret = cpu_exec_setjmp(cpu, &sc); cpu_exec_exit(cpu); rcu_read_unlock(); diff --git a/accel/tcg/debuginfo.c b/accel/tcg/debuginfo.c new file mode 100644 index 0000000..71c66d0 --- /dev/null +++ b/accel/tcg/debuginfo.c @@ -0,0 +1,96 @@ +/* + * Debug information support. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qemu/lockable.h" + +#include <elfutils/libdwfl.h> + +#include "debuginfo.h" + +static QemuMutex lock; +static Dwfl *dwfl; +static const Dwfl_Callbacks dwfl_callbacks = { + .find_elf = NULL, + .find_debuginfo = dwfl_standard_find_debuginfo, + .section_address = NULL, + .debuginfo_path = NULL, +}; + +__attribute__((constructor)) +static void debuginfo_init(void) +{ + qemu_mutex_init(&lock); +} + +void debuginfo_report_elf(const char *name, int fd, uint64_t bias) +{ + QEMU_LOCK_GUARD(&lock); + + if (dwfl) { + dwfl_report_begin_add(dwfl); + } else { + dwfl = dwfl_begin(&dwfl_callbacks); + } + + if (dwfl) { + dwfl_report_elf(dwfl, name, name, fd, bias, true); + dwfl_report_end(dwfl, NULL, NULL); + } +} + +void debuginfo_lock(void) +{ + qemu_mutex_lock(&lock); +} + +void debuginfo_query(struct debuginfo_query *q, size_t n) +{ + const char *symbol, *file; + Dwfl_Module *dwfl_module; + Dwfl_Line *dwfl_line; + GElf_Off dwfl_offset; + GElf_Sym dwfl_sym; + size_t i; + int line; + + if (!dwfl) { + return; + } + + for (i = 0; i < n; i++) { + dwfl_module = dwfl_addrmodule(dwfl, q[i].address); + if (!dwfl_module) { + continue; + } + + if (q[i].flags & DEBUGINFO_SYMBOL) { + symbol = dwfl_module_addrinfo(dwfl_module, q[i].address, + &dwfl_offset, &dwfl_sym, + NULL, NULL, NULL); + if (symbol) { + q[i].symbol = symbol; + q[i].offset = dwfl_offset; + } + } + + if (q[i].flags & DEBUGINFO_LINE) { + dwfl_line = dwfl_module_getsrc(dwfl_module, q[i].address); + if (dwfl_line) { + file = dwfl_lineinfo(dwfl_line, NULL, &line, 0, NULL, NULL); + if (file) { + q[i].file = file; + q[i].line = line; + } + } + } + } +} + +void debuginfo_unlock(void) +{ + qemu_mutex_unlock(&lock); +} diff --git a/accel/tcg/debuginfo.h b/accel/tcg/debuginfo.h new file mode 100644 index 0000000..7542cfe --- /dev/null +++ b/accel/tcg/debuginfo.h @@ -0,0 +1,77 @@ +/* + * Debug information support. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef ACCEL_TCG_DEBUGINFO_H +#define ACCEL_TCG_DEBUGINFO_H + +/* + * Debuginfo describing a certain address. + */ +struct debuginfo_query { + uint64_t address; /* Input: address. */ + int flags; /* Input: debuginfo subset. */ + const char *symbol; /* Symbol that the address is part of. */ + uint64_t offset; /* Offset from the symbol. */ + const char *file; /* Source file associated with the address. */ + int line; /* Line number in the source file. */ +}; + +/* + * Debuginfo subsets. + */ +#define DEBUGINFO_SYMBOL BIT(1) +#define DEBUGINFO_LINE BIT(2) + +#if defined(CONFIG_TCG) && defined(CONFIG_LIBDW) +/* + * Load debuginfo for the specified guest ELF image. + * Return true on success, false on failure. + */ +void debuginfo_report_elf(const char *name, int fd, uint64_t bias); + +/* + * Take the debuginfo lock. + */ +void debuginfo_lock(void); + +/* + * Fill each on N Qs with the debuginfo about Q->ADDRESS as specified by + * Q->FLAGS: + * + * - DEBUGINFO_SYMBOL: update Q->SYMBOL and Q->OFFSET. If symbol debuginfo is + * missing, then leave them as is. + * - DEBUINFO_LINE: update Q->FILE and Q->LINE. If line debuginfo is missing, + * then leave them as is. + * + * This function must be called under the debuginfo lock. The results can be + * accessed only until the debuginfo lock is released. + */ +void debuginfo_query(struct debuginfo_query *q, size_t n); + +/* + * Release the debuginfo lock. + */ +void debuginfo_unlock(void); +#else +static inline void debuginfo_report_elf(const char *image_name, int image_fd, + uint64_t load_bias) +{ +} + +static inline void debuginfo_lock(void) +{ +} + +static inline void debuginfo_query(struct debuginfo_query *q, size_t n) +{ +} + +static inline void debuginfo_unlock(void) +{ +} +#endif + +#endif diff --git a/accel/tcg/meson.build b/accel/tcg/meson.build index 75e1dff..77740b1 100644 --- a/accel/tcg/meson.build +++ b/accel/tcg/meson.build @@ -12,6 +12,8 @@ tcg_ss.add(files( tcg_ss.add(when: 'CONFIG_USER_ONLY', if_true: files('user-exec.c')) tcg_ss.add(when: 'CONFIG_SOFTMMU', if_false: files('user-exec-stub.c')) tcg_ss.add(when: 'CONFIG_PLUGIN', if_true: [files('plugin-gen.c')]) +tcg_ss.add(when: libdw, if_true: files('debuginfo.c')) +tcg_ss.add(when: 'CONFIG_LINUX', if_true: files('perf.c')) specific_ss.add_all(when: 'CONFIG_TCG', if_true: tcg_ss) specific_ss.add(when: ['CONFIG_SOFTMMU', 'CONFIG_TCG'], if_true: files( diff --git a/accel/tcg/perf.c b/accel/tcg/perf.c new file mode 100644 index 0000000..ae19f6e --- /dev/null +++ b/accel/tcg/perf.c @@ -0,0 +1,375 @@ +/* + * Linux perf perf-<pid>.map and jit-<pid>.dump integration. + * + * The jitdump spec can be found at [1]. + * + * [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/plain/tools/perf/Documentation/jitdump-specification.txt + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "elf.h" +#include "exec/exec-all.h" +#include "qemu/timer.h" +#include "tcg/tcg.h" + +#include "debuginfo.h" +#include "perf.h" + +static FILE *safe_fopen_w(const char *path) +{ + int saved_errno; + FILE *f; + int fd; + + /* Delete the old file, if any. */ + unlink(path); + + /* Avoid symlink attacks by using O_CREAT | O_EXCL. */ + fd = open(path, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); + if (fd == -1) { + return NULL; + } + + /* Convert fd to FILE*. */ + f = fdopen(fd, "w"); + if (f == NULL) { + saved_errno = errno; + close(fd); + errno = saved_errno; + return NULL; + } + + return f; +} + +static FILE *perfmap; + +void perf_enable_perfmap(void) +{ + char map_file[32]; + + snprintf(map_file, sizeof(map_file), "/tmp/perf-%d.map", getpid()); + perfmap = safe_fopen_w(map_file); + if (perfmap == NULL) { + warn_report("Could not open %s: %s, proceeding without perfmap", + map_file, strerror(errno)); + } +} + +/* Get PC and size of code JITed for guest instruction #INSN. */ +static void get_host_pc_size(uintptr_t *host_pc, uint16_t *host_size, + const void *start, size_t insn) +{ + uint16_t start_off = insn ? tcg_ctx->gen_insn_end_off[insn - 1] : 0; + + if (host_pc) { + *host_pc = (uintptr_t)start + start_off; + } + if (host_size) { + *host_size = tcg_ctx->gen_insn_end_off[insn] - start_off; + } +} + +static const char *pretty_symbol(const struct debuginfo_query *q, size_t *len) +{ + static __thread char buf[64]; + int tmp; + + if (!q->symbol) { + tmp = snprintf(buf, sizeof(buf), "guest-0x%"PRIx64, q->address); + if (len) { + *len = MIN(tmp + 1, sizeof(buf)); + } + return buf; + } + + if (!q->offset) { + if (len) { + *len = strlen(q->symbol) + 1; + } + return q->symbol; + } + + tmp = snprintf(buf, sizeof(buf), "%s+0x%"PRIx64, q->symbol, q->offset); + if (len) { + *len = MIN(tmp + 1, sizeof(buf)); + } + return buf; +} + +static void write_perfmap_entry(const void *start, size_t insn, + const struct debuginfo_query *q) +{ + uint16_t host_size; + uintptr_t host_pc; + + get_host_pc_size(&host_pc, &host_size, start, insn); + fprintf(perfmap, "%"PRIxPTR" %"PRIx16" %s\n", + host_pc, host_size, pretty_symbol(q, NULL)); +} + +static FILE *jitdump; + +#define JITHEADER_MAGIC 0x4A695444 +#define JITHEADER_VERSION 1 + +struct jitheader { + uint32_t magic; + uint32_t version; + uint32_t total_size; + uint32_t elf_mach; + uint32_t pad1; + uint32_t pid; + uint64_t timestamp; + uint64_t flags; +}; + +enum jit_record_type { + JIT_CODE_LOAD = 0, + JIT_CODE_DEBUG_INFO = 2, +}; + +struct jr_prefix { + uint32_t id; + uint32_t total_size; + uint64_t timestamp; +}; + +struct jr_code_load { + struct jr_prefix p; + + uint32_t pid; + uint32_t tid; + uint64_t vma; + uint64_t code_addr; + uint64_t code_size; + uint64_t code_index; +}; + +struct debug_entry { + uint64_t addr; + int lineno; + int discrim; + const char name[]; +}; + +struct jr_code_debug_info { + struct jr_prefix p; + + uint64_t code_addr; + uint64_t nr_entry; + struct debug_entry entries[]; +}; + +static uint32_t get_e_machine(void) +{ + Elf64_Ehdr elf_header; + FILE *exe; + size_t n; + + QEMU_BUILD_BUG_ON(offsetof(Elf32_Ehdr, e_machine) != + offsetof(Elf64_Ehdr, e_machine)); + + exe = fopen("/proc/self/exe", "r"); + if (exe == NULL) { + return EM_NONE; + } + + n = fread(&elf_header, sizeof(elf_header), 1, exe); + fclose(exe); + if (n != 1) { + return EM_NONE; + } + + return elf_header.e_machine; +} + +void perf_enable_jitdump(void) +{ + struct jitheader header; + char jitdump_file[32]; + void *perf_marker; + + if (!use_rt_clock) { + warn_report("CLOCK_MONOTONIC is not available, proceeding without jitdump"); + return; + } + + snprintf(jitdump_file, sizeof(jitdump_file), "jit-%d.dump", getpid()); + jitdump = safe_fopen_w(jitdump_file); + if (jitdump == NULL) { + warn_report("Could not open %s: %s, proceeding without jitdump", + jitdump_file, strerror(errno)); + return; + } + + /* + * `perf inject` will see that the mapped file name in the corresponding + * PERF_RECORD_MMAP or PERF_RECORD_MMAP2 event is of the form jit-%d.dump + * and will process it as a jitdump file. + */ + perf_marker = mmap(NULL, qemu_real_host_page_size(), PROT_READ | PROT_EXEC, + MAP_PRIVATE, fileno(jitdump), 0); + if (perf_marker == MAP_FAILED) { + warn_report("Could not map %s: %s, proceeding without jitdump", + jitdump_file, strerror(errno)); + fclose(jitdump); + jitdump = NULL; + return; + } + + header.magic = JITHEADER_MAGIC; + header.version = JITHEADER_VERSION; + header.total_size = sizeof(header); + header.elf_mach = get_e_machine(); + header.pad1 = 0; + header.pid = getpid(); + header.timestamp = get_clock(); + header.flags = 0; + fwrite(&header, sizeof(header), 1, jitdump); +} + +void perf_report_prologue(const void *start, size_t size) +{ + if (perfmap) { + fprintf(perfmap, "%"PRIxPTR" %zx tcg-prologue-buffer\n", + (uintptr_t)start, size); + } +} + +/* Write a JIT_CODE_DEBUG_INFO jitdump entry. */ +static void write_jr_code_debug_info(const void *start, + const struct debuginfo_query *q, + size_t icount) +{ + struct jr_code_debug_info rec; + struct debug_entry ent; + uintptr_t host_pc; + int insn; + + /* Write the header. */ + rec.p.id = JIT_CODE_DEBUG_INFO; + rec.p.total_size = sizeof(rec) + sizeof(ent) + 1; + rec.p.timestamp = get_clock(); + rec.code_addr = (uintptr_t)start; + rec.nr_entry = 1; + for (insn = 0; insn < icount; insn++) { + if (q[insn].file) { + rec.p.total_size += sizeof(ent) + strlen(q[insn].file) + 1; + rec.nr_entry++; + } + } + fwrite(&rec, sizeof(rec), 1, jitdump); + + /* Write the main debug entries. */ + for (insn = 0; insn < icount; insn++) { + if (q[insn].file) { + get_host_pc_size(&host_pc, NULL, start, insn); + ent.addr = host_pc; + ent.lineno = q[insn].line; + ent.discrim = 0; + fwrite(&ent, sizeof(ent), 1, jitdump); + fwrite(q[insn].file, strlen(q[insn].file) + 1, 1, jitdump); + } + } + + /* Write the trailing debug_entry. */ + ent.addr = (uintptr_t)start + tcg_ctx->gen_insn_end_off[icount - 1]; + ent.lineno = 0; + ent.discrim = 0; + fwrite(&ent, sizeof(ent), 1, jitdump); + fwrite("", 1, 1, jitdump); +} + +/* Write a JIT_CODE_LOAD jitdump entry. */ +static void write_jr_code_load(const void *start, uint16_t host_size, + const struct debuginfo_query *q) +{ + static uint64_t code_index; + struct jr_code_load rec; + const char *symbol; + size_t symbol_size; + + symbol = pretty_symbol(q, &symbol_size); + rec.p.id = JIT_CODE_LOAD; + rec.p.total_size = sizeof(rec) + symbol_size + host_size; + rec.p.timestamp = get_clock(); + rec.pid = getpid(); + rec.tid = qemu_get_thread_id(); + rec.vma = (uintptr_t)start; + rec.code_addr = (uintptr_t)start; + rec.code_size = host_size; + rec.code_index = code_index++; + fwrite(&rec, sizeof(rec), 1, jitdump); + fwrite(symbol, symbol_size, 1, jitdump); + fwrite(start, host_size, 1, jitdump); +} + +void perf_report_code(uint64_t guest_pc, TranslationBlock *tb, + const void *start) +{ + struct debuginfo_query *q; + size_t insn; + + if (!perfmap && !jitdump) { + return; + } + + q = g_try_malloc0_n(tb->icount, sizeof(*q)); + if (!q) { + return; + } + + debuginfo_lock(); + + /* Query debuginfo for each guest instruction. */ + for (insn = 0; insn < tb->icount; insn++) { + /* FIXME: This replicates the restore_state_to_opc() logic. */ + q[insn].address = tcg_ctx->gen_insn_data[insn][0]; + if (TARGET_TB_PCREL) { + q[insn].address |= (guest_pc & TARGET_PAGE_MASK); + } else { +#if defined(TARGET_I386) + q[insn].address -= tb->cs_base; +#endif + } + q[insn].flags = DEBUGINFO_SYMBOL | (jitdump ? DEBUGINFO_LINE : 0); + } + debuginfo_query(q, tb->icount); + + /* Emit perfmap entries if needed. */ + if (perfmap) { + flockfile(perfmap); + for (insn = 0; insn < tb->icount; insn++) { + write_perfmap_entry(start, insn, &q[insn]); + } + funlockfile(perfmap); + } + + /* Emit jitdump entries if needed. */ + if (jitdump) { + flockfile(jitdump); + write_jr_code_debug_info(start, q, tb->icount); + write_jr_code_load(start, tcg_ctx->gen_insn_end_off[tb->icount - 1], + q); + funlockfile(jitdump); + } + + debuginfo_unlock(); + g_free(q); +} + +void perf_exit(void) +{ + if (perfmap) { + fclose(perfmap); + perfmap = NULL; + } + + if (jitdump) { + fclose(jitdump); + jitdump = NULL; + } +} diff --git a/accel/tcg/perf.h b/accel/tcg/perf.h new file mode 100644 index 0000000..f92dd52 --- /dev/null +++ b/accel/tcg/perf.h @@ -0,0 +1,49 @@ +/* + * Linux perf perf-<pid>.map and jit-<pid>.dump integration. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef ACCEL_TCG_PERF_H +#define ACCEL_TCG_PERF_H + +#if defined(CONFIG_TCG) && defined(CONFIG_LINUX) +/* Start writing perf-<pid>.map. */ +void perf_enable_perfmap(void); + +/* Start writing jit-<pid>.dump. */ +void perf_enable_jitdump(void); + +/* Add information about TCG prologue to profiler maps. */ +void perf_report_prologue(const void *start, size_t size); + +/* Add information about JITted guest code to profiler maps. */ +void perf_report_code(uint64_t guest_pc, TranslationBlock *tb, + const void *start); + +/* Stop writing perf-<pid>.map and/or jit-<pid>.dump. */ +void perf_exit(void); +#else +static inline void perf_enable_perfmap(void) +{ +} + +static inline void perf_enable_jitdump(void) +{ +} + +static inline void perf_report_prologue(const void *start, size_t size) +{ +} + +static inline void perf_report_code(uint64_t guest_pc, TranslationBlock *tb, + const void *start) +{ +} + +static inline void perf_exit(void) +{ +} +#endif + +#endif diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c index 51ac1f6..979f8e1 100644 --- a/accel/tcg/translate-all.c +++ b/accel/tcg/translate-all.c @@ -62,6 +62,7 @@ #include "tb-hash.h" #include "tb-context.h" #include "internal.h" +#include "perf.h" /* Make sure all possible CPU event bits fit in tb->trace_vcpu_dstate */ QEMU_BUILD_BUG_ON(CPU_TRACE_DSTATE_MAX_EVENTS > @@ -406,6 +407,12 @@ TranslationBlock *tb_gen_code(CPUState *cpu, } tb->tc.size = gen_code_size; + /* + * For TARGET_TB_PCREL, attribute all executions of the generated + * code to its first mapping. + */ + perf_report_code(pc, tb, tcg_splitwx_to_rx(gen_code_buf)); + #ifdef CONFIG_PROFILER qatomic_set(&prof->code_time, prof->code_time + profile_getclock() - ti); qatomic_set(&prof->code_in_len, prof->code_in_len + tb->size); diff --git a/docs/devel/tcg.rst b/docs/devel/tcg.rst index 136a7a0..b4096a1 100644 --- a/docs/devel/tcg.rst +++ b/docs/devel/tcg.rst @@ -188,3 +188,26 @@ memory areas instead calls out to C code for device emulation. Finally, the MMU helps tracking dirty pages and pages pointed to by translation blocks. +Profiling JITted code +--------------------- + +The Linux ``perf`` tool will treat all JITted code as a single block as +unlike the main code it can't use debug information to link individual +program counter samples with larger functions. To overcome this +limitation you can use the ``-perfmap`` or the ``-jitdump`` option to generate +map files. ``-perfmap`` is lightweight and produces only guest-host mappings. +``-jitdump`` additionally saves JITed code and guest debug information (if +available); its output needs to be integrated with the ``perf.data`` file +before the final report can be viewed. + +.. code:: + + perf record $QEMU -perfmap $REMAINING_ARGS + perf report + + perf record -k 1 $QEMU -jitdump $REMAINING_ARGS + DEBUGINFOD_URLS= perf inject -j -i perf.data -o perf.data.jitted + perf report -i perf.data.jitted + +Note that qemu-system generates mappings only for ``-kernel`` files in ELF +format. diff --git a/hw/core/loader.c b/hw/core/loader.c index 0548830..55dbe2e 100644 --- a/hw/core/loader.c +++ b/hw/core/loader.c @@ -61,6 +61,7 @@ #include "hw/boards.h" #include "qemu/cutils.h" #include "sysemu/runstate.h" +#include "accel/tcg/debuginfo.h" #include <zlib.h> @@ -503,6 +504,10 @@ ssize_t load_elf_ram_sym(const char *filename, clear_lsb, data_swab, as, load_rom, sym_cb); } + if (ret != ELF_LOAD_FAILED) { + debuginfo_report_elf(filename, fd, 0); + } + fail: close(fd); return ret; diff --git a/linux-user/elfload.c b/linux-user/elfload.c index 20894b6..5928c14 100644 --- a/linux-user/elfload.c +++ b/linux-user/elfload.c @@ -19,6 +19,7 @@ #include "qemu/selfmap.h" #include "qapi/error.h" #include "target_signal.h" +#include "accel/tcg/debuginfo.h" #ifdef _ARCH_PPC64 #undef ARCH_DLINFO @@ -3261,6 +3262,8 @@ static void load_elf_image(const char *image_name, int image_fd, load_symbols(ehdr, image_fd, load_bias); } + debuginfo_report_elf(image_name, image_fd, load_bias); + mmap_unlock(); close(image_fd); diff --git a/linux-user/exit.c b/linux-user/exit.c index fa6ef0b..607b6da 100644 --- a/linux-user/exit.c +++ b/linux-user/exit.c @@ -17,6 +17,7 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "qemu/osdep.h" +#include "accel/tcg/perf.h" #include "exec/gdbstub.h" #include "qemu.h" #include "user-internals.h" @@ -38,4 +39,5 @@ void preexit_cleanup(CPUArchState *env, int code) #endif gdb_exit(code); qemu_plugin_user_exit(); + perf_exit(); } diff --git a/linux-user/main.c b/linux-user/main.c index a17fed0..4290651 100644 --- a/linux-user/main.c +++ b/linux-user/main.c @@ -53,6 +53,7 @@ #include "signal-common.h" #include "loader.h" #include "user-mmap.h" +#include "accel/tcg/perf.h" #ifdef CONFIG_SEMIHOSTING #include "semihosting/semihost.h" @@ -423,6 +424,16 @@ static void handle_arg_abi_call0(const char *arg) } #endif +static void handle_arg_perfmap(const char *arg) +{ + perf_enable_perfmap(); +} + +static void handle_arg_jitdump(const char *arg) +{ + perf_enable_jitdump(); +} + static QemuPluginList plugins = QTAILQ_HEAD_INITIALIZER(plugins); #ifdef CONFIG_PLUGIN @@ -493,6 +504,10 @@ static const struct qemu_argument arg_table[] = { {"xtensa-abi-call0", "QEMU_XTENSA_ABI_CALL0", false, handle_arg_abi_call0, "", "assume CALL0 Xtensa ABI"}, #endif + {"perfmap", "QEMU_PERFMAP", false, handle_arg_perfmap, + "", "Generate a /tmp/perf-${pid}.map file for perf"}, + {"jitdump", "QEMU_JITDUMP", false, handle_arg_jitdump, + "", "Generate a jit-${pid}.dump file for perf"}, {NULL, NULL, false, NULL, NULL, NULL} }; diff --git a/linux-user/meson.build b/linux-user/meson.build index de4320a..7171dc6 100644 --- a/linux-user/meson.build +++ b/linux-user/meson.build @@ -22,6 +22,7 @@ linux_user_ss.add(files( 'uname.c', )) linux_user_ss.add(rt) +linux_user_ss.add(libdw) linux_user_ss.add(when: 'TARGET_HAS_BFLT', if_true: files('flatload.c')) linux_user_ss.add(when: 'TARGET_I386', if_true: files('vm86.c')) diff --git a/linux-user/signal.c b/linux-user/signal.c index 61c6fa3..098f3a7 100644 --- a/linux-user/signal.c +++ b/linux-user/signal.c @@ -695,7 +695,7 @@ void cpu_loop_exit_sigbus(CPUState *cpu, target_ulong addr, /* abort execution with signal */ static G_NORETURN -void dump_core_and_abort(int target_sig) +void dump_core_and_abort(CPUArchState *cpu_env, int target_sig) { CPUState *cpu = thread_cpu; CPUArchState *env = cpu->env_ptr; @@ -724,6 +724,8 @@ void dump_core_and_abort(int target_sig) target_sig, strsignal(host_sig), "core dumped" ); } + preexit_cleanup(cpu_env, 128 + target_sig); + /* The proper exit code for dying from an uncaught signal is * -<signal>. The kernel doesn't allow exit() or _exit() to pass * a negative value. To get the proper exit code we need to @@ -1058,12 +1060,12 @@ static void handle_pending_signal(CPUArchState *cpu_env, int sig, sig != TARGET_SIGURG && sig != TARGET_SIGWINCH && sig != TARGET_SIGCONT) { - dump_core_and_abort(sig); + dump_core_and_abort(cpu_env, sig); } } else if (handler == TARGET_SIG_IGN) { /* ignore sig */ } else if (handler == TARGET_SIG_ERR) { - dump_core_and_abort(sig); + dump_core_and_abort(cpu_env, sig); } else { /* compute the blocked signals during the handler execution */ sigset_t *blocked_set; diff --git a/meson.build b/meson.build index 5d68a8f..58d8cd6 100644 --- a/meson.build +++ b/meson.build @@ -1648,6 +1648,12 @@ if libbpf.found() and not cc.links(''' endif endif +# libdw +libdw = dependency('libdw', + method: 'pkg-config', + kwargs: static_kwargs, + required: false) + ################# # config-host.h # ################# @@ -1923,6 +1929,7 @@ config_host_data.set('CONFIG_DBUS_DISPLAY', dbus_display) config_host_data.set('CONFIG_CFI', get_option('cfi')) config_host_data.set('CONFIG_SELINUX', selinux.found()) config_host_data.set('CONFIG_XEN_BACKEND', xen.found()) +config_host_data.set('CONFIG_LIBDW', libdw.found()) if xen.found() # protect from xen.version() having less than three components xen_version = xen.version().split('.') + ['0', '0'] @@ -2331,11 +2338,9 @@ config_host_data.set('CONFIG_CPUID_H', have_cpuid_h) config_host_data.set('CONFIG_AVX2_OPT', get_option('avx2') \ .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX2') \ .require(cc.links(''' - #pragma GCC push_options - #pragma GCC target("avx2") #include <cpuid.h> #include <immintrin.h> - static int bar(void *a) { + static int __attribute__((target("avx2"))) bar(void *a) { __m256i x = *(__m256i *)a; return _mm256_testz_si256(x, x); } @@ -2345,11 +2350,9 @@ config_host_data.set('CONFIG_AVX2_OPT', get_option('avx2') \ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \ .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512F') \ .require(cc.links(''' - #pragma GCC push_options - #pragma GCC target("avx512f") #include <cpuid.h> #include <immintrin.h> - static int bar(void *a) { + static int __attribute__((target("avx512f"))) bar(void *a) { __m512i x = *(__m512i *)a; return _mm512_test_epi64_mask(x, x); } @@ -3976,6 +3979,7 @@ summary_info += {'libudev': libudev} # Dummy dependency, keep .found() summary_info += {'FUSE lseek': fuse_lseek.found()} summary_info += {'selinux': selinux} +summary_info += {'libdw': libdw} summary(summary_info, bool_yn: true, section: 'Dependencies') if not supported_cpus.contains(cpu) diff --git a/qemu-options.hx b/qemu-options.hx index 3aa3a2f..d59d197 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -4838,6 +4838,26 @@ SRST Enable synchronization profiling. ERST +#if defined(CONFIG_TCG) && defined(CONFIG_LINUX) +DEF("perfmap", 0, QEMU_OPTION_perfmap, + "-perfmap generate a /tmp/perf-${pid}.map file for perf\n", + QEMU_ARCH_ALL) +SRST +``-perfmap`` + Generate a map file for Linux perf tools that will allow basic profiling + information to be broken down into basic blocks. +ERST + +DEF("jitdump", 0, QEMU_OPTION_jitdump, + "-jitdump generate a jit-${pid}.dump file for perf\n", + QEMU_ARCH_ALL) +SRST +``-jitdump`` + Generate a dump file for Linux perf tools that maps basic blocks to symbol + names, line numbers and JITted code. +ERST +#endif + DEFHEADING() DEFHEADING(Generic object creation:) diff --git a/softmmu/vl.c b/softmmu/vl.c index 9bd0e52..9177d95 100644 --- a/softmmu/vl.c +++ b/softmmu/vl.c @@ -96,6 +96,9 @@ #include "fsdev/qemu-fsdev.h" #endif #include "sysemu/qtest.h" +#ifdef CONFIG_TCG +#include "accel/tcg/perf.h" +#endif #include "disas/disas.h" @@ -2926,6 +2929,14 @@ void qemu_init(int argc, char **argv) case QEMU_OPTION_DFILTER: qemu_set_dfilter_ranges(optarg, &error_fatal); break; +#if defined(CONFIG_TCG) && defined(CONFIG_LINUX) + case QEMU_OPTION_perfmap: + perf_enable_perfmap(); + break; + case QEMU_OPTION_jitdump: + perf_enable_jitdump(); + break; +#endif case QEMU_OPTION_seed: qemu_guest_random_seed_main(optarg, &error_fatal); break; @@ -61,6 +61,7 @@ #include "exec/log.h" #include "tcg/tcg-ldst.h" #include "tcg-internal.h" +#include "accel/tcg/perf.h" /* Forward declarations for functions declared in tcg-target.c.inc and used here. */ @@ -913,6 +914,7 @@ void tcg_prologue_init(TCGContext *s) #endif prologue_size = tcg_current_code_size(s); + perf_report_prologue(s->code_gen_ptr, prologue_size); #ifndef CONFIG_TCG_INTERPRETER flush_idcache_range((uintptr_t)tcg_splitwx_to_rx(s->code_buf), diff --git a/util/bufferiszero.c b/util/bufferiszero.c index ec3cd4c..1790ded 100644 --- a/util/bufferiszero.c +++ b/util/bufferiszero.c @@ -64,18 +64,11 @@ buffer_zero_int(const void *buf, size_t len) } #if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) || defined(__SSE2__) -/* Do not use push_options pragmas unnecessarily, because clang - * does not support them. - */ -#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) -#pragma GCC push_options -#pragma GCC target("sse2") -#endif -#include <emmintrin.h> +#include <immintrin.h> /* Note that each of these vectorized functions require len >= 64. */ -static bool +static bool __attribute__((target("sse2"))) buffer_zero_sse2(const void *buf, size_t len) { __m128i t = _mm_loadu_si128(buf); @@ -104,20 +97,9 @@ buffer_zero_sse2(const void *buf, size_t len) return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == 0xFFFF; } -#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) -#pragma GCC pop_options -#endif #ifdef CONFIG_AVX2_OPT -/* Note that due to restrictions/bugs wrt __builtin functions in gcc <= 4.8, - * the includes have to be within the corresponding push_options region, and - * therefore the regions themselves have to be ordered with increasing ISA. - */ -#pragma GCC push_options -#pragma GCC target("sse4") -#include <smmintrin.h> - -static bool +static bool __attribute__((target("sse4"))) buffer_zero_sse4(const void *buf, size_t len) { __m128i t = _mm_loadu_si128(buf); @@ -145,12 +127,7 @@ buffer_zero_sse4(const void *buf, size_t len) return _mm_testz_si128(t, t); } -#pragma GCC pop_options -#pragma GCC push_options -#pragma GCC target("avx2") -#include <immintrin.h> - -static bool +static bool __attribute__((target("avx2"))) buffer_zero_avx2(const void *buf, size_t len) { /* Begin with an unaligned head of 32 bytes. */ @@ -176,15 +153,10 @@ buffer_zero_avx2(const void *buf, size_t len) return _mm256_testz_si256(t, t); } -#pragma GCC pop_options #endif /* CONFIG_AVX2_OPT */ #ifdef CONFIG_AVX512F_OPT -#pragma GCC push_options -#pragma GCC target("avx512f") -#include <immintrin.h> - -static bool +static bool __attribute__((target("avx512f"))) buffer_zero_avx512(const void *buf, size_t len) { /* Begin with an unaligned head of 64 bytes. */ @@ -210,8 +182,7 @@ buffer_zero_avx512(const void *buf, size_t len) return !_mm512_test_epi64_mask(t, t); } -#pragma GCC pop_options -#endif +#endif /* CONFIG_AVX512F_OPT */ /* Note that for test_buffer_is_zero_next_accel, the most preferred |