diff options
author | Emilio G. Cota <cota@braap.org> | 2017-07-07 19:24:20 -0400 |
---|---|---|
committer | Richard Henderson <richard.henderson@linaro.org> | 2017-10-24 13:53:42 -0700 |
commit | e8feb96fcc6c16eab8923332e86ff4ef0e2ac276 (patch) | |
tree | 417e8eea7365ee87a136a948bd20af42a70d353b /accel/tcg | |
parent | f51f315a676ec913a55ac27be4ef857f9f7ddc5c (diff) | |
download | qemu-e8feb96fcc6c16eab8923332e86ff4ef0e2ac276.zip qemu-e8feb96fcc6c16eab8923332e86ff4ef0e2ac276.tar.gz qemu-e8feb96fcc6c16eab8923332e86ff4ef0e2ac276.tar.bz2 |
tcg: introduce regions to split code_gen_buffer
This is groundwork for supporting multiple TCG contexts.
The naive solution here is to split code_gen_buffer statically
among the TCG threads; this however results in poor utilization
if translation needs are different across TCG threads.
What we do here is to add an extra layer of indirection, assigning
regions that act just like pages do in virtual memory allocation.
(BTW if you are wondering about the chosen naming, I did not want
to use blocks or pages because those are already heavily used in QEMU).
We use a global lock to serialize allocations as well as statistics
reporting (we now export the size of the used code_gen_buffer with
tcg_code_size()). Note that for the allocator we could just use
a counter and atomic_inc; however, that would complicate the gathering
of tcg_code_size()-like stats. So given that the region operations are
not a fast path, a lock seems the most reasonable choice.
The effectiveness of this approach is clear after seeing some numbers.
I used the bootup+shutdown of debian-arm with '-tb-size 80' as a benchmark.
Note that I'm evaluating this after enabling per-thread TCG (which
is done by a subsequent commit).
* -smp 1, 1 region (entire buffer):
qemu: flush code_size=83885014 nb_tbs=154739 avg_tb_size=357
qemu: flush code_size=83884902 nb_tbs=153136 avg_tb_size=363
qemu: flush code_size=83885014 nb_tbs=152777 avg_tb_size=364
qemu: flush code_size=83884950 nb_tbs=150057 avg_tb_size=373
qemu: flush code_size=83884998 nb_tbs=150234 avg_tb_size=373
qemu: flush code_size=83885014 nb_tbs=154009 avg_tb_size=360
qemu: flush code_size=83885014 nb_tbs=151007 avg_tb_size=370
qemu: flush code_size=83885014 nb_tbs=151816 avg_tb_size=367
That is, 8 flushes.
* -smp 8, 32 regions (80/32 MB per region) [i.e. this patch]:
qemu: flush code_size=76328008 nb_tbs=141040 avg_tb_size=356
qemu: flush code_size=75366534 nb_tbs=138000 avg_tb_size=361
qemu: flush code_size=76864546 nb_tbs=140653 avg_tb_size=361
qemu: flush code_size=76309084 nb_tbs=135945 avg_tb_size=375
qemu: flush code_size=74581856 nb_tbs=132909 avg_tb_size=375
qemu: flush code_size=73927256 nb_tbs=135616 avg_tb_size=360
qemu: flush code_size=78629426 nb_tbs=142896 avg_tb_size=365
qemu: flush code_size=76667052 nb_tbs=138508 avg_tb_size=368
Again, 8 flushes. Note how buffer utilization is not 100%, but it
is close. Smaller region sizes would yield higher utilization,
but we want region allocation to be rare (it acquires a lock), so
we do not want to go too small.
* -smp 8, static partitioning of 8 regions (10 MB per region):
qemu: flush code_size=21936504 nb_tbs=40570 avg_tb_size=354
qemu: flush code_size=11472174 nb_tbs=20633 avg_tb_size=370
qemu: flush code_size=11603976 nb_tbs=21059 avg_tb_size=365
qemu: flush code_size=23254872 nb_tbs=41243 avg_tb_size=377
qemu: flush code_size=28289496 nb_tbs=52057 avg_tb_size=358
qemu: flush code_size=43605160 nb_tbs=78896 avg_tb_size=367
qemu: flush code_size=45166552 nb_tbs=82158 avg_tb_size=364
qemu: flush code_size=63289640 nb_tbs=116494 avg_tb_size=358
qemu: flush code_size=51389960 nb_tbs=93937 avg_tb_size=362
qemu: flush code_size=59665928 nb_tbs=107063 avg_tb_size=372
qemu: flush code_size=38380824 nb_tbs=68597 avg_tb_size=374
qemu: flush code_size=44884568 nb_tbs=79901 avg_tb_size=376
qemu: flush code_size=50782632 nb_tbs=90681 avg_tb_size=374
qemu: flush code_size=39848888 nb_tbs=71433 avg_tb_size=372
qemu: flush code_size=64708840 nb_tbs=119052 avg_tb_size=359
qemu: flush code_size=49830008 nb_tbs=90992 avg_tb_size=362
qemu: flush code_size=68372408 nb_tbs=123442 avg_tb_size=368
qemu: flush code_size=33555560 nb_tbs=59514 avg_tb_size=378
qemu: flush code_size=44748344 nb_tbs=80974 avg_tb_size=367
qemu: flush code_size=37104248 nb_tbs=67609 avg_tb_size=364
That is, 20 flushes. Note how a static partitioning approach uses
the code buffer poorly, leading to many unnecessary flushes.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Diffstat (limited to 'accel/tcg')
-rw-r--r-- | accel/tcg/translate-all.c | 63 |
1 files changed, 20 insertions, 43 deletions
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c index 9061c05..f99bfd9 100644 --- a/accel/tcg/translate-all.c +++ b/accel/tcg/translate-all.c @@ -606,15 +606,13 @@ static inline void *alloc_code_gen_buffer(void) { void *buf = static_code_gen_buffer; void *end = static_code_gen_buffer + sizeof(static_code_gen_buffer); - size_t full_size, size; + size_t size; /* page-align the beginning and end of the buffer */ buf = QEMU_ALIGN_PTR_UP(buf, qemu_real_host_page_size); end = QEMU_ALIGN_PTR_DOWN(end, qemu_real_host_page_size); - /* Reserve a guard page. */ - full_size = end - buf; - size = full_size - qemu_real_host_page_size; + size = end - buf; /* Honor a command-line option limiting the size of the buffer. */ if (size > tcg_ctx->code_gen_buffer_size) { @@ -633,9 +631,6 @@ static inline void *alloc_code_gen_buffer(void) if (qemu_mprotect_rwx(buf, size)) { abort(); } - if (qemu_mprotect_none(buf + size, qemu_real_host_page_size)) { - abort(); - } qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE); return buf; @@ -644,22 +639,16 @@ static inline void *alloc_code_gen_buffer(void) static inline void *alloc_code_gen_buffer(void) { size_t size = tcg_ctx->code_gen_buffer_size; - void *buf1, *buf2; - - /* Perform the allocation in two steps, so that the guard page - is reserved but uncommitted. */ - buf1 = VirtualAlloc(NULL, size + qemu_real_host_page_size, - MEM_RESERVE, PAGE_NOACCESS); - if (buf1 != NULL) { - buf2 = VirtualAlloc(buf1, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE); - assert(buf1 == buf2); - } + void *buf; - return buf1; + buf = VirtualAlloc(NULL, size, MEM_RESERVE | MEM_COMMIT, + PAGE_EXECUTE_READWRITE); + return buf; } #else static inline void *alloc_code_gen_buffer(void) { + int prot = PROT_WRITE | PROT_READ | PROT_EXEC; int flags = MAP_PRIVATE | MAP_ANONYMOUS; uintptr_t start = 0; size_t size = tcg_ctx->code_gen_buffer_size; @@ -693,8 +682,7 @@ static inline void *alloc_code_gen_buffer(void) # endif # endif - buf = mmap((void *)start, size + qemu_real_host_page_size, - PROT_NONE, flags, -1, 0); + buf = mmap((void *)start, size, prot, flags, -1, 0); if (buf == MAP_FAILED) { return NULL; } @@ -704,24 +692,23 @@ static inline void *alloc_code_gen_buffer(void) /* Try again, with the original still mapped, to avoid re-acquiring that 256mb crossing. This time don't specify an address. */ size_t size2; - void *buf2 = mmap(NULL, size + qemu_real_host_page_size, - PROT_NONE, flags, -1, 0); + void *buf2 = mmap(NULL, size, prot, flags, -1, 0); switch ((int)(buf2 != MAP_FAILED)) { case 1: if (!cross_256mb(buf2, size)) { /* Success! Use the new buffer. */ - munmap(buf, size + qemu_real_host_page_size); + munmap(buf, size); break; } /* Failure. Work with what we had. */ - munmap(buf2, size + qemu_real_host_page_size); + munmap(buf2, size); /* fallthru */ default: /* Split the original buffer. Free the smaller half. */ buf2 = split_cross_256mb(buf, size); size2 = tcg_ctx->code_gen_buffer_size; if (buf == buf2) { - munmap(buf + size2 + qemu_real_host_page_size, size - size2); + munmap(buf + size2, size - size2); } else { munmap(buf, size - size2); } @@ -732,10 +719,6 @@ static inline void *alloc_code_gen_buffer(void) } #endif - /* Make the final buffer accessible. The guard page at the end - will remain inaccessible with PROT_NONE. */ - mprotect(buf, size, PROT_WRITE | PROT_READ | PROT_EXEC); - /* Request large pages for the buffer. */ qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE); @@ -916,13 +899,8 @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count) size_t host_size = 0; g_tree_foreach(tb_ctx.tb_tree, tb_host_size_iter, &host_size); - printf("qemu: flush code_size=%td nb_tbs=%zu avg_tb_size=%zu\n", - tcg_ctx->code_gen_ptr - tcg_ctx->code_gen_buffer, nb_tbs, - nb_tbs > 0 ? host_size / nb_tbs : 0); - } - if ((unsigned long)(tcg_ctx->code_gen_ptr - tcg_ctx->code_gen_buffer) - > tcg_ctx->code_gen_buffer_size) { - cpu_abort(cpu, "Internal error: code buffer overflow\n"); + printf("qemu: flush code_size=%zu nb_tbs=%zu avg_tb_size=%zu\n", + tcg_code_size(), nb_tbs, nb_tbs > 0 ? host_size / nb_tbs : 0); } CPU_FOREACH(cpu) { @@ -936,7 +914,7 @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count) qht_reset_size(&tb_ctx.htable, CODE_GEN_HTABLE_SIZE); page_flush_tb(); - tcg_ctx->code_gen_ptr = tcg_ctx->code_gen_buffer; + tcg_region_reset_all(); /* XXX: flush processor icache at this point if cache flush is expensive */ atomic_mb_set(&tb_ctx.tb_flush_count, tb_ctx.tb_flush_count + 1); @@ -1274,9 +1252,9 @@ TranslationBlock *tb_gen_code(CPUState *cpu, phys_pc = get_page_addr_code(env, pc); + buffer_overflow: tb = tb_alloc(pc); if (unlikely(!tb)) { - buffer_overflow: /* flush must be done */ tb_flush(cpu); mmap_unlock(); @@ -1380,9 +1358,9 @@ TranslationBlock *tb_gen_code(CPUState *cpu, } #endif - tcg_ctx->code_gen_ptr = (void *) + atomic_set(&tcg_ctx->code_gen_ptr, (void *) ROUND_UP((uintptr_t)gen_code_buf + gen_code_size + search_size, - CODE_GEN_ALIGN); + CODE_GEN_ALIGN)); /* init jump list */ assert(((uintptr_t)tb & 3) == 0); @@ -1908,9 +1886,8 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf) * otherwise users might think "-tb-size" is not honoured. * For avg host size we use the precise numbers from tb_tree_stats though. */ - cpu_fprintf(f, "gen code size %td/%zd\n", - tcg_ctx->code_gen_ptr - tcg_ctx->code_gen_buffer, - tcg_ctx->code_gen_highwater - tcg_ctx->code_gen_buffer); + cpu_fprintf(f, "gen code size %zu/%zu\n", + tcg_code_size(), tcg_code_capacity()); cpu_fprintf(f, "TB count %zu\n", nb_tbs); cpu_fprintf(f, "TB avg target size %zu max=%zu bytes\n", nb_tbs ? tst.target_size / nb_tbs : 0, |