From be2cdc5e352eb28b4ff631f053a261d91e6af78e Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Wed, 26 Jul 2017 16:58:05 -0400 Subject: tcg: track TBs with per-region BST's MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This paves the way for enabling scalable parallel generation of TCG code. Instead of tracking TBs with a single binary search tree (BST), use a BST for each TCG region, protecting it with a lock. This is as scalable as it gets, since each TCG thread operates on a separate region. The core of this change is the introduction of struct tcg_region_tree, which contains a pointer to a GTree and an associated lock to serialize accesses to it. We then allocate an array of tcg_region_tree's, adding the appropriate padding to avoid false sharing based on qemu_dcache_linesize. Given a tc_ptr, we first find the corresponding region_tree. This is done by special-casing the first and last regions first, since they might be of size != region.size; otherwise we just divide the offset by region.stride. I was worried about this division (several dozen cycles of latency), but profiling shows that this is not a fast path. Note that region.stride is not required to be a power of two; it is only required to be a multiple of the host's page size. Note that with this design we can also provide consistent snapshots about all region trees at once; for instance, tcg_tb_foreach acquires/releases all region_tree locks before/after iterating over them. For this reason we now drop tb_lock in dump_exec_info(). As an alternative I considered implementing a concurrent BST, but this can be tricky to get right, offers no consistent snapshots of the BST, and performance and scalability-wise I don't think it could ever beat having separate GTrees, given that our workload is insert-mostly (all concurrent BST designs I've seen focus, understandably, on making lookups fast, which comes at the expense of convoluted, non-wait-free insertions/removals). Reviewed-by: Richard Henderson Reviewed-by: Alex Bennée Signed-off-by: Emilio G. Cota Signed-off-by: Richard Henderson --- include/exec/exec-all.h | 1 - include/exec/tb-context.h | 1 - 2 files changed, 2 deletions(-) (limited to 'include/exec') diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index 8bbea78..8d4306a 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -405,7 +405,6 @@ static inline uint32_t curr_cflags(void) | (use_icount ? CF_USE_ICOUNT : 0); } -void tb_remove(TranslationBlock *tb); void tb_flush(CPUState *cpu); void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr); TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc, diff --git a/include/exec/tb-context.h b/include/exec/tb-context.h index 1d41202..d8472c8 100644 --- a/include/exec/tb-context.h +++ b/include/exec/tb-context.h @@ -31,7 +31,6 @@ typedef struct TBContext TBContext; struct TBContext { - GTree *tb_tree; struct qht htable; /* any access to the tbs or the page table must use this lock */ QemuMutex tb_lock; -- cgit v1.1 From 128ed2278c4e6ad063f101c5dda7999b43f2d8a3 Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Tue, 1 Aug 2017 15:11:12 -0400 Subject: tcg: move tb_ctx.tb_phys_invalidate_count to tcg_ctx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thereby making it per-TCGContext. Once we remove tb_lock, this will avoid an atomic increment every time a TB is invalidated. Reviewed-by: Richard Henderson Reviewed-by: Alex Bennée Signed-off-by: Emilio G. Cota Signed-off-by: Richard Henderson --- include/exec/tb-context.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/exec') diff --git a/include/exec/tb-context.h b/include/exec/tb-context.h index d8472c8..8c9b49c 100644 --- a/include/exec/tb-context.h +++ b/include/exec/tb-context.h @@ -37,7 +37,6 @@ struct TBContext { /* statistics */ unsigned tb_flush_count; - int tb_phys_invalidate_count; }; extern TBContext tb_ctx; -- cgit v1.1 From 1e05197f24c49d52f339de9053bb1d17082f1be3 Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Thu, 3 Aug 2017 18:37:15 -0400 Subject: translate-all: iterate over TBs in a page with PAGE_FOR_EACH_TB This commit does several things, but to avoid churn I merged them all into the same commit. To wit: - Use uintptr_t instead of TranslationBlock * for the list of TBs in a page. Just like we did in (c37e6d7e "tcg: Use uintptr_t type for jmp_list_{next|first} fields of TB"), the rationale is the same: these are tagged pointers, not pointers. So use a more appropriate type. - Only check the least significant bit of the tagged pointers. Masking with 3/~3 is unnecessary and confusing. - Introduce the TB_FOR_EACH_TAGGED macro, and use it to define PAGE_FOR_EACH_TB, which improves readability. Note that TB_FOR_EACH_TAGGED will gain another user in a subsequent patch. - Update tb_page_remove to use PAGE_FOR_EACH_TB. In case there is a bug and we attempt to remove a TB that is not in the list, instead of segfaulting (since the list is NULL-terminated) we will reach g_assert_not_reached(). Reviewed-by: Richard Henderson Signed-off-by: Emilio G. Cota Signed-off-by: Richard Henderson --- include/exec/exec-all.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/exec') diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index 8d4306a..07653d3 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -360,7 +360,7 @@ struct TranslationBlock { struct TranslationBlock *orig_tb; /* first and second physical page containing code. The lower bit of the pointer tells the index in page_next[] */ - struct TranslationBlock *page_next[2]; + uintptr_t page_next[2]; tb_page_addr_t page_addr[2]; /* The following data are used to directly call another TB from -- cgit v1.1 From 0b5c91f74f3c83a36f37740969df8c775c997e69 Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Wed, 26 Jul 2017 20:22:51 -0400 Subject: translate-all: use per-page locking in !user-mode Groundwork for supporting parallel TCG generation. Instead of using a global lock (tb_lock) to protect changes to pages, use fine-grained, per-page locks in !user-mode. User-mode stays with mmap_lock. Sometimes changes need to happen atomically on more than one page (e.g. when a TB that spans across two pages is added/invalidated, or when a range of pages is invalidated). We therefore introduce struct page_collection, which helps us keep track of a set of pages that have been locked in the appropriate locking order (i.e. by ascending page index). This commit first introduces the structs and the function helpers, to then convert the calling code to use per-page locking. Note that tb_lock is not removed yet. While at it, rename tb_alloc_page to tb_page_add, which pairs with tb_page_remove. Reviewed-by: Richard Henderson Signed-off-by: Emilio G. Cota Signed-off-by: Richard Henderson --- include/exec/exec-all.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/exec') diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index 07653d3..8d92e3c 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -359,7 +359,8 @@ struct TranslationBlock { /* original tb when cflags has CF_NOCACHE */ struct TranslationBlock *orig_tb; /* first and second physical page containing code. The lower bit - of the pointer tells the index in page_next[] */ + of the pointer tells the index in page_next[]. + The list is protected by the TB's page('s) lock(s) */ uintptr_t page_next[2]; tb_page_addr_t page_addr[2]; -- cgit v1.1 From faa9372c07d062eb01f9da72e3f6c0f32efffca7 Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Thu, 22 Feb 2018 20:50:29 -0500 Subject: translate-all: introduce assert_no_pages_locked The appended adds assertions to make sure we do not longjmp with page locks held. Note that user-mode has nothing to check, since page_locks are !user-mode only. Reviewed-by: Richard Henderson Signed-off-by: Emilio G. Cota Signed-off-by: Richard Henderson --- include/exec/exec-all.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/exec') diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index 8d92e3c..4f07a17 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -435,6 +435,14 @@ void tb_lock(void); void tb_unlock(void); void tb_lock_reset(void); +#if !defined(CONFIG_USER_ONLY) && defined(CONFIG_DEBUG_TCG) +void assert_no_pages_locked(void); +#else +static inline void assert_no_pages_locked(void) +{ +} +#endif + #if !defined(CONFIG_USER_ONLY) /** -- cgit v1.1 From 194125e3ebd553acb02aaf3797a4f0387493fe94 Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Wed, 2 Aug 2017 20:34:06 -0400 Subject: translate-all: protect TB jumps with a per-destination-TB lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This applies to both user-mode and !user-mode emulation. Instead of relying on a global lock, protect the list of incoming jumps with tb->jmp_lock. This lock also protects tb->cflags, so update all tb->cflags readers outside tb->jmp_lock to use atomic reads via tb_cflags(). In order to find the destination TB (and therefore its jmp_lock) from the origin TB, we introduce tb->jmp_dest[]. I considered not using a linked list of jumps, which simplifies code and makes the struct smaller. However, it unnecessarily increases memory usage, which results in a performance decrease. See for instance these numbers booting+shutting down debian-arm: Time (s) Rel. err (%) Abs. err (s) Rel. slowdown (%) ------------------------------------------------------------------------------ before 20.88 0.74 0.154512 0. after 20.81 0.38 0.079078 -0.33524904 GTree 21.02 0.28 0.058856 0.67049808 GHashTable + xxhash 21.63 1.08 0.233604 3.5919540 Using a hash table or a binary tree to keep track of the jumps doesn't really pay off, not only due to the increased memory usage, but also because most TBs have only 0 or 1 jumps to them. The maximum number of jumps when booting debian-arm that I measured is 35, but as we can see in the histogram below a TB with that many incoming jumps is extremely rare; the average TB has 0.80 incoming jumps. n_jumps: 379208; avg jumps/tb: 0.801099 dist: [0.0,1.0)|▄█▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁ ▁▁▁ ▁▁▁ ▁|[34.0,35.0] Reviewed-by: Richard Henderson Signed-off-by: Emilio G. Cota Signed-off-by: Richard Henderson --- include/exec/exec-all.h | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) (limited to 'include/exec') diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index 4f07a17..3c2a0ef 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -345,7 +345,7 @@ struct TranslationBlock { #define CF_LAST_IO 0x00008000 /* Last insn may be an IO access. */ #define CF_NOCACHE 0x00010000 /* To be freed after execution */ #define CF_USE_ICOUNT 0x00020000 -#define CF_INVALID 0x00040000 /* TB is stale. Setters need tb_lock */ +#define CF_INVALID 0x00040000 /* TB is stale. Set with @jmp_lock held */ #define CF_PARALLEL 0x00080000 /* Generate code for a parallel context */ /* cflags' mask for hashing/comparison */ #define CF_HASH_MASK \ @@ -364,6 +364,9 @@ struct TranslationBlock { uintptr_t page_next[2]; tb_page_addr_t page_addr[2]; + /* jmp_lock placed here to fill a 4-byte hole. Its documentation is below */ + QemuSpin jmp_lock; + /* The following data are used to directly call another TB from * the code of this one. This can be done either by emitting direct or * indirect native jump instructions. These jumps are reset so that the TB @@ -375,20 +378,26 @@ struct TranslationBlock { #define TB_JMP_RESET_OFFSET_INVALID 0xffff /* indicates no jump generated */ uintptr_t jmp_target_arg[2]; /* target address or offset */ - /* Each TB has an associated circular list of TBs jumping to this one. - * jmp_list_first points to the first TB jumping to this one. - * jmp_list_next is used to point to the next TB in a list. - * Since each TB can have two jumps, it can participate in two lists. - * jmp_list_first and jmp_list_next are 4-byte aligned pointers to a - * TranslationBlock structure, but the two least significant bits of - * them are used to encode which data field of the pointed TB should - * be used to traverse the list further from that TB: - * 0 => jmp_list_next[0], 1 => jmp_list_next[1], 2 => jmp_list_first. - * In other words, 0/1 tells which jump is used in the pointed TB, - * and 2 means that this is a pointer back to the target TB of this list. + /* + * Each TB has a NULL-terminated list (jmp_list_head) of incoming jumps. + * Each TB can have two outgoing jumps, and therefore can participate + * in two lists. The list entries are kept in jmp_list_next[2]. The least + * significant bit (LSB) of the pointers in these lists is used to encode + * which of the two list entries is to be used in the pointed TB. + * + * List traversals are protected by jmp_lock. The destination TB of each + * outgoing jump is kept in jmp_dest[] so that the appropriate jmp_lock + * can be acquired from any origin TB. + * + * jmp_dest[] are tagged pointers as well. The LSB is set when the TB is + * being invalidated, so that no further outgoing jumps from it can be set. + * + * jmp_lock also protects the CF_INVALID cflag; a jump must not be chained + * to a destination TB that has CF_INVALID set. */ + uintptr_t jmp_list_head; uintptr_t jmp_list_next[2]; - uintptr_t jmp_list_first; + uintptr_t jmp_dest[2]; }; extern bool parallel_cpus; -- cgit v1.1 From 0ac20318ce16f4de288969b2007ef5a654176058 Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Fri, 4 Aug 2017 23:46:31 -0400 Subject: tcg: remove tb_lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use mmap_lock in user-mode to protect TCG state and the page descriptors. In !user-mode, each vCPU has its own TCG state, so no locks needed. Per-page locks are used to protect the page descriptors. Per-TB locks are used in both modes to protect TB jumps. Some notes: - tb_lock is removed from notdirty_mem_write by passing a locked page_collection to tb_invalidate_phys_page_fast. - tcg_tb_lookup/remove/insert/etc have their own internal lock(s), so there is no need to further serialize access to them. - do_tb_flush is run in a safe async context, meaning no other vCPU threads are running. Therefore acquiring mmap_lock there is just to please tools such as thread sanitizer. - Not visible in the diff, but tb_invalidate_phys_page already has an assert_memory_lock. - cpu_io_recompile is !user-only, so no mmap_lock there. - Added mmap_unlock()'s before all siglongjmp's that could be called in user-mode while mmap_lock is held. + Added an assert for !have_mmap_lock() after returning from the longjmp in cpu_exec, just like we do in cpu_exec_step_atomic. Performance numbers before/after: Host: AMD Opteron(tm) Processor 6376 ubuntu 17.04 ppc64 bootup+shutdown time 700 +-+--+----+------+------------+-----------+------------*--+-+ | + + + + + *B | | before ***B*** ** * | |tb lock removal ###D### *** | 600 +-+ *** +-+ | ** # | | *B* #D | | *** * ## | 500 +-+ *** ### +-+ | * *** ### | | *B* # ## | | ** * #D# | 400 +-+ ** ## +-+ | ** ### | | ** ## | | ** # ## | 300 +-+ * B* #D# +-+ | B *** ### | | * ** #### | | * *** ### | 200 +-+ B *B #D# +-+ | #B* * ## # | | #* ## | | + D##D# + + + + | 100 +-+--+----+------+------------+-----------+------------+--+-+ 1 8 16 Guest CPUs 48 64 png: https://imgur.com/HwmBHXe debian jessie aarch64 bootup+shutdown time 90 +-+--+-----+-----+------------+------------+------------+--+-+ | + + + + + + | | before ***B*** B | 80 +tb lock removal ###D### **D +-+ | **### | | **## | 70 +-+ ** # +-+ | ** ## | | ** # | 60 +-+ *B ## +-+ | ** ## | | *** #D | 50 +-+ *** ## +-+ | * ** ### | | **B* ### | 40 +-+ **** # ## +-+ | **** #D# | | ***B** ### | 30 +-+ B***B** #### +-+ | B * * # ### | | B ###D# | 20 +-+ D ##D## +-+ | D# | | + + + + + + | 10 +-+--+-----+-----+------------+------------+------------+--+-+ 1 8 16 Guest CPUs 48 64 png: https://imgur.com/iGpGFtv The gains are high for 4-8 CPUs. Beyond that point, however, unrelated lock contention significantly hurts scalability. Reviewed-by: Richard Henderson Reviewed-by: Alex Bennée Signed-off-by: Emilio G. Cota Signed-off-by: Richard Henderson --- include/exec/cpu-common.h | 2 +- include/exec/exec-all.h | 4 ---- include/exec/memory-internal.h | 6 ++++-- include/exec/tb-context.h | 2 -- 4 files changed, 5 insertions(+), 9 deletions(-) (limited to 'include/exec') diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h index 0b58e26..18b40d6 100644 --- a/include/exec/cpu-common.h +++ b/include/exec/cpu-common.h @@ -23,7 +23,7 @@ typedef struct CPUListState { FILE *file; } CPUListState; -/* The CPU list lock nests outside tb_lock/tb_unlock. */ +/* The CPU list lock nests outside page_(un)lock or mmap_(un)lock */ void qemu_init_cpu_list(void); void cpu_list_lock(void); void cpu_list_unlock(void); diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index 3c2a0ef..25a6f28 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -440,10 +440,6 @@ extern uintptr_t tci_tb_ptr; smaller than 4 bytes, so we don't worry about special-casing this. */ #define GETPC_ADJ 2 -void tb_lock(void); -void tb_unlock(void); -void tb_lock_reset(void); - #if !defined(CONFIG_USER_ONLY) && defined(CONFIG_DEBUG_TCG) void assert_no_pages_locked(void); #else diff --git a/include/exec/memory-internal.h b/include/exec/memory-internal.h index 56c25c0..bb08fa4 100644 --- a/include/exec/memory-internal.h +++ b/include/exec/memory-internal.h @@ -49,6 +49,8 @@ void mtree_print_dispatch(fprintf_function mon, void *f, struct AddressSpaceDispatch *d, MemoryRegion *root); +struct page_collection; + /* Opaque struct for passing info from memory_notdirty_write_prepare() * to memory_notdirty_write_complete(). Callers should treat all fields * as private, with the exception of @active. @@ -60,10 +62,10 @@ void mtree_print_dispatch(fprintf_function mon, void *f, */ typedef struct { CPUState *cpu; + struct page_collection *pages; ram_addr_t ram_addr; vaddr mem_vaddr; unsigned size; - bool locked; bool active; } NotDirtyInfo; @@ -91,7 +93,7 @@ typedef struct { * * This must only be called if we are using TCG; it will assert otherwise. * - * We may take a lock in the prepare call, so callers must ensure that + * We may take locks in the prepare call, so callers must ensure that * they don't exit (via longjump or otherwise) without calling complete. * * This call must only be made inside an RCU critical section. diff --git a/include/exec/tb-context.h b/include/exec/tb-context.h index 8c9b49c..feb585e 100644 --- a/include/exec/tb-context.h +++ b/include/exec/tb-context.h @@ -32,8 +32,6 @@ typedef struct TBContext TBContext; struct TBContext { struct qht htable; - /* any access to the tbs or the page table must use this lock */ - QemuMutex tb_lock; /* statistics */ unsigned tb_flush_count; -- cgit v1.1