tcg: remove tb_lock

Use mmap_lock in user-mode to protect TCG state and the page descriptors. In !user-mode, each vCPU has its own TCG state, so no locks needed. Per-page locks are used to protect the page descriptors. Per-TB locks are used in both modes to protect TB jumps. Some notes: - tb_lock is removed from notdirty_mem_write by passing a locked page_collection to tb_invalidate_phys_page_fast. - tcg_tb_lookup/remove/insert/etc have their own internal lock(s), so there is no need to further serialize access to them. - do_tb_flush is run in a safe async context, meaning no other vCPU threads are running. Therefore acquiring mmap_lock there is just to please tools such as thread sanitizer. - Not visible in the diff, but tb_invalidate_phys_page already has an assert_memory_lock. - cpu_io_recompile is !user-only, so no mmap_lock there. - Added mmap_unlock()'s before all siglongjmp's that could be called in user-mode while mmap_lock is held. + Added an assert for !have_mmap_lock() after returning from the longjmp in cpu_exec, just like we do in cpu_exec_step_atomic. Performance numbers before/after: Host: AMD Opteron(tm) Processor 6376 ubuntu 17.04 ppc64 bootup+shutdown time 700 +-+--+----+------+------------+-----------+------------*--+-+ | + + + + + *B | | before ***B*** ** * | |tb lock removal ###D### *** | 600 +-+ *** +-+ | ** # | | *B* #D | | *** * ## | 500 +-+ *** ### +-+ | * *** ### | | *B* # ## | | ** * #D# | 400 +-+ ** ## +-+ | ** ### | | ** ## | | ** # ## | 300 +-+ * B* #D# +-+ | B *** ### | | * ** #### | | * *** ### | 200 +-+ B *B #D# +-+ | #B* * ## # | | #* ## | | + D##D# + + + + | 100 +-+--+----+------+------------+-----------+------------+--+-+ 1 8 16 Guest CPUs 48 64 png: https://imgur.com/HwmBHXe debian jessie aarch64 bootup+shutdown time 90 +-+--+-----+-----+------------+------------+------------+--+-+ | + + + + + + | | before ***B*** B | 80 +tb lock removal ###D### **D +-+ | **### | | **## | 70 +-+ ** # +-+ | ** ## | | ** # | 60 +-+ *B ## +-+ | ** ## | | *** #D | 50 +-+ *** ## +-+ | * ** ### | | **B* ### | 40 +-+ **** # ## +-+ | **** #D# | | ***B** ### | 30 +-+ B***B** #### +-+ | B * * # ### | | B ###D# | 20 +-+ D ##D## +-+ | D# | | + + + + + + | 10 +-+--+-----+-----+------------+------------+------------+--+-+ 1 8 16 Guest CPUs 48 64 png: https://imgur.com/iGpGFtv The gains are high for 4-8 CPUs. Beyond that point, however, unrelated lock contention significantly hurts scalability. Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Reviewed-by: Alex Bennée <alex.bennee@linaro.org> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
author: Emilio G. Cota <cota@braap.org> 2017-08-04 23:46:31 -0400
committer: Richard Henderson <richard.henderson@linaro.org> 2018-06-15 08:18:48 -1000
commit: 0ac20318ce16f4de288969b2007ef5a654176058 (patch)
tree: ba285e1540b4d875b476acd72f364d8aaa7165d5 /accel/tcg/cpu-exec.c
parent: 705ad1ff0ce264475cb4c9a3aa31ba94a04869fe (diff)
download: qemu-0ac20318ce16f4de288969b2007ef5a654176058.zip
qemu-0ac20318ce16f4de288969b2007ef5a654176058.tar.gz
qemu-0ac20318ce16f4de288969b2007ef5a654176058.tar.bz2
1 files changed, 8 insertions, 26 deletions
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index c482008..c738b7f 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -212,20 +212,20 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
        We only end up here when an existing TB is too long.  */
     cflags |= MIN(max_cycles, CF_COUNT_MASK);
 
-    tb_lock();
+    mmap_lock();
     tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base,
                      orig_tb->flags, cflags);
     tb->orig_tb = orig_tb;
-    tb_unlock();
+    mmap_unlock();
 
     /* execute the generated code */
     trace_exec_tb_nocache(tb, tb->pc);
     cpu_tb_exec(cpu, tb);
 
-    tb_lock();
+    mmap_lock();
     tb_phys_invalidate(tb, -1);
+    mmap_unlock();
     tcg_tb_remove(tb);
-    tb_unlock();
 }
 #endif
 
@@ -244,9 +244,7 @@ void cpu_exec_step_atomic(CPUState *cpu)
         tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
         if (tb == NULL) {
             mmap_lock();
-            tb_lock();
             tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
-            tb_unlock();
             mmap_unlock();
         }
 
@@ -261,15 +259,13 @@ void cpu_exec_step_atomic(CPUState *cpu)
         cpu_tb_exec(cpu, tb);
         cc->cpu_exec_exit(cpu);
     } else {
-        /* We may have exited due to another problem here, so we need
-         * to reset any tb_locks we may have taken but didn't release.
+        /*
          * The mmap_lock is dropped by tb_gen_code if it runs out of
          * memory.
          */
 #ifndef CONFIG_SOFTMMU
         tcg_debug_assert(!have_mmap_lock());
 #endif
-        tb_lock_reset();
         assert_no_pages_locked();
     }
 
@@ -398,20 +394,11 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
     TranslationBlock *tb;
     target_ulong cs_base, pc;
     uint32_t flags;
-    bool acquired_tb_lock = false;
 
     tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
     if (tb == NULL) {
-        /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
-         * taken outside tb_lock. As system emulation is currently
-         * single threaded the locks are NOPs.
-         */
         mmap_lock();
-        tb_lock();
-        acquired_tb_lock = true;
-
         tb = tb_gen_code(cpu, pc, cs_base, flags, cf_mask);
-
         mmap_unlock();
         /* We add the TB in the virtual pc hash table for the fast lookup */
         atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
@@ -427,15 +414,8 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
 #endif
     /* See if we can patch the calling TB. */
     if (last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
-        if (!acquired_tb_lock) {
-            tb_lock();
-            acquired_tb_lock = true;
-        }
         tb_add_jump(last_tb, tb_exit, tb);
     }
-    if (acquired_tb_lock) {
-        tb_unlock();
-    }
     return tb;
 }
 
@@ -710,7 +690,9 @@ int cpu_exec(CPUState *cpu)
         g_assert(cpu == current_cpu);
         g_assert(cc == CPU_GET_CLASS(cpu));
 #endif /* buggy compiler */
-        tb_lock_reset();
+#ifndef CONFIG_SOFTMMU
+        tcg_debug_assert(!have_mmap_lock());
+#endif
         if (qemu_mutex_iothread_locked()) {
             qemu_mutex_unlock_iothread();
         }
author	Emilio G. Cota <cota@braap.org>	2017-08-04 23:46:31 -0400
committer	Richard Henderson <richard.henderson@linaro.org>	2018-06-15 08:18:48 -1000
commit	0ac20318ce16f4de288969b2007ef5a654176058 (patch)
tree	ba285e1540b4d875b476acd72f364d8aaa7165d5 /accel/tcg/cpu-exec.c
parent	705ad1ff0ce264475cb4c9a3aa31ba94a04869fe (diff)
download	qemu-0ac20318ce16f4de288969b2007ef5a654176058.zip qemu-0ac20318ce16f4de288969b2007ef5a654176058.tar.gz qemu-0ac20318ce16f4de288969b2007ef5a654176058.tar.bz2