diff options
author | Emilio G. Cota <cota@braap.org> | 2016-06-27 15:02:06 -0400 |
---|---|---|
committer | Richard Henderson <rth@twiddle.net> | 2016-10-26 08:29:01 -0700 |
commit | 37b995f6e7a1cb6fa378c5cd4217b9dd9e1fc98b (patch) | |
tree | c9d3061fdcd06dbf63f4220e1eaec8b4ba583b91 /target-i386 | |
parent | ea97ebe89f7a879ea9aba90140e40c29b5cbd653 (diff) | |
download | qemu-37b995f6e7a1cb6fa378c5cd4217b9dd9e1fc98b.zip qemu-37b995f6e7a1cb6fa378c5cd4217b9dd9e1fc98b.tar.gz qemu-37b995f6e7a1cb6fa378c5cd4217b9dd9e1fc98b.tar.bz2 |
target-i386: remove helper_lock()
It's been superseded by the atomic helpers.
The use of the atomic helpers provides a significant performance and scalability
improvement. Below is the result of running the atomic_add-test microbenchmark with:
$ x86_64-linux-user/qemu-x86_64 tests/atomic_add-bench -o 5000000 -r $r -n $n
, where $n is the number of threads and $r is the allowed range for the additions.
The scenarios measured are:
- atomic: implements x86' ADDL with the atomic_add helper (i.e. this patchset)
- cmpxchg: implement x86' ADDL with a TCG loop using the cmpxchg helper
- master: before this patchset
Results sorted in ascending range, i.e. descending degree of contention.
Y axis is Throughput in Mops/s. Tests are run on an AMD machine with 64
Opteron 6376 cores.
atomic_add-bench: 5000000 ops/thread, [0,1] range
25 ++---------+----------+---------+----------+----------+----------+---++
+ atomic +-E--+ + + + + + |
|cmpxchg +-H--+ |
20 +Emaster +-N--+ ++
|| |
|++ |
|| |
15 +++ ++
|N| |
|+| |
10 ++| ++
|+|+ |
| | -+E+------ +++ ---+E+------+E+------+E+-----+E+------+E|
|+E+E+- +++ +E+------+E+-- |
5 ++|+ ++
|+N+H+--- +++ |
++++N+--+H++----+++ + +++ --++H+------+H+------+H++----+H+---+--- |
0 ++---------+-----H----+---H-----+----------+----------+----------+---H+
0 10 20 30 40 50 60
Number of threads
atomic_add-bench: 5000000 ops/thread, [0,2] range
25 ++---------+----------+---------+----------+----------+----------+---++
++atomic +-E--+ + + + + + |
|cmpxchg +-H--+ |
20 ++master +-N--+ ++
|E| |
|++ |
||E |
15 ++| ++
|N|| |
|+|| ---+E+------+E+-----+E+------+E|
10 ++| | ---+E+------+E+-----+E+--- +++ +++
||H+E+--+E+-- |
|+++++ |
| || |
5 ++|+H+-- +++ ++
|+N+ - ---+H+------+H+------ |
+ +N+--+H++----+H+---+--+H+----++H+--- + + +H+---+--+H|
0 ++---------+----------+---------+----------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
atomic_add-bench: 5000000 ops/thread, [0,8] range
40 ++---------+----------+---------+----------+----------+----------+---++
++atomic +-E--+ + + + + + |
35 +cmpxchg +-H--+ ++
| master +-N--+ ---+E+------+E+------+E+-----+E+------+E|
30 ++| ---+E+-- +++ ++
| | -+E+--- |
25 ++E ---- +++ ++
|+++++ -+E+ |
20 +E+ E-- +++ ++
|H|+++ |
|+| +H+------- |
15 ++H+ ---+++ +H+------ ++
|N++H+-- +++--- +H+------++|
10 ++ +++ - +++ ---+H+ +++ +H+
| | +H+-----+H+------+H+-- |
5 ++| +++ ++
++N+N+--+N++ + + + + + |
0 ++---------+----------+---------+----------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
atomic_add-bench: 5000000 ops/thread, [0,128] range
160 ++---------+---------+----------+---------+----------+----------+---++
+ atomic +-E--+ + + + + + |
140 +cmpxchg +-H--+ +++ +++ ++
| master +-N--+ E--------E------+E+------++|
120 ++ --| | +++ E+
| -- +++ +++ ++|
100 ++ - ++
| +++- +++ ++|
80 ++ -+E+ -+H+------+H+------H--------++
| ---- ---- +++ H|
| ---+E+-----+E+- ---+H+ ++|
60 ++ +E+--- +++ ---+H+--- ++
| --+++ ---+H+-- |
40 ++ +E+-+H+--- ++
| +H+ |
20 +EE+ ++
+N+ + + + + + + |
0 ++N-N---N--+---------+----------+---------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
atomic_add-bench: 5000000 ops/thread, [0,1024] range
350 ++---------+---------+----------+---------+----------+----------+---++
+ atomic +-E--+ + + + + + |
300 +cmpxchg +-H--+ +++
| master +-N--+ +++ ||
| +++ | ----E|
250 ++ | ----E---- ++
| ----E--- | ---+H|
200 ++ -+E+--- +++ ---+H+--- ++
| ---- -+H+-- |
| +E+ +++ ---- +++ |
150 ++ ---+++ ---+H+- ++
| --- -+H+-- |
100 ++ ---+E+ ---- +++ ++
| +++ ---+E+-----+H+- |
| -+E+------+H+-- |
50 ++ +E+ ++
+EE+ + + + + + + |
0 ++N-N---N--+---------+----------+---------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
hi-res: http://imgur.com/a/fMRmq
For master I stopped measuring master after 8 threads, because there is little
point in measuring the well-known performance collapse of a contended lock.
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1467054136-10430-21-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
Diffstat (limited to 'target-i386')
-rw-r--r-- | target-i386/helper.h | 2 | ||||
-rw-r--r-- | target-i386/mem_helper.c | 33 | ||||
-rw-r--r-- | target-i386/translate.c | 15 |
3 files changed, 0 insertions, 50 deletions
diff --git a/target-i386/helper.h b/target-i386/helper.h index 729d4b6..4e859eb 100644 --- a/target-i386/helper.h +++ b/target-i386/helper.h @@ -1,8 +1,6 @@ DEF_HELPER_FLAGS_4(cc_compute_all, TCG_CALL_NO_RWG_SE, tl, tl, tl, tl, int) DEF_HELPER_FLAGS_4(cc_compute_c, TCG_CALL_NO_RWG_SE, tl, tl, tl, tl, int) -DEF_HELPER_0(lock, void) -DEF_HELPER_0(unlock, void) DEF_HELPER_3(write_eflags, void, env, tl, i32) DEF_HELPER_1(read_eflags, tl, env) DEF_HELPER_2(divb_AL, void, env, tl) diff --git a/target-i386/mem_helper.c b/target-i386/mem_helper.c index c4b5c5b..70f6766 100644 --- a/target-i386/mem_helper.c +++ b/target-i386/mem_helper.c @@ -25,39 +25,6 @@ #include "qemu/int128.h" #include "tcg.h" -/* broken thread support */ - -#if defined(CONFIG_USER_ONLY) -QemuMutex global_cpu_lock; - -void helper_lock(void) -{ - qemu_mutex_lock(&global_cpu_lock); -} - -void helper_unlock(void) -{ - qemu_mutex_unlock(&global_cpu_lock); -} - -void helper_lock_init(void) -{ - qemu_mutex_init(&global_cpu_lock); -} -#else -void helper_lock(void) -{ -} - -void helper_unlock(void) -{ -} - -void helper_lock_init(void) -{ -} -#endif - void helper_cmpxchg8b_unlocked(CPUX86State *env, target_ulong a0) { uintptr_t ra = GETPC(); diff --git a/target-i386/translate.c b/target-i386/translate.c index cfa3956..927b366 100644 --- a/target-i386/translate.c +++ b/target-i386/translate.c @@ -4536,10 +4536,6 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, s->aflag = aflag; s->dflag = dflag; - /* lock generation */ - if (prefixes & PREFIX_LOCK) - gen_helper_lock(); - /* now check op code */ reswitch: switch(b) { @@ -8211,20 +8207,11 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, default: goto unknown_op; } - /* lock generation */ - if (s->prefix & PREFIX_LOCK) - gen_helper_unlock(); return s->pc; illegal_op: - if (s->prefix & PREFIX_LOCK) - gen_helper_unlock(); - /* XXX: ensure that no lock was generated */ gen_illegal_opcode(s); return s->pc; unknown_op: - if (s->prefix & PREFIX_LOCK) - gen_helper_unlock(); - /* XXX: ensure that no lock was generated */ gen_unknown_opcode(env, s); return s->pc; } @@ -8316,8 +8303,6 @@ void tcg_x86_init(void) offsetof(CPUX86State, bnd_regs[i].ub), bnd_regu_names[i]); } - - helper_lock_init(); } /* generate intermediate code for basic block 'tb'. */ |