diff options
49 files changed, 1157 insertions, 521 deletions
diff --git a/Makefile.objs b/Makefile.objs index 7301544..a8e0224 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -89,7 +89,7 @@ endif ####################################################################### # Target-independent parts used in system and user emulation -common-obj-y += tcg-runtime.o +common-obj-y += tcg-runtime.o cpus-common.o common-obj-y += hw/ common-obj-y += qom/ common-obj-y += disas/ diff --git a/block/blkreplay.c b/block/blkreplay.c index 30f9d5f..a741654 100755 --- a/block/blkreplay.c +++ b/block/blkreplay.c @@ -20,11 +20,6 @@ typedef struct Request { QEMUBH *bh; } Request; -/* Next request id. - This counter is global, because requests from different - block devices should not get overlapping ids. */ -static uint64_t request_id; - static int blkreplay_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { @@ -84,7 +79,7 @@ static void block_request_create(uint64_t reqid, BlockDriverState *bs, static int coroutine_fn blkreplay_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) { - uint64_t reqid = request_id++; + uint64_t reqid = blkreplay_next_id(); int ret = bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); block_request_create(reqid, bs, qemu_coroutine_self()); qemu_coroutine_yield(); @@ -95,7 +90,7 @@ static int coroutine_fn blkreplay_co_preadv(BlockDriverState *bs, static int coroutine_fn blkreplay_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) { - uint64_t reqid = request_id++; + uint64_t reqid = blkreplay_next_id(); int ret = bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); block_request_create(reqid, bs, qemu_coroutine_self()); qemu_coroutine_yield(); @@ -106,7 +101,7 @@ static int coroutine_fn blkreplay_co_pwritev(BlockDriverState *bs, static int coroutine_fn blkreplay_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int count, BdrvRequestFlags flags) { - uint64_t reqid = request_id++; + uint64_t reqid = blkreplay_next_id(); int ret = bdrv_co_pwrite_zeroes(bs->file, offset, count, flags); block_request_create(reqid, bs, qemu_coroutine_self()); qemu_coroutine_yield(); @@ -117,7 +112,7 @@ static int coroutine_fn blkreplay_co_pwrite_zeroes(BlockDriverState *bs, static int coroutine_fn blkreplay_co_pdiscard(BlockDriverState *bs, int64_t offset, int count) { - uint64_t reqid = request_id++; + uint64_t reqid = blkreplay_next_id(); int ret = bdrv_co_pdiscard(bs->file->bs, offset, count); block_request_create(reqid, bs, qemu_coroutine_self()); qemu_coroutine_yield(); @@ -127,7 +122,7 @@ static int coroutine_fn blkreplay_co_pdiscard(BlockDriverState *bs, static int coroutine_fn blkreplay_co_flush(BlockDriverState *bs) { - uint64_t reqid = request_id++; + uint64_t reqid = blkreplay_next_id(); int ret = bdrv_co_flush(bs->file->bs); block_request_create(reqid, bs, qemu_coroutine_self()); qemu_coroutine_yield(); diff --git a/bsd-user/main.c b/bsd-user/main.c index 0fb08e4..35125b7 100644 --- a/bsd-user/main.c +++ b/bsd-user/main.c @@ -67,23 +67,6 @@ int cpu_get_pic_interrupt(CPUX86State *env) } #endif -/* These are no-ops because we are not threadsafe. */ -static inline void cpu_exec_start(CPUArchState *env) -{ -} - -static inline void cpu_exec_end(CPUArchState *env) -{ -} - -static inline void start_exclusive(void) -{ -} - -static inline void end_exclusive(void) -{ -} - void fork_start(void) { } @@ -95,14 +78,6 @@ void fork_end(int child) } } -void cpu_list_lock(void) -{ -} - -void cpu_list_unlock(void) -{ -} - #ifdef TARGET_I386 /***********************************************************/ /* CPUX86 core interface */ @@ -172,7 +147,11 @@ void cpu_loop(CPUX86State *env) //target_siginfo_t info; for(;;) { + cpu_exec_start(cs); trapnr = cpu_exec(cs); + cpu_exec_end(cs); + process_queued_cpu_work(cs); + switch(trapnr) { case 0x80: /* syscall from int $0x80 */ @@ -513,7 +492,10 @@ void cpu_loop(CPUSPARCState *env) //target_siginfo_t info; while (1) { + cpu_exec_start(cs); trapnr = cpu_exec(cs); + cpu_exec_end(cs); + process_queued_cpu_work(cs); switch (trapnr) { #ifndef TARGET_SPARC64 @@ -748,6 +730,7 @@ int main(int argc, char **argv) if (argc <= 1) usage(); + qemu_init_cpu_list(); module_call_init(MODULE_INIT_QOM); if ((envlist = envlist_create()) == NULL) { @@ -2988,7 +2988,7 @@ for i in $glib_modules; do if $pkg_config --atleast-version=$glib_req_ver $i; then glib_cflags=$($pkg_config --cflags $i) glib_libs=$($pkg_config --libs $i) - CFLAGS="$glib_cflags $CFLAGS" + QEMU_CFLAGS="$glib_cflags $QEMU_CFLAGS" LIBS="$glib_libs $LIBS" libs_qga="$glib_libs $libs_qga" else @@ -5195,7 +5195,6 @@ fi if test "$glib_subprocess" = "yes" ; then echo "CONFIG_HAS_GLIB_SUBPROCESS_TESTS=y" >> $config_host_mak fi -echo "GLIB_CFLAGS=$glib_cflags" >> $config_host_mak if test "$gtk" = "yes" ; then echo "CONFIG_GTK=y" >> $config_host_mak echo "CONFIG_GTKABI=$gtkabi" >> $config_host_mak @@ -204,20 +204,16 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles, TranslationBlock *orig_tb, bool ignore_icount) { TranslationBlock *tb; - bool old_tb_flushed; /* Should never happen. We only end up here when an existing TB is too long. */ if (max_cycles > CF_COUNT_MASK) max_cycles = CF_COUNT_MASK; - old_tb_flushed = cpu->tb_flushed; - cpu->tb_flushed = false; tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base, orig_tb->flags, max_cycles | CF_NOCACHE | (ignore_icount ? CF_IGNORE_ICOUNT : 0)); - tb->orig_tb = cpu->tb_flushed ? NULL : orig_tb; - cpu->tb_flushed |= old_tb_flushed; + tb->orig_tb = orig_tb; /* execute the generated code */ trace_exec_tb_nocache(tb, tb->pc); cpu_tb_exec(cpu, tb); @@ -338,10 +334,7 @@ static inline TranslationBlock *tb_find(CPUState *cpu, tb_lock(); have_tb_lock = true; } - /* Check if translation buffer has been flushed */ - if (cpu->tb_flushed) { - cpu->tb_flushed = false; - } else if (!tb->invalid) { + if (!tb->invalid) { tb_add_jump(last_tb, tb_exit, tb); } } @@ -606,7 +599,6 @@ int cpu_exec(CPUState *cpu) break; } - atomic_mb_set(&cpu->tb_flushed, false); /* reset before first TB lookup */ for(;;) { cpu_handle_interrupt(cpu, &last_tb); tb = tb_find(cpu, last_tb, tb_exit); diff --git a/cpus-common.c b/cpus-common.c new file mode 100644 index 0000000..3e11452 --- /dev/null +++ b/cpus-common.c @@ -0,0 +1,352 @@ +/* + * CPU thread main loop - common bits for user and system mode emulation + * + * Copyright (c) 2003-2005 Fabrice Bellard + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "qemu/main-loop.h" +#include "exec/cpu-common.h" +#include "qom/cpu.h" +#include "sysemu/cpus.h" + +static QemuMutex qemu_cpu_list_lock; +static QemuCond exclusive_cond; +static QemuCond exclusive_resume; +static QemuCond qemu_work_cond; + +/* >= 1 if a thread is inside start_exclusive/end_exclusive. Written + * under qemu_cpu_list_lock, read with atomic operations. + */ +static int pending_cpus; + +void qemu_init_cpu_list(void) +{ + /* This is needed because qemu_init_cpu_list is also called by the + * child process in a fork. */ + pending_cpus = 0; + + qemu_mutex_init(&qemu_cpu_list_lock); + qemu_cond_init(&exclusive_cond); + qemu_cond_init(&exclusive_resume); + qemu_cond_init(&qemu_work_cond); +} + +void cpu_list_lock(void) +{ + qemu_mutex_lock(&qemu_cpu_list_lock); +} + +void cpu_list_unlock(void) +{ + qemu_mutex_unlock(&qemu_cpu_list_lock); +} + +static bool cpu_index_auto_assigned; + +static int cpu_get_free_index(void) +{ + CPUState *some_cpu; + int cpu_index = 0; + + cpu_index_auto_assigned = true; + CPU_FOREACH(some_cpu) { + cpu_index++; + } + return cpu_index; +} + +static void finish_safe_work(CPUState *cpu) +{ + cpu_exec_start(cpu); + cpu_exec_end(cpu); +} + +void cpu_list_add(CPUState *cpu) +{ + qemu_mutex_lock(&qemu_cpu_list_lock); + if (cpu->cpu_index == UNASSIGNED_CPU_INDEX) { + cpu->cpu_index = cpu_get_free_index(); + assert(cpu->cpu_index != UNASSIGNED_CPU_INDEX); + } else { + assert(!cpu_index_auto_assigned); + } + QTAILQ_INSERT_TAIL(&cpus, cpu, node); + qemu_mutex_unlock(&qemu_cpu_list_lock); + + finish_safe_work(cpu); +} + +void cpu_list_remove(CPUState *cpu) +{ + qemu_mutex_lock(&qemu_cpu_list_lock); + if (!QTAILQ_IN_USE(cpu, node)) { + /* there is nothing to undo since cpu_exec_init() hasn't been called */ + qemu_mutex_unlock(&qemu_cpu_list_lock); + return; + } + + assert(!(cpu_index_auto_assigned && cpu != QTAILQ_LAST(&cpus, CPUTailQ))); + + QTAILQ_REMOVE(&cpus, cpu, node); + cpu->cpu_index = UNASSIGNED_CPU_INDEX; + qemu_mutex_unlock(&qemu_cpu_list_lock); +} + +struct qemu_work_item { + struct qemu_work_item *next; + run_on_cpu_func func; + void *data; + bool free, exclusive, done; +}; + +static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi) +{ + qemu_mutex_lock(&cpu->work_mutex); + if (cpu->queued_work_first == NULL) { + cpu->queued_work_first = wi; + } else { + cpu->queued_work_last->next = wi; + } + cpu->queued_work_last = wi; + wi->next = NULL; + wi->done = false; + qemu_mutex_unlock(&cpu->work_mutex); + + qemu_cpu_kick(cpu); +} + +void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data, + QemuMutex *mutex) +{ + struct qemu_work_item wi; + + if (qemu_cpu_is_self(cpu)) { + func(cpu, data); + return; + } + + wi.func = func; + wi.data = data; + wi.done = false; + wi.free = false; + wi.exclusive = false; + + queue_work_on_cpu(cpu, &wi); + while (!atomic_mb_read(&wi.done)) { + CPUState *self_cpu = current_cpu; + + qemu_cond_wait(&qemu_work_cond, mutex); + current_cpu = self_cpu; + } +} + +void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data) +{ + struct qemu_work_item *wi; + + wi = g_malloc0(sizeof(struct qemu_work_item)); + wi->func = func; + wi->data = data; + wi->free = true; + + queue_work_on_cpu(cpu, wi); +} + +/* Wait for pending exclusive operations to complete. The CPU list lock + must be held. */ +static inline void exclusive_idle(void) +{ + while (pending_cpus) { + qemu_cond_wait(&exclusive_resume, &qemu_cpu_list_lock); + } +} + +/* Start an exclusive operation. + Must only be called from outside cpu_exec. */ +void start_exclusive(void) +{ + CPUState *other_cpu; + int running_cpus; + + qemu_mutex_lock(&qemu_cpu_list_lock); + exclusive_idle(); + + /* Make all other cpus stop executing. */ + atomic_set(&pending_cpus, 1); + + /* Write pending_cpus before reading other_cpu->running. */ + smp_mb(); + running_cpus = 0; + CPU_FOREACH(other_cpu) { + if (atomic_read(&other_cpu->running)) { + other_cpu->has_waiter = true; + running_cpus++; + qemu_cpu_kick(other_cpu); + } + } + + atomic_set(&pending_cpus, running_cpus + 1); + while (pending_cpus > 1) { + qemu_cond_wait(&exclusive_cond, &qemu_cpu_list_lock); + } + + /* Can release mutex, no one will enter another exclusive + * section until end_exclusive resets pending_cpus to 0. + */ + qemu_mutex_unlock(&qemu_cpu_list_lock); +} + +/* Finish an exclusive operation. */ +void end_exclusive(void) +{ + qemu_mutex_lock(&qemu_cpu_list_lock); + atomic_set(&pending_cpus, 0); + qemu_cond_broadcast(&exclusive_resume); + qemu_mutex_unlock(&qemu_cpu_list_lock); +} + +/* Wait for exclusive ops to finish, and begin cpu execution. */ +void cpu_exec_start(CPUState *cpu) +{ + atomic_set(&cpu->running, true); + + /* Write cpu->running before reading pending_cpus. */ + smp_mb(); + + /* 1. start_exclusive saw cpu->running == true and pending_cpus >= 1. + * After taking the lock we'll see cpu->has_waiter == true and run---not + * for long because start_exclusive kicked us. cpu_exec_end will + * decrement pending_cpus and signal the waiter. + * + * 2. start_exclusive saw cpu->running == false but pending_cpus >= 1. + * This includes the case when an exclusive item is running now. + * Then we'll see cpu->has_waiter == false and wait for the item to + * complete. + * + * 3. pending_cpus == 0. Then start_exclusive is definitely going to + * see cpu->running == true, and it will kick the CPU. + */ + if (unlikely(atomic_read(&pending_cpus))) { + qemu_mutex_lock(&qemu_cpu_list_lock); + if (!cpu->has_waiter) { + /* Not counted in pending_cpus, let the exclusive item + * run. Since we have the lock, just set cpu->running to true + * while holding it; no need to check pending_cpus again. + */ + atomic_set(&cpu->running, false); + exclusive_idle(); + /* Now pending_cpus is zero. */ + atomic_set(&cpu->running, true); + } else { + /* Counted in pending_cpus, go ahead and release the + * waiter at cpu_exec_end. + */ + } + qemu_mutex_unlock(&qemu_cpu_list_lock); + } +} + +/* Mark cpu as not executing, and release pending exclusive ops. */ +void cpu_exec_end(CPUState *cpu) +{ + atomic_set(&cpu->running, false); + + /* Write cpu->running before reading pending_cpus. */ + smp_mb(); + + /* 1. start_exclusive saw cpu->running == true. Then it will increment + * pending_cpus and wait for exclusive_cond. After taking the lock + * we'll see cpu->has_waiter == true. + * + * 2. start_exclusive saw cpu->running == false but here pending_cpus >= 1. + * This includes the case when an exclusive item started after setting + * cpu->running to false and before we read pending_cpus. Then we'll see + * cpu->has_waiter == false and not touch pending_cpus. The next call to + * cpu_exec_start will run exclusive_idle if still necessary, thus waiting + * for the item to complete. + * + * 3. pending_cpus == 0. Then start_exclusive is definitely going to + * see cpu->running == false, and it can ignore this CPU until the + * next cpu_exec_start. + */ + if (unlikely(atomic_read(&pending_cpus))) { + qemu_mutex_lock(&qemu_cpu_list_lock); + if (cpu->has_waiter) { + cpu->has_waiter = false; + atomic_set(&pending_cpus, pending_cpus - 1); + if (pending_cpus == 1) { + qemu_cond_signal(&exclusive_cond); + } + } + qemu_mutex_unlock(&qemu_cpu_list_lock); + } +} + +void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data) +{ + struct qemu_work_item *wi; + + wi = g_malloc0(sizeof(struct qemu_work_item)); + wi->func = func; + wi->data = data; + wi->free = true; + wi->exclusive = true; + + queue_work_on_cpu(cpu, wi); +} + +void process_queued_cpu_work(CPUState *cpu) +{ + struct qemu_work_item *wi; + + if (cpu->queued_work_first == NULL) { + return; + } + + qemu_mutex_lock(&cpu->work_mutex); + while (cpu->queued_work_first != NULL) { + wi = cpu->queued_work_first; + cpu->queued_work_first = wi->next; + if (!cpu->queued_work_first) { + cpu->queued_work_last = NULL; + } + qemu_mutex_unlock(&cpu->work_mutex); + if (wi->exclusive) { + /* Running work items outside the BQL avoids the following deadlock: + * 1) start_exclusive() is called with the BQL taken while another + * CPU is running; 2) cpu_exec in the other CPU tries to takes the + * BQL, so it goes to sleep; start_exclusive() is sleeping too, so + * neither CPU can proceed. + */ + qemu_mutex_unlock_iothread(); + start_exclusive(); + wi->func(cpu, wi->data); + end_exclusive(); + qemu_mutex_lock_iothread(); + } else { + wi->func(cpu, wi->data); + } + qemu_mutex_lock(&cpu->work_mutex); + if (wi->free) { + g_free(wi); + } else { + atomic_mb_set(&wi->done, true); + } + } + qemu_mutex_unlock(&cpu->work_mutex); + qemu_cond_broadcast(&qemu_work_cond); +} @@ -557,9 +557,8 @@ static const VMStateDescription vmstate_timers = { } }; -static void cpu_throttle_thread(void *opaque) +static void cpu_throttle_thread(CPUState *cpu, void *opaque) { - CPUState *cpu = opaque; double pct; double throttle_ratio; long sleeptime_ns; @@ -589,7 +588,7 @@ static void cpu_throttle_timer_tick(void *opaque) } CPU_FOREACH(cpu) { if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) { - async_run_on_cpu(cpu, cpu_throttle_thread, cpu); + async_run_on_cpu(cpu, cpu_throttle_thread, NULL); } } @@ -751,6 +750,7 @@ static int do_vm_stop(RunState state) } bdrv_drain_all(); + replay_disable_events(); ret = blk_flush_all(); return ret; @@ -903,79 +903,21 @@ static QemuThread io_thread; static QemuCond qemu_cpu_cond; /* system init */ static QemuCond qemu_pause_cond; -static QemuCond qemu_work_cond; void qemu_init_cpu_loop(void) { qemu_init_sigbus(); qemu_cond_init(&qemu_cpu_cond); qemu_cond_init(&qemu_pause_cond); - qemu_cond_init(&qemu_work_cond); qemu_cond_init(&qemu_io_proceeded_cond); qemu_mutex_init(&qemu_global_mutex); qemu_thread_get_self(&io_thread); } -void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data) +void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data) { - struct qemu_work_item wi; - - if (qemu_cpu_is_self(cpu)) { - func(data); - return; - } - - wi.func = func; - wi.data = data; - wi.free = false; - - qemu_mutex_lock(&cpu->work_mutex); - if (cpu->queued_work_first == NULL) { - cpu->queued_work_first = &wi; - } else { - cpu->queued_work_last->next = &wi; - } - cpu->queued_work_last = &wi; - wi.next = NULL; - wi.done = false; - qemu_mutex_unlock(&cpu->work_mutex); - - qemu_cpu_kick(cpu); - while (!atomic_mb_read(&wi.done)) { - CPUState *self_cpu = current_cpu; - - qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex); - current_cpu = self_cpu; - } -} - -void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data) -{ - struct qemu_work_item *wi; - - if (qemu_cpu_is_self(cpu)) { - func(data); - return; - } - - wi = g_malloc0(sizeof(struct qemu_work_item)); - wi->func = func; - wi->data = data; - wi->free = true; - - qemu_mutex_lock(&cpu->work_mutex); - if (cpu->queued_work_first == NULL) { - cpu->queued_work_first = wi; - } else { - cpu->queued_work_last->next = wi; - } - cpu->queued_work_last = wi; - wi->next = NULL; - wi->done = false; - qemu_mutex_unlock(&cpu->work_mutex); - - qemu_cpu_kick(cpu); + do_run_on_cpu(cpu, func, data, &qemu_global_mutex); } static void qemu_kvm_destroy_vcpu(CPUState *cpu) @@ -990,34 +932,6 @@ static void qemu_tcg_destroy_vcpu(CPUState *cpu) { } -static void flush_queued_work(CPUState *cpu) -{ - struct qemu_work_item *wi; - - if (cpu->queued_work_first == NULL) { - return; - } - - qemu_mutex_lock(&cpu->work_mutex); - while (cpu->queued_work_first != NULL) { - wi = cpu->queued_work_first; - cpu->queued_work_first = wi->next; - if (!cpu->queued_work_first) { - cpu->queued_work_last = NULL; - } - qemu_mutex_unlock(&cpu->work_mutex); - wi->func(wi->data); - qemu_mutex_lock(&cpu->work_mutex); - if (wi->free) { - g_free(wi); - } else { - atomic_mb_set(&wi->done, true); - } - } - qemu_mutex_unlock(&cpu->work_mutex); - qemu_cond_broadcast(&qemu_work_cond); -} - static void qemu_wait_io_event_common(CPUState *cpu) { if (cpu->stop) { @@ -1025,7 +939,7 @@ static void qemu_wait_io_event_common(CPUState *cpu) cpu->stopped = true; qemu_cond_broadcast(&qemu_pause_cond); } - flush_queued_work(cpu); + process_queued_cpu_work(cpu); cpu->thread_kicked = false; } @@ -1544,7 +1458,9 @@ static int tcg_cpu_exec(CPUState *cpu) cpu->icount_decr.u16.low = decr; cpu->icount_extra = count; } + cpu_exec_start(cpu); ret = cpu_exec(cpu); + cpu_exec_end(cpu); #ifdef CONFIG_PROFILER tcg_time += profile_getclock() - ti; #endif diff --git a/docs/tcg-exclusive.promela b/docs/tcg-exclusive.promela new file mode 100644 index 0000000..c91cfca --- /dev/null +++ b/docs/tcg-exclusive.promela @@ -0,0 +1,225 @@ +/* + * This model describes the implementation of exclusive sections in + * cpus-common.c (start_exclusive, end_exclusive, cpu_exec_start, + * cpu_exec_end). + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * This file is in the public domain. If you really want a license, + * the WTFPL will do. + * + * To verify it: + * spin -a docs/tcg-exclusive.promela + * gcc pan.c -O2 + * ./a.out -a + * + * Tunable processor macros: N_CPUS, N_EXCLUSIVE, N_CYCLES, USE_MUTEX, + * TEST_EXPENSIVE. + */ + +// Define the missing parameters for the model +#ifndef N_CPUS +#define N_CPUS 2 +#warning defaulting to 2 CPU processes +#endif + +// the expensive test is not so expensive for <= 2 CPUs +// If the mutex is used, it's also cheap (300 MB / 4 seconds) for 3 CPUs +// For 3 CPUs and the lock-free option it needs 1.5 GB of RAM +#if N_CPUS <= 2 || (N_CPUS <= 3 && defined USE_MUTEX) +#define TEST_EXPENSIVE +#endif + +#ifndef N_EXCLUSIVE +# if !defined N_CYCLES || N_CYCLES <= 1 || defined TEST_EXPENSIVE +# define N_EXCLUSIVE 2 +# warning defaulting to 2 concurrent exclusive sections +# else +# define N_EXCLUSIVE 1 +# warning defaulting to 1 concurrent exclusive sections +# endif +#endif +#ifndef N_CYCLES +# if N_EXCLUSIVE <= 1 || defined TEST_EXPENSIVE +# define N_CYCLES 2 +# warning defaulting to 2 CPU cycles +# else +# define N_CYCLES 1 +# warning defaulting to 1 CPU cycles +# endif +#endif + + +// synchronization primitives. condition variables require a +// process-local "cond_t saved;" variable. + +#define mutex_t byte +#define MUTEX_LOCK(m) atomic { m == 0 -> m = 1 } +#define MUTEX_UNLOCK(m) m = 0 + +#define cond_t int +#define COND_WAIT(c, m) { \ + saved = c; \ + MUTEX_UNLOCK(m); \ + c != saved -> MUTEX_LOCK(m); \ + } +#define COND_BROADCAST(c) c++ + +// this is the logic from cpus-common.c + +mutex_t mutex; +cond_t exclusive_cond; +cond_t exclusive_resume; +byte pending_cpus; + +byte running[N_CPUS]; +byte has_waiter[N_CPUS]; + +#define exclusive_idle() \ + do \ + :: pending_cpus -> COND_WAIT(exclusive_resume, mutex); \ + :: else -> break; \ + od + +#define start_exclusive() \ + MUTEX_LOCK(mutex); \ + exclusive_idle(); \ + pending_cpus = 1; \ + \ + i = 0; \ + do \ + :: i < N_CPUS -> { \ + if \ + :: running[i] -> has_waiter[i] = 1; pending_cpus++; \ + :: else -> skip; \ + fi; \ + i++; \ + } \ + :: else -> break; \ + od; \ + \ + do \ + :: pending_cpus > 1 -> COND_WAIT(exclusive_cond, mutex); \ + :: else -> break; \ + od; \ + MUTEX_UNLOCK(mutex); + +#define end_exclusive() \ + MUTEX_LOCK(mutex); \ + pending_cpus = 0; \ + COND_BROADCAST(exclusive_resume); \ + MUTEX_UNLOCK(mutex); + +#ifdef USE_MUTEX +// Simple version using mutexes +#define cpu_exec_start(id) \ + MUTEX_LOCK(mutex); \ + exclusive_idle(); \ + running[id] = 1; \ + MUTEX_UNLOCK(mutex); + +#define cpu_exec_end(id) \ + MUTEX_LOCK(mutex); \ + running[id] = 0; \ + if \ + :: pending_cpus -> { \ + pending_cpus--; \ + if \ + :: pending_cpus == 1 -> COND_BROADCAST(exclusive_cond); \ + :: else -> skip; \ + fi; \ + } \ + :: else -> skip; \ + fi; \ + MUTEX_UNLOCK(mutex); +#else +// Wait-free fast path, only needs mutex when concurrent with +// an exclusive section +#define cpu_exec_start(id) \ + running[id] = 1; \ + if \ + :: pending_cpus -> { \ + MUTEX_LOCK(mutex); \ + if \ + :: !has_waiter[id] -> { \ + running[id] = 0; \ + exclusive_idle(); \ + running[id] = 1; \ + } \ + :: else -> skip; \ + fi; \ + MUTEX_UNLOCK(mutex); \ + } \ + :: else -> skip; \ + fi; + +#define cpu_exec_end(id) \ + running[id] = 0; \ + if \ + :: pending_cpus -> { \ + MUTEX_LOCK(mutex); \ + if \ + :: has_waiter[id] -> { \ + has_waiter[id] = 0; \ + pending_cpus--; \ + if \ + :: pending_cpus == 1 -> COND_BROADCAST(exclusive_cond); \ + :: else -> skip; \ + fi; \ + } \ + :: else -> skip; \ + fi; \ + MUTEX_UNLOCK(mutex); \ + } \ + :: else -> skip; \ + fi +#endif + +// Promela processes + +byte done_cpu; +byte in_cpu; +active[N_CPUS] proctype cpu() +{ + byte id = _pid % N_CPUS; + byte cycles = 0; + cond_t saved; + + do + :: cycles == N_CYCLES -> break; + :: else -> { + cycles++; + cpu_exec_start(id) + in_cpu++; + done_cpu++; + in_cpu--; + cpu_exec_end(id) + } + od; +} + +byte done_exclusive; +byte in_exclusive; +active[N_EXCLUSIVE] proctype exclusive() +{ + cond_t saved; + byte i; + + start_exclusive(); + in_exclusive = 1; + done_exclusive++; + in_exclusive = 0; + end_exclusive(); +} + +#define LIVENESS (done_cpu == N_CPUS * N_CYCLES && done_exclusive == N_EXCLUSIVE) +#define SAFETY !(in_exclusive && in_cpu) + +never { /* ! ([] SAFETY && <> [] LIVENESS) */ + do + // once the liveness property is satisfied, this is not executable + // and the never clause is not accepted + :: ! LIVENESS -> accept_liveness: skip + :: 1 -> assert(SAFETY) + od; +} @@ -598,36 +598,11 @@ AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx) } #endif -static bool cpu_index_auto_assigned; - -static int cpu_get_free_index(void) -{ - CPUState *some_cpu; - int cpu_index = 0; - - cpu_index_auto_assigned = true; - CPU_FOREACH(some_cpu) { - cpu_index++; - } - return cpu_index; -} - void cpu_exec_exit(CPUState *cpu) { CPUClass *cc = CPU_GET_CLASS(cpu); - cpu_list_lock(); - if (!QTAILQ_IN_USE(cpu, node)) { - /* there is nothing to undo since cpu_exec_init() hasn't been called */ - cpu_list_unlock(); - return; - } - - assert(!(cpu_index_auto_assigned && cpu != QTAILQ_LAST(&cpus, CPUTailQ))); - - QTAILQ_REMOVE(&cpus, cpu, node); - cpu->cpu_index = UNASSIGNED_CPU_INDEX; - cpu_list_unlock(); + cpu_list_remove(cpu); if (cc->vmsd != NULL) { vmstate_unregister(NULL, cc->vmsd, cpu); @@ -663,15 +638,7 @@ void cpu_exec_init(CPUState *cpu, Error **errp) object_ref(OBJECT(cpu->memory)); #endif - cpu_list_lock(); - if (cpu->cpu_index == UNASSIGNED_CPU_INDEX) { - cpu->cpu_index = cpu_get_free_index(); - assert(cpu->cpu_index != UNASSIGNED_CPU_INDEX); - } else { - assert(!cpu_index_auto_assigned); - } - QTAILQ_INSERT_TAIL(&cpus, cpu, node); - cpu_list_unlock(); + cpu_list_add(cpu); #ifndef CONFIG_USER_ONLY if (qdev_get_vmsd(DEVICE(cpu)) == NULL) { diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index a91a179..023de52 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -21,6 +21,7 @@ */ #include "qemu/osdep.h" #include "hw/i386/amd_iommu.h" +#include "qemu/error-report.h" #include "trace.h" /* used AMD-Vi MMIO registers */ @@ -1066,13 +1067,18 @@ static const MemoryRegionOps mmio_mem_ops = { } }; -static void amdvi_iommu_notify_started(MemoryRegion *iommu) +static void amdvi_iommu_notify_flag_changed(MemoryRegion *iommu, + IOMMUNotifierFlag old, + IOMMUNotifierFlag new) { AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu); - hw_error("device %02x.%02x.%x requires iommu notifier which is not " - "currently supported", as->bus_num, PCI_SLOT(as->devfn), - PCI_FUNC(as->devfn)); + if (new & IOMMU_NOTIFIER_MAP) { + error_report("device %02x.%02x.%x requires iommu notifier which is not " + "currently supported", as->bus_num, PCI_SLOT(as->devfn), + PCI_FUNC(as->devfn)); + exit(1); + } } static void amdvi_init(AMDVIState *s) @@ -1080,7 +1086,7 @@ static void amdvi_init(AMDVIState *s) amdvi_iotlb_reset(s); s->iommu_ops.translate = amdvi_translate; - s->iommu_ops.notify_started = amdvi_iommu_notify_started; + s->iommu_ops.notify_flag_changed = amdvi_iommu_notify_flag_changed; s->devtab_len = 0; s->cmdbuf_len = 0; s->cmdbuf_head = 0; diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index d6e02c8..9f4e64a 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -1974,14 +1974,20 @@ static IOMMUTLBEntry vtd_iommu_translate(MemoryRegion *iommu, hwaddr addr, return ret; } -static void vtd_iommu_notify_started(MemoryRegion *iommu) +static void vtd_iommu_notify_flag_changed(MemoryRegion *iommu, + IOMMUNotifierFlag old, + IOMMUNotifierFlag new) { VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); - hw_error("Device at bus %s addr %02x.%d requires iommu notifier which " - "is currently not supported by intel-iommu emulation", - vtd_as->bus->qbus.name, PCI_SLOT(vtd_as->devfn), - PCI_FUNC(vtd_as->devfn)); + if (new & IOMMU_NOTIFIER_MAP) { + error_report("Device at bus %s addr %02x.%d requires iommu " + "notifier which is currently not supported by " + "intel-iommu emulation", + vtd_as->bus->qbus.name, PCI_SLOT(vtd_as->devfn), + PCI_FUNC(vtd_as->devfn)); + exit(1); + } } static const VMStateDescription vtd_vmstate = { @@ -2348,7 +2354,7 @@ static void vtd_init(IntelIOMMUState *s) memset(s->womask, 0, DMAR_REG_SIZE); s->iommu_ops.translate = vtd_iommu_translate; - s->iommu_ops.notify_started = vtd_iommu_notify_started; + s->iommu_ops.notify_flag_changed = vtd_iommu_notify_flag_changed; s->root = 0; s->root_extended = false; s->dmar_enabled = false; diff --git a/hw/i386/kvm/apic.c b/hw/i386/kvm/apic.c index f57fed1..c016e63 100644 --- a/hw/i386/kvm/apic.c +++ b/hw/i386/kvm/apic.c @@ -125,7 +125,7 @@ static void kvm_apic_vapic_base_update(APICCommonState *s) } } -static void kvm_apic_put(void *data) +static void kvm_apic_put(CPUState *cs, void *data) { APICCommonState *s = data; struct kvm_lapic_state kapic; @@ -146,10 +146,9 @@ static void kvm_apic_post_load(APICCommonState *s) run_on_cpu(CPU(s->cpu), kvm_apic_put, s); } -static void do_inject_external_nmi(void *data) +static void do_inject_external_nmi(CPUState *cpu, void *data) { APICCommonState *s = data; - CPUState *cpu = CPU(s->cpu); uint32_t lvt; int ret; diff --git a/hw/i386/kvmvapic.c b/hw/i386/kvmvapic.c index a1cd9b5..74a549b 100644 --- a/hw/i386/kvmvapic.c +++ b/hw/i386/kvmvapic.c @@ -483,7 +483,7 @@ typedef struct VAPICEnableTPRReporting { bool enable; } VAPICEnableTPRReporting; -static void vapic_do_enable_tpr_reporting(void *data) +static void vapic_do_enable_tpr_reporting(CPUState *cpu, void *data) { VAPICEnableTPRReporting *info = data; @@ -734,10 +734,10 @@ static void vapic_realize(DeviceState *dev, Error **errp) nb_option_roms++; } -static void do_vapic_enable(void *data) +static void do_vapic_enable(CPUState *cs, void *data) { VAPICROMState *s = data; - X86CPU *cpu = X86_CPU(first_cpu); + X86CPU *cpu = X86_CPU(cs); static const uint8_t enabled = 1; cpu_physical_memory_write(s->vapic_paddr + offsetof(VAPICState, enabled), diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c index 31791b0..fd9208f 100644 --- a/hw/intc/ioapic.c +++ b/hw/intc/ioapic.c @@ -416,7 +416,7 @@ static void ioapic_realize(DeviceState *dev, Error **errp) } static Property ioapic_properties[] = { - DEFINE_PROP_UINT8("version", IOAPICCommonState, version, 0x11), + DEFINE_PROP_UINT8("version", IOAPICCommonState, version, 0x20), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/ppc/ppce500_spin.c b/hw/ppc/ppce500_spin.c index 22c584e..8e16f65 100644 --- a/hw/ppc/ppce500_spin.c +++ b/hw/ppc/ppce500_spin.c @@ -54,11 +54,6 @@ typedef struct SpinState { SpinInfo spin[MAX_CPUS]; } SpinState; -typedef struct spin_kick { - PowerPCCPU *cpu; - SpinInfo *spin; -} SpinKick; - static void spin_reset(void *opaque) { SpinState *s = opaque; @@ -89,16 +84,15 @@ static void mmubooke_create_initial_mapping(CPUPPCState *env, env->tlb_dirty = true; } -static void spin_kick(void *data) +static void spin_kick(CPUState *cs, void *data) { - SpinKick *kick = data; - CPUState *cpu = CPU(kick->cpu); - CPUPPCState *env = &kick->cpu->env; - SpinInfo *curspin = kick->spin; + PowerPCCPU *cpu = POWERPC_CPU(cs); + CPUPPCState *env = &cpu->env; + SpinInfo *curspin = data; hwaddr map_size = 64 * 1024 * 1024; hwaddr map_start; - cpu_synchronize_state(cpu); + cpu_synchronize_state(cs); stl_p(&curspin->pir, env->spr[SPR_BOOKE_PIR]); env->nip = ldq_p(&curspin->addr) & (map_size - 1); env->gpr[3] = ldq_p(&curspin->r3); @@ -112,10 +106,10 @@ static void spin_kick(void *data) map_start = ldq_p(&curspin->addr) & ~(map_size - 1); mmubooke_create_initial_mapping(env, 0, map_start, map_size); - cpu->halted = 0; - cpu->exception_index = -1; - cpu->stopped = false; - qemu_cpu_kick(cpu); + cs->halted = 0; + cs->exception_index = -1; + cs->stopped = false; + qemu_cpu_kick(cs); } static void spin_write(void *opaque, hwaddr addr, uint64_t value, @@ -153,12 +147,7 @@ static void spin_write(void *opaque, hwaddr addr, uint64_t value, if (!(ldq_p(&curspin->addr) & 1)) { /* run CPU */ - SpinKick kick = { - .cpu = POWERPC_CPU(cpu), - .spin = curspin, - }; - - run_on_cpu(cpu, spin_kick, &kick); + run_on_cpu(cpu, spin_kick, curspin); } } diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 648576e..14b6821 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -2132,10 +2132,8 @@ static void spapr_machine_finalizefn(Object *obj) g_free(spapr->kvm_type); } -static void ppc_cpu_do_nmi_on_cpu(void *arg) +static void ppc_cpu_do_nmi_on_cpu(CPUState *cs, void *arg) { - CPUState *cs = arg; - cpu_synchronize_state(cs); ppc_cpu_do_system_reset(cs); } @@ -2145,7 +2143,7 @@ static void spapr_nmi(NMIState *n, int cpu_index, Error **errp) CPUState *cs; CPU_FOREACH(cs) { - async_run_on_cpu(cs, ppc_cpu_do_nmi_on_cpu, cs); + async_run_on_cpu(cs, ppc_cpu_do_nmi_on_cpu, NULL); } } diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c index 290a712..c5e7e8c 100644 --- a/hw/ppc/spapr_hcall.c +++ b/hw/ppc/spapr_hcall.c @@ -13,19 +13,18 @@ #include "kvm_ppc.h" struct SPRSyncState { - CPUState *cs; int spr; target_ulong value; target_ulong mask; }; -static void do_spr_sync(void *arg) +static void do_spr_sync(CPUState *cs, void *arg) { struct SPRSyncState *s = arg; - PowerPCCPU *cpu = POWERPC_CPU(s->cs); + PowerPCCPU *cpu = POWERPC_CPU(cs); CPUPPCState *env = &cpu->env; - cpu_synchronize_state(s->cs); + cpu_synchronize_state(cs); env->spr[s->spr] &= ~s->mask; env->spr[s->spr] |= s->value; } @@ -34,7 +33,6 @@ static void set_spr(CPUState *cs, int spr, target_ulong value, target_ulong mask) { struct SPRSyncState s = { - .cs = cs, .spr = spr, .value = value, .mask = mask @@ -909,17 +907,17 @@ static target_ulong cas_get_option_vector(int vector, target_ulong table) } typedef struct { - PowerPCCPU *cpu; uint32_t cpu_version; Error *err; } SetCompatState; -static void do_set_compat(void *arg) +static void do_set_compat(CPUState *cs, void *arg) { + PowerPCCPU *cpu = POWERPC_CPU(cs); SetCompatState *s = arg; - cpu_synchronize_state(CPU(s->cpu)); - ppc_set_compat(s->cpu, s->cpu_version, &s->err); + cpu_synchronize_state(cs); + ppc_set_compat(cpu, s->cpu_version, &s->err); } #define get_compat_level(cpuver) ( \ @@ -1015,7 +1013,6 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu_, if (old_cpu_version != cpu_version) { CPU_FOREACH(cs) { SetCompatState s = { - .cpu = POWERPC_CPU(cs), .cpu_version = cpu_version, .err = NULL, }; diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c index f20b0b8..ae30bbe 100644 --- a/hw/ppc/spapr_iommu.c +++ b/hw/ppc/spapr_iommu.c @@ -156,14 +156,17 @@ static uint64_t spapr_tce_get_min_page_size(MemoryRegion *iommu) return 1ULL << tcet->page_shift; } -static void spapr_tce_notify_started(MemoryRegion *iommu) +static void spapr_tce_notify_flag_changed(MemoryRegion *iommu, + IOMMUNotifierFlag old, + IOMMUNotifierFlag new) { - spapr_tce_set_need_vfio(container_of(iommu, sPAPRTCETable, iommu), true); -} + struct sPAPRTCETable *tbl = container_of(iommu, sPAPRTCETable, iommu); -static void spapr_tce_notify_stopped(MemoryRegion *iommu) -{ - spapr_tce_set_need_vfio(container_of(iommu, sPAPRTCETable, iommu), false); + if (old == IOMMU_NOTIFIER_NONE && new != IOMMU_NOTIFIER_NONE) { + spapr_tce_set_need_vfio(tbl, true); + } else if (old != IOMMU_NOTIFIER_NONE && new == IOMMU_NOTIFIER_NONE) { + spapr_tce_set_need_vfio(tbl, false); + } } static int spapr_tce_table_post_load(void *opaque, int version_id) @@ -246,8 +249,7 @@ static const VMStateDescription vmstate_spapr_tce_table = { static MemoryRegionIOMMUOps spapr_iommu_ops = { .translate = spapr_tce_translate_iommu, .get_min_page_size = spapr_tce_get_min_page_size, - .notify_started = spapr_tce_notify_started, - .notify_stopped = spapr_tce_notify_stopped, + .notify_flag_changed = spapr_tce_notify_flag_changed, }; static int spapr_tce_table_realize(DeviceState *dev) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index b313e7c..29188a1 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -293,11 +293,10 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section) section->offset_within_address_space & (1ULL << 63); } -static void vfio_iommu_map_notify(Notifier *n, void *data) +static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) { VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); VFIOContainer *container = giommu->container; - IOMMUTLBEntry *iotlb = data; hwaddr iova = iotlb->iova + giommu->iommu_offset; MemoryRegion *mr; hwaddr xlat; @@ -454,6 +453,7 @@ static void vfio_listener_region_add(MemoryListener *listener, section->offset_within_region; giommu->container = container; giommu->n.notify = vfio_iommu_map_notify; + giommu->n.notifier_flags = IOMMU_NOTIFIER_ALL; QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); memory_region_register_iommu_notifier(giommu->iommu, &giommu->n); diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h index 952bcfe..869ba41 100644 --- a/include/exec/cpu-common.h +++ b/include/exec/cpu-common.h @@ -23,6 +23,11 @@ typedef struct CPUListState { FILE *file; } CPUListState; +/* The CPU list lock nests outside tb_lock/tb_unlock. */ +void qemu_init_cpu_list(void); +void cpu_list_lock(void); +void cpu_list_unlock(void); + #if !defined(CONFIG_USER_ONLY) enum device_endian { diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index 008e09a..336a57c 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -56,17 +56,6 @@ TranslationBlock *tb_gen_code(CPUState *cpu, target_ulong pc, target_ulong cs_base, uint32_t flags, int cflags); -#if defined(CONFIG_USER_ONLY) -void cpu_list_lock(void); -void cpu_list_unlock(void); -#else -static inline void cpu_list_unlock(void) -{ -} -static inline void cpu_list_lock(void) -{ -} -#endif void cpu_exec_init(CPUState *cpu, Error **errp); void QEMU_NORETURN cpu_loop_exit(CPUState *cpu); diff --git a/include/exec/memory.h b/include/exec/memory.h index 3e4d416..10d7eac 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -67,6 +67,27 @@ struct IOMMUTLBEntry { IOMMUAccessFlags perm; }; +/* + * Bitmap for different IOMMUNotifier capabilities. Each notifier can + * register with one or multiple IOMMU Notifier capability bit(s). + */ +typedef enum { + IOMMU_NOTIFIER_NONE = 0, + /* Notify cache invalidations */ + IOMMU_NOTIFIER_UNMAP = 0x1, + /* Notify entry changes (newly created entries) */ + IOMMU_NOTIFIER_MAP = 0x2, +} IOMMUNotifierFlag; + +#define IOMMU_NOTIFIER_ALL (IOMMU_NOTIFIER_MAP | IOMMU_NOTIFIER_UNMAP) + +struct IOMMUNotifier { + void (*notify)(struct IOMMUNotifier *notifier, IOMMUTLBEntry *data); + IOMMUNotifierFlag notifier_flags; + QLIST_ENTRY(IOMMUNotifier) node; +}; +typedef struct IOMMUNotifier IOMMUNotifier; + /* New-style MMIO accessors can indicate that the transaction failed. * A zero (MEMTX_OK) response means success; anything else is a failure * of some kind. The memory subsystem will bitwise-OR together results @@ -153,10 +174,10 @@ struct MemoryRegionIOMMUOps { IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr, bool is_write); /* Returns minimum supported page size */ uint64_t (*get_min_page_size)(MemoryRegion *iommu); - /* Called when the first notifier is set */ - void (*notify_started)(MemoryRegion *iommu); - /* Called when the last notifier is removed */ - void (*notify_stopped)(MemoryRegion *iommu); + /* Called when IOMMU Notifier flag changed */ + void (*notify_flag_changed)(MemoryRegion *iommu, + IOMMUNotifierFlag old_flags, + IOMMUNotifierFlag new_flags); }; typedef struct CoalescedMemoryRange CoalescedMemoryRange; @@ -201,7 +222,8 @@ struct MemoryRegion { const char *name; unsigned ioeventfd_nb; MemoryRegionIoeventfd *ioeventfds; - NotifierList iommu_notify; + QLIST_HEAD(, IOMMUNotifier) iommu_notify; + IOMMUNotifierFlag iommu_notify_flags; }; /** @@ -607,6 +629,15 @@ uint64_t memory_region_iommu_get_min_page_size(MemoryRegion *mr); /** * memory_region_notify_iommu: notify a change in an IOMMU translation entry. * + * The notification type will be decided by entry.perm bits: + * + * - For UNMAP (cache invalidation) notifies: set entry.perm to IOMMU_NONE. + * - For MAP (newly added entry) notifies: set entry.perm to the + * permission of the page (which is definitely !IOMMU_NONE). + * + * Note: for any IOMMU implementation, an in-place mapping change + * should be notified with an UNMAP followed by a MAP. + * * @mr: the memory region that was changed * @entry: the new entry in the IOMMU translation table. The entry * replaces all old entries for the same virtual I/O address range. @@ -620,11 +651,12 @@ void memory_region_notify_iommu(MemoryRegion *mr, * IOMMU translation entries. * * @mr: the memory region to observe - * @n: the notifier to be added; the notifier receives a pointer to an - * #IOMMUTLBEntry as the opaque value; the pointer ceases to be - * valid on exit from the notifier. + * @n: the IOMMUNotifier to be added; the notify callback receives a + * pointer to an #IOMMUTLBEntry as the opaque value; the pointer + * ceases to be valid on exit from the notifier. */ -void memory_region_register_iommu_notifier(MemoryRegion *mr, Notifier *n); +void memory_region_register_iommu_notifier(MemoryRegion *mr, + IOMMUNotifier *n); /** * memory_region_iommu_replay: replay existing IOMMU translations to @@ -636,7 +668,8 @@ void memory_region_register_iommu_notifier(MemoryRegion *mr, Notifier *n); * @is_write: Whether to treat the replay as a translate "write" * through the iommu */ -void memory_region_iommu_replay(MemoryRegion *mr, Notifier *n, bool is_write); +void memory_region_iommu_replay(MemoryRegion *mr, IOMMUNotifier *n, + bool is_write); /** * memory_region_unregister_iommu_notifier: unregister a notifier for @@ -646,7 +679,8 @@ void memory_region_iommu_replay(MemoryRegion *mr, Notifier *n, bool is_write); * needs to be called * @n: the notifier to be removed. */ -void memory_region_unregister_iommu_notifier(MemoryRegion *mr, Notifier *n); +void memory_region_unregister_iommu_notifier(MemoryRegion *mr, + IOMMUNotifier *n); /** * memory_region_name: get a memory region's name @@ -1154,12 +1188,11 @@ MemoryRegionSection memory_region_find(MemoryRegion *mr, hwaddr addr, uint64_t size); /** - * address_space_sync_dirty_bitmap: synchronize the dirty log for all memory + * memory_global_dirty_log_sync: synchronize the dirty log for all memory * - * Synchronizes the dirty page log for an entire address space. - * @as: the address space that contains the memory being synchronized + * Synchronizes the dirty page log for all address spaces. */ -void address_space_sync_dirty_bitmap(AddressSpace *as); +void memory_global_dirty_log_sync(void); /** * memory_region_transaction_begin: Start a transaction. diff --git a/include/exec/tb-context.h b/include/exec/tb-context.h index dce95d9..c7f17f2 100644 --- a/include/exec/tb-context.h +++ b/include/exec/tb-context.h @@ -38,7 +38,7 @@ struct TBContext { QemuMutex tb_lock; /* statistics */ - int tb_flush_count; + unsigned tb_flush_count; int tb_phys_invalidate_count; }; diff --git a/include/hw/compat.h b/include/hw/compat.h index a1d6694..46412b2 100644 --- a/include/hw/compat.h +++ b/include/hw/compat.h @@ -6,6 +6,10 @@ .driver = "virtio-pci",\ .property = "page-per-vq",\ .value = "on",\ + },{\ + .driver = "ioapic",\ + .property = "version",\ + .value = "0x11",\ }, #define HW_COMPAT_2_6 \ diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 94dfae3..c17602e 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -93,7 +93,7 @@ typedef struct VFIOGuestIOMMU { VFIOContainer *container; MemoryRegion *iommu; hwaddr iommu_offset; - Notifier n; + IOMMUNotifier n; QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; } VFIOGuestIOMMU; diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h index 338d3a6..157698b 100644 --- a/include/qemu/compiler.h +++ b/include/qemu/compiler.h @@ -1,4 +1,8 @@ -/* public domain */ +/* compiler.h: macros to abstract away compiler specifics + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ #ifndef COMPILER_H #define COMPILER_H diff --git a/include/qom/cpu.h b/include/qom/cpu.h index ce0c406..22b54d6 100644 --- a/include/qom/cpu.h +++ b/include/qom/cpu.h @@ -232,13 +232,8 @@ struct kvm_run; #define TB_JMP_CACHE_SIZE (1 << TB_JMP_CACHE_BITS) /* work queue */ -struct qemu_work_item { - struct qemu_work_item *next; - void (*func)(void *data); - void *data; - int done; - bool free; -}; +typedef void (*run_on_cpu_func)(CPUState *cpu, void *data); +struct qemu_work_item; /** * CPUState: @@ -247,7 +242,9 @@ struct qemu_work_item { * @nr_threads: Number of threads within this CPU. * @numa_node: NUMA node this CPU is belonging to. * @host_tid: Host thread ID. - * @running: #true if CPU is currently running (usermode). + * @running: #true if CPU is currently running (lockless). + * @has_waiter: #true if a CPU is currently waiting for the cpu_exec_end; + * valid under cpu_list_lock. * @created: Indicates whether the CPU thread has been successfully created. * @interrupt_request: Indicates a pending interrupt request. * @halted: Nonzero if the CPU is in suspended state. @@ -257,7 +254,6 @@ struct qemu_work_item { * @crash_occurred: Indicates the OS reported a crash (panic) for this CPU * @tcg_exit_req: Set to force TCG to stop executing linked TBs for this * CPU and return to its top level loop. - * @tb_flushed: Indicates the translation buffer has been flushed. * @singlestep_enabled: Flags for single-stepping. * @icount_extra: Instructions until next timer event. * @icount_decr: Number of cycles left, with interrupt flag in high bit. @@ -301,7 +297,7 @@ struct CPUState { #endif int thread_id; uint32_t host_tid; - bool running; + bool running, has_waiter; struct QemuCond *halt_cond; bool thread_kicked; bool created; @@ -310,7 +306,6 @@ struct CPUState { bool unplug; bool crash_occurred; bool exit_request; - bool tb_flushed; uint32_t interrupt_request; int singlestep_enabled; int64_t icount_extra; @@ -543,6 +538,18 @@ static inline int cpu_asidx_from_attrs(CPUState *cpu, MemTxAttrs attrs) #endif /** + * cpu_list_add: + * @cpu: The CPU to be added to the list of CPUs. + */ +void cpu_list_add(CPUState *cpu); + +/** + * cpu_list_remove: + * @cpu: The CPU to be removed from the list of CPUs. + */ +void cpu_list_remove(CPUState *cpu); + +/** * cpu_reset: * @cpu: The CPU whose state is to be reset. */ @@ -616,6 +623,18 @@ void qemu_cpu_kick(CPUState *cpu); bool cpu_is_stopped(CPUState *cpu); /** + * do_run_on_cpu: + * @cpu: The vCPU to run on. + * @func: The function to be executed. + * @data: Data to pass to the function. + * @mutex: Mutex to release while waiting for @func to run. + * + * Used internally in the implementation of run_on_cpu. + */ +void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data, + QemuMutex *mutex); + +/** * run_on_cpu: * @cpu: The vCPU to run on. * @func: The function to be executed. @@ -623,7 +642,7 @@ bool cpu_is_stopped(CPUState *cpu); * * Schedules the function @func for execution on the vCPU @cpu. */ -void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data); +void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data); /** * async_run_on_cpu: @@ -633,7 +652,21 @@ void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data); * * Schedules the function @func for execution on the vCPU @cpu asynchronously. */ -void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data); +void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data); + +/** + * async_safe_run_on_cpu: + * @cpu: The vCPU to run on. + * @func: The function to be executed. + * @data: Data to pass to the function. + * + * Schedules the function @func for execution on the vCPU @cpu asynchronously, + * while all other vCPUs are sleeping. + * + * Unlike run_on_cpu and async_run_on_cpu, the function is run outside the + * BQL. + */ +void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data); /** * qemu_get_cpu: @@ -794,6 +827,49 @@ void cpu_remove(CPUState *cpu); void cpu_remove_sync(CPUState *cpu); /** + * process_queued_cpu_work() - process all items on CPU work queue + * @cpu: The CPU which work queue to process. + */ +void process_queued_cpu_work(CPUState *cpu); + +/** + * cpu_exec_start: + * @cpu: The CPU for the current thread. + * + * Record that a CPU has started execution and can be interrupted with + * cpu_exit. + */ +void cpu_exec_start(CPUState *cpu); + +/** + * cpu_exec_end: + * @cpu: The CPU for the current thread. + * + * Record that a CPU has stopped execution and exclusive sections + * can be executed without interrupting it. + */ +void cpu_exec_end(CPUState *cpu); + +/** + * start_exclusive: + * + * Wait for a concurrent exclusive section to end, and then start + * a section of work that is run while other CPUs are not running + * between cpu_exec_start and cpu_exec_end. CPUs that are running + * cpu_exec are exited immediately. CPUs that call cpu_exec_start + * during the exclusive section go to sleep until this CPU calls + * end_exclusive. + */ +void start_exclusive(void); + +/** + * end_exclusive: + * + * Concludes an exclusive execution section started by start_exclusive. + */ +void end_exclusive(void); + +/** * qemu_init_vcpu: * @cpu: The vCPU to initialize. * diff --git a/include/sysemu/replay.h b/include/sysemu/replay.h index 0a88393..f80d6d2 100644 --- a/include/sysemu/replay.h +++ b/include/sysemu/replay.h @@ -105,6 +105,8 @@ bool replay_checkpoint(ReplayCheckpoint checkpoint); /*! Disables storing events in the queue */ void replay_disable_events(void); +/*! Enables storing events in the queue */ +void replay_enable_events(void); /*! Returns true when saving events is enabled */ bool replay_events_enabled(void); /*! Adds bottom half event to the queue */ @@ -115,6 +117,8 @@ void replay_input_event(QemuConsole *src, InputEvent *evt); void replay_input_sync_event(void); /*! Adds block layer event to the queue */ void replay_block_event(QEMUBH *bh, uint64_t id); +/*! Returns ID for the next block event */ +uint64_t blkreplay_next_id(void); /* Character device */ @@ -1847,10 +1847,8 @@ void kvm_flush_coalesced_mmio_buffer(void) s->coalesced_flush_in_progress = false; } -static void do_kvm_cpu_synchronize_state(void *arg) +static void do_kvm_cpu_synchronize_state(CPUState *cpu, void *arg) { - CPUState *cpu = arg; - if (!cpu->kvm_vcpu_dirty) { kvm_arch_get_registers(cpu); cpu->kvm_vcpu_dirty = true; @@ -1860,34 +1858,30 @@ static void do_kvm_cpu_synchronize_state(void *arg) void kvm_cpu_synchronize_state(CPUState *cpu) { if (!cpu->kvm_vcpu_dirty) { - run_on_cpu(cpu, do_kvm_cpu_synchronize_state, cpu); + run_on_cpu(cpu, do_kvm_cpu_synchronize_state, NULL); } } -static void do_kvm_cpu_synchronize_post_reset(void *arg) +static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, void *arg) { - CPUState *cpu = arg; - kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE); cpu->kvm_vcpu_dirty = false; } void kvm_cpu_synchronize_post_reset(CPUState *cpu) { - run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, cpu); + run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, NULL); } -static void do_kvm_cpu_synchronize_post_init(void *arg) +static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, void *arg) { - CPUState *cpu = arg; - kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE); cpu->kvm_vcpu_dirty = false; } void kvm_cpu_synchronize_post_init(CPUState *cpu) { - run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, cpu); + run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, NULL); } int kvm_cpu_exec(CPUState *cpu) @@ -2216,7 +2210,7 @@ struct kvm_set_guest_debug_data { int err; }; -static void kvm_invoke_set_guest_debug(void *data) +static void kvm_invoke_set_guest_debug(CPUState *unused_cpu, void *data) { struct kvm_set_guest_debug_data *dbg_data = data; @@ -2234,7 +2228,6 @@ int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap) data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP; } kvm_arch_update_guest_debug(cpu, &data.dbg); - data.cpu = cpu; run_on_cpu(cpu, kvm_invoke_set_guest_debug, &data); return data.err; diff --git a/linux-user/main.c b/linux-user/main.c index 8daebe0..c8f8573 100644 --- a/linux-user/main.c +++ b/linux-user/main.c @@ -107,21 +107,11 @@ int cpu_get_pic_interrupt(CPUX86State *env) /***********************************************************/ /* Helper routines for implementing atomic operations. */ -/* To implement exclusive operations we force all cpus to syncronise. - We don't require a full sync, only that no cpus are executing guest code. - The alternative is to map target atomic ops onto host equivalents, - which requires quite a lot of per host/target work. */ -static pthread_mutex_t cpu_list_mutex = PTHREAD_MUTEX_INITIALIZER; -static pthread_mutex_t exclusive_lock = PTHREAD_MUTEX_INITIALIZER; -static pthread_cond_t exclusive_cond = PTHREAD_COND_INITIALIZER; -static pthread_cond_t exclusive_resume = PTHREAD_COND_INITIALIZER; -static int pending_cpus; - /* Make sure everything is in a consistent state for calling fork(). */ void fork_start(void) { + cpu_list_lock(); qemu_mutex_lock(&tcg_ctx.tb_ctx.tb_lock); - pthread_mutex_lock(&exclusive_lock); mmap_fork_start(); } @@ -137,93 +127,15 @@ void fork_end(int child) QTAILQ_REMOVE(&cpus, cpu, node); } } - pending_cpus = 0; - pthread_mutex_init(&exclusive_lock, NULL); - pthread_mutex_init(&cpu_list_mutex, NULL); - pthread_cond_init(&exclusive_cond, NULL); - pthread_cond_init(&exclusive_resume, NULL); qemu_mutex_init(&tcg_ctx.tb_ctx.tb_lock); + qemu_init_cpu_list(); gdbserver_fork(thread_cpu); } else { - pthread_mutex_unlock(&exclusive_lock); qemu_mutex_unlock(&tcg_ctx.tb_ctx.tb_lock); + cpu_list_unlock(); } } -/* Wait for pending exclusive operations to complete. The exclusive lock - must be held. */ -static inline void exclusive_idle(void) -{ - while (pending_cpus) { - pthread_cond_wait(&exclusive_resume, &exclusive_lock); - } -} - -/* Start an exclusive operation. - Must only be called from outside cpu_exec. */ -static inline void start_exclusive(void) -{ - CPUState *other_cpu; - - pthread_mutex_lock(&exclusive_lock); - exclusive_idle(); - - pending_cpus = 1; - /* Make all other cpus stop executing. */ - CPU_FOREACH(other_cpu) { - if (other_cpu->running) { - pending_cpus++; - cpu_exit(other_cpu); - } - } - if (pending_cpus > 1) { - pthread_cond_wait(&exclusive_cond, &exclusive_lock); - } -} - -/* Finish an exclusive operation. */ -static inline void __attribute__((unused)) end_exclusive(void) -{ - pending_cpus = 0; - pthread_cond_broadcast(&exclusive_resume); - pthread_mutex_unlock(&exclusive_lock); -} - -/* Wait for exclusive ops to finish, and begin cpu execution. */ -static inline void cpu_exec_start(CPUState *cpu) -{ - pthread_mutex_lock(&exclusive_lock); - exclusive_idle(); - cpu->running = true; - pthread_mutex_unlock(&exclusive_lock); -} - -/* Mark cpu as not executing, and release pending exclusive ops. */ -static inline void cpu_exec_end(CPUState *cpu) -{ - pthread_mutex_lock(&exclusive_lock); - cpu->running = false; - if (pending_cpus > 1) { - pending_cpus--; - if (pending_cpus == 1) { - pthread_cond_signal(&exclusive_cond); - } - } - exclusive_idle(); - pthread_mutex_unlock(&exclusive_lock); -} - -void cpu_list_lock(void) -{ - pthread_mutex_lock(&cpu_list_mutex); -} - -void cpu_list_unlock(void) -{ - pthread_mutex_unlock(&cpu_list_mutex); -} - - #ifdef TARGET_I386 /***********************************************************/ /* CPUX86 core interface */ @@ -296,6 +208,8 @@ void cpu_loop(CPUX86State *env) cpu_exec_start(cs); trapnr = cpu_exec(cs); cpu_exec_end(cs); + process_queued_cpu_work(cs); + switch(trapnr) { case 0x80: /* linux syscall from int $0x80 */ @@ -737,6 +651,8 @@ void cpu_loop(CPUARMState *env) cpu_exec_start(cs); trapnr = cpu_exec(cs); cpu_exec_end(cs); + process_queued_cpu_work(cs); + switch(trapnr) { case EXCP_UDEF: { @@ -1073,6 +989,7 @@ void cpu_loop(CPUARMState *env) cpu_exec_start(cs); trapnr = cpu_exec(cs); cpu_exec_end(cs); + process_queued_cpu_work(cs); switch (trapnr) { case EXCP_SWI: @@ -1161,6 +1078,8 @@ void cpu_loop(CPUUniCore32State *env) cpu_exec_start(cs); trapnr = cpu_exec(cs); cpu_exec_end(cs); + process_queued_cpu_work(cs); + switch (trapnr) { case UC32_EXCP_PRIV: { @@ -1366,6 +1285,7 @@ void cpu_loop (CPUSPARCState *env) cpu_exec_start(cs); trapnr = cpu_exec(cs); cpu_exec_end(cs); + process_queued_cpu_work(cs); /* Compute PSR before exposing state. */ if (env->cc_op != CC_OP_FLAGS) { @@ -1638,6 +1558,8 @@ void cpu_loop(CPUPPCState *env) cpu_exec_start(cs); trapnr = cpu_exec(cs); cpu_exec_end(cs); + process_queued_cpu_work(cs); + switch(trapnr) { case POWERPC_EXCP_NONE: /* Just go on */ @@ -2484,6 +2406,8 @@ void cpu_loop(CPUMIPSState *env) cpu_exec_start(cs); trapnr = cpu_exec(cs); cpu_exec_end(cs); + process_queued_cpu_work(cs); + switch(trapnr) { case EXCP_SYSCALL: env->active_tc.PC += 4; @@ -2724,6 +2648,7 @@ void cpu_loop(CPUOpenRISCState *env) cpu_exec_start(cs); trapnr = cpu_exec(cs); cpu_exec_end(cs); + process_queued_cpu_work(cs); gdbsig = 0; switch (trapnr) { @@ -2818,6 +2743,7 @@ void cpu_loop(CPUSH4State *env) cpu_exec_start(cs); trapnr = cpu_exec(cs); cpu_exec_end(cs); + process_queued_cpu_work(cs); switch (trapnr) { case 0x160: @@ -2884,6 +2810,8 @@ void cpu_loop(CPUCRISState *env) cpu_exec_start(cs); trapnr = cpu_exec(cs); cpu_exec_end(cs); + process_queued_cpu_work(cs); + switch (trapnr) { case 0xaa: { @@ -2949,6 +2877,8 @@ void cpu_loop(CPUMBState *env) cpu_exec_start(cs); trapnr = cpu_exec(cs); cpu_exec_end(cs); + process_queued_cpu_work(cs); + switch (trapnr) { case 0xaa: { @@ -3066,6 +2996,8 @@ void cpu_loop(CPUM68KState *env) cpu_exec_start(cs); trapnr = cpu_exec(cs); cpu_exec_end(cs); + process_queued_cpu_work(cs); + switch(trapnr) { case EXCP_ILLEGAL: { @@ -3209,6 +3141,7 @@ void cpu_loop(CPUAlphaState *env) cpu_exec_start(cs); trapnr = cpu_exec(cs); cpu_exec_end(cs); + process_queued_cpu_work(cs); /* All of the traps imply a transition through PALcode, which implies an REI instruction has been executed. Which means @@ -3401,6 +3334,8 @@ void cpu_loop(CPUS390XState *env) cpu_exec_start(cs); trapnr = cpu_exec(cs); cpu_exec_end(cs); + process_queued_cpu_work(cs); + switch (trapnr) { case EXCP_INTERRUPT: /* Just indicate that signals should be handled asap. */ @@ -3710,6 +3645,8 @@ void cpu_loop(CPUTLGState *env) cpu_exec_start(cs); trapnr = cpu_exec(cs); cpu_exec_end(cs); + process_queued_cpu_work(cs); + switch (trapnr) { case TILEGX_EXCP_SYSCALL: { @@ -3769,6 +3706,16 @@ void cpu_loop(CPUTLGState *env) THREAD CPUState *thread_cpu; +bool qemu_cpu_is_self(CPUState *cpu) +{ + return thread_cpu == cpu; +} + +void qemu_cpu_kick(CPUState *cpu) +{ + cpu_exit(cpu); +} + void task_settid(TaskState *ts) { if (ts->ts_tid == 0) { @@ -4211,6 +4158,7 @@ int main(int argc, char **argv, char **envp) int ret; int execfd; + qemu_init_cpu_list(); module_call_init(MODULE_INIT_QOM); if ((envlist = envlist_create()) == NULL) { @@ -158,14 +158,10 @@ static bool memory_listener_match(MemoryListener *listener, /* No need to ref/unref .mr, the FlatRange keeps it alive. */ #define MEMORY_LISTENER_UPDATE_REGION(fr, as, dir, callback, _args...) \ - MEMORY_LISTENER_CALL(callback, dir, (&(MemoryRegionSection) { \ - .mr = (fr)->mr, \ - .address_space = (as), \ - .offset_within_region = (fr)->offset_in_region, \ - .size = (fr)->addr.size, \ - .offset_within_address_space = int128_get64((fr)->addr.start), \ - .readonly = (fr)->readonly, \ - }), ##_args) + do { \ + MemoryRegionSection mrs = section_from_flat_range(fr, as); \ + MEMORY_LISTENER_CALL(callback, dir, &mrs, ##_args); \ + } while(0) struct CoalescedMemoryRange { AddrRange addr; @@ -245,6 +241,19 @@ typedef struct AddressSpaceOps AddressSpaceOps; #define FOR_EACH_FLAT_RANGE(var, view) \ for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var) +static inline MemoryRegionSection +section_from_flat_range(FlatRange *fr, AddressSpace *as) +{ + return (MemoryRegionSection) { + .mr = fr->mr, + .address_space = as, + .offset_within_region = fr->offset_in_region, + .size = fr->addr.size, + .offset_within_address_space = int128_get64(fr->addr.start), + .readonly = fr->readonly, + }; +} + static bool flatrange_equal(FlatRange *a, FlatRange *b) { return a->mr == b->mr @@ -1413,7 +1422,8 @@ void memory_region_init_iommu(MemoryRegion *mr, memory_region_init(mr, owner, name, size); mr->iommu_ops = ops, mr->terminates = true; /* then re-forwards */ - notifier_list_init(&mr->iommu_notify); + QLIST_INIT(&mr->iommu_notify); + mr->iommu_notify_flags = IOMMU_NOTIFIER_NONE; } static void memory_region_finalize(Object *obj) @@ -1508,13 +1518,31 @@ bool memory_region_is_logging(MemoryRegion *mr, uint8_t client) return memory_region_get_dirty_log_mask(mr) & (1 << client); } -void memory_region_register_iommu_notifier(MemoryRegion *mr, Notifier *n) +static void memory_region_update_iommu_notify_flags(MemoryRegion *mr) { - if (mr->iommu_ops->notify_started && - QLIST_EMPTY(&mr->iommu_notify.notifiers)) { - mr->iommu_ops->notify_started(mr); + IOMMUNotifierFlag flags = IOMMU_NOTIFIER_NONE; + IOMMUNotifier *iommu_notifier; + + QLIST_FOREACH(iommu_notifier, &mr->iommu_notify, node) { + flags |= iommu_notifier->notifier_flags; + } + + if (flags != mr->iommu_notify_flags && + mr->iommu_ops->notify_flag_changed) { + mr->iommu_ops->notify_flag_changed(mr, mr->iommu_notify_flags, + flags); } - notifier_list_add(&mr->iommu_notify, n); + + mr->iommu_notify_flags = flags; +} + +void memory_region_register_iommu_notifier(MemoryRegion *mr, + IOMMUNotifier *n) +{ + /* We need to register for at least one bitfield */ + assert(n->notifier_flags != IOMMU_NOTIFIER_NONE); + QLIST_INSERT_HEAD(&mr->iommu_notify, n, node); + memory_region_update_iommu_notify_flags(mr); } uint64_t memory_region_iommu_get_min_page_size(MemoryRegion *mr) @@ -1526,7 +1554,8 @@ uint64_t memory_region_iommu_get_min_page_size(MemoryRegion *mr) return TARGET_PAGE_SIZE; } -void memory_region_iommu_replay(MemoryRegion *mr, Notifier *n, bool is_write) +void memory_region_iommu_replay(MemoryRegion *mr, IOMMUNotifier *n, + bool is_write) { hwaddr addr, granularity; IOMMUTLBEntry iotlb; @@ -1547,20 +1576,32 @@ void memory_region_iommu_replay(MemoryRegion *mr, Notifier *n, bool is_write) } } -void memory_region_unregister_iommu_notifier(MemoryRegion *mr, Notifier *n) +void memory_region_unregister_iommu_notifier(MemoryRegion *mr, + IOMMUNotifier *n) { - notifier_remove(n); - if (mr->iommu_ops->notify_stopped && - QLIST_EMPTY(&mr->iommu_notify.notifiers)) { - mr->iommu_ops->notify_stopped(mr); - } + QLIST_REMOVE(n, node); + memory_region_update_iommu_notify_flags(mr); } void memory_region_notify_iommu(MemoryRegion *mr, IOMMUTLBEntry entry) { + IOMMUNotifier *iommu_notifier; + IOMMUNotifierFlag request_flags; + assert(memory_region_is_iommu(mr)); - notifier_list_notify(&mr->iommu_notify, &entry); + + if (entry.perm & IOMMU_RW) { + request_flags = IOMMU_NOTIFIER_MAP; + } else { + request_flags = IOMMU_NOTIFIER_UNMAP; + } + + QLIST_FOREACH(iommu_notifier, &mr->iommu_notify, node) { + if (iommu_notifier->notifier_flags & request_flags) { + iommu_notifier->notify(iommu_notifier, &entry); + } + } } void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client) @@ -2124,16 +2165,27 @@ bool memory_region_present(MemoryRegion *container, hwaddr addr) return mr && mr != container; } -void address_space_sync_dirty_bitmap(AddressSpace *as) +void memory_global_dirty_log_sync(void) { + MemoryListener *listener; + AddressSpace *as; FlatView *view; FlatRange *fr; - view = address_space_get_flatview(as); - FOR_EACH_FLAT_RANGE(fr, view) { - MEMORY_LISTENER_UPDATE_REGION(fr, as, Forward, log_sync); + QTAILQ_FOREACH(listener, &memory_listeners, link) { + if (!listener->log_sync) { + continue; + } + /* Global listeners are being phased out. */ + assert(listener->address_space_filter); + as = listener->address_space_filter; + view = address_space_get_flatview(as); + FOR_EACH_FLAT_RANGE(fr, view) { + MemoryRegionSection mrs = section_from_flat_range(fr, as); + listener->log_sync(listener, &mrs); + } + flatview_unref(view); } - flatview_unref(view); } void memory_global_dirty_log_start(void) diff --git a/migration/ram.c b/migration/ram.c index a6e1c63..c8ec9f2 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -626,7 +626,7 @@ static void migration_bitmap_sync(void) } trace_migration_bitmap_sync_start(); - address_space_sync_dirty_bitmap(&address_space_memory); + memory_global_dirty_log_sync(); qemu_mutex_lock(&migration_bitmap_mutex); rcu_read_lock(); diff --git a/replay/Makefile.objs b/replay/Makefile.objs index fcb3f74..c8ad3eb 100644 --- a/replay/Makefile.objs +++ b/replay/Makefile.objs @@ -4,3 +4,4 @@ common-obj-y += replay-events.o common-obj-y += replay-time.o common-obj-y += replay-input.o common-obj-y += replay-char.o +common-obj-y += replay-snapshot.o diff --git a/replay/replay-events.c b/replay/replay-events.c index 3807245..c513913 100644 --- a/replay/replay-events.c +++ b/replay/replay-events.c @@ -279,7 +279,7 @@ static Event *replay_read_event(int checkpoint) /* Called with replay mutex locked */ void replay_read_events(int checkpoint) { - while (replay_data_kind == EVENT_ASYNC) { + while (replay_state.data_kind == EVENT_ASYNC) { Event *event = replay_read_event(checkpoint); if (!event) { break; @@ -309,3 +309,11 @@ bool replay_events_enabled(void) { return events_enabled; } + +uint64_t blkreplay_next_id(void) +{ + if (replay_events_enabled()) { + return replay_state.block_request_id++; + } + return 0; +} diff --git a/replay/replay-internal.c b/replay/replay-internal.c index 5835e8d..bea7b4a 100644 --- a/replay/replay-internal.c +++ b/replay/replay-internal.c @@ -16,11 +16,8 @@ #include "qemu/error-report.h" #include "sysemu/sysemu.h" -unsigned int replay_data_kind = -1; -static unsigned int replay_has_unread_data; - /* Mutex to protect reading and writing events to the log. - replay_data_kind and replay_has_unread_data are also protected + data_kind and has_unread_data are also protected by this mutex. It also protects replay events queue which stores events to be written or read to the log. */ @@ -150,15 +147,16 @@ void replay_check_error(void) void replay_fetch_data_kind(void) { if (replay_file) { - if (!replay_has_unread_data) { - replay_data_kind = replay_get_byte(); - if (replay_data_kind == EVENT_INSTRUCTION) { + if (!replay_state.has_unread_data) { + replay_state.data_kind = replay_get_byte(); + if (replay_state.data_kind == EVENT_INSTRUCTION) { replay_state.instructions_count = replay_get_dword(); } replay_check_error(); - replay_has_unread_data = 1; - if (replay_data_kind >= EVENT_COUNT) { - error_report("Replay: unknown event kind %d", replay_data_kind); + replay_state.has_unread_data = 1; + if (replay_state.data_kind >= EVENT_COUNT) { + error_report("Replay: unknown event kind %d", + replay_state.data_kind); exit(1); } } @@ -167,7 +165,7 @@ void replay_fetch_data_kind(void) void replay_finish_event(void) { - replay_has_unread_data = 0; + replay_state.has_unread_data = 0; replay_fetch_data_kind(); } diff --git a/replay/replay-internal.h b/replay/replay-internal.h index efbf14c..9117e44 100644 --- a/replay/replay-internal.h +++ b/replay/replay-internal.h @@ -62,11 +62,19 @@ typedef struct ReplayState { uint64_t current_step; /*! Number of instructions to be executed before other events happen. */ int instructions_count; + /*! Type of the currently executed event. */ + unsigned int data_kind; + /*! Flag which indicates that event is not processed yet. */ + unsigned int has_unread_data; + /*! Temporary variable for saving current log offset. */ + uint64_t file_offset; + /*! Next block operation id. + This counter is global, because requests from different + block devices should not get overlapping ids. */ + uint64_t block_request_id; } ReplayState; extern ReplayState replay_state; -extern unsigned int replay_data_kind; - /* File for replay writing */ extern FILE *replay_file; @@ -98,7 +106,7 @@ void replay_check_error(void); the next event from the log. */ void replay_finish_event(void); /*! Reads data type from the file and stores it in the - replay_data_kind variable. */ + data_kind variable. */ void replay_fetch_data_kind(void); /*! Saves queued events (like instructions and sound). */ @@ -119,8 +127,6 @@ void replay_read_next_clock(unsigned int kind); void replay_init_events(void); /*! Clears internal data structures for events handling */ void replay_finish_events(void); -/*! Enables storing events in the queue */ -void replay_enable_events(void); /*! Flushes events queue */ void replay_flush_events(void); /*! Clears events list before loading new VM state */ @@ -155,4 +161,11 @@ void replay_event_char_read_save(void *opaque); /*! Reads char event read from the file. */ void *replay_event_char_read_load(void); +/* VMState-related functions */ + +/* Registers replay VMState. + Should be called before virtual devices initialization + to make cached timers available for post_load functions. */ +void replay_vmstate_register(void); + #endif diff --git a/replay/replay-snapshot.c b/replay/replay-snapshot.c new file mode 100644 index 0000000..4980597 --- /dev/null +++ b/replay/replay-snapshot.c @@ -0,0 +1,61 @@ +/* + * replay-snapshot.c + * + * Copyright (c) 2010-2016 Institute for System Programming + * of the Russian Academy of Sciences. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu-common.h" +#include "sysemu/replay.h" +#include "replay-internal.h" +#include "sysemu/sysemu.h" +#include "monitor/monitor.h" +#include "qapi/qmp/qstring.h" +#include "qemu/error-report.h" +#include "migration/vmstate.h" + +static void replay_pre_save(void *opaque) +{ + ReplayState *state = opaque; + state->file_offset = ftell(replay_file); +} + +static int replay_post_load(void *opaque, int version_id) +{ + ReplayState *state = opaque; + fseek(replay_file, state->file_offset, SEEK_SET); + /* If this was a vmstate, saved in recording mode, + we need to initialize replay data fields. */ + replay_fetch_data_kind(); + + return 0; +} + +static const VMStateDescription vmstate_replay = { + .name = "replay", + .version_id = 1, + .minimum_version_id = 1, + .pre_save = replay_pre_save, + .post_load = replay_post_load, + .fields = (VMStateField[]) { + VMSTATE_INT64_ARRAY(cached_clock, ReplayState, REPLAY_CLOCK_COUNT), + VMSTATE_UINT64(current_step, ReplayState), + VMSTATE_INT32(instructions_count, ReplayState), + VMSTATE_UINT32(data_kind, ReplayState), + VMSTATE_UINT32(has_unread_data, ReplayState), + VMSTATE_UINT64(file_offset, ReplayState), + VMSTATE_UINT64(block_request_id, ReplayState), + VMSTATE_END_OF_LIST() + }, +}; + +void replay_vmstate_register(void) +{ + vmstate_register(NULL, 0, &vmstate_replay, &replay_state); +} diff --git a/replay/replay-time.c b/replay/replay-time.c index fffe072..f70382a 100644 --- a/replay/replay-time.c +++ b/replay/replay-time.c @@ -31,7 +31,7 @@ int64_t replay_save_clock(ReplayClockKind kind, int64_t clock) void replay_read_next_clock(ReplayClockKind kind) { - unsigned int read_kind = replay_data_kind - EVENT_CLOCK; + unsigned int read_kind = replay_state.data_kind - EVENT_CLOCK; assert(read_kind == kind); diff --git a/replay/replay.c b/replay/replay.c index 167fd29..c797aea 100644 --- a/replay/replay.c +++ b/replay/replay.c @@ -38,15 +38,15 @@ bool replay_next_event_is(int event) /* nothing to skip - not all instructions used */ if (replay_state.instructions_count != 0) { - assert(replay_data_kind == EVENT_INSTRUCTION); + assert(replay_state.data_kind == EVENT_INSTRUCTION); return event == EVENT_INSTRUCTION; } while (true) { - if (event == replay_data_kind) { + if (event == replay_state.data_kind) { res = true; } - switch (replay_data_kind) { + switch (replay_state.data_kind) { case EVENT_SHUTDOWN: replay_finish_event(); qemu_system_shutdown_request(); @@ -85,7 +85,7 @@ void replay_account_executed_instructions(void) replay_state.instructions_count -= count; replay_state.current_step += count; if (replay_state.instructions_count == 0) { - assert(replay_data_kind == EVENT_INSTRUCTION); + assert(replay_state.data_kind == EVENT_INSTRUCTION); replay_finish_event(); /* Wake up iothread. This is required because timers will not expire until clock counters @@ -188,7 +188,7 @@ bool replay_checkpoint(ReplayCheckpoint checkpoint) if (replay_mode == REPLAY_MODE_PLAY) { if (replay_next_event_is(EVENT_CHECKPOINT + checkpoint)) { replay_finish_event(); - } else if (replay_data_kind != EVENT_ASYNC) { + } else if (replay_state.data_kind != EVENT_ASYNC) { res = false; goto out; } @@ -196,7 +196,7 @@ bool replay_checkpoint(ReplayCheckpoint checkpoint) /* replay_read_events may leave some unread events. Return false if not all of the events associated with checkpoint were processed */ - res = replay_data_kind != EVENT_ASYNC; + res = replay_state.data_kind != EVENT_ASYNC; } else if (replay_mode == REPLAY_MODE_RECORD) { replay_put_event(EVENT_CHECKPOINT + checkpoint); replay_save_events(checkpoint); @@ -237,9 +237,10 @@ static void replay_enable(const char *fname, int mode) replay_filename = g_strdup(fname); replay_mode = mode; - replay_data_kind = -1; + replay_state.data_kind = -1; replay_state.instructions_count = 0; replay_state.current_step = 0; + replay_state.has_unread_data = 0; /* skip file header for RECORD and check it for PLAY */ if (replay_mode == REPLAY_MODE_RECORD) { @@ -291,6 +292,7 @@ void replay_configure(QemuOpts *opts) exit(1); } + replay_vmstate_register(); replay_enable(fname, mode); out: diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index dde3f5f..3afa19a 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2407,7 +2407,7 @@ sub process { # we have e.g. CONFIG_LINUX and CONFIG_WIN32 for common cases # where they might be necessary. if ($line =~ m@^.\s*\#\s*if.*\b__@) { - ERROR("architecture specific defines should be avoided\n" . $herecurr); + WARN("architecture specific defines should be avoided\n" . $herecurr); } # Check that the storage class is at the beginning of a declaration diff --git a/stubs/replay.c b/stubs/replay.c index de9fa1e..d9a6da9 100644 --- a/stubs/replay.c +++ b/stubs/replay.c @@ -67,3 +67,8 @@ void replay_char_read_all_save_buf(uint8_t *buf, int offset) void replay_block_event(QEMUBH *bh, uint64_t id) { } + +uint64_t blkreplay_next_id(void) +{ + return 0; +} diff --git a/target-i386/helper.c b/target-i386/helper.c index 1c250b8..9bc961b 100644 --- a/target-i386/helper.c +++ b/target-i386/helper.c @@ -1113,7 +1113,6 @@ out: typedef struct MCEInjectionParams { Monitor *mon; - X86CPU *cpu; int bank; uint64_t status; uint64_t mcg_status; @@ -1122,14 +1121,14 @@ typedef struct MCEInjectionParams { int flags; } MCEInjectionParams; -static void do_inject_x86_mce(void *data) +static void do_inject_x86_mce(CPUState *cs, void *data) { MCEInjectionParams *params = data; - CPUX86State *cenv = ¶ms->cpu->env; - CPUState *cpu = CPU(params->cpu); + X86CPU *cpu = X86_CPU(cs); + CPUX86State *cenv = &cpu->env; uint64_t *banks = cenv->mce_banks + 4 * params->bank; - cpu_synchronize_state(cpu); + cpu_synchronize_state(cs); /* * If there is an MCE exception being processed, ignore this SRAO MCE @@ -1149,7 +1148,7 @@ static void do_inject_x86_mce(void *data) if ((cenv->mcg_cap & MCG_CTL_P) && cenv->mcg_ctl != ~(uint64_t)0) { monitor_printf(params->mon, "CPU %d: Uncorrected error reporting disabled\n", - cpu->cpu_index); + cs->cpu_index); return; } @@ -1161,7 +1160,7 @@ static void do_inject_x86_mce(void *data) monitor_printf(params->mon, "CPU %d: Uncorrected error reporting disabled for" " bank %d\n", - cpu->cpu_index, params->bank); + cs->cpu_index, params->bank); return; } @@ -1170,7 +1169,7 @@ static void do_inject_x86_mce(void *data) monitor_printf(params->mon, "CPU %d: Previous MCE still in progress, raising" " triple fault\n", - cpu->cpu_index); + cs->cpu_index); qemu_log_mask(CPU_LOG_RESET, "Triple fault\n"); qemu_system_reset_request(); return; @@ -1182,7 +1181,7 @@ static void do_inject_x86_mce(void *data) banks[3] = params->misc; cenv->mcg_status = params->mcg_status; banks[1] = params->status; - cpu_interrupt(cpu, CPU_INTERRUPT_MCE); + cpu_interrupt(cs, CPU_INTERRUPT_MCE); } else if (!(banks[1] & MCI_STATUS_VAL) || !(banks[1] & MCI_STATUS_UC)) { if (banks[1] & MCI_STATUS_VAL) { @@ -1204,7 +1203,6 @@ void cpu_x86_inject_mce(Monitor *mon, X86CPU *cpu, int bank, CPUX86State *cenv = &cpu->env; MCEInjectionParams params = { .mon = mon, - .cpu = cpu, .bank = bank, .status = status, .mcg_status = mcg_status, @@ -1245,7 +1243,6 @@ void cpu_x86_inject_mce(Monitor *mon, X86CPU *cpu, int bank, if (other_cs == cs) { continue; } - params.cpu = X86_CPU(other_cs); run_on_cpu(other_cs, do_inject_x86_mce, ¶ms); } } diff --git a/target-i386/kvm.c b/target-i386/kvm.c index 30b63b7..ee1f53e 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -150,10 +150,8 @@ static int kvm_get_tsc(CPUState *cs) return 0; } -static inline void do_kvm_synchronize_tsc(void *arg) +static inline void do_kvm_synchronize_tsc(CPUState *cpu, void *arg) { - CPUState *cpu = arg; - kvm_get_tsc(cpu); } @@ -163,7 +161,7 @@ void kvm_synchronize_all_tsc(void) if (kvm_enabled()) { CPU_FOREACH(cpu) { - run_on_cpu(cpu, do_kvm_synchronize_tsc, cpu); + run_on_cpu(cpu, do_kvm_synchronize_tsc, NULL); } } } diff --git a/target-s390x/cpu.c b/target-s390x/cpu.c index 2f3c8e2..35ae2ce 100644 --- a/target-s390x/cpu.c +++ b/target-s390x/cpu.c @@ -164,7 +164,7 @@ static void s390_cpu_machine_reset_cb(void *opaque) { S390CPU *cpu = opaque; - run_on_cpu(CPU(cpu), s390_do_cpu_full_reset, CPU(cpu)); + run_on_cpu(CPU(cpu), s390_do_cpu_full_reset, NULL); } #endif @@ -220,7 +220,7 @@ static void s390_cpu_realizefn(DeviceState *dev, Error **errp) s390_cpu_gdb_init(cs); qemu_init_vcpu(cs); #if !defined(CONFIG_USER_ONLY) - run_on_cpu(cs, s390_do_cpu_full_reset, cs); + run_on_cpu(cs, s390_do_cpu_full_reset, NULL); #else cpu_reset(cs); #endif diff --git a/target-s390x/cpu.h b/target-s390x/cpu.h index 5645e06..4fb34b5 100644 --- a/target-s390x/cpu.h +++ b/target-s390x/cpu.h @@ -502,17 +502,14 @@ static inline hwaddr decode_basedisp_s(CPUS390XState *env, uint32_t ipb, #define decode_basedisp_rs decode_basedisp_s /* helper functions for run_on_cpu() */ -static inline void s390_do_cpu_reset(void *arg) +static inline void s390_do_cpu_reset(CPUState *cs, void *arg) { - CPUState *cs = arg; S390CPUClass *scc = S390_CPU_GET_CLASS(cs); scc->cpu_reset(cs); } -static inline void s390_do_cpu_full_reset(void *arg) +static inline void s390_do_cpu_full_reset(CPUState *cs, void *arg) { - CPUState *cs = arg; - cpu_reset(cs); } diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c index 4b847a3..fd929e8 100644 --- a/target-s390x/kvm.c +++ b/target-s390x/kvm.c @@ -1385,7 +1385,6 @@ static int handle_diag(S390CPU *cpu, struct kvm_run *run, uint32_t ipb) } typedef struct SigpInfo { - S390CPU *cpu; uint64_t param; int cc; uint64_t *status_reg; @@ -1398,38 +1397,40 @@ static void set_sigp_status(SigpInfo *si, uint64_t status) si->cc = SIGP_CC_STATUS_STORED; } -static void sigp_start(void *arg) +static void sigp_start(CPUState *cs, void *arg) { + S390CPU *cpu = S390_CPU(cs); SigpInfo *si = arg; - if (s390_cpu_get_state(si->cpu) != CPU_STATE_STOPPED) { + if (s390_cpu_get_state(cpu) != CPU_STATE_STOPPED) { si->cc = SIGP_CC_ORDER_CODE_ACCEPTED; return; } - s390_cpu_set_state(CPU_STATE_OPERATING, si->cpu); + s390_cpu_set_state(CPU_STATE_OPERATING, cpu); si->cc = SIGP_CC_ORDER_CODE_ACCEPTED; } -static void sigp_stop(void *arg) +static void sigp_stop(CPUState *cs, void *arg) { + S390CPU *cpu = S390_CPU(cs); SigpInfo *si = arg; struct kvm_s390_irq irq = { .type = KVM_S390_SIGP_STOP, }; - if (s390_cpu_get_state(si->cpu) != CPU_STATE_OPERATING) { + if (s390_cpu_get_state(cpu) != CPU_STATE_OPERATING) { si->cc = SIGP_CC_ORDER_CODE_ACCEPTED; return; } /* disabled wait - sleeping in user space */ - if (CPU(si->cpu)->halted) { - s390_cpu_set_state(CPU_STATE_STOPPED, si->cpu); + if (cs->halted) { + s390_cpu_set_state(CPU_STATE_STOPPED, cpu); } else { /* execute the stop function */ - si->cpu->env.sigp_order = SIGP_STOP; - kvm_s390_vcpu_interrupt(si->cpu, &irq); + cpu->env.sigp_order = SIGP_STOP; + kvm_s390_vcpu_interrupt(cpu, &irq); } si->cc = SIGP_CC_ORDER_CODE_ACCEPTED; } @@ -1496,56 +1497,58 @@ static int kvm_s390_store_status(S390CPU *cpu, hwaddr addr, bool store_arch) return 0; } -static void sigp_stop_and_store_status(void *arg) +static void sigp_stop_and_store_status(CPUState *cs, void *arg) { + S390CPU *cpu = S390_CPU(cs); SigpInfo *si = arg; struct kvm_s390_irq irq = { .type = KVM_S390_SIGP_STOP, }; /* disabled wait - sleeping in user space */ - if (s390_cpu_get_state(si->cpu) == CPU_STATE_OPERATING && - CPU(si->cpu)->halted) { - s390_cpu_set_state(CPU_STATE_STOPPED, si->cpu); + if (s390_cpu_get_state(cpu) == CPU_STATE_OPERATING && cs->halted) { + s390_cpu_set_state(CPU_STATE_STOPPED, cpu); } - switch (s390_cpu_get_state(si->cpu)) { + switch (s390_cpu_get_state(cpu)) { case CPU_STATE_OPERATING: - si->cpu->env.sigp_order = SIGP_STOP_STORE_STATUS; - kvm_s390_vcpu_interrupt(si->cpu, &irq); + cpu->env.sigp_order = SIGP_STOP_STORE_STATUS; + kvm_s390_vcpu_interrupt(cpu, &irq); /* store will be performed when handling the stop intercept */ break; case CPU_STATE_STOPPED: /* already stopped, just store the status */ - cpu_synchronize_state(CPU(si->cpu)); - kvm_s390_store_status(si->cpu, KVM_S390_STORE_STATUS_DEF_ADDR, true); + cpu_synchronize_state(cs); + kvm_s390_store_status(cpu, KVM_S390_STORE_STATUS_DEF_ADDR, true); break; } si->cc = SIGP_CC_ORDER_CODE_ACCEPTED; } -static void sigp_store_status_at_address(void *arg) +static void sigp_store_status_at_address(CPUState *cs, void *arg) { + S390CPU *cpu = S390_CPU(cs); SigpInfo *si = arg; uint32_t address = si->param & 0x7ffffe00u; /* cpu has to be stopped */ - if (s390_cpu_get_state(si->cpu) != CPU_STATE_STOPPED) { + if (s390_cpu_get_state(cpu) != CPU_STATE_STOPPED) { set_sigp_status(si, SIGP_STAT_INCORRECT_STATE); return; } - cpu_synchronize_state(CPU(si->cpu)); + cpu_synchronize_state(cs); - if (kvm_s390_store_status(si->cpu, address, false)) { + if (kvm_s390_store_status(cpu, address, false)) { set_sigp_status(si, SIGP_STAT_INVALID_PARAMETER); return; } si->cc = SIGP_CC_ORDER_CODE_ACCEPTED; } -static void sigp_store_adtl_status(void *arg) +static void sigp_store_adtl_status(CPUState *cs, void *arg) { + S390CPU *cpu = S390_CPU(cs); SigpInfo *si = arg; if (!s390_has_feat(S390_FEAT_VECTOR)) { @@ -1554,7 +1557,7 @@ static void sigp_store_adtl_status(void *arg) } /* cpu has to be stopped */ - if (s390_cpu_get_state(si->cpu) != CPU_STATE_STOPPED) { + if (s390_cpu_get_state(cpu) != CPU_STATE_STOPPED) { set_sigp_status(si, SIGP_STAT_INCORRECT_STATE); return; } @@ -1565,31 +1568,32 @@ static void sigp_store_adtl_status(void *arg) return; } - cpu_synchronize_state(CPU(si->cpu)); + cpu_synchronize_state(cs); - if (kvm_s390_store_adtl_status(si->cpu, si->param)) { + if (kvm_s390_store_adtl_status(cpu, si->param)) { set_sigp_status(si, SIGP_STAT_INVALID_PARAMETER); return; } si->cc = SIGP_CC_ORDER_CODE_ACCEPTED; } -static void sigp_restart(void *arg) +static void sigp_restart(CPUState *cs, void *arg) { + S390CPU *cpu = S390_CPU(cs); SigpInfo *si = arg; struct kvm_s390_irq irq = { .type = KVM_S390_RESTART, }; - switch (s390_cpu_get_state(si->cpu)) { + switch (s390_cpu_get_state(cpu)) { case CPU_STATE_STOPPED: /* the restart irq has to be delivered prior to any other pending irq */ - cpu_synchronize_state(CPU(si->cpu)); - do_restart_interrupt(&si->cpu->env); - s390_cpu_set_state(CPU_STATE_OPERATING, si->cpu); + cpu_synchronize_state(cs); + do_restart_interrupt(&cpu->env); + s390_cpu_set_state(CPU_STATE_OPERATING, cpu); break; case CPU_STATE_OPERATING: - kvm_s390_vcpu_interrupt(si->cpu, &irq); + kvm_s390_vcpu_interrupt(cpu, &irq); break; } si->cc = SIGP_CC_ORDER_CODE_ACCEPTED; @@ -1597,20 +1601,18 @@ static void sigp_restart(void *arg) int kvm_s390_cpu_restart(S390CPU *cpu) { - SigpInfo si = { - .cpu = cpu, - }; + SigpInfo si = {}; run_on_cpu(CPU(cpu), sigp_restart, &si); DPRINTF("DONE: KVM cpu restart: %p\n", &cpu->env); return 0; } -static void sigp_initial_cpu_reset(void *arg) +static void sigp_initial_cpu_reset(CPUState *cs, void *arg) { + S390CPU *cpu = S390_CPU(cs); + S390CPUClass *scc = S390_CPU_GET_CLASS(cpu); SigpInfo *si = arg; - CPUState *cs = CPU(si->cpu); - S390CPUClass *scc = S390_CPU_GET_CLASS(si->cpu); cpu_synchronize_state(cs); scc->initial_cpu_reset(cs); @@ -1618,11 +1620,11 @@ static void sigp_initial_cpu_reset(void *arg) si->cc = SIGP_CC_ORDER_CODE_ACCEPTED; } -static void sigp_cpu_reset(void *arg) +static void sigp_cpu_reset(CPUState *cs, void *arg) { + S390CPU *cpu = S390_CPU(cs); + S390CPUClass *scc = S390_CPU_GET_CLASS(cpu); SigpInfo *si = arg; - CPUState *cs = CPU(si->cpu); - S390CPUClass *scc = S390_CPU_GET_CLASS(si->cpu); cpu_synchronize_state(cs); scc->cpu_reset(cs); @@ -1630,12 +1632,13 @@ static void sigp_cpu_reset(void *arg) si->cc = SIGP_CC_ORDER_CODE_ACCEPTED; } -static void sigp_set_prefix(void *arg) +static void sigp_set_prefix(CPUState *cs, void *arg) { + S390CPU *cpu = S390_CPU(cs); SigpInfo *si = arg; uint32_t addr = si->param & 0x7fffe000u; - cpu_synchronize_state(CPU(si->cpu)); + cpu_synchronize_state(cs); if (!address_space_access_valid(&address_space_memory, addr, sizeof(struct LowCore), false)) { @@ -1644,13 +1647,13 @@ static void sigp_set_prefix(void *arg) } /* cpu has to be stopped */ - if (s390_cpu_get_state(si->cpu) != CPU_STATE_STOPPED) { + if (s390_cpu_get_state(cpu) != CPU_STATE_STOPPED) { set_sigp_status(si, SIGP_STAT_INCORRECT_STATE); return; } - si->cpu->env.psa = addr; - cpu_synchronize_post_init(CPU(si->cpu)); + cpu->env.psa = addr; + cpu_synchronize_post_init(cs); si->cc = SIGP_CC_ORDER_CODE_ACCEPTED; } @@ -1658,7 +1661,6 @@ static int handle_sigp_single_dst(S390CPU *dst_cpu, uint8_t order, uint64_t param, uint64_t *status_reg) { SigpInfo si = { - .cpu = dst_cpu, .param = param, .status_reg = status_reg, }; diff --git a/target-s390x/misc_helper.c b/target-s390x/misc_helper.c index 86da194..4df2ec6 100644 --- a/target-s390x/misc_helper.c +++ b/target-s390x/misc_helper.c @@ -126,7 +126,7 @@ static int modified_clear_reset(S390CPU *cpu) pause_all_vcpus(); cpu_synchronize_all_states(); CPU_FOREACH(t) { - run_on_cpu(t, s390_do_cpu_full_reset, t); + run_on_cpu(t, s390_do_cpu_full_reset, NULL); } s390_cmma_reset(); subsystem_reset(); @@ -145,7 +145,7 @@ static int load_normal_reset(S390CPU *cpu) pause_all_vcpus(); cpu_synchronize_all_states(); CPU_FOREACH(t) { - run_on_cpu(t, s390_do_cpu_reset, t); + run_on_cpu(t, s390_do_cpu_reset, NULL); } s390_cmma_reset(); subsystem_reset(); diff --git a/translate-all.c b/translate-all.c index e9bc90c..8ca393c 100644 --- a/translate-all.c +++ b/translate-all.c @@ -834,12 +834,19 @@ static void page_flush_tb(void) } /* flush all the translation blocks */ -/* XXX: tb_flush is currently not thread safe */ -void tb_flush(CPUState *cpu) +static void do_tb_flush(CPUState *cpu, void *data) { - if (!tcg_enabled()) { - return; + unsigned tb_flush_req = (unsigned) (uintptr_t) data; + + tb_lock(); + + /* If it's already been done on request of another CPU, + * just retry. + */ + if (tcg_ctx.tb_ctx.tb_flush_count != tb_flush_req) { + goto done; } + #if defined(DEBUG_FLUSH) printf("qemu: flush code_size=%ld nb_tbs=%d avg_tb_size=%ld\n", (unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer), @@ -858,7 +865,6 @@ void tb_flush(CPUState *cpu) for (i = 0; i < TB_JMP_CACHE_SIZE; ++i) { atomic_set(&cpu->tb_jmp_cache[i], NULL); } - atomic_mb_set(&cpu->tb_flushed, true); } tcg_ctx.tb_ctx.nb_tbs = 0; @@ -868,7 +874,19 @@ void tb_flush(CPUState *cpu) tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer; /* XXX: flush processor icache at this point if cache flush is expensive */ - tcg_ctx.tb_ctx.tb_flush_count++; + atomic_mb_set(&tcg_ctx.tb_ctx.tb_flush_count, + tcg_ctx.tb_ctx.tb_flush_count + 1); + +done: + tb_unlock(); +} + +void tb_flush(CPUState *cpu) +{ + if (tcg_enabled()) { + uintptr_t tb_flush_req = atomic_mb_read(&tcg_ctx.tb_ctx.tb_flush_count); + async_safe_run_on_cpu(cpu, do_tb_flush, (void *) tb_flush_req); + } } #ifdef DEBUG_TB_CHECK @@ -1175,9 +1193,8 @@ TranslationBlock *tb_gen_code(CPUState *cpu, buffer_overflow: /* flush must be done */ tb_flush(cpu); - /* cannot fail at this point */ - tb = tb_alloc(pc); - assert(tb != NULL); + mmap_unlock(); + cpu_loop_exit(cpu); } gen_code_buf = tcg_ctx.code_gen_ptr; @@ -1775,7 +1792,8 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf) qht_statistics_destroy(&hst); cpu_fprintf(f, "\nStatistics:\n"); - cpu_fprintf(f, "TB flush count %d\n", tcg_ctx.tb_ctx.tb_flush_count); + cpu_fprintf(f, "TB flush count %u\n", + atomic_read(&tcg_ctx.tb_ctx.tb_flush_count)); cpu_fprintf(f, "TB invalidate count %d\n", tcg_ctx.tb_ctx.tb_phys_invalidate_count); cpu_fprintf(f, "TLB flush count %d\n", tlb_flush_count); @@ -784,6 +784,7 @@ void vm_start(void) if (runstate_is_running()) { qapi_event_send_stop(&error_abort); } else { + replay_enable_events(); cpu_enable_ticks(); runstate_set(RUN_STATE_RUNNING); vm_state_notify(1, RUN_STATE_RUNNING); @@ -3019,6 +3020,7 @@ int main(int argc, char **argv, char **envp) Error *err = NULL; bool list_data_dirs = false; + qemu_init_cpu_list(); qemu_init_cpu_loop(); qemu_mutex_lock_iothread(); |