aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Maydell <peter.maydell@linaro.org>2022-12-21 14:15:18 +0000
committerPeter Maydell <peter.maydell@linaro.org>2022-12-21 14:15:18 +0000
commit700ce3b1bb52da4acbbf1ad8f6256baaf52c7953 (patch)
tree0a797fa30cf4e5b8df6c4c0bb0d0ffb7ab9e72f9
parent6394578984da00564d6a3515940732ff9b83cd10 (diff)
parent811242654934bd4613634235ef6a8219792ab088 (diff)
downloadqemu-700ce3b1bb52da4acbbf1ad8f6256baaf52c7953.zip
qemu-700ce3b1bb52da4acbbf1ad8f6256baaf52c7953.tar.gz
qemu-700ce3b1bb52da4acbbf1ad8f6256baaf52c7953.tar.bz2
Merge tag 'pull-tcg-20221220' of https://gitlab.com/rth7680/qemu into staging
Use interval trees for user-only vma mappings. Assorted cleanups to page locking. # gpg: Signature made Wed 21 Dec 2022 05:00:30 GMT # gpg: using RSA key 7A481E78868B4DB6A85A05C064DF38E8AF7E215F # gpg: issuer "richard.henderson@linaro.org" # gpg: Good signature from "Richard Henderson <richard.henderson@linaro.org>" [full] # Primary key fingerprint: 7A48 1E78 868B 4DB6 A85A 05C0 64DF 38E8 AF7E 215F * tag 'pull-tcg-20221220' of https://gitlab.com/rth7680/qemu: accel/tcg: Restrict page_collection structure to system TB maintainance accel/tcg: Factor tb_invalidate_phys_range_fast() out accel/tcg: Rename tb_invalidate_phys_page_fast{,__locked}() accel/tcg: Remove trace events from trace-root.h accel/tcg: Restrict cpu_io_recompile() to system emulation accel/tcg: Move remainder of page locking to tb-maint.c accel/tcg: Move PageDesc tree into tb-maint.c for system accel/tcg: Use interval tree for user-only page tracking accel/tcg: Move page_{get,set}_flags to user-exec.c accel/tcg: Drop PAGE_RESERVED for CONFIG_BSD accel/tcg: Use interval tree for TARGET_PAGE_DATA_SIZE accel/tcg: Use interval tree for TBs in user-only mode accel/tcg: Rename page_flush_tb util: Add interval-tree.c Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r--accel/tcg/cputlb.c7
-rw-r--r--accel/tcg/internal.h81
-rw-r--r--accel/tcg/tb-maint.c994
-rw-r--r--accel/tcg/trace-events4
-rw-r--r--accel/tcg/translate-all.c811
-rw-r--r--accel/tcg/user-exec.c658
-rw-r--r--include/exec/exec-all.h43
-rw-r--r--include/exec/translate-all.h6
-rw-r--r--include/qemu/interval-tree.h99
-rw-r--r--tests/tcg/multiarch/test-vma.c22
-rw-r--r--tests/unit/meson.build1
-rw-r--r--tests/unit/test-interval-tree.c209
-rw-r--r--trace-events4
-rw-r--r--util/interval-tree.c882
-rw-r--r--util/meson.build1
15 files changed, 2661 insertions, 1161 deletions
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index 6f1c006..03674d5 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -33,7 +33,7 @@
#include "qemu/atomic.h"
#include "qemu/atomic128.h"
#include "exec/translate-all.h"
-#include "trace/trace-root.h"
+#include "trace.h"
#include "tb-hash.h"
#include "internal.h"
#ifdef CONFIG_PLUGIN
@@ -1508,10 +1508,7 @@ static void notdirty_write(CPUState *cpu, vaddr mem_vaddr, unsigned size,
trace_memory_notdirty_write_access(mem_vaddr, ram_addr, size);
if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
- struct page_collection *pages
- = page_collection_lock(ram_addr, ram_addr + size);
- tb_invalidate_phys_page_fast(pages, ram_addr, size, retaddr);
- page_collection_unlock(pages);
+ tb_invalidate_phys_range_fast(ram_addr, size, retaddr);
}
/*
diff --git a/accel/tcg/internal.h b/accel/tcg/internal.h
index cb13bad..6edff16 100644
--- a/accel/tcg/internal.h
+++ b/accel/tcg/internal.h
@@ -23,83 +23,28 @@
#define assert_memory_lock() tcg_debug_assert(have_mmap_lock())
#endif
-typedef struct PageDesc {
- /* list of TBs intersecting this ram page */
- uintptr_t first_tb;
-#ifdef CONFIG_USER_ONLY
- unsigned long flags;
- void *target_data;
-#endif
-#ifdef CONFIG_SOFTMMU
- QemuSpin lock;
+#if defined(CONFIG_SOFTMMU) && defined(CONFIG_DEBUG_TCG)
+void assert_no_pages_locked(void);
+#else
+static inline void assert_no_pages_locked(void) { }
#endif
-} PageDesc;
-
-/* Size of the L2 (and L3, etc) page tables. */
-#define V_L2_BITS 10
-#define V_L2_SIZE (1 << V_L2_BITS)
-
-/*
- * L1 Mapping properties
- */
-extern int v_l1_size;
-extern int v_l1_shift;
-extern int v_l2_levels;
-
-/*
- * The bottom level has pointers to PageDesc, and is indexed by
- * anything from 4 to (V_L2_BITS + 3) bits, depending on target page size.
- */
-#define V_L1_MIN_BITS 4
-#define V_L1_MAX_BITS (V_L2_BITS + 3)
-#define V_L1_MAX_SIZE (1 << V_L1_MAX_BITS)
-
-extern void *l1_map[V_L1_MAX_SIZE];
-PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc);
-
-static inline PageDesc *page_find(tb_page_addr_t index)
-{
- return page_find_alloc(index, false);
-}
-
-/* list iterators for lists of tagged pointers in TranslationBlock */
-#define TB_FOR_EACH_TAGGED(head, tb, n, field) \
- for (n = (head) & 1, tb = (TranslationBlock *)((head) & ~1); \
- tb; tb = (TranslationBlock *)tb->field[n], n = (uintptr_t)tb & 1, \
- tb = (TranslationBlock *)((uintptr_t)tb & ~1))
-
-#define PAGE_FOR_EACH_TB(pagedesc, tb, n) \
- TB_FOR_EACH_TAGGED((pagedesc)->first_tb, tb, n, page_next)
-
-#define TB_FOR_EACH_JMP(head_tb, tb, n) \
- TB_FOR_EACH_TAGGED((head_tb)->jmp_list_head, tb, n, jmp_list_next)
-
-/* In user-mode page locks aren't used; mmap_lock is enough */
#ifdef CONFIG_USER_ONLY
-#define assert_page_locked(pd) tcg_debug_assert(have_mmap_lock())
-static inline void page_lock(PageDesc *pd) { }
-static inline void page_unlock(PageDesc *pd) { }
-#else
-#ifdef CONFIG_DEBUG_TCG
-void do_assert_page_locked(const PageDesc *pd, const char *file, int line);
-#define assert_page_locked(pd) do_assert_page_locked(pd, __FILE__, __LINE__)
+static inline void page_table_config_init(void) { }
#else
-#define assert_page_locked(pd)
-#endif
-void page_lock(PageDesc *pd);
-void page_unlock(PageDesc *pd);
-#endif
-#if !defined(CONFIG_USER_ONLY) && defined(CONFIG_DEBUG_TCG)
-void assert_no_pages_locked(void);
-#else
-static inline void assert_no_pages_locked(void) { }
+void page_table_config_init(void);
#endif
+#ifdef CONFIG_SOFTMMU
+void tb_invalidate_phys_range_fast(ram_addr_t ram_addr,
+ unsigned size,
+ uintptr_t retaddr);
+G_NORETURN void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr);
+#endif /* CONFIG_SOFTMMU */
+
TranslationBlock *tb_gen_code(CPUState *cpu, target_ulong pc,
target_ulong cs_base, uint32_t flags,
int cflags);
-G_NORETURN void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr);
void page_init(void);
void tb_htable_init(void);
void tb_reset_jump(TranslationBlock *tb, int n);
diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c
index 0cdb355..1b8e860 100644
--- a/accel/tcg/tb-maint.c
+++ b/accel/tcg/tb-maint.c
@@ -18,6 +18,7 @@
*/
#include "qemu/osdep.h"
+#include "qemu/interval-tree.h"
#include "exec/cputlb.h"
#include "exec/log.h"
#include "exec/exec-all.h"
@@ -29,6 +30,15 @@
#include "internal.h"
+/* List iterators for lists of tagged pointers in TranslationBlock. */
+#define TB_FOR_EACH_TAGGED(head, tb, n, field) \
+ for (n = (head) & 1, tb = (TranslationBlock *)((head) & ~1); \
+ tb; tb = (TranslationBlock *)tb->field[n], n = (uintptr_t)tb & 1, \
+ tb = (TranslationBlock *)((uintptr_t)tb & ~1))
+
+#define TB_FOR_EACH_JMP(head_tb, tb, n) \
+ TB_FOR_EACH_TAGGED((head_tb)->jmp_list_head, tb, n, jmp_list_next)
+
static bool tb_cmp(const void *ap, const void *bp)
{
const TranslationBlock *a = ap;
@@ -50,8 +60,513 @@ void tb_htable_init(void)
qht_init(&tb_ctx.htable, tb_cmp, CODE_GEN_HTABLE_SIZE, mode);
}
+typedef struct PageDesc PageDesc;
+
+#ifdef CONFIG_USER_ONLY
+
+/*
+ * In user-mode page locks aren't used; mmap_lock is enough.
+ */
+#define assert_page_locked(pd) tcg_debug_assert(have_mmap_lock())
+
+static inline void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
+ PageDesc **ret_p2, tb_page_addr_t phys2,
+ bool alloc)
+{
+ *ret_p1 = NULL;
+ *ret_p2 = NULL;
+}
+
+static inline void page_unlock(PageDesc *pd) { }
+static inline void page_lock_tb(const TranslationBlock *tb) { }
+static inline void page_unlock_tb(const TranslationBlock *tb) { }
+
+/*
+ * For user-only, since we are protecting all of memory with a single lock,
+ * and because the two pages of a TranslationBlock are always contiguous,
+ * use a single data structure to record all TranslationBlocks.
+ */
+static IntervalTreeRoot tb_root;
+
+static void tb_remove_all(void)
+{
+ assert_memory_lock();
+ memset(&tb_root, 0, sizeof(tb_root));
+}
+
+/* Call with mmap_lock held. */
+static void tb_record(TranslationBlock *tb, PageDesc *p1, PageDesc *p2)
+{
+ target_ulong addr;
+ int flags;
+
+ assert_memory_lock();
+ tb->itree.last = tb->itree.start + tb->size - 1;
+
+ /* translator_loop() must have made all TB pages non-writable */
+ addr = tb_page_addr0(tb);
+ flags = page_get_flags(addr);
+ assert(!(flags & PAGE_WRITE));
+
+ addr = tb_page_addr1(tb);
+ if (addr != -1) {
+ flags = page_get_flags(addr);
+ assert(!(flags & PAGE_WRITE));
+ }
+
+ interval_tree_insert(&tb->itree, &tb_root);
+}
+
+/* Call with mmap_lock held. */
+static void tb_remove(TranslationBlock *tb)
+{
+ assert_memory_lock();
+ interval_tree_remove(&tb->itree, &tb_root);
+}
+
+/* TODO: For now, still shared with translate-all.c for system mode. */
+#define PAGE_FOR_EACH_TB(start, end, pagedesc, T, N) \
+ for (T = foreach_tb_first(start, end), \
+ N = foreach_tb_next(T, start, end); \
+ T != NULL; \
+ T = N, N = foreach_tb_next(N, start, end))
+
+typedef TranslationBlock *PageForEachNext;
+
+static PageForEachNext foreach_tb_first(tb_page_addr_t start,
+ tb_page_addr_t end)
+{
+ IntervalTreeNode *n = interval_tree_iter_first(&tb_root, start, end - 1);
+ return n ? container_of(n, TranslationBlock, itree) : NULL;
+}
+
+static PageForEachNext foreach_tb_next(PageForEachNext tb,
+ tb_page_addr_t start,
+ tb_page_addr_t end)
+{
+ IntervalTreeNode *n;
+
+ if (tb) {
+ n = interval_tree_iter_next(&tb->itree, start, end - 1);
+ if (n) {
+ return container_of(n, TranslationBlock, itree);
+ }
+ }
+ return NULL;
+}
+
+#else
+/*
+ * In system mode we want L1_MAP to be based on ram offsets.
+ */
+#if HOST_LONG_BITS < TARGET_PHYS_ADDR_SPACE_BITS
+# define L1_MAP_ADDR_SPACE_BITS HOST_LONG_BITS
+#else
+# define L1_MAP_ADDR_SPACE_BITS TARGET_PHYS_ADDR_SPACE_BITS
+#endif
+
+/* Size of the L2 (and L3, etc) page tables. */
+#define V_L2_BITS 10
+#define V_L2_SIZE (1 << V_L2_BITS)
+
+/*
+ * L1 Mapping properties
+ */
+static int v_l1_size;
+static int v_l1_shift;
+static int v_l2_levels;
+
+/*
+ * The bottom level has pointers to PageDesc, and is indexed by
+ * anything from 4 to (V_L2_BITS + 3) bits, depending on target page size.
+ */
+#define V_L1_MIN_BITS 4
+#define V_L1_MAX_BITS (V_L2_BITS + 3)
+#define V_L1_MAX_SIZE (1 << V_L1_MAX_BITS)
+
+static void *l1_map[V_L1_MAX_SIZE];
+
+struct PageDesc {
+ QemuSpin lock;
+ /* list of TBs intersecting this ram page */
+ uintptr_t first_tb;
+};
+
+void page_table_config_init(void)
+{
+ uint32_t v_l1_bits;
+
+ assert(TARGET_PAGE_BITS);
+ /* The bits remaining after N lower levels of page tables. */
+ v_l1_bits = (L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS) % V_L2_BITS;
+ if (v_l1_bits < V_L1_MIN_BITS) {
+ v_l1_bits += V_L2_BITS;
+ }
+
+ v_l1_size = 1 << v_l1_bits;
+ v_l1_shift = L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS - v_l1_bits;
+ v_l2_levels = v_l1_shift / V_L2_BITS - 1;
+
+ assert(v_l1_bits <= V_L1_MAX_BITS);
+ assert(v_l1_shift % V_L2_BITS == 0);
+ assert(v_l2_levels >= 0);
+}
+
+static PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc)
+{
+ PageDesc *pd;
+ void **lp;
+ int i;
+
+ /* Level 1. Always allocated. */
+ lp = l1_map + ((index >> v_l1_shift) & (v_l1_size - 1));
+
+ /* Level 2..N-1. */
+ for (i = v_l2_levels; i > 0; i--) {
+ void **p = qatomic_rcu_read(lp);
+
+ if (p == NULL) {
+ void *existing;
+
+ if (!alloc) {
+ return NULL;
+ }
+ p = g_new0(void *, V_L2_SIZE);
+ existing = qatomic_cmpxchg(lp, NULL, p);
+ if (unlikely(existing)) {
+ g_free(p);
+ p = existing;
+ }
+ }
+
+ lp = p + ((index >> (i * V_L2_BITS)) & (V_L2_SIZE - 1));
+ }
+
+ pd = qatomic_rcu_read(lp);
+ if (pd == NULL) {
+ void *existing;
+
+ if (!alloc) {
+ return NULL;
+ }
+
+ pd = g_new0(PageDesc, V_L2_SIZE);
+ for (int i = 0; i < V_L2_SIZE; i++) {
+ qemu_spin_init(&pd[i].lock);
+ }
+
+ existing = qatomic_cmpxchg(lp, NULL, pd);
+ if (unlikely(existing)) {
+ for (int i = 0; i < V_L2_SIZE; i++) {
+ qemu_spin_destroy(&pd[i].lock);
+ }
+ g_free(pd);
+ pd = existing;
+ }
+ }
+
+ return pd + (index & (V_L2_SIZE - 1));
+}
+
+static inline PageDesc *page_find(tb_page_addr_t index)
+{
+ return page_find_alloc(index, false);
+}
+
+/**
+ * struct page_entry - page descriptor entry
+ * @pd: pointer to the &struct PageDesc of the page this entry represents
+ * @index: page index of the page
+ * @locked: whether the page is locked
+ *
+ * This struct helps us keep track of the locked state of a page, without
+ * bloating &struct PageDesc.
+ *
+ * A page lock protects accesses to all fields of &struct PageDesc.
+ *
+ * See also: &struct page_collection.
+ */
+struct page_entry {
+ PageDesc *pd;
+ tb_page_addr_t index;
+ bool locked;
+};
+
+/**
+ * struct page_collection - tracks a set of pages (i.e. &struct page_entry's)
+ * @tree: Binary search tree (BST) of the pages, with key == page index
+ * @max: Pointer to the page in @tree with the highest page index
+ *
+ * To avoid deadlock we lock pages in ascending order of page index.
+ * When operating on a set of pages, we need to keep track of them so that
+ * we can lock them in order and also unlock them later. For this we collect
+ * pages (i.e. &struct page_entry's) in a binary search @tree. Given that the
+ * @tree implementation we use does not provide an O(1) operation to obtain the
+ * highest-ranked element, we use @max to keep track of the inserted page
+ * with the highest index. This is valuable because if a page is not in
+ * the tree and its index is higher than @max's, then we can lock it
+ * without breaking the locking order rule.
+ *
+ * Note on naming: 'struct page_set' would be shorter, but we already have a few
+ * page_set_*() helpers, so page_collection is used instead to avoid confusion.
+ *
+ * See also: page_collection_lock().
+ */
+struct page_collection {
+ GTree *tree;
+ struct page_entry *max;
+};
+
+typedef int PageForEachNext;
+#define PAGE_FOR_EACH_TB(start, end, pagedesc, tb, n) \
+ TB_FOR_EACH_TAGGED((pagedesc)->first_tb, tb, n, page_next)
+
+#ifdef CONFIG_DEBUG_TCG
+
+static __thread GHashTable *ht_pages_locked_debug;
+
+static void ht_pages_locked_debug_init(void)
+{
+ if (ht_pages_locked_debug) {
+ return;
+ }
+ ht_pages_locked_debug = g_hash_table_new(NULL, NULL);
+}
+
+static bool page_is_locked(const PageDesc *pd)
+{
+ PageDesc *found;
+
+ ht_pages_locked_debug_init();
+ found = g_hash_table_lookup(ht_pages_locked_debug, pd);
+ return !!found;
+}
+
+static void page_lock__debug(PageDesc *pd)
+{
+ ht_pages_locked_debug_init();
+ g_assert(!page_is_locked(pd));
+ g_hash_table_insert(ht_pages_locked_debug, pd, pd);
+}
+
+static void page_unlock__debug(const PageDesc *pd)
+{
+ bool removed;
+
+ ht_pages_locked_debug_init();
+ g_assert(page_is_locked(pd));
+ removed = g_hash_table_remove(ht_pages_locked_debug, pd);
+ g_assert(removed);
+}
+
+static void do_assert_page_locked(const PageDesc *pd,
+ const char *file, int line)
+{
+ if (unlikely(!page_is_locked(pd))) {
+ error_report("assert_page_lock: PageDesc %p not locked @ %s:%d",
+ pd, file, line);
+ abort();
+ }
+}
+#define assert_page_locked(pd) do_assert_page_locked(pd, __FILE__, __LINE__)
+
+void assert_no_pages_locked(void)
+{
+ ht_pages_locked_debug_init();
+ g_assert(g_hash_table_size(ht_pages_locked_debug) == 0);
+}
+
+#else /* !CONFIG_DEBUG_TCG */
+
+static inline void page_lock__debug(const PageDesc *pd) { }
+static inline void page_unlock__debug(const PageDesc *pd) { }
+static inline void assert_page_locked(const PageDesc *pd) { }
+
+#endif /* CONFIG_DEBUG_TCG */
+
+static void page_lock(PageDesc *pd)
+{
+ page_lock__debug(pd);
+ qemu_spin_lock(&pd->lock);
+}
+
+static void page_unlock(PageDesc *pd)
+{
+ qemu_spin_unlock(&pd->lock);
+ page_unlock__debug(pd);
+}
+
+static inline struct page_entry *
+page_entry_new(PageDesc *pd, tb_page_addr_t index)
+{
+ struct page_entry *pe = g_malloc(sizeof(*pe));
+
+ pe->index = index;
+ pe->pd = pd;
+ pe->locked = false;
+ return pe;
+}
+
+static void page_entry_destroy(gpointer p)
+{
+ struct page_entry *pe = p;
+
+ g_assert(pe->locked);
+ page_unlock(pe->pd);
+ g_free(pe);
+}
+
+/* returns false on success */
+static bool page_entry_trylock(struct page_entry *pe)
+{
+ bool busy;
+
+ busy = qemu_spin_trylock(&pe->pd->lock);
+ if (!busy) {
+ g_assert(!pe->locked);
+ pe->locked = true;
+ page_lock__debug(pe->pd);
+ }
+ return busy;
+}
+
+static void do_page_entry_lock(struct page_entry *pe)
+{
+ page_lock(pe->pd);
+ g_assert(!pe->locked);
+ pe->locked = true;
+}
+
+static gboolean page_entry_lock(gpointer key, gpointer value, gpointer data)
+{
+ struct page_entry *pe = value;
+
+ do_page_entry_lock(pe);
+ return FALSE;
+}
+
+static gboolean page_entry_unlock(gpointer key, gpointer value, gpointer data)
+{
+ struct page_entry *pe = value;
+
+ if (pe->locked) {
+ pe->locked = false;
+ page_unlock(pe->pd);
+ }
+ return FALSE;
+}
+
+/*
+ * Trylock a page, and if successful, add the page to a collection.
+ * Returns true ("busy") if the page could not be locked; false otherwise.
+ */
+static bool page_trylock_add(struct page_collection *set, tb_page_addr_t addr)
+{
+ tb_page_addr_t index = addr >> TARGET_PAGE_BITS;
+ struct page_entry *pe;
+ PageDesc *pd;
+
+ pe = g_tree_lookup(set->tree, &index);
+ if (pe) {
+ return false;
+ }
+
+ pd = page_find(index);
+ if (pd == NULL) {
+ return false;
+ }
+
+ pe = page_entry_new(pd, index);
+ g_tree_insert(set->tree, &pe->index, pe);
+
+ /*
+ * If this is either (1) the first insertion or (2) a page whose index
+ * is higher than any other so far, just lock the page and move on.
+ */
+ if (set->max == NULL || pe->index > set->max->index) {
+ set->max = pe;
+ do_page_entry_lock(pe);
+ return false;
+ }
+ /*
+ * Try to acquire out-of-order lock; if busy, return busy so that we acquire
+ * locks in order.
+ */
+ return page_entry_trylock(pe);
+}
+
+static gint tb_page_addr_cmp(gconstpointer ap, gconstpointer bp, gpointer udata)
+{
+ tb_page_addr_t a = *(const tb_page_addr_t *)ap;
+ tb_page_addr_t b = *(const tb_page_addr_t *)bp;
+
+ if (a == b) {
+ return 0;
+ } else if (a < b) {
+ return -1;
+ }
+ return 1;
+}
+
+/*
+ * Lock a range of pages ([@start,@end[) as well as the pages of all
+ * intersecting TBs.
+ * Locking order: acquire locks in ascending order of page index.
+ */
+static struct page_collection *page_collection_lock(tb_page_addr_t start,
+ tb_page_addr_t end)
+{
+ struct page_collection *set = g_malloc(sizeof(*set));
+ tb_page_addr_t index;
+ PageDesc *pd;
+
+ start >>= TARGET_PAGE_BITS;
+ end >>= TARGET_PAGE_BITS;
+ g_assert(start <= end);
+
+ set->tree = g_tree_new_full(tb_page_addr_cmp, NULL, NULL,
+ page_entry_destroy);
+ set->max = NULL;
+ assert_no_pages_locked();
+
+ retry:
+ g_tree_foreach(set->tree, page_entry_lock, NULL);
+
+ for (index = start; index <= end; index++) {
+ TranslationBlock *tb;
+ PageForEachNext n;
+
+ pd = page_find(index);
+ if (pd == NULL) {
+ continue;
+ }
+ if (page_trylock_add(set, index << TARGET_PAGE_BITS)) {
+ g_tree_foreach(set->tree, page_entry_unlock, NULL);
+ goto retry;
+ }
+ assert_page_locked(pd);
+ PAGE_FOR_EACH_TB(unused, unused, pd, tb, n) {
+ if (page_trylock_add(set, tb_page_addr0(tb)) ||
+ (tb_page_addr1(tb) != -1 &&
+ page_trylock_add(set, tb_page_addr1(tb)))) {
+ /* drop all locks, and reacquire in order */
+ g_tree_foreach(set->tree, page_entry_unlock, NULL);
+ goto retry;
+ }
+ }
+ }
+ return set;
+}
+
+static void page_collection_unlock(struct page_collection *set)
+{
+ /* entries are unlocked and freed via page_entry_destroy */
+ g_tree_destroy(set->tree);
+ g_free(set);
+}
+
/* Set to NULL all the 'first_tb' fields in all PageDescs. */
-static void page_flush_tb_1(int level, void **lp)
+static void tb_remove_all_1(int level, void **lp)
{
int i;
@@ -70,20 +585,144 @@ static void page_flush_tb_1(int level, void **lp)
void **pp = *lp;
for (i = 0; i < V_L2_SIZE; ++i) {
- page_flush_tb_1(level - 1, pp + i);
+ tb_remove_all_1(level - 1, pp + i);
}
}
}
-static void page_flush_tb(void)
+static void tb_remove_all(void)
{
int i, l1_sz = v_l1_size;
for (i = 0; i < l1_sz; i++) {
- page_flush_tb_1(v_l2_levels, l1_map + i);
+ tb_remove_all_1(v_l2_levels, l1_map + i);
+ }
+}
+
+/*
+ * Add the tb in the target page and protect it if necessary.
+ * Called with @p->lock held.
+ */
+static inline void tb_page_add(PageDesc *p, TranslationBlock *tb,
+ unsigned int n)
+{
+ bool page_already_protected;
+
+ assert_page_locked(p);
+
+ tb->page_next[n] = p->first_tb;
+ page_already_protected = p->first_tb != 0;
+ p->first_tb = (uintptr_t)tb | n;
+
+ /*
+ * If some code is already present, then the pages are already
+ * protected. So we handle the case where only the first TB is
+ * allocated in a physical page.
+ */
+ if (!page_already_protected) {
+ tlb_protect_code(tb->page_addr[n] & TARGET_PAGE_MASK);
+ }
+}
+
+static void tb_record(TranslationBlock *tb, PageDesc *p1, PageDesc *p2)
+{
+ tb_page_add(p1, tb, 0);
+ if (unlikely(p2)) {
+ tb_page_add(p2, tb, 1);
+ }
+}
+
+static inline void tb_page_remove(PageDesc *pd, TranslationBlock *tb)
+{
+ TranslationBlock *tb1;
+ uintptr_t *pprev;
+ PageForEachNext n1;
+
+ assert_page_locked(pd);
+ pprev = &pd->first_tb;
+ PAGE_FOR_EACH_TB(unused, unused, pd, tb1, n1) {
+ if (tb1 == tb) {
+ *pprev = tb1->page_next[n1];
+ return;
+ }
+ pprev = &tb1->page_next[n1];
}
+ g_assert_not_reached();
}
+static void tb_remove(TranslationBlock *tb)
+{
+ PageDesc *pd;
+
+ pd = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS);
+ tb_page_remove(pd, tb);
+ if (unlikely(tb->page_addr[1] != -1)) {
+ pd = page_find(tb->page_addr[1] >> TARGET_PAGE_BITS);
+ tb_page_remove(pd, tb);
+ }
+}
+
+static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
+ PageDesc **ret_p2, tb_page_addr_t phys2, bool alloc)
+{
+ PageDesc *p1, *p2;
+ tb_page_addr_t page1;
+ tb_page_addr_t page2;
+
+ assert_memory_lock();
+ g_assert(phys1 != -1);
+
+ page1 = phys1 >> TARGET_PAGE_BITS;
+ page2 = phys2 >> TARGET_PAGE_BITS;
+
+ p1 = page_find_alloc(page1, alloc);
+ if (ret_p1) {
+ *ret_p1 = p1;
+ }
+ if (likely(phys2 == -1)) {
+ page_lock(p1);
+ return;
+ } else if (page1 == page2) {
+ page_lock(p1);
+ if (ret_p2) {
+ *ret_p2 = p1;
+ }
+ return;
+ }
+ p2 = page_find_alloc(page2, alloc);
+ if (ret_p2) {
+ *ret_p2 = p2;
+ }
+ if (page1 < page2) {
+ page_lock(p1);
+ page_lock(p2);
+ } else {
+ page_lock(p2);
+ page_lock(p1);
+ }
+}
+
+/* lock the page(s) of a TB in the correct acquisition order */
+static void page_lock_tb(const TranslationBlock *tb)
+{
+ page_lock_pair(NULL, tb_page_addr0(tb), NULL, tb_page_addr1(tb), false);
+}
+
+static void page_unlock_tb(const TranslationBlock *tb)
+{
+ PageDesc *p1 = page_find(tb_page_addr0(tb) >> TARGET_PAGE_BITS);
+
+ page_unlock(p1);
+ if (unlikely(tb_page_addr1(tb) != -1)) {
+ PageDesc *p2 = page_find(tb_page_addr1(tb) >> TARGET_PAGE_BITS);
+
+ if (p2 != p1) {
+ page_unlock(p2);
+ }
+ }
+}
+#endif /* CONFIG_USER_ONLY */
+
/* flush all the translation blocks */
static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
{
@@ -101,7 +740,7 @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
}
qht_reset_size(&tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
- page_flush_tb();
+ tb_remove_all();
tcg_region_reset_all();
/* XXX: flush processor icache at this point if cache flush is expensive */
@@ -128,28 +767,6 @@ void tb_flush(CPUState *cpu)
}
}
-/*
- * user-mode: call with mmap_lock held
- * !user-mode: call with @pd->lock held
- */
-static inline void tb_page_remove(PageDesc *pd, TranslationBlock *tb)
-{
- TranslationBlock *tb1;
- uintptr_t *pprev;
- unsigned int n1;
-
- assert_page_locked(pd);
- pprev = &pd->first_tb;
- PAGE_FOR_EACH_TB(pd, tb1, n1) {
- if (tb1 == tb) {
- *pprev = tb1->page_next[n1];
- return;
- }
- pprev = &tb1->page_next[n1];
- }
- g_assert_not_reached();
-}
-
/* remove @orig from its @n_orig-th jump list */
static inline void tb_remove_from_jmp_list(TranslationBlock *orig, int n_orig)
{
@@ -255,7 +872,6 @@ static void tb_jmp_cache_inval_tb(TranslationBlock *tb)
*/
static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
{
- PageDesc *p;
uint32_t h;
tb_page_addr_t phys_pc;
uint32_t orig_cflags = tb_cflags(tb);
@@ -277,13 +893,7 @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
/* remove the TB from the page list */
if (rm_from_page_list) {
- p = page_find(phys_pc >> TARGET_PAGE_BITS);
- tb_page_remove(p, tb);
- phys_pc = tb_page_addr1(tb);
- if (phys_pc != -1) {
- p = page_find(phys_pc >> TARGET_PAGE_BITS);
- tb_page_remove(p, tb);
- }
+ tb_remove(tb);
}
/* remove the TB from the hash list */
@@ -307,71 +917,6 @@ static void tb_phys_invalidate__locked(TranslationBlock *tb)
qemu_thread_jit_execute();
}
-static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
- PageDesc **ret_p2, tb_page_addr_t phys2, bool alloc)
-{
- PageDesc *p1, *p2;
- tb_page_addr_t page1;
- tb_page_addr_t page2;
-
- assert_memory_lock();
- g_assert(phys1 != -1);
-
- page1 = phys1 >> TARGET_PAGE_BITS;
- page2 = phys2 >> TARGET_PAGE_BITS;
-
- p1 = page_find_alloc(page1, alloc);
- if (ret_p1) {
- *ret_p1 = p1;
- }
- if (likely(phys2 == -1)) {
- page_lock(p1);
- return;
- } else if (page1 == page2) {
- page_lock(p1);
- if (ret_p2) {
- *ret_p2 = p1;
- }
- return;
- }
- p2 = page_find_alloc(page2, alloc);
- if (ret_p2) {
- *ret_p2 = p2;
- }
- if (page1 < page2) {
- page_lock(p1);
- page_lock(p2);
- } else {
- page_lock(p2);
- page_lock(p1);
- }
-}
-
-#ifdef CONFIG_USER_ONLY
-static inline void page_lock_tb(const TranslationBlock *tb) { }
-static inline void page_unlock_tb(const TranslationBlock *tb) { }
-#else
-/* lock the page(s) of a TB in the correct acquisition order */
-static void page_lock_tb(const TranslationBlock *tb)
-{
- page_lock_pair(NULL, tb_page_addr0(tb), NULL, tb_page_addr1(tb), false);
-}
-
-static void page_unlock_tb(const TranslationBlock *tb)
-{
- PageDesc *p1 = page_find(tb_page_addr0(tb) >> TARGET_PAGE_BITS);
-
- page_unlock(p1);
- if (unlikely(tb_page_addr1(tb) != -1)) {
- PageDesc *p2 = page_find(tb_page_addr1(tb) >> TARGET_PAGE_BITS);
-
- if (p2 != p1) {
- page_unlock(p2);
- }
- }
-}
-#endif
-
/*
* Invalidate one TB.
* Called with mmap_lock held in user-mode.
@@ -388,41 +933,6 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
}
/*
- * Add the tb in the target page and protect it if necessary.
- * Called with mmap_lock held for user-mode emulation.
- * Called with @p->lock held in !user-mode.
- */
-static inline void tb_page_add(PageDesc *p, TranslationBlock *tb,
- unsigned int n, tb_page_addr_t page_addr)
-{
-#ifndef CONFIG_USER_ONLY
- bool page_already_protected;
-#endif
-
- assert_page_locked(p);
-
- tb->page_next[n] = p->first_tb;
-#ifndef CONFIG_USER_ONLY
- page_already_protected = p->first_tb != (uintptr_t)NULL;
-#endif
- p->first_tb = (uintptr_t)tb | n;
-
-#if defined(CONFIG_USER_ONLY)
- /* translator_loop() must have made all TB pages non-writable */
- assert(!(p->flags & PAGE_WRITE));
-#else
- /*
- * If some code is already present, then the pages are already
- * protected. So we handle the case where only the first TB is
- * allocated in a physical page.
- */
- if (!page_already_protected) {
- tlb_protect_code(page_addr);
- }
-#endif
-}
-
-/*
* Add a new TB and link it to the physical page tables. phys_page2 is
* (-1) to indicate that only one page contains the TB.
*
@@ -453,10 +963,7 @@ TranslationBlock *tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
* we can only insert TBs that are fully initialized.
*/
page_lock_pair(&p, phys_pc, &p2, phys_page2, true);
- tb_page_add(p, tb, 0, phys_pc);
- if (p2) {
- tb_page_add(p2, tb, 1, phys_page2);
- }
+ tb_record(tb, p, p2);
/* add in the hash table */
h = tb_hash_func(phys_pc, (TARGET_TB_PCREL ? 0 : tb_pc(tb)),
@@ -465,10 +972,7 @@ TranslationBlock *tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
/* remove TB from the page(s) if we couldn't insert it */
if (unlikely(existing_tb)) {
- tb_page_remove(p, tb);
- if (p2) {
- tb_page_remove(p2, tb);
- }
+ tb_remove(tb);
tb = existing_tb;
}
@@ -479,10 +983,90 @@ TranslationBlock *tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
return tb;
}
+#ifdef CONFIG_USER_ONLY
+/*
+ * Invalidate all TBs which intersect with the target address range.
+ * Called with mmap_lock held for user-mode emulation.
+ * NOTE: this function must not be called while a TB is running.
+ */
+void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end)
+{
+ TranslationBlock *tb;
+ PageForEachNext n;
+
+ assert_memory_lock();
+
+ PAGE_FOR_EACH_TB(start, end, unused, tb, n) {
+ tb_phys_invalidate__locked(tb);
+ }
+}
+
+/*
+ * Invalidate all TBs which intersect with the target address page @addr.
+ * Called with mmap_lock held for user-mode emulation
+ * NOTE: this function must not be called while a TB is running.
+ */
+void tb_invalidate_phys_page(tb_page_addr_t addr)
+{
+ tb_page_addr_t start, end;
+
+ start = addr & TARGET_PAGE_MASK;
+ end = start + TARGET_PAGE_SIZE;
+ tb_invalidate_phys_range(start, end);
+}
+
+/*
+ * Called with mmap_lock held. If pc is not 0 then it indicates the
+ * host PC of the faulting store instruction that caused this invalidate.
+ * Returns true if the caller needs to abort execution of the current
+ * TB (because it was modified by this store and the guest CPU has
+ * precise-SMC semantics).
+ */
+bool tb_invalidate_phys_page_unwind(tb_page_addr_t addr, uintptr_t pc)
+{
+ assert(pc != 0);
+#ifdef TARGET_HAS_PRECISE_SMC
+ assert_memory_lock();
+ {
+ TranslationBlock *current_tb = tcg_tb_lookup(pc);
+ bool current_tb_modified = false;
+ TranslationBlock *tb;
+ PageForEachNext n;
+
+ addr &= TARGET_PAGE_MASK;
+
+ PAGE_FOR_EACH_TB(addr, addr + TARGET_PAGE_SIZE, unused, tb, n) {
+ if (current_tb == tb &&
+ (tb_cflags(current_tb) & CF_COUNT_MASK) != 1) {
+ /*
+ * If we are modifying the current TB, we must stop its
+ * execution. We could be more precise by checking that
+ * the modification is after the current PC, but it would
+ * require a specialized function to partially restore
+ * the CPU state.
+ */
+ current_tb_modified = true;
+ cpu_restore_state_from_tb(current_cpu, current_tb, pc);
+ }
+ tb_phys_invalidate__locked(tb);
+ }
+
+ if (current_tb_modified) {
+ /* Force execution of one insn next time. */
+ CPUState *cpu = current_cpu;
+ cpu->cflags_next_tb = 1 | CF_NOIRQ | curr_cflags(current_cpu);
+ return true;
+ }
+ }
+#else
+ tb_invalidate_phys_page(addr);
+#endif /* TARGET_HAS_PRECISE_SMC */
+ return false;
+}
+#else
/*
* @p must be non-NULL.
- * user-mode: call with mmap_lock held.
- * !user-mode: call with all @pages locked.
+ * Call with all @pages locked.
*/
static void
tb_invalidate_phys_page_range__locked(struct page_collection *pages,
@@ -492,22 +1076,17 @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
{
TranslationBlock *tb;
tb_page_addr_t tb_start, tb_end;
- int n;
+ PageForEachNext n;
#ifdef TARGET_HAS_PRECISE_SMC
- CPUState *cpu = current_cpu;
- bool current_tb_not_found = retaddr != 0;
bool current_tb_modified = false;
- TranslationBlock *current_tb = NULL;
+ TranslationBlock *current_tb = retaddr ? tcg_tb_lookup(retaddr) : NULL;
#endif /* TARGET_HAS_PRECISE_SMC */
- assert_page_locked(p);
-
/*
* We remove all the TBs in the range [start, end[.
* XXX: see if in some cases it could be faster to invalidate all the code
*/
- PAGE_FOR_EACH_TB(p, tb, n) {
- assert_page_locked(p);
+ PAGE_FOR_EACH_TB(start, end, p, tb, n) {
/* NOTE: this is subtle as a TB may span two physical pages */
if (n == 0) {
/* NOTE: tb_end may be after the end of the page, but
@@ -521,11 +1100,6 @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
}
if (!(tb_end <= start || tb_start >= end)) {
#ifdef TARGET_HAS_PRECISE_SMC
- if (current_tb_not_found) {
- current_tb_not_found = false;
- /* now we have a real cpu fault */
- current_tb = tcg_tb_lookup(retaddr);
- }
if (current_tb == tb &&
(tb_cflags(current_tb) & CF_COUNT_MASK) != 1) {
/*
@@ -536,25 +1110,25 @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
* restore the CPU state.
*/
current_tb_modified = true;
- cpu_restore_state_from_tb(cpu, current_tb, retaddr);
+ cpu_restore_state_from_tb(current_cpu, current_tb, retaddr);
}
#endif /* TARGET_HAS_PRECISE_SMC */
tb_phys_invalidate__locked(tb);
}
}
-#if !defined(CONFIG_USER_ONLY)
+
/* if no code remaining, no need to continue to use slow writes */
if (!p->first_tb) {
tlb_unprotect_code(start);
}
-#endif
+
#ifdef TARGET_HAS_PRECISE_SMC
if (current_tb_modified) {
page_collection_unlock(pages);
/* Force execution of one insn next time. */
- cpu->cflags_next_tb = 1 | CF_NOIRQ | curr_cflags(cpu);
+ current_cpu->cflags_next_tb = 1 | CF_NOIRQ | curr_cflags(current_cpu);
mmap_unlock();
- cpu_loop_exit_noexc(cpu);
+ cpu_loop_exit_noexc(current_cpu);
}
#endif
}
@@ -562,8 +1136,6 @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
/*
* Invalidate all TBs which intersect with the target physical
* address page @addr.
- *
- * Called with mmap_lock held for user-mode emulation
*/
void tb_invalidate_phys_page(tb_page_addr_t addr)
{
@@ -571,8 +1143,6 @@ void tb_invalidate_phys_page(tb_page_addr_t addr)
tb_page_addr_t start, end;
PageDesc *p;
- assert_memory_lock();
-
p = page_find(addr >> TARGET_PAGE_BITS);
if (p == NULL) {
return;
@@ -591,16 +1161,12 @@ void tb_invalidate_phys_page(tb_page_addr_t addr)
* 'is_cpu_write_access' should be true if called from a real cpu write
* access: the virtual CPU will exit the current TB if code is modified inside
* this TB.
- *
- * Called with mmap_lock held for user-mode emulation.
*/
void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end)
{
struct page_collection *pages;
tb_page_addr_t next;
- assert_memory_lock();
-
pages = page_collection_lock(start, end);
for (next = (start & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE;
start < end;
@@ -611,94 +1177,44 @@ void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end)
if (pd == NULL) {
continue;
}
+ assert_page_locked(pd);
tb_invalidate_phys_page_range__locked(pages, pd, start, bound, 0);
}
page_collection_unlock(pages);
}
-#ifdef CONFIG_SOFTMMU
/*
- * len must be <= 8 and start must be a multiple of len.
- * Called via softmmu_template.h when code areas are written to with
- * iothread mutex not held.
- *
* Call with all @pages in the range [@start, @start + len[ locked.
*/
-void tb_invalidate_phys_page_fast(struct page_collection *pages,
- tb_page_addr_t start, int len,
- uintptr_t retaddr)
+static void tb_invalidate_phys_page_fast__locked(struct page_collection *pages,
+ tb_page_addr_t start,
+ unsigned len, uintptr_t ra)
{
PageDesc *p;
- assert_memory_lock();
-
p = page_find(start >> TARGET_PAGE_BITS);
if (!p) {
return;
}
assert_page_locked(p);
- tb_invalidate_phys_page_range__locked(pages, p, start, start + len,
- retaddr);
+ tb_invalidate_phys_page_range__locked(pages, p, start, start + len, ra);
}
-#else
+
/*
- * Called with mmap_lock held. If pc is not 0 then it indicates the
- * host PC of the faulting store instruction that caused this invalidate.
- * Returns true if the caller needs to abort execution of the current
- * TB (because it was modified by this store and the guest CPU has
- * precise-SMC semantics).
+ * len must be <= 8 and start must be a multiple of len.
+ * Called via softmmu_template.h when code areas are written to with
+ * iothread mutex not held.
*/
-bool tb_invalidate_phys_page_unwind(tb_page_addr_t addr, uintptr_t pc)
+void tb_invalidate_phys_range_fast(ram_addr_t ram_addr,
+ unsigned size,
+ uintptr_t retaddr)
{
- TranslationBlock *tb;
- PageDesc *p;
- int n;
-#ifdef TARGET_HAS_PRECISE_SMC
- TranslationBlock *current_tb = NULL;
- CPUState *cpu = current_cpu;
- bool current_tb_modified = false;
-#endif
-
- assert_memory_lock();
-
- addr &= TARGET_PAGE_MASK;
- p = page_find(addr >> TARGET_PAGE_BITS);
- if (!p) {
- return false;
- }
-
-#ifdef TARGET_HAS_PRECISE_SMC
- if (p->first_tb && pc != 0) {
- current_tb = tcg_tb_lookup(pc);
- }
-#endif
- assert_page_locked(p);
- PAGE_FOR_EACH_TB(p, tb, n) {
-#ifdef TARGET_HAS_PRECISE_SMC
- if (current_tb == tb &&
- (tb_cflags(current_tb) & CF_COUNT_MASK) != 1) {
- /*
- * If we are modifying the current TB, we must stop its execution.
- * We could be more precise by checking that the modification is
- * after the current PC, but it would require a specialized
- * function to partially restore the CPU state.
- */
- current_tb_modified = true;
- cpu_restore_state_from_tb(cpu, current_tb, pc);
- }
-#endif /* TARGET_HAS_PRECISE_SMC */
- tb_phys_invalidate(tb, addr);
- }
- p->first_tb = (uintptr_t)NULL;
-#ifdef TARGET_HAS_PRECISE_SMC
- if (current_tb_modified) {
- /* Force execution of one insn next time. */
- cpu->cflags_next_tb = 1 | CF_NOIRQ | curr_cflags(cpu);
- return true;
- }
-#endif
+ struct page_collection *pages;
- return false;
+ pages = page_collection_lock(ram_addr, ram_addr + size);
+ tb_invalidate_phys_page_fast__locked(pages, ram_addr, size, retaddr);
+ page_collection_unlock(pages);
}
-#endif
+
+#endif /* CONFIG_USER_ONLY */
diff --git a/accel/tcg/trace-events b/accel/tcg/trace-events
index 59eab96..4e9b450 100644
--- a/accel/tcg/trace-events
+++ b/accel/tcg/trace-events
@@ -6,5 +6,9 @@ exec_tb(void *tb, uintptr_t pc) "tb:%p pc=0x%"PRIxPTR
exec_tb_nocache(void *tb, uintptr_t pc) "tb:%p pc=0x%"PRIxPTR
exec_tb_exit(void *last_tb, unsigned int flags) "tb:%p flags=0x%x"
+# cputlb.c
+memory_notdirty_write_access(uint64_t vaddr, uint64_t ram_addr, unsigned size) "0x%" PRIx64 " ram_addr 0x%" PRIx64 " size %u"
+memory_notdirty_set_dirty(uint64_t vaddr) "0x%" PRIx64
+
# translate-all.c
translate_block(void *tb, uintptr_t pc, const void *tb_code) "tb:%p, pc:0x%"PRIxPTR", tb_code:%p"
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index ac3ee37..51ac1f6 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -63,105 +63,13 @@
#include "tb-context.h"
#include "internal.h"
-/* make various TB consistency checks */
-
-/**
- * struct page_entry - page descriptor entry
- * @pd: pointer to the &struct PageDesc of the page this entry represents
- * @index: page index of the page
- * @locked: whether the page is locked
- *
- * This struct helps us keep track of the locked state of a page, without
- * bloating &struct PageDesc.
- *
- * A page lock protects accesses to all fields of &struct PageDesc.
- *
- * See also: &struct page_collection.
- */
-struct page_entry {
- PageDesc *pd;
- tb_page_addr_t index;
- bool locked;
-};
-
-/**
- * struct page_collection - tracks a set of pages (i.e. &struct page_entry's)
- * @tree: Binary search tree (BST) of the pages, with key == page index
- * @max: Pointer to the page in @tree with the highest page index
- *
- * To avoid deadlock we lock pages in ascending order of page index.
- * When operating on a set of pages, we need to keep track of them so that
- * we can lock them in order and also unlock them later. For this we collect
- * pages (i.e. &struct page_entry's) in a binary search @tree. Given that the
- * @tree implementation we use does not provide an O(1) operation to obtain the
- * highest-ranked element, we use @max to keep track of the inserted page
- * with the highest index. This is valuable because if a page is not in
- * the tree and its index is higher than @max's, then we can lock it
- * without breaking the locking order rule.
- *
- * Note on naming: 'struct page_set' would be shorter, but we already have a few
- * page_set_*() helpers, so page_collection is used instead to avoid confusion.
- *
- * See also: page_collection_lock().
- */
-struct page_collection {
- GTree *tree;
- struct page_entry *max;
-};
-
-/*
- * In system mode we want L1_MAP to be based on ram offsets,
- * while in user mode we want it to be based on virtual addresses.
- *
- * TODO: For user mode, see the caveat re host vs guest virtual
- * address spaces near GUEST_ADDR_MAX.
- */
-#if !defined(CONFIG_USER_ONLY)
-#if HOST_LONG_BITS < TARGET_PHYS_ADDR_SPACE_BITS
-# define L1_MAP_ADDR_SPACE_BITS HOST_LONG_BITS
-#else
-# define L1_MAP_ADDR_SPACE_BITS TARGET_PHYS_ADDR_SPACE_BITS
-#endif
-#else
-# define L1_MAP_ADDR_SPACE_BITS MIN(HOST_LONG_BITS, TARGET_ABI_BITS)
-#endif
-
/* Make sure all possible CPU event bits fit in tb->trace_vcpu_dstate */
QEMU_BUILD_BUG_ON(CPU_TRACE_DSTATE_MAX_EVENTS >
sizeof_field(TranslationBlock, trace_vcpu_dstate)
* BITS_PER_BYTE);
-/*
- * L1 Mapping properties
- */
-int v_l1_size;
-int v_l1_shift;
-int v_l2_levels;
-
-void *l1_map[V_L1_MAX_SIZE];
-
TBContext tb_ctx;
-static void page_table_config_init(void)
-{
- uint32_t v_l1_bits;
-
- assert(TARGET_PAGE_BITS);
- /* The bits remaining after N lower levels of page tables. */
- v_l1_bits = (L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS) % V_L2_BITS;
- if (v_l1_bits < V_L1_MIN_BITS) {
- v_l1_bits += V_L2_BITS;
- }
-
- v_l1_size = 1 << v_l1_bits;
- v_l1_shift = L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS - v_l1_bits;
- v_l2_levels = v_l1_shift / V_L2_BITS - 1;
-
- assert(v_l1_bits <= V_L1_MAX_BITS);
- assert(v_l1_shift % V_L2_BITS == 0);
- assert(v_l2_levels >= 0);
-}
-
/* Encode VAL as a signed leb128 sequence at P.
Return P incremented past the encoded value. */
static uint8_t *encode_sleb128(uint8_t *p, target_long val)
@@ -354,395 +262,9 @@ void page_init(void)
{
page_size_init();
page_table_config_init();
-
-#if defined(CONFIG_BSD) && defined(CONFIG_USER_ONLY)
- {
-#ifdef HAVE_KINFO_GETVMMAP
- struct kinfo_vmentry *freep;
- int i, cnt;
-
- freep = kinfo_getvmmap(getpid(), &cnt);
- if (freep) {
- mmap_lock();
- for (i = 0; i < cnt; i++) {
- unsigned long startaddr, endaddr;
-
- startaddr = freep[i].kve_start;
- endaddr = freep[i].kve_end;
- if (h2g_valid(startaddr)) {
- startaddr = h2g(startaddr) & TARGET_PAGE_MASK;
-
- if (h2g_valid(endaddr)) {
- endaddr = h2g(endaddr);
- page_set_flags(startaddr, endaddr, PAGE_RESERVED);
- } else {
-#if TARGET_ABI_BITS <= L1_MAP_ADDR_SPACE_BITS
- endaddr = ~0ul;
- page_set_flags(startaddr, endaddr, PAGE_RESERVED);
-#endif
- }
- }
- }
- free(freep);
- mmap_unlock();
- }
-#else
- FILE *f;
-
- last_brk = (unsigned long)sbrk(0);
-
- f = fopen("/compat/linux/proc/self/maps", "r");
- if (f) {
- mmap_lock();
-
- do {
- unsigned long startaddr, endaddr;
- int n;
-
- n = fscanf(f, "%lx-%lx %*[^\n]\n", &startaddr, &endaddr);
-
- if (n == 2 && h2g_valid(startaddr)) {
- startaddr = h2g(startaddr) & TARGET_PAGE_MASK;
-
- if (h2g_valid(endaddr)) {
- endaddr = h2g(endaddr);
- } else {
- endaddr = ~0ul;
- }
- page_set_flags(startaddr, endaddr, PAGE_RESERVED);
- }
- } while (!feof(f));
-
- fclose(f);
- mmap_unlock();
- }
-#endif
- }
-#endif
-}
-
-PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc)
-{
- PageDesc *pd;
- void **lp;
- int i;
-
- /* Level 1. Always allocated. */
- lp = l1_map + ((index >> v_l1_shift) & (v_l1_size - 1));
-
- /* Level 2..N-1. */
- for (i = v_l2_levels; i > 0; i--) {
- void **p = qatomic_rcu_read(lp);
-
- if (p == NULL) {
- void *existing;
-
- if (!alloc) {
- return NULL;
- }
- p = g_new0(void *, V_L2_SIZE);
- existing = qatomic_cmpxchg(lp, NULL, p);
- if (unlikely(existing)) {
- g_free(p);
- p = existing;
- }
- }
-
- lp = p + ((index >> (i * V_L2_BITS)) & (V_L2_SIZE - 1));
- }
-
- pd = qatomic_rcu_read(lp);
- if (pd == NULL) {
- void *existing;
-
- if (!alloc) {
- return NULL;
- }
- pd = g_new0(PageDesc, V_L2_SIZE);
-#ifndef CONFIG_USER_ONLY
- {
- int i;
-
- for (i = 0; i < V_L2_SIZE; i++) {
- qemu_spin_init(&pd[i].lock);
- }
- }
-#endif
- existing = qatomic_cmpxchg(lp, NULL, pd);
- if (unlikely(existing)) {
-#ifndef CONFIG_USER_ONLY
- {
- int i;
-
- for (i = 0; i < V_L2_SIZE; i++) {
- qemu_spin_destroy(&pd[i].lock);
- }
- }
-#endif
- g_free(pd);
- pd = existing;
- }
- }
-
- return pd + (index & (V_L2_SIZE - 1));
-}
-
-/* In user-mode page locks aren't used; mmap_lock is enough */
-#ifdef CONFIG_USER_ONLY
-struct page_collection *
-page_collection_lock(tb_page_addr_t start, tb_page_addr_t end)
-{
- return NULL;
-}
-
-void page_collection_unlock(struct page_collection *set)
-{ }
-#else /* !CONFIG_USER_ONLY */
-
-#ifdef CONFIG_DEBUG_TCG
-
-static __thread GHashTable *ht_pages_locked_debug;
-
-static void ht_pages_locked_debug_init(void)
-{
- if (ht_pages_locked_debug) {
- return;
- }
- ht_pages_locked_debug = g_hash_table_new(NULL, NULL);
-}
-
-static bool page_is_locked(const PageDesc *pd)
-{
- PageDesc *found;
-
- ht_pages_locked_debug_init();
- found = g_hash_table_lookup(ht_pages_locked_debug, pd);
- return !!found;
-}
-
-static void page_lock__debug(PageDesc *pd)
-{
- ht_pages_locked_debug_init();
- g_assert(!page_is_locked(pd));
- g_hash_table_insert(ht_pages_locked_debug, pd, pd);
-}
-
-static void page_unlock__debug(const PageDesc *pd)
-{
- bool removed;
-
- ht_pages_locked_debug_init();
- g_assert(page_is_locked(pd));
- removed = g_hash_table_remove(ht_pages_locked_debug, pd);
- g_assert(removed);
-}
-
-void do_assert_page_locked(const PageDesc *pd, const char *file, int line)
-{
- if (unlikely(!page_is_locked(pd))) {
- error_report("assert_page_lock: PageDesc %p not locked @ %s:%d",
- pd, file, line);
- abort();
- }
-}
-
-void assert_no_pages_locked(void)
-{
- ht_pages_locked_debug_init();
- g_assert(g_hash_table_size(ht_pages_locked_debug) == 0);
-}
-
-#else /* !CONFIG_DEBUG_TCG */
-
-static inline void page_lock__debug(const PageDesc *pd) { }
-static inline void page_unlock__debug(const PageDesc *pd) { }
-
-#endif /* CONFIG_DEBUG_TCG */
-
-void page_lock(PageDesc *pd)
-{
- page_lock__debug(pd);
- qemu_spin_lock(&pd->lock);
-}
-
-void page_unlock(PageDesc *pd)
-{
- qemu_spin_unlock(&pd->lock);
- page_unlock__debug(pd);
-}
-
-static inline struct page_entry *
-page_entry_new(PageDesc *pd, tb_page_addr_t index)
-{
- struct page_entry *pe = g_malloc(sizeof(*pe));
-
- pe->index = index;
- pe->pd = pd;
- pe->locked = false;
- return pe;
-}
-
-static void page_entry_destroy(gpointer p)
-{
- struct page_entry *pe = p;
-
- g_assert(pe->locked);
- page_unlock(pe->pd);
- g_free(pe);
-}
-
-/* returns false on success */
-static bool page_entry_trylock(struct page_entry *pe)
-{
- bool busy;
-
- busy = qemu_spin_trylock(&pe->pd->lock);
- if (!busy) {
- g_assert(!pe->locked);
- pe->locked = true;
- page_lock__debug(pe->pd);
- }
- return busy;
-}
-
-static void do_page_entry_lock(struct page_entry *pe)
-{
- page_lock(pe->pd);
- g_assert(!pe->locked);
- pe->locked = true;
-}
-
-static gboolean page_entry_lock(gpointer key, gpointer value, gpointer data)
-{
- struct page_entry *pe = value;
-
- do_page_entry_lock(pe);
- return FALSE;
-}
-
-static gboolean page_entry_unlock(gpointer key, gpointer value, gpointer data)
-{
- struct page_entry *pe = value;
-
- if (pe->locked) {
- pe->locked = false;
- page_unlock(pe->pd);
- }
- return FALSE;
}
/*
- * Trylock a page, and if successful, add the page to a collection.
- * Returns true ("busy") if the page could not be locked; false otherwise.
- */
-static bool page_trylock_add(struct page_collection *set, tb_page_addr_t addr)
-{
- tb_page_addr_t index = addr >> TARGET_PAGE_BITS;
- struct page_entry *pe;
- PageDesc *pd;
-
- pe = g_tree_lookup(set->tree, &index);
- if (pe) {
- return false;
- }
-
- pd = page_find(index);
- if (pd == NULL) {
- return false;
- }
-
- pe = page_entry_new(pd, index);
- g_tree_insert(set->tree, &pe->index, pe);
-
- /*
- * If this is either (1) the first insertion or (2) a page whose index
- * is higher than any other so far, just lock the page and move on.
- */
- if (set->max == NULL || pe->index > set->max->index) {
- set->max = pe;
- do_page_entry_lock(pe);
- return false;
- }
- /*
- * Try to acquire out-of-order lock; if busy, return busy so that we acquire
- * locks in order.
- */
- return page_entry_trylock(pe);
-}
-
-static gint tb_page_addr_cmp(gconstpointer ap, gconstpointer bp, gpointer udata)
-{
- tb_page_addr_t a = *(const tb_page_addr_t *)ap;
- tb_page_addr_t b = *(const tb_page_addr_t *)bp;
-
- if (a == b) {
- return 0;
- } else if (a < b) {
- return -1;
- }
- return 1;
-}
-
-/*
- * Lock a range of pages ([@start,@end[) as well as the pages of all
- * intersecting TBs.
- * Locking order: acquire locks in ascending order of page index.
- */
-struct page_collection *
-page_collection_lock(tb_page_addr_t start, tb_page_addr_t end)
-{
- struct page_collection *set = g_malloc(sizeof(*set));
- tb_page_addr_t index;
- PageDesc *pd;
-
- start >>= TARGET_PAGE_BITS;
- end >>= TARGET_PAGE_BITS;
- g_assert(start <= end);
-
- set->tree = g_tree_new_full(tb_page_addr_cmp, NULL, NULL,
- page_entry_destroy);
- set->max = NULL;
- assert_no_pages_locked();
-
- retry:
- g_tree_foreach(set->tree, page_entry_lock, NULL);
-
- for (index = start; index <= end; index++) {
- TranslationBlock *tb;
- int n;
-
- pd = page_find(index);
- if (pd == NULL) {
- continue;
- }
- if (page_trylock_add(set, index << TARGET_PAGE_BITS)) {
- g_tree_foreach(set->tree, page_entry_unlock, NULL);
- goto retry;
- }
- assert_page_locked(pd);
- PAGE_FOR_EACH_TB(pd, tb, n) {
- if (page_trylock_add(set, tb_page_addr0(tb)) ||
- (tb_page_addr1(tb) != -1 &&
- page_trylock_add(set, tb_page_addr1(tb)))) {
- /* drop all locks, and reacquire in order */
- g_tree_foreach(set->tree, page_entry_unlock, NULL);
- goto retry;
- }
- }
- }
- return set;
-}
-
-void page_collection_unlock(struct page_collection *set)
-{
- /* entries are unlocked and freed via page_entry_destroy */
- g_tree_destroy(set->tree);
- g_free(set);
-}
-
-#endif /* !CONFIG_USER_ONLY */
-
-/*
* Isolate the portion of code gen which can setjmp/longjmp.
* Return the size of the generated code, or negative on error.
*/
@@ -1235,339 +757,6 @@ void cpu_interrupt(CPUState *cpu, int mask)
qatomic_set(&cpu_neg(cpu)->icount_decr.u16.high, -1);
}
-/*
- * Walks guest process memory "regions" one by one
- * and calls callback function 'fn' for each region.
- */
-struct walk_memory_regions_data {
- walk_memory_regions_fn fn;
- void *priv;
- target_ulong start;
- int prot;
-};
-
-static int walk_memory_regions_end(struct walk_memory_regions_data *data,
- target_ulong end, int new_prot)
-{
- if (data->start != -1u) {
- int rc = data->fn(data->priv, data->start, end, data->prot);
- if (rc != 0) {
- return rc;
- }
- }
-
- data->start = (new_prot ? end : -1u);
- data->prot = new_prot;
-
- return 0;
-}
-
-static int walk_memory_regions_1(struct walk_memory_regions_data *data,
- target_ulong base, int level, void **lp)
-{
- target_ulong pa;
- int i, rc;
-
- if (*lp == NULL) {
- return walk_memory_regions_end(data, base, 0);
- }
-
- if (level == 0) {
- PageDesc *pd = *lp;
-
- for (i = 0; i < V_L2_SIZE; ++i) {
- int prot = pd[i].flags;
-
- pa = base | (i << TARGET_PAGE_BITS);
- if (prot != data->prot) {
- rc = walk_memory_regions_end(data, pa, prot);
- if (rc != 0) {
- return rc;
- }
- }
- }
- } else {
- void **pp = *lp;
-
- for (i = 0; i < V_L2_SIZE; ++i) {
- pa = base | ((target_ulong)i <<
- (TARGET_PAGE_BITS + V_L2_BITS * level));
- rc = walk_memory_regions_1(data, pa, level - 1, pp + i);
- if (rc != 0) {
- return rc;
- }
- }
- }
-
- return 0;
-}
-
-int walk_memory_regions(void *priv, walk_memory_regions_fn fn)
-{
- struct walk_memory_regions_data data;
- uintptr_t i, l1_sz = v_l1_size;
-
- data.fn = fn;
- data.priv = priv;
- data.start = -1u;
- data.prot = 0;
-
- for (i = 0; i < l1_sz; i++) {
- target_ulong base = i << (v_l1_shift + TARGET_PAGE_BITS);
- int rc = walk_memory_regions_1(&data, base, v_l2_levels, l1_map + i);
- if (rc != 0) {
- return rc;
- }
- }
-
- return walk_memory_regions_end(&data, 0, 0);
-}
-
-static int dump_region(void *priv, target_ulong start,
- target_ulong end, unsigned long prot)
-{
- FILE *f = (FILE *)priv;
-
- (void) fprintf(f, TARGET_FMT_lx"-"TARGET_FMT_lx
- " "TARGET_FMT_lx" %c%c%c\n",
- start, end, end - start,
- ((prot & PAGE_READ) ? 'r' : '-'),
- ((prot & PAGE_WRITE) ? 'w' : '-'),
- ((prot & PAGE_EXEC) ? 'x' : '-'));
-
- return 0;
-}
-
-/* dump memory mappings */
-void page_dump(FILE *f)
-{
- const int length = sizeof(target_ulong) * 2;
- (void) fprintf(f, "%-*s %-*s %-*s %s\n",
- length, "start", length, "end", length, "size", "prot");
- walk_memory_regions(f, dump_region);
-}
-
-int page_get_flags(target_ulong address)
-{
- PageDesc *p;
-
- p = page_find(address >> TARGET_PAGE_BITS);
- if (!p) {
- return 0;
- }
- return p->flags;
-}
-
-/*
- * Allow the target to decide if PAGE_TARGET_[12] may be reset.
- * By default, they are not kept.
- */
-#ifndef PAGE_TARGET_STICKY
-#define PAGE_TARGET_STICKY 0
-#endif
-#define PAGE_STICKY (PAGE_ANON | PAGE_PASSTHROUGH | PAGE_TARGET_STICKY)
-
-/* Modify the flags of a page and invalidate the code if necessary.
- The flag PAGE_WRITE_ORG is positioned automatically depending
- on PAGE_WRITE. The mmap_lock should already be held. */
-void page_set_flags(target_ulong start, target_ulong end, int flags)
-{
- target_ulong addr, len;
- bool reset, inval_tb = false;
-
- /* This function should never be called with addresses outside the
- guest address space. If this assert fires, it probably indicates
- a missing call to h2g_valid. */
- assert(end - 1 <= GUEST_ADDR_MAX);
- assert(start < end);
- /* Only set PAGE_ANON with new mappings. */
- assert(!(flags & PAGE_ANON) || (flags & PAGE_RESET));
- assert_memory_lock();
-
- start = start & TARGET_PAGE_MASK;
- end = TARGET_PAGE_ALIGN(end);
-
- if (flags & PAGE_WRITE) {
- flags |= PAGE_WRITE_ORG;
- }
- reset = !(flags & PAGE_VALID) || (flags & PAGE_RESET);
- if (reset) {
- page_reset_target_data(start, end);
- }
- flags &= ~PAGE_RESET;
-
- for (addr = start, len = end - start;
- len != 0;
- len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
- PageDesc *p = page_find_alloc(addr >> TARGET_PAGE_BITS, true);
-
- /*
- * If the page was executable, but is reset, or is no longer
- * executable, or has become writable, then invalidate any code.
- */
- if ((p->flags & PAGE_EXEC)
- && (reset ||
- !(flags & PAGE_EXEC) ||
- (flags & ~p->flags & PAGE_WRITE))) {
- inval_tb = true;
- }
- /* Using mprotect on a page does not change sticky bits. */
- p->flags = (reset ? 0 : p->flags & PAGE_STICKY) | flags;
- }
-
- if (inval_tb) {
- tb_invalidate_phys_range(start, end);
- }
-}
-
-int page_check_range(target_ulong start, target_ulong len, int flags)
-{
- PageDesc *p;
- target_ulong end;
- target_ulong addr;
-
- /* This function should never be called with addresses outside the
- guest address space. If this assert fires, it probably indicates
- a missing call to h2g_valid. */
- if (TARGET_ABI_BITS > L1_MAP_ADDR_SPACE_BITS) {
- assert(start < ((target_ulong)1 << L1_MAP_ADDR_SPACE_BITS));
- }
-
- if (len == 0) {
- return 0;
- }
- if (start + len - 1 < start) {
- /* We've wrapped around. */
- return -1;
- }
-
- /* must do before we loose bits in the next step */
- end = TARGET_PAGE_ALIGN(start + len);
- start = start & TARGET_PAGE_MASK;
-
- for (addr = start, len = end - start;
- len != 0;
- len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
- p = page_find(addr >> TARGET_PAGE_BITS);
- if (!p) {
- return -1;
- }
- if (!(p->flags & PAGE_VALID)) {
- return -1;
- }
-
- if ((flags & PAGE_READ) && !(p->flags & PAGE_READ)) {
- return -1;
- }
- if (flags & PAGE_WRITE) {
- if (!(p->flags & PAGE_WRITE_ORG)) {
- return -1;
- }
- /* unprotect the page if it was put read-only because it
- contains translated code */
- if (!(p->flags & PAGE_WRITE)) {
- if (!page_unprotect(addr, 0)) {
- return -1;
- }
- }
- }
- }
- return 0;
-}
-
-void page_protect(tb_page_addr_t page_addr)
-{
- target_ulong addr;
- PageDesc *p;
- int prot;
-
- p = page_find(page_addr >> TARGET_PAGE_BITS);
- if (p && (p->flags & PAGE_WRITE)) {
- /*
- * Force the host page as non writable (writes will have a page fault +
- * mprotect overhead).
- */
- page_addr &= qemu_host_page_mask;
- prot = 0;
- for (addr = page_addr; addr < page_addr + qemu_host_page_size;
- addr += TARGET_PAGE_SIZE) {
-
- p = page_find(addr >> TARGET_PAGE_BITS);
- if (!p) {
- continue;
- }
- prot |= p->flags;
- p->flags &= ~PAGE_WRITE;
- }
- mprotect(g2h_untagged(page_addr), qemu_host_page_size,
- (prot & PAGE_BITS) & ~PAGE_WRITE);
- }
-}
-
-/* called from signal handler: invalidate the code and unprotect the
- * page. Return 0 if the fault was not handled, 1 if it was handled,
- * and 2 if it was handled but the caller must cause the TB to be
- * immediately exited. (We can only return 2 if the 'pc' argument is
- * non-zero.)
- */
-int page_unprotect(target_ulong address, uintptr_t pc)
-{
- unsigned int prot;
- bool current_tb_invalidated;
- PageDesc *p;
- target_ulong host_start, host_end, addr;
-
- /* Technically this isn't safe inside a signal handler. However we
- know this only ever happens in a synchronous SEGV handler, so in
- practice it seems to be ok. */
- mmap_lock();
-
- p = page_find(address >> TARGET_PAGE_BITS);
- if (!p) {
- mmap_unlock();
- return 0;
- }
-
- /* if the page was really writable, then we change its
- protection back to writable */
- if (p->flags & PAGE_WRITE_ORG) {
- current_tb_invalidated = false;
- if (p->flags & PAGE_WRITE) {
- /* If the page is actually marked WRITE then assume this is because
- * this thread raced with another one which got here first and
- * set the page to PAGE_WRITE and did the TB invalidate for us.
- */
-#ifdef TARGET_HAS_PRECISE_SMC
- TranslationBlock *current_tb = tcg_tb_lookup(pc);
- if (current_tb) {
- current_tb_invalidated = tb_cflags(current_tb) & CF_INVALID;
- }
-#endif
- } else {
- host_start = address & qemu_host_page_mask;
- host_end = host_start + qemu_host_page_size;
-
- prot = 0;
- for (addr = host_start; addr < host_end; addr += TARGET_PAGE_SIZE) {
- p = page_find(addr >> TARGET_PAGE_BITS);
- p->flags |= PAGE_WRITE;
- prot |= p->flags;
-
- /* and since the content will be modified, we must invalidate
- the corresponding translated code. */
- current_tb_invalidated |=
- tb_invalidate_phys_page_unwind(addr, pc);
- }
- mprotect((void *)g2h_untagged(host_start), qemu_host_page_size,
- prot & PAGE_BITS);
- }
- mmap_unlock();
- /* If current TB was invalidated return to main loop */
- return current_tb_invalidated ? 2 : 1;
- }
- mmap_unlock();
- return 0;
-}
#endif /* CONFIG_USER_ONLY */
/*
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index fb7d6ee..a3cecda 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -135,6 +135,565 @@ bool handle_sigsegv_accerr_write(CPUState *cpu, sigset_t *old_set,
}
}
+typedef struct PageFlagsNode {
+ IntervalTreeNode itree;
+ int flags;
+} PageFlagsNode;
+
+static IntervalTreeRoot pageflags_root;
+
+static PageFlagsNode *pageflags_find(target_ulong start, target_long last)
+{
+ IntervalTreeNode *n;
+
+ n = interval_tree_iter_first(&pageflags_root, start, last);
+ return n ? container_of(n, PageFlagsNode, itree) : NULL;
+}
+
+static PageFlagsNode *pageflags_next(PageFlagsNode *p, target_ulong start,
+ target_long last)
+{
+ IntervalTreeNode *n;
+
+ n = interval_tree_iter_next(&p->itree, start, last);
+ return n ? container_of(n, PageFlagsNode, itree) : NULL;
+}
+
+int walk_memory_regions(void *priv, walk_memory_regions_fn fn)
+{
+ IntervalTreeNode *n;
+ int rc = 0;
+
+ mmap_lock();
+ for (n = interval_tree_iter_first(&pageflags_root, 0, -1);
+ n != NULL;
+ n = interval_tree_iter_next(n, 0, -1)) {
+ PageFlagsNode *p = container_of(n, PageFlagsNode, itree);
+
+ rc = fn(priv, n->start, n->last + 1, p->flags);
+ if (rc != 0) {
+ break;
+ }
+ }
+ mmap_unlock();
+
+ return rc;
+}
+
+static int dump_region(void *priv, target_ulong start,
+ target_ulong end, unsigned long prot)
+{
+ FILE *f = (FILE *)priv;
+
+ fprintf(f, TARGET_FMT_lx"-"TARGET_FMT_lx" "TARGET_FMT_lx" %c%c%c\n",
+ start, end, end - start,
+ ((prot & PAGE_READ) ? 'r' : '-'),
+ ((prot & PAGE_WRITE) ? 'w' : '-'),
+ ((prot & PAGE_EXEC) ? 'x' : '-'));
+ return 0;
+}
+
+/* dump memory mappings */
+void page_dump(FILE *f)
+{
+ const int length = sizeof(target_ulong) * 2;
+
+ fprintf(f, "%-*s %-*s %-*s %s\n",
+ length, "start", length, "end", length, "size", "prot");
+ walk_memory_regions(f, dump_region);
+}
+
+int page_get_flags(target_ulong address)
+{
+ PageFlagsNode *p = pageflags_find(address, address);
+
+ /*
+ * See util/interval-tree.c re lockless lookups: no false positives but
+ * there are false negatives. If we find nothing, retry with the mmap
+ * lock acquired.
+ */
+ if (p) {
+ return p->flags;
+ }
+ if (have_mmap_lock()) {
+ return 0;
+ }
+
+ mmap_lock();
+ p = pageflags_find(address, address);
+ mmap_unlock();
+ return p ? p->flags : 0;
+}
+
+/* A subroutine of page_set_flags: insert a new node for [start,last]. */
+static void pageflags_create(target_ulong start, target_ulong last, int flags)
+{
+ PageFlagsNode *p = g_new(PageFlagsNode, 1);
+
+ p->itree.start = start;
+ p->itree.last = last;
+ p->flags = flags;
+ interval_tree_insert(&p->itree, &pageflags_root);
+}
+
+/* A subroutine of page_set_flags: remove everything in [start,last]. */
+static bool pageflags_unset(target_ulong start, target_ulong last)
+{
+ bool inval_tb = false;
+
+ while (true) {
+ PageFlagsNode *p = pageflags_find(start, last);
+ target_ulong p_last;
+
+ if (!p) {
+ break;
+ }
+
+ if (p->flags & PAGE_EXEC) {
+ inval_tb = true;
+ }
+
+ interval_tree_remove(&p->itree, &pageflags_root);
+ p_last = p->itree.last;
+
+ if (p->itree.start < start) {
+ /* Truncate the node from the end, or split out the middle. */
+ p->itree.last = start - 1;
+ interval_tree_insert(&p->itree, &pageflags_root);
+ if (last < p_last) {
+ pageflags_create(last + 1, p_last, p->flags);
+ break;
+ }
+ } else if (p_last <= last) {
+ /* Range completely covers node -- remove it. */
+ g_free(p);
+ } else {
+ /* Truncate the node from the start. */
+ p->itree.start = last + 1;
+ interval_tree_insert(&p->itree, &pageflags_root);
+ break;
+ }
+ }
+
+ return inval_tb;
+}
+
+/*
+ * A subroutine of page_set_flags: nothing overlaps [start,last],
+ * but check adjacent mappings and maybe merge into a single range.
+ */
+static void pageflags_create_merge(target_ulong start, target_ulong last,
+ int flags)
+{
+ PageFlagsNode *next = NULL, *prev = NULL;
+
+ if (start > 0) {
+ prev = pageflags_find(start - 1, start - 1);
+ if (prev) {
+ if (prev->flags == flags) {
+ interval_tree_remove(&prev->itree, &pageflags_root);
+ } else {
+ prev = NULL;
+ }
+ }
+ }
+ if (last + 1 != 0) {
+ next = pageflags_find(last + 1, last + 1);
+ if (next) {
+ if (next->flags == flags) {
+ interval_tree_remove(&next->itree, &pageflags_root);
+ } else {
+ next = NULL;
+ }
+ }
+ }
+
+ if (prev) {
+ if (next) {
+ prev->itree.last = next->itree.last;
+ g_free(next);
+ } else {
+ prev->itree.last = last;
+ }
+ interval_tree_insert(&prev->itree, &pageflags_root);
+ } else if (next) {
+ next->itree.start = start;
+ interval_tree_insert(&next->itree, &pageflags_root);
+ } else {
+ pageflags_create(start, last, flags);
+ }
+}
+
+/*
+ * Allow the target to decide if PAGE_TARGET_[12] may be reset.
+ * By default, they are not kept.
+ */
+#ifndef PAGE_TARGET_STICKY
+#define PAGE_TARGET_STICKY 0
+#endif
+#define PAGE_STICKY (PAGE_ANON | PAGE_PASSTHROUGH | PAGE_TARGET_STICKY)
+
+/* A subroutine of page_set_flags: add flags to [start,last]. */
+static bool pageflags_set_clear(target_ulong start, target_ulong last,
+ int set_flags, int clear_flags)
+{
+ PageFlagsNode *p;
+ target_ulong p_start, p_last;
+ int p_flags, merge_flags;
+ bool inval_tb = false;
+
+ restart:
+ p = pageflags_find(start, last);
+ if (!p) {
+ if (set_flags) {
+ pageflags_create_merge(start, last, set_flags);
+ }
+ goto done;
+ }
+
+ p_start = p->itree.start;
+ p_last = p->itree.last;
+ p_flags = p->flags;
+ /* Using mprotect on a page does not change sticky bits. */
+ merge_flags = (p_flags & ~clear_flags) | set_flags;
+
+ /*
+ * Need to flush if an overlapping executable region
+ * removes exec, or adds write.
+ */
+ if ((p_flags & PAGE_EXEC)
+ && (!(merge_flags & PAGE_EXEC)
+ || (merge_flags & ~p_flags & PAGE_WRITE))) {
+ inval_tb = true;
+ }
+
+ /*
+ * If there is an exact range match, update and return without
+ * attempting to merge with adjacent regions.
+ */
+ if (start == p_start && last == p_last) {
+ if (merge_flags) {
+ p->flags = merge_flags;
+ } else {
+ interval_tree_remove(&p->itree, &pageflags_root);
+ g_free(p);
+ }
+ goto done;
+ }
+
+ /*
+ * If sticky bits affect the original mapping, then we must be more
+ * careful about the existing intervals and the separate flags.
+ */
+ if (set_flags != merge_flags) {
+ if (p_start < start) {
+ interval_tree_remove(&p->itree, &pageflags_root);
+ p->itree.last = start - 1;
+ interval_tree_insert(&p->itree, &pageflags_root);
+
+ if (last < p_last) {
+ if (merge_flags) {
+ pageflags_create(start, last, merge_flags);
+ }
+ pageflags_create(last + 1, p_last, p_flags);
+ } else {
+ if (merge_flags) {
+ pageflags_create(start, p_last, merge_flags);
+ }
+ if (p_last < last) {
+ start = p_last + 1;
+ goto restart;
+ }
+ }
+ } else {
+ if (start < p_start && set_flags) {
+ pageflags_create(start, p_start - 1, set_flags);
+ }
+ if (last < p_last) {
+ interval_tree_remove(&p->itree, &pageflags_root);
+ p->itree.start = last + 1;
+ interval_tree_insert(&p->itree, &pageflags_root);
+ if (merge_flags) {
+ pageflags_create(start, last, merge_flags);
+ }
+ } else {
+ if (merge_flags) {
+ p->flags = merge_flags;
+ } else {
+ interval_tree_remove(&p->itree, &pageflags_root);
+ g_free(p);
+ }
+ if (p_last < last) {
+ start = p_last + 1;
+ goto restart;
+ }
+ }
+ }
+ goto done;
+ }
+
+ /* If flags are not changing for this range, incorporate it. */
+ if (set_flags == p_flags) {
+ if (start < p_start) {
+ interval_tree_remove(&p->itree, &pageflags_root);
+ p->itree.start = start;
+ interval_tree_insert(&p->itree, &pageflags_root);
+ }
+ if (p_last < last) {
+ start = p_last + 1;
+ goto restart;
+ }
+ goto done;
+ }
+
+ /* Maybe split out head and/or tail ranges with the original flags. */
+ interval_tree_remove(&p->itree, &pageflags_root);
+ if (p_start < start) {
+ p->itree.last = start - 1;
+ interval_tree_insert(&p->itree, &pageflags_root);
+
+ if (p_last < last) {
+ goto restart;
+ }
+ if (last < p_last) {
+ pageflags_create(last + 1, p_last, p_flags);
+ }
+ } else if (last < p_last) {
+ p->itree.start = last + 1;
+ interval_tree_insert(&p->itree, &pageflags_root);
+ } else {
+ g_free(p);
+ goto restart;
+ }
+ if (set_flags) {
+ pageflags_create(start, last, set_flags);
+ }
+
+ done:
+ return inval_tb;
+}
+
+/*
+ * Modify the flags of a page and invalidate the code if necessary.
+ * The flag PAGE_WRITE_ORG is positioned automatically depending
+ * on PAGE_WRITE. The mmap_lock should already be held.
+ */
+void page_set_flags(target_ulong start, target_ulong end, int flags)
+{
+ target_ulong last;
+ bool reset = false;
+ bool inval_tb = false;
+
+ /* This function should never be called with addresses outside the
+ guest address space. If this assert fires, it probably indicates
+ a missing call to h2g_valid. */
+ assert(start < end);
+ assert(end - 1 <= GUEST_ADDR_MAX);
+ /* Only set PAGE_ANON with new mappings. */
+ assert(!(flags & PAGE_ANON) || (flags & PAGE_RESET));
+ assert_memory_lock();
+
+ start = start & TARGET_PAGE_MASK;
+ end = TARGET_PAGE_ALIGN(end);
+ last = end - 1;
+
+ if (!(flags & PAGE_VALID)) {
+ flags = 0;
+ } else {
+ reset = flags & PAGE_RESET;
+ flags &= ~PAGE_RESET;
+ if (flags & PAGE_WRITE) {
+ flags |= PAGE_WRITE_ORG;
+ }
+ }
+
+ if (!flags || reset) {
+ page_reset_target_data(start, end);
+ inval_tb |= pageflags_unset(start, last);
+ }
+ if (flags) {
+ inval_tb |= pageflags_set_clear(start, last, flags,
+ ~(reset ? 0 : PAGE_STICKY));
+ }
+ if (inval_tb) {
+ tb_invalidate_phys_range(start, end);
+ }
+}
+
+int page_check_range(target_ulong start, target_ulong len, int flags)
+{
+ target_ulong last;
+
+ if (len == 0) {
+ return 0; /* trivial length */
+ }
+
+ last = start + len - 1;
+ if (last < start) {
+ return -1; /* wrap around */
+ }
+
+ while (true) {
+ PageFlagsNode *p = pageflags_find(start, last);
+ int missing;
+
+ if (!p) {
+ return -1; /* entire region invalid */
+ }
+ if (start < p->itree.start) {
+ return -1; /* initial bytes invalid */
+ }
+
+ missing = flags & ~p->flags;
+ if (missing & PAGE_READ) {
+ return -1; /* page not readable */
+ }
+ if (missing & PAGE_WRITE) {
+ if (!(p->flags & PAGE_WRITE_ORG)) {
+ return -1; /* page not writable */
+ }
+ /* Asking about writable, but has been protected: undo. */
+ if (!page_unprotect(start, 0)) {
+ return -1;
+ }
+ /* TODO: page_unprotect should take a range, not a single page. */
+ if (last - start < TARGET_PAGE_SIZE) {
+ return 0; /* ok */
+ }
+ start += TARGET_PAGE_SIZE;
+ continue;
+ }
+
+ if (last <= p->itree.last) {
+ return 0; /* ok */
+ }
+ start = p->itree.last + 1;
+ }
+}
+
+void page_protect(tb_page_addr_t address)
+{
+ PageFlagsNode *p;
+ target_ulong start, last;
+ int prot;
+
+ assert_memory_lock();
+
+ if (qemu_host_page_size <= TARGET_PAGE_SIZE) {
+ start = address & TARGET_PAGE_MASK;
+ last = start + TARGET_PAGE_SIZE - 1;
+ } else {
+ start = address & qemu_host_page_mask;
+ last = start + qemu_host_page_size - 1;
+ }
+
+ p = pageflags_find(start, last);
+ if (!p) {
+ return;
+ }
+ prot = p->flags;
+
+ if (unlikely(p->itree.last < last)) {
+ /* More than one protection region covers the one host page. */
+ assert(TARGET_PAGE_SIZE < qemu_host_page_size);
+ while ((p = pageflags_next(p, start, last)) != NULL) {
+ prot |= p->flags;
+ }
+ }
+
+ if (prot & PAGE_WRITE) {
+ pageflags_set_clear(start, last, 0, PAGE_WRITE);
+ mprotect(g2h_untagged(start), qemu_host_page_size,
+ prot & (PAGE_READ | PAGE_EXEC) ? PROT_READ : PROT_NONE);
+ }
+}
+
+/*
+ * Called from signal handler: invalidate the code and unprotect the
+ * page. Return 0 if the fault was not handled, 1 if it was handled,
+ * and 2 if it was handled but the caller must cause the TB to be
+ * immediately exited. (We can only return 2 if the 'pc' argument is
+ * non-zero.)
+ */
+int page_unprotect(target_ulong address, uintptr_t pc)
+{
+ PageFlagsNode *p;
+ bool current_tb_invalidated;
+
+ /*
+ * Technically this isn't safe inside a signal handler. However we
+ * know this only ever happens in a synchronous SEGV handler, so in
+ * practice it seems to be ok.
+ */
+ mmap_lock();
+
+ p = pageflags_find(address, address);
+
+ /* If this address was not really writable, nothing to do. */
+ if (!p || !(p->flags & PAGE_WRITE_ORG)) {
+ mmap_unlock();
+ return 0;
+ }
+
+ current_tb_invalidated = false;
+ if (p->flags & PAGE_WRITE) {
+ /*
+ * If the page is actually marked WRITE then assume this is because
+ * this thread raced with another one which got here first and
+ * set the page to PAGE_WRITE and did the TB invalidate for us.
+ */
+#ifdef TARGET_HAS_PRECISE_SMC
+ TranslationBlock *current_tb = tcg_tb_lookup(pc);
+ if (current_tb) {
+ current_tb_invalidated = tb_cflags(current_tb) & CF_INVALID;
+ }
+#endif
+ } else {
+ target_ulong start, len, i;
+ int prot;
+
+ if (qemu_host_page_size <= TARGET_PAGE_SIZE) {
+ start = address & TARGET_PAGE_MASK;
+ len = TARGET_PAGE_SIZE;
+ prot = p->flags | PAGE_WRITE;
+ pageflags_set_clear(start, start + len - 1, PAGE_WRITE, 0);
+ current_tb_invalidated = tb_invalidate_phys_page_unwind(start, pc);
+ } else {
+ start = address & qemu_host_page_mask;
+ len = qemu_host_page_size;
+ prot = 0;
+
+ for (i = 0; i < len; i += TARGET_PAGE_SIZE) {
+ target_ulong addr = start + i;
+
+ p = pageflags_find(addr, addr);
+ if (p) {
+ prot |= p->flags;
+ if (p->flags & PAGE_WRITE_ORG) {
+ prot |= PAGE_WRITE;
+ pageflags_set_clear(addr, addr + TARGET_PAGE_SIZE - 1,
+ PAGE_WRITE, 0);
+ }
+ }
+ /*
+ * Since the content will be modified, we must invalidate
+ * the corresponding translated code.
+ */
+ current_tb_invalidated |=
+ tb_invalidate_phys_page_unwind(addr, pc);
+ }
+ }
+ if (prot & PAGE_EXEC) {
+ prot = (prot & ~PAGE_EXEC) | PAGE_READ;
+ }
+ mprotect((void *)g2h_untagged(start), len, prot & PAGE_BITS);
+ }
+ mmap_unlock();
+
+ /* If current TB was invalidated return to main loop */
+ return current_tb_invalidated ? 2 : 1;
+}
+
static int probe_access_internal(CPUArchState *env, target_ulong addr,
int fault_size, MMUAccessType access_type,
bool nonfault, uintptr_t ra)
@@ -210,47 +769,96 @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, target_ulong addr,
return addr;
}
+#ifdef TARGET_PAGE_DATA_SIZE
+/*
+ * Allocate chunks of target data together. For the only current user,
+ * if we allocate one hunk per page, we have overhead of 40/128 or 40%.
+ * Therefore, allocate memory for 64 pages at a time for overhead < 1%.
+ */
+#define TPD_PAGES 64
+#define TBD_MASK (TARGET_PAGE_MASK * TPD_PAGES)
+
+typedef struct TargetPageDataNode {
+ IntervalTreeNode itree;
+ char data[TPD_PAGES][TARGET_PAGE_DATA_SIZE] __attribute__((aligned));
+} TargetPageDataNode;
+
+static IntervalTreeRoot targetdata_root;
+
void page_reset_target_data(target_ulong start, target_ulong end)
{
-#ifdef TARGET_PAGE_DATA_SIZE
- target_ulong addr, len;
+ IntervalTreeNode *n, *next;
+ target_ulong last;
- /*
- * This function should never be called with addresses outside the
- * guest address space. If this assert fires, it probably indicates
- * a missing call to h2g_valid.
- */
- assert(end - 1 <= GUEST_ADDR_MAX);
- assert(start < end);
assert_memory_lock();
start = start & TARGET_PAGE_MASK;
- end = TARGET_PAGE_ALIGN(end);
+ last = TARGET_PAGE_ALIGN(end) - 1;
+
+ for (n = interval_tree_iter_first(&targetdata_root, start, last),
+ next = n ? interval_tree_iter_next(n, start, last) : NULL;
+ n != NULL;
+ n = next,
+ next = next ? interval_tree_iter_next(n, start, last) : NULL) {
+ target_ulong n_start, n_last, p_ofs, p_len;
+ TargetPageDataNode *t;
+
+ if (n->start >= start && n->last <= last) {
+ interval_tree_remove(n, &targetdata_root);
+ g_free(n);
+ continue;
+ }
- for (addr = start, len = end - start;
- len != 0;
- len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
- PageDesc *p = page_find_alloc(addr >> TARGET_PAGE_BITS, 1);
+ if (n->start < start) {
+ n_start = start;
+ p_ofs = (start - n->start) >> TARGET_PAGE_BITS;
+ } else {
+ n_start = n->start;
+ p_ofs = 0;
+ }
+ n_last = MIN(last, n->last);
+ p_len = (n_last + 1 - n_start) >> TARGET_PAGE_BITS;
- g_free(p->target_data);
- p->target_data = NULL;
+ t = container_of(n, TargetPageDataNode, itree);
+ memset(t->data[p_ofs], 0, p_len * TARGET_PAGE_DATA_SIZE);
}
-#endif
}
-#ifdef TARGET_PAGE_DATA_SIZE
void *page_get_target_data(target_ulong address)
{
- PageDesc *p = page_find(address >> TARGET_PAGE_BITS);
- void *ret = p->target_data;
+ IntervalTreeNode *n;
+ TargetPageDataNode *t;
+ target_ulong page, region;
+
+ page = address & TARGET_PAGE_MASK;
+ region = address & TBD_MASK;
- if (!ret) {
- ret = g_malloc0(TARGET_PAGE_DATA_SIZE);
- p->target_data = ret;
+ n = interval_tree_iter_first(&targetdata_root, page, page);
+ if (!n) {
+ /*
+ * See util/interval-tree.c re lockless lookups: no false positives
+ * but there are false negatives. If we find nothing, retry with
+ * the mmap lock acquired. We also need the lock for the
+ * allocation + insert.
+ */
+ mmap_lock();
+ n = interval_tree_iter_first(&targetdata_root, page, page);
+ if (!n) {
+ t = g_new0(TargetPageDataNode, 1);
+ n = &t->itree;
+ n->start = region;
+ n->last = region | ~TBD_MASK;
+ interval_tree_insert(n, &targetdata_root);
+ }
+ mmap_unlock();
}
- return ret;
+
+ t = container_of(n, TargetPageDataNode, itree);
+ return t->data[(page - region) >> TARGET_PAGE_BITS];
}
-#endif
+#else
+void page_reset_target_data(target_ulong start, target_ulong end) { }
+#endif /* TARGET_PAGE_DATA_SIZE */
/* The softmmu versions of these helpers are in cputlb.c. */
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 9b7bfbf..25e11b0 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -24,6 +24,7 @@
#ifdef CONFIG_TCG
#include "exec/cpu_ldst.h"
#endif
+#include "qemu/interval-tree.h"
/* allow to see translation results - the slowdown should be negligible, so we leave it */
#define DEBUG_DISAS
@@ -559,11 +560,20 @@ struct TranslationBlock {
struct tb_tc tc;
- /* first and second physical page containing code. The lower bit
- of the pointer tells the index in page_next[].
- The list is protected by the TB's page('s) lock(s) */
+ /*
+ * Track tb_page_addr_t intervals that intersect this TB.
+ * For user-only, the virtual addresses are always contiguous,
+ * and we use a unified interval tree. For system, we use a
+ * linked list headed in each PageDesc. Within the list, the lsb
+ * of the previous pointer tells the index of page_next[], and the
+ * list is protected by the PageDesc lock(s).
+ */
+#ifdef CONFIG_USER_ONLY
+ IntervalTreeNode itree;
+#else
uintptr_t page_next[2];
tb_page_addr_t page_addr[2];
+#endif
/* jmp_lock placed here to fill a 4-byte hole. Its documentation is below */
QemuSpin jmp_lock;
@@ -619,24 +629,51 @@ static inline uint32_t tb_cflags(const TranslationBlock *tb)
static inline tb_page_addr_t tb_page_addr0(const TranslationBlock *tb)
{
+#ifdef CONFIG_USER_ONLY
+ return tb->itree.start;
+#else
return tb->page_addr[0];
+#endif
}
static inline tb_page_addr_t tb_page_addr1(const TranslationBlock *tb)
{
+#ifdef CONFIG_USER_ONLY
+ tb_page_addr_t next = tb->itree.last & TARGET_PAGE_MASK;
+ return next == (tb->itree.start & TARGET_PAGE_MASK) ? -1 : next;
+#else
return tb->page_addr[1];
+#endif
}
static inline void tb_set_page_addr0(TranslationBlock *tb,
tb_page_addr_t addr)
{
+#ifdef CONFIG_USER_ONLY
+ tb->itree.start = addr;
+ /*
+ * To begin, we record an interval of one byte. When the translation
+ * loop encounters a second page, the interval will be extended to
+ * include the first byte of the second page, which is sufficient to
+ * allow tb_page_addr1() above to work properly. The final corrected
+ * interval will be set by tb_page_add() from tb->size before the
+ * node is added to the interval tree.
+ */
+ tb->itree.last = addr;
+#else
tb->page_addr[0] = addr;
+#endif
}
static inline void tb_set_page_addr1(TranslationBlock *tb,
tb_page_addr_t addr)
{
+#ifdef CONFIG_USER_ONLY
+ /* Extend the interval to the first byte of the second page. See above. */
+ tb->itree.last = addr;
+#else
tb->page_addr[1] = addr;
+#endif
}
/* current cflags for hashing/comparison */
diff --git a/include/exec/translate-all.h b/include/exec/translate-all.h
index 3e9cb91..88602ae 100644
--- a/include/exec/translate-all.h
+++ b/include/exec/translate-all.h
@@ -23,12 +23,6 @@
/* translate-all.c */
-struct page_collection *page_collection_lock(tb_page_addr_t start,
- tb_page_addr_t end);
-void page_collection_unlock(struct page_collection *set);
-void tb_invalidate_phys_page_fast(struct page_collection *pages,
- tb_page_addr_t start, int len,
- uintptr_t retaddr);
void tb_invalidate_phys_page(tb_page_addr_t addr);
void tb_check_watchpoint(CPUState *cpu, uintptr_t retaddr);
diff --git a/include/qemu/interval-tree.h b/include/qemu/interval-tree.h
new file mode 100644
index 0000000..25006de
--- /dev/null
+++ b/include/qemu/interval-tree.h
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Interval trees.
+ *
+ * Derived from include/linux/interval_tree.h and its dependencies.
+ */
+
+#ifndef QEMU_INTERVAL_TREE_H
+#define QEMU_INTERVAL_TREE_H
+
+/*
+ * For now, don't expose Linux Red-Black Trees separately, but retain the
+ * separate type definitions to keep the implementation sane, and allow
+ * the possibility of disentangling them later.
+ */
+typedef struct RBNode
+{
+ /* Encodes parent with color in the lsb. */
+ uintptr_t rb_parent_color;
+ struct RBNode *rb_right;
+ struct RBNode *rb_left;
+} RBNode;
+
+typedef struct RBRoot
+{
+ RBNode *rb_node;
+} RBRoot;
+
+typedef struct RBRootLeftCached {
+ RBRoot rb_root;
+ RBNode *rb_leftmost;
+} RBRootLeftCached;
+
+typedef struct IntervalTreeNode
+{
+ RBNode rb;
+
+ uint64_t start; /* Start of interval */
+ uint64_t last; /* Last location _in_ interval */
+ uint64_t subtree_last;
+} IntervalTreeNode;
+
+typedef RBRootLeftCached IntervalTreeRoot;
+
+/**
+ * interval_tree_is_empty
+ * @root: root of the tree.
+ *
+ * Returns true if the tree contains no nodes.
+ */
+static inline bool interval_tree_is_empty(const IntervalTreeRoot *root)
+{
+ return root->rb_root.rb_node == NULL;
+}
+
+/**
+ * interval_tree_insert
+ * @node: node to insert,
+ * @root: root of the tree.
+ *
+ * Insert @node into @root, and rebalance.
+ */
+void interval_tree_insert(IntervalTreeNode *node, IntervalTreeRoot *root);
+
+/**
+ * interval_tree_remove
+ * @node: node to remove,
+ * @root: root of the tree.
+ *
+ * Remove @node from @root, and rebalance.
+ */
+void interval_tree_remove(IntervalTreeNode *node, IntervalTreeRoot *root);
+
+/**
+ * interval_tree_iter_first:
+ * @root: root of the tree,
+ * @start, @last: the inclusive interval [start, last].
+ *
+ * Locate the "first" of a set of nodes within the tree at @root
+ * that overlap the interval, where "first" is sorted by start.
+ * Returns NULL if no overlap found.
+ */
+IntervalTreeNode *interval_tree_iter_first(IntervalTreeRoot *root,
+ uint64_t start, uint64_t last);
+
+/**
+ * interval_tree_iter_next:
+ * @node: previous search result
+ * @start, @last: the inclusive interval [start, last].
+ *
+ * Locate the "next" of a set of nodes within the tree that overlap the
+ * interval; @next is the result of a previous call to
+ * interval_tree_iter_{first,next}. Returns NULL if @next was the last
+ * node in the set.
+ */
+IntervalTreeNode *interval_tree_iter_next(IntervalTreeNode *node,
+ uint64_t start, uint64_t last);
+
+#endif /* QEMU_INTERVAL_TREE_H */
diff --git a/tests/tcg/multiarch/test-vma.c b/tests/tcg/multiarch/test-vma.c
new file mode 100644
index 0000000..2893d60
--- /dev/null
+++ b/tests/tcg/multiarch/test-vma.c
@@ -0,0 +1,22 @@
+/*
+ * Test very large vma allocations.
+ * The qemu out-of-memory condition was within the mmap syscall itself.
+ * If the syscall actually returns with MAP_FAILED, the test succeeded.
+ */
+#include <sys/mman.h>
+
+int main()
+{
+ int n = sizeof(size_t) == 4 ? 32 : 45;
+
+ for (int i = 28; i < n; i++) {
+ size_t l = (size_t)1 << i;
+ void *p = mmap(0, l, PROT_NONE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0);
+ if (p == MAP_FAILED) {
+ break;
+ }
+ munmap(p, l);
+ }
+ return 0;
+}
diff --git a/tests/unit/meson.build b/tests/unit/meson.build
index b497a41..ffa444f 100644
--- a/tests/unit/meson.build
+++ b/tests/unit/meson.build
@@ -47,6 +47,7 @@ tests = {
'ptimer-test': ['ptimer-test-stubs.c', meson.project_source_root() / 'hw/core/ptimer.c'],
'test-qapi-util': [],
'test-smp-parse': [qom, meson.project_source_root() / 'hw/core/machine-smp.c'],
+ 'test-interval-tree': [],
}
if have_system or have_tools
diff --git a/tests/unit/test-interval-tree.c b/tests/unit/test-interval-tree.c
new file mode 100644
index 0000000..119817a
--- /dev/null
+++ b/tests/unit/test-interval-tree.c
@@ -0,0 +1,209 @@
+/*
+ * Test interval trees
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/interval-tree.h"
+
+static IntervalTreeNode nodes[20];
+static IntervalTreeRoot root;
+
+static void rand_interval(IntervalTreeNode *n, uint64_t start, uint64_t last)
+{
+ gint32 s_ofs, l_ofs, l_max;
+
+ if (last - start > INT32_MAX) {
+ l_max = INT32_MAX;
+ } else {
+ l_max = last - start;
+ }
+ s_ofs = g_test_rand_int_range(0, l_max);
+ l_ofs = g_test_rand_int_range(s_ofs, l_max);
+
+ n->start = start + s_ofs;
+ n->last = start + l_ofs;
+}
+
+static void test_empty(void)
+{
+ g_assert(root.rb_root.rb_node == NULL);
+ g_assert(root.rb_leftmost == NULL);
+ g_assert(interval_tree_iter_first(&root, 0, UINT64_MAX) == NULL);
+}
+
+static void test_find_one_point(void)
+{
+ /* Create a tree of a single node, which is the point [1,1]. */
+ nodes[0].start = 1;
+ nodes[0].last = 1;
+
+ interval_tree_insert(&nodes[0], &root);
+
+ g_assert(interval_tree_iter_first(&root, 0, 9) == &nodes[0]);
+ g_assert(interval_tree_iter_next(&nodes[0], 0, 9) == NULL);
+ g_assert(interval_tree_iter_first(&root, 0, 0) == NULL);
+ g_assert(interval_tree_iter_next(&nodes[0], 0, 0) == NULL);
+ g_assert(interval_tree_iter_first(&root, 0, 1) == &nodes[0]);
+ g_assert(interval_tree_iter_first(&root, 1, 1) == &nodes[0]);
+ g_assert(interval_tree_iter_first(&root, 1, 2) == &nodes[0]);
+ g_assert(interval_tree_iter_first(&root, 2, 2) == NULL);
+
+ interval_tree_remove(&nodes[0], &root);
+ g_assert(root.rb_root.rb_node == NULL);
+ g_assert(root.rb_leftmost == NULL);
+}
+
+static void test_find_two_point(void)
+{
+ IntervalTreeNode *find0, *find1;
+
+ /* Create a tree of a two nodes, which are both the point [1,1]. */
+ nodes[0].start = 1;
+ nodes[0].last = 1;
+ nodes[1] = nodes[0];
+
+ interval_tree_insert(&nodes[0], &root);
+ interval_tree_insert(&nodes[1], &root);
+
+ find0 = interval_tree_iter_first(&root, 0, 9);
+ g_assert(find0 == &nodes[0] || find0 == &nodes[1]);
+
+ find1 = interval_tree_iter_next(find0, 0, 9);
+ g_assert(find1 == &nodes[0] || find1 == &nodes[1]);
+ g_assert(find0 != find1);
+
+ interval_tree_remove(&nodes[1], &root);
+
+ g_assert(interval_tree_iter_first(&root, 0, 9) == &nodes[0]);
+ g_assert(interval_tree_iter_next(&nodes[0], 0, 9) == NULL);
+
+ interval_tree_remove(&nodes[0], &root);
+}
+
+static void test_find_one_range(void)
+{
+ /* Create a tree of a single node, which is the range [1,8]. */
+ nodes[0].start = 1;
+ nodes[0].last = 8;
+
+ interval_tree_insert(&nodes[0], &root);
+
+ g_assert(interval_tree_iter_first(&root, 0, 9) == &nodes[0]);
+ g_assert(interval_tree_iter_next(&nodes[0], 0, 9) == NULL);
+ g_assert(interval_tree_iter_first(&root, 0, 0) == NULL);
+ g_assert(interval_tree_iter_first(&root, 0, 1) == &nodes[0]);
+ g_assert(interval_tree_iter_first(&root, 1, 1) == &nodes[0]);
+ g_assert(interval_tree_iter_first(&root, 4, 6) == &nodes[0]);
+ g_assert(interval_tree_iter_first(&root, 8, 8) == &nodes[0]);
+ g_assert(interval_tree_iter_first(&root, 9, 9) == NULL);
+
+ interval_tree_remove(&nodes[0], &root);
+}
+
+static void test_find_one_range_many(void)
+{
+ int i;
+
+ /*
+ * Create a tree of many nodes in [0,99] and [200,299],
+ * but only one node with exactly [110,190].
+ */
+ nodes[0].start = 110;
+ nodes[0].last = 190;
+
+ for (i = 1; i < ARRAY_SIZE(nodes) / 2; ++i) {
+ rand_interval(&nodes[i], 0, 99);
+ }
+ for (; i < ARRAY_SIZE(nodes); ++i) {
+ rand_interval(&nodes[i], 200, 299);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(nodes); ++i) {
+ interval_tree_insert(&nodes[i], &root);
+ }
+
+ /* Test that we find exactly the one node. */
+ g_assert(interval_tree_iter_first(&root, 100, 199) == &nodes[0]);
+ g_assert(interval_tree_iter_next(&nodes[0], 100, 199) == NULL);
+ g_assert(interval_tree_iter_first(&root, 100, 109) == NULL);
+ g_assert(interval_tree_iter_first(&root, 100, 110) == &nodes[0]);
+ g_assert(interval_tree_iter_first(&root, 111, 120) == &nodes[0]);
+ g_assert(interval_tree_iter_first(&root, 111, 199) == &nodes[0]);
+ g_assert(interval_tree_iter_first(&root, 190, 199) == &nodes[0]);
+ g_assert(interval_tree_iter_first(&root, 192, 199) == NULL);
+
+ /*
+ * Test that if there are multiple matches, we return the one
+ * with the minimal start.
+ */
+ g_assert(interval_tree_iter_first(&root, 100, 300) == &nodes[0]);
+
+ /* Test that we don't find it after it is removed. */
+ interval_tree_remove(&nodes[0], &root);
+ g_assert(interval_tree_iter_first(&root, 100, 199) == NULL);
+
+ for (i = 1; i < ARRAY_SIZE(nodes); ++i) {
+ interval_tree_remove(&nodes[i], &root);
+ }
+}
+
+static void test_find_many_range(void)
+{
+ IntervalTreeNode *find;
+ int i, n;
+
+ n = g_test_rand_int_range(ARRAY_SIZE(nodes) / 3, ARRAY_SIZE(nodes) / 2);
+
+ /*
+ * Create a fair few nodes in [2000,2999], with the others
+ * distributed around.
+ */
+ for (i = 0; i < n; ++i) {
+ rand_interval(&nodes[i], 2000, 2999);
+ }
+ for (; i < ARRAY_SIZE(nodes) * 2 / 3; ++i) {
+ rand_interval(&nodes[i], 1000, 1899);
+ }
+ for (; i < ARRAY_SIZE(nodes); ++i) {
+ rand_interval(&nodes[i], 3100, 3999);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(nodes); ++i) {
+ interval_tree_insert(&nodes[i], &root);
+ }
+
+ /* Test that we find all of the nodes. */
+ find = interval_tree_iter_first(&root, 2000, 2999);
+ for (i = 0; find != NULL; i++) {
+ find = interval_tree_iter_next(find, 2000, 2999);
+ }
+ g_assert_cmpint(i, ==, n);
+
+ g_assert(interval_tree_iter_first(&root, 0, 999) == NULL);
+ g_assert(interval_tree_iter_first(&root, 1900, 1999) == NULL);
+ g_assert(interval_tree_iter_first(&root, 3000, 3099) == NULL);
+ g_assert(interval_tree_iter_first(&root, 4000, UINT64_MAX) == NULL);
+
+ for (i = 0; i < ARRAY_SIZE(nodes); ++i) {
+ interval_tree_remove(&nodes[i], &root);
+ }
+}
+
+int main(int argc, char **argv)
+{
+ g_test_init(&argc, &argv, NULL);
+
+ g_test_add_func("/interval-tree/empty", test_empty);
+ g_test_add_func("/interval-tree/find-one-point", test_find_one_point);
+ g_test_add_func("/interval-tree/find-two-point", test_find_two_point);
+ g_test_add_func("/interval-tree/find-one-range", test_find_one_range);
+ g_test_add_func("/interval-tree/find-one-range-many",
+ test_find_one_range_many);
+ g_test_add_func("/interval-tree/find-many-range", test_find_many_range);
+
+ return g_test_run();
+}
diff --git a/trace-events b/trace-events
index 035f3d5..b6b84b1 100644
--- a/trace-events
+++ b/trace-events
@@ -42,10 +42,6 @@ find_ram_offset(uint64_t size, uint64_t offset) "size: 0x%" PRIx64 " @ 0x%" PRIx
find_ram_offset_loop(uint64_t size, uint64_t candidate, uint64_t offset, uint64_t next, uint64_t mingap) "trying size: 0x%" PRIx64 " @ 0x%" PRIx64 ", offset: 0x%" PRIx64" next: 0x%" PRIx64 " mingap: 0x%" PRIx64
ram_block_discard_range(const char *rbname, void *hva, size_t length, bool need_madvise, bool need_fallocate, int ret) "%s@%p + 0x%zx: madvise: %d fallocate: %d ret: %d"
-# accel/tcg/cputlb.c
-memory_notdirty_write_access(uint64_t vaddr, uint64_t ram_addr, unsigned size) "0x%" PRIx64 " ram_addr 0x%" PRIx64 " size %u"
-memory_notdirty_set_dirty(uint64_t vaddr) "0x%" PRIx64
-
# job.c
job_state_transition(void *job, int ret, const char *legal, const char *s0, const char *s1) "job %p (ret: %d) attempting %s transition (%s-->%s)"
job_apply_verb(void *job, const char *state, const char *verb, const char *legal) "job %p in state %s; applying verb %s (%s)"
diff --git a/util/interval-tree.c b/util/interval-tree.c
new file mode 100644
index 0000000..4c0baf1
--- /dev/null
+++ b/util/interval-tree.c
@@ -0,0 +1,882 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#include "qemu/osdep.h"
+#include "qemu/interval-tree.h"
+#include "qemu/atomic.h"
+
+/*
+ * Red Black Trees.
+ *
+ * For now, don't expose Linux Red-Black Trees separately, but retain the
+ * separate type definitions to keep the implementation sane, and allow
+ * the possibility of separating them later.
+ *
+ * Derived from include/linux/rbtree_augmented.h and its dependencies.
+ */
+
+/*
+ * red-black trees properties: https://en.wikipedia.org/wiki/Rbtree
+ *
+ * 1) A node is either red or black
+ * 2) The root is black
+ * 3) All leaves (NULL) are black
+ * 4) Both children of every red node are black
+ * 5) Every simple path from root to leaves contains the same number
+ * of black nodes.
+ *
+ * 4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two
+ * consecutive red nodes in a path and every red node is therefore followed by
+ * a black. So if B is the number of black nodes on every simple path (as per
+ * 5), then the longest possible path due to 4 is 2B.
+ *
+ * We shall indicate color with case, where black nodes are uppercase and red
+ * nodes will be lowercase. Unknown color nodes shall be drawn as red within
+ * parentheses and have some accompanying text comment.
+ *
+ * Notes on lockless lookups:
+ *
+ * All stores to the tree structure (rb_left and rb_right) must be done using
+ * WRITE_ONCE [qatomic_set for QEMU]. And we must not inadvertently cause
+ * (temporary) loops in the tree structure as seen in program order.
+ *
+ * These two requirements will allow lockless iteration of the tree -- not
+ * correct iteration mind you, tree rotations are not atomic so a lookup might
+ * miss entire subtrees.
+ *
+ * But they do guarantee that any such traversal will only see valid elements
+ * and that it will indeed complete -- does not get stuck in a loop.
+ *
+ * It also guarantees that if the lookup returns an element it is the 'correct'
+ * one. But not returning an element does _NOT_ mean it's not present.
+ *
+ * NOTE:
+ *
+ * Stores to __rb_parent_color are not important for simple lookups so those
+ * are left undone as of now. Nor did I check for loops involving parent
+ * pointers.
+ */
+
+typedef enum RBColor
+{
+ RB_RED,
+ RB_BLACK,
+} RBColor;
+
+typedef struct RBAugmentCallbacks {
+ void (*propagate)(RBNode *node, RBNode *stop);
+ void (*copy)(RBNode *old, RBNode *new);
+ void (*rotate)(RBNode *old, RBNode *new);
+} RBAugmentCallbacks;
+
+static inline RBNode *rb_parent(const RBNode *n)
+{
+ return (RBNode *)(n->rb_parent_color & ~1);
+}
+
+static inline RBNode *rb_red_parent(const RBNode *n)
+{
+ return (RBNode *)n->rb_parent_color;
+}
+
+static inline RBColor pc_color(uintptr_t pc)
+{
+ return (RBColor)(pc & 1);
+}
+
+static inline bool pc_is_red(uintptr_t pc)
+{
+ return pc_color(pc) == RB_RED;
+}
+
+static inline bool pc_is_black(uintptr_t pc)
+{
+ return !pc_is_red(pc);
+}
+
+static inline RBColor rb_color(const RBNode *n)
+{
+ return pc_color(n->rb_parent_color);
+}
+
+static inline bool rb_is_red(const RBNode *n)
+{
+ return pc_is_red(n->rb_parent_color);
+}
+
+static inline bool rb_is_black(const RBNode *n)
+{
+ return pc_is_black(n->rb_parent_color);
+}
+
+static inline void rb_set_black(RBNode *n)
+{
+ n->rb_parent_color |= RB_BLACK;
+}
+
+static inline void rb_set_parent_color(RBNode *n, RBNode *p, RBColor color)
+{
+ n->rb_parent_color = (uintptr_t)p | color;
+}
+
+static inline void rb_set_parent(RBNode *n, RBNode *p)
+{
+ rb_set_parent_color(n, p, rb_color(n));
+}
+
+static inline void rb_link_node(RBNode *node, RBNode *parent, RBNode **rb_link)
+{
+ node->rb_parent_color = (uintptr_t)parent;
+ node->rb_left = node->rb_right = NULL;
+
+ qatomic_set(rb_link, node);
+}
+
+static RBNode *rb_next(RBNode *node)
+{
+ RBNode *parent;
+
+ /* OMIT: if empty node, return null. */
+
+ /*
+ * If we have a right-hand child, go down and then left as far as we can.
+ */
+ if (node->rb_right) {
+ node = node->rb_right;
+ while (node->rb_left) {
+ node = node->rb_left;
+ }
+ return node;
+ }
+
+ /*
+ * No right-hand children. Everything down and left is smaller than us,
+ * so any 'next' node must be in the general direction of our parent.
+ * Go up the tree; any time the ancestor is a right-hand child of its
+ * parent, keep going up. First time it's a left-hand child of its
+ * parent, said parent is our 'next' node.
+ */
+ while ((parent = rb_parent(node)) && node == parent->rb_right) {
+ node = parent;
+ }
+
+ return parent;
+}
+
+static inline void rb_change_child(RBNode *old, RBNode *new,
+ RBNode *parent, RBRoot *root)
+{
+ if (!parent) {
+ qatomic_set(&root->rb_node, new);
+ } else if (parent->rb_left == old) {
+ qatomic_set(&parent->rb_left, new);
+ } else {
+ qatomic_set(&parent->rb_right, new);
+ }
+}
+
+static inline void rb_rotate_set_parents(RBNode *old, RBNode *new,
+ RBRoot *root, RBColor color)
+{
+ RBNode *parent = rb_parent(old);
+
+ new->rb_parent_color = old->rb_parent_color;
+ rb_set_parent_color(old, new, color);
+ rb_change_child(old, new, parent, root);
+}
+
+static void rb_insert_augmented(RBNode *node, RBRoot *root,
+ const RBAugmentCallbacks *augment)
+{
+ RBNode *parent = rb_red_parent(node), *gparent, *tmp;
+
+ while (true) {
+ /*
+ * Loop invariant: node is red.
+ */
+ if (unlikely(!parent)) {
+ /*
+ * The inserted node is root. Either this is the first node, or
+ * we recursed at Case 1 below and are no longer violating 4).
+ */
+ rb_set_parent_color(node, NULL, RB_BLACK);
+ break;
+ }
+
+ /*
+ * If there is a black parent, we are done. Otherwise, take some
+ * corrective action as, per 4), we don't want a red root or two
+ * consecutive red nodes.
+ */
+ if (rb_is_black(parent)) {
+ break;
+ }
+
+ gparent = rb_red_parent(parent);
+
+ tmp = gparent->rb_right;
+ if (parent != tmp) { /* parent == gparent->rb_left */
+ if (tmp && rb_is_red(tmp)) {
+ /*
+ * Case 1 - node's uncle is red (color flips).
+ *
+ * G g
+ * / \ / \
+ * p u --> P U
+ * / /
+ * n n
+ *
+ * However, since g's parent might be red, and 4) does not
+ * allow this, we need to recurse at g.
+ */
+ rb_set_parent_color(tmp, gparent, RB_BLACK);
+ rb_set_parent_color(parent, gparent, RB_BLACK);
+ node = gparent;
+ parent = rb_parent(node);
+ rb_set_parent_color(node, parent, RB_RED);
+ continue;
+ }
+
+ tmp = parent->rb_right;
+ if (node == tmp) {
+ /*
+ * Case 2 - node's uncle is black and node is
+ * the parent's right child (left rotate at parent).
+ *
+ * G G
+ * / \ / \
+ * p U --> n U
+ * \ /
+ * n p
+ *
+ * This still leaves us in violation of 4), the
+ * continuation into Case 3 will fix that.
+ */
+ tmp = node->rb_left;
+ qatomic_set(&parent->rb_right, tmp);
+ qatomic_set(&node->rb_left, parent);
+ if (tmp) {
+ rb_set_parent_color(tmp, parent, RB_BLACK);
+ }
+ rb_set_parent_color(parent, node, RB_RED);
+ augment->rotate(parent, node);
+ parent = node;
+ tmp = node->rb_right;
+ }
+
+ /*
+ * Case 3 - node's uncle is black and node is
+ * the parent's left child (right rotate at gparent).
+ *
+ * G P
+ * / \ / \
+ * p U --> n g
+ * / \
+ * n U
+ */
+ qatomic_set(&gparent->rb_left, tmp); /* == parent->rb_right */
+ qatomic_set(&parent->rb_right, gparent);
+ if (tmp) {
+ rb_set_parent_color(tmp, gparent, RB_BLACK);
+ }
+ rb_rotate_set_parents(gparent, parent, root, RB_RED);
+ augment->rotate(gparent, parent);
+ break;
+ } else {
+ tmp = gparent->rb_left;
+ if (tmp && rb_is_red(tmp)) {
+ /* Case 1 - color flips */
+ rb_set_parent_color(tmp, gparent, RB_BLACK);
+ rb_set_parent_color(parent, gparent, RB_BLACK);
+ node = gparent;
+ parent = rb_parent(node);
+ rb_set_parent_color(node, parent, RB_RED);
+ continue;
+ }
+
+ tmp = parent->rb_left;
+ if (node == tmp) {
+ /* Case 2 - right rotate at parent */
+ tmp = node->rb_right;
+ qatomic_set(&parent->rb_left, tmp);
+ qatomic_set(&node->rb_right, parent);
+ if (tmp) {
+ rb_set_parent_color(tmp, parent, RB_BLACK);
+ }
+ rb_set_parent_color(parent, node, RB_RED);
+ augment->rotate(parent, node);
+ parent = node;
+ tmp = node->rb_left;
+ }
+
+ /* Case 3 - left rotate at gparent */
+ qatomic_set(&gparent->rb_right, tmp); /* == parent->rb_left */
+ qatomic_set(&parent->rb_left, gparent);
+ if (tmp) {
+ rb_set_parent_color(tmp, gparent, RB_BLACK);
+ }
+ rb_rotate_set_parents(gparent, parent, root, RB_RED);
+ augment->rotate(gparent, parent);
+ break;
+ }
+ }
+}
+
+static void rb_insert_augmented_cached(RBNode *node,
+ RBRootLeftCached *root, bool newleft,
+ const RBAugmentCallbacks *augment)
+{
+ if (newleft) {
+ root->rb_leftmost = node;
+ }
+ rb_insert_augmented(node, &root->rb_root, augment);
+}
+
+static void rb_erase_color(RBNode *parent, RBRoot *root,
+ const RBAugmentCallbacks *augment)
+{
+ RBNode *node = NULL, *sibling, *tmp1, *tmp2;
+
+ while (true) {
+ /*
+ * Loop invariants:
+ * - node is black (or NULL on first iteration)
+ * - node is not the root (parent is not NULL)
+ * - All leaf paths going through parent and node have a
+ * black node count that is 1 lower than other leaf paths.
+ */
+ sibling = parent->rb_right;
+ if (node != sibling) { /* node == parent->rb_left */
+ if (rb_is_red(sibling)) {
+ /*
+ * Case 1 - left rotate at parent
+ *
+ * P S
+ * / \ / \
+ * N s --> p Sr
+ * / \ / \
+ * Sl Sr N Sl
+ */
+ tmp1 = sibling->rb_left;
+ qatomic_set(&parent->rb_right, tmp1);
+ qatomic_set(&sibling->rb_left, parent);
+ rb_set_parent_color(tmp1, parent, RB_BLACK);
+ rb_rotate_set_parents(parent, sibling, root, RB_RED);
+ augment->rotate(parent, sibling);
+ sibling = tmp1;
+ }
+ tmp1 = sibling->rb_right;
+ if (!tmp1 || rb_is_black(tmp1)) {
+ tmp2 = sibling->rb_left;
+ if (!tmp2 || rb_is_black(tmp2)) {
+ /*
+ * Case 2 - sibling color flip
+ * (p could be either color here)
+ *
+ * (p) (p)
+ * / \ / \
+ * N S --> N s
+ * / \ / \
+ * Sl Sr Sl Sr
+ *
+ * This leaves us violating 5) which
+ * can be fixed by flipping p to black
+ * if it was red, or by recursing at p.
+ * p is red when coming from Case 1.
+ */
+ rb_set_parent_color(sibling, parent, RB_RED);
+ if (rb_is_red(parent)) {
+ rb_set_black(parent);
+ } else {
+ node = parent;
+ parent = rb_parent(node);
+ if (parent) {
+ continue;
+ }
+ }
+ break;
+ }
+ /*
+ * Case 3 - right rotate at sibling
+ * (p could be either color here)
+ *
+ * (p) (p)
+ * / \ / \
+ * N S --> N sl
+ * / \ \
+ * sl Sr S
+ * \
+ * Sr
+ *
+ * Note: p might be red, and then bot
+ * p and sl are red after rotation (which
+ * breaks property 4). This is fixed in
+ * Case 4 (in rb_rotate_set_parents()
+ * which set sl the color of p
+ * and set p RB_BLACK)
+ *
+ * (p) (sl)
+ * / \ / \
+ * N sl --> P S
+ * \ / \
+ * S N Sr
+ * \
+ * Sr
+ */
+ tmp1 = tmp2->rb_right;
+ qatomic_set(&sibling->rb_left, tmp1);
+ qatomic_set(&tmp2->rb_right, sibling);
+ qatomic_set(&parent->rb_right, tmp2);
+ if (tmp1) {
+ rb_set_parent_color(tmp1, sibling, RB_BLACK);
+ }
+ augment->rotate(sibling, tmp2);
+ tmp1 = sibling;
+ sibling = tmp2;
+ }
+ /*
+ * Case 4 - left rotate at parent + color flips
+ * (p and sl could be either color here.
+ * After rotation, p becomes black, s acquires
+ * p's color, and sl keeps its color)
+ *
+ * (p) (s)
+ * / \ / \
+ * N S --> P Sr
+ * / \ / \
+ * (sl) sr N (sl)
+ */
+ tmp2 = sibling->rb_left;
+ qatomic_set(&parent->rb_right, tmp2);
+ qatomic_set(&sibling->rb_left, parent);
+ rb_set_parent_color(tmp1, sibling, RB_BLACK);
+ if (tmp2) {
+ rb_set_parent(tmp2, parent);
+ }
+ rb_rotate_set_parents(parent, sibling, root, RB_BLACK);
+ augment->rotate(parent, sibling);
+ break;
+ } else {
+ sibling = parent->rb_left;
+ if (rb_is_red(sibling)) {
+ /* Case 1 - right rotate at parent */
+ tmp1 = sibling->rb_right;
+ qatomic_set(&parent->rb_left, tmp1);
+ qatomic_set(&sibling->rb_right, parent);
+ rb_set_parent_color(tmp1, parent, RB_BLACK);
+ rb_rotate_set_parents(parent, sibling, root, RB_RED);
+ augment->rotate(parent, sibling);
+ sibling = tmp1;
+ }
+ tmp1 = sibling->rb_left;
+ if (!tmp1 || rb_is_black(tmp1)) {
+ tmp2 = sibling->rb_right;
+ if (!tmp2 || rb_is_black(tmp2)) {
+ /* Case 2 - sibling color flip */
+ rb_set_parent_color(sibling, parent, RB_RED);
+ if (rb_is_red(parent)) {
+ rb_set_black(parent);
+ } else {
+ node = parent;
+ parent = rb_parent(node);
+ if (parent) {
+ continue;
+ }
+ }
+ break;
+ }
+ /* Case 3 - left rotate at sibling */
+ tmp1 = tmp2->rb_left;
+ qatomic_set(&sibling->rb_right, tmp1);
+ qatomic_set(&tmp2->rb_left, sibling);
+ qatomic_set(&parent->rb_left, tmp2);
+ if (tmp1) {
+ rb_set_parent_color(tmp1, sibling, RB_BLACK);
+ }
+ augment->rotate(sibling, tmp2);
+ tmp1 = sibling;
+ sibling = tmp2;
+ }
+ /* Case 4 - right rotate at parent + color flips */
+ tmp2 = sibling->rb_right;
+ qatomic_set(&parent->rb_left, tmp2);
+ qatomic_set(&sibling->rb_right, parent);
+ rb_set_parent_color(tmp1, sibling, RB_BLACK);
+ if (tmp2) {
+ rb_set_parent(tmp2, parent);
+ }
+ rb_rotate_set_parents(parent, sibling, root, RB_BLACK);
+ augment->rotate(parent, sibling);
+ break;
+ }
+ }
+}
+
+static void rb_erase_augmented(RBNode *node, RBRoot *root,
+ const RBAugmentCallbacks *augment)
+{
+ RBNode *child = node->rb_right;
+ RBNode *tmp = node->rb_left;
+ RBNode *parent, *rebalance;
+ uintptr_t pc;
+
+ if (!tmp) {
+ /*
+ * Case 1: node to erase has no more than 1 child (easy!)
+ *
+ * Note that if there is one child it must be red due to 5)
+ * and node must be black due to 4). We adjust colors locally
+ * so as to bypass rb_erase_color() later on.
+ */
+ pc = node->rb_parent_color;
+ parent = rb_parent(node);
+ rb_change_child(node, child, parent, root);
+ if (child) {
+ child->rb_parent_color = pc;
+ rebalance = NULL;
+ } else {
+ rebalance = pc_is_black(pc) ? parent : NULL;
+ }
+ tmp = parent;
+ } else if (!child) {
+ /* Still case 1, but this time the child is node->rb_left */
+ pc = node->rb_parent_color;
+ parent = rb_parent(node);
+ tmp->rb_parent_color = pc;
+ rb_change_child(node, tmp, parent, root);
+ rebalance = NULL;
+ tmp = parent;
+ } else {
+ RBNode *successor = child, *child2;
+ tmp = child->rb_left;
+ if (!tmp) {
+ /*
+ * Case 2: node's successor is its right child
+ *
+ * (n) (s)
+ * / \ / \
+ * (x) (s) -> (x) (c)
+ * \
+ * (c)
+ */
+ parent = successor;
+ child2 = successor->rb_right;
+
+ augment->copy(node, successor);
+ } else {
+ /*
+ * Case 3: node's successor is leftmost under
+ * node's right child subtree
+ *
+ * (n) (s)
+ * / \ / \
+ * (x) (y) -> (x) (y)
+ * / /
+ * (p) (p)
+ * / /
+ * (s) (c)
+ * \
+ * (c)
+ */
+ do {
+ parent = successor;
+ successor = tmp;
+ tmp = tmp->rb_left;
+ } while (tmp);
+ child2 = successor->rb_right;
+ qatomic_set(&parent->rb_left, child2);
+ qatomic_set(&successor->rb_right, child);
+ rb_set_parent(child, successor);
+
+ augment->copy(node, successor);
+ augment->propagate(parent, successor);
+ }
+
+ tmp = node->rb_left;
+ qatomic_set(&successor->rb_left, tmp);
+ rb_set_parent(tmp, successor);
+
+ pc = node->rb_parent_color;
+ tmp = rb_parent(node);
+ rb_change_child(node, successor, tmp, root);
+
+ if (child2) {
+ rb_set_parent_color(child2, parent, RB_BLACK);
+ rebalance = NULL;
+ } else {
+ rebalance = rb_is_black(successor) ? parent : NULL;
+ }
+ successor->rb_parent_color = pc;
+ tmp = successor;
+ }
+
+ augment->propagate(tmp, NULL);
+
+ if (rebalance) {
+ rb_erase_color(rebalance, root, augment);
+ }
+}
+
+static void rb_erase_augmented_cached(RBNode *node, RBRootLeftCached *root,
+ const RBAugmentCallbacks *augment)
+{
+ if (root->rb_leftmost == node) {
+ root->rb_leftmost = rb_next(node);
+ }
+ rb_erase_augmented(node, &root->rb_root, augment);
+}
+
+
+/*
+ * Interval trees.
+ *
+ * Derived from lib/interval_tree.c and its dependencies,
+ * especially include/linux/interval_tree_generic.h.
+ */
+
+#define rb_to_itree(N) container_of(N, IntervalTreeNode, rb)
+
+static bool interval_tree_compute_max(IntervalTreeNode *node, bool exit)
+{
+ IntervalTreeNode *child;
+ uint64_t max = node->last;
+
+ if (node->rb.rb_left) {
+ child = rb_to_itree(node->rb.rb_left);
+ if (child->subtree_last > max) {
+ max = child->subtree_last;
+ }
+ }
+ if (node->rb.rb_right) {
+ child = rb_to_itree(node->rb.rb_right);
+ if (child->subtree_last > max) {
+ max = child->subtree_last;
+ }
+ }
+ if (exit && node->subtree_last == max) {
+ return true;
+ }
+ node->subtree_last = max;
+ return false;
+}
+
+static void interval_tree_propagate(RBNode *rb, RBNode *stop)
+{
+ while (rb != stop) {
+ IntervalTreeNode *node = rb_to_itree(rb);
+ if (interval_tree_compute_max(node, true)) {
+ break;
+ }
+ rb = rb_parent(&node->rb);
+ }
+}
+
+static void interval_tree_copy(RBNode *rb_old, RBNode *rb_new)
+{
+ IntervalTreeNode *old = rb_to_itree(rb_old);
+ IntervalTreeNode *new = rb_to_itree(rb_new);
+
+ new->subtree_last = old->subtree_last;
+}
+
+static void interval_tree_rotate(RBNode *rb_old, RBNode *rb_new)
+{
+ IntervalTreeNode *old = rb_to_itree(rb_old);
+ IntervalTreeNode *new = rb_to_itree(rb_new);
+
+ new->subtree_last = old->subtree_last;
+ interval_tree_compute_max(old, false);
+}
+
+static const RBAugmentCallbacks interval_tree_augment = {
+ .propagate = interval_tree_propagate,
+ .copy = interval_tree_copy,
+ .rotate = interval_tree_rotate,
+};
+
+/* Insert / remove interval nodes from the tree */
+void interval_tree_insert(IntervalTreeNode *node, IntervalTreeRoot *root)
+{
+ RBNode **link = &root->rb_root.rb_node, *rb_parent = NULL;
+ uint64_t start = node->start, last = node->last;
+ IntervalTreeNode *parent;
+ bool leftmost = true;
+
+ while (*link) {
+ rb_parent = *link;
+ parent = rb_to_itree(rb_parent);
+
+ if (parent->subtree_last < last) {
+ parent->subtree_last = last;
+ }
+ if (start < parent->start) {
+ link = &parent->rb.rb_left;
+ } else {
+ link = &parent->rb.rb_right;
+ leftmost = false;
+ }
+ }
+
+ node->subtree_last = last;
+ rb_link_node(&node->rb, rb_parent, link);
+ rb_insert_augmented_cached(&node->rb, root, leftmost,
+ &interval_tree_augment);
+}
+
+void interval_tree_remove(IntervalTreeNode *node, IntervalTreeRoot *root)
+{
+ rb_erase_augmented_cached(&node->rb, root, &interval_tree_augment);
+}
+
+/*
+ * Iterate over intervals intersecting [start;last]
+ *
+ * Note that a node's interval intersects [start;last] iff:
+ * Cond1: node->start <= last
+ * and
+ * Cond2: start <= node->last
+ */
+
+static IntervalTreeNode *interval_tree_subtree_search(IntervalTreeNode *node,
+ uint64_t start,
+ uint64_t last)
+{
+ while (true) {
+ /*
+ * Loop invariant: start <= node->subtree_last
+ * (Cond2 is satisfied by one of the subtree nodes)
+ */
+ if (node->rb.rb_left) {
+ IntervalTreeNode *left = rb_to_itree(node->rb.rb_left);
+
+ if (start <= left->subtree_last) {
+ /*
+ * Some nodes in left subtree satisfy Cond2.
+ * Iterate to find the leftmost such node N.
+ * If it also satisfies Cond1, that's the
+ * match we are looking for. Otherwise, there
+ * is no matching interval as nodes to the
+ * right of N can't satisfy Cond1 either.
+ */
+ node = left;
+ continue;
+ }
+ }
+ if (node->start <= last) { /* Cond1 */
+ if (start <= node->last) { /* Cond2 */
+ return node; /* node is leftmost match */
+ }
+ if (node->rb.rb_right) {
+ node = rb_to_itree(node->rb.rb_right);
+ if (start <= node->subtree_last) {
+ continue;
+ }
+ }
+ }
+ return NULL; /* no match */
+ }
+}
+
+IntervalTreeNode *interval_tree_iter_first(IntervalTreeRoot *root,
+ uint64_t start, uint64_t last)
+{
+ IntervalTreeNode *node, *leftmost;
+
+ if (!root->rb_root.rb_node) {
+ return NULL;
+ }
+
+ /*
+ * Fastpath range intersection/overlap between A: [a0, a1] and
+ * B: [b0, b1] is given by:
+ *
+ * a0 <= b1 && b0 <= a1
+ *
+ * ... where A holds the lock range and B holds the smallest
+ * 'start' and largest 'last' in the tree. For the later, we
+ * rely on the root node, which by augmented interval tree
+ * property, holds the largest value in its last-in-subtree.
+ * This allows mitigating some of the tree walk overhead for
+ * for non-intersecting ranges, maintained and consulted in O(1).
+ */
+ node = rb_to_itree(root->rb_root.rb_node);
+ if (node->subtree_last < start) {
+ return NULL;
+ }
+
+ leftmost = rb_to_itree(root->rb_leftmost);
+ if (leftmost->start > last) {
+ return NULL;
+ }
+
+ return interval_tree_subtree_search(node, start, last);
+}
+
+IntervalTreeNode *interval_tree_iter_next(IntervalTreeNode *node,
+ uint64_t start, uint64_t last)
+{
+ RBNode *rb = node->rb.rb_right, *prev;
+
+ while (true) {
+ /*
+ * Loop invariants:
+ * Cond1: node->start <= last
+ * rb == node->rb.rb_right
+ *
+ * First, search right subtree if suitable
+ */
+ if (rb) {
+ IntervalTreeNode *right = rb_to_itree(rb);
+
+ if (start <= right->subtree_last) {
+ return interval_tree_subtree_search(right, start, last);
+ }
+ }
+
+ /* Move up the tree until we come from a node's left child */
+ do {
+ rb = rb_parent(&node->rb);
+ if (!rb) {
+ return NULL;
+ }
+ prev = &node->rb;
+ node = rb_to_itree(rb);
+ rb = node->rb.rb_right;
+ } while (prev == rb);
+
+ /* Check if the node intersects [start;last] */
+ if (last < node->start) { /* !Cond1 */
+ return NULL;
+ }
+ if (start <= node->last) { /* Cond2 */
+ return node;
+ }
+ }
+}
+
+/* Occasionally useful for calling from within the debugger. */
+#if 0
+static void debug_interval_tree_int(IntervalTreeNode *node,
+ const char *dir, int level)
+{
+ printf("%4d %*s %s [%" PRIu64 ",%" PRIu64 "] subtree_last:%" PRIu64 "\n",
+ level, level + 1, dir, rb_is_red(&node->rb) ? "r" : "b",
+ node->start, node->last, node->subtree_last);
+
+ if (node->rb.rb_left) {
+ debug_interval_tree_int(rb_to_itree(node->rb.rb_left), "<", level + 1);
+ }
+ if (node->rb.rb_right) {
+ debug_interval_tree_int(rb_to_itree(node->rb.rb_right), ">", level + 1);
+ }
+}
+
+void debug_interval_tree(IntervalTreeNode *node);
+void debug_interval_tree(IntervalTreeNode *node)
+{
+ if (node) {
+ debug_interval_tree_int(node, "*", 0);
+ } else {
+ printf("null\n");
+ }
+}
+#endif
diff --git a/util/meson.build b/util/meson.build
index 25b9b61..d8d109f 100644
--- a/util/meson.build
+++ b/util/meson.build
@@ -57,6 +57,7 @@ util_ss.add(files('guest-random.c'))
util_ss.add(files('yank.c'))
util_ss.add(files('int128.c'))
util_ss.add(files('memalign.c'))
+util_ss.add(files('interval-tree.c'))
if have_user
util_ss.add(files('selfmap.c'))