diff options
author | Richard Henderson <rth@twiddle.net> | 2010-03-10 15:53:37 -0800 |
---|---|---|
committer | Paul Brook <paul@codesourcery.com> | 2010-03-12 16:31:09 +0000 |
commit | 5cd2c5b6ad75c46d40118ac67c0c09d4e7930a65 (patch) | |
tree | 6d024eefeab2a13c4b8a39335a028aa7a151c623 /exec.c | |
parent | 14f24e1465edc44b9b4d89fbbea66e06088154e1 (diff) | |
download | qemu-5cd2c5b6ad75c46d40118ac67c0c09d4e7930a65.zip qemu-5cd2c5b6ad75c46d40118ac67c0c09d4e7930a65.tar.gz qemu-5cd2c5b6ad75c46d40118ac67c0c09d4e7930a65.tar.bz2 |
Implement multi-level page tables.
Define L1_MAP_ADDR_SPACE_BITS to be either the virtual address size
(in user mode) or physical address size (in system mode), and use
that to size l1_map. This rewrites page_find_alloc, page_flush_tb,
and walk_memory_regions.
Use TARGET_PHYS_ADDR_SPACE_BITS for the physical memory map based
off of l1_phys_map. This rewrites page_phys_find_alloc and
phys_page_for_each.
Signed-off-by: Richard Henderson <rth@twiddle.net>
Diffstat (limited to 'exec.c')
-rw-r--r-- | exec.c | 445 |
1 files changed, 270 insertions, 175 deletions
@@ -141,30 +141,56 @@ typedef struct PhysPageDesc { ram_addr_t region_offset; } PhysPageDesc; -#define L2_BITS 10 -#if defined(CONFIG_USER_ONLY) && defined(TARGET_VIRT_ADDR_SPACE_BITS) -/* XXX: this is a temporary hack for alpha target. - * In the future, this is to be replaced by a multi-level table - * to actually be able to handle the complete 64 bits address space. - */ -#define L1_BITS (TARGET_VIRT_ADDR_SPACE_BITS - L2_BITS - TARGET_PAGE_BITS) +/* In system mode we want L1_MAP to be based on physical addresses, + while in user mode we want it to be based on virtual addresses. */ +#if !defined(CONFIG_USER_ONLY) +# define L1_MAP_ADDR_SPACE_BITS TARGET_PHYS_ADDR_SPACE_BITS #else -#define L1_BITS (32 - L2_BITS - TARGET_PAGE_BITS) +# define L1_MAP_ADDR_SPACE_BITS TARGET_VIRT_ADDR_SPACE_BITS #endif -#define L1_SIZE (1 << L1_BITS) +/* Size of the L2 (and L3, etc) page tables. */ +#define L2_BITS 10 #define L2_SIZE (1 << L2_BITS) +/* The bits remaining after N lower levels of page tables. */ +#define P_L1_BITS_REM \ + ((TARGET_PHYS_ADDR_SPACE_BITS - TARGET_PAGE_BITS) % L2_BITS) +#define V_L1_BITS_REM \ + ((L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS) % L2_BITS) + +/* Size of the L1 page table. Avoid silly small sizes. */ +#if P_L1_BITS_REM < 4 +#define P_L1_BITS (P_L1_BITS_REM + L2_BITS) +#else +#define P_L1_BITS P_L1_BITS_REM +#endif + +#if V_L1_BITS_REM < 4 +#define V_L1_BITS (V_L1_BITS_REM + L2_BITS) +#else +#define V_L1_BITS V_L1_BITS_REM +#endif + +#define P_L1_SIZE ((target_phys_addr_t)1 << P_L1_BITS) +#define V_L1_SIZE ((target_ulong)1 << V_L1_BITS) + +#define P_L1_SHIFT (TARGET_PHYS_ADDR_SPACE_BITS - TARGET_PAGE_BITS - P_L1_BITS) +#define V_L1_SHIFT (L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS - V_L1_BITS) + unsigned long qemu_real_host_page_size; unsigned long qemu_host_page_bits; unsigned long qemu_host_page_size; unsigned long qemu_host_page_mask; -/* XXX: for system emulation, it could just be an array */ -static PageDesc *l1_map[L1_SIZE]; +/* This is a multi-level map on the virtual address space. + The bottom level has pointers to PageDesc. */ +static void *l1_map[V_L1_SIZE]; #if !defined(CONFIG_USER_ONLY) -static PhysPageDesc **l1_phys_map; +/* This is a multi-level map on the physical address space. + The bottom level has pointers to PhysPageDesc. */ +static void *l1_phys_map[P_L1_SIZE]; static void io_mem_init(void); @@ -239,133 +265,159 @@ static void page_init(void) while ((1 << qemu_host_page_bits) < qemu_host_page_size) qemu_host_page_bits++; qemu_host_page_mask = ~(qemu_host_page_size - 1); -#if !defined(CONFIG_USER_ONLY) - l1_phys_map = qemu_vmalloc(L1_SIZE * sizeof(void *)); - memset(l1_phys_map, 0, L1_SIZE * sizeof(void *)); -#endif #if !defined(_WIN32) && defined(CONFIG_USER_ONLY) { - long long startaddr, endaddr; FILE *f; - int n; - mmap_lock(); last_brk = (unsigned long)sbrk(0); + f = fopen("/proc/self/maps", "r"); if (f) { + mmap_lock(); + do { - n = fscanf (f, "%llx-%llx %*[^\n]\n", &startaddr, &endaddr); - if (n == 2) { - startaddr = MIN(startaddr, - (1ULL << TARGET_PHYS_ADDR_SPACE_BITS) - 1); - endaddr = MIN(endaddr, - (1ULL << TARGET_PHYS_ADDR_SPACE_BITS) - 1); - page_set_flags(startaddr & TARGET_PAGE_MASK, - TARGET_PAGE_ALIGN(endaddr), - PAGE_RESERVED); + unsigned long startaddr, endaddr; + int n; + + n = fscanf (f, "%lx-%lx %*[^\n]\n", &startaddr, &endaddr); + + if (n == 2 && h2g_valid(startaddr)) { + startaddr = h2g(startaddr) & TARGET_PAGE_MASK; + + if (h2g_valid(endaddr)) { + endaddr = h2g(endaddr); + } else { + endaddr = ~0ul; + } + page_set_flags(startaddr, endaddr, PAGE_RESERVED); } } while (!feof(f)); + fclose(f); + mmap_unlock(); } - mmap_unlock(); } #endif } -static inline PageDesc **page_l1_map(target_ulong index) +static PageDesc *page_find_alloc(target_ulong index, int alloc) { -#if TARGET_LONG_BITS > 32 - /* Host memory outside guest VM. For 32-bit targets we have already - excluded high addresses. */ - if (index > ((target_ulong)L2_SIZE * L1_SIZE)) - return NULL; +#if defined(CONFIG_USER_ONLY) + /* We can't use qemu_malloc because it may recurse into a locked mutex. + Neither can we record the new pages we reserve while allocating a + given page because that may recurse into an unallocated page table + entry. Stuff the allocations we do make into a queue and process + them after having completed one entire page table allocation. */ + + unsigned long reserve[2 * (V_L1_SHIFT / L2_BITS)]; + int reserve_idx = 0; + +# define ALLOC(P, SIZE) \ + do { \ + P = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, \ + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); \ + if (h2g_valid(P)) { \ + reserve[reserve_idx] = h2g(P); \ + reserve[reserve_idx + 1] = SIZE; \ + reserve_idx += 2; \ + } \ + } while (0) +#else +# define ALLOC(P, SIZE) \ + do { P = qemu_mallocz(SIZE); } while (0) #endif - return &l1_map[index >> L2_BITS]; -} -static inline PageDesc *page_find_alloc(target_ulong index) -{ - PageDesc **lp, *p; - lp = page_l1_map(index); - if (!lp) - return NULL; + PageDesc *pd; + void **lp; + int i; - p = *lp; - if (!p) { - /* allocate if not found */ -#if defined(CONFIG_USER_ONLY) - size_t len = sizeof(PageDesc) * L2_SIZE; - /* Don't use qemu_malloc because it may recurse. */ - p = mmap(NULL, len, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - *lp = p; - if (h2g_valid(p)) { - unsigned long addr = h2g(p); - page_set_flags(addr & TARGET_PAGE_MASK, - TARGET_PAGE_ALIGN(addr + len), - PAGE_RESERVED); + /* Level 1. Always allocated. */ + lp = l1_map + ((index >> V_L1_SHIFT) & (V_L1_SIZE - 1)); + + /* Level 2..N-1. */ + for (i = V_L1_SHIFT / L2_BITS - 1; i > 0; i--) { + void **p = *lp; + + if (p == NULL) { + if (!alloc) { + return NULL; + } + ALLOC(p, sizeof(void *) * L2_SIZE); + *lp = p; } -#else - p = qemu_mallocz(sizeof(PageDesc) * L2_SIZE); - *lp = p; -#endif + + lp = p + ((index >> (i * L2_BITS)) & (L2_SIZE - 1)); } - return p + (index & (L2_SIZE - 1)); + + pd = *lp; + if (pd == NULL) { + if (!alloc) { + return NULL; + } + ALLOC(pd, sizeof(PageDesc) * L2_SIZE); + *lp = pd; + } + +#undef ALLOC +#if defined(CONFIG_USER_ONLY) + for (i = 0; i < reserve_idx; i += 2) { + unsigned long addr = reserve[i]; + unsigned long len = reserve[i + 1]; + + page_set_flags(addr & TARGET_PAGE_MASK, + TARGET_PAGE_ALIGN(addr + len), + PAGE_RESERVED); + } +#endif + + return pd + (index & (L2_SIZE - 1)); } static inline PageDesc *page_find(target_ulong index) { - PageDesc **lp, *p; - lp = page_l1_map(index); - if (!lp) - return NULL; - - p = *lp; - if (!p) { - return NULL; - } - return p + (index & (L2_SIZE - 1)); + return page_find_alloc(index, 0); } #if !defined(CONFIG_USER_ONLY) static PhysPageDesc *phys_page_find_alloc(target_phys_addr_t index, int alloc) { - void **lp, **p; PhysPageDesc *pd; + void **lp; + int i; - p = (void **)l1_phys_map; -#if TARGET_PHYS_ADDR_SPACE_BITS > 32 + /* Level 1. Always allocated. */ + lp = l1_phys_map + ((index >> P_L1_SHIFT) & (P_L1_SIZE - 1)); -#if TARGET_PHYS_ADDR_SPACE_BITS > (32 + L1_BITS) -#error unsupported TARGET_PHYS_ADDR_SPACE_BITS -#endif - lp = p + ((index >> (L1_BITS + L2_BITS)) & (L1_SIZE - 1)); - p = *lp; - if (!p) { - /* allocate if not found */ - if (!alloc) - return NULL; - p = qemu_vmalloc(sizeof(void *) * L1_SIZE); - memset(p, 0, sizeof(void *) * L1_SIZE); - *lp = p; + /* Level 2..N-1. */ + for (i = P_L1_SHIFT / L2_BITS - 1; i > 0; i--) { + void **p = *lp; + if (p == NULL) { + if (!alloc) { + return NULL; + } + *lp = p = qemu_mallocz(sizeof(void *) * L2_SIZE); + } + lp = p + ((index >> (i * L2_BITS)) & (L2_SIZE - 1)); } -#endif - lp = p + ((index >> L2_BITS) & (L1_SIZE - 1)); + pd = *lp; - if (!pd) { + if (pd == NULL) { int i; - /* allocate if not found */ - if (!alloc) + + if (!alloc) { return NULL; - pd = qemu_vmalloc(sizeof(PhysPageDesc) * L2_SIZE); - *lp = pd; + } + + *lp = pd = qemu_malloc(sizeof(PhysPageDesc) * L2_SIZE); + for (i = 0; i < L2_SIZE; i++) { - pd[i].phys_offset = IO_MEM_UNASSIGNED; - pd[i].region_offset = (index + i) << TARGET_PAGE_BITS; + pd[i].phys_offset = IO_MEM_UNASSIGNED; + pd[i].region_offset = (index + i) << TARGET_PAGE_BITS; } } - return ((PhysPageDesc *)pd) + (index & (L2_SIZE - 1)); + + return pd + (index & (L2_SIZE - 1)); } static inline PhysPageDesc *phys_page_find(target_phys_addr_t index) @@ -573,24 +625,37 @@ static inline void invalidate_page_bitmap(PageDesc *p) p->code_write_count = 0; } -/* set to NULL all the 'first_tb' fields in all PageDescs */ -static void page_flush_tb(void) +/* Set to NULL all the 'first_tb' fields in all PageDescs. */ + +static void page_flush_tb_1 (int level, void **lp) { - int i, j; - PageDesc *p; + int i; - for(i = 0; i < L1_SIZE; i++) { - p = l1_map[i]; - if (p) { - for(j = 0; j < L2_SIZE; j++) { - p->first_tb = NULL; - invalidate_page_bitmap(p); - p++; - } + if (*lp == NULL) { + return; + } + if (level == 0) { + PageDesc *pd = *lp; + for (i = 0; i < L2_BITS; ++i) { + pd[i].first_tb = NULL; + invalidate_page_bitmap(pd + i); + } + } else { + void **pp = *lp; + for (i = 0; i < L2_BITS; ++i) { + page_flush_tb_1 (level - 1, pp + i); } } } +static void page_flush_tb(void) +{ + int i; + for (i = 0; i < V_L1_SIZE; i++) { + page_flush_tb_1(V_L1_SHIFT / L2_BITS - 1, l1_map + i); + } +} + /* flush all the translation blocks */ /* XXX: tb_flush is currently not thread safe */ void tb_flush(CPUState *env1) @@ -1081,7 +1146,7 @@ static inline void tb_alloc_page(TranslationBlock *tb, TranslationBlock *last_first_tb; tb->page_addr[n] = page_addr; - p = page_find_alloc(page_addr >> TARGET_PAGE_BITS); + p = page_find_alloc(page_addr >> TARGET_PAGE_BITS, 1); tb->page_next[n] = p->first_tb; last_first_tb = p->first_tb; p->first_tb = (TranslationBlock *)((long)tb | n); @@ -1641,50 +1706,37 @@ static int cpu_notify_migration_log(int enable) return 0; } -static void phys_page_for_each_in_l1_map(PhysPageDesc **phys_map, - CPUPhysMemoryClient *client) +static void phys_page_for_each_1(CPUPhysMemoryClient *client, + int level, void **lp) { - PhysPageDesc *pd; - int l1, l2; + int i; - for (l1 = 0; l1 < L1_SIZE; ++l1) { - pd = phys_map[l1]; - if (!pd) { - continue; - } - for (l2 = 0; l2 < L2_SIZE; ++l2) { - if (pd[l2].phys_offset == IO_MEM_UNASSIGNED) { - continue; + if (*lp == NULL) { + return; + } + if (level == 0) { + PhysPageDesc *pd = *lp; + for (i = 0; i < L2_BITS; ++i) { + if (pd[i].phys_offset != IO_MEM_UNASSIGNED) { + client->set_memory(client, pd[i].region_offset, + TARGET_PAGE_SIZE, pd[i].phys_offset); } - client->set_memory(client, pd[l2].region_offset, - TARGET_PAGE_SIZE, pd[l2].phys_offset); + } + } else { + void **pp = *lp; + for (i = 0; i < L2_BITS; ++i) { + phys_page_for_each_1(client, level - 1, pp + i); } } } static void phys_page_for_each(CPUPhysMemoryClient *client) { -#if TARGET_PHYS_ADDR_SPACE_BITS > 32 - -#if TARGET_PHYS_ADDR_SPACE_BITS > (32 + L1_BITS) -#error unsupported TARGET_PHYS_ADDR_SPACE_BITS -#endif - void **phys_map = (void **)l1_phys_map; - int l1; - if (!l1_phys_map) { - return; - } - for (l1 = 0; l1 < L1_SIZE; ++l1) { - if (phys_map[l1]) { - phys_page_for_each_in_l1_map(phys_map[l1], client); - } - } -#else - if (!l1_phys_map) { - return; + int i; + for (i = 0; i < P_L1_SIZE; ++i) { + phys_page_for_each_1(client, P_L1_SHIFT / L2_BITS - 1, + l1_phys_map + 1); } - phys_page_for_each_in_l1_map(l1_phys_map, client); -#endif } void cpu_register_phys_memory_client(CPUPhysMemoryClient *client) @@ -2148,44 +2200,87 @@ void tlb_flush_page(CPUState *env, target_ulong addr) * Walks guest process memory "regions" one by one * and calls callback function 'fn' for each region. */ -int walk_memory_regions(void *priv, - int (*fn)(void *, unsigned long, unsigned long, unsigned long)) + +struct walk_memory_regions_data { - unsigned long start, end; - PageDesc *p = NULL; - int i, j, prot, prot1; - int rc = 0; + walk_memory_regions_fn fn; + void *priv; + unsigned long start; + int prot; +}; - start = end = -1; - prot = 0; +static int walk_memory_regions_end(struct walk_memory_regions_data *data, + unsigned long end, int new_prot) +{ + if (data->start != -1ul) { + int rc = data->fn(data->priv, data->start, end, data->prot); + if (rc != 0) { + return rc; + } + } + + data->start = (new_prot ? end : -1ul); + data->prot = new_prot; + + return 0; +} + +static int walk_memory_regions_1(struct walk_memory_regions_data *data, + unsigned long base, int level, void **lp) +{ + unsigned long pa; + int i, rc; - for (i = 0; i <= L1_SIZE; i++) { - p = (i < L1_SIZE) ? l1_map[i] : NULL; - for (j = 0; j < L2_SIZE; j++) { - prot1 = (p == NULL) ? 0 : p[j].flags; - /* - * "region" is one continuous chunk of memory - * that has same protection flags set. - */ - if (prot1 != prot) { - end = (i << (32 - L1_BITS)) | (j << TARGET_PAGE_BITS); - if (start != -1) { - rc = (*fn)(priv, start, end, prot); - /* callback can stop iteration by returning != 0 */ - if (rc != 0) - return (rc); + if (*lp == NULL) { + return walk_memory_regions_end(data, base, 0); + } + + if (level == 0) { + PageDesc *pd = *lp; + for (i = 0; i < L2_BITS; ++i) { + int prot = pd[i].flags; + + pa = base | (i << TARGET_PAGE_BITS); + if (prot != data->prot) { + rc = walk_memory_regions_end(data, pa, prot); + if (rc != 0) { + return rc; } - if (prot1 != 0) - start = end; - else - start = -1; - prot = prot1; } - if (p == NULL) - break; + } + } else { + void **pp = *lp; + for (i = 0; i < L2_BITS; ++i) { + pa = base | (i << (TARGET_PAGE_BITS + L2_BITS * level)); + rc = walk_memory_regions_1(data, pa, level - 1, pp + i); + if (rc != 0) { + return rc; + } + } + } + + return 0; +} + +int walk_memory_regions(void *priv, walk_memory_regions_fn fn) +{ + struct walk_memory_regions_data data; + unsigned long i; + + data.fn = fn; + data.priv = priv; + data.start = -1ul; + data.prot = 0; + + for (i = 0; i < V_L1_SIZE; i++) { + int rc = walk_memory_regions_1(&data, i << V_L1_SHIFT, + V_L1_SHIFT / L2_BITS - 1, l1_map + i); + if (rc != 0) { + return rc; } } - return (rc); + + return walk_memory_regions_end(&data, 0, 0); } static int dump_region(void *priv, unsigned long start, |