diff options
48 files changed, 3732 insertions, 2138 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 24b71a4..7d134a8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4318,6 +4318,7 @@ F: docs/system/devices/vfio-user.rst F: hw/vfio-user/* F: include/hw/vfio-user/* F: subprojects/libvfio-user +F: tests/functional/x86_64/test_vfio_user_client.py EBPF: M: Jason Wang <jasowang@redhat.com> diff --git a/contrib/plugins/execlog.c b/contrib/plugins/execlog.c index 06ec76d..811f320 100644 --- a/contrib/plugins/execlog.c +++ b/contrib/plugins/execlog.c @@ -95,6 +95,7 @@ static void insn_check_regs(CPU *cpu) g_byte_array_set_size(reg->new, 0); sz = qemu_plugin_read_register(reg->handle, reg->new); + g_assert(sz > 0); g_assert(sz == reg->last->len); if (memcmp(reg->last->data, reg->new->data, sz)) { diff --git a/contrib/plugins/meson.build b/contrib/plugins/meson.build index 1876bc7..7eb3629 100644 --- a/contrib/plugins/meson.build +++ b/contrib/plugins/meson.build @@ -1,5 +1,6 @@ contrib_plugins = ['bbv', 'cache', 'cflow', 'drcov', 'execlog', 'hotblocks', - 'hotpages', 'howvec', 'hwprofile', 'ips', 'stoptrigger'] + 'hotpages', 'howvec', 'hwprofile', 'ips', 'stoptrigger', + 'uftrace'] if host_os != 'windows' # lockstep uses socket.h contrib_plugins += 'lockstep' diff --git a/contrib/plugins/uftrace.c b/contrib/plugins/uftrace.c new file mode 100644 index 0000000..b7d6124 --- /dev/null +++ b/contrib/plugins/uftrace.c @@ -0,0 +1,878 @@ +/* + * Copyright (C) 2025, Pierrick Bouvier <pierrick.bouvier@linaro.org> + * + * Generates a trace compatible with uftrace (similar to uftrace record). + * https://github.com/namhyung/uftrace + * + * See docs/about/emulation.rst|Uftrace for details and examples. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include <qemu-plugin.h> +#include <glib.h> +#include <stdio.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <time.h> +#include <unistd.h> + +#define MiB (INT64_C(1) << 20) +#define NANOSECONDS_PER_SECOND 1000000000LL +#define TRACE_FLUSH_SIZE (32 * MiB) +#define TRACE_ID_SCALE 100 + +QEMU_PLUGIN_EXPORT int qemu_plugin_version = QEMU_PLUGIN_VERSION; + +typedef struct { + GArray *s; +} Callstack; + +typedef struct { + uint64_t pc; + uint64_t frame_pointer; +} CallstackEntry; + +typedef struct { + GArray *t; + GString *path; + GString *name; + uint32_t id; +} Trace; + +typedef struct Cpu Cpu; + +typedef struct { + void (*init)(Cpu *cpu); + void (*end)(Cpu *cpu); + uint64_t (*get_frame_pointer)(Cpu *cpu); + uint8_t (*get_privilege_level)(Cpu *cpu); + uint8_t (*num_privilege_levels)(void); + const char *(*get_privilege_level_name)(uint8_t pl); + bool (*does_insn_modify_frame_pointer)(const char *disas); +} CpuOps; + +typedef struct Cpu { + Trace *trace; + Callstack *cs; + uint8_t privilege_level; + GArray *traces; /* Trace *traces [] */ + GByteArray *buf; + CpuOps ops; + void *arch; +} Cpu; + +typedef enum { + AARCH64_EL0_SECURE, + AARCH64_EL0_NONSECURE, + AARCH64_EL0_REALM, + AARCH64_EL1_SECURE, + AARCH64_EL1_NONSECURE, + AARCH64_EL1_REALM, + AARCH64_EL2_SECURE, + AARCH64_EL2_NONSECURE, + AARCH64_EL2_REALM, + AARCH64_EL3, + AARCH64_PRIVILEGE_LEVEL_MAX, +} Aarch64PrivilegeLevel; + +typedef struct { + struct qemu_plugin_register *reg_fp; + struct qemu_plugin_register *reg_cpsr; + struct qemu_plugin_register *reg_scr_el3; +} Aarch64Cpu; + +typedef enum { + X64_RING0, + X64_RING1, + X64_RING2, + X64_RING3, + X64_REAL_MODE, + X64_PRIVILEGE_LEVEL_MAX, +} X64PrivilegeLevel; + +typedef struct { + struct qemu_plugin_register *reg_rbp; + struct qemu_plugin_register *reg_cs; + struct qemu_plugin_register *reg_cr0; +} X64Cpu; + +typedef struct { + uint64_t timestamp; + uint64_t data; +} UftraceEntry; + +typedef enum { + UFTRACE_ENTRY, + UFTRACE_EXIT, + UFTRACE_LOST, + UFTRACE_EVENT, +} UftraceRecordType; + +static struct qemu_plugin_scoreboard *score; +static bool trace_privilege_level; +static CpuOps arch_ops; + +static uint64_t gettime_ns(void) +{ +#ifdef _WIN32 + /* + * On Windows, timespec_get is available only with UCRT, but not with + * MinGW64 environment. Simplify by using only gettimeofday on this + * platform. This may result in a precision loss. + */ + struct timeval tv; + gettimeofday(&tv, NULL); + uint64_t now_ns = tv.tv_sec * NANOSECONDS_PER_SECOND + tv.tv_usec * 1000; +#else + /* We need nanosecond precision for short lived functions. */ + struct timespec ts; + timespec_get(&ts, TIME_UTC); + uint64_t now_ns = ts.tv_sec * NANOSECONDS_PER_SECOND + ts.tv_nsec; +#endif + return now_ns; +} + +static void uftrace_write_map(bool system_emulation) +{ + const char *path = "./uftrace.data/sid-0.map"; + + if (system_emulation && access(path, F_OK) == 0) { + /* do not erase existing map in system emulation, as a custom one might + * already have been generated by uftrace_symbols.py */ + return; + } + + FILE *sid_map = fopen(path, "w"); + g_assert(sid_map); + + if (system_emulation) { + fprintf(sid_map, + "# map stack on highest address possible, to prevent uftrace\n" + "# from considering any kernel address\n"); + fprintf(sid_map, + "ffffffffffff-ffffffffffff rw-p 00000000 00:00 0 [stack]\n"); + } else { + /* in user mode, copy /proc/self/maps instead */ + FILE *self_map = fopen("/proc/self/maps", "r"); + g_assert(self_map); + for (;;) { + int c = fgetc(self_map); + if (c == EOF) { + break; + } + fputc(c, sid_map); + } + fclose(self_map); + } + fclose(sid_map); +} + +static void uftrace_write_task(const GArray *traces) +{ + FILE *task = fopen("./uftrace.data/task.txt", "w"); + g_assert(task); + for (int i = 0; i < traces->len; ++i) { + Trace *t = g_array_index(traces, Trace*, i); + fprintf(task, "SESS timestamp=0.0 pid=%"PRIu32" sid=0 exename=\"%s\"\n", + t->id, t->name->str); + fprintf(task, "TASK timestamp=0.0 tid=%"PRIu32" pid=%"PRIu32"\n", + t->id, t->id); + } + fclose(task); +} + +static void uftrace_write_info(const GArray *traces) +{ + g_autoptr(GString) taskinfo_tids = g_string_new("taskinfo:tids="); + for (int i = 0; i < traces->len; ++i) { + Trace *t = g_array_index(traces, Trace*, i); + const char *delim = i > 0 ? "," : ""; + g_string_append_printf(taskinfo_tids, "%s%"PRIu32, delim, t->id); + } + + g_autoptr(GString) taskinfo_nr_tid = g_string_new("taskinfo:nr_tid="); + g_string_append_printf(taskinfo_nr_tid, "%d", traces->len); + + FILE *info = fopen("./uftrace.data/info", "w"); + g_assert(info); + /* + * $ uftrace dump --debug + * uftrace file header: magic = 4674726163652100 + * uftrace file header: version = 4 + * uftrace file header: header size = 40 + * uftrace file header: endian = 1 (little) + * uftrace file header: class = 2 (64 bit) + * uftrace file header: features = 0x1263 (PLTHOOK | ... + * uftrace file header: info = 0x7bff (EXE_NAME | ... + * <0000000000000000>: 46 74 72 61 63 65 21 00 04 00 00 00 28 00 01 02 + * <0000000000000010>: 63 12 00 00 00 00 00 00 ff 7b 00 00 00 00 00 00 + * <0000000000000020>: 00 04 00 00 00 00 00 00 + */ + const uint8_t header[] = {0x46, 0x74, 0x72, 0x61, 0x63, 0x65, 0x21, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x28, 0x00, 0x01, 0x02, + 0x63, 0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0x7b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; + size_t wrote = fwrite(header, sizeof(header), 1, info); + g_assert(wrote == 1); + const char *info_data[] = { + "exename:", + "build_id:0000000000000000000000000000000000000000", + "exit_status:", + "cmdline:", + "cpuinfo:lines=2", + "cpuinfo:nr_cpus=", + "cpuinfo:desc=", + "meminfo:", + "osinfo:lines=3", + "osinfo:kernel=", + "osinfo:hostname=", + "osinfo:distro=", + "taskinfo:lines=2", + taskinfo_nr_tid->str, + taskinfo_tids->str, + "usageinfo:lines=6", + "usageinfo:systime=", + "usageinfo:usrtime=", + "usageinfo:ctxsw=", + "usageinfo:maxrss=", + "usageinfo:pagefault=", + "usageinfo:iops=", + "loadinfo:", + "record_date:", + "elapsed_time:", + "pattern_type:regex", + "uftrace_version:", + "utc_offset:", + 0}; + const char **info_data_it = info_data; + while (*(info_data_it)) { + fprintf(info, "%s\n", *info_data_it); + ++info_data_it; + } + fclose(info); +} + +static Callstack *callstack_new(void) +{ + Callstack *cs = g_new0(Callstack, 1); + cs->s = g_array_new(false, false, sizeof(CallstackEntry)); + return cs; +} + +static void callstack_free(Callstack *cs) +{ + g_array_free(cs->s, true); + cs->s = NULL; + g_free(cs); +} + +static size_t callstack_depth(const Callstack *cs) +{ + return cs->s->len; +} + +static size_t callstack_empty(const Callstack *cs) +{ + return callstack_depth(cs) == 0; +} + +static void callstack_clear(Callstack *cs) +{ + g_array_set_size(cs->s, 0); +} + +static const CallstackEntry *callstack_at(const Callstack *cs, size_t depth) +{ + g_assert(depth > 0); + g_assert(depth <= callstack_depth(cs)); + return &g_array_index(cs->s, CallstackEntry, depth - 1); +} + +static CallstackEntry callstack_top(const Callstack *cs) +{ + if (callstack_depth(cs) >= 1) { + return *callstack_at(cs, callstack_depth(cs)); + } + return (CallstackEntry){}; +} + +static CallstackEntry callstack_caller(const Callstack *cs) +{ + if (callstack_depth(cs) >= 2) { + return *callstack_at(cs, callstack_depth(cs) - 1); + } + return (CallstackEntry){}; +} + +static void callstack_push(Callstack *cs, CallstackEntry e) +{ + g_array_append_val(cs->s, e); +} + +static CallstackEntry callstack_pop(Callstack *cs) +{ + g_assert(!callstack_empty(cs)); + CallstackEntry e = callstack_top(cs); + g_array_set_size(cs->s, callstack_depth(cs) - 1); + return e; +} + +static Trace *trace_new(uint32_t id, GString *name) +{ + Trace *t = g_new0(Trace, 1); + t->t = g_array_new(false, false, sizeof(UftraceEntry)); + t->path = g_string_new(NULL); + g_string_append_printf(t->path, "./uftrace.data/%"PRIu32".dat", id); + t->name = g_string_new(name->str); + t->id = id; + return t; +} + +static void trace_free(Trace *t) +{ + g_assert(t->t->len == 0); + g_array_free(t->t, true); + t->t = NULL; + g_string_free(t->path, true); + t->path = NULL; + g_string_free(t->name, true); + t->name = NULL; + g_free(t); +} + +static void trace_flush(Trace *t, bool append) +{ + int create_dir = g_mkdir_with_parents("./uftrace.data", + S_IRWXU | S_IRWXG | S_IRWXO); + g_assert(create_dir == 0); + FILE *dat = fopen(t->path->str, append ? "a" : "w"); + g_assert(dat); + GArray *data = t->t; + if (data->len) { + size_t wrote = fwrite(data->data, sizeof(UftraceEntry), data->len, dat); + g_assert(wrote == data->len); + } + fclose(dat); + g_array_set_size(data, 0); +} + +static void trace_add_entry(Trace *t, uint64_t timestamp, uint64_t pc, + size_t depth, UftraceRecordType type) +{ + /* https://github.com/namhyung/uftrace/blob/v0.18/libmcount/record.c#L909 */ + const uint64_t record_magic = 0x5; + uint64_t data = type | (record_magic << 3); + data += depth << 6; + data += pc << 16; + UftraceEntry e = {.timestamp = timestamp, .data = data}; + g_array_append_val(t->t, e); + if (t->t->len * sizeof(UftraceEntry) > TRACE_FLUSH_SIZE) { + trace_flush(t, true); + } +} + +static void trace_enter_function(Trace *t, uint64_t timestamp, + uint64_t pc, size_t depth) +{ + trace_add_entry(t, timestamp, pc, depth, UFTRACE_ENTRY); +} + +static void trace_exit_function(Trace *t, uint64_t timestamp, + uint64_t pc, size_t depth) +{ + trace_add_entry(t, timestamp, pc, depth, UFTRACE_EXIT); +} + +static void trace_enter_stack(Trace *t, Callstack *cs, uint64_t timestamp) +{ + for (size_t depth = 1; depth <= callstack_depth(cs); ++depth) { + trace_enter_function(t, timestamp, callstack_at(cs, depth)->pc, depth); + } +} + +static void trace_exit_stack(Trace *t, Callstack *cs, uint64_t timestamp) +{ + for (size_t depth = callstack_depth(cs); depth > 0; --depth) { + trace_exit_function(t, timestamp, callstack_at(cs, depth)->pc, depth); + } +} + +static uint64_t cpu_read_register64(Cpu *cpu, struct qemu_plugin_register *reg) +{ + GByteArray *buf = cpu->buf; + g_byte_array_set_size(buf, 0); + size_t sz = qemu_plugin_read_register(reg, buf); + g_assert(sz == 8); + g_assert(buf->len == 8); + return *((uint64_t *) buf->data); +} + +static uint32_t cpu_read_register32(Cpu *cpu, struct qemu_plugin_register *reg) +{ + GByteArray *buf = cpu->buf; + g_byte_array_set_size(buf, 0); + size_t sz = qemu_plugin_read_register(reg, buf); + g_assert(sz == 4); + g_assert(buf->len == 4); + return *((uint32_t *) buf->data); +} + +static uint64_t cpu_read_memory64(Cpu *cpu, uint64_t addr) +{ + g_assert(addr); + GByteArray *buf = cpu->buf; + g_byte_array_set_size(buf, 0); + bool read = qemu_plugin_read_memory_vaddr(addr, buf, 8); + if (!read) { + return 0; + } + g_assert(buf->len == 8); + return *((uint64_t *) buf->data); +} + +static void cpu_unwind_stack(Cpu *cpu, uint64_t frame_pointer, uint64_t pc) +{ + g_assert(callstack_empty(cpu->cs)); + + #define UNWIND_STACK_MAX_DEPTH 1024 + CallstackEntry unwind[UNWIND_STACK_MAX_DEPTH]; + size_t depth = 0; + do { + /* check we don't have an infinite stack */ + for (size_t i = 0; i < depth; ++i) { + if (frame_pointer == unwind[i].frame_pointer) { + break; + } + } + CallstackEntry e = {.frame_pointer = frame_pointer, .pc = pc}; + unwind[depth] = e; + depth++; + if (frame_pointer) { + frame_pointer = cpu_read_memory64(cpu, frame_pointer); + } + pc = cpu_read_memory64(cpu, frame_pointer + 8); /* read previous lr */ + } while (frame_pointer && pc && depth < UNWIND_STACK_MAX_DEPTH); + #undef UNWIND_STACK_MAX_DEPTH + + /* push it from bottom to top */ + while (depth) { + callstack_push(cpu->cs, unwind[depth - 1]); + --depth; + } +} + +static struct qemu_plugin_register *plugin_find_register(const char *name) +{ + g_autoptr(GArray) regs = qemu_plugin_get_registers(); + for (int i = 0; i < regs->len; ++i) { + qemu_plugin_reg_descriptor *reg; + reg = &g_array_index(regs, qemu_plugin_reg_descriptor, i); + if (!strcmp(reg->name, name)) { + return reg->handle; + } + } + return NULL; +} + +static uint8_t aarch64_num_privilege_levels(void) +{ + return AARCH64_PRIVILEGE_LEVEL_MAX; +} + +static const char *aarch64_get_privilege_level_name(uint8_t pl) +{ + switch (pl) { + case AARCH64_EL0_SECURE: return "S-EL0"; + case AARCH64_EL0_NONSECURE: return "NS-EL0"; + case AARCH64_EL0_REALM: return "R-EL0"; + case AARCH64_EL1_SECURE: return "S-EL1"; + case AARCH64_EL1_NONSECURE: return "NS-EL1"; + case AARCH64_EL1_REALM: return "R-EL1"; + case AARCH64_EL2_SECURE: return "S-EL2"; + case AARCH64_EL2_NONSECURE: return "NS-EL2"; + case AARCH64_EL2_REALM: return "R-EL2"; + case AARCH64_EL3: return "EL3"; + default: + g_assert_not_reached(); + } +} + +static uint8_t aarch64_get_privilege_level(Cpu *cpu_) +{ + Aarch64Cpu *cpu = cpu_->arch; + /* + * QEMU gdbstub does not provide access to CurrentEL, + * so we use CPSR instead. + */ + uint8_t el = cpu_read_register32(cpu_, cpu->reg_cpsr) >> 2 & 0b11; + + if (el == 3) { + return AARCH64_EL3; + } + + uint8_t ss = AARCH64_EL0_SECURE; + if (!cpu->reg_scr_el3) { + ss = AARCH64_EL0_NONSECURE; + } + uint64_t scr_el3 = cpu_read_register64(cpu_, cpu->reg_scr_el3); + uint64_t ns = (scr_el3 >> 0) & 0b1; + uint64_t nse = (scr_el3 >> 62) & 0b1; + switch (nse << 1 | ns) { + case 0b00: + ss = AARCH64_EL0_SECURE; + break; + case 0b01: + ss = AARCH64_EL0_NONSECURE; + break; + case 0b11: + ss = AARCH64_EL0_REALM; + break; + default: + g_assert_not_reached(); + } + + const uint8_t num_ss = 3; + Aarch64PrivilegeLevel pl = el * num_ss + ss; + return pl; +} + +static uint64_t aarch64_get_frame_pointer(Cpu *cpu_) +{ + Aarch64Cpu *cpu = cpu_->arch; + return cpu_read_register64(cpu_, cpu->reg_fp); +} + +static void aarch64_init(Cpu *cpu_) +{ + Aarch64Cpu *cpu = g_new0(Aarch64Cpu, 1); + cpu_->arch = cpu; + cpu->reg_fp = plugin_find_register("x29"); + if (!cpu->reg_fp) { + fprintf(stderr, "uftrace plugin: frame pointer register (x29) is not " + "available. Please use an AArch64 cpu (or -cpu max).\n"); + g_abort(); + } + cpu->reg_cpsr = plugin_find_register("cpsr"); + g_assert(cpu->reg_cpsr); + cpu->reg_scr_el3 = plugin_find_register("SCR_EL3"); + /* scr_el3 is optional */ +} + +static void aarch64_end(Cpu *cpu) +{ + g_free(cpu->arch); +} + +static bool aarch64_does_insn_modify_frame_pointer(const char *disas) +{ + /* + * Check if current instruction concerns fp register "x29". + * We add a prefix space to make sure we don't match addresses dump + * in disassembly. + */ + return strstr(disas, " x29"); +} + +static CpuOps aarch64_ops = { + .init = aarch64_init, + .end = aarch64_end, + .get_frame_pointer = aarch64_get_frame_pointer, + .get_privilege_level = aarch64_get_privilege_level, + .num_privilege_levels = aarch64_num_privilege_levels, + .get_privilege_level_name = aarch64_get_privilege_level_name, + .does_insn_modify_frame_pointer = aarch64_does_insn_modify_frame_pointer, +}; + +static uint8_t x64_num_privilege_levels(void) +{ + return X64_PRIVILEGE_LEVEL_MAX; +} + +static const char *x64_get_privilege_level_name(uint8_t pl) +{ + switch (pl) { + case X64_RING0: return "Ring0"; + case X64_RING1: return "Ring1"; + case X64_RING2: return "Ring2"; + case X64_RING3: return "Ring3"; + case X64_REAL_MODE: return "RealMode"; + default: + g_assert_not_reached(); + } +} + +static uint8_t x64_get_privilege_level(Cpu *cpu_) +{ + X64Cpu *cpu = cpu_->arch; + uint64_t cr0 = cpu_read_register64(cpu_, cpu->reg_cr0); + uint64_t protected_mode = (cr0 >> 0) & 0b1; + if (!protected_mode) { + return X64_REAL_MODE; + } + uint32_t cs = cpu_read_register32(cpu_, cpu->reg_cs); + uint32_t ring_level = (cs >> 0) & 0b11; + return ring_level; +} + +static uint64_t x64_get_frame_pointer(Cpu *cpu_) +{ + X64Cpu *cpu = cpu_->arch; + return cpu_read_register64(cpu_, cpu->reg_rbp); +} + +static void x64_init(Cpu *cpu_) +{ + X64Cpu *cpu = g_new0(X64Cpu, 1); + cpu_->arch = cpu; + cpu->reg_rbp = plugin_find_register("rbp"); + g_assert(cpu->reg_rbp); + cpu->reg_cs = plugin_find_register("cs"); + g_assert(cpu->reg_cs); + cpu->reg_cr0 = plugin_find_register("cr0"); + g_assert(cpu->reg_cr0); +} + +static void x64_end(Cpu *cpu) +{ + g_free(cpu->arch); +} + +static bool x64_does_insn_modify_frame_pointer(const char *disas) +{ + return strstr(disas, "rbp"); +} + +static CpuOps x64_ops = { + .init = x64_init, + .end = x64_end, + .get_frame_pointer = x64_get_frame_pointer, + .get_privilege_level = x64_get_privilege_level, + .num_privilege_levels = x64_num_privilege_levels, + .get_privilege_level_name = x64_get_privilege_level_name, + .does_insn_modify_frame_pointer = x64_does_insn_modify_frame_pointer, +}; + +static void track_privilege_change(unsigned int cpu_index, void *udata) +{ + Cpu *cpu = qemu_plugin_scoreboard_find(score, cpu_index); + uint8_t new_pl = cpu->ops.get_privilege_level(cpu); + + if (new_pl == cpu->privilege_level) { + return; + } + + uint64_t pc = (uintptr_t) udata; + uint64_t timestamp = gettime_ns(); + + trace_exit_stack(cpu->trace, cpu->cs, timestamp); + callstack_clear(cpu->cs); + + cpu->privilege_level = new_pl; + cpu->trace = g_array_index(cpu->traces, Trace*, new_pl); + + cpu_unwind_stack(cpu, cpu->ops.get_frame_pointer(cpu), pc); + trace_enter_stack(cpu->trace, cpu->cs, timestamp); +} + +static void track_callstack(unsigned int cpu_index, void *udata) +{ + uint64_t pc = (uintptr_t) udata; + Cpu *cpu = qemu_plugin_scoreboard_find(score, cpu_index); + uint64_t timestamp = gettime_ns(); + Callstack *cs = cpu->cs; + Trace *t = cpu->trace; + + uint64_t fp = cpu->ops.get_frame_pointer(cpu); + if (!fp && callstack_empty(cs)) { + /* + * We simply push current pc. Note that we won't detect symbol change as + * long as a proper call does not happen. + */ + callstack_push(cs, (CallstackEntry){.frame_pointer = fp, .pc = pc}); + trace_enter_function(t, timestamp, pc, callstack_depth(cs)); + return; + } + + CallstackEntry top = callstack_top(cs); + if (fp == top.frame_pointer) { + /* same function */ + return; + } + + CallstackEntry caller = callstack_caller(cs); + if (fp == caller.frame_pointer) { + /* return */ + CallstackEntry e = callstack_pop(cs); + trace_exit_function(t, timestamp, e.pc, callstack_depth(cs)); + return; + } + + uint64_t caller_fp = fp ? cpu_read_memory64(cpu, fp) : 0; + if (caller_fp == top.frame_pointer) { + /* call */ + callstack_push(cs, (CallstackEntry){.frame_pointer = fp, .pc = pc}); + trace_enter_function(t, timestamp, pc, callstack_depth(cs)); + return; + } + + /* discontinuity, exit current stack and unwind new one */ + trace_exit_stack(t, cs, timestamp); + callstack_clear(cs); + + cpu_unwind_stack(cpu, fp, pc); + trace_enter_stack(t, cs, timestamp); +} + +static void vcpu_tb_trans(qemu_plugin_id_t id, struct qemu_plugin_tb *tb) +{ + size_t n_insns = qemu_plugin_tb_n_insns(tb); + uintptr_t tb_pc = qemu_plugin_tb_vaddr(tb); + + if (trace_privilege_level) { + qemu_plugin_register_vcpu_tb_exec_cb(tb, track_privilege_change, + QEMU_PLUGIN_CB_R_REGS, + (void *) tb_pc); + } + + /* + * Callbacks and inline instrumentation are inserted before an instruction. + * Thus, to see instruction effect, we need to wait for next one. + * Potentially, the last instruction of a block could modify the frame + * pointer. Thus, we need to always instrument first instruction in a tb. + */ + bool instrument_insn = true; + for (size_t i = 0; i < n_insns; i++) { + struct qemu_plugin_insn *insn = qemu_plugin_tb_get_insn(tb, i); + + if (instrument_insn) { + uintptr_t pc = qemu_plugin_insn_vaddr(insn); + qemu_plugin_register_vcpu_insn_exec_cb(insn, track_callstack, + QEMU_PLUGIN_CB_R_REGS, + (void *) pc); + instrument_insn = false; + } + + char *disas = qemu_plugin_insn_disas(insn); + if (arch_ops.does_insn_modify_frame_pointer(disas)) { + instrument_insn = true; + } + } +} + +static void vcpu_init(qemu_plugin_id_t id, unsigned int vcpu_index) +{ + Cpu *cpu = qemu_plugin_scoreboard_find(score, vcpu_index); + cpu->ops = arch_ops; + + cpu->ops.init(cpu); + cpu->buf = g_byte_array_new(); + cpu->traces = g_array_new(0, 0, sizeof(Trace *)); + + g_assert(vcpu_index < UINT32_MAX / TRACE_ID_SCALE); + g_assert(cpu->ops.num_privilege_levels() < TRACE_ID_SCALE); + /* trace_id is: cpu_number * TRACE_ID_SCALE + privilege_level */ + uint32_t trace_id = (vcpu_index + 1) * TRACE_ID_SCALE; + + if (trace_privilege_level) { + for (uint8_t pl = 0; pl < cpu->ops.num_privilege_levels(); ++pl) { + g_autoptr(GString) trace_name = g_string_new(NULL); + g_string_append_printf(trace_name, "cpu%u %s", vcpu_index, + cpu->ops.get_privilege_level_name(pl)); + Trace *t = trace_new(trace_id + pl, trace_name); + g_array_append_val(cpu->traces, t); + } + } else { + g_autoptr(GString) trace_name = g_string_new(NULL); + g_string_append_printf(trace_name, "cpu%u", vcpu_index); + Trace *t = trace_new(trace_id, trace_name); + g_array_append_val(cpu->traces, t); + } + + for (size_t i = 0; i < cpu->traces->len; ++i) { + /* create/truncate trace files */ + Trace *t = g_array_index(cpu->traces, Trace*, i); + trace_flush(t, false); + } + + cpu->cs = callstack_new(); + cpu->trace = g_array_index(cpu->traces, Trace*, cpu->privilege_level); +} + +static void vcpu_end(unsigned int vcpu_index) +{ + Cpu *cpu = qemu_plugin_scoreboard_find(score, vcpu_index); + g_byte_array_free(cpu->buf, true); + + for (size_t i = 0; i < cpu->traces->len; ++i) { + Trace *t = g_array_index(cpu->traces, Trace*, i); + trace_free(t); + } + + g_array_free(cpu->traces, true); + callstack_free(cpu->cs); + memset(cpu, 0, sizeof(Cpu)); +} + +static void at_exit(qemu_plugin_id_t id, void *data) +{ + bool system_emulation = (bool) data; + g_autoptr(GArray) traces = g_array_new(0, 0, sizeof(Trace *)); + + for (size_t i = 0; i < qemu_plugin_num_vcpus(); ++i) { + Cpu *cpu = qemu_plugin_scoreboard_find(score, i); + for (size_t j = 0; j < cpu->traces->len; ++j) { + Trace *t = g_array_index(cpu->traces, Trace*, j); + trace_flush(t, true); + g_array_append_val(traces, t); + } + } + + uftrace_write_map(system_emulation); + uftrace_write_info(traces); + uftrace_write_task(traces); + + for (size_t i = 0; i < qemu_plugin_num_vcpus(); ++i) { + vcpu_end(i); + } + + qemu_plugin_scoreboard_free(score); +} + +QEMU_PLUGIN_EXPORT int qemu_plugin_install(qemu_plugin_id_t id, + const qemu_info_t *info, + int argc, char **argv) +{ + for (int i = 0; i < argc; i++) { + char *opt = argv[i]; + g_auto(GStrv) tokens = g_strsplit(opt, "=", 2); + if (g_strcmp0(tokens[0], "trace-privilege-level") == 0) { + if (!qemu_plugin_bool_parse(tokens[0], tokens[1], + &trace_privilege_level)) { + fprintf(stderr, "boolean argument parsing failed: %s\n", opt); + return -1; + } + } else { + fprintf(stderr, "option parsing failed: %s\n", opt); + return -1; + } + } + + if (!strcmp(info->target_name, "aarch64")) { + arch_ops = aarch64_ops; + } else if (!strcmp(info->target_name, "x86_64")) { + arch_ops = x64_ops; + } else { + fprintf(stderr, "plugin uftrace: %s target is not supported\n", + info->target_name); + return 1; + } + + score = qemu_plugin_scoreboard_new(sizeof(Cpu)); + qemu_plugin_register_vcpu_init_cb(id, vcpu_init); + qemu_plugin_register_atexit_cb(id, at_exit, (void *) info->system_emulation); + qemu_plugin_register_vcpu_tb_trans_cb(id, vcpu_tb_trans); + + return 0; +} diff --git a/contrib/plugins/uftrace_symbols.py b/contrib/plugins/uftrace_symbols.py new file mode 100755 index 0000000..b49e032 --- /dev/null +++ b/contrib/plugins/uftrace_symbols.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Create symbols and mapping files for uftrace. +# +# Copyright 2025 Linaro Ltd +# Author: Pierrick Bouvier <pierrick.bouvier@linaro.org> +# +# SPDX-License-Identifier: GPL-2.0-or-later + +import argparse +import elftools # pip install pyelftools +import os + +from elftools.elf.elffile import ELFFile +from elftools.elf.sections import SymbolTableSection + +def elf_func_symbols(elf): + symbol_tables = [(idx, s) for idx, s in enumerate(elf.iter_sections()) + if isinstance(s, SymbolTableSection)] + symbols = [] + for _, section in symbol_tables: + for _, symbol in enumerate(section.iter_symbols()): + if symbol_size(symbol) == 0: + continue + type = symbol['st_info']['type'] + if type == 'STT_FUNC' or type == 'STT_NOTYPE': + symbols.append(symbol) + symbols.sort(key = lambda x: symbol_addr(x)) + return symbols + +def symbol_size(symbol): + return symbol['st_size'] + +def symbol_addr(symbol): + addr = symbol['st_value'] + # clamp addr to 48 bits, like uftrace entries + return addr & 0xffffffffffff + +def symbol_name(symbol): + return symbol.name + +class BinaryFile: + def __init__(self, path, map_offset): + self.fullpath = os.path.realpath(path) + self.map_offset = map_offset + with open(path, 'rb') as f: + self.elf = ELFFile(f) + self.symbols = elf_func_symbols(self.elf) + + def path(self): + return self.fullpath + + def addr_start(self): + return self.map_offset + + def addr_end(self): + last_sym = self.symbols[-1] + return symbol_addr(last_sym) + symbol_size(last_sym) + self.map_offset + + def generate_symbol_file(self, prefix_symbols): + binary_name = os.path.basename(self.fullpath) + sym_file_path = f'./uftrace.data/{binary_name}.sym' + print(f'{sym_file_path} ({len(self.symbols)} symbols)') + with open(sym_file_path, 'w') as sym_file: + # print hexadecimal addresses on 48 bits + addrx = "0>12x" + for s in self.symbols: + addr = symbol_addr(s) + addr = f'{addr:{addrx}}' + size = f'{symbol_size(s):{addrx}}' + name = symbol_name(s) + if prefix_symbols: + name = f'{binary_name}:{name}' + print(addr, size, 'T', name, file=sym_file) + +def parse_parameter(p): + s = p.split(":") + path = s[0] + if len(s) == 1: + return path, 0 + if len(s) > 2: + raise ValueError('only one offset can be set') + offset = s[1] + if not offset.startswith('0x'): + err = f'offset "{offset}" is not an hexadecimal constant. ' + err += 'It should starts with "0x".' + raise ValueError(err) + offset = int(offset, 16) + return path, offset + +def is_from_user_mode(map_file_path): + if os.path.exists(map_file_path): + with open(map_file_path, 'r') as map_file: + if not map_file.readline().startswith('# map stack on'): + return True + return False + +def generate_map(binaries): + map_file_path = './uftrace.data/sid-0.map' + + if is_from_user_mode(map_file_path): + print(f'do not overwrite {map_file_path} generated from qemu-user') + return + + mappings = [] + + # print hexadecimal addresses on 48 bits + addrx = "0>12x" + + mappings += ['# map stack on highest address possible, to prevent uftrace'] + mappings += ['# from considering any kernel address'] + mappings += ['ffffffffffff-ffffffffffff rw-p 00000000 00:00 0 [stack]'] + + for b in binaries: + m = f'{b.addr_start():{addrx}}-{b.addr_end():{addrx}}' + m += f' r--p 00000000 00:00 0 {b.path()}' + mappings.append(m) + + with open(map_file_path, 'w') as map_file: + print('\n'.join(mappings), file=map_file) + print(f'{map_file_path}') + print('\n'.join(mappings)) + +def main(): + parser = argparse.ArgumentParser(description= + 'generate symbol files for uftrace') + parser.add_argument('elf_file', nargs='+', + help='path to an ELF file. ' + 'Use /path/to/file:0xdeadbeef to add a mapping offset.') + parser.add_argument('--prefix-symbols', + help='prepend binary name to symbols', + action=argparse.BooleanOptionalAction) + args = parser.parse_args() + + if not os.path.exists('./uftrace.data'): + os.mkdir('./uftrace.data') + + binaries = [] + for file in args.elf_file: + path, offset = parse_parameter(file) + b = BinaryFile(path, offset) + binaries.append(b) + binaries.sort(key = lambda b: b.addr_end()); + + for b in binaries: + b.generate_symbol_file(args.prefix_symbols) + + generate_map(binaries) + +if __name__ == '__main__': + main() diff --git a/docs/about/emulation.rst b/docs/about/emulation.rst index 456d01d..8a5e128 100644 --- a/docs/about/emulation.rst +++ b/docs/about/emulation.rst @@ -816,6 +816,205 @@ This plugin can limit the number of Instructions Per Second that are executed:: The lower the number the more accurate time will be, but the less efficient the plugin. Defaults to ips/10 +Uftrace +....... + +``contrib/plugins/uftrace.c`` + +This plugin generates a binary trace compatible with +`uftrace <https://github.com/namhyung/uftrace>`_. + +Plugin supports aarch64 and x64, and works in user and system mode, allowing to +trace a system boot, which is not something possible usually. + +In user mode, the memory mapping is directly copied from ``/proc/self/maps`` at +the end of execution. Uftrace should be able to retrieve symbols by itself, +without any additional step. +In system mode, the default memory mapping is empty, and you can generate +one (and associated symbols) using ``contrib/plugins/uftrace_symbols.py``. +Symbols must be present in ELF binaries. + +It tracks the call stack (based on frame pointer analysis). Thus, your program +and its dependencies must be compiled using ``-fno-omit-frame-pointer +-mno-omit-leaf-frame-pointer``. In 2024, `Ubuntu and Fedora enabled it by +default again on x64 +<https://www.brendangregg.com/blog/2024-03-17/the-return-of-the-frame-pointers.html>`_. +On aarch64, this is less of a problem, as they are usually part of the ABI, +except for leaf functions. That's true for user space applications, but not +necessarily for bare metal code. You can read this `section +<uftrace_build_system_example>` to easily build a system with frame pointers. + +When tracing long scenarios (> 1 min), the generated trace can become very long, +making it hard to extract data from it. In this case, a simple solution is to +trace execution while generating a timestamped output log using +``qemu-system-aarch64 ... | ts "%s"``. Then, ``uftrace --time-range=start~end`` +can be used to reduce trace for only this part of execution. + +Performance wise, overhead compared to normal tcg execution is around x5-x15. + +.. list-table:: Uftrace plugin arguments + :widths: 20 80 + :header-rows: 1 + + * - Option + - Description + * - trace-privilege-level=[on|off] + - Generate separate traces for each privilege level (Exception Level + + Security State on aarch64, Rings on x64). + +.. list-table:: uftrace_symbols.py arguments + :widths: 20 80 + :header-rows: 1 + + * - Option + - Description + * - elf_file [elf_file ...] + - path to an ELF file. Use /path/to/file:0xdeadbeef to add a mapping offset. + * - --prefix-symbols + - prepend binary name to symbols + +Example user trace +++++++++++++++++++ + +As an example, we can trace qemu itself running git:: + + $ ./build/qemu-aarch64 -plugin \ + build/contrib/plugins/libuftrace.so \ + ./build/qemu-aarch64 /usr/bin/git --help + + # and generate a chrome trace directly + $ uftrace dump --chrome | gzip > ~/qemu_aarch64_git_help.json.gz + +For convenience, you can download this trace `qemu_aarch64_git_help.json.gz +<https://fileserver.linaro.org/s/N8X8fnZ5yGRZLsT/download/qemu_aarch64_git_help.json.gz>`_. +Download it and open this trace on https://ui.perfetto.dev/. You can zoom in/out +using :kbd:`W`, :kbd:`A`, :kbd:`S`, :kbd:`D` keys. +Some sequences taken from this trace: + +- Loading program and its interpreter + +.. image:: https://fileserver.linaro.org/s/fie8JgX76yyL5cq/preview + :height: 200px + +- open syscall + +.. image:: https://fileserver.linaro.org/s/rsXPTeZZPza4PcE/preview + :height: 200px + +- TB creation + +.. image:: https://fileserver.linaro.org/s/GXY6NKMw5EeRCew/preview + :height: 200px + +It's usually better to use ``uftrace record`` directly. However, tracing +binaries through qemu-user can be convenient when you don't want to recompile +them (``uftrace record`` requires instrumentation), as long as symbols are +present. + +Example system trace +++++++++++++++++++++ + +A full trace example (chrome trace, from instructions below) generated from a +system boot can be found `here +<https://fileserver.linaro.org/s/WsemLboPEzo24nw/download/aarch64_boot.json.gz>`_. +Download it and open this trace on https://ui.perfetto.dev/. You can see code +executed for all privilege levels, and zoom in/out using +:kbd:`W`, :kbd:`A`, :kbd:`S`, :kbd:`D` keys. You can find below some sequences +taken from this trace: + +- Two first stages of boot sequence in Arm Trusted Firmware (EL3 and S-EL1) + +.. image:: https://fileserver.linaro.org/s/kkxBS552W7nYESX/preview + :height: 200px + +- U-boot initialization (until code relocation, after which we can't track it) + +.. image:: https://fileserver.linaro.org/s/LKTgsXNZFi5GFNC/preview + :height: 200px + +- Stat and open syscalls in kernel + +.. image:: https://fileserver.linaro.org/s/dXe4MfraKg2F476/preview + :height: 200px + +- Timer interrupt + +.. image:: https://fileserver.linaro.org/s/TM5yobYzJtP7P3C/preview + :height: 200px + +- Poweroff sequence (from kernel back to firmware, NS-EL2 to EL3) + +.. image:: https://fileserver.linaro.org/s/oR2PtyGKJrqnfRf/preview + :height: 200px + +Build and run system example +++++++++++++++++++++++++++++ + +.. _uftrace_build_system_example: + +Building a full system image with frame pointers is not trivial. + +We provide a `simple way <https://github.com/pbo-linaro/qemu-linux-stack>`_ to +build an aarch64 system, combining Arm Trusted firmware, U-boot, Linux kernel +and debian userland. It's based on containers (``podman`` only) and +``qemu-user-static (binfmt)`` to make sure it's easily reproducible and does not depend +on machine where you build it. + +You can follow the exact same instructions for a x64 system, combining edk2, +Linux, and Ubuntu, simply by switching to +`x86_64 <https://github.com/pbo-linaro/qemu-linux-stack/tree/x86_64>`_ branch. + +To build the system:: + + # Install dependencies + $ sudo apt install -y podman qemu-user-static + + $ git clone https://github.com/pbo-linaro/qemu-linux-stack + $ cd qemu-linux-stack + $ ./build.sh + + # system can be started using: + $ ./run.sh /path/to/qemu-system-aarch64 + +To generate a uftrace for a system boot from that:: + + # run true and poweroff the system + $ env INIT=true ./run.sh path/to/qemu-system-aarch64 \ + -plugin path/to/contrib/plugins/libuftrace.so,trace-privilege-level=on + + # generate symbols and memory mapping + $ path/to/contrib/plugins/uftrace_symbols.py \ + --prefix-symbols \ + arm-trusted-firmware/build/qemu/debug/bl1/bl1.elf \ + arm-trusted-firmware/build/qemu/debug/bl2/bl2.elf \ + arm-trusted-firmware/build/qemu/debug/bl31/bl31.elf \ + u-boot/u-boot:0x60000000 \ + linux/vmlinux + + # inspect trace with + $ uftrace replay + +Uftrace allows to filter the trace, and dump flamegraphs, or a chrome trace. +This last one is very interesting to see visually the boot process:: + + $ uftrace dump --chrome > boot.json + # Open your browser, and load boot.json on https://ui.perfetto.dev/. + +Long visual chrome traces can't be easily opened, thus, it might be +interesting to generate them around a particular point of execution:: + + # execute qemu and timestamp output log + $ env INIT=true ./run.sh path/to/qemu-system-aarch64 \ + -plugin path/to/contrib/plugins/libuftrace.so,trace-privilege-level=on |& + ts "%s" | tee exec.log + + $ cat exec.log | grep 'Run /init' + 1753122320 [ 11.834391] Run /init as init process + # init was launched at 1753122320 + + # generate trace around init execution (2 seconds): + $ uftrace dump --chrome --time-range=1753122320~1753122322 > init.json + Other emulation features ------------------------ diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c index 7e1c71e..a748a0b 100644 --- a/hw/ppc/spapr_pci_vfio.c +++ b/hw/ppc/spapr_pci_vfio.c @@ -24,7 +24,7 @@ #include "hw/pci-host/spapr.h" #include "hw/pci/msix.h" #include "hw/pci/pci_device.h" -#include "hw/vfio/vfio-container.h" +#include "hw/vfio/vfio-container-legacy.h" #include "qemu/error-report.h" #include CONFIG_DEVICES /* CONFIG_VFIO_PCI */ @@ -32,7 +32,7 @@ * Interfaces for IBM EEH (Enhanced Error Handling) */ #ifdef CONFIG_VFIO_PCI -static bool vfio_eeh_container_ok(VFIOContainer *container) +static bool vfio_eeh_container_ok(VFIOLegacyContainer *container) { /* * As of 2016-03-04 (linux-4.5) the host kernel EEH/VFIO @@ -60,7 +60,7 @@ static bool vfio_eeh_container_ok(VFIOContainer *container) return true; } -static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op) +static int vfio_eeh_container_op(VFIOLegacyContainer *container, uint32_t op) { struct vfio_eeh_pe_op pe_op = { .argsz = sizeof(pe_op), @@ -83,10 +83,10 @@ static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op) return ret; } -static VFIOContainer *vfio_eeh_as_container(AddressSpace *as) +static VFIOLegacyContainer *vfio_eeh_as_container(AddressSpace *as) { VFIOAddressSpace *space = vfio_address_space_get(as); - VFIOContainerBase *bcontainer = NULL; + VFIOContainer *bcontainer = NULL; if (QLIST_EMPTY(&space->containers)) { /* No containers to act on */ @@ -111,14 +111,14 @@ out: static bool vfio_eeh_as_ok(AddressSpace *as) { - VFIOContainer *container = vfio_eeh_as_container(as); + VFIOLegacyContainer *container = vfio_eeh_as_container(as); return (container != NULL) && vfio_eeh_container_ok(container); } static int vfio_eeh_as_op(AddressSpace *as, uint32_t op) { - VFIOContainer *container = vfio_eeh_as_container(as); + VFIOLegacyContainer *container = vfio_eeh_as_container(as); if (!container) { return -ENODEV; diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c index 938a551..9e31029 100644 --- a/hw/s390x/s390-pci-vfio.c +++ b/hw/s390x/s390-pci-vfio.c @@ -20,7 +20,7 @@ #include "hw/s390x/s390-pci-clp.h" #include "hw/s390x/s390-pci-vfio.h" #include "hw/vfio/pci.h" -#include "hw/vfio/vfio-container.h" +#include "hw/vfio/vfio-container-legacy.h" #include "hw/vfio/vfio-helpers.h" /* @@ -62,7 +62,7 @@ S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s, { S390PCIDMACount *cnt; uint32_t avail; - VFIOPCIDevice *vpdev = VFIO_PCI_BASE(pbdev->pdev); + VFIOPCIDevice *vpdev = VFIO_PCI_DEVICE(pbdev->pdev); int id; assert(vpdev); @@ -108,7 +108,7 @@ static void s390_pci_read_base(S390PCIBusDevice *pbdev, { struct vfio_info_cap_header *hdr; struct vfio_device_info_cap_zpci_base *cap; - VFIOPCIDevice *vpci = VFIO_PCI_BASE(pbdev->pdev); + VFIOPCIDevice *vpci = VFIO_PCI_DEVICE(pbdev->pdev); uint64_t vfio_size; hdr = vfio_get_device_info_cap(info, VFIO_DEVICE_INFO_CAP_ZPCI_BASE); @@ -162,7 +162,7 @@ static bool get_host_fh(S390PCIBusDevice *pbdev, struct vfio_device_info *info, { struct vfio_info_cap_header *hdr; struct vfio_device_info_cap_zpci_base *cap; - VFIOPCIDevice *vpci = VFIO_PCI_BASE(pbdev->pdev); + VFIOPCIDevice *vpci = VFIO_PCI_DEVICE(pbdev->pdev); hdr = vfio_get_device_info_cap(info, VFIO_DEVICE_INFO_CAP_ZPCI_BASE); @@ -185,7 +185,7 @@ static void s390_pci_read_group(S390PCIBusDevice *pbdev, struct vfio_device_info_cap_zpci_group *cap; S390pciState *s = s390_get_phb(); ClpRspQueryPciGrp *resgrp; - VFIOPCIDevice *vpci = VFIO_PCI_BASE(pbdev->pdev); + VFIOPCIDevice *vpci = VFIO_PCI_DEVICE(pbdev->pdev); uint8_t start_gid = pbdev->zpci_fn.pfgid; hdr = vfio_get_device_info_cap(info, VFIO_DEVICE_INFO_CAP_ZPCI_GROUP); @@ -264,7 +264,7 @@ static void s390_pci_read_util(S390PCIBusDevice *pbdev, { struct vfio_info_cap_header *hdr; struct vfio_device_info_cap_zpci_util *cap; - VFIOPCIDevice *vpci = VFIO_PCI_BASE(pbdev->pdev); + VFIOPCIDevice *vpci = VFIO_PCI_DEVICE(pbdev->pdev); hdr = vfio_get_device_info_cap(info, VFIO_DEVICE_INFO_CAP_ZPCI_UTIL); @@ -291,7 +291,7 @@ static void s390_pci_read_pfip(S390PCIBusDevice *pbdev, { struct vfio_info_cap_header *hdr; struct vfio_device_info_cap_zpci_pfip *cap; - VFIOPCIDevice *vpci = VFIO_PCI_BASE(pbdev->pdev); + VFIOPCIDevice *vpci = VFIO_PCI_DEVICE(pbdev->pdev); hdr = vfio_get_device_info_cap(info, VFIO_DEVICE_INFO_CAP_ZPCI_PFIP); @@ -314,7 +314,7 @@ static void s390_pci_read_pfip(S390PCIBusDevice *pbdev, static struct vfio_device_info *get_device_info(S390PCIBusDevice *pbdev) { - VFIOPCIDevice *vfio_pci = VFIO_PCI_BASE(pbdev->pdev); + VFIOPCIDevice *vfio_pci = VFIO_PCI_DEVICE(pbdev->pdev); return vfio_get_device_info(vfio_pci->vbasedev.fd); } diff --git a/hw/vfio-user/container.c b/hw/vfio-user/container.c index 3cdbd44..411eb7b 100644 --- a/hw/vfio-user/container.c +++ b/hw/vfio-user/container.c @@ -22,14 +22,14 @@ * will fire during memory update transactions. These depend on BQL being held, * so do any resulting map/demap ops async while keeping BQL. */ -static void vfio_user_listener_begin(VFIOContainerBase *bcontainer) +static void vfio_user_listener_begin(VFIOContainer *bcontainer) { VFIOUserContainer *container = VFIO_IOMMU_USER(bcontainer); container->proxy->async_ops = true; } -static void vfio_user_listener_commit(VFIOContainerBase *bcontainer) +static void vfio_user_listener_commit(VFIOContainer *bcontainer) { VFIOUserContainer *container = VFIO_IOMMU_USER(bcontainer); @@ -38,7 +38,7 @@ static void vfio_user_listener_commit(VFIOContainerBase *bcontainer) vfio_user_wait_reqs(container->proxy); } -static int vfio_user_dma_unmap(const VFIOContainerBase *bcontainer, +static int vfio_user_dma_unmap(const VFIOContainer *bcontainer, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb, bool unmap_all) { @@ -80,7 +80,7 @@ static int vfio_user_dma_unmap(const VFIOContainerBase *bcontainer, return ret; } -static int vfio_user_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova, +static int vfio_user_dma_map(const VFIOContainer *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly, MemoryRegion *mrp) { @@ -154,14 +154,14 @@ static int vfio_user_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova, } static int -vfio_user_set_dirty_page_tracking(const VFIOContainerBase *bcontainer, +vfio_user_set_dirty_page_tracking(const VFIOContainer *bcontainer, bool start, Error **errp) { error_setg_errno(errp, ENOTSUP, "Not supported"); return -ENOTSUP; } -static int vfio_user_query_dirty_bitmap(const VFIOContainerBase *bcontainer, +static int vfio_user_query_dirty_bitmap(const VFIOContainer *bcontainer, VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp) { @@ -169,7 +169,7 @@ static int vfio_user_query_dirty_bitmap(const VFIOContainerBase *bcontainer, return -ENOTSUP; } -static bool vfio_user_setup(VFIOContainerBase *bcontainer, Error **errp) +static bool vfio_user_setup(VFIOContainer *bcontainer, Error **errp) { VFIOUserContainer *container = VFIO_IOMMU_USER(bcontainer); @@ -202,7 +202,7 @@ static VFIOUserContainer * vfio_user_container_connect(AddressSpace *as, VFIODevice *vbasedev, Error **errp) { - VFIOContainerBase *bcontainer; + VFIOContainer *bcontainer; VFIOUserContainer *container; VFIOAddressSpace *space; VFIOIOMMUClass *vioc; @@ -260,7 +260,7 @@ put_space_exit: static void vfio_user_container_disconnect(VFIOUserContainer *container) { - VFIOContainerBase *bcontainer = VFIO_IOMMU(container); + VFIOContainer *bcontainer = VFIO_IOMMU(container); VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); VFIOAddressSpace *space = bcontainer->space; diff --git a/hw/vfio-user/container.h b/hw/vfio-user/container.h index 96aa678..a2b42e3 100644 --- a/hw/vfio-user/container.h +++ b/hw/vfio-user/container.h @@ -9,12 +9,12 @@ #include "qemu/osdep.h" -#include "hw/vfio/vfio-container-base.h" +#include "hw/vfio/vfio-container.h" #include "hw/vfio-user/proxy.h" /* MMU container sub-class for vfio-user. */ struct VFIOUserContainer { - VFIOContainerBase parent_obj; + VFIOContainer parent_obj; VFIOUserProxy *proxy; }; diff --git a/hw/vfio-user/pci.c b/hw/vfio-user/pci.c index e2c3097..b53ed3b 100644 --- a/hw/vfio-user/pci.c +++ b/hw/vfio-user/pci.c @@ -234,9 +234,10 @@ static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp) { ERRP_GUARD(); VFIOUserPCIDevice *udev = VFIO_USER_PCI(pdev); - VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(pdev); VFIODevice *vbasedev = &vdev->vbasedev; const char *sock_name; + AddressSpace *as; SocketAddress addr; VFIOUserProxy *proxy; @@ -343,10 +344,10 @@ error: vfio_pci_put_device(vdev); } -static void vfio_user_instance_init(Object *obj) +static void vfio_user_pci_init(Object *obj) { PCIDevice *pci_dev = PCI_DEVICE(obj); - VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(obj); VFIODevice *vbasedev = &vdev->vbasedev; device_add_bootindex_property(obj, &vdev->bootindex, @@ -369,9 +370,9 @@ static void vfio_user_instance_init(Object *obj) pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS; } -static void vfio_user_instance_finalize(Object *obj) +static void vfio_user_pci_finalize(Object *obj) { - VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(obj); VFIODevice *vbasedev = &vdev->vbasedev; if (vdev->msix != NULL) { @@ -387,7 +388,7 @@ static void vfio_user_instance_finalize(Object *obj) static void vfio_user_pci_reset(DeviceState *dev) { - VFIOPCIDevice *vdev = VFIO_PCI_BASE(dev); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(dev); VFIODevice *vbasedev = &vdev->vbasedev; vfio_pci_pre_reset(vdev); @@ -399,7 +400,7 @@ static void vfio_user_pci_reset(DeviceState *dev) vfio_pci_post_reset(vdev); } -static const Property vfio_user_pci_dev_properties[] = { +static const Property vfio_user_pci_properties[] = { DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice, vendor_id, PCI_ANY_ID), DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice, @@ -421,7 +422,7 @@ static void vfio_user_pci_set_socket(Object *obj, Visitor *v, const char *name, VFIOUserPCIDevice *udev = VFIO_USER_PCI(obj); bool success; - if (VFIO_PCI_BASE(udev)->vbasedev.proxy) { + if (VFIO_PCI_DEVICE(udev)->vbasedev.proxy) { error_setg(errp, "Proxy is connected"); return; } @@ -445,13 +446,13 @@ static void vfio_user_pci_set_socket(Object *obj, Visitor *v, const char *name, } } -static void vfio_user_pci_dev_class_init(ObjectClass *klass, const void *data) +static void vfio_user_pci_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass); device_class_set_legacy_reset(dc, vfio_user_pci_reset); - device_class_set_props(dc, vfio_user_pci_dev_properties); + device_class_set_props(dc, vfio_user_pci_properties); object_class_property_add(klass, "socket", "SocketAddress", NULL, vfio_user_pci_set_socket, NULL, NULL); @@ -462,18 +463,18 @@ static void vfio_user_pci_dev_class_init(ObjectClass *klass, const void *data) pdc->realize = vfio_user_pci_realize; } -static const TypeInfo vfio_user_pci_dev_info = { +static const TypeInfo vfio_user_pci_info = { .name = TYPE_VFIO_USER_PCI, - .parent = TYPE_VFIO_PCI_BASE, + .parent = TYPE_VFIO_PCI_DEVICE, .instance_size = sizeof(VFIOUserPCIDevice), - .class_init = vfio_user_pci_dev_class_init, - .instance_init = vfio_user_instance_init, - .instance_finalize = vfio_user_instance_finalize, + .class_init = vfio_user_pci_class_init, + .instance_init = vfio_user_pci_init, + .instance_finalize = vfio_user_pci_finalize, }; static void register_vfio_user_dev_type(void) { - type_register_static(&vfio_user_pci_dev_info); + type_register_static(&vfio_user_pci_info); } - type_init(register_vfio_user_dev_type) +type_init(register_vfio_user_dev_type) diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c deleted file mode 100644 index 5630497..0000000 --- a/hw/vfio/container-base.c +++ /dev/null @@ -1,347 +0,0 @@ -/* - * VFIO BASE CONTAINER - * - * Copyright (C) 2023 Intel Corporation. - * Copyright Red Hat, Inc. 2023 - * - * Authors: Yi Liu <yi.l.liu@intel.com> - * Eric Auger <eric.auger@redhat.com> - * - * SPDX-License-Identifier: GPL-2.0-or-later - */ - -#include <sys/ioctl.h> -#include <linux/vfio.h> - -#include "qemu/osdep.h" -#include "system/tcg.h" -#include "system/ram_addr.h" -#include "qapi/error.h" -#include "qemu/error-report.h" -#include "hw/vfio/vfio-container-base.h" -#include "hw/vfio/vfio-device.h" /* vfio_device_reset_handler */ -#include "system/reset.h" -#include "vfio-helpers.h" - -#include "trace.h" - -static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces = - QLIST_HEAD_INITIALIZER(vfio_address_spaces); - -VFIOAddressSpace *vfio_address_space_get(AddressSpace *as) -{ - VFIOAddressSpace *space; - - QLIST_FOREACH(space, &vfio_address_spaces, list) { - if (space->as == as) { - return space; - } - } - - /* No suitable VFIOAddressSpace, create a new one */ - space = g_malloc0(sizeof(*space)); - space->as = as; - QLIST_INIT(&space->containers); - - if (QLIST_EMPTY(&vfio_address_spaces)) { - qemu_register_reset(vfio_device_reset_handler, NULL); - } - - QLIST_INSERT_HEAD(&vfio_address_spaces, space, list); - - return space; -} - -void vfio_address_space_put(VFIOAddressSpace *space) -{ - if (!QLIST_EMPTY(&space->containers)) { - return; - } - - QLIST_REMOVE(space, list); - g_free(space); - - if (QLIST_EMPTY(&vfio_address_spaces)) { - qemu_unregister_reset(vfio_device_reset_handler, NULL); - } -} - -void vfio_address_space_insert(VFIOAddressSpace *space, - VFIOContainerBase *bcontainer) -{ - QLIST_INSERT_HEAD(&space->containers, bcontainer, next); - bcontainer->space = space; -} - -int vfio_container_dma_map(VFIOContainerBase *bcontainer, - hwaddr iova, ram_addr_t size, - void *vaddr, bool readonly, MemoryRegion *mr) -{ - VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); - RAMBlock *rb = mr->ram_block; - int mfd = rb ? qemu_ram_get_fd(rb) : -1; - - if (mfd >= 0 && vioc->dma_map_file) { - unsigned long start = vaddr - qemu_ram_get_host_addr(rb); - unsigned long offset = qemu_ram_get_fd_offset(rb); - - return vioc->dma_map_file(bcontainer, iova, size, mfd, start + offset, - readonly); - } - g_assert(vioc->dma_map); - return vioc->dma_map(bcontainer, iova, size, vaddr, readonly, mr); -} - -int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, - hwaddr iova, ram_addr_t size, - IOMMUTLBEntry *iotlb, bool unmap_all) -{ - VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); - - g_assert(vioc->dma_unmap); - return vioc->dma_unmap(bcontainer, iova, size, iotlb, unmap_all); -} - -bool vfio_container_add_section_window(VFIOContainerBase *bcontainer, - MemoryRegionSection *section, - Error **errp) -{ - VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); - - if (!vioc->add_window) { - return true; - } - - return vioc->add_window(bcontainer, section, errp); -} - -void vfio_container_del_section_window(VFIOContainerBase *bcontainer, - MemoryRegionSection *section) -{ - VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); - - if (!vioc->del_window) { - return; - } - - return vioc->del_window(bcontainer, section); -} - -int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer, - bool start, Error **errp) -{ - VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); - int ret; - - if (!bcontainer->dirty_pages_supported) { - return 0; - } - - g_assert(vioc->set_dirty_page_tracking); - if (bcontainer->dirty_pages_started == start) { - return 0; - } - - ret = vioc->set_dirty_page_tracking(bcontainer, start, errp); - if (!ret) { - bcontainer->dirty_pages_started = start; - } - - return ret; -} - -static bool vfio_container_devices_dirty_tracking_is_started( - const VFIOContainerBase *bcontainer) -{ - VFIODevice *vbasedev; - - QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { - if (!vbasedev->dirty_tracking) { - return false; - } - } - - return true; -} - -bool vfio_container_dirty_tracking_is_started( - const VFIOContainerBase *bcontainer) -{ - return vfio_container_devices_dirty_tracking_is_started(bcontainer) || - bcontainer->dirty_pages_started; -} - -bool vfio_container_devices_dirty_tracking_is_supported( - const VFIOContainerBase *bcontainer) -{ - VFIODevice *vbasedev; - - QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { - if (vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) { - return false; - } - if (!vbasedev->dirty_pages_supported) { - return false; - } - } - - return true; -} - -static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova, - hwaddr size, void *bitmap) -{ - uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + - sizeof(struct vfio_device_feature_dma_logging_report), - sizeof(uint64_t))] = {}; - struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; - struct vfio_device_feature_dma_logging_report *report = - (struct vfio_device_feature_dma_logging_report *)feature->data; - - report->iova = iova; - report->length = size; - report->page_size = qemu_real_host_page_size(); - report->bitmap = (uintptr_t)bitmap; - - feature->argsz = sizeof(buf); - feature->flags = VFIO_DEVICE_FEATURE_GET | - VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT; - - return vbasedev->io_ops->device_feature(vbasedev, feature); -} - -static int vfio_container_iommu_query_dirty_bitmap(const VFIOContainerBase *bcontainer, - VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp) -{ - VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); - - g_assert(vioc->query_dirty_bitmap); - return vioc->query_dirty_bitmap(bcontainer, vbmap, iova, size, - errp); -} - -static int vfio_container_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer, - VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp) -{ - VFIODevice *vbasedev; - int ret; - - QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { - ret = vfio_device_dma_logging_report(vbasedev, iova, size, - vbmap->bitmap); - if (ret) { - error_setg_errno(errp, -ret, - "%s: Failed to get DMA logging report, iova: " - "0x%" HWADDR_PRIx ", size: 0x%" HWADDR_PRIx, - vbasedev->name, iova, size); - - return ret; - } - } - - return 0; -} - -int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova, - uint64_t size, ram_addr_t ram_addr, Error **errp) -{ - bool all_device_dirty_tracking = - vfio_container_devices_dirty_tracking_is_supported(bcontainer); - uint64_t dirty_pages; - VFIOBitmap vbmap; - int ret; - - if (!bcontainer->dirty_pages_supported && !all_device_dirty_tracking) { - cpu_physical_memory_set_dirty_range(ram_addr, size, - tcg_enabled() ? DIRTY_CLIENTS_ALL : - DIRTY_CLIENTS_NOCODE); - return 0; - } - - ret = vfio_bitmap_alloc(&vbmap, size); - if (ret) { - error_setg_errno(errp, -ret, - "Failed to allocate dirty tracking bitmap"); - return ret; - } - - if (all_device_dirty_tracking) { - ret = vfio_container_devices_query_dirty_bitmap(bcontainer, &vbmap, iova, size, - errp); - } else { - ret = vfio_container_iommu_query_dirty_bitmap(bcontainer, &vbmap, iova, size, - errp); - } - - if (ret) { - goto out; - } - - dirty_pages = cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr, - vbmap.pages); - - trace_vfio_container_query_dirty_bitmap(iova, size, vbmap.size, ram_addr, - dirty_pages); -out: - g_free(vbmap.bitmap); - - return ret; -} - -static gpointer copy_iova_range(gconstpointer src, gpointer data) -{ - Range *source = (Range *)src; - Range *dest = g_new(Range, 1); - - range_set_bounds(dest, range_lob(source), range_upb(source)); - return dest; -} - -GList *vfio_container_get_iova_ranges(const VFIOContainerBase *bcontainer) -{ - assert(bcontainer); - return g_list_copy_deep(bcontainer->iova_ranges, copy_iova_range, NULL); -} - -static void vfio_container_instance_finalize(Object *obj) -{ - VFIOContainerBase *bcontainer = VFIO_IOMMU(obj); - VFIOGuestIOMMU *giommu, *tmp; - - QLIST_SAFE_REMOVE(bcontainer, next); - - QLIST_FOREACH_SAFE(giommu, &bcontainer->giommu_list, giommu_next, tmp) { - memory_region_unregister_iommu_notifier( - MEMORY_REGION(giommu->iommu_mr), &giommu->n); - QLIST_REMOVE(giommu, giommu_next); - g_free(giommu); - } - - g_list_free_full(bcontainer->iova_ranges, g_free); -} - -static void vfio_container_instance_init(Object *obj) -{ - VFIOContainerBase *bcontainer = VFIO_IOMMU(obj); - - bcontainer->error = NULL; - bcontainer->dirty_pages_supported = false; - bcontainer->dma_max_mappings = 0; - bcontainer->iova_ranges = NULL; - QLIST_INIT(&bcontainer->giommu_list); - QLIST_INIT(&bcontainer->vrdl_list); -} - -static const TypeInfo types[] = { - { - .name = TYPE_VFIO_IOMMU, - .parent = TYPE_OBJECT, - .instance_init = vfio_container_instance_init, - .instance_finalize = vfio_container_instance_finalize, - .instance_size = sizeof(VFIOContainerBase), - .class_size = sizeof(VFIOIOMMUClass), - .abstract = true, - }, -}; - -DEFINE_TYPES(types) diff --git a/hw/vfio/container-legacy.c b/hw/vfio/container-legacy.c new file mode 100644 index 0000000..c0f87f7 --- /dev/null +++ b/hw/vfio/container-legacy.c @@ -0,0 +1,1277 @@ +/* + * generic functions used by VFIO devices + * + * Copyright Red Hat, Inc. 2012 + * + * Authors: + * Alex Williamson <alex.williamson@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Based on qemu-kvm device-assignment: + * Adapted for KVM by Qumranet. + * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) + * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) + * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) + * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) + * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) + */ + +#include "qemu/osdep.h" +#include <sys/ioctl.h> +#include <linux/vfio.h> + +#include "hw/vfio/vfio-device.h" +#include "system/address-spaces.h" +#include "system/memory.h" +#include "system/ram_addr.h" +#include "qemu/error-report.h" +#include "qemu/range.h" +#include "system/reset.h" +#include "trace.h" +#include "qapi/error.h" +#include "migration/cpr.h" +#include "migration/blocker.h" +#include "pci.h" +#include "hw/vfio/vfio-container-legacy.h" +#include "vfio-helpers.h" +#include "vfio-listener.h" + +#define TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-legacy-vfio" + +typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList; +static VFIOGroupList vfio_group_list = + QLIST_HEAD_INITIALIZER(vfio_group_list); + +static int vfio_ram_block_discard_disable(VFIOLegacyContainer *container, + bool state) +{ + switch (container->iommu_type) { + case VFIO_TYPE1v2_IOMMU: + case VFIO_TYPE1_IOMMU: + /* + * We support coordinated discarding of RAM via the RamDiscardManager. + */ + return ram_block_uncoordinated_discard_disable(state); + default: + /* + * VFIO_SPAPR_TCE_IOMMU most probably works just fine with + * RamDiscardManager, however, it is completely untested. + * + * VFIO_SPAPR_TCE_v2_IOMMU with "DMA memory preregistering" does + * completely the opposite of managing mapping/pinning dynamically as + * required by RamDiscardManager. We would have to special-case sections + * with a RamDiscardManager. + */ + return ram_block_discard_disable(state); + } +} + +static int vfio_dma_unmap_bitmap(const VFIOLegacyContainer *container, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb) +{ + const VFIOContainer *bcontainer = VFIO_IOMMU(container); + struct vfio_iommu_type1_dma_unmap *unmap; + struct vfio_bitmap *bitmap; + VFIOBitmap vbmap; + int ret; + + ret = vfio_bitmap_alloc(&vbmap, size); + if (ret) { + return ret; + } + + unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap)); + + unmap->argsz = sizeof(*unmap) + sizeof(*bitmap); + unmap->iova = iova; + unmap->size = size; + unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP; + bitmap = (struct vfio_bitmap *)&unmap->data; + + /* + * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of + * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize + * to qemu_real_host_page_size. + */ + bitmap->pgsize = qemu_real_host_page_size(); + bitmap->size = vbmap.size; + bitmap->data = (__u64 *)vbmap.bitmap; + + if (vbmap.size > bcontainer->max_dirty_bitmap_size) { + error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size); + ret = -E2BIG; + goto unmap_exit; + } + + ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap); + if (!ret) { + cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, + iotlb->translated_addr, vbmap.pages); + } else { + error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m"); + } + +unmap_exit: + g_free(unmap); + g_free(vbmap.bitmap); + + return ret; +} + +static int vfio_legacy_dma_unmap_one(const VFIOContainer *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb) +{ + const VFIOLegacyContainer *container = VFIO_IOMMU_LEGACY(bcontainer); + struct vfio_iommu_type1_dma_unmap unmap = { + .argsz = sizeof(unmap), + .flags = 0, + .iova = iova, + .size = size, + }; + bool need_dirty_sync = false; + int ret; + Error *local_err = NULL; + + g_assert(!cpr_is_incoming()); + + if (iotlb && vfio_container_dirty_tracking_is_started(bcontainer)) { + if (!vfio_container_devices_dirty_tracking_is_supported(bcontainer) && + bcontainer->dirty_pages_supported) { + return vfio_dma_unmap_bitmap(container, iova, size, iotlb); + } + + need_dirty_sync = true; + } + + while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { + /* + * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c + * v4.15) where an overflow in its wrap-around check prevents us from + * unmapping the last page of the address space. Test for the error + * condition and re-try the unmap excluding the last page. The + * expectation is that we've never mapped the last page anyway and this + * unmap request comes via vIOMMU support which also makes it unlikely + * that this page is used. This bug was introduced well after type1 v2 + * support was introduced, so we shouldn't need to test for v1. A fix + * is queued for kernel v5.0 so this workaround can be removed once + * affected kernels are sufficiently deprecated. + */ + if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) && + container->iommu_type == VFIO_TYPE1v2_IOMMU) { + trace_vfio_legacy_dma_unmap_overflow_workaround(); + unmap.size -= 1ULL << ctz64(bcontainer->pgsizes); + continue; + } + return -errno; + } + + if (need_dirty_sync) { + ret = vfio_container_query_dirty_bitmap(bcontainer, iova, size, + iotlb->translated_addr, &local_err); + if (ret) { + error_report_err(local_err); + return ret; + } + } + + return 0; +} + +/* + * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 + */ +static int vfio_legacy_dma_unmap(const VFIOContainer *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb, bool unmap_all) +{ + int ret; + + if (unmap_all) { + /* The unmap ioctl doesn't accept a full 64-bit span. */ + Int128 llsize = int128_rshift(int128_2_64(), 1); + + ret = vfio_legacy_dma_unmap_one(bcontainer, 0, int128_get64(llsize), + iotlb); + + if (ret == 0) { + ret = vfio_legacy_dma_unmap_one(bcontainer, int128_get64(llsize), + int128_get64(llsize), iotlb); + } + + } else { + ret = vfio_legacy_dma_unmap_one(bcontainer, iova, size, iotlb); + } + + return ret; +} + +static int vfio_legacy_dma_map(const VFIOContainer *bcontainer, hwaddr iova, + ram_addr_t size, void *vaddr, bool readonly, + MemoryRegion *mr) +{ + const VFIOLegacyContainer *container = VFIO_IOMMU_LEGACY(bcontainer); + struct vfio_iommu_type1_dma_map map = { + .argsz = sizeof(map), + .flags = VFIO_DMA_MAP_FLAG_READ, + .vaddr = (__u64)(uintptr_t)vaddr, + .iova = iova, + .size = size, + }; + + if (!readonly) { + map.flags |= VFIO_DMA_MAP_FLAG_WRITE; + } + + /* + * Try the mapping, if it fails with EBUSY, unmap the region and try + * again. This shouldn't be necessary, but we sometimes see it in + * the VGA ROM space. + */ + if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 || + (errno == EBUSY && + vfio_legacy_dma_unmap(bcontainer, iova, size, NULL, false) == 0 && + ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) { + return 0; + } + + return -errno; +} + +static int +vfio_legacy_set_dirty_page_tracking(const VFIOContainer *bcontainer, + bool start, Error **errp) +{ + const VFIOLegacyContainer *container = VFIO_IOMMU_LEGACY(bcontainer); + int ret; + struct vfio_iommu_type1_dirty_bitmap dirty = { + .argsz = sizeof(dirty), + }; + + if (start) { + dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START; + } else { + dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP; + } + + ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty); + if (ret) { + ret = -errno; + error_setg_errno(errp, errno, "Failed to set dirty tracking flag 0x%x", + dirty.flags); + } + + return ret; +} + +static int vfio_legacy_query_dirty_bitmap(const VFIOContainer *bcontainer, + VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp) +{ + const VFIOLegacyContainer *container = VFIO_IOMMU_LEGACY(bcontainer); + struct vfio_iommu_type1_dirty_bitmap *dbitmap; + struct vfio_iommu_type1_dirty_bitmap_get *range; + int ret; + + dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range)); + + dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range); + dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; + range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data; + range->iova = iova; + range->size = size; + + /* + * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of + * qemu_real_host_page_size to mark those dirty. Hence set bitmap's pgsize + * to qemu_real_host_page_size. + */ + range->bitmap.pgsize = qemu_real_host_page_size(); + range->bitmap.size = vbmap->size; + range->bitmap.data = (__u64 *)vbmap->bitmap; + + ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap); + if (ret) { + ret = -errno; + error_setg_errno(errp, errno, + "Failed to get dirty bitmap for iova: 0x%"PRIx64 + " size: 0x%"PRIx64, (uint64_t)range->iova, + (uint64_t)range->size); + } + + g_free(dbitmap); + + return ret; +} + +static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info, + VFIOContainer *bcontainer) +{ + struct vfio_info_cap_header *hdr; + struct vfio_iommu_type1_info_cap_iova_range *cap; + + hdr = vfio_get_iommu_type1_info_cap(info, + VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE); + if (!hdr) { + return false; + } + + cap = (void *)hdr; + + for (int i = 0; i < cap->nr_iovas; i++) { + Range *range = g_new(Range, 1); + + range_set_bounds(range, cap->iova_ranges[i].start, + cap->iova_ranges[i].end); + bcontainer->iova_ranges = + range_list_insert(bcontainer->iova_ranges, range); + } + + return true; +} + +static void vfio_group_add_kvm_device(VFIOGroup *group) +{ + Error *err = NULL; + + if (vfio_kvm_device_add_fd(group->fd, &err)) { + error_reportf_err(err, "group ID %d: ", group->groupid); + } +} + +static void vfio_group_del_kvm_device(VFIOGroup *group) +{ + Error *err = NULL; + + if (vfio_kvm_device_del_fd(group->fd, &err)) { + error_reportf_err(err, "group ID %d: ", group->groupid); + } +} + +/* + * vfio_get_iommu_type - selects the richest iommu_type (v2 first) + */ +static int vfio_get_iommu_type(int container_fd, + Error **errp) +{ + int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU, + VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU }; + int i; + + for (i = 0; i < ARRAY_SIZE(iommu_types); i++) { + if (ioctl(container_fd, VFIO_CHECK_EXTENSION, iommu_types[i])) { + return iommu_types[i]; + } + } + error_setg(errp, "No available IOMMU models"); + return -EINVAL; +} + +/* + * vfio_get_iommu_ops - get a VFIOIOMMUClass associated with a type + */ +static const char *vfio_get_iommu_class_name(int iommu_type) +{ + switch (iommu_type) { + case VFIO_TYPE1v2_IOMMU: + case VFIO_TYPE1_IOMMU: + return TYPE_VFIO_IOMMU_LEGACY; + break; + case VFIO_SPAPR_TCE_v2_IOMMU: + case VFIO_SPAPR_TCE_IOMMU: + return TYPE_VFIO_IOMMU_SPAPR; + break; + default: + g_assert_not_reached(); + }; +} + +static bool vfio_set_iommu(int container_fd, int group_fd, + int *iommu_type, Error **errp) +{ + if (ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container_fd)) { + error_setg_errno(errp, errno, "Failed to set group container"); + return false; + } + + while (ioctl(container_fd, VFIO_SET_IOMMU, *iommu_type)) { + if (*iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { + /* + * On sPAPR, despite the IOMMU subdriver always advertises v1 and + * v2, the running platform may not support v2 and there is no + * way to guess it until an IOMMU group gets added to the container. + * So in case it fails with v2, try v1 as a fallback. + */ + *iommu_type = VFIO_SPAPR_TCE_IOMMU; + continue; + } + error_setg_errno(errp, errno, "Failed to set iommu for container"); + return false; + } + + return true; +} + +static VFIOLegacyContainer *vfio_create_container(int fd, VFIOGroup *group, + Error **errp) +{ + int iommu_type; + const char *vioc_name; + VFIOLegacyContainer *container; + + iommu_type = vfio_get_iommu_type(fd, errp); + if (iommu_type < 0) { + return NULL; + } + + /* + * During CPR, just set the container type and skip the ioctls, as the + * container and group are already configured in the kernel. + */ + if (!cpr_is_incoming() && + !vfio_set_iommu(fd, group->fd, &iommu_type, errp)) { + return NULL; + } + + vioc_name = vfio_get_iommu_class_name(iommu_type); + + container = VFIO_IOMMU_LEGACY(object_new(vioc_name)); + container->fd = fd; + container->iommu_type = iommu_type; + return container; +} + +static int vfio_get_iommu_info(VFIOLegacyContainer *container, + struct vfio_iommu_type1_info **info) +{ + + size_t argsz = sizeof(struct vfio_iommu_type1_info); + + *info = g_new0(struct vfio_iommu_type1_info, 1); +again: + (*info)->argsz = argsz; + + if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) { + g_free(*info); + *info = NULL; + return -errno; + } + + if (((*info)->argsz > argsz)) { + argsz = (*info)->argsz; + *info = g_realloc(*info, argsz); + goto again; + } + + return 0; +} + +static struct vfio_info_cap_header * +vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id) +{ + struct vfio_info_cap_header *hdr; + void *ptr = info; + + if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) { + return NULL; + } + + for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) { + if (hdr->id == id) { + return hdr; + } + } + + return NULL; +} + +static void vfio_get_iommu_info_migration(VFIOLegacyContainer *container, + struct vfio_iommu_type1_info *info) +{ + struct vfio_info_cap_header *hdr; + struct vfio_iommu_type1_info_cap_migration *cap_mig; + VFIOContainer *bcontainer = VFIO_IOMMU(container); + + hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION); + if (!hdr) { + return; + } + + cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration, + header); + + /* + * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of + * qemu_real_host_page_size to mark those dirty. + */ + if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) { + bcontainer->dirty_pages_supported = true; + bcontainer->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size; + bcontainer->dirty_pgsizes = cap_mig->pgsize_bitmap; + } +} + +static bool vfio_legacy_setup(VFIOContainer *bcontainer, Error **errp) +{ + VFIOLegacyContainer *container = VFIO_IOMMU_LEGACY(bcontainer); + g_autofree struct vfio_iommu_type1_info *info = NULL; + int ret; + + ret = vfio_get_iommu_info(container, &info); + if (ret) { + error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info"); + return false; + } + + if (info->flags & VFIO_IOMMU_INFO_PGSIZES) { + bcontainer->pgsizes = info->iova_pgsizes; + } else { + bcontainer->pgsizes = qemu_real_host_page_size(); + } + + if (!vfio_get_info_dma_avail(info, &bcontainer->dma_max_mappings)) { + bcontainer->dma_max_mappings = 65535; + } + + vfio_get_info_iova_range(info, bcontainer); + + vfio_get_iommu_info_migration(container, info); + return true; +} + +static bool vfio_container_attach_discard_disable( + VFIOLegacyContainer *container, VFIOGroup *group, Error **errp) +{ + int ret; + + /* + * VFIO is currently incompatible with discarding of RAM insofar as the + * madvise to purge (zap) the page from QEMU's address space does not + * interact with the memory API and therefore leaves stale virtual to + * physical mappings in the IOMMU if the page was previously pinned. We + * therefore set discarding broken for each group added to a container, + * whether the container is used individually or shared. This provides + * us with options to allow devices within a group to opt-in and allow + * discarding, so long as it is done consistently for a group (for instance + * if the device is an mdev device where it is known that the host vendor + * driver will never pin pages outside of the working set of the guest + * driver, which would thus not be discarding candidates). + * + * The first opportunity to induce pinning occurs here where we attempt to + * attach the group to existing containers within the AddressSpace. If any + * pages are already zapped from the virtual address space, such as from + * previous discards, new pinning will cause valid mappings to be + * re-established. Likewise, when the overall MemoryListener for a new + * container is registered, a replay of mappings within the AddressSpace + * will occur, re-establishing any previously zapped pages as well. + * + * Especially virtio-balloon is currently only prevented from discarding + * new memory, it will not yet set ram_block_discard_set_required() and + * therefore, neither stops us here or deals with the sudden memory + * consumption of inflated memory. + * + * We do support discarding of memory coordinated via the RamDiscardManager + * with some IOMMU types. vfio_ram_block_discard_disable() handles the + * details once we know which type of IOMMU we are using. + */ + + ret = vfio_ram_block_discard_disable(container, true); + if (ret) { + error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken"); + if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) { + error_report("vfio: error disconnecting group %d from" + " container", group->groupid); + } + } + return !ret; +} + +static bool vfio_container_group_add(VFIOLegacyContainer *container, + VFIOGroup *group, Error **errp) +{ + if (!vfio_container_attach_discard_disable(container, group, errp)) { + return false; + } + group->container = container; + QLIST_INSERT_HEAD(&container->group_list, group, container_next); + vfio_group_add_kvm_device(group); + /* + * Remember the container fd for each group, so we can attach to the same + * container after CPR. + */ + cpr_resave_fd("vfio_container_for_group", group->groupid, container->fd); + return true; +} + +static void vfio_container_group_del(VFIOLegacyContainer *container, + VFIOGroup *group) +{ + QLIST_REMOVE(group, container_next); + group->container = NULL; + vfio_group_del_kvm_device(group); + vfio_ram_block_discard_disable(container, false); + cpr_delete_fd("vfio_container_for_group", group->groupid); +} + +static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, + Error **errp) +{ + VFIOLegacyContainer *container; + VFIOContainer *bcontainer; + int ret, fd = -1; + VFIOAddressSpace *space; + VFIOIOMMUClass *vioc = NULL; + bool new_container = false; + bool group_was_added = false; + + space = vfio_address_space_get(as); + fd = cpr_find_fd("vfio_container_for_group", group->groupid); + + if (!cpr_is_incoming()) { + QLIST_FOREACH(bcontainer, &space->containers, next) { + container = VFIO_IOMMU_LEGACY(bcontainer); + if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { + return vfio_container_group_add(container, group, errp); + } + } + + fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp); + if (fd < 0) { + goto fail; + } + } else { + /* + * For incoming CPR, the group is already attached in the kernel. + * If a container with matching fd is found, then update the + * userland group list and return. If not, then after the loop, + * create the container struct and group list. + */ + QLIST_FOREACH(bcontainer, &space->containers, next) { + container = VFIO_IOMMU_LEGACY(bcontainer); + + if (vfio_cpr_container_match(container, group, fd)) { + return vfio_container_group_add(container, group, errp); + } + } + } + + ret = ioctl(fd, VFIO_GET_API_VERSION); + if (ret != VFIO_API_VERSION) { + error_setg(errp, "supported vfio version: %d, " + "reported version: %d", VFIO_API_VERSION, ret); + goto fail; + } + + container = vfio_create_container(fd, group, errp); + if (!container) { + goto fail; + } + new_container = true; + bcontainer = VFIO_IOMMU(container); + + if (!vfio_legacy_cpr_register_container(container, errp)) { + goto fail; + } + + vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + assert(vioc->setup); + + if (!vioc->setup(bcontainer, errp)) { + goto fail; + } + + vfio_address_space_insert(space, bcontainer); + + if (!vfio_container_group_add(container, group, errp)) { + goto fail; + } + group_was_added = true; + + /* + * If CPR, register the listener later, after all state that may + * affect regions and mapping boundaries has been cpr load'ed. Later, + * the listener will invoke its callback on each flat section and call + * dma_map to supply the new vaddr, and the calls will match the mappings + * remembered by the kernel. + */ + if (!cpr_is_incoming()) { + if (!vfio_listener_register(bcontainer, errp)) { + goto fail; + } + } + + bcontainer->initialized = true; + + return true; + +fail: + if (new_container) { + vfio_listener_unregister(bcontainer); + } + + if (group_was_added) { + vfio_container_group_del(container, group); + } + if (vioc && vioc->release) { + vioc->release(bcontainer); + } + if (new_container) { + vfio_legacy_cpr_unregister_container(container); + object_unref(container); + } + if (fd >= 0) { + close(fd); + } + vfio_address_space_put(space); + + return false; +} + +static void vfio_container_disconnect(VFIOGroup *group) +{ + VFIOLegacyContainer *container = group->container; + VFIOContainer *bcontainer = VFIO_IOMMU(container); + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + + QLIST_REMOVE(group, container_next); + group->container = NULL; + cpr_delete_fd("vfio_container_for_group", group->groupid); + + /* + * Explicitly release the listener first before unset container, + * since unset may destroy the backend container if it's the last + * group. + */ + if (QLIST_EMPTY(&container->group_list)) { + vfio_listener_unregister(bcontainer); + if (vioc->release) { + vioc->release(bcontainer); + } + } + + if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) { + error_report("vfio: error disconnecting group %d from container", + group->groupid); + } + + if (QLIST_EMPTY(&container->group_list)) { + VFIOAddressSpace *space = bcontainer->space; + + trace_vfio_container_disconnect(container->fd); + vfio_legacy_cpr_unregister_container(container); + close(container->fd); + object_unref(container); + + vfio_address_space_put(space); + } +} + +static VFIOGroup *vfio_group_get(int groupid, AddressSpace *as, Error **errp) +{ + ERRP_GUARD(); + VFIOGroup *group; + char path[32]; + struct vfio_group_status status = { .argsz = sizeof(status) }; + + QLIST_FOREACH(group, &vfio_group_list, next) { + if (group->groupid == groupid) { + /* Found it. Now is it already in the right context? */ + if (VFIO_IOMMU(group->container)->space->as == as) { + return group; + } else { + error_setg(errp, "group %d used in multiple address spaces", + group->groupid); + return NULL; + } + } + } + + group = g_malloc0(sizeof(*group)); + + snprintf(path, sizeof(path), "/dev/vfio/%d", groupid); + group->fd = cpr_open_fd(path, O_RDWR, "vfio_group", groupid, errp); + if (group->fd < 0) { + goto free_group_exit; + } + + if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) { + error_setg_errno(errp, errno, "failed to get group %d status", groupid); + goto close_fd_exit; + } + + if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) { + error_setg(errp, "group %d is not viable", groupid); + error_append_hint(errp, + "Please ensure all devices within the iommu_group " + "are bound to their vfio bus driver.\n"); + goto close_fd_exit; + } + + group->groupid = groupid; + QLIST_INIT(&group->device_list); + + if (!vfio_container_connect(group, as, errp)) { + error_prepend(errp, "failed to setup container for group %d: ", + groupid); + goto close_fd_exit; + } + + QLIST_INSERT_HEAD(&vfio_group_list, group, next); + + return group; + +close_fd_exit: + cpr_delete_fd("vfio_group", groupid); + close(group->fd); + +free_group_exit: + g_free(group); + + return NULL; +} + +static void vfio_group_put(VFIOGroup *group) +{ + if (!group || !QLIST_EMPTY(&group->device_list)) { + return; + } + + if (!group->ram_block_discard_allowed) { + vfio_ram_block_discard_disable(group->container, false); + } + vfio_group_del_kvm_device(group); + vfio_container_disconnect(group); + QLIST_REMOVE(group, next); + trace_vfio_group_put(group->fd); + cpr_delete_fd("vfio_group", group->groupid); + close(group->fd); + g_free(group); +} + +static bool vfio_device_get(VFIOGroup *group, const char *name, + VFIODevice *vbasedev, Error **errp) +{ + g_autofree struct vfio_device_info *info = NULL; + int fd; + + fd = vfio_cpr_group_get_device_fd(group->fd, name); + if (fd < 0) { + error_setg_errno(errp, errno, "error getting device from group %d", + group->groupid); + error_append_hint(errp, + "Verify all devices in group %d are bound to vfio-<bus> " + "or pci-stub and not already in use\n", group->groupid); + return false; + } + + info = vfio_get_device_info(fd); + if (!info) { + error_setg_errno(errp, errno, "error getting device info"); + goto fail; + } + + /* + * Set discarding of RAM as not broken for this group if the driver knows + * the device operates compatibly with discarding. Setting must be + * consistent per group, but since compatibility is really only possible + * with mdev currently, we expect singleton groups. + */ + if (vbasedev->ram_block_discard_allowed != + group->ram_block_discard_allowed) { + if (!QLIST_EMPTY(&group->device_list)) { + error_setg(errp, "Inconsistent setting of support for discarding " + "RAM (e.g., balloon) within group"); + goto fail; + } + + if (!group->ram_block_discard_allowed) { + group->ram_block_discard_allowed = true; + vfio_ram_block_discard_disable(group->container, false); + } + } + + vfio_device_prepare(vbasedev, VFIO_IOMMU(group->container), info); + + vbasedev->fd = fd; + vbasedev->group = group; + QLIST_INSERT_HEAD(&group->device_list, vbasedev, next); + + trace_vfio_device_get(name, info->flags, info->num_regions, info->num_irqs); + + return true; + +fail: + close(fd); + cpr_delete_fd(name, 0); + return false; +} + +static void vfio_device_put(VFIODevice *vbasedev) +{ + if (!vbasedev->group) { + return; + } + QLIST_REMOVE(vbasedev, next); + vbasedev->group = NULL; + trace_vfio_device_put(vbasedev->fd); + cpr_delete_fd(vbasedev->name, 0); + close(vbasedev->fd); +} + +static int vfio_device_get_groupid(VFIODevice *vbasedev, Error **errp) +{ + char *tmp, group_path[PATH_MAX]; + g_autofree char *group_name = NULL; + int ret, groupid; + ssize_t len; + + tmp = g_strdup_printf("%s/iommu_group", vbasedev->sysfsdev); + len = readlink(tmp, group_path, sizeof(group_path)); + g_free(tmp); + + if (len <= 0 || len >= sizeof(group_path)) { + ret = len < 0 ? -errno : -ENAMETOOLONG; + error_setg_errno(errp, -ret, "no iommu_group found"); + return ret; + } + + group_path[len] = 0; + + group_name = g_path_get_basename(group_path); + if (sscanf(group_name, "%d", &groupid) != 1) { + error_setg_errno(errp, errno, "failed to read %s", group_path); + return -errno; + } + return groupid; +} + +/* + * vfio_device_attach: attach a device to a security context + * @name and @vbasedev->name are likely to be different depending + * on the type of the device, hence the need for passing @name + */ +static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp) +{ + int groupid = vfio_device_get_groupid(vbasedev, errp); + VFIODevice *vbasedev_iter; + VFIOGroup *group; + + if (groupid < 0) { + return false; + } + + trace_vfio_device_attach(vbasedev->name, groupid); + + group = vfio_group_get(groupid, as, errp); + if (!group) { + return false; + } + + QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { + if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) { + error_setg(errp, "device is already attached"); + goto group_put_exit; + } + } + if (!vfio_device_get(group, name, vbasedev, errp)) { + goto group_put_exit; + } + + if (!vfio_device_hiod_create_and_realize(vbasedev, + TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO, + errp)) { + goto device_put_exit; + } + + if (vbasedev->mdev) { + error_setg(&vbasedev->cpr.mdev_blocker, + "CPR does not support vfio mdev %s", vbasedev->name); + if (migrate_add_blocker_modes(&vbasedev->cpr.mdev_blocker, errp, + MIG_MODE_CPR_TRANSFER, -1) < 0) { + goto hiod_unref_exit; + } + } + + return true; + +hiod_unref_exit: + object_unref(vbasedev->hiod); +device_put_exit: + vfio_device_put(vbasedev); +group_put_exit: + vfio_group_put(group); + return false; +} + +static void vfio_legacy_detach_device(VFIODevice *vbasedev) +{ + VFIOGroup *group = vbasedev->group; + + trace_vfio_device_detach(vbasedev->name, group->groupid); + + vfio_device_unprepare(vbasedev); + + migrate_del_blocker(&vbasedev->cpr.mdev_blocker); + object_unref(vbasedev->hiod); + vfio_device_put(vbasedev); + vfio_group_put(group); +} + +static int vfio_legacy_pci_hot_reset(VFIODevice *vbasedev, bool single) +{ + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + VFIOGroup *group; + struct vfio_pci_hot_reset_info *info = NULL; + struct vfio_pci_dependent_device *devices; + struct vfio_pci_hot_reset *reset; + int32_t *fds; + int ret, i, count; + bool multi = false; + + trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); + + if (!single) { + vfio_pci_pre_reset(vdev); + } + vdev->vbasedev.needs_reset = false; + + ret = vfio_pci_get_pci_hot_reset_info(vdev, &info); + + if (ret) { + goto out_single; + } + devices = &info->devices[0]; + + trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name); + + /* Verify that we have all the groups required */ + for (i = 0; i < info->count; i++) { + PCIHostDeviceAddress host; + VFIOPCIDevice *tmp; + VFIODevice *vbasedev_iter; + + host.domain = devices[i].segment; + host.bus = devices[i].bus; + host.slot = PCI_SLOT(devices[i].devfn); + host.function = PCI_FUNC(devices[i].devfn); + + trace_vfio_pci_hot_reset_dep_devices(host.domain, + host.bus, host.slot, host.function, devices[i].group_id); + + if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { + continue; + } + + QLIST_FOREACH(group, &vfio_group_list, next) { + if (group->groupid == devices[i].group_id) { + break; + } + } + + if (!group) { + if (!vdev->has_pm_reset) { + error_report("vfio: Cannot reset device %s, " + "depends on group %d which is not owned.", + vdev->vbasedev.name, devices[i].group_id); + } + ret = -EPERM; + goto out; + } + + /* Prep dependent devices for reset and clear our marker. */ + QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { + if (!vbasedev_iter->dev->realized || + !vfio_pci_from_vfio_device(vbasedev_iter)) { + continue; + } + tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); + if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { + if (single) { + ret = -EINVAL; + goto out_single; + } + vfio_pci_pre_reset(tmp); + tmp->vbasedev.needs_reset = false; + multi = true; + break; + } + } + } + + if (!single && !multi) { + ret = -EINVAL; + goto out_single; + } + + /* Determine how many group fds need to be passed */ + count = 0; + QLIST_FOREACH(group, &vfio_group_list, next) { + for (i = 0; i < info->count; i++) { + if (group->groupid == devices[i].group_id) { + count++; + break; + } + } + } + + reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds))); + reset->argsz = sizeof(*reset) + (count * sizeof(*fds)); + fds = &reset->group_fds[0]; + + /* Fill in group fds */ + QLIST_FOREACH(group, &vfio_group_list, next) { + for (i = 0; i < info->count; i++) { + if (group->groupid == devices[i].group_id) { + fds[reset->count++] = group->fd; + break; + } + } + } + + /* Bus reset! */ + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset); + g_free(reset); + if (ret) { + ret = -errno; + } + + trace_vfio_pci_hot_reset_result(vdev->vbasedev.name, + ret ? strerror(errno) : "Success"); + +out: + /* Re-enable INTx on affected devices */ + for (i = 0; i < info->count; i++) { + PCIHostDeviceAddress host; + VFIOPCIDevice *tmp; + VFIODevice *vbasedev_iter; + + host.domain = devices[i].segment; + host.bus = devices[i].bus; + host.slot = PCI_SLOT(devices[i].devfn); + host.function = PCI_FUNC(devices[i].devfn); + + if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { + continue; + } + + QLIST_FOREACH(group, &vfio_group_list, next) { + if (group->groupid == devices[i].group_id) { + break; + } + } + + if (!group) { + break; + } + + QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { + if (!vbasedev_iter->dev->realized || + !vfio_pci_from_vfio_device(vbasedev_iter)) { + continue; + } + tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); + if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { + vfio_pci_post_reset(tmp); + break; + } + } + } +out_single: + if (!single) { + vfio_pci_post_reset(vdev); + } + g_free(info); + + return ret; +} + +static void vfio_iommu_legacy_class_init(ObjectClass *klass, const void *data) +{ + VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); + + vioc->setup = vfio_legacy_setup; + vioc->dma_map = vfio_legacy_dma_map; + vioc->dma_unmap = vfio_legacy_dma_unmap; + vioc->attach_device = vfio_legacy_attach_device; + vioc->detach_device = vfio_legacy_detach_device; + vioc->set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking; + vioc->query_dirty_bitmap = vfio_legacy_query_dirty_bitmap; + vioc->pci_hot_reset = vfio_legacy_pci_hot_reset; +}; + +static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque, + Error **errp) +{ + VFIODevice *vdev = opaque; + + hiod->name = g_strdup(vdev->name); + hiod->agent = opaque; + + return true; +} + +static int hiod_legacy_vfio_get_cap(HostIOMMUDevice *hiod, int cap, + Error **errp) +{ + switch (cap) { + case HOST_IOMMU_DEVICE_CAP_AW_BITS: + return vfio_device_get_aw_bits(hiod->agent); + default: + error_setg(errp, "%s: unsupported capability %x", hiod->name, cap); + return -EINVAL; + } +} + +static GList * +hiod_legacy_vfio_get_iova_ranges(HostIOMMUDevice *hiod) +{ + VFIODevice *vdev = hiod->agent; + + g_assert(vdev); + return vfio_container_get_iova_ranges(vdev->bcontainer); +} + +static uint64_t +hiod_legacy_vfio_get_page_size_mask(HostIOMMUDevice *hiod) +{ + VFIODevice *vdev = hiod->agent; + + g_assert(vdev); + return vfio_container_get_page_size_mask(vdev->bcontainer); +} + +static void vfio_iommu_legacy_instance_init(Object *obj) +{ + VFIOLegacyContainer *container = VFIO_IOMMU_LEGACY(obj); + + QLIST_INIT(&container->group_list); +} + +static void hiod_legacy_vfio_class_init(ObjectClass *oc, const void *data) +{ + HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc); + + hioc->realize = hiod_legacy_vfio_realize; + hioc->get_cap = hiod_legacy_vfio_get_cap; + hioc->get_iova_ranges = hiod_legacy_vfio_get_iova_ranges; + hioc->get_page_size_mask = hiod_legacy_vfio_get_page_size_mask; +}; + +static const TypeInfo types[] = { + { + .name = TYPE_VFIO_IOMMU_LEGACY, + .parent = TYPE_VFIO_IOMMU, + .instance_init = vfio_iommu_legacy_instance_init, + .instance_size = sizeof(VFIOLegacyContainer), + .class_init = vfio_iommu_legacy_class_init, + }, { + .name = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO, + .parent = TYPE_HOST_IOMMU_DEVICE, + .class_init = hiod_legacy_vfio_class_init, + } +}; + +DEFINE_TYPES(types) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 030c6d3..250b20f 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -1,1275 +1,350 @@ /* - * generic functions used by VFIO devices + * VFIO BASE CONTAINER * - * Copyright Red Hat, Inc. 2012 + * Copyright (C) 2023 Intel Corporation. + * Copyright Red Hat, Inc. 2023 * - * Authors: - * Alex Williamson <alex.williamson@redhat.com> + * Authors: Yi Liu <yi.l.liu@intel.com> + * Eric Auger <eric.auger@redhat.com> * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - * Based on qemu-kvm device-assignment: - * Adapted for KVM by Qumranet. - * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) - * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) - * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) - * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) - * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) + * SPDX-License-Identifier: GPL-2.0-or-later */ -#include "qemu/osdep.h" #include <sys/ioctl.h> #include <linux/vfio.h> -#include "hw/vfio/vfio-device.h" -#include "system/address-spaces.h" -#include "system/memory.h" +#include "qemu/osdep.h" +#include "system/tcg.h" #include "system/ram_addr.h" -#include "qemu/error-report.h" -#include "qemu/range.h" -#include "system/reset.h" -#include "trace.h" #include "qapi/error.h" -#include "migration/cpr.h" -#include "migration/blocker.h" -#include "pci.h" +#include "qemu/error-report.h" #include "hw/vfio/vfio-container.h" +#include "hw/vfio/vfio-device.h" /* vfio_device_reset_handler */ +#include "system/reset.h" #include "vfio-helpers.h" -#include "vfio-listener.h" - -#define TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-legacy-vfio" - -typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList; -static VFIOGroupList vfio_group_list = - QLIST_HEAD_INITIALIZER(vfio_group_list); - -static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state) -{ - switch (container->iommu_type) { - case VFIO_TYPE1v2_IOMMU: - case VFIO_TYPE1_IOMMU: - /* - * We support coordinated discarding of RAM via the RamDiscardManager. - */ - return ram_block_uncoordinated_discard_disable(state); - default: - /* - * VFIO_SPAPR_TCE_IOMMU most probably works just fine with - * RamDiscardManager, however, it is completely untested. - * - * VFIO_SPAPR_TCE_v2_IOMMU with "DMA memory preregistering" does - * completely the opposite of managing mapping/pinning dynamically as - * required by RamDiscardManager. We would have to special-case sections - * with a RamDiscardManager. - */ - return ram_block_discard_disable(state); - } -} - -static int vfio_dma_unmap_bitmap(const VFIOContainer *container, - hwaddr iova, ram_addr_t size, - IOMMUTLBEntry *iotlb) -{ - const VFIOContainerBase *bcontainer = VFIO_IOMMU(container); - struct vfio_iommu_type1_dma_unmap *unmap; - struct vfio_bitmap *bitmap; - VFIOBitmap vbmap; - int ret; - - ret = vfio_bitmap_alloc(&vbmap, size); - if (ret) { - return ret; - } - - unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap)); - - unmap->argsz = sizeof(*unmap) + sizeof(*bitmap); - unmap->iova = iova; - unmap->size = size; - unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP; - bitmap = (struct vfio_bitmap *)&unmap->data; - - /* - * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of - * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize - * to qemu_real_host_page_size. - */ - bitmap->pgsize = qemu_real_host_page_size(); - bitmap->size = vbmap.size; - bitmap->data = (__u64 *)vbmap.bitmap; - - if (vbmap.size > bcontainer->max_dirty_bitmap_size) { - error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size); - ret = -E2BIG; - goto unmap_exit; - } - - ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap); - if (!ret) { - cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, - iotlb->translated_addr, vbmap.pages); - } else { - error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m"); - } - -unmap_exit: - g_free(unmap); - g_free(vbmap.bitmap); - - return ret; -} - -static int vfio_legacy_dma_unmap_one(const VFIOContainerBase *bcontainer, - hwaddr iova, ram_addr_t size, - IOMMUTLBEntry *iotlb) -{ - const VFIOContainer *container = VFIO_IOMMU_LEGACY(bcontainer); - struct vfio_iommu_type1_dma_unmap unmap = { - .argsz = sizeof(unmap), - .flags = 0, - .iova = iova, - .size = size, - }; - bool need_dirty_sync = false; - int ret; - Error *local_err = NULL; - - g_assert(!cpr_is_incoming()); - - if (iotlb && vfio_container_dirty_tracking_is_started(bcontainer)) { - if (!vfio_container_devices_dirty_tracking_is_supported(bcontainer) && - bcontainer->dirty_pages_supported) { - return vfio_dma_unmap_bitmap(container, iova, size, iotlb); - } - need_dirty_sync = true; - } - - while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { - /* - * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c - * v4.15) where an overflow in its wrap-around check prevents us from - * unmapping the last page of the address space. Test for the error - * condition and re-try the unmap excluding the last page. The - * expectation is that we've never mapped the last page anyway and this - * unmap request comes via vIOMMU support which also makes it unlikely - * that this page is used. This bug was introduced well after type1 v2 - * support was introduced, so we shouldn't need to test for v1. A fix - * is queued for kernel v5.0 so this workaround can be removed once - * affected kernels are sufficiently deprecated. - */ - if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) && - container->iommu_type == VFIO_TYPE1v2_IOMMU) { - trace_vfio_legacy_dma_unmap_overflow_workaround(); - unmap.size -= 1ULL << ctz64(bcontainer->pgsizes); - continue; - } - return -errno; - } - - if (need_dirty_sync) { - ret = vfio_container_query_dirty_bitmap(bcontainer, iova, size, - iotlb->translated_addr, &local_err); - if (ret) { - error_report_err(local_err); - return ret; - } - } +#include "trace.h" - return 0; -} +static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces = + QLIST_HEAD_INITIALIZER(vfio_address_spaces); -/* - * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 - */ -static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer, - hwaddr iova, ram_addr_t size, - IOMMUTLBEntry *iotlb, bool unmap_all) +VFIOAddressSpace *vfio_address_space_get(AddressSpace *as) { - int ret; - - if (unmap_all) { - /* The unmap ioctl doesn't accept a full 64-bit span. */ - Int128 llsize = int128_rshift(int128_2_64(), 1); - - ret = vfio_legacy_dma_unmap_one(bcontainer, 0, int128_get64(llsize), - iotlb); + VFIOAddressSpace *space; - if (ret == 0) { - ret = vfio_legacy_dma_unmap_one(bcontainer, int128_get64(llsize), - int128_get64(llsize), iotlb); + QLIST_FOREACH(space, &vfio_address_spaces, list) { + if (space->as == as) { + return space; } - - } else { - ret = vfio_legacy_dma_unmap_one(bcontainer, iova, size, iotlb); - } - - return ret; -} - -static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova, - ram_addr_t size, void *vaddr, bool readonly, - MemoryRegion *mr) -{ - const VFIOContainer *container = VFIO_IOMMU_LEGACY(bcontainer); - struct vfio_iommu_type1_dma_map map = { - .argsz = sizeof(map), - .flags = VFIO_DMA_MAP_FLAG_READ, - .vaddr = (__u64)(uintptr_t)vaddr, - .iova = iova, - .size = size, - }; - - if (!readonly) { - map.flags |= VFIO_DMA_MAP_FLAG_WRITE; - } - - /* - * Try the mapping, if it fails with EBUSY, unmap the region and try - * again. This shouldn't be necessary, but we sometimes see it in - * the VGA ROM space. - */ - if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 || - (errno == EBUSY && - vfio_legacy_dma_unmap(bcontainer, iova, size, NULL, false) == 0 && - ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) { - return 0; - } - - return -errno; -} - -static int -vfio_legacy_set_dirty_page_tracking(const VFIOContainerBase *bcontainer, - bool start, Error **errp) -{ - const VFIOContainer *container = VFIO_IOMMU_LEGACY(bcontainer); - int ret; - struct vfio_iommu_type1_dirty_bitmap dirty = { - .argsz = sizeof(dirty), - }; - - if (start) { - dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START; - } else { - dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP; - } - - ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty); - if (ret) { - ret = -errno; - error_setg_errno(errp, errno, "Failed to set dirty tracking flag 0x%x", - dirty.flags); - } - - return ret; -} - -static int vfio_legacy_query_dirty_bitmap(const VFIOContainerBase *bcontainer, - VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp) -{ - const VFIOContainer *container = VFIO_IOMMU_LEGACY(bcontainer); - struct vfio_iommu_type1_dirty_bitmap *dbitmap; - struct vfio_iommu_type1_dirty_bitmap_get *range; - int ret; - - dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range)); - - dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range); - dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; - range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data; - range->iova = iova; - range->size = size; - - /* - * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of - * qemu_real_host_page_size to mark those dirty. Hence set bitmap's pgsize - * to qemu_real_host_page_size. - */ - range->bitmap.pgsize = qemu_real_host_page_size(); - range->bitmap.size = vbmap->size; - range->bitmap.data = (__u64 *)vbmap->bitmap; - - ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap); - if (ret) { - ret = -errno; - error_setg_errno(errp, errno, - "Failed to get dirty bitmap for iova: 0x%"PRIx64 - " size: 0x%"PRIx64, (uint64_t)range->iova, - (uint64_t)range->size); - } - - g_free(dbitmap); - - return ret; -} - -static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info, - VFIOContainerBase *bcontainer) -{ - struct vfio_info_cap_header *hdr; - struct vfio_iommu_type1_info_cap_iova_range *cap; - - hdr = vfio_get_iommu_type1_info_cap(info, - VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE); - if (!hdr) { - return false; } - cap = (void *)hdr; - - for (int i = 0; i < cap->nr_iovas; i++) { - Range *range = g_new(Range, 1); + /* No suitable VFIOAddressSpace, create a new one */ + space = g_malloc0(sizeof(*space)); + space->as = as; + QLIST_INIT(&space->containers); - range_set_bounds(range, cap->iova_ranges[i].start, - cap->iova_ranges[i].end); - bcontainer->iova_ranges = - range_list_insert(bcontainer->iova_ranges, range); + if (QLIST_EMPTY(&vfio_address_spaces)) { + qemu_register_reset(vfio_device_reset_handler, NULL); } - return true; -} - -static void vfio_group_add_kvm_device(VFIOGroup *group) -{ - Error *err = NULL; + QLIST_INSERT_HEAD(&vfio_address_spaces, space, list); - if (vfio_kvm_device_add_fd(group->fd, &err)) { - error_reportf_err(err, "group ID %d: ", group->groupid); - } + return space; } -static void vfio_group_del_kvm_device(VFIOGroup *group) +void vfio_address_space_put(VFIOAddressSpace *space) { - Error *err = NULL; - - if (vfio_kvm_device_del_fd(group->fd, &err)) { - error_reportf_err(err, "group ID %d: ", group->groupid); + if (!QLIST_EMPTY(&space->containers)) { + return; } -} -/* - * vfio_get_iommu_type - selects the richest iommu_type (v2 first) - */ -static int vfio_get_iommu_type(int container_fd, - Error **errp) -{ - int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU, - VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU }; - int i; + QLIST_REMOVE(space, list); + g_free(space); - for (i = 0; i < ARRAY_SIZE(iommu_types); i++) { - if (ioctl(container_fd, VFIO_CHECK_EXTENSION, iommu_types[i])) { - return iommu_types[i]; - } + if (QLIST_EMPTY(&vfio_address_spaces)) { + qemu_unregister_reset(vfio_device_reset_handler, NULL); } - error_setg(errp, "No available IOMMU models"); - return -EINVAL; } -/* - * vfio_get_iommu_ops - get a VFIOIOMMUClass associated with a type - */ -static const char *vfio_get_iommu_class_name(int iommu_type) +void vfio_address_space_insert(VFIOAddressSpace *space, + VFIOContainer *bcontainer) { - switch (iommu_type) { - case VFIO_TYPE1v2_IOMMU: - case VFIO_TYPE1_IOMMU: - return TYPE_VFIO_IOMMU_LEGACY; - break; - case VFIO_SPAPR_TCE_v2_IOMMU: - case VFIO_SPAPR_TCE_IOMMU: - return TYPE_VFIO_IOMMU_SPAPR; - break; - default: - g_assert_not_reached(); - }; + QLIST_INSERT_HEAD(&space->containers, bcontainer, next); + bcontainer->space = space; } -static bool vfio_set_iommu(int container_fd, int group_fd, - int *iommu_type, Error **errp) +int vfio_container_dma_map(VFIOContainer *bcontainer, + hwaddr iova, ram_addr_t size, + void *vaddr, bool readonly, MemoryRegion *mr) { - if (ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container_fd)) { - error_setg_errno(errp, errno, "Failed to set group container"); - return false; - } - - while (ioctl(container_fd, VFIO_SET_IOMMU, *iommu_type)) { - if (*iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { - /* - * On sPAPR, despite the IOMMU subdriver always advertises v1 and - * v2, the running platform may not support v2 and there is no - * way to guess it until an IOMMU group gets added to the container. - * So in case it fails with v2, try v1 as a fallback. - */ - *iommu_type = VFIO_SPAPR_TCE_IOMMU; - continue; - } - error_setg_errno(errp, errno, "Failed to set iommu for container"); - return false; - } - - return true; -} - -static VFIOContainer *vfio_create_container(int fd, VFIOGroup *group, - Error **errp) -{ - int iommu_type; - const char *vioc_name; - VFIOContainer *container; + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + RAMBlock *rb = mr->ram_block; + int mfd = rb ? qemu_ram_get_fd(rb) : -1; - iommu_type = vfio_get_iommu_type(fd, errp); - if (iommu_type < 0) { - return NULL; - } + if (mfd >= 0 && vioc->dma_map_file) { + unsigned long start = vaddr - qemu_ram_get_host_addr(rb); + unsigned long offset = qemu_ram_get_fd_offset(rb); - /* - * During CPR, just set the container type and skip the ioctls, as the - * container and group are already configured in the kernel. - */ - if (!cpr_is_incoming() && - !vfio_set_iommu(fd, group->fd, &iommu_type, errp)) { - return NULL; + return vioc->dma_map_file(bcontainer, iova, size, mfd, start + offset, + readonly); } - - vioc_name = vfio_get_iommu_class_name(iommu_type); - - container = VFIO_IOMMU_LEGACY(object_new(vioc_name)); - container->fd = fd; - container->iommu_type = iommu_type; - return container; + g_assert(vioc->dma_map); + return vioc->dma_map(bcontainer, iova, size, vaddr, readonly, mr); } -static int vfio_get_iommu_info(VFIOContainer *container, - struct vfio_iommu_type1_info **info) +int vfio_container_dma_unmap(VFIOContainer *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb, bool unmap_all) { + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); - size_t argsz = sizeof(struct vfio_iommu_type1_info); - - *info = g_new0(struct vfio_iommu_type1_info, 1); -again: - (*info)->argsz = argsz; - - if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) { - g_free(*info); - *info = NULL; - return -errno; - } - - if (((*info)->argsz > argsz)) { - argsz = (*info)->argsz; - *info = g_realloc(*info, argsz); - goto again; - } - - return 0; + g_assert(vioc->dma_unmap); + return vioc->dma_unmap(bcontainer, iova, size, iotlb, unmap_all); } -static struct vfio_info_cap_header * -vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id) +bool vfio_container_add_section_window(VFIOContainer *bcontainer, + MemoryRegionSection *section, + Error **errp) { - struct vfio_info_cap_header *hdr; - void *ptr = info; + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); - if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) { - return NULL; + if (!vioc->add_window) { + return true; } - for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) { - if (hdr->id == id) { - return hdr; - } - } - - return NULL; + return vioc->add_window(bcontainer, section, errp); } -static void vfio_get_iommu_info_migration(VFIOContainer *container, - struct vfio_iommu_type1_info *info) +void vfio_container_del_section_window(VFIOContainer *bcontainer, + MemoryRegionSection *section) { - struct vfio_info_cap_header *hdr; - struct vfio_iommu_type1_info_cap_migration *cap_mig; - VFIOContainerBase *bcontainer = VFIO_IOMMU(container); + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); - hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION); - if (!hdr) { + if (!vioc->del_window) { return; } - cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration, - header); - - /* - * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of - * qemu_real_host_page_size to mark those dirty. - */ - if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) { - bcontainer->dirty_pages_supported = true; - bcontainer->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size; - bcontainer->dirty_pgsizes = cap_mig->pgsize_bitmap; - } + return vioc->del_window(bcontainer, section); } -static bool vfio_legacy_setup(VFIOContainerBase *bcontainer, Error **errp) +int vfio_container_set_dirty_page_tracking(VFIOContainer *bcontainer, + bool start, Error **errp) { - VFIOContainer *container = VFIO_IOMMU_LEGACY(bcontainer); - g_autofree struct vfio_iommu_type1_info *info = NULL; + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); int ret; - ret = vfio_get_iommu_info(container, &info); - if (ret) { - error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info"); - return false; + if (!bcontainer->dirty_pages_supported) { + return 0; } - if (info->flags & VFIO_IOMMU_INFO_PGSIZES) { - bcontainer->pgsizes = info->iova_pgsizes; - } else { - bcontainer->pgsizes = qemu_real_host_page_size(); + g_assert(vioc->set_dirty_page_tracking); + if (bcontainer->dirty_pages_started == start) { + return 0; } - if (!vfio_get_info_dma_avail(info, &bcontainer->dma_max_mappings)) { - bcontainer->dma_max_mappings = 65535; + ret = vioc->set_dirty_page_tracking(bcontainer, start, errp); + if (!ret) { + bcontainer->dirty_pages_started = start; } - vfio_get_info_iova_range(info, bcontainer); - - vfio_get_iommu_info_migration(container, info); - return true; + return ret; } -static bool vfio_container_attach_discard_disable(VFIOContainer *container, - VFIOGroup *group, Error **errp) +static bool vfio_container_devices_dirty_tracking_is_started( + const VFIOContainer *bcontainer) { - int ret; + VFIODevice *vbasedev; - /* - * VFIO is currently incompatible with discarding of RAM insofar as the - * madvise to purge (zap) the page from QEMU's address space does not - * interact with the memory API and therefore leaves stale virtual to - * physical mappings in the IOMMU if the page was previously pinned. We - * therefore set discarding broken for each group added to a container, - * whether the container is used individually or shared. This provides - * us with options to allow devices within a group to opt-in and allow - * discarding, so long as it is done consistently for a group (for instance - * if the device is an mdev device where it is known that the host vendor - * driver will never pin pages outside of the working set of the guest - * driver, which would thus not be discarding candidates). - * - * The first opportunity to induce pinning occurs here where we attempt to - * attach the group to existing containers within the AddressSpace. If any - * pages are already zapped from the virtual address space, such as from - * previous discards, new pinning will cause valid mappings to be - * re-established. Likewise, when the overall MemoryListener for a new - * container is registered, a replay of mappings within the AddressSpace - * will occur, re-establishing any previously zapped pages as well. - * - * Especially virtio-balloon is currently only prevented from discarding - * new memory, it will not yet set ram_block_discard_set_required() and - * therefore, neither stops us here or deals with the sudden memory - * consumption of inflated memory. - * - * We do support discarding of memory coordinated via the RamDiscardManager - * with some IOMMU types. vfio_ram_block_discard_disable() handles the - * details once we know which type of IOMMU we are using. - */ - - ret = vfio_ram_block_discard_disable(container, true); - if (ret) { - error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken"); - if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) { - error_report("vfio: error disconnecting group %d from" - " container", group->groupid); + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + if (!vbasedev->dirty_tracking) { + return false; } } - return !ret; -} -static bool vfio_container_group_add(VFIOContainer *container, VFIOGroup *group, - Error **errp) -{ - if (!vfio_container_attach_discard_disable(container, group, errp)) { - return false; - } - group->container = container; - QLIST_INSERT_HEAD(&container->group_list, group, container_next); - vfio_group_add_kvm_device(group); - /* - * Remember the container fd for each group, so we can attach to the same - * container after CPR. - */ - cpr_resave_fd("vfio_container_for_group", group->groupid, container->fd); return true; } -static void vfio_container_group_del(VFIOContainer *container, VFIOGroup *group) +bool vfio_container_dirty_tracking_is_started( + const VFIOContainer *bcontainer) { - QLIST_REMOVE(group, container_next); - group->container = NULL; - vfio_group_del_kvm_device(group); - vfio_ram_block_discard_disable(container, false); - cpr_delete_fd("vfio_container_for_group", group->groupid); + return vfio_container_devices_dirty_tracking_is_started(bcontainer) || + bcontainer->dirty_pages_started; } -static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, - Error **errp) +bool vfio_container_devices_dirty_tracking_is_supported( + const VFIOContainer *bcontainer) { - VFIOContainer *container; - VFIOContainerBase *bcontainer; - int ret, fd = -1; - VFIOAddressSpace *space; - VFIOIOMMUClass *vioc = NULL; - bool new_container = false; - bool group_was_added = false; - - space = vfio_address_space_get(as); - fd = cpr_find_fd("vfio_container_for_group", group->groupid); - - if (!cpr_is_incoming()) { - QLIST_FOREACH(bcontainer, &space->containers, next) { - container = VFIO_IOMMU_LEGACY(bcontainer); - if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { - return vfio_container_group_add(container, group, errp); - } - } + VFIODevice *vbasedev; - fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp); - if (fd < 0) { - goto fail; + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + if (vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) { + return false; } - } else { - /* - * For incoming CPR, the group is already attached in the kernel. - * If a container with matching fd is found, then update the - * userland group list and return. If not, then after the loop, - * create the container struct and group list. - */ - QLIST_FOREACH(bcontainer, &space->containers, next) { - container = VFIO_IOMMU_LEGACY(bcontainer); - - if (vfio_cpr_container_match(container, group, fd)) { - return vfio_container_group_add(container, group, errp); - } - } - } - - ret = ioctl(fd, VFIO_GET_API_VERSION); - if (ret != VFIO_API_VERSION) { - error_setg(errp, "supported vfio version: %d, " - "reported version: %d", VFIO_API_VERSION, ret); - goto fail; - } - - container = vfio_create_container(fd, group, errp); - if (!container) { - goto fail; - } - new_container = true; - bcontainer = VFIO_IOMMU(container); - - if (!vfio_legacy_cpr_register_container(container, errp)) { - goto fail; - } - - vioc = VFIO_IOMMU_GET_CLASS(bcontainer); - assert(vioc->setup); - - if (!vioc->setup(bcontainer, errp)) { - goto fail; - } - - vfio_address_space_insert(space, bcontainer); - - if (!vfio_container_group_add(container, group, errp)) { - goto fail; - } - group_was_added = true; - - /* - * If CPR, register the listener later, after all state that may - * affect regions and mapping boundaries has been cpr load'ed. Later, - * the listener will invoke its callback on each flat section and call - * dma_map to supply the new vaddr, and the calls will match the mappings - * remembered by the kernel. - */ - if (!cpr_is_incoming()) { - if (!vfio_listener_register(bcontainer, errp)) { - goto fail; + if (!vbasedev->dirty_pages_supported) { + return false; } } - bcontainer->initialized = true; - return true; - -fail: - if (new_container) { - vfio_listener_unregister(bcontainer); - } - - if (group_was_added) { - vfio_container_group_del(container, group); - } - if (vioc && vioc->release) { - vioc->release(bcontainer); - } - if (new_container) { - vfio_legacy_cpr_unregister_container(container); - object_unref(container); - } - if (fd >= 0) { - close(fd); - } - vfio_address_space_put(space); - - return false; -} - -static void vfio_container_disconnect(VFIOGroup *group) -{ - VFIOContainer *container = group->container; - VFIOContainerBase *bcontainer = VFIO_IOMMU(container); - VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); - - QLIST_REMOVE(group, container_next); - group->container = NULL; - cpr_delete_fd("vfio_container_for_group", group->groupid); - - /* - * Explicitly release the listener first before unset container, - * since unset may destroy the backend container if it's the last - * group. - */ - if (QLIST_EMPTY(&container->group_list)) { - vfio_listener_unregister(bcontainer); - if (vioc->release) { - vioc->release(bcontainer); - } - } - - if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) { - error_report("vfio: error disconnecting group %d from container", - group->groupid); - } - - if (QLIST_EMPTY(&container->group_list)) { - VFIOAddressSpace *space = bcontainer->space; - - trace_vfio_container_disconnect(container->fd); - vfio_legacy_cpr_unregister_container(container); - close(container->fd); - object_unref(container); - - vfio_address_space_put(space); - } } -static VFIOGroup *vfio_group_get(int groupid, AddressSpace *as, Error **errp) +static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova, + hwaddr size, void *bitmap) { - ERRP_GUARD(); - VFIOGroup *group; - char path[32]; - struct vfio_group_status status = { .argsz = sizeof(status) }; - - QLIST_FOREACH(group, &vfio_group_list, next) { - if (group->groupid == groupid) { - /* Found it. Now is it already in the right context? */ - if (VFIO_IOMMU(group->container)->space->as == as) { - return group; - } else { - error_setg(errp, "group %d used in multiple address spaces", - group->groupid); - return NULL; - } - } - } + uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + + sizeof(struct vfio_device_feature_dma_logging_report), + sizeof(uint64_t))] = {}; + struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; + struct vfio_device_feature_dma_logging_report *report = + (struct vfio_device_feature_dma_logging_report *)feature->data; - group = g_malloc0(sizeof(*group)); + report->iova = iova; + report->length = size; + report->page_size = qemu_real_host_page_size(); + report->bitmap = (uintptr_t)bitmap; - snprintf(path, sizeof(path), "/dev/vfio/%d", groupid); - group->fd = cpr_open_fd(path, O_RDWR, "vfio_group", groupid, errp); - if (group->fd < 0) { - goto free_group_exit; - } + feature->argsz = sizeof(buf); + feature->flags = VFIO_DEVICE_FEATURE_GET | + VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT; - if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) { - error_setg_errno(errp, errno, "failed to get group %d status", groupid); - goto close_fd_exit; - } - - if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) { - error_setg(errp, "group %d is not viable", groupid); - error_append_hint(errp, - "Please ensure all devices within the iommu_group " - "are bound to their vfio bus driver.\n"); - goto close_fd_exit; - } - - group->groupid = groupid; - QLIST_INIT(&group->device_list); - - if (!vfio_container_connect(group, as, errp)) { - error_prepend(errp, "failed to setup container for group %d: ", - groupid); - goto close_fd_exit; - } - - QLIST_INSERT_HEAD(&vfio_group_list, group, next); - - return group; - -close_fd_exit: - cpr_delete_fd("vfio_group", groupid); - close(group->fd); - -free_group_exit: - g_free(group); - - return NULL; -} - -static void vfio_group_put(VFIOGroup *group) -{ - if (!group || !QLIST_EMPTY(&group->device_list)) { - return; - } - - if (!group->ram_block_discard_allowed) { - vfio_ram_block_discard_disable(group->container, false); - } - vfio_group_del_kvm_device(group); - vfio_container_disconnect(group); - QLIST_REMOVE(group, next); - trace_vfio_group_put(group->fd); - cpr_delete_fd("vfio_group", group->groupid); - close(group->fd); - g_free(group); -} - -static bool vfio_device_get(VFIOGroup *group, const char *name, - VFIODevice *vbasedev, Error **errp) -{ - g_autofree struct vfio_device_info *info = NULL; - int fd; - - fd = vfio_cpr_group_get_device_fd(group->fd, name); - if (fd < 0) { - error_setg_errno(errp, errno, "error getting device from group %d", - group->groupid); - error_append_hint(errp, - "Verify all devices in group %d are bound to vfio-<bus> " - "or pci-stub and not already in use\n", group->groupid); - return false; - } - - info = vfio_get_device_info(fd); - if (!info) { - error_setg_errno(errp, errno, "error getting device info"); - goto fail; - } - - /* - * Set discarding of RAM as not broken for this group if the driver knows - * the device operates compatibly with discarding. Setting must be - * consistent per group, but since compatibility is really only possible - * with mdev currently, we expect singleton groups. - */ - if (vbasedev->ram_block_discard_allowed != - group->ram_block_discard_allowed) { - if (!QLIST_EMPTY(&group->device_list)) { - error_setg(errp, "Inconsistent setting of support for discarding " - "RAM (e.g., balloon) within group"); - goto fail; - } - - if (!group->ram_block_discard_allowed) { - group->ram_block_discard_allowed = true; - vfio_ram_block_discard_disable(group->container, false); - } - } - - vfio_device_prepare(vbasedev, VFIO_IOMMU(group->container), info); - - vbasedev->fd = fd; - vbasedev->group = group; - QLIST_INSERT_HEAD(&group->device_list, vbasedev, next); - - trace_vfio_device_get(name, info->flags, info->num_regions, info->num_irqs); - - return true; - -fail: - close(fd); - cpr_delete_fd(name, 0); - return false; -} - -static void vfio_device_put(VFIODevice *vbasedev) -{ - if (!vbasedev->group) { - return; - } - QLIST_REMOVE(vbasedev, next); - vbasedev->group = NULL; - trace_vfio_device_put(vbasedev->fd); - cpr_delete_fd(vbasedev->name, 0); - close(vbasedev->fd); + return vbasedev->io_ops->device_feature(vbasedev, feature); } -static int vfio_device_get_groupid(VFIODevice *vbasedev, Error **errp) +static int vfio_container_iommu_query_dirty_bitmap( + const VFIOContainer *bcontainer, VFIOBitmap *vbmap, hwaddr iova, + hwaddr size, Error **errp) { - char *tmp, group_path[PATH_MAX]; - g_autofree char *group_name = NULL; - int ret, groupid; - ssize_t len; - - tmp = g_strdup_printf("%s/iommu_group", vbasedev->sysfsdev); - len = readlink(tmp, group_path, sizeof(group_path)); - g_free(tmp); - - if (len <= 0 || len >= sizeof(group_path)) { - ret = len < 0 ? -errno : -ENAMETOOLONG; - error_setg_errno(errp, -ret, "no iommu_group found"); - return ret; - } - - group_path[len] = 0; + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); - group_name = g_path_get_basename(group_path); - if (sscanf(group_name, "%d", &groupid) != 1) { - error_setg_errno(errp, errno, "failed to read %s", group_path); - return -errno; - } - return groupid; + g_assert(vioc->query_dirty_bitmap); + return vioc->query_dirty_bitmap(bcontainer, vbmap, iova, size, + errp); } -/* - * vfio_device_attach: attach a device to a security context - * @name and @vbasedev->name are likely to be different depending - * on the type of the device, hence the need for passing @name - */ -static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev, - AddressSpace *as, Error **errp) +static int vfio_container_devices_query_dirty_bitmap( + const VFIOContainer *bcontainer, VFIOBitmap *vbmap, hwaddr iova, + hwaddr size, Error **errp) { - int groupid = vfio_device_get_groupid(vbasedev, errp); - VFIODevice *vbasedev_iter; - VFIOGroup *group; - - if (groupid < 0) { - return false; - } - - trace_vfio_device_attach(vbasedev->name, groupid); - - group = vfio_group_get(groupid, as, errp); - if (!group) { - return false; - } - - QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { - if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) { - error_setg(errp, "device is already attached"); - goto group_put_exit; - } - } - if (!vfio_device_get(group, name, vbasedev, errp)) { - goto group_put_exit; - } + VFIODevice *vbasedev; + int ret; - if (!vfio_device_hiod_create_and_realize(vbasedev, - TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO, - errp)) { - goto device_put_exit; - } + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + ret = vfio_device_dma_logging_report(vbasedev, iova, size, + vbmap->bitmap); + if (ret) { + error_setg_errno(errp, -ret, + "%s: Failed to get DMA logging report, iova: " + "0x%" HWADDR_PRIx ", size: 0x%" HWADDR_PRIx, + vbasedev->name, iova, size); - if (vbasedev->mdev) { - error_setg(&vbasedev->cpr.mdev_blocker, - "CPR does not support vfio mdev %s", vbasedev->name); - if (migrate_add_blocker_modes(&vbasedev->cpr.mdev_blocker, errp, - MIG_MODE_CPR_TRANSFER, -1) < 0) { - goto hiod_unref_exit; + return ret; } } - return true; - -hiod_unref_exit: - object_unref(vbasedev->hiod); -device_put_exit: - vfio_device_put(vbasedev); -group_put_exit: - vfio_group_put(group); - return false; + return 0; } -static void vfio_legacy_detach_device(VFIODevice *vbasedev) +int vfio_container_query_dirty_bitmap(const VFIOContainer *bcontainer, + uint64_t iova, uint64_t size, + ram_addr_t ram_addr, Error **errp) { - VFIOGroup *group = vbasedev->group; - - trace_vfio_device_detach(vbasedev->name, group->groupid); - - vfio_device_unprepare(vbasedev); - - migrate_del_blocker(&vbasedev->cpr.mdev_blocker); - object_unref(vbasedev->hiod); - vfio_device_put(vbasedev); - vfio_group_put(group); -} + bool all_device_dirty_tracking = + vfio_container_devices_dirty_tracking_is_supported(bcontainer); + uint64_t dirty_pages; + VFIOBitmap vbmap; + int ret; -static int vfio_legacy_pci_hot_reset(VFIODevice *vbasedev, bool single) -{ - VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); - VFIOGroup *group; - struct vfio_pci_hot_reset_info *info = NULL; - struct vfio_pci_dependent_device *devices; - struct vfio_pci_hot_reset *reset; - int32_t *fds; - int ret, i, count; - bool multi = false; - - trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); - - if (!single) { - vfio_pci_pre_reset(vdev); + if (!bcontainer->dirty_pages_supported && !all_device_dirty_tracking) { + cpu_physical_memory_set_dirty_range(ram_addr, size, + tcg_enabled() ? DIRTY_CLIENTS_ALL : + DIRTY_CLIENTS_NOCODE); + return 0; } - vdev->vbasedev.needs_reset = false; - - ret = vfio_pci_get_pci_hot_reset_info(vdev, &info); + ret = vfio_bitmap_alloc(&vbmap, size); if (ret) { - goto out_single; - } - devices = &info->devices[0]; - - trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name); - - /* Verify that we have all the groups required */ - for (i = 0; i < info->count; i++) { - PCIHostDeviceAddress host; - VFIOPCIDevice *tmp; - VFIODevice *vbasedev_iter; - - host.domain = devices[i].segment; - host.bus = devices[i].bus; - host.slot = PCI_SLOT(devices[i].devfn); - host.function = PCI_FUNC(devices[i].devfn); - - trace_vfio_pci_hot_reset_dep_devices(host.domain, - host.bus, host.slot, host.function, devices[i].group_id); - - if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { - continue; - } - - QLIST_FOREACH(group, &vfio_group_list, next) { - if (group->groupid == devices[i].group_id) { - break; - } - } - - if (!group) { - if (!vdev->has_pm_reset) { - error_report("vfio: Cannot reset device %s, " - "depends on group %d which is not owned.", - vdev->vbasedev.name, devices[i].group_id); - } - ret = -EPERM; - goto out; - } - - /* Prep dependent devices for reset and clear our marker. */ - QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { - if (!vbasedev_iter->dev->realized || - !vfio_pci_from_vfio_device(vbasedev_iter)) { - continue; - } - tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); - if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { - if (single) { - ret = -EINVAL; - goto out_single; - } - vfio_pci_pre_reset(tmp); - tmp->vbasedev.needs_reset = false; - multi = true; - break; - } - } - } - - if (!single && !multi) { - ret = -EINVAL; - goto out_single; - } - - /* Determine how many group fds need to be passed */ - count = 0; - QLIST_FOREACH(group, &vfio_group_list, next) { - for (i = 0; i < info->count; i++) { - if (group->groupid == devices[i].group_id) { - count++; - break; - } - } + error_setg_errno(errp, -ret, + "Failed to allocate dirty tracking bitmap"); + return ret; } - reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds))); - reset->argsz = sizeof(*reset) + (count * sizeof(*fds)); - fds = &reset->group_fds[0]; - - /* Fill in group fds */ - QLIST_FOREACH(group, &vfio_group_list, next) { - for (i = 0; i < info->count; i++) { - if (group->groupid == devices[i].group_id) { - fds[reset->count++] = group->fd; - break; - } - } + if (all_device_dirty_tracking) { + ret = vfio_container_devices_query_dirty_bitmap(bcontainer, &vbmap, iova, size, + errp); + } else { + ret = vfio_container_iommu_query_dirty_bitmap(bcontainer, &vbmap, iova, size, + errp); } - /* Bus reset! */ - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset); - g_free(reset); if (ret) { - ret = -errno; + goto out; } - trace_vfio_pci_hot_reset_result(vdev->vbasedev.name, - ret ? strerror(errno) : "Success"); + dirty_pages = cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr, + vbmap.pages); + trace_vfio_container_query_dirty_bitmap(iova, size, vbmap.size, ram_addr, + dirty_pages); out: - /* Re-enable INTx on affected devices */ - for (i = 0; i < info->count; i++) { - PCIHostDeviceAddress host; - VFIOPCIDevice *tmp; - VFIODevice *vbasedev_iter; - - host.domain = devices[i].segment; - host.bus = devices[i].bus; - host.slot = PCI_SLOT(devices[i].devfn); - host.function = PCI_FUNC(devices[i].devfn); - - if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { - continue; - } - - QLIST_FOREACH(group, &vfio_group_list, next) { - if (group->groupid == devices[i].group_id) { - break; - } - } - - if (!group) { - break; - } - - QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { - if (!vbasedev_iter->dev->realized || - !vfio_pci_from_vfio_device(vbasedev_iter)) { - continue; - } - tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); - if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { - vfio_pci_post_reset(tmp); - break; - } - } - } -out_single: - if (!single) { - vfio_pci_post_reset(vdev); - } - g_free(info); + g_free(vbmap.bitmap); return ret; } -static void vfio_iommu_legacy_class_init(ObjectClass *klass, const void *data) +static gpointer copy_iova_range(gconstpointer src, gpointer data) { - VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); - - vioc->setup = vfio_legacy_setup; - vioc->dma_map = vfio_legacy_dma_map; - vioc->dma_unmap = vfio_legacy_dma_unmap; - vioc->attach_device = vfio_legacy_attach_device; - vioc->detach_device = vfio_legacy_detach_device; - vioc->set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking; - vioc->query_dirty_bitmap = vfio_legacy_query_dirty_bitmap; - vioc->pci_hot_reset = vfio_legacy_pci_hot_reset; -}; - -static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque, - Error **errp) -{ - VFIODevice *vdev = opaque; + Range *source = (Range *)src; + Range *dest = g_new(Range, 1); - hiod->name = g_strdup(vdev->name); - hiod->agent = opaque; - - return true; + range_set_bounds(dest, range_lob(source), range_upb(source)); + return dest; } -static int hiod_legacy_vfio_get_cap(HostIOMMUDevice *hiod, int cap, - Error **errp) +GList *vfio_container_get_iova_ranges(const VFIOContainer *bcontainer) { - switch (cap) { - case HOST_IOMMU_DEVICE_CAP_AW_BITS: - return vfio_device_get_aw_bits(hiod->agent); - default: - error_setg(errp, "%s: unsupported capability %x", hiod->name, cap); - return -EINVAL; - } + assert(bcontainer); + return g_list_copy_deep(bcontainer->iova_ranges, copy_iova_range, NULL); } -static GList * -hiod_legacy_vfio_get_iova_ranges(HostIOMMUDevice *hiod) +static void vfio_container_instance_finalize(Object *obj) { - VFIODevice *vdev = hiod->agent; + VFIOContainer *bcontainer = VFIO_IOMMU(obj); + VFIOGuestIOMMU *giommu, *tmp; - g_assert(vdev); - return vfio_container_get_iova_ranges(vdev->bcontainer); -} + QLIST_SAFE_REMOVE(bcontainer, next); -static uint64_t -hiod_legacy_vfio_get_page_size_mask(HostIOMMUDevice *hiod) -{ - VFIODevice *vdev = hiod->agent; + QLIST_FOREACH_SAFE(giommu, &bcontainer->giommu_list, giommu_next, tmp) { + memory_region_unregister_iommu_notifier( + MEMORY_REGION(giommu->iommu_mr), &giommu->n); + QLIST_REMOVE(giommu, giommu_next); + g_free(giommu); + } - g_assert(vdev); - return vfio_container_get_page_size_mask(vdev->bcontainer); + g_list_free_full(bcontainer->iova_ranges, g_free); } -static void vfio_iommu_legacy_instance_init(Object *obj) +static void vfio_container_instance_init(Object *obj) { - VFIOContainer *container = VFIO_IOMMU_LEGACY(obj); + VFIOContainer *bcontainer = VFIO_IOMMU(obj); - QLIST_INIT(&container->group_list); + bcontainer->error = NULL; + bcontainer->dirty_pages_supported = false; + bcontainer->dma_max_mappings = 0; + bcontainer->iova_ranges = NULL; + QLIST_INIT(&bcontainer->giommu_list); + QLIST_INIT(&bcontainer->vrdl_list); } -static void hiod_legacy_vfio_class_init(ObjectClass *oc, const void *data) -{ - HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc); - - hioc->realize = hiod_legacy_vfio_realize; - hioc->get_cap = hiod_legacy_vfio_get_cap; - hioc->get_iova_ranges = hiod_legacy_vfio_get_iova_ranges; - hioc->get_page_size_mask = hiod_legacy_vfio_get_page_size_mask; -}; - static const TypeInfo types[] = { { - .name = TYPE_VFIO_IOMMU_LEGACY, - .parent = TYPE_VFIO_IOMMU, - .instance_init = vfio_iommu_legacy_instance_init, + .name = TYPE_VFIO_IOMMU, + .parent = TYPE_OBJECT, + .instance_init = vfio_container_instance_init, + .instance_finalize = vfio_container_instance_finalize, .instance_size = sizeof(VFIOContainer), - .class_init = vfio_iommu_legacy_class_init, - }, { - .name = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO, - .parent = TYPE_HOST_IOMMU_DEVICE, - .class_init = hiod_legacy_vfio_class_init, - } + .class_size = sizeof(VFIOIOMMUClass), + .abstract = true, + }, }; DEFINE_TYPES(types) diff --git a/hw/vfio/cpr-iommufd.c b/hw/vfio/cpr-iommufd.c index 148a06d..1d70c87 100644 --- a/hw/vfio/cpr-iommufd.c +++ b/hw/vfio/cpr-iommufd.c @@ -176,7 +176,7 @@ void vfio_iommufd_cpr_unregister_iommufd(IOMMUFDBackend *be) bool vfio_iommufd_cpr_register_container(VFIOIOMMUFDContainer *container, Error **errp) { - VFIOContainerBase *bcontainer = &container->bcontainer; + VFIOContainer *bcontainer = VFIO_IOMMU(container); migration_add_notifier_mode(&bcontainer->cpr_reboot_notifier, vfio_cpr_reboot_notifier, @@ -189,7 +189,7 @@ bool vfio_iommufd_cpr_register_container(VFIOIOMMUFDContainer *container, void vfio_iommufd_cpr_unregister_container(VFIOIOMMUFDContainer *container) { - VFIOContainerBase *bcontainer = &container->bcontainer; + VFIOContainer *bcontainer = VFIO_IOMMU(container); migration_remove_notifier(&bcontainer->cpr_reboot_notifier); } diff --git a/hw/vfio/cpr-legacy.c b/hw/vfio/cpr-legacy.c index 8f43719..bbf7a0d 100644 --- a/hw/vfio/cpr-legacy.c +++ b/hw/vfio/cpr-legacy.c @@ -7,7 +7,7 @@ #include <sys/ioctl.h> #include <linux/vfio.h> #include "qemu/osdep.h" -#include "hw/vfio/vfio-container.h" +#include "hw/vfio/vfio-container-legacy.h" #include "hw/vfio/vfio-device.h" #include "hw/vfio/vfio-listener.h" #include "migration/blocker.h" @@ -17,7 +17,8 @@ #include "qapi/error.h" #include "qemu/error-report.h" -static bool vfio_dma_unmap_vaddr_all(VFIOContainer *container, Error **errp) +static bool vfio_dma_unmap_vaddr_all(VFIOLegacyContainer *container, + Error **errp) { struct vfio_iommu_type1_dma_unmap unmap = { .argsz = sizeof(unmap), @@ -37,11 +38,11 @@ static bool vfio_dma_unmap_vaddr_all(VFIOContainer *container, Error **errp) * Set the new @vaddr for any mappings registered during cpr load. * The incoming state is cleared thereafter. */ -static int vfio_legacy_cpr_dma_map(const VFIOContainerBase *bcontainer, +static int vfio_legacy_cpr_dma_map(const VFIOContainer *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly, MemoryRegion *mr) { - const VFIOContainer *container = VFIO_IOMMU_LEGACY(bcontainer); + const VFIOLegacyContainer *container = VFIO_IOMMU_LEGACY(bcontainer); struct vfio_iommu_type1_dma_map map = { .argsz = sizeof(map), @@ -63,12 +64,13 @@ static int vfio_legacy_cpr_dma_map(const VFIOContainerBase *bcontainer, static void vfio_region_remap(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainer *container = container_of(listener, VFIOContainer, - cpr.remap_listener); + VFIOLegacyContainer *container = container_of(listener, + VFIOLegacyContainer, + cpr.remap_listener); vfio_container_region_add(VFIO_IOMMU(container), section, true); } -static bool vfio_cpr_supported(VFIOContainer *container, Error **errp) +static bool vfio_cpr_supported(VFIOLegacyContainer *container, Error **errp) { if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UPDATE_VADDR)) { error_setg(errp, "VFIO container does not support VFIO_UPDATE_VADDR"); @@ -85,7 +87,7 @@ static bool vfio_cpr_supported(VFIOContainer *container, Error **errp) static int vfio_container_pre_save(void *opaque) { - VFIOContainer *container = opaque; + VFIOLegacyContainer *container = opaque; Error *local_err = NULL; if (!vfio_dma_unmap_vaddr_all(container, &local_err)) { @@ -97,8 +99,8 @@ static int vfio_container_pre_save(void *opaque) static int vfio_container_post_load(void *opaque, int version_id) { - VFIOContainer *container = opaque; - VFIOContainerBase *bcontainer = VFIO_IOMMU(container); + VFIOLegacyContainer *container = opaque; + VFIOContainer *bcontainer = VFIO_IOMMU(container); VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); dma_map_fn saved_dma_map = vioc->dma_map; Error *local_err = NULL; @@ -133,9 +135,9 @@ static const VMStateDescription vfio_container_vmstate = { static int vfio_cpr_fail_notifier(NotifierWithReturn *notifier, MigrationEvent *e, Error **errp) { - VFIOContainer *container = - container_of(notifier, VFIOContainer, cpr.transfer_notifier); - VFIOContainerBase *bcontainer = VFIO_IOMMU(container); + VFIOLegacyContainer *container = + container_of(notifier, VFIOLegacyContainer, cpr.transfer_notifier); + VFIOContainer *bcontainer = VFIO_IOMMU(container); if (e->type != MIG_EVENT_PRECOPY_FAILED) { return 0; @@ -165,9 +167,10 @@ static int vfio_cpr_fail_notifier(NotifierWithReturn *notifier, return 0; } -bool vfio_legacy_cpr_register_container(VFIOContainer *container, Error **errp) +bool vfio_legacy_cpr_register_container(VFIOLegacyContainer *container, + Error **errp) { - VFIOContainerBase *bcontainer = VFIO_IOMMU(container); + VFIOContainer *bcontainer = VFIO_IOMMU(container); Error **cpr_blocker = &container->cpr.blocker; migration_add_notifier_mode(&bcontainer->cpr_reboot_notifier, @@ -189,9 +192,9 @@ bool vfio_legacy_cpr_register_container(VFIOContainer *container, Error **errp) return true; } -void vfio_legacy_cpr_unregister_container(VFIOContainer *container) +void vfio_legacy_cpr_unregister_container(VFIOLegacyContainer *container) { - VFIOContainerBase *bcontainer = VFIO_IOMMU(container); + VFIOContainer *bcontainer = VFIO_IOMMU(container); migration_remove_notifier(&bcontainer->cpr_reboot_notifier); migrate_del_blocker(&container->cpr.blocker); @@ -207,7 +210,7 @@ void vfio_legacy_cpr_unregister_container(VFIOContainer *container) * The giommu already exists. Find it and replay it, which calls * vfio_legacy_cpr_dma_map further down the stack. */ -void vfio_cpr_giommu_remap(VFIOContainerBase *bcontainer, +void vfio_cpr_giommu_remap(VFIOContainer *bcontainer, MemoryRegionSection *section) { VFIOGuestIOMMU *giommu = NULL; @@ -232,7 +235,7 @@ void vfio_cpr_giommu_remap(VFIOContainerBase *bcontainer, * The ram discard listener already exists. Call its populate function * directly, which calls vfio_legacy_cpr_dma_map. */ -bool vfio_cpr_ram_discard_register_listener(VFIOContainerBase *bcontainer, +bool vfio_cpr_ram_discard_register_listener(VFIOContainer *bcontainer, MemoryRegionSection *section) { VFIORamDiscardListener *vrdl = @@ -263,7 +266,7 @@ static bool same_device(int fd1, int fd2) return !fstat(fd1, &st1) && !fstat(fd2, &st2) && st1.st_dev == st2.st_dev; } -bool vfio_cpr_container_match(VFIOContainer *container, VFIOGroup *group, +bool vfio_cpr_container_match(VFIOLegacyContainer *container, VFIOGroup *group, int fd) { if (container->fd == fd) { diff --git a/hw/vfio/device.c b/hw/vfio/device.c index 08f12ac..64f8750 100644 --- a/hw/vfio/device.c +++ b/hw/vfio/device.c @@ -423,7 +423,7 @@ bool vfio_device_hiod_create_and_realize(VFIODevice *vbasedev, VFIODevice *vfio_get_vfio_device(Object *obj) { if (object_dynamic_cast(obj, TYPE_VFIO_PCI)) { - return &VFIO_PCI_BASE(obj)->vbasedev; + return &VFIO_PCI_DEVICE(obj)->vbasedev; } else { return NULL; } @@ -460,7 +460,7 @@ void vfio_device_detach(VFIODevice *vbasedev) VFIO_IOMMU_GET_CLASS(vbasedev->bcontainer)->detach_device(vbasedev); } -void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer, +void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainer *bcontainer, struct vfio_device_info *info) { int i; diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 8c27222..f0ffe23 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -34,36 +34,33 @@ #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO \ TYPE_HOST_IOMMU_DEVICE_IOMMUFD "-vfio" -static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova, +static int iommufd_cdev_map(const VFIOContainer *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly, MemoryRegion *mr) { - const VFIOIOMMUFDContainer *container = - container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + const VFIOIOMMUFDContainer *container = VFIO_IOMMU_IOMMUFD(bcontainer); return iommufd_backend_map_dma(container->be, container->ioas_id, iova, size, vaddr, readonly); } -static int iommufd_cdev_map_file(const VFIOContainerBase *bcontainer, +static int iommufd_cdev_map_file(const VFIOContainer *bcontainer, hwaddr iova, ram_addr_t size, int fd, unsigned long start, bool readonly) { - const VFIOIOMMUFDContainer *container = - container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + const VFIOIOMMUFDContainer *container = VFIO_IOMMU_IOMMUFD(bcontainer); return iommufd_backend_map_file_dma(container->be, container->ioas_id, iova, size, fd, start, readonly); } -static int iommufd_cdev_unmap(const VFIOContainerBase *bcontainer, +static int iommufd_cdev_unmap(const VFIOContainer *bcontainer, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb, bool unmap_all) { - const VFIOIOMMUFDContainer *container = - container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + const VFIOIOMMUFDContainer *container = VFIO_IOMMU_IOMMUFD(bcontainer); /* unmap in halves */ if (unmap_all) { @@ -159,11 +156,10 @@ static bool iommufd_hwpt_dirty_tracking(VFIOIOASHwpt *hwpt) return hwpt && hwpt->hwpt_flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; } -static int iommufd_set_dirty_page_tracking(const VFIOContainerBase *bcontainer, +static int iommufd_set_dirty_page_tracking(const VFIOContainer *bcontainer, bool start, Error **errp) { - const VFIOIOMMUFDContainer *container = - container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + const VFIOIOMMUFDContainer *container = VFIO_IOMMU_IOMMUFD(bcontainer); VFIOIOASHwpt *hwpt; QLIST_FOREACH(hwpt, &container->hwpt_list, next) { @@ -190,13 +186,11 @@ err: return -EINVAL; } -static int iommufd_query_dirty_bitmap(const VFIOContainerBase *bcontainer, +static int iommufd_query_dirty_bitmap(const VFIOContainer *bcontainer, VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp) { - VFIOIOMMUFDContainer *container = container_of(bcontainer, - VFIOIOMMUFDContainer, - bcontainer); + VFIOIOMMUFDContainer *container = VFIO_IOMMU_IOMMUFD(bcontainer); unsigned long page_size = qemu_real_host_page_size(); VFIOIOASHwpt *hwpt; @@ -324,6 +318,7 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev, { ERRP_GUARD(); IOMMUFDBackend *iommufd = vbasedev->iommufd; + VFIOContainer *bcontainer = VFIO_IOMMU(container); uint32_t type, flags = 0; uint64_t hw_caps; VFIOIOASHwpt *hwpt; @@ -408,9 +403,9 @@ skip_alloc: vbasedev->iommu_dirty_tracking = iommufd_hwpt_dirty_tracking(hwpt); QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next); QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next); - container->bcontainer.dirty_pages_supported |= + bcontainer->dirty_pages_supported |= vbasedev->iommu_dirty_tracking; - if (container->bcontainer.dirty_pages_supported && + if (bcontainer->dirty_pages_supported && !vbasedev->iommu_dirty_tracking) { warn_report("IOMMU instance for device %s doesn't support dirty tracking", vbasedev->name); @@ -464,7 +459,7 @@ static void iommufd_cdev_detach_container(VFIODevice *vbasedev, static void iommufd_cdev_container_destroy(VFIOIOMMUFDContainer *container) { - VFIOContainerBase *bcontainer = &container->bcontainer; + VFIOContainer *bcontainer = VFIO_IOMMU(container); if (!QLIST_EMPTY(&bcontainer->device_list)) { return; @@ -486,7 +481,7 @@ static int iommufd_cdev_ram_block_discard_disable(bool state) static bool iommufd_cdev_get_info_iova_range(VFIOIOMMUFDContainer *container, uint32_t ioas_id, Error **errp) { - VFIOContainerBase *bcontainer = &container->bcontainer; + VFIOContainer *bcontainer = VFIO_IOMMU(container); g_autofree struct iommu_ioas_iova_ranges *info = NULL; struct iommu_iova_range *iova_ranges; int sz, fd = container->be->fd; @@ -528,7 +523,7 @@ error: static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp) { - VFIOContainerBase *bcontainer; + VFIOContainer *bcontainer; VFIOIOMMUFDContainer *container; VFIOAddressSpace *space; struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; @@ -559,7 +554,7 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, /* try to attach to an existing container in this space */ QLIST_FOREACH(bcontainer, &space->containers, next) { - container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + container = VFIO_IOMMU_IOMMUFD(bcontainer); if (VFIO_IOMMU_GET_CLASS(bcontainer) != iommufd_vioc || vbasedev->iommufd != container->be) { continue; @@ -609,7 +604,7 @@ skip_ioas_alloc: QLIST_INIT(&container->hwpt_list); vbasedev->cpr.ioas_id = ioas_id; - bcontainer = &container->bcontainer; + bcontainer = VFIO_IOMMU(container); vfio_address_space_insert(space, bcontainer); if (!iommufd_cdev_attach_container(vbasedev, container, errp)) { @@ -687,11 +682,10 @@ err_connect_bind: static void iommufd_cdev_detach(VFIODevice *vbasedev) { - VFIOContainerBase *bcontainer = vbasedev->bcontainer; + VFIOContainer *bcontainer = vbasedev->bcontainer; VFIOAddressSpace *space = bcontainer->space; - VFIOIOMMUFDContainer *container = container_of(bcontainer, - VFIOIOMMUFDContainer, - bcontainer); + VFIOIOMMUFDContainer *container = VFIO_IOMMU_IOMMUFD(bcontainer); + vfio_device_unprepare(vbasedev); if (!vbasedev->ram_block_discard_allowed) { diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c index e093833..3b6f17f 100644 --- a/hw/vfio/listener.c +++ b/hw/vfio/listener.c @@ -52,7 +52,7 @@ */ -static bool vfio_log_sync_needed(const VFIOContainerBase *bcontainer) +static bool vfio_log_sync_needed(const VFIOContainer *bcontainer) { VFIODevice *vbasedev; @@ -125,7 +125,7 @@ static MemoryRegion *vfio_translate_iotlb(IOMMUTLBEntry *iotlb, hwaddr *xlat_p, static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) { VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); - VFIOContainerBase *bcontainer = giommu->bcontainer; + VFIOContainer *bcontainer = giommu->bcontainer; hwaddr iova = iotlb->iova + giommu->iommu_offset; MemoryRegion *mr; hwaddr xlat; @@ -202,7 +202,7 @@ static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl, { VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, listener); - VFIOContainerBase *bcontainer = vrdl->bcontainer; + VFIOContainer *bcontainer = vrdl->bcontainer; const hwaddr size = int128_get64(section->size); const hwaddr iova = section->offset_within_address_space; int ret; @@ -220,7 +220,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, { VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, listener); - VFIOContainerBase *bcontainer = vrdl->bcontainer; + VFIOContainer *bcontainer = vrdl->bcontainer; const hwaddr end = section->offset_within_region + int128_get64(section->size); hwaddr start, next, iova; @@ -250,7 +250,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, return 0; } -static bool vfio_ram_discard_register_listener(VFIOContainerBase *bcontainer, +static bool vfio_ram_discard_register_listener(VFIOContainer *bcontainer, MemoryRegionSection *section, Error **errp) { @@ -328,7 +328,7 @@ static bool vfio_ram_discard_register_listener(VFIOContainerBase *bcontainer, return true; } -static void vfio_ram_discard_unregister_listener(VFIOContainerBase *bcontainer, +static void vfio_ram_discard_unregister_listener(VFIOContainer *bcontainer, MemoryRegionSection *section) { RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); @@ -396,7 +396,7 @@ static bool vfio_listener_valid_section(MemoryRegionSection *section, return true; } -static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer, +static bool vfio_get_section_iova_range(VFIOContainer *bcontainer, MemoryRegionSection *section, hwaddr *out_iova, hwaddr *out_end, Int128 *out_llend) @@ -423,9 +423,9 @@ static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer, static void vfio_listener_begin(MemoryListener *listener) { - VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, - listener); - void (*listener_begin)(VFIOContainerBase *bcontainer); + VFIOContainer *bcontainer = container_of(listener, VFIOContainer, + listener); + void (*listener_begin)(VFIOContainer *bcontainer); listener_begin = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_begin; @@ -436,9 +436,9 @@ static void vfio_listener_begin(MemoryListener *listener) static void vfio_listener_commit(MemoryListener *listener) { - VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, - listener); - void (*listener_commit)(VFIOContainerBase *bcontainer); + VFIOContainer *bcontainer = container_of(listener, VFIOContainer, + listener); + void (*listener_commit)(VFIOContainer *bcontainer); listener_commit = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_commit; @@ -460,7 +460,7 @@ static void vfio_device_error_append(VFIODevice *vbasedev, Error **errp) } VFIORamDiscardListener *vfio_find_ram_discard_listener( - VFIOContainerBase *bcontainer, MemoryRegionSection *section) + VFIOContainer *bcontainer, MemoryRegionSection *section) { VFIORamDiscardListener *vrdl = NULL; @@ -482,12 +482,12 @@ VFIORamDiscardListener *vfio_find_ram_discard_listener( static void vfio_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, - listener); + VFIOContainer *bcontainer = container_of(listener, VFIOContainer, + listener); vfio_container_region_add(bcontainer, section, false); } -void vfio_container_region_add(VFIOContainerBase *bcontainer, +void vfio_container_region_add(VFIOContainer *bcontainer, MemoryRegionSection *section, bool cpr_remap) { @@ -656,8 +656,8 @@ fail: static void vfio_listener_region_del(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, - listener); + VFIOContainer *bcontainer = container_of(listener, VFIOContainer, + listener); hwaddr iova, end; Int128 llend, llsize; int ret; @@ -744,13 +744,13 @@ typedef struct VFIODirtyRanges { } VFIODirtyRanges; typedef struct VFIODirtyRangesListener { - VFIOContainerBase *bcontainer; + VFIOContainer *bcontainer; VFIODirtyRanges ranges; MemoryListener listener; } VFIODirtyRangesListener; static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, - VFIOContainerBase *bcontainer) + VFIOContainer *bcontainer) { VFIOPCIDevice *pcidev; VFIODevice *vbasedev; @@ -835,7 +835,7 @@ static const MemoryListener vfio_dirty_tracking_listener = { .region_add = vfio_dirty_tracking_update, }; -static void vfio_dirty_tracking_init(VFIOContainerBase *bcontainer, +static void vfio_dirty_tracking_init(VFIOContainer *bcontainer, VFIODirtyRanges *ranges) { VFIODirtyRangesListener dirty; @@ -860,7 +860,7 @@ static void vfio_dirty_tracking_init(VFIOContainerBase *bcontainer, memory_listener_unregister(&dirty.listener); } -static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer) +static void vfio_devices_dma_logging_stop(VFIOContainer *bcontainer) { uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), sizeof(uint64_t))] = {}; @@ -889,7 +889,7 @@ static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer) } static struct vfio_device_feature * -vfio_device_feature_dma_logging_start_create(VFIOContainerBase *bcontainer, +vfio_device_feature_dma_logging_start_create(VFIOContainer *bcontainer, VFIODirtyRanges *tracking) { struct vfio_device_feature *feature; @@ -962,7 +962,7 @@ static void vfio_device_feature_dma_logging_start_destroy( g_free(feature); } -static bool vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer, +static bool vfio_devices_dma_logging_start(VFIOContainer *bcontainer, Error **errp) { struct vfio_device_feature *feature; @@ -1006,8 +1006,8 @@ static bool vfio_listener_log_global_start(MemoryListener *listener, Error **errp) { ERRP_GUARD(); - VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, - listener); + VFIOContainer *bcontainer = container_of(listener, VFIOContainer, + listener); bool ret; if (vfio_container_devices_dirty_tracking_is_supported(bcontainer)) { @@ -1024,8 +1024,8 @@ static bool vfio_listener_log_global_start(MemoryListener *listener, static void vfio_listener_log_global_stop(MemoryListener *listener) { - VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, - listener); + VFIOContainer *bcontainer = container_of(listener, VFIOContainer, + listener); Error *local_err = NULL; int ret = 0; @@ -1057,7 +1057,7 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) vfio_giommu_dirty_notifier *gdn = container_of(n, vfio_giommu_dirty_notifier, n); VFIOGuestIOMMU *giommu = gdn->giommu; - VFIOContainerBase *bcontainer = giommu->bcontainer; + VFIOContainer *bcontainer = giommu->bcontainer; hwaddr iova = iotlb->iova + giommu->iommu_offset; ram_addr_t translated_addr; Error *local_err = NULL; @@ -1127,7 +1127,7 @@ static int vfio_ram_discard_query_dirty_bitmap(MemoryRegionSection *section, } static int -vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer, +vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *bcontainer, MemoryRegionSection *section) { RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); @@ -1143,7 +1143,7 @@ vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer, &vrdl); } -static int vfio_sync_iommu_dirty_bitmap(VFIOContainerBase *bcontainer, +static int vfio_sync_iommu_dirty_bitmap(VFIOContainer *bcontainer, MemoryRegionSection *section) { VFIOGuestIOMMU *giommu; @@ -1180,7 +1180,7 @@ static int vfio_sync_iommu_dirty_bitmap(VFIOContainerBase *bcontainer, return 0; } -static int vfio_sync_dirty_bitmap(VFIOContainerBase *bcontainer, +static int vfio_sync_dirty_bitmap(VFIOContainer *bcontainer, MemoryRegionSection *section, Error **errp) { ram_addr_t ram_addr; @@ -1209,8 +1209,8 @@ static int vfio_sync_dirty_bitmap(VFIOContainerBase *bcontainer, static void vfio_listener_log_sync(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, - listener); + VFIOContainer *bcontainer = container_of(listener, VFIOContainer, + listener); int ret; Error *local_err = NULL; @@ -1241,7 +1241,7 @@ static const MemoryListener vfio_memory_listener = { .log_sync = vfio_listener_log_sync, }; -bool vfio_listener_register(VFIOContainerBase *bcontainer, Error **errp) +bool vfio_listener_register(VFIOContainer *bcontainer, Error **errp) { bcontainer->listener = vfio_memory_listener; memory_listener_register(&bcontainer->listener, bcontainer->space->as); @@ -1255,7 +1255,7 @@ bool vfio_listener_register(VFIOContainerBase *bcontainer, Error **errp) return true; } -void vfio_listener_unregister(VFIOContainerBase *bcontainer) +void vfio_listener_unregister(VFIOContainer *bcontainer) { memory_listener_unregister(&bcontainer->listener); } diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index d3ed3cb..82f6869 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -3,8 +3,8 @@ vfio_ss = ss.source_set() vfio_ss.add(files( 'listener.c', - 'container-base.c', 'container.c', + 'container-legacy.c', 'helpers.c', )) vfio_ss.add(when: 'CONFIG_PSERIES', if_true: files('spapr.c')) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index bc0b4c4..5b022da 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -305,7 +305,7 @@ static void vfio_intx_update(VFIOPCIDevice *vdev, PCIINTxRoute *route) static void vfio_intx_routing_notifier(PCIDevice *pdev) { - VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(pdev); PCIINTxRoute route; if (vdev->interrupt != VFIO_INT_INTx) { @@ -660,7 +660,7 @@ void vfio_pci_vector_init(VFIOPCIDevice *vdev, int nr) static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, MSIMessage *msg, IOHandler *handler) { - VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(pdev); VFIOMSIVector *vector; int ret; bool resizing = !!(vdev->nr_vectors < nr + 1); @@ -755,7 +755,7 @@ static int vfio_msix_vector_use(PCIDevice *pdev, static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) { - VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(pdev); VFIOMSIVector *vector = &vdev->msi_vectors[nr]; trace_vfio_msix_vector_release(vdev->vbasedev.name, nr); @@ -1346,7 +1346,7 @@ static const MemoryRegionOps vfio_vga_ops = { */ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar) { - VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(pdev); VFIORegion *region = &vdev->bars[bar].region; MemoryRegion *mmap_mr, *region_mr, *base_mr; PCIIORegion *r; @@ -1392,7 +1392,7 @@ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar) */ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) { - VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(pdev); VFIODevice *vbasedev = &vdev->vbasedev; uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val; @@ -1426,7 +1426,7 @@ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, uint32_t val, int len) { - VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(pdev); VFIODevice *vbasedev = &vdev->vbasedev; uint32_t val_le = cpu_to_le32(val); int ret; @@ -3392,7 +3392,7 @@ bool vfio_pci_interrupt_setup(VFIOPCIDevice *vdev, Error **errp) static void vfio_pci_realize(PCIDevice *pdev, Error **errp) { ERRP_GUARD(); - VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(pdev); VFIODevice *vbasedev = &vdev->vbasedev; int i; char uuid[UUID_STR_LEN]; @@ -3550,16 +3550,16 @@ error: error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->name); } -static void vfio_instance_finalize(Object *obj) +static void vfio_pci_finalize(Object *obj) { - VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(obj); vfio_pci_put_device(vdev); } static void vfio_exitfn(PCIDevice *pdev) { - VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(pdev); VFIODevice *vbasedev = &vdev->vbasedev; vfio_unregister_req_notifier(vdev); @@ -3583,7 +3583,7 @@ static void vfio_exitfn(PCIDevice *pdev) static void vfio_pci_reset(DeviceState *dev) { - VFIOPCIDevice *vdev = VFIO_PCI_BASE(dev); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(dev); /* Do not reset the device during qemu_system_reset prior to cpr load */ if (cpr_is_incoming()) { @@ -3625,10 +3625,10 @@ post_reset: vfio_pci_post_reset(vdev); } -static void vfio_instance_init(Object *obj) +static void vfio_pci_init(Object *obj) { PCIDevice *pci_dev = PCI_DEVICE(obj); - VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(obj); VFIODevice *vbasedev = &vdev->vbasedev; device_add_bootindex_property(obj, &vdev->bootindex, @@ -3656,7 +3656,7 @@ static void vfio_instance_init(Object *obj) pci_dev->cap_present |= QEMU_PCI_SKIP_RESET_ON_CPR; } -static void vfio_pci_base_dev_class_init(ObjectClass *klass, const void *data) +static void vfio_pci_device_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass); @@ -3668,12 +3668,12 @@ static void vfio_pci_base_dev_class_init(ObjectClass *klass, const void *data) pdc->config_write = vfio_pci_write_config; } -static const TypeInfo vfio_pci_base_dev_info = { - .name = TYPE_VFIO_PCI_BASE, +static const TypeInfo vfio_pci_device_info = { + .name = TYPE_VFIO_PCI_DEVICE, .parent = TYPE_PCI_DEVICE, .instance_size = sizeof(VFIOPCIDevice), .abstract = true, - .class_init = vfio_pci_base_dev_class_init, + .class_init = vfio_pci_device_class_init, .interfaces = (const InterfaceInfo[]) { { INTERFACE_PCIE_DEVICE }, { INTERFACE_CONVENTIONAL_PCI_DEVICE }, @@ -3683,7 +3683,7 @@ static const TypeInfo vfio_pci_base_dev_info = { static PropertyInfo vfio_pci_migration_multifd_transfer_prop; -static const Property vfio_pci_dev_properties[] = { +static const Property vfio_pci_properties[] = { DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host), DEFINE_PROP_UUID_NODEFAULT("vf-token", VFIOPCIDevice, vf_token), DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev), @@ -3758,18 +3758,18 @@ static const Property vfio_pci_dev_properties[] = { #ifdef CONFIG_IOMMUFD static void vfio_pci_set_fd(Object *obj, const char *str, Error **errp) { - VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); + VFIOPCIDevice *vdev = VFIO_PCI_DEVICE(obj); vfio_device_set_fd(&vdev->vbasedev, str, errp); } #endif -static void vfio_pci_dev_class_init(ObjectClass *klass, const void *data) +static void vfio_pci_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass); device_class_set_legacy_reset(dc, vfio_pci_reset); - device_class_set_props(dc, vfio_pci_dev_properties); + device_class_set_props(dc, vfio_pci_properties); #ifdef CONFIG_IOMMUFD object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd); #endif @@ -3912,15 +3912,15 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, const void *data) "multifd channels"); } -static const TypeInfo vfio_pci_dev_info = { +static const TypeInfo vfio_pci_info = { .name = TYPE_VFIO_PCI, - .parent = TYPE_VFIO_PCI_BASE, - .class_init = vfio_pci_dev_class_init, - .instance_init = vfio_instance_init, - .instance_finalize = vfio_instance_finalize, + .parent = TYPE_VFIO_PCI_DEVICE, + .class_init = vfio_pci_class_init, + .instance_init = vfio_pci_init, + .instance_finalize = vfio_pci_finalize, }; -static const Property vfio_pci_dev_nohotplug_properties[] = { +static const Property vfio_pci_nohotplug_properties[] = { DEFINE_PROP_BOOL("ramfb", VFIOPCIDevice, enable_ramfb, false), DEFINE_PROP_BOOL("use-legacy-x86-rom", VFIOPCIDevice, use_legacy_x86_rom, false), @@ -3928,12 +3928,12 @@ static const Property vfio_pci_dev_nohotplug_properties[] = { ON_OFF_AUTO_AUTO), }; -static void vfio_pci_nohotplug_dev_class_init(ObjectClass *klass, +static void vfio_pci_nohotplug_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); - device_class_set_props(dc, vfio_pci_dev_nohotplug_properties); + device_class_set_props(dc, vfio_pci_nohotplug_properties); dc->hotpluggable = false; object_class_property_set_description(klass, /* 3.1 */ @@ -3949,11 +3949,11 @@ static void vfio_pci_nohotplug_dev_class_init(ObjectClass *klass, "Controls loading of a legacy VGA BIOS ROM"); } -static const TypeInfo vfio_pci_nohotplug_dev_info = { +static const TypeInfo vfio_pci_nohotplug_info = { .name = TYPE_VFIO_PCI_NOHOTPLUG, .parent = TYPE_VFIO_PCI, .instance_size = sizeof(VFIOPCIDevice), - .class_init = vfio_pci_nohotplug_dev_class_init, + .class_init = vfio_pci_nohotplug_class_init, }; static void register_vfio_pci_dev_type(void) @@ -3969,9 +3969,9 @@ static void register_vfio_pci_dev_type(void) vfio_pci_migration_multifd_transfer_prop = qdev_prop_on_off_auto; vfio_pci_migration_multifd_transfer_prop.realized_set_allowed = true; - type_register_static(&vfio_pci_base_dev_info); - type_register_static(&vfio_pci_dev_info); - type_register_static(&vfio_pci_nohotplug_dev_info); + type_register_static(&vfio_pci_device_info); + type_register_static(&vfio_pci_info); + type_register_static(&vfio_pci_nohotplug_info); } type_init(register_vfio_pci_dev_type) diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index e0aef82..0f78cf9 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -120,7 +120,7 @@ typedef struct VFIOMSIXInfo { MemoryRegion *pba_region; } VFIOMSIXInfo; -OBJECT_DECLARE_SIMPLE_TYPE(VFIOPCIDevice, VFIO_PCI_BASE) +OBJECT_DECLARE_SIMPLE_TYPE(VFIOPCIDevice, VFIO_PCI_DEVICE) struct VFIOPCIDevice { PCIDevice parent_obj; diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index c41e458..8d9d68d 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -15,7 +15,7 @@ #include "system/hostmem.h" #include "system/address-spaces.h" -#include "hw/vfio/vfio-container.h" +#include "hw/vfio/vfio-container-legacy.h" #include "hw/hw.h" #include "system/ram_addr.h" #include "qemu/error-report.h" @@ -30,12 +30,13 @@ typedef struct VFIOHostDMAWindow { QLIST_ENTRY(VFIOHostDMAWindow) hostwin_next; } VFIOHostDMAWindow; -typedef struct VFIOSpaprContainer { - VFIOContainer container; +struct VFIOSpaprContainer { + VFIOLegacyContainer parent_obj; + MemoryListener prereg_listener; QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; unsigned int levels; -} VFIOSpaprContainer; +}; OBJECT_DECLARE_SIMPLE_TYPE(VFIOSpaprContainer, VFIO_IOMMU_SPAPR); @@ -61,8 +62,8 @@ static void vfio_prereg_listener_region_add(MemoryListener *listener, { VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer, prereg_listener); - VFIOContainer *container = &scontainer->container; - VFIOContainerBase *bcontainer = VFIO_IOMMU(container); + VFIOLegacyContainer *container = VFIO_IOMMU_LEGACY(scontainer); + VFIOContainer *bcontainer = VFIO_IOMMU(container); const hwaddr gpa = section->offset_within_address_space; hwaddr end; int ret; @@ -121,7 +122,7 @@ static void vfio_prereg_listener_region_del(MemoryListener *listener, { VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer, prereg_listener); - VFIOContainer *container = &scontainer->container; + VFIOLegacyContainer *container = VFIO_IOMMU_LEGACY(scontainer); const hwaddr gpa = section->offset_within_address_space; hwaddr end; int ret; @@ -218,7 +219,7 @@ static VFIOHostDMAWindow *vfio_find_hostwin(VFIOSpaprContainer *container, return hostwin_found ? hostwin : NULL; } -static int vfio_spapr_remove_window(VFIOContainer *container, +static int vfio_spapr_remove_window(VFIOLegacyContainer *container, hwaddr offset_within_address_space) { struct vfio_iommu_spapr_tce_remove remove = { @@ -239,14 +240,13 @@ static int vfio_spapr_remove_window(VFIOContainer *container, return 0; } -static bool vfio_spapr_create_window(VFIOContainer *container, +static bool vfio_spapr_create_window(VFIOLegacyContainer *container, MemoryRegionSection *section, hwaddr *pgsize, Error **errp) { int ret = 0; - VFIOContainerBase *bcontainer = VFIO_IOMMU(container); - VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, - container); + VFIOContainer *bcontainer = VFIO_IOMMU(container); + VFIOSpaprContainer *scontainer = VFIO_IOMMU_SPAPR(bcontainer); IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr), pgmask; unsigned entries, bits_total, bits_per_level, max_levels, ddw_levels; @@ -348,13 +348,12 @@ static bool vfio_spapr_create_window(VFIOContainer *container, } static bool -vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer, +vfio_spapr_container_add_section_window(VFIOContainer *bcontainer, MemoryRegionSection *section, Error **errp) { - VFIOContainer *container = VFIO_IOMMU_LEGACY(bcontainer); - VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, - container); + VFIOLegacyContainer *container = VFIO_IOMMU_LEGACY(bcontainer); + VFIOSpaprContainer *scontainer = VFIO_IOMMU_SPAPR(container); VFIOHostDMAWindow *hostwin; hwaddr pgsize = 0; int ret; @@ -439,12 +438,11 @@ vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer, } static void -vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer, +vfio_spapr_container_del_section_window(VFIOContainer *bcontainer, MemoryRegionSection *section) { - VFIOContainer *container = VFIO_IOMMU_LEGACY(bcontainer); - VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, - container); + VFIOLegacyContainer *container = VFIO_IOMMU_LEGACY(bcontainer); + VFIOSpaprContainer *scontainer = VFIO_IOMMU_SPAPR(container); if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { return; @@ -461,11 +459,10 @@ vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer, } } -static void vfio_spapr_container_release(VFIOContainerBase *bcontainer) +static void vfio_spapr_container_release(VFIOContainer *bcontainer) { - VFIOContainer *container = VFIO_IOMMU_LEGACY(bcontainer); - VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, - container); + VFIOLegacyContainer *container = VFIO_IOMMU_LEGACY(bcontainer); + VFIOSpaprContainer *scontainer = VFIO_IOMMU_SPAPR(container); VFIOHostDMAWindow *hostwin, *next; if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { @@ -478,12 +475,11 @@ static void vfio_spapr_container_release(VFIOContainerBase *bcontainer) } } -static bool vfio_spapr_container_setup(VFIOContainerBase *bcontainer, +static bool vfio_spapr_container_setup(VFIOContainer *bcontainer, Error **errp) { - VFIOContainer *container = VFIO_IOMMU_LEGACY(bcontainer); - VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, - container); + VFIOLegacyContainer *container = VFIO_IOMMU_LEGACY(bcontainer); + VFIOSpaprContainer *scontainer = VFIO_IOMMU_SPAPR(container); struct vfio_iommu_spapr_tce_info info; bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU; int ret, fd = container->fd; diff --git a/hw/vfio/types.h b/hw/vfio/types.h index c19334f..5482d90 100644 --- a/hw/vfio/types.h +++ b/hw/vfio/types.h @@ -9,11 +9,11 @@ #define HW_VFIO_VFIO_TYPES_H /* - * TYPE_VFIO_PCI_BASE is an abstract type used to share code + * TYPE_VFIO_PCI_DEVICE is an abstract type used to share code * between VFIO implementations that use a kernel driver * with those that use user sockets. */ -#define TYPE_VFIO_PCI_BASE "vfio-pci-base" +#define TYPE_VFIO_PCI_DEVICE "vfio-pci-device" #define TYPE_VFIO_PCI "vfio-pci" /* TYPE_VFIO_PCI shares struct VFIOPCIDevice. */ diff --git a/hw/vfio/vfio-iommufd.h b/hw/vfio/vfio-iommufd.h index 07ea0f4..6b28e1f 100644 --- a/hw/vfio/vfio-iommufd.h +++ b/hw/vfio/vfio-iommufd.h @@ -9,7 +9,7 @@ #ifndef HW_VFIO_VFIO_IOMMUFD_H #define HW_VFIO_VFIO_IOMMUFD_H -#include "hw/vfio/vfio-container-base.h" +#include "hw/vfio/vfio-container.h" typedef struct VFIODevice VFIODevice; @@ -22,12 +22,13 @@ typedef struct VFIOIOASHwpt { typedef struct IOMMUFDBackend IOMMUFDBackend; -typedef struct VFIOIOMMUFDContainer { - VFIOContainerBase bcontainer; +struct VFIOIOMMUFDContainer { + VFIOContainer parent_obj; + IOMMUFDBackend *be; uint32_t ioas_id; QLIST_HEAD(, VFIOIOASHwpt) hwpt_list; -} VFIOIOMMUFDContainer; +}; OBJECT_DECLARE_SIMPLE_TYPE(VFIOIOMMUFDContainer, VFIO_IOMMU_IOMMUFD); diff --git a/hw/vfio/vfio-listener.h b/hw/vfio/vfio-listener.h index eb69ddd..a90674c 100644 --- a/hw/vfio/vfio-listener.h +++ b/hw/vfio/vfio-listener.h @@ -9,7 +9,7 @@ #ifndef HW_VFIO_VFIO_LISTENER_H #define HW_VFIO_VFIO_LISTENER_H -bool vfio_listener_register(VFIOContainerBase *bcontainer, Error **errp); -void vfio_listener_unregister(VFIOContainerBase *bcontainer); +bool vfio_listener_register(VFIOContainer *bcontainer, Error **errp); +void vfio_listener_unregister(VFIOContainer *bcontainer); #endif /* HW_VFIO_VFIO_LISTENER_H */ diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h deleted file mode 100644 index acbd48a..0000000 --- a/include/hw/vfio/vfio-container-base.h +++ /dev/null @@ -1,279 +0,0 @@ -/* - * VFIO BASE CONTAINER - * - * Copyright (C) 2023 Intel Corporation. - * Copyright Red Hat, Inc. 2023 - * - * Authors: Yi Liu <yi.l.liu@intel.com> - * Eric Auger <eric.auger@redhat.com> - * - * SPDX-License-Identifier: GPL-2.0-or-later - */ - -#ifndef HW_VFIO_VFIO_CONTAINER_BASE_H -#define HW_VFIO_VFIO_CONTAINER_BASE_H - -#include "system/memory.h" - -typedef struct VFIODevice VFIODevice; -typedef struct VFIOIOMMUClass VFIOIOMMUClass; - -typedef struct { - unsigned long *bitmap; - hwaddr size; - hwaddr pages; -} VFIOBitmap; - -typedef struct VFIOAddressSpace { - AddressSpace *as; - QLIST_HEAD(, VFIOContainerBase) containers; - QLIST_ENTRY(VFIOAddressSpace) list; -} VFIOAddressSpace; - -/* - * This is the base object for vfio container backends - */ -struct VFIOContainerBase { - Object parent_obj; - - VFIOAddressSpace *space; - MemoryListener listener; - Error *error; - bool initialized; - uint64_t dirty_pgsizes; - uint64_t max_dirty_bitmap_size; - unsigned long pgsizes; - unsigned int dma_max_mappings; - bool dirty_pages_supported; - bool dirty_pages_started; /* Protected by BQL */ - QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; - QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; - QLIST_ENTRY(VFIOContainerBase) next; - QLIST_HEAD(, VFIODevice) device_list; - GList *iova_ranges; - NotifierWithReturn cpr_reboot_notifier; -}; - -#define TYPE_VFIO_IOMMU "vfio-iommu" -OBJECT_DECLARE_TYPE(VFIOContainerBase, VFIOIOMMUClass, VFIO_IOMMU) - -typedef struct VFIOGuestIOMMU { - VFIOContainerBase *bcontainer; - IOMMUMemoryRegion *iommu_mr; - hwaddr iommu_offset; - IOMMUNotifier n; - QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; -} VFIOGuestIOMMU; - -typedef struct VFIORamDiscardListener { - VFIOContainerBase *bcontainer; - MemoryRegion *mr; - hwaddr offset_within_address_space; - hwaddr size; - uint64_t granularity; - RamDiscardListener listener; - QLIST_ENTRY(VFIORamDiscardListener) next; -} VFIORamDiscardListener; - -VFIOAddressSpace *vfio_address_space_get(AddressSpace *as); -void vfio_address_space_put(VFIOAddressSpace *space); -void vfio_address_space_insert(VFIOAddressSpace *space, - VFIOContainerBase *bcontainer); - -int vfio_container_dma_map(VFIOContainerBase *bcontainer, - hwaddr iova, ram_addr_t size, - void *vaddr, bool readonly, MemoryRegion *mr); -int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, - hwaddr iova, ram_addr_t size, - IOMMUTLBEntry *iotlb, bool unmap_all); -bool vfio_container_add_section_window(VFIOContainerBase *bcontainer, - MemoryRegionSection *section, - Error **errp); -void vfio_container_del_section_window(VFIOContainerBase *bcontainer, - MemoryRegionSection *section); -int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer, - bool start, Error **errp); -bool vfio_container_dirty_tracking_is_started( - const VFIOContainerBase *bcontainer); -bool vfio_container_devices_dirty_tracking_is_supported( - const VFIOContainerBase *bcontainer); -int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer, - uint64_t iova, uint64_t size, ram_addr_t ram_addr, Error **errp); - -GList *vfio_container_get_iova_ranges(const VFIOContainerBase *bcontainer); - -static inline uint64_t -vfio_container_get_page_size_mask(const VFIOContainerBase *bcontainer) -{ - assert(bcontainer); - return bcontainer->pgsizes; -} - -#define TYPE_VFIO_IOMMU_LEGACY TYPE_VFIO_IOMMU "-legacy" -#define TYPE_VFIO_IOMMU_SPAPR TYPE_VFIO_IOMMU "-spapr" -#define TYPE_VFIO_IOMMU_IOMMUFD TYPE_VFIO_IOMMU "-iommufd" -#define TYPE_VFIO_IOMMU_USER TYPE_VFIO_IOMMU "-user" - -struct VFIOIOMMUClass { - ObjectClass parent_class; - - /** - * @setup - * - * Perform basic setup of the container, including configuring IOMMU - * capabilities, IOVA ranges, supported page sizes, etc. - * - * @bcontainer: #VFIOContainerBase - * @errp: pointer to Error*, to store an error if it happens. - * - * Returns true to indicate success and false for error. - */ - bool (*setup)(VFIOContainerBase *bcontainer, Error **errp); - - /** - * @listener_begin - * - * Called at the beginning of an address space update transaction. - * See #MemoryListener. - * - * @bcontainer: #VFIOContainerBase - */ - void (*listener_begin)(VFIOContainerBase *bcontainer); - - /** - * @listener_commit - * - * Called at the end of an address space update transaction, - * See #MemoryListener. - * - * @bcontainer: #VFIOContainerBase - */ - void (*listener_commit)(VFIOContainerBase *bcontainer); - - /** - * @dma_map - * - * Map an address range into the container. Note that the memory region is - * referenced within an RCU read lock region across this call. - * - * @bcontainer: #VFIOContainerBase to use - * @iova: start address to map - * @size: size of the range to map - * @vaddr: process virtual address of mapping - * @readonly: true if mapping should be readonly - * @mr: the memory region for this mapping - * - * Returns 0 to indicate success and -errno otherwise. - */ - int (*dma_map)(const VFIOContainerBase *bcontainer, - hwaddr iova, ram_addr_t size, - void *vaddr, bool readonly, MemoryRegion *mr); - /** - * @dma_map_file - * - * Map a file range for the container. - * - * @bcontainer: #VFIOContainerBase to use for map - * @iova: start address to map - * @size: size of the range to map - * @fd: descriptor of the file to map - * @start: starting file offset of the range to map - * @readonly: map read only if true - */ - int (*dma_map_file)(const VFIOContainerBase *bcontainer, - hwaddr iova, ram_addr_t size, - int fd, unsigned long start, bool readonly); - /** - * @dma_unmap - * - * Unmap an address range from the container. - * - * @bcontainer: #VFIOContainerBase to use for unmap - * @iova: start address to unmap - * @size: size of the range to unmap - * @iotlb: The IOMMU TLB mapping entry (or NULL) - * @unmap_all: if set, unmap the entire address space - * - * Returns 0 to indicate success and -errno otherwise. - */ - int (*dma_unmap)(const VFIOContainerBase *bcontainer, - hwaddr iova, ram_addr_t size, - IOMMUTLBEntry *iotlb, bool unmap_all); - - - /** - * @attach_device - * - * Associate the given device with a container and do some related - * initialization of the device context. - * - * @name: name of the device - * @vbasedev: the device - * @as: address space to use - * @errp: pointer to Error*, to store an error if it happens. - * - * Returns true to indicate success and false for error. - */ - bool (*attach_device)(const char *name, VFIODevice *vbasedev, - AddressSpace *as, Error **errp); - - /* - * @detach_device - * - * Detach the given device from its container and clean up any necessary - * state. - * - * @vbasedev: the device to disassociate - */ - void (*detach_device)(VFIODevice *vbasedev); - - /* migration feature */ - - /** - * @set_dirty_page_tracking - * - * Start or stop dirty pages tracking on VFIO container - * - * @bcontainer: #VFIOContainerBase on which to de/activate dirty - * page tracking - * @start: indicates whether to start or stop dirty pages tracking - * @errp: pointer to Error*, to store an error if it happens. - * - * Returns zero to indicate success and negative for error. - */ - int (*set_dirty_page_tracking)(const VFIOContainerBase *bcontainer, - bool start, Error **errp); - /** - * @query_dirty_bitmap - * - * Get bitmap of dirty pages from container - * - * @bcontainer: #VFIOContainerBase from which to get dirty pages - * @vbmap: #VFIOBitmap internal bitmap structure - * @iova: iova base address - * @size: size of iova range - * @errp: pointer to Error*, to store an error if it happens. - * - * Returns zero to indicate success and negative for error. - */ - int (*query_dirty_bitmap)(const VFIOContainerBase *bcontainer, - VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp); - /* PCI specific */ - int (*pci_hot_reset)(VFIODevice *vbasedev, bool single); - - /* SPAPR specific */ - bool (*add_window)(VFIOContainerBase *bcontainer, - MemoryRegionSection *section, - Error **errp); - void (*del_window)(VFIOContainerBase *bcontainer, - MemoryRegionSection *section); - void (*release)(VFIOContainerBase *bcontainer); -}; - -VFIORamDiscardListener *vfio_find_ram_discard_listener( - VFIOContainerBase *bcontainer, MemoryRegionSection *section); - -void vfio_container_region_add(VFIOContainerBase *bcontainer, - MemoryRegionSection *section, bool cpr_remap); - -#endif /* HW_VFIO_VFIO_CONTAINER_BASE_H */ diff --git a/include/hw/vfio/vfio-container-legacy.h b/include/hw/vfio/vfio-container-legacy.h new file mode 100644 index 0000000..74a72df --- /dev/null +++ b/include/hw/vfio/vfio-container-legacy.h @@ -0,0 +1,39 @@ +/* + * VFIO container + * + * Copyright Red Hat, Inc. 2025 + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_VFIO_CONTAINER_LEGACY_H +#define HW_VFIO_CONTAINER_LEGACY_H + +#include "hw/vfio/vfio-container.h" +#include "hw/vfio/vfio-cpr.h" + +typedef struct VFIOLegacyContainer VFIOLegacyContainer; +typedef struct VFIODevice VFIODevice; + +typedef struct VFIOGroup { + int fd; + int groupid; + VFIOLegacyContainer *container; + QLIST_HEAD(, VFIODevice) device_list; + QLIST_ENTRY(VFIOGroup) next; + QLIST_ENTRY(VFIOGroup) container_next; + bool ram_block_discard_allowed; +} VFIOGroup; + +struct VFIOLegacyContainer { + VFIOContainer parent_obj; + + int fd; /* /dev/vfio/vfio, empowered by the attached groups */ + unsigned iommu_type; + QLIST_HEAD(, VFIOGroup) group_list; + VFIOContainerCPR cpr; +}; + +OBJECT_DECLARE_SIMPLE_TYPE(VFIOLegacyContainer, VFIO_IOMMU_LEGACY); + +#endif /* HW_VFIO_CONTAINER_LEGACY_H */ diff --git a/include/hw/vfio/vfio-container.h b/include/hw/vfio/vfio-container.h index 240f566..b8fb2b8 100644 --- a/include/hw/vfio/vfio-container.h +++ b/include/hw/vfio/vfio-container.h @@ -1,39 +1,279 @@ /* - * VFIO container + * VFIO BASE CONTAINER * - * Copyright Red Hat, Inc. 2025 + * Copyright (C) 2023 Intel Corporation. + * Copyright Red Hat, Inc. 2023 + * + * Authors: Yi Liu <yi.l.liu@intel.com> + * Eric Auger <eric.auger@redhat.com> * * SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef HW_VFIO_CONTAINER_H -#define HW_VFIO_CONTAINER_H +#ifndef HW_VFIO_VFIO_CONTAINER_H +#define HW_VFIO_VFIO_CONTAINER_H -#include "hw/vfio/vfio-container-base.h" -#include "hw/vfio/vfio-cpr.h" +#include "system/memory.h" -typedef struct VFIOContainer VFIOContainer; typedef struct VFIODevice VFIODevice; +typedef struct VFIOIOMMUClass VFIOIOMMUClass; -typedef struct VFIOGroup { - int fd; - int groupid; - VFIOContainer *container; - QLIST_HEAD(, VFIODevice) device_list; - QLIST_ENTRY(VFIOGroup) next; - QLIST_ENTRY(VFIOGroup) container_next; - bool ram_block_discard_allowed; -} VFIOGroup; +typedef struct { + unsigned long *bitmap; + hwaddr size; + hwaddr pages; +} VFIOBitmap; +typedef struct VFIOAddressSpace { + AddressSpace *as; + QLIST_HEAD(, VFIOContainer) containers; + QLIST_ENTRY(VFIOAddressSpace) list; +} VFIOAddressSpace; + +/* + * This is the base object for vfio container backends + */ struct VFIOContainer { - VFIOContainerBase parent_obj; + Object parent_obj; - int fd; /* /dev/vfio/vfio, empowered by the attached groups */ - unsigned iommu_type; - QLIST_HEAD(, VFIOGroup) group_list; - VFIOContainerCPR cpr; + VFIOAddressSpace *space; + MemoryListener listener; + Error *error; + bool initialized; + uint64_t dirty_pgsizes; + uint64_t max_dirty_bitmap_size; + unsigned long pgsizes; + unsigned int dma_max_mappings; + bool dirty_pages_supported; + bool dirty_pages_started; /* Protected by BQL */ + QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; + QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; + QLIST_ENTRY(VFIOContainer) next; + QLIST_HEAD(, VFIODevice) device_list; + GList *iova_ranges; + NotifierWithReturn cpr_reboot_notifier; }; -OBJECT_DECLARE_SIMPLE_TYPE(VFIOContainer, VFIO_IOMMU_LEGACY); +#define TYPE_VFIO_IOMMU "vfio-iommu" +OBJECT_DECLARE_TYPE(VFIOContainer, VFIOIOMMUClass, VFIO_IOMMU) + +typedef struct VFIOGuestIOMMU { + VFIOContainer *bcontainer; + IOMMUMemoryRegion *iommu_mr; + hwaddr iommu_offset; + IOMMUNotifier n; + QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; +} VFIOGuestIOMMU; + +typedef struct VFIORamDiscardListener { + VFIOContainer *bcontainer; + MemoryRegion *mr; + hwaddr offset_within_address_space; + hwaddr size; + uint64_t granularity; + RamDiscardListener listener; + QLIST_ENTRY(VFIORamDiscardListener) next; +} VFIORamDiscardListener; + +VFIOAddressSpace *vfio_address_space_get(AddressSpace *as); +void vfio_address_space_put(VFIOAddressSpace *space); +void vfio_address_space_insert(VFIOAddressSpace *space, + VFIOContainer *bcontainer); + +int vfio_container_dma_map(VFIOContainer *bcontainer, + hwaddr iova, ram_addr_t size, + void *vaddr, bool readonly, MemoryRegion *mr); +int vfio_container_dma_unmap(VFIOContainer *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb, bool unmap_all); +bool vfio_container_add_section_window(VFIOContainer *bcontainer, + MemoryRegionSection *section, + Error **errp); +void vfio_container_del_section_window(VFIOContainer *bcontainer, + MemoryRegionSection *section); +int vfio_container_set_dirty_page_tracking(VFIOContainer *bcontainer, + bool start, Error **errp); +bool vfio_container_dirty_tracking_is_started( + const VFIOContainer *bcontainer); +bool vfio_container_devices_dirty_tracking_is_supported( + const VFIOContainer *bcontainer); +int vfio_container_query_dirty_bitmap(const VFIOContainer *bcontainer, + uint64_t iova, uint64_t size, ram_addr_t ram_addr, Error **errp); + +GList *vfio_container_get_iova_ranges(const VFIOContainer *bcontainer); + +static inline uint64_t +vfio_container_get_page_size_mask(const VFIOContainer *bcontainer) +{ + assert(bcontainer); + return bcontainer->pgsizes; +} + +#define TYPE_VFIO_IOMMU_LEGACY TYPE_VFIO_IOMMU "-legacy" +#define TYPE_VFIO_IOMMU_SPAPR TYPE_VFIO_IOMMU "-spapr" +#define TYPE_VFIO_IOMMU_IOMMUFD TYPE_VFIO_IOMMU "-iommufd" +#define TYPE_VFIO_IOMMU_USER TYPE_VFIO_IOMMU "-user" + +struct VFIOIOMMUClass { + ObjectClass parent_class; + + /** + * @setup + * + * Perform basic setup of the container, including configuring IOMMU + * capabilities, IOVA ranges, supported page sizes, etc. + * + * @bcontainer: #VFIOContainer + * @errp: pointer to Error*, to store an error if it happens. + * + * Returns true to indicate success and false for error. + */ + bool (*setup)(VFIOContainer *bcontainer, Error **errp); + + /** + * @listener_begin + * + * Called at the beginning of an address space update transaction. + * See #MemoryListener. + * + * @bcontainer: #VFIOContainer + */ + void (*listener_begin)(VFIOContainer *bcontainer); + + /** + * @listener_commit + * + * Called at the end of an address space update transaction, + * See #MemoryListener. + * + * @bcontainer: #VFIOContainer + */ + void (*listener_commit)(VFIOContainer *bcontainer); + + /** + * @dma_map + * + * Map an address range into the container. Note that the memory region is + * referenced within an RCU read lock region across this call. + * + * @bcontainer: #VFIOContainer to use + * @iova: start address to map + * @size: size of the range to map + * @vaddr: process virtual address of mapping + * @readonly: true if mapping should be readonly + * @mr: the memory region for this mapping + * + * Returns 0 to indicate success and -errno otherwise. + */ + int (*dma_map)(const VFIOContainer *bcontainer, + hwaddr iova, ram_addr_t size, + void *vaddr, bool readonly, MemoryRegion *mr); + /** + * @dma_map_file + * + * Map a file range for the container. + * + * @bcontainer: #VFIOContainer to use for map + * @iova: start address to map + * @size: size of the range to map + * @fd: descriptor of the file to map + * @start: starting file offset of the range to map + * @readonly: map read only if true + */ + int (*dma_map_file)(const VFIOContainer *bcontainer, + hwaddr iova, ram_addr_t size, + int fd, unsigned long start, bool readonly); + /** + * @dma_unmap + * + * Unmap an address range from the container. + * + * @bcontainer: #VFIOContainer to use for unmap + * @iova: start address to unmap + * @size: size of the range to unmap + * @iotlb: The IOMMU TLB mapping entry (or NULL) + * @unmap_all: if set, unmap the entire address space + * + * Returns 0 to indicate success and -errno otherwise. + */ + int (*dma_unmap)(const VFIOContainer *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb, bool unmap_all); + + + /** + * @attach_device + * + * Associate the given device with a container and do some related + * initialization of the device context. + * + * @name: name of the device + * @vbasedev: the device + * @as: address space to use + * @errp: pointer to Error*, to store an error if it happens. + * + * Returns true to indicate success and false for error. + */ + bool (*attach_device)(const char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp); + + /* + * @detach_device + * + * Detach the given device from its container and clean up any necessary + * state. + * + * @vbasedev: the device to disassociate + */ + void (*detach_device)(VFIODevice *vbasedev); + + /* migration feature */ + + /** + * @set_dirty_page_tracking + * + * Start or stop dirty pages tracking on VFIO container + * + * @bcontainer: #VFIOContainer on which to de/activate dirty + * page tracking + * @start: indicates whether to start or stop dirty pages tracking + * @errp: pointer to Error*, to store an error if it happens. + * + * Returns zero to indicate success and negative for error. + */ + int (*set_dirty_page_tracking)(const VFIOContainer *bcontainer, + bool start, Error **errp); + /** + * @query_dirty_bitmap + * + * Get bitmap of dirty pages from container + * + * @bcontainer: #VFIOContainer from which to get dirty pages + * @vbmap: #VFIOBitmap internal bitmap structure + * @iova: iova base address + * @size: size of iova range + * @errp: pointer to Error*, to store an error if it happens. + * + * Returns zero to indicate success and negative for error. + */ + int (*query_dirty_bitmap)(const VFIOContainer *bcontainer, + VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp); + /* PCI specific */ + int (*pci_hot_reset)(VFIODevice *vbasedev, bool single); + + /* SPAPR specific */ + bool (*add_window)(VFIOContainer *bcontainer, + MemoryRegionSection *section, + Error **errp); + void (*del_window)(VFIOContainer *bcontainer, + MemoryRegionSection *section); + void (*release)(VFIOContainer *bcontainer); +}; + +VFIORamDiscardListener *vfio_find_ram_discard_listener( + VFIOContainer *bcontainer, MemoryRegionSection *section); + +void vfio_container_region_add(VFIOContainer *bcontainer, + MemoryRegionSection *section, bool cpr_remap); -#endif /* HW_VFIO_CONTAINER_H */ +#endif /* HW_VFIO_VFIO_CONTAINER_H */ diff --git a/include/hw/vfio/vfio-cpr.h b/include/hw/vfio/vfio-cpr.h index d37daff..26ee0c4 100644 --- a/include/hw/vfio/vfio-cpr.h +++ b/include/hw/vfio/vfio-cpr.h @@ -12,15 +12,15 @@ #include "migration/misc.h" #include "system/memory.h" +struct VFIOLegacyContainer; struct VFIOContainer; -struct VFIOContainerBase; struct VFIOGroup; struct VFIODevice; struct VFIOPCIDevice; struct VFIOIOMMUFDContainer; struct IOMMUFDBackend; -typedef int (*dma_map_fn)(const struct VFIOContainerBase *bcontainer, +typedef int (*dma_map_fn)(const struct VFIOContainer *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly, MemoryRegion *mr); @@ -42,9 +42,10 @@ typedef struct VFIOPCICPR { NotifierWithReturn transfer_notifier; } VFIOPCICPR; -bool vfio_legacy_cpr_register_container(struct VFIOContainer *container, +bool vfio_legacy_cpr_register_container(struct VFIOLegacyContainer *container, Error **errp); -void vfio_legacy_cpr_unregister_container(struct VFIOContainer *container); +void vfio_legacy_cpr_unregister_container( + struct VFIOLegacyContainer *container); int vfio_cpr_reboot_notifier(NotifierWithReturn *notifier, MigrationEvent *e, Error **errp); @@ -61,14 +62,14 @@ void vfio_cpr_load_device(struct VFIODevice *vbasedev); int vfio_cpr_group_get_device_fd(int d, const char *name); -bool vfio_cpr_container_match(struct VFIOContainer *container, +bool vfio_cpr_container_match(struct VFIOLegacyContainer *container, struct VFIOGroup *group, int fd); -void vfio_cpr_giommu_remap(struct VFIOContainerBase *bcontainer, +void vfio_cpr_giommu_remap(struct VFIOContainer *bcontainer, MemoryRegionSection *section); bool vfio_cpr_ram_discard_register_listener( - struct VFIOContainerBase *bcontainer, MemoryRegionSection *section); + struct VFIOContainer *bcontainer, MemoryRegionSection *section); void vfio_cpr_save_vector_fd(struct VFIOPCIDevice *vdev, const char *name, int nr, int fd); diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h index e7e6243..7e9aed6 100644 --- a/include/hw/vfio/vfio-device.h +++ b/include/hw/vfio/vfio-device.h @@ -18,8 +18,8 @@ * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) */ -#ifndef HW_VFIO_VFIO_COMMON_H -#define HW_VFIO_VFIO_COMMON_H +#ifndef HW_VFIO_VFIO_DEVICE_H +#define HW_VFIO_VFIO_DEVICE_H #include "system/memory.h" #include "qemu/queue.h" @@ -27,7 +27,7 @@ #include <linux/vfio.h> #endif #include "system/system.h" -#include "hw/vfio/vfio-container-base.h" +#include "hw/vfio/vfio-container.h" #include "hw/vfio/vfio-cpr.h" #include "system/host_iommu_device.h" #include "system/iommufd.h" @@ -54,7 +54,7 @@ typedef struct VFIODevice { QLIST_ENTRY(VFIODevice) container_next; QLIST_ENTRY(VFIODevice) global_next; struct VFIOGroup *group; - VFIOContainerBase *bcontainer; + VFIOContainer *bcontainer; char *sysfsdev; char *name; DeviceState *dev; @@ -252,7 +252,7 @@ struct VFIODeviceIOOps { void *data, bool post); }; -void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer, +void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainer *bcontainer, struct vfio_device_info *info); void vfio_device_unprepare(VFIODevice *vbasedev); @@ -288,4 +288,4 @@ void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, int vfio_device_get_aw_bits(VFIODevice *vdev); void vfio_kvm_device_close(void); -#endif /* HW_VFIO_VFIO_COMMON_H */ +#endif /* HW_VFIO_VFIO_DEVICE_H */ diff --git a/include/semihosting/common-semi.h b/include/semihosting/common-semi.h index 0a91db7..aa511a4 100644 --- a/include/semihosting/common-semi.h +++ b/include/semihosting/common-semi.h @@ -35,5 +35,11 @@ #define COMMON_SEMI_H void do_common_semihosting(CPUState *cs); +uint64_t common_semi_arg(CPUState *cs, int argno); +void common_semi_set_ret(CPUState *cs, uint64_t ret); +bool is_64bit_semihosting(CPUArchState *env); +bool common_semi_sys_exit_is_extended(CPUState *cs); +uint64_t common_semi_stack_bottom(CPUState *cs); +bool common_semi_has_synccache(CPUArchState *env); #endif /* COMMON_SEMI_H */ diff --git a/include/semihosting/guestfd.h b/include/semihosting/guestfd.h index 3d426fe..a7ea104 100644 --- a/include/semihosting/guestfd.h +++ b/include/semihosting/guestfd.h @@ -35,13 +35,6 @@ typedef struct GuestFD { }; } GuestFD; -/* - * For ARM semihosting, we have a separate structure for routing - * data for the console which is outside the guest fd address space. - */ -extern GuestFD console_in_gf; -extern GuestFD console_out_gf; - /** * alloc_guestfd: * diff --git a/include/semihosting/semihost.h b/include/semihosting/semihost.h index b03e637..231dc89 100644 --- a/include/semihosting/semihost.h +++ b/include/semihosting/semihost.h @@ -33,6 +33,8 @@ typedef enum SemihostingTarget { * Return true if guest code is allowed to make semihosting calls. */ bool semihosting_enabled(bool is_user); +bool semihosting_arm_compatible(void); +void semihosting_arm_compatible_init(void); SemihostingTarget semihosting_get_target(void); const char *semihosting_get_arg(int i); diff --git a/include/semihosting/syscalls.h b/include/semihosting/syscalls.h index 6627c45..03aa45b 100644 --- a/include/semihosting/syscalls.h +++ b/include/semihosting/syscalls.h @@ -9,7 +9,7 @@ #ifndef SEMIHOSTING_SYSCALLS_H #define SEMIHOSTING_SYSCALLS_H -#include "exec/cpu-defs.h" +#include "exec/vaddr.h" #include "gdbstub/syscalls.h" /* @@ -24,23 +24,23 @@ typedef struct GuestFD GuestFD; void semihost_sys_open(CPUState *cs, gdb_syscall_complete_cb complete, - target_ulong fname, target_ulong fname_len, + vaddr fname, uint64_t fname_len, int gdb_flags, int mode); void semihost_sys_close(CPUState *cs, gdb_syscall_complete_cb complete, int fd); void semihost_sys_read(CPUState *cs, gdb_syscall_complete_cb complete, - int fd, target_ulong buf, target_ulong len); + int fd, vaddr buf, uint64_t len); void semihost_sys_read_gf(CPUState *cs, gdb_syscall_complete_cb complete, - GuestFD *gf, target_ulong buf, target_ulong len); + GuestFD *gf, vaddr buf, uint64_t len); void semihost_sys_write(CPUState *cs, gdb_syscall_complete_cb complete, - int fd, target_ulong buf, target_ulong len); + int fd, vaddr buf, uint64_t len); void semihost_sys_write_gf(CPUState *cs, gdb_syscall_complete_cb complete, - GuestFD *gf, target_ulong buf, target_ulong len); + GuestFD *gf, vaddr buf, uint64_t len); void semihost_sys_lseek(CPUState *cs, gdb_syscall_complete_cb complete, int fd, int64_t off, int gdb_whence); @@ -50,27 +50,27 @@ void semihost_sys_isatty(CPUState *cs, gdb_syscall_complete_cb complete, void semihost_sys_flen(CPUState *cs, gdb_syscall_complete_cb fstat_cb, gdb_syscall_complete_cb flen_cb, - int fd, target_ulong fstat_addr); + int fd, vaddr fstat_addr); void semihost_sys_fstat(CPUState *cs, gdb_syscall_complete_cb complete, - int fd, target_ulong addr); + int fd, vaddr addr); void semihost_sys_stat(CPUState *cs, gdb_syscall_complete_cb complete, - target_ulong fname, target_ulong fname_len, - target_ulong addr); + vaddr fname, uint64_t fname_len, + vaddr addr); void semihost_sys_remove(CPUState *cs, gdb_syscall_complete_cb complete, - target_ulong fname, target_ulong fname_len); + vaddr fname, uint64_t fname_len); void semihost_sys_rename(CPUState *cs, gdb_syscall_complete_cb complete, - target_ulong oname, target_ulong oname_len, - target_ulong nname, target_ulong nname_len); + vaddr oname, uint64_t oname_len, + vaddr nname, uint64_t nname_len); void semihost_sys_system(CPUState *cs, gdb_syscall_complete_cb complete, - target_ulong cmd, target_ulong cmd_len); + vaddr cmd, uint64_t cmd_len); void semihost_sys_gettimeofday(CPUState *cs, gdb_syscall_complete_cb complete, - target_ulong tv_addr, target_ulong tz_addr); + vaddr tv_addr, vaddr tz_addr); void semihost_sys_poll_one(CPUState *cs, gdb_syscall_complete_cb complete, int fd, GIOCondition cond, int timeout); diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 91616c9..40b6955 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1816,7 +1816,8 @@ sub process { } # Check SPDX-License-Identifier references a permitted license - if ($rawline =~ m,SPDX-License-Identifier: (.*?)(\*/)?\s*$,) { + if (($rawline =~ m,SPDX-License-Identifier: (.*?)(\*/)?\s*$,) && + $rawline !~ /^-/) { $fileinfo->{facts}->{sawspdx} = 1; &checkspdx($realfile, $1); } diff --git a/scripts/ci/gitlab-failure-analysis b/scripts/ci/gitlab-failure-analysis new file mode 100755 index 0000000..906725b --- /dev/null +++ b/scripts/ci/gitlab-failure-analysis @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +# +# A script to analyse failures in the gitlab pipelines. It requires an +# API key from gitlab with the following permissions: +# - api +# - read_repository +# - read_user +# + +import argparse +import gitlab +import os + +# +# Arguments +# +class NoneForEmptyStringAction(argparse.Action): + def __call__(self, parser, namespace, value, option_string=None): + if value == '': + setattr(namespace, self.dest, None) + else: + setattr(namespace, self.dest, value) + + +parser = argparse.ArgumentParser(description="Analyse failed GitLab CI runs.") + +parser.add_argument("--gitlab", + default="https://gitlab.com", + help="GitLab instance URL (default: https://gitlab.com).") +parser.add_argument("--id", default=11167699, + type=int, + help="GitLab project id (default: 11167699 for qemu-project/qemu)") +parser.add_argument("--token", + default=os.getenv("GITLAB_TOKEN"), + help="Your personal access token with 'api' scope.") +parser.add_argument("--branch", + type=str, + default="staging", + action=NoneForEmptyStringAction, + help="The name of the branch (default: 'staging')") +parser.add_argument("--status", + type=str, + action=NoneForEmptyStringAction, + default="failed", + help="Filter by branch status (default: 'failed')") +parser.add_argument("--count", type=int, + default=3, + help="The number of failed runs to fetch.") +parser.add_argument("--skip-jobs", + default=False, + action='store_true', + help="Skip dumping the job info") +parser.add_argument("--pipeline", type=int, + nargs="+", + default=None, + help="Explicit pipeline ID(s) to fetch.") + + +if __name__ == "__main__": + args = parser.parse_args() + + gl = gitlab.Gitlab(url=args.gitlab, private_token=args.token) + project = gl.projects.get(args.id) + + + pipelines_to_process = [] + + # Use explicit pipeline IDs if provided, otherwise fetch a list + if args.pipeline: + args.count = len(args.pipeline) + for p_id in args.pipeline: + pipelines_to_process.append(project.pipelines.get(p_id)) + else: + # Use an iterator to fetch the pipelines + pipe_iter = project.pipelines.list(iterator=True, + status=args.status, + ref=args.branch) + # Check each failed pipeline + pipelines_to_process = [next(pipe_iter) for _ in range(args.count)] + + # Check each pipeline + for p in pipelines_to_process: + + jobs = p.jobs.list(get_all=True) + failed_jobs = [j for j in jobs if j.status == "failed"] + skipped_jobs = [j for j in jobs if j.status == "skipped"] + manual_jobs = [j for j in jobs if j.status == "manual"] + + trs = p.test_report_summary.get() + total = trs.total["count"] + skipped = trs.total["skipped"] + failed = trs.total["failed"] + + print(f"{p.status} pipeline {p.id}, total jobs {len(jobs)}, " + f"skipped {len(skipped_jobs)}, " + f"failed {len(failed_jobs)}, ", + f"{total} tests, " + f"{skipped} skipped tests, " + f"{failed} failed tests") + + if not args.skip_jobs: + for j in failed_jobs: + print(f" Failed job {j.id}, {j.name}, {j.web_url}") + + # It seems we can only extract failing tests from the full + # test report, maybe there is some way to filter it. + + if failed > 0: + ftr = p.test_report.get() + failed_suites = [s for s in ftr.test_suites if + s["failed_count"] > 0] + for fs in failed_suites: + name = fs["name"] + tests = fs["test_cases"] + failed_tests = [t for t in tests if t["status"] == 'failed'] + for t in failed_tests: + print(f" Failed test {t["classname"]}, {name}, {t["name"]}") diff --git a/semihosting/arm-compat-semi-stub.c b/semihosting/arm-compat-semi-stub.c new file mode 100644 index 0000000..bfa3681 --- /dev/null +++ b/semihosting/arm-compat-semi-stub.c @@ -0,0 +1,19 @@ +/* + * Stubs for platforms different from ARM + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "semihosting/semihost.h" +#include <glib.h> + +bool semihosting_arm_compatible(void) +{ + return false; +} + +void semihosting_arm_compatible_init(void) +{ + g_assert_not_reached(); +} diff --git a/semihosting/arm-compat-semi.c b/semihosting/arm-compat-semi.c index bcd13cd..6100126 100644 --- a/semihosting/arm-compat-semi.c +++ b/semihosting/arm-compat-semi.c @@ -100,6 +100,13 @@ static int gdb_open_modeflags[12] = { GDB_O_RDWR | GDB_O_CREAT | GDB_O_APPEND, }; +/* + * For ARM semihosting, we have a separate structure for routing + * data for the console which is outside the guest fd address space. + */ +static GuestFD console_in_gf; +static GuestFD console_out_gf; + #ifndef CONFIG_USER_ONLY /** @@ -115,7 +122,7 @@ static int gdb_open_modeflags[12] = { */ typedef struct LayoutInfo { - target_ulong rambase; + vaddr rambase; size_t ramsize; hwaddr heapbase; hwaddr heaplimit; @@ -166,8 +173,7 @@ static LayoutInfo common_semi_find_bases(CPUState *cs) #endif -#include "cpu.h" -#include "common-semi-target.h" +#include "semihosting/common-semi.h" /* * Read the input value from the argument block; fail the semihosting @@ -207,7 +213,7 @@ static LayoutInfo common_semi_find_bases(CPUState *cs) * global, and we assume that the guest takes care of avoiding any races. */ #ifndef CONFIG_USER_ONLY -static target_ulong syscall_err; +static uint64_t syscall_err; #include "semihosting/uaccess.h" #endif @@ -253,8 +259,8 @@ static void common_semi_rw_cb(CPUState *cs, uint64_t ret, int err) { /* Recover the original length from the third argument. */ CPUArchState *env G_GNUC_UNUSED = cpu_env(cs); - target_ulong args = common_semi_arg(cs, 1); - target_ulong arg2; + uint64_t args = common_semi_arg(cs, 1); + uint64_t arg2; GET_ARG(2); if (err) { @@ -293,9 +299,9 @@ static void common_semi_seek_cb(CPUState *cs, uint64_t ret, int err) * is defined by GDB's remote protocol and is not target-specific.) * We put this on the guest's stack just below SP. */ -static target_ulong common_semi_flen_buf(CPUState *cs) +static uint64_t common_semi_flen_buf(CPUState *cs) { - target_ulong sp = common_semi_stack_bottom(cs); + vaddr sp = common_semi_stack_bottom(cs); return sp - 64; } @@ -352,6 +358,25 @@ static const uint8_t featurefile_data[] = { SH_EXT_EXIT_EXTENDED | SH_EXT_STDOUT_STDERR, /* Feature byte 0 */ }; +bool semihosting_arm_compatible(void) +{ + return true; +} + +void semihosting_arm_compatible_init(void) +{ + /* For ARM-compat, the console is in a separate namespace. */ + if (use_gdb_syscalls()) { + console_in_gf.type = GuestFDGDB; + console_in_gf.hostfd = 0; + console_out_gf.type = GuestFDGDB; + console_out_gf.hostfd = 2; + } else { + console_in_gf.type = GuestFDConsole; + console_out_gf.type = GuestFDConsole; + } +} + /* * Do a semihosting call. * @@ -363,9 +388,9 @@ static const uint8_t featurefile_data[] = { void do_common_semihosting(CPUState *cs) { CPUArchState *env = cpu_env(cs); - target_ulong args; - target_ulong arg0, arg1, arg2, arg3; - target_ulong ul_ret; + uint64_t args; + uint64_t arg0, arg1, arg2, arg3; + uint64_t ul_ret; char * s; int nr; int64_t elapsed; @@ -436,7 +461,7 @@ void do_common_semihosting(CPUState *cs) case TARGET_SYS_WRITEC: /* - * FIXME: the byte to be written is in a target_ulong slot, + * FIXME: the byte to be written is in a uint64_t slot, * which means this is wrong for a big-endian guest. */ semihost_sys_write_gf(cs, common_semi_dead_cb, @@ -475,10 +500,13 @@ void do_common_semihosting(CPUState *cs) break; case TARGET_SYS_ISERROR: + { GET_ARG(0); - common_semi_set_ret(cs, (target_long)arg0 < 0); + bool ret = is_64bit_semihosting(env) ? + (int64_t)arg0 < 0 : (int32_t)arg0 < 0; + common_semi_set_ret(cs, ret); break; - + } case TARGET_SYS_ISTTY: GET_ARG(0); semihost_sys_isatty(cs, common_semi_istty_cb, arg0); @@ -662,7 +690,7 @@ void do_common_semihosting(CPUState *cs) case TARGET_SYS_HEAPINFO: { - target_ulong retvals[4]; + uint64_t retvals[4]; int i; #ifdef CONFIG_USER_ONLY TaskState *ts = get_task_state(cs); @@ -728,7 +756,8 @@ void do_common_semihosting(CPUState *cs) { uint32_t ret; - if (common_semi_sys_exit_extended(cs, nr)) { + if (nr == TARGET_SYS_EXIT_EXTENDED || + common_semi_sys_exit_is_extended(cs)) { /* * The A64 version of SYS_EXIT takes a parameter block, * so the application-exit type can return a subcode which @@ -759,7 +788,7 @@ void do_common_semihosting(CPUState *cs) case TARGET_SYS_ELAPSED: elapsed = get_clock() - clock_start; - if (sizeof(target_ulong) == 8) { + if (is_64bit_semihosting(env)) { if (SET_ARG(0, elapsed)) { goto do_fault; } diff --git a/semihosting/guestfd.c b/semihosting/guestfd.c index d324143..e8f236c 100644 --- a/semihosting/guestfd.c +++ b/semihosting/guestfd.c @@ -12,35 +12,20 @@ #include "gdbstub/syscalls.h" #include "semihosting/semihost.h" #include "semihosting/guestfd.h" -#ifndef CONFIG_USER_ONLY -#include CONFIG_DEVICES -#endif static GArray *guestfd_array; -#ifdef CONFIG_ARM_COMPATIBLE_SEMIHOSTING -GuestFD console_in_gf; -GuestFD console_out_gf; -#endif - void qemu_semihosting_guestfd_init(void) { /* New entries zero-initialized, i.e. type GuestFDUnused */ guestfd_array = g_array_new(FALSE, TRUE, sizeof(GuestFD)); -#ifdef CONFIG_ARM_COMPATIBLE_SEMIHOSTING - /* For ARM-compat, the console is in a separate namespace. */ - if (use_gdb_syscalls()) { - console_in_gf.type = GuestFDGDB; - console_in_gf.hostfd = 0; - console_out_gf.type = GuestFDGDB; - console_out_gf.hostfd = 2; - } else { - console_in_gf.type = GuestFDConsole; - console_out_gf.type = GuestFDConsole; + if (semihosting_arm_compatible()) { + semihosting_arm_compatible_init(); + return; } -#else - /* Otherwise, the stdio file descriptors apply. */ + + /* Out of ARM, the stdio file descriptors apply. */ guestfd_array = g_array_set_size(guestfd_array, 3); #ifndef CONFIG_USER_ONLY if (!use_gdb_syscalls()) { @@ -54,7 +39,6 @@ void qemu_semihosting_guestfd_init(void) associate_guestfd(0, 0); associate_guestfd(1, 1); associate_guestfd(2, 2); -#endif } /* diff --git a/semihosting/meson.build b/semihosting/meson.build index b1ab250..99f10e2 100644 --- a/semihosting/meson.build +++ b/semihosting/meson.build @@ -1,17 +1,21 @@ -specific_ss.add(when: 'CONFIG_SEMIHOSTING', if_true: files( - 'guestfd.c', - 'syscalls.c', -)) - common_ss.add(when: 'CONFIG_SEMIHOSTING', if_false: files('stubs-all.c')) -user_ss.add(when: 'CONFIG_SEMIHOSTING', if_true: files('user.c')) +user_ss.add(when: 'CONFIG_SEMIHOSTING', if_true: files( + 'user.c', + 'guestfd.c')) system_ss.add(when: 'CONFIG_SEMIHOSTING', if_true: files( 'config.c', 'console.c', + 'guestfd.c', 'uaccess.c', + 'syscalls.c', ), if_false: files( 'stubs-system.c', )) +system_ss.add(when: 'CONFIG_ARM_COMPATIBLE_SEMIHOSTING', + if_true: files('arm-compat-semi.c'), + if_false: files('arm-compat-semi-stub.c')) -specific_ss.add(when: ['CONFIG_ARM_COMPATIBLE_SEMIHOSTING'], +specific_ss.add(when: ['CONFIG_SEMIHOSTING', 'CONFIG_USER_ONLY'], + if_true: files('syscalls.c')) +specific_ss.add(when: ['CONFIG_ARM_COMPATIBLE_SEMIHOSTING', 'CONFIG_USER_ONLY'], if_true: files('arm-compat-semi.c')) diff --git a/semihosting/syscalls.c b/semihosting/syscalls.c index f6451d9..20f155f 100644 --- a/semihosting/syscalls.c +++ b/semihosting/syscalls.c @@ -8,7 +8,6 @@ #include "qemu/osdep.h" #include "qemu/log.h" -#include "cpu.h" #include "gdbstub/syscalls.h" #include "semihosting/guestfd.h" #include "semihosting/syscalls.h" @@ -23,7 +22,7 @@ /* * Validate or compute the length of the string (including terminator). */ -static int validate_strlen(CPUState *cs, target_ulong str, target_ulong tlen) +static int validate_strlen(CPUState *cs, vaddr str, uint64_t tlen) { CPUArchState *env G_GNUC_UNUSED = cpu_env(cs); char c; @@ -52,7 +51,7 @@ static int validate_strlen(CPUState *cs, target_ulong str, target_ulong tlen) } static int validate_lock_user_string(char **pstr, CPUState *cs, - target_ulong tstr, target_ulong tlen) + vaddr tstr, uint64_t tlen) { int ret = validate_strlen(cs, tstr, tlen); CPUArchState *env G_GNUC_UNUSED = cpu_env(cs); @@ -72,7 +71,7 @@ static int validate_lock_user_string(char **pstr, CPUState *cs, * big-endian. Until we do something with gdb, also produce the * same big-endian result from the host. */ -static int copy_stat_to_user(CPUState *cs, target_ulong addr, +static int copy_stat_to_user(CPUState *cs, vaddr addr, const struct stat *s) { CPUArchState *env G_GNUC_UNUSED = cpu_env(cs); @@ -129,7 +128,7 @@ static void gdb_open_cb(CPUState *cs, uint64_t ret, int err) } static void gdb_open(CPUState *cs, gdb_syscall_complete_cb complete, - target_ulong fname, target_ulong fname_len, + vaddr fname, uint64_t fname_len, int gdb_flags, int mode) { int len = validate_strlen(cs, fname, fname_len); @@ -140,7 +139,7 @@ static void gdb_open(CPUState *cs, gdb_syscall_complete_cb complete, gdb_open_complete = complete; gdb_do_syscall(gdb_open_cb, "open,%s,%x,%x", - (uint64_t)fname, (uint32_t)len, + (vaddr)fname, (uint32_t)len, (uint32_t)gdb_flags, (uint32_t)mode); } @@ -151,17 +150,17 @@ static void gdb_close(CPUState *cs, gdb_syscall_complete_cb complete, } static void gdb_read(CPUState *cs, gdb_syscall_complete_cb complete, - GuestFD *gf, target_ulong buf, target_ulong len) + GuestFD *gf, vaddr buf, uint64_t len) { gdb_do_syscall(complete, "read,%x,%lx,%lx", - (uint32_t)gf->hostfd, (uint64_t)buf, (uint64_t)len); + (uint32_t)gf->hostfd, (vaddr)buf, (uint64_t)len); } static void gdb_write(CPUState *cs, gdb_syscall_complete_cb complete, - GuestFD *gf, target_ulong buf, target_ulong len) + GuestFD *gf, vaddr buf, uint64_t len) { gdb_do_syscall(complete, "write,%x,%lx,%lx", - (uint32_t)gf->hostfd, (uint64_t)buf, (uint64_t)len); + (uint32_t)gf->hostfd, (vaddr)buf, (uint64_t)len); } static void gdb_lseek(CPUState *cs, gdb_syscall_complete_cb complete, @@ -178,15 +177,15 @@ static void gdb_isatty(CPUState *cs, gdb_syscall_complete_cb complete, } static void gdb_fstat(CPUState *cs, gdb_syscall_complete_cb complete, - GuestFD *gf, target_ulong addr) + GuestFD *gf, vaddr addr) { gdb_do_syscall(complete, "fstat,%x,%lx", - (uint32_t)gf->hostfd, (uint64_t)addr); + (uint32_t)gf->hostfd, (vaddr)addr); } static void gdb_stat(CPUState *cs, gdb_syscall_complete_cb complete, - target_ulong fname, target_ulong fname_len, - target_ulong addr) + vaddr fname, uint64_t fname_len, + vaddr addr) { int len = validate_strlen(cs, fname, fname_len); if (len < 0) { @@ -195,11 +194,11 @@ static void gdb_stat(CPUState *cs, gdb_syscall_complete_cb complete, } gdb_do_syscall(complete, "stat,%s,%lx", - (uint64_t)fname, (uint32_t)len, (uint64_t)addr); + (vaddr)fname, (uint32_t)len, (vaddr)addr); } static void gdb_remove(CPUState *cs, gdb_syscall_complete_cb complete, - target_ulong fname, target_ulong fname_len) + vaddr fname, uint64_t fname_len) { int len = validate_strlen(cs, fname, fname_len); if (len < 0) { @@ -207,12 +206,12 @@ static void gdb_remove(CPUState *cs, gdb_syscall_complete_cb complete, return; } - gdb_do_syscall(complete, "unlink,%s", (uint64_t)fname, (uint32_t)len); + gdb_do_syscall(complete, "unlink,%s", (vaddr)fname, (uint32_t)len); } static void gdb_rename(CPUState *cs, gdb_syscall_complete_cb complete, - target_ulong oname, target_ulong oname_len, - target_ulong nname, target_ulong nname_len) + vaddr oname, uint64_t oname_len, + vaddr nname, uint64_t nname_len) { int olen, nlen; @@ -228,12 +227,12 @@ static void gdb_rename(CPUState *cs, gdb_syscall_complete_cb complete, } gdb_do_syscall(complete, "rename,%s,%s", - (uint64_t)oname, (uint32_t)olen, - (uint64_t)nname, (uint32_t)nlen); + (vaddr)oname, (uint32_t)olen, + (vaddr)nname, (uint32_t)nlen); } static void gdb_system(CPUState *cs, gdb_syscall_complete_cb complete, - target_ulong cmd, target_ulong cmd_len) + vaddr cmd, uint64_t cmd_len) { int len = validate_strlen(cs, cmd, cmd_len); if (len < 0) { @@ -241,14 +240,14 @@ static void gdb_system(CPUState *cs, gdb_syscall_complete_cb complete, return; } - gdb_do_syscall(complete, "system,%s", (uint64_t)cmd, (uint32_t)len); + gdb_do_syscall(complete, "system,%s", (vaddr)cmd, (uint32_t)len); } static void gdb_gettimeofday(CPUState *cs, gdb_syscall_complete_cb complete, - target_ulong tv_addr, target_ulong tz_addr) + vaddr tv_addr, vaddr tz_addr) { gdb_do_syscall(complete, "gettimeofday,%lx,%lx", - (uint64_t)tv_addr, (uint64_t)tz_addr); + (vaddr)tv_addr, (vaddr)tz_addr); } /* @@ -256,7 +255,7 @@ static void gdb_gettimeofday(CPUState *cs, gdb_syscall_complete_cb complete, */ static void host_open(CPUState *cs, gdb_syscall_complete_cb complete, - target_ulong fname, target_ulong fname_len, + vaddr fname, uint64_t fname_len, int gdb_flags, int mode) { CPUArchState *env G_GNUC_UNUSED = cpu_env(cs); @@ -316,7 +315,7 @@ static void host_close(CPUState *cs, gdb_syscall_complete_cb complete, } static void host_read(CPUState *cs, gdb_syscall_complete_cb complete, - GuestFD *gf, target_ulong buf, target_ulong len) + GuestFD *gf, vaddr buf, uint64_t len) { CPUArchState *env G_GNUC_UNUSED = cpu_env(cs); void *ptr = lock_user(VERIFY_WRITE, buf, len, 0); @@ -337,7 +336,7 @@ static void host_read(CPUState *cs, gdb_syscall_complete_cb complete, } static void host_write(CPUState *cs, gdb_syscall_complete_cb complete, - GuestFD *gf, target_ulong buf, target_ulong len) + GuestFD *gf, vaddr buf, uint64_t len) { CPUArchState *env G_GNUC_UNUSED = cpu_env(cs); void *ptr = lock_user(VERIFY_READ, buf, len, 1); @@ -395,7 +394,7 @@ static void host_flen(CPUState *cs, gdb_syscall_complete_cb complete, } static void host_fstat(CPUState *cs, gdb_syscall_complete_cb complete, - GuestFD *gf, target_ulong addr) + GuestFD *gf, vaddr addr) { struct stat buf; int ret; @@ -410,8 +409,8 @@ static void host_fstat(CPUState *cs, gdb_syscall_complete_cb complete, } static void host_stat(CPUState *cs, gdb_syscall_complete_cb complete, - target_ulong fname, target_ulong fname_len, - target_ulong addr) + vaddr fname, uint64_t fname_len, + vaddr addr) { CPUArchState *env G_GNUC_UNUSED = cpu_env(cs); struct stat buf; @@ -440,7 +439,7 @@ static void host_stat(CPUState *cs, gdb_syscall_complete_cb complete, } static void host_remove(CPUState *cs, gdb_syscall_complete_cb complete, - target_ulong fname, target_ulong fname_len) + vaddr fname, uint64_t fname_len) { CPUArchState *env G_GNUC_UNUSED = cpu_env(cs); char *p; @@ -458,8 +457,8 @@ static void host_remove(CPUState *cs, gdb_syscall_complete_cb complete, } static void host_rename(CPUState *cs, gdb_syscall_complete_cb complete, - target_ulong oname, target_ulong oname_len, - target_ulong nname, target_ulong nname_len) + vaddr oname, uint64_t oname_len, + vaddr nname, uint64_t nname_len) { CPUArchState *env G_GNUC_UNUSED = cpu_env(cs); char *ostr, *nstr; @@ -484,7 +483,7 @@ static void host_rename(CPUState *cs, gdb_syscall_complete_cb complete, } static void host_system(CPUState *cs, gdb_syscall_complete_cb complete, - target_ulong cmd, target_ulong cmd_len) + vaddr cmd, uint64_t cmd_len) { CPUArchState *env G_GNUC_UNUSED = cpu_env(cs); char *p; @@ -502,7 +501,7 @@ static void host_system(CPUState *cs, gdb_syscall_complete_cb complete, } static void host_gettimeofday(CPUState *cs, gdb_syscall_complete_cb complete, - target_ulong tv_addr, target_ulong tz_addr) + vaddr tv_addr, vaddr tz_addr) { CPUArchState *env G_GNUC_UNUSED = cpu_env(cs); struct gdb_timeval *p; @@ -547,10 +546,10 @@ static void host_poll_one(CPUState *cs, gdb_syscall_complete_cb complete, */ static void staticfile_read(CPUState *cs, gdb_syscall_complete_cb complete, - GuestFD *gf, target_ulong buf, target_ulong len) + GuestFD *gf, vaddr buf, uint64_t len) { CPUArchState *env G_GNUC_UNUSED = cpu_env(cs); - target_ulong rest = gf->staticfile.len - gf->staticfile.off; + uint64_t rest = gf->staticfile.len - gf->staticfile.off; void *ptr; if (len > rest) { @@ -605,7 +604,7 @@ static void staticfile_flen(CPUState *cs, gdb_syscall_complete_cb complete, */ static void console_read(CPUState *cs, gdb_syscall_complete_cb complete, - GuestFD *gf, target_ulong buf, target_ulong len) + GuestFD *gf, vaddr buf, uint64_t len) { CPUArchState *env G_GNUC_UNUSED = cpu_env(cs); char *ptr; @@ -622,7 +621,7 @@ static void console_read(CPUState *cs, gdb_syscall_complete_cb complete, } static void console_write(CPUState *cs, gdb_syscall_complete_cb complete, - GuestFD *gf, target_ulong buf, target_ulong len) + GuestFD *gf, vaddr buf, uint64_t len) { CPUArchState *env G_GNUC_UNUSED = cpu_env(cs); char *ptr = lock_user(VERIFY_READ, buf, len, 1); @@ -638,7 +637,7 @@ static void console_write(CPUState *cs, gdb_syscall_complete_cb complete, } static void console_fstat(CPUState *cs, gdb_syscall_complete_cb complete, - GuestFD *gf, target_ulong addr) + GuestFD *gf, vaddr addr) { static const struct stat tty_buf = { .st_mode = 020666, /* S_IFCHR, ugo+rw */ @@ -683,7 +682,7 @@ static void console_poll_one(CPUState *cs, gdb_syscall_complete_cb complete, */ void semihost_sys_open(CPUState *cs, gdb_syscall_complete_cb complete, - target_ulong fname, target_ulong fname_len, + vaddr fname, uint64_t fname_len, int gdb_flags, int mode) { if (use_gdb_syscalls()) { @@ -719,7 +718,7 @@ void semihost_sys_close(CPUState *cs, gdb_syscall_complete_cb complete, int fd) } void semihost_sys_read_gf(CPUState *cs, gdb_syscall_complete_cb complete, - GuestFD *gf, target_ulong buf, target_ulong len) + GuestFD *gf, vaddr buf, uint64_t len) { /* * Bound length for 64-bit guests on 32-bit hosts, not overflowing ssize_t. @@ -748,7 +747,7 @@ void semihost_sys_read_gf(CPUState *cs, gdb_syscall_complete_cb complete, } void semihost_sys_read(CPUState *cs, gdb_syscall_complete_cb complete, - int fd, target_ulong buf, target_ulong len) + int fd, vaddr buf, uint64_t len) { GuestFD *gf = get_guestfd(fd); @@ -760,7 +759,7 @@ void semihost_sys_read(CPUState *cs, gdb_syscall_complete_cb complete, } void semihost_sys_write_gf(CPUState *cs, gdb_syscall_complete_cb complete, - GuestFD *gf, target_ulong buf, target_ulong len) + GuestFD *gf, vaddr buf, uint64_t len) { /* * Bound length for 64-bit guests on 32-bit hosts, not overflowing ssize_t. @@ -790,7 +789,7 @@ void semihost_sys_write_gf(CPUState *cs, gdb_syscall_complete_cb complete, } void semihost_sys_write(CPUState *cs, gdb_syscall_complete_cb complete, - int fd, target_ulong buf, target_ulong len) + int fd, vaddr buf, uint64_t len) { GuestFD *gf = get_guestfd(fd); @@ -856,7 +855,7 @@ void semihost_sys_isatty(CPUState *cs, gdb_syscall_complete_cb complete, int fd) void semihost_sys_flen(CPUState *cs, gdb_syscall_complete_cb fstat_cb, gdb_syscall_complete_cb flen_cb, int fd, - target_ulong fstat_addr) + vaddr fstat_addr) { GuestFD *gf = get_guestfd(fd); @@ -881,7 +880,7 @@ void semihost_sys_flen(CPUState *cs, gdb_syscall_complete_cb fstat_cb, } void semihost_sys_fstat(CPUState *cs, gdb_syscall_complete_cb complete, - int fd, target_ulong addr) + int fd, vaddr addr) { GuestFD *gf = get_guestfd(fd); @@ -906,8 +905,8 @@ void semihost_sys_fstat(CPUState *cs, gdb_syscall_complete_cb complete, } void semihost_sys_stat(CPUState *cs, gdb_syscall_complete_cb complete, - target_ulong fname, target_ulong fname_len, - target_ulong addr) + vaddr fname, uint64_t fname_len, + vaddr addr) { if (use_gdb_syscalls()) { gdb_stat(cs, complete, fname, fname_len, addr); @@ -917,7 +916,7 @@ void semihost_sys_stat(CPUState *cs, gdb_syscall_complete_cb complete, } void semihost_sys_remove(CPUState *cs, gdb_syscall_complete_cb complete, - target_ulong fname, target_ulong fname_len) + vaddr fname, uint64_t fname_len) { if (use_gdb_syscalls()) { gdb_remove(cs, complete, fname, fname_len); @@ -927,8 +926,8 @@ void semihost_sys_remove(CPUState *cs, gdb_syscall_complete_cb complete, } void semihost_sys_rename(CPUState *cs, gdb_syscall_complete_cb complete, - target_ulong oname, target_ulong oname_len, - target_ulong nname, target_ulong nname_len) + vaddr oname, uint64_t oname_len, + vaddr nname, uint64_t nname_len) { if (use_gdb_syscalls()) { gdb_rename(cs, complete, oname, oname_len, nname, nname_len); @@ -938,7 +937,7 @@ void semihost_sys_rename(CPUState *cs, gdb_syscall_complete_cb complete, } void semihost_sys_system(CPUState *cs, gdb_syscall_complete_cb complete, - target_ulong cmd, target_ulong cmd_len) + vaddr cmd, uint64_t cmd_len) { if (use_gdb_syscalls()) { gdb_system(cs, complete, cmd, cmd_len); @@ -948,7 +947,7 @@ void semihost_sys_system(CPUState *cs, gdb_syscall_complete_cb complete, } void semihost_sys_gettimeofday(CPUState *cs, gdb_syscall_complete_cb complete, - target_ulong tv_addr, target_ulong tz_addr) + vaddr tv_addr, vaddr tz_addr) { if (use_gdb_syscalls()) { gdb_gettimeofday(cs, complete, tv_addr, tz_addr); diff --git a/target/arm/common-semi-target.h b/target/arm/common-semi-target.c index da51f2d..2b77ce9 100644 --- a/target/arm/common-semi-target.h +++ b/target/arm/common-semi-target.c @@ -7,12 +7,12 @@ * SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef TARGET_ARM_COMMON_SEMI_TARGET_H -#define TARGET_ARM_COMMON_SEMI_TARGET_H - +#include "qemu/osdep.h" +#include "cpu.h" +#include "semihosting/common-semi.h" #include "target/arm/cpu-qom.h" -static inline target_ulong common_semi_arg(CPUState *cs, int argno) +uint64_t common_semi_arg(CPUState *cs, int argno) { ARMCPU *cpu = ARM_CPU(cs); CPUARMState *env = &cpu->env; @@ -23,7 +23,7 @@ static inline target_ulong common_semi_arg(CPUState *cs, int argno) } } -static inline void common_semi_set_ret(CPUState *cs, target_ulong ret) +void common_semi_set_ret(CPUState *cs, uint64_t ret) { ARMCPU *cpu = ARM_CPU(cs); CPUARMState *env = &cpu->env; @@ -34,27 +34,25 @@ static inline void common_semi_set_ret(CPUState *cs, target_ulong ret) } } -static inline bool common_semi_sys_exit_extended(CPUState *cs, int nr) +bool common_semi_sys_exit_is_extended(CPUState *cs) { - return nr == TARGET_SYS_EXIT_EXTENDED || is_a64(cpu_env(cs)); + return is_a64(cpu_env(cs)); } -static inline bool is_64bit_semihosting(CPUArchState *env) +bool is_64bit_semihosting(CPUArchState *env) { return is_a64(env); } -static inline target_ulong common_semi_stack_bottom(CPUState *cs) +uint64_t common_semi_stack_bottom(CPUState *cs) { ARMCPU *cpu = ARM_CPU(cs); CPUARMState *env = &cpu->env; return is_a64(env) ? env->xregs[31] : env->regs[13]; } -static inline bool common_semi_has_synccache(CPUArchState *env) +bool common_semi_has_synccache(CPUArchState *env) { /* Ok for A64, invalid for A32/T32 */ return is_a64(env); } - -#endif diff --git a/target/arm/meson.build b/target/arm/meson.build index 914f149..638ee62 100644 --- a/target/arm/meson.build +++ b/target/arm/meson.build @@ -28,12 +28,16 @@ arm_user_ss.add(files( 'vfp_fpscr.c', 'el2-stubs.c', )) +arm_user_ss.add(when: 'CONFIG_ARM_COMPATIBLE_SEMIHOSTING', + if_true: files('common-semi-target.c')) arm_common_system_ss.add(files('cpu.c')) arm_common_system_ss.add(when: 'TARGET_AARCH64', if_false: files( 'cpu32-stubs.c')) arm_common_system_ss.add(when: 'CONFIG_KVM', if_false: files('kvm-stub.c')) arm_common_system_ss.add(when: 'CONFIG_HVF', if_false: files('hvf-stub.c')) +arm_common_system_ss.add(when: 'CONFIG_ARM_COMPATIBLE_SEMIHOSTING', + if_true: files('common-semi-target.c')) arm_common_system_ss.add(files( 'arch_dump.c', 'arm-powerctl.c', diff --git a/target/riscv/common-semi-target.h b/target/riscv/common-semi-target.c index 7c8a59e..aeaeb88 100644 --- a/target/riscv/common-semi-target.h +++ b/target/riscv/common-semi-target.c @@ -8,43 +8,42 @@ * SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef TARGET_RISCV_COMMON_SEMI_TARGET_H -#define TARGET_RISCV_COMMON_SEMI_TARGET_H +#include "qemu/osdep.h" +#include "cpu.h" +#include "semihosting/common-semi.h" -static inline target_ulong common_semi_arg(CPUState *cs, int argno) +uint64_t common_semi_arg(CPUState *cs, int argno) { RISCVCPU *cpu = RISCV_CPU(cs); CPURISCVState *env = &cpu->env; return env->gpr[xA0 + argno]; } -static inline void common_semi_set_ret(CPUState *cs, target_ulong ret) +void common_semi_set_ret(CPUState *cs, uint64_t ret) { RISCVCPU *cpu = RISCV_CPU(cs); CPURISCVState *env = &cpu->env; env->gpr[xA0] = ret; } -static inline bool common_semi_sys_exit_extended(CPUState *cs, int nr) +bool is_64bit_semihosting(CPUArchState *env) { - return (nr == TARGET_SYS_EXIT_EXTENDED || sizeof(target_ulong) == 8); + return riscv_cpu_mxl(env) != MXL_RV32; } -static inline bool is_64bit_semihosting(CPUArchState *env) +bool common_semi_sys_exit_is_extended(CPUState *cs) { - return riscv_cpu_mxl(env) != MXL_RV32; + return is_64bit_semihosting(cpu_env(cs)); } -static inline target_ulong common_semi_stack_bottom(CPUState *cs) +uint64_t common_semi_stack_bottom(CPUState *cs) { RISCVCPU *cpu = RISCV_CPU(cs); CPURISCVState *env = &cpu->env; return env->gpr[xSP]; } -static inline bool common_semi_has_synccache(CPUArchState *env) +bool common_semi_has_synccache(CPUArchState *env) { return true; } - -#endif diff --git a/target/riscv/meson.build b/target/riscv/meson.build index a4bd61e..fdefe88 100644 --- a/target/riscv/meson.build +++ b/target/riscv/meson.build @@ -8,6 +8,10 @@ gen = [ riscv_ss = ss.source_set() riscv_ss.add(gen) + +riscv_ss.add(when: 'CONFIG_ARM_COMPATIBLE_SEMIHOSTING', + if_true: files('common-semi-target.c')) + riscv_ss.add(files( 'cpu.c', 'cpu_helper.c', diff --git a/tests/functional/x86_64/meson.build b/tests/functional/x86_64/meson.build index 967426c..f78eec5 100644 --- a/tests/functional/x86_64/meson.build +++ b/tests/functional/x86_64/meson.build @@ -33,6 +33,7 @@ tests_x86_64_system_thorough = [ 'replay', 'reverse_debug', 'tuxrun', + 'vfio_user_client', 'virtio_balloon', 'virtio_gpu', ] diff --git a/tests/functional/x86_64/test_vfio_user_client.py b/tests/functional/x86_64/test_vfio_user_client.py new file mode 100755 index 0000000..8bc16e5 --- /dev/null +++ b/tests/functional/x86_64/test_vfio_user_client.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2025 Nutanix, Inc. +# +# Author: +# Mark Cave-Ayland <mark.caveayland@nutanix.com> +# John Levon <john.levon@nutanix.com> +# +# SPDX-License-Identifier: GPL-2.0-or-later +""" +Check basic vfio-user-pci client functionality. The test starts two VMs: + + - the server VM runs the libvfio-user "gpio" example server inside it, + piping vfio-user traffic between a local UNIX socket and a virtio-serial + port. On the host, the virtio-serial port is backed by a local socket. + + - the client VM loads the gpio-pci-idio-16 kernel module, with the + vfio-user client connecting to the above local UNIX socket. + +This way, we don't depend on trying to run a vfio-user server on the host +itself. + +Once both VMs are running, we run some basic configuration on the gpio device +and verify that the server is logging the expected out. As this is consistent +given the same VM images, we just do a simple direct comparison. +""" + +import os + +from qemu_test import Asset +from qemu_test import QemuSystemTest +from qemu_test import exec_command_and_wait_for_pattern +from qemu_test import wait_for_console_pattern + +# Exact output can vary, so we just sample for some expected lines. +EXPECTED_SERVER_LINES = [ + "gpio: adding DMA region [0, 0xc0000) offset=0 flags=0x3", + "gpio: devinfo flags 0x3, num_regions 9, num_irqs 5", + "gpio: region_info[0] offset 0 flags 0 size 0 argsz 32", + "gpio: region_info[1] offset 0 flags 0 size 0 argsz 32", + "gpio: region_info[2] offset 0 flags 0x3 size 256 argsz 32", + "gpio: region_info[3] offset 0 flags 0 size 0 argsz 32", + "gpio: region_info[4] offset 0 flags 0 size 0 argsz 32", + "gpio: region_info[5] offset 0 flags 0 size 0 argsz 32", + "gpio: region_info[7] offset 0 flags 0x3 size 256 argsz 32", + "gpio: region7: read 256 bytes at 0", + "gpio: region7: read 0 from (0x30:4)", + "gpio: cleared EROM", + "gpio: I/O space enabled", + "gpio: memory space enabled", + "gpio: SERR# enabled", + "gpio: region7: wrote 0x103 to (0x4:2)", + "gpio: I/O space enabled", + "gpio: memory space enabled", +] + +class VfioUserClient(QemuSystemTest): + """vfio-user testing class.""" + + ASSET_REPO = 'https://github.com/mcayland-ntx/libvfio-user-test' + + ASSET_KERNEL = Asset( + f'{ASSET_REPO}/raw/refs/heads/main/images/bzImage', + '40292fa6ce95d516e26bccf5974e138d0db65a6de0bc540cabae060fe9dea605' + ) + + ASSET_ROOTFS = Asset( + f'{ASSET_REPO}/raw/refs/heads/main/images/rootfs.ext2', + 'e1e3abae8aebb8e6e77f08b1c531caeacf46250c94c815655c6bbea59fc3d1c1' + ) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.kernel_path = None + self.rootfs_path = None + + def configure_server_vm_args(self, server_vm, sock_path): + """ + Configuration for the server VM. Set up virtio-serial device backed by + the given socket path. + """ + server_vm.add_args('-kernel', self.kernel_path) + server_vm.add_args('-append', 'console=ttyS0 root=/dev/sda') + server_vm.add_args('-drive', + f"file={self.rootfs_path},if=ide,format=raw,id=drv0") + server_vm.add_args('-snapshot') + server_vm.add_args('-chardev', + f"socket,id=sock0,path={sock_path},telnet=off,server=on,wait=off") + server_vm.add_args('-device', 'virtio-serial') + server_vm.add_args('-device', + 'virtserialport,chardev=sock0,name=org.fedoraproject.port.0') + + def configure_client_vm_args(self, client_vm, sock_path): + """ + Configuration for the client VM. Point the vfio-user-pci device to the + socket path configured above. + """ + + client_vm.add_args('-kernel', self.kernel_path) + client_vm.add_args('-append', 'console=ttyS0 root=/dev/sda') + client_vm.add_args('-drive', + f'file={self.rootfs_path},if=ide,format=raw,id=drv0') + client_vm.add_args('-snapshot') + client_vm.add_args('-device', + '{"driver":"vfio-user-pci",' + + '"socket":{"path": "%s", "type": "unix"}}' % sock_path) + + def setup_vfio_user_pci_server(self, server_vm): + """ + Start the libvfio-user server within the server VM, and arrange + for data to shuttle between its socket and the virtio serial port. + """ + wait_for_console_pattern(self, 'login:', None, server_vm) + exec_command_and_wait_for_pattern(self, 'root', '#', None, server_vm) + + exec_command_and_wait_for_pattern(self, + 'gpio-pci-idio-16 -v /tmp/vfio-user.sock >/var/tmp/gpio.out 2>&1 &', + '#', None, server_vm) + + # wait for libvfio-user socket to appear + while True: + out = exec_command_and_wait_for_pattern(self, + 'ls --color=no /tmp/vfio-user.sock', '#', None, server_vm) + ls_out = out.decode().splitlines()[1].strip() + if ls_out == "/tmp/vfio-user.sock": + break + + exec_command_and_wait_for_pattern(self, + 'socat UNIX-CONNECT:/tmp/vfio-user.sock /dev/vport0p1,ignoreeof ' + + ' &', '#', None, server_vm) + + def test_vfio_user_pci(self): + """Run basic sanity test.""" + + self.set_machine('pc') + self.require_device('virtio-serial') + self.require_device('vfio-user-pci') + + self.kernel_path = self.ASSET_KERNEL.fetch() + self.rootfs_path = self.ASSET_ROOTFS.fetch() + + sock_dir = self.socket_dir() + socket_path = os.path.join(sock_dir.name, 'vfio-user.sock') + + server_vm = self.get_vm(name='server') + server_vm.set_console() + self.configure_server_vm_args(server_vm, socket_path) + + server_vm.launch() + + self.log.debug('starting libvfio-user server') + + self.setup_vfio_user_pci_server(server_vm) + + client_vm = self.get_vm(name="client") + client_vm.set_console() + self.configure_client_vm_args(client_vm, socket_path) + + try: + client_vm.launch() + except: + self.log.error('client VM failed to start, dumping server logs') + exec_command_and_wait_for_pattern(self, 'cat /var/tmp/gpio.out', + '#', None, server_vm) + raise + + self.log.debug('waiting for client VM boot') + + wait_for_console_pattern(self, 'login:', None, client_vm) + exec_command_and_wait_for_pattern(self, 'root', '#', None, client_vm) + + # + # Here, we'd like to actually interact with the gpio device a little + # more as described at: + # + # https://github.com/nutanix/libvfio-user/blob/master/docs/qemu.md + # + # Unfortunately, the buildroot Linux kernel has some undiagnosed issue + # so we don't get /sys/class/gpio. Nonetheless just the basic + # initialization and setup is enough for basic testing of vfio-user. + # + + self.log.debug('collecting libvfio-user server output') + + out = exec_command_and_wait_for_pattern(self, + 'cat /var/tmp/gpio.out', + 'gpio: region2: wrote 0 to (0x1:1)', + None, server_vm) + + gpio_server_out = [s for s in out.decode().splitlines() + if s.startswith("gpio:")] + + for line in EXPECTED_SERVER_LINES: + if line not in gpio_server_out: + self.log.error(f'Missing server debug line: {line}') + self.fail(False) + + +if __name__ == '__main__': + QemuSystemTest.main() |