diff options
-rw-r--r-- | MAINTAINERS | 6 | ||||
-rw-r--r-- | hw/arm/virt-acpi-build.c | 56 | ||||
-rw-r--r-- | hw/arm/virt.c | 124 | ||||
-rw-r--r-- | hw/intc/Makefile.objs | 2 | ||||
-rw-r--r-- | hw/intc/arm_gic_kvm.c | 98 | ||||
-rw-r--r-- | hw/intc/arm_gicv3_common.c | 140 | ||||
-rw-r--r-- | hw/intc/arm_gicv3_kvm.c | 149 | ||||
-rw-r--r-- | hw/intc/vgic_common.h | 35 | ||||
-rw-r--r-- | hw/vfio/Makefile.objs | 2 | ||||
-rw-r--r-- | hw/vfio/common.c | 2 | ||||
-rw-r--r-- | hw/vfio/pci-quirks.c | 1204 | ||||
-rw-r--r-- | hw/vfio/pci.c | 1424 | ||||
-rw-r--r-- | hw/vfio/pci.h | 159 | ||||
-rw-r--r-- | hw/vfio/platform.c | 2 | ||||
-rw-r--r-- | include/hw/acpi/acpi-defs.h | 9 | ||||
-rw-r--r-- | include/hw/arm/virt-acpi-build.h | 1 | ||||
-rw-r--r-- | include/hw/arm/virt.h | 4 | ||||
-rw-r--r-- | include/hw/intc/arm_gicv3_common.h | 68 | ||||
-rw-r--r-- | include/hw/vfio/vfio-common.h | 7 | ||||
-rw-r--r-- | include/sysemu/kvm.h | 26 | ||||
-rw-r--r-- | kvm-all.c | 34 | ||||
-rw-r--r-- | target-arm/kvm.c | 19 | ||||
-rw-r--r-- | target-arm/kvm_arm.h | 19 | ||||
-rw-r--r-- | target-arm/machine.c | 18 | ||||
-rw-r--r-- | trace-events | 87 |
25 files changed, 2244 insertions, 1451 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 71c652b..cf02890 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -266,10 +266,10 @@ F: *win32* ARM Machines ------------ Allwinner-a10 -M: Li Guang <lig.fnst@cn.fujitsu.com> +M: Beniamino Galvani <b.galvani@gmail.com> S: Maintained -F: hw/*/allwinner-a10* -F: include/hw/*/allwinner-a10* +F: hw/*/allwinner* +F: include/hw/*/allwinner* F: hw/arm/cubieboard.c Exynos diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 9088248..1aaff1f 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -114,7 +114,7 @@ static void acpi_dsdt_add_flash(Aml *scope, const MemMapEntry *flash_memmap) { Aml *dev, *crs; hwaddr base = flash_memmap->base; - hwaddr size = flash_memmap->size; + hwaddr size = flash_memmap->size / 2; dev = aml_device("FLS0"); aml_append(dev, aml_name_decl("_HID", aml_string("LNRO0015"))); @@ -443,33 +443,43 @@ build_madt(GArray *table_data, GArray *linker, VirtGuestInfo *guest_info, madt = acpi_data_push(table_data, sizeof *madt); - for (i = 0; i < guest_info->smp_cpus; i++) { - AcpiMadtGenericInterrupt *gicc = acpi_data_push(table_data, - sizeof *gicc); - gicc->type = ACPI_APIC_GENERIC_INTERRUPT; - gicc->length = sizeof(*gicc); - gicc->base_address = memmap[VIRT_GIC_CPU].base; - gicc->cpu_interface_number = i; - gicc->arm_mpidr = i; - gicc->uid = i; - if (test_bit(i, cpuinfo->found_cpus)) { - gicc->flags = cpu_to_le32(ACPI_GICC_ENABLED); - } - } - gicd = acpi_data_push(table_data, sizeof *gicd); gicd->type = ACPI_APIC_GENERIC_DISTRIBUTOR; gicd->length = sizeof(*gicd); gicd->base_address = memmap[VIRT_GIC_DIST].base; - gic_msi = acpi_data_push(table_data, sizeof *gic_msi); - gic_msi->type = ACPI_APIC_GENERIC_MSI_FRAME; - gic_msi->length = sizeof(*gic_msi); - gic_msi->gic_msi_frame_id = 0; - gic_msi->base_address = cpu_to_le64(memmap[VIRT_GIC_V2M].base); - gic_msi->flags = cpu_to_le32(1); - gic_msi->spi_count = cpu_to_le16(NUM_GICV2M_SPIS); - gic_msi->spi_base = cpu_to_le16(irqmap[VIRT_GIC_V2M] + ARM_SPI_BASE); + if (guest_info->gic_version == 3) { + AcpiMadtGenericRedistributor *gicr = acpi_data_push(table_data, + sizeof *gicr); + + gicr->type = ACPI_APIC_GENERIC_REDISTRIBUTOR; + gicr->length = sizeof(*gicr); + gicr->base_address = cpu_to_le64(memmap[VIRT_GIC_REDIST].base); + gicr->range_length = cpu_to_le32(memmap[VIRT_GIC_REDIST].size); + } else { + for (i = 0; i < guest_info->smp_cpus; i++) { + AcpiMadtGenericInterrupt *gicc = acpi_data_push(table_data, + sizeof *gicc); + gicc->type = ACPI_APIC_GENERIC_INTERRUPT; + gicc->length = sizeof(*gicc); + gicc->base_address = memmap[VIRT_GIC_CPU].base; + gicc->cpu_interface_number = i; + gicc->arm_mpidr = i; + gicc->uid = i; + if (test_bit(i, cpuinfo->found_cpus)) { + gicc->flags = cpu_to_le32(ACPI_GICC_ENABLED); + } + } + + gic_msi = acpi_data_push(table_data, sizeof *gic_msi); + gic_msi->type = ACPI_APIC_GENERIC_MSI_FRAME; + gic_msi->length = sizeof(*gic_msi); + gic_msi->gic_msi_frame_id = 0; + gic_msi->base_address = cpu_to_le64(memmap[VIRT_GIC_V2M].base); + gic_msi->flags = cpu_to_le32(1); + gic_msi->spi_count = cpu_to_le16(NUM_GICV2M_SPIS); + gic_msi->spi_base = cpu_to_le16(irqmap[VIRT_GIC_V2M] + ARM_SPI_BASE); + } build_header(linker, table_data, (void *)(table_data->data + madt_start), "APIC", diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 6bf0d6d..d25d6cf 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -51,6 +51,7 @@ #include "hw/intc/arm_gic_common.h" #include "kvm_arm.h" #include "hw/smbios/smbios.h" +#include "qapi/visitor.h" /* Number of external interrupt lines to configure the GIC with */ #define NUM_IRQS 256 @@ -81,6 +82,7 @@ typedef struct { MachineState parent; bool secure; bool highmem; + int32_t gic_version; } VirtMachineState; #define TYPE_VIRT_MACHINE MACHINE_TYPE_NAME("virt") @@ -111,6 +113,10 @@ static const MemMapEntry a15memmap[] = { [VIRT_GIC_DIST] = { 0x08000000, 0x00010000 }, [VIRT_GIC_CPU] = { 0x08010000, 0x00010000 }, [VIRT_GIC_V2M] = { 0x08020000, 0x00001000 }, + /* The space in between here is reserved for GICv3 CPU/vCPU/HYP */ + [VIRT_GIC_ITS] = { 0x08080000, 0x00020000 }, + /* This redistributor space allows up to 2*64kB*123 CPUs */ + [VIRT_GIC_REDIST] = { 0x080A0000, 0x00F60000 }, [VIRT_UART] = { 0x09000000, 0x00001000 }, [VIRT_RTC] = { 0x09010000, 0x00001000 }, [VIRT_FW_CFG] = { 0x09020000, 0x0000000a }, @@ -255,7 +261,7 @@ static void fdt_add_psci_node(const VirtBoardInfo *vbi) qemu_fdt_setprop_cell(fdt, "/psci", "migrate", migrate_fn); } -static void fdt_add_timer_nodes(const VirtBoardInfo *vbi) +static void fdt_add_timer_nodes(const VirtBoardInfo *vbi, int gictype) { /* Note that on A15 h/w these interrupts are level-triggered, * but for the GIC implementation provided by both QEMU and KVM @@ -264,8 +270,11 @@ static void fdt_add_timer_nodes(const VirtBoardInfo *vbi) ARMCPU *armcpu; uint32_t irqflags = GIC_FDT_IRQ_FLAGS_EDGE_LO_HI; - irqflags = deposit32(irqflags, GIC_FDT_IRQ_PPI_CPU_START, - GIC_FDT_IRQ_PPI_CPU_WIDTH, (1 << vbi->smp_cpus) - 1); + if (gictype == 2) { + irqflags = deposit32(irqflags, GIC_FDT_IRQ_PPI_CPU_START, + GIC_FDT_IRQ_PPI_CPU_WIDTH, + (1 << vbi->smp_cpus) - 1); + } qemu_fdt_add_subnode(vbi->fdt, "/timer"); @@ -355,25 +364,36 @@ static void fdt_add_v2m_gic_node(VirtBoardInfo *vbi) qemu_fdt_setprop_cell(vbi->fdt, "/intc/v2m", "phandle", vbi->v2m_phandle); } -static void fdt_add_gic_node(VirtBoardInfo *vbi) +static void fdt_add_gic_node(VirtBoardInfo *vbi, int type) { vbi->gic_phandle = qemu_fdt_alloc_phandle(vbi->fdt); qemu_fdt_setprop_cell(vbi->fdt, "/", "interrupt-parent", vbi->gic_phandle); qemu_fdt_add_subnode(vbi->fdt, "/intc"); - /* 'cortex-a15-gic' means 'GIC v2' */ - qemu_fdt_setprop_string(vbi->fdt, "/intc", "compatible", - "arm,cortex-a15-gic"); qemu_fdt_setprop_cell(vbi->fdt, "/intc", "#interrupt-cells", 3); qemu_fdt_setprop(vbi->fdt, "/intc", "interrupt-controller", NULL, 0); - qemu_fdt_setprop_sized_cells(vbi->fdt, "/intc", "reg", - 2, vbi->memmap[VIRT_GIC_DIST].base, - 2, vbi->memmap[VIRT_GIC_DIST].size, - 2, vbi->memmap[VIRT_GIC_CPU].base, - 2, vbi->memmap[VIRT_GIC_CPU].size); qemu_fdt_setprop_cell(vbi->fdt, "/intc", "#address-cells", 0x2); qemu_fdt_setprop_cell(vbi->fdt, "/intc", "#size-cells", 0x2); qemu_fdt_setprop(vbi->fdt, "/intc", "ranges", NULL, 0); + if (type == 3) { + qemu_fdt_setprop_string(vbi->fdt, "/intc", "compatible", + "arm,gic-v3"); + qemu_fdt_setprop_sized_cells(vbi->fdt, "/intc", "reg", + 2, vbi->memmap[VIRT_GIC_DIST].base, + 2, vbi->memmap[VIRT_GIC_DIST].size, + 2, vbi->memmap[VIRT_GIC_REDIST].base, + 2, vbi->memmap[VIRT_GIC_REDIST].size); + } else { + /* 'cortex-a15-gic' means 'GIC v2' */ + qemu_fdt_setprop_string(vbi->fdt, "/intc", "compatible", + "arm,cortex-a15-gic"); + qemu_fdt_setprop_sized_cells(vbi->fdt, "/intc", "reg", + 2, vbi->memmap[VIRT_GIC_DIST].base, + 2, vbi->memmap[VIRT_GIC_DIST].size, + 2, vbi->memmap[VIRT_GIC_CPU].base, + 2, vbi->memmap[VIRT_GIC_CPU].size); + } + qemu_fdt_setprop_cell(vbi->fdt, "/intc", "phandle", vbi->gic_phandle); } @@ -396,18 +416,18 @@ static void create_v2m(VirtBoardInfo *vbi, qemu_irq *pic) fdt_add_v2m_gic_node(vbi); } -static void create_gic(VirtBoardInfo *vbi, qemu_irq *pic, bool secure) +static void create_gic(VirtBoardInfo *vbi, qemu_irq *pic, int type, bool secure) { - /* We create a standalone GIC v2 */ + /* We create a standalone GIC */ DeviceState *gicdev; SysBusDevice *gicbusdev; const char *gictype; int i; - gictype = gic_class_name(); + gictype = (type == 3) ? gicv3_class_name() : gic_class_name(); gicdev = qdev_create(NULL, gictype); - qdev_prop_set_uint32(gicdev, "revision", 2); + qdev_prop_set_uint32(gicdev, "revision", type); qdev_prop_set_uint32(gicdev, "num-cpu", smp_cpus); /* Note that the num-irq property counts both internal and external * interrupts; there are always 32 of the former (mandated by GIC spec). @@ -419,7 +439,11 @@ static void create_gic(VirtBoardInfo *vbi, qemu_irq *pic, bool secure) qdev_init_nofail(gicdev); gicbusdev = SYS_BUS_DEVICE(gicdev); sysbus_mmio_map(gicbusdev, 0, vbi->memmap[VIRT_GIC_DIST].base); - sysbus_mmio_map(gicbusdev, 1, vbi->memmap[VIRT_GIC_CPU].base); + if (type == 3) { + sysbus_mmio_map(gicbusdev, 1, vbi->memmap[VIRT_GIC_REDIST].base); + } else { + sysbus_mmio_map(gicbusdev, 1, vbi->memmap[VIRT_GIC_CPU].base); + } /* Wire the outputs from each CPU's generic timer to the * appropriate GIC PPI inputs, and the GIC's IRQ output to @@ -454,9 +478,11 @@ static void create_gic(VirtBoardInfo *vbi, qemu_irq *pic, bool secure) pic[i] = qdev_get_gpio_in(gicdev, i); } - fdt_add_gic_node(vbi); + fdt_add_gic_node(vbi, type); - create_v2m(vbi, pic); + if (type == 2) { + create_v2m(vbi, pic); + } } static void create_uart(const VirtBoardInfo *vbi, qemu_irq *pic) @@ -773,7 +799,10 @@ static void create_pcie(const VirtBoardInfo *vbi, qemu_irq *pic, qemu_fdt_setprop_cells(vbi->fdt, nodename, "bus-range", 0, nr_pcie_buses - 1); - qemu_fdt_setprop_cells(vbi->fdt, nodename, "msi-parent", vbi->v2m_phandle); + if (vbi->v2m_phandle) { + qemu_fdt_setprop_cells(vbi->fdt, nodename, "msi-parent", + vbi->v2m_phandle); + } qemu_fdt_setprop_sized_cells(vbi->fdt, nodename, "reg", 2, base_ecam, 2, size_ecam); @@ -888,6 +917,7 @@ static void machvirt_init(MachineState *machine) VirtMachineState *vms = VIRT_MACHINE(machine); qemu_irq pic[NUM_IRQS]; MemoryRegion *sysmem = get_system_memory(); + int gic_version = vms->gic_version; int n; MemoryRegion *ram = g_new(MemoryRegion, 1); const char *cpu_model = machine->cpu_model; @@ -900,6 +930,18 @@ static void machvirt_init(MachineState *machine) cpu_model = "cortex-a15"; } + /* We can probe only here because during property set + * KVM is not available yet + */ + if (!gic_version) { + gic_version = kvm_arm_vgic_probe(); + if (!gic_version) { + error_report("Unable to determine GIC version supported by host\n" + "Probably KVM acceleration is not supported\n"); + exit(1); + } + } + /* Separate the actual CPU model name from any appended features */ cpustr = g_strsplit(cpu_model, ",", 2); @@ -960,7 +1002,7 @@ static void machvirt_init(MachineState *machine) object_property_set_bool(cpuobj, true, "realized", NULL); } g_strfreev(cpustr); - fdt_add_timer_nodes(vbi); + fdt_add_timer_nodes(vbi, gic_version); fdt_add_cpu_nodes(vbi); fdt_add_psci_node(vbi); @@ -970,7 +1012,7 @@ static void machvirt_init(MachineState *machine) create_flash(vbi); - create_gic(vbi, pic, vms->secure); + create_gic(vbi, pic, gic_version, vms->secure); create_uart(vbi, pic); @@ -992,6 +1034,7 @@ static void machvirt_init(MachineState *machine) guest_info->memmap = vbi->memmap; guest_info->irqmap = vbi->irqmap; guest_info->use_highmem = vms->highmem; + guest_info->gic_version = gic_version; guest_info_state->machine_done.notify = virt_guest_info_machine_done; qemu_add_machine_init_done_notifier(&guest_info_state->machine_done); @@ -1043,6 +1086,31 @@ static void virt_set_highmem(Object *obj, bool value, Error **errp) vms->highmem = value; } +static char *virt_get_gic_version(Object *obj, Error **errp) +{ + VirtMachineState *vms = VIRT_MACHINE(obj); + const char *val = vms->gic_version == 3 ? "3" : "2"; + + return g_strdup(val); +} + +static void virt_set_gic_version(Object *obj, const char *value, Error **errp) +{ + VirtMachineState *vms = VIRT_MACHINE(obj); + + if (!strcmp(value, "3")) { + vms->gic_version = 3; + } else if (!strcmp(value, "2")) { + vms->gic_version = 2; + } else if (!strcmp(value, "host")) { + vms->gic_version = 0; /* Will probe later */ + } else { + error_report("Invalid gic-version option value\n" + "Allowed values are: 3, 2, host\n"); + exit(1); + } +} + static void virt_instance_init(Object *obj) { VirtMachineState *vms = VIRT_MACHINE(obj); @@ -1067,6 +1135,13 @@ static void virt_instance_init(Object *obj) "Set on/off to enable/disable using " "physical address space above 32 bits", NULL); + /* Default GIC type is v2 */ + vms->gic_version = 2; + object_property_add_str(obj, "gic-version", virt_get_gic_version, + virt_set_gic_version, NULL); + object_property_set_description(obj, "gic-version", + "Set GIC version. " + "Valid values are 2, 3 and host", NULL); } static void virt_class_init(ObjectClass *oc, void *data) @@ -1075,7 +1150,10 @@ static void virt_class_init(ObjectClass *oc, void *data) mc->desc = "ARM Virtual Machine", mc->init = machvirt_init; - mc->max_cpus = 8; + /* Our maximum number of CPUs depends on how many redistributors + * we can fit into memory map + */ + mc->max_cpus = a15memmap[VIRT_GIC_REDIST].size / 0x20000; mc->has_dynamic_sysbus = true; mc->block_default_type = IF_VIRTIO; mc->no_cdrom = 1; diff --git a/hw/intc/Makefile.objs b/hw/intc/Makefile.objs index 092d8a8..004b0c2 100644 --- a/hw/intc/Makefile.objs +++ b/hw/intc/Makefile.objs @@ -12,10 +12,12 @@ common-obj-$(CONFIG_IOAPIC) += ioapic_common.o common-obj-$(CONFIG_ARM_GIC) += arm_gic_common.o common-obj-$(CONFIG_ARM_GIC) += arm_gic.o common-obj-$(CONFIG_ARM_GIC) += arm_gicv2m.o +common-obj-$(CONFIG_ARM_GIC) += arm_gicv3_common.o common-obj-$(CONFIG_OPENPIC) += openpic.o obj-$(CONFIG_APIC) += apic.o apic_common.o obj-$(CONFIG_ARM_GIC_KVM) += arm_gic_kvm.o +obj-$(call land,$(CONFIG_ARM_GIC_KVM),$(TARGET_AARCH64)) += arm_gicv3_kvm.o obj-$(CONFIG_STELLARIS) += armv7m_nvic.o obj-$(CONFIG_EXYNOS4) += exynos4210_gic.o exynos4210_combiner.o obj-$(CONFIG_GRLIB) += grlib_irqmp.o diff --git a/hw/intc/arm_gic_kvm.c b/hw/intc/arm_gic_kvm.c index e5d0f67..e8b2386 100644 --- a/hw/intc/arm_gic_kvm.c +++ b/hw/intc/arm_gic_kvm.c @@ -23,6 +23,7 @@ #include "sysemu/kvm.h" #include "kvm_arm.h" #include "gic_internal.h" +#include "vgic_common.h" //#define DEBUG_GIC_KVM @@ -52,7 +53,7 @@ typedef struct KVMARMGICClass { void (*parent_reset)(DeviceState *dev); } KVMARMGICClass; -static void kvm_arm_gic_set_irq(void *opaque, int irq, int level) +void kvm_arm_gic_set_irq(uint32_t num_irq, int irq, int level) { /* Meaning of the 'irq' parameter: * [0..N-1] : external interrupts @@ -63,10 +64,9 @@ static void kvm_arm_gic_set_irq(void *opaque, int irq, int level) * has separate fields in the irq number for type, * CPU number and interrupt number. */ - GICState *s = (GICState *)opaque; int kvm_irq, irqtype, cpu; - if (irq < (s->num_irq - GIC_INTERNAL)) { + if (irq < (num_irq - GIC_INTERNAL)) { /* External interrupt. The kernel numbers these like the GIC * hardware, with external interrupt IDs starting after the * internal ones. @@ -77,7 +77,7 @@ static void kvm_arm_gic_set_irq(void *opaque, int irq, int level) } else { /* Internal interrupt: decode into (cpu, interrupt id) */ irqtype = KVM_ARM_IRQ_TYPE_PPI; - irq -= (s->num_irq - GIC_INTERNAL); + irq -= (num_irq - GIC_INTERNAL); cpu = irq / GIC_INTERNAL; irq %= GIC_INTERNAL; } @@ -87,69 +87,36 @@ static void kvm_arm_gic_set_irq(void *opaque, int irq, int level) kvm_set_irq(kvm_state, kvm_irq, !!level); } -static bool kvm_arm_gic_can_save_restore(GICState *s) -{ - return s->dev_fd >= 0; -} - -static bool kvm_gic_supports_attr(GICState *s, int group, int attrnum) +static void kvm_arm_gicv2_set_irq(void *opaque, int irq, int level) { - struct kvm_device_attr attr = { - .group = group, - .attr = attrnum, - .flags = 0, - }; - - if (s->dev_fd == -1) { - return false; - } + GICState *s = (GICState *)opaque; - return kvm_device_ioctl(s->dev_fd, KVM_HAS_DEVICE_ATTR, &attr) == 0; + kvm_arm_gic_set_irq(s->num_irq, irq, level); } -static void kvm_gic_access(GICState *s, int group, int offset, - int cpu, uint32_t *val, bool write) +static bool kvm_arm_gic_can_save_restore(GICState *s) { - struct kvm_device_attr attr; - int type; - int err; - - cpu = cpu & 0xff; - - attr.flags = 0; - attr.group = group; - attr.attr = (((uint64_t)cpu << KVM_DEV_ARM_VGIC_CPUID_SHIFT) & - KVM_DEV_ARM_VGIC_CPUID_MASK) | - (((uint64_t)offset << KVM_DEV_ARM_VGIC_OFFSET_SHIFT) & - KVM_DEV_ARM_VGIC_OFFSET_MASK); - attr.addr = (uintptr_t)val; - - if (write) { - type = KVM_SET_DEVICE_ATTR; - } else { - type = KVM_GET_DEVICE_ATTR; - } - - err = kvm_device_ioctl(s->dev_fd, type, &attr); - if (err < 0) { - fprintf(stderr, "KVM_{SET/GET}_DEVICE_ATTR failed: %s\n", - strerror(-err)); - abort(); - } + return s->dev_fd >= 0; } +#define KVM_VGIC_ATTR(offset, cpu) \ + ((((uint64_t)(cpu) << KVM_DEV_ARM_VGIC_CPUID_SHIFT) & \ + KVM_DEV_ARM_VGIC_CPUID_MASK) | \ + (((uint64_t)(offset) << KVM_DEV_ARM_VGIC_OFFSET_SHIFT) & \ + KVM_DEV_ARM_VGIC_OFFSET_MASK)) + static void kvm_gicd_access(GICState *s, int offset, int cpu, uint32_t *val, bool write) { - kvm_gic_access(s, KVM_DEV_ARM_VGIC_GRP_DIST_REGS, - offset, cpu, val, write); + kvm_device_access(s->dev_fd, KVM_DEV_ARM_VGIC_GRP_DIST_REGS, + KVM_VGIC_ATTR(offset, cpu), val, write); } static void kvm_gicc_access(GICState *s, int offset, int cpu, uint32_t *val, bool write) { - kvm_gic_access(s, KVM_DEV_ARM_VGIC_GRP_CPU_REGS, - offset, cpu, val, write); + kvm_device_access(s->dev_fd, KVM_DEV_ARM_VGIC_GRP_CPU_REGS, + KVM_VGIC_ATTR(offset, cpu), val, write); } #define for_each_irq_reg(_ctr, _max_irq, _field_width) \ @@ -559,7 +526,7 @@ static void kvm_arm_gic_realize(DeviceState *dev, Error **errp) return; } - gic_init_irqs_and_mmio(s, kvm_arm_gic_set_irq, NULL); + gic_init_irqs_and_mmio(s, kvm_arm_gicv2_set_irq, NULL); for (i = 0; i < s->num_irq - GIC_INTERNAL; i++) { qemu_irq irq = qdev_get_gpio_in(dev, i); @@ -571,23 +538,24 @@ static void kvm_arm_gic_realize(DeviceState *dev, Error **errp) ret = kvm_create_device(kvm_state, KVM_DEV_TYPE_ARM_VGIC_V2, false); if (ret >= 0) { s->dev_fd = ret; + + /* Newstyle API is used, we may have attributes */ + if (kvm_device_check_attr(s->dev_fd, KVM_DEV_ARM_VGIC_GRP_NR_IRQS, 0)) { + uint32_t numirqs = s->num_irq; + kvm_device_access(s->dev_fd, KVM_DEV_ARM_VGIC_GRP_NR_IRQS, 0, + &numirqs, true); + } + /* Tell the kernel to complete VGIC initialization now */ + if (kvm_device_check_attr(s->dev_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT)) { + kvm_device_access(s->dev_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); + } } else if (ret != -ENODEV && ret != -ENOTSUP) { error_setg_errno(errp, -ret, "error creating in-kernel VGIC"); return; } - if (kvm_gic_supports_attr(s, KVM_DEV_ARM_VGIC_GRP_NR_IRQS, 0)) { - uint32_t numirqs = s->num_irq; - kvm_gic_access(s, KVM_DEV_ARM_VGIC_GRP_NR_IRQS, 0, 0, &numirqs, 1); - } - - /* Tell the kernel to complete VGIC initialization now */ - if (kvm_gic_supports_attr(s, KVM_DEV_ARM_VGIC_GRP_CTRL, - KVM_DEV_ARM_VGIC_CTRL_INIT)) { - kvm_gic_access(s, KVM_DEV_ARM_VGIC_GRP_CTRL, - KVM_DEV_ARM_VGIC_CTRL_INIT, 0, 0, 1); - } - /* Distributor */ kvm_arm_register_device(&s->iomem, (KVM_ARM_DEVICE_VGIC_V2 << KVM_ARM_DEVICE_ID_SHIFT) diff --git a/hw/intc/arm_gicv3_common.c b/hw/intc/arm_gicv3_common.c new file mode 100644 index 0000000..032ece2 --- /dev/null +++ b/hw/intc/arm_gicv3_common.c @@ -0,0 +1,140 @@ +/* + * ARM GICv3 support - common bits of emulated and KVM kernel model + * + * Copyright (c) 2012 Linaro Limited + * Copyright (c) 2015 Huawei. + * Written by Peter Maydell + * Extended to 64 cores by Shlomo Pongratz + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "hw/intc/arm_gicv3_common.h" + +static void gicv3_pre_save(void *opaque) +{ + GICv3State *s = (GICv3State *)opaque; + ARMGICv3CommonClass *c = ARM_GICV3_COMMON_GET_CLASS(s); + + if (c->pre_save) { + c->pre_save(s); + } +} + +static int gicv3_post_load(void *opaque, int version_id) +{ + GICv3State *s = (GICv3State *)opaque; + ARMGICv3CommonClass *c = ARM_GICV3_COMMON_GET_CLASS(s); + + if (c->post_load) { + c->post_load(s); + } + return 0; +} + +static const VMStateDescription vmstate_gicv3 = { + .name = "arm_gicv3", + .unmigratable = 1, + .pre_save = gicv3_pre_save, + .post_load = gicv3_post_load, +}; + +void gicv3_init_irqs_and_mmio(GICv3State *s, qemu_irq_handler handler, + const MemoryRegionOps *ops) +{ + SysBusDevice *sbd = SYS_BUS_DEVICE(s); + int i; + + /* For the GIC, also expose incoming GPIO lines for PPIs for each CPU. + * GPIO array layout is thus: + * [0..N-1] spi + * [N..N+31] PPIs for CPU 0 + * [N+32..N+63] PPIs for CPU 1 + * ... + */ + i = s->num_irq - GIC_INTERNAL + GIC_INTERNAL * s->num_cpu; + qdev_init_gpio_in(DEVICE(s), handler, i); + + s->parent_irq = g_malloc(s->num_cpu * sizeof(qemu_irq)); + s->parent_fiq = g_malloc(s->num_cpu * sizeof(qemu_irq)); + + for (i = 0; i < s->num_cpu; i++) { + sysbus_init_irq(sbd, &s->parent_irq[i]); + } + for (i = 0; i < s->num_cpu; i++) { + sysbus_init_irq(sbd, &s->parent_fiq[i]); + } + + memory_region_init_io(&s->iomem_dist, OBJECT(s), ops, s, + "gicv3_dist", 0x10000); + memory_region_init_io(&s->iomem_redist, OBJECT(s), ops ? &ops[1] : NULL, s, + "gicv3_redist", 0x20000 * s->num_cpu); + + sysbus_init_mmio(sbd, &s->iomem_dist); + sysbus_init_mmio(sbd, &s->iomem_redist); +} + +static void arm_gicv3_common_realize(DeviceState *dev, Error **errp) +{ + GICv3State *s = ARM_GICV3_COMMON(dev); + + /* revision property is actually reserved and currently used only in order + * to keep the interface compatible with GICv2 code, avoiding extra + * conditions. However, in future it could be used, for example, if we + * implement GICv4. + */ + if (s->revision != 3) { + error_setg(errp, "unsupported GIC revision %d", s->revision); + return; + } +} + +static void arm_gicv3_common_reset(DeviceState *dev) +{ + /* TODO */ +} + +static Property arm_gicv3_common_properties[] = { + DEFINE_PROP_UINT32("num-cpu", GICv3State, num_cpu, 1), + DEFINE_PROP_UINT32("num-irq", GICv3State, num_irq, 32), + DEFINE_PROP_UINT32("revision", GICv3State, revision, 3), + DEFINE_PROP_BOOL("has-security-extensions", GICv3State, security_extn, 0), + DEFINE_PROP_END_OF_LIST(), +}; + +static void arm_gicv3_common_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->reset = arm_gicv3_common_reset; + dc->realize = arm_gicv3_common_realize; + dc->props = arm_gicv3_common_properties; + dc->vmsd = &vmstate_gicv3; +} + +static const TypeInfo arm_gicv3_common_type = { + .name = TYPE_ARM_GICV3_COMMON, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(GICv3State), + .class_size = sizeof(ARMGICv3CommonClass), + .class_init = arm_gicv3_common_class_init, + .abstract = true, +}; + +static void register_types(void) +{ + type_register_static(&arm_gicv3_common_type); +} + +type_init(register_types) diff --git a/hw/intc/arm_gicv3_kvm.c b/hw/intc/arm_gicv3_kvm.c new file mode 100644 index 0000000..b48f78f --- /dev/null +++ b/hw/intc/arm_gicv3_kvm.c @@ -0,0 +1,149 @@ +/* + * ARM Generic Interrupt Controller using KVM in-kernel support + * + * Copyright (c) 2015 Samsung Electronics Co., Ltd. + * Written by Pavel Fedin + * Based on vGICv2 code by Peter Maydell + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "hw/intc/arm_gicv3_common.h" +#include "hw/sysbus.h" +#include "sysemu/kvm.h" +#include "kvm_arm.h" +#include "vgic_common.h" + +#ifdef DEBUG_GICV3_KVM +#define DPRINTF(fmt, ...) \ + do { fprintf(stderr, "kvm_gicv3: " fmt, ## __VA_ARGS__); } while (0) +#else +#define DPRINTF(fmt, ...) \ + do { } while (0) +#endif + +#define TYPE_KVM_ARM_GICV3 "kvm-arm-gicv3" +#define KVM_ARM_GICV3(obj) \ + OBJECT_CHECK(GICv3State, (obj), TYPE_KVM_ARM_GICV3) +#define KVM_ARM_GICV3_CLASS(klass) \ + OBJECT_CLASS_CHECK(KVMARMGICv3Class, (klass), TYPE_KVM_ARM_GICV3) +#define KVM_ARM_GICV3_GET_CLASS(obj) \ + OBJECT_GET_CLASS(KVMARMGICv3Class, (obj), TYPE_KVM_ARM_GICV3) + +typedef struct KVMARMGICv3Class { + ARMGICv3CommonClass parent_class; + DeviceRealize parent_realize; + void (*parent_reset)(DeviceState *dev); +} KVMARMGICv3Class; + +static void kvm_arm_gicv3_set_irq(void *opaque, int irq, int level) +{ + GICv3State *s = (GICv3State *)opaque; + + kvm_arm_gic_set_irq(s->num_irq, irq, level); +} + +static void kvm_arm_gicv3_put(GICv3State *s) +{ + /* TODO */ + DPRINTF("Cannot put kernel gic state, no kernel interface\n"); +} + +static void kvm_arm_gicv3_get(GICv3State *s) +{ + /* TODO */ + DPRINTF("Cannot get kernel gic state, no kernel interface\n"); +} + +static void kvm_arm_gicv3_reset(DeviceState *dev) +{ + GICv3State *s = ARM_GICV3_COMMON(dev); + KVMARMGICv3Class *kgc = KVM_ARM_GICV3_GET_CLASS(s); + + DPRINTF("Reset\n"); + + kgc->parent_reset(dev); + kvm_arm_gicv3_put(s); +} + +static void kvm_arm_gicv3_realize(DeviceState *dev, Error **errp) +{ + GICv3State *s = KVM_ARM_GICV3(dev); + KVMARMGICv3Class *kgc = KVM_ARM_GICV3_GET_CLASS(s); + Error *local_err = NULL; + + DPRINTF("kvm_arm_gicv3_realize\n"); + + kgc->parent_realize(dev, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + + if (s->security_extn) { + error_setg(errp, "the in-kernel VGICv3 does not implement the " + "security extensions"); + return; + } + + gicv3_init_irqs_and_mmio(s, kvm_arm_gicv3_set_irq, NULL); + + /* Try to create the device via the device control API */ + s->dev_fd = kvm_create_device(kvm_state, KVM_DEV_TYPE_ARM_VGIC_V3, false); + if (s->dev_fd < 0) { + error_setg_errno(errp, -s->dev_fd, "error creating in-kernel VGIC"); + return; + } + + kvm_device_access(s->dev_fd, KVM_DEV_ARM_VGIC_GRP_NR_IRQS, + 0, &s->num_irq, true); + + /* Tell the kernel to complete VGIC initialization now */ + kvm_device_access(s->dev_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); + + kvm_arm_register_device(&s->iomem_dist, -1, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_DIST, s->dev_fd); + kvm_arm_register_device(&s->iomem_redist, -1, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST, s->dev_fd); +} + +static void kvm_arm_gicv3_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + ARMGICv3CommonClass *agcc = ARM_GICV3_COMMON_CLASS(klass); + KVMARMGICv3Class *kgc = KVM_ARM_GICV3_CLASS(klass); + + agcc->pre_save = kvm_arm_gicv3_get; + agcc->post_load = kvm_arm_gicv3_put; + kgc->parent_realize = dc->realize; + kgc->parent_reset = dc->reset; + dc->realize = kvm_arm_gicv3_realize; + dc->reset = kvm_arm_gicv3_reset; +} + +static const TypeInfo kvm_arm_gicv3_info = { + .name = TYPE_KVM_ARM_GICV3, + .parent = TYPE_ARM_GICV3_COMMON, + .instance_size = sizeof(GICv3State), + .class_init = kvm_arm_gicv3_class_init, + .class_size = sizeof(KVMARMGICv3Class), +}; + +static void kvm_arm_gicv3_register_types(void) +{ + type_register_static(&kvm_arm_gicv3_info); +} + +type_init(kvm_arm_gicv3_register_types) diff --git a/hw/intc/vgic_common.h b/hw/intc/vgic_common.h new file mode 100644 index 0000000..80d919e --- /dev/null +++ b/hw/intc/vgic_common.h @@ -0,0 +1,35 @@ +/* + * ARM KVM vGIC utility functions + * + * Copyright (c) 2015 Samsung Electronics + * Written by Pavel Fedin + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef QEMU_ARM_VGIC_COMMON_H +#define QEMU_ARM_VGIC_COMMON_H + +/** + * kvm_arm_gic_set_irq - Send an IRQ to the in-kernel vGIC + * @num_irq: Total number of IRQs configured for the GIC instance + * @irq: qemu internal IRQ line number: + * [0..N-1] : external interrupts + * [N..N+31] : PPI (internal) interrupts for CPU 0 + * [N+32..N+63] : PPI (internal interrupts for CPU 1 + * @level: level of the IRQ line. + */ +void kvm_arm_gic_set_irq(uint32_t num_irq, int irq, int level); + +#endif diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs index d540c9d..d324863 100644 --- a/hw/vfio/Makefile.objs +++ b/hw/vfio/Makefile.objs @@ -1,6 +1,6 @@ ifeq ($(CONFIG_LINUX), y) obj-$(CONFIG_SOFTMMU) += common.o -obj-$(CONFIG_PCI) += pci.o +obj-$(CONFIG_PCI) += pci.o pci-quirks.o obj-$(CONFIG_SOFTMMU) += platform.o obj-$(CONFIG_SOFTMMU) += calxeda-xgmac.o endif diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 6d21311..0d341a3 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -496,7 +496,7 @@ int vfio_mmap_region(Object *obj, VFIORegion *region, int ret = 0; VFIODevice *vbasedev = region->vbasedev; - if (vbasedev->allow_mmap && size && region->flags & + if (!vbasedev->no_mmap && size && region->flags & VFIO_REGION_INFO_FLAG_MMAP) { int prot = 0; diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c new file mode 100644 index 0000000..2bdaef1 --- /dev/null +++ b/hw/vfio/pci-quirks.c @@ -0,0 +1,1204 @@ +/* + * device quirks for PCI devices + * + * Copyright Red Hat, Inc. 2012-2015 + * + * Authors: + * Alex Williamson <alex.williamson@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include "pci.h" +#include "trace.h" +#include "qemu/range.h" + +/* Use uin32_t for vendor & device so PCI_ANY_ID expands and cannot match hw */ +static bool vfio_pci_is(VFIOPCIDevice *vdev, uint32_t vendor, uint32_t device) +{ + return (vendor == PCI_ANY_ID || vendor == vdev->vendor_id) && + (device == PCI_ANY_ID || device == vdev->device_id); +} + +static bool vfio_is_vga(VFIOPCIDevice *vdev) +{ + PCIDevice *pdev = &vdev->pdev; + uint16_t class = pci_get_word(pdev->config + PCI_CLASS_DEVICE); + + return class == PCI_CLASS_DISPLAY_VGA; +} + +/* + * List of device ids/vendor ids for which to disable + * option rom loading. This avoids the guest hangs during rom + * execution as noticed with the BCM 57810 card for lack of a + * more better way to handle such issues. + * The user can still override by specifying a romfile or + * rombar=1. + * Please see https://bugs.launchpad.net/qemu/+bug/1284874 + * for an analysis of the 57810 card hang. When adding + * a new vendor id/device id combination below, please also add + * your card/environment details and information that could + * help in debugging to the bug tracking this issue + */ +static const struct { + uint32_t vendor; + uint32_t device; +} romblacklist[] = { + { 0x14e4, 0x168e }, /* Broadcom BCM 57810 */ +}; + +bool vfio_blacklist_opt_rom(VFIOPCIDevice *vdev) +{ + int i; + + for (i = 0 ; i < ARRAY_SIZE(romblacklist); i++) { + if (vfio_pci_is(vdev, romblacklist[i].vendor, romblacklist[i].device)) { + trace_vfio_quirk_rom_blacklisted(vdev->vbasedev.name, + romblacklist[i].vendor, + romblacklist[i].device); + return true; + } + } + return false; +} + +/* + * Device specific region quirks (mostly backdoors to PCI config space) + */ + +/* + * The generic window quirks operate on an address and data register, + * vfio_generic_window_address_quirk handles the address register and + * vfio_generic_window_data_quirk handles the data register. These ops + * pass reads and writes through to hardware until a value matching the + * stored address match/mask is written. When this occurs, the data + * register access emulated PCI config space for the device rather than + * passing through accesses. This enables devices where PCI config space + * is accessible behind a window register to maintain the virtualization + * provided through vfio. + */ +typedef struct VFIOConfigWindowMatch { + uint32_t match; + uint32_t mask; +} VFIOConfigWindowMatch; + +typedef struct VFIOConfigWindowQuirk { + struct VFIOPCIDevice *vdev; + + uint32_t address_val; + + uint32_t address_offset; + uint32_t data_offset; + + bool window_enabled; + uint8_t bar; + + MemoryRegion *addr_mem; + MemoryRegion *data_mem; + + uint32_t nr_matches; + VFIOConfigWindowMatch matches[]; +} VFIOConfigWindowQuirk; + +static uint64_t vfio_generic_window_quirk_address_read(void *opaque, + hwaddr addr, + unsigned size) +{ + VFIOConfigWindowQuirk *window = opaque; + VFIOPCIDevice *vdev = window->vdev; + + return vfio_region_read(&vdev->bars[window->bar].region, + addr + window->address_offset, size); +} + +static void vfio_generic_window_quirk_address_write(void *opaque, hwaddr addr, + uint64_t data, + unsigned size) +{ + VFIOConfigWindowQuirk *window = opaque; + VFIOPCIDevice *vdev = window->vdev; + int i; + + window->window_enabled = false; + + vfio_region_write(&vdev->bars[window->bar].region, + addr + window->address_offset, data, size); + + for (i = 0; i < window->nr_matches; i++) { + if ((data & ~window->matches[i].mask) == window->matches[i].match) { + window->window_enabled = true; + window->address_val = data & window->matches[i].mask; + trace_vfio_quirk_generic_window_address_write(vdev->vbasedev.name, + memory_region_name(window->addr_mem), data); + break; + } + } +} + +static const MemoryRegionOps vfio_generic_window_address_quirk = { + .read = vfio_generic_window_quirk_address_read, + .write = vfio_generic_window_quirk_address_write, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static uint64_t vfio_generic_window_quirk_data_read(void *opaque, + hwaddr addr, unsigned size) +{ + VFIOConfigWindowQuirk *window = opaque; + VFIOPCIDevice *vdev = window->vdev; + uint64_t data; + + /* Always read data reg, discard if window enabled */ + data = vfio_region_read(&vdev->bars[window->bar].region, + addr + window->data_offset, size); + + if (window->window_enabled) { + data = vfio_pci_read_config(&vdev->pdev, window->address_val, size); + trace_vfio_quirk_generic_window_data_read(vdev->vbasedev.name, + memory_region_name(window->data_mem), data); + } + + return data; +} + +static void vfio_generic_window_quirk_data_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + VFIOConfigWindowQuirk *window = opaque; + VFIOPCIDevice *vdev = window->vdev; + + if (window->window_enabled) { + vfio_pci_write_config(&vdev->pdev, window->address_val, data, size); + trace_vfio_quirk_generic_window_data_write(vdev->vbasedev.name, + memory_region_name(window->data_mem), data); + return; + } + + vfio_region_write(&vdev->bars[window->bar].region, + addr + window->data_offset, data, size); +} + +static const MemoryRegionOps vfio_generic_window_data_quirk = { + .read = vfio_generic_window_quirk_data_read, + .write = vfio_generic_window_quirk_data_write, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +/* + * The generic mirror quirk handles devices which expose PCI config space + * through a region within a BAR. When enabled, reads and writes are + * redirected through to emulated PCI config space. XXX if PCI config space + * used memory regions, this could just be an alias. + */ +typedef struct VFIOConfigMirrorQuirk { + struct VFIOPCIDevice *vdev; + uint32_t offset; + uint8_t bar; + MemoryRegion *mem; +} VFIOConfigMirrorQuirk; + +static uint64_t vfio_generic_quirk_mirror_read(void *opaque, + hwaddr addr, unsigned size) +{ + VFIOConfigMirrorQuirk *mirror = opaque; + VFIOPCIDevice *vdev = mirror->vdev; + uint64_t data; + + /* Read and discard in case the hardware cares */ + (void)vfio_region_read(&vdev->bars[mirror->bar].region, + addr + mirror->offset, size); + + data = vfio_pci_read_config(&vdev->pdev, addr, size); + trace_vfio_quirk_generic_mirror_read(vdev->vbasedev.name, + memory_region_name(mirror->mem), + addr, data); + return data; +} + +static void vfio_generic_quirk_mirror_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + VFIOConfigMirrorQuirk *mirror = opaque; + VFIOPCIDevice *vdev = mirror->vdev; + + vfio_pci_write_config(&vdev->pdev, addr, data, size); + trace_vfio_quirk_generic_mirror_write(vdev->vbasedev.name, + memory_region_name(mirror->mem), + addr, data); +} + +static const MemoryRegionOps vfio_generic_mirror_quirk = { + .read = vfio_generic_quirk_mirror_read, + .write = vfio_generic_quirk_mirror_write, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +/* Is range1 fully contained within range2? */ +static bool vfio_range_contained(uint64_t first1, uint64_t len1, + uint64_t first2, uint64_t len2) { + return (first1 >= first2 && first1 + len1 <= first2 + len2); +} + +#define PCI_VENDOR_ID_ATI 0x1002 + +/* + * Radeon HD cards (HD5450 & HD7850) report the upper byte of the I/O port BAR + * through VGA register 0x3c3. On newer cards, the I/O port BAR is always + * BAR4 (older cards like the X550 used BAR1, but we don't care to support + * those). Note that on bare metal, a read of 0x3c3 doesn't always return the + * I/O port BAR address. Originally this was coded to return the virtual BAR + * address only if the physical register read returns the actual BAR address, + * but users have reported greater success if we return the virtual address + * unconditionally. + */ +static uint64_t vfio_ati_3c3_quirk_read(void *opaque, + hwaddr addr, unsigned size) +{ + VFIOPCIDevice *vdev = opaque; + uint64_t data = vfio_pci_read_config(&vdev->pdev, + PCI_BASE_ADDRESS_4 + 1, size); + + trace_vfio_quirk_ati_3c3_read(vdev->vbasedev.name, data); + + return data; +} + +static const MemoryRegionOps vfio_ati_3c3_quirk = { + .read = vfio_ati_3c3_quirk_read, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev) +{ + VFIOQuirk *quirk; + + /* + * As long as the BAR is >= 256 bytes it will be aligned such that the + * lower byte is always zero. Filter out anything else, if it exists. + */ + if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) || + !vdev->bars[4].ioport || vdev->bars[4].region.size < 256) { + return; + } + + quirk = g_malloc0(sizeof(*quirk)); + quirk->mem = g_malloc0_n(sizeof(MemoryRegion), 1); + quirk->nr_mem = 1; + + memory_region_init_io(quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, vdev, + "vfio-ati-3c3-quirk", 1); + memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem, + 3 /* offset 3 bytes from 0x3c0 */, quirk->mem); + + QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks, + quirk, next); + + trace_vfio_quirk_ati_3c3_probe(vdev->vbasedev.name); +} + +/* + * Newer ATI/AMD devices, including HD5450 and HD7850, have a mirror to PCI + * config space through MMIO BAR2 at offset 0x4000. Nothing seems to access + * the MMIO space directly, but a window to this space is provided through + * I/O port BAR4. Offset 0x0 is the address register and offset 0x4 is the + * data register. When the address is programmed to a range of 0x4000-0x4fff + * PCI configuration space is available. Experimentation seems to indicate + * that read-only may be provided by hardware. + */ +static void vfio_probe_ati_bar4_quirk(VFIOPCIDevice *vdev, int nr) +{ + VFIOQuirk *quirk; + VFIOConfigWindowQuirk *window; + + /* This windows doesn't seem to be used except by legacy VGA code */ + if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) || + !vdev->has_vga || nr != 4) { + return; + } + + quirk = g_malloc0(sizeof(*quirk)); + quirk->mem = g_malloc0_n(sizeof(MemoryRegion), 2); + quirk->nr_mem = 2; + window = quirk->data = g_malloc0(sizeof(*window) + + sizeof(VFIOConfigWindowMatch)); + window->vdev = vdev; + window->address_offset = 0; + window->data_offset = 4; + window->nr_matches = 1; + window->matches[0].match = 0x4000; + window->matches[0].mask = PCIE_CONFIG_SPACE_SIZE - 1; + window->bar = nr; + window->addr_mem = &quirk->mem[0]; + window->data_mem = &quirk->mem[1]; + + memory_region_init_io(window->addr_mem, OBJECT(vdev), + &vfio_generic_window_address_quirk, window, + "vfio-ati-bar4-window-address-quirk", 4); + memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, + window->address_offset, + window->addr_mem, 1); + + memory_region_init_io(window->data_mem, OBJECT(vdev), + &vfio_generic_window_data_quirk, window, + "vfio-ati-bar4-window-data-quirk", 4); + memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, + window->data_offset, + window->data_mem, 1); + + QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); + + trace_vfio_quirk_ati_bar4_probe(vdev->vbasedev.name); +} + +/* + * Trap the BAR2 MMIO mirror to config space as well. + */ +static void vfio_probe_ati_bar2_quirk(VFIOPCIDevice *vdev, int nr) +{ + VFIOQuirk *quirk; + VFIOConfigMirrorQuirk *mirror; + + /* Only enable on newer devices where BAR2 is 64bit */ + if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) || + !vdev->has_vga || nr != 2 || !vdev->bars[2].mem64) { + return; + } + + quirk = g_malloc0(sizeof(*quirk)); + mirror = quirk->data = g_malloc0(sizeof(*mirror)); + mirror->mem = quirk->mem = g_malloc0_n(sizeof(MemoryRegion), 1); + quirk->nr_mem = 1; + mirror->vdev = vdev; + mirror->offset = 0x4000; + mirror->bar = nr; + + memory_region_init_io(mirror->mem, OBJECT(vdev), + &vfio_generic_mirror_quirk, mirror, + "vfio-ati-bar2-4000-quirk", PCI_CONFIG_SPACE_SIZE); + memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, + mirror->offset, mirror->mem, 1); + + QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); + + trace_vfio_quirk_ati_bar2_probe(vdev->vbasedev.name); +} + +/* + * Older ATI/AMD cards like the X550 have a similar window to that above. + * I/O port BAR1 provides a window to a mirror of PCI config space located + * in BAR2 at offset 0xf00. We don't care to support such older cards, but + * note it for future reference. + */ + +#define PCI_VENDOR_ID_NVIDIA 0x10de + +/* + * Nvidia has several different methods to get to config space, the + * nouveu project has several of these documented here: + * https://github.com/pathscale/envytools/tree/master/hwdocs + * + * The first quirk is actually not documented in envytools and is found + * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]). This is an + * NV46 chipset. The backdoor uses the legacy VGA I/O ports to access + * the mirror of PCI config space found at BAR0 offset 0x1800. The access + * sequence first writes 0x338 to I/O port 0x3d4. The target offset is + * then written to 0x3d0. Finally 0x538 is written for a read and 0x738 + * is written for a write to 0x3d4. The BAR0 offset is then accessible + * through 0x3d0. This quirk doesn't seem to be necessary on newer cards + * that use the I/O port BAR5 window but it doesn't hurt to leave it. + */ +typedef enum {NONE = 0, SELECT, WINDOW, READ, WRITE} VFIONvidia3d0State; +static const char *nv3d0_states[] = { "NONE", "SELECT", + "WINDOW", "READ", "WRITE" }; + +typedef struct VFIONvidia3d0Quirk { + VFIOPCIDevice *vdev; + VFIONvidia3d0State state; + uint32_t offset; +} VFIONvidia3d0Quirk; + +static uint64_t vfio_nvidia_3d4_quirk_read(void *opaque, + hwaddr addr, unsigned size) +{ + VFIONvidia3d0Quirk *quirk = opaque; + VFIOPCIDevice *vdev = quirk->vdev; + + quirk->state = NONE; + + return vfio_vga_read(&vdev->vga.region[QEMU_PCI_VGA_IO_HI], + addr + 0x14, size); +} + +static void vfio_nvidia_3d4_quirk_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + VFIONvidia3d0Quirk *quirk = opaque; + VFIOPCIDevice *vdev = quirk->vdev; + VFIONvidia3d0State old_state = quirk->state; + + quirk->state = NONE; + + switch (data) { + case 0x338: + if (old_state == NONE) { + quirk->state = SELECT; + trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, + nv3d0_states[quirk->state]); + } + break; + case 0x538: + if (old_state == WINDOW) { + quirk->state = READ; + trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, + nv3d0_states[quirk->state]); + } + break; + case 0x738: + if (old_state == WINDOW) { + quirk->state = WRITE; + trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, + nv3d0_states[quirk->state]); + } + break; + } + + vfio_vga_write(&vdev->vga.region[QEMU_PCI_VGA_IO_HI], + addr + 0x14, data, size); +} + +static const MemoryRegionOps vfio_nvidia_3d4_quirk = { + .read = vfio_nvidia_3d4_quirk_read, + .write = vfio_nvidia_3d4_quirk_write, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque, + hwaddr addr, unsigned size) +{ + VFIONvidia3d0Quirk *quirk = opaque; + VFIOPCIDevice *vdev = quirk->vdev; + VFIONvidia3d0State old_state = quirk->state; + uint64_t data = vfio_vga_read(&vdev->vga.region[QEMU_PCI_VGA_IO_HI], + addr + 0x10, size); + + quirk->state = NONE; + + if (old_state == READ && + (quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) { + uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1); + + data = vfio_pci_read_config(&vdev->pdev, offset, size); + trace_vfio_quirk_nvidia_3d0_read(vdev->vbasedev.name, + offset, size, data); + } + + return data; +} + +static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + VFIONvidia3d0Quirk *quirk = opaque; + VFIOPCIDevice *vdev = quirk->vdev; + VFIONvidia3d0State old_state = quirk->state; + + quirk->state = NONE; + + if (old_state == SELECT) { + quirk->offset = (uint32_t)data; + quirk->state = WINDOW; + trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name, + nv3d0_states[quirk->state]); + } else if (old_state == WRITE) { + if ((quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) { + uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1); + + vfio_pci_write_config(&vdev->pdev, offset, data, size); + trace_vfio_quirk_nvidia_3d0_write(vdev->vbasedev.name, + offset, data, size); + return; + } + } + + vfio_vga_write(&vdev->vga.region[QEMU_PCI_VGA_IO_HI], + addr + 0x10, data, size); +} + +static const MemoryRegionOps vfio_nvidia_3d0_quirk = { + .read = vfio_nvidia_3d0_quirk_read, + .write = vfio_nvidia_3d0_quirk_write, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static void vfio_vga_probe_nvidia_3d0_quirk(VFIOPCIDevice *vdev) +{ + VFIOQuirk *quirk; + VFIONvidia3d0Quirk *data; + + if (!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) || + !vdev->bars[1].region.size) { + return; + } + + quirk = g_malloc0(sizeof(*quirk)); + quirk->data = data = g_malloc0(sizeof(*data)); + quirk->mem = g_malloc0_n(sizeof(MemoryRegion), 2); + quirk->nr_mem = 2; + data->vdev = vdev; + + memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_nvidia_3d4_quirk, + data, "vfio-nvidia-3d4-quirk", 2); + memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem, + 0x14 /* 0x3c0 + 0x14 */, &quirk->mem[0]); + + memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_nvidia_3d0_quirk, + data, "vfio-nvidia-3d0-quirk", 2); + memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem, + 0x10 /* 0x3c0 + 0x10 */, &quirk->mem[1]); + + QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks, + quirk, next); + + trace_vfio_quirk_nvidia_3d0_probe(vdev->vbasedev.name); +} + +/* + * The second quirk is documented in envytools. The I/O port BAR5 is just + * a set of address/data ports to the MMIO BARs. The BAR we care about is + * again BAR0. This backdoor is apparently a bit newer than the one above + * so we need to not only trap 256 bytes @0x1800, but all of PCI config + * space, including extended space is available at the 4k @0x88000. + */ +typedef struct VFIONvidiaBAR5Quirk { + uint32_t master; + uint32_t enable; + MemoryRegion *addr_mem; + MemoryRegion *data_mem; + bool enabled; + VFIOConfigWindowQuirk window; /* last for match data */ +} VFIONvidiaBAR5Quirk; + +static void vfio_nvidia_bar5_enable(VFIONvidiaBAR5Quirk *bar5) +{ + VFIOPCIDevice *vdev = bar5->window.vdev; + + if (((bar5->master & bar5->enable) & 0x1) == bar5->enabled) { + return; + } + + bar5->enabled = !bar5->enabled; + trace_vfio_quirk_nvidia_bar5_state(vdev->vbasedev.name, + bar5->enabled ? "Enable" : "Disable"); + memory_region_set_enabled(bar5->addr_mem, bar5->enabled); + memory_region_set_enabled(bar5->data_mem, bar5->enabled); +} + +static uint64_t vfio_nvidia_bar5_quirk_master_read(void *opaque, + hwaddr addr, unsigned size) +{ + VFIONvidiaBAR5Quirk *bar5 = opaque; + VFIOPCIDevice *vdev = bar5->window.vdev; + + return vfio_region_read(&vdev->bars[5].region, addr, size); +} + +static void vfio_nvidia_bar5_quirk_master_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + VFIONvidiaBAR5Quirk *bar5 = opaque; + VFIOPCIDevice *vdev = bar5->window.vdev; + + vfio_region_write(&vdev->bars[5].region, addr, data, size); + + bar5->master = data; + vfio_nvidia_bar5_enable(bar5); +} + +static const MemoryRegionOps vfio_nvidia_bar5_quirk_master = { + .read = vfio_nvidia_bar5_quirk_master_read, + .write = vfio_nvidia_bar5_quirk_master_write, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static uint64_t vfio_nvidia_bar5_quirk_enable_read(void *opaque, + hwaddr addr, unsigned size) +{ + VFIONvidiaBAR5Quirk *bar5 = opaque; + VFIOPCIDevice *vdev = bar5->window.vdev; + + return vfio_region_read(&vdev->bars[5].region, addr + 4, size); +} + +static void vfio_nvidia_bar5_quirk_enable_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + VFIONvidiaBAR5Quirk *bar5 = opaque; + VFIOPCIDevice *vdev = bar5->window.vdev; + + vfio_region_write(&vdev->bars[5].region, addr + 4, data, size); + + bar5->enable = data; + vfio_nvidia_bar5_enable(bar5); +} + +static const MemoryRegionOps vfio_nvidia_bar5_quirk_enable = { + .read = vfio_nvidia_bar5_quirk_enable_read, + .write = vfio_nvidia_bar5_quirk_enable_write, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr) +{ + VFIOQuirk *quirk; + VFIONvidiaBAR5Quirk *bar5; + VFIOConfigWindowQuirk *window; + + if (!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) || + !vdev->has_vga || nr != 5) { + return; + } + + quirk = g_malloc0(sizeof(*quirk)); + quirk->mem = g_malloc0_n(sizeof(MemoryRegion), 4); + quirk->nr_mem = 4; + bar5 = quirk->data = g_malloc0(sizeof(*bar5) + + (sizeof(VFIOConfigWindowMatch) * 2)); + window = &bar5->window; + + window->vdev = vdev; + window->address_offset = 0x8; + window->data_offset = 0xc; + window->nr_matches = 2; + window->matches[0].match = 0x1800; + window->matches[0].mask = PCI_CONFIG_SPACE_SIZE - 1; + window->matches[1].match = 0x88000; + window->matches[1].mask = PCIE_CONFIG_SPACE_SIZE - 1; + window->bar = nr; + window->addr_mem = bar5->addr_mem = &quirk->mem[0]; + window->data_mem = bar5->data_mem = &quirk->mem[1]; + + memory_region_init_io(window->addr_mem, OBJECT(vdev), + &vfio_generic_window_address_quirk, window, + "vfio-nvidia-bar5-window-address-quirk", 4); + memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, + window->address_offset, + window->addr_mem, 1); + memory_region_set_enabled(window->addr_mem, false); + + memory_region_init_io(window->data_mem, OBJECT(vdev), + &vfio_generic_window_data_quirk, window, + "vfio-nvidia-bar5-window-data-quirk", 4); + memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, + window->data_offset, + window->data_mem, 1); + memory_region_set_enabled(window->data_mem, false); + + memory_region_init_io(&quirk->mem[2], OBJECT(vdev), + &vfio_nvidia_bar5_quirk_master, bar5, + "vfio-nvidia-bar5-master-quirk", 4); + memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, + 0, &quirk->mem[2], 1); + + memory_region_init_io(&quirk->mem[3], OBJECT(vdev), + &vfio_nvidia_bar5_quirk_enable, bar5, + "vfio-nvidia-bar5-enable-quirk", 4); + memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, + 4, &quirk->mem[3], 1); + + QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); + + trace_vfio_quirk_nvidia_bar5_probe(vdev->vbasedev.name); +} + +/* + * Finally, BAR0 itself. We want to redirect any accesses to either + * 0x1800 or 0x88000 through the PCI config space access functions. + */ +static void vfio_nvidia_quirk_mirror_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + VFIOConfigMirrorQuirk *mirror = opaque; + VFIOPCIDevice *vdev = mirror->vdev; + PCIDevice *pdev = &vdev->pdev; + + vfio_generic_quirk_mirror_write(opaque, addr, data, size); + + /* + * Nvidia seems to acknowledge MSI interrupts by writing 0xff to the + * MSI capability ID register. Both the ID and next register are + * read-only, so we allow writes covering either of those to real hw. + */ + if ((pdev->cap_present & QEMU_PCI_CAP_MSI) && + vfio_range_contained(addr, size, pdev->msi_cap, PCI_MSI_FLAGS)) { + vfio_region_write(&vdev->bars[mirror->bar].region, + addr + mirror->offset, data, size); + trace_vfio_quirk_nvidia_bar0_msi_ack(vdev->vbasedev.name); + } +} + +static const MemoryRegionOps vfio_nvidia_mirror_quirk = { + .read = vfio_generic_quirk_mirror_read, + .write = vfio_nvidia_quirk_mirror_write, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr) +{ + VFIOQuirk *quirk; + VFIOConfigMirrorQuirk *mirror; + + if (!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) || + !vfio_is_vga(vdev) || nr != 0) { + return; + } + + quirk = g_malloc0(sizeof(*quirk)); + mirror = quirk->data = g_malloc0(sizeof(*mirror)); + mirror->mem = quirk->mem = g_malloc0_n(sizeof(MemoryRegion), 1); + quirk->nr_mem = 1; + mirror->vdev = vdev; + mirror->offset = 0x88000; + mirror->bar = nr; + + memory_region_init_io(mirror->mem, OBJECT(vdev), + &vfio_nvidia_mirror_quirk, mirror, + "vfio-nvidia-bar0-88000-mirror-quirk", + PCIE_CONFIG_SPACE_SIZE); + memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, + mirror->offset, mirror->mem, 1); + + QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); + + /* The 0x1800 offset mirror only seems to get used by legacy VGA */ + if (vdev->has_vga) { + quirk = g_malloc0(sizeof(*quirk)); + mirror = quirk->data = g_malloc0(sizeof(*mirror)); + mirror->mem = quirk->mem = g_malloc0_n(sizeof(MemoryRegion), 1); + quirk->nr_mem = 1; + mirror->vdev = vdev; + mirror->offset = 0x1800; + mirror->bar = nr; + + memory_region_init_io(mirror->mem, OBJECT(vdev), + &vfio_nvidia_mirror_quirk, mirror, + "vfio-nvidia-bar0-1800-mirror-quirk", + PCI_CONFIG_SPACE_SIZE); + memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, + mirror->offset, mirror->mem, 1); + + QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); + } + + trace_vfio_quirk_nvidia_bar0_probe(vdev->vbasedev.name); +} + +/* + * TODO - Some Nvidia devices provide config access to their companion HDA + * device and even to their parent bridge via these config space mirrors. + * Add quirks for those regions. + */ + +#define PCI_VENDOR_ID_REALTEK 0x10ec + +/* + * RTL8168 devices have a backdoor that can access the MSI-X table. At BAR2 + * offset 0x70 there is a dword data register, offset 0x74 is a dword address + * register. According to the Linux r8169 driver, the MSI-X table is addressed + * when the "type" portion of the address register is set to 0x1. This appears + * to be bits 16:30. Bit 31 is both a write indicator and some sort of + * "address latched" indicator. Bits 12:15 are a mask field, which we can + * ignore because the MSI-X table should always be accessed as a dword (full + * mask). Bits 0:11 is offset within the type. + * + * Example trace: + * + * Read from MSI-X table offset 0 + * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x1f000, 4) // store read addr + * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x8001f000 // latch + * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data + * + * Write 0xfee00000 to MSI-X table offset 0 + * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x70, 0xfee00000, 4) // write data + * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x8001f000, 4) // do write + * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x1f000 // complete + */ +typedef struct VFIOrtl8168Quirk { + VFIOPCIDevice *vdev; + uint32_t addr; + uint32_t data; + bool enabled; +} VFIOrtl8168Quirk; + +static uint64_t vfio_rtl8168_quirk_address_read(void *opaque, + hwaddr addr, unsigned size) +{ + VFIOrtl8168Quirk *rtl = opaque; + VFIOPCIDevice *vdev = rtl->vdev; + uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x74, size); + + if (rtl->enabled) { + data = rtl->addr ^ 0x80000000U; /* latch/complete */ + trace_vfio_quirk_rtl8168_fake_latch(vdev->vbasedev.name, data); + } + + return data; +} + +static void vfio_rtl8168_quirk_address_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + VFIOrtl8168Quirk *rtl = opaque; + VFIOPCIDevice *vdev = rtl->vdev; + + rtl->enabled = false; + + if ((data & 0x7fff0000) == 0x10000) { /* MSI-X table */ + rtl->enabled = true; + rtl->addr = (uint32_t)data; + + if (data & 0x80000000U) { /* Do write */ + if (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX) { + hwaddr offset = data & 0xfff; + uint64_t val = rtl->data; + + trace_vfio_quirk_rtl8168_msix_write(vdev->vbasedev.name, + (uint16_t)offset, val); + + /* Write to the proper guest MSI-X table instead */ + memory_region_dispatch_write(&vdev->pdev.msix_table_mmio, + offset, val, size, + MEMTXATTRS_UNSPECIFIED); + } + return; /* Do not write guest MSI-X data to hardware */ + } + } + + vfio_region_write(&vdev->bars[2].region, addr + 0x74, data, size); +} + +static const MemoryRegionOps vfio_rtl_address_quirk = { + .read = vfio_rtl8168_quirk_address_read, + .write = vfio_rtl8168_quirk_address_write, + .valid = { + .min_access_size = 4, + .max_access_size = 4, + .unaligned = false, + }, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static uint64_t vfio_rtl8168_quirk_data_read(void *opaque, + hwaddr addr, unsigned size) +{ + VFIOrtl8168Quirk *rtl = opaque; + VFIOPCIDevice *vdev = rtl->vdev; + uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x74, size); + + if (rtl->enabled && (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) { + hwaddr offset = rtl->addr & 0xfff; + memory_region_dispatch_read(&vdev->pdev.msix_table_mmio, offset, + &data, size, MEMTXATTRS_UNSPECIFIED); + trace_vfio_quirk_rtl8168_msix_read(vdev->vbasedev.name, offset, data); + } + + return data; +} + +static void vfio_rtl8168_quirk_data_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + VFIOrtl8168Quirk *rtl = opaque; + VFIOPCIDevice *vdev = rtl->vdev; + + rtl->data = (uint32_t)data; + + vfio_region_write(&vdev->bars[2].region, addr + 0x70, data, size); +} + +static const MemoryRegionOps vfio_rtl_data_quirk = { + .read = vfio_rtl8168_quirk_data_read, + .write = vfio_rtl8168_quirk_data_write, + .valid = { + .min_access_size = 4, + .max_access_size = 4, + .unaligned = false, + }, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static void vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice *vdev, int nr) +{ + VFIOQuirk *quirk; + VFIOrtl8168Quirk *rtl; + + if (!vfio_pci_is(vdev, PCI_VENDOR_ID_REALTEK, 0x8168) || nr != 2) { + return; + } + + quirk = g_malloc0(sizeof(*quirk)); + quirk->mem = g_malloc0_n(sizeof(MemoryRegion), 2); + quirk->nr_mem = 2; + quirk->data = rtl = g_malloc0(sizeof(*rtl)); + rtl->vdev = vdev; + + memory_region_init_io(&quirk->mem[0], OBJECT(vdev), + &vfio_rtl_address_quirk, rtl, + "vfio-rtl8168-window-address-quirk", 4); + memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, + 0x74, &quirk->mem[0], 1); + + memory_region_init_io(&quirk->mem[1], OBJECT(vdev), + &vfio_rtl_data_quirk, rtl, + "vfio-rtl8168-window-data-quirk", 4); + memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, + 0x70, &quirk->mem[1], 1); + + QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); + + trace_vfio_quirk_rtl8168_probe(vdev->vbasedev.name); +} + +/* + * Common quirk probe entry points. + */ +void vfio_vga_quirk_setup(VFIOPCIDevice *vdev) +{ + vfio_vga_probe_ati_3c3_quirk(vdev); + vfio_vga_probe_nvidia_3d0_quirk(vdev); +} + +void vfio_vga_quirk_teardown(VFIOPCIDevice *vdev) +{ + VFIOQuirk *quirk; + int i, j; + + for (i = 0; i < ARRAY_SIZE(vdev->vga.region); i++) { + QLIST_FOREACH(quirk, &vdev->vga.region[i].quirks, next) { + for (j = 0; j < quirk->nr_mem; j++) { + memory_region_del_subregion(&vdev->vga.region[i].mem, + &quirk->mem[j]); + } + } + } +} + +void vfio_vga_quirk_free(VFIOPCIDevice *vdev) +{ + int i, j; + + for (i = 0; i < ARRAY_SIZE(vdev->vga.region); i++) { + while (!QLIST_EMPTY(&vdev->vga.region[i].quirks)) { + VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga.region[i].quirks); + QLIST_REMOVE(quirk, next); + for (j = 0; j < quirk->nr_mem; j++) { + object_unparent(OBJECT(&quirk->mem[j])); + } + g_free(quirk->mem); + g_free(quirk->data); + g_free(quirk); + } + } +} + +void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr) +{ + vfio_probe_ati_bar4_quirk(vdev, nr); + vfio_probe_ati_bar2_quirk(vdev, nr); + vfio_probe_nvidia_bar5_quirk(vdev, nr); + vfio_probe_nvidia_bar0_quirk(vdev, nr); + vfio_probe_rtl8168_bar2_quirk(vdev, nr); +} + +void vfio_bar_quirk_teardown(VFIOPCIDevice *vdev, int nr) +{ + VFIOBAR *bar = &vdev->bars[nr]; + VFIOQuirk *quirk; + int i; + + QLIST_FOREACH(quirk, &bar->quirks, next) { + for (i = 0; i < quirk->nr_mem; i++) { + memory_region_del_subregion(&bar->region.mem, &quirk->mem[i]); + } + } +} + +void vfio_bar_quirk_free(VFIOPCIDevice *vdev, int nr) +{ + VFIOBAR *bar = &vdev->bars[nr]; + int i; + + while (!QLIST_EMPTY(&bar->quirks)) { + VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks); + QLIST_REMOVE(quirk, next); + for (i = 0; i < quirk->nr_mem; i++) { + object_unparent(OBJECT(&quirk->mem[i])); + } + g_free(quirk->mem); + g_free(quirk->data); + g_free(quirk); + } +} + +/* + * Reset quirks + */ + +/* + * AMD Radeon PCI config reset, based on Linux: + * drivers/gpu/drm/radeon/ci_smc.c:ci_is_smc_running() + * drivers/gpu/drm/radeon/radeon_device.c:radeon_pci_config_reset + * drivers/gpu/drm/radeon/ci_smc.c:ci_reset_smc() + * drivers/gpu/drm/radeon/ci_smc.c:ci_stop_smc_clock() + * IDs: include/drm/drm_pciids.h + * Registers: http://cgit.freedesktop.org/~agd5f/linux/commit/?id=4e2aa447f6f0 + * + * Bonaire and Hawaii GPUs do not respond to a bus reset. This is a bug in the + * hardware that should be fixed on future ASICs. The symptom of this is that + * once the accerlated driver loads, Windows guests will bsod on subsequent + * attmpts to load the driver, such as after VM reset or shutdown/restart. To + * work around this, we do an AMD specific PCI config reset, followed by an SMC + * reset. The PCI config reset only works if SMC firmware is running, so we + * have a dependency on the state of the device as to whether this reset will + * be effective. There are still cases where we won't be able to kick the + * device into working, but this greatly improves the usability overall. The + * config reset magic is relatively common on AMD GPUs, but the setup and SMC + * poking is largely ASIC specific. + */ +static bool vfio_radeon_smc_is_running(VFIOPCIDevice *vdev) +{ + uint32_t clk, pc_c; + + /* + * Registers 200h and 204h are index and data registers for accessing + * indirect configuration registers within the device. + */ + vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4); + clk = vfio_region_read(&vdev->bars[5].region, 0x204, 4); + vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000370, 4); + pc_c = vfio_region_read(&vdev->bars[5].region, 0x204, 4); + + return (!(clk & 1) && (0x20100 <= pc_c)); +} + +/* + * The scope of a config reset is controlled by a mode bit in the misc register + * and a fuse, exposed as a bit in another register. The fuse is the default + * (0 = GFX, 1 = whole GPU), the misc bit is a toggle, with the forumula + * scope = !(misc ^ fuse), where the resulting scope is defined the same as + * the fuse. A truth table therefore tells us that if misc == fuse, we need + * to flip the value of the bit in the misc register. + */ +static void vfio_radeon_set_gfx_only_reset(VFIOPCIDevice *vdev) +{ + uint32_t misc, fuse; + bool a, b; + + vfio_region_write(&vdev->bars[5].region, 0x200, 0xc00c0000, 4); + fuse = vfio_region_read(&vdev->bars[5].region, 0x204, 4); + b = fuse & 64; + + vfio_region_write(&vdev->bars[5].region, 0x200, 0xc0000010, 4); + misc = vfio_region_read(&vdev->bars[5].region, 0x204, 4); + a = misc & 2; + + if (a == b) { + vfio_region_write(&vdev->bars[5].region, 0x204, misc ^ 2, 4); + vfio_region_read(&vdev->bars[5].region, 0x204, 4); /* flush */ + } +} + +static int vfio_radeon_reset(VFIOPCIDevice *vdev) +{ + PCIDevice *pdev = &vdev->pdev; + int i, ret = 0; + uint32_t data; + + /* Defer to a kernel implemented reset */ + if (vdev->vbasedev.reset_works) { + trace_vfio_quirk_ati_bonaire_reset_skipped(vdev->vbasedev.name); + return -ENODEV; + } + + /* Enable only memory BAR access */ + vfio_pci_write_config(pdev, PCI_COMMAND, PCI_COMMAND_MEMORY, 2); + + /* Reset only works if SMC firmware is loaded and running */ + if (!vfio_radeon_smc_is_running(vdev)) { + ret = -EINVAL; + trace_vfio_quirk_ati_bonaire_reset_no_smc(vdev->vbasedev.name); + goto out; + } + + /* Make sure only the GFX function is reset */ + vfio_radeon_set_gfx_only_reset(vdev); + + /* AMD PCI config reset */ + vfio_pci_write_config(pdev, 0x7c, 0x39d5e86b, 4); + usleep(100); + + /* Read back the memory size to make sure we're out of reset */ + for (i = 0; i < 100000; i++) { + if (vfio_region_read(&vdev->bars[5].region, 0x5428, 4) != 0xffffffff) { + goto reset_smc; + } + usleep(1); + } + + trace_vfio_quirk_ati_bonaire_reset_timeout(vdev->vbasedev.name); + +reset_smc: + /* Reset SMC */ + vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000000, 4); + data = vfio_region_read(&vdev->bars[5].region, 0x204, 4); + data |= 1; + vfio_region_write(&vdev->bars[5].region, 0x204, data, 4); + + /* Disable SMC clock */ + vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4); + data = vfio_region_read(&vdev->bars[5].region, 0x204, 4); + data |= 1; + vfio_region_write(&vdev->bars[5].region, 0x204, data, 4); + + trace_vfio_quirk_ati_bonaire_reset_done(vdev->vbasedev.name); + +out: + /* Restore PCI command register */ + vfio_pci_write_config(pdev, PCI_COMMAND, 0, 2); + + return ret; +} + +void vfio_setup_resetfn_quirk(VFIOPCIDevice *vdev) +{ + switch (vdev->vendor_id) { + case 0x1002: + switch (vdev->device_id) { + /* Bonaire */ + case 0x6649: /* Bonaire [FirePro W5100] */ + case 0x6650: + case 0x6651: + case 0x6658: /* Bonaire XTX [Radeon R7 260X] */ + case 0x665c: /* Bonaire XT [Radeon HD 7790/8770 / R9 260 OEM] */ + case 0x665d: /* Bonaire [Radeon R7 200 Series] */ + /* Hawaii */ + case 0x67A0: /* Hawaii XT GL [FirePro W9100] */ + case 0x67A1: /* Hawaii PRO GL [FirePro W8100] */ + case 0x67A2: + case 0x67A8: + case 0x67A9: + case 0x67AA: + case 0x67B0: /* Hawaii XT [Radeon R9 290X] */ + case 0x67B1: /* Hawaii PRO [Radeon R9 290] */ + case 0x67B8: + case 0x67B9: + case 0x67BA: + case 0x67BE: + vdev->resetfn = vfio_radeon_reset; + trace_vfio_quirk_ati_bonaire_reset(vdev->vbasedev.name); + break; + } + break; + } +} diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 73d34b9..dcabb6d 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -26,178 +26,18 @@ #include <unistd.h> #include "config.h" -#include "exec/address-spaces.h" -#include "exec/memory.h" #include "hw/pci/msi.h" #include "hw/pci/msix.h" -#include "hw/pci/pci.h" -#include "qemu-common.h" #include "qemu/error-report.h" -#include "qemu/event_notifier.h" -#include "qemu/queue.h" #include "qemu/range.h" #include "sysemu/kvm.h" #include "sysemu/sysemu.h" +#include "pci.h" #include "trace.h" -#include "hw/vfio/vfio.h" -#include "hw/vfio/vfio-common.h" - -struct VFIOPCIDevice; - -typedef struct VFIOQuirk { - MemoryRegion mem; - struct VFIOPCIDevice *vdev; - QLIST_ENTRY(VFIOQuirk) next; - struct { - uint32_t base_offset:TARGET_PAGE_BITS; - uint32_t address_offset:TARGET_PAGE_BITS; - uint32_t address_size:3; - uint32_t bar:3; - - uint32_t address_match; - uint32_t address_mask; - - uint32_t address_val:TARGET_PAGE_BITS; - uint32_t data_offset:TARGET_PAGE_BITS; - uint32_t data_size:3; - - uint8_t flags; - uint8_t read_flags; - uint8_t write_flags; - } data; -} VFIOQuirk; - -typedef struct VFIOBAR { - VFIORegion region; - bool ioport; - bool mem64; - QLIST_HEAD(, VFIOQuirk) quirks; -} VFIOBAR; - -typedef struct VFIOVGARegion { - MemoryRegion mem; - off_t offset; - int nr; - QLIST_HEAD(, VFIOQuirk) quirks; -} VFIOVGARegion; - -typedef struct VFIOVGA { - off_t fd_offset; - int fd; - VFIOVGARegion region[QEMU_PCI_VGA_NUM_REGIONS]; -} VFIOVGA; - -typedef struct VFIOINTx { - bool pending; /* interrupt pending */ - bool kvm_accel; /* set when QEMU bypass through KVM enabled */ - uint8_t pin; /* which pin to pull for qemu_set_irq */ - EventNotifier interrupt; /* eventfd triggered on interrupt */ - EventNotifier unmask; /* eventfd for unmask on QEMU bypass */ - PCIINTxRoute route; /* routing info for QEMU bypass */ - uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */ - QEMUTimer *mmap_timer; /* enable mmaps after periods w/o interrupts */ -} VFIOINTx; - -typedef struct VFIOMSIVector { - /* - * Two interrupt paths are configured per vector. The first, is only used - * for interrupts injected via QEMU. This is typically the non-accel path, - * but may also be used when we want QEMU to handle masking and pending - * bits. The KVM path bypasses QEMU and is therefore higher performance, - * but requires masking at the device. virq is used to track the MSI route - * through KVM, thus kvm_interrupt is only available when virq is set to a - * valid (>= 0) value. - */ - EventNotifier interrupt; - EventNotifier kvm_interrupt; - struct VFIOPCIDevice *vdev; /* back pointer to device */ - int virq; - bool use; -} VFIOMSIVector; - -enum { - VFIO_INT_NONE = 0, - VFIO_INT_INTx = 1, - VFIO_INT_MSI = 2, - VFIO_INT_MSIX = 3, -}; - -/* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */ -typedef struct VFIOMSIXInfo { - uint8_t table_bar; - uint8_t pba_bar; - uint16_t entries; - uint32_t table_offset; - uint32_t pba_offset; - MemoryRegion mmap_mem; - void *mmap; -} VFIOMSIXInfo; - -typedef struct VFIOPCIDevice { - PCIDevice pdev; - VFIODevice vbasedev; - VFIOINTx intx; - unsigned int config_size; - uint8_t *emulated_config_bits; /* QEMU emulated bits, little-endian */ - off_t config_offset; /* Offset of config space region within device fd */ - unsigned int rom_size; - off_t rom_offset; /* Offset of ROM region within device fd */ - void *rom; - int msi_cap_size; - VFIOMSIVector *msi_vectors; - VFIOMSIXInfo *msix; - int nr_vectors; /* Number of MSI/MSIX vectors currently in use */ - int interrupt; /* Current interrupt type */ - VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */ - VFIOVGA vga; /* 0xa0000, 0x3b0, 0x3c0 */ - PCIHostDeviceAddress host; - EventNotifier err_notifier; - EventNotifier req_notifier; - int (*resetfn)(struct VFIOPCIDevice *); - uint32_t features; -#define VFIO_FEATURE_ENABLE_VGA_BIT 0 -#define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT) -#define VFIO_FEATURE_ENABLE_REQ_BIT 1 -#define VFIO_FEATURE_ENABLE_REQ (1 << VFIO_FEATURE_ENABLE_REQ_BIT) - int32_t bootindex; - uint8_t pm_cap; - bool has_vga; - bool pci_aer; - bool req_enabled; - bool has_flr; - bool has_pm_reset; - bool rom_read_failed; -} VFIOPCIDevice; - -typedef struct VFIORomBlacklistEntry { - uint16_t vendor_id; - uint16_t device_id; -} VFIORomBlacklistEntry; - -/* - * List of device ids/vendor ids for which to disable - * option rom loading. This avoids the guest hangs during rom - * execution as noticed with the BCM 57810 card for lack of a - * more better way to handle such issues. - * The user can still override by specifying a romfile or - * rombar=1. - * Please see https://bugs.launchpad.net/qemu/+bug/1284874 - * for an analysis of the 57810 card hang. When adding - * a new vendor id/device id combination below, please also add - * your card/environment details and information that could - * help in debugging to the bug tracking this issue - */ -static const VFIORomBlacklistEntry romblacklist[] = { - /* Broadcom BCM 57810 */ - { 0x14e4, 0x168e } -}; #define MSIX_CAP_LENGTH 12 static void vfio_disable_interrupts(VFIOPCIDevice *vdev); -static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len); -static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, - uint32_t val, int len); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); /* @@ -247,7 +87,7 @@ static void vfio_intx_interrupt(void *opaque) } } -static void vfio_eoi(VFIODevice *vbasedev) +static void vfio_intx_eoi(VFIODevice *vbasedev) { VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); @@ -255,14 +95,14 @@ static void vfio_eoi(VFIODevice *vbasedev) return; } - trace_vfio_eoi(vbasedev->name); + trace_vfio_intx_eoi(vbasedev->name); vdev->intx.pending = false; pci_irq_deassert(&vdev->pdev); vfio_unmask_single_irqindex(vbasedev, VFIO_PCI_INTX_IRQ_INDEX); } -static void vfio_enable_intx_kvm(VFIOPCIDevice *vdev) +static void vfio_intx_enable_kvm(VFIOPCIDevice *vdev) { #ifdef CONFIG_KVM struct kvm_irqfd irqfd = { @@ -274,7 +114,7 @@ static void vfio_enable_intx_kvm(VFIOPCIDevice *vdev) int ret, argsz; int32_t *pfd; - if (!VFIO_ALLOW_KVM_INTX || !kvm_irqfds_enabled() || + if (vdev->no_kvm_intx || !kvm_irqfds_enabled() || vdev->intx.route.mode != PCI_INTX_ENABLED || !kvm_resamplefds_enabled()) { return; @@ -324,7 +164,7 @@ static void vfio_enable_intx_kvm(VFIOPCIDevice *vdev) vdev->intx.kvm_accel = true; - trace_vfio_enable_intx_kvm(vdev->vbasedev.name); + trace_vfio_intx_enable_kvm(vdev->vbasedev.name); return; @@ -339,7 +179,7 @@ fail: #endif } -static void vfio_disable_intx_kvm(VFIOPCIDevice *vdev) +static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev) { #ifdef CONFIG_KVM struct kvm_irqfd irqfd = { @@ -376,11 +216,11 @@ static void vfio_disable_intx_kvm(VFIOPCIDevice *vdev) /* If we've missed an event, let it re-fire through QEMU */ vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); - trace_vfio_disable_intx_kvm(vdev->vbasedev.name); + trace_vfio_intx_disable_kvm(vdev->vbasedev.name); #endif } -static void vfio_update_irq(PCIDevice *pdev) +static void vfio_intx_update(PCIDevice *pdev) { VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); PCIINTxRoute route; @@ -395,10 +235,10 @@ static void vfio_update_irq(PCIDevice *pdev) return; /* Nothing changed */ } - trace_vfio_update_irq(vdev->vbasedev.name, - vdev->intx.route.irq, route.irq); + trace_vfio_intx_update(vdev->vbasedev.name, + vdev->intx.route.irq, route.irq); - vfio_disable_intx_kvm(vdev); + vfio_intx_disable_kvm(vdev); vdev->intx.route = route; @@ -406,13 +246,13 @@ static void vfio_update_irq(PCIDevice *pdev) return; } - vfio_enable_intx_kvm(vdev); + vfio_intx_enable_kvm(vdev); /* Re-enable the interrupt in cased we missed an EOI */ - vfio_eoi(&vdev->vbasedev); + vfio_intx_eoi(&vdev->vbasedev); } -static int vfio_enable_intx(VFIOPCIDevice *vdev) +static int vfio_intx_enable(VFIOPCIDevice *vdev) { uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1); int ret, argsz; @@ -467,21 +307,21 @@ static int vfio_enable_intx(VFIOPCIDevice *vdev) return -errno; } - vfio_enable_intx_kvm(vdev); + vfio_intx_enable_kvm(vdev); vdev->interrupt = VFIO_INT_INTx; - trace_vfio_enable_intx(vdev->vbasedev.name); + trace_vfio_intx_enable(vdev->vbasedev.name); return 0; } -static void vfio_disable_intx(VFIOPCIDevice *vdev) +static void vfio_intx_disable(VFIOPCIDevice *vdev) { int fd; timer_del(vdev->intx.mmap_timer); - vfio_disable_intx_kvm(vdev); + vfio_intx_disable_kvm(vdev); vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); vdev->intx.pending = false; pci_irq_deassert(&vdev->pdev); @@ -493,7 +333,7 @@ static void vfio_disable_intx(VFIOPCIDevice *vdev) vdev->interrupt = VFIO_INT_NONE; - trace_vfio_disable_intx(vdev->vbasedev.name); + trace_vfio_intx_disable(vdev->vbasedev.name); } /* @@ -503,33 +343,28 @@ static void vfio_msi_interrupt(void *opaque) { VFIOMSIVector *vector = opaque; VFIOPCIDevice *vdev = vector->vdev; + MSIMessage (*get_msg)(PCIDevice *dev, unsigned vector); + void (*notify)(PCIDevice *dev, unsigned vector); + MSIMessage msg; int nr = vector - vdev->msi_vectors; if (!event_notifier_test_and_clear(&vector->interrupt)) { return; } -#ifdef DEBUG_VFIO - MSIMessage msg; - if (vdev->interrupt == VFIO_INT_MSIX) { - msg = msix_get_message(&vdev->pdev, nr); + get_msg = msix_get_message; + notify = msix_notify; } else if (vdev->interrupt == VFIO_INT_MSI) { - msg = msi_get_message(&vdev->pdev, nr); + get_msg = msi_get_message; + notify = msi_notify; } else { abort(); } + msg = get_msg(&vdev->pdev, nr); trace_vfio_msi_interrupt(vdev->vbasedev.name, nr, msg.address, msg.data); -#endif - - if (vdev->interrupt == VFIO_INT_MSIX) { - msix_notify(&vdev->pdev, nr); - } else if (vdev->interrupt == VFIO_INT_MSI) { - msi_notify(&vdev->pdev, nr); - } else { - error_report("vfio: MSI interrupt receieved, but not enabled?"); - } + notify(&vdev->pdev, nr); } static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) @@ -576,13 +411,12 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) return ret; } -static void vfio_add_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage *msg, - bool msix) +static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, + MSIMessage *msg, bool msix) { int virq; - if ((msix && !VFIO_ALLOW_KVM_MSIX) || - (!msix && !VFIO_ALLOW_KVM_MSI) || !msg) { + if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi) || !msg) { return; } @@ -655,7 +489,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, vfio_update_kvm_msi_virq(vector, *msg); } } else { - vfio_add_kvm_msi_virq(vector, msg, true); + vfio_add_kvm_msi_virq(vdev, vector, msg, true); } /* @@ -747,7 +581,7 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) } } -static void vfio_enable_msix(VFIOPCIDevice *vdev) +static void vfio_msix_enable(VFIOPCIDevice *vdev) { vfio_disable_interrupts(vdev); @@ -776,10 +610,10 @@ static void vfio_enable_msix(VFIOPCIDevice *vdev) error_report("vfio: msix_set_vector_notifiers failed"); } - trace_vfio_enable_msix(vdev->vbasedev.name); + trace_vfio_msix_enable(vdev->vbasedev.name); } -static void vfio_enable_msi(VFIOPCIDevice *vdev) +static void vfio_msi_enable(VFIOPCIDevice *vdev) { int ret, i; @@ -808,7 +642,7 @@ retry: * Attempt to enable route through KVM irqchip, * default to userspace handling if unavailable. */ - vfio_add_kvm_msi_virq(vector, &msg, false); + vfio_add_kvm_msi_virq(vdev, vector, &msg, false); } /* Set interrupt type prior to possible interrupts */ @@ -852,10 +686,10 @@ retry: return; } - trace_vfio_enable_msi(vdev->vbasedev.name, vdev->nr_vectors); + trace_vfio_msi_enable(vdev->vbasedev.name, vdev->nr_vectors); } -static void vfio_disable_msi_common(VFIOPCIDevice *vdev) +static void vfio_msi_disable_common(VFIOPCIDevice *vdev) { int i; @@ -876,10 +710,10 @@ static void vfio_disable_msi_common(VFIOPCIDevice *vdev) vdev->nr_vectors = 0; vdev->interrupt = VFIO_INT_NONE; - vfio_enable_intx(vdev); + vfio_intx_enable(vdev); } -static void vfio_disable_msix(VFIOPCIDevice *vdev) +static void vfio_msix_disable(VFIOPCIDevice *vdev) { int i; @@ -900,17 +734,17 @@ static void vfio_disable_msix(VFIOPCIDevice *vdev) vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); } - vfio_disable_msi_common(vdev); + vfio_msi_disable_common(vdev); - trace_vfio_disable_msix(vdev->vbasedev.name); + trace_vfio_msix_disable(vdev->vbasedev.name); } -static void vfio_disable_msi(VFIOPCIDevice *vdev) +static void vfio_msi_disable(VFIOPCIDevice *vdev) { vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX); - vfio_disable_msi_common(vdev); + vfio_msi_disable_common(vdev); - trace_vfio_disable_msi(vdev->vbasedev.name); + trace_vfio_msi_disable(vdev->vbasedev.name); } static void vfio_update_msi(VFIOPCIDevice *vdev) @@ -1033,26 +867,6 @@ static const MemoryRegionOps vfio_rom_ops = { .endianness = DEVICE_LITTLE_ENDIAN, }; -static bool vfio_blacklist_opt_rom(VFIOPCIDevice *vdev) -{ - PCIDevice *pdev = &vdev->pdev; - uint16_t vendor_id, device_id; - int count = 0; - - vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID); - device_id = pci_get_word(pdev->config + PCI_DEVICE_ID); - - while (count < ARRAY_SIZE(romblacklist)) { - if (romblacklist[count].vendor_id == vendor_id && - romblacklist[count].device_id == device_id) { - return true; - } - count++; - } - - return false; -} - static void vfio_pci_size_rom(VFIOPCIDevice *vdev) { uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK); @@ -1130,7 +944,7 @@ static void vfio_pci_size_rom(VFIOPCIDevice *vdev) vdev->rom_read_failed = false; } -static void vfio_vga_write(void *opaque, hwaddr addr, +void vfio_vga_write(void *opaque, hwaddr addr, uint64_t data, unsigned size) { VFIOVGARegion *region = opaque; @@ -1166,7 +980,7 @@ static void vfio_vga_write(void *opaque, hwaddr addr, trace_vfio_vga_write(region->offset + addr, data, size); } -static uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size) +uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size) { VFIOVGARegion *region = opaque; VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]); @@ -1212,858 +1026,9 @@ static const MemoryRegionOps vfio_vga_ops = { }; /* - * Device specific quirks - */ - -/* Is range1 fully contained within range2? */ -static bool vfio_range_contained(uint64_t first1, uint64_t len1, - uint64_t first2, uint64_t len2) { - return (first1 >= first2 && first1 + len1 <= first2 + len2); -} - -static bool vfio_flags_enabled(uint8_t flags, uint8_t mask) -{ - return (mask && (flags & mask) == mask); -} - -static uint64_t vfio_generic_window_quirk_read(void *opaque, - hwaddr addr, unsigned size) -{ - VFIOQuirk *quirk = opaque; - VFIOPCIDevice *vdev = quirk->vdev; - uint64_t data; - - if (vfio_flags_enabled(quirk->data.flags, quirk->data.read_flags) && - ranges_overlap(addr, size, - quirk->data.data_offset, quirk->data.data_size)) { - hwaddr offset = addr - quirk->data.data_offset; - - if (!vfio_range_contained(addr, size, quirk->data.data_offset, - quirk->data.data_size)) { - hw_error("%s: window data read not fully contained: %s", - __func__, memory_region_name(&quirk->mem)); - } - - data = vfio_pci_read_config(&vdev->pdev, - quirk->data.address_val + offset, size); - - trace_vfio_generic_window_quirk_read(memory_region_name(&quirk->mem), - vdev->vbasedev.name, - quirk->data.bar, - addr, size, data); - } else { - data = vfio_region_read(&vdev->bars[quirk->data.bar].region, - addr + quirk->data.base_offset, size); - } - - return data; -} - -static void vfio_generic_window_quirk_write(void *opaque, hwaddr addr, - uint64_t data, unsigned size) -{ - VFIOQuirk *quirk = opaque; - VFIOPCIDevice *vdev = quirk->vdev; - - if (ranges_overlap(addr, size, - quirk->data.address_offset, quirk->data.address_size)) { - - if (addr != quirk->data.address_offset) { - hw_error("%s: offset write into address window: %s", - __func__, memory_region_name(&quirk->mem)); - } - - if ((data & ~quirk->data.address_mask) == quirk->data.address_match) { - quirk->data.flags |= quirk->data.write_flags | - quirk->data.read_flags; - quirk->data.address_val = data & quirk->data.address_mask; - } else { - quirk->data.flags &= ~(quirk->data.write_flags | - quirk->data.read_flags); - } - } - - if (vfio_flags_enabled(quirk->data.flags, quirk->data.write_flags) && - ranges_overlap(addr, size, - quirk->data.data_offset, quirk->data.data_size)) { - hwaddr offset = addr - quirk->data.data_offset; - - if (!vfio_range_contained(addr, size, quirk->data.data_offset, - quirk->data.data_size)) { - hw_error("%s: window data write not fully contained: %s", - __func__, memory_region_name(&quirk->mem)); - } - - vfio_pci_write_config(&vdev->pdev, - quirk->data.address_val + offset, data, size); - trace_vfio_generic_window_quirk_write(memory_region_name(&quirk->mem), - vdev->vbasedev.name, - quirk->data.bar, - addr, data, size); - return; - } - - vfio_region_write(&vdev->bars[quirk->data.bar].region, - addr + quirk->data.base_offset, data, size); -} - -static const MemoryRegionOps vfio_generic_window_quirk = { - .read = vfio_generic_window_quirk_read, - .write = vfio_generic_window_quirk_write, - .endianness = DEVICE_LITTLE_ENDIAN, -}; - -static uint64_t vfio_generic_quirk_read(void *opaque, - hwaddr addr, unsigned size) -{ - VFIOQuirk *quirk = opaque; - VFIOPCIDevice *vdev = quirk->vdev; - hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK; - hwaddr offset = quirk->data.address_match & ~TARGET_PAGE_MASK; - uint64_t data; - - if (vfio_flags_enabled(quirk->data.flags, quirk->data.read_flags) && - ranges_overlap(addr, size, offset, quirk->data.address_mask + 1)) { - if (!vfio_range_contained(addr, size, offset, - quirk->data.address_mask + 1)) { - hw_error("%s: read not fully contained: %s", - __func__, memory_region_name(&quirk->mem)); - } - - data = vfio_pci_read_config(&vdev->pdev, addr - offset, size); - - trace_vfio_generic_quirk_read(memory_region_name(&quirk->mem), - vdev->vbasedev.name, quirk->data.bar, - addr + base, size, data); - } else { - data = vfio_region_read(&vdev->bars[quirk->data.bar].region, - addr + base, size); - } - - return data; -} - -static void vfio_generic_quirk_write(void *opaque, hwaddr addr, - uint64_t data, unsigned size) -{ - VFIOQuirk *quirk = opaque; - VFIOPCIDevice *vdev = quirk->vdev; - hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK; - hwaddr offset = quirk->data.address_match & ~TARGET_PAGE_MASK; - - if (vfio_flags_enabled(quirk->data.flags, quirk->data.write_flags) && - ranges_overlap(addr, size, offset, quirk->data.address_mask + 1)) { - if (!vfio_range_contained(addr, size, offset, - quirk->data.address_mask + 1)) { - hw_error("%s: write not fully contained: %s", - __func__, memory_region_name(&quirk->mem)); - } - - vfio_pci_write_config(&vdev->pdev, addr - offset, data, size); - - trace_vfio_generic_quirk_write(memory_region_name(&quirk->mem), - vdev->vbasedev.name, quirk->data.bar, - addr + base, data, size); - } else { - vfio_region_write(&vdev->bars[quirk->data.bar].region, - addr + base, data, size); - } -} - -static const MemoryRegionOps vfio_generic_quirk = { - .read = vfio_generic_quirk_read, - .write = vfio_generic_quirk_write, - .endianness = DEVICE_LITTLE_ENDIAN, -}; - -#define PCI_VENDOR_ID_ATI 0x1002 - -/* - * Radeon HD cards (HD5450 & HD7850) report the upper byte of the I/O port BAR - * through VGA register 0x3c3. On newer cards, the I/O port BAR is always - * BAR4 (older cards like the X550 used BAR1, but we don't care to support - * those). Note that on bare metal, a read of 0x3c3 doesn't always return the - * I/O port BAR address. Originally this was coded to return the virtual BAR - * address only if the physical register read returns the actual BAR address, - * but users have reported greater success if we return the virtual address - * unconditionally. - */ -static uint64_t vfio_ati_3c3_quirk_read(void *opaque, - hwaddr addr, unsigned size) -{ - VFIOQuirk *quirk = opaque; - VFIOPCIDevice *vdev = quirk->vdev; - uint64_t data = vfio_pci_read_config(&vdev->pdev, - PCI_BASE_ADDRESS_0 + (4 * 4) + 1, - size); - trace_vfio_ati_3c3_quirk_read(data); - - return data; -} - -static const MemoryRegionOps vfio_ati_3c3_quirk = { - .read = vfio_ati_3c3_quirk_read, - .endianness = DEVICE_LITTLE_ENDIAN, -}; - -static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev) -{ - PCIDevice *pdev = &vdev->pdev; - VFIOQuirk *quirk; - - if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) { - return; - } - - /* - * As long as the BAR is >= 256 bytes it will be aligned such that the - * lower byte is always zero. Filter out anything else, if it exists. - */ - if (!vdev->bars[4].ioport || vdev->bars[4].region.size < 256) { - return; - } - - quirk = g_malloc0(sizeof(*quirk)); - quirk->vdev = vdev; - - memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, quirk, - "vfio-ati-3c3-quirk", 1); - memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem, - 3 /* offset 3 bytes from 0x3c0 */, &quirk->mem); - - QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks, - quirk, next); - - trace_vfio_vga_probe_ati_3c3_quirk(vdev->vbasedev.name); -} - -/* - * Newer ATI/AMD devices, including HD5450 and HD7850, have a window to PCI - * config space through MMIO BAR2 at offset 0x4000. Nothing seems to access - * the MMIO space directly, but a window to this space is provided through - * I/O port BAR4. Offset 0x0 is the address register and offset 0x4 is the - * data register. When the address is programmed to a range of 0x4000-0x4fff - * PCI configuration space is available. Experimentation seems to indicate - * that only read-only access is provided, but we drop writes when the window - * is enabled to config space nonetheless. - */ -static void vfio_probe_ati_bar4_window_quirk(VFIOPCIDevice *vdev, int nr) -{ - PCIDevice *pdev = &vdev->pdev; - VFIOQuirk *quirk; - - if (!vdev->has_vga || nr != 4 || - pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) { - return; - } - - quirk = g_malloc0(sizeof(*quirk)); - quirk->vdev = vdev; - quirk->data.address_size = 4; - quirk->data.data_offset = 4; - quirk->data.data_size = 4; - quirk->data.address_match = 0x4000; - quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1; - quirk->data.bar = nr; - quirk->data.read_flags = quirk->data.write_flags = 1; - - memory_region_init_io(&quirk->mem, OBJECT(vdev), - &vfio_generic_window_quirk, quirk, - "vfio-ati-bar4-window-quirk", 8); - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, - quirk->data.base_offset, &quirk->mem, 1); - - QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); - - trace_vfio_probe_ati_bar4_window_quirk(vdev->vbasedev.name); -} - -#define PCI_VENDOR_ID_REALTEK 0x10ec - -/* - * RTL8168 devices have a backdoor that can access the MSI-X table. At BAR2 - * offset 0x70 there is a dword data register, offset 0x74 is a dword address - * register. According to the Linux r8169 driver, the MSI-X table is addressed - * when the "type" portion of the address register is set to 0x1. This appears - * to be bits 16:30. Bit 31 is both a write indicator and some sort of - * "address latched" indicator. Bits 12:15 are a mask field, which we can - * ignore because the MSI-X table should always be accessed as a dword (full - * mask). Bits 0:11 is offset within the type. - * - * Example trace: - * - * Read from MSI-X table offset 0 - * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x1f000, 4) // store read addr - * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x8001f000 // latch - * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data - * - * Write 0xfee00000 to MSI-X table offset 0 - * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x70, 0xfee00000, 4) // write data - * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x8001f000, 4) // do write - * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x1f000 // complete - */ - -static uint64_t vfio_rtl8168_window_quirk_read(void *opaque, - hwaddr addr, unsigned size) -{ - VFIOQuirk *quirk = opaque; - VFIOPCIDevice *vdev = quirk->vdev; - - switch (addr) { - case 4: /* address */ - if (quirk->data.flags) { - trace_vfio_rtl8168_window_quirk_read_fake( - memory_region_name(&quirk->mem), - vdev->vbasedev.name); - - return quirk->data.address_match ^ 0x80000000U; - } - break; - case 0: /* data */ - if (quirk->data.flags) { - uint64_t val; - - trace_vfio_rtl8168_window_quirk_read_table( - memory_region_name(&quirk->mem), - vdev->vbasedev.name); - - if (!(vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) { - return 0; - } - - memory_region_dispatch_read(&vdev->pdev.msix_table_mmio, - (hwaddr)(quirk->data.address_match - & 0xfff), - &val, - size, - MEMTXATTRS_UNSPECIFIED); - return val; - } - } - - trace_vfio_rtl8168_window_quirk_read_direct(memory_region_name(&quirk->mem), - vdev->vbasedev.name); - - return vfio_region_read(&vdev->bars[quirk->data.bar].region, - addr + 0x70, size); -} - -static void vfio_rtl8168_window_quirk_write(void *opaque, hwaddr addr, - uint64_t data, unsigned size) -{ - VFIOQuirk *quirk = opaque; - VFIOPCIDevice *vdev = quirk->vdev; - - switch (addr) { - case 4: /* address */ - if ((data & 0x7fff0000) == 0x10000) { - if (data & 0x80000000U && - vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX) { - - trace_vfio_rtl8168_window_quirk_write_table( - memory_region_name(&quirk->mem), - vdev->vbasedev.name); - - memory_region_dispatch_write(&vdev->pdev.msix_table_mmio, - (hwaddr)(data & 0xfff), - (uint64_t)quirk->data.address_mask, - size, MEMTXATTRS_UNSPECIFIED); - } - - quirk->data.flags = 1; - quirk->data.address_match = data; - - return; - } - quirk->data.flags = 0; - break; - case 0: /* data */ - quirk->data.address_mask = data; - break; - } - - trace_vfio_rtl8168_window_quirk_write_direct( - memory_region_name(&quirk->mem), - vdev->vbasedev.name); - - vfio_region_write(&vdev->bars[quirk->data.bar].region, - addr + 0x70, data, size); -} - -static const MemoryRegionOps vfio_rtl8168_window_quirk = { - .read = vfio_rtl8168_window_quirk_read, - .write = vfio_rtl8168_window_quirk_write, - .valid = { - .min_access_size = 4, - .max_access_size = 4, - .unaligned = false, - }, - .endianness = DEVICE_LITTLE_ENDIAN, -}; - -static void vfio_probe_rtl8168_bar2_window_quirk(VFIOPCIDevice *vdev, int nr) -{ - PCIDevice *pdev = &vdev->pdev; - VFIOQuirk *quirk; - - if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_REALTEK || - pci_get_word(pdev->config + PCI_DEVICE_ID) != 0x8168 || nr != 2) { - return; - } - - quirk = g_malloc0(sizeof(*quirk)); - quirk->vdev = vdev; - quirk->data.bar = nr; - - memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_rtl8168_window_quirk, - quirk, "vfio-rtl8168-window-quirk", 8); - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, - 0x70, &quirk->mem, 1); - - QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); - - trace_vfio_probe_rtl8168_bar2_window_quirk(vdev->vbasedev.name); -} -/* - * Trap the BAR2 MMIO window to config space as well. - */ -static void vfio_probe_ati_bar2_4000_quirk(VFIOPCIDevice *vdev, int nr) -{ - PCIDevice *pdev = &vdev->pdev; - VFIOQuirk *quirk; - - /* Only enable on newer devices where BAR2 is 64bit */ - if (!vdev->has_vga || nr != 2 || !vdev->bars[2].mem64 || - pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI) { - return; - } - - quirk = g_malloc0(sizeof(*quirk)); - quirk->vdev = vdev; - quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1; - quirk->data.address_match = 0x4000; - quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1; - quirk->data.bar = nr; - - memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_generic_quirk, quirk, - "vfio-ati-bar2-4000-quirk", - TARGET_PAGE_ALIGN(quirk->data.address_mask + 1)); - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, - quirk->data.address_match & TARGET_PAGE_MASK, - &quirk->mem, 1); - - QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); - - trace_vfio_probe_ati_bar2_4000_quirk(vdev->vbasedev.name); -} - -/* - * Older ATI/AMD cards like the X550 have a similar window to that above. - * I/O port BAR1 provides a window to a mirror of PCI config space located - * in BAR2 at offset 0xf00. We don't care to support such older cards, but - * note it for future reference. - */ - -#define PCI_VENDOR_ID_NVIDIA 0x10de - -/* - * Nvidia has several different methods to get to config space, the - * nouveu project has several of these documented here: - * https://github.com/pathscale/envytools/tree/master/hwdocs - * - * The first quirk is actually not documented in envytools and is found - * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]). This is an - * NV46 chipset. The backdoor uses the legacy VGA I/O ports to access - * the mirror of PCI config space found at BAR0 offset 0x1800. The access - * sequence first writes 0x338 to I/O port 0x3d4. The target offset is - * then written to 0x3d0. Finally 0x538 is written for a read and 0x738 - * is written for a write to 0x3d4. The BAR0 offset is then accessible - * through 0x3d0. This quirk doesn't seem to be necessary on newer cards - * that use the I/O port BAR5 window but it doesn't hurt to leave it. - */ -enum { - NV_3D0_NONE = 0, - NV_3D0_SELECT, - NV_3D0_WINDOW, - NV_3D0_READ, - NV_3D0_WRITE, -}; - -static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque, - hwaddr addr, unsigned size) -{ - VFIOQuirk *quirk = opaque; - VFIOPCIDevice *vdev = quirk->vdev; - PCIDevice *pdev = &vdev->pdev; - uint64_t data = vfio_vga_read(&vdev->vga.region[QEMU_PCI_VGA_IO_HI], - addr + quirk->data.base_offset, size); - - if (quirk->data.flags == NV_3D0_READ && addr == quirk->data.data_offset) { - data = vfio_pci_read_config(pdev, quirk->data.address_val, size); - trace_vfio_nvidia_3d0_quirk_read(size, data); - } - - quirk->data.flags = NV_3D0_NONE; - - return data; -} - -static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr, - uint64_t data, unsigned size) -{ - VFIOQuirk *quirk = opaque; - VFIOPCIDevice *vdev = quirk->vdev; - PCIDevice *pdev = &vdev->pdev; - - switch (quirk->data.flags) { - case NV_3D0_NONE: - if (addr == quirk->data.address_offset && data == 0x338) { - quirk->data.flags = NV_3D0_SELECT; - } - break; - case NV_3D0_SELECT: - quirk->data.flags = NV_3D0_NONE; - if (addr == quirk->data.data_offset && - (data & ~quirk->data.address_mask) == quirk->data.address_match) { - quirk->data.flags = NV_3D0_WINDOW; - quirk->data.address_val = data & quirk->data.address_mask; - } - break; - case NV_3D0_WINDOW: - quirk->data.flags = NV_3D0_NONE; - if (addr == quirk->data.address_offset) { - if (data == 0x538) { - quirk->data.flags = NV_3D0_READ; - } else if (data == 0x738) { - quirk->data.flags = NV_3D0_WRITE; - } - } - break; - case NV_3D0_WRITE: - quirk->data.flags = NV_3D0_NONE; - if (addr == quirk->data.data_offset) { - vfio_pci_write_config(pdev, quirk->data.address_val, data, size); - trace_vfio_nvidia_3d0_quirk_write(data, size); - return; - } - break; - } - - vfio_vga_write(&vdev->vga.region[QEMU_PCI_VGA_IO_HI], - addr + quirk->data.base_offset, data, size); -} - -static const MemoryRegionOps vfio_nvidia_3d0_quirk = { - .read = vfio_nvidia_3d0_quirk_read, - .write = vfio_nvidia_3d0_quirk_write, - .endianness = DEVICE_LITTLE_ENDIAN, -}; - -static void vfio_vga_probe_nvidia_3d0_quirk(VFIOPCIDevice *vdev) -{ - PCIDevice *pdev = &vdev->pdev; - VFIOQuirk *quirk; - - if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA || - !vdev->bars[1].region.size) { - return; - } - - quirk = g_malloc0(sizeof(*quirk)); - quirk->vdev = vdev; - quirk->data.base_offset = 0x10; - quirk->data.address_offset = 4; - quirk->data.address_size = 2; - quirk->data.address_match = 0x1800; - quirk->data.address_mask = PCI_CONFIG_SPACE_SIZE - 1; - quirk->data.data_offset = 0; - quirk->data.data_size = 4; - - memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_nvidia_3d0_quirk, - quirk, "vfio-nvidia-3d0-quirk", 6); - memory_region_add_subregion(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem, - quirk->data.base_offset, &quirk->mem); - - QLIST_INSERT_HEAD(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks, - quirk, next); - - trace_vfio_vga_probe_nvidia_3d0_quirk(vdev->vbasedev.name); -} - -/* - * The second quirk is documented in envytools. The I/O port BAR5 is just - * a set of address/data ports to the MMIO BARs. The BAR we care about is - * again BAR0. This backdoor is apparently a bit newer than the one above - * so we need to not only trap 256 bytes @0x1800, but all of PCI config - * space, including extended space is available at the 4k @0x88000. - */ -enum { - NV_BAR5_ADDRESS = 0x1, - NV_BAR5_ENABLE = 0x2, - NV_BAR5_MASTER = 0x4, - NV_BAR5_VALID = 0x7, -}; - -static void vfio_nvidia_bar5_window_quirk_write(void *opaque, hwaddr addr, - uint64_t data, unsigned size) -{ - VFIOQuirk *quirk = opaque; - - switch (addr) { - case 0x0: - if (data & 0x1) { - quirk->data.flags |= NV_BAR5_MASTER; - } else { - quirk->data.flags &= ~NV_BAR5_MASTER; - } - break; - case 0x4: - if (data & 0x1) { - quirk->data.flags |= NV_BAR5_ENABLE; - } else { - quirk->data.flags &= ~NV_BAR5_ENABLE; - } - break; - case 0x8: - if (quirk->data.flags & NV_BAR5_MASTER) { - if ((data & ~0xfff) == 0x88000) { - quirk->data.flags |= NV_BAR5_ADDRESS; - quirk->data.address_val = data & 0xfff; - } else if ((data & ~0xff) == 0x1800) { - quirk->data.flags |= NV_BAR5_ADDRESS; - quirk->data.address_val = data & 0xff; - } else { - quirk->data.flags &= ~NV_BAR5_ADDRESS; - } - } - break; - } - - vfio_generic_window_quirk_write(opaque, addr, data, size); -} - -static const MemoryRegionOps vfio_nvidia_bar5_window_quirk = { - .read = vfio_generic_window_quirk_read, - .write = vfio_nvidia_bar5_window_quirk_write, - .valid.min_access_size = 4, - .endianness = DEVICE_LITTLE_ENDIAN, -}; - -static void vfio_probe_nvidia_bar5_window_quirk(VFIOPCIDevice *vdev, int nr) -{ - PCIDevice *pdev = &vdev->pdev; - VFIOQuirk *quirk; - - if (!vdev->has_vga || nr != 5 || - pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) { - return; - } - - quirk = g_malloc0(sizeof(*quirk)); - quirk->vdev = vdev; - quirk->data.read_flags = quirk->data.write_flags = NV_BAR5_VALID; - quirk->data.address_offset = 0x8; - quirk->data.address_size = 0; /* actually 4, but avoids generic code */ - quirk->data.data_offset = 0xc; - quirk->data.data_size = 4; - quirk->data.bar = nr; - - memory_region_init_io(&quirk->mem, OBJECT(vdev), - &vfio_nvidia_bar5_window_quirk, quirk, - "vfio-nvidia-bar5-window-quirk", 16); - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, - 0, &quirk->mem, 1); - - QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); - - trace_vfio_probe_nvidia_bar5_window_quirk(vdev->vbasedev.name); -} - -static void vfio_nvidia_88000_quirk_write(void *opaque, hwaddr addr, - uint64_t data, unsigned size) -{ - VFIOQuirk *quirk = opaque; - VFIOPCIDevice *vdev = quirk->vdev; - PCIDevice *pdev = &vdev->pdev; - hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK; - - vfio_generic_quirk_write(opaque, addr, data, size); - - /* - * Nvidia seems to acknowledge MSI interrupts by writing 0xff to the - * MSI capability ID register. Both the ID and next register are - * read-only, so we allow writes covering either of those to real hw. - * NB - only fixed for the 0x88000 MMIO window. - */ - if ((pdev->cap_present & QEMU_PCI_CAP_MSI) && - vfio_range_contained(addr, size, pdev->msi_cap, PCI_MSI_FLAGS)) { - vfio_region_write(&vdev->bars[quirk->data.bar].region, - addr + base, data, size); - } -} - -static const MemoryRegionOps vfio_nvidia_88000_quirk = { - .read = vfio_generic_quirk_read, - .write = vfio_nvidia_88000_quirk_write, - .endianness = DEVICE_LITTLE_ENDIAN, -}; - -/* - * Finally, BAR0 itself. We want to redirect any accesses to either - * 0x1800 or 0x88000 through the PCI config space access functions. - * - * NB - quirk at a page granularity or else they don't seem to work when - * BARs are mmap'd - * - * Here's offset 0x88000... - */ -static void vfio_probe_nvidia_bar0_88000_quirk(VFIOPCIDevice *vdev, int nr) -{ - PCIDevice *pdev = &vdev->pdev; - VFIOQuirk *quirk; - uint16_t vendor, class; - - vendor = pci_get_word(pdev->config + PCI_VENDOR_ID); - class = pci_get_word(pdev->config + PCI_CLASS_DEVICE); - - if (nr != 0 || vendor != PCI_VENDOR_ID_NVIDIA || - class != PCI_CLASS_DISPLAY_VGA) { - return; - } - - quirk = g_malloc0(sizeof(*quirk)); - quirk->vdev = vdev; - quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1; - quirk->data.address_match = 0x88000; - quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1; - quirk->data.bar = nr; - - memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_nvidia_88000_quirk, - quirk, "vfio-nvidia-bar0-88000-quirk", - TARGET_PAGE_ALIGN(quirk->data.address_mask + 1)); - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, - quirk->data.address_match & TARGET_PAGE_MASK, - &quirk->mem, 1); - - QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); - - trace_vfio_probe_nvidia_bar0_88000_quirk(vdev->vbasedev.name); -} - -/* - * And here's the same for BAR0 offset 0x1800... - */ -static void vfio_probe_nvidia_bar0_1800_quirk(VFIOPCIDevice *vdev, int nr) -{ - PCIDevice *pdev = &vdev->pdev; - VFIOQuirk *quirk; - - if (!vdev->has_vga || nr != 0 || - pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_NVIDIA) { - return; - } - - /* Log the chipset ID */ - trace_vfio_probe_nvidia_bar0_1800_quirk_id( - (unsigned int)(vfio_region_read(&vdev->bars[0].region, 0, 4) >> 20) - & 0xff); - - quirk = g_malloc0(sizeof(*quirk)); - quirk->vdev = vdev; - quirk->data.flags = quirk->data.read_flags = quirk->data.write_flags = 1; - quirk->data.address_match = 0x1800; - quirk->data.address_mask = PCI_CONFIG_SPACE_SIZE - 1; - quirk->data.bar = nr; - - memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_generic_quirk, quirk, - "vfio-nvidia-bar0-1800-quirk", - TARGET_PAGE_ALIGN(quirk->data.address_mask + 1)); - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, - quirk->data.address_match & TARGET_PAGE_MASK, - &quirk->mem, 1); - - QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); - - trace_vfio_probe_nvidia_bar0_1800_quirk(vdev->vbasedev.name); -} - -/* - * TODO - Some Nvidia devices provide config access to their companion HDA - * device and even to their parent bridge via these config space mirrors. - * Add quirks for those regions. - */ - -/* - * Common quirk probe entry points. - */ -static void vfio_vga_quirk_setup(VFIOPCIDevice *vdev) -{ - vfio_vga_probe_ati_3c3_quirk(vdev); - vfio_vga_probe_nvidia_3d0_quirk(vdev); -} - -static void vfio_vga_quirk_teardown(VFIOPCIDevice *vdev) -{ - VFIOQuirk *quirk; - int i; - - for (i = 0; i < ARRAY_SIZE(vdev->vga.region); i++) { - QLIST_FOREACH(quirk, &vdev->vga.region[i].quirks, next) { - memory_region_del_subregion(&vdev->vga.region[i].mem, &quirk->mem); - } - } -} - -static void vfio_vga_quirk_free(VFIOPCIDevice *vdev) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(vdev->vga.region); i++) { - while (!QLIST_EMPTY(&vdev->vga.region[i].quirks)) { - VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga.region[i].quirks); - object_unparent(OBJECT(&quirk->mem)); - QLIST_REMOVE(quirk, next); - g_free(quirk); - } - } -} - -static void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr) -{ - vfio_probe_ati_bar4_window_quirk(vdev, nr); - vfio_probe_ati_bar2_4000_quirk(vdev, nr); - vfio_probe_nvidia_bar5_window_quirk(vdev, nr); - vfio_probe_nvidia_bar0_88000_quirk(vdev, nr); - vfio_probe_nvidia_bar0_1800_quirk(vdev, nr); - vfio_probe_rtl8168_bar2_window_quirk(vdev, nr); -} - -static void vfio_bar_quirk_teardown(VFIOPCIDevice *vdev, int nr) -{ - VFIOBAR *bar = &vdev->bars[nr]; - VFIOQuirk *quirk; - - QLIST_FOREACH(quirk, &bar->quirks, next) { - memory_region_del_subregion(&bar->region.mem, &quirk->mem); - } -} - -static void vfio_bar_quirk_free(VFIOPCIDevice *vdev, int nr) -{ - VFIOBAR *bar = &vdev->bars[nr]; - - while (!QLIST_EMPTY(&bar->quirks)) { - VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks); - object_unparent(OBJECT(&quirk->mem)); - QLIST_REMOVE(quirk, next); - g_free(quirk); - } -} - -/* * PCI config space */ -static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) +uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) { VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val; @@ -2096,8 +1061,8 @@ static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) return val; } -static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, - uint32_t val, int len) +void vfio_pci_write_config(PCIDevice *pdev, + uint32_t addr, uint32_t val, int len) { VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); uint32_t val_le = cpu_to_le32(val); @@ -2123,11 +1088,11 @@ static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, if (!was_enabled) { if (is_enabled) { - vfio_enable_msi(vdev); + vfio_msi_enable(vdev); } } else { if (!is_enabled) { - vfio_disable_msi(vdev); + vfio_msi_disable(vdev); } else { vfio_update_msi(vdev); } @@ -2141,9 +1106,9 @@ static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, is_enabled = msix_enabled(pdev); if (!was_enabled && is_enabled) { - vfio_enable_msix(vdev); + vfio_msix_enable(vdev); } else if (was_enabled && !is_enabled) { - vfio_disable_msix(vdev); + vfio_msix_disable(vdev); } } else { /* Write everything to QEMU to keep emulated bits correct */ @@ -2162,17 +1127,17 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev) * disable MSI/X and then cleanup by disabling INTx. */ if (vdev->interrupt == VFIO_INT_MSIX) { - vfio_disable_msix(vdev); + vfio_msix_disable(vdev); } else if (vdev->interrupt == VFIO_INT_MSI) { - vfio_disable_msi(vdev); + vfio_msi_disable(vdev); } if (vdev->interrupt == VFIO_INT_INTx) { - vfio_disable_intx(vdev); + vfio_intx_disable(vdev); } } -static int vfio_setup_msi(VFIOPCIDevice *vdev, int pos) +static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos) { uint16_t ctrl; bool msi_64bit, msi_maskbit; @@ -2188,7 +1153,7 @@ static int vfio_setup_msi(VFIOPCIDevice *vdev, int pos) msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT); entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1); - trace_vfio_setup_msi(vdev->vbasedev.name, pos); + trace_vfio_msi_setup(vdev->vbasedev.name, pos); ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit); if (ret < 0) { @@ -2211,12 +1176,13 @@ static int vfio_setup_msi(VFIOPCIDevice *vdev, int pos) * need to first look for where the MSI-X table lives. So we * unfortunately split MSI-X setup across two functions. */ -static int vfio_early_setup_msix(VFIOPCIDevice *vdev) +static int vfio_msix_early_setup(VFIOPCIDevice *vdev) { uint8_t pos; uint16_t ctrl; uint32_t table, pba; int fd = vdev->vbasedev.fd; + VFIOMSIXInfo *msix; pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX); if (!pos) { @@ -2242,49 +1208,44 @@ static int vfio_early_setup_msix(VFIOPCIDevice *vdev) table = le32_to_cpu(table); pba = le32_to_cpu(pba); - vdev->msix = g_malloc0(sizeof(*(vdev->msix))); - vdev->msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK; - vdev->msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK; - vdev->msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK; - vdev->msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK; - vdev->msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; + msix = g_malloc0(sizeof(*msix)); + msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK; + msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK; + msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK; + msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK; + msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; /* * Test the size of the pba_offset variable and catch if it extends outside * of the specified BAR. If it is the case, we need to apply a hardware * specific quirk if the device is known or we have a broken configuration. */ - if (vdev->msix->pba_offset >= - vdev->bars[vdev->msix->pba_bar].region.size) { - - PCIDevice *pdev = &vdev->pdev; - uint16_t vendor = pci_get_word(pdev->config + PCI_VENDOR_ID); - uint16_t device = pci_get_word(pdev->config + PCI_DEVICE_ID); - + if (msix->pba_offset >= vdev->bars[msix->pba_bar].region.size) { /* * Chelsio T5 Virtual Function devices are encoded as 0x58xx for T5 * adapters. The T5 hardware returns an incorrect value of 0x8000 for * the VF PBA offset while the BAR itself is only 8k. The correct value * is 0x1000, so we hard code that here. */ - if (vendor == PCI_VENDOR_ID_CHELSIO && (device & 0xff00) == 0x5800) { - vdev->msix->pba_offset = 0x1000; + if (vdev->vendor_id == PCI_VENDOR_ID_CHELSIO && + (vdev->device_id & 0xff00) == 0x5800) { + msix->pba_offset = 0x1000; } else { error_report("vfio: Hardware reports invalid configuration, " "MSIX PBA outside of specified BAR"); + g_free(msix); return -EINVAL; } } - trace_vfio_early_setup_msix(vdev->vbasedev.name, pos, - vdev->msix->table_bar, - vdev->msix->table_offset, - vdev->msix->entries); + trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar, + msix->table_offset, msix->entries); + vdev->msix = msix; return 0; } -static int vfio_setup_msix(VFIOPCIDevice *vdev, int pos) +static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos) { int ret; @@ -2707,14 +1668,14 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos) switch (cap_id) { case PCI_CAP_ID_MSI: - ret = vfio_setup_msi(vdev, pos); + ret = vfio_msi_setup(vdev, pos); break; case PCI_CAP_ID_EXP: vfio_check_pcie_flr(vdev, pos); ret = vfio_setup_pcie_cap(vdev, pos, size); break; case PCI_CAP_ID_MSIX: - ret = vfio_setup_msix(vdev, pos); + ret = vfio_msix_setup(vdev, pos); break; case PCI_CAP_ID_PM: vfio_check_pm_reset(vdev, pos); @@ -2792,7 +1753,7 @@ static void vfio_pci_pre_reset(VFIOPCIDevice *vdev) static void vfio_pci_post_reset(VFIOPCIDevice *vdev) { - vfio_enable_intx(vdev); + vfio_intx_enable(vdev); } static bool vfio_pci_host_match(PCIHostDeviceAddress *host1, @@ -3016,7 +1977,7 @@ static void vfio_pci_compute_needs_reset(VFIODevice *vbasedev) static VFIODeviceOps vfio_pci_ops = { .vfio_compute_needs_reset = vfio_pci_compute_needs_reset, .vfio_hot_reset_multi = vfio_pci_hot_reset_multi, - .vfio_eoi = vfio_eoi, + .vfio_eoi = vfio_intx_eoi, }; static int vfio_populate_device(VFIOPCIDevice *vdev) @@ -3351,162 +2312,6 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev) vdev->req_enabled = false; } -/* - * AMD Radeon PCI config reset, based on Linux: - * drivers/gpu/drm/radeon/ci_smc.c:ci_is_smc_running() - * drivers/gpu/drm/radeon/radeon_device.c:radeon_pci_config_reset - * drivers/gpu/drm/radeon/ci_smc.c:ci_reset_smc() - * drivers/gpu/drm/radeon/ci_smc.c:ci_stop_smc_clock() - * IDs: include/drm/drm_pciids.h - * Registers: http://cgit.freedesktop.org/~agd5f/linux/commit/?id=4e2aa447f6f0 - * - * Bonaire and Hawaii GPUs do not respond to a bus reset. This is a bug in the - * hardware that should be fixed on future ASICs. The symptom of this is that - * once the accerlated driver loads, Windows guests will bsod on subsequent - * attmpts to load the driver, such as after VM reset or shutdown/restart. To - * work around this, we do an AMD specific PCI config reset, followed by an SMC - * reset. The PCI config reset only works if SMC firmware is running, so we - * have a dependency on the state of the device as to whether this reset will - * be effective. There are still cases where we won't be able to kick the - * device into working, but this greatly improves the usability overall. The - * config reset magic is relatively common on AMD GPUs, but the setup and SMC - * poking is largely ASIC specific. - */ -static bool vfio_radeon_smc_is_running(VFIOPCIDevice *vdev) -{ - uint32_t clk, pc_c; - - /* - * Registers 200h and 204h are index and data registers for accessing - * indirect configuration registers within the device. - */ - vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4); - clk = vfio_region_read(&vdev->bars[5].region, 0x204, 4); - vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000370, 4); - pc_c = vfio_region_read(&vdev->bars[5].region, 0x204, 4); - - return (!(clk & 1) && (0x20100 <= pc_c)); -} - -/* - * The scope of a config reset is controlled by a mode bit in the misc register - * and a fuse, exposed as a bit in another register. The fuse is the default - * (0 = GFX, 1 = whole GPU), the misc bit is a toggle, with the forumula - * scope = !(misc ^ fuse), where the resulting scope is defined the same as - * the fuse. A truth table therefore tells us that if misc == fuse, we need - * to flip the value of the bit in the misc register. - */ -static void vfio_radeon_set_gfx_only_reset(VFIOPCIDevice *vdev) -{ - uint32_t misc, fuse; - bool a, b; - - vfio_region_write(&vdev->bars[5].region, 0x200, 0xc00c0000, 4); - fuse = vfio_region_read(&vdev->bars[5].region, 0x204, 4); - b = fuse & 64; - - vfio_region_write(&vdev->bars[5].region, 0x200, 0xc0000010, 4); - misc = vfio_region_read(&vdev->bars[5].region, 0x204, 4); - a = misc & 2; - - if (a == b) { - vfio_region_write(&vdev->bars[5].region, 0x204, misc ^ 2, 4); - vfio_region_read(&vdev->bars[5].region, 0x204, 4); /* flush */ - } -} - -static int vfio_radeon_reset(VFIOPCIDevice *vdev) -{ - PCIDevice *pdev = &vdev->pdev; - int i, ret = 0; - uint32_t data; - - /* Defer to a kernel implemented reset */ - if (vdev->vbasedev.reset_works) { - return -ENODEV; - } - - /* Enable only memory BAR access */ - vfio_pci_write_config(pdev, PCI_COMMAND, PCI_COMMAND_MEMORY, 2); - - /* Reset only works if SMC firmware is loaded and running */ - if (!vfio_radeon_smc_is_running(vdev)) { - ret = -EINVAL; - goto out; - } - - /* Make sure only the GFX function is reset */ - vfio_radeon_set_gfx_only_reset(vdev); - - /* AMD PCI config reset */ - vfio_pci_write_config(pdev, 0x7c, 0x39d5e86b, 4); - usleep(100); - - /* Read back the memory size to make sure we're out of reset */ - for (i = 0; i < 100000; i++) { - if (vfio_region_read(&vdev->bars[5].region, 0x5428, 4) != 0xffffffff) { - break; - } - usleep(1); - } - - /* Reset SMC */ - vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000000, 4); - data = vfio_region_read(&vdev->bars[5].region, 0x204, 4); - data |= 1; - vfio_region_write(&vdev->bars[5].region, 0x204, data, 4); - - /* Disable SMC clock */ - vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4); - data = vfio_region_read(&vdev->bars[5].region, 0x204, 4); - data |= 1; - vfio_region_write(&vdev->bars[5].region, 0x204, data, 4); - -out: - /* Restore PCI command register */ - vfio_pci_write_config(pdev, PCI_COMMAND, 0, 2); - - return ret; -} - -static void vfio_setup_resetfn(VFIOPCIDevice *vdev) -{ - PCIDevice *pdev = &vdev->pdev; - uint16_t vendor, device; - - vendor = pci_get_word(pdev->config + PCI_VENDOR_ID); - device = pci_get_word(pdev->config + PCI_DEVICE_ID); - - switch (vendor) { - case 0x1002: - switch (device) { - /* Bonaire */ - case 0x6649: /* Bonaire [FirePro W5100] */ - case 0x6650: - case 0x6651: - case 0x6658: /* Bonaire XTX [Radeon R7 260X] */ - case 0x665c: /* Bonaire XT [Radeon HD 7790/8770 / R9 260 OEM] */ - case 0x665d: /* Bonaire [Radeon R7 200 Series] */ - /* Hawaii */ - case 0x67A0: /* Hawaii XT GL [FirePro W9100] */ - case 0x67A1: /* Hawaii PRO GL [FirePro W8100] */ - case 0x67A2: - case 0x67A8: - case 0x67A9: - case 0x67AA: - case 0x67B0: /* Hawaii XT [Radeon R9 290X] */ - case 0x67B1: /* Hawaii PRO [Radeon R9 290] */ - case 0x67B8: - case 0x67B9: - case 0x67BA: - case 0x67BE: - vdev->resetfn = vfio_radeon_reset; - break; - } - break; - } -} - static int vfio_initfn(PCIDevice *pdev) { VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); @@ -3599,6 +2404,54 @@ static int vfio_initfn(PCIDevice *pdev) /* QEMU can choose to expose the ROM or not */ memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4); + /* + * The PCI spec reserves vendor ID 0xffff as an invalid value. The + * device ID is managed by the vendor and need only be a 16-bit value. + * Allow any 16-bit value for subsystem so they can be hidden or changed. + */ + if (vdev->vendor_id != PCI_ANY_ID) { + if (vdev->vendor_id >= 0xffff) { + error_report("vfio: Invalid PCI vendor ID provided"); + return -EINVAL; + } + vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0); + trace_vfio_pci_emulated_vendor_id(vdev->vbasedev.name, vdev->vendor_id); + } else { + vdev->vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID); + } + + if (vdev->device_id != PCI_ANY_ID) { + if (vdev->device_id > 0xffff) { + error_report("vfio: Invalid PCI device ID provided"); + return -EINVAL; + } + vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0); + trace_vfio_pci_emulated_device_id(vdev->vbasedev.name, vdev->device_id); + } else { + vdev->device_id = pci_get_word(pdev->config + PCI_DEVICE_ID); + } + + if (vdev->sub_vendor_id != PCI_ANY_ID) { + if (vdev->sub_vendor_id > 0xffff) { + error_report("vfio: Invalid PCI subsystem vendor ID provided"); + return -EINVAL; + } + vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID, + vdev->sub_vendor_id, ~0); + trace_vfio_pci_emulated_sub_vendor_id(vdev->vbasedev.name, + vdev->sub_vendor_id); + } + + if (vdev->sub_device_id != PCI_ANY_ID) { + if (vdev->sub_device_id > 0xffff) { + error_report("vfio: Invalid PCI subsystem device ID provided"); + return -EINVAL; + } + vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0); + trace_vfio_pci_emulated_sub_device_id(vdev->vbasedev.name, + vdev->sub_device_id); + } + /* QEMU can change multi-function devices to single function, or reverse */ vdev->emulated_config_bits[PCI_HEADER_TYPE] = PCI_HEADER_TYPE_MULTI_FUNCTION; @@ -3620,7 +2473,7 @@ static int vfio_initfn(PCIDevice *pdev) vfio_pci_size_rom(vdev); - ret = vfio_early_setup_msix(vdev); + ret = vfio_msix_early_setup(vdev); if (ret) { return ret; } @@ -3646,8 +2499,8 @@ static int vfio_initfn(PCIDevice *pdev) if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) { vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, vfio_intx_mmap_enable, vdev); - pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq); - ret = vfio_enable_intx(vdev); + pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_intx_update); + ret = vfio_intx_enable(vdev); if (ret) { goto out_teardown; } @@ -3655,7 +2508,7 @@ static int vfio_initfn(PCIDevice *pdev) vfio_register_err_notifier(vdev); vfio_register_req_notifier(vdev); - vfio_setup_resetfn(vdev); + vfio_setup_resetfn_quirk(vdev); return 0; @@ -3748,7 +2601,16 @@ static Property vfio_pci_dev_properties[] = { VFIO_FEATURE_ENABLE_VGA_BIT, false), DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features, VFIO_FEATURE_ENABLE_REQ_BIT, true), - DEFINE_PROP_BOOL("x-mmap", VFIOPCIDevice, vbasedev.allow_mmap, true), + DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false), + DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false), + DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false), + DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false), + DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice, vendor_id, PCI_ANY_ID), + DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice, device_id, PCI_ANY_ID), + DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice, + sub_vendor_id, PCI_ANY_ID), + DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice, + sub_device_id, PCI_ANY_ID), /* * TODO - support passed fds... is this necessary? * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name), diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h new file mode 100644 index 0000000..f004d52 --- /dev/null +++ b/hw/vfio/pci.h @@ -0,0 +1,159 @@ +/* + * vfio based device assignment support - PCI devices + * + * Copyright Red Hat, Inc. 2012-2015 + * + * Authors: + * Alex Williamson <alex.williamson@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ +#ifndef HW_VFIO_VFIO_PCI_H +#define HW_VFIO_VFIO_PCI_H + +#include "qemu-common.h" +#include "exec/memory.h" +#include "hw/pci/pci.h" +#include "hw/vfio/vfio-common.h" +#include "qemu/event_notifier.h" +#include "qemu/queue.h" +#include "qemu/timer.h" + +#define PCI_ANY_ID (~0) + +struct VFIOPCIDevice; + +typedef struct VFIOQuirk { + QLIST_ENTRY(VFIOQuirk) next; + void *data; + int nr_mem; + MemoryRegion *mem; +} VFIOQuirk; + +typedef struct VFIOBAR { + VFIORegion region; + bool ioport; + bool mem64; + QLIST_HEAD(, VFIOQuirk) quirks; +} VFIOBAR; + +typedef struct VFIOVGARegion { + MemoryRegion mem; + off_t offset; + int nr; + QLIST_HEAD(, VFIOQuirk) quirks; +} VFIOVGARegion; + +typedef struct VFIOVGA { + off_t fd_offset; + int fd; + VFIOVGARegion region[QEMU_PCI_VGA_NUM_REGIONS]; +} VFIOVGA; + +typedef struct VFIOINTx { + bool pending; /* interrupt pending */ + bool kvm_accel; /* set when QEMU bypass through KVM enabled */ + uint8_t pin; /* which pin to pull for qemu_set_irq */ + EventNotifier interrupt; /* eventfd triggered on interrupt */ + EventNotifier unmask; /* eventfd for unmask on QEMU bypass */ + PCIINTxRoute route; /* routing info for QEMU bypass */ + uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */ + QEMUTimer *mmap_timer; /* enable mmaps after periods w/o interrupts */ +} VFIOINTx; + +typedef struct VFIOMSIVector { + /* + * Two interrupt paths are configured per vector. The first, is only used + * for interrupts injected via QEMU. This is typically the non-accel path, + * but may also be used when we want QEMU to handle masking and pending + * bits. The KVM path bypasses QEMU and is therefore higher performance, + * but requires masking at the device. virq is used to track the MSI route + * through KVM, thus kvm_interrupt is only available when virq is set to a + * valid (>= 0) value. + */ + EventNotifier interrupt; + EventNotifier kvm_interrupt; + struct VFIOPCIDevice *vdev; /* back pointer to device */ + int virq; + bool use; +} VFIOMSIVector; + +enum { + VFIO_INT_NONE = 0, + VFIO_INT_INTx = 1, + VFIO_INT_MSI = 2, + VFIO_INT_MSIX = 3, +}; + +/* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */ +typedef struct VFIOMSIXInfo { + uint8_t table_bar; + uint8_t pba_bar; + uint16_t entries; + uint32_t table_offset; + uint32_t pba_offset; + MemoryRegion mmap_mem; + void *mmap; +} VFIOMSIXInfo; + +typedef struct VFIOPCIDevice { + PCIDevice pdev; + VFIODevice vbasedev; + VFIOINTx intx; + unsigned int config_size; + uint8_t *emulated_config_bits; /* QEMU emulated bits, little-endian */ + off_t config_offset; /* Offset of config space region within device fd */ + unsigned int rom_size; + off_t rom_offset; /* Offset of ROM region within device fd */ + void *rom; + int msi_cap_size; + VFIOMSIVector *msi_vectors; + VFIOMSIXInfo *msix; + int nr_vectors; /* Number of MSI/MSIX vectors currently in use */ + int interrupt; /* Current interrupt type */ + VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */ + VFIOVGA vga; /* 0xa0000, 0x3b0, 0x3c0 */ + PCIHostDeviceAddress host; + EventNotifier err_notifier; + EventNotifier req_notifier; + int (*resetfn)(struct VFIOPCIDevice *); + uint32_t vendor_id; + uint32_t device_id; + uint32_t sub_vendor_id; + uint32_t sub_device_id; + uint32_t features; +#define VFIO_FEATURE_ENABLE_VGA_BIT 0 +#define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT) +#define VFIO_FEATURE_ENABLE_REQ_BIT 1 +#define VFIO_FEATURE_ENABLE_REQ (1 << VFIO_FEATURE_ENABLE_REQ_BIT) + int32_t bootindex; + uint8_t pm_cap; + bool has_vga; + bool pci_aer; + bool req_enabled; + bool has_flr; + bool has_pm_reset; + bool rom_read_failed; + bool no_kvm_intx; + bool no_kvm_msi; + bool no_kvm_msix; +} VFIOPCIDevice; + +uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len); +void vfio_pci_write_config(PCIDevice *pdev, + uint32_t addr, uint32_t val, int len); + +uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size); +void vfio_vga_write(void *opaque, hwaddr addr, uint64_t data, unsigned size); + +bool vfio_blacklist_opt_rom(VFIOPCIDevice *vdev); +void vfio_vga_quirk_setup(VFIOPCIDevice *vdev); +void vfio_vga_quirk_teardown(VFIOPCIDevice *vdev); +void vfio_vga_quirk_free(VFIOPCIDevice *vdev); +void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr); +void vfio_bar_quirk_teardown(VFIOPCIDevice *vdev, int nr); +void vfio_bar_quirk_free(VFIOPCIDevice *vdev, int nr); +void vfio_setup_resetfn_quirk(VFIOPCIDevice *vdev); + +#endif /* HW_VFIO_VFIO_PCI_H */ diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index de4ec52..a6726cd 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -678,7 +678,7 @@ static const VMStateDescription vfio_platform_vmstate = { static Property vfio_platform_dev_properties[] = { DEFINE_PROP_STRING("host", VFIOPlatformDevice, vbasedev.name), - DEFINE_PROP_BOOL("x-mmap", VFIOPlatformDevice, vbasedev.allow_mmap, true), + DEFINE_PROP_BOOL("x-no-mmap", VFIOPlatformDevice, vbasedev.no_mmap, false), DEFINE_PROP_UINT32("mmap-timeout-ms", VFIOPlatformDevice, mmap_timeout, 1100), DEFINE_PROP_BOOL("x-irqfd", VFIOPlatformDevice, irqfd_allowed, true), diff --git a/include/hw/acpi/acpi-defs.h b/include/hw/acpi/acpi-defs.h index 2b431e6..c7a03d4 100644 --- a/include/hw/acpi/acpi-defs.h +++ b/include/hw/acpi/acpi-defs.h @@ -384,6 +384,15 @@ struct AcpiMadtGenericMsiFrame { typedef struct AcpiMadtGenericMsiFrame AcpiMadtGenericMsiFrame; +struct AcpiMadtGenericRedistributor { + ACPI_SUB_HEADER_DEF + uint16_t reserved; + uint64_t base_address; + uint32_t range_length; +} QEMU_PACKED; + +typedef struct AcpiMadtGenericRedistributor AcpiMadtGenericRedistributor; + /* * Generic Timer Description Table (GTDT) */ diff --git a/include/hw/arm/virt-acpi-build.h b/include/hw/arm/virt-acpi-build.h index 19b68a4..744b666 100644 --- a/include/hw/arm/virt-acpi-build.h +++ b/include/hw/arm/virt-acpi-build.h @@ -32,6 +32,7 @@ typedef struct VirtGuestInfo { const MemMapEntry *memmap; const int *irqmap; bool use_highmem; + int gic_version; } VirtGuestInfo; diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index 808753f..f464586 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -46,6 +46,9 @@ enum { VIRT_CPUPERIPHS, VIRT_GIC_DIST, VIRT_GIC_CPU, + VIRT_GIC_V2M, + VIRT_GIC_ITS, + VIRT_GIC_REDIST, VIRT_UART, VIRT_MMIO, VIRT_RTC, @@ -54,7 +57,6 @@ enum { VIRT_PCIE_MMIO, VIRT_PCIE_PIO, VIRT_PCIE_ECAM, - VIRT_GIC_V2M, VIRT_PLATFORM_BUS, VIRT_PCIE_MMIO_HIGH, }; diff --git a/include/hw/intc/arm_gicv3_common.h b/include/hw/intc/arm_gicv3_common.h new file mode 100644 index 0000000..c2fd8da --- /dev/null +++ b/include/hw/intc/arm_gicv3_common.h @@ -0,0 +1,68 @@ +/* + * ARM GIC support + * + * Copyright (c) 2012 Linaro Limited + * Copyright (c) 2015 Huawei. + * Written by Peter Maydell + * Extended to 64 cores by Shlomo Pongratz + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef HW_ARM_GICV3_COMMON_H +#define HW_ARM_GICV3_COMMON_H + +#include "hw/sysbus.h" +#include "hw/intc/arm_gic_common.h" + +typedef struct GICv3State { + /*< private >*/ + SysBusDevice parent_obj; + /*< public >*/ + + qemu_irq *parent_irq; + qemu_irq *parent_fiq; + + MemoryRegion iomem_dist; /* Distributor */ + MemoryRegion iomem_redist; /* Redistributors */ + + uint32_t num_cpu; + uint32_t num_irq; + uint32_t revision; + bool security_extn; + + int dev_fd; /* kvm device fd if backed by kvm vgic support */ +} GICv3State; + +#define TYPE_ARM_GICV3_COMMON "arm-gicv3-common" +#define ARM_GICV3_COMMON(obj) \ + OBJECT_CHECK(GICv3State, (obj), TYPE_ARM_GICV3_COMMON) +#define ARM_GICV3_COMMON_CLASS(klass) \ + OBJECT_CLASS_CHECK(ARMGICv3CommonClass, (klass), TYPE_ARM_GICV3_COMMON) +#define ARM_GICV3_COMMON_GET_CLASS(obj) \ + OBJECT_GET_CLASS(ARMGICv3CommonClass, (obj), TYPE_ARM_GICV3_COMMON) + +typedef struct ARMGICv3CommonClass { + /*< private >*/ + SysBusDeviceClass parent_class; + /*< public >*/ + + void (*pre_save)(GICv3State *s); + void (*post_load)(GICv3State *s); +} ARMGICv3CommonClass; + +void gicv3_init_irqs_and_mmio(GICv3State *s, qemu_irq_handler handler, + const MemoryRegionOps *ops); + +#endif diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 59a321d..9b9901f 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -35,11 +35,6 @@ do { } while (0) #endif -/* Extra debugging, trap acceleration paths for more logging */ -#define VFIO_ALLOW_KVM_INTX 1 -#define VFIO_ALLOW_KVM_MSI 1 -#define VFIO_ALLOW_KVM_MSIX 1 - enum { VFIO_DEVICE_TYPE_PCI = 0, VFIO_DEVICE_TYPE_PLATFORM = 1, @@ -102,7 +97,7 @@ typedef struct VFIODevice { int type; bool reset_works; bool needs_reset; - bool allow_mmap; + bool no_mmap; VFIODeviceOps *ops; unsigned int num_irqs; unsigned int num_regions; diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index 983e99e..2a58b4d 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -240,6 +240,32 @@ int kvm_device_ioctl(int fd, int type, ...); int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr); /** + * kvm_device_check_attr - check for existence of a specific device attribute + * @fd: The device file descriptor + * @group: the group + * @attr: the attribute of that group to query for + * + * Returns: 1 if the attribute exists + * 0 if the attribute either does not exist or if the vm device + * interface is unavailable + */ +int kvm_device_check_attr(int fd, uint32_t group, uint64_t attr); + +/** + * kvm_device_access - set or get value of a specific vm attribute + * @fd: The device file descriptor + * @group: the group + * @attr: the attribute of that group to set or get + * @val: pointer to a storage area for the value + * @write: true for set and false for get operation + * + * This function is not allowed to fail. Use kvm_device_check_attr() + * in order to check for the availability of optional attributes. + */ +void kvm_device_access(int fd, int group, uint64_t attr, + void *val, bool write); + +/** * kvm_create_device - create a KVM device for the device control API * @KVMState: The KVMState pointer * @type: The KVM device type (see Documentation/virtual/kvm/devices in the @@ -24,6 +24,7 @@ #include "qemu/atomic.h" #include "qemu/option.h" #include "qemu/config-file.h" +#include "qemu/error-report.h" #include "hw/hw.h" #include "hw/pci/msi.h" #include "hw/s390x/adapter.h" @@ -2008,6 +2009,39 @@ int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr) return ret ? 0 : 1; } +int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr) +{ + struct kvm_device_attr attribute = { + .group = group, + .attr = attr, + .flags = 0, + }; + + return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1; +} + +void kvm_device_access(int fd, int group, uint64_t attr, + void *val, bool write) +{ + struct kvm_device_attr kvmattr; + int err; + + kvmattr.flags = 0; + kvmattr.group = group; + kvmattr.attr = attr; + kvmattr.addr = (uintptr_t)val; + + err = kvm_device_ioctl(fd, + write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR, + &kvmattr); + if (err < 0) { + error_report("KVM_%s_DEVICE_ATTR failed: %s\n" + "Group %d attr 0x%016" PRIx64, write ? "SET" : "GET", + strerror(-err), group, attr); + abort(); + } +} + int kvm_has_sync_mmu(void) { return kvm_check_extension(kvm_state, KVM_CAP_SYNC_MMU); diff --git a/target-arm/kvm.c b/target-arm/kvm.c index b278542..6aadcd8 100644 --- a/target-arm/kvm.c +++ b/target-arm/kvm.c @@ -585,18 +585,23 @@ void kvm_arch_init_irq_routing(KVMState *s) int kvm_arch_irqchip_create(KVMState *s) { - int ret; - /* If we can create the VGIC using the newer device control API, we * let the device do this when it initializes itself, otherwise we * fall back to the old API */ + return kvm_check_extension(s, KVM_CAP_DEVICE_CTRL); +} - ret = kvm_create_device(s, KVM_DEV_TYPE_ARM_VGIC_V2, true); - if (ret == 0) { - return 1; +int kvm_arm_vgic_probe(void) +{ + if (kvm_create_device(kvm_state, + KVM_DEV_TYPE_ARM_VGIC_V3, true) == 0) { + return 3; + } else if (kvm_create_device(kvm_state, + KVM_DEV_TYPE_ARM_VGIC_V2, true) == 0) { + return 2; + } else { + return 0; } - - return 0; } int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route, diff --git a/target-arm/kvm_arm.h b/target-arm/kvm_arm.h index b3e0ab7..b516041 100644 --- a/target-arm/kvm_arm.h +++ b/target-arm/kvm_arm.h @@ -189,6 +189,15 @@ int kvm_arm_sync_mpstate_to_kvm(ARMCPU *cpu); */ int kvm_arm_sync_mpstate_to_qemu(ARMCPU *cpu); +int kvm_arm_vgic_probe(void); + +#else + +static inline int kvm_arm_vgic_probe(void) +{ + return 0; +} + #endif static inline const char *gic_class_name(void) @@ -196,4 +205,14 @@ static inline const char *gic_class_name(void) return kvm_irqchip_in_kernel() ? "kvm-arm-gic" : "arm_gic"; } +/** + * gicv3_class_name + * + * Return name of GICv3 class to use depending on whether KVM acceleration is + * in use. May throw an error if the chosen implementation is not available. + * + * Returns: class name to use + */ +const char *gicv3_class_name(void); + #endif diff --git a/target-arm/machine.c b/target-arm/machine.c index 32adfe7..36a0d15 100644 --- a/target-arm/machine.c +++ b/target-arm/machine.c @@ -1,5 +1,6 @@ #include "hw/hw.h" #include "hw/boards.h" +#include "qemu/error-report.h" #include "sysemu/kvm.h" #include "kvm_arm.h" #include "internals.h" @@ -328,3 +329,20 @@ const VMStateDescription vmstate_arm_cpu = { NULL } }; + +const char *gicv3_class_name(void) +{ + if (kvm_irqchip_in_kernel()) { +#ifdef TARGET_AARCH64 + return "kvm-arm-gicv3"; +#else + error_report("KVM GICv3 acceleration is not supported on this " + "platform\n"); +#endif + } else { + /* TODO: Software emulation is not implemented yet */ + error_report("KVM is currently required for GICv3 emulation\n"); + } + + exit(1); +} diff --git a/trace-events b/trace-events index 88a2f14..25c53e0 100644 --- a/trace-events +++ b/trace-events @@ -1522,56 +1522,30 @@ xen_pv_mmio_write(uint64_t addr) "WARNING: write to Xen PV Device MMIO space (ad pci_cfg_read(const char *dev, unsigned devid, unsigned fnid, unsigned offs, unsigned val) "%s %02u:%u @0x%x -> 0x%x" pci_cfg_write(const char *dev, unsigned devid, unsigned fnid, unsigned offs, unsigned val) "%s %02u:%u @0x%x <- 0x%x" -# hw/vfio/vfio-pci.c +# hw/vfio/pci.c vfio_intx_interrupt(const char *name, char line) " (%s) Pin %c" -vfio_eoi(const char *name) " (%s) EOI" -vfio_enable_intx_kvm(const char *name) " (%s) KVM INTx accel enabled" -vfio_disable_intx_kvm(const char *name) " (%s) KVM INTx accel disabled" -vfio_update_irq(const char *name, int new_irq, int target_irq) " (%s) IRQ moved %d -> %d" -vfio_enable_intx(const char *name) " (%s)" -vfio_disable_intx(const char *name) " (%s)" +vfio_intx_eoi(const char *name) " (%s) EOI" +vfio_intx_enable_kvm(const char *name) " (%s) KVM INTx accel enabled" +vfio_intx_disable_kvm(const char *name) " (%s) KVM INTx accel disabled" +vfio_intx_update(const char *name, int new_irq, int target_irq) " (%s) IRQ moved %d -> %d" +vfio_intx_enable(const char *name) " (%s)" +vfio_intx_disable(const char *name) " (%s)" vfio_msi_interrupt(const char *name, int index, uint64_t addr, int data) " (%s) vector %d 0x%"PRIx64"/0x%x" vfio_msix_vector_do_use(const char *name, int index) " (%s) vector %d used" vfio_msix_vector_release(const char *name, int index) " (%s) vector %d released" -vfio_enable_msix(const char *name) " (%s)" -vfio_enable_msi(const char *name, int nr_vectors) " (%s) Enabled %d MSI vectors" -vfio_disable_msix(const char *name) " (%s)" -vfio_disable_msi(const char *name) " (%s)" +vfio_msix_enable(const char *name) " (%s)" +vfio_msix_disable(const char *name) " (%s)" +vfio_msi_enable(const char *name, int nr_vectors) " (%s) Enabled %d MSI vectors" +vfio_msi_disable(const char *name) " (%s)" vfio_pci_load_rom(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device %s ROM:\n size: 0x%lx, offset: 0x%lx, flags: 0x%lx" vfio_rom_read(const char *name, uint64_t addr, int size, uint64_t data) " (%s, 0x%"PRIx64", 0x%x) = 0x%"PRIx64 vfio_pci_size_rom(const char *name, int size) "%s ROM size 0x%x" vfio_vga_write(uint64_t addr, uint64_t data, int size) " (0x%"PRIx64", 0x%"PRIx64", %d)" vfio_vga_read(uint64_t addr, int size, uint64_t data) " (0x%"PRIx64", %d) = 0x%"PRIx64 -# remove ) = -vfio_generic_window_quirk_read(const char * region_name, const char *name, int index, uint64_t addr, int size, uint64_t data) "%s read(%s:BAR%d+0x%"PRIx64", %d = 0x%"PRIx64 -## remove ) -vfio_generic_window_quirk_write(const char * region_name, const char *name, int index, uint64_t addr, uint64_t data, int size) "%s write(%s:BAR%d+0x%"PRIx64", 0x%"PRIx64", %d" -# remove ) = -vfio_generic_quirk_read(const char * region_name, const char *name, int index, uint64_t addr, int size, uint64_t data) "%s read(%s:BAR%d+0x%"PRIx64", %d = 0x%"PRIx64 -# remove ) -vfio_generic_quirk_write(const char * region_name, const char *name, int index, uint64_t addr, uint64_t data, int size) "%s write(%s:BAR%d+0x%"PRIx64", 0x%"PRIx64", %d" -vfio_ati_3c3_quirk_read(uint64_t data) " (0x3c3, 1) = 0x%"PRIx64 -vfio_vga_probe_ati_3c3_quirk(const char *name) "Enabled ATI/AMD quirk 0x3c3 BAR4for device %s" -vfio_probe_ati_bar4_window_quirk(const char *name) "Enabled ATI/AMD BAR4 window quirk for device %s" -#issue with ) -vfio_rtl8168_window_quirk_read_fake(const char *region_name, const char *name) "%s fake read(%s" -vfio_rtl8168_window_quirk_read_table(const char *region_name, const char *name) "%s MSI-X table read(%s" -vfio_rtl8168_window_quirk_read_direct(const char *region_name, const char *name) "%s direct read(%s" -vfio_rtl8168_window_quirk_write_table(const char *region_name, const char *name) "%s MSI-X table write(%s" -vfio_rtl8168_window_quirk_write_direct(const char *region_name, const char *name) "%s direct write(%s" -vfio_probe_rtl8168_bar2_window_quirk(const char *name) "Enabled RTL8168 BAR2 window quirk for device %s" -vfio_probe_ati_bar2_4000_quirk(const char *name) "Enabled ATI/AMD BAR2 0x4000 quirk for device %s" -vfio_nvidia_3d0_quirk_read(int size, uint64_t data) " (0x3d0, %d) = 0x%"PRIx64 -vfio_nvidia_3d0_quirk_write(uint64_t data, int size) " (0x3d0, 0x%"PRIx64", %d)" -vfio_vga_probe_nvidia_3d0_quirk(const char *name) "Enabled NVIDIA VGA 0x3d0 quirk for device %s" -vfio_probe_nvidia_bar5_window_quirk(const char *name) "Enabled NVIDIA BAR5 window quirk for device %s" -vfio_probe_nvidia_bar0_88000_quirk(const char *name) "Enabled NVIDIA BAR0 0x88000 quirk for device %s" -vfio_probe_nvidia_bar0_1800_quirk_id(int id) "Nvidia NV%02x" -vfio_probe_nvidia_bar0_1800_quirk(const char *name) "Enabled NVIDIA BAR0 0x1800 quirk for device %s" vfio_pci_read_config(const char *name, int addr, int len, int val) " (%s, @0x%x, len=0x%x) %x" vfio_pci_write_config(const char *name, int addr, int val, int len) " (%s, @0x%x, 0x%x, len=0x%x)" -vfio_setup_msi(const char *name, int pos) "%s PCI MSI CAP @0x%x" -vfio_early_setup_msix(const char *name, int pos, int table_bar, int offset, int entries) "%s PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d" +vfio_msi_setup(const char *name, int pos) "%s PCI MSI CAP @0x%x" +vfio_msix_early_setup(const char *name, int pos, int table_bar, int offset, int entries) "%s PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d" vfio_check_pcie_flr(const char *name) "%s Supports FLR via PCIe cap" vfio_check_pm_reset(const char *name) "%s Supports PM reset" vfio_check_af_flr(const char *name) "%s Supports FLR via AF cap" @@ -1586,6 +1560,41 @@ vfio_initfn(const char *name, int group_id) " (%s) group %d" vfio_pci_reset(const char *name) " (%s)" vfio_pci_reset_flr(const char *name) "%s FLR/VFIO_DEVICE_RESET" vfio_pci_reset_pm(const char *name) "%s PCI PM Reset" +vfio_pci_emulated_vendor_id(const char *name, uint16_t val) "%s %04x" +vfio_pci_emulated_device_id(const char *name, uint16_t val) "%s %04x" +vfio_pci_emulated_sub_vendor_id(const char *name, uint16_t val) "%s %04x" +vfio_pci_emulated_sub_device_id(const char *name, uint16_t val) "%s %04x" + +# hw/vfio/pci-quirks. +vfio_quirk_rom_blacklisted(const char *name, uint16_t vid, uint16_t did) "%s %04x:%04x" +vfio_quirk_generic_window_address_write(const char *name, const char * region_name, uint64_t data) "%s %s 0x%"PRIx64 +vfio_quirk_generic_window_data_read(const char *name, const char * region_name, uint64_t data) "%s %s 0x%"PRIx64 +vfio_quirk_generic_window_data_write(const char *name, const char * region_name, uint64_t data) "%s %s 0x%"PRIx64 +vfio_quirk_generic_mirror_read(const char *name, const char * region_name, uint64_t addr, uint64_t data) "%s %s 0x%"PRIx64": 0x%"PRIx64 +vfio_quirk_generic_mirror_write(const char *name, const char * region_name, uint64_t addr, uint64_t data) "%s %s 0x%"PRIx64": 0x%"PRIx64 +vfio_quirk_ati_3c3_read(const char *name, uint64_t data) "%s 0x%"PRIx64 +vfio_quirk_ati_3c3_probe(const char *name) "%s" +vfio_quirk_ati_bar4_probe(const char *name) "%s" +vfio_quirk_ati_bar2_probe(const char *name) "%s" +vfio_quirk_nvidia_3d0_state(const char *name, const char *state) "%s %s" +vfio_quirk_nvidia_3d0_read(const char *name, uint8_t offset, unsigned size, uint64_t val) " (%s, @0x%x, len=0x%x) %"PRIx64 +vfio_quirk_nvidia_3d0_write(const char *name, uint8_t offset, uint64_t data, unsigned size) "(%s, @0x%x, 0x%"PRIx64", len=0x%x)" +vfio_quirk_nvidia_3d0_probe(const char *name) "%s" +vfio_quirk_nvidia_bar5_state(const char *name, const char *state) "%s %s" +vfio_quirk_nvidia_bar5_probe(const char *name) "%s" +vfio_quirk_nvidia_bar0_msi_ack(const char *name) "%s" +vfio_quirk_nvidia_bar0_probe(const char *name) "%s" +vfio_quirk_rtl8168_fake_latch(const char *name, uint64_t val) "%s 0x%"PRIx64 +vfio_quirk_rtl8168_msix_write(const char *name, uint16_t offset, uint64_t val) "%s MSI-X table write[0x%x]: 0x%"PRIx64 +vfio_quirk_rtl8168_msix_read(const char *name, uint16_t offset, uint64_t val) "%s MSI-X table read[0x%x]: 0x%"PRIx64 +vfio_quirk_rtl8168_probe(const char *name) "%s" + +vfio_quirk_ati_bonaire_reset_skipped(const char *name) "%s" +vfio_quirk_ati_bonaire_reset_no_smc(const char *name) "%s" +vfio_quirk_ati_bonaire_reset_timeout(const char *name) "%s" +vfio_quirk_ati_bonaire_reset_done(const char *name) "%s" +vfio_quirk_ati_bonaire_reset(const char *name) "%s" + # hw/vfio/vfio-common.c vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)" |