diff options
Diffstat (limited to 'hw')
183 files changed, 7971 insertions, 2513 deletions
@@ -42,6 +42,7 @@ source ufs/Kconfig source usb/Kconfig source virtio/Kconfig source vfio/Kconfig +source vfio-user/Kconfig source vmapple/Kconfig source xen/Kconfig source watchdog/Kconfig diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c index 9ba9080..732d613 100644 --- a/hw/acpi/nvdimm.c +++ b/hw/acpi/nvdimm.c @@ -535,7 +535,7 @@ nvdimm_dsm_no_payload(uint32_t func_ret_status, hwaddr dsm_mem_addr) #define NVDIMM_QEMU_RSVD_HANDLE_ROOT 0x10000 -/* Read FIT data, defined in docs/specs/acpi_nvdimm.txt. */ +/* Read FIT data, defined in docs/specs/acpi_nvdimm.rst. */ static void nvdimm_dsm_func_read_fit(NVDIMMState *state, NvdimmDsmIn *in, hwaddr dsm_mem_addr) { diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c index aac9001..497281a 100644 --- a/hw/acpi/pcihp.c +++ b/hw/acpi/pcihp.c @@ -3,7 +3,7 @@ * * QEMU supports PCI hotplug via ACPI. This module * implements the interface between QEMU and the ACPI BIOS. - * Interface specification - see docs/specs/acpi_pci_hotplug.txt + * Interface specification - see docs/specs/acpi_pci_hotplug.rst * * Copyright (c) 2013, Red Hat Inc, Michael S. Tsirkin (mst@redhat.com) * Copyright (c) 2006 Fabrice Bellard diff --git a/hw/acpi/vmgenid.c b/hw/acpi/vmgenid.c index fac3d6d..33c35c8 100644 --- a/hw/acpi/vmgenid.c +++ b/hw/acpi/vmgenid.c @@ -38,7 +38,7 @@ void vmgenid_build_acpi(VmGenIdState *vms, GArray *table_data, GArray *guid, guid_le = qemu_uuid_bswap(vms->guid); /* The GUID is written at a fixed offset into the fw_cfg file * in order to implement the "OVMF SDT Header probe suppressor" - * see docs/specs/vmgenid.txt for more details + * see docs/specs/vmgenid.rst for more details */ g_array_insert_vals(guid, VMGENID_GUID_OFFSET, guid_le.data, ARRAY_SIZE(guid_le.data)); @@ -101,7 +101,7 @@ void vmgenid_build_acpi(VmGenIdState *vms, GArray *table_data, GArray *guid, * < 4GB, but write 64 bits anyway. * The address that is patched in is offset in order to implement * the "OVMF SDT Header probe suppressor" - * see docs/specs/vmgenid.txt for more details. + * see docs/specs/vmgenid.rst for more details. */ bios_linker_loader_write_pointer(linker, VMGENID_ADDR_FW_CFG_FILE, 0, sizeof(uint64_t), @@ -153,7 +153,7 @@ static void vmgenid_update_guest(VmGenIdState *vms) guid_le = qemu_uuid_bswap(vms->guid); /* The GUID is written at a fixed offset into the fw_cfg file * in order to implement the "OVMF SDT Header probe suppressor" - * see docs/specs/vmgenid.txt for more details. + * see docs/specs/vmgenid.rst for more details. */ cpu_physical_memory_write(vmgenid_addr, guid_le.data, sizeof(guid_le.data)); diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig index a55b44d..6ea8653 100644 --- a/hw/arm/Kconfig +++ b/hw/arm/Kconfig @@ -147,7 +147,6 @@ config OMAP bool select FRAMEBUFFER select I2C - select NAND select PFLASH_CFI01 select SD select SERIAL_MM @@ -533,6 +532,7 @@ config ASPEED_SOC select I2C select DPS310 select PCA9552 + select PCA9554 select SERIAL_MM select SMBUS_EEPROM select PCA954X diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c index d0b3336..c31bbe7 100644 --- a/hw/arm/aspeed.c +++ b/hw/arm/aspeed.c @@ -19,6 +19,7 @@ #include "hw/i2c/i2c_mux_pca954x.h" #include "hw/i2c/smbus_eeprom.h" #include "hw/gpio/pca9552.h" +#include "hw/gpio/pca9554.h" #include "hw/nvram/eeprom_at24c.h" #include "hw/sensor/tmp105.h" #include "hw/misc/led.h" @@ -197,9 +198,12 @@ struct AspeedMachineState { #define FUJI_BMC_HW_STRAP2 0x00000000 /* Bletchley hardware value */ -/* TODO: Leave same as EVB for now. */ -#define BLETCHLEY_BMC_HW_STRAP1 AST2600_EVB_HW_STRAP1 -#define BLETCHLEY_BMC_HW_STRAP2 AST2600_EVB_HW_STRAP2 +#define BLETCHLEY_BMC_HW_STRAP1 0x00002000 +#define BLETCHLEY_BMC_HW_STRAP2 0x00000801 + +/* GB200NVL hardware value */ +#define GB200NVL_BMC_HW_STRAP1 AST2600_EVB_HW_STRAP1 +#define GB200NVL_BMC_HW_STRAP2 AST2600_EVB_HW_STRAP2 /* Qualcomm DC-SCM hardware value */ #define QCOM_DC_SCM_V1_BMC_HW_STRAP1 0x00000000 @@ -465,6 +469,8 @@ static void aspeed_machine_init(MachineState *machine) aspeed_board_init_flashes(&bmc->soc->spi[0], bmc->spi_model ? bmc->spi_model : amc->spi_model, 1, amc->num_cs); + aspeed_board_init_flashes(&bmc->soc->spi[1], + amc->spi2_model, 1, amc->num_cs2); } if (machine->kernel_filename && sc->num_cpus > 1) { @@ -645,6 +651,12 @@ static void create_pca9552(AspeedSoCState *soc, int bus_id, int addr) TYPE_PCA9552, addr); } +static I2CSlave *create_pca9554(AspeedSoCState *soc, int bus_id, int addr) +{ + return i2c_slave_create_simple(aspeed_i2c_get_bus(&soc->i2c, bus_id), + TYPE_PCA9554, addr); +} + static void sonorapass_bmc_i2c_init(AspeedMachineState *bmc) { AspeedSoCState *soc = bmc->soc; @@ -1003,6 +1015,180 @@ static void fuji_bmc_i2c_init(AspeedMachineState *bmc) } #define TYPE_TMP421 "tmp421" +#define TYPE_DS1338 "ds1338" + +/* Catalina hardware value */ +#define CATALINA_BMC_HW_STRAP1 0x00002002 +#define CATALINA_BMC_HW_STRAP2 0x00000800 + +#define CATALINA_BMC_RAM_SIZE ASPEED_RAM_SIZE(2 * GiB) + +static void catalina_bmc_i2c_init(AspeedMachineState *bmc) +{ + /* Reference from v6.16-rc2 aspeed-bmc-facebook-catalina.dts */ + + AspeedSoCState *soc = bmc->soc; + I2CBus *i2c[16] = {}; + I2CSlave *i2c_mux; + + /* busses 0-15 are all used. */ + for (int i = 0; i < ARRAY_SIZE(i2c); i++) { + i2c[i] = aspeed_i2c_get_bus(&soc->i2c, i); + } + + /* &i2c0 */ + /* i2c-mux@71 (PCA9546) on i2c0 */ + i2c_slave_create_simple(i2c[0], TYPE_PCA9546, 0x71); + + /* i2c-mux@72 (PCA9546) on i2c0 */ + i2c_mux = i2c_slave_create_simple(i2c[0], TYPE_PCA9546, 0x72); + + /* i2c0mux1ch1 */ + /* io_expander7 - pca9535@20 */ + i2c_slave_create_simple(pca954x_i2c_get_bus(i2c_mux, 1), + TYPE_PCA9552, 0x20); + /* eeprom@50 */ + at24c_eeprom_init(pca954x_i2c_get_bus(i2c_mux, 1), 0x50, 8 * KiB); + + /* i2c-mux@73 (PCA9546) on i2c0 */ + i2c_slave_create_simple(i2c[0], TYPE_PCA9546, 0x73); + + /* i2c-mux@75 (PCA9546) on i2c0 */ + i2c_slave_create_simple(i2c[0], TYPE_PCA9546, 0x75); + + /* i2c-mux@76 (PCA9546) on i2c0 */ + i2c_mux = i2c_slave_create_simple(i2c[0], TYPE_PCA9546, 0x76); + + /* i2c0mux4ch1 */ + /* io_expander8 - pca9535@21 */ + i2c_slave_create_simple(pca954x_i2c_get_bus(i2c_mux, 1), + TYPE_PCA9552, 0x21); + /* eeprom@50 */ + at24c_eeprom_init(pca954x_i2c_get_bus(i2c_mux, 1), 0x50, 8 * KiB); + + /* i2c-mux@77 (PCA9546) on i2c0 */ + i2c_slave_create_simple(i2c[0], TYPE_PCA9546, 0x77); + + + /* &i2c1 */ + /* i2c-mux@70 (PCA9548) on i2c1 */ + i2c_mux = i2c_slave_create_simple(i2c[1], TYPE_PCA9548, 0x70); + /* i2c1mux0ch0 */ + /* ina238@41 - no model */ + /* ina238@42 - no model */ + /* ina238@44 - no model */ + /* i2c1mux0ch1 */ + /* ina238@41 - no model */ + /* ina238@43 - no model */ + /* i2c1mux0ch4 */ + /* ltc4287@42 - no model */ + /* ltc4287@43 - no model */ + + /* i2c1mux0ch5 */ + /* eeprom@54 */ + at24c_eeprom_init(pca954x_i2c_get_bus(i2c_mux, 5), 0x54, 8 * KiB); + /* tpm75@4f */ + i2c_slave_create_simple(pca954x_i2c_get_bus(i2c_mux, 5), TYPE_TMP75, 0x4f); + + /* i2c1mux0ch6 */ + /* io_expander5 - pca9554@27 */ + i2c_slave_create_simple(pca954x_i2c_get_bus(i2c_mux, 6), + TYPE_PCA9554, 0x27); + /* io_expander6 - pca9555@25 */ + i2c_slave_create_simple(pca954x_i2c_get_bus(i2c_mux, 6), + TYPE_PCA9552, 0x25); + /* eeprom@51 */ + at24c_eeprom_init(pca954x_i2c_get_bus(i2c_mux, 6), 0x51, 8 * KiB); + + /* i2c1mux0ch7 */ + /* eeprom@53 */ + at24c_eeprom_init(pca954x_i2c_get_bus(i2c_mux, 7), 0x53, 8 * KiB); + /* temperature-sensor@4b - tmp75 */ + i2c_slave_create_simple(pca954x_i2c_get_bus(i2c_mux, 7), TYPE_TMP75, 0x4b); + + /* &i2c2 */ + /* io_expander0 - pca9555@20 */ + i2c_slave_create_simple(i2c[2], TYPE_PCA9552, 0x20); + /* io_expander0 - pca9555@21 */ + i2c_slave_create_simple(i2c[2], TYPE_PCA9552, 0x21); + /* io_expander0 - pca9555@27 */ + i2c_slave_create_simple(i2c[2], TYPE_PCA9552, 0x27); + /* eeprom@50 */ + at24c_eeprom_init(i2c[2], 0x50, 8 * KiB); + /* eeprom@51 */ + at24c_eeprom_init(i2c[2], 0x51, 8 * KiB); + + /* &i2c5 */ + /* i2c-mux@70 (PCA9548) on i2c5 */ + i2c_mux = i2c_slave_create_simple(i2c[5], TYPE_PCA9548, 0x70); + /* i2c5mux0ch6 */ + /* eeprom@52 */ + at24c_eeprom_init(pca954x_i2c_get_bus(i2c_mux, 6), 0x52, 8 * KiB); + /* i2c5mux0ch7 */ + /* ina230@40 - no model */ + /* ina230@41 - no model */ + /* ina230@44 - no model */ + /* ina230@45 - no model */ + + /* &i2c6 */ + /* io_expander3 - pca9555@21 */ + i2c_slave_create_simple(i2c[6], TYPE_PCA9552, 0x21); + /* rtc@6f - nct3018y */ + i2c_slave_create_simple(i2c[6], TYPE_DS1338, 0x6f); + + /* &i2c9 */ + /* io_expander4 - pca9555@4f */ + i2c_slave_create_simple(i2c[9], TYPE_PCA9552, 0x4f); + /* temperature-sensor@4b - tpm75 */ + i2c_slave_create_simple(i2c[9], TYPE_TMP75, 0x4b); + /* eeprom@50 */ + at24c_eeprom_init(i2c[9], 0x50, 8 * KiB); + /* eeprom@56 */ + at24c_eeprom_init(i2c[9], 0x56, 8 * KiB); + + /* &i2c10 */ + /* temperature-sensor@1f - tpm421 */ + i2c_slave_create_simple(i2c[10], TYPE_TMP421, 0x1f); + /* eeprom@50 */ + at24c_eeprom_init(i2c[10], 0x50, 8 * KiB); + + /* &i2c11 */ + /* ssif-bmc@10 - no model */ + + /* &i2c12 */ + /* eeprom@50 */ + at24c_eeprom_init(i2c[12], 0x50, 8 * KiB); + + /* &i2c13 */ + /* eeprom@50 */ + at24c_eeprom_init(i2c[13], 0x50, 8 * KiB); + /* eeprom@54 */ + at24c_eeprom_init(i2c[13], 0x54, 256); + /* eeprom@55 */ + at24c_eeprom_init(i2c[13], 0x55, 256); + /* eeprom@57 */ + at24c_eeprom_init(i2c[13], 0x57, 256); + + /* &i2c14 */ + /* io_expander9 - pca9555@10 */ + i2c_slave_create_simple(i2c[14], TYPE_PCA9552, 0x10); + /* io_expander10 - pca9555@11 */ + i2c_slave_create_simple(i2c[14], TYPE_PCA9552, 0x11); + /* io_expander11 - pca9555@12 */ + i2c_slave_create_simple(i2c[14], TYPE_PCA9552, 0x12); + /* io_expander12 - pca9555@13 */ + i2c_slave_create_simple(i2c[14], TYPE_PCA9552, 0x13); + /* io_expander13 - pca9555@14 */ + i2c_slave_create_simple(i2c[14], TYPE_PCA9552, 0x14); + /* io_expander14 - pca9555@15 */ + i2c_slave_create_simple(i2c[14], TYPE_PCA9552, 0x15); + + /* &i2c15 */ + /* temperature-sensor@1f - tmp421 */ + i2c_slave_create_simple(i2c[15], TYPE_TMP421, 0x1f); + /* eeprom@52 */ + at24c_eeprom_init(i2c[15], 0x52, 8 * KiB); +} static void bletchley_bmc_i2c_init(AspeedMachineState *bmc) { @@ -1050,6 +1236,45 @@ static void bletchley_bmc_i2c_init(AspeedMachineState *bmc) i2c_slave_create_simple(i2c[12], TYPE_PCA9552, 0x67); } + +static void gb200nvl_bmc_i2c_init(AspeedMachineState *bmc) +{ + AspeedSoCState *soc = bmc->soc; + I2CBus *i2c[15] = {}; + DeviceState *dev; + for (int i = 0; i < sizeof(i2c) / sizeof(i2c[0]); i++) { + if ((i == 11) || (i == 12) || (i == 13)) { + continue; + } + i2c[i] = aspeed_i2c_get_bus(&soc->i2c, i); + } + + /* Bus 5 Expander */ + create_pca9554(soc, 4, 0x21); + + /* Mux I2c Expanders */ + i2c_slave_create_simple(i2c[5], "pca9546", 0x71); + i2c_slave_create_simple(i2c[5], "pca9546", 0x72); + i2c_slave_create_simple(i2c[5], "pca9546", 0x73); + i2c_slave_create_simple(i2c[5], "pca9546", 0x75); + i2c_slave_create_simple(i2c[5], "pca9546", 0x76); + i2c_slave_create_simple(i2c[5], "pca9546", 0x77); + + /* Bus 10 */ + dev = DEVICE(create_pca9554(soc, 9, 0x20)); + + /* Set FPGA_READY */ + object_property_set_str(OBJECT(dev), "pin1", "high", &error_fatal); + + create_pca9554(soc, 9, 0x21); + at24c_eeprom_init(i2c[9], 0x50, 64 * KiB); + at24c_eeprom_init(i2c[9], 0x51, 64 * KiB); + + /* Bus 11 */ + at24c_eeprom_init_rom(i2c[10], 0x50, 256, gb200nvl_bmc_fruid, + gb200nvl_bmc_fruid_len); +} + static void fby35_i2c_init(AspeedMachineState *bmc) { AspeedSoCState *soc = bmc->soc; @@ -1585,6 +1810,52 @@ static void aspeed_machine_bletchley_class_init(ObjectClass *oc, aspeed_machine_class_init_cpus_defaults(mc); } +static void aspeed_machine_catalina_class_init(ObjectClass *oc, + const void *data) +{ + MachineClass *mc = MACHINE_CLASS(oc); + AspeedMachineClass *amc = ASPEED_MACHINE_CLASS(oc); + + mc->desc = "Facebook Catalina BMC (Cortex-A7)"; + amc->soc_name = "ast2600-a3"; + amc->hw_strap1 = CATALINA_BMC_HW_STRAP1; + amc->hw_strap2 = CATALINA_BMC_HW_STRAP2; + amc->fmc_model = "w25q01jvq"; + amc->spi_model = NULL; + amc->num_cs = 2; + amc->macs_mask = ASPEED_MAC2_ON; + amc->i2c_init = catalina_bmc_i2c_init; + mc->auto_create_sdcard = true; + mc->default_ram_size = CATALINA_BMC_RAM_SIZE; + aspeed_machine_class_init_cpus_defaults(mc); + aspeed_machine_ast2600_class_emmc_init(oc); +} + +#define GB200NVL_BMC_RAM_SIZE ASPEED_RAM_SIZE(1 * GiB) + +static void aspeed_machine_gb200nvl_class_init(ObjectClass *oc, + const void *data) +{ + MachineClass *mc = MACHINE_CLASS(oc); + AspeedMachineClass *amc = ASPEED_MACHINE_CLASS(oc); + + mc->desc = "Nvidia GB200NVL BMC (Cortex-A7)"; + amc->soc_name = "ast2600-a3"; + amc->hw_strap1 = GB200NVL_BMC_HW_STRAP1; + amc->hw_strap2 = GB200NVL_BMC_HW_STRAP2; + amc->fmc_model = "mx66u51235f"; + amc->spi_model = "mx66u51235f"; + amc->num_cs = 2; + + amc->spi2_model = "mx66u51235f"; + amc->num_cs2 = 1; + amc->macs_mask = ASPEED_MAC0_ON | ASPEED_MAC1_ON; + amc->i2c_init = gb200nvl_bmc_i2c_init; + mc->default_ram_size = GB200NVL_BMC_RAM_SIZE; + aspeed_machine_class_init_cpus_defaults(mc); + aspeed_machine_ast2600_class_emmc_init(oc); +} + static void fby35_reset(MachineState *state, ResetType type) { AspeedMachineState *bmc = ASPEED_MACHINE(state); @@ -1878,6 +2149,14 @@ static const TypeInfo aspeed_machine_types[] = { .parent = TYPE_ASPEED_MACHINE, .class_init = aspeed_machine_bletchley_class_init, }, { + .name = MACHINE_TYPE_NAME("gb200nvl-bmc"), + .parent = TYPE_ASPEED_MACHINE, + .class_init = aspeed_machine_gb200nvl_class_init, + }, { + .name = MACHINE_TYPE_NAME("catalina-bmc"), + .parent = TYPE_ASPEED_MACHINE, + .class_init = aspeed_machine_catalina_class_init, + }, { .name = MACHINE_TYPE_NAME("fby35-bmc"), .parent = MACHINE_TYPE_NAME("ast2600-evb"), .class_init = aspeed_machine_fby35_class_init, diff --git a/hw/arm/aspeed_ast27x0-fc.c b/hw/arm/aspeed_ast27x0-fc.c index 125a3ad..7087be4 100644 --- a/hw/arm/aspeed_ast27x0-fc.c +++ b/hw/arm/aspeed_ast27x0-fc.c @@ -48,7 +48,7 @@ struct Ast2700FCState { bool mmio_exec; }; -#define AST2700FC_BMC_RAM_SIZE (2 * GiB) +#define AST2700FC_BMC_RAM_SIZE (1 * GiB) #define AST2700FC_CM4_DRAM_SIZE (32 * MiB) #define AST2700FC_HW_STRAP1 0x000000C0 @@ -68,6 +68,7 @@ static void ast2700fc_ca35_init(MachineState *machine) memory_region_init(&s->ca35_memory, OBJECT(&s->ca35), "ca35-memory", UINT64_MAX); + memory_region_add_subregion(get_system_memory(), 0, &s->ca35_memory); if (!memory_region_init_ram(&s->ca35_dram, OBJECT(&s->ca35), "ca35-dram", AST2700FC_BMC_RAM_SIZE, &error_abort)) { @@ -86,6 +87,13 @@ static void ast2700fc_ca35_init(MachineState *machine) AST2700FC_BMC_RAM_SIZE, &error_abort)) { return; } + + for (int i = 0; i < sc->macs_num; i++) { + if (!qemu_configure_nic_device(DEVICE(&soc->ftgmac100[i]), + true, NULL)) { + break; + } + } if (!object_property_set_int(OBJECT(&s->ca35), "hw-strap1", AST2700FC_HW_STRAP1, &error_abort)) { return; diff --git a/hw/arm/aspeed_ast27x0.c b/hw/arm/aspeed_ast27x0.c index 1974a25..6aa3841 100644 --- a/hw/arm/aspeed_ast27x0.c +++ b/hw/arm/aspeed_ast27x0.c @@ -23,14 +23,14 @@ #include "qobject/qlist.h" #include "qemu/log.h" -#define AST2700_SOC_IO_SIZE 0x01000000 +#define AST2700_SOC_IO_SIZE 0x00FE0000 #define AST2700_SOC_IOMEM_SIZE 0x01000000 #define AST2700_SOC_DPMCU_SIZE 0x00040000 #define AST2700_SOC_LTPI_SIZE 0x01000000 static const hwaddr aspeed_soc_ast2700_memmap[] = { - [ASPEED_DEV_IOMEM] = 0x00000000, [ASPEED_DEV_VBOOTROM] = 0x00000000, + [ASPEED_DEV_IOMEM] = 0x00020000, [ASPEED_DEV_SRAM] = 0x10000000, [ASPEED_DEV_DPMCU] = 0x11000000, [ASPEED_DEV_IOMEM0] = 0x12000000, @@ -346,8 +346,9 @@ static void aspeed_ram_capacity_write(void *opaque, hwaddr addr, uint64_t data, * If writes the data to the address which is beyond the ram size, * it would write the data to the "address % ram_size". */ - result = address_space_write(&s->dram_as, addr % ram_size, - MEMTXATTRS_UNSPECIFIED, &data, 4); + address_space_stl_le(&s->dram_as, addr % ram_size, data, + MEMTXATTRS_UNSPECIFIED, &result); + if (result != MEMTX_OK) { qemu_log_mask(LOG_GUEST_ERROR, "%s: DRAM write failed, addr:0x%" HWADDR_PRIx @@ -360,9 +361,10 @@ static const MemoryRegionOps aspeed_ram_capacity_ops = { .read = aspeed_ram_capacity_read, .write = aspeed_ram_capacity_write, .endianness = DEVICE_LITTLE_ENDIAN, + .impl.min_access_size = 4, .valid = { - .min_access_size = 1, - .max_access_size = 8, + .min_access_size = 4, + .max_access_size = 4, }, }; diff --git a/hw/arm/aspeed_eeprom.c b/hw/arm/aspeed_eeprom.c index daa3d32..8bbbdec 100644 --- a/hw/arm/aspeed_eeprom.c +++ b/hw/arm/aspeed_eeprom.c @@ -162,6 +162,25 @@ const uint8_t rainier_bmc_fruid[] = { 0x31, 0x50, 0x46, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, }; +const uint8_t gb200nvl_bmc_fruid[] = { + 0x01, 0x00, 0x00, 0x01, 0x0b, 0x00, 0x00, 0xf3, 0x01, 0x0a, 0x19, 0x1f, + 0x0f, 0xe6, 0xc6, 0x4e, 0x56, 0x49, 0x44, 0x49, 0x41, 0xc5, 0x50, 0x33, + 0x38, 0x30, 0x39, 0xcd, 0x31, 0x35, 0x38, 0x33, 0x33, 0x32, 0x34, 0x38, + 0x30, 0x30, 0x31, 0x35, 0x30, 0xd2, 0x36, 0x39, 0x39, 0x2d, 0x31, 0x33, + 0x38, 0x30, 0x39, 0x2d, 0x30, 0x34, 0x30, 0x34, 0x2d, 0x36, 0x30, 0x30, + 0xc0, 0x01, 0x01, 0xd6, 0x4d, 0x41, 0x43, 0x3a, 0x20, 0x33, 0x43, 0x3a, + 0x36, 0x44, 0x3a, 0x36, 0x36, 0x3a, 0x31, 0x34, 0x3a, 0x43, 0x38, 0x3a, + 0x37, 0x41, 0xc1, 0x3b, 0x01, 0x09, 0x19, 0xc6, 0x4e, 0x56, 0x49, 0x44, + 0x49, 0x41, 0xc9, 0x50, 0x33, 0x38, 0x30, 0x39, 0x2d, 0x42, 0x4d, 0x43, + 0xd2, 0x36, 0x39, 0x39, 0x2d, 0x31, 0x33, 0x38, 0x30, 0x39, 0x2d, 0x30, + 0x34, 0x30, 0x34, 0x2d, 0x36, 0x30, 0x30, 0xc4, 0x41, 0x45, 0x2e, 0x31, + 0xcd, 0x31, 0x35, 0x38, 0x33, 0x33, 0x32, 0x34, 0x38, 0x30, 0x30, 0x31, + 0x35, 0x30, 0xc0, 0xc4, 0x76, 0x30, 0x2e, 0x31, 0xc1, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0xb4, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + +}; + const size_t tiogapass_bmc_fruid_len = sizeof(tiogapass_bmc_fruid); const size_t fby35_nic_fruid_len = sizeof(fby35_nic_fruid); const size_t fby35_bb_fruid_len = sizeof(fby35_bb_fruid); @@ -169,3 +188,5 @@ const size_t fby35_bmc_fruid_len = sizeof(fby35_bmc_fruid); const size_t yosemitev2_bmc_fruid_len = sizeof(yosemitev2_bmc_fruid); const size_t rainier_bb_fruid_len = sizeof(rainier_bb_fruid); const size_t rainier_bmc_fruid_len = sizeof(rainier_bmc_fruid); +const size_t gb200nvl_bmc_fruid_len = sizeof(gb200nvl_bmc_fruid); + diff --git a/hw/arm/aspeed_eeprom.h b/hw/arm/aspeed_eeprom.h index f08c16e..3ed9bc1 100644 --- a/hw/arm/aspeed_eeprom.h +++ b/hw/arm/aspeed_eeprom.h @@ -26,4 +26,7 @@ extern const size_t rainier_bb_fruid_len; extern const uint8_t rainier_bmc_fruid[]; extern const size_t rainier_bmc_fruid_len; +extern const uint8_t gb200nvl_bmc_fruid[]; +extern const size_t gb200nvl_bmc_fruid_len; + #endif diff --git a/hw/arm/boot.c b/hw/arm/boot.c index f94b940..becd827 100644 --- a/hw/arm/boot.c +++ b/hw/arm/boot.c @@ -19,6 +19,7 @@ #include "system/kvm.h" #include "system/tcg.h" #include "system/system.h" +#include "system/memory.h" #include "system/numa.h" #include "hw/boards.h" #include "system/reset.h" @@ -526,7 +527,7 @@ int arm_load_dtb(hwaddr addr, const struct arm_boot_info *binfo, if (binfo->dtb_filename) { char *filename; - filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, binfo->dtb_filename); + filename = qemu_find_file(QEMU_FILE_TYPE_DTB, binfo->dtb_filename); if (!filename) { fprintf(stderr, "Couldn't open dtb file %s\n", binfo->dtb_filename); goto fail; @@ -743,7 +744,7 @@ static void do_cpu_reset(void *opaque) } else { if (arm_feature(env, ARM_FEATURE_EL3) && (info->secure_boot || - (info->secure_board_setup && cs == first_cpu))) { + (info->secure_board_setup && cpu == info->primary_cpu))) { /* Start this CPU in Secure SVC */ target_el = 3; } @@ -751,7 +752,7 @@ static void do_cpu_reset(void *opaque) arm_emulate_firmware_reset(cs, target_el); - if (cs == first_cpu) { + if (cpu == info->primary_cpu) { AddressSpace *as = arm_boot_address_space(cpu, info); cpu_set_pc(cs, info->loader_start); @@ -1238,6 +1239,9 @@ void arm_load_kernel(ARMCPU *cpu, MachineState *ms, struct arm_boot_info *info) info->dtb_filename = ms->dtb; info->dtb_limit = 0; + /* We assume the CPU passed as argument is the primary CPU. */ + info->primary_cpu = cpu; + /* Load the kernel. */ if (!info->kernel_filename || info->firmware_loaded) { arm_setup_firmware_boot(cpu, info); @@ -1287,12 +1291,8 @@ void arm_load_kernel(ARMCPU *cpu, MachineState *ms, struct arm_boot_info *info) object_property_set_int(cpuobj, "psci-conduit", info->psci_conduit, &error_abort); - /* - * Secondary CPUs start in PSCI powered-down state. Like the - * code in do_cpu_reset(), we assume first_cpu is the primary - * CPU. - */ - if (cs != first_cpu) { + /* Secondary CPUs start in PSCI powered-down state. */ + if (ARM_CPU(cs) != info->primary_cpu) { object_property_set_bool(cpuobj, "start-powered-off", true, &error_abort); } diff --git a/hw/arm/fby35.c b/hw/arm/fby35.c index e123fa6..c14fc2e 100644 --- a/hw/arm/fby35.c +++ b/hw/arm/fby35.c @@ -77,6 +77,7 @@ static void fby35_bmc_init(Fby35State *s) memory_region_init(&s->bmc_memory, OBJECT(&s->bmc), "bmc-memory", UINT64_MAX); + memory_region_add_subregion(get_system_memory(), 0, &s->bmc_memory); memory_region_init_ram(&s->bmc_dram, OBJECT(&s->bmc), "bmc-dram", FBY35_BMC_RAM_SIZE, &error_abort); diff --git a/hw/arm/meson.build b/hw/arm/meson.build index 5098795..d90be8f 100644 --- a/hw/arm/meson.build +++ b/hw/arm/meson.build @@ -8,7 +8,7 @@ arm_common_ss.add(when: 'CONFIG_HIGHBANK', if_true: files('highbank.c')) arm_common_ss.add(when: 'CONFIG_INTEGRATOR', if_true: files('integratorcp.c')) arm_common_ss.add(when: 'CONFIG_MICROBIT', if_true: files('microbit.c')) arm_common_ss.add(when: 'CONFIG_MPS3R', if_true: files('mps3r.c')) -arm_common_ss.add(when: 'CONFIG_MUSICPAL', if_true: [pixman, files('musicpal.c')]) +arm_common_ss.add(when: 'CONFIG_MUSICPAL', if_true: [files('musicpal.c')]) arm_common_ss.add(when: 'CONFIG_NETDUINOPLUS2', if_true: files('netduinoplus2.c')) arm_common_ss.add(when: 'CONFIG_OLIMEX_STM32_H405', if_true: files('olimex-stm32-h405.c')) arm_common_ss.add(when: 'CONFIG_NPCM7XX', if_true: files('npcm7xx.c', 'npcm7xx_boards.c')) @@ -79,7 +79,7 @@ arm_common_ss.add(when: 'CONFIG_SX1', if_true: files('omap_sx1.c')) arm_common_ss.add(when: 'CONFIG_VERSATILE', if_true: files('versatilepb.c')) arm_common_ss.add(when: 'CONFIG_VEXPRESS', if_true: files('vexpress.c')) -arm_common_ss.add(fdt, files('boot.c')) +arm_common_ss.add(files('boot.c')) hw_arch += {'arm': arm_ss} hw_common_arch += {'arm': arm_common_ss} diff --git a/hw/arm/mps2.c b/hw/arm/mps2.c index 58efb41..bd378e3 100644 --- a/hw/arm/mps2.c +++ b/hw/arm/mps2.c @@ -224,7 +224,11 @@ static void mps2_common_init(MachineState *machine) switch (mmc->fpga_type) { case FPGA_AN385: case FPGA_AN386: + qdev_prop_set_uint32(armv7m, "num-irq", 32); + break; case FPGA_AN500: + /* The AN500 configures its Cortex-M7 with 16 MPU regions */ + qdev_prop_set_uint32(armv7m, "mpu-ns-regions", 16); qdev_prop_set_uint32(armv7m, "num-irq", 32); break; case FPGA_AN511: diff --git a/hw/arm/npcm8xx.c b/hw/arm/npcm8xx.c index d7ee306..a276fea 100644 --- a/hw/arm/npcm8xx.c +++ b/hw/arm/npcm8xx.c @@ -67,6 +67,9 @@ /* SDHCI Modules */ #define NPCM8XX_MMC_BA 0xf0842000 +/* PCS Module */ +#define NPCM8XX_PCS_BA 0xf0780000 + /* PSPI Modules */ #define NPCM8XX_PSPI_BA 0xf0201000 @@ -85,6 +88,10 @@ enum NPCM8xxInterrupt { NPCM8XX_ADC_IRQ = 0, NPCM8XX_PECI_IRQ = 6, NPCM8XX_KCS_HIB_IRQ = 9, + NPCM8XX_GMAC1_IRQ = 14, + NPCM8XX_GMAC2_IRQ, + NPCM8XX_GMAC3_IRQ, + NPCM8XX_GMAC4_IRQ, NPCM8XX_MMC_IRQ = 26, NPCM8XX_PSPI_IRQ = 28, NPCM8XX_TIMER0_IRQ = 32, /* Timer Module 0 */ @@ -260,6 +267,14 @@ static const hwaddr npcm8xx_smbus_addr[] = { 0xfff0a000, }; +/* Register base address for each GMAC Module */ +static const hwaddr npcm8xx_gmac_addr[] = { + 0xf0802000, + 0xf0804000, + 0xf0806000, + 0xf0808000, +}; + /* Register base address for each USB host EHCI registers */ static const hwaddr npcm8xx_ehci_addr[] = { 0xf0828100, @@ -350,6 +365,7 @@ static struct arm_boot_info npcm8xx_binfo = { .secure_boot = false, .board_id = -1, .board_setup_addr = NPCM8XX_BOARD_SETUP_ADDR, + .psci_conduit = QEMU_PSCI_CONDUIT_SMC, }; void npcm8xx_load_kernel(MachineState *machine, NPCM8xxState *soc) @@ -444,6 +460,11 @@ static void npcm8xx_init(Object *obj) object_initialize_child(obj, "mft[*]", &s->mft[i], TYPE_NPCM7XX_MFT); } + for (i = 0; i < ARRAY_SIZE(s->gmac); i++) { + object_initialize_child(obj, "gmac[*]", &s->gmac[i], TYPE_NPCM_GMAC); + } + object_initialize_child(obj, "pcs", &s->pcs, TYPE_NPCM_PCS); + object_initialize_child(obj, "mmc", &s->mmc, TYPE_NPCM7XX_SDHCI); object_initialize_child(obj, "pspi", &s->pspi, TYPE_NPCM_PSPI); } @@ -669,6 +690,35 @@ static void npcm8xx_realize(DeviceState *dev, Error **errp) } /* + * GMAC Modules. Cannot fail. + */ + QEMU_BUILD_BUG_ON(ARRAY_SIZE(npcm8xx_gmac_addr) != ARRAY_SIZE(s->gmac)); + for (i = 0; i < ARRAY_SIZE(s->gmac); i++) { + SysBusDevice *sbd = SYS_BUS_DEVICE(&s->gmac[i]); + + /* This is used to make sure that the NIC can create the device */ + qemu_configure_nic_device(DEVICE(sbd), false, NULL); + + /* + * The device exists regardless of whether it's connected to a QEMU + * netdev backend. So always instantiate it even if there is no + * backend. + */ + sysbus_realize(sbd, &error_abort); + sysbus_mmio_map(sbd, 0, npcm8xx_gmac_addr[i]); + /* + * N.B. The values for the second argument sysbus_connect_irq are + * chosen to match the registration order in npcm7xx_emc_realize. + */ + sysbus_connect_irq(sbd, 0, npcm8xx_irq(s, NPCM8XX_GMAC1_IRQ + i)); + } + /* + * GMAC Physical Coding Sublayer(PCS) Module. Cannot fail. + */ + sysbus_realize(SYS_BUS_DEVICE(&s->pcs), &error_abort); + sysbus_mmio_map(SYS_BUS_DEVICE(&s->pcs), 0, NPCM8XX_PCS_BA); + + /* * Flash Interface Unit (FIU). Can fail if incorrect number of chip selects * specified, but this is a programming error. */ @@ -741,12 +791,7 @@ static void npcm8xx_realize(DeviceState *dev, Error **errp) create_unimplemented_device("npcm8xx.ahbpci", 0xf0400000, 1 * MiB); create_unimplemented_device("npcm8xx.dap", 0xf0500000, 960 * KiB); create_unimplemented_device("npcm8xx.mcphy", 0xf05f0000, 64 * KiB); - create_unimplemented_device("npcm8xx.pcs", 0xf0780000, 256 * KiB); create_unimplemented_device("npcm8xx.tsgen", 0xf07fc000, 8 * KiB); - create_unimplemented_device("npcm8xx.gmac1", 0xf0802000, 8 * KiB); - create_unimplemented_device("npcm8xx.gmac2", 0xf0804000, 8 * KiB); - create_unimplemented_device("npcm8xx.gmac3", 0xf0806000, 8 * KiB); - create_unimplemented_device("npcm8xx.gmac4", 0xf0808000, 8 * KiB); create_unimplemented_device("npcm8xx.copctl", 0xf080c000, 4 * KiB); create_unimplemented_device("npcm8xx.tipctl", 0xf080d000, 4 * KiB); create_unimplemented_device("npcm8xx.rst", 0xf080e000, 4 * KiB); diff --git a/hw/arm/sbsa-ref.c b/hw/arm/sbsa-ref.c index deae5cf..15c1ff4 100644 --- a/hw/arm/sbsa-ref.c +++ b/hw/arm/sbsa-ref.c @@ -19,6 +19,7 @@ */ #include "qemu/osdep.h" +#include "qemu/cutils.h" #include "qemu/datadir.h" #include "qapi/error.h" #include "qemu/error-report.h" @@ -53,8 +54,7 @@ #include "target/arm/cpu-qom.h" #include "target/arm/gtimer.h" -#define RAMLIMIT_GB 8192 -#define RAMLIMIT_BYTES (RAMLIMIT_GB * GiB) +#define RAMLIMIT_BYTES (8 * TiB) #define NUM_IRQS 256 #define NUM_SMMU_IRQS 4 @@ -756,7 +756,9 @@ static void sbsa_ref_init(MachineState *machine) sms->smp_cpus = smp_cpus; if (machine->ram_size > sbsa_ref_memmap[SBSA_MEM].size) { - error_report("sbsa-ref: cannot model more than %dGB RAM", RAMLIMIT_GB); + char *size_str = size_to_str(RAMLIMIT_BYTES); + + error_report("sbsa-ref: cannot model more than %s of RAM", size_str); exit(1); } diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 7e8e0f0..cd90c47 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -266,6 +266,43 @@ static int iort_idmap_compare(gconstpointer a, gconstpointer b) return idmap_a->input_base - idmap_b->input_base; } +/* Compute ID ranges (RIDs) from RC that are directed to the ITS Group node */ +static void create_rc_its_idmaps(GArray *its_idmaps, GArray *smmu_idmaps) +{ + AcpiIortIdMapping *idmap; + AcpiIortIdMapping next_range = {0}; + + /* + * Based on the RID ranges that are directed to the SMMU, determine the + * bypassed RID ranges, i.e., the ones that are directed to the ITS Group + * node and do not pass through the SMMU, by subtracting the SMMU-bound + * ranges from the full RID range (0x0000–0xFFFF). + */ + for (int i = 0; i < smmu_idmaps->len; i++) { + idmap = &g_array_index(smmu_idmaps, AcpiIortIdMapping, i); + + if (next_range.input_base < idmap->input_base) { + next_range.id_count = idmap->input_base - next_range.input_base; + g_array_append_val(its_idmaps, next_range); + } + + next_range.input_base = idmap->input_base + idmap->id_count; + } + + /* + * Append the last RC -> ITS ID mapping. + * + * RIDs are 16-bit, according to the PCI Express 2.0 Base Specification, rev + * 0.9, section 2.2.6.2, "Transaction Descriptor - Transaction ID Field", + * hence the end of the range is 0x10000. + */ + if (next_range.input_base < 0x10000) { + next_range.id_count = 0x10000 - next_range.input_base; + g_array_append_val(its_idmaps, next_range); + } +} + + /* * Input Output Remapping Table (IORT) * Conforms to "IO Remapping Table System Software on ARM Platforms", @@ -276,10 +313,9 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) { int i, nb_nodes, rc_mapping_count; size_t node_size, smmu_offset = 0; - AcpiIortIdMapping *idmap; uint32_t id = 0; - GArray *smmu_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping)); - GArray *its_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping)); + GArray *rc_smmu_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping)); + GArray *rc_its_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping)); AcpiTable table = { .sig = "IORT", .rev = 3, .oem_id = vms->oem_id, .oem_table_id = vms->oem_table_id }; @@ -287,40 +323,39 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) acpi_table_begin(&table, table_data); if (vms->iommu == VIRT_IOMMU_SMMUV3) { - AcpiIortIdMapping next_range = {0}; - object_child_foreach_recursive(object_get_root(), - iort_host_bridges, smmu_idmaps); + iort_host_bridges, rc_smmu_idmaps); /* Sort the smmu idmap by input_base */ - g_array_sort(smmu_idmaps, iort_idmap_compare); + g_array_sort(rc_smmu_idmaps, iort_idmap_compare); /* - * Split the whole RIDs by mapping from RC to SMMU, - * build the ID mapping from RC to ITS directly. + * Knowing the ID ranges from the RC to the SMMU, it's possible to + * determine the ID ranges from RC that are directed to the ITS. */ - for (i = 0; i < smmu_idmaps->len; i++) { - idmap = &g_array_index(smmu_idmaps, AcpiIortIdMapping, i); + create_rc_its_idmaps(rc_its_idmaps, rc_smmu_idmaps); - if (next_range.input_base < idmap->input_base) { - next_range.id_count = idmap->input_base - next_range.input_base; - g_array_append_val(its_idmaps, next_range); - } + nb_nodes = 2; /* RC and SMMUv3 */ + rc_mapping_count = rc_smmu_idmaps->len; - next_range.input_base = idmap->input_base + idmap->id_count; - } + if (vms->its) { + /* + * Knowing the ID ranges from the RC to the SMMU, it's possible to + * determine the ID ranges from RC that go directly to ITS. + */ + create_rc_its_idmaps(rc_its_idmaps, rc_smmu_idmaps); - /* Append the last RC -> ITS ID mapping */ - if (next_range.input_base < 0x10000) { - next_range.id_count = 0x10000 - next_range.input_base; - g_array_append_val(its_idmaps, next_range); + nb_nodes++; /* ITS */ + rc_mapping_count += rc_its_idmaps->len; } - - nb_nodes = 3; /* RC, ITS, SMMUv3 */ - rc_mapping_count = smmu_idmaps->len + its_idmaps->len; } else { - nb_nodes = 2; /* RC, ITS */ - rc_mapping_count = 1; + if (vms->its) { + nb_nodes = 2; /* RC and ITS */ + rc_mapping_count = 1; /* Direct map to ITS */ + } else { + nb_nodes = 1; /* RC only */ + rc_mapping_count = 0; /* No output mapping */ + } } /* Number of IORT Nodes */ build_append_int_noprefix(table_data, nb_nodes, 4); @@ -329,31 +364,43 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) build_append_int_noprefix(table_data, IORT_NODE_OFFSET, 4); build_append_int_noprefix(table_data, 0, 4); /* Reserved */ - /* Table 12 ITS Group Format */ - build_append_int_noprefix(table_data, 0 /* ITS Group */, 1); /* Type */ - node_size = 20 /* fixed header size */ + 4 /* 1 GIC ITS Identifier */; - build_append_int_noprefix(table_data, node_size, 2); /* Length */ - build_append_int_noprefix(table_data, 1, 1); /* Revision */ - build_append_int_noprefix(table_data, id++, 4); /* Identifier */ - build_append_int_noprefix(table_data, 0, 4); /* Number of ID mappings */ - build_append_int_noprefix(table_data, 0, 4); /* Reference to ID Array */ - build_append_int_noprefix(table_data, 1, 4); /* Number of ITSs */ - /* GIC ITS Identifier Array */ - build_append_int_noprefix(table_data, 0 /* MADT translation_id */, 4); + if (vms->its) { + /* Table 12 ITS Group Format */ + build_append_int_noprefix(table_data, 0 /* ITS Group */, 1); /* Type */ + node_size = 20 /* fixed header size */ + 4 /* 1 GIC ITS Identifier */; + build_append_int_noprefix(table_data, node_size, 2); /* Length */ + build_append_int_noprefix(table_data, 1, 1); /* Revision */ + build_append_int_noprefix(table_data, id++, 4); /* Identifier */ + build_append_int_noprefix(table_data, 0, 4); /* Number of ID mappings */ + build_append_int_noprefix(table_data, 0, 4); /* Reference to ID Array */ + build_append_int_noprefix(table_data, 1, 4); /* Number of ITSs */ + /* GIC ITS Identifier Array */ + build_append_int_noprefix(table_data, 0 /* MADT translation_id */, 4); + } if (vms->iommu == VIRT_IOMMU_SMMUV3) { int irq = vms->irqmap[VIRT_SMMU] + ARM_SPI_BASE; - + int smmu_mapping_count, offset_to_id_array; + + if (vms->its) { + smmu_mapping_count = 1; /* ITS Group node */ + offset_to_id_array = SMMU_V3_ENTRY_SIZE; /* Just after the header */ + } else { + smmu_mapping_count = 0; /* No ID mappings */ + offset_to_id_array = 0; /* No ID mappings array */ + } smmu_offset = table_data->len - table.table_offset; /* Table 9 SMMUv3 Format */ build_append_int_noprefix(table_data, 4 /* SMMUv3 */, 1); /* Type */ - node_size = SMMU_V3_ENTRY_SIZE + ID_MAPPING_ENTRY_SIZE; + node_size = SMMU_V3_ENTRY_SIZE + + (ID_MAPPING_ENTRY_SIZE * smmu_mapping_count); build_append_int_noprefix(table_data, node_size, 2); /* Length */ build_append_int_noprefix(table_data, 4, 1); /* Revision */ build_append_int_noprefix(table_data, id++, 4); /* Identifier */ - build_append_int_noprefix(table_data, 1, 4); /* Number of ID mappings */ + /* Number of ID mappings */ + build_append_int_noprefix(table_data, smmu_mapping_count, 4); /* Reference to ID Array */ - build_append_int_noprefix(table_data, SMMU_V3_ENTRY_SIZE, 4); + build_append_int_noprefix(table_data, offset_to_id_array, 4); /* Base address */ build_append_int_noprefix(table_data, vms->memmap[VIRT_SMMU].base, 8); /* Flags */ @@ -369,9 +416,11 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) build_append_int_noprefix(table_data, 0, 4); /* Proximity domain */ /* DeviceID mapping index (ignored since interrupts are GSIV based) */ build_append_int_noprefix(table_data, 0, 4); - - /* output IORT node is the ITS group node (the first node) */ - build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET); + /* Array of ID mappings */ + if (smmu_mapping_count) { + /* Output IORT node is the ITS Group node (the first node). */ + build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET); + } } /* Table 17 Root Complex Node */ @@ -407,29 +456,44 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) if (vms->iommu == VIRT_IOMMU_SMMUV3) { AcpiIortIdMapping *range; - /* translated RIDs connect to SMMUv3 node: RC -> SMMUv3 -> ITS */ - for (i = 0; i < smmu_idmaps->len; i++) { - range = &g_array_index(smmu_idmaps, AcpiIortIdMapping, i); - /* output IORT node is the smmuv3 node */ + /* + * Map RIDs (input) from RC to SMMUv3 nodes: RC -> SMMUv3. + * + * N.B.: The mapping from SMMUv3 to ITS Group node (SMMUv3 -> ITS) is + * defined in the SMMUv3 table, where all SMMUv3 IDs are mapped to the + * ITS Group node, if ITS is available. + */ + for (i = 0; i < rc_smmu_idmaps->len; i++) { + range = &g_array_index(rc_smmu_idmaps, AcpiIortIdMapping, i); + /* Output IORT node is the SMMUv3 node. */ build_iort_id_mapping(table_data, range->input_base, range->id_count, smmu_offset); } - /* bypassed RIDs connect to ITS group node directly: RC -> ITS */ - for (i = 0; i < its_idmaps->len; i++) { - range = &g_array_index(its_idmaps, AcpiIortIdMapping, i); - /* output IORT node is the ITS group node (the first node) */ - build_iort_id_mapping(table_data, range->input_base, - range->id_count, IORT_NODE_OFFSET); + if (vms->its) { + /* + * Map bypassed (don't go through the SMMU) RIDs (input) to + * ITS Group node directly: RC -> ITS. + */ + for (i = 0; i < rc_its_idmaps->len; i++) { + range = &g_array_index(rc_its_idmaps, AcpiIortIdMapping, i); + /* Output IORT node is the ITS Group node (the first node). */ + build_iort_id_mapping(table_data, range->input_base, + range->id_count, IORT_NODE_OFFSET); + } } } else { - /* output IORT node is the ITS group node (the first node) */ + /* + * Map all RIDs (input) to ITS Group node directly, since there is no + * SMMU: RC -> ITS. + * Output IORT node is the ITS Group node (the first node). + */ build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET); } acpi_table_end(linker, &table); - g_array_free(smmu_idmaps, true); - g_array_free(its_idmaps, true); + g_array_free(rc_smmu_idmaps, true); + g_array_free(rc_its_idmaps, true); } /* @@ -737,7 +801,7 @@ build_madt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) memmap[VIRT_HIGH_GIC_REDIST2].size); } - if (its_class_name()) { + if (vms->its) { /* * ACPI spec, Revision 6.0 Errata A * (original 6.0 definition has invalid Length) @@ -969,10 +1033,8 @@ void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables) vms->oem_table_id); } - if (its_class_name()) { - acpi_add_table(table_offsets, tables_blob); - build_iort(tables_blob, tables->linker, vms); - } + acpi_add_table(table_offsets, tables_blob); + build_iort(tables_blob, tables->linker, vms); #ifdef CONFIG_TPM if (tpm_get_version(tpm_find()) == TPM_VERSION_2_0) { diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 9a6cd08..3bcdf92 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -705,21 +705,18 @@ static inline DeviceState *create_acpi_ged(VirtMachineState *vms) static void create_its(VirtMachineState *vms) { - const char *itsclass = its_class_name(); DeviceState *dev; - if (!strcmp(itsclass, "arm-gicv3-its")) { - if (!vms->tcg_its) { - itsclass = NULL; - } - } - - if (!itsclass) { - /* Do nothing if not supported */ + assert(vms->its); + if (!kvm_irqchip_in_kernel() && !vms->tcg_its) { + /* + * Do nothing if ITS is neither supported by the host nor emulated by + * the machine. + */ return; } - dev = qdev_new(itsclass); + dev = qdev_new(its_class_name()); object_property_set_link(OBJECT(dev), "parent-gicv3", OBJECT(vms->gic), &error_abort); @@ -1487,9 +1484,12 @@ static void create_virtio_iommu_dt_bindings(VirtMachineState *vms) qemu_fdt_setprop_cell(ms->fdt, node, "phandle", vms->iommu_phandle); g_free(node); - qemu_fdt_setprop_cells(ms->fdt, vms->pciehb_nodename, "iommu-map", - 0x0, vms->iommu_phandle, 0x0, bdf, - bdf + 1, vms->iommu_phandle, bdf + 1, 0xffff - bdf); + if (!vms->default_bus_bypass_iommu) { + qemu_fdt_setprop_cells(ms->fdt, vms->pciehb_nodename, "iommu-map", + 0x0, vms->iommu_phandle, 0x0, bdf, + bdf + 1, vms->iommu_phandle, bdf + 1, + 0xffff - bdf); + } } static void create_pcie(VirtMachineState *vms) @@ -1612,8 +1612,10 @@ static void create_pcie(VirtMachineState *vms) switch (vms->iommu) { case VIRT_IOMMU_SMMUV3: create_smmu(vms, vms->bus); - qemu_fdt_setprop_cells(ms->fdt, nodename, "iommu-map", - 0x0, vms->iommu_phandle, 0x0, 0x10000); + if (!vms->default_bus_bypass_iommu) { + qemu_fdt_setprop_cells(ms->fdt, nodename, "iommu-map", + 0x0, vms->iommu_phandle, 0x0, 0x10000); + } break; default: g_assert_not_reached(); @@ -2024,10 +2026,11 @@ static void finalize_gic_version(VirtMachineState *vms) } /* - * virt_cpu_post_init() must be called after the CPUs have - * been realized and the GIC has been created. + * virt_post_cpus_gic_realized() must be called after the CPUs and + * the GIC have both been realized. */ -static void virt_cpu_post_init(VirtMachineState *vms, MemoryRegion *sysmem) +static void virt_post_cpus_gic_realized(VirtMachineState *vms, + MemoryRegion *sysmem) { int max_cpus = MACHINE(vms)->smp.max_cpus; bool aarch64, pmu, steal_time; @@ -2198,14 +2201,14 @@ static void machvirt_init(MachineState *machine) exit(1); } - if (vms->secure && (kvm_enabled() || hvf_enabled())) { + if (vms->secure && !tcg_enabled() && !qtest_enabled()) { error_report("mach-virt: %s does not support providing " "Security extensions (TrustZone) to the guest CPU", current_accel_name()); exit(1); } - if (vms->virt && (kvm_enabled() || hvf_enabled())) { + if (vms->virt && !tcg_enabled() && !qtest_enabled()) { error_report("mach-virt: %s does not support providing " "Virtualization extensions to the guest CPU", current_accel_name()); @@ -2344,7 +2347,7 @@ static void machvirt_init(MachineState *machine) create_gic(vms, sysmem); - virt_cpu_post_init(vms, sysmem); + virt_post_cpus_gic_realized(vms, sysmem); fdt_add_pmu_nodes(vms); @@ -3337,12 +3340,8 @@ static void virt_instance_init(Object *obj) /* Default allows ITS instantiation */ vms->its = true; - - if (vmc->no_tcg_its) { - vms->tcg_its = false; - } else { - vms->tcg_its = true; - } + /* Allow ITS emulation if the machine version supports it */ + vms->tcg_its = !vmc->no_tcg_its; /* Default disallows iommu instantiation */ vms->iommu = VIRT_IOMMU_NONE; diff --git a/hw/audio/ac97.c b/hw/audio/ac97.c index 669a046..eb7a847 100644 --- a/hw/audio/ac97.c +++ b/hw/audio/ac97.c @@ -886,7 +886,7 @@ static void nabm_writel(void *opaque, uint32_t addr, uint32_t val) static int write_audio(AC97LinkState *s, AC97BusMasterRegs *r, int max, int *stop) { - uint8_t tmpbuf[4096]; + QEMU_UNINITIALIZED uint8_t tmpbuf[4096]; uint32_t addr = r->bd.addr; uint32_t temp = r->picb << 1; uint32_t written = 0; @@ -959,7 +959,7 @@ static void write_bup(AC97LinkState *s, int elapsed) static int read_audio(AC97LinkState *s, AC97BusMasterRegs *r, int max, int *stop) { - uint8_t tmpbuf[4096]; + QEMU_UNINITIALIZED uint8_t tmpbuf[4096]; uint32_t addr = r->bd.addr; uint32_t temp = r->picb << 1; uint32_t nread = 0; diff --git a/hw/audio/asc.c b/hw/audio/asc.c index 18382cc..edd42d6 100644 --- a/hw/audio/asc.c +++ b/hw/audio/asc.c @@ -12,6 +12,7 @@ #include "qemu/osdep.h" #include "qemu/timer.h" +#include "qapi/error.h" #include "hw/sysbus.h" #include "hw/irq.h" #include "audio/audio.h" @@ -653,11 +654,17 @@ static void asc_realize(DeviceState *dev, Error **errp) s->voice = AUD_open_out(&s->card, s->voice, "asc.out", s, asc_out_cb, &as); + if (!s->voice) { + AUD_remove_card(&s->card); + error_setg(errp, "Initializing audio stream failed"); + return; + } + s->shift = 1; s->samples = AUD_get_buffer_size_out(s->voice) >> s->shift; s->mixbuf = g_malloc0(s->samples << s->shift); - s->silentbuf = g_malloc0(s->samples << s->shift); + s->silentbuf = g_malloc(s->samples << s->shift); memset(s->silentbuf, 0x80, s->samples << s->shift); /* Add easc registers if required */ diff --git a/hw/audio/cs4231a.c b/hw/audio/cs4231a.c index eb9a458..6dfff20 100644 --- a/hw/audio/cs4231a.c +++ b/hw/audio/cs4231a.c @@ -528,7 +528,7 @@ static int cs_write_audio (CSState *s, int nchan, int dma_pos, int dma_len, int len) { int temp, net; - uint8_t tmpbuf[4096]; + QEMU_UNINITIALIZED uint8_t tmpbuf[4096]; IsaDmaClass *k = ISADMA_GET_CLASS(s->isa_dma); temp = len; @@ -547,7 +547,7 @@ static int cs_write_audio (CSState *s, int nchan, int dma_pos, copied = k->read_memory(s->isa_dma, nchan, tmpbuf, dma_pos, to_copy); if (s->tab) { int i; - int16_t linbuf[4096]; + QEMU_UNINITIALIZED int16_t linbuf[4096]; for (i = 0; i < copied; ++i) linbuf[i] = s->tab[tmpbuf[i]]; diff --git a/hw/audio/es1370.c b/hw/audio/es1370.c index 8efb969..a6a32a6 100644 --- a/hw/audio/es1370.c +++ b/hw/audio/es1370.c @@ -604,7 +604,7 @@ static uint64_t es1370_read(void *opaque, hwaddr addr, unsigned size) static void es1370_transfer_audio (ES1370State *s, struct chan *d, int loop_sel, int max, bool *irq) { - uint8_t tmpbuf[4096]; + QEMU_UNINITIALIZED uint8_t tmpbuf[4096]; size_t to_transfer; uint32_t addr = d->frame_addr; int sc = d->scount & 0xffff; diff --git a/hw/audio/gus.c b/hw/audio/gus.c index 87e8634..c36df02 100644 --- a/hw/audio/gus.c +++ b/hw/audio/gus.c @@ -183,7 +183,7 @@ static int GUS_read_DMA (void *opaque, int nchan, int dma_pos, int dma_len) { GUSState *s = opaque; IsaDmaClass *k = ISADMA_GET_CLASS(s->isa_dma); - char tmpbuf[4096]; + QEMU_UNINITIALIZED char tmpbuf[4096]; int pos = dma_pos, mode, left = dma_len - dma_pos; ldebug ("read DMA %#x %d\n", dma_pos, dma_len); diff --git a/hw/audio/marvell_88w8618.c b/hw/audio/marvell_88w8618.c index 6d3ebbb..c5c79d0 100644 --- a/hw/audio/marvell_88w8618.c +++ b/hw/audio/marvell_88w8618.c @@ -66,7 +66,7 @@ static void mv88w8618_audio_callback(void *opaque, int free_out, int free_in) { mv88w8618_audio_state *s = opaque; int16_t *codec_buffer; - int8_t buf[4096]; + QEMU_UNINITIALIZED int8_t buf[4096]; int8_t *mem_buffer; int pos, block_size; diff --git a/hw/audio/sb16.c b/hw/audio/sb16.c index 19fd3b9..bac6411 100644 --- a/hw/audio/sb16.c +++ b/hw/audio/sb16.c @@ -1181,7 +1181,7 @@ static int write_audio (SB16State *s, int nchan, int dma_pos, IsaDma *isa_dma = nchan == s->dma ? s->isa_dma : s->isa_hdma; IsaDmaClass *k = ISADMA_GET_CLASS(isa_dma); int temp, net; - uint8_t tmpbuf[4096]; + QEMU_UNINITIALIZED uint8_t tmpbuf[4096]; temp = len; net = 0; diff --git a/hw/audio/via-ac97.c b/hw/audio/via-ac97.c index 1e0a5c7..d5231e1 100644 --- a/hw/audio/via-ac97.c +++ b/hw/audio/via-ac97.c @@ -175,7 +175,7 @@ static void out_cb(void *opaque, int avail) ViaAC97SGDChannel *c = &s->aur; int temp, to_copy, copied; bool stop = false; - uint8_t tmpbuf[4096]; + QEMU_UNINITIALIZED uint8_t tmpbuf[4096]; if (c->stat & STAT_PAUSED) { return; diff --git a/hw/block/Kconfig b/hw/block/Kconfig index a898e04..737dbcd 100644 --- a/hw/block/Kconfig +++ b/hw/block/Kconfig @@ -13,9 +13,6 @@ config FDC_SYSBUS config SSI_M25P80 bool -config NAND - bool - config PFLASH_CFI01 bool diff --git a/hw/block/meson.build b/hw/block/meson.build index 16a51bf..6557044 100644 --- a/hw/block/meson.build +++ b/hw/block/meson.build @@ -6,7 +6,6 @@ system_ss.add(files( system_ss.add(when: 'CONFIG_FDC', if_true: files('fdc.c')) system_ss.add(when: 'CONFIG_FDC_ISA', if_true: files('fdc-isa.c')) system_ss.add(when: 'CONFIG_FDC_SYSBUS', if_true: files('fdc-sysbus.c')) -system_ss.add(when: 'CONFIG_NAND', if_true: files('nand.c')) system_ss.add(when: 'CONFIG_PFLASH_CFI01', if_true: files('pflash_cfi01.c')) system_ss.add(when: 'CONFIG_PFLASH_CFI02', if_true: files('pflash_cfi02.c')) system_ss.add(when: 'CONFIG_SSI_M25P80', if_true: files('m25p80.c')) diff --git a/hw/block/nand.c b/hw/block/nand.c deleted file mode 100644 index c80bf78..0000000 --- a/hw/block/nand.c +++ /dev/null @@ -1,835 +0,0 @@ -/* - * Flash NAND memory emulation. Based on "16M x 8 Bit NAND Flash - * Memory" datasheet for the KM29U128AT / K9F2808U0A chips from - * Samsung Electronic. - * - * Copyright (c) 2006 Openedhand Ltd. - * Written by Andrzej Zaborowski <balrog@zabor.org> - * - * Support for additional features based on "MT29F2G16ABCWP 2Gx16" - * datasheet from Micron Technology and "NAND02G-B2C" datasheet - * from ST Microelectronics. - * - * This code is licensed under the GNU GPL v2. - * - * Contributions after 2012-01-13 are licensed under the terms of the - * GNU GPL, version 2 or (at your option) any later version. - */ - -#ifndef NAND_IO - -#include "qemu/osdep.h" -#include "hw/hw.h" -#include "hw/qdev-properties.h" -#include "hw/qdev-properties-system.h" -#include "hw/block/flash.h" -#include "system/block-backend.h" -#include "migration/vmstate.h" -#include "qapi/error.h" -#include "qemu/error-report.h" -#include "qemu/module.h" -#include "qom/object.h" - -# define NAND_CMD_READ0 0x00 -# define NAND_CMD_READ1 0x01 -# define NAND_CMD_READ2 0x50 -# define NAND_CMD_LPREAD2 0x30 -# define NAND_CMD_NOSERIALREAD2 0x35 -# define NAND_CMD_RANDOMREAD1 0x05 -# define NAND_CMD_RANDOMREAD2 0xe0 -# define NAND_CMD_READID 0x90 -# define NAND_CMD_RESET 0xff -# define NAND_CMD_PAGEPROGRAM1 0x80 -# define NAND_CMD_PAGEPROGRAM2 0x10 -# define NAND_CMD_CACHEPROGRAM2 0x15 -# define NAND_CMD_BLOCKERASE1 0x60 -# define NAND_CMD_BLOCKERASE2 0xd0 -# define NAND_CMD_READSTATUS 0x70 -# define NAND_CMD_COPYBACKPRG1 0x85 - -# define NAND_IOSTATUS_ERROR (1 << 0) -# define NAND_IOSTATUS_PLANE0 (1 << 1) -# define NAND_IOSTATUS_PLANE1 (1 << 2) -# define NAND_IOSTATUS_PLANE2 (1 << 3) -# define NAND_IOSTATUS_PLANE3 (1 << 4) -# define NAND_IOSTATUS_READY (1 << 6) -# define NAND_IOSTATUS_UNPROTCT (1 << 7) - -# define MAX_PAGE 0x800 -# define MAX_OOB 0x40 - -typedef struct NANDFlashState NANDFlashState; -struct NANDFlashState { - DeviceState parent_obj; - - uint8_t manf_id, chip_id; - uint8_t buswidth; /* in BYTES */ - int size, pages; - int page_shift, oob_shift, erase_shift, addr_shift; - uint8_t *storage; - BlockBackend *blk; - int mem_oob; - - uint8_t cle, ale, ce, wp, gnd; - - uint8_t io[MAX_PAGE + MAX_OOB + 0x400]; - uint8_t *ioaddr; - int iolen; - - uint32_t cmd; - uint64_t addr; - int addrlen; - int status; - int offset; - - void (*blk_write)(NANDFlashState *s); - void (*blk_erase)(NANDFlashState *s); - /* - * Returns %true when block containing (@addr + @offset) is - * successfully loaded, otherwise %false. - */ - bool (*blk_load)(NANDFlashState *s, uint64_t addr, unsigned offset); - - uint32_t ioaddr_vmstate; -}; - -#define TYPE_NAND "nand" - -OBJECT_DECLARE_SIMPLE_TYPE(NANDFlashState, NAND) - -static void mem_and(uint8_t *dest, const uint8_t *src, size_t n) -{ - /* Like memcpy() but we logical-AND the data into the destination */ - int i; - for (i = 0; i < n; i++) { - dest[i] &= src[i]; - } -} - -# define NAND_NO_AUTOINCR 0x00000001 -# define NAND_BUSWIDTH_16 0x00000002 -# define NAND_NO_PADDING 0x00000004 -# define NAND_CACHEPRG 0x00000008 -# define NAND_COPYBACK 0x00000010 -# define NAND_IS_AND 0x00000020 -# define NAND_4PAGE_ARRAY 0x00000040 -# define NAND_NO_READRDY 0x00000100 -# define NAND_SAMSUNG_LP (NAND_NO_PADDING | NAND_COPYBACK) - -# define NAND_IO - -# define PAGE(addr) ((addr) >> ADDR_SHIFT) -# define PAGE_START(page) (PAGE(page) * (NAND_PAGE_SIZE + OOB_SIZE)) -# define PAGE_MASK ((1 << ADDR_SHIFT) - 1) -# define OOB_SHIFT (PAGE_SHIFT - 5) -# define OOB_SIZE (1 << OOB_SHIFT) -# define SECTOR(addr) ((addr) >> (9 + ADDR_SHIFT - PAGE_SHIFT)) -# define SECTOR_OFFSET(addr) ((addr) & ((511 >> PAGE_SHIFT) << 8)) - -# define NAND_PAGE_SIZE 256 -# define PAGE_SHIFT 8 -# define PAGE_SECTORS 1 -# define ADDR_SHIFT 8 -# include "nand.c" -# define NAND_PAGE_SIZE 512 -# define PAGE_SHIFT 9 -# define PAGE_SECTORS 1 -# define ADDR_SHIFT 8 -# include "nand.c" -# define NAND_PAGE_SIZE 2048 -# define PAGE_SHIFT 11 -# define PAGE_SECTORS 4 -# define ADDR_SHIFT 16 -# include "nand.c" - -/* Information based on Linux drivers/mtd/nand/raw/nand_ids.c */ -static const struct { - int size; - int width; - int page_shift; - int erase_shift; - uint32_t options; -} nand_flash_ids[0x100] = { - [0 ... 0xff] = { 0 }, - - [0x6b] = { 4, 8, 9, 4, 0 }, - [0xe3] = { 4, 8, 9, 4, 0 }, - [0xe5] = { 4, 8, 9, 4, 0 }, - [0xd6] = { 8, 8, 9, 4, 0 }, - [0xe6] = { 8, 8, 9, 4, 0 }, - - [0x33] = { 16, 8, 9, 5, 0 }, - [0x73] = { 16, 8, 9, 5, 0 }, - [0x43] = { 16, 16, 9, 5, NAND_BUSWIDTH_16 }, - [0x53] = { 16, 16, 9, 5, NAND_BUSWIDTH_16 }, - - [0x35] = { 32, 8, 9, 5, 0 }, - [0x75] = { 32, 8, 9, 5, 0 }, - [0x45] = { 32, 16, 9, 5, NAND_BUSWIDTH_16 }, - [0x55] = { 32, 16, 9, 5, NAND_BUSWIDTH_16 }, - - [0x36] = { 64, 8, 9, 5, 0 }, - [0x76] = { 64, 8, 9, 5, 0 }, - [0x46] = { 64, 16, 9, 5, NAND_BUSWIDTH_16 }, - [0x56] = { 64, 16, 9, 5, NAND_BUSWIDTH_16 }, - - [0x78] = { 128, 8, 9, 5, 0 }, - [0x39] = { 128, 8, 9, 5, 0 }, - [0x79] = { 128, 8, 9, 5, 0 }, - [0x72] = { 128, 16, 9, 5, NAND_BUSWIDTH_16 }, - [0x49] = { 128, 16, 9, 5, NAND_BUSWIDTH_16 }, - [0x74] = { 128, 16, 9, 5, NAND_BUSWIDTH_16 }, - [0x59] = { 128, 16, 9, 5, NAND_BUSWIDTH_16 }, - - [0x71] = { 256, 8, 9, 5, 0 }, - - /* - * These are the new chips with large page size. The pagesize and the - * erasesize is determined from the extended id bytes - */ -# define LP_OPTIONS (NAND_SAMSUNG_LP | NAND_NO_READRDY | NAND_NO_AUTOINCR) -# define LP_OPTIONS16 (LP_OPTIONS | NAND_BUSWIDTH_16) - - /* 512 Megabit */ - [0xa2] = { 64, 8, 0, 0, LP_OPTIONS }, - [0xf2] = { 64, 8, 0, 0, LP_OPTIONS }, - [0xb2] = { 64, 16, 0, 0, LP_OPTIONS16 }, - [0xc2] = { 64, 16, 0, 0, LP_OPTIONS16 }, - - /* 1 Gigabit */ - [0xa1] = { 128, 8, 0, 0, LP_OPTIONS }, - [0xf1] = { 128, 8, 0, 0, LP_OPTIONS }, - [0xb1] = { 128, 16, 0, 0, LP_OPTIONS16 }, - [0xc1] = { 128, 16, 0, 0, LP_OPTIONS16 }, - - /* 2 Gigabit */ - [0xaa] = { 256, 8, 0, 0, LP_OPTIONS }, - [0xda] = { 256, 8, 0, 0, LP_OPTIONS }, - [0xba] = { 256, 16, 0, 0, LP_OPTIONS16 }, - [0xca] = { 256, 16, 0, 0, LP_OPTIONS16 }, - - /* 4 Gigabit */ - [0xac] = { 512, 8, 0, 0, LP_OPTIONS }, - [0xdc] = { 512, 8, 0, 0, LP_OPTIONS }, - [0xbc] = { 512, 16, 0, 0, LP_OPTIONS16 }, - [0xcc] = { 512, 16, 0, 0, LP_OPTIONS16 }, - - /* 8 Gigabit */ - [0xa3] = { 1024, 8, 0, 0, LP_OPTIONS }, - [0xd3] = { 1024, 8, 0, 0, LP_OPTIONS }, - [0xb3] = { 1024, 16, 0, 0, LP_OPTIONS16 }, - [0xc3] = { 1024, 16, 0, 0, LP_OPTIONS16 }, - - /* 16 Gigabit */ - [0xa5] = { 2048, 8, 0, 0, LP_OPTIONS }, - [0xd5] = { 2048, 8, 0, 0, LP_OPTIONS }, - [0xb5] = { 2048, 16, 0, 0, LP_OPTIONS16 }, - [0xc5] = { 2048, 16, 0, 0, LP_OPTIONS16 }, -}; - -static void nand_reset(DeviceState *dev) -{ - NANDFlashState *s = NAND(dev); - s->cmd = NAND_CMD_READ0; - s->addr = 0; - s->addrlen = 0; - s->iolen = 0; - s->offset = 0; - s->status &= NAND_IOSTATUS_UNPROTCT; - s->status |= NAND_IOSTATUS_READY; -} - -static inline void nand_pushio_byte(NANDFlashState *s, uint8_t value) -{ - s->ioaddr[s->iolen++] = value; - for (value = s->buswidth; --value;) { - s->ioaddr[s->iolen++] = 0; - } -} - -/* - * nand_load_block: Load block containing (s->addr + @offset). - * Returns length of data available at @offset in this block. - */ -static unsigned nand_load_block(NANDFlashState *s, unsigned offset) -{ - unsigned iolen; - - if (!s->blk_load(s, s->addr, offset)) { - return 0; - } - - iolen = (1 << s->page_shift); - if (s->gnd) { - iolen += 1 << s->oob_shift; - } - assert(offset <= iolen); - iolen -= offset; - - return iolen; -} - -static void nand_command(NANDFlashState *s) -{ - switch (s->cmd) { - case NAND_CMD_READ0: - s->iolen = 0; - break; - - case NAND_CMD_READID: - s->ioaddr = s->io; - s->iolen = 0; - nand_pushio_byte(s, s->manf_id); - nand_pushio_byte(s, s->chip_id); - nand_pushio_byte(s, 'Q'); /* Don't-care byte (often 0xa5) */ - if (nand_flash_ids[s->chip_id].options & NAND_SAMSUNG_LP) { - /* Page Size, Block Size, Spare Size; bit 6 indicates - * 8 vs 16 bit width NAND. - */ - nand_pushio_byte(s, (s->buswidth == 2) ? 0x55 : 0x15); - } else { - nand_pushio_byte(s, 0xc0); /* Multi-plane */ - } - break; - - case NAND_CMD_RANDOMREAD2: - case NAND_CMD_NOSERIALREAD2: - if (!(nand_flash_ids[s->chip_id].options & NAND_SAMSUNG_LP)) - break; - s->iolen = nand_load_block(s, s->addr & ((1 << s->addr_shift) - 1)); - break; - - case NAND_CMD_RESET: - nand_reset(DEVICE(s)); - break; - - case NAND_CMD_PAGEPROGRAM1: - s->ioaddr = s->io; - s->iolen = 0; - break; - - case NAND_CMD_PAGEPROGRAM2: - if (s->wp) { - s->blk_write(s); - } - break; - - case NAND_CMD_BLOCKERASE1: - break; - - case NAND_CMD_BLOCKERASE2: - s->addr &= (1ull << s->addrlen * 8) - 1; - s->addr <<= nand_flash_ids[s->chip_id].options & NAND_SAMSUNG_LP ? - 16 : 8; - - if (s->wp) { - s->blk_erase(s); - } - break; - - case NAND_CMD_READSTATUS: - s->ioaddr = s->io; - s->iolen = 0; - nand_pushio_byte(s, s->status); - break; - - default: - printf("%s: Unknown NAND command 0x%02x\n", __func__, s->cmd); - } -} - -static int nand_pre_save(void *opaque) -{ - NANDFlashState *s = NAND(opaque); - - s->ioaddr_vmstate = s->ioaddr - s->io; - - return 0; -} - -static int nand_post_load(void *opaque, int version_id) -{ - NANDFlashState *s = NAND(opaque); - - if (s->ioaddr_vmstate > sizeof(s->io)) { - return -EINVAL; - } - s->ioaddr = s->io + s->ioaddr_vmstate; - - return 0; -} - -static const VMStateDescription vmstate_nand = { - .name = "nand", - .version_id = 1, - .minimum_version_id = 1, - .pre_save = nand_pre_save, - .post_load = nand_post_load, - .fields = (const VMStateField[]) { - VMSTATE_UINT8(cle, NANDFlashState), - VMSTATE_UINT8(ale, NANDFlashState), - VMSTATE_UINT8(ce, NANDFlashState), - VMSTATE_UINT8(wp, NANDFlashState), - VMSTATE_UINT8(gnd, NANDFlashState), - VMSTATE_BUFFER(io, NANDFlashState), - VMSTATE_UINT32(ioaddr_vmstate, NANDFlashState), - VMSTATE_INT32(iolen, NANDFlashState), - VMSTATE_UINT32(cmd, NANDFlashState), - VMSTATE_UINT64(addr, NANDFlashState), - VMSTATE_INT32(addrlen, NANDFlashState), - VMSTATE_INT32(status, NANDFlashState), - VMSTATE_INT32(offset, NANDFlashState), - /* XXX: do we want to save s->storage too? */ - VMSTATE_END_OF_LIST() - } -}; - -static void nand_realize(DeviceState *dev, Error **errp) -{ - int pagesize; - NANDFlashState *s = NAND(dev); - int ret; - - - s->buswidth = nand_flash_ids[s->chip_id].width >> 3; - s->size = nand_flash_ids[s->chip_id].size << 20; - if (nand_flash_ids[s->chip_id].options & NAND_SAMSUNG_LP) { - s->page_shift = 11; - s->erase_shift = 6; - } else { - s->page_shift = nand_flash_ids[s->chip_id].page_shift; - s->erase_shift = nand_flash_ids[s->chip_id].erase_shift; - } - - switch (1 << s->page_shift) { - case 256: - nand_init_256(s); - break; - case 512: - nand_init_512(s); - break; - case 2048: - nand_init_2048(s); - break; - default: - error_setg(errp, "Unsupported NAND block size %#x", - 1 << s->page_shift); - return; - } - - pagesize = 1 << s->oob_shift; - s->mem_oob = 1; - if (s->blk) { - if (!blk_supports_write_perm(s->blk)) { - error_setg(errp, "Can't use a read-only drive"); - return; - } - ret = blk_set_perm(s->blk, BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE, - BLK_PERM_ALL, errp); - if (ret < 0) { - return; - } - if (blk_getlength(s->blk) >= - (s->pages << s->page_shift) + (s->pages << s->oob_shift)) { - pagesize = 0; - s->mem_oob = 0; - } - } else { - pagesize += 1 << s->page_shift; - } - if (pagesize) { - s->storage = (uint8_t *) memset(g_malloc(s->pages * pagesize), - 0xff, s->pages * pagesize); - } - /* Give s->ioaddr a sane value in case we save state before it is used. */ - s->ioaddr = s->io; -} - -static const Property nand_properties[] = { - DEFINE_PROP_UINT8("manufacturer_id", NANDFlashState, manf_id, 0), - DEFINE_PROP_UINT8("chip_id", NANDFlashState, chip_id, 0), - DEFINE_PROP_DRIVE("drive", NANDFlashState, blk), -}; - -static void nand_class_init(ObjectClass *klass, const void *data) -{ - DeviceClass *dc = DEVICE_CLASS(klass); - - dc->realize = nand_realize; - device_class_set_legacy_reset(dc, nand_reset); - dc->vmsd = &vmstate_nand; - device_class_set_props(dc, nand_properties); - set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); -} - -static const TypeInfo nand_info = { - .name = TYPE_NAND, - .parent = TYPE_DEVICE, - .instance_size = sizeof(NANDFlashState), - .class_init = nand_class_init, -}; - -static void nand_register_types(void) -{ - type_register_static(&nand_info); -} - -/* - * Chip inputs are CLE, ALE, CE, WP, GND and eight I/O pins. Chip - * outputs are R/B and eight I/O pins. - * - * CE, WP and R/B are active low. - */ -void nand_setpins(DeviceState *dev, uint8_t cle, uint8_t ale, - uint8_t ce, uint8_t wp, uint8_t gnd) -{ - NANDFlashState *s = NAND(dev); - - s->cle = cle; - s->ale = ale; - s->ce = ce; - s->wp = wp; - s->gnd = gnd; - if (wp) { - s->status |= NAND_IOSTATUS_UNPROTCT; - } else { - s->status &= ~NAND_IOSTATUS_UNPROTCT; - } -} - -void nand_getpins(DeviceState *dev, int *rb) -{ - *rb = 1; -} - -void nand_setio(DeviceState *dev, uint32_t value) -{ - int i; - NANDFlashState *s = NAND(dev); - - if (!s->ce && s->cle) { - if (nand_flash_ids[s->chip_id].options & NAND_SAMSUNG_LP) { - if (s->cmd == NAND_CMD_READ0 && value == NAND_CMD_LPREAD2) - return; - if (value == NAND_CMD_RANDOMREAD1) { - s->addr &= ~((1 << s->addr_shift) - 1); - s->addrlen = 0; - return; - } - } - if (value == NAND_CMD_READ0) { - s->offset = 0; - } else if (value == NAND_CMD_READ1) { - s->offset = 0x100; - value = NAND_CMD_READ0; - } else if (value == NAND_CMD_READ2) { - s->offset = 1 << s->page_shift; - value = NAND_CMD_READ0; - } - - s->cmd = value; - - if (s->cmd == NAND_CMD_READSTATUS || - s->cmd == NAND_CMD_PAGEPROGRAM2 || - s->cmd == NAND_CMD_BLOCKERASE1 || - s->cmd == NAND_CMD_BLOCKERASE2 || - s->cmd == NAND_CMD_NOSERIALREAD2 || - s->cmd == NAND_CMD_RANDOMREAD2 || - s->cmd == NAND_CMD_RESET) { - nand_command(s); - } - - if (s->cmd != NAND_CMD_RANDOMREAD2) { - s->addrlen = 0; - } - } - - if (s->ale) { - unsigned int shift = s->addrlen * 8; - uint64_t mask = ~(0xffull << shift); - uint64_t v = (uint64_t)value << shift; - - s->addr = (s->addr & mask) | v; - s->addrlen ++; - - switch (s->addrlen) { - case 1: - if (s->cmd == NAND_CMD_READID) { - nand_command(s); - } - break; - case 2: /* fix cache address as a byte address */ - s->addr <<= (s->buswidth - 1); - break; - case 3: - if (!(nand_flash_ids[s->chip_id].options & NAND_SAMSUNG_LP) && - (s->cmd == NAND_CMD_READ0 || - s->cmd == NAND_CMD_PAGEPROGRAM1)) { - nand_command(s); - } - break; - case 4: - if ((nand_flash_ids[s->chip_id].options & NAND_SAMSUNG_LP) && - nand_flash_ids[s->chip_id].size < 256 && /* 1Gb or less */ - (s->cmd == NAND_CMD_READ0 || - s->cmd == NAND_CMD_PAGEPROGRAM1)) { - nand_command(s); - } - break; - case 5: - if ((nand_flash_ids[s->chip_id].options & NAND_SAMSUNG_LP) && - nand_flash_ids[s->chip_id].size >= 256 && /* 2Gb or more */ - (s->cmd == NAND_CMD_READ0 || - s->cmd == NAND_CMD_PAGEPROGRAM1)) { - nand_command(s); - } - break; - default: - break; - } - } - - if (!s->cle && !s->ale && s->cmd == NAND_CMD_PAGEPROGRAM1) { - if (s->iolen < (1 << s->page_shift) + (1 << s->oob_shift)) { - for (i = s->buswidth; i--; value >>= 8) { - s->io[s->iolen ++] = (uint8_t) (value & 0xff); - } - } - } else if (!s->cle && !s->ale && s->cmd == NAND_CMD_COPYBACKPRG1) { - if ((s->addr & ((1 << s->addr_shift) - 1)) < - (1 << s->page_shift) + (1 << s->oob_shift)) { - for (i = s->buswidth; i--; s->addr++, value >>= 8) { - s->io[s->iolen + (s->addr & ((1 << s->addr_shift) - 1))] = - (uint8_t) (value & 0xff); - } - } - } -} - -uint32_t nand_getio(DeviceState *dev) -{ - int offset; - uint32_t x = 0; - NANDFlashState *s = NAND(dev); - - /* Allow sequential reading */ - if (!s->iolen && s->cmd == NAND_CMD_READ0) { - offset = (int) (s->addr & ((1 << s->addr_shift) - 1)) + s->offset; - s->offset = 0; - s->iolen = nand_load_block(s, offset); - } - - if (s->ce || s->iolen <= 0) { - return 0; - } - - for (offset = s->buswidth; offset--;) { - x |= s->ioaddr[offset] << (offset << 3); - } - /* after receiving READ STATUS command all subsequent reads will - * return the status register value until another command is issued - */ - if (s->cmd != NAND_CMD_READSTATUS) { - s->addr += s->buswidth; - s->ioaddr += s->buswidth; - s->iolen -= s->buswidth; - } - return x; -} - -uint32_t nand_getbuswidth(DeviceState *dev) -{ - NANDFlashState *s = (NANDFlashState *) dev; - return s->buswidth << 3; -} - -DeviceState *nand_init(BlockBackend *blk, int manf_id, int chip_id) -{ - DeviceState *dev; - - if (nand_flash_ids[chip_id].size == 0) { - hw_error("%s: Unsupported NAND chip ID.\n", __func__); - } - dev = qdev_new(TYPE_NAND); - qdev_prop_set_uint8(dev, "manufacturer_id", manf_id); - qdev_prop_set_uint8(dev, "chip_id", chip_id); - if (blk) { - qdev_prop_set_drive_err(dev, "drive", blk, &error_fatal); - } - - qdev_realize(dev, NULL, &error_fatal); - return dev; -} - -type_init(nand_register_types) - -#else - -/* Program a single page */ -static void glue(nand_blk_write_, NAND_PAGE_SIZE)(NANDFlashState *s) -{ - uint64_t off, page, sector, soff; - uint8_t iobuf[(PAGE_SECTORS + 2) * 0x200]; - if (PAGE(s->addr) >= s->pages) - return; - - if (!s->blk) { - mem_and(s->storage + PAGE_START(s->addr) + (s->addr & PAGE_MASK) + - s->offset, s->io, s->iolen); - } else if (s->mem_oob) { - sector = SECTOR(s->addr); - off = (s->addr & PAGE_MASK) + s->offset; - soff = SECTOR_OFFSET(s->addr); - if (blk_pread(s->blk, sector << BDRV_SECTOR_BITS, - PAGE_SECTORS << BDRV_SECTOR_BITS, iobuf, 0) < 0) { - printf("%s: read error in sector %" PRIu64 "\n", __func__, sector); - return; - } - - mem_and(iobuf + (soff | off), s->io, MIN(s->iolen, NAND_PAGE_SIZE - off)); - if (off + s->iolen > NAND_PAGE_SIZE) { - page = PAGE(s->addr); - mem_and(s->storage + (page << OOB_SHIFT), s->io + NAND_PAGE_SIZE - off, - MIN(OOB_SIZE, off + s->iolen - NAND_PAGE_SIZE)); - } - - if (blk_pwrite(s->blk, sector << BDRV_SECTOR_BITS, - PAGE_SECTORS << BDRV_SECTOR_BITS, iobuf, 0) < 0) { - printf("%s: write error in sector %" PRIu64 "\n", __func__, sector); - } - } else { - off = PAGE_START(s->addr) + (s->addr & PAGE_MASK) + s->offset; - sector = off >> 9; - soff = off & 0x1ff; - if (blk_pread(s->blk, sector << BDRV_SECTOR_BITS, - (PAGE_SECTORS + 2) << BDRV_SECTOR_BITS, iobuf, 0) < 0) { - printf("%s: read error in sector %" PRIu64 "\n", __func__, sector); - return; - } - - mem_and(iobuf + soff, s->io, s->iolen); - - if (blk_pwrite(s->blk, sector << BDRV_SECTOR_BITS, - (PAGE_SECTORS + 2) << BDRV_SECTOR_BITS, iobuf, 0) < 0) { - printf("%s: write error in sector %" PRIu64 "\n", __func__, sector); - } - } - s->offset = 0; -} - -/* Erase a single block */ -static void glue(nand_blk_erase_, NAND_PAGE_SIZE)(NANDFlashState *s) -{ - uint64_t i, page, addr; - uint8_t iobuf[0x200] = { [0 ... 0x1ff] = 0xff, }; - addr = s->addr & ~((1 << (ADDR_SHIFT + s->erase_shift)) - 1); - - if (PAGE(addr) >= s->pages) { - return; - } - - if (!s->blk) { - memset(s->storage + PAGE_START(addr), - 0xff, (NAND_PAGE_SIZE + OOB_SIZE) << s->erase_shift); - } else if (s->mem_oob) { - memset(s->storage + (PAGE(addr) << OOB_SHIFT), - 0xff, OOB_SIZE << s->erase_shift); - i = SECTOR(addr); - page = SECTOR(addr + (1 << (ADDR_SHIFT + s->erase_shift))); - for (; i < page; i ++) - if (blk_pwrite(s->blk, i << BDRV_SECTOR_BITS, - BDRV_SECTOR_SIZE, iobuf, 0) < 0) { - printf("%s: write error in sector %" PRIu64 "\n", __func__, i); - } - } else { - addr = PAGE_START(addr); - page = addr >> 9; - if (blk_pread(s->blk, page << BDRV_SECTOR_BITS, - BDRV_SECTOR_SIZE, iobuf, 0) < 0) { - printf("%s: read error in sector %" PRIu64 "\n", __func__, page); - } - memset(iobuf + (addr & 0x1ff), 0xff, (~addr & 0x1ff) + 1); - if (blk_pwrite(s->blk, page << BDRV_SECTOR_BITS, - BDRV_SECTOR_SIZE, iobuf, 0) < 0) { - printf("%s: write error in sector %" PRIu64 "\n", __func__, page); - } - - memset(iobuf, 0xff, 0x200); - i = (addr & ~0x1ff) + 0x200; - for (addr += ((NAND_PAGE_SIZE + OOB_SIZE) << s->erase_shift) - 0x200; - i < addr; i += 0x200) { - if (blk_pwrite(s->blk, i, BDRV_SECTOR_SIZE, iobuf, 0) < 0) { - printf("%s: write error in sector %" PRIu64 "\n", - __func__, i >> 9); - } - } - - page = i >> 9; - if (blk_pread(s->blk, page << BDRV_SECTOR_BITS, - BDRV_SECTOR_SIZE, iobuf, 0) < 0) { - printf("%s: read error in sector %" PRIu64 "\n", __func__, page); - } - memset(iobuf, 0xff, ((addr - 1) & 0x1ff) + 1); - if (blk_pwrite(s->blk, page << BDRV_SECTOR_BITS, - BDRV_SECTOR_SIZE, iobuf, 0) < 0) { - printf("%s: write error in sector %" PRIu64 "\n", __func__, page); - } - } -} - -static bool glue(nand_blk_load_, NAND_PAGE_SIZE)(NANDFlashState *s, - uint64_t addr, unsigned offset) -{ - if (PAGE(addr) >= s->pages) { - return false; - } - - if (offset > NAND_PAGE_SIZE + OOB_SIZE) { - return false; - } - - if (s->blk) { - if (s->mem_oob) { - if (blk_pread(s->blk, SECTOR(addr) << BDRV_SECTOR_BITS, - PAGE_SECTORS << BDRV_SECTOR_BITS, s->io, 0) < 0) { - printf("%s: read error in sector %" PRIu64 "\n", - __func__, SECTOR(addr)); - } - memcpy(s->io + SECTOR_OFFSET(s->addr) + NAND_PAGE_SIZE, - s->storage + (PAGE(s->addr) << OOB_SHIFT), - OOB_SIZE); - s->ioaddr = s->io + SECTOR_OFFSET(s->addr) + offset; - } else { - if (blk_pread(s->blk, PAGE_START(addr), - (PAGE_SECTORS + 2) << BDRV_SECTOR_BITS, s->io, 0) - < 0) { - printf("%s: read error in sector %" PRIu64 "\n", - __func__, PAGE_START(addr) >> 9); - } - s->ioaddr = s->io + (PAGE_START(addr) & 0x1ff) + offset; - } - } else { - memcpy(s->io, s->storage + PAGE_START(s->addr) + - offset, NAND_PAGE_SIZE + OOB_SIZE - offset); - s->ioaddr = s->io; - } - - return true; -} - -static void glue(nand_init_, NAND_PAGE_SIZE)(NANDFlashState *s) -{ - s->oob_shift = PAGE_SHIFT - 5; - s->pages = s->size >> PAGE_SHIFT; - s->addr_shift = ADDR_SHIFT; - - s->blk_erase = glue(nand_blk_erase_, NAND_PAGE_SIZE); - s->blk_write = glue(nand_blk_write_, NAND_PAGE_SIZE); - s->blk_load = glue(nand_blk_load_, NAND_PAGE_SIZE); -} - -# undef NAND_PAGE_SIZE -# undef PAGE_SHIFT -# undef PAGE_SECTORS -# undef ADDR_SHIFT -#endif /* NAND_IO */ diff --git a/hw/char/sclpconsole-lm.c b/hw/char/sclpconsole-lm.c index e9580aa..3e40d5e 100644 --- a/hw/char/sclpconsole-lm.c +++ b/hw/char/sclpconsole-lm.c @@ -214,7 +214,7 @@ static int process_mdb(SCLPEvent *event, MDBO *mdbo) { int rc; int len; - uint8_t buffer[SIZE_BUFFER]; + QEMU_UNINITIALIZED uint8_t buffer[SIZE_BUFFER]; len = be16_to_cpu(mdbo->length); len -= sizeof(mdbo->length) + sizeof(mdbo->type) diff --git a/hw/char/sh_serial.c b/hw/char/sh_serial.c index 6abd803..30447fa 100644 --- a/hw/char/sh_serial.c +++ b/hw/char/sh_serial.c @@ -78,10 +78,6 @@ struct SHSerialState { qemu_irq bri; }; -typedef struct {} SHSerialStateClass; - -OBJECT_DEFINE_TYPE(SHSerialState, sh_serial, SH_SERIAL, SYS_BUS_DEVICE) - static void sh_serial_clear_fifo(SHSerialState *s) { memset(s->rx_fifo, 0, SH_RX_FIFO_LENGTH); @@ -434,17 +430,13 @@ static void sh_serial_realize(DeviceState *d, Error **errp) s->etu = NANOSECONDS_PER_SECOND / 9600; } -static void sh_serial_finalize(Object *obj) +static void sh_serial_unrealize(DeviceState *dev) { - SHSerialState *s = SH_SERIAL(obj); + SHSerialState *s = SH_SERIAL(dev); timer_del(&s->fifo_timeout_timer); } -static void sh_serial_init(Object *obj) -{ -} - static const Property sh_serial_properties[] = { DEFINE_PROP_CHR("chardev", SHSerialState, chr), DEFINE_PROP_UINT8("features", SHSerialState, feat, 0), @@ -456,7 +448,19 @@ static void sh_serial_class_init(ObjectClass *oc, const void *data) device_class_set_props(dc, sh_serial_properties); dc->realize = sh_serial_realize; + dc->unrealize = sh_serial_unrealize; device_class_set_legacy_reset(dc, sh_serial_reset); /* Reason: part of SuperH CPU/SoC, needs to be wired up */ dc->user_creatable = false; } + +static const TypeInfo sh_serial_types[] = { + { + .name = TYPE_SH_SERIAL, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(SHSerialState), + .class_init = sh_serial_class_init, + }, +}; + +DEFINE_TYPES(sh_serial_types) diff --git a/hw/char/sifive_uart.c b/hw/char/sifive_uart.c index 0fc89e7..9bc697a 100644 --- a/hw/char/sifive_uart.c +++ b/hw/char/sifive_uart.c @@ -128,8 +128,10 @@ static void sifive_uart_write_tx_fifo(SiFiveUARTState *s, const uint8_t *buf, s->txfifo |= SIFIVE_UART_TXFIFO_FULL; } - timer_mod(s->fifo_trigger_handle, current_time + - TX_INTERRUPT_TRIGGER_DELAY_NS); + if (!timer_pending(s->fifo_trigger_handle)) { + timer_mod(s->fifo_trigger_handle, current_time + + TX_INTERRUPT_TRIGGER_DELAY_NS); + } } static uint64_t diff --git a/hw/core/loader.c b/hw/core/loader.c index b792a54..e7056ba 100644 --- a/hw/core/loader.c +++ b/hw/core/loader.c @@ -1333,20 +1333,6 @@ void rom_set_fw(FWCfgState *f) fw_cfg = f; } -void rom_set_order_override(int order) -{ - if (!fw_cfg) - return; - fw_cfg_set_order_override(fw_cfg, order); -} - -void rom_reset_order_override(void) -{ - if (!fw_cfg) - return; - fw_cfg_reset_order_override(fw_cfg); -} - void rom_transaction_begin(void) { Rom *rom; diff --git a/hw/core/machine.c b/hw/core/machine.c index b8ae155..e869821 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -37,7 +37,9 @@ #include "hw/virtio/virtio-iommu.h" #include "audio/audio.h" -GlobalProperty hw_compat_10_0[] = {}; +GlobalProperty hw_compat_10_0[] = { + { "scsi-hd", "dpofua", "off" }, +}; const size_t hw_compat_10_0_len = G_N_ELEMENTS(hw_compat_10_0); GlobalProperty hw_compat_9_2[] = { @@ -283,24 +285,6 @@ GlobalProperty hw_compat_2_6[] = { }; const size_t hw_compat_2_6_len = G_N_ELEMENTS(hw_compat_2_6); -GlobalProperty hw_compat_2_5[] = { - { "isa-fdc", "fallback", "144" }, - { "pvscsi", "x-old-pci-configuration", "on" }, - { "pvscsi", "x-disable-pcie", "on" }, - { "vmxnet3", "x-old-msi-offsets", "on" }, - { "vmxnet3", "x-disable-pcie", "on" }, -}; -const size_t hw_compat_2_5_len = G_N_ELEMENTS(hw_compat_2_5); - -GlobalProperty hw_compat_2_4[] = { - { "e1000", "extra_mac_registers", "off" }, - { "virtio-pci", "x-disable-pcie", "on" }, - { "virtio-pci", "migrate-extra", "off" }, - { "fw_cfg_mem", "dma_enabled", "off" }, - { "fw_cfg_io", "dma_enabled", "off" } -}; -const size_t hw_compat_2_4_len = G_N_ELEMENTS(hw_compat_2_4); - MachineState *current_machine; static char *machine_get_kernel(Object *obj, Error **errp) diff --git a/hw/core/meson.build b/hw/core/meson.build index 547de65..b5a545a 100644 --- a/hw/core/meson.build +++ b/hw/core/meson.build @@ -26,7 +26,7 @@ system_ss.add(when: 'CONFIG_XILINX_AXI', if_true: files('stream.c')) system_ss.add(when: 'CONFIG_PLATFORM_BUS', if_true: files('sysbus-fdt.c')) system_ss.add(when: 'CONFIG_EIF', if_true: [files('eif.c'), zlib, libcbor, gnutls]) -libsystem_ss.add(files( +system_ss.add(files( 'cpu-system.c', 'fw-path-provider.c', 'gpio.c', @@ -46,7 +46,7 @@ libsystem_ss.add(files( 'vm-change-state-handler.c', 'clock-vmstate.c', )) -libuser_ss.add(files( +user_ss.add(files( 'cpu-user.c', 'qdev-user.c', )) diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index 8e11e63..24e145d 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -145,6 +145,7 @@ static void set_drive_helper(Object *obj, Visitor *v, const char *name, if (ctx != bdrv_get_aio_context(bs)) { error_setg(errp, "Different aio context is not supported for new " "node"); + return; } blk_replace_bs(blk, bs, errp); diff --git a/hw/display/apple-gfx.m b/hw/display/apple-gfx.m index 8dde1f1..174d56a 100644 --- a/hw/display/apple-gfx.m +++ b/hw/display/apple-gfx.m @@ -454,7 +454,7 @@ static void set_cursor_glyph(void *opaque) /* ------ DMA (device reading system memory) ------ */ typedef struct AppleGFXReadMemoryJob { - QemuSemaphore sem; + QemuEvent event; hwaddr physical_address; uint64_t length; void *dst; @@ -470,7 +470,7 @@ static void apple_gfx_do_read_memory(void *opaque) job->dst, job->length, MEMTXATTRS_UNSPECIFIED); job->success = (r == MEMTX_OK); - qemu_sem_post(&job->sem); + qemu_event_set(&job->event); } static bool apple_gfx_read_memory(AppleGFXState *s, hwaddr physical_address, @@ -483,11 +483,11 @@ static bool apple_gfx_read_memory(AppleGFXState *s, hwaddr physical_address, trace_apple_gfx_read_memory(physical_address, length, dst); /* Performing DMA requires BQL, so do it in a BH. */ - qemu_sem_init(&job.sem, 0); + qemu_event_init(&job.event, 0); aio_bh_schedule_oneshot(qemu_get_aio_context(), apple_gfx_do_read_memory, &job); - qemu_sem_wait(&job.sem); - qemu_sem_destroy(&job.sem); + qemu_event_wait(&job.event); + qemu_event_destroy(&job.event); return job.success; } diff --git a/hw/display/virtio-gpu-virgl.c b/hw/display/virtio-gpu-virgl.c index 145a0b3..94ddc01 100644 --- a/hw/display/virtio-gpu-virgl.c +++ b/hw/display/virtio-gpu-virgl.c @@ -970,6 +970,15 @@ void virtio_gpu_virgl_process_cmd(VirtIOGPU *g, } trace_virtio_gpu_fence_ctrl(cmd->cmd_hdr.fence_id, cmd->cmd_hdr.type); +#if VIRGL_VERSION_MAJOR >= 1 + if (cmd->cmd_hdr.flags & VIRTIO_GPU_FLAG_INFO_RING_IDX) { + virgl_renderer_context_create_fence(cmd->cmd_hdr.ctx_id, + VIRGL_RENDERER_FENCE_FLAG_MERGEABLE, + cmd->cmd_hdr.ring_idx, + cmd->cmd_hdr.fence_id); + return; + } +#endif virgl_renderer_create_fence(cmd->cmd_hdr.fence_id, cmd->cmd_hdr.type); } @@ -983,6 +992,11 @@ static void virgl_write_fence(void *opaque, uint32_t fence) * the guest can end up emitting fences out of order * so we should check all fenced cmds not just the first one. */ +#if VIRGL_VERSION_MAJOR >= 1 + if (cmd->cmd_hdr.flags & VIRTIO_GPU_FLAG_INFO_RING_IDX) { + continue; + } +#endif if (cmd->cmd_hdr.fence_id > fence) { continue; } @@ -997,6 +1011,29 @@ static void virgl_write_fence(void *opaque, uint32_t fence) } } +#if VIRGL_VERSION_MAJOR >= 1 +static void virgl_write_context_fence(void *opaque, uint32_t ctx_id, + uint32_t ring_idx, uint64_t fence_id) { + VirtIOGPU *g = opaque; + struct virtio_gpu_ctrl_command *cmd, *tmp; + + QTAILQ_FOREACH_SAFE(cmd, &g->fenceq, next, tmp) { + if (cmd->cmd_hdr.flags & VIRTIO_GPU_FLAG_INFO_RING_IDX && + cmd->cmd_hdr.ctx_id == ctx_id && cmd->cmd_hdr.ring_idx == ring_idx && + cmd->cmd_hdr.fence_id <= fence_id) { + trace_virtio_gpu_fence_resp(cmd->cmd_hdr.fence_id); + virtio_gpu_ctrl_response_nodata(g, cmd, VIRTIO_GPU_RESP_OK_NODATA); + QTAILQ_REMOVE(&g->fenceq, cmd, next); + g_free(cmd); + g->inflight--; + if (virtio_gpu_stats_enabled(g->parent_obj.conf)) { + trace_virtio_gpu_dec_inflight_fences(g->inflight); + } + } + } +} +#endif + static virgl_renderer_gl_context virgl_create_context(void *opaque, int scanout_idx, struct virgl_renderer_gl_ctx_param *params) @@ -1031,11 +1068,18 @@ static int virgl_make_context_current(void *opaque, int scanout_idx, } static struct virgl_renderer_callbacks virtio_gpu_3d_cbs = { +#if VIRGL_VERSION_MAJOR >= 1 + .version = 3, +#else .version = 1, +#endif .write_fence = virgl_write_fence, .create_gl_context = virgl_create_context, .destroy_gl_context = virgl_destroy_context, .make_current = virgl_make_context_current, +#if VIRGL_VERSION_MAJOR >= 1 + .write_context_fence = virgl_write_context_fence, +#endif }; static void virtio_gpu_print_stats(void *opaque) diff --git a/hw/display/vmware_vga.c b/hw/display/vmware_vga.c index 544bb65..bc1a8ed 100644 --- a/hw/display/vmware_vga.c +++ b/hw/display/vmware_vga.c @@ -618,7 +618,7 @@ static void vmsvga_fifo_run(struct vmsvga_state_s *s) uint32_t cmd, colour; int args, len, maxloop = 1024; int x, y, dx, dy, width, height; - struct vmsvga_cursor_definition_s cursor; + QEMU_UNINITIALIZED struct vmsvga_cursor_definition_s cursor; uint32_t cmd_start; len = vmsvga_fifo_length(s); diff --git a/hw/dma/xlnx_csu_dma.c b/hw/dma/xlnx_csu_dma.c index 3db3904..d8c7da1 100644 --- a/hw/dma/xlnx_csu_dma.c +++ b/hw/dma/xlnx_csu_dma.c @@ -287,7 +287,7 @@ static uint32_t xlnx_csu_dma_advance(XlnxCSUDMA *s, uint32_t len) static void xlnx_csu_dma_src_notify(void *opaque) { XlnxCSUDMA *s = XLNX_CSU_DMA(opaque); - unsigned char buf[4 * 1024]; + QEMU_UNINITIALIZED unsigned char buf[4 * 1024]; size_t rlen = 0; ptimer_transaction_begin(s->src_timer); diff --git a/hw/gpio/pca9552.c b/hw/gpio/pca9552.c index d65c0a2..1e10238 100644 --- a/hw/gpio/pca9552.c +++ b/hw/gpio/pca9552.c @@ -76,7 +76,7 @@ static void pca955x_display_pins_status(PCA955xState *s, return; } if (trace_event_get_state_backends(TRACE_PCA955X_GPIO_STATUS)) { - char *buf = g_newa(char, k->pin_count + 1); + char buf[PCA955X_PIN_COUNT_MAX + 1]; for (i = 0; i < k->pin_count; i++) { if (extract32(pins_status, i, 1)) { diff --git a/hw/hyperv/hv-balloon.c b/hw/hyperv/hv-balloon.c index 94b0abb..6dbcb2d 100644 --- a/hw/hyperv/hv-balloon.c +++ b/hw/hyperv/hv-balloon.c @@ -67,10 +67,6 @@ * these requests */ -struct HvBalloonClass { - VMBusDeviceClass parent_class; -} HvBalloonClass; - typedef enum State { /* not a real state */ S_NO_CHANGE = 0, @@ -162,8 +158,9 @@ typedef struct HvBalloon { MemoryRegion *mr; } HvBalloon; -OBJECT_DEFINE_TYPE_WITH_INTERFACES(HvBalloon, hv_balloon, HV_BALLOON, VMBUS_DEVICE, \ - { TYPE_MEMORY_DEVICE }, { }) +OBJECT_DEFINE_SIMPLE_TYPE_WITH_INTERFACES(HvBalloon, hv_balloon, \ + HV_BALLOON, VMBUS_DEVICE, \ + { TYPE_MEMORY_DEVICE }, { }) #define HV_BALLOON_SET_STATE(hvb, news) \ do { \ diff --git a/hw/hyperv/syndbg.c b/hw/hyperv/syndbg.c index 8b8a147..ac7e15f 100644 --- a/hw/hyperv/syndbg.c +++ b/hw/hyperv/syndbg.c @@ -192,7 +192,7 @@ static uint16_t handle_recv_msg(HvSynDbg *syndbg, uint64_t outgpa, { uint16_t ret; g_assert(MSG_BUFSZ >= qemu_target_page_size()); - uint8_t data_buf[MSG_BUFSZ]; + QEMU_UNINITIALIZED uint8_t data_buf[MSG_BUFSZ]; hwaddr out_len; void *out_data; ssize_t recv_byte_count; diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig index d34ce07..14d23e2 100644 --- a/hw/i386/Kconfig +++ b/hw/i386/Kconfig @@ -10,6 +10,11 @@ config SGX bool depends on KVM +config TDX + bool + select X86_FW_OVMF + depends on KVM && X86_64 + config PC bool imply APPLESMC @@ -26,6 +31,7 @@ config PC imply QXL imply SEV imply SGX + imply TDX imply TEST_DEVICES imply TPM_CRB imply TPM_TIS_ISA diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index 0775c8f..963aa24 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -1426,7 +1426,6 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) AMDVIState *s = opaque; AMDVIAddressSpace **iommu_as, *amdvi_dev_as; int bus_num = pci_bus_num(bus); - X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); iommu_as = s->address_spaces[bus_num]; @@ -1486,15 +1485,8 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) AMDVI_INT_ADDR_FIRST, &amdvi_dev_as->iommu_ir, 1); - if (!x86_iommu->pt_supported) { - memory_region_set_enabled(&amdvi_dev_as->iommu_nodma, false); - memory_region_set_enabled(MEMORY_REGION(&amdvi_dev_as->iommu), - true); - } else { - memory_region_set_enabled(MEMORY_REGION(&amdvi_dev_as->iommu), - false); - memory_region_set_enabled(&amdvi_dev_as->iommu_nodma, true); - } + memory_region_set_enabled(&amdvi_dev_as->iommu_nodma, false); + memory_region_set_enabled(MEMORY_REGION(&amdvi_dev_as->iommu), true); } return &iommu_as[devfn]->as; } @@ -1723,6 +1715,14 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) exit(EXIT_FAILURE); } + if (s->xtsup) { + if (kvm_irqchip_is_split() && !kvm_enable_x2apic()) { + error_report("AMD IOMMU xtsup=on requires x2APIC support on " + "the KVM side"); + exit(EXIT_FAILURE); + } + } + pci_setup_iommu(bus, &amdvi_iommu_ops, s); amdvi_init(s); } diff --git a/hw/i386/kvm/apic.c b/hw/i386/kvm/apic.c index 39035db..1be9bfe 100644 --- a/hw/i386/kvm/apic.c +++ b/hw/i386/kvm/apic.c @@ -17,6 +17,7 @@ #include "system/hw_accel.h" #include "system/kvm.h" #include "kvm/kvm_i386.h" +#include "kvm/tdx.h" static inline void kvm_apic_set_reg(struct kvm_lapic_state *kapic, int reg_id, uint32_t val) @@ -141,6 +142,10 @@ static void kvm_apic_put(CPUState *cs, run_on_cpu_data data) struct kvm_lapic_state kapic; int ret; + if (is_tdx_vm()) { + return; + } + kvm_put_apicbase(s->cpu, s->apicbase); kvm_put_apic_state(s, &kapic); diff --git a/hw/i386/kvm/xen-stubs.c b/hw/i386/kvm/xen-stubs.c index d03131e..ce73119 100644 --- a/hw/i386/kvm/xen-stubs.c +++ b/hw/i386/kvm/xen-stubs.c @@ -12,7 +12,6 @@ #include "qemu/osdep.h" #include "qapi/error.h" -#include "qapi/qapi-commands-misc-target.h" #include "xen_evtchn.h" #include "xen_primary_console.h" @@ -38,15 +37,3 @@ void xen_primary_console_create(void) void xen_primary_console_set_be_port(uint16_t port) { } -#ifdef TARGET_I386 -EvtchnInfoList *qmp_xen_event_list(Error **errp) -{ - error_setg(errp, "Xen event channel emulation not enabled"); - return NULL; -} - -void qmp_xen_event_inject(uint32_t port, Error **errp) -{ - error_setg(errp, "Xen event channel emulation not enabled"); -} -#endif diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c index b519054..dd566c4 100644 --- a/hw/i386/kvm/xen_evtchn.c +++ b/hw/i386/kvm/xen_evtchn.c @@ -19,7 +19,7 @@ #include "monitor/monitor.h" #include "monitor/hmp.h" #include "qapi/error.h" -#include "qapi/qapi-commands-misc-target.h" +#include "qapi/qapi-commands-misc-i386.h" #include "qobject/qdict.h" #include "qom/object.h" #include "exec/target_page.h" diff --git a/hw/i386/meson.build b/hw/i386/meson.build index 10bdfde..7896f34 100644 --- a/hw/i386/meson.build +++ b/hw/i386/meson.build @@ -32,6 +32,7 @@ i386_ss.add(when: 'CONFIG_PC', if_true: files( 'port92.c')) i386_ss.add(when: 'CONFIG_X86_FW_OVMF', if_true: files('pc_sysfw_ovmf.c'), if_false: files('pc_sysfw_ovmf-stubs.c')) +i386_ss.add(when: 'CONFIG_TDX', if_true: files('tdvf.c', 'tdvf-hob.c')) subdir('kvm') subdir('xen') diff --git a/hw/i386/monitor.c b/hw/i386/monitor.c index 1921e4d..79df965 100644 --- a/hw/i386/monitor.c +++ b/hw/i386/monitor.c @@ -26,7 +26,7 @@ #include "monitor/monitor.h" #include "qobject/qdict.h" #include "qapi/error.h" -#include "qapi/qapi-commands-misc-target.h" +#include "qapi/qapi-commands-misc-i386.h" #include "hw/i386/x86.h" #include "hw/rtc/mc146818rtc.h" diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 7065615..b211633 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -44,6 +44,7 @@ #include "system/xen.h" #include "system/reset.h" #include "kvm/kvm_i386.h" +#include "kvm/tdx.h" #include "hw/xen/xen.h" #include "qobject/qlist.h" #include "qemu/error-report.h" @@ -259,28 +260,6 @@ GlobalProperty pc_compat_2_6[] = { }; const size_t pc_compat_2_6_len = G_N_ELEMENTS(pc_compat_2_6); -GlobalProperty pc_compat_2_5[] = {}; -const size_t pc_compat_2_5_len = G_N_ELEMENTS(pc_compat_2_5); - -GlobalProperty pc_compat_2_4[] = { - PC_CPU_MODEL_IDS("2.4.0") - { "Haswell-" TYPE_X86_CPU, "abm", "off" }, - { "Haswell-noTSX-" TYPE_X86_CPU, "abm", "off" }, - { "Broadwell-" TYPE_X86_CPU, "abm", "off" }, - { "Broadwell-noTSX-" TYPE_X86_CPU, "abm", "off" }, - { "host" "-" TYPE_X86_CPU, "host-cache-info", "on" }, - { TYPE_X86_CPU, "check", "off" }, - { "qemu64" "-" TYPE_X86_CPU, "sse4a", "on" }, - { "qemu64" "-" TYPE_X86_CPU, "abm", "on" }, - { "qemu64" "-" TYPE_X86_CPU, "popcnt", "on" }, - { "qemu32" "-" TYPE_X86_CPU, "popcnt", "on" }, - { "Opteron_G2" "-" TYPE_X86_CPU, "rdtscp", "on" }, - { "Opteron_G3" "-" TYPE_X86_CPU, "rdtscp", "on" }, - { "Opteron_G4" "-" TYPE_X86_CPU, "rdtscp", "on" }, - { "Opteron_G5" "-" TYPE_X86_CPU, "rdtscp", "on", } -}; -const size_t pc_compat_2_4_len = G_N_ELEMENTS(pc_compat_2_4); - /* * @PC_FW_DATA: * Size of the chunk of memory at the top of RAM for the BIOS ACPI tables @@ -976,21 +955,23 @@ void pc_memory_init(PCMachineState *pcms, /* Initialize PC system firmware */ pc_system_firmware_init(pcms, rom_memory); - option_rom_mr = g_malloc(sizeof(*option_rom_mr)); - if (machine_require_guest_memfd(machine)) { - memory_region_init_ram_guest_memfd(option_rom_mr, NULL, "pc.rom", - PC_ROM_SIZE, &error_fatal); - } else { - memory_region_init_ram(option_rom_mr, NULL, "pc.rom", PC_ROM_SIZE, - &error_fatal); - if (pcmc->pci_enabled) { - memory_region_set_readonly(option_rom_mr, true); + if (!is_tdx_vm()) { + option_rom_mr = g_malloc(sizeof(*option_rom_mr)); + if (machine_require_guest_memfd(machine)) { + memory_region_init_ram_guest_memfd(option_rom_mr, NULL, "pc.rom", + PC_ROM_SIZE, &error_fatal); + } else { + memory_region_init_ram(option_rom_mr, NULL, "pc.rom", PC_ROM_SIZE, + &error_fatal); + if (pcmc->pci_enabled) { + memory_region_set_readonly(option_rom_mr, true); + } } + memory_region_add_subregion_overlap(rom_memory, + PC_ROM_MIN_VGA, + option_rom_mr, + 1); } - memory_region_add_subregion_overlap(rom_memory, - PC_ROM_MIN_VGA, - option_rom_mr, - 1); fw_cfg = fw_cfg_arch_create(machine, x86ms->boot_cpus, x86ms->apic_id_limit); @@ -999,14 +980,13 @@ void pc_memory_init(PCMachineState *pcms, if (machine->device_memory) { uint64_t *val = g_malloc(sizeof(*val)); - uint64_t res_mem_end = machine->device_memory->base; - - if (!pcmc->broken_reserved_end) { - res_mem_end += memory_region_size(&machine->device_memory->mr); - } + uint64_t res_mem_end; if (pcms->cxl_devices_state.is_enabled) { res_mem_end = cxl_resv_end; + } else { + res_mem_end = machine->device_memory->base + + memory_region_size(&machine->device_memory->mr); } *val = cpu_to_le64(ROUND_UP(res_mem_end, 1 * GiB)); fw_cfg_add_file(fw_cfg, "etc/reserved-memory-end", val, sizeof(*val)); @@ -1044,9 +1024,7 @@ uint64_t pc_pci_hole64_start(void) hole64_start = pc_get_cxl_range_end(pcms); } else if (pcmc->has_reserved_memory && (ms->ram_size < ms->maxram_size)) { pc_get_device_memory_range(pcms, &hole64_start, &size); - if (!pcmc->broken_reserved_end) { - hole64_start += size; - } + hole64_start += size; } else { hole64_start = pc_above_4g_end(pcms); } @@ -1058,7 +1036,6 @@ DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus) { DeviceState *dev = NULL; - rom_set_order_override(FW_CFG_ORDER_OVERRIDE_VGA); if (pci_bus) { PCIDevice *pcidev = pci_vga_init(pci_bus); dev = pcidev ? &pcidev->qdev : NULL; @@ -1066,7 +1043,7 @@ DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus) ISADevice *isadev = isa_vga_init(isa_bus); dev = isadev ? DEVICE(isadev) : NULL; } - rom_reset_order_override(); + return dev; } @@ -1256,8 +1233,6 @@ void pc_nic_init(PCMachineClass *pcmc, ISABus *isa_bus, PCIBus *pci_bus) bool default_is_ne2k = g_str_equal(mc->default_nic, TYPE_ISA_NE2000); NICInfo *nd; - rom_set_order_override(FW_CFG_ORDER_OVERRIDE_NIC); - while ((nd = qemu_find_nic_info(TYPE_ISA_NE2000, default_is_ne2k, NULL))) { pc_init_ne2k_isa(isa_bus, nd, &error_fatal); } @@ -1266,8 +1241,6 @@ void pc_nic_init(PCMachineClass *pcmc, ISABus *isa_bus, PCIBus *pci_bus) if (pci_bus) { pci_init_nic_devices(pci_bus, mc->default_nic); } - - rom_reset_order_override(); } void pc_i8259_create(ISABus *isa_bus, qemu_irq *i8259_irqs) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 0dce512..ea7572e 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -285,6 +285,8 @@ static void pc_init1(MachineState *machine, const char *pci_type) pcms->idebus[0] = qdev_get_child_bus(dev, "ide.0"); pcms->idebus[1] = qdev_get_child_bus(dev, "ide.1"); } else { + uint32_t irq; + isa_bus = isa_bus_new(NULL, system_memory, system_io, &error_abort); isa_bus_register_input_irqs(isa_bus, x86ms->gsi); @@ -292,6 +294,9 @@ static void pc_init1(MachineState *machine, const char *pci_type) x86ms->rtc = isa_new(TYPE_MC146818_RTC); qdev_prop_set_int32(DEVICE(x86ms->rtc), "base_year", 2000); isa_realize_and_unref(x86ms->rtc, isa_bus, &error_fatal); + irq = object_property_get_uint(OBJECT(x86ms->rtc), "irq", + &error_fatal); + isa_connect_gpio_out(ISA_DEVICE(x86ms->rtc), 0, irq); i8257_dma_init(OBJECT(machine), isa_bus, 0); pcms->hpet_enabled = false; @@ -778,32 +783,6 @@ static void pc_i440fx_machine_2_6_options(MachineClass *m) DEFINE_I440FX_MACHINE(2, 6); -static void pc_i440fx_machine_2_5_options(MachineClass *m) -{ - X86MachineClass *x86mc = X86_MACHINE_CLASS(m); - - pc_i440fx_machine_2_6_options(m); - x86mc->save_tsc_khz = false; - m->legacy_fw_cfg_order = 1; - compat_props_add(m->compat_props, hw_compat_2_5, hw_compat_2_5_len); - compat_props_add(m->compat_props, pc_compat_2_5, pc_compat_2_5_len); -} - -DEFINE_I440FX_MACHINE(2, 5); - -static void pc_i440fx_machine_2_4_options(MachineClass *m) -{ - PCMachineClass *pcmc = PC_MACHINE_CLASS(m); - - pc_i440fx_machine_2_5_options(m); - m->hw_version = "2.4.0"; - pcmc->broken_reserved_end = true; - compat_props_add(m->compat_props, hw_compat_2_4, hw_compat_2_4_len); - compat_props_add(m->compat_props, pc_compat_2_4, pc_compat_2_4_len); -} - -DEFINE_I440FX_MACHINE(2, 4); - #ifdef CONFIG_ISAPC static void isapc_machine_options(MachineClass *m) { diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index c538b3d..33211b1 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -672,29 +672,3 @@ static void pc_q35_machine_2_6_options(MachineClass *m) } DEFINE_Q35_MACHINE(2, 6); - -static void pc_q35_machine_2_5_options(MachineClass *m) -{ - X86MachineClass *x86mc = X86_MACHINE_CLASS(m); - - pc_q35_machine_2_6_options(m); - x86mc->save_tsc_khz = false; - m->legacy_fw_cfg_order = 1; - compat_props_add(m->compat_props, hw_compat_2_5, hw_compat_2_5_len); - compat_props_add(m->compat_props, pc_compat_2_5, pc_compat_2_5_len); -} - -DEFINE_Q35_MACHINE(2, 5); - -static void pc_q35_machine_2_4_options(MachineClass *m) -{ - PCMachineClass *pcmc = PC_MACHINE_CLASS(m); - - pc_q35_machine_2_5_options(m); - m->hw_version = "2.4.0"; - pcmc->broken_reserved_end = true; - compat_props_add(m->compat_props, hw_compat_2_4, hw_compat_2_4_len); - compat_props_add(m->compat_props, pc_compat_2_4, pc_compat_2_4_len); -} - -DEFINE_Q35_MACHINE(2, 4); diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c index 1eeb58a..821396c 100644 --- a/hw/i386/pc_sysfw.c +++ b/hw/i386/pc_sysfw.c @@ -37,6 +37,7 @@ #include "hw/block/flash.h" #include "system/kvm.h" #include "target/i386/sev.h" +#include "kvm/tdx.h" #define FLASH_SECTOR_SIZE 4096 @@ -280,5 +281,11 @@ void x86_firmware_configure(hwaddr gpa, void *ptr, int size) } sev_encrypt_flash(gpa, ptr, size, &error_fatal); + } else if (is_tdx_vm()) { + ret = tdx_parse_tdvf(ptr, size); + if (ret) { + error_report("failed to parse TDVF for TDX VM"); + exit(1); + } } } diff --git a/hw/i386/sgx-stub.c b/hw/i386/sgx-stub.c index 38ff75e..d295e54 100644 --- a/hw/i386/sgx-stub.c +++ b/hw/i386/sgx-stub.c @@ -3,20 +3,20 @@ #include "monitor/hmp-target.h" #include "hw/i386/pc.h" #include "hw/i386/sgx-epc.h" +#include "qapi/qapi-commands-misc-i386.h" #include "qapi/error.h" -#include "qapi/qapi-commands-misc-target.h" void sgx_epc_build_srat(GArray *table_data) { } -SGXInfo *qmp_query_sgx(Error **errp) +SgxInfo *qmp_query_sgx(Error **errp) { error_setg(errp, "SGX support is not compiled in"); return NULL; } -SGXInfo *qmp_query_sgx_capabilities(Error **errp) +SgxInfo *qmp_query_sgx_capabilities(Error **errp) { error_setg(errp, "SGX support is not compiled in"); return NULL; diff --git a/hw/i386/sgx.c b/hw/i386/sgx.c index 5685c4f..e280154 100644 --- a/hw/i386/sgx.c +++ b/hw/i386/sgx.c @@ -19,7 +19,7 @@ #include "monitor/hmp-target.h" #include "qapi/error.h" #include "qemu/error-report.h" -#include "qapi/qapi-commands-misc-target.h" +#include "qapi/qapi-commands-misc-i386.h" #include "system/address-spaces.h" #include "system/hw_accel.h" #include "system/reset.h" @@ -84,10 +84,10 @@ static uint64_t sgx_calc_section_metric(uint64_t low, uint64_t high) ((high & MAKE_64BIT_MASK(0, 20)) << 32); } -static SGXEPCSectionList *sgx_calc_host_epc_sections(void) +static SgxEpcSectionList *sgx_calc_host_epc_sections(void) { - SGXEPCSectionList *head = NULL, **tail = &head; - SGXEPCSection *section; + SgxEpcSectionList *head = NULL, **tail = &head; + SgxEpcSection *section; uint32_t i, type; uint32_t eax, ebx, ecx, edx; uint32_t j = 0; @@ -104,7 +104,7 @@ static SGXEPCSectionList *sgx_calc_host_epc_sections(void) break; } - section = g_new0(SGXEPCSection, 1); + section = g_new0(SgxEpcSection, 1); section->node = j++; section->size = sgx_calc_section_metric(ecx, edx); QAPI_LIST_APPEND(tail, section); @@ -153,9 +153,9 @@ static void sgx_epc_reset(void *opaque) } } -SGXInfo *qmp_query_sgx_capabilities(Error **errp) +SgxInfo *qmp_query_sgx_capabilities(Error **errp) { - SGXInfo *info = NULL; + SgxInfo *info = NULL; uint32_t eax, ebx, ecx, edx; Error *local_err = NULL; @@ -166,7 +166,7 @@ SGXInfo *qmp_query_sgx_capabilities(Error **errp) return NULL; } - info = g_new0(SGXInfo, 1); + info = g_new0(SgxInfo, 1); host_cpuid(0x7, 0, &eax, &ebx, &ecx, &edx); info->sgx = ebx & (1U << 2) ? true : false; @@ -183,17 +183,17 @@ SGXInfo *qmp_query_sgx_capabilities(Error **errp) return info; } -static SGXEPCSectionList *sgx_get_epc_sections_list(void) +static SgxEpcSectionList *sgx_get_epc_sections_list(void) { GSList *device_list = sgx_epc_get_device_list(); - SGXEPCSectionList *head = NULL, **tail = &head; - SGXEPCSection *section; + SgxEpcSectionList *head = NULL, **tail = &head; + SgxEpcSection *section; for (; device_list; device_list = device_list->next) { DeviceState *dev = device_list->data; Object *obj = OBJECT(dev); - section = g_new0(SGXEPCSection, 1); + section = g_new0(SgxEpcSection, 1); section->node = object_property_get_uint(obj, SGX_EPC_NUMA_NODE_PROP, &error_abort); section->size = object_property_get_uint(obj, SGX_EPC_SIZE_PROP, @@ -205,9 +205,9 @@ static SGXEPCSectionList *sgx_get_epc_sections_list(void) return head; } -SGXInfo *qmp_query_sgx(Error **errp) +SgxInfo *qmp_query_sgx(Error **errp) { - SGXInfo *info = NULL; + SgxInfo *info = NULL; X86MachineState *x86ms; PCMachineState *pcms = (PCMachineState *)object_dynamic_cast(qdev_get_machine(), @@ -223,7 +223,7 @@ SGXInfo *qmp_query_sgx(Error **errp) return NULL; } - info = g_new0(SGXInfo, 1); + info = g_new0(SgxInfo, 1); info->sgx = true; info->sgx1 = true; @@ -237,8 +237,8 @@ SGXInfo *qmp_query_sgx(Error **errp) void hmp_info_sgx(Monitor *mon, const QDict *qdict) { Error *err = NULL; - SGXEPCSectionList *section_list, *section; - g_autoptr(SGXInfo) info = qmp_query_sgx(&err); + SgxEpcSectionList *section_list, *section; + g_autoptr(SgxInfo) info = qmp_query_sgx(&err); uint64_t size = 0; if (err) { diff --git a/hw/i386/tdvf-hob.c b/hw/i386/tdvf-hob.c new file mode 100644 index 0000000..782b3d1 --- /dev/null +++ b/hw/i386/tdvf-hob.c @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2025 Intel Corporation + * Author: Isaku Yamahata <isaku.yamahata at gmail.com> + * <isaku.yamahata at intel.com> + * Xiaoyao Li <xiaoyao.li@intel.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qemu/error-report.h" +#include "standard-headers/uefi/uefi.h" +#include "hw/pci/pcie_host.h" +#include "tdvf-hob.h" + +typedef struct TdvfHob { + hwaddr hob_addr; + void *ptr; + int size; + + /* working area */ + void *current; + void *end; +} TdvfHob; + +static uint64_t tdvf_current_guest_addr(const TdvfHob *hob) +{ + return hob->hob_addr + (hob->current - hob->ptr); +} + +static void tdvf_align(TdvfHob *hob, size_t align) +{ + hob->current = QEMU_ALIGN_PTR_UP(hob->current, align); +} + +static void *tdvf_get_area(TdvfHob *hob, uint64_t size) +{ + void *ret; + + if (hob->current + size > hob->end) { + error_report("TD_HOB overrun, size = 0x%" PRIx64, size); + exit(1); + } + + ret = hob->current; + hob->current += size; + tdvf_align(hob, 8); + return ret; +} + +static void tdvf_hob_add_memory_resources(TdxGuest *tdx, TdvfHob *hob) +{ + EFI_HOB_RESOURCE_DESCRIPTOR *region; + EFI_RESOURCE_ATTRIBUTE_TYPE attr; + EFI_RESOURCE_TYPE resource_type; + + TdxRamEntry *e; + int i; + + for (i = 0; i < tdx->nr_ram_entries; i++) { + e = &tdx->ram_entries[i]; + + if (e->type == TDX_RAM_UNACCEPTED) { + resource_type = EFI_RESOURCE_MEMORY_UNACCEPTED; + attr = EFI_RESOURCE_ATTRIBUTE_TDVF_UNACCEPTED; + } else if (e->type == TDX_RAM_ADDED) { + resource_type = EFI_RESOURCE_SYSTEM_MEMORY; + attr = EFI_RESOURCE_ATTRIBUTE_TDVF_PRIVATE; + } else { + error_report("unknown TDX_RAM_ENTRY type %d", e->type); + exit(1); + } + + region = tdvf_get_area(hob, sizeof(*region)); + *region = (EFI_HOB_RESOURCE_DESCRIPTOR) { + .Header = { + .HobType = EFI_HOB_TYPE_RESOURCE_DESCRIPTOR, + .HobLength = cpu_to_le16(sizeof(*region)), + .Reserved = cpu_to_le32(0), + }, + .Owner = EFI_HOB_OWNER_ZERO, + .ResourceType = cpu_to_le32(resource_type), + .ResourceAttribute = cpu_to_le32(attr), + .PhysicalStart = cpu_to_le64(e->address), + .ResourceLength = cpu_to_le64(e->length), + }; + } +} + +void tdvf_hob_create(TdxGuest *tdx, TdxFirmwareEntry *td_hob) +{ + TdvfHob hob = { + .hob_addr = td_hob->address, + .size = td_hob->size, + .ptr = td_hob->mem_ptr, + + .current = td_hob->mem_ptr, + .end = td_hob->mem_ptr + td_hob->size, + }; + + EFI_HOB_GENERIC_HEADER *last_hob; + EFI_HOB_HANDOFF_INFO_TABLE *hit; + + /* Note, Efi{Free}Memory{Bottom,Top} are ignored, leave 'em zeroed. */ + hit = tdvf_get_area(&hob, sizeof(*hit)); + *hit = (EFI_HOB_HANDOFF_INFO_TABLE) { + .Header = { + .HobType = EFI_HOB_TYPE_HANDOFF, + .HobLength = cpu_to_le16(sizeof(*hit)), + .Reserved = cpu_to_le32(0), + }, + .Version = cpu_to_le32(EFI_HOB_HANDOFF_TABLE_VERSION), + .BootMode = cpu_to_le32(0), + .EfiMemoryTop = cpu_to_le64(0), + .EfiMemoryBottom = cpu_to_le64(0), + .EfiFreeMemoryTop = cpu_to_le64(0), + .EfiFreeMemoryBottom = cpu_to_le64(0), + .EfiEndOfHobList = cpu_to_le64(0), /* initialized later */ + }; + + tdvf_hob_add_memory_resources(tdx, &hob); + + last_hob = tdvf_get_area(&hob, sizeof(*last_hob)); + *last_hob = (EFI_HOB_GENERIC_HEADER) { + .HobType = EFI_HOB_TYPE_END_OF_HOB_LIST, + .HobLength = cpu_to_le16(sizeof(*last_hob)), + .Reserved = cpu_to_le32(0), + }; + hit->EfiEndOfHobList = tdvf_current_guest_addr(&hob); +} diff --git a/hw/i386/tdvf-hob.h b/hw/i386/tdvf-hob.h new file mode 100644 index 0000000..4fc6a37 --- /dev/null +++ b/hw/i386/tdvf-hob.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef HW_I386_TD_HOB_H +#define HW_I386_TD_HOB_H + +#include "hw/i386/tdvf.h" +#include "target/i386/kvm/tdx.h" + +void tdvf_hob_create(TdxGuest *tdx, TdxFirmwareEntry *td_hob); + +#define EFI_RESOURCE_ATTRIBUTE_TDVF_PRIVATE \ + (EFI_RESOURCE_ATTRIBUTE_PRESENT | \ + EFI_RESOURCE_ATTRIBUTE_INITIALIZED | \ + EFI_RESOURCE_ATTRIBUTE_TESTED) + +#define EFI_RESOURCE_ATTRIBUTE_TDVF_UNACCEPTED \ + (EFI_RESOURCE_ATTRIBUTE_PRESENT | \ + EFI_RESOURCE_ATTRIBUTE_INITIALIZED | \ + EFI_RESOURCE_ATTRIBUTE_TESTED) + +#define EFI_RESOURCE_ATTRIBUTE_TDVF_MMIO \ + (EFI_RESOURCE_ATTRIBUTE_PRESENT | \ + EFI_RESOURCE_ATTRIBUTE_INITIALIZED | \ + EFI_RESOURCE_ATTRIBUTE_UNCACHEABLE) + +#endif diff --git a/hw/i386/tdvf.c b/hw/i386/tdvf.c new file mode 100644 index 0000000..645d9d1 --- /dev/null +++ b/hw/i386/tdvf.c @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2025 Intel Corporation + * Author: Isaku Yamahata <isaku.yamahata at gmail.com> + * <isaku.yamahata at intel.com> + * Xiaoyao Li <xiaoyao.li@intel.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qemu/error-report.h" + +#include "hw/i386/pc.h" +#include "hw/i386/tdvf.h" +#include "system/kvm.h" + +#define TDX_METADATA_OFFSET_GUID "e47a6535-984a-4798-865e-4685a7bf8ec2" +#define TDX_METADATA_VERSION 1 +#define TDVF_SIGNATURE 0x46564454 /* TDVF as little endian */ +#define TDVF_ALIGNMENT 4096 + +/* + * the raw structs read from TDVF keeps the name convention in + * TDVF Design Guide spec. + */ +typedef struct { + uint32_t DataOffset; + uint32_t RawDataSize; + uint64_t MemoryAddress; + uint64_t MemoryDataSize; + uint32_t Type; + uint32_t Attributes; +} TdvfSectionEntry; + +typedef struct { + uint32_t Signature; + uint32_t Length; + uint32_t Version; + uint32_t NumberOfSectionEntries; + TdvfSectionEntry SectionEntries[]; +} TdvfMetadata; + +struct tdx_metadata_offset { + uint32_t offset; +}; + +static TdvfMetadata *tdvf_get_metadata(void *flash_ptr, int size) +{ + TdvfMetadata *metadata; + uint32_t offset = 0; + uint8_t *data; + + if ((uint32_t) size != size) { + return NULL; + } + + if (pc_system_ovmf_table_find(TDX_METADATA_OFFSET_GUID, &data, NULL)) { + offset = size - le32_to_cpu(((struct tdx_metadata_offset *)data)->offset); + + if (offset + sizeof(*metadata) > size) { + return NULL; + } + } else { + error_report("Cannot find TDX_METADATA_OFFSET_GUID"); + return NULL; + } + + metadata = flash_ptr + offset; + + /* Finally, verify the signature to determine if this is a TDVF image. */ + metadata->Signature = le32_to_cpu(metadata->Signature); + if (metadata->Signature != TDVF_SIGNATURE) { + error_report("Invalid TDVF signature in metadata!"); + return NULL; + } + + /* Sanity check that the TDVF doesn't overlap its own metadata. */ + metadata->Length = le32_to_cpu(metadata->Length); + if (offset + metadata->Length > size) { + return NULL; + } + + /* Only version 1 is supported/defined. */ + metadata->Version = le32_to_cpu(metadata->Version); + if (metadata->Version != TDX_METADATA_VERSION) { + return NULL; + } + + return metadata; +} + +static int tdvf_parse_and_check_section_entry(const TdvfSectionEntry *src, + TdxFirmwareEntry *entry) +{ + entry->data_offset = le32_to_cpu(src->DataOffset); + entry->data_len = le32_to_cpu(src->RawDataSize); + entry->address = le64_to_cpu(src->MemoryAddress); + entry->size = le64_to_cpu(src->MemoryDataSize); + entry->type = le32_to_cpu(src->Type); + entry->attributes = le32_to_cpu(src->Attributes); + + /* sanity check */ + if (entry->size < entry->data_len) { + error_report("Broken metadata RawDataSize 0x%x MemoryDataSize 0x%"PRIx64, + entry->data_len, entry->size); + return -1; + } + if (!QEMU_IS_ALIGNED(entry->address, TDVF_ALIGNMENT)) { + error_report("MemoryAddress 0x%"PRIx64" not page aligned", entry->address); + return -1; + } + if (!QEMU_IS_ALIGNED(entry->size, TDVF_ALIGNMENT)) { + error_report("MemoryDataSize 0x%"PRIx64" not page aligned", entry->size); + return -1; + } + + switch (entry->type) { + case TDVF_SECTION_TYPE_BFV: + case TDVF_SECTION_TYPE_CFV: + /* The sections that must be copied from firmware image to TD memory */ + if (entry->data_len == 0) { + error_report("%d section with RawDataSize == 0", entry->type); + return -1; + } + break; + case TDVF_SECTION_TYPE_TD_HOB: + case TDVF_SECTION_TYPE_TEMP_MEM: + /* The sections that no need to be copied from firmware image */ + if (entry->data_len != 0) { + error_report("%d section with RawDataSize 0x%x != 0", + entry->type, entry->data_len); + return -1; + } + break; + default: + error_report("TDVF contains unsupported section type %d", entry->type); + return -1; + } + + return 0; +} + +int tdvf_parse_metadata(TdxFirmware *fw, void *flash_ptr, int size) +{ + g_autofree TdvfSectionEntry *sections = NULL; + TdvfMetadata *metadata; + ssize_t entries_size; + int i; + + metadata = tdvf_get_metadata(flash_ptr, size); + if (!metadata) { + return -EINVAL; + } + + /* load and parse metadata entries */ + fw->nr_entries = le32_to_cpu(metadata->NumberOfSectionEntries); + if (fw->nr_entries < 2) { + error_report("Invalid number of fw entries (%u) in TDVF Metadata", + fw->nr_entries); + return -EINVAL; + } + + entries_size = fw->nr_entries * sizeof(TdvfSectionEntry); + if (metadata->Length != sizeof(*metadata) + entries_size) { + error_report("TDVF metadata len (0x%x) mismatch, expected (0x%x)", + metadata->Length, + (uint32_t)(sizeof(*metadata) + entries_size)); + return -EINVAL; + } + + fw->entries = g_new(TdxFirmwareEntry, fw->nr_entries); + sections = g_new(TdvfSectionEntry, fw->nr_entries); + + memcpy(sections, (void *)metadata + sizeof(*metadata), entries_size); + + for (i = 0; i < fw->nr_entries; i++) { + if (tdvf_parse_and_check_section_entry(§ions[i], &fw->entries[i])) { + goto err; + } + } + + fw->mem_ptr = flash_ptr; + return 0; + +err: + fw->entries = 0; + g_free(fw->entries); + return -EINVAL; +} diff --git a/hw/i386/x86-common.c b/hw/i386/x86-common.c index 1b0671c..b1b5f11 100644 --- a/hw/i386/x86-common.c +++ b/hw/i386/x86-common.c @@ -44,6 +44,7 @@ #include "standard-headers/asm-x86/bootparam.h" #include CONFIG_DEVICES #include "kvm/kvm_i386.h" +#include "kvm/tdx.h" #ifdef CONFIG_XEN_EMU #include "hw/xen/xen.h" @@ -1035,11 +1036,14 @@ void x86_bios_rom_init(X86MachineState *x86ms, const char *default_firmware, if (machine_require_guest_memfd(MACHINE(x86ms))) { memory_region_init_ram_guest_memfd(&x86ms->bios, NULL, "pc.bios", bios_size, &error_fatal); + if (is_tdx_vm()) { + tdx_set_tdvf_region(&x86ms->bios); + } } else { memory_region_init_ram(&x86ms->bios, NULL, "pc.bios", bios_size, &error_fatal); } - if (sev_enabled()) { + if (sev_enabled() || is_tdx_vm()) { /* * The concept of a "reset" simply doesn't exist for * confidential computing guests, we have to destroy and diff --git a/hw/i386/x86.c b/hw/i386/x86.c index e2d0409..f80533d 100644 --- a/hw/i386/x86.c +++ b/hw/i386/x86.c @@ -382,7 +382,6 @@ static void x86_machine_class_init(ObjectClass *oc, const void *data) mc->get_default_cpu_node_id = x86_get_default_cpu_node_id; mc->possible_cpu_arch_ids = x86_possible_cpu_arch_ids; mc->kvm_type = x86_kvm_type; - x86mc->save_tsc_khz = true; x86mc->fwcfg_dma_enabled = true; nc->nmi_monitor_handler = x86_nmi; diff --git a/hw/intc/arm_gic.c b/hw/intc/arm_gic.c index d18bef4..899f133 100644 --- a/hw/intc/arm_gic.c +++ b/hw/intc/arm_gic.c @@ -59,7 +59,7 @@ static const uint8_t gic_id_gicv2[] = { static inline int gic_get_current_cpu(GICState *s) { if (!qtest_enabled() && s->num_cpu > 1) { - return current_cpu->cpu_index; + return current_cpu->cpu_index - s->first_cpu_index; } return 0; } diff --git a/hw/intc/arm_gic_common.c b/hw/intc/arm_gic_common.c index 0f0c48d..ed5be05 100644 --- a/hw/intc/arm_gic_common.c +++ b/hw/intc/arm_gic_common.c @@ -350,6 +350,7 @@ static void arm_gic_common_linux_init(ARMLinuxBootIf *obj, static const Property arm_gic_common_properties[] = { DEFINE_PROP_UINT32("num-cpu", GICState, num_cpu, 1), + DEFINE_PROP_UINT32("first-cpu-index", GICState, first_cpu_index, 0), DEFINE_PROP_UINT32("num-irq", GICState, num_irq, 32), /* Revision can be 1 or 2 for GIC architecture specification * versions 1 or 2, or 0 to indicate the legacy 11MPCore GIC. diff --git a/hw/intc/armv7m_nvic.c b/hw/intc/armv7m_nvic.c index 83ff74f..6d85720 100644 --- a/hw/intc/armv7m_nvic.c +++ b/hw/intc/armv7m_nvic.c @@ -988,6 +988,7 @@ static void nvic_nmi_trigger(void *opaque, int n, int level) static uint32_t nvic_readl(NVICState *s, uint32_t offset, MemTxAttrs attrs) { ARMCPU *cpu = s->cpu; + ARMISARegisters *isar = &cpu->isar; uint32_t val; switch (offset) { @@ -1263,17 +1264,17 @@ static uint32_t nvic_readl(NVICState *s, uint32_t offset, MemTxAttrs attrs) if (!arm_feature(&cpu->env, ARM_FEATURE_M_MAIN)) { goto bad_offset; } - return cpu->isar.id_pfr0; + return GET_IDREG(isar, ID_PFR0); case 0xd44: /* PFR1. */ if (!arm_feature(&cpu->env, ARM_FEATURE_M_MAIN)) { goto bad_offset; } - return cpu->isar.id_pfr1; + return GET_IDREG(isar, ID_PFR1); case 0xd48: /* DFR0. */ if (!arm_feature(&cpu->env, ARM_FEATURE_M_MAIN)) { goto bad_offset; } - return cpu->isar.id_dfr0; + return GET_IDREG(isar, ID_DFR0); case 0xd4c: /* AFR0. */ if (!arm_feature(&cpu->env, ARM_FEATURE_M_MAIN)) { goto bad_offset; @@ -1283,52 +1284,52 @@ static uint32_t nvic_readl(NVICState *s, uint32_t offset, MemTxAttrs attrs) if (!arm_feature(&cpu->env, ARM_FEATURE_M_MAIN)) { goto bad_offset; } - return cpu->isar.id_mmfr0; + return GET_IDREG(isar, ID_MMFR0); case 0xd54: /* MMFR1. */ if (!arm_feature(&cpu->env, ARM_FEATURE_M_MAIN)) { goto bad_offset; } - return cpu->isar.id_mmfr1; + return GET_IDREG(isar, ID_MMFR1); case 0xd58: /* MMFR2. */ if (!arm_feature(&cpu->env, ARM_FEATURE_M_MAIN)) { goto bad_offset; } - return cpu->isar.id_mmfr2; + return GET_IDREG(isar, ID_MMFR2); case 0xd5c: /* MMFR3. */ if (!arm_feature(&cpu->env, ARM_FEATURE_M_MAIN)) { goto bad_offset; } - return cpu->isar.id_mmfr3; + return GET_IDREG(isar, ID_MMFR3); case 0xd60: /* ISAR0. */ if (!arm_feature(&cpu->env, ARM_FEATURE_M_MAIN)) { goto bad_offset; } - return cpu->isar.id_isar0; + return GET_IDREG(&cpu->isar, ID_ISAR0); case 0xd64: /* ISAR1. */ if (!arm_feature(&cpu->env, ARM_FEATURE_M_MAIN)) { goto bad_offset; } - return cpu->isar.id_isar1; + return GET_IDREG(&cpu->isar, ID_ISAR1); case 0xd68: /* ISAR2. */ if (!arm_feature(&cpu->env, ARM_FEATURE_M_MAIN)) { goto bad_offset; } - return cpu->isar.id_isar2; + return GET_IDREG(&cpu->isar, ID_ISAR2); case 0xd6c: /* ISAR3. */ if (!arm_feature(&cpu->env, ARM_FEATURE_M_MAIN)) { goto bad_offset; } - return cpu->isar.id_isar3; + return GET_IDREG(&cpu->isar, ID_ISAR3); case 0xd70: /* ISAR4. */ if (!arm_feature(&cpu->env, ARM_FEATURE_M_MAIN)) { goto bad_offset; } - return cpu->isar.id_isar4; + return GET_IDREG(&cpu->isar, ID_ISAR4); case 0xd74: /* ISAR5. */ if (!arm_feature(&cpu->env, ARM_FEATURE_M_MAIN)) { goto bad_offset; } - return cpu->isar.id_isar5; + return GET_IDREG(&cpu->isar, ID_ISAR5); case 0xd78: /* CLIDR */ return cpu->clidr; case 0xd7c: /* CTR */ diff --git a/hw/intc/aspeed_intc.c b/hw/intc/aspeed_intc.c index 33fcbe7..5cd786d 100644 --- a/hw/intc/aspeed_intc.c +++ b/hw/intc/aspeed_intc.c @@ -737,6 +737,7 @@ static const MemoryRegionOps aspeed_intc_ops = { .read = aspeed_intc_read, .write = aspeed_intc_write, .endianness = DEVICE_LITTLE_ENDIAN, + .impl.min_access_size = 4, .valid = { .min_access_size = 4, .max_access_size = 4, @@ -747,6 +748,7 @@ static const MemoryRegionOps aspeed_intcio_ops = { .read = aspeed_intcio_read, .write = aspeed_intcio_write, .endianness = DEVICE_LITTLE_ENDIAN, + .impl.min_access_size = 4, .valid = { .min_access_size = 4, .max_access_size = 4, @@ -757,6 +759,7 @@ static const MemoryRegionOps aspeed_ssp_intc_ops = { .read = aspeed_intc_read, .write = aspeed_ssp_intc_write, .endianness = DEVICE_LITTLE_ENDIAN, + .impl.min_access_size = 4, .valid = { .min_access_size = 4, .max_access_size = 4, @@ -767,6 +770,7 @@ static const MemoryRegionOps aspeed_ssp_intcio_ops = { .read = aspeed_intcio_read, .write = aspeed_ssp_intcio_write, .endianness = DEVICE_LITTLE_ENDIAN, + .impl.min_access_size = 4, .valid = { .min_access_size = 4, .max_access_size = 4, @@ -777,6 +781,7 @@ static const MemoryRegionOps aspeed_tsp_intc_ops = { .read = aspeed_intc_read, .write = aspeed_tsp_intc_write, .endianness = DEVICE_LITTLE_ENDIAN, + .impl.min_access_size = 4, .valid = { .min_access_size = 4, .max_access_size = 4, @@ -787,6 +792,7 @@ static const MemoryRegionOps aspeed_tsp_intcio_ops = { .read = aspeed_intcio_read, .write = aspeed_tsp_intcio_write, .endianness = DEVICE_LITTLE_ENDIAN, + .impl.min_access_size = 4, .valid = { .min_access_size = 4, .max_access_size = 4, @@ -995,7 +1001,8 @@ static AspeedINTCIRQ aspeed_2700ssp_intcio_irqs[ASPEED_INTC_MAX_INPINS] = { {5, 5, 1, R_SSPINT165_EN, R_SSPINT165_STATUS}, }; -static void aspeed_2700ssp_intcio_class_init(ObjectClass *klass, const void *data) +static void aspeed_2700ssp_intcio_class_init(ObjectClass *klass, + const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); AspeedINTCClass *aic = ASPEED_INTC_CLASS(klass); @@ -1063,7 +1070,8 @@ static AspeedINTCIRQ aspeed_2700tsp_intcio_irqs[ASPEED_INTC_MAX_INPINS] = { {5, 5, 1, R_TSPINT165_EN, R_TSPINT165_STATUS}, }; -static void aspeed_2700tsp_intcio_class_init(ObjectClass *klass, const void *data) +static void aspeed_2700tsp_intcio_class_init(ObjectClass *klass, + const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); AspeedINTCClass *aic = ASPEED_INTC_CLASS(klass); diff --git a/hw/intc/loongarch_extioi.c b/hw/intc/loongarch_extioi.c index 7c38c4c..8b8ac6b 100644 --- a/hw/intc/loongarch_extioi.c +++ b/hw/intc/loongarch_extioi.c @@ -12,6 +12,7 @@ #include "hw/irq.h" #include "hw/loongarch/virt.h" #include "system/address-spaces.h" +#include "system/kvm.h" #include "hw/intc/loongarch_extioi.h" #include "trace.h" @@ -351,23 +352,29 @@ static void loongarch_extioi_realize(DeviceState *dev, Error **errp) return; } - for (i = 0; i < EXTIOI_IRQS; i++) { - sysbus_init_irq(sbd, &s->irq[i]); - } - - qdev_init_gpio_in(dev, extioi_setirq, EXTIOI_IRQS); - memory_region_init_io(&s->extioi_system_mem, OBJECT(s), &extioi_ops, - s, "extioi_system_mem", 0x900); - sysbus_init_mmio(sbd, &s->extioi_system_mem); - if (s->features & BIT(EXTIOI_HAS_VIRT_EXTENSION)) { - memory_region_init_io(&s->virt_extend, OBJECT(s), &extioi_virt_ops, - s, "extioi_virt", EXTIOI_VIRT_SIZE); - sysbus_init_mmio(sbd, &s->virt_extend); s->features |= EXTIOI_VIRT_HAS_FEATURES; } else { s->status |= BIT(EXTIOI_ENABLE); } + + if (kvm_irqchip_in_kernel()) { + kvm_extioi_realize(dev, errp); + } else { + for (i = 0; i < EXTIOI_IRQS; i++) { + sysbus_init_irq(sbd, &s->irq[i]); + } + + qdev_init_gpio_in(dev, extioi_setirq, EXTIOI_IRQS); + memory_region_init_io(&s->extioi_system_mem, OBJECT(s), &extioi_ops, + s, "extioi_system_mem", 0x900); + sysbus_init_mmio(sbd, &s->extioi_system_mem); + if (s->features & BIT(EXTIOI_HAS_VIRT_EXTENSION)) { + memory_region_init_io(&s->virt_extend, OBJECT(s), &extioi_virt_ops, + s, "extioi_virt", EXTIOI_VIRT_SIZE); + sysbus_init_mmio(sbd, &s->virt_extend); + } + } } static void loongarch_extioi_unrealize(DeviceState *dev) @@ -384,6 +391,19 @@ static void loongarch_extioi_reset_hold(Object *obj, ResetType type) if (lec->parent_phases.hold) { lec->parent_phases.hold(obj, type); } + + if (kvm_irqchip_in_kernel()) { + kvm_extioi_put(obj, 0); + } +} + +static int vmstate_extioi_pre_save(void *opaque) +{ + if (kvm_irqchip_in_kernel()) { + return kvm_extioi_get(opaque); + } + + return 0; } static int vmstate_extioi_post_load(void *opaque, int version_id) @@ -391,6 +411,10 @@ static int vmstate_extioi_post_load(void *opaque, int version_id) LoongArchExtIOICommonState *s = LOONGARCH_EXTIOI_COMMON(opaque); int i, start_irq; + if (kvm_irqchip_in_kernel()) { + return kvm_extioi_put(opaque, version_id); + } + for (i = 0; i < (EXTIOI_IRQS / 4); i++) { start_irq = i * 4; extioi_update_sw_coremap(s, start_irq, s->coremap[i], false); @@ -416,6 +440,7 @@ static void loongarch_extioi_class_init(ObjectClass *klass, const void *data) &lec->parent_unrealize); resettable_class_set_parent_phases(rc, NULL, loongarch_extioi_reset_hold, NULL, &lec->parent_phases); + lecc->pre_save = vmstate_extioi_pre_save; lecc->post_load = vmstate_extioi_post_load; } diff --git a/hw/intc/loongarch_extioi_kvm.c b/hw/intc/loongarch_extioi_kvm.c new file mode 100644 index 0000000..0133540 --- /dev/null +++ b/hw/intc/loongarch_extioi_kvm.c @@ -0,0 +1,140 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * LoongArch EXTIOI interrupt kvm support + * + * Copyright (C) 2025 Loongson Technology Corporation Limited + */ + +#include "qemu/osdep.h" +#include "qemu/typedefs.h" +#include "hw/intc/loongarch_extioi.h" +#include "linux/kvm.h" +#include "qapi/error.h" +#include "system/kvm.h" + +static void kvm_extioi_access_reg(int fd, uint64_t addr, void *val, bool write) +{ + kvm_device_access(fd, KVM_DEV_LOONGARCH_EXTIOI_GRP_REGS, + addr, val, write, &error_abort); +} + +static void kvm_extioi_access_sw_state(int fd, uint64_t addr, + void *val, bool write) +{ + kvm_device_access(fd, KVM_DEV_LOONGARCH_EXTIOI_GRP_SW_STATUS, + addr, val, write, &error_abort); +} + +static void kvm_extioi_access_sw_status(void *opaque, bool write) +{ + LoongArchExtIOICommonState *lecs = LOONGARCH_EXTIOI_COMMON(opaque); + LoongArchExtIOIState *les = LOONGARCH_EXTIOI(opaque); + int addr; + + addr = KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_STATE; + kvm_extioi_access_sw_state(les->dev_fd, addr, &lecs->status, write); +} + +static void kvm_extioi_access_regs(void *opaque, bool write) +{ + LoongArchExtIOICommonState *lecs = LOONGARCH_EXTIOI_COMMON(opaque); + LoongArchExtIOIState *les = LOONGARCH_EXTIOI(opaque); + int fd = les->dev_fd; + int addr, offset, cpu; + + for (addr = EXTIOI_NODETYPE_START; addr < EXTIOI_NODETYPE_END; addr += 4) { + offset = (addr - EXTIOI_NODETYPE_START) / 4; + kvm_extioi_access_reg(fd, addr, &lecs->nodetype[offset], write); + } + + for (addr = EXTIOI_IPMAP_START; addr < EXTIOI_IPMAP_END; addr += 4) { + offset = (addr - EXTIOI_IPMAP_START) / 4; + kvm_extioi_access_reg(fd, addr, &lecs->ipmap[offset], write); + } + + for (addr = EXTIOI_ENABLE_START; addr < EXTIOI_ENABLE_END; addr += 4) { + offset = (addr - EXTIOI_ENABLE_START) / 4; + kvm_extioi_access_reg(fd, addr, &lecs->enable[offset], write); + } + + for (addr = EXTIOI_BOUNCE_START; addr < EXTIOI_BOUNCE_END; addr += 4) { + offset = (addr - EXTIOI_BOUNCE_START) / 4; + kvm_extioi_access_reg(fd, addr, &lecs->bounce[offset], write); + } + + for (addr = EXTIOI_ISR_START; addr < EXTIOI_ISR_END; addr += 4) { + offset = (addr - EXTIOI_ISR_START) / 4; + kvm_extioi_access_reg(fd, addr, &lecs->isr[offset], write); + } + + for (addr = EXTIOI_COREMAP_START; addr < EXTIOI_COREMAP_END; addr += 4) { + offset = (addr - EXTIOI_COREMAP_START) / 4; + kvm_extioi_access_reg(fd, addr, &lecs->coremap[offset], write); + } + + for (cpu = 0; cpu < lecs->num_cpu; cpu++) { + for (addr = EXTIOI_COREISR_START; + addr < EXTIOI_COREISR_END; addr += 4) { + offset = (addr - EXTIOI_COREISR_START) / 4; + kvm_extioi_access_reg(fd, (cpu << 16) | addr, + &lecs->cpu[cpu].coreisr[offset], write); + } + } +} + +int kvm_extioi_get(void *opaque) +{ + kvm_extioi_access_regs(opaque, false); + kvm_extioi_access_sw_status(opaque, false); + return 0; +} + +int kvm_extioi_put(void *opaque, int version_id) +{ + LoongArchExtIOIState *les = LOONGARCH_EXTIOI(opaque); + int fd = les->dev_fd; + + if (fd == 0) { + return 0; + } + + kvm_extioi_access_regs(opaque, true); + kvm_extioi_access_sw_status(opaque, true); + kvm_device_access(fd, KVM_DEV_LOONGARCH_EXTIOI_GRP_CTRL, + KVM_DEV_LOONGARCH_EXTIOI_CTRL_LOAD_FINISHED, + NULL, true, &error_abort); + return 0; +} + +void kvm_extioi_realize(DeviceState *dev, Error **errp) +{ + LoongArchExtIOICommonState *lecs = LOONGARCH_EXTIOI_COMMON(dev); + LoongArchExtIOIState *les = LOONGARCH_EXTIOI(dev); + int ret; + + ret = kvm_create_device(kvm_state, KVM_DEV_TYPE_LOONGARCH_EIOINTC, false); + if (ret < 0) { + fprintf(stderr, "create KVM_LOONGARCH_EIOINTC failed: %s\n", + strerror(-ret)); + abort(); + } + + les->dev_fd = ret; + ret = kvm_device_access(les->dev_fd, KVM_DEV_LOONGARCH_EXTIOI_GRP_CTRL, + KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_NUM_CPU, + &lecs->num_cpu, true, NULL); + if (ret < 0) { + fprintf(stderr, "KVM_LOONGARCH_EXTIOI_INIT_NUM_CPU failed: %s\n", + strerror(-ret)); + abort(); + } + + ret = kvm_device_access(les->dev_fd, KVM_DEV_LOONGARCH_EXTIOI_GRP_CTRL, + KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_FEATURE, + &lecs->features, true, NULL); + if (ret < 0) { + fprintf(stderr, "KVM_LOONGARCH_EXTIOI_INIT_FEATURE failed: %s\n", + strerror(-ret)); + abort(); + } +} diff --git a/hw/intc/loongarch_ipi.c b/hw/intc/loongarch_ipi.c index 74372a2..fc8005c 100644 --- a/hw/intc/loongarch_ipi.c +++ b/hw/intc/loongarch_ipi.c @@ -11,6 +11,7 @@ #include "qapi/error.h" #include "hw/intc/loongarch_ipi.h" #include "hw/qdev-properties.h" +#include "system/kvm.h" #include "target/loongarch/cpu.h" static AddressSpace *get_iocsr_as(CPUState *cpu) @@ -91,6 +92,10 @@ static void loongarch_ipi_realize(DeviceState *dev, Error **errp) lics->cpu[i].ipi = lics; qdev_init_gpio_out(dev, &lics->cpu[i].irq, 1); } + + if (kvm_irqchip_in_kernel()) { + kvm_ipi_realize(dev, errp); + } } static void loongarch_ipi_reset_hold(Object *obj, ResetType type) @@ -117,6 +122,10 @@ static void loongarch_ipi_reset_hold(Object *obj, ResetType type) core->clear = 0; memset(core->buf, 0, sizeof(core->buf)); } + + if (kvm_irqchip_in_kernel()) { + kvm_ipi_put(obj, 0); + } } static void loongarch_ipi_cpu_plug(HotplugHandler *hotplug_dev, @@ -166,6 +175,24 @@ static void loongarch_ipi_cpu_unplug(HotplugHandler *hotplug_dev, core->cpu = NULL; } +static int loongarch_ipi_pre_save(void *opaque) +{ + if (kvm_irqchip_in_kernel()) { + return kvm_ipi_get(opaque); + } + + return 0; +} + +static int loongarch_ipi_post_load(void *opaque, int version_id) +{ + if (kvm_irqchip_in_kernel()) { + return kvm_ipi_put(opaque, version_id); + } + + return 0; +} + static void loongarch_ipi_class_init(ObjectClass *klass, const void *data) { LoongsonIPICommonClass *licc = LOONGSON_IPI_COMMON_CLASS(klass); @@ -182,6 +209,8 @@ static void loongarch_ipi_class_init(ObjectClass *klass, const void *data) licc->cpu_by_arch_id = loongarch_cpu_by_arch_id; hc->plug = loongarch_ipi_cpu_plug; hc->unplug = loongarch_ipi_cpu_unplug; + licc->pre_save = loongarch_ipi_pre_save; + licc->post_load = loongarch_ipi_post_load; } static const TypeInfo loongarch_ipi_types[] = { diff --git a/hw/intc/loongarch_ipi_kvm.c b/hw/intc/loongarch_ipi_kvm.c new file mode 100644 index 0000000..4cb3acc --- /dev/null +++ b/hw/intc/loongarch_ipi_kvm.c @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * LoongArch IPI interrupt KVM support + * + * Copyright (C) 2025 Loongson Technology Corporation Limited + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "hw/intc/loongarch_ipi.h" +#include "system/kvm.h" +#include "target/loongarch/cpu.h" + +static void kvm_ipi_access_reg(int fd, uint64_t addr, uint32_t *val, bool write) +{ + kvm_device_access(fd, KVM_DEV_LOONGARCH_IPI_GRP_REGS, + addr, val, write, &error_abort); +} + +static void kvm_ipi_access_regs(void *opaque, bool write) +{ + LoongsonIPICommonState *ipi = (LoongsonIPICommonState *)opaque; + LoongarchIPIState *lis = LOONGARCH_IPI(opaque); + IPICore *core; + uint64_t attr; + int cpu, fd = lis->dev_fd; + + if (fd == 0) { + return; + } + + for (cpu = 0; cpu < ipi->num_cpu; cpu++) { + core = &ipi->cpu[cpu]; + attr = (cpu << 16) | CORE_STATUS_OFF; + kvm_ipi_access_reg(fd, attr, &core->status, write); + + attr = (cpu << 16) | CORE_EN_OFF; + kvm_ipi_access_reg(fd, attr, &core->en, write); + + attr = (cpu << 16) | CORE_SET_OFF; + kvm_ipi_access_reg(fd, attr, &core->set, write); + + attr = (cpu << 16) | CORE_CLEAR_OFF; + kvm_ipi_access_reg(fd, attr, &core->clear, write); + + attr = (cpu << 16) | CORE_BUF_20; + kvm_ipi_access_reg(fd, attr, &core->buf[0], write); + + attr = (cpu << 16) | CORE_BUF_28; + kvm_ipi_access_reg(fd, attr, &core->buf[2], write); + + attr = (cpu << 16) | CORE_BUF_30; + kvm_ipi_access_reg(fd, attr, &core->buf[4], write); + + attr = (cpu << 16) | CORE_BUF_38; + kvm_ipi_access_reg(fd, attr, &core->buf[6], write); + } +} + +int kvm_ipi_get(void *opaque) +{ + kvm_ipi_access_regs(opaque, false); + return 0; +} + +int kvm_ipi_put(void *opaque, int version_id) +{ + kvm_ipi_access_regs(opaque, true); + return 0; +} + +void kvm_ipi_realize(DeviceState *dev, Error **errp) +{ + LoongarchIPIState *lis = LOONGARCH_IPI(dev); + int ret; + + ret = kvm_create_device(kvm_state, KVM_DEV_TYPE_LOONGARCH_IPI, false); + if (ret < 0) { + fprintf(stderr, "IPI KVM_CREATE_DEVICE failed: %s\n", + strerror(-ret)); + abort(); + } + + lis->dev_fd = ret; +} diff --git a/hw/intc/loongarch_pch_msi.c b/hw/intc/loongarch_pch_msi.c index 06eb944..f6d1631 100644 --- a/hw/intc/loongarch_pch_msi.c +++ b/hw/intc/loongarch_pch_msi.c @@ -13,6 +13,7 @@ #include "hw/pci/msi.h" #include "hw/misc/unimp.h" #include "migration/vmstate.h" +#include "system/kvm.h" #include "trace.h" static uint64_t loongarch_msi_mem_read(void *opaque, hwaddr addr, unsigned size) @@ -26,6 +27,15 @@ static void loongarch_msi_mem_write(void *opaque, hwaddr addr, LoongArchPCHMSI *s = (LoongArchPCHMSI *)opaque; int irq_num; + if (kvm_irqchip_in_kernel()) { + MSIMessage msg; + + msg.address = addr; + msg.data = val; + kvm_irqchip_send_msi(kvm_state, msg); + return; + } + /* * vector number is irq number from upper extioi intc * need subtract irq base to get msi vector offset diff --git a/hw/intc/loongarch_pch_pic.c b/hw/intc/loongarch_pch_pic.c index cbba2fc..c4b242d 100644 --- a/hw/intc/loongarch_pch_pic.c +++ b/hw/intc/loongarch_pch_pic.c @@ -10,6 +10,7 @@ #include "qemu/log.h" #include "hw/irq.h" #include "hw/intc/loongarch_pch_pic.h" +#include "system/kvm.h" #include "trace.h" #include "qapi/error.h" @@ -48,6 +49,11 @@ static void pch_pic_irq_handler(void *opaque, int irq, int level) assert(irq < s->irq_num); trace_loongarch_pch_pic_irq_handler(irq, level); + if (kvm_irqchip_in_kernel()) { + kvm_set_irq(kvm_state, irq, !!level); + return; + } + if (s->intedge & mask) { /* Edge triggered */ if (level) { @@ -82,7 +88,7 @@ static uint64_t pch_pic_read(void *opaque, hwaddr addr, uint64_t field_mask) addr -= offset; switch (addr) { case PCH_PIC_INT_ID: - val = s->id.data; + val = cpu_to_le64(s->id.data); break; case PCH_PIC_INT_MASK: val = s->int_mask; @@ -258,6 +264,10 @@ static void loongarch_pic_reset_hold(Object *obj, ResetType type) if (lpc->parent_phases.hold) { lpc->parent_phases.hold(obj, type); } + + if (kvm_irqchip_in_kernel()) { + kvm_pic_put(obj, 0); + } } static void loongarch_pic_realize(DeviceState *dev, Error **errp) @@ -275,22 +285,49 @@ static void loongarch_pic_realize(DeviceState *dev, Error **errp) qdev_init_gpio_out(dev, s->parent_irq, s->irq_num); qdev_init_gpio_in(dev, pch_pic_irq_handler, s->irq_num); - memory_region_init_io(&s->iomem, OBJECT(dev), - &loongarch_pch_pic_ops, - s, TYPE_LOONGARCH_PIC, VIRT_PCH_REG_SIZE); - sysbus_init_mmio(sbd, &s->iomem); + + if (kvm_irqchip_in_kernel()) { + kvm_pic_realize(dev, errp); + } else { + memory_region_init_io(&s->iomem, OBJECT(dev), + &loongarch_pch_pic_ops, + s, TYPE_LOONGARCH_PIC, VIRT_PCH_REG_SIZE); + sysbus_init_mmio(sbd, &s->iomem); + } +} + +static int loongarch_pic_pre_save(LoongArchPICCommonState *opaque) +{ + if (kvm_irqchip_in_kernel()) { + return kvm_pic_get(opaque); + } + + return 0; +} + +static int loongarch_pic_post_load(LoongArchPICCommonState *opaque, + int version_id) +{ + if (kvm_irqchip_in_kernel()) { + return kvm_pic_put(opaque, version_id); + } + + return 0; } static void loongarch_pic_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); LoongarchPICClass *lpc = LOONGARCH_PIC_CLASS(klass); + LoongArchPICCommonClass *lpcc = LOONGARCH_PIC_COMMON_CLASS(klass); ResettableClass *rc = RESETTABLE_CLASS(klass); resettable_class_set_parent_phases(rc, NULL, loongarch_pic_reset_hold, NULL, &lpc->parent_phases); device_class_set_parent_realize(dc, loongarch_pic_realize, &lpc->parent_realize); + lpcc->pre_save = loongarch_pic_pre_save; + lpcc->post_load = loongarch_pic_post_load; } static const TypeInfo loongarch_pic_types[] = { diff --git a/hw/intc/loongarch_pic_kvm.c b/hw/intc/loongarch_pic_kvm.c new file mode 100644 index 0000000..dd504ec --- /dev/null +++ b/hw/intc/loongarch_pic_kvm.c @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * LoongArch kvm pch pic interrupt support + * + * Copyright (C) 2025 Loongson Technology Corporation Limited + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "hw/boards.h" +#include "hw/intc/loongarch_pch_pic.h" +#include "hw/loongarch/virt.h" +#include "hw/pci-host/ls7a.h" +#include "system/kvm.h" + +static void kvm_pch_pic_access_reg(int fd, uint64_t addr, void *val, bool write) +{ + kvm_device_access(fd, KVM_DEV_LOONGARCH_PCH_PIC_GRP_REGS, + addr, val, write, &error_abort); +} + +static void kvm_pch_pic_access(void *opaque, bool write) +{ + LoongArchPICCommonState *s = LOONGARCH_PIC_COMMON(opaque); + LoongarchPICState *lps = LOONGARCH_PIC(opaque); + int fd = lps->dev_fd; + int addr, offset; + + if (fd == 0) { + return; + } + + kvm_pch_pic_access_reg(fd, PCH_PIC_INT_MASK, &s->int_mask, write); + kvm_pch_pic_access_reg(fd, PCH_PIC_HTMSI_EN, &s->htmsi_en, write); + kvm_pch_pic_access_reg(fd, PCH_PIC_INT_EDGE, &s->intedge, write); + kvm_pch_pic_access_reg(fd, PCH_PIC_AUTO_CTRL0, &s->auto_crtl0, write); + kvm_pch_pic_access_reg(fd, PCH_PIC_AUTO_CTRL1, &s->auto_crtl1, write); + + for (addr = PCH_PIC_ROUTE_ENTRY; + addr < PCH_PIC_ROUTE_ENTRY_END; addr++) { + offset = addr - PCH_PIC_ROUTE_ENTRY; + kvm_pch_pic_access_reg(fd, addr, &s->route_entry[offset], write); + } + + for (addr = PCH_PIC_HTMSI_VEC; addr < PCH_PIC_HTMSI_VEC_END; addr++) { + offset = addr - PCH_PIC_HTMSI_VEC; + kvm_pch_pic_access_reg(fd, addr, &s->htmsi_vector[offset], write); + } + + kvm_pch_pic_access_reg(fd, PCH_PIC_INT_REQUEST, &s->intirr, write); + kvm_pch_pic_access_reg(fd, PCH_PIC_INT_STATUS, &s->intisr, write); + kvm_pch_pic_access_reg(fd, PCH_PIC_INT_POL, &s->int_polarity, write); +} + +int kvm_pic_get(void *opaque) +{ + kvm_pch_pic_access(opaque, false); + return 0; +} + +int kvm_pic_put(void *opaque, int version_id) +{ + kvm_pch_pic_access(opaque, true); + return 0; +} + +void kvm_pic_realize(DeviceState *dev, Error **errp) +{ + LoongarchPICState *lps = LOONGARCH_PIC(dev); + uint64_t pch_pic_base = VIRT_PCH_REG_BASE; + int ret; + + ret = kvm_create_device(kvm_state, KVM_DEV_TYPE_LOONGARCH_PCHPIC, false); + if (ret < 0) { + fprintf(stderr, "Create KVM_LOONGARCH_PCHPIC failed: %s\n", + strerror(-ret)); + abort(); + } + + lps->dev_fd = ret; + ret = kvm_device_access(lps->dev_fd, KVM_DEV_LOONGARCH_PCH_PIC_GRP_CTRL, + KVM_DEV_LOONGARCH_PCH_PIC_CTRL_INIT, + &pch_pic_base, true, NULL); + if (ret < 0) { + fprintf(stderr, "KVM_LOONGARCH_PCH_PIC_INIT failed: %s\n", + strerror(-ret)); + abort(); + } +} diff --git a/hw/intc/loongson_ipi_common.c b/hw/intc/loongson_ipi_common.c index f32661c..8cd78d4 100644 --- a/hw/intc/loongson_ipi_common.c +++ b/hw/intc/loongson_ipi_common.c @@ -11,6 +11,7 @@ #include "hw/irq.h" #include "qemu/log.h" #include "migration/vmstate.h" +#include "system/kvm.h" #include "trace.h" MemTxResult loongson_ipi_core_readl(void *opaque, hwaddr addr, uint64_t *data, @@ -255,6 +256,10 @@ static void loongson_ipi_common_realize(DeviceState *dev, Error **errp) LoongsonIPICommonState *s = LOONGSON_IPI_COMMON(dev); SysBusDevice *sbd = SYS_BUS_DEVICE(dev); + if (kvm_irqchip_in_kernel()) { + return; + } + memory_region_init_io(&s->ipi_iocsr_mem, OBJECT(dev), &loongson_ipi_iocsr_ops, s, "loongson_ipi_iocsr", 0x48); @@ -277,10 +282,38 @@ static void loongson_ipi_common_unrealize(DeviceState *dev) g_free(s->cpu); } +static int loongson_ipi_common_pre_save(void *opaque) +{ + IPICore *ipicore = (IPICore *)opaque; + LoongsonIPICommonState *s = ipicore->ipi; + LoongsonIPICommonClass *licc = LOONGSON_IPI_COMMON_GET_CLASS(s); + + if (licc->pre_save) { + return licc->pre_save(s); + } + + return 0; +} + +static int loongson_ipi_common_post_load(void *opaque, int version_id) +{ + IPICore *ipicore = (IPICore *)opaque; + LoongsonIPICommonState *s = ipicore->ipi; + LoongsonIPICommonClass *licc = LOONGSON_IPI_COMMON_GET_CLASS(s); + + if (licc->post_load) { + return licc->post_load(s, version_id); + } + + return 0; +} + static const VMStateDescription vmstate_ipi_core = { .name = "ipi-single", .version_id = 2, .minimum_version_id = 2, + .pre_save = loongson_ipi_common_pre_save, + .post_load = loongson_ipi_common_post_load, .fields = (const VMStateField[]) { VMSTATE_UINT32(status, IPICore), VMSTATE_UINT32(en, IPICore), diff --git a/hw/intc/meson.build b/hw/intc/meson.build index 602da30..3137521 100644 --- a/hw/intc/meson.build +++ b/hw/intc/meson.build @@ -71,6 +71,12 @@ specific_ss.add(when: 'CONFIG_M68K_IRQC', if_true: files('m68k_irqc.c')) specific_ss.add(when: 'CONFIG_LOONGSON_IPI_COMMON', if_true: files('loongson_ipi_common.c')) specific_ss.add(when: 'CONFIG_LOONGSON_IPI', if_true: files('loongson_ipi.c')) specific_ss.add(when: 'CONFIG_LOONGARCH_IPI', if_true: files('loongarch_ipi.c')) +specific_ss.add(when: ['CONFIG_KVM', 'CONFIG_LOONGARCH_IPI'], + if_true: files('loongarch_ipi_kvm.c')) specific_ss.add(when: 'CONFIG_LOONGARCH_PCH_PIC', if_true: files('loongarch_pch_pic.c', 'loongarch_pic_common.c')) +specific_ss.add(when: ['CONFIG_KVM', 'CONFIG_LOONGARCH_PCH_PIC'], + if_true: files('loongarch_pic_kvm.c')) specific_ss.add(when: 'CONFIG_LOONGARCH_PCH_MSI', if_true: files('loongarch_pch_msi.c')) specific_ss.add(when: 'CONFIG_LOONGARCH_EXTIOI', if_true: files('loongarch_extioi.c', 'loongarch_extioi_common.c')) +specific_ss.add(when: ['CONFIG_KVM', 'CONFIG_LOONGARCH_EXTIOI'], + if_true: files('loongarch_extioi_kvm.c')) diff --git a/hw/intc/riscv_aclint.c b/hw/intc/riscv_aclint.c index b0139f0..4623cfa0 100644 --- a/hw/intc/riscv_aclint.c +++ b/hw/intc/riscv_aclint.c @@ -28,6 +28,7 @@ #include "qemu/module.h" #include "hw/sysbus.h" #include "target/riscv/cpu.h" +#include "target/riscv/time_helper.h" #include "hw/qdev-properties.h" #include "hw/intc/riscv_aclint.h" #include "qemu/timer.h" @@ -240,6 +241,10 @@ static void riscv_aclint_mtimer_write(void *opaque, hwaddr addr, riscv_aclint_mtimer_write_timecmp(mtimer, RISCV_CPU(cpu), mtimer->hartid_base + i, mtimer->timecmp[i]); + riscv_timer_write_timecmp(env, env->stimer, env->stimecmp, 0, MIP_STIP); + riscv_timer_write_timecmp(env, env->vstimer, env->vstimecmp, + env->htimedelta, MIP_VSTIP); + } return; } diff --git a/hw/intc/riscv_aplic.c b/hw/intc/riscv_aplic.c index 8bcd9f4..4fa5f75 100644 --- a/hw/intc/riscv_aplic.c +++ b/hw/intc/riscv_aplic.c @@ -962,10 +962,18 @@ static const Property riscv_aplic_properties[] = { DEFINE_PROP_BOOL("mmode", RISCVAPLICState, mmode, 0), }; +static bool riscv_aplic_state_needed(void *opaque) +{ + RISCVAPLICState *aplic = opaque; + + return riscv_use_emulated_aplic(aplic->msimode); +} + static const VMStateDescription vmstate_riscv_aplic = { .name = "riscv_aplic", - .version_id = 2, - .minimum_version_id = 2, + .version_id = 3, + .minimum_version_id = 3, + .needed = riscv_aplic_state_needed, .fields = (const VMStateField[]) { VMSTATE_UINT32(domaincfg, RISCVAPLICState), VMSTATE_UINT32(mmsicfgaddr, RISCVAPLICState), diff --git a/hw/intc/riscv_imsic.c b/hw/intc/riscv_imsic.c index 2169988..6174e1a 100644 --- a/hw/intc/riscv_imsic.c +++ b/hw/intc/riscv_imsic.c @@ -398,10 +398,16 @@ static const Property riscv_imsic_properties[] = { DEFINE_PROP_UINT32("num-irqs", RISCVIMSICState, num_irqs, 0), }; +static bool riscv_imsic_state_needed(void *opaque) +{ + return !kvm_irqchip_in_kernel(); +} + static const VMStateDescription vmstate_riscv_imsic = { .name = "riscv_imsic", - .version_id = 1, - .minimum_version_id = 1, + .version_id = 2, + .minimum_version_id = 2, + .needed = riscv_imsic_state_needed, .fields = (const VMStateField[]) { VMSTATE_VARRAY_UINT32(eidelivery, RISCVIMSICState, num_pages, 0, diff --git a/hw/loongarch/boot.c b/hw/loongarch/boot.c index 9b6292e..14d6c52 100644 --- a/hw/loongarch/boot.c +++ b/hw/loongarch/boot.c @@ -35,12 +35,6 @@ struct loongarch_linux_hdr { uint32_t pe_header_offset; } QEMU_PACKED; -struct memmap_entry *memmap_table; -unsigned memmap_entries; - -ram_addr_t initrd_offset; -uint64_t initrd_size; - static const unsigned int slave_boot_code[] = { /* Configure reset ebase. */ 0x0400302c, /* csrwr $t0, LOONGARCH_CSR_EENTRY */ @@ -94,12 +88,16 @@ static inline void *guidcpy(void *dst, const void *src) return memcpy(dst, src, sizeof(efi_guid_t)); } -static void init_efi_boot_memmap(struct efi_system_table *systab, +static void init_efi_boot_memmap(MachineState *ms, + struct efi_system_table *systab, void *p, void *start) { unsigned i; struct efi_boot_memmap *boot_memmap = p; efi_guid_t tbl_guid = LINUX_EFI_BOOT_MEMMAP_GUID; + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(ms); + struct memmap_entry *memmap_table; + unsigned int memmap_entries; /* efi_configuration_table 1 */ guidcpy(&systab->tables[0].guid, &tbl_guid); @@ -111,6 +109,8 @@ static void init_efi_boot_memmap(struct efi_system_table *systab, boot_memmap->map_size = 0; efi_memory_desc_t *map = p + sizeof(struct efi_boot_memmap); + memmap_table = lvms->memmap_table; + memmap_entries = lvms->memmap_entries; for (i = 0; i < memmap_entries; i++) { map = (void *)boot_memmap + sizeof(*map); map[i].type = memmap_table[i].type; @@ -121,7 +121,8 @@ static void init_efi_boot_memmap(struct efi_system_table *systab, } } -static void init_efi_initrd_table(struct efi_system_table *systab, +static void init_efi_initrd_table(struct loongarch_boot_info *info, + struct efi_system_table *systab, void *p, void *start) { efi_guid_t tbl_guid = LINUX_EFI_INITRD_MEDIA_GUID; @@ -132,8 +133,8 @@ static void init_efi_initrd_table(struct efi_system_table *systab, systab->tables[1].table = (struct efi_configuration_table *)(p - start); systab->nr_tables = 2; - initrd_table->base = initrd_offset; - initrd_table->size = initrd_size; + initrd_table->base = info->initrd_addr; + initrd_table->size = info->initrd_size; } static void init_efi_fdt_table(struct efi_system_table *systab) @@ -146,10 +147,12 @@ static void init_efi_fdt_table(struct efi_system_table *systab) systab->nr_tables = 3; } -static void init_systab(struct loongarch_boot_info *info, void *p, void *start) +static void init_systab(MachineState *ms, + struct loongarch_boot_info *info, void *p, void *start) { void *bp_tables_start; struct efi_system_table *systab = p; + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(ms); info->a2 = p - start; @@ -166,10 +169,10 @@ static void init_systab(struct loongarch_boot_info *info, void *p, void *start) systab->tables = p; bp_tables_start = p; - init_efi_boot_memmap(systab, p, start); + init_efi_boot_memmap(ms, systab, p, start); p += ROUND_UP(sizeof(struct efi_boot_memmap) + - sizeof(efi_memory_desc_t) * memmap_entries, 64 * KiB); - init_efi_initrd_table(systab, p, start); + sizeof(efi_memory_desc_t) * lvms->memmap_entries, 64 * KiB); + init_efi_initrd_table(info, systab, p, start); p += ROUND_UP(sizeof(struct efi_initrd), 64 * KiB); init_efi_fdt_table(systab); @@ -276,8 +279,8 @@ static ram_addr_t alloc_initrd_memory(struct loongarch_boot_info *info, static int64_t load_kernel_info(struct loongarch_boot_info *info) { - uint64_t kernel_entry, kernel_low, kernel_high; - ssize_t kernel_size; + uint64_t kernel_entry, kernel_low, kernel_high, initrd_offset = 0; + ssize_t kernel_size, initrd_size; kernel_size = load_elf(info->kernel_filename, NULL, cpu_loongarch_virt_to_phys, NULL, @@ -313,8 +316,9 @@ static int64_t load_kernel_info(struct loongarch_boot_info *info) info->initrd_filename); exit(1); } - } else { - initrd_size = 0; + + info->initrd_addr = initrd_offset; + info->initrd_size = initrd_size; } return kernel_entry; @@ -369,17 +373,19 @@ static void loongarch_firmware_boot(LoongArchVirtMachineState *lvms, fw_cfg_add_kernel_info(info, lvms->fw_cfg); } -static void init_boot_rom(struct loongarch_boot_info *info, void *p) +static void init_boot_rom(MachineState *ms, + struct loongarch_boot_info *info, void *p) { void *start = p; init_cmdline(info, p, start); p += COMMAND_LINE_SIZE; - init_systab(info, p, start); + init_systab(ms, info, p, start); } -static void loongarch_direct_kernel_boot(struct loongarch_boot_info *info) +static void loongarch_direct_kernel_boot(MachineState *ms, + struct loongarch_boot_info *info) { void *p, *bp; int64_t kernel_addr = VIRT_FLASH0_BASE; @@ -397,7 +403,7 @@ static void loongarch_direct_kernel_boot(struct loongarch_boot_info *info) /* Load cmdline and system tables at [0 - 1 MiB] */ p = g_malloc0(1 * MiB); bp = p; - init_boot_rom(info, p); + init_boot_rom(ms, info, p); rom_add_blob_fixed_as("boot_info", bp, 1 * MiB, 0, &address_space_memory); /* Load slave boot code at pflash0 . */ @@ -437,6 +443,6 @@ void loongarch_load_kernel(MachineState *ms, struct loongarch_boot_info *info) if (lvms->bios_loaded) { loongarch_firmware_boot(lvms, info); } else { - loongarch_direct_kernel_boot(info); + loongarch_direct_kernel_boot(ms, info); } } diff --git a/hw/loongarch/virt-acpi-build.c b/hw/loongarch/virt-acpi-build.c index 073b6de..2cd2d9d 100644 --- a/hw/loongarch/virt-acpi-build.c +++ b/hw/loongarch/virt-acpi-build.c @@ -575,8 +575,8 @@ static void acpi_build(AcpiBuildTables *tables, MachineState *machine) acpi_add_table(table_offsets, tables_blob); { AcpiMcfgInfo mcfg = { - .base = cpu_to_le64(VIRT_PCI_CFG_BASE), - .size = cpu_to_le64(VIRT_PCI_CFG_SIZE), + .base = VIRT_PCI_CFG_BASE, + .size = VIRT_PCI_CFG_SIZE, }; build_mcfg(tables_blob, tables->linker, &mcfg, lvms->oem_id, lvms->oem_table_id); diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 1b50404..b15ada2 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -136,6 +136,10 @@ static void virt_build_smbios(LoongArchVirtMachineState *lvms) return; } + if (kvm_enabled()) { + product = "KVM Virtual Machine"; + } + smbios_set_defaults("QEMU", product, mc->name); smbios_get_tables(ms, SMBIOS_ENTRY_POINT_TYPE_64, @@ -168,8 +172,15 @@ static void virt_powerdown_req(Notifier *notifier, void *opaque) acpi_send_event(s->acpi_ged, ACPI_POWER_DOWN_STATUS); } -static void memmap_add_entry(uint64_t address, uint64_t length, uint32_t type) +static void memmap_add_entry(MachineState *ms, uint64_t address, + uint64_t length, uint32_t type) { + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(ms); + struct memmap_entry *memmap_table; + unsigned int memmap_entries; + + memmap_table = lvms->memmap_table; + memmap_entries = lvms->memmap_entries; /* Ensure there are no duplicate entries. */ for (unsigned i = 0; i < memmap_entries; i++) { assert(memmap_table[i].address != address); @@ -182,6 +193,8 @@ static void memmap_add_entry(uint64_t address, uint64_t length, uint32_t type) memmap_table[memmap_entries].type = cpu_to_le32(type); memmap_table[memmap_entries].reserved = 0; memmap_entries++; + lvms->memmap_table = memmap_table; + lvms->memmap_entries = memmap_entries; } static DeviceState *create_acpi_ged(DeviceState *pch_pic, @@ -401,12 +414,6 @@ static void virt_irq_init(LoongArchVirtMachineState *lvms) lvms->ipi = ipi; sysbus_realize_and_unref(SYS_BUS_DEVICE(ipi), &error_fatal); - /* IPI iocsr memory region */ - memory_region_add_subregion(&lvms->system_iocsr, SMP_IPI_MAILBOX, - sysbus_mmio_get_region(SYS_BUS_DEVICE(ipi), 0)); - memory_region_add_subregion(&lvms->system_iocsr, MAIL_SEND_ADDR, - sysbus_mmio_get_region(SYS_BUS_DEVICE(ipi), 1)); - /* Create EXTIOI device */ extioi = qdev_new(TYPE_LOONGARCH_EXTIOI); lvms->extioi = extioi; @@ -414,12 +421,6 @@ static void virt_irq_init(LoongArchVirtMachineState *lvms) qdev_prop_set_bit(extioi, "has-virtualization-extension", true); } sysbus_realize_and_unref(SYS_BUS_DEVICE(extioi), &error_fatal); - memory_region_add_subregion(&lvms->system_iocsr, APIC_BASE, - sysbus_mmio_get_region(SYS_BUS_DEVICE(extioi), 0)); - if (virt_is_veiointc_enabled(lvms)) { - memory_region_add_subregion(&lvms->system_iocsr, EXTIOI_VIRT_BASE, - sysbus_mmio_get_region(SYS_BUS_DEVICE(extioi), 1)); - } virt_cpu_irq_init(lvms); pch_pic = qdev_new(TYPE_LOONGARCH_PIC); @@ -427,13 +428,6 @@ static void virt_irq_init(LoongArchVirtMachineState *lvms) qdev_prop_set_uint32(pch_pic, "pch_pic_irq_num", num); d = SYS_BUS_DEVICE(pch_pic); sysbus_realize_and_unref(d, &error_fatal); - memory_region_add_subregion(get_system_memory(), VIRT_IOAPIC_REG_BASE, - sysbus_mmio_get_region(d, 0)); - - /* Connect pch_pic irqs to extioi */ - for (i = 0; i < num; i++) { - qdev_connect_gpio_out(DEVICE(d), i, qdev_get_gpio_in(extioi, i)); - } pch_msi = qdev_new(TYPE_LOONGARCH_PCH_MSI); start = num; @@ -443,12 +437,40 @@ static void virt_irq_init(LoongArchVirtMachineState *lvms) d = SYS_BUS_DEVICE(pch_msi); sysbus_realize_and_unref(d, &error_fatal); sysbus_mmio_map(d, 0, VIRT_PCH_MSI_ADDR_LOW); - for (i = 0; i < num; i++) { - /* Connect pch_msi irqs to extioi */ - qdev_connect_gpio_out(DEVICE(d), i, - qdev_get_gpio_in(extioi, i + start)); - } + if (kvm_irqchip_in_kernel()) { + kvm_loongarch_init_irq_routing(); + } else { + /* IPI iocsr memory region */ + memory_region_add_subregion(&lvms->system_iocsr, SMP_IPI_MAILBOX, + sysbus_mmio_get_region(SYS_BUS_DEVICE(ipi), 0)); + memory_region_add_subregion(&lvms->system_iocsr, MAIL_SEND_ADDR, + sysbus_mmio_get_region(SYS_BUS_DEVICE(ipi), 1)); + + /* EXTIOI iocsr memory region */ + memory_region_add_subregion(&lvms->system_iocsr, APIC_BASE, + sysbus_mmio_get_region(SYS_BUS_DEVICE(extioi), 0)); + if (virt_is_veiointc_enabled(lvms)) { + memory_region_add_subregion(&lvms->system_iocsr, EXTIOI_VIRT_BASE, + sysbus_mmio_get_region(SYS_BUS_DEVICE(extioi), 1)); + } + + /* PCH_PIC memory region */ + memory_region_add_subregion(get_system_memory(), VIRT_IOAPIC_REG_BASE, + sysbus_mmio_get_region(SYS_BUS_DEVICE(pch_pic), 0)); + + /* Connect pch_pic irqs to extioi */ + for (i = 0; i < VIRT_PCH_PIC_IRQ_NUM; i++) { + qdev_connect_gpio_out(DEVICE(pch_pic), i, + qdev_get_gpio_in(extioi, i)); + } + + for (i = VIRT_PCH_PIC_IRQ_NUM; i < EXTIOI_IRQS; i++) { + /* Connect pch_msi irqs to extioi */ + qdev_connect_gpio_out(DEVICE(pch_msi), i - VIRT_PCH_PIC_IRQ_NUM, + qdev_get_gpio_in(extioi, i)); + } + } virt_devices_init(pch_pic, lvms); } @@ -509,6 +531,10 @@ static MemTxResult virt_iocsr_misc_write(void *opaque, hwaddr addr, switch (addr) { case MISC_FUNC_REG: + if (kvm_irqchip_in_kernel()) { + return MEMTX_OK; + } + if (!virt_is_veiointc_enabled(lvms)) { return MEMTX_OK; } @@ -559,6 +585,10 @@ static MemTxResult virt_iocsr_misc_read(void *opaque, hwaddr addr, ret = 0x303030354133ULL; /* "3A5000" */ break; case MISC_FUNC_REG: + if (kvm_irqchip_in_kernel()) { + return MEMTX_OK; + } + if (!virt_is_veiointc_enabled(lvms)) { ret |= BIT_ULL(IOCSRM_EXTIOI_EN); break; @@ -619,13 +649,13 @@ static void fw_cfg_add_memory(MachineState *ms) } if (size >= gap) { - memmap_add_entry(base, gap, 1); + memmap_add_entry(ms, base, gap, 1); size -= gap; base = VIRT_HIGHMEM_BASE; } if (size) { - memmap_add_entry(base, size, 1); + memmap_add_entry(ms, base, size, 1); base += size; } @@ -640,7 +670,7 @@ static void fw_cfg_add_memory(MachineState *ms) * lowram: [base, +(gap - numa_info[0].node_mem)) * highram: [VIRT_HIGHMEM_BASE, +(ram_size - gap)) */ - memmap_add_entry(base, gap - numa_info[0].node_mem, 1); + memmap_add_entry(ms, base, gap - numa_info[0].node_mem, 1); size = ram_size - gap; base = VIRT_HIGHMEM_BASE; } else { @@ -648,7 +678,7 @@ static void fw_cfg_add_memory(MachineState *ms) } if (size) { - memmap_add_entry(base, size, 1); + memmap_add_entry(ms, base, size, 1); } } @@ -734,8 +764,8 @@ static void virt_init(MachineState *machine) rom_set_fw(lvms->fw_cfg); if (lvms->fw_cfg != NULL) { fw_cfg_add_file(lvms->fw_cfg, "etc/memmap", - memmap_table, - sizeof(struct memmap_entry) * (memmap_entries)); + lvms->memmap_table, + sizeof(struct memmap_entry) * lvms->memmap_entries); } /* Initialize the IO interrupt subsystem */ diff --git a/hw/meson.build b/hw/meson.build index b91f761..791ce21 100644 --- a/hw/meson.build +++ b/hw/meson.build @@ -39,6 +39,7 @@ subdir('uefi') subdir('ufs') subdir('usb') subdir('vfio') +subdir('vfio-user') subdir('virtio') subdir('vmapple') subdir('watchdog') diff --git a/hw/microblaze/petalogix_ml605_mmu.c b/hw/microblaze/petalogix_ml605_mmu.c index bea6b68..6e923c4 100644 --- a/hw/microblaze/petalogix_ml605_mmu.c +++ b/hw/microblaze/petalogix_ml605_mmu.c @@ -80,8 +80,6 @@ petalogix_ml605_init(MachineState *machine) MemoryRegion *phys_lmb_bram = g_new(MemoryRegion, 1); MemoryRegion *phys_ram = g_new(MemoryRegion, 1); qemu_irq irq[32]; - EndianMode endianness = TARGET_BIG_ENDIAN ? ENDIAN_MODE_BIG - : ENDIAN_MODE_LITTLE; /* init CPUs */ cpu = MICROBLAZE_CPU(object_new(TYPE_MICROBLAZE_CPU)); @@ -113,7 +111,7 @@ petalogix_ml605_init(MachineState *machine) dev = qdev_new("xlnx.xps-intc"); - qdev_prop_set_enum(dev, "endianness", endianness); + qdev_prop_set_enum(dev, "endianness", ENDIAN_MODE_LITTLE); qdev_prop_set_uint32(dev, "kind-of-intr", 1 << TIMER_IRQ); sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal); sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, INTC_BASEADDR); @@ -129,7 +127,7 @@ petalogix_ml605_init(MachineState *machine) /* 2 timers at irq 2 @ 100 Mhz. */ dev = qdev_new("xlnx.xps-timer"); - qdev_prop_set_enum(dev, "endianness", endianness); + qdev_prop_set_enum(dev, "endianness", ENDIAN_MODE_LITTLE); qdev_prop_set_uint32(dev, "one-timer-only", 0); qdev_prop_set_uint32(dev, "clock-frequency", 100 * 1000000); sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal); @@ -177,7 +175,7 @@ petalogix_ml605_init(MachineState *machine) SSIBus *spi; dev = qdev_new("xlnx.xps-spi"); - qdev_prop_set_enum(dev, "endianness", endianness); + qdev_prop_set_enum(dev, "endianness", ENDIAN_MODE_LITTLE); qdev_prop_set_uint8(dev, "num-ss-bits", NUM_SPI_FLASHES); busdev = SYS_BUS_DEVICE(dev); sysbus_realize_and_unref(busdev, &error_fatal); @@ -218,12 +216,7 @@ petalogix_ml605_init(MachineState *machine) static void petalogix_ml605_machine_init(MachineClass *mc) { - if (TARGET_BIG_ENDIAN) { - mc->desc = "PetaLogix linux refdesign for xilinx ml605 (big endian)"; - mc->deprecation_reason = "big endian support is not tested"; - } else { - mc->desc = "PetaLogix linux refdesign for xilinx ml605 (little endian)"; - } + mc->desc = "PetaLogix linux refdesign for xilinx ml605 (little endian)"; mc->init = petalogix_ml605_init; } diff --git a/hw/microblaze/petalogix_s3adsp1800_mmu.c b/hw/microblaze/petalogix_s3adsp1800_mmu.c index 032f6f7..e8d0ddf 100644 --- a/hw/microblaze/petalogix_s3adsp1800_mmu.c +++ b/hw/microblaze/petalogix_s3adsp1800_mmu.c @@ -58,9 +58,20 @@ #define TYPE_PETALOGIX_S3ADSP1800_MACHINE \ MACHINE_TYPE_NAME("petalogix-s3adsp1800") +struct S3Adsp1800MachineState { + MachineState parent_class; + + EndianMode endianness; +}; + +OBJECT_DECLARE_TYPE(S3Adsp1800MachineState, MachineClass, + PETALOGIX_S3ADSP1800_MACHINE) + + static void petalogix_s3adsp1800_init(MachineState *machine) { + S3Adsp1800MachineState *psms = PETALOGIX_S3ADSP1800_MACHINE(machine); ram_addr_t ram_size = machine->ram_size; DeviceState *dev; MicroBlazeCPU *cpu; @@ -71,13 +82,12 @@ petalogix_s3adsp1800_init(MachineState *machine) MemoryRegion *phys_ram = g_new(MemoryRegion, 1); qemu_irq irq[32]; MemoryRegion *sysmem = get_system_memory(); - EndianMode endianness = TARGET_BIG_ENDIAN ? ENDIAN_MODE_BIG - : ENDIAN_MODE_LITTLE; + EndianMode endianness = psms->endianness; cpu = MICROBLAZE_CPU(object_new(TYPE_MICROBLAZE_CPU)); object_property_set_str(OBJECT(cpu), "version", "7.10.d", &error_abort); object_property_set_bool(OBJECT(cpu), "little-endian", - !TARGET_BIG_ENDIAN, &error_abort); + endianness == ENDIAN_MODE_LITTLE, &error_abort); qdev_realize(DEVICE(cpu), NULL, &error_abort); /* Attach emulated BRAM through the LMB. */ @@ -135,20 +145,41 @@ petalogix_s3adsp1800_init(MachineState *machine) create_unimplemented_device("xps_gpio", GPIO_BASEADDR, 0x10000); - microblaze_load_kernel(cpu, !TARGET_BIG_ENDIAN, ddr_base, ram_size, - machine->initrd_filename, + microblaze_load_kernel(cpu, endianness == ENDIAN_MODE_LITTLE, ddr_base, + ram_size, machine->initrd_filename, BINARY_DEVICE_TREE_FILE, NULL); } +static int machine_get_endianness(Object *obj, Error **errp G_GNUC_UNUSED) +{ + S3Adsp1800MachineState *ms = PETALOGIX_S3ADSP1800_MACHINE(obj); + return ms->endianness; +} + +static void machine_set_endianness(Object *obj, int endianness, Error **errp) +{ + S3Adsp1800MachineState *ms = PETALOGIX_S3ADSP1800_MACHINE(obj); + ms->endianness = endianness; +} + static void petalogix_s3adsp1800_machine_class_init(ObjectClass *oc, const void *data) { MachineClass *mc = MACHINE_CLASS(oc); + ObjectProperty *prop; mc->desc = "PetaLogix linux refdesign for xilinx Spartan 3ADSP1800"; mc->init = petalogix_s3adsp1800_init; mc->is_default = true; + + prop = object_class_property_add_enum(oc, "endianness", "EndianMode", + &EndianMode_lookup, + machine_get_endianness, + machine_set_endianness); + object_property_set_default_str(prop, TARGET_BIG_ENDIAN ? "big" : "little"); + object_class_property_set_description(oc, "endianness", + "Defines whether the machine runs in big or little endian mode"); } static const TypeInfo petalogix_s3adsp1800_machine_types[] = { @@ -156,6 +187,7 @@ static const TypeInfo petalogix_s3adsp1800_machine_types[] = { .name = TYPE_PETALOGIX_S3ADSP1800_MACHINE, .parent = TYPE_MACHINE, .class_init = petalogix_s3adsp1800_machine_class_init, + .instance_size = sizeof(S3Adsp1800MachineState), }, }; diff --git a/hw/microblaze/xlnx-zynqmp-pmu.c b/hw/microblaze/xlnx-zynqmp-pmu.c index ed40b5f..e909802 100644 --- a/hw/microblaze/xlnx-zynqmp-pmu.c +++ b/hw/microblaze/xlnx-zynqmp-pmu.c @@ -181,12 +181,7 @@ static void xlnx_zynqmp_pmu_init(MachineState *machine) static void xlnx_zynqmp_pmu_machine_init(MachineClass *mc) { - if (TARGET_BIG_ENDIAN) { - mc->desc = "Xilinx ZynqMP PMU machine (big endian)"; - mc->deprecation_reason = "big endian support is not tested"; - } else { - mc->desc = "Xilinx ZynqMP PMU machine (little endian)"; - } + mc->desc = "Xilinx ZynqMP PMU machine (little endian)"; mc->init = xlnx_zynqmp_pmu_init; } diff --git a/hw/misc/aspeed_hace.c b/hw/misc/aspeed_hace.c index f4bff32..726368f 100644 --- a/hw/misc/aspeed_hace.c +++ b/hw/misc/aspeed_hace.c @@ -10,14 +10,17 @@ */ #include "qemu/osdep.h" +#include "qemu/cutils.h" #include "qemu/log.h" #include "qemu/error-report.h" +#include "qemu/iov.h" #include "hw/misc/aspeed_hace.h" #include "qapi/error.h" #include "migration/vmstate.h" #include "crypto/hash.h" #include "hw/qdev-properties.h" #include "hw/irq.h" +#include "trace.h" #define R_CRYPT_CMD (0x10 / 4) @@ -27,9 +30,12 @@ #define TAG_IRQ BIT(15) #define R_HASH_SRC (0x20 / 4) -#define R_HASH_DEST (0x24 / 4) +#define R_HASH_DIGEST (0x24 / 4) #define R_HASH_KEY_BUFF (0x28 / 4) #define R_HASH_SRC_LEN (0x2c / 4) +#define R_HASH_SRC_HI (0x90 / 4) +#define R_HASH_DIGEST_HI (0x94 / 4) +#define R_HASH_KEY_BUFF_HI (0x98 / 4) #define R_HASH_CMD (0x30 / 4) /* Hash algorithm selection */ @@ -84,6 +90,42 @@ static const struct { QCRYPTO_HASH_ALGO_SHA256 }, }; +static void hace_hexdump(const char *desc, const char *buf, size_t size) +{ + g_autoptr(GString) str = g_string_sized_new(64); + size_t len; + size_t i; + + for (i = 0; i < size; i += len) { + len = MIN(16, size - i); + g_string_truncate(str, 0); + qemu_hexdump_line(str, buf + i, len, 1, 4); + trace_aspeed_hace_hexdump(desc, i, str->str); + } +} + +static void hace_iov_hexdump(const char *desc, const struct iovec *iov, + const unsigned int iov_cnt) +{ + size_t size = 0; + char *buf; + int i; + + for (i = 0; i < iov_cnt; i++) { + size += iov[i].iov_len; + } + + buf = g_malloc(size); + + if (!buf) { + return; + } + + iov_to_buf(iov, iov_cnt, 0, buf, size); + hace_hexdump(desc, buf, size); + g_free(buf); +} + static int hash_algo_lookup(uint32_t reg) { int i; @@ -142,171 +184,269 @@ static bool has_padding(AspeedHACEState *s, struct iovec *iov, return false; } -static int reconstruct_iov(AspeedHACEState *s, struct iovec *iov, int id, - uint32_t *pad_offset) +static uint64_t hash_get_source_addr(AspeedHACEState *s) { - int i, iov_count; - if (*pad_offset != 0) { - s->iov_cache[s->iov_count].iov_base = iov[id].iov_base; - s->iov_cache[s->iov_count].iov_len = *pad_offset; - ++s->iov_count; - } - for (i = 0; i < s->iov_count; i++) { - iov[i].iov_base = s->iov_cache[i].iov_base; - iov[i].iov_len = s->iov_cache[i].iov_len; + AspeedHACEClass *ahc = ASPEED_HACE_GET_CLASS(s); + uint64_t src_addr = 0; + + src_addr = deposit64(src_addr, 0, 32, s->regs[R_HASH_SRC]); + if (ahc->has_dma64) { + src_addr = deposit64(src_addr, 32, 32, s->regs[R_HASH_SRC_HI]); } - iov_count = s->iov_count; - s->iov_count = 0; - s->total_req_len = 0; - return iov_count; + + return src_addr; } -static void do_hash_operation(AspeedHACEState *s, int algo, bool sg_mode, - bool acc_mode) +static int hash_prepare_direct_iov(AspeedHACEState *s, struct iovec *iov, + bool acc_mode, bool *acc_final_request) { - struct iovec iov[ASPEED_HACE_MAX_SG]; uint32_t total_msg_len; uint32_t pad_offset; - g_autofree uint8_t *digest_buf = NULL; - size_t digest_len = 0; - bool sg_acc_mode_final_request = false; - int i; + uint64_t src; void *haddr; - Error *local_err = NULL; + hwaddr plen; + int iov_idx; + + plen = s->regs[R_HASH_SRC_LEN]; + src = hash_get_source_addr(s); + trace_aspeed_hace_hash_addr("src", src); + haddr = address_space_map(&s->dram_as, src, &plen, false, + MEMTXATTRS_UNSPECIFIED); + if (haddr == NULL) { + qemu_log_mask(LOG_GUEST_ERROR, + "%s: Unable to map address, addr=0x%" HWADDR_PRIx + " ,plen=0x%" HWADDR_PRIx "\n", + __func__, src, plen); + return -1; + } - if (acc_mode && s->hash_ctx == NULL) { - s->hash_ctx = qcrypto_hash_new(algo, &local_err); - if (s->hash_ctx == NULL) { - qemu_log_mask(LOG_GUEST_ERROR, "qcrypto hash failed : %s", - error_get_pretty(local_err)); - error_free(local_err); - return; + iov[0].iov_base = haddr; + iov_idx = 1; + + if (acc_mode) { + s->total_req_len += plen; + + if (has_padding(s, &iov[0], plen, &total_msg_len, + &pad_offset)) { + /* Padding being present indicates the final request */ + *acc_final_request = true; + iov[0].iov_len = pad_offset; + } else { + iov[0].iov_len = plen; } + } else { + iov[0].iov_len = plen; } - if (sg_mode) { - uint32_t len = 0; - - for (i = 0; !(len & SG_LIST_LEN_LAST); i++) { - uint32_t addr, src; - hwaddr plen; + return iov_idx; +} - if (i == ASPEED_HACE_MAX_SG) { - qemu_log_mask(LOG_GUEST_ERROR, - "aspeed_hace: guest failed to set end of sg list marker\n"); - break; - } +static int hash_prepare_sg_iov(AspeedHACEState *s, struct iovec *iov, + bool acc_mode, bool *acc_final_request) +{ + uint32_t total_msg_len; + uint32_t pad_offset; + uint32_t len = 0; + uint32_t sg_addr; + uint64_t src; + int iov_idx; + hwaddr plen; + void *haddr; - src = s->regs[R_HASH_SRC] + (i * SG_LIST_ENTRY_SIZE); + src = hash_get_source_addr(s); + for (iov_idx = 0; !(len & SG_LIST_LEN_LAST); iov_idx++) { + if (iov_idx == ASPEED_HACE_MAX_SG) { + qemu_log_mask(LOG_GUEST_ERROR, + "%s: Failed to set end of sg list marker\n", + __func__); + return -1; + } - len = address_space_ldl_le(&s->dram_as, src, + len = address_space_ldl_le(&s->dram_as, src, + MEMTXATTRS_UNSPECIFIED, NULL); + sg_addr = address_space_ldl_le(&s->dram_as, src + SG_LIST_LEN_SIZE, MEMTXATTRS_UNSPECIFIED, NULL); + sg_addr &= SG_LIST_ADDR_MASK; + trace_aspeed_hace_hash_sg(iov_idx, src, sg_addr, len); + /* + * To maintain compatibility with older SoCs such as the AST2600, + * the AST2700 HW automatically set bit 34 of the 64-bit sg_addr. + * As a result, the firmware only needs to provide a 32-bit sg_addr + * containing bits [31:0]. This is sufficient for the AST2700, as + * it uses a DRAM offset rather than a DRAM address. + */ + plen = len & SG_LIST_LEN_MASK; + haddr = address_space_map(&s->dram_as, sg_addr, &plen, false, + MEMTXATTRS_UNSPECIFIED); - addr = address_space_ldl_le(&s->dram_as, src + SG_LIST_LEN_SIZE, - MEMTXATTRS_UNSPECIFIED, NULL); - addr &= SG_LIST_ADDR_MASK; + if (haddr == NULL) { + qemu_log_mask(LOG_GUEST_ERROR, + "%s: Unable to map address, sg_addr=0x%x, " + "plen=0x%" HWADDR_PRIx "\n", + __func__, sg_addr, plen); + return -1; + } - plen = len & SG_LIST_LEN_MASK; - haddr = address_space_map(&s->dram_as, addr, &plen, false, - MEMTXATTRS_UNSPECIFIED); - if (haddr == NULL) { - qemu_log_mask(LOG_GUEST_ERROR, - "%s: qcrypto failed\n", __func__); - return; - } - iov[i].iov_base = haddr; - if (acc_mode) { - s->total_req_len += plen; - - if (has_padding(s, &iov[i], plen, &total_msg_len, - &pad_offset)) { - /* Padding being present indicates the final request */ - sg_acc_mode_final_request = true; - iov[i].iov_len = pad_offset; - } else { - iov[i].iov_len = plen; - } + src += SG_LIST_ENTRY_SIZE; + + iov[iov_idx].iov_base = haddr; + if (acc_mode) { + s->total_req_len += plen; + + if (has_padding(s, &iov[iov_idx], plen, &total_msg_len, + &pad_offset)) { + /* Padding being present indicates the final request */ + *acc_final_request = true; + iov[iov_idx].iov_len = pad_offset; } else { - iov[i].iov_len = plen; + iov[iov_idx].iov_len = plen; } + } else { + iov[iov_idx].iov_len = plen; } - } else { - hwaddr len = s->regs[R_HASH_SRC_LEN]; + } - haddr = address_space_map(&s->dram_as, s->regs[R_HASH_SRC], - &len, false, MEMTXATTRS_UNSPECIFIED); - if (haddr == NULL) { - qemu_log_mask(LOG_GUEST_ERROR, "%s: qcrypto failed\n", __func__); + return iov_idx; +} + +static uint64_t hash_get_digest_addr(AspeedHACEState *s) +{ + AspeedHACEClass *ahc = ASPEED_HACE_GET_CLASS(s); + uint64_t digest_addr = 0; + + digest_addr = deposit64(digest_addr, 0, 32, s->regs[R_HASH_DIGEST]); + if (ahc->has_dma64) { + digest_addr = deposit64(digest_addr, 32, 32, s->regs[R_HASH_DIGEST_HI]); + } + + return digest_addr; +} + +static void hash_write_digest_and_unmap_iov(AspeedHACEState *s, + struct iovec *iov, + int iov_idx, + uint8_t *digest_buf, + size_t digest_len) +{ + uint64_t digest_addr = 0; + + digest_addr = hash_get_digest_addr(s); + trace_aspeed_hace_hash_addr("digest", digest_addr); + if (address_space_write(&s->dram_as, digest_addr, + MEMTXATTRS_UNSPECIFIED, + digest_buf, digest_len)) { + qemu_log_mask(LOG_GUEST_ERROR, + "%s: Failed to write digest to 0x%" HWADDR_PRIx "\n", + __func__, digest_addr); + } + + if (trace_event_get_state_backends(TRACE_ASPEED_HACE_HEXDUMP)) { + hace_hexdump("digest", (char *)digest_buf, digest_len); + } + + for (; iov_idx > 0; iov_idx--) { + address_space_unmap(&s->dram_as, iov[iov_idx - 1].iov_base, + iov[iov_idx - 1].iov_len, false, + iov[iov_idx - 1].iov_len); + } +} + +static void hash_execute_non_acc_mode(AspeedHACEState *s, int algo, + struct iovec *iov, int iov_idx) +{ + g_autofree uint8_t *digest_buf = NULL; + Error *local_err = NULL; + size_t digest_len = 0; + + if (qcrypto_hash_bytesv(algo, iov, iov_idx, &digest_buf, + &digest_len, &local_err) < 0) { + qemu_log_mask(LOG_GUEST_ERROR, + "%s: qcrypto hash bytesv failed : %s", + __func__, error_get_pretty(local_err)); + error_free(local_err); + return; + } + + hash_write_digest_and_unmap_iov(s, iov, iov_idx, digest_buf, digest_len); +} + +static void hash_execute_acc_mode(AspeedHACEState *s, int algo, + struct iovec *iov, int iov_idx, + bool final_request) +{ + g_autofree uint8_t *digest_buf = NULL; + Error *local_err = NULL; + size_t digest_len = 0; + + trace_aspeed_hace_hash_execute_acc_mode(final_request); + + if (s->hash_ctx == NULL) { + s->hash_ctx = qcrypto_hash_new(algo, &local_err); + if (s->hash_ctx == NULL) { + qemu_log_mask(LOG_GUEST_ERROR, "%s: qcrypto hash new failed : %s", + __func__, error_get_pretty(local_err)); + error_free(local_err); return; } - iov[0].iov_base = haddr; - iov[0].iov_len = len; - i = 1; - - if (s->iov_count) { - /* - * In aspeed sdk kernel driver, sg_mode is disabled in hash_final(). - * Thus if we received a request with sg_mode disabled, it is - * required to check whether cache is empty. If no, we should - * combine cached iov and the current iov. - */ - s->total_req_len += len; - if (has_padding(s, iov, len, &total_msg_len, &pad_offset)) { - i = reconstruct_iov(s, iov, 0, &pad_offset); - } - } } - if (acc_mode) { - if (qcrypto_hash_updatev(s->hash_ctx, iov, i, &local_err) < 0) { - qemu_log_mask(LOG_GUEST_ERROR, "qcrypto hash update failed : %s", - error_get_pretty(local_err)); + if (qcrypto_hash_updatev(s->hash_ctx, iov, iov_idx, &local_err) < 0) { + qemu_log_mask(LOG_GUEST_ERROR, "%s: qcrypto hash updatev failed : %s", + __func__, error_get_pretty(local_err)); + error_free(local_err); + return; + } + + if (final_request) { + if (qcrypto_hash_finalize_bytes(s->hash_ctx, &digest_buf, + &digest_len, &local_err)) { + qemu_log_mask(LOG_GUEST_ERROR, + "%s: qcrypto hash finalize bytes failed : %s", + __func__, error_get_pretty(local_err)); error_free(local_err); - return; + local_err = NULL; } - if (sg_acc_mode_final_request) { - if (qcrypto_hash_finalize_bytes(s->hash_ctx, &digest_buf, - &digest_len, &local_err)) { - qemu_log_mask(LOG_GUEST_ERROR, - "qcrypto hash finalize failed : %s", - error_get_pretty(local_err)); - error_free(local_err); - local_err = NULL; - } + qcrypto_hash_free(s->hash_ctx); + + s->hash_ctx = NULL; + s->total_req_len = 0; + } - qcrypto_hash_free(s->hash_ctx); + hash_write_digest_and_unmap_iov(s, iov, iov_idx, digest_buf, digest_len); +} - s->hash_ctx = NULL; - s->iov_count = 0; - s->total_req_len = 0; - } - } else if (qcrypto_hash_bytesv(algo, iov, i, &digest_buf, - &digest_len, &local_err) < 0) { - qemu_log_mask(LOG_GUEST_ERROR, "qcrypto hash bytesv failed : %s", - error_get_pretty(local_err)); - error_free(local_err); - return; +static void do_hash_operation(AspeedHACEState *s, int algo, bool sg_mode, + bool acc_mode) +{ + QEMU_UNINITIALIZED struct iovec iov[ASPEED_HACE_MAX_SG]; + bool acc_final_request = false; + int iov_idx = -1; + + /* Prepares the iov for hashing operations based on the selected mode */ + if (sg_mode) { + iov_idx = hash_prepare_sg_iov(s, iov, acc_mode, &acc_final_request); + } else { + iov_idx = hash_prepare_direct_iov(s, iov, acc_mode, + &acc_final_request); } - if (address_space_write(&s->dram_as, s->regs[R_HASH_DEST], - MEMTXATTRS_UNSPECIFIED, - digest_buf, digest_len)) { + if (iov_idx <= 0) { qemu_log_mask(LOG_GUEST_ERROR, - "aspeed_hace: address space write failed\n"); + "%s: Failed to prepare iov\n", __func__); + return; } - for (; i > 0; i--) { - address_space_unmap(&s->dram_as, iov[i - 1].iov_base, - iov[i - 1].iov_len, false, - iov[i - 1].iov_len); + if (trace_event_get_state_backends(TRACE_ASPEED_HACE_HEXDUMP)) { + hace_iov_hexdump("plaintext", iov, iov_idx); } - /* - * Set status bits to indicate completion. Testing shows hardware sets - * these irrespective of HASH_IRQ_EN. - */ - s->regs[R_STATUS] |= HASH_IRQ; + /* Executes the hash operation */ + if (acc_mode) { + hash_execute_acc_mode(s, algo, iov, iov_idx, acc_final_request); + } else { + hash_execute_non_acc_mode(s, algo, iov, iov_idx); + } } static uint64_t aspeed_hace_read(void *opaque, hwaddr addr, unsigned int size) @@ -315,12 +455,7 @@ static uint64_t aspeed_hace_read(void *opaque, hwaddr addr, unsigned int size) addr >>= 2; - if (addr >= ASPEED_HACE_NR_REGS) { - qemu_log_mask(LOG_GUEST_ERROR, - "%s: Out-of-bounds read at offset 0x%" HWADDR_PRIx "\n", - __func__, addr << 2); - return 0; - } + trace_aspeed_hace_read(addr << 2, s->regs[addr]); return s->regs[addr]; } @@ -333,12 +468,7 @@ static void aspeed_hace_write(void *opaque, hwaddr addr, uint64_t data, addr >>= 2; - if (addr >= ASPEED_HACE_NR_REGS) { - qemu_log_mask(LOG_GUEST_ERROR, - "%s: Out-of-bounds write at offset 0x%" HWADDR_PRIx "\n", - __func__, addr << 2); - return; - } + trace_aspeed_hace_write(addr << 2, data); switch (addr) { case R_STATUS: @@ -362,7 +492,7 @@ static void aspeed_hace_write(void *opaque, hwaddr addr, uint64_t data, case R_HASH_SRC: data &= ahc->src_mask; break; - case R_HASH_DEST: + case R_HASH_DIGEST: data &= ahc->dest_mask; break; case R_HASH_KEY_BUFF: @@ -390,10 +520,16 @@ static void aspeed_hace_write(void *opaque, hwaddr addr, uint64_t data, qemu_log_mask(LOG_GUEST_ERROR, "%s: Invalid hash algorithm selection 0x%"PRIx64"\n", __func__, data & ahc->hash_mask); - break; + } else { + do_hash_operation(s, algo, data & HASH_SG_EN, + ((data & HASH_HMAC_MASK) == HASH_DIGEST_ACCUM)); } - do_hash_operation(s, algo, data & HASH_SG_EN, - ((data & HASH_HMAC_MASK) == HASH_DIGEST_ACCUM)); + + /* + * Set status bits to indicate completion. Testing shows hardware sets + * these irrespective of HASH_IRQ_EN. + */ + s->regs[R_STATUS] |= HASH_IRQ; if (data & HASH_IRQ_EN) { qemu_irq_raise(s->irq); @@ -410,6 +546,15 @@ static void aspeed_hace_write(void *opaque, hwaddr addr, uint64_t data, } } break; + case R_HASH_SRC_HI: + data &= ahc->src_hi_mask; + break; + case R_HASH_DIGEST_HI: + data &= ahc->dest_hi_mask; + break; + case R_HASH_KEY_BUFF_HI: + data &= ahc->key_hi_mask; + break; default: break; } @@ -430,14 +575,14 @@ static const MemoryRegionOps aspeed_hace_ops = { static void aspeed_hace_reset(DeviceState *dev) { struct AspeedHACEState *s = ASPEED_HACE(dev); + AspeedHACEClass *ahc = ASPEED_HACE_GET_CLASS(s); if (s->hash_ctx != NULL) { qcrypto_hash_free(s->hash_ctx); s->hash_ctx = NULL; } - memset(s->regs, 0, sizeof(s->regs)); - s->iov_count = 0; + memset(s->regs, 0, ahc->nr_regs << 2); s->total_req_len = 0; } @@ -445,11 +590,13 @@ static void aspeed_hace_realize(DeviceState *dev, Error **errp) { AspeedHACEState *s = ASPEED_HACE(dev); SysBusDevice *sbd = SYS_BUS_DEVICE(dev); + AspeedHACEClass *ahc = ASPEED_HACE_GET_CLASS(s); sysbus_init_irq(sbd, &s->irq); + s->regs = g_new(uint32_t, ahc->nr_regs); memory_region_init_io(&s->iomem, OBJECT(s), &aspeed_hace_ops, s, - TYPE_ASPEED_HACE, 0x1000); + TYPE_ASPEED_HACE, ahc->nr_regs << 2); if (!s->dram_mr) { error_setg(errp, TYPE_ASPEED_HACE ": 'dram' link not set"); @@ -469,21 +616,28 @@ static const Property aspeed_hace_properties[] = { static const VMStateDescription vmstate_aspeed_hace = { .name = TYPE_ASPEED_HACE, - .version_id = 1, - .minimum_version_id = 1, + .version_id = 2, + .minimum_version_id = 2, .fields = (const VMStateField[]) { - VMSTATE_UINT32_ARRAY(regs, AspeedHACEState, ASPEED_HACE_NR_REGS), VMSTATE_UINT32(total_req_len, AspeedHACEState), - VMSTATE_UINT32(iov_count, AspeedHACEState), VMSTATE_END_OF_LIST(), } }; +static void aspeed_hace_unrealize(DeviceState *dev) +{ + AspeedHACEState *s = ASPEED_HACE(dev); + + g_free(s->regs); + s->regs = NULL; +} + static void aspeed_hace_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); dc->realize = aspeed_hace_realize; + dc->unrealize = aspeed_hace_unrealize; device_class_set_legacy_reset(dc, aspeed_hace_reset); device_class_set_props(dc, aspeed_hace_properties); dc->vmsd = &vmstate_aspeed_hace; @@ -504,6 +658,7 @@ static void aspeed_ast2400_hace_class_init(ObjectClass *klass, const void *data) dc->desc = "AST2400 Hash and Crypto Engine"; + ahc->nr_regs = 0x64 >> 2; ahc->src_mask = 0x0FFFFFFF; ahc->dest_mask = 0x0FFFFFF8; ahc->key_mask = 0x0FFFFFC0; @@ -523,6 +678,7 @@ static void aspeed_ast2500_hace_class_init(ObjectClass *klass, const void *data) dc->desc = "AST2500 Hash and Crypto Engine"; + ahc->nr_regs = 0x64 >> 2; ahc->src_mask = 0x3fffffff; ahc->dest_mask = 0x3ffffff8; ahc->key_mask = 0x3FFFFFC0; @@ -542,6 +698,7 @@ static void aspeed_ast2600_hace_class_init(ObjectClass *klass, const void *data) dc->desc = "AST2600 Hash and Crypto Engine"; + ahc->nr_regs = 0x64 >> 2; ahc->src_mask = 0x7FFFFFFF; ahc->dest_mask = 0x7FFFFFF8; ahc->key_mask = 0x7FFFFFF8; @@ -561,6 +718,7 @@ static void aspeed_ast1030_hace_class_init(ObjectClass *klass, const void *data) dc->desc = "AST1030 Hash and Crypto Engine"; + ahc->nr_regs = 0x64 >> 2; ahc->src_mask = 0x7FFFFFFF; ahc->dest_mask = 0x7FFFFFF8; ahc->key_mask = 0x7FFFFFF8; @@ -580,17 +738,36 @@ static void aspeed_ast2700_hace_class_init(ObjectClass *klass, const void *data) dc->desc = "AST2700 Hash and Crypto Engine"; + ahc->nr_regs = 0x9C >> 2; ahc->src_mask = 0x7FFFFFFF; ahc->dest_mask = 0x7FFFFFF8; ahc->key_mask = 0x7FFFFFF8; ahc->hash_mask = 0x00147FFF; /* + * The AST2700 supports a maximum DRAM size of 8 GB, with a DRAM + * addressable range from 0x0_0000_0000 to 0x1_FFFF_FFFF. Since this range + * fits within 34 bits, only bits [33:0] are needed to store the DRAM + * offset. To optimize address storage, the high physical address bits + * [1:0] of the source, digest and key buffer addresses are stored as + * dram_offset bits [33:32]. + * + * This approach eliminates the need to reduce the high part of the DRAM + * physical address for DMA operations. Previously, this was calculated as + * (high physical address bits [7:0] - 4), since the DRAM start address is + * 0x4_00000000, making the high part address [7:0] - 4. + */ + ahc->src_hi_mask = 0x00000003; + ahc->dest_hi_mask = 0x00000003; + ahc->key_hi_mask = 0x00000003; + + /* * Currently, it does not support the CRYPT command. Instead, it only * sends an interrupt to notify the firmware that the crypt command * has completed. It is a temporary workaround. */ ahc->raise_crypt_interrupt_workaround = true; + ahc->has_dma64 = true; } static const TypeInfo aspeed_ast2700_hace_info = { diff --git a/hw/misc/aspeed_scu.c b/hw/misc/aspeed_scu.c index 4930e00..a0ab5ee 100644 --- a/hw/misc/aspeed_scu.c +++ b/hw/misc/aspeed_scu.c @@ -91,6 +91,7 @@ #define BMC_DEV_ID TO_REG(0x1A4) #define AST2600_PROT_KEY TO_REG(0x00) +#define AST2600_PROT_KEY2 TO_REG(0x10) #define AST2600_SILICON_REV TO_REG(0x04) #define AST2600_SILICON_REV2 TO_REG(0x14) #define AST2600_SYS_RST_CTRL TO_REG(0x40) @@ -176,6 +177,7 @@ #define AST2700_SCUIO_UARTCLK_GEN TO_REG(0x330) #define AST2700_SCUIO_HUARTCLK_GEN TO_REG(0x334) #define AST2700_SCUIO_CLK_DUTY_MEAS_RST TO_REG(0x388) +#define AST2700_SCUIO_FREQ_CNT_CTL TO_REG(0x3A0) #define SCU_IO_REGION_SIZE 0x1000 @@ -722,6 +724,8 @@ static void aspeed_ast2600_scu_write(void *opaque, hwaddr offset, int reg = TO_REG(offset); /* Truncate here so bitwise operations below behave as expected */ uint32_t data = data64; + bool prot_data_state = data == ASPEED_SCU_PROT_KEY; + bool unlocked = s->regs[AST2600_PROT_KEY] && s->regs[AST2600_PROT_KEY2]; if (reg >= ASPEED_AST2600_SCU_NR_REGS) { qemu_log_mask(LOG_GUEST_ERROR, @@ -730,15 +734,24 @@ static void aspeed_ast2600_scu_write(void *opaque, hwaddr offset, return; } - if (reg > PROT_KEY && !s->regs[PROT_KEY]) { + if ((reg != AST2600_PROT_KEY && reg != AST2600_PROT_KEY2) && !unlocked) { qemu_log_mask(LOG_GUEST_ERROR, "%s: SCU is locked!\n", __func__); + return; } trace_aspeed_scu_write(offset, size, data); switch (reg) { case AST2600_PROT_KEY: - s->regs[reg] = (data == ASPEED_SCU_PROT_KEY) ? 1 : 0; + /* + * Writing a value to SCU000 will modify both protection + * registers to each protection register individually. + */ + s->regs[AST2600_PROT_KEY] = prot_data_state; + s->regs[AST2600_PROT_KEY2] = prot_data_state; + return; + case AST2600_PROT_KEY2: + s->regs[AST2600_PROT_KEY2] = prot_data_state; return; case AST2600_HW_STRAP1: case AST2600_HW_STRAP2: @@ -1022,6 +1035,10 @@ static void aspeed_ast2700_scuio_write(void *opaque, hwaddr offset, s->regs[reg - 1] ^= data; updated = true; break; + case AST2700_SCUIO_FREQ_CNT_CTL: + s->regs[reg] = deposit32(s->regs[reg], 6, 1, !!(data & BIT(1))); + updated = true; + break; default: qemu_log_mask(LOG_GUEST_ERROR, "%s: Unhandled write at offset 0x%" HWADDR_PRIx "\n", @@ -1066,6 +1083,7 @@ static const uint32_t ast2700_a0_resets_io[ASPEED_AST2700_SCU_NR_REGS] = { [AST2700_SCUIO_UARTCLK_GEN] = 0x00014506, [AST2700_SCUIO_HUARTCLK_GEN] = 0x000145c0, [AST2700_SCUIO_CLK_DUTY_MEAS_RST] = 0x0c9100d2, + [AST2700_SCUIO_FREQ_CNT_CTL] = 0x00000080, }; static void aspeed_2700_scuio_class_init(ObjectClass *klass, const void *data) diff --git a/hw/misc/aspeed_sdmc.c b/hw/misc/aspeed_sdmc.c index f04d993..dff7cc3 100644 --- a/hw/misc/aspeed_sdmc.c +++ b/hw/misc/aspeed_sdmc.c @@ -570,6 +570,9 @@ static void aspeed_2700_sdmc_reset(DeviceState *dev) /* Set ram size bit and defaults values */ s->regs[R_MAIN_CONF] = asc->compute_conf(s, 0); + /* Skipping dram init */ + s->regs[R_MAIN_CONTROL] = BIT(16); + if (s->unlocked) { s->regs[R_2700_PROT] = PROT_UNLOCKED; } diff --git a/hw/misc/ivshmem-flat.c b/hw/misc/ivshmem-flat.c index be28c24..fe4be6b 100644 --- a/hw/misc/ivshmem-flat.c +++ b/hw/misc/ivshmem-flat.c @@ -362,7 +362,7 @@ static bool ivshmem_flat_connect_server(DeviceState *dev, Error **errp) * * ivshmem_flat_recv_msg() calls return 'msg' and 'fd'. * - * See ./docs/specs/ivshmem-spec.txt for details on the protocol. + * See docs/specs/ivshmem-spec.rst for details on the protocol. */ /* Step 0 */ diff --git a/hw/misc/stm32_rcc.c b/hw/misc/stm32_rcc.c index 94e8dae..5815b3e 100644 --- a/hw/misc/stm32_rcc.c +++ b/hw/misc/stm32_rcc.c @@ -60,7 +60,7 @@ static void stm32_rcc_write(void *opaque, hwaddr addr, uint32_t value = val64; uint32_t prev_value, new_value, irq_offset; - trace_stm32_rcc_write(value, addr); + trace_stm32_rcc_write(addr, value); if (addr > STM32_RCC_DCKCFGR2) { qemu_log_mask(LOG_GUEST_ERROR, "%s: Bad offset 0x%"HWADDR_PRIx"\n", diff --git a/hw/misc/trace-events b/hw/misc/trace-events index 4383808..e3f64c0 100644 --- a/hw/misc/trace-events +++ b/hw/misc/trace-events @@ -302,6 +302,14 @@ aspeed_peci_read(uint64_t offset, uint64_t data) "offset 0x%" PRIx64 " data 0x%" aspeed_peci_write(uint64_t offset, uint64_t data) "offset 0x%" PRIx64 " data 0x%" PRIx64 aspeed_peci_raise_interrupt(uint32_t ctrl, uint32_t status) "ctrl 0x%" PRIx32 " status 0x%" PRIx32 +# aspeed_hace.c +aspeed_hace_read(uint64_t offset, uint64_t data) "offset 0x%" PRIx64 " data 0x%" PRIx64 +aspeed_hace_write(uint64_t offset, uint64_t data) "offset 0x%" PRIx64 " data 0x%" PRIx64 +aspeed_hace_hash_sg(int index, uint64_t list_addr, uint64_t buf_addr, uint32_t len) "%d: list_addr 0x%" PRIx64 " buf_addr 0x%" PRIx64 " len 0x%" PRIx32 +aspeed_hace_hash_addr(const char *s, uint64_t addr) "%s: 0x%" PRIx64 +aspeed_hace_hash_execute_acc_mode(bool final_request) "final request: %d" +aspeed_hace_hexdump(const char *desc, uint32_t offset, char *s) "%s: 0x%08x: %s" + # bcm2835_property.c bcm2835_mbox_property(uint32_t tag, uint32_t bufsize, size_t resplen) "mbox property tag:0x%08x in_sz:%u out_sz:%zu" diff --git a/hw/net/e1000.c b/hw/net/e1000.c index cba4999..a80a7b0 100644 --- a/hw/net/e1000.c +++ b/hw/net/e1000.c @@ -127,10 +127,8 @@ struct E1000State_st { QEMUTimer *flush_queue_timer; /* Compatibility flags for migration to/from qemu 1.3.0 and older */ -#define E1000_FLAG_MAC_BIT 2 #define E1000_FLAG_TSO_BIT 3 #define E1000_FLAG_VET_BIT 4 -#define E1000_FLAG_MAC (1 << E1000_FLAG_MAC_BIT) #define E1000_FLAG_TSO (1 << E1000_FLAG_TSO_BIT) #define E1000_FLAG_VET (1 << E1000_FLAG_VET_BIT) @@ -1212,52 +1210,51 @@ enum { NWRITEOPS = ARRAY_SIZE(macreg_writeops) }; enum { MAC_ACCESS_PARTIAL = 1, MAC_ACCESS_FLAG_NEEDED = 2 }; -#define markflag(x) ((E1000_FLAG_##x << 2) | MAC_ACCESS_FLAG_NEEDED) /* In the array below the meaning of the bits is: [f|f|f|f|f|f|n|p] * f - flag bits (up to 6 possible flags) * n - flag needed - * p - partially implenented */ + * p - partially implemented */ static const uint8_t mac_reg_access[0x8000] = { - [IPAV] = markflag(MAC), [WUC] = markflag(MAC), - [IP6AT] = markflag(MAC), [IP4AT] = markflag(MAC), - [FFVT] = markflag(MAC), [WUPM] = markflag(MAC), - [ECOL] = markflag(MAC), [MCC] = markflag(MAC), - [DC] = markflag(MAC), [TNCRS] = markflag(MAC), - [RLEC] = markflag(MAC), [XONRXC] = markflag(MAC), - [XOFFTXC] = markflag(MAC), [RFC] = markflag(MAC), - [TSCTFC] = markflag(MAC), [MGTPRC] = markflag(MAC), - [WUS] = markflag(MAC), [AIT] = markflag(MAC), - [FFLT] = markflag(MAC), [FFMT] = markflag(MAC), - [SCC] = markflag(MAC), [FCRUC] = markflag(MAC), - [LATECOL] = markflag(MAC), [COLC] = markflag(MAC), - [SEQEC] = markflag(MAC), [CEXTERR] = markflag(MAC), - [XONTXC] = markflag(MAC), [XOFFRXC] = markflag(MAC), - [RJC] = markflag(MAC), [RNBC] = markflag(MAC), - [MGTPDC] = markflag(MAC), [MGTPTC] = markflag(MAC), - [RUC] = markflag(MAC), [ROC] = markflag(MAC), - [GORCL] = markflag(MAC), [GORCH] = markflag(MAC), - [GOTCL] = markflag(MAC), [GOTCH] = markflag(MAC), - [BPRC] = markflag(MAC), [MPRC] = markflag(MAC), - [TSCTC] = markflag(MAC), [PRC64] = markflag(MAC), - [PRC127] = markflag(MAC), [PRC255] = markflag(MAC), - [PRC511] = markflag(MAC), [PRC1023] = markflag(MAC), - [PRC1522] = markflag(MAC), [PTC64] = markflag(MAC), - [PTC127] = markflag(MAC), [PTC255] = markflag(MAC), - [PTC511] = markflag(MAC), [PTC1023] = markflag(MAC), - [PTC1522] = markflag(MAC), [MPTC] = markflag(MAC), - [BPTC] = markflag(MAC), - - [TDFH] = markflag(MAC) | MAC_ACCESS_PARTIAL, - [TDFT] = markflag(MAC) | MAC_ACCESS_PARTIAL, - [TDFHS] = markflag(MAC) | MAC_ACCESS_PARTIAL, - [TDFTS] = markflag(MAC) | MAC_ACCESS_PARTIAL, - [TDFPC] = markflag(MAC) | MAC_ACCESS_PARTIAL, - [RDFH] = markflag(MAC) | MAC_ACCESS_PARTIAL, - [RDFT] = markflag(MAC) | MAC_ACCESS_PARTIAL, - [RDFHS] = markflag(MAC) | MAC_ACCESS_PARTIAL, - [RDFTS] = markflag(MAC) | MAC_ACCESS_PARTIAL, - [RDFPC] = markflag(MAC) | MAC_ACCESS_PARTIAL, - [PBM] = markflag(MAC) | MAC_ACCESS_PARTIAL, + [IPAV] = MAC_ACCESS_FLAG_NEEDED, [WUC] = MAC_ACCESS_FLAG_NEEDED, + [IP6AT] = MAC_ACCESS_FLAG_NEEDED, [IP4AT] = MAC_ACCESS_FLAG_NEEDED, + [FFVT] = MAC_ACCESS_FLAG_NEEDED, [WUPM] = MAC_ACCESS_FLAG_NEEDED, + [ECOL] = MAC_ACCESS_FLAG_NEEDED, [MCC] = MAC_ACCESS_FLAG_NEEDED, + [DC] = MAC_ACCESS_FLAG_NEEDED, [TNCRS] = MAC_ACCESS_FLAG_NEEDED, + [RLEC] = MAC_ACCESS_FLAG_NEEDED, [XONRXC] = MAC_ACCESS_FLAG_NEEDED, + [XOFFTXC] = MAC_ACCESS_FLAG_NEEDED, [RFC] = MAC_ACCESS_FLAG_NEEDED, + [TSCTFC] = MAC_ACCESS_FLAG_NEEDED, [MGTPRC] = MAC_ACCESS_FLAG_NEEDED, + [WUS] = MAC_ACCESS_FLAG_NEEDED, [AIT] = MAC_ACCESS_FLAG_NEEDED, + [FFLT] = MAC_ACCESS_FLAG_NEEDED, [FFMT] = MAC_ACCESS_FLAG_NEEDED, + [SCC] = MAC_ACCESS_FLAG_NEEDED, [FCRUC] = MAC_ACCESS_FLAG_NEEDED, + [LATECOL] = MAC_ACCESS_FLAG_NEEDED, [COLC] = MAC_ACCESS_FLAG_NEEDED, + [SEQEC] = MAC_ACCESS_FLAG_NEEDED, [CEXTERR] = MAC_ACCESS_FLAG_NEEDED, + [XONTXC] = MAC_ACCESS_FLAG_NEEDED, [XOFFRXC] = MAC_ACCESS_FLAG_NEEDED, + [RJC] = MAC_ACCESS_FLAG_NEEDED, [RNBC] = MAC_ACCESS_FLAG_NEEDED, + [MGTPDC] = MAC_ACCESS_FLAG_NEEDED, [MGTPTC] = MAC_ACCESS_FLAG_NEEDED, + [RUC] = MAC_ACCESS_FLAG_NEEDED, [ROC] = MAC_ACCESS_FLAG_NEEDED, + [GORCL] = MAC_ACCESS_FLAG_NEEDED, [GORCH] = MAC_ACCESS_FLAG_NEEDED, + [GOTCL] = MAC_ACCESS_FLAG_NEEDED, [GOTCH] = MAC_ACCESS_FLAG_NEEDED, + [BPRC] = MAC_ACCESS_FLAG_NEEDED, [MPRC] = MAC_ACCESS_FLAG_NEEDED, + [TSCTC] = MAC_ACCESS_FLAG_NEEDED, [PRC64] = MAC_ACCESS_FLAG_NEEDED, + [PRC127] = MAC_ACCESS_FLAG_NEEDED, [PRC255] = MAC_ACCESS_FLAG_NEEDED, + [PRC511] = MAC_ACCESS_FLAG_NEEDED, [PRC1023] = MAC_ACCESS_FLAG_NEEDED, + [PRC1522] = MAC_ACCESS_FLAG_NEEDED, [PTC64] = MAC_ACCESS_FLAG_NEEDED, + [PTC127] = MAC_ACCESS_FLAG_NEEDED, [PTC255] = MAC_ACCESS_FLAG_NEEDED, + [PTC511] = MAC_ACCESS_FLAG_NEEDED, [PTC1023] = MAC_ACCESS_FLAG_NEEDED, + [PTC1522] = MAC_ACCESS_FLAG_NEEDED, [MPTC] = MAC_ACCESS_FLAG_NEEDED, + [BPTC] = MAC_ACCESS_FLAG_NEEDED, + + [TDFH] = MAC_ACCESS_FLAG_NEEDED | MAC_ACCESS_PARTIAL, + [TDFT] = MAC_ACCESS_FLAG_NEEDED | MAC_ACCESS_PARTIAL, + [TDFHS] = MAC_ACCESS_FLAG_NEEDED | MAC_ACCESS_PARTIAL, + [TDFTS] = MAC_ACCESS_FLAG_NEEDED | MAC_ACCESS_PARTIAL, + [TDFPC] = MAC_ACCESS_FLAG_NEEDED | MAC_ACCESS_PARTIAL, + [RDFH] = MAC_ACCESS_FLAG_NEEDED | MAC_ACCESS_PARTIAL, + [RDFT] = MAC_ACCESS_FLAG_NEEDED | MAC_ACCESS_PARTIAL, + [RDFHS] = MAC_ACCESS_FLAG_NEEDED | MAC_ACCESS_PARTIAL, + [RDFTS] = MAC_ACCESS_FLAG_NEEDED | MAC_ACCESS_PARTIAL, + [RDFPC] = MAC_ACCESS_FLAG_NEEDED | MAC_ACCESS_PARTIAL, + [PBM] = MAC_ACCESS_FLAG_NEEDED | MAC_ACCESS_PARTIAL, }; static void @@ -1419,13 +1416,6 @@ static int e1000_tx_tso_post_load(void *opaque, int version_id) return 0; } -static bool e1000_full_mac_needed(void *opaque) -{ - E1000State *s = opaque; - - return chkflag(MAC); -} - static bool e1000_tso_state_needed(void *opaque) { E1000State *s = opaque; @@ -1451,7 +1441,6 @@ static const VMStateDescription vmstate_e1000_full_mac_state = { .name = "e1000/full_mac_state", .version_id = 1, .minimum_version_id = 1, - .needed = e1000_full_mac_needed, .fields = (const VMStateField[]) { VMSTATE_UINT32_ARRAY(mac_reg, E1000State, 0x8000), VMSTATE_END_OF_LIST() @@ -1679,8 +1668,6 @@ static void pci_e1000_realize(PCIDevice *pci_dev, Error **errp) static const Property e1000_properties[] = { DEFINE_NIC_PROPERTIES(E1000State, conf), - DEFINE_PROP_BIT("extra_mac_registers", E1000State, - compat_flags, E1000_FLAG_MAC_BIT, true), DEFINE_PROP_BIT("migrate_tso_props", E1000State, compat_flags, E1000_FLAG_TSO_BIT, true), DEFINE_PROP_BIT("init-vet", E1000State, diff --git a/hw/net/fsl_etsec/etsec.c b/hw/net/fsl_etsec/etsec.c index d14cb2a..846f6cb 100644 --- a/hw/net/fsl_etsec/etsec.c +++ b/hw/net/fsl_etsec/etsec.c @@ -389,6 +389,7 @@ static void etsec_realize(DeviceState *dev, Error **errp) { eTSEC *etsec = ETSEC_COMMON(dev); + qemu_macaddr_default_if_unset(&etsec->conf.macaddr); etsec->nic = qemu_new_nic(&net_etsec_info, &etsec->conf, object_get_typename(OBJECT(dev)), dev->id, &dev->mem_reentrancy_guard, etsec); diff --git a/hw/net/i82596.c b/hw/net/i82596.c index 64ed3c8..c1ff3e6 100644 --- a/hw/net/i82596.c +++ b/hw/net/i82596.c @@ -5,7 +5,7 @@ * This work is licensed under the GNU GPL license version 2 or later. * * This software was written to be compatible with the specification: - * https://www.intel.com/assets/pdf/general/82596ca.pdf + * https://parisc.docs.kernel.org/en/latest/_downloads/96672be0650d9fc046bbcea40b92482f/82596CA.pdf */ #include "qemu/osdep.h" @@ -177,6 +177,26 @@ static void set_individual_address(I82596State *s, uint32_t addr) trace_i82596_new_mac(nc->info_str); } +static void i82596_configure(I82596State *s, uint32_t addr) +{ + uint8_t byte_cnt; + byte_cnt = get_byte(addr + 8) & 0x0f; + + byte_cnt = MAX(byte_cnt, 4); + byte_cnt = MIN(byte_cnt, sizeof(s->config)); + /* copy byte_cnt max. */ + address_space_read(&address_space_memory, addr + 8, + MEMTXATTRS_UNSPECIFIED, s->config, byte_cnt); + /* config byte according to page 35ff */ + s->config[2] &= 0x82; /* mask valid bits */ + s->config[2] |= 0x40; + s->config[7] &= 0xf7; /* clear zero bit */ + assert(I596_NOCRC_INS == 0); /* do CRC insertion */ + s->config[10] = MAX(s->config[10], 5); /* min frame length */ + s->config[12] &= 0x40; /* only full duplex field valid */ + s->config[13] |= 0x3f; /* set ones in byte 13 */ +} + static void set_multicast_list(I82596State *s, uint32_t addr) { uint16_t mc_count, i; @@ -234,7 +254,6 @@ static void command_loop(I82596State *s) { uint16_t cmd; uint16_t status; - uint8_t byte_cnt; DBG(printf("STARTING COMMAND LOOP cmd_p=%08x\n", s->cmd_p)); @@ -254,20 +273,7 @@ static void command_loop(I82596State *s) set_individual_address(s, s->cmd_p); break; case CmdConfigure: - byte_cnt = get_byte(s->cmd_p + 8) & 0x0f; - byte_cnt = MAX(byte_cnt, 4); - byte_cnt = MIN(byte_cnt, sizeof(s->config)); - /* copy byte_cnt max. */ - address_space_read(&address_space_memory, s->cmd_p + 8, - MEMTXATTRS_UNSPECIFIED, s->config, byte_cnt); - /* config byte according to page 35ff */ - s->config[2] &= 0x82; /* mask valid bits */ - s->config[2] |= 0x40; - s->config[7] &= 0xf7; /* clear zero bit */ - assert(I596_NOCRC_INS == 0); /* do CRC insertion */ - s->config[10] = MAX(s->config[10], 5); /* min frame length */ - s->config[12] &= 0x40; /* only full duplex field valid */ - s->config[13] |= 0x3f; /* set ones in byte 13 */ + i82596_configure(s, s->cmd_p); break; case CmdTDR: /* get signal LINK */ diff --git a/hw/net/rocker/rocker.h b/hw/net/rocker/rocker.h index 6e0962f..ae06c1c 100644 --- a/hw/net/rocker/rocker.h +++ b/hw/net/rocker/rocker.h @@ -36,15 +36,7 @@ static inline G_GNUC_PRINTF(1, 2) int DPRINTF(const char *fmt, ...) } #endif -#define __le16 uint16_t -#define __le32 uint32_t -#define __le64 uint64_t - -#define __be16 uint16_t -#define __be32 uint32_t -#define __be64 uint64_t - -static inline bool ipv4_addr_is_multicast(__be32 addr) +static inline bool ipv4_addr_is_multicast(uint32_t addr) { return (addr & htonl(0xf0000000)) == htonl(0xe0000000); } @@ -52,8 +44,8 @@ static inline bool ipv4_addr_is_multicast(__be32 addr) typedef struct ipv6_addr { union { uint8_t addr8[16]; - __be16 addr16[8]; - __be32 addr32[4]; + uint16_t addr16[8]; + uint32_t addr32[4]; }; } Ipv6Addr; diff --git a/hw/net/rocker/rocker_hw.h b/hw/net/rocker/rocker_hw.h index 1786323..7ec6bfb 100644 --- a/hw/net/rocker/rocker_hw.h +++ b/hw/net/rocker/rocker_hw.h @@ -9,10 +9,6 @@ #ifndef ROCKER_HW_H #define ROCKER_HW_H -#define __le16 uint16_t -#define __le32 uint32_t -#define __le64 uint64_t - /* * Return codes */ @@ -124,12 +120,12 @@ enum { */ typedef struct rocker_desc { - __le64 buf_addr; + uint64_t buf_addr; uint64_t cookie; - __le16 buf_size; - __le16 tlv_size; - __le16 rsvd[5]; /* pad to 32 bytes */ - __le16 comp_err; + uint16_t buf_size; + uint16_t tlv_size; + uint16_t rsvd[5]; /* pad to 32 bytes */ + uint16_t comp_err; } __attribute__((packed, aligned(8))) RockerDesc; /* @@ -137,9 +133,9 @@ typedef struct rocker_desc { */ typedef struct rocker_tlv { - __le32 type; - __le16 len; - __le16 rsvd; + uint32_t type; + uint16_t len; + uint16_t rsvd; } __attribute__((packed, aligned(8))) RockerTlv; /* cmd msg */ diff --git a/hw/net/rocker/rocker_of_dpa.c b/hw/net/rocker/rocker_of_dpa.c index 3378f63..4aed178 100644 --- a/hw/net/rocker/rocker_of_dpa.c +++ b/hw/net/rocker/rocker_of_dpa.c @@ -52,10 +52,10 @@ typedef struct of_dpa_flow_key { uint32_t tunnel_id; /* overlay tunnel id */ uint32_t tbl_id; /* table id */ struct { - __be16 vlan_id; /* 0 if no VLAN */ + uint16_t vlan_id; /* 0 if no VLAN */ MACAddr src; /* ethernet source address */ MACAddr dst; /* ethernet destination address */ - __be16 type; /* ethernet frame type */ + uint16_t type; /* ethernet frame type */ } eth; struct { uint8_t proto; /* IP protocol or ARP opcode */ @@ -66,14 +66,14 @@ typedef struct of_dpa_flow_key { union { struct { struct { - __be32 src; /* IP source address */ - __be32 dst; /* IP destination address */ + uint32_t src; /* IP source address */ + uint32_t dst; /* IP destination address */ } addr; union { struct { - __be16 src; /* TCP/UDP/SCTP source port */ - __be16 dst; /* TCP/UDP/SCTP destination port */ - __be16 flags; /* TCP flags */ + uint16_t src; /* TCP/UDP/SCTP source port */ + uint16_t dst; /* TCP/UDP/SCTP destination port */ + uint16_t flags; /* TCP flags */ } tp; struct { MACAddr sha; /* ARP source hardware address */ @@ -86,11 +86,11 @@ typedef struct of_dpa_flow_key { Ipv6Addr src; /* IPv6 source address */ Ipv6Addr dst; /* IPv6 destination address */ } addr; - __be32 label; /* IPv6 flow label */ + uint32_t label; /* IPv6 flow label */ struct { - __be16 src; /* TCP/UDP/SCTP source port */ - __be16 dst; /* TCP/UDP/SCTP destination port */ - __be16 flags; /* TCP flags */ + uint16_t src; /* TCP/UDP/SCTP source port */ + uint16_t dst; /* TCP/UDP/SCTP destination port */ + uint16_t flags; /* TCP flags */ } tp; struct { Ipv6Addr target; /* ND target address */ @@ -112,13 +112,13 @@ typedef struct of_dpa_flow_action { struct { uint32_t group_id; uint32_t tun_log_lport; - __be16 vlan_id; + uint16_t vlan_id; } write; struct { - __be16 new_vlan_id; + uint16_t new_vlan_id; uint32_t out_pport; uint8_t copy_to_cpu; - __be16 vlan_id; + uint16_t vlan_id; } apply; } OfDpaFlowAction; @@ -143,7 +143,7 @@ typedef struct of_dpa_flow { typedef struct of_dpa_flow_pkt_fields { uint32_t tunnel_id; struct eth_header *ethhdr; - __be16 *h_proto; + uint16_t *h_proto; struct vlan_header *vlanhdr; struct ip_header *ipv4hdr; struct ip6_header *ipv6hdr; @@ -180,7 +180,7 @@ typedef struct of_dpa_group { uint32_t group_id; MACAddr src_mac; MACAddr dst_mac; - __be16 vlan_id; + uint16_t vlan_id; } l2_rewrite; struct { uint16_t group_count; @@ -190,13 +190,13 @@ typedef struct of_dpa_group { uint32_t group_id; MACAddr src_mac; MACAddr dst_mac; - __be16 vlan_id; + uint16_t vlan_id; uint8_t ttl_check; } l3_unicast; }; } OfDpaGroup; -static int of_dpa_mask2prefix(__be32 mask) +static int of_dpa_mask2prefix(uint32_t mask) { int i; int count = 32; @@ -451,7 +451,7 @@ static void of_dpa_flow_pkt_parse(OfDpaFlowContext *fc, fc->iovcnt = iovcnt + 2; } -static void of_dpa_flow_pkt_insert_vlan(OfDpaFlowContext *fc, __be16 vlan_id) +static void of_dpa_flow_pkt_insert_vlan(OfDpaFlowContext *fc, uint16_t vlan_id) { OfDpaFlowPktFields *fields = &fc->fields; uint16_t h_proto = fields->ethhdr->h_proto; @@ -486,7 +486,7 @@ static void of_dpa_flow_pkt_strip_vlan(OfDpaFlowContext *fc) static void of_dpa_flow_pkt_hdr_rewrite(OfDpaFlowContext *fc, uint8_t *src_mac, uint8_t *dst_mac, - __be16 vlan_id) + uint16_t vlan_id) { OfDpaFlowPktFields *fields = &fc->fields; diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c index 15b8f75..654a087 100644 --- a/hw/net/rtl8139.c +++ b/hw/net/rtl8139.c @@ -1816,7 +1816,7 @@ static int rtl8139_transmit_one(RTL8139State *s, int descriptor) PCIDevice *d = PCI_DEVICE(s); int txsize = s->TxStatus[descriptor] & 0x1fff; - uint8_t txbuffer[0x2000]; + QEMU_UNINITIALIZED uint8_t txbuffer[0x2000]; DPRINTF("+++ transmit reading %d bytes from host memory at 0x%08x\n", txsize, s->TxAddr[descriptor]); diff --git a/hw/net/tulip.c b/hw/net/tulip.c index 63fe513..319af90 100644 --- a/hw/net/tulip.c +++ b/hw/net/tulip.c @@ -629,7 +629,7 @@ static void tulip_setup_filter_addr(TULIPState *s, uint8_t *buf, int n) static void tulip_setup_frame(TULIPState *s, struct tulip_descriptor *desc) { - uint8_t buf[4096]; + QEMU_UNINITIALIZED uint8_t buf[4096]; int len = (desc->control >> TDES1_BUF1_SIZE_SHIFT) & TDES1_BUF1_SIZE_MASK; int i; diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 221252e..eb93607 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -1911,9 +1911,9 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf, VirtIONet *n = qemu_get_nic_opaque(nc); VirtIONetQueue *q; VirtIODevice *vdev = VIRTIO_DEVICE(n); - VirtQueueElement *elems[VIRTQUEUE_MAX_SIZE]; - size_t lens[VIRTQUEUE_MAX_SIZE]; - struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE]; + QEMU_UNINITIALIZED VirtQueueElement *elems[VIRTQUEUE_MAX_SIZE]; + QEMU_UNINITIALIZED size_t lens[VIRTQUEUE_MAX_SIZE]; + QEMU_UNINITIALIZED struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE]; struct virtio_net_hdr_v1_hash extra_hdr; unsigned mhdr_cnt = 0; size_t offset, i, guest_offset, j; diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c index 83d942a..7c0ca56 100644 --- a/hw/net/vmxnet3.c +++ b/hw/net/vmxnet3.c @@ -41,19 +41,9 @@ #define PCI_DEVICE_ID_VMWARE_VMXNET3_REVISION 0x1 #define VMXNET3_MSIX_BAR_SIZE 0x2000 -/* Compatibility flags for migration */ -#define VMXNET3_COMPAT_FLAG_OLD_MSI_OFFSETS_BIT 0 -#define VMXNET3_COMPAT_FLAG_OLD_MSI_OFFSETS \ - (1 << VMXNET3_COMPAT_FLAG_OLD_MSI_OFFSETS_BIT) -#define VMXNET3_COMPAT_FLAG_DISABLE_PCIE_BIT 1 -#define VMXNET3_COMPAT_FLAG_DISABLE_PCIE \ - (1 << VMXNET3_COMPAT_FLAG_DISABLE_PCIE_BIT) - #define VMXNET3_EXP_EP_OFFSET (0x48) -#define VMXNET3_MSI_OFFSET(s) \ - ((s)->compat_flags & VMXNET3_COMPAT_FLAG_OLD_MSI_OFFSETS ? 0x50 : 0x84) -#define VMXNET3_MSIX_OFFSET(s) \ - ((s)->compat_flags & VMXNET3_COMPAT_FLAG_OLD_MSI_OFFSETS ? 0 : 0x9c) +#define VMXNET3_MSI_OFFSET (0x84) +#define VMXNET3_MSIX_OFFSET (0x9c) #define VMXNET3_DSN_OFFSET (0x100) #define VMXNET3_BAR0_IDX (0) @@ -61,8 +51,7 @@ #define VMXNET3_MSIX_BAR_IDX (2) #define VMXNET3_OFF_MSIX_TABLE (0x000) -#define VMXNET3_OFF_MSIX_PBA(s) \ - ((s)->compat_flags & VMXNET3_COMPAT_FLAG_OLD_MSI_OFFSETS ? 0x800 : 0x1000) +#define VMXNET3_OFF_MSIX_PBA (0x1000) /* Link speed in Mbps should be shifted by 16 */ #define VMXNET3_LINK_SPEED (1000 << 16) @@ -2122,8 +2111,8 @@ vmxnet3_init_msix(VMXNET3State *s) &s->msix_bar, VMXNET3_MSIX_BAR_IDX, VMXNET3_OFF_MSIX_TABLE, &s->msix_bar, - VMXNET3_MSIX_BAR_IDX, VMXNET3_OFF_MSIX_PBA(s), - VMXNET3_MSIX_OFFSET(s), NULL); + VMXNET3_MSIX_BAR_IDX, VMXNET3_OFF_MSIX_PBA, + VMXNET3_MSIX_OFFSET, NULL); if (0 > res) { VMW_WRPRN("Failed to initialize MSI-X, error %d", res); @@ -2221,7 +2210,7 @@ static void vmxnet3_pci_realize(PCIDevice *pci_dev, Error **errp) /* Interrupt pin A */ pci_dev->config[PCI_INTERRUPT_PIN] = 0x01; - ret = msi_init(pci_dev, VMXNET3_MSI_OFFSET(s), VMXNET3_MAX_NMSIX_INTRS, + ret = msi_init(pci_dev, VMXNET3_MSI_OFFSET, VMXNET3_MAX_NMSIX_INTRS, VMXNET3_USE_64BIT, VMXNET3_PER_VECTOR_MASK, NULL); /* Any error other than -ENOTSUP(board's MSI support is broken) * is a programming error. Fall back to INTx silently on -ENOTSUP */ @@ -2249,6 +2238,7 @@ static void vmxnet3_instance_init(Object *obj) device_add_bootindex_property(obj, &s->conf.bootindex, "bootindex", "/ethernet-phy@0", DEVICE(obj)); + PCI_DEVICE(obj)->cap_present |= QEMU_PCI_CAP_EXPRESS; } static void vmxnet3_pci_uninit(PCIDevice *pci_dev) @@ -2472,30 +2462,12 @@ static const VMStateDescription vmstate_vmxnet3 = { static const Property vmxnet3_properties[] = { DEFINE_NIC_PROPERTIES(VMXNET3State, conf), - DEFINE_PROP_BIT("x-old-msi-offsets", VMXNET3State, compat_flags, - VMXNET3_COMPAT_FLAG_OLD_MSI_OFFSETS_BIT, false), - DEFINE_PROP_BIT("x-disable-pcie", VMXNET3State, compat_flags, - VMXNET3_COMPAT_FLAG_DISABLE_PCIE_BIT, false), }; -static void vmxnet3_realize(DeviceState *qdev, Error **errp) -{ - VMXNET3Class *vc = VMXNET3_DEVICE_GET_CLASS(qdev); - PCIDevice *pci_dev = PCI_DEVICE(qdev); - VMXNET3State *s = VMXNET3(qdev); - - if (!(s->compat_flags & VMXNET3_COMPAT_FLAG_DISABLE_PCIE)) { - pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS; - } - - vc->parent_dc_realize(qdev, errp); -} - static void vmxnet3_class_init(ObjectClass *class, const void *data) { DeviceClass *dc = DEVICE_CLASS(class); PCIDeviceClass *c = PCI_DEVICE_CLASS(class); - VMXNET3Class *vc = VMXNET3_DEVICE_CLASS(class); c->realize = vmxnet3_pci_realize; c->exit = vmxnet3_pci_uninit; @@ -2506,8 +2478,6 @@ static void vmxnet3_class_init(ObjectClass *class, const void *data) c->class_id = PCI_CLASS_NETWORK_ETHERNET; c->subsystem_vendor_id = PCI_VENDOR_ID_VMWARE; c->subsystem_id = PCI_DEVICE_ID_VMWARE_VMXNET3; - device_class_set_parent_realize(dc, vmxnet3_realize, - &vc->parent_dc_realize); dc->desc = "VMWare Paravirtualized Ethernet v3"; device_class_set_legacy_reset(dc, vmxnet3_qdev_reset); dc->vmsd = &vmstate_vmxnet3; diff --git a/hw/net/vmxnet3.h b/hw/net/vmxnet3.h index f9283f9..dbc69d5 100644 --- a/hw/net/vmxnet3.h +++ b/hw/net/vmxnet3.h @@ -63,8 +63,8 @@ * details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * along with this program; if not, see + * <https://www.gnu.org/licenses/>. * * The full GNU General Public License is included in this distribution in * the file called "COPYING". diff --git a/hw/net/xgmac.c b/hw/net/xgmac.c index 9c87c4e..d45f872 100644 --- a/hw/net/xgmac.c +++ b/hw/net/xgmac.c @@ -207,7 +207,7 @@ static void xgmac_enet_send(XgmacState *s) struct desc bd; int frame_size; int len; - uint8_t frame[8192]; + QEMU_UNINITIALIZED uint8_t frame[8192]; uint8_t *ptr; ptr = frame; diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index fd93550..e764ec7 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -22,7 +22,7 @@ * * Usage * ----- - * See docs/system/nvme.rst for extensive documentation. + * See docs/system/devices/nvme.rst for extensive documentation. * * Add options: * -drive file=<file>,if=none,id=<drive_id> @@ -1057,7 +1057,8 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl, */ #define SEG_CHUNK_SIZE 256 - NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld; + QEMU_UNINITIALIZED NvmeSglDescriptor segment[SEG_CHUNK_SIZE]; + NvmeSglDescriptor *sgld, *last_sgld; uint64_t nsgld; uint32_t seg_len; uint16_t status; @@ -5128,7 +5129,7 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, uint64_t off, NvmeRequest *req) { - uint32_t nslist[1024]; + uint32_t nslist[1024] = {}; uint32_t trans_len; int i = 0; uint32_t nsid; @@ -5138,7 +5139,6 @@ static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, return NVME_INVALID_FIELD | NVME_DNR; } - memset(nslist, 0x0, sizeof(nslist)); trans_len = MIN(sizeof(nslist) - off, buf_len); while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) != diff --git a/hw/nvram/fw_cfg.c b/hw/nvram/fw_cfg.c index 237b9f7..aa24050 100644 --- a/hw/nvram/fw_cfg.c +++ b/hw/nvram/fw_cfg.c @@ -817,62 +817,6 @@ void fw_cfg_modify_i64(FWCfgState *s, uint16_t key, uint64_t value) g_free(old); } -void fw_cfg_set_order_override(FWCfgState *s, int order) -{ - assert(s->fw_cfg_order_override == 0); - s->fw_cfg_order_override = order; -} - -void fw_cfg_reset_order_override(FWCfgState *s) -{ - assert(s->fw_cfg_order_override != 0); - s->fw_cfg_order_override = 0; -} - -/* - * This is the legacy order list. For legacy systems, files are in - * the fw_cfg in the order defined below, by the "order" value. Note - * that some entries (VGA ROMs, NIC option ROMS, etc.) go into a - * specific area, but there may be more than one and they occur in the - * order that the user specifies them on the command line. Those are - * handled in a special manner, using the order override above. - * - * For non-legacy, the files are sorted by filename to avoid this kind - * of complexity in the future. - * - * This is only for x86, other arches don't implement versioning so - * they won't set legacy mode. - */ -static struct { - const char *name; - int order; -} fw_cfg_order[] = { - { "etc/boot-menu-wait", 10 }, - { "bootsplash.jpg", 11 }, - { "bootsplash.bmp", 12 }, - { "etc/boot-fail-wait", 15 }, - { "etc/smbios/smbios-tables", 20 }, - { "etc/smbios/smbios-anchor", 30 }, - { "etc/e820", 40 }, - { "etc/reserved-memory-end", 50 }, - { "genroms/kvmvapic.bin", 55 }, - { "genroms/linuxboot.bin", 60 }, - { }, /* VGA ROMs from pc_vga_init come here, 70. */ - { }, /* NIC option ROMs from pc_nic_init come here, 80. */ - { "etc/system-states", 90 }, - { }, /* User ROMs come here, 100. */ - { }, /* Device FW comes here, 110. */ - { "etc/extra-pci-roots", 120 }, - { "etc/acpi/tables", 130 }, - { "etc/table-loader", 140 }, - { "etc/tpm/log", 150 }, - { "etc/acpi/rsdp", 160 }, - { "bootorder", 170 }, - { "etc/msr_feature_control", 180 }, - -#define FW_CFG_ORDER_OVERRIDE_LAST 200 -}; - /* * Any sub-page size update to these table MRs will be lost during migration, * as we use aligned size in ram_load_precopy() -> qemu_ram_resize() path. @@ -890,29 +834,6 @@ static void fw_cfg_acpi_mr_save(FWCfgState *s, const char *filename, size_t len) } } -static int get_fw_cfg_order(FWCfgState *s, const char *name) -{ - int i; - - if (s->fw_cfg_order_override > 0) { - return s->fw_cfg_order_override; - } - - for (i = 0; i < ARRAY_SIZE(fw_cfg_order); i++) { - if (fw_cfg_order[i].name == NULL) { - continue; - } - - if (strcmp(name, fw_cfg_order[i].name) == 0) { - return fw_cfg_order[i].order; - } - } - - /* Stick unknown stuff at the end. */ - warn_report("Unknown firmware file in legacy mode: %s", name); - return FW_CFG_ORDER_OVERRIDE_LAST; -} - void fw_cfg_add_file_callback(FWCfgState *s, const char *filename, FWCfgCallback select_cb, FWCfgWriteCallback write_cb, @@ -921,7 +842,6 @@ void fw_cfg_add_file_callback(FWCfgState *s, const char *filename, { int i, index, count; size_t dsize; - MachineClass *mc = MACHINE_GET_CLASS(qdev_get_machine()); int order = 0; if (!s->files) { @@ -933,22 +853,11 @@ void fw_cfg_add_file_callback(FWCfgState *s, const char *filename, count = be32_to_cpu(s->files->count); assert(count < fw_cfg_file_slots(s)); - /* Find the insertion point. */ - if (mc->legacy_fw_cfg_order) { - /* - * Sort by order. For files with the same order, we keep them - * in the sequence in which they were added. - */ - order = get_fw_cfg_order(s, filename); - for (index = count; - index > 0 && order < s->entry_order[index - 1]; - index--); - } else { - /* Sort by file name. */ - for (index = count; - index > 0 && strcmp(filename, s->files->f[index - 1].name) < 0; - index--); - } + /* Find the insertion point, sorting by file name. */ + for (index = count; + index > 0 && strcmp(filename, s->files->f[index - 1].name) < 0; + index--) + ; /* * Move all the entries from the index point and after down one @@ -1058,7 +967,6 @@ bool fw_cfg_add_file_from_generator(FWCfgState *s, static void fw_cfg_machine_reset(void *opaque) { - MachineClass *mc = MACHINE_GET_CLASS(qdev_get_machine()); FWCfgState *s = opaque; void *ptr; size_t len; @@ -1068,11 +976,9 @@ static void fw_cfg_machine_reset(void *opaque) ptr = fw_cfg_modify_file(s, "bootorder", (uint8_t *)buf, len); g_free(ptr); - if (!mc->legacy_fw_cfg_order) { - buf = get_boot_devices_lchs_list(&len); - ptr = fw_cfg_modify_file(s, "bios-geometry", (uint8_t *)buf, len); - g_free(ptr); - } + buf = get_boot_devices_lchs_list(&len); + ptr = fw_cfg_modify_file(s, "bios-geometry", (uint8_t *)buf, len); + g_free(ptr); } static void fw_cfg_machine_ready(struct Notifier *n, void *data) diff --git a/hw/pci-host/gt64120.c b/hw/pci-host/gt64120.c index 56a6ef9..b12a256 100644 --- a/hw/pci-host/gt64120.c +++ b/hw/pci-host/gt64120.c @@ -320,38 +320,6 @@ static void gt64120_isd_mapping(GT64120State *s) memory_region_transaction_commit(); } -static void gt64120_update_pci_cfgdata_mapping(GT64120State *s) -{ - /* Indexed on MByteSwap bit, see Table 158: PCI_0 Command, Offset: 0xc00 */ - static const MemoryRegionOps *pci_host_data_ops[] = { - &pci_host_data_be_ops, &pci_host_data_le_ops - }; - PCIHostState *phb = PCI_HOST_BRIDGE(s); - - memory_region_transaction_begin(); - - /* - * The setting of the MByteSwap bit and MWordSwap bit in the PCI Internal - * Command Register determines how data transactions from the CPU to/from - * PCI are handled along with the setting of the Endianness bit in the CPU - * Configuration Register. See: - * - Table 16: 32-bit PCI Transaction Endianness - * - Table 158: PCI_0 Command, Offset: 0xc00 - */ - - if (memory_region_is_mapped(&phb->data_mem)) { - memory_region_del_subregion(&s->ISD_mem, &phb->data_mem); - object_unparent(OBJECT(&phb->data_mem)); - } - memory_region_init_io(&phb->data_mem, OBJECT(phb), - pci_host_data_ops[s->regs[GT_PCI0_CMD] & 1], - s, "pci-conf-data", 4); - memory_region_add_subregion_overlap(&s->ISD_mem, GT_PCI0_CFGDATA << 2, - &phb->data_mem, 1); - - memory_region_transaction_commit(); -} - static void gt64120_pci_mapping(GT64120State *s) { memory_region_transaction_begin(); @@ -645,7 +613,6 @@ static void gt64120_writel(void *opaque, hwaddr addr, case GT_PCI0_CMD: case GT_PCI1_CMD: s->regs[saddr] = val & 0x0401fc0f; - gt64120_update_pci_cfgdata_mapping(s); break; case GT_PCI0_TOR: case GT_PCI0_BS_SCS10: @@ -1024,6 +991,48 @@ static const MemoryRegionOps isd_mem_ops = { }, }; +static bool bswap(const GT64120State *s) +{ + PCIHostState *phb = PCI_HOST_BRIDGE(s); + /*check for bus == 0 && device == 0, Bits 11:15 = Device , Bits 16:23 = Bus*/ + bool is_phb_dev0 = extract32(phb->config_reg, 11, 13) == 0; + bool le_mode = FIELD_EX32(s->regs[GT_PCI0_CMD], GT_PCI0_CMD, MByteSwap); + /* Only swap for non-bridge devices in big-endian mode */ + return !le_mode && !is_phb_dev0; +} + +static uint64_t gt64120_pci_data_read(void *opaque, hwaddr addr, unsigned size) +{ + GT64120State *s = opaque; + uint32_t val = pci_host_data_le_ops.read(opaque, addr, size); + + if (bswap(s)) { + val = bswap32(val); + } + return val; +} + +static void gt64120_pci_data_write(void *opaque, hwaddr addr, + uint64_t val, unsigned size) +{ + GT64120State *s = opaque; + + if (bswap(s)) { + val = bswap32(val); + } + pci_host_data_le_ops.write(opaque, addr, val, size); +} + +static const MemoryRegionOps gt64120_pci_data_ops = { + .read = gt64120_pci_data_read, + .write = gt64120_pci_data_write, + .endianness = DEVICE_LITTLE_ENDIAN, + .valid = { + .min_access_size = 4, + .max_access_size = 4, + }, +}; + static void gt64120_reset(DeviceState *dev) { GT64120State *s = GT64120_PCI_HOST_BRIDGE(dev); @@ -1178,7 +1187,6 @@ static void gt64120_reset(DeviceState *dev) gt64120_isd_mapping(s); gt64120_pci_mapping(s); - gt64120_update_pci_cfgdata_mapping(s); } static void gt64120_realize(DeviceState *dev, Error **errp) @@ -1202,6 +1210,12 @@ static void gt64120_realize(DeviceState *dev, Error **errp) memory_region_add_subregion_overlap(&s->ISD_mem, GT_PCI0_CFGADDR << 2, &phb->conf_mem, 1); + memory_region_init_io(&phb->data_mem, OBJECT(phb), + >64120_pci_data_ops, + s, "pci-conf-data", 4); + memory_region_add_subregion_overlap(&s->ISD_mem, GT_PCI0_CFGDATA << 2, + &phb->data_mem, 1); + /* * The whole address space decoded by the GT-64120A doesn't generate diff --git a/hw/pci-host/ppce500.c b/hw/pci-host/ppce500.c index e97a515..52269b0 100644 --- a/hw/pci-host/ppce500.c +++ b/hw/pci-host/ppce500.c @@ -16,7 +16,6 @@ #include "qemu/osdep.h" #include "hw/irq.h" -#include "hw/ppc/e500-ccsr.h" #include "hw/qdev-properties.h" #include "migration/vmstate.h" #include "hw/pci/pci_device.h" @@ -418,11 +417,12 @@ static const VMStateDescription vmstate_ppce500_pci = { static void e500_pcihost_bridge_realize(PCIDevice *d, Error **errp) { PPCE500PCIBridgeState *b = PPC_E500_PCI_BRIDGE(d); - PPCE500CCSRState *ccsr = CCSR( + SysBusDevice *ccsr = SYS_BUS_DEVICE( object_resolve_path_component(qdev_get_machine(), "e500-ccsr")); + MemoryRegion *ccsr_space = sysbus_mmio_get_region(ccsr, 0); - memory_region_init_alias(&b->bar0, OBJECT(ccsr), "e500-pci-bar0", &ccsr->ccsr_space, - 0, int128_get64(ccsr->ccsr_space.size)); + memory_region_init_alias(&b->bar0, OBJECT(ccsr), "e500-pci-bar0", + ccsr_space, 0, int128_get64(ccsr_space->size)); pci_register_bar(d, 0, PCI_BASE_ADDRESS_SPACE_MEMORY, &b->bar0); } diff --git a/hw/pci-host/raven.c b/hw/pci-host/raven.c index 21f7ca6..f8c0be5 100644 --- a/hw/pci-host/raven.c +++ b/hw/pci-host/raven.c @@ -24,7 +24,6 @@ */ #include "qemu/osdep.h" -#include "qemu/datadir.h" #include "qemu/units.h" #include "qemu/log.h" #include "qapi/error.h" @@ -35,9 +34,7 @@ #include "migration/vmstate.h" #include "hw/intc/i8259.h" #include "hw/irq.h" -#include "hw/loader.h" #include "hw/or-irq.h" -#include "elf.h" #include "qom/object.h" #define TYPE_RAVEN_PCI_DEVICE "raven" @@ -47,10 +44,6 @@ OBJECT_DECLARE_SIMPLE_TYPE(RavenPCIState, RAVEN_PCI_DEVICE) struct RavenPCIState { PCIDevice dev; - - uint32_t elf_machine; - char *bios_name; - MemoryRegion bios; }; typedef struct PRePPCIState PREPPCIState; @@ -75,11 +68,8 @@ struct PRePPCIState { RavenPCIState pci_dev; int contiguous_map; - bool is_legacy_prep; }; -#define BIOS_SIZE (1 * MiB) - #define PCI_IO_BASE_ADDR 0x80000000 /* Physical address on main bus */ static inline uint32_t raven_pci_io_config(hwaddr addr) @@ -243,22 +233,18 @@ static void raven_pcihost_realizefn(DeviceState *d, Error **errp) MemoryRegion *address_space_mem = get_system_memory(); int i; - if (s->is_legacy_prep) { - for (i = 0; i < PCI_NUM_PINS; i++) { - sysbus_init_irq(dev, &s->pci_irqs[i]); - } - } else { - /* According to PReP specification section 6.1.6 "System Interrupt - * Assignments", all PCI interrupts are routed via IRQ 15 */ - s->or_irq = OR_IRQ(object_new(TYPE_OR_IRQ)); - object_property_set_int(OBJECT(s->or_irq), "num-lines", PCI_NUM_PINS, - &error_fatal); - qdev_realize(DEVICE(s->or_irq), NULL, &error_fatal); - sysbus_init_irq(dev, &s->or_irq->out_irq); - - for (i = 0; i < PCI_NUM_PINS; i++) { - s->pci_irqs[i] = qdev_get_gpio_in(DEVICE(s->or_irq), i); - } + /* + * According to PReP specification section 6.1.6 "System Interrupt + * Assignments", all PCI interrupts are routed via IRQ 15 + */ + s->or_irq = OR_IRQ(object_new(TYPE_OR_IRQ)); + object_property_set_int(OBJECT(s->or_irq), "num-lines", PCI_NUM_PINS, + &error_fatal); + qdev_realize(DEVICE(s->or_irq), NULL, &error_fatal); + sysbus_init_irq(dev, &s->or_irq->out_irq); + + for (i = 0; i < PCI_NUM_PINS; i++) { + s->pci_irqs[i] = qdev_get_gpio_in(DEVICE(s->or_irq), i); } qdev_init_gpio_in(d, raven_change_gpio, 1); @@ -338,48 +324,9 @@ static void raven_pcihost_initfn(Object *obj) static void raven_realize(PCIDevice *d, Error **errp) { - RavenPCIState *s = RAVEN_PCI_DEVICE(d); - char *filename; - int bios_size = -1; - d->config[PCI_CACHE_LINE_SIZE] = 0x08; d->config[PCI_LATENCY_TIMER] = 0x10; d->config[PCI_CAPABILITY_LIST] = 0x00; - - if (!memory_region_init_rom_nomigrate(&s->bios, OBJECT(s), "bios", - BIOS_SIZE, errp)) { - return; - } - memory_region_add_subregion(get_system_memory(), (uint32_t)(-BIOS_SIZE), - &s->bios); - if (s->bios_name) { - filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, s->bios_name); - if (filename) { - if (s->elf_machine != EM_NONE) { - bios_size = load_elf(filename, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, - ELFDATA2MSB, s->elf_machine, 0, 0); - } - if (bios_size < 0) { - bios_size = get_image_size(filename); - if (bios_size > 0 && bios_size <= BIOS_SIZE) { - hwaddr bios_addr; - bios_size = (bios_size + 0xfff) & ~0xfff; - bios_addr = (uint32_t)(-BIOS_SIZE); - bios_size = load_image_targphys(filename, bios_addr, - bios_size); - } - } - } - g_free(filename); - if (bios_size < 0 || bios_size > BIOS_SIZE) { - memory_region_del_subregion(get_system_memory(), &s->bios); - error_setg(errp, "Could not load bios image '%s'", s->bios_name); - return; - } - } - - vmstate_register_ram_global(&s->bios); } static const VMStateDescription vmstate_raven = { @@ -422,22 +369,12 @@ static const TypeInfo raven_info = { }, }; -static const Property raven_pcihost_properties[] = { - DEFINE_PROP_UINT32("elf-machine", PREPPCIState, pci_dev.elf_machine, - EM_NONE), - DEFINE_PROP_STRING("bios-name", PREPPCIState, pci_dev.bios_name), - /* Temporary workaround until legacy prep machine is removed */ - DEFINE_PROP_BOOL("is-legacy-prep", PREPPCIState, is_legacy_prep, - false), -}; - static void raven_pcihost_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories); dc->realize = raven_pcihost_realizefn; - device_class_set_props(dc, raven_pcihost_properties); dc->fw_name = "pci"; } diff --git a/hw/pci/msix.c b/hw/pci/msix.c index 66f27b9..8c7f670 100644 --- a/hw/pci/msix.c +++ b/hw/pci/msix.c @@ -72,7 +72,7 @@ static uint8_t *msix_pending_byte(PCIDevice *dev, int vector) return dev->msix_pba + vector / 8; } -static int msix_is_pending(PCIDevice *dev, int vector) +int msix_is_pending(PCIDevice *dev, unsigned int vector) { return *msix_pending_byte(dev, vector) & msix_pending_mask(vector); } diff --git a/hw/pci/pci.c b/hw/pci/pci.c index f5ab510..c70b5ce 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -32,6 +32,7 @@ #include "hw/pci/pci_host.h" #include "hw/qdev-properties.h" #include "hw/qdev-properties-system.h" +#include "migration/cpr.h" #include "migration/qemu-file-types.h" #include "migration/vmstate.h" #include "net/net.h" @@ -128,6 +129,12 @@ static GSequence *pci_acpi_index_list(void) return used_acpi_index_list; } +static void pci_set_master(PCIDevice *d, bool enable) +{ + memory_region_set_enabled(&d->bus_master_enable_region, enable); + d->is_master = enable; /* cache the status */ +} + static void pci_init_bus_master(PCIDevice *pci_dev) { AddressSpace *dma_as = pci_device_iommu_address_space(pci_dev); @@ -135,7 +142,7 @@ static void pci_init_bus_master(PCIDevice *pci_dev) memory_region_init_alias(&pci_dev->bus_master_enable_region, OBJECT(pci_dev), "bus master", dma_as->root, 0, memory_region_size(dma_as->root)); - memory_region_set_enabled(&pci_dev->bus_master_enable_region, false); + pci_set_master(pci_dev, false); memory_region_add_subregion(&pci_dev->bus_master_container_region, 0, &pci_dev->bus_master_enable_region); } @@ -531,6 +538,10 @@ static void pci_reset_regions(PCIDevice *dev) static void pci_do_device_reset(PCIDevice *dev) { + if ((dev->cap_present & QEMU_PCI_SKIP_RESET_ON_CPR) && cpr_is_incoming()) { + return; + } + pci_device_deassert_intx(dev); assert(dev->irq_state == 0); @@ -804,9 +815,8 @@ static int get_pci_config_device(QEMUFile *f, void *pv, size_t size, pci_bridge_update_mappings(PCI_BRIDGE(s)); } - memory_region_set_enabled(&s->bus_master_enable_region, - pci_get_word(s->config + PCI_COMMAND) - & PCI_COMMAND_MASTER); + pci_set_master(s, pci_get_word(s->config + PCI_COMMAND) + & PCI_COMMAND_MASTER); g_free(config); return 0; @@ -1725,7 +1735,7 @@ static void pci_update_mappings(PCIDevice *d) pci_update_vga(d); } -static inline int pci_irq_disabled(PCIDevice *d) +int pci_irq_disabled(PCIDevice *d) { return pci_get_word(d->config + PCI_COMMAND) & PCI_COMMAND_INTX_DISABLE; } @@ -1787,9 +1797,8 @@ void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, int if (ranges_overlap(addr, l, PCI_COMMAND, 2)) { pci_update_irq_disabled(d, was_irq_disabled); - memory_region_set_enabled(&d->bus_master_enable_region, - (pci_get_word(d->config + PCI_COMMAND) - & PCI_COMMAND_MASTER) && d->enabled); + pci_set_master(d, (pci_get_word(d->config + PCI_COMMAND) & + PCI_COMMAND_MASTER) && d->enabled); } msi_write_config(d, addr, val_in, l); @@ -2935,6 +2944,23 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev) return &address_space_memory; } +int pci_iommu_init_iotlb_notifier(PCIDevice *dev, IOMMUNotifier *n, + IOMMUNotify fn, void *opaque) +{ + PCIBus *bus; + PCIBus *iommu_bus; + int devfn; + + pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + if (iommu_bus && iommu_bus->iommu_ops->init_iotlb_notifier) { + iommu_bus->iommu_ops->init_iotlb_notifier(bus, iommu_bus->iommu_opaque, + devfn, n, fn, opaque); + return 0; + } + + return -ENODEV; +} + bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod, Error **errp) { @@ -2966,6 +2992,170 @@ void pci_device_unset_iommu_device(PCIDevice *dev) } } +int pci_pri_request_page(PCIDevice *dev, uint32_t pasid, bool priv_req, + bool exec_req, hwaddr addr, bool lpig, + uint16_t prgi, bool is_read, bool is_write) +{ + PCIBus *bus; + PCIBus *iommu_bus; + int devfn; + + if (!dev->is_master || + ((pasid != PCI_NO_PASID) && !pcie_pasid_enabled(dev))) { + return -EPERM; + } + + if (!pcie_pri_enabled(dev)) { + return -EPERM; + } + + pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + if (iommu_bus && iommu_bus->iommu_ops->pri_request_page) { + return iommu_bus->iommu_ops->pri_request_page(bus, + iommu_bus->iommu_opaque, + devfn, pasid, priv_req, + exec_req, addr, lpig, prgi, + is_read, is_write); + } + + return -ENODEV; +} + +int pci_pri_register_notifier(PCIDevice *dev, uint32_t pasid, + IOMMUPRINotifier *notifier) +{ + PCIBus *bus; + PCIBus *iommu_bus; + int devfn; + + if (!dev->is_master || + ((pasid != PCI_NO_PASID) && !pcie_pasid_enabled(dev))) { + return -EPERM; + } + + pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + if (iommu_bus && iommu_bus->iommu_ops->pri_register_notifier) { + iommu_bus->iommu_ops->pri_register_notifier(bus, + iommu_bus->iommu_opaque, + devfn, pasid, notifier); + return 0; + } + + return -ENODEV; +} + +void pci_pri_unregister_notifier(PCIDevice *dev, uint32_t pasid) +{ + PCIBus *bus; + PCIBus *iommu_bus; + int devfn; + + pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + if (iommu_bus && iommu_bus->iommu_ops->pri_unregister_notifier) { + iommu_bus->iommu_ops->pri_unregister_notifier(bus, + iommu_bus->iommu_opaque, + devfn, pasid); + } +} + +ssize_t pci_ats_request_translation(PCIDevice *dev, uint32_t pasid, + bool priv_req, bool exec_req, + hwaddr addr, size_t length, + bool no_write, IOMMUTLBEntry *result, + size_t result_length, + uint32_t *err_count) +{ + PCIBus *bus; + PCIBus *iommu_bus; + int devfn; + + if (!dev->is_master || + ((pasid != PCI_NO_PASID) && !pcie_pasid_enabled(dev))) { + return -EPERM; + } + + if (result_length == 0) { + return -ENOSPC; + } + + if (!pcie_ats_enabled(dev)) { + return -EPERM; + } + + pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + if (iommu_bus && iommu_bus->iommu_ops->ats_request_translation) { + return iommu_bus->iommu_ops->ats_request_translation(bus, + iommu_bus->iommu_opaque, + devfn, pasid, priv_req, + exec_req, addr, length, + no_write, result, + result_length, err_count); + } + + return -ENODEV; +} + +int pci_iommu_register_iotlb_notifier(PCIDevice *dev, uint32_t pasid, + IOMMUNotifier *n) +{ + PCIBus *bus; + PCIBus *iommu_bus; + int devfn; + + if ((pasid != PCI_NO_PASID) && !pcie_pasid_enabled(dev)) { + return -EPERM; + } + + pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + if (iommu_bus && iommu_bus->iommu_ops->register_iotlb_notifier) { + iommu_bus->iommu_ops->register_iotlb_notifier(bus, + iommu_bus->iommu_opaque, devfn, + pasid, n); + return 0; + } + + return -ENODEV; +} + +int pci_iommu_unregister_iotlb_notifier(PCIDevice *dev, uint32_t pasid, + IOMMUNotifier *n) +{ + PCIBus *bus; + PCIBus *iommu_bus; + int devfn; + + if ((pasid != PCI_NO_PASID) && !pcie_pasid_enabled(dev)) { + return -EPERM; + } + + pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + if (iommu_bus && iommu_bus->iommu_ops->unregister_iotlb_notifier) { + iommu_bus->iommu_ops->unregister_iotlb_notifier(bus, + iommu_bus->iommu_opaque, + devfn, pasid, n); + return 0; + } + + return -ENODEV; +} + +int pci_iommu_get_iotlb_info(PCIDevice *dev, uint8_t *addr_width, + uint32_t *min_page_size) +{ + PCIBus *bus; + PCIBus *iommu_bus; + int devfn; + + pci_device_get_iommu_bus_devfn(dev, &bus, &iommu_bus, &devfn); + if (iommu_bus && iommu_bus->iommu_ops->get_iotlb_info) { + iommu_bus->iommu_ops->get_iotlb_info(iommu_bus->iommu_opaque, + addr_width, min_page_size); + return 0; + } + + return -ENODEV; +} + void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque) { /* @@ -3100,9 +3290,8 @@ void pci_set_enabled(PCIDevice *d, bool state) d->enabled = state; pci_update_mappings(d); - memory_region_set_enabled(&d->bus_master_enable_region, - (pci_get_word(d->config + PCI_COMMAND) - & PCI_COMMAND_MASTER) && d->enabled); + pci_set_master(d, (pci_get_word(d->config + PCI_COMMAND) + & PCI_COMMAND_MASTER) && d->enabled); if (qdev_is_realized(&d->qdev)) { pci_device_reset(d); } diff --git a/hw/pci/pci_host.c b/hw/pci/pci_host.c index abe83bb..7179d99 100644 --- a/hw/pci/pci_host.c +++ b/hw/pci/pci_host.c @@ -217,12 +217,6 @@ const MemoryRegionOps pci_host_data_le_ops = { .endianness = DEVICE_LITTLE_ENDIAN, }; -const MemoryRegionOps pci_host_data_be_ops = { - .read = pci_host_data_read, - .write = pci_host_data_write, - .endianness = DEVICE_BIG_ENDIAN, -}; - static bool pci_host_needed(void *opaque) { PCIHostState *s = opaque; diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c index 1b12db6..eaeb688 100644 --- a/hw/pci/pcie.c +++ b/hw/pci/pcie.c @@ -1214,3 +1214,81 @@ void pcie_acs_reset(PCIDevice *dev) pci_set_word(dev->config + dev->exp.acs_cap + PCI_ACS_CTRL, 0); } } + +/* PASID */ +void pcie_pasid_init(PCIDevice *dev, uint16_t offset, uint8_t pasid_width, + bool exec_perm, bool priv_mod) +{ + static const uint16_t control_reg_rw_mask = 0x07; + uint16_t capability_reg; + + assert(pasid_width <= PCI_EXT_CAP_PASID_MAX_WIDTH); + + pcie_add_capability(dev, PCI_EXT_CAP_ID_PASID, PCI_PASID_VER, offset, + PCI_EXT_CAP_PASID_SIZEOF); + + capability_reg = ((uint16_t)pasid_width) << PCI_PASID_CAP_WIDTH_SHIFT; + capability_reg |= exec_perm ? PCI_PASID_CAP_EXEC : 0; + capability_reg |= priv_mod ? PCI_PASID_CAP_PRIV : 0; + pci_set_word(dev->config + offset + PCI_PASID_CAP, capability_reg); + + /* Everything is disabled by default */ + pci_set_word(dev->config + offset + PCI_PASID_CTRL, 0); + + pci_set_word(dev->wmask + offset + PCI_PASID_CTRL, control_reg_rw_mask); + + dev->exp.pasid_cap = offset; +} + +/* PRI */ +void pcie_pri_init(PCIDevice *dev, uint16_t offset, uint32_t outstanding_pr_cap, + bool prg_response_pasid_req) +{ + static const uint16_t control_reg_rw_mask = 0x3; + static const uint16_t status_reg_rw1_mask = 0x3; + static const uint32_t pr_alloc_reg_rw_mask = 0xffffffff; + uint16_t status_reg; + + status_reg = prg_response_pasid_req ? PCI_PRI_STATUS_PASID : 0; + status_reg |= PCI_PRI_STATUS_STOPPED; /* Stopped by default */ + + pcie_add_capability(dev, PCI_EXT_CAP_ID_PRI, PCI_PRI_VER, offset, + PCI_EXT_CAP_PRI_SIZEOF); + /* Disabled by default */ + + pci_set_word(dev->config + offset + PCI_PRI_STATUS, status_reg); + pci_set_long(dev->config + offset + PCI_PRI_MAX_REQ, outstanding_pr_cap); + + pci_set_word(dev->wmask + offset + PCI_PRI_CTRL, control_reg_rw_mask); + pci_set_word(dev->w1cmask + offset + PCI_PRI_STATUS, status_reg_rw1_mask); + pci_set_long(dev->wmask + offset + PCI_PRI_ALLOC_REQ, pr_alloc_reg_rw_mask); + + dev->exp.pri_cap = offset; +} + +bool pcie_pri_enabled(const PCIDevice *dev) +{ + if (!pci_is_express(dev) || !dev->exp.pri_cap) { + return false; + } + return (pci_get_word(dev->config + dev->exp.pri_cap + PCI_PRI_CTRL) & + PCI_PRI_CTRL_ENABLE) != 0; +} + +bool pcie_pasid_enabled(const PCIDevice *dev) +{ + if (!pci_is_express(dev) || !dev->exp.pasid_cap) { + return false; + } + return (pci_get_word(dev->config + dev->exp.pasid_cap + PCI_PASID_CTRL) & + PCI_PASID_CTRL_ENABLE) != 0; +} + +bool pcie_ats_enabled(const PCIDevice *dev) +{ + if (!pci_is_express(dev) || !dev->exp.ats_cap) { + return false; + } + return (pci_get_word(dev->config + dev->exp.ats_cap + PCI_ATS_CTRL) & + PCI_ATS_CTRL_ENABLE) != 0; +} diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c index 809078a..723c97f 100644 --- a/hw/ppc/e500.c +++ b/hw/ppc/e500.c @@ -79,8 +79,6 @@ #define MPC85XX_ESDHC_IRQ 72 #define RTC_REGS_OFFSET 0x68 -#define PLATFORM_CLK_FREQ_HZ (400 * 1000 * 1000) - struct boot_info { uint32_t dt_base; @@ -120,7 +118,7 @@ static uint32_t *pci_map_create(void *fdt, uint32_t mpic, int first_slot, } static void dt_serial_create(void *fdt, unsigned long long offset, - const char *soc, const char *mpic, + const char *soc, uint32_t freq, const char *mpic, const char *alias, int idx, bool defcon) { char *ser; @@ -131,7 +129,7 @@ static void dt_serial_create(void *fdt, unsigned long long offset, qemu_fdt_setprop_string(fdt, ser, "compatible", "ns16550"); qemu_fdt_setprop_cells(fdt, ser, "reg", offset, 0x100); qemu_fdt_setprop_cell(fdt, ser, "cell-index", idx); - qemu_fdt_setprop_cell(fdt, ser, "clock-frequency", PLATFORM_CLK_FREQ_HZ); + qemu_fdt_setprop_cell(fdt, ser, "clock-frequency", freq); qemu_fdt_setprop_cells(fdt, ser, "interrupts", 42, 2); qemu_fdt_setprop_phandle(fdt, ser, "interrupt-parent", mpic); qemu_fdt_setprop_string(fdt, "/aliases", alias, ser); @@ -382,8 +380,7 @@ static int ppce500_load_device_tree(PPCE500MachineState *pms, int fdt_size; void *fdt; uint8_t hypercall[16]; - uint32_t clock_freq = PLATFORM_CLK_FREQ_HZ; - uint32_t tb_freq = PLATFORM_CLK_FREQ_HZ; + uint32_t clock_freq, tb_freq; int i; char compatible_sb[] = "fsl,mpc8544-immr\0simple-bus"; char *soc; @@ -411,7 +408,7 @@ static int ppce500_load_device_tree(PPCE500MachineState *pms, if (dtb_file) { char *filename; - filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, dtb_file); + filename = qemu_find_file(QEMU_FILE_TYPE_DTB, dtb_file); if (!filename) { goto out; } @@ -484,6 +481,9 @@ static int ppce500_load_device_tree(PPCE500MachineState *pms, if (kvmppc_get_hasidle(env)) { qemu_fdt_setprop(fdt, "/hypervisor", "has-idle", NULL, 0); } + } else { + clock_freq = pmc->clock_freq; + tb_freq = pmc->tb_freq; } /* Create CPU nodes */ @@ -564,12 +564,12 @@ static int ppce500_load_device_tree(PPCE500MachineState *pms, */ if (serial_hd(1)) { dt_serial_create(fdt, MPC8544_SERIAL1_REGS_OFFSET, - soc, mpic, "serial1", 1, false); + soc, pmc->clock_freq, mpic, "serial1", 1, false); } if (serial_hd(0)) { dt_serial_create(fdt, MPC8544_SERIAL0_REGS_OFFSET, - soc, mpic, "serial0", 0, true); + soc, pmc->clock_freq, mpic, "serial0", 0, true); } /* i2c */ @@ -931,7 +931,6 @@ void ppce500_init(MachineState *machine) CPUPPCState *firstenv = NULL; MemoryRegion *ccsr_addr_space; SysBusDevice *s; - PPCE500CCSRState *ccsr; I2CBus *i2c; irqs = g_new0(IrqLines, smp_cpus); @@ -968,7 +967,7 @@ void ppce500_init(MachineState *machine) env->spr_cb[SPR_BOOKE_PIR].default_value = cs->cpu_index = i; env->mpic_iack = pmc->ccsrbar_base + MPC8544_MPIC_REGS_OFFSET + 0xa0; - ppc_booke_timers_init(cpu, PLATFORM_CLK_FREQ_HZ, PPC_TIMER_E500); + ppc_booke_timers_init(cpu, pmc->tb_freq, PPC_TIMER_E500); /* Register reset handler */ if (!i) { @@ -993,10 +992,10 @@ void ppce500_init(MachineState *machine) memory_region_add_subregion(address_space_mem, 0, machine->ram); dev = qdev_new("e500-ccsr"); + s = SYS_BUS_DEVICE(dev); object_property_add_child(OBJECT(machine), "e500-ccsr", OBJECT(dev)); - sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal); - ccsr = CCSR(dev); - ccsr_addr_space = &ccsr->ccsr_space; + sysbus_realize_and_unref(s, &error_fatal); + ccsr_addr_space = sysbus_mmio_get_region(s, 0); memory_region_add_subregion(address_space_mem, pmc->ccsrbar_base, ccsr_addr_space); @@ -1284,6 +1283,7 @@ static void e500_ccsr_initfn(Object *obj) PPCE500CCSRState *ccsr = CCSR(obj); memory_region_init(&ccsr->ccsr_space, obj, "e500-ccsr", MPC8544_CCSRBAR_SIZE); + sysbus_init_mmio(SYS_BUS_DEVICE(ccsr), &ccsr->ccsr_space); } static const TypeInfo e500_ccsr_info = { diff --git a/hw/ppc/e500.h b/hw/ppc/e500.h index 01db102..00f4905 100644 --- a/hw/ppc/e500.h +++ b/hw/ppc/e500.h @@ -5,6 +5,8 @@ #include "hw/platform-bus.h" #include "qom/object.h" +#define PLATFORM_CLK_FREQ_HZ (400 * 1000 * 1000) + struct PPCE500MachineState { /*< private >*/ MachineState parent_obj; @@ -37,6 +39,8 @@ struct PPCE500MachineClass { hwaddr pci_mmio_base; hwaddr pci_mmio_bus_base; hwaddr spin_base; + uint32_t clock_freq; + uint32_t tb_freq; }; void ppce500_init(MachineState *machine); diff --git a/hw/ppc/e500plat.c b/hw/ppc/e500plat.c index 775b9d8..4f1d659 100644 --- a/hw/ppc/e500plat.c +++ b/hw/ppc/e500plat.c @@ -93,6 +93,8 @@ static void e500plat_machine_class_init(ObjectClass *oc, const void *data) pmc->pci_mmio_base = 0xC00000000ULL; pmc->pci_mmio_bus_base = 0xE0000000ULL; pmc->spin_base = 0xFEF000000ULL; + pmc->clock_freq = PLATFORM_CLK_FREQ_HZ; + pmc->tb_freq = PLATFORM_CLK_FREQ_HZ; mc->desc = "generic paravirt e500 platform"; mc->init = e500plat_init; diff --git a/hw/ppc/mpc8544ds.c b/hw/ppc/mpc8544ds.c index 97fb0f3..5826985 100644 --- a/hw/ppc/mpc8544ds.c +++ b/hw/ppc/mpc8544ds.c @@ -55,6 +55,8 @@ static void mpc8544ds_machine_class_init(ObjectClass *oc, const void *data) pmc->pci_mmio_bus_base = 0xC0000000ULL; pmc->pci_pio_base = 0xE1000000ULL; pmc->spin_base = 0xEF000000ULL; + pmc->clock_freq = PLATFORM_CLK_FREQ_HZ; + pmc->tb_freq = PLATFORM_CLK_FREQ_HZ; mc->desc = "mpc8544ds"; mc->init = mpc8544ds_init; diff --git a/hw/ppc/pnv_occ.c b/hw/ppc/pnv_occ.c index fa6f31c..24b789c 100644 --- a/hw/ppc/pnv_occ.c +++ b/hw/ppc/pnv_occ.c @@ -789,7 +789,7 @@ static bool occ_opal_process_command(PnvOCC *occ, static bool occ_model_tick(PnvOCC *occ) { - struct occ_dynamic_data dynamic_data; + QEMU_UNINITIALIZED struct occ_dynamic_data dynamic_data; if (!occ_read_dynamic_data(occ, &dynamic_data, NULL)) { /* Can't move OCC state field to safe because we can't map it! */ diff --git a/hw/ppc/prep.c b/hw/ppc/prep.c index 7395263..982e40e 100644 --- a/hw/ppc/prep.c +++ b/hw/ppc/prep.c @@ -35,6 +35,7 @@ #include "qapi/error.h" #include "qemu/error-report.h" #include "qemu/log.h" +#include "qemu/datadir.h" #include "hw/loader.h" #include "hw/rtc/mc146818rtc.h" #include "hw/isa/pc87312.h" @@ -55,6 +56,8 @@ #define KERNEL_LOAD_ADDR 0x01000000 #define INITRD_LOAD_ADDR 0x01800000 +#define BIOS_ADDR 0xfff00000 +#define BIOS_SIZE (1 * MiB) #define NVRAM_SIZE 0x2000 static void fw_cfg_boot_set(void *opaque, const char *boot_device, @@ -241,6 +244,9 @@ static void ibm_40p_init(MachineState *machine) ISADevice *isa_dev; ISABus *isa_bus; void *fw_cfg; + MemoryRegion *bios = g_new(MemoryRegion, 1); + char *filename; + ssize_t bios_size = -1; uint32_t kernel_base = 0, initrd_base = 0; long kernel_size = 0, initrd_size = 0; char boot_device; @@ -263,10 +269,27 @@ static void ibm_40p_init(MachineState *machine) cpu_ppc_tb_init(env, 100UL * 1000UL * 1000UL); qemu_register_reset(ppc_prep_reset, cpu); + /* allocate and load firmware */ + filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name); + if (!filename) { + error_report("Could not find bios image '%s'", bios_name); + exit(1); + } + memory_region_init_rom(bios, NULL, "bios", BIOS_SIZE, &error_fatal); + memory_region_add_subregion(get_system_memory(), BIOS_ADDR, bios); + bios_size = load_elf(filename, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + ELFDATA2MSB, PPC_ELF_MACHINE, 0, 0); + if (bios_size < 0) { + bios_size = load_image_targphys(filename, BIOS_ADDR, BIOS_SIZE); + } + if (bios_size < 0 || bios_size > BIOS_SIZE) { + error_report("Could not load bios image '%s'", filename); + return; + } + g_free(filename); + /* PCI host */ dev = qdev_new("raven-pcihost"); - qdev_prop_set_string(dev, "bios-name", bios_name); - qdev_prop_set_uint32(dev, "elf-machine", PPC_ELF_MACHINE); pcihost = SYS_BUS_DEVICE(dev); object_property_add_child(qdev_get_machine(), "raven", OBJECT(dev)); sysbus_realize_and_unref(pcihost, &error_fatal); diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 702f774..08615f6 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -577,7 +577,7 @@ static int spapr_dt_dynamic_memory(SpaprMachineState *spapr, void *fdt, /* * Adds ibm,dynamic-reconfiguration-memory node. - * Refer to docs/specs/ppc-spapr-hotplug.txt for the documentation + * Refer to docs/specs/ppc-spapr-hotplug.rst for the documentation * of this device tree node. */ static int spapr_dt_dynamic_reconfiguration_memory(SpaprMachineState *spapr, diff --git a/hw/ppc/spapr_tpm_proxy.c b/hw/ppc/spapr_tpm_proxy.c index 862eeaa..1297b3a 100644 --- a/hw/ppc/spapr_tpm_proxy.c +++ b/hw/ppc/spapr_tpm_proxy.c @@ -41,8 +41,8 @@ static ssize_t tpm_execute(SpaprTpmProxy *tpm_proxy, target_ulong *args) target_ulong data_in_size = args[2]; uint64_t data_out = ppc64_phys_to_real(args[3]); target_ulong data_out_size = args[4]; - uint8_t buf_in[TPM_SPAPR_BUFSIZE]; - uint8_t buf_out[TPM_SPAPR_BUFSIZE]; + QEMU_UNINITIALIZED uint8_t buf_in[TPM_SPAPR_BUFSIZE]; + QEMU_UNINITIALIZED uint8_t buf_out[TPM_SPAPR_BUFSIZE]; ssize_t ret; trace_spapr_tpm_execute(data_in, data_in_size, data_out, data_out_size); diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig index e6a0ac1..fc9c35b 100644 --- a/hw/riscv/Kconfig +++ b/hw/riscv/Kconfig @@ -119,3 +119,12 @@ config SPIKE select HTIF select RISCV_ACLINT select SIFIVE_PLIC + +config XIANGSHAN_KUNMINGHU + bool + default y + depends on RISCV64 + select RISCV_ACLINT + select RISCV_APLIC + select RISCV_IMSIC + select SERIAL_MM diff --git a/hw/riscv/boot.c b/hw/riscv/boot.c index 765b9e2..828a867 100644 --- a/hw/riscv/boot.c +++ b/hw/riscv/boot.c @@ -37,7 +37,7 @@ bool riscv_is_32bit(RISCVHartArrayState *harts) { RISCVCPUClass *mcc = RISCV_CPU_GET_CLASS(&harts->harts[0]); - return mcc->misa_mxl_max == MXL_RV32; + return mcc->def->misa_mxl_max == MXL_RV32; } /* diff --git a/hw/riscv/meson.build b/hw/riscv/meson.build index c22f3a7..2a8d5b1 100644 --- a/hw/riscv/meson.build +++ b/hw/riscv/meson.build @@ -13,5 +13,6 @@ riscv_ss.add(when: 'CONFIG_ACPI', if_true: files('virt-acpi-build.c')) riscv_ss.add(when: 'CONFIG_RISCV_IOMMU', if_true: files( 'riscv-iommu.c', 'riscv-iommu-pci.c', 'riscv-iommu-sys.c', 'riscv-iommu-hpm.c')) riscv_ss.add(when: 'CONFIG_MICROBLAZE_V', if_true: files('microblaze-v-generic.c')) +riscv_ss.add(when: 'CONFIG_XIANGSHAN_KUNMINGHU', if_true: files('xiangshan_kmh.c')) hw_arch += {'riscv': riscv_ss} diff --git a/hw/riscv/riscv-iommu-bits.h b/hw/riscv/riscv-iommu-bits.h index 1017d73..47fe01b 100644 --- a/hw/riscv/riscv-iommu-bits.h +++ b/hw/riscv/riscv-iommu-bits.h @@ -79,6 +79,7 @@ struct riscv_iommu_pq_record { #define RISCV_IOMMU_CAP_SV39 BIT_ULL(9) #define RISCV_IOMMU_CAP_SV48 BIT_ULL(10) #define RISCV_IOMMU_CAP_SV57 BIT_ULL(11) +#define RISCV_IOMMU_CAP_SVRSW60T59B BIT_ULL(14) #define RISCV_IOMMU_CAP_SV32X4 BIT_ULL(16) #define RISCV_IOMMU_CAP_SV39X4 BIT_ULL(17) #define RISCV_IOMMU_CAP_SV48X4 BIT_ULL(18) diff --git a/hw/riscv/riscv-iommu-pci.c b/hw/riscv/riscv-iommu-pci.c index 1f44eef..cdb4a7a 100644 --- a/hw/riscv/riscv-iommu-pci.c +++ b/hw/riscv/riscv-iommu-pci.c @@ -68,12 +68,6 @@ typedef struct RISCVIOMMUStatePci { RISCVIOMMUState iommu; /* common IOMMU state */ } RISCVIOMMUStatePci; -struct RISCVIOMMUPciClass { - /*< public >*/ - DeviceRealize parent_realize; - ResettablePhases parent_phases; -}; - /* interrupt delivery callback */ static void riscv_iommu_pci_notify(RISCVIOMMUState *iommu, unsigned vector) { diff --git a/hw/riscv/riscv-iommu-sys.c b/hw/riscv/riscv-iommu-sys.c index 74e76b9..e34d00a 100644 --- a/hw/riscv/riscv-iommu-sys.c +++ b/hw/riscv/riscv-iommu-sys.c @@ -53,12 +53,6 @@ struct RISCVIOMMUStateSys { uint8_t *msix_pba; }; -struct RISCVIOMMUSysClass { - /*< public >*/ - DeviceRealize parent_realize; - ResettablePhases parent_phases; -}; - static uint64_t msix_table_mmio_read(void *opaque, hwaddr addr, unsigned size) { diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c index a877e5d..96a7fbd 100644 --- a/hw/riscv/riscv-iommu.c +++ b/hw/riscv/riscv-iommu.c @@ -1935,11 +1935,7 @@ static void riscv_iommu_process_dbg(RISCVIOMMUState *s) iova = RISCV_IOMMU_TR_RESPONSE_FAULT | (((uint64_t) fault) << 10); } else { iova = iotlb.translated_addr & ~iotlb.addr_mask; - iova >>= TARGET_PAGE_BITS; - iova &= RISCV_IOMMU_TR_RESPONSE_PPN; - - /* We do not support superpages (> 4kbs) for now */ - iova &= ~RISCV_IOMMU_TR_RESPONSE_S; + iova = set_field(0, RISCV_IOMMU_TR_RESPONSE_PPN, PPN_DOWN(iova)); } riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_TR_RESPONSE, iova); } @@ -2355,7 +2351,8 @@ static void riscv_iommu_realize(DeviceState *dev, Error **errp) } if (s->enable_g_stage) { s->cap |= RISCV_IOMMU_CAP_SV32X4 | RISCV_IOMMU_CAP_SV39X4 | - RISCV_IOMMU_CAP_SV48X4 | RISCV_IOMMU_CAP_SV57X4; + RISCV_IOMMU_CAP_SV48X4 | RISCV_IOMMU_CAP_SV57X4 | + RISCV_IOMMU_CAP_SVRSW60T59B; } if (s->hpm_cntrs > 0) { diff --git a/hw/riscv/virt-acpi-build.c b/hw/riscv/virt-acpi-build.c index 1eef2fb..8b5683d 100644 --- a/hw/riscv/virt-acpi-build.c +++ b/hw/riscv/virt-acpi-build.c @@ -287,7 +287,7 @@ static void build_rhct(GArray *table_data, uint32_t isa_offset, num_rhct_nodes, cmo_offset = 0; RISCVCPU *cpu = &s->soc[0].harts[0]; uint32_t mmu_offset = 0; - uint8_t satp_mode_max; + bool rv32 = riscv_cpu_is_32bit(cpu); g_autofree char *isa = NULL; AcpiTable table = { .sig = "RHCT", .rev = 1, .oem_id = s->oem_id, @@ -307,7 +307,7 @@ static void build_rhct(GArray *table_data, num_rhct_nodes++; } - if (cpu->cfg.satp_mode.supported != 0) { + if (!rv32 && cpu->cfg.max_satp_mode >= VM_1_10_SV39) { num_rhct_nodes++; } @@ -367,22 +367,21 @@ static void build_rhct(GArray *table_data, } /* MMU node structure */ - if (cpu->cfg.satp_mode.supported != 0) { - satp_mode_max = satp_mode_max_from_map(cpu->cfg.satp_mode.map); + if (!rv32 && cpu->cfg.max_satp_mode >= VM_1_10_SV39) { mmu_offset = table_data->len - table.table_offset; build_append_int_noprefix(table_data, 2, 2); /* Type */ build_append_int_noprefix(table_data, 8, 2); /* Length */ build_append_int_noprefix(table_data, 0x1, 2); /* Revision */ build_append_int_noprefix(table_data, 0, 1); /* Reserved */ /* MMU Type */ - if (satp_mode_max == VM_1_10_SV57) { + if (cpu->cfg.max_satp_mode == VM_1_10_SV57) { build_append_int_noprefix(table_data, 2, 1); /* Sv57 */ - } else if (satp_mode_max == VM_1_10_SV48) { + } else if (cpu->cfg.max_satp_mode == VM_1_10_SV48) { build_append_int_noprefix(table_data, 1, 1); /* Sv48 */ - } else if (satp_mode_max == VM_1_10_SV39) { + } else if (cpu->cfg.max_satp_mode == VM_1_10_SV39) { build_append_int_noprefix(table_data, 0, 1); /* Sv39 */ } else { - assert(1); + g_assert_not_reached(); } } diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c index 0dcced1..47e573f 100644 --- a/hw/riscv/virt.c +++ b/hw/riscv/virt.c @@ -237,10 +237,10 @@ static void create_fdt_socket_cpus(RISCVVirtState *s, int socket, uint32_t cpu_phandle; MachineState *ms = MACHINE(s); bool is_32_bit = riscv_is_32bit(&s->soc[0]); - uint8_t satp_mode_max; for (cpu = s->soc[socket].num_harts - 1; cpu >= 0; cpu--) { RISCVCPU *cpu_ptr = &s->soc[socket].harts[cpu]; + int8_t satp_mode_max = cpu_ptr->cfg.max_satp_mode; g_autofree char *cpu_name = NULL; g_autofree char *core_name = NULL; g_autofree char *intc_name = NULL; @@ -252,8 +252,7 @@ static void create_fdt_socket_cpus(RISCVVirtState *s, int socket, s->soc[socket].hartid_base + cpu); qemu_fdt_add_subnode(ms->fdt, cpu_name); - if (cpu_ptr->cfg.satp_mode.supported != 0) { - satp_mode_max = satp_mode_max_from_map(cpu_ptr->cfg.satp_mode.map); + if (satp_mode_max != -1) { sv_name = g_strdup_printf("riscv,%s", satp_mode_str(satp_mode_max, is_32_bit)); qemu_fdt_setprop_string(ms->fdt, cpu_name, "mmu-type", sv_name); @@ -312,8 +311,7 @@ static void create_fdt_socket_memory(RISCVVirtState *s, int socket) size = riscv_socket_mem_size(ms, socket); mem_name = g_strdup_printf("/memory@%"HWADDR_PRIx, addr); qemu_fdt_add_subnode(ms->fdt, mem_name); - qemu_fdt_setprop_cells(ms->fdt, mem_name, "reg", - addr >> 32, addr, size >> 32, size); + qemu_fdt_setprop_sized_cells(ms->fdt, mem_name, "reg", 2, addr, 2, size); qemu_fdt_setprop_string(ms->fdt, mem_name, "device_type", "memory"); riscv_socket_fdt_write_id(ms, mem_name, socket); } @@ -325,7 +323,7 @@ static void create_fdt_socket_clint(RISCVVirtState *s, int cpu; g_autofree char *clint_name = NULL; g_autofree uint32_t *clint_cells = NULL; - unsigned long clint_addr; + hwaddr clint_addr; MachineState *ms = MACHINE(s); static const char * const clint_compat[2] = { "sifive,clint0", "riscv,clint0" @@ -341,14 +339,14 @@ static void create_fdt_socket_clint(RISCVVirtState *s, } clint_addr = s->memmap[VIRT_CLINT].base + - (s->memmap[VIRT_CLINT].size * socket); - clint_name = g_strdup_printf("/soc/clint@%lx", clint_addr); + s->memmap[VIRT_CLINT].size * socket; + clint_name = g_strdup_printf("/soc/clint@%"HWADDR_PRIx, clint_addr); qemu_fdt_add_subnode(ms->fdt, clint_name); qemu_fdt_setprop_string_array(ms->fdt, clint_name, "compatible", (char **)&clint_compat, ARRAY_SIZE(clint_compat)); - qemu_fdt_setprop_cells(ms->fdt, clint_name, "reg", - 0x0, clint_addr, 0x0, s->memmap[VIRT_CLINT].size); + qemu_fdt_setprop_sized_cells(ms->fdt, clint_name, "reg", + 2, clint_addr, 2, s->memmap[VIRT_CLINT].size); qemu_fdt_setprop(ms->fdt, clint_name, "interrupts-extended", clint_cells, s->soc[socket].num_harts * sizeof(uint32_t) * 4); riscv_socket_fdt_write_id(ms, clint_name, socket); @@ -389,8 +387,8 @@ static void create_fdt_socket_aclint(RISCVVirtState *s, qemu_fdt_add_subnode(ms->fdt, name); qemu_fdt_setprop_string(ms->fdt, name, "compatible", "riscv,aclint-mswi"); - qemu_fdt_setprop_cells(ms->fdt, name, "reg", - 0x0, addr, 0x0, RISCV_ACLINT_SWI_SIZE); + qemu_fdt_setprop_sized_cells(ms->fdt, name, "reg", + 2, addr, 2, RISCV_ACLINT_SWI_SIZE); qemu_fdt_setprop(ms->fdt, name, "interrupts-extended", aclint_mswi_cells, aclint_cells_size); qemu_fdt_setprop(ms->fdt, name, "interrupt-controller", NULL, 0); @@ -412,11 +410,11 @@ static void create_fdt_socket_aclint(RISCVVirtState *s, qemu_fdt_add_subnode(ms->fdt, name); qemu_fdt_setprop_string(ms->fdt, name, "compatible", "riscv,aclint-mtimer"); - qemu_fdt_setprop_cells(ms->fdt, name, "reg", - 0x0, addr + RISCV_ACLINT_DEFAULT_MTIME, - 0x0, size - RISCV_ACLINT_DEFAULT_MTIME, - 0x0, addr + RISCV_ACLINT_DEFAULT_MTIMECMP, - 0x0, RISCV_ACLINT_DEFAULT_MTIME); + qemu_fdt_setprop_sized_cells(ms->fdt, name, "reg", + 2, addr + RISCV_ACLINT_DEFAULT_MTIME, + 2, size - RISCV_ACLINT_DEFAULT_MTIME, + 2, addr + RISCV_ACLINT_DEFAULT_MTIMECMP, + 2, RISCV_ACLINT_DEFAULT_MTIME); qemu_fdt_setprop(ms->fdt, name, "interrupts-extended", aclint_mtimer_cells, aclint_cells_size); riscv_socket_fdt_write_id(ms, name, socket); @@ -430,8 +428,8 @@ static void create_fdt_socket_aclint(RISCVVirtState *s, qemu_fdt_add_subnode(ms->fdt, name); qemu_fdt_setprop_string(ms->fdt, name, "compatible", "riscv,aclint-sswi"); - qemu_fdt_setprop_cells(ms->fdt, name, "reg", - 0x0, addr, 0x0, s->memmap[VIRT_ACLINT_SSWI].size); + qemu_fdt_setprop_sized_cells(ms->fdt, name, "reg", + 2, addr, 2, s->memmap[VIRT_ACLINT_SSWI].size); qemu_fdt_setprop(ms->fdt, name, "interrupts-extended", aclint_sswi_cells, aclint_cells_size); qemu_fdt_setprop(ms->fdt, name, "interrupt-controller", NULL, 0); @@ -495,8 +493,8 @@ static void create_fdt_socket_plic(RISCVVirtState *s, s->soc[socket].num_harts * sizeof(uint32_t) * 4); } - qemu_fdt_setprop_cells(ms->fdt, plic_name, "reg", - 0x0, plic_addr, 0x0, s->memmap[VIRT_PLIC].size); + qemu_fdt_setprop_sized_cells(ms->fdt, plic_name, "reg", + 2, plic_addr, 2, s->memmap[VIRT_PLIC].size); qemu_fdt_setprop_cell(ms->fdt, plic_name, "riscv,ndev", VIRT_IRQCHIP_NUM_SOURCES - 1); riscv_socket_fdt_write_id(ms, plic_name, socket); @@ -657,8 +655,8 @@ static void create_fdt_one_aplic(RISCVVirtState *s, int socket, qemu_fdt_setprop_cell(ms->fdt, aplic_name, "msi-parent", msi_phandle); } - qemu_fdt_setprop_cells(ms->fdt, aplic_name, "reg", - 0x0, aplic_addr, 0x0, aplic_size); + qemu_fdt_setprop_sized_cells(ms->fdt, aplic_name, "reg", + 2, aplic_addr, 2, aplic_size); qemu_fdt_setprop_cell(ms->fdt, aplic_name, "riscv,num-sources", VIRT_IRQCHIP_NUM_SOURCES); @@ -858,9 +856,7 @@ static void create_fdt_virtio(RISCVVirtState *s, uint32_t irq_virtio_phandle) qemu_fdt_add_subnode(ms->fdt, name); qemu_fdt_setprop_string(ms->fdt, name, "compatible", "virtio,mmio"); - qemu_fdt_setprop_cells(ms->fdt, name, "reg", - 0x0, addr, - 0x0, size); + qemu_fdt_setprop_sized_cells(ms->fdt, name, "reg", 2, addr, 2, size); qemu_fdt_setprop_cell(ms->fdt, name, "interrupt-parent", irq_virtio_phandle); if (s->aia_type == VIRT_AIA_TYPE_NONE) { @@ -898,8 +894,8 @@ static void create_fdt_pcie(RISCVVirtState *s, if (s->aia_type == VIRT_AIA_TYPE_APLIC_IMSIC) { qemu_fdt_setprop_cell(ms->fdt, name, "msi-parent", msi_pcie_phandle); } - qemu_fdt_setprop_cells(ms->fdt, name, "reg", 0, - s->memmap[VIRT_PCIE_ECAM].base, 0, s->memmap[VIRT_PCIE_ECAM].size); + qemu_fdt_setprop_sized_cells(ms->fdt, name, "reg", 2, + s->memmap[VIRT_PCIE_ECAM].base, 2, s->memmap[VIRT_PCIE_ECAM].size); qemu_fdt_setprop_sized_cells(ms->fdt, name, "ranges", 1, FDT_PCI_RANGE_IOPORT, 2, 0, 2, s->memmap[VIRT_PCIE_PIO].base, 2, s->memmap[VIRT_PCIE_PIO].size, @@ -936,8 +932,9 @@ static void create_fdt_reset(RISCVVirtState *s, uint32_t *phandle) qemu_fdt_setprop_string_array(ms->fdt, name, "compatible", (char **)&compat, ARRAY_SIZE(compat)); } - qemu_fdt_setprop_cells(ms->fdt, name, "reg", - 0x0, s->memmap[VIRT_TEST].base, 0x0, s->memmap[VIRT_TEST].size); + qemu_fdt_setprop_sized_cells(ms->fdt, name, "reg", + 2, s->memmap[VIRT_TEST].base, + 2, s->memmap[VIRT_TEST].size); qemu_fdt_setprop_cell(ms->fdt, name, "phandle", test_phandle); test_phandle = qemu_fdt_get_phandle(ms->fdt, name); g_free(name); @@ -969,9 +966,9 @@ static void create_fdt_uart(RISCVVirtState *s, s->memmap[VIRT_UART0].base); qemu_fdt_add_subnode(ms->fdt, name); qemu_fdt_setprop_string(ms->fdt, name, "compatible", "ns16550a"); - qemu_fdt_setprop_cells(ms->fdt, name, "reg", - 0x0, s->memmap[VIRT_UART0].base, - 0x0, s->memmap[VIRT_UART0].size); + qemu_fdt_setprop_sized_cells(ms->fdt, name, "reg", + 2, s->memmap[VIRT_UART0].base, + 2, s->memmap[VIRT_UART0].size); qemu_fdt_setprop_cell(ms->fdt, name, "clock-frequency", 3686400); qemu_fdt_setprop_cell(ms->fdt, name, "interrupt-parent", irq_mmio_phandle); if (s->aia_type == VIRT_AIA_TYPE_NONE) { @@ -995,8 +992,9 @@ static void create_fdt_rtc(RISCVVirtState *s, qemu_fdt_add_subnode(ms->fdt, name); qemu_fdt_setprop_string(ms->fdt, name, "compatible", "google,goldfish-rtc"); - qemu_fdt_setprop_cells(ms->fdt, name, "reg", - 0x0, s->memmap[VIRT_RTC].base, 0x0, s->memmap[VIRT_RTC].size); + qemu_fdt_setprop_sized_cells(ms->fdt, name, "reg", + 2, s->memmap[VIRT_RTC].base, + 2, s->memmap[VIRT_RTC].size); qemu_fdt_setprop_cell(ms->fdt, name, "interrupt-parent", irq_mmio_phandle); if (s->aia_type == VIRT_AIA_TYPE_NONE) { @@ -1090,8 +1088,7 @@ static void create_fdt_iommu_sys(RISCVVirtState *s, uint32_t irq_chip, qemu_fdt_setprop_cell(fdt, iommu_node, "#iommu-cells", 1); qemu_fdt_setprop_cell(fdt, iommu_node, "phandle", iommu_phandle); - qemu_fdt_setprop_cells(fdt, iommu_node, "reg", - addr >> 32, addr, size >> 32, size); + qemu_fdt_setprop_sized_cells(fdt, iommu_node, "reg", 2, addr, 2, size); qemu_fdt_setprop_cell(fdt, iommu_node, "interrupt-parent", irq_chip); qemu_fdt_setprop_cells(fdt, iommu_node, "interrupts", diff --git a/hw/riscv/xiangshan_kmh.c b/hw/riscv/xiangshan_kmh.c new file mode 100644 index 0000000..a95fd61 --- /dev/null +++ b/hw/riscv/xiangshan_kmh.c @@ -0,0 +1,220 @@ +/* + * QEMU RISC-V Board Compatible with the Xiangshan Kunminghu + * FPGA prototype platform + * + * Copyright (c) 2025 Beijing Institute of Open Source Chip (BOSC) + * SPDX-License-Identifier: GPL-2.0-or-later + * + * Provides a board compatible with the Xiangshan Kunminghu + * FPGA prototype platform: + * + * 0) UART (16550A) + * 1) CLINT (Core-Local Interruptor) + * 2) IMSIC (Incoming MSI Controller) + * 3) APLIC (Advanced Platform-Level Interrupt Controller) + * + * More information can be found in our Github repository: + * https://github.com/OpenXiangShan/XiangShan + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2 or later, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "system/address-spaces.h" +#include "hw/boards.h" +#include "hw/char/serial-mm.h" +#include "hw/intc/riscv_aclint.h" +#include "hw/intc/riscv_aplic.h" +#include "hw/intc/riscv_imsic.h" +#include "hw/qdev-properties.h" +#include "hw/riscv/boot.h" +#include "hw/riscv/xiangshan_kmh.h" +#include "hw/riscv/riscv_hart.h" +#include "system/system.h" + +static const MemMapEntry xiangshan_kmh_memmap[] = { + [XIANGSHAN_KMH_ROM] = { 0x1000, 0xF000 }, + [XIANGSHAN_KMH_UART0] = { 0x310B0000, 0x10000 }, + [XIANGSHAN_KMH_CLINT] = { 0x38000000, 0x10000 }, + [XIANGSHAN_KMH_APLIC_M] = { 0x31100000, 0x4000 }, + [XIANGSHAN_KMH_APLIC_S] = { 0x31120000, 0x4000 }, + [XIANGSHAN_KMH_IMSIC_M] = { 0x3A800000, 0x10000 }, + [XIANGSHAN_KMH_IMSIC_S] = { 0x3B000000, 0x80000 }, + [XIANGSHAN_KMH_DRAM] = { 0x80000000, 0x0 }, +}; + +static DeviceState *xiangshan_kmh_create_aia(uint32_t num_harts) +{ + int i; + const MemMapEntry *memmap = xiangshan_kmh_memmap; + hwaddr addr = 0; + DeviceState *aplic_m = NULL; + + /* M-level IMSICs */ + addr = memmap[XIANGSHAN_KMH_IMSIC_M].base; + for (i = 0; i < num_harts; i++) { + riscv_imsic_create(addr + i * IMSIC_HART_SIZE(0), i, true, + 1, XIANGSHAN_KMH_IMSIC_NUM_IDS); + } + + /* S-level IMSICs */ + addr = memmap[XIANGSHAN_KMH_IMSIC_S].base; + for (i = 0; i < num_harts; i++) { + riscv_imsic_create(addr + + i * IMSIC_HART_SIZE(XIANGSHAN_KMH_IMSIC_GUEST_BITS), + i, false, 1 + XIANGSHAN_KMH_IMSIC_GUEST_BITS, + XIANGSHAN_KMH_IMSIC_NUM_IDS); + } + + /* M-level APLIC */ + aplic_m = riscv_aplic_create(memmap[XIANGSHAN_KMH_APLIC_M].base, + memmap[XIANGSHAN_KMH_APLIC_M].size, + 0, 0, XIANGSHAN_KMH_APLIC_NUM_SOURCES, + 1, true, true, NULL); + + /* S-level APLIC */ + riscv_aplic_create(memmap[XIANGSHAN_KMH_APLIC_S].base, + memmap[XIANGSHAN_KMH_APLIC_S].size, + 0, 0, XIANGSHAN_KMH_APLIC_NUM_SOURCES, + 1, true, false, aplic_m); + + return aplic_m; +} + +static void xiangshan_kmh_soc_realize(DeviceState *dev, Error **errp) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + XiangshanKmhSoCState *s = XIANGSHAN_KMH_SOC(dev); + const MemMapEntry *memmap = xiangshan_kmh_memmap; + MemoryRegion *system_memory = get_system_memory(); + uint32_t num_harts = ms->smp.cpus; + + qdev_prop_set_uint32(DEVICE(&s->cpus), "num-harts", num_harts); + qdev_prop_set_uint32(DEVICE(&s->cpus), "hartid-base", 0); + qdev_prop_set_string(DEVICE(&s->cpus), "cpu-type", + TYPE_RISCV_CPU_XIANGSHAN_KMH); + sysbus_realize(SYS_BUS_DEVICE(&s->cpus), &error_fatal); + + /* AIA */ + s->irqchip = xiangshan_kmh_create_aia(num_harts); + + /* UART */ + serial_mm_init(system_memory, memmap[XIANGSHAN_KMH_UART0].base, 2, + qdev_get_gpio_in(s->irqchip, XIANGSHAN_KMH_UART0_IRQ), + 115200, serial_hd(0), DEVICE_LITTLE_ENDIAN); + + /* CLINT */ + riscv_aclint_swi_create(memmap[XIANGSHAN_KMH_CLINT].base, + 0, num_harts, false); + riscv_aclint_mtimer_create(memmap[XIANGSHAN_KMH_CLINT].base + + RISCV_ACLINT_SWI_SIZE, + RISCV_ACLINT_DEFAULT_MTIMER_SIZE, + 0, num_harts, RISCV_ACLINT_DEFAULT_MTIMECMP, + RISCV_ACLINT_DEFAULT_MTIME, + XIANGSHAN_KMH_CLINT_TIMEBASE_FREQ, true); + + /* ROM */ + memory_region_init_rom(&s->rom, OBJECT(dev), "xiangshan.kunminghu.rom", + memmap[XIANGSHAN_KMH_ROM].size, &error_fatal); + memory_region_add_subregion(system_memory, + memmap[XIANGSHAN_KMH_ROM].base, &s->rom); +} + +static void xiangshan_kmh_soc_class_init(ObjectClass *klass, const void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->realize = xiangshan_kmh_soc_realize; + dc->user_creatable = false; +} + +static void xiangshan_kmh_soc_instance_init(Object *obj) +{ + XiangshanKmhSoCState *s = XIANGSHAN_KMH_SOC(obj); + + object_initialize_child(obj, "cpus", &s->cpus, TYPE_RISCV_HART_ARRAY); +} + +static const TypeInfo xiangshan_kmh_soc_info = { + .name = TYPE_XIANGSHAN_KMH_SOC, + .parent = TYPE_DEVICE, + .instance_size = sizeof(XiangshanKmhSoCState), + .instance_init = xiangshan_kmh_soc_instance_init, + .class_init = xiangshan_kmh_soc_class_init, +}; + +static void xiangshan_kmh_soc_register_types(void) +{ + type_register_static(&xiangshan_kmh_soc_info); +} +type_init(xiangshan_kmh_soc_register_types) + +static void xiangshan_kmh_machine_init(MachineState *machine) +{ + XiangshanKmhState *s = XIANGSHAN_KMH_MACHINE(machine); + const MemMapEntry *memmap = xiangshan_kmh_memmap; + MemoryRegion *system_memory = get_system_memory(); + hwaddr start_addr = memmap[XIANGSHAN_KMH_DRAM].base; + + /* Initialize SoC */ + object_initialize_child(OBJECT(machine), "soc", &s->soc, + TYPE_XIANGSHAN_KMH_SOC); + qdev_realize(DEVICE(&s->soc), NULL, &error_fatal); + + /* Register RAM */ + memory_region_add_subregion(system_memory, + memmap[XIANGSHAN_KMH_DRAM].base, + machine->ram); + + /* ROM reset vector */ + riscv_setup_rom_reset_vec(machine, &s->soc.cpus, + start_addr, + memmap[XIANGSHAN_KMH_ROM].base, + memmap[XIANGSHAN_KMH_ROM].size, 0, 0); + if (machine->firmware) { + riscv_load_firmware(machine->firmware, &start_addr, NULL); + } + + /* Note: dtb has been integrated into firmware(OpenSBI) when compiling */ +} + +static void xiangshan_kmh_machine_class_init(ObjectClass *klass, const void *data) +{ + MachineClass *mc = MACHINE_CLASS(klass); + static const char *const valid_cpu_types[] = { + TYPE_RISCV_CPU_XIANGSHAN_KMH, + NULL + }; + + mc->desc = "RISC-V Board compatible with the Xiangshan " \ + "Kunminghu FPGA prototype platform"; + mc->init = xiangshan_kmh_machine_init; + mc->max_cpus = XIANGSHAN_KMH_MAX_CPUS; + mc->default_cpu_type = TYPE_RISCV_CPU_XIANGSHAN_KMH; + mc->valid_cpu_types = valid_cpu_types; + mc->default_ram_id = "xiangshan.kunminghu.ram"; +} + +static const TypeInfo xiangshan_kmh_machine_info = { + .name = TYPE_XIANGSHAN_KMH_MACHINE, + .parent = TYPE_MACHINE, + .instance_size = sizeof(XiangshanKmhState), + .class_init = xiangshan_kmh_machine_class_init, +}; + +static void xiangshan_kmh_machine_register_types(void) +{ + type_register_static(&xiangshan_kmh_machine_info); +} +type_init(xiangshan_kmh_machine_register_types) diff --git a/hw/s390x/ap-stub.c b/hw/s390x/ap-stub.c new file mode 100644 index 0000000..001fe5f --- /dev/null +++ b/hw/s390x/ap-stub.c @@ -0,0 +1,21 @@ +/* + * VFIO based AP matrix device assignment + * + * Copyright 2025 IBM Corp. + * Author(s): Rorie Reyes <rreyes@linux.ibm.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "hw/s390x/ap-bridge.h" + +int ap_chsc_sei_nt0_get_event(void *res) +{ + return EVENT_INFORMATION_NOT_STORED; +} + +bool ap_chsc_sei_nt0_have_event(void) +{ + return false; +} diff --git a/hw/s390x/ccw-device.c b/hw/s390x/ccw-device.c index 19c2238..8be1813 100644 --- a/hw/s390x/ccw-device.c +++ b/hw/s390x/ccw-device.c @@ -57,7 +57,7 @@ static void ccw_device_set_loadparm(Object *obj, Visitor *v, Error **errp) { CcwDevice *dev = CCW_DEVICE(obj); - char *val; + g_autofree char *val = NULL; int index; index = object_property_get_int(obj, "bootindex", NULL); diff --git a/hw/s390x/cpu-topology.c b/hw/s390x/cpu-topology.c index 7d4e1f5..b513f89 100644 --- a/hw/s390x/cpu-topology.c +++ b/hw/s390x/cpu-topology.c @@ -23,8 +23,8 @@ #include "target/s390x/cpu.h" #include "hw/s390x/s390-virtio-ccw.h" #include "hw/s390x/cpu-topology.h" -#include "qapi/qapi-commands-machine-target.h" -#include "qapi/qapi-events-machine-target.h" +#include "qapi/qapi-commands-machine-s390x.h" +#include "qapi/qapi-events-machine-s390x.h" /* * s390_topology is used to keep the topology information. diff --git a/hw/s390x/event-facility.c b/hw/s390x/event-facility.c index 7b7bf23..fee286e 100644 --- a/hw/s390x/event-facility.c +++ b/hw/s390x/event-facility.c @@ -4,6 +4,7 @@ * handles SCLP event types * - Signal Quiesce - system power down * - ASCII Console Data - VT220 read and write + * - Control-Program Identification - Send OS data from guest to host * * Copyright IBM, Corp. 2012 * @@ -40,6 +41,7 @@ struct SCLPEventFacility { SysBusDevice parent_obj; SCLPEventsBus sbus; SCLPEvent quiesce, cpu_hotplug; + SCLPEventCPI cpi; /* guest's receive mask */ union { uint32_t receive_mask_pieces[2]; diff --git a/hw/s390x/meson.build b/hw/s390x/meson.build index 3bbebfd..1bc8583 100644 --- a/hw/s390x/meson.build +++ b/hw/s390x/meson.build @@ -13,6 +13,7 @@ s390x_ss.add(files( 's390-skeys.c', 's390-stattrib.c', 'sclp.c', + 'sclpcpi.c', 'sclpcpu.c', 'sclpquiesce.c', 'tod.c', @@ -33,6 +34,7 @@ s390x_ss.add(when: 'CONFIG_S390_CCW_VIRTIO', if_true: files( )) s390x_ss.add(when: 'CONFIG_TERMINAL3270', if_true: files('3270-ccw.c')) s390x_ss.add(when: 'CONFIG_VFIO', if_true: files('s390-pci-vfio.c')) +s390x_ss.add(when: 'CONFIG_VFIO_AP', if_false: files('ap-stub.c')) virtio_ss = ss.source_set() virtio_ss.add(files('virtio-ccw.c')) diff --git a/hw/s390x/s390-skeys.c b/hw/s390x/s390-skeys.c index aedb62b..8eeecfd 100644 --- a/hw/s390x/s390-skeys.c +++ b/hw/s390x/s390-skeys.c @@ -17,7 +17,6 @@ #include "hw/s390x/storage-keys.h" #include "qapi/error.h" #include "qapi/qapi-commands-machine.h" -#include "qapi/qapi-commands-misc-target.h" #include "qobject/qdict.h" #include "qemu/error-report.h" #include "system/memory_mapping.h" diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c index f69a4d8..a79bd13 100644 --- a/hw/s390x/s390-virtio-ccw.c +++ b/hw/s390x/s390-virtio-ccw.c @@ -260,9 +260,21 @@ static void s390_create_sclpconsole(SCLPDevice *sclp, qdev_realize_and_unref(dev, ev_fac_bus, &error_fatal); } +static void s390_create_sclpcpi(SCLPDevice *sclp) +{ + SCLPEventFacility *ef = sclp->event_facility; + BusState *ev_fac_bus = sclp_get_event_facility_bus(ef); + DeviceState *dev; + + dev = qdev_new(TYPE_SCLP_EVENT_CPI); + object_property_add_child(OBJECT(ef), "sclpcpi", OBJECT(dev)); + qdev_realize_and_unref(dev, ev_fac_bus, &error_fatal); +} + static void ccw_init(MachineState *machine) { MachineClass *mc = MACHINE_GET_CLASS(machine); + S390CcwMachineClass *s390mc = S390_CCW_MACHINE_CLASS(mc); S390CcwMachineState *ms = S390_CCW_MACHINE(machine); int ret; VirtualCssBus *css_bus; @@ -323,6 +335,12 @@ static void ccw_init(MachineState *machine) /* init the TOD clock */ s390_init_tod(); + + /* init SCLP event Control-Program Identification */ + if (s390mc->use_cpi) { + s390_create_sclpcpi(ms->sclp); + } + } static void s390_cpu_plug(HotplugHandler *hotplug_dev, @@ -783,6 +801,7 @@ static void ccw_machine_class_init(ObjectClass *oc, const void *data) DumpSKeysInterface *dsi = DUMP_SKEYS_INTERFACE_CLASS(oc); s390mc->max_threads = 1; + s390mc->use_cpi = true; mc->reset = s390_machine_reset; mc->block_default_type = IF_VIRTIO; mc->no_cdrom = 1; @@ -908,6 +927,9 @@ static void ccw_machine_10_0_instance_options(MachineState *machine) static void ccw_machine_10_0_class_options(MachineClass *mc) { + S390CcwMachineClass *s390mc = S390_CCW_MACHINE_CLASS(mc); + s390mc->use_cpi = false; + ccw_machine_10_1_class_options(mc); compat_props_add(mc->compat_props, hw_compat_10_0, hw_compat_10_0_len); } @@ -1145,20 +1167,6 @@ static void ccw_machine_4_2_class_options(MachineClass *mc) } DEFINE_CCW_MACHINE(4, 2); -static void ccw_machine_4_1_instance_options(MachineState *machine) -{ - static const S390FeatInit qemu_cpu_feat = { S390_FEAT_LIST_QEMU_V4_1 }; - ccw_machine_4_2_instance_options(machine); - s390_set_qemu_cpu_model(0x2964, 13, 2, qemu_cpu_feat); -} - -static void ccw_machine_4_1_class_options(MachineClass *mc) -{ - ccw_machine_4_2_class_options(mc); - compat_props_add(mc->compat_props, hw_compat_4_1, hw_compat_4_1_len); -} -DEFINE_CCW_MACHINE(4, 1); - static void ccw_machine_register_types(void) { type_register_static(&ccw_machine_info); diff --git a/hw/s390x/sclpcpi.c b/hw/s390x/sclpcpi.c new file mode 100644 index 0000000..7aa039d --- /dev/null +++ b/hw/s390x/sclpcpi.c @@ -0,0 +1,212 @@ + /* + * SPDX-License-Identifier: GPL-2.0-or-later + * + * SCLP event type 11 - Control-Program Identification (CPI): + * CPI is used to send program identifiers from the guest to the + * Service-Call Logical Processor (SCLP). It is not sent by the SCLP. + * + * Control-program identifiers provide data about the guest operating + * system. The control-program identifiers are: system type, system name, + * system level and sysplex name. + * + * In Linux, all the control-program identifiers are user configurable. The + * system type, system name, and sysplex name use EBCDIC characters from + * this set: capital A-Z, 0-9, $, @, #, and blank. In Linux, the system + * type, system name and sysplex name are arbitrary free-form texts. + * + * In Linux, the 8-byte hexadecimal system-level has the format + * 0x<a><b><cc><dd><eeee><ff><gg><hh>, where: + * <a>: is a 4-bit digit, its most significant bit indicates hypervisor use + * <b>: is one digit that represents Linux distributions as follows + * 0: generic Linux + * 1: Red Hat Enterprise Linux + * 2: SUSE Linux Enterprise Server + * 3: Canonical Ubuntu + * 4: Fedora + * 5: openSUSE Leap + * 6: Debian GNU/Linux + * 7: Red Hat Enterprise Linux CoreOS + * <cc>: are two digits for a distribution-specific encoding of the major + * version of the distribution + * <dd>: are two digits for a distribution-specific encoding of the minor + * version of the distribution + * <eeee>: are four digits for the patch level of the distribution + * <ff>: are two digits for the major version of the kernel + * <gg>: are two digits for the minor version of the kernel + * <hh>: are two digits for the stable version of the kernel + * (e.g. 74872343805430528, when converted to hex is 0x010a000000060b00). On + * machines prior to z16, some of the values are not available to display. + * + * Sysplex refers to a cluster of logical partitions that communicates and + * co-operates with each other. + * + * The CPI feature is supported since 10.1. + * + * Copyright IBM, Corp. 2024 + * + * Authors: + * Shalini Chellathurai Saroja <shalini@linux.ibm.com> + * + */ + +#include "qemu/osdep.h" +#include "qemu/timer.h" +#include "hw/s390x/event-facility.h" +#include "hw/s390x/ebcdic.h" +#include "qapi/qapi-visit-machine.h" +#include "migration/vmstate.h" + +typedef struct Data { + uint8_t id_format; + uint8_t reserved0; + uint8_t system_type[8]; + uint64_t reserved1; + uint8_t system_name[8]; + uint64_t reserved2; + uint64_t system_level; + uint64_t reserved3; + uint8_t sysplex_name[8]; + uint8_t reserved4[16]; +} QEMU_PACKED Data; + +typedef struct ControlProgramIdMsg { + EventBufferHeader ebh; + Data data; +} QEMU_PACKED ControlProgramIdMsg; + +static bool can_handle_event(uint8_t type) +{ + return type == SCLP_EVENT_CTRL_PGM_ID; +} + +static sccb_mask_t send_mask(void) +{ + return 0; +} + +/* Enable SCLP to accept buffers of event type CPI from the control-program. */ +static sccb_mask_t receive_mask(void) +{ + return SCLP_EVENT_MASK_CTRL_PGM_ID; +} + +static int write_event_data(SCLPEvent *event, EventBufferHeader *evt_buf_hdr) +{ + ControlProgramIdMsg *cpim = container_of(evt_buf_hdr, ControlProgramIdMsg, + ebh); + SCLPEventCPI *e = SCLP_EVENT_CPI(event); + + ascii_put(e->system_type, (char *)cpim->data.system_type, + sizeof(cpim->data.system_type)); + ascii_put(e->system_name, (char *)cpim->data.system_name, + sizeof(cpim->data.system_name)); + ascii_put(e->sysplex_name, (char *)cpim->data.sysplex_name, + sizeof(cpim->data.sysplex_name)); + e->system_level = ldq_be_p(&cpim->data.system_level); + e->timestamp = qemu_clock_get_ns(QEMU_CLOCK_HOST); + + cpim->ebh.flags = SCLP_EVENT_BUFFER_ACCEPTED; + return SCLP_RC_NORMAL_COMPLETION; +} + +static char *get_system_type(Object *obj, Error **errp) +{ + SCLPEventCPI *e = SCLP_EVENT_CPI(obj); + + return g_strndup((char *) e->system_type, sizeof(e->system_type)); +} + +static char *get_system_name(Object *obj, Error **errp) +{ + SCLPEventCPI *e = SCLP_EVENT_CPI(obj); + + return g_strndup((char *) e->system_name, sizeof(e->system_name)); +} + +static char *get_sysplex_name(Object *obj, Error **errp) +{ + SCLPEventCPI *e = SCLP_EVENT_CPI(obj); + + return g_strndup((char *) e->sysplex_name, sizeof(e->sysplex_name)); +} + +static void get_system_level(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + SCLPEventCPI *e = SCLP_EVENT_CPI(obj); + + visit_type_uint64(v, name, &e->system_level, errp); +} + +static void get_timestamp(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + SCLPEventCPI *e = SCLP_EVENT_CPI(obj); + + visit_type_uint64(v, name, &e->timestamp, errp); +} + +static const VMStateDescription vmstate_sclpcpi = { + .name = "s390_control_program_id", + .version_id = 0, + .fields = (const VMStateField[]) { + VMSTATE_UINT8_ARRAY(system_type, SCLPEventCPI, 8), + VMSTATE_UINT8_ARRAY(system_name, SCLPEventCPI, 8), + VMSTATE_UINT64(system_level, SCLPEventCPI), + VMSTATE_UINT8_ARRAY(sysplex_name, SCLPEventCPI, 8), + VMSTATE_UINT64(timestamp, SCLPEventCPI), + VMSTATE_END_OF_LIST() + } +}; + +static void cpi_class_init(ObjectClass *klass, const void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + SCLPEventClass *k = SCLP_EVENT_CLASS(klass); + + dc->user_creatable = false; + dc->vmsd = &vmstate_sclpcpi; + + k->can_handle_event = can_handle_event; + k->get_send_mask = send_mask; + k->get_receive_mask = receive_mask; + k->write_event_data = write_event_data; + + object_class_property_add_str(klass, "system_type", get_system_type, NULL); + object_class_property_set_description(klass, "system_type", + "operating system e.g. \"LINUX \""); + + object_class_property_add_str(klass, "system_name", get_system_name, NULL); + object_class_property_set_description(klass, "system_name", + "user configurable name of the VM e.g. \"TESTVM \""); + + object_class_property_add_str(klass, "sysplex_name", get_sysplex_name, + NULL); + object_class_property_set_description(klass, "sysplex_name", + "name of the cluster which the VM belongs to, if any" + " e.g. \"PLEX \""); + + object_class_property_add(klass, "system_level", "uint64", get_system_level, + NULL, NULL, NULL); + object_class_property_set_description(klass, "system_level", + "distribution and kernel version in Linux e.g. 74872343805430528"); + + object_class_property_add(klass, "timestamp", "uint64", get_timestamp, + NULL, NULL, NULL); + object_class_property_set_description(klass, "timestamp", + "latest update of CPI data in nanoseconds since the UNIX EPOCH"); +} + +static const TypeInfo sclp_cpi_info = { + .name = TYPE_SCLP_EVENT_CPI, + .parent = TYPE_SCLP_EVENT, + .instance_size = sizeof(SCLPEventCPI), + .class_init = cpi_class_init, +}; + +static void sclp_cpi_register_types(void) +{ + type_register_static(&sclp_cpi_info); +} + +type_init(sclp_cpi_register_types) diff --git a/hw/scsi/lsi53c895a.c b/hw/scsi/lsi53c895a.c index f4f2ef3..9ea4aa0 100644 --- a/hw/scsi/lsi53c895a.c +++ b/hw/scsi/lsi53c895a.c @@ -1112,7 +1112,7 @@ bad: static void lsi_memcpy(LSIState *s, uint32_t dest, uint32_t src, int count) { int n; - uint8_t buf[LSI_BUF_SIZE]; + QEMU_UNINITIALIZED uint8_t buf[LSI_BUF_SIZE]; trace_lsi_memcpy(dest, src, count); while (count) { diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c index 55cd188..844643d 100644 --- a/hw/scsi/megasas.c +++ b/hw/scsi/megasas.c @@ -981,13 +981,11 @@ static int megasas_event_wait(MegasasState *s, MegasasCmd *cmd) static int megasas_dcmd_pd_get_list(MegasasState *s, MegasasCmd *cmd) { - struct mfi_pd_list info; - size_t dcmd_size = sizeof(info); + struct mfi_pd_list info = {}; BusChild *kid; uint32_t offset, dcmd_limit, num_pd_disks = 0, max_pd_disks; dma_addr_t residual; - memset(&info, 0, dcmd_size); offset = 8; dcmd_limit = offset + sizeof(struct mfi_pd_address); if (cmd->iov_size < dcmd_limit) { @@ -1429,11 +1427,10 @@ static int megasas_dcmd_cfg_read(MegasasState *s, MegasasCmd *cmd) static int megasas_dcmd_get_properties(MegasasState *s, MegasasCmd *cmd) { - struct mfi_ctrl_props info; + struct mfi_ctrl_props info = {}; size_t dcmd_size = sizeof(info); dma_addr_t residual; - memset(&info, 0x0, dcmd_size); if (cmd->iov_size < dcmd_size) { trace_megasas_dcmd_invalid_xfer_len(cmd->index, cmd->iov_size, dcmd_size); diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c index cb4af1b..b4782c6 100644 --- a/hw/scsi/scsi-disk.c +++ b/hw/scsi/scsi-disk.c @@ -74,7 +74,7 @@ struct SCSIDiskClass { */ DMAIOFunc *dma_readv; DMAIOFunc *dma_writev; - bool (*need_fua_emulation)(SCSICommand *cmd); + bool (*need_fua)(SCSICommand *cmd); void (*update_sense)(SCSIRequest *r); }; @@ -85,7 +85,7 @@ typedef struct SCSIDiskReq { uint32_t sector_count; uint32_t buflen; bool started; - bool need_fua_emulation; + bool need_fua; struct iovec iov; QEMUIOVector qiov; BlockAcctCookie acct; @@ -389,24 +389,6 @@ static bool scsi_is_cmd_fua(SCSICommand *cmd) } } -static void scsi_write_do_fua(SCSIDiskReq *r) -{ - SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev); - - assert(r->req.aiocb == NULL); - assert(!r->req.io_canceled); - - if (r->need_fua_emulation) { - block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct, 0, - BLOCK_ACCT_FLUSH); - r->req.aiocb = blk_aio_flush(s->qdev.conf.blk, scsi_aio_complete, r); - return; - } - - scsi_req_complete(&r->req, GOOD); - scsi_req_unref(&r->req); -} - static void scsi_dma_complete_noio(SCSIDiskReq *r, int ret) { assert(r->req.aiocb == NULL); @@ -416,12 +398,7 @@ static void scsi_dma_complete_noio(SCSIDiskReq *r, int ret) r->sector += r->sector_count; r->sector_count = 0; - if (r->req.cmd.mode == SCSI_XFER_TO_DEV) { - scsi_write_do_fua(r); - return; - } else { - scsi_req_complete(&r->req, GOOD); - } + scsi_req_complete(&r->req, GOOD); done: scsi_req_unref(&r->req); @@ -564,7 +541,7 @@ static void scsi_read_data(SCSIRequest *req) first = !r->started; r->started = true; - if (first && r->need_fua_emulation) { + if (first && r->need_fua) { block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct, 0, BLOCK_ACCT_FLUSH); r->req.aiocb = blk_aio_flush(s->qdev.conf.blk, scsi_do_read_cb, r); @@ -589,8 +566,7 @@ static void scsi_write_complete_noio(SCSIDiskReq *r, int ret) r->sector += n; r->sector_count -= n; if (r->sector_count == 0) { - scsi_write_do_fua(r); - return; + scsi_req_complete(&r->req, GOOD); } else { scsi_init_iovec(r, SCSI_DMA_BUF_SIZE); trace_scsi_disk_write_complete_noio(r->req.tag, r->qiov.size); @@ -623,6 +599,7 @@ static void scsi_write_data(SCSIRequest *req) SCSIDiskReq *r = DO_UPCAST(SCSIDiskReq, req, req); SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev); SCSIDiskClass *sdc = (SCSIDiskClass *) object_get_class(OBJECT(s)); + BlockCompletionFunc *cb; /* No data transfer may already be in progress */ assert(r->req.aiocb == NULL); @@ -648,11 +625,10 @@ static void scsi_write_data(SCSIRequest *req) if (r->req.cmd.buf[0] == VERIFY_10 || r->req.cmd.buf[0] == VERIFY_12 || r->req.cmd.buf[0] == VERIFY_16) { - if (r->req.sg) { - scsi_dma_complete_noio(r, 0); - } else { - scsi_write_complete_noio(r, 0); - } + block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct, 0, + BLOCK_ACCT_FLUSH); + cb = r->req.sg ? scsi_dma_complete : scsi_write_complete; + r->req.aiocb = blk_aio_flush(s->qdev.conf.blk, cb, r); return; } @@ -2391,7 +2367,7 @@ static int32_t scsi_disk_dma_command(SCSIRequest *req, uint8_t *buf) scsi_check_condition(r, SENSE_CODE(LBA_OUT_OF_RANGE)); return 0; } - r->need_fua_emulation = sdc->need_fua_emulation(&r->req.cmd); + r->need_fua = sdc->need_fua(&r->req.cmd); if (r->sector_count == 0) { scsi_req_complete(&r->req, GOOD); } @@ -3137,7 +3113,8 @@ BlockAIOCB *scsi_dma_writev(int64_t offset, QEMUIOVector *iov, { SCSIDiskReq *r = opaque; SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev); - return blk_aio_pwritev(s->qdev.conf.blk, offset, iov, 0, cb, cb_opaque); + int flags = r->need_fua ? BDRV_REQ_FUA : 0; + return blk_aio_pwritev(s->qdev.conf.blk, offset, iov, flags, cb, cb_opaque); } static char *scsi_property_get_loadparm(Object *obj, Error **errp) @@ -3186,7 +3163,7 @@ static void scsi_disk_base_class_initfn(ObjectClass *klass, const void *data) device_class_set_legacy_reset(dc, scsi_disk_reset); sdc->dma_readv = scsi_dma_readv; sdc->dma_writev = scsi_dma_writev; - sdc->need_fua_emulation = scsi_is_cmd_fua; + sdc->need_fua = scsi_is_cmd_fua; } static const TypeInfo scsi_disk_base_info = { @@ -3215,7 +3192,7 @@ static const Property scsi_hd_properties[] = { DEFINE_PROP_BIT("removable", SCSIDiskState, features, SCSI_DISK_F_REMOVABLE, false), DEFINE_PROP_BIT("dpofua", SCSIDiskState, features, - SCSI_DISK_F_DPOFUA, false), + SCSI_DISK_F_DPOFUA, true), DEFINE_PROP_UINT64("wwn", SCSIDiskState, qdev.wwn, 0), DEFINE_PROP_UINT64("port_wwn", SCSIDiskState, qdev.port_wwn, 0), DEFINE_PROP_UINT16("port_index", SCSIDiskState, port_index, 0), @@ -3338,7 +3315,7 @@ static void scsi_block_class_initfn(ObjectClass *klass, const void *data) sdc->dma_readv = scsi_block_dma_readv; sdc->dma_writev = scsi_block_dma_writev; sdc->update_sense = scsi_block_update_sense; - sdc->need_fua_emulation = scsi_block_no_fua; + sdc->need_fua = scsi_block_no_fua; dc->desc = "SCSI block device passthrough"; device_class_set_props(dc, scsi_block_properties); dc->vmsd = &vmstate_scsi_disk_state; diff --git a/hw/scsi/vmw_pvscsi.c b/hw/scsi/vmw_pvscsi.c index d5825b6..7c98b1b 100644 --- a/hw/scsi/vmw_pvscsi.c +++ b/hw/scsi/vmw_pvscsi.c @@ -68,18 +68,7 @@ struct PVSCSIClass { OBJECT_DECLARE_TYPE(PVSCSIState, PVSCSIClass, PVSCSI) -/* Compatibility flags for migration */ -#define PVSCSI_COMPAT_OLD_PCI_CONFIGURATION_BIT 0 -#define PVSCSI_COMPAT_OLD_PCI_CONFIGURATION \ - (1 << PVSCSI_COMPAT_OLD_PCI_CONFIGURATION_BIT) -#define PVSCSI_COMPAT_DISABLE_PCIE_BIT 1 -#define PVSCSI_COMPAT_DISABLE_PCIE \ - (1 << PVSCSI_COMPAT_DISABLE_PCIE_BIT) - -#define PVSCSI_USE_OLD_PCI_CONFIGURATION(s) \ - ((s)->compat_flags & PVSCSI_COMPAT_OLD_PCI_CONFIGURATION) -#define PVSCSI_MSI_OFFSET(s) \ - (PVSCSI_USE_OLD_PCI_CONFIGURATION(s) ? 0x50 : 0x7c) +#define PVSCSI_MSI_OFFSET (0x7c) #define PVSCSI_EXP_EP_OFFSET (0x40) typedef struct PVSCSIRingInfo { @@ -129,8 +118,6 @@ struct PVSCSIState { uint8_t msi_used; /* For migration compatibility */ PVSCSIRingInfo rings; /* Data transfer rings manager */ uint32_t resetting; /* Reset in progress */ - - uint32_t compat_flags; }; typedef struct PVSCSIRequest { @@ -1110,7 +1097,7 @@ pvscsi_init_msi(PVSCSIState *s) int res; PCIDevice *d = PCI_DEVICE(s); - res = msi_init(d, PVSCSI_MSI_OFFSET(s), PVSCSI_MSIX_NUM_VECTORS, + res = msi_init(d, PVSCSI_MSI_OFFSET, PVSCSI_MSIX_NUM_VECTORS, PVSCSI_USE_64BIT, PVSCSI_PER_VECTOR_MASK, NULL); if (res < 0) { trace_pvscsi_init_msi_fail(res); @@ -1158,15 +1145,11 @@ pvscsi_realizefn(PCIDevice *pci_dev, Error **errp) trace_pvscsi_state("init"); /* PCI subsystem ID, subsystem vendor ID, revision */ - if (PVSCSI_USE_OLD_PCI_CONFIGURATION(s)) { - pci_set_word(pci_dev->config + PCI_SUBSYSTEM_ID, 0x1000); - } else { - pci_set_word(pci_dev->config + PCI_SUBSYSTEM_VENDOR_ID, - PCI_VENDOR_ID_VMWARE); - pci_set_word(pci_dev->config + PCI_SUBSYSTEM_ID, - PCI_DEVICE_ID_VMWARE_PVSCSI); - pci_config_set_revision(pci_dev->config, 0x2); - } + pci_set_word(pci_dev->config + PCI_SUBSYSTEM_VENDOR_ID, + PCI_VENDOR_ID_VMWARE); + pci_set_word(pci_dev->config + PCI_SUBSYSTEM_ID, + PCI_DEVICE_ID_VMWARE_PVSCSI); + pci_config_set_revision(pci_dev->config, 0x2); /* PCI latency timer = 255 */ pci_dev->config[PCI_LATENCY_TIMER] = 0xff; @@ -1234,21 +1217,8 @@ pvscsi_post_load(void *opaque, int version_id) return 0; } -static bool pvscsi_vmstate_need_pcie_device(void *opaque) -{ - PVSCSIState *s = PVSCSI(opaque); - - return !(s->compat_flags & PVSCSI_COMPAT_DISABLE_PCIE); -} - -static bool pvscsi_vmstate_test_pci_device(void *opaque, int version_id) -{ - return !pvscsi_vmstate_need_pcie_device(opaque); -} - static const VMStateDescription vmstate_pvscsi_pcie_device = { .name = "pvscsi/pcie", - .needed = pvscsi_vmstate_need_pcie_device, .fields = (const VMStateField[]) { VMSTATE_PCI_DEVICE(parent_obj, PVSCSIState), VMSTATE_END_OF_LIST() @@ -1262,9 +1232,6 @@ static const VMStateDescription vmstate_pvscsi = { .pre_save = pvscsi_pre_save, .post_load = pvscsi_post_load, .fields = (const VMStateField[]) { - VMSTATE_STRUCT_TEST(parent_obj, PVSCSIState, - pvscsi_vmstate_test_pci_device, 0, - vmstate_pci_device, PCIDevice), VMSTATE_UINT8(msi_used, PVSCSIState), VMSTATE_UINT32(resetting, PVSCSIState), VMSTATE_UINT64(reg_interrupt_status, PVSCSIState), @@ -1298,30 +1265,17 @@ static const VMStateDescription vmstate_pvscsi = { static const Property pvscsi_properties[] = { DEFINE_PROP_UINT8("use_msg", PVSCSIState, use_msg, 1), - DEFINE_PROP_BIT("x-old-pci-configuration", PVSCSIState, compat_flags, - PVSCSI_COMPAT_OLD_PCI_CONFIGURATION_BIT, false), - DEFINE_PROP_BIT("x-disable-pcie", PVSCSIState, compat_flags, - PVSCSI_COMPAT_DISABLE_PCIE_BIT, false), }; -static void pvscsi_realize(DeviceState *qdev, Error **errp) +static void pvscsi_instance_init(Object *obj) { - PVSCSIClass *pvs_c = PVSCSI_GET_CLASS(qdev); - PCIDevice *pci_dev = PCI_DEVICE(qdev); - PVSCSIState *s = PVSCSI(qdev); - - if (!(s->compat_flags & PVSCSI_COMPAT_DISABLE_PCIE)) { - pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS; - } - - pvs_c->parent_dc_realize(qdev, errp); + PCI_DEVICE(obj)->cap_present |= QEMU_PCI_CAP_EXPRESS; } static void pvscsi_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); - PVSCSIClass *pvs_k = PVSCSI_CLASS(klass); HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(klass); k->realize = pvscsi_realizefn; @@ -1330,8 +1284,6 @@ static void pvscsi_class_init(ObjectClass *klass, const void *data) k->device_id = PCI_DEVICE_ID_VMWARE_PVSCSI; k->class_id = PCI_CLASS_STORAGE_SCSI; k->subsystem_id = 0x1000; - device_class_set_parent_realize(dc, pvscsi_realize, - &pvs_k->parent_dc_realize); device_class_set_legacy_reset(dc, pvscsi_reset); dc->vmsd = &vmstate_pvscsi; device_class_set_props(dc, pvscsi_properties); @@ -1346,6 +1298,7 @@ static const TypeInfo pvscsi_info = { .class_size = sizeof(PVSCSIClass), .instance_size = sizeof(PVSCSIState), .class_init = pvscsi_class_init, + .instance_init = pvscsi_instance_init, .interfaces = (const InterfaceInfo[]) { { TYPE_HOTPLUG_HANDLER }, { INTERFACE_PCIE_DEVICE }, diff --git a/hw/scsi/vmw_pvscsi.h b/hw/scsi/vmw_pvscsi.h index 17fcf66..a3ae517 100644 --- a/hw/scsi/vmw_pvscsi.h +++ b/hw/scsi/vmw_pvscsi.h @@ -14,8 +14,8 @@ * details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * along with this program; if not, see + * <https://www.gnu.org/licenses/>. * * Maintained by: Arvind Kumar <arvindkumar@vmware.com> * diff --git a/hw/timer/hpet.c b/hw/timer/hpet.c index d1b7bc5..cb48cc1 100644 --- a/hw/timer/hpet.c +++ b/hw/timer/hpet.c @@ -328,16 +328,16 @@ static const VMStateDescription vmstate_hpet_timer = { static const VMStateDescription vmstate_hpet = { .name = "hpet", .version_id = 2, - .minimum_version_id = 1, + .minimum_version_id = 2, .pre_save = hpet_pre_save, .post_load = hpet_post_load, .fields = (const VMStateField[]) { VMSTATE_UINT64(config, HPETState), VMSTATE_UINT64(isr, HPETState), VMSTATE_UINT64(hpet_counter, HPETState), - VMSTATE_UINT8_V(num_timers_save, HPETState, 2), + VMSTATE_UINT8(num_timers_save, HPETState), VMSTATE_VALIDATE("num_timers must match", hpet_validate_num_timers), - VMSTATE_STRUCT_VARRAY_UINT8(timer, HPETState, num_timers, 0, + VMSTATE_STRUCT_VARRAY_UINT8(timer, HPETState, num_timers_save, 0, vmstate_hpet_timer, HPETTimer), VMSTATE_END_OF_LIST() }, @@ -426,30 +426,11 @@ static uint64_t hpet_ram_read(void *opaque, hwaddr addr, uint64_t cur_tick; trace_hpet_ram_read(addr); + addr &= ~4; - /*address range of all TN regs*/ - if (addr >= 0x100 && addr <= 0x3ff) { - uint8_t timer_id = (addr - 0x100) / 0x20; - HPETTimer *timer = &s->timer[timer_id]; - - if (timer_id > s->num_timers) { - trace_hpet_timer_id_out_of_range(timer_id); - return 0; - } - - switch (addr & 0x18) { - case HPET_TN_CFG: // including interrupt capabilities - return timer->config >> shift; - case HPET_TN_CMP: // comparator register - return timer->cmp >> shift; - case HPET_TN_ROUTE: - return timer->fsb >> shift; - default: - trace_hpet_ram_read_invalid(); - break; - } - } else { - switch (addr & ~4) { + /*address range of all global regs*/ + if (addr <= 0xff) { + switch (addr) { case HPET_ID: // including HPET_PERIOD return s->capability >> shift; case HPET_CFG: @@ -468,6 +449,26 @@ static uint64_t hpet_ram_read(void *opaque, hwaddr addr, trace_hpet_ram_read_invalid(); break; } + } else { + uint8_t timer_id = (addr - 0x100) / 0x20; + HPETTimer *timer = &s->timer[timer_id]; + + if (timer_id > s->num_timers) { + trace_hpet_timer_id_out_of_range(timer_id); + return 0; + } + + switch (addr & 0x1f) { + case HPET_TN_CFG: // including interrupt capabilities + return timer->config >> shift; + case HPET_TN_CMP: // comparator register + return timer->cmp >> shift; + case HPET_TN_ROUTE: + return timer->fsb >> shift; + default: + trace_hpet_ram_read_invalid(); + break; + } } return 0; } @@ -482,9 +483,67 @@ static void hpet_ram_write(void *opaque, hwaddr addr, uint64_t old_val, new_val, cleared; trace_hpet_ram_write(addr, value); + addr &= ~4; - /*address range of all TN regs*/ - if (addr >= 0x100 && addr <= 0x3ff) { + /*address range of all global regs*/ + if (addr <= 0xff) { + switch (addr) { + case HPET_ID: + return; + case HPET_CFG: + old_val = s->config; + new_val = deposit64(old_val, shift, len, value); + new_val = hpet_fixup_reg(new_val, old_val, HPET_CFG_WRITE_MASK); + s->config = new_val; + if (activating_bit(old_val, new_val, HPET_CFG_ENABLE)) { + /* Enable main counter and interrupt generation. */ + s->hpet_offset = + ticks_to_ns(s->hpet_counter) - qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); + for (i = 0; i < s->num_timers; i++) { + if (timer_enabled(&s->timer[i]) && (s->isr & (1 << i))) { + update_irq(&s->timer[i], 1); + } + hpet_set_timer(&s->timer[i]); + } + } else if (deactivating_bit(old_val, new_val, HPET_CFG_ENABLE)) { + /* Halt main counter and disable interrupt generation. */ + s->hpet_counter = hpet_get_ticks(s); + for (i = 0; i < s->num_timers; i++) { + hpet_del_timer(&s->timer[i]); + } + } + /* i8254 and RTC output pins are disabled + * when HPET is in legacy mode */ + if (activating_bit(old_val, new_val, HPET_CFG_LEGACY)) { + qemu_set_irq(s->pit_enabled, 0); + qemu_irq_lower(s->irqs[0]); + qemu_irq_lower(s->irqs[RTC_ISA_IRQ]); + } else if (deactivating_bit(old_val, new_val, HPET_CFG_LEGACY)) { + qemu_irq_lower(s->irqs[0]); + qemu_set_irq(s->pit_enabled, 1); + qemu_set_irq(s->irqs[RTC_ISA_IRQ], s->rtc_irq_level); + } + break; + case HPET_STATUS: + new_val = value << shift; + cleared = new_val & s->isr; + for (i = 0; i < s->num_timers; i++) { + if (cleared & (1 << i)) { + update_irq(&s->timer[i], 0); + } + } + break; + case HPET_COUNTER: + if (hpet_enabled(s)) { + trace_hpet_ram_write_counter_write_while_enabled(); + } + s->hpet_counter = deposit64(s->hpet_counter, shift, len, value); + break; + default: + trace_hpet_ram_write_invalid(); + break; + } + } else { uint8_t timer_id = (addr - 0x100) / 0x20; HPETTimer *timer = &s->timer[timer_id]; @@ -550,63 +609,6 @@ static void hpet_ram_write(void *opaque, hwaddr addr, break; } return; - } else { - switch (addr & ~4) { - case HPET_ID: - return; - case HPET_CFG: - old_val = s->config; - new_val = deposit64(old_val, shift, len, value); - new_val = hpet_fixup_reg(new_val, old_val, HPET_CFG_WRITE_MASK); - s->config = new_val; - if (activating_bit(old_val, new_val, HPET_CFG_ENABLE)) { - /* Enable main counter and interrupt generation. */ - s->hpet_offset = - ticks_to_ns(s->hpet_counter) - qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); - for (i = 0; i < s->num_timers; i++) { - if (timer_enabled(&s->timer[i]) && (s->isr & (1 << i))) { - update_irq(&s->timer[i], 1); - } - hpet_set_timer(&s->timer[i]); - } - } else if (deactivating_bit(old_val, new_val, HPET_CFG_ENABLE)) { - /* Halt main counter and disable interrupt generation. */ - s->hpet_counter = hpet_get_ticks(s); - for (i = 0; i < s->num_timers; i++) { - hpet_del_timer(&s->timer[i]); - } - } - /* i8254 and RTC output pins are disabled - * when HPET is in legacy mode */ - if (activating_bit(old_val, new_val, HPET_CFG_LEGACY)) { - qemu_set_irq(s->pit_enabled, 0); - qemu_irq_lower(s->irqs[0]); - qemu_irq_lower(s->irqs[RTC_ISA_IRQ]); - } else if (deactivating_bit(old_val, new_val, HPET_CFG_LEGACY)) { - qemu_irq_lower(s->irqs[0]); - qemu_set_irq(s->pit_enabled, 1); - qemu_set_irq(s->irqs[RTC_ISA_IRQ], s->rtc_irq_level); - } - break; - case HPET_STATUS: - new_val = value << shift; - cleared = new_val & s->isr; - for (i = 0; i < s->num_timers; i++) { - if (cleared & (1 << i)) { - update_irq(&s->timer[i], 0); - } - } - break; - case HPET_COUNTER: - if (hpet_enabled(s)) { - trace_hpet_ram_write_counter_write_while_enabled(); - } - s->hpet_counter = deposit64(s->hpet_counter, shift, len, value); - break; - default: - trace_hpet_ram_write_invalid(); - break; - } } } @@ -689,8 +691,14 @@ static void hpet_realize(DeviceState *dev, Error **errp) int i; HPETTimer *timer; + if (s->num_timers < HPET_MIN_TIMERS || s->num_timers > HPET_MAX_TIMERS) { + error_setg(errp, "hpet.num_timers must be between %d and %d", + HPET_MIN_TIMERS, HPET_MAX_TIMERS); + return; + } if (!s->intcap) { - warn_report("Hpet's intcap not initialized"); + error_setg(errp, "hpet.hpet-intcap not initialized"); + return; } if (hpet_fw_cfg.count == UINT8_MAX) { /* first instance */ @@ -698,7 +706,7 @@ static void hpet_realize(DeviceState *dev, Error **errp) } if (hpet_fw_cfg.count == 8) { - error_setg(errp, "Only 8 instances of HPET is allowed"); + error_setg(errp, "Only 8 instances of HPET are allowed"); return; } @@ -708,11 +716,6 @@ static void hpet_realize(DeviceState *dev, Error **errp) sysbus_init_irq(sbd, &s->irqs[i]); } - if (s->num_timers < HPET_MIN_TIMERS) { - s->num_timers = HPET_MIN_TIMERS; - } else if (s->num_timers > HPET_MAX_TIMERS) { - s->num_timers = HPET_MAX_TIMERS; - } for (i = 0; i < HPET_MAX_TIMERS; i++) { timer = &s->timer[i]; timer->qemu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, hpet_timer, timer); diff --git a/hw/ufs/lu.c b/hw/ufs/lu.c index 57b307e..2d8ffd7 100644 --- a/hw/ufs/lu.c +++ b/hw/ufs/lu.c @@ -194,7 +194,7 @@ static int ufs_emulate_wlun_inquiry(UfsRequest *req, uint8_t *outbuf, static UfsReqResult ufs_emulate_scsi_cmd(UfsLu *lu, UfsRequest *req) { uint8_t lun = lu->lun; - uint8_t outbuf[4096]; + QEMU_UNINITIALIZED uint8_t outbuf[4096]; uint8_t sense_buf[UFS_SENSE_SIZE]; uint8_t scsi_status; int len = 0; diff --git a/hw/usb/hcd-ohci.c b/hw/usb/hcd-ohci.c index 71b5491..72a9f9f 100644 --- a/hw/usb/hcd-ohci.c +++ b/hw/usb/hcd-ohci.c @@ -577,7 +577,7 @@ static int ohci_service_iso_td(OHCIState *ohci, struct ohci_ed *ed) USBDevice *dev; USBEndpoint *ep; USBPacket *pkt; - uint8_t buf[8192]; + QEMU_UNINITIALIZED uint8_t buf[8192]; bool int_req; struct ohci_iso_td iso_td; uint32_t addr; diff --git a/hw/vfio-user/Kconfig b/hw/vfio-user/Kconfig new file mode 100644 index 0000000..24bdf7a --- /dev/null +++ b/hw/vfio-user/Kconfig @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +config VFIO_USER + bool + default y + depends on VFIO_PCI + diff --git a/hw/vfio-user/container.c b/hw/vfio-user/container.c new file mode 100644 index 0000000..d318e6a --- /dev/null +++ b/hw/vfio-user/container.c @@ -0,0 +1,361 @@ +/* + * Container for vfio-user IOMMU type: rather than communicating with the kernel + * vfio driver, we communicate over a socket to a server using the vfio-user + * protocol. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include <sys/ioctl.h> +#include <linux/vfio.h> +#include "qemu/osdep.h" + +#include "hw/vfio-user/container.h" +#include "hw/vfio-user/device.h" +#include "hw/vfio-user/trace.h" +#include "hw/vfio/vfio-device.h" +#include "hw/vfio/vfio-listener.h" +#include "qapi/error.h" + +/* + * When DMA space is the physical address space, the region add/del listeners + * will fire during memory update transactions. These depend on BQL being held, + * so do any resulting map/demap ops async while keeping BQL. + */ +static void vfio_user_listener_begin(VFIOContainerBase *bcontainer) +{ + VFIOUserContainer *container = container_of(bcontainer, VFIOUserContainer, + bcontainer); + + container->proxy->async_ops = true; +} + +static void vfio_user_listener_commit(VFIOContainerBase *bcontainer) +{ + VFIOUserContainer *container = container_of(bcontainer, VFIOUserContainer, + bcontainer); + + /* wait here for any async requests sent during the transaction */ + container->proxy->async_ops = false; + vfio_user_wait_reqs(container->proxy); +} + +static int vfio_user_dma_unmap(const VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb, bool unmap_all) +{ + VFIOUserContainer *container = container_of(bcontainer, VFIOUserContainer, + bcontainer); + Error *local_err = NULL; + int ret = 0; + + VFIOUserDMAUnmap *msgp = g_malloc(sizeof(*msgp)); + + vfio_user_request_msg(&msgp->hdr, VFIO_USER_DMA_UNMAP, sizeof(*msgp), 0); + msgp->argsz = sizeof(struct vfio_iommu_type1_dma_unmap); + msgp->flags = unmap_all ? VFIO_DMA_UNMAP_FLAG_ALL : 0; + msgp->iova = iova; + msgp->size = size; + trace_vfio_user_dma_unmap(msgp->iova, msgp->size, msgp->flags, + container->proxy->async_ops); + + if (container->proxy->async_ops) { + if (!vfio_user_send_nowait(container->proxy, &msgp->hdr, NULL, + 0, &local_err)) { + error_report_err(local_err); + ret = -EFAULT; + } else { + ret = 0; + } + } else { + if (!vfio_user_send_wait(container->proxy, &msgp->hdr, NULL, + 0, &local_err)) { + error_report_err(local_err); + ret = -EFAULT; + } + + if (msgp->hdr.flags & VFIO_USER_ERROR) { + ret = -msgp->hdr.error_reply; + } + + g_free(msgp); + } + + return ret; +} + +static int vfio_user_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova, + ram_addr_t size, void *vaddr, bool readonly, + MemoryRegion *mrp) +{ + VFIOUserContainer *container = container_of(bcontainer, VFIOUserContainer, + bcontainer); + int fd = memory_region_get_fd(mrp); + Error *local_err = NULL; + int ret; + + VFIOUserFDs *fds = NULL; + VFIOUserDMAMap *msgp = g_malloc0(sizeof(*msgp)); + + vfio_user_request_msg(&msgp->hdr, VFIO_USER_DMA_MAP, sizeof(*msgp), 0); + msgp->argsz = sizeof(struct vfio_iommu_type1_dma_map); + msgp->flags = VFIO_DMA_MAP_FLAG_READ; + msgp->offset = 0; + msgp->iova = iova; + msgp->size = size; + + /* + * vaddr enters as a QEMU process address; make it either a file offset + * for mapped areas or leave as 0. + */ + if (fd != -1) { + msgp->offset = qemu_ram_block_host_offset(mrp->ram_block, vaddr); + } + + if (!readonly) { + msgp->flags |= VFIO_DMA_MAP_FLAG_WRITE; + } + + trace_vfio_user_dma_map(msgp->iova, msgp->size, msgp->offset, msgp->flags, + container->proxy->async_ops); + + /* + * The async_ops case sends without blocking. They're later waited for in + * vfio_send_wait_reqs. + */ + if (container->proxy->async_ops) { + /* can't use auto variable since we don't block */ + if (fd != -1) { + fds = vfio_user_getfds(1); + fds->send_fds = 1; + fds->fds[0] = fd; + } + + if (!vfio_user_send_nowait(container->proxy, &msgp->hdr, fds, + 0, &local_err)) { + error_report_err(local_err); + ret = -EFAULT; + } else { + ret = 0; + } + } else { + VFIOUserFDs local_fds = { 1, 0, &fd }; + + fds = fd != -1 ? &local_fds : NULL; + + if (!vfio_user_send_wait(container->proxy, &msgp->hdr, fds, + 0, &local_err)) { + error_report_err(local_err); + ret = -EFAULT; + } + + if (msgp->hdr.flags & VFIO_USER_ERROR) { + ret = -msgp->hdr.error_reply; + } + + g_free(msgp); + } + + return ret; +} + +static int +vfio_user_set_dirty_page_tracking(const VFIOContainerBase *bcontainer, + bool start, Error **errp) +{ + error_setg_errno(errp, ENOTSUP, "Not supported"); + return -ENOTSUP; +} + +static int vfio_user_query_dirty_bitmap(const VFIOContainerBase *bcontainer, + VFIOBitmap *vbmap, hwaddr iova, + hwaddr size, Error **errp) +{ + error_setg_errno(errp, ENOTSUP, "Not supported"); + return -ENOTSUP; +} + +static bool vfio_user_setup(VFIOContainerBase *bcontainer, Error **errp) +{ + VFIOUserContainer *container = container_of(bcontainer, VFIOUserContainer, + bcontainer); + + assert(container->proxy->dma_pgsizes != 0); + bcontainer->pgsizes = container->proxy->dma_pgsizes; + bcontainer->dma_max_mappings = container->proxy->max_dma; + + /* No live migration support yet. */ + bcontainer->dirty_pages_supported = false; + bcontainer->max_dirty_bitmap_size = container->proxy->max_bitmap; + bcontainer->dirty_pgsizes = container->proxy->migr_pgsize; + + return true; +} + +static VFIOUserContainer *vfio_user_create_container(VFIODevice *vbasedev, + Error **errp) +{ + VFIOUserContainer *container; + + container = VFIO_IOMMU_USER(object_new(TYPE_VFIO_IOMMU_USER)); + container->proxy = vbasedev->proxy; + return container; +} + +/* + * Try to mirror vfio_container_connect() as much as possible. + */ +static VFIOUserContainer * +vfio_user_container_connect(AddressSpace *as, VFIODevice *vbasedev, + Error **errp) +{ + VFIOContainerBase *bcontainer; + VFIOUserContainer *container; + VFIOAddressSpace *space; + VFIOIOMMUClass *vioc; + int ret; + + space = vfio_address_space_get(as); + + container = vfio_user_create_container(vbasedev, errp); + if (!container) { + goto put_space_exit; + } + + bcontainer = &container->bcontainer; + + ret = ram_block_uncoordinated_discard_disable(true); + if (ret) { + error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken"); + goto free_container_exit; + } + + vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + assert(vioc->setup); + + if (!vioc->setup(bcontainer, errp)) { + goto enable_discards_exit; + } + + vfio_address_space_insert(space, bcontainer); + + if (!vfio_listener_register(bcontainer, errp)) { + goto listener_release_exit; + } + + bcontainer->initialized = true; + + return container; + +listener_release_exit: + vfio_listener_unregister(bcontainer); + if (vioc->release) { + vioc->release(bcontainer); + } + +enable_discards_exit: + ram_block_uncoordinated_discard_disable(false); + +free_container_exit: + object_unref(container); + +put_space_exit: + vfio_address_space_put(space); + + return NULL; +} + +static void vfio_user_container_disconnect(VFIOUserContainer *container) +{ + VFIOContainerBase *bcontainer = &container->bcontainer; + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + VFIOAddressSpace *space = bcontainer->space; + + ram_block_uncoordinated_discard_disable(false); + + vfio_listener_unregister(bcontainer); + if (vioc->release) { + vioc->release(bcontainer); + } + + object_unref(container); + + vfio_address_space_put(space); +} + +static bool vfio_user_device_get(VFIOUserContainer *container, + VFIODevice *vbasedev, Error **errp) +{ + struct vfio_device_info info = { .argsz = sizeof(info) }; + + + if (!vfio_user_get_device_info(vbasedev->proxy, &info, errp)) { + return false; + } + + vbasedev->fd = -1; + + vfio_device_prepare(vbasedev, &container->bcontainer, &info); + + return true; +} + +/* + * vfio_user_device_attach: attach a device to a new container. + */ +static bool vfio_user_device_attach(const char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp) +{ + VFIOUserContainer *container; + + container = vfio_user_container_connect(as, vbasedev, errp); + if (container == NULL) { + error_prepend(errp, "failed to connect proxy"); + return false; + } + + return vfio_user_device_get(container, vbasedev, errp); +} + +static void vfio_user_device_detach(VFIODevice *vbasedev) +{ + VFIOUserContainer *container = container_of(vbasedev->bcontainer, + VFIOUserContainer, bcontainer); + + vfio_device_unprepare(vbasedev); + + vfio_user_container_disconnect(container); +} + +static int vfio_user_pci_hot_reset(VFIODevice *vbasedev, bool single) +{ + /* ->needs_reset is always false for vfio-user. */ + return 0; +} + +static void vfio_iommu_user_class_init(ObjectClass *klass, const void *data) +{ + VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); + + vioc->setup = vfio_user_setup; + vioc->listener_begin = vfio_user_listener_begin, + vioc->listener_commit = vfio_user_listener_commit, + vioc->dma_map = vfio_user_dma_map; + vioc->dma_unmap = vfio_user_dma_unmap; + vioc->attach_device = vfio_user_device_attach; + vioc->detach_device = vfio_user_device_detach; + vioc->set_dirty_page_tracking = vfio_user_set_dirty_page_tracking; + vioc->query_dirty_bitmap = vfio_user_query_dirty_bitmap; + vioc->pci_hot_reset = vfio_user_pci_hot_reset; +}; + +static const TypeInfo types[] = { + { + .name = TYPE_VFIO_IOMMU_USER, + .parent = TYPE_VFIO_IOMMU, + .instance_size = sizeof(VFIOUserContainer), + .class_init = vfio_iommu_user_class_init, + }, +}; + +DEFINE_TYPES(types) diff --git a/hw/vfio-user/container.h b/hw/vfio-user/container.h new file mode 100644 index 0000000..2bb1fa1 --- /dev/null +++ b/hw/vfio-user/container.h @@ -0,0 +1,23 @@ +/* + * vfio-user specific definitions. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_VFIO_USER_CONTAINER_H +#define HW_VFIO_USER_CONTAINER_H + +#include "qemu/osdep.h" + +#include "hw/vfio/vfio-container-base.h" +#include "hw/vfio-user/proxy.h" + +/* MMU container sub-class for vfio-user. */ +typedef struct VFIOUserContainer { + VFIOContainerBase bcontainer; + VFIOUserProxy *proxy; +} VFIOUserContainer; + +OBJECT_DECLARE_SIMPLE_TYPE(VFIOUserContainer, VFIO_IOMMU_USER); + +#endif /* HW_VFIO_USER_CONTAINER_H */ diff --git a/hw/vfio-user/device.c b/hw/vfio-user/device.c new file mode 100644 index 0000000..0609a7d --- /dev/null +++ b/hw/vfio-user/device.c @@ -0,0 +1,441 @@ +/* + * vfio protocol over a UNIX socket device handling. + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qemu/lockable.h" +#include "qemu/thread.h" + +#include "hw/vfio-user/device.h" +#include "hw/vfio-user/trace.h" + +/* + * These are to defend against a malign server trying + * to force us to run out of memory. + */ +#define VFIO_USER_MAX_REGIONS 100 +#define VFIO_USER_MAX_IRQS 50 + +bool vfio_user_get_device_info(VFIOUserProxy *proxy, + struct vfio_device_info *info, Error **errp) +{ + VFIOUserDeviceInfo msg; + uint32_t argsz = sizeof(msg) - sizeof(msg.hdr); + + memset(&msg, 0, sizeof(msg)); + vfio_user_request_msg(&msg.hdr, VFIO_USER_DEVICE_GET_INFO, sizeof(msg), 0); + msg.argsz = argsz; + + if (!vfio_user_send_wait(proxy, &msg.hdr, NULL, 0, errp)) { + return false; + } + + if (msg.hdr.flags & VFIO_USER_ERROR) { + error_setg_errno(errp, -msg.hdr.error_reply, + "VFIO_USER_DEVICE_GET_INFO failed"); + return false; + } + + trace_vfio_user_get_info(msg.num_regions, msg.num_irqs); + + memcpy(info, &msg.argsz, argsz); + + /* defend against a malicious server */ + if (info->num_regions > VFIO_USER_MAX_REGIONS || + info->num_irqs > VFIO_USER_MAX_IRQS) { + error_setg_errno(errp, EINVAL, "invalid reply"); + return false; + } + + return true; +} + +void vfio_user_device_reset(VFIOUserProxy *proxy) +{ + Error *local_err = NULL; + VFIOUserHdr hdr; + + vfio_user_request_msg(&hdr, VFIO_USER_DEVICE_RESET, sizeof(hdr), 0); + + if (!vfio_user_send_wait(proxy, &hdr, NULL, 0, &local_err)) { + error_prepend(&local_err, "%s: ", __func__); + error_report_err(local_err); + return; + } + + if (hdr.flags & VFIO_USER_ERROR) { + error_printf("reset reply error %d\n", hdr.error_reply); + } +} + +static int vfio_user_get_region_info(VFIOUserProxy *proxy, + struct vfio_region_info *info, + VFIOUserFDs *fds) +{ + g_autofree VFIOUserRegionInfo *msgp = NULL; + Error *local_err = NULL; + uint32_t size; + + /* data returned can be larger than vfio_region_info */ + if (info->argsz < sizeof(*info)) { + error_printf("vfio_user_get_region_info argsz too small\n"); + return -E2BIG; + } + if (fds != NULL && fds->send_fds != 0) { + error_printf("vfio_user_get_region_info can't send FDs\n"); + return -EINVAL; + } + + size = info->argsz + sizeof(VFIOUserHdr); + msgp = g_malloc0(size); + + vfio_user_request_msg(&msgp->hdr, VFIO_USER_DEVICE_GET_REGION_INFO, + sizeof(*msgp), 0); + msgp->argsz = info->argsz; + msgp->index = info->index; + + if (!vfio_user_send_wait(proxy, &msgp->hdr, fds, size, &local_err)) { + error_prepend(&local_err, "%s: ", __func__); + error_report_err(local_err); + return -EFAULT; + } + + if (msgp->hdr.flags & VFIO_USER_ERROR) { + return -msgp->hdr.error_reply; + } + trace_vfio_user_get_region_info(msgp->index, msgp->flags, msgp->size); + + memcpy(info, &msgp->argsz, info->argsz); + + /* + * If at least one region is directly mapped into the VM, then we can no + * longer rely on the sequential nature of vfio-user request handling to + * ensure that posted writes are completed before a subsequent read. In this + * case, disable posted write support. This is a per-device property, not + * per-region. + */ + if (info->flags & VFIO_REGION_INFO_FLAG_MMAP) { + vfio_user_disable_posted_writes(proxy); + } + + return 0; +} + +static int vfio_user_device_io_get_region_info(VFIODevice *vbasedev, + struct vfio_region_info *info, + int *fd) +{ + VFIOUserFDs fds = { 0, 1, fd}; + int ret; + + if (info->index > vbasedev->num_regions) { + return -EINVAL; + } + + ret = vfio_user_get_region_info(vbasedev->proxy, info, &fds); + if (ret) { + return ret; + } + + /* cap_offset in valid area */ + if ((info->flags & VFIO_REGION_INFO_FLAG_CAPS) && + (info->cap_offset < sizeof(*info) || info->cap_offset > info->argsz)) { + return -EINVAL; + } + + return 0; +} + +static int vfio_user_device_io_get_irq_info(VFIODevice *vbasedev, + struct vfio_irq_info *info) +{ + VFIOUserProxy *proxy = vbasedev->proxy; + Error *local_err = NULL; + VFIOUserIRQInfo msg; + + memset(&msg, 0, sizeof(msg)); + vfio_user_request_msg(&msg.hdr, VFIO_USER_DEVICE_GET_IRQ_INFO, + sizeof(msg), 0); + msg.argsz = info->argsz; + msg.index = info->index; + + if (!vfio_user_send_wait(proxy, &msg.hdr, NULL, 0, &local_err)) { + error_prepend(&local_err, "%s: ", __func__); + error_report_err(local_err); + return -EFAULT; + } + + if (msg.hdr.flags & VFIO_USER_ERROR) { + return -msg.hdr.error_reply; + } + trace_vfio_user_get_irq_info(msg.index, msg.flags, msg.count); + + memcpy(info, &msg.argsz, sizeof(*info)); + return 0; +} + +static int irq_howmany(int *fdp, uint32_t cur, uint32_t max) +{ + int n = 0; + + if (fdp[cur] != -1) { + do { + n++; + } while (n < max && fdp[cur + n] != -1); + } else { + do { + n++; + } while (n < max && fdp[cur + n] == -1); + } + + return n; +} + +static int vfio_user_device_io_set_irqs(VFIODevice *vbasedev, + struct vfio_irq_set *irq) +{ + VFIOUserProxy *proxy = vbasedev->proxy; + g_autofree VFIOUserIRQSet *msgp = NULL; + uint32_t size, nfds, send_fds, sent_fds, max; + Error *local_err = NULL; + + if (irq->argsz < sizeof(*irq)) { + error_printf("vfio_user_set_irqs argsz too small\n"); + return -EINVAL; + } + + /* + * Handle simple case + */ + if ((irq->flags & VFIO_IRQ_SET_DATA_EVENTFD) == 0) { + size = sizeof(VFIOUserHdr) + irq->argsz; + msgp = g_malloc0(size); + + vfio_user_request_msg(&msgp->hdr, VFIO_USER_DEVICE_SET_IRQS, size, 0); + msgp->argsz = irq->argsz; + msgp->flags = irq->flags; + msgp->index = irq->index; + msgp->start = irq->start; + msgp->count = irq->count; + trace_vfio_user_set_irqs(msgp->index, msgp->start, msgp->count, + msgp->flags); + + if (!vfio_user_send_wait(proxy, &msgp->hdr, NULL, 0, &local_err)) { + error_prepend(&local_err, "%s: ", __func__); + error_report_err(local_err); + return -EFAULT; + } + + if (msgp->hdr.flags & VFIO_USER_ERROR) { + return -msgp->hdr.error_reply; + } + + return 0; + } + + /* + * Calculate the number of FDs to send + * and adjust argsz + */ + nfds = (irq->argsz - sizeof(*irq)) / sizeof(int); + irq->argsz = sizeof(*irq); + msgp = g_malloc0(sizeof(*msgp)); + /* + * Send in chunks if over max_send_fds + */ + for (sent_fds = 0; nfds > sent_fds; sent_fds += send_fds) { + VFIOUserFDs *arg_fds, loop_fds; + + /* must send all valid FDs or all invalid FDs in single msg */ + max = nfds - sent_fds; + if (max > proxy->max_send_fds) { + max = proxy->max_send_fds; + } + send_fds = irq_howmany((int *)irq->data, sent_fds, max); + + vfio_user_request_msg(&msgp->hdr, VFIO_USER_DEVICE_SET_IRQS, + sizeof(*msgp), 0); + msgp->argsz = irq->argsz; + msgp->flags = irq->flags; + msgp->index = irq->index; + msgp->start = irq->start + sent_fds; + msgp->count = send_fds; + trace_vfio_user_set_irqs(msgp->index, msgp->start, msgp->count, + msgp->flags); + + loop_fds.send_fds = send_fds; + loop_fds.recv_fds = 0; + loop_fds.fds = (int *)irq->data + sent_fds; + arg_fds = loop_fds.fds[0] != -1 ? &loop_fds : NULL; + + if (!vfio_user_send_wait(proxy, &msgp->hdr, arg_fds, 0, &local_err)) { + error_prepend(&local_err, "%s: ", __func__); + error_report_err(local_err); + return -EFAULT; + } + + if (msgp->hdr.flags & VFIO_USER_ERROR) { + return -msgp->hdr.error_reply; + } + } + + return 0; +} + +static int vfio_user_device_io_region_read(VFIODevice *vbasedev, uint8_t index, + off_t off, uint32_t count, + void *data) +{ + g_autofree VFIOUserRegionRW *msgp = NULL; + VFIOUserProxy *proxy = vbasedev->proxy; + int size = sizeof(*msgp) + count; + Error *local_err = NULL; + + if (count > proxy->max_xfer_size) { + return -EINVAL; + } + + msgp = g_malloc0(size); + vfio_user_request_msg(&msgp->hdr, VFIO_USER_REGION_READ, sizeof(*msgp), 0); + msgp->offset = off; + msgp->region = index; + msgp->count = count; + trace_vfio_user_region_rw(msgp->region, msgp->offset, msgp->count); + + if (!vfio_user_send_wait(proxy, &msgp->hdr, NULL, size, &local_err)) { + error_prepend(&local_err, "%s: ", __func__); + error_report_err(local_err); + return -EFAULT; + } + + if (msgp->hdr.flags & VFIO_USER_ERROR) { + return -msgp->hdr.error_reply; + } else if (msgp->count > count) { + return -E2BIG; + } else { + memcpy(data, &msgp->data, msgp->count); + } + + return msgp->count; +} + +/* + * If this is a posted write, and VFIO_PROXY_NO_POST is not set, then we are OK + * to send the write to the socket without waiting for the server's reply: + * a subsequent read (of any region) will not pass the posted write, as all + * messages are handled sequentially. + */ +static int vfio_user_device_io_region_write(VFIODevice *vbasedev, uint8_t index, + off_t off, unsigned count, + void *data, bool post) +{ + VFIOUserRegionRW *msgp = NULL; + VFIOUserProxy *proxy = vbasedev->proxy; + int size = sizeof(*msgp) + count; + Error *local_err = NULL; + bool can_multi; + int flags = 0; + int ret; + + if (count > proxy->max_xfer_size) { + return -EINVAL; + } + + if (proxy->flags & VFIO_PROXY_NO_POST) { + post = false; + } + + if (post) { + flags |= VFIO_USER_NO_REPLY; + } + + /* write eligible to be in a WRITE_MULTI msg ? */ + can_multi = (proxy->flags & VFIO_PROXY_USE_MULTI) && post && + count <= VFIO_USER_MULTI_DATA; + + /* + * This should be a rare case, so first check without the lock, + * if we're wrong, vfio_send_queued() will flush any posted writes + * we missed here + */ + if (proxy->wr_multi != NULL || + (proxy->num_outgoing > VFIO_USER_OUT_HIGH && can_multi)) { + + /* + * re-check with lock + * + * if already building a WRITE_MULTI msg, + * add this one if possible else flush pending before + * sending the current one + * + * else if outgoing queue is over the highwater, + * start a new WRITE_MULTI message + */ + WITH_QEMU_LOCK_GUARD(&proxy->lock) { + if (proxy->wr_multi != NULL) { + if (can_multi) { + vfio_user_add_multi(proxy, index, off, count, data); + return count; + } + vfio_user_flush_multi(proxy); + } else if (proxy->num_outgoing > VFIO_USER_OUT_HIGH && can_multi) { + vfio_user_create_multi(proxy); + vfio_user_add_multi(proxy, index, off, count, data); + return count; + } + } + } + + msgp = g_malloc0(size); + vfio_user_request_msg(&msgp->hdr, VFIO_USER_REGION_WRITE, size, flags); + msgp->offset = off; + msgp->region = index; + msgp->count = count; + memcpy(&msgp->data, data, count); + trace_vfio_user_region_rw(msgp->region, msgp->offset, msgp->count); + + /* async send will free msg after it's sent */ + if (post) { + if (!vfio_user_send_async(proxy, &msgp->hdr, NULL, &local_err)) { + error_prepend(&local_err, "%s: ", __func__); + error_report_err(local_err); + return -EFAULT; + } + + return count; + } + + if (!vfio_user_send_wait(proxy, &msgp->hdr, NULL, 0, &local_err)) { + error_prepend(&local_err, "%s: ", __func__); + error_report_err(local_err); + g_free(msgp); + return -EFAULT; + } + + if (msgp->hdr.flags & VFIO_USER_ERROR) { + ret = -msgp->hdr.error_reply; + } else { + ret = count; + } + + g_free(msgp); + return ret; +} + +/* + * Socket-based io_ops + */ +VFIODeviceIOOps vfio_user_device_io_ops_sock = { + .get_region_info = vfio_user_device_io_get_region_info, + .get_irq_info = vfio_user_device_io_get_irq_info, + .set_irqs = vfio_user_device_io_set_irqs, + .region_read = vfio_user_device_io_region_read, + .region_write = vfio_user_device_io_region_write, + +}; diff --git a/hw/vfio-user/device.h b/hw/vfio-user/device.h new file mode 100644 index 0000000..d183a39 --- /dev/null +++ b/hw/vfio-user/device.h @@ -0,0 +1,24 @@ +#ifndef VFIO_USER_DEVICE_H +#define VFIO_USER_DEVICE_H + +/* + * vfio protocol over a UNIX socket device handling. + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "linux/vfio.h" + +#include "hw/vfio-user/proxy.h" + +bool vfio_user_get_device_info(VFIOUserProxy *proxy, + struct vfio_device_info *info, Error **errp); + +void vfio_user_device_reset(VFIOUserProxy *proxy); + +extern VFIODeviceIOOps vfio_user_device_io_ops_sock; + +#endif /* VFIO_USER_DEVICE_H */ diff --git a/hw/vfio-user/meson.build b/hw/vfio-user/meson.build new file mode 100644 index 0000000..2ed0ae5 --- /dev/null +++ b/hw/vfio-user/meson.build @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +vfio_user_ss = ss.source_set() +vfio_user_ss.add(files( + 'container.c', + 'device.c', + 'pci.c', + 'proxy.c', +)) + +system_ss.add_all(when: 'CONFIG_VFIO_USER', if_true: vfio_user_ss) diff --git a/hw/vfio-user/pci.c b/hw/vfio-user/pci.c new file mode 100644 index 0000000..be71c77 --- /dev/null +++ b/hw/vfio-user/pci.c @@ -0,0 +1,475 @@ +/* + * vfio PCI device over a UNIX socket. + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include <sys/ioctl.h> +#include "qemu/osdep.h" +#include "qapi-visit-sockets.h" +#include "qemu/error-report.h" + +#include "hw/qdev-properties.h" +#include "hw/vfio/pci.h" +#include "hw/vfio-user/device.h" +#include "hw/vfio-user/proxy.h" + +#define TYPE_VFIO_USER_PCI "vfio-user-pci" +OBJECT_DECLARE_SIMPLE_TYPE(VFIOUserPCIDevice, VFIO_USER_PCI) + +struct VFIOUserPCIDevice { + VFIOPCIDevice device; + SocketAddress *socket; + bool send_queued; /* all sends are queued */ + uint32_t wait_time; /* timeout for message replies */ + bool no_post; /* all region writes are sync */ +}; + +/* + * The server maintains the device's pending interrupts, + * via its MSIX table and PBA, so we treat these accesses + * like PCI config space and forward them. + */ +static uint64_t vfio_user_pba_read(void *opaque, hwaddr addr, + unsigned size) +{ + VFIOPCIDevice *vdev = opaque; + VFIORegion *region = &vdev->bars[vdev->msix->pba_bar].region; + uint64_t data; + + /* server copy is what matters */ + data = vfio_region_read(region, addr + vdev->msix->pba_offset, size); + return data; +} + +static void vfio_user_pba_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + /* dropped */ +} + +static const MemoryRegionOps vfio_user_pba_ops = { + .read = vfio_user_pba_read, + .write = vfio_user_pba_write, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static void vfio_user_msix_setup(VFIOPCIDevice *vdev) +{ + MemoryRegion *vfio_reg, *msix_reg, *pba_reg; + + pba_reg = g_new0(MemoryRegion, 1); + vdev->msix->pba_region = pba_reg; + + vfio_reg = vdev->bars[vdev->msix->pba_bar].mr; + msix_reg = &vdev->pdev.msix_pba_mmio; + memory_region_init_io(pba_reg, OBJECT(vdev), &vfio_user_pba_ops, vdev, + "VFIO MSIX PBA", int128_get64(msix_reg->size)); + memory_region_add_subregion_overlap(vfio_reg, vdev->msix->pba_offset, + pba_reg, 1); +} + +static void vfio_user_msix_teardown(VFIOPCIDevice *vdev) +{ + MemoryRegion *mr, *sub; + + mr = vdev->bars[vdev->msix->pba_bar].mr; + sub = vdev->msix->pba_region; + memory_region_del_subregion(mr, sub); + + g_free(vdev->msix->pba_region); + vdev->msix->pba_region = NULL; +} + +static void vfio_user_dma_read(VFIOPCIDevice *vdev, VFIOUserDMARW *msg) +{ + PCIDevice *pdev = &vdev->pdev; + VFIOUserProxy *proxy = vdev->vbasedev.proxy; + VFIOUserDMARW *res; + MemTxResult r; + size_t size; + + if (msg->hdr.size < sizeof(*msg)) { + vfio_user_send_error(proxy, &msg->hdr, EINVAL); + return; + } + if (msg->count > proxy->max_xfer_size) { + vfio_user_send_error(proxy, &msg->hdr, E2BIG); + return; + } + + /* switch to our own message buffer */ + size = msg->count + sizeof(VFIOUserDMARW); + res = g_malloc0(size); + memcpy(res, msg, sizeof(*res)); + g_free(msg); + + r = pci_dma_read(pdev, res->offset, &res->data, res->count); + + switch (r) { + case MEMTX_OK: + if (res->hdr.flags & VFIO_USER_NO_REPLY) { + g_free(res); + return; + } + vfio_user_send_reply(proxy, &res->hdr, size); + break; + case MEMTX_ERROR: + vfio_user_send_error(proxy, &res->hdr, EFAULT); + break; + case MEMTX_DECODE_ERROR: + vfio_user_send_error(proxy, &res->hdr, ENODEV); + break; + case MEMTX_ACCESS_ERROR: + vfio_user_send_error(proxy, &res->hdr, EPERM); + break; + default: + error_printf("vfio_user_dma_read unknown error %d\n", r); + vfio_user_send_error(vdev->vbasedev.proxy, &res->hdr, EINVAL); + } +} + +static void vfio_user_dma_write(VFIOPCIDevice *vdev, VFIOUserDMARW *msg) +{ + PCIDevice *pdev = &vdev->pdev; + VFIOUserProxy *proxy = vdev->vbasedev.proxy; + MemTxResult r; + + if (msg->hdr.size < sizeof(*msg)) { + vfio_user_send_error(proxy, &msg->hdr, EINVAL); + return; + } + /* make sure transfer count isn't larger than the message data */ + if (msg->count > msg->hdr.size - sizeof(*msg)) { + vfio_user_send_error(proxy, &msg->hdr, E2BIG); + return; + } + + r = pci_dma_write(pdev, msg->offset, &msg->data, msg->count); + + switch (r) { + case MEMTX_OK: + if ((msg->hdr.flags & VFIO_USER_NO_REPLY) == 0) { + vfio_user_send_reply(proxy, &msg->hdr, sizeof(msg->hdr)); + } else { + g_free(msg); + } + break; + case MEMTX_ERROR: + vfio_user_send_error(proxy, &msg->hdr, EFAULT); + break; + case MEMTX_DECODE_ERROR: + vfio_user_send_error(proxy, &msg->hdr, ENODEV); + break; + case MEMTX_ACCESS_ERROR: + vfio_user_send_error(proxy, &msg->hdr, EPERM); + break; + default: + error_printf("vfio_user_dma_write unknown error %d\n", r); + vfio_user_send_error(vdev->vbasedev.proxy, &msg->hdr, EINVAL); + } +} + +/* + * Incoming request message callback. + * + * Runs off main loop, so BQL held. + */ +static void vfio_user_pci_process_req(void *opaque, VFIOUserMsg *msg) +{ + VFIOPCIDevice *vdev = opaque; + VFIOUserHdr *hdr = msg->hdr; + + /* no incoming PCI requests pass FDs */ + if (msg->fds != NULL) { + vfio_user_send_error(vdev->vbasedev.proxy, hdr, EINVAL); + vfio_user_putfds(msg); + return; + } + + switch (hdr->command) { + case VFIO_USER_DMA_READ: + vfio_user_dma_read(vdev, (VFIOUserDMARW *)hdr); + break; + case VFIO_USER_DMA_WRITE: + vfio_user_dma_write(vdev, (VFIOUserDMARW *)hdr); + break; + default: + error_printf("vfio_user_pci_process_req unknown cmd %d\n", + hdr->command); + vfio_user_send_error(vdev->vbasedev.proxy, hdr, ENOSYS); + } +} + +/* + * Emulated devices don't use host hot reset + */ +static void vfio_user_compute_needs_reset(VFIODevice *vbasedev) +{ + vbasedev->needs_reset = false; +} + +static Object *vfio_user_pci_get_object(VFIODevice *vbasedev) +{ + VFIOUserPCIDevice *vdev = container_of(vbasedev, VFIOUserPCIDevice, + device.vbasedev); + + return OBJECT(vdev); +} + +static VFIODeviceOps vfio_user_pci_ops = { + .vfio_compute_needs_reset = vfio_user_compute_needs_reset, + .vfio_eoi = vfio_pci_intx_eoi, + .vfio_get_object = vfio_user_pci_get_object, + /* No live migration support yet. */ + .vfio_save_config = NULL, + .vfio_load_config = NULL, +}; + +static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp) +{ + ERRP_GUARD(); + VFIOUserPCIDevice *udev = VFIO_USER_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); + VFIODevice *vbasedev = &vdev->vbasedev; + const char *sock_name; + AddressSpace *as; + SocketAddress addr; + VFIOUserProxy *proxy; + + if (!udev->socket) { + error_setg(errp, "No socket specified"); + error_append_hint(errp, "e.g. -device '{" + "\"driver\":\"vfio-user-pci\", " + "\"socket\": {\"path\": \"/tmp/vfio-user.sock\", " + "\"type\": \"unix\"}'" + "}'\n"); + return; + } + + sock_name = udev->socket->u.q_unix.path; + + vbasedev->name = g_strdup_printf("vfio-user:%s", sock_name); + + memset(&addr, 0, sizeof(addr)); + addr.type = SOCKET_ADDRESS_TYPE_UNIX; + addr.u.q_unix.path = (char *)sock_name; + proxy = vfio_user_connect_dev(&addr, errp); + if (!proxy) { + return; + } + vbasedev->proxy = proxy; + vfio_user_set_handler(vbasedev, vfio_user_pci_process_req, vdev); + + vbasedev->name = g_strdup_printf("vfio-user:%s", sock_name); + + if (udev->send_queued) { + proxy->flags |= VFIO_PROXY_FORCE_QUEUED; + } + + if (udev->no_post) { + proxy->flags |= VFIO_PROXY_NO_POST; + } + + /* user specified or 5 sec default */ + proxy->wait_time = udev->wait_time; + + if (!vfio_user_validate_version(proxy, errp)) { + goto error; + } + + /* + * Use socket-based device I/O instead of vfio kernel driver. + */ + vbasedev->io_ops = &vfio_user_device_io_ops_sock; + + /* + * vfio-user devices are effectively mdevs (don't use a host iommu). + */ + vbasedev->mdev = true; + + /* + * Enable per-region fds. + */ + vbasedev->use_region_fds = true; + + as = pci_device_iommu_address_space(pdev); + if (!vfio_device_attach_by_iommu_type(TYPE_VFIO_IOMMU_USER, + vbasedev->name, vbasedev, + as, errp)) { + goto error; + } + + if (!vfio_pci_populate_device(vdev, errp)) { + goto error; + } + + if (!vfio_pci_config_setup(vdev, errp)) { + goto error; + } + + /* + * vfio_pci_config_setup will have registered the device's BARs + * and setup any MSIX BARs, so errors after it succeeds must + * use out_teardown + */ + + if (!vfio_pci_add_capabilities(vdev, errp)) { + goto out_teardown; + } + + if (vdev->msix != NULL) { + vfio_user_msix_setup(vdev); + } + + if (!vfio_pci_interrupt_setup(vdev, errp)) { + goto out_teardown; + } + + vfio_pci_register_err_notifier(vdev); + vfio_pci_register_req_notifier(vdev); + + return; + +out_teardown: + vfio_pci_teardown_msi(vdev); + vfio_pci_bars_exit(vdev); +error: + error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.name); + vfio_pci_put_device(vdev); +} + +static void vfio_user_instance_init(Object *obj) +{ + PCIDevice *pci_dev = PCI_DEVICE(obj); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); + VFIODevice *vbasedev = &vdev->vbasedev; + + device_add_bootindex_property(obj, &vdev->bootindex, + "bootindex", NULL, + &pci_dev->qdev); + vdev->host.domain = ~0U; + vdev->host.bus = ~0U; + vdev->host.slot = ~0U; + vdev->host.function = ~0U; + + vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PCI, &vfio_user_pci_ops, + DEVICE(vdev), false); + + vdev->nv_gpudirect_clique = 0xFF; + + /* + * QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command + * line, therefore, no need to wait to realize like other devices. + */ + pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS; +} + +static void vfio_user_instance_finalize(Object *obj) +{ + VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); + VFIODevice *vbasedev = &vdev->vbasedev; + + if (vdev->msix != NULL) { + vfio_user_msix_teardown(vdev); + } + + vfio_pci_put_device(vdev); + + if (vbasedev->proxy != NULL) { + vfio_user_disconnect(vbasedev->proxy); + } +} + +static void vfio_user_pci_reset(DeviceState *dev) +{ + VFIOPCIDevice *vdev = VFIO_PCI_BASE(dev); + VFIODevice *vbasedev = &vdev->vbasedev; + + vfio_pci_pre_reset(vdev); + + if (vbasedev->reset_works) { + vfio_user_device_reset(vbasedev->proxy); + } + + vfio_pci_post_reset(vdev); +} + +static const Property vfio_user_pci_dev_properties[] = { + DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice, + vendor_id, PCI_ANY_ID), + DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice, + device_id, PCI_ANY_ID), + DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice, + sub_vendor_id, PCI_ANY_ID), + DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice, + sub_device_id, PCI_ANY_ID), + DEFINE_PROP_BOOL("x-send-queued", VFIOUserPCIDevice, send_queued, false), + DEFINE_PROP_UINT32("x-msg-timeout", VFIOUserPCIDevice, wait_time, 5000), + DEFINE_PROP_BOOL("x-no-posted-writes", VFIOUserPCIDevice, no_post, false), +}; + +static void vfio_user_pci_set_socket(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + VFIOUserPCIDevice *udev = VFIO_USER_PCI(obj); + bool success; + + if (udev->device.vbasedev.proxy) { + error_setg(errp, "Proxy is connected"); + return; + } + + qapi_free_SocketAddress(udev->socket); + + udev->socket = NULL; + + success = visit_type_SocketAddress(v, name, &udev->socket, errp); + + if (!success) { + return; + } + + if (udev->socket->type != SOCKET_ADDRESS_TYPE_UNIX) { + error_setg(errp, "Unsupported socket type %s", + SocketAddressType_str(udev->socket->type)); + qapi_free_SocketAddress(udev->socket); + udev->socket = NULL; + return; + } +} + +static void vfio_user_pci_dev_class_init(ObjectClass *klass, const void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass); + + device_class_set_legacy_reset(dc, vfio_user_pci_reset); + device_class_set_props(dc, vfio_user_pci_dev_properties); + + object_class_property_add(klass, "socket", "SocketAddress", NULL, + vfio_user_pci_set_socket, NULL, NULL); + object_class_property_set_description(klass, "socket", + "SocketAddress (UNIX sockets only)"); + + dc->desc = "VFIO over socket PCI device assignment"; + pdc->realize = vfio_user_pci_realize; +} + +static const TypeInfo vfio_user_pci_dev_info = { + .name = TYPE_VFIO_USER_PCI, + .parent = TYPE_VFIO_PCI_BASE, + .instance_size = sizeof(VFIOUserPCIDevice), + .class_init = vfio_user_pci_dev_class_init, + .instance_init = vfio_user_instance_init, + .instance_finalize = vfio_user_instance_finalize, +}; + +static void register_vfio_user_dev_type(void) +{ + type_register_static(&vfio_user_pci_dev_info); +} + + type_init(register_vfio_user_dev_type) diff --git a/hw/vfio-user/protocol.h b/hw/vfio-user/protocol.h new file mode 100644 index 0000000..3249a4a --- /dev/null +++ b/hw/vfio-user/protocol.h @@ -0,0 +1,242 @@ +#ifndef VFIO_USER_PROTOCOL_H +#define VFIO_USER_PROTOCOL_H + +/* + * vfio protocol over a UNIX socket. + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * Each message has a standard header that describes the command + * being sent, which is almost always a VFIO ioctl(). + * + * The header may be followed by command-specific data, such as the + * region and offset info for read and write commands. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +typedef struct { + uint16_t id; + uint16_t command; + uint32_t size; + uint32_t flags; + uint32_t error_reply; +} VFIOUserHdr; + +/* VFIOUserHdr commands */ +enum vfio_user_command { + VFIO_USER_VERSION = 1, + VFIO_USER_DMA_MAP = 2, + VFIO_USER_DMA_UNMAP = 3, + VFIO_USER_DEVICE_GET_INFO = 4, + VFIO_USER_DEVICE_GET_REGION_INFO = 5, + VFIO_USER_DEVICE_GET_REGION_IO_FDS = 6, + VFIO_USER_DEVICE_GET_IRQ_INFO = 7, + VFIO_USER_DEVICE_SET_IRQS = 8, + VFIO_USER_REGION_READ = 9, + VFIO_USER_REGION_WRITE = 10, + VFIO_USER_DMA_READ = 11, + VFIO_USER_DMA_WRITE = 12, + VFIO_USER_DEVICE_RESET = 13, + VFIO_USER_DIRTY_PAGES = 14, + VFIO_USER_REGION_WRITE_MULTI = 15, + VFIO_USER_MAX, +}; + +/* VFIOUserHdr flags */ +#define VFIO_USER_REQUEST 0x0 +#define VFIO_USER_REPLY 0x1 +#define VFIO_USER_TYPE 0xF + +#define VFIO_USER_NO_REPLY 0x10 +#define VFIO_USER_ERROR 0x20 + + +/* + * VFIO_USER_VERSION + */ +typedef struct { + VFIOUserHdr hdr; + uint16_t major; + uint16_t minor; + char capabilities[]; +} VFIOUserVersion; + +#define VFIO_USER_MAJOR_VER 0 +#define VFIO_USER_MINOR_VER 0 + +#define VFIO_USER_CAP "capabilities" + +/* "capabilities" members */ +#define VFIO_USER_CAP_MAX_FDS "max_msg_fds" +#define VFIO_USER_CAP_MAX_XFER "max_data_xfer_size" +#define VFIO_USER_CAP_PGSIZES "pgsizes" +#define VFIO_USER_CAP_MAP_MAX "max_dma_maps" +#define VFIO_USER_CAP_MIGR "migration" +#define VFIO_USER_CAP_MULTI "write_multiple" + +/* "migration" members */ +#define VFIO_USER_CAP_PGSIZE "pgsize" +#define VFIO_USER_CAP_MAX_BITMAP "max_bitmap_size" + +/* + * Max FDs mainly comes into play when a device supports multiple interrupts + * where each ones uses an eventfd to inject it into the guest. + * It is clamped by the the number of FDs the qio channel supports in a + * single message. + */ +#define VFIO_USER_DEF_MAX_FDS 8 +#define VFIO_USER_MAX_MAX_FDS 16 + +/* + * Max transfer limits the amount of data in region and DMA messages. + * Region R/W will be very small (limited by how much a single instruction + * can process) so just use a reasonable limit here. + */ +#define VFIO_USER_DEF_MAX_XFER (1024 * 1024) +#define VFIO_USER_MAX_MAX_XFER (64 * 1024 * 1024) + +/* + * Default pagesizes supported is 4k. + */ +#define VFIO_USER_DEF_PGSIZE 4096 + +/* + * Default max number of DMA mappings is stolen from the + * linux kernel "dma_entry_limit" + */ +#define VFIO_USER_DEF_MAP_MAX 65535 + +/* + * Default max bitmap size is also take from the linux kernel, + * where usage of signed ints limits the VA range to 2^31 bytes. + * Dividing that by the number of bits per byte yields 256MB + */ +#define VFIO_USER_DEF_MAX_BITMAP (256 * 1024 * 1024) + +/* + * VFIO_USER_DMA_MAP + * imported from struct vfio_iommu_type1_dma_map + */ +typedef struct { + VFIOUserHdr hdr; + uint32_t argsz; + uint32_t flags; + uint64_t offset; /* FD offset */ + uint64_t iova; + uint64_t size; +} VFIOUserDMAMap; + +/* + * VFIO_USER_DMA_UNMAP + * imported from struct vfio_iommu_type1_dma_unmap + */ +typedef struct { + VFIOUserHdr hdr; + uint32_t argsz; + uint32_t flags; + uint64_t iova; + uint64_t size; +} VFIOUserDMAUnmap; + +/* + * VFIO_USER_DEVICE_GET_INFO + * imported from struct vfio_device_info + */ +typedef struct { + VFIOUserHdr hdr; + uint32_t argsz; + uint32_t flags; + uint32_t num_regions; + uint32_t num_irqs; +} VFIOUserDeviceInfo; + +/* + * VFIO_USER_DEVICE_GET_REGION_INFO + * imported from struct vfio_region_info + */ +typedef struct { + VFIOUserHdr hdr; + uint32_t argsz; + uint32_t flags; + uint32_t index; + uint32_t cap_offset; + uint64_t size; + uint64_t offset; +} VFIOUserRegionInfo; + +/* + * VFIO_USER_DEVICE_GET_IRQ_INFO + * imported from struct vfio_irq_info + */ +typedef struct { + VFIOUserHdr hdr; + uint32_t argsz; + uint32_t flags; + uint32_t index; + uint32_t count; +} VFIOUserIRQInfo; + +/* + * VFIO_USER_DEVICE_SET_IRQS + * imported from struct vfio_irq_set + */ +typedef struct { + VFIOUserHdr hdr; + uint32_t argsz; + uint32_t flags; + uint32_t index; + uint32_t start; + uint32_t count; +} VFIOUserIRQSet; + +/* + * VFIO_USER_REGION_READ + * VFIO_USER_REGION_WRITE + */ +typedef struct { + VFIOUserHdr hdr; + uint64_t offset; + uint32_t region; + uint32_t count; + char data[]; +} VFIOUserRegionRW; + +/* + * VFIO_USER_DMA_READ + * VFIO_USER_DMA_WRITE + */ +typedef struct { + VFIOUserHdr hdr; + uint64_t offset; + uint32_t count; + char data[]; +} VFIOUserDMARW; + +/* imported from struct vfio_bitmap */ +typedef struct { + uint64_t pgsize; + uint64_t size; + char data[]; +} VFIOUserBitmap; + +/* + * VFIO_USER_REGION_WRITE_MULTI + */ +#define VFIO_USER_MULTI_DATA 8 +#define VFIO_USER_MULTI_MAX 200 + +typedef struct { + uint64_t offset; + uint32_t region; + uint32_t count; + char data[VFIO_USER_MULTI_DATA]; +} VFIOUserWROne; + +typedef struct { + VFIOUserHdr hdr; + uint64_t wr_cnt; + VFIOUserWROne wrs[VFIO_USER_MULTI_MAX]; +} VFIOUserWRMulti; + +#endif /* VFIO_USER_PROTOCOL_H */ diff --git a/hw/vfio-user/proxy.c b/hw/vfio-user/proxy.c new file mode 100644 index 0000000..c418954 --- /dev/null +++ b/hw/vfio-user/proxy.c @@ -0,0 +1,1356 @@ +/* + * vfio protocol over a UNIX socket. + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include <sys/ioctl.h> + +#include "hw/vfio/vfio-device.h" +#include "hw/vfio-user/proxy.h" +#include "hw/vfio-user/trace.h" +#include "qapi/error.h" +#include "qobject/qbool.h" +#include "qobject/qdict.h" +#include "qobject/qjson.h" +#include "qobject/qnum.h" +#include "qemu/error-report.h" +#include "qemu/lockable.h" +#include "qemu/main-loop.h" +#include "qemu/thread.h" +#include "system/iothread.h" + +static IOThread *vfio_user_iothread; + +static void vfio_user_shutdown(VFIOUserProxy *proxy); +static VFIOUserMsg *vfio_user_getmsg(VFIOUserProxy *proxy, VFIOUserHdr *hdr, + VFIOUserFDs *fds); +static void vfio_user_recycle(VFIOUserProxy *proxy, VFIOUserMsg *msg); + +static void vfio_user_recv(void *opaque); +static void vfio_user_send(void *opaque); +static void vfio_user_cb(void *opaque); + +static void vfio_user_request(void *opaque); + +static inline void vfio_user_set_error(VFIOUserHdr *hdr, uint32_t err) +{ + hdr->flags |= VFIO_USER_ERROR; + hdr->error_reply = err; +} + +/* + * Functions called by main, CPU, or iothread threads + */ + +static void vfio_user_shutdown(VFIOUserProxy *proxy) +{ + qio_channel_shutdown(proxy->ioc, QIO_CHANNEL_SHUTDOWN_READ, NULL); + qio_channel_set_aio_fd_handler(proxy->ioc, proxy->ctx, NULL, + proxy->ctx, NULL, NULL); +} + +/* + * Same return values as qio_channel_writev_full(): + * + * QIO_CHANNEL_ERR_BLOCK: *errp not set + * -1: *errp will be populated + * otherwise: bytes written + */ +static ssize_t vfio_user_send_qio(VFIOUserProxy *proxy, VFIOUserMsg *msg, + Error **errp) +{ + VFIOUserFDs *fds = msg->fds; + struct iovec iov = { + .iov_base = msg->hdr, + .iov_len = msg->hdr->size, + }; + size_t numfds = 0; + int *fdp = NULL; + ssize_t ret; + + if (fds != NULL && fds->send_fds != 0) { + numfds = fds->send_fds; + fdp = fds->fds; + } + + ret = qio_channel_writev_full(proxy->ioc, &iov, 1, fdp, numfds, 0, errp); + + if (ret == -1) { + vfio_user_set_error(msg->hdr, EIO); + vfio_user_shutdown(proxy); + } + trace_vfio_user_send_write(msg->hdr->id, ret); + + return ret; +} + +static VFIOUserMsg *vfio_user_getmsg(VFIOUserProxy *proxy, VFIOUserHdr *hdr, + VFIOUserFDs *fds) +{ + VFIOUserMsg *msg; + + msg = QTAILQ_FIRST(&proxy->free); + if (msg != NULL) { + QTAILQ_REMOVE(&proxy->free, msg, next); + } else { + msg = g_malloc0(sizeof(*msg)); + qemu_cond_init(&msg->cv); + } + + msg->hdr = hdr; + msg->fds = fds; + return msg; +} + +/* + * Recycle a message list entry to the free list. + */ +static void vfio_user_recycle(VFIOUserProxy *proxy, VFIOUserMsg *msg) +{ + if (msg->type == VFIO_MSG_NONE) { + error_printf("vfio_user_recycle - freeing free msg\n"); + return; + } + + /* free msg buffer if no one is waiting to consume the reply */ + if (msg->type == VFIO_MSG_NOWAIT || msg->type == VFIO_MSG_ASYNC) { + g_free(msg->hdr); + if (msg->fds != NULL) { + g_free(msg->fds); + } + } + + msg->type = VFIO_MSG_NONE; + msg->hdr = NULL; + msg->fds = NULL; + msg->complete = false; + msg->pending = false; + QTAILQ_INSERT_HEAD(&proxy->free, msg, next); +} + +VFIOUserFDs *vfio_user_getfds(int numfds) +{ + VFIOUserFDs *fds = g_malloc0(sizeof(*fds) + (numfds * sizeof(int))); + + fds->fds = (int *)((char *)fds + sizeof(*fds)); + + return fds; +} + +/* + * Functions only called by iothread + */ + +/* + * Process a received message. + */ +static void vfio_user_process(VFIOUserProxy *proxy, VFIOUserMsg *msg, + bool isreply) +{ + + /* + * Replies signal a waiter, if none just check for errors + * and free the message buffer. + * + * Requests get queued for the BH. + */ + if (isreply) { + msg->complete = true; + if (msg->type == VFIO_MSG_WAIT) { + qemu_cond_signal(&msg->cv); + } else { + if (msg->hdr->flags & VFIO_USER_ERROR) { + error_printf("vfio_user_process: error reply on async "); + error_printf("request command %x error %s\n", + msg->hdr->command, + strerror(msg->hdr->error_reply)); + } + /* youngest nowait msg has been ack'd */ + if (proxy->last_nowait == msg) { + proxy->last_nowait = NULL; + } + vfio_user_recycle(proxy, msg); + } + } else { + QTAILQ_INSERT_TAIL(&proxy->incoming, msg, next); + qemu_bh_schedule(proxy->req_bh); + } +} + +/* + * Complete a partial message read + */ +static int vfio_user_complete(VFIOUserProxy *proxy, Error **errp) +{ + VFIOUserMsg *msg = proxy->part_recv; + size_t msgleft = proxy->recv_left; + bool isreply; + char *data; + int ret; + + data = (char *)msg->hdr + (msg->hdr->size - msgleft); + while (msgleft > 0) { + ret = qio_channel_read(proxy->ioc, data, msgleft, errp); + + /* error or would block */ + if (ret <= 0) { + /* try for rest on next iternation */ + if (ret == QIO_CHANNEL_ERR_BLOCK) { + proxy->recv_left = msgleft; + } + return ret; + } + trace_vfio_user_recv_read(msg->hdr->id, ret); + + msgleft -= ret; + data += ret; + } + + /* + * Read complete message, process it. + */ + proxy->part_recv = NULL; + proxy->recv_left = 0; + isreply = (msg->hdr->flags & VFIO_USER_TYPE) == VFIO_USER_REPLY; + vfio_user_process(proxy, msg, isreply); + + /* return positive value */ + return 1; +} + +/* + * Receive and process one incoming message. + * + * For replies, find matching outgoing request and wake any waiters. + * For requests, queue in incoming list and run request BH. + */ +static int vfio_user_recv_one(VFIOUserProxy *proxy, Error **errp) +{ + VFIOUserMsg *msg = NULL; + g_autofree int *fdp = NULL; + VFIOUserFDs *reqfds; + VFIOUserHdr hdr; + struct iovec iov = { + .iov_base = &hdr, + .iov_len = sizeof(hdr), + }; + bool isreply = false; + int i, ret; + size_t msgleft, numfds = 0; + char *data = NULL; + char *buf = NULL; + + /* + * Complete any partial reads + */ + if (proxy->part_recv != NULL) { + ret = vfio_user_complete(proxy, errp); + + /* still not complete, try later */ + if (ret == QIO_CHANNEL_ERR_BLOCK) { + return ret; + } + + if (ret <= 0) { + goto fatal; + } + /* else fall into reading another msg */ + } + + /* + * Read header + */ + ret = qio_channel_readv_full(proxy->ioc, &iov, 1, &fdp, &numfds, 0, + errp); + if (ret == QIO_CHANNEL_ERR_BLOCK) { + return ret; + } + + /* read error or other side closed connection */ + if (ret <= 0) { + goto fatal; + } + + if (ret < sizeof(hdr)) { + error_setg(errp, "short read of header"); + goto fatal; + } + + /* + * Validate header + */ + if (hdr.size < sizeof(VFIOUserHdr)) { + error_setg(errp, "bad header size"); + goto fatal; + } + switch (hdr.flags & VFIO_USER_TYPE) { + case VFIO_USER_REQUEST: + isreply = false; + break; + case VFIO_USER_REPLY: + isreply = true; + break; + default: + error_setg(errp, "unknown message type"); + goto fatal; + } + trace_vfio_user_recv_hdr(proxy->sockname, hdr.id, hdr.command, hdr.size, + hdr.flags); + + /* + * For replies, find the matching pending request. + * For requests, reap incoming FDs. + */ + if (isreply) { + QTAILQ_FOREACH(msg, &proxy->pending, next) { + if (hdr.id == msg->id) { + break; + } + } + if (msg == NULL) { + error_setg(errp, "unexpected reply"); + goto err; + } + QTAILQ_REMOVE(&proxy->pending, msg, next); + + /* + * Process any received FDs + */ + if (numfds != 0) { + if (msg->fds == NULL || msg->fds->recv_fds < numfds) { + error_setg(errp, "unexpected FDs"); + goto err; + } + msg->fds->recv_fds = numfds; + memcpy(msg->fds->fds, fdp, numfds * sizeof(int)); + } + } else { + if (numfds != 0) { + reqfds = vfio_user_getfds(numfds); + memcpy(reqfds->fds, fdp, numfds * sizeof(int)); + } else { + reqfds = NULL; + } + } + + /* + * Put the whole message into a single buffer. + */ + if (isreply) { + if (hdr.size > msg->rsize) { + error_setg(errp, "reply larger than recv buffer"); + goto err; + } + *msg->hdr = hdr; + data = (char *)msg->hdr + sizeof(hdr); + } else { + if (hdr.size > proxy->max_xfer_size + sizeof(VFIOUserDMARW)) { + error_setg(errp, "vfio_user_recv request larger than max"); + goto err; + } + buf = g_malloc0(hdr.size); + memcpy(buf, &hdr, sizeof(hdr)); + data = buf + sizeof(hdr); + msg = vfio_user_getmsg(proxy, (VFIOUserHdr *)buf, reqfds); + msg->type = VFIO_MSG_REQ; + } + + /* + * Read rest of message. + */ + msgleft = hdr.size - sizeof(hdr); + while (msgleft > 0) { + ret = qio_channel_read(proxy->ioc, data, msgleft, errp); + + /* prepare to complete read on next iternation */ + if (ret == QIO_CHANNEL_ERR_BLOCK) { + proxy->part_recv = msg; + proxy->recv_left = msgleft; + return ret; + } + + if (ret <= 0) { + goto fatal; + } + trace_vfio_user_recv_read(hdr.id, ret); + + msgleft -= ret; + data += ret; + } + + vfio_user_process(proxy, msg, isreply); + return 0; + + /* + * fatal means the other side closed or we don't trust the stream + * err means this message is corrupt + */ +fatal: + vfio_user_shutdown(proxy); + proxy->state = VFIO_PROXY_ERROR; + + /* set error if server side closed */ + if (ret == 0) { + error_setg(errp, "server closed socket"); + } + +err: + for (i = 0; i < numfds; i++) { + close(fdp[i]); + } + if (isreply && msg != NULL) { + /* force an error to keep sending thread from hanging */ + vfio_user_set_error(msg->hdr, EINVAL); + msg->complete = true; + qemu_cond_signal(&msg->cv); + } + return -1; +} + +static void vfio_user_recv(void *opaque) +{ + VFIOUserProxy *proxy = opaque; + + QEMU_LOCK_GUARD(&proxy->lock); + + if (proxy->state == VFIO_PROXY_CONNECTED) { + Error *local_err = NULL; + + while (vfio_user_recv_one(proxy, &local_err) == 0) { + ; + } + + if (local_err != NULL) { + error_report_err(local_err); + } + } +} + +/* + * Send a single message, same return semantics as vfio_user_send_qio(). + * + * Sent async messages are freed, others are moved to pending queue. + */ +static ssize_t vfio_user_send_one(VFIOUserProxy *proxy, Error **errp) +{ + VFIOUserMsg *msg; + ssize_t ret; + + msg = QTAILQ_FIRST(&proxy->outgoing); + ret = vfio_user_send_qio(proxy, msg, errp); + if (ret < 0) { + return ret; + } + + QTAILQ_REMOVE(&proxy->outgoing, msg, next); + proxy->num_outgoing--; + if (msg->type == VFIO_MSG_ASYNC) { + vfio_user_recycle(proxy, msg); + } else { + QTAILQ_INSERT_TAIL(&proxy->pending, msg, next); + msg->pending = true; + } + + return ret; +} + +/* + * Send messages from outgoing queue when the socket buffer has space. + * If we deplete 'outgoing', remove ourselves from the poll list. + */ +static void vfio_user_send(void *opaque) +{ + VFIOUserProxy *proxy = opaque; + + QEMU_LOCK_GUARD(&proxy->lock); + + if (proxy->state == VFIO_PROXY_CONNECTED) { + while (!QTAILQ_EMPTY(&proxy->outgoing)) { + Error *local_err = NULL; + int ret; + + ret = vfio_user_send_one(proxy, &local_err); + + if (ret == QIO_CHANNEL_ERR_BLOCK) { + return; + } else if (ret == -1) { + error_report_err(local_err); + return; + } + } + qio_channel_set_aio_fd_handler(proxy->ioc, proxy->ctx, + vfio_user_recv, NULL, NULL, proxy); + + /* queue empty - send any pending multi write msgs */ + if (proxy->wr_multi != NULL) { + vfio_user_flush_multi(proxy); + } + } +} + +static void vfio_user_cb(void *opaque) +{ + VFIOUserProxy *proxy = opaque; + + QEMU_LOCK_GUARD(&proxy->lock); + + proxy->state = VFIO_PROXY_CLOSED; + qemu_cond_signal(&proxy->close_cv); +} + + +/* + * Functions called by main or CPU threads + */ + +/* + * Process incoming requests. + * + * The bus-specific callback has the form: + * request(opaque, msg) + * where 'opaque' was specified in vfio_user_set_handler + * and 'msg' is the inbound message. + * + * The callback is responsible for disposing of the message buffer, + * usually by re-using it when calling vfio_send_reply or vfio_send_error, + * both of which free their message buffer when the reply is sent. + * + * If the callback uses a new buffer, it needs to free the old one. + */ +static void vfio_user_request(void *opaque) +{ + VFIOUserProxy *proxy = opaque; + VFIOUserMsgQ new, free; + VFIOUserMsg *msg, *m1; + + /* reap all incoming */ + QTAILQ_INIT(&new); + WITH_QEMU_LOCK_GUARD(&proxy->lock) { + QTAILQ_FOREACH_SAFE(msg, &proxy->incoming, next, m1) { + QTAILQ_REMOVE(&proxy->incoming, msg, next); + QTAILQ_INSERT_TAIL(&new, msg, next); + } + } + + /* process list */ + QTAILQ_INIT(&free); + QTAILQ_FOREACH_SAFE(msg, &new, next, m1) { + QTAILQ_REMOVE(&new, msg, next); + trace_vfio_user_recv_request(msg->hdr->command); + proxy->request(proxy->req_arg, msg); + QTAILQ_INSERT_HEAD(&free, msg, next); + } + + /* free list */ + WITH_QEMU_LOCK_GUARD(&proxy->lock) { + QTAILQ_FOREACH_SAFE(msg, &free, next, m1) { + vfio_user_recycle(proxy, msg); + } + } +} + +/* + * Messages are queued onto the proxy's outgoing list. + * + * It handles 3 types of messages: + * + * async messages - replies and posted writes + * + * There will be no reply from the server, so message + * buffers are freed after they're sent. + * + * nowait messages - map/unmap during address space transactions + * + * These are also sent async, but a reply is expected so that + * vfio_wait_reqs() can wait for the youngest nowait request. + * They transition from the outgoing list to the pending list + * when sent, and are freed when the reply is received. + * + * wait messages - all other requests + * + * The reply to these messages is waited for by their caller. + * They also transition from outgoing to pending when sent, but + * the message buffer is returned to the caller with the reply + * contents. The caller is responsible for freeing these messages. + * + * As an optimization, if the outgoing list and the socket send + * buffer are empty, the message is sent inline instead of being + * added to the outgoing list. The rest of the transitions are + * unchanged. + */ +static bool vfio_user_send_queued(VFIOUserProxy *proxy, VFIOUserMsg *msg, + Error **errp) +{ + int ret; + + /* older coalesced writes go first */ + if (proxy->wr_multi != NULL && + ((msg->hdr->flags & VFIO_USER_TYPE) == VFIO_USER_REQUEST)) { + vfio_user_flush_multi(proxy); + } + + /* + * Unsent outgoing msgs - add to tail + */ + if (!QTAILQ_EMPTY(&proxy->outgoing)) { + QTAILQ_INSERT_TAIL(&proxy->outgoing, msg, next); + proxy->num_outgoing++; + return true; + } + + /* + * Try inline - if blocked, queue it and kick send poller + */ + if (proxy->flags & VFIO_PROXY_FORCE_QUEUED) { + ret = QIO_CHANNEL_ERR_BLOCK; + } else { + ret = vfio_user_send_qio(proxy, msg, errp); + } + + if (ret == QIO_CHANNEL_ERR_BLOCK) { + QTAILQ_INSERT_HEAD(&proxy->outgoing, msg, next); + proxy->num_outgoing = 1; + qio_channel_set_aio_fd_handler(proxy->ioc, proxy->ctx, + vfio_user_recv, proxy->ctx, + vfio_user_send, proxy); + return true; + } + if (ret == -1) { + return false; + } + + /* + * Sent - free async, add others to pending + */ + if (msg->type == VFIO_MSG_ASYNC) { + vfio_user_recycle(proxy, msg); + } else { + QTAILQ_INSERT_TAIL(&proxy->pending, msg, next); + msg->pending = true; + } + + return true; +} + +/* + * nowait send - vfio_wait_reqs() can wait for it later + * + * Returns false if we did not successfully receive a reply message, in which + * case @errp will be populated. + * + * In either case, ownership of @hdr and @fds is taken, and the caller must + * *not* free them itself. + */ +bool vfio_user_send_nowait(VFIOUserProxy *proxy, VFIOUserHdr *hdr, + VFIOUserFDs *fds, int rsize, Error **errp) +{ + VFIOUserMsg *msg; + + QEMU_LOCK_GUARD(&proxy->lock); + + msg = vfio_user_getmsg(proxy, hdr, fds); + msg->id = hdr->id; + msg->rsize = rsize ? rsize : hdr->size; + msg->type = VFIO_MSG_NOWAIT; + + if (hdr->flags & VFIO_USER_NO_REPLY) { + error_setg_errno(errp, EINVAL, "%s on NO_REPLY message", __func__); + vfio_user_recycle(proxy, msg); + return false; + } + + if (!vfio_user_send_queued(proxy, msg, errp)) { + vfio_user_recycle(proxy, msg); + return false; + } + + proxy->last_nowait = msg; + + return true; +} + +/* + * Returns false if we did not successfully receive a reply message, in which + * case @errp will be populated. + * + * In either case, the caller must free @hdr and @fds if needed. + */ +bool vfio_user_send_wait(VFIOUserProxy *proxy, VFIOUserHdr *hdr, + VFIOUserFDs *fds, int rsize, Error **errp) +{ + VFIOUserMsg *msg; + bool ok = false; + + if (hdr->flags & VFIO_USER_NO_REPLY) { + error_setg_errno(errp, EINVAL, "%s on NO_REPLY message", __func__); + return false; + } + + qemu_mutex_lock(&proxy->lock); + + msg = vfio_user_getmsg(proxy, hdr, fds); + msg->id = hdr->id; + msg->rsize = rsize ? rsize : hdr->size; + msg->type = VFIO_MSG_WAIT; + + ok = vfio_user_send_queued(proxy, msg, errp); + + if (ok) { + while (!msg->complete) { + if (!qemu_cond_timedwait(&msg->cv, &proxy->lock, + proxy->wait_time)) { + VFIOUserMsgQ *list; + + list = msg->pending ? &proxy->pending : &proxy->outgoing; + QTAILQ_REMOVE(list, msg, next); + error_setg_errno(errp, ETIMEDOUT, + "timed out waiting for reply"); + ok = false; + break; + } + } + } + + vfio_user_recycle(proxy, msg); + + qemu_mutex_unlock(&proxy->lock); + + return ok; +} + +/* + * async send - msg can be queued, but will be freed when sent + * + * Returns false on failure, in which case @errp will be populated. + * + * In either case, ownership of @hdr and @fds is taken, and the caller must + * *not* free them itself. + */ +bool vfio_user_send_async(VFIOUserProxy *proxy, VFIOUserHdr *hdr, + VFIOUserFDs *fds, Error **errp) +{ + VFIOUserMsg *msg; + + QEMU_LOCK_GUARD(&proxy->lock); + + msg = vfio_user_getmsg(proxy, hdr, fds); + msg->id = hdr->id; + msg->rsize = 0; + msg->type = VFIO_MSG_ASYNC; + + if (!(hdr->flags & (VFIO_USER_NO_REPLY | VFIO_USER_REPLY))) { + error_setg_errno(errp, EINVAL, "%s on sync message", __func__); + vfio_user_recycle(proxy, msg); + return false; + } + + if (!vfio_user_send_queued(proxy, msg, errp)) { + vfio_user_recycle(proxy, msg); + return false; + } + + return true; +} + +void vfio_user_wait_reqs(VFIOUserProxy *proxy) +{ + VFIOUserMsg *msg; + + /* + * Any DMA map/unmap requests sent in the middle + * of a memory region transaction were sent nowait. + * Wait for them here. + */ + qemu_mutex_lock(&proxy->lock); + if (proxy->last_nowait != NULL) { + /* + * Change type to WAIT to wait for reply + */ + msg = proxy->last_nowait; + msg->type = VFIO_MSG_WAIT; + proxy->last_nowait = NULL; + while (!msg->complete) { + if (!qemu_cond_timedwait(&msg->cv, &proxy->lock, + proxy->wait_time)) { + VFIOUserMsgQ *list; + + list = msg->pending ? &proxy->pending : &proxy->outgoing; + QTAILQ_REMOVE(list, msg, next); + error_printf("vfio_wait_reqs - timed out\n"); + break; + } + } + + if (msg->hdr->flags & VFIO_USER_ERROR) { + error_printf("vfio_user_wait_reqs - error reply on async "); + error_printf("request: command %x error %s\n", msg->hdr->command, + strerror(msg->hdr->error_reply)); + } + + /* + * Change type back to NOWAIT to free + */ + msg->type = VFIO_MSG_NOWAIT; + vfio_user_recycle(proxy, msg); + } + + qemu_mutex_unlock(&proxy->lock); +} + +/* + * Reply to an incoming request. + */ +void vfio_user_send_reply(VFIOUserProxy *proxy, VFIOUserHdr *hdr, int size) +{ + Error *local_err = NULL; + + if (size < sizeof(VFIOUserHdr)) { + error_printf("%s: size too small", __func__); + g_free(hdr); + return; + } + + /* + * convert header to associated reply + */ + hdr->flags = VFIO_USER_REPLY; + hdr->size = size; + + if (!vfio_user_send_async(proxy, hdr, NULL, &local_err)) { + error_report_err(local_err); + } +} + +/* + * Send an error reply to an incoming request. + */ +void vfio_user_send_error(VFIOUserProxy *proxy, VFIOUserHdr *hdr, int error) +{ + Error *local_err = NULL; + + /* + * convert header to associated reply + */ + hdr->flags = VFIO_USER_REPLY; + hdr->flags |= VFIO_USER_ERROR; + hdr->error_reply = error; + hdr->size = sizeof(*hdr); + + if (!vfio_user_send_async(proxy, hdr, NULL, &local_err)) { + error_report_err(local_err); + } +} + +/* + * Close FDs erroneously received in an incoming request. + */ +void vfio_user_putfds(VFIOUserMsg *msg) +{ + VFIOUserFDs *fds = msg->fds; + int i; + + for (i = 0; i < fds->recv_fds; i++) { + close(fds->fds[i]); + } + g_free(fds); + msg->fds = NULL; +} + +void +vfio_user_disable_posted_writes(VFIOUserProxy *proxy) +{ + WITH_QEMU_LOCK_GUARD(&proxy->lock) { + proxy->flags |= VFIO_PROXY_NO_POST; + } +} + +static QLIST_HEAD(, VFIOUserProxy) vfio_user_sockets = + QLIST_HEAD_INITIALIZER(vfio_user_sockets); + +VFIOUserProxy *vfio_user_connect_dev(SocketAddress *addr, Error **errp) +{ + VFIOUserProxy *proxy; + QIOChannelSocket *sioc; + QIOChannel *ioc; + char *sockname; + + if (addr->type != SOCKET_ADDRESS_TYPE_UNIX) { + error_setg(errp, "vfio_user_connect - bad address family"); + return NULL; + } + sockname = addr->u.q_unix.path; + + sioc = qio_channel_socket_new(); + ioc = QIO_CHANNEL(sioc); + if (qio_channel_socket_connect_sync(sioc, addr, errp)) { + object_unref(OBJECT(ioc)); + return NULL; + } + qio_channel_set_blocking(ioc, false, NULL); + + proxy = g_malloc0(sizeof(VFIOUserProxy)); + proxy->sockname = g_strdup_printf("unix:%s", sockname); + proxy->ioc = ioc; + + /* init defaults */ + proxy->max_xfer_size = VFIO_USER_DEF_MAX_XFER; + proxy->max_send_fds = VFIO_USER_DEF_MAX_FDS; + proxy->max_dma = VFIO_USER_DEF_MAP_MAX; + proxy->dma_pgsizes = VFIO_USER_DEF_PGSIZE; + proxy->max_bitmap = VFIO_USER_DEF_MAX_BITMAP; + proxy->migr_pgsize = VFIO_USER_DEF_PGSIZE; + + proxy->flags = VFIO_PROXY_CLIENT; + proxy->state = VFIO_PROXY_CONNECTED; + + qemu_mutex_init(&proxy->lock); + qemu_cond_init(&proxy->close_cv); + + if (vfio_user_iothread == NULL) { + vfio_user_iothread = iothread_create("VFIO user", errp); + } + + proxy->ctx = iothread_get_aio_context(vfio_user_iothread); + proxy->req_bh = qemu_bh_new(vfio_user_request, proxy); + + QTAILQ_INIT(&proxy->outgoing); + QTAILQ_INIT(&proxy->incoming); + QTAILQ_INIT(&proxy->free); + QTAILQ_INIT(&proxy->pending); + QLIST_INSERT_HEAD(&vfio_user_sockets, proxy, next); + + return proxy; +} + +void vfio_user_set_handler(VFIODevice *vbasedev, + void (*handler)(void *opaque, VFIOUserMsg *msg), + void *req_arg) +{ + VFIOUserProxy *proxy = vbasedev->proxy; + + proxy->request = handler; + proxy->req_arg = req_arg; + qio_channel_set_aio_fd_handler(proxy->ioc, proxy->ctx, + vfio_user_recv, NULL, NULL, proxy); +} + +void vfio_user_disconnect(VFIOUserProxy *proxy) +{ + VFIOUserMsg *r1, *r2; + + qemu_mutex_lock(&proxy->lock); + + /* our side is quitting */ + if (proxy->state == VFIO_PROXY_CONNECTED) { + vfio_user_shutdown(proxy); + if (!QTAILQ_EMPTY(&proxy->pending)) { + error_printf("vfio_user_disconnect: outstanding requests\n"); + } + } + object_unref(OBJECT(proxy->ioc)); + proxy->ioc = NULL; + qemu_bh_delete(proxy->req_bh); + proxy->req_bh = NULL; + + proxy->state = VFIO_PROXY_CLOSING; + QTAILQ_FOREACH_SAFE(r1, &proxy->outgoing, next, r2) { + qemu_cond_destroy(&r1->cv); + QTAILQ_REMOVE(&proxy->outgoing, r1, next); + g_free(r1); + } + QTAILQ_FOREACH_SAFE(r1, &proxy->incoming, next, r2) { + qemu_cond_destroy(&r1->cv); + QTAILQ_REMOVE(&proxy->incoming, r1, next); + g_free(r1); + } + QTAILQ_FOREACH_SAFE(r1, &proxy->pending, next, r2) { + qemu_cond_destroy(&r1->cv); + QTAILQ_REMOVE(&proxy->pending, r1, next); + g_free(r1); + } + QTAILQ_FOREACH_SAFE(r1, &proxy->free, next, r2) { + qemu_cond_destroy(&r1->cv); + QTAILQ_REMOVE(&proxy->free, r1, next); + g_free(r1); + } + + /* + * Make sure the iothread isn't blocking anywhere + * with a ref to this proxy by waiting for a BH + * handler to run after the proxy fd handlers were + * deleted above. + */ + aio_bh_schedule_oneshot(proxy->ctx, vfio_user_cb, proxy); + qemu_cond_wait(&proxy->close_cv, &proxy->lock); + + /* we now hold the only ref to proxy */ + qemu_mutex_unlock(&proxy->lock); + qemu_cond_destroy(&proxy->close_cv); + qemu_mutex_destroy(&proxy->lock); + + QLIST_REMOVE(proxy, next); + if (QLIST_EMPTY(&vfio_user_sockets)) { + iothread_destroy(vfio_user_iothread); + vfio_user_iothread = NULL; + } + + g_free(proxy->sockname); + g_free(proxy); +} + +void vfio_user_request_msg(VFIOUserHdr *hdr, uint16_t cmd, + uint32_t size, uint32_t flags) +{ + static uint16_t next_id; + + hdr->id = qatomic_fetch_inc(&next_id); + hdr->command = cmd; + hdr->size = size; + hdr->flags = (flags & ~VFIO_USER_TYPE) | VFIO_USER_REQUEST; + hdr->error_reply = 0; +} + +struct cap_entry { + const char *name; + bool (*check)(VFIOUserProxy *proxy, QObject *qobj, Error **errp); +}; + +static bool caps_parse(VFIOUserProxy *proxy, QDict *qdict, + struct cap_entry caps[], Error **errp) +{ + QObject *qobj; + struct cap_entry *p; + + for (p = caps; p->name != NULL; p++) { + qobj = qdict_get(qdict, p->name); + if (qobj != NULL) { + if (!p->check(proxy, qobj, errp)) { + return false; + } + qdict_del(qdict, p->name); + } + } + + /* warning, for now */ + if (qdict_size(qdict) != 0) { + warn_report("spurious capabilities"); + } + return true; +} + +static bool check_migr_pgsize(VFIOUserProxy *proxy, QObject *qobj, Error **errp) +{ + QNum *qn = qobject_to(QNum, qobj); + uint64_t pgsize; + + if (qn == NULL || !qnum_get_try_uint(qn, &pgsize)) { + error_setg(errp, "malformed %s", VFIO_USER_CAP_PGSIZE); + return false; + } + + /* must be larger than default */ + if (pgsize & (VFIO_USER_DEF_PGSIZE - 1)) { + error_setg(errp, "pgsize 0x%"PRIx64" too small", pgsize); + return false; + } + + proxy->migr_pgsize = pgsize; + return true; +} + +static bool check_bitmap(VFIOUserProxy *proxy, QObject *qobj, Error **errp) +{ + QNum *qn = qobject_to(QNum, qobj); + uint64_t bitmap_size; + + if (qn == NULL || !qnum_get_try_uint(qn, &bitmap_size)) { + error_setg(errp, "malformed %s", VFIO_USER_CAP_MAX_BITMAP); + return false; + } + + /* can only lower it */ + if (bitmap_size > VFIO_USER_DEF_MAX_BITMAP) { + error_setg(errp, "%s too large", VFIO_USER_CAP_MAX_BITMAP); + return false; + } + + proxy->max_bitmap = bitmap_size; + return true; +} + +static struct cap_entry caps_migr[] = { + { VFIO_USER_CAP_PGSIZE, check_migr_pgsize }, + { VFIO_USER_CAP_MAX_BITMAP, check_bitmap }, + { NULL } +}; + +static bool check_max_fds(VFIOUserProxy *proxy, QObject *qobj, Error **errp) +{ + QNum *qn = qobject_to(QNum, qobj); + uint64_t max_send_fds; + + if (qn == NULL || !qnum_get_try_uint(qn, &max_send_fds) || + max_send_fds > VFIO_USER_MAX_MAX_FDS) { + error_setg(errp, "malformed %s", VFIO_USER_CAP_MAX_FDS); + return false; + } + proxy->max_send_fds = max_send_fds; + return true; +} + +static bool check_max_xfer(VFIOUserProxy *proxy, QObject *qobj, Error **errp) +{ + QNum *qn = qobject_to(QNum, qobj); + uint64_t max_xfer_size; + + if (qn == NULL || !qnum_get_try_uint(qn, &max_xfer_size) || + max_xfer_size > VFIO_USER_MAX_MAX_XFER) { + error_setg(errp, "malformed %s", VFIO_USER_CAP_MAX_XFER); + return false; + } + proxy->max_xfer_size = max_xfer_size; + return true; +} + +static bool check_pgsizes(VFIOUserProxy *proxy, QObject *qobj, Error **errp) +{ + QNum *qn = qobject_to(QNum, qobj); + uint64_t pgsizes; + + if (qn == NULL || !qnum_get_try_uint(qn, &pgsizes)) { + error_setg(errp, "malformed %s", VFIO_USER_CAP_PGSIZES); + return false; + } + + /* must be larger than default */ + if (pgsizes & (VFIO_USER_DEF_PGSIZE - 1)) { + error_setg(errp, "pgsize 0x%"PRIx64" too small", pgsizes); + return false; + } + + proxy->dma_pgsizes = pgsizes; + return true; +} + +static bool check_max_dma(VFIOUserProxy *proxy, QObject *qobj, Error **errp) +{ + QNum *qn = qobject_to(QNum, qobj); + uint64_t max_dma; + + if (qn == NULL || !qnum_get_try_uint(qn, &max_dma)) { + error_setg(errp, "malformed %s", VFIO_USER_CAP_MAP_MAX); + return false; + } + + /* can only lower it */ + if (max_dma > VFIO_USER_DEF_MAP_MAX) { + error_setg(errp, "%s too large", VFIO_USER_CAP_MAP_MAX); + return false; + } + + proxy->max_dma = max_dma; + return true; +} + +static bool check_migr(VFIOUserProxy *proxy, QObject *qobj, Error **errp) +{ + QDict *qdict = qobject_to(QDict, qobj); + + if (qdict == NULL) { + error_setg(errp, "malformed %s", VFIO_USER_CAP_MAX_FDS); + return true; + } + return caps_parse(proxy, qdict, caps_migr, errp); +} + +static bool check_multi(VFIOUserProxy *proxy, QObject *qobj, Error **errp) +{ + QBool *qb = qobject_to(QBool, qobj); + + if (qb == NULL) { + error_setg(errp, "malformed %s", VFIO_USER_CAP_MULTI); + return false; + } + if (qbool_get_bool(qb)) { + proxy->flags |= VFIO_PROXY_USE_MULTI; + } + return true; +} + +static struct cap_entry caps_cap[] = { + { VFIO_USER_CAP_MAX_FDS, check_max_fds }, + { VFIO_USER_CAP_MAX_XFER, check_max_xfer }, + { VFIO_USER_CAP_PGSIZES, check_pgsizes }, + { VFIO_USER_CAP_MAP_MAX, check_max_dma }, + { VFIO_USER_CAP_MIGR, check_migr }, + { VFIO_USER_CAP_MULTI, check_multi }, + { NULL } +}; + +static bool check_cap(VFIOUserProxy *proxy, QObject *qobj, Error **errp) +{ + QDict *qdict = qobject_to(QDict, qobj); + + if (qdict == NULL) { + error_setg(errp, "malformed %s", VFIO_USER_CAP); + return false; + } + return caps_parse(proxy, qdict, caps_cap, errp); +} + +static struct cap_entry ver_0_0[] = { + { VFIO_USER_CAP, check_cap }, + { NULL } +}; + +static bool caps_check(VFIOUserProxy *proxy, int minor, const char *caps, + Error **errp) +{ + QObject *qobj; + QDict *qdict; + bool ret; + + qobj = qobject_from_json(caps, NULL); + if (qobj == NULL) { + error_setg(errp, "malformed capabilities %s", caps); + return false; + } + qdict = qobject_to(QDict, qobj); + if (qdict == NULL) { + error_setg(errp, "capabilities %s not an object", caps); + qobject_unref(qobj); + return false; + } + ret = caps_parse(proxy, qdict, ver_0_0, errp); + + qobject_unref(qobj); + return ret; +} + +static GString *caps_json(void) +{ + QDict *dict = qdict_new(); + QDict *capdict = qdict_new(); + QDict *migdict = qdict_new(); + GString *str; + + qdict_put_int(migdict, VFIO_USER_CAP_PGSIZE, VFIO_USER_DEF_PGSIZE); + qdict_put_int(migdict, VFIO_USER_CAP_MAX_BITMAP, VFIO_USER_DEF_MAX_BITMAP); + qdict_put_obj(capdict, VFIO_USER_CAP_MIGR, QOBJECT(migdict)); + + qdict_put_int(capdict, VFIO_USER_CAP_MAX_FDS, VFIO_USER_MAX_MAX_FDS); + qdict_put_int(capdict, VFIO_USER_CAP_MAX_XFER, VFIO_USER_DEF_MAX_XFER); + qdict_put_int(capdict, VFIO_USER_CAP_PGSIZES, VFIO_USER_DEF_PGSIZE); + qdict_put_int(capdict, VFIO_USER_CAP_MAP_MAX, VFIO_USER_DEF_MAP_MAX); + qdict_put_bool(capdict, VFIO_USER_CAP_MULTI, true); + + qdict_put_obj(dict, VFIO_USER_CAP, QOBJECT(capdict)); + + str = qobject_to_json(QOBJECT(dict)); + qobject_unref(dict); + return str; +} + +bool vfio_user_validate_version(VFIOUserProxy *proxy, Error **errp) +{ + g_autofree VFIOUserVersion *msgp = NULL; + GString *caps; + char *reply; + int size, caplen; + + caps = caps_json(); + caplen = caps->len + 1; + size = sizeof(*msgp) + caplen; + msgp = g_malloc0(size); + + vfio_user_request_msg(&msgp->hdr, VFIO_USER_VERSION, size, 0); + msgp->major = VFIO_USER_MAJOR_VER; + msgp->minor = VFIO_USER_MINOR_VER; + memcpy(&msgp->capabilities, caps->str, caplen); + g_string_free(caps, true); + trace_vfio_user_version(msgp->major, msgp->minor, msgp->capabilities); + + if (!vfio_user_send_wait(proxy, &msgp->hdr, NULL, 0, errp)) { + return false; + } + + if (msgp->hdr.flags & VFIO_USER_ERROR) { + error_setg_errno(errp, msgp->hdr.error_reply, "version reply"); + return false; + } + + if (msgp->major != VFIO_USER_MAJOR_VER || + msgp->minor > VFIO_USER_MINOR_VER) { + error_setg(errp, "incompatible server version"); + return false; + } + + reply = msgp->capabilities; + if (reply[msgp->hdr.size - sizeof(*msgp) - 1] != '\0') { + error_setg(errp, "corrupt version reply"); + return false; + } + + if (!caps_check(proxy, msgp->minor, reply, errp)) { + return false; + } + + trace_vfio_user_version(msgp->major, msgp->minor, msgp->capabilities); + return true; +} + +void vfio_user_flush_multi(VFIOUserProxy *proxy) +{ + VFIOUserMsg *msg; + VFIOUserWRMulti *wm = proxy->wr_multi; + Error *local_err = NULL; + + proxy->wr_multi = NULL; + + /* adjust size for actual # of writes */ + wm->hdr.size -= (VFIO_USER_MULTI_MAX - wm->wr_cnt) * sizeof(VFIOUserWROne); + + msg = vfio_user_getmsg(proxy, &wm->hdr, NULL); + msg->id = wm->hdr.id; + msg->rsize = 0; + msg->type = VFIO_MSG_ASYNC; + trace_vfio_user_wrmulti("flush", wm->wr_cnt); + + if (!vfio_user_send_queued(proxy, msg, &local_err)) { + error_report_err(local_err); + vfio_user_recycle(proxy, msg); + } +} + +void vfio_user_create_multi(VFIOUserProxy *proxy) +{ + VFIOUserWRMulti *wm; + + wm = g_malloc0(sizeof(*wm)); + vfio_user_request_msg(&wm->hdr, VFIO_USER_REGION_WRITE_MULTI, + sizeof(*wm), VFIO_USER_NO_REPLY); + proxy->wr_multi = wm; +} + +void vfio_user_add_multi(VFIOUserProxy *proxy, uint8_t index, + off_t offset, uint32_t count, void *data) +{ + VFIOUserWRMulti *wm = proxy->wr_multi; + VFIOUserWROne *w1 = &wm->wrs[wm->wr_cnt]; + + w1->offset = offset; + w1->region = index; + w1->count = count; + memcpy(&w1->data, data, count); + + wm->wr_cnt++; + trace_vfio_user_wrmulti("add", wm->wr_cnt); + if (wm->wr_cnt == VFIO_USER_MULTI_MAX || + proxy->num_outgoing < VFIO_USER_OUT_LOW) { + vfio_user_flush_multi(proxy); + } +} diff --git a/hw/vfio-user/proxy.h b/hw/vfio-user/proxy.h new file mode 100644 index 0000000..61e64a0 --- /dev/null +++ b/hw/vfio-user/proxy.h @@ -0,0 +1,135 @@ +#ifndef VFIO_USER_PROXY_H +#define VFIO_USER_PROXY_H + +/* + * vfio protocol over a UNIX socket. + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "io/channel.h" +#include "io/channel-socket.h" + +#include "qemu/queue.h" +#include "qemu/sockets.h" +#include "qemu/thread.h" +#include "hw/vfio/vfio-device.h" +#include "hw/vfio-user/protocol.h" + +typedef struct { + int send_fds; + int recv_fds; + int *fds; +} VFIOUserFDs; + +enum msg_type { + VFIO_MSG_NONE, + VFIO_MSG_ASYNC, + VFIO_MSG_WAIT, + VFIO_MSG_NOWAIT, + VFIO_MSG_REQ, +}; + +typedef struct VFIOUserMsg { + QTAILQ_ENTRY(VFIOUserMsg) next; + VFIOUserHdr *hdr; + VFIOUserFDs *fds; + uint32_t rsize; + uint32_t id; + QemuCond cv; + bool complete; + bool pending; + enum msg_type type; +} VFIOUserMsg; + + +enum proxy_state { + VFIO_PROXY_CONNECTED = 1, + VFIO_PROXY_ERROR = 2, + VFIO_PROXY_CLOSING = 3, + VFIO_PROXY_CLOSED = 4, +}; + +typedef QTAILQ_HEAD(VFIOUserMsgQ, VFIOUserMsg) VFIOUserMsgQ; + +typedef struct VFIOUserProxy { + QLIST_ENTRY(VFIOUserProxy) next; + char *sockname; + struct QIOChannel *ioc; + void (*request)(void *opaque, VFIOUserMsg *msg); + void *req_arg; + uint64_t max_xfer_size; + uint64_t max_send_fds; + uint64_t max_dma; + uint64_t dma_pgsizes; + uint64_t max_bitmap; + uint64_t migr_pgsize; + int flags; + uint32_t wait_time; + QemuCond close_cv; + AioContext *ctx; + QEMUBH *req_bh; + bool async_ops; + + /* + * above only changed when BQL is held + * below are protected by per-proxy lock + */ + QemuMutex lock; + VFIOUserMsgQ free; + VFIOUserMsgQ pending; + VFIOUserMsgQ incoming; + VFIOUserMsgQ outgoing; + VFIOUserMsg *last_nowait; + VFIOUserMsg *part_recv; + size_t recv_left; + VFIOUserWRMulti *wr_multi; + int num_outgoing; + enum proxy_state state; +} VFIOUserProxy; + +/* VFIOProxy flags */ +#define VFIO_PROXY_CLIENT 0x1 +#define VFIO_PROXY_FORCE_QUEUED 0x4 +#define VFIO_PROXY_NO_POST 0x8 +#define VFIO_PROXY_USE_MULTI 0x16 + +/* coalescing high and low water marks for VFIOProxy num_outgoing */ +#define VFIO_USER_OUT_HIGH 1024 +#define VFIO_USER_OUT_LOW 128 + +typedef struct VFIODevice VFIODevice; + +VFIOUserProxy *vfio_user_connect_dev(SocketAddress *addr, Error **errp); +void vfio_user_disconnect(VFIOUserProxy *proxy); +void vfio_user_set_handler(VFIODevice *vbasedev, + void (*handler)(void *opaque, VFIOUserMsg *msg), + void *reqarg); +bool vfio_user_validate_version(VFIOUserProxy *proxy, Error **errp); + +VFIOUserFDs *vfio_user_getfds(int numfds); +void vfio_user_putfds(VFIOUserMsg *msg); + +void vfio_user_disable_posted_writes(VFIOUserProxy *proxy); + +void vfio_user_request_msg(VFIOUserHdr *hdr, uint16_t cmd, + uint32_t size, uint32_t flags); +void vfio_user_wait_reqs(VFIOUserProxy *proxy); +bool vfio_user_send_wait(VFIOUserProxy *proxy, VFIOUserHdr *hdr, + VFIOUserFDs *fds, int rsize, Error **errp); +bool vfio_user_send_nowait(VFIOUserProxy *proxy, VFIOUserHdr *hdr, + VFIOUserFDs *fds, int rsize, Error **errp); +bool vfio_user_send_async(VFIOUserProxy *proxy, VFIOUserHdr *hdr, + VFIOUserFDs *fds, Error **errp); + +void vfio_user_send_reply(VFIOUserProxy *proxy, VFIOUserHdr *hdr, int size); +void vfio_user_send_error(VFIOUserProxy *proxy, VFIOUserHdr *hdr, int error); + +void vfio_user_flush_multi(VFIOUserProxy *proxy); +void vfio_user_create_multi(VFIOUserProxy *proxy); +void vfio_user_add_multi(VFIOUserProxy *proxy, uint8_t index, + off_t offset, uint32_t count, void *data); + +#endif /* VFIO_USER_PROXY_H */ diff --git a/hw/vfio-user/trace-events b/hw/vfio-user/trace-events new file mode 100644 index 0000000..abb67f4 --- /dev/null +++ b/hw/vfio-user/trace-events @@ -0,0 +1,20 @@ +# See docs/devel/tracing.rst for syntax documentation. +# +# SPDX-License-Identifier: GPL-2.0-or-later + +# common.c +vfio_user_recv_hdr(const char *name, uint16_t id, uint16_t cmd, uint32_t size, uint32_t flags) " (%s) id 0x%x cmd 0x%x size 0x%x flags 0x%x" +vfio_user_recv_read(uint16_t id, int read) " id 0x%x read 0x%x" +vfio_user_recv_request(uint16_t cmd) " command 0x%x" +vfio_user_send_write(uint16_t id, int wrote) " id 0x%x wrote 0x%x" +vfio_user_version(uint16_t major, uint16_t minor, const char *caps) " major %d minor %d caps: %s" +vfio_user_get_info(uint32_t nregions, uint32_t nirqs) " #regions %d #irqs %d" +vfio_user_get_region_info(uint32_t index, uint32_t flags, uint64_t size) " index %d flags 0x%x size 0x%"PRIx64 +vfio_user_region_rw(uint32_t region, uint64_t off, uint32_t count) " region %d offset 0x%"PRIx64" count %d" +vfio_user_get_irq_info(uint32_t index, uint32_t flags, uint32_t count) " index %d flags 0x%x count %d" +vfio_user_set_irqs(uint32_t index, uint32_t start, uint32_t count, uint32_t flags) " index %d start %d count %d flags 0x%x" +vfio_user_wrmulti(const char *s, uint64_t wr_cnt) " %s count 0x%"PRIx64 + +# container.c +vfio_user_dma_map(uint64_t iova, uint64_t size, uint64_t off, uint32_t flags, bool async_ops) " iova 0x%"PRIx64" size 0x%"PRIx64" off 0x%"PRIx64" flags 0x%x async_ops %d" +vfio_user_dma_unmap(uint64_t iova, uint64_t size, uint32_t flags, bool async_ops) " iova 0x%"PRIx64" size 0x%"PRIx64" flags 0x%x async_ops %d" diff --git a/hw/vfio-user/trace.h b/hw/vfio-user/trace.h new file mode 100644 index 0000000..9cf02d9 --- /dev/null +++ b/hw/vfio-user/trace.h @@ -0,0 +1,4 @@ +/* + * SPDX-License-Identifier: GPL-2.0-or-later + */ +#include "trace/trace-hw_vfio_user.h" diff --git a/hw/vfio/Kconfig b/hw/vfio/Kconfig index 7cdba05..91d9023 100644 --- a/hw/vfio/Kconfig +++ b/hw/vfio/Kconfig @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + config VFIO bool depends on LINUX diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index 785c0a0..7719f24 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -10,6 +10,7 @@ * directory. */ +#include <stdbool.h> #include "qemu/osdep.h" #include CONFIG_DEVICES /* CONFIG_IOMMUFD */ #include <linux/vfio.h> @@ -18,8 +19,10 @@ #include "hw/vfio/vfio-device.h" #include "system/iommufd.h" #include "hw/s390x/ap-device.h" +#include "hw/s390x/css.h" #include "qemu/error-report.h" #include "qemu/event_notifier.h" +#include "qemu/lockable.h" #include "qemu/main-loop.h" #include "qemu/module.h" #include "qemu/option.h" @@ -37,8 +40,23 @@ struct VFIOAPDevice { APDevice apdev; VFIODevice vdev; EventNotifier req_notifier; + EventNotifier cfg_notifier; }; +typedef struct APConfigChgEvent { + QTAILQ_ENTRY(APConfigChgEvent) next; +} APConfigChgEvent; + +static QTAILQ_HEAD(, APConfigChgEvent) cfg_chg_events = + QTAILQ_HEAD_INITIALIZER(cfg_chg_events); + +static QemuMutex cfg_chg_events_lock; + +static void __attribute__((constructor)) vfio_ap_global_init(void) +{ + qemu_mutex_init(&cfg_chg_events_lock); +} + OBJECT_DECLARE_SIMPLE_TYPE(VFIOAPDevice, VFIO_AP_DEVICE) static void vfio_ap_compute_needs_reset(VFIODevice *vdev) @@ -70,6 +88,57 @@ static void vfio_ap_req_notifier_handler(void *opaque) } } +static void vfio_ap_cfg_chg_notifier_handler(void *opaque) +{ + APConfigChgEvent *cfg_chg_event; + VFIOAPDevice *vapdev = opaque; + + if (!event_notifier_test_and_clear(&vapdev->cfg_notifier)) { + return; + } + + cfg_chg_event = g_new0(APConfigChgEvent, 1); + + WITH_QEMU_LOCK_GUARD(&cfg_chg_events_lock) { + QTAILQ_INSERT_TAIL(&cfg_chg_events, cfg_chg_event, next); + } + + css_generate_css_crws(0); + +} + +int ap_chsc_sei_nt0_get_event(void *res) +{ + ChscSeiNt0Res *nt0_res = (ChscSeiNt0Res *)res; + APConfigChgEvent *cfg_chg_event; + + WITH_QEMU_LOCK_GUARD(&cfg_chg_events_lock) { + if (QTAILQ_EMPTY(&cfg_chg_events)) { + return EVENT_INFORMATION_NOT_STORED; + } + + cfg_chg_event = QTAILQ_FIRST(&cfg_chg_events); + QTAILQ_REMOVE(&cfg_chg_events, cfg_chg_event, next); + } + + memset(nt0_res, 0, sizeof(*nt0_res)); + g_free(cfg_chg_event); + nt0_res->flags |= PENDING_EVENT_INFO_BITMASK; + nt0_res->length = sizeof(ChscSeiNt0Res); + nt0_res->code = NT0_RES_RESPONSE_CODE; + nt0_res->nt = NT0_RES_NT_DEFAULT; + nt0_res->rs = NT0_RES_RS_AP_CHANGE; + nt0_res->cc = NT0_RES_CC_AP_CHANGE; + + return EVENT_INFORMATION_STORED; +} + +bool ap_chsc_sei_nt0_have_event(void) +{ + QEMU_LOCK_GUARD(&cfg_chg_events_lock); + return !QTAILQ_EMPTY(&cfg_chg_events); +} + static bool vfio_ap_register_irq_notifier(VFIOAPDevice *vapdev, unsigned int irq, Error **errp) { @@ -85,6 +154,10 @@ static bool vfio_ap_register_irq_notifier(VFIOAPDevice *vapdev, notifier = &vapdev->req_notifier; fd_read = vfio_ap_req_notifier_handler; break; + case VFIO_AP_CFG_CHG_IRQ_INDEX: + notifier = &vapdev->cfg_notifier; + fd_read = vfio_ap_cfg_chg_notifier_handler; + break; default: error_setg(errp, "vfio: Unsupported device irq(%d)", irq); return false; @@ -137,6 +210,9 @@ static void vfio_ap_unregister_irq_notifier(VFIOAPDevice *vapdev, case VFIO_AP_REQ_IRQ_INDEX: notifier = &vapdev->req_notifier; break; + case VFIO_AP_CFG_CHG_IRQ_INDEX: + notifier = &vapdev->cfg_notifier; + break; default: error_report("vfio: Unsupported device irq(%d)", irq); return; @@ -176,11 +252,20 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp) warn_report_err(err); } + if (!vfio_ap_register_irq_notifier(vapdev, VFIO_AP_CFG_CHG_IRQ_INDEX, &err)) + { + /* + * Report this error, but do not make it a failing condition. + * Lack of this IRQ in the host does not prevent normal operation. + */ + warn_report_err(err); + } + return; error: error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->name); - g_free(vbasedev->name); + vfio_device_free_name(vbasedev); } static void vfio_ap_unrealize(DeviceState *dev) @@ -188,8 +273,9 @@ static void vfio_ap_unrealize(DeviceState *dev) VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev); vfio_ap_unregister_irq_notifier(vapdev, VFIO_AP_REQ_IRQ_INDEX); + vfio_ap_unregister_irq_notifier(vapdev, VFIO_AP_CFG_CHG_IRQ_INDEX); vfio_device_detach(&vapdev->vdev); - g_free(vapdev->vdev.name); + vfio_device_free_name(&vapdev->vdev); } static const Property vfio_ap_properties[] = { diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index cea9d6e..9560b8d 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -619,7 +619,7 @@ out_io_notifier_err: out_region_err: vfio_device_detach(vbasedev); out_attach_dev_err: - g_free(vbasedev->name); + vfio_device_free_name(vbasedev); out_unrealize: if (cdc->unrealize) { cdc->unrealize(cdev); @@ -637,7 +637,7 @@ static void vfio_ccw_unrealize(DeviceState *dev) vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX); vfio_ccw_put_region(vcdev); vfio_device_detach(&vcdev->vdev); - g_free(vcdev->vdev.name); + vfio_device_free_name(&vcdev->vdev); if (cdc->unrealize) { cdc->unrealize(cdev); diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index 1c6ca94..5630497 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -75,12 +75,21 @@ void vfio_address_space_insert(VFIOAddressSpace *space, int vfio_container_dma_map(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, - void *vaddr, bool readonly) + void *vaddr, bool readonly, MemoryRegion *mr) { VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + RAMBlock *rb = mr->ram_block; + int mfd = rb ? qemu_ram_get_fd(rb) : -1; + if (mfd >= 0 && vioc->dma_map_file) { + unsigned long start = vaddr - qemu_ram_get_host_addr(rb); + unsigned long offset = qemu_ram_get_fd_offset(rb); + + return vioc->dma_map_file(bcontainer, iova, size, mfd, start + offset, + readonly); + } g_assert(vioc->dma_map); - return vioc->dma_map(bcontainer, iova, size, vaddr, readonly); + return vioc->dma_map(bcontainer, iova, size, vaddr, readonly, mr); } int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, diff --git a/hw/vfio/container.c b/hw/vfio/container.c index a9f0dba..3e13fea 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -31,10 +31,11 @@ #include "system/reset.h" #include "trace.h" #include "qapi/error.h" +#include "migration/cpr.h" +#include "migration/blocker.h" #include "pci.h" #include "hw/vfio/vfio-container.h" #include "vfio-helpers.h" -#include "vfio-cpr.h" #include "vfio-listener.h" #define TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-legacy-vfio" @@ -135,6 +136,8 @@ static int vfio_legacy_dma_unmap_one(const VFIOContainerBase *bcontainer, int ret; Error *local_err = NULL; + g_assert(!cpr_is_incoming()); + if (iotlb && vfio_container_dirty_tracking_is_started(bcontainer)) { if (!vfio_container_devices_dirty_tracking_is_supported(bcontainer) && bcontainer->dirty_pages_supported) { @@ -207,7 +210,8 @@ static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer, } static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova, - ram_addr_t size, void *vaddr, bool readonly) + ram_addr_t size, void *vaddr, bool readonly, + MemoryRegion *mr) { const VFIOContainer *container = container_of(bcontainer, VFIOContainer, bcontainer); @@ -425,7 +429,12 @@ static VFIOContainer *vfio_create_container(int fd, VFIOGroup *group, return NULL; } - if (!vfio_set_iommu(fd, group->fd, &iommu_type, errp)) { + /* + * During CPR, just set the container type and skip the ioctls, as the + * container and group are already configured in the kernel. + */ + if (!cpr_is_incoming() && + !vfio_set_iommu(fd, group->fd, &iommu_type, errp)) { return NULL; } @@ -592,6 +601,11 @@ static bool vfio_container_group_add(VFIOContainer *container, VFIOGroup *group, group->container = container; QLIST_INSERT_HEAD(&container->group_list, group, container_next); vfio_group_add_kvm_device(group); + /* + * Remember the container fd for each group, so we can attach to the same + * container after CPR. + */ + cpr_resave_fd("vfio_container_for_group", group->groupid, container->fd); return true; } @@ -601,6 +615,7 @@ static void vfio_container_group_del(VFIOContainer *container, VFIOGroup *group) group->container = NULL; vfio_group_del_kvm_device(group); vfio_ram_block_discard_disable(container, false); + cpr_delete_fd("vfio_container_for_group", group->groupid); } static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, @@ -615,17 +630,34 @@ static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, bool group_was_added = false; space = vfio_address_space_get(as); + fd = cpr_find_fd("vfio_container_for_group", group->groupid); - QLIST_FOREACH(bcontainer, &space->containers, next) { - container = container_of(bcontainer, VFIOContainer, bcontainer); - if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { - return vfio_container_group_add(container, group, errp); + if (!cpr_is_incoming()) { + QLIST_FOREACH(bcontainer, &space->containers, next) { + container = container_of(bcontainer, VFIOContainer, bcontainer); + if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { + return vfio_container_group_add(container, group, errp); + } } - } - fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp); - if (fd < 0) { - goto fail; + fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp); + if (fd < 0) { + goto fail; + } + } else { + /* + * For incoming CPR, the group is already attached in the kernel. + * If a container with matching fd is found, then update the + * userland group list and return. If not, then after the loop, + * create the container struct and group list. + */ + QLIST_FOREACH(bcontainer, &space->containers, next) { + container = container_of(bcontainer, VFIOContainer, bcontainer); + + if (vfio_cpr_container_match(container, group, fd)) { + return vfio_container_group_add(container, group, errp); + } + } } ret = ioctl(fd, VFIO_GET_API_VERSION); @@ -642,7 +674,7 @@ static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, new_container = true; bcontainer = &container->bcontainer; - if (!vfio_cpr_register_container(bcontainer, errp)) { + if (!vfio_legacy_cpr_register_container(container, errp)) { goto fail; } @@ -660,8 +692,17 @@ static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, } group_was_added = true; - if (!vfio_listener_register(bcontainer, errp)) { - goto fail; + /* + * If CPR, register the listener later, after all state that may + * affect regions and mapping boundaries has been cpr load'ed. Later, + * the listener will invoke its callback on each flat section and call + * dma_map to supply the new vaddr, and the calls will match the mappings + * remembered by the kernel. + */ + if (!cpr_is_incoming()) { + if (!vfio_listener_register(bcontainer, errp)) { + goto fail; + } } bcontainer->initialized = true; @@ -669,7 +710,9 @@ static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, return true; fail: - vfio_listener_unregister(bcontainer); + if (new_container) { + vfio_listener_unregister(bcontainer); + } if (group_was_added) { vfio_container_group_del(container, group); @@ -678,7 +721,7 @@ fail: vioc->release(bcontainer); } if (new_container) { - vfio_cpr_unregister_container(bcontainer); + vfio_legacy_cpr_unregister_container(container); object_unref(container); } if (fd >= 0) { @@ -697,6 +740,7 @@ static void vfio_container_disconnect(VFIOGroup *group) QLIST_REMOVE(group, container_next); group->container = NULL; + cpr_delete_fd("vfio_container_for_group", group->groupid); /* * Explicitly release the listener first before unset container, @@ -719,7 +763,7 @@ static void vfio_container_disconnect(VFIOGroup *group) VFIOAddressSpace *space = bcontainer->space; trace_vfio_container_disconnect(container->fd); - vfio_cpr_unregister_container(bcontainer); + vfio_legacy_cpr_unregister_container(container); close(container->fd); object_unref(container); @@ -750,7 +794,7 @@ static VFIOGroup *vfio_group_get(int groupid, AddressSpace *as, Error **errp) group = g_malloc0(sizeof(*group)); snprintf(path, sizeof(path), "/dev/vfio/%d", groupid); - group->fd = qemu_open(path, O_RDWR, errp); + group->fd = cpr_open_fd(path, O_RDWR, "vfio_group", groupid, errp); if (group->fd < 0) { goto free_group_exit; } @@ -782,6 +826,7 @@ static VFIOGroup *vfio_group_get(int groupid, AddressSpace *as, Error **errp) return group; close_fd_exit: + cpr_delete_fd("vfio_group", groupid); close(group->fd); free_group_exit: @@ -803,6 +848,7 @@ static void vfio_group_put(VFIOGroup *group) vfio_container_disconnect(group); QLIST_REMOVE(group, next); trace_vfio_group_put(group->fd); + cpr_delete_fd("vfio_group", group->groupid); close(group->fd); g_free(group); } @@ -813,7 +859,7 @@ static bool vfio_device_get(VFIOGroup *group, const char *name, g_autofree struct vfio_device_info *info = NULL; int fd; - fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name); + fd = vfio_cpr_group_get_device_fd(group->fd, name); if (fd < 0) { error_setg_errno(errp, errno, "error getting device from group %d", group->groupid); @@ -826,8 +872,7 @@ static bool vfio_device_get(VFIOGroup *group, const char *name, info = vfio_get_device_info(fd); if (!info) { error_setg_errno(errp, errno, "error getting device info"); - close(fd); - return false; + goto fail; } /* @@ -841,8 +886,7 @@ static bool vfio_device_get(VFIOGroup *group, const char *name, if (!QLIST_EMPTY(&group->device_list)) { error_setg(errp, "Inconsistent setting of support for discarding " "RAM (e.g., balloon) within group"); - close(fd); - return false; + goto fail; } if (!group->ram_block_discard_allowed) { @@ -860,6 +904,11 @@ static bool vfio_device_get(VFIOGroup *group, const char *name, trace_vfio_device_get(name, info->flags, info->num_regions, info->num_irqs); return true; + +fail: + close(fd); + cpr_delete_fd(name, 0); + return false; } static void vfio_device_put(VFIODevice *vbasedev) @@ -870,6 +919,7 @@ static void vfio_device_put(VFIODevice *vbasedev) QLIST_REMOVE(vbasedev, next); vbasedev->group = NULL; trace_vfio_device_put(vbasedev->fd); + cpr_delete_fd(vbasedev->name, 0); close(vbasedev->fd); } @@ -939,8 +989,19 @@ static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev, goto device_put_exit; } + if (vbasedev->mdev) { + error_setg(&vbasedev->cpr.mdev_blocker, + "CPR does not support vfio mdev %s", vbasedev->name); + if (migrate_add_blocker_modes(&vbasedev->cpr.mdev_blocker, errp, + MIG_MODE_CPR_TRANSFER, -1) < 0) { + goto hiod_unref_exit; + } + } + return true; +hiod_unref_exit: + object_unref(vbasedev->hiod); device_put_exit: vfio_device_put(vbasedev); group_put_exit: @@ -956,6 +1017,7 @@ static void vfio_legacy_detach_device(VFIODevice *vbasedev) vfio_device_unprepare(vbasedev); + migrate_del_blocker(&vbasedev->cpr.mdev_blocker); object_unref(vbasedev->hiod); vfio_device_put(vbasedev); vfio_group_put(group); diff --git a/hw/vfio/cpr-iommufd.c b/hw/vfio/cpr-iommufd.c new file mode 100644 index 0000000..148a06d --- /dev/null +++ b/hw/vfio/cpr-iommufd.c @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2024-2025 Oracle and/or its affiliates. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qemu/error-report.h" +#include "qapi/error.h" +#include "hw/vfio/vfio-cpr.h" +#include "hw/vfio/vfio-device.h" +#include "migration/blocker.h" +#include "migration/cpr.h" +#include "migration/migration.h" +#include "migration/vmstate.h" +#include "system/iommufd.h" +#include "vfio-iommufd.h" +#include "trace.h" + +typedef struct CprVFIODevice { + char *name; + unsigned int namelen; + uint32_t ioas_id; + int devid; + uint32_t hwpt_id; + QLIST_ENTRY(CprVFIODevice) next; +} CprVFIODevice; + +static const VMStateDescription vmstate_cpr_vfio_device = { + .name = "cpr vfio device", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32(namelen, CprVFIODevice), + VMSTATE_VBUFFER_ALLOC_UINT32(name, CprVFIODevice, 0, NULL, namelen), + VMSTATE_INT32(devid, CprVFIODevice), + VMSTATE_UINT32(ioas_id, CprVFIODevice), + VMSTATE_UINT32(hwpt_id, CprVFIODevice), + VMSTATE_END_OF_LIST() + } +}; + +const VMStateDescription vmstate_cpr_vfio_devices = { + .name = CPR_STATE "/vfio devices", + .version_id = 1, + .minimum_version_id = 1, + .fields = (const VMStateField[]){ + VMSTATE_QLIST_V(vfio_devices, CprState, 1, vmstate_cpr_vfio_device, + CprVFIODevice, next), + VMSTATE_END_OF_LIST() + } +}; + +static void vfio_cpr_save_device(VFIODevice *vbasedev) +{ + CprVFIODevice *elem = g_new0(CprVFIODevice, 1); + + elem->name = g_strdup(vbasedev->name); + elem->namelen = strlen(vbasedev->name) + 1; + elem->ioas_id = vbasedev->cpr.ioas_id; + elem->devid = vbasedev->devid; + elem->hwpt_id = vbasedev->cpr.hwpt_id; + QLIST_INSERT_HEAD(&cpr_state.vfio_devices, elem, next); +} + +static CprVFIODevice *find_device(const char *name) +{ + CprVFIODeviceList *head = &cpr_state.vfio_devices; + CprVFIODevice *elem; + + QLIST_FOREACH(elem, head, next) { + if (!strcmp(elem->name, name)) { + return elem; + } + } + return NULL; +} + +static void vfio_cpr_delete_device(const char *name) +{ + CprVFIODevice *elem = find_device(name); + + if (elem) { + QLIST_REMOVE(elem, next); + g_free(elem->name); + g_free(elem); + } +} + +static bool vfio_cpr_find_device(VFIODevice *vbasedev) +{ + CprVFIODevice *elem = find_device(vbasedev->name); + + if (elem) { + vbasedev->cpr.ioas_id = elem->ioas_id; + vbasedev->devid = elem->devid; + vbasedev->cpr.hwpt_id = elem->hwpt_id; + trace_vfio_cpr_find_device(elem->ioas_id, elem->devid, elem->hwpt_id); + return true; + } + return false; +} + +static bool vfio_cpr_supported(IOMMUFDBackend *be, Error **errp) +{ + if (!iommufd_change_process_capable(be)) { + if (errp) { + error_setg(errp, "vfio iommufd backend does not support " + "IOMMU_IOAS_CHANGE_PROCESS"); + } + return false; + } + return true; +} + +static int iommufd_cpr_pre_save(void *opaque) +{ + IOMMUFDBackend *be = opaque; + + /* + * The process has not changed yet, but proactively try the ioctl, + * and it will fail if any DMA mappings are not supported. + */ + if (!iommufd_change_process_capable(be)) { + error_report("some memory regions do not support " + "IOMMU_IOAS_CHANGE_PROCESS"); + return -1; + } + return 0; +} + +static int iommufd_cpr_post_load(void *opaque, int version_id) +{ + IOMMUFDBackend *be = opaque; + Error *local_err = NULL; + + if (!iommufd_change_process(be, &local_err)) { + error_report_err(local_err); + return -1; + } + return 0; +} + +static const VMStateDescription iommufd_cpr_vmstate = { + .name = "iommufd", + .version_id = 0, + .minimum_version_id = 0, + .pre_save = iommufd_cpr_pre_save, + .post_load = iommufd_cpr_post_load, + .needed = cpr_incoming_needed, + .fields = (VMStateField[]) { + VMSTATE_END_OF_LIST() + } +}; + +bool vfio_iommufd_cpr_register_iommufd(IOMMUFDBackend *be, Error **errp) +{ + Error **cpr_blocker = &be->cpr_blocker; + + if (!vfio_cpr_supported(be, cpr_blocker)) { + return migrate_add_blocker_modes(cpr_blocker, errp, + MIG_MODE_CPR_TRANSFER, -1) == 0; + } + + vmstate_register(NULL, -1, &iommufd_cpr_vmstate, be); + + return true; +} + +void vfio_iommufd_cpr_unregister_iommufd(IOMMUFDBackend *be) +{ + vmstate_unregister(NULL, &iommufd_cpr_vmstate, be); + migrate_del_blocker(&be->cpr_blocker); +} + +bool vfio_iommufd_cpr_register_container(VFIOIOMMUFDContainer *container, + Error **errp) +{ + VFIOContainerBase *bcontainer = &container->bcontainer; + + migration_add_notifier_mode(&bcontainer->cpr_reboot_notifier, + vfio_cpr_reboot_notifier, + MIG_MODE_CPR_REBOOT); + + vfio_cpr_add_kvm_notifier(); + + return true; +} + +void vfio_iommufd_cpr_unregister_container(VFIOIOMMUFDContainer *container) +{ + VFIOContainerBase *bcontainer = &container->bcontainer; + + migration_remove_notifier(&bcontainer->cpr_reboot_notifier); +} + +void vfio_iommufd_cpr_register_device(VFIODevice *vbasedev) +{ + if (!cpr_is_incoming()) { + /* + * Beware fd may have already been saved by vfio_device_set_fd, + * so call resave to avoid a duplicate entry. + */ + cpr_resave_fd(vbasedev->name, 0, vbasedev->fd); + vfio_cpr_save_device(vbasedev); + } +} + +void vfio_iommufd_cpr_unregister_device(VFIODevice *vbasedev) +{ + cpr_delete_fd(vbasedev->name, 0); + vfio_cpr_delete_device(vbasedev->name); +} + +void vfio_cpr_load_device(VFIODevice *vbasedev) +{ + if (cpr_is_incoming()) { + bool ret = vfio_cpr_find_device(vbasedev); + g_assert(ret); + + if (vbasedev->fd < 0) { + vbasedev->fd = cpr_find_fd(vbasedev->name, 0); + } + } +} diff --git a/hw/vfio/cpr-legacy.c b/hw/vfio/cpr-legacy.c new file mode 100644 index 0000000..553b203 --- /dev/null +++ b/hw/vfio/cpr-legacy.c @@ -0,0 +1,284 @@ +/* + * Copyright (c) 2021-2025 Oracle and/or its affiliates. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include <sys/ioctl.h> +#include <linux/vfio.h> +#include "qemu/osdep.h" +#include "hw/vfio/vfio-container.h" +#include "hw/vfio/vfio-device.h" +#include "hw/vfio/vfio-listener.h" +#include "migration/blocker.h" +#include "migration/cpr.h" +#include "migration/migration.h" +#include "migration/vmstate.h" +#include "qapi/error.h" +#include "qemu/error-report.h" + +static bool vfio_dma_unmap_vaddr_all(VFIOContainer *container, Error **errp) +{ + struct vfio_iommu_type1_dma_unmap unmap = { + .argsz = sizeof(unmap), + .flags = VFIO_DMA_UNMAP_FLAG_VADDR | VFIO_DMA_UNMAP_FLAG_ALL, + .iova = 0, + .size = 0, + }; + if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { + error_setg_errno(errp, errno, "vfio_dma_unmap_vaddr_all"); + return false; + } + container->cpr.vaddr_unmapped = true; + return true; +} + +/* + * Set the new @vaddr for any mappings registered during cpr load. + * The incoming state is cleared thereafter. + */ +static int vfio_legacy_cpr_dma_map(const VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, void *vaddr, + bool readonly, MemoryRegion *mr) +{ + const VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); + struct vfio_iommu_type1_dma_map map = { + .argsz = sizeof(map), + .flags = VFIO_DMA_MAP_FLAG_VADDR, + .vaddr = (__u64)(uintptr_t)vaddr, + .iova = iova, + .size = size, + }; + + g_assert(cpr_is_incoming()); + + if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map)) { + return -errno; + } + + return 0; +} + +static void vfio_region_remap(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainer *container = container_of(listener, VFIOContainer, + cpr.remap_listener); + vfio_container_region_add(&container->bcontainer, section, true); +} + +static bool vfio_cpr_supported(VFIOContainer *container, Error **errp) +{ + if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UPDATE_VADDR)) { + error_setg(errp, "VFIO container does not support VFIO_UPDATE_VADDR"); + return false; + + } else if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UNMAP_ALL)) { + error_setg(errp, "VFIO container does not support VFIO_UNMAP_ALL"); + return false; + + } else { + return true; + } +} + +static int vfio_container_pre_save(void *opaque) +{ + VFIOContainer *container = opaque; + Error *local_err = NULL; + + if (!vfio_dma_unmap_vaddr_all(container, &local_err)) { + error_report_err(local_err); + return -1; + } + return 0; +} + +static int vfio_container_post_load(void *opaque, int version_id) +{ + VFIOContainer *container = opaque; + VFIOContainerBase *bcontainer = &container->bcontainer; + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + dma_map_fn saved_dma_map = vioc->dma_map; + Error *local_err = NULL; + + /* During incoming CPR, divert calls to dma_map. */ + vioc->dma_map = vfio_legacy_cpr_dma_map; + + if (!vfio_listener_register(bcontainer, &local_err)) { + error_report_err(local_err); + return -1; + } + + /* Restore original dma_map function */ + vioc->dma_map = saved_dma_map; + + return 0; +} + +static const VMStateDescription vfio_container_vmstate = { + .name = "vfio-container", + .version_id = 0, + .minimum_version_id = 0, + .priority = MIG_PRI_LOW, /* Must happen after devices and groups */ + .pre_save = vfio_container_pre_save, + .post_load = vfio_container_post_load, + .needed = cpr_incoming_needed, + .fields = (VMStateField[]) { + VMSTATE_END_OF_LIST() + } +}; + +static int vfio_cpr_fail_notifier(NotifierWithReturn *notifier, + MigrationEvent *e, Error **errp) +{ + VFIOContainer *container = + container_of(notifier, VFIOContainer, cpr.transfer_notifier); + VFIOContainerBase *bcontainer = &container->bcontainer; + + if (e->type != MIG_EVENT_PRECOPY_FAILED) { + return 0; + } + + if (container->cpr.vaddr_unmapped) { + /* + * Force a call to vfio_region_remap for each mapped section by + * temporarily registering a listener, and temporarily diverting + * dma_map to vfio_legacy_cpr_dma_map. The latter restores vaddr. + */ + + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + dma_map_fn saved_dma_map = vioc->dma_map; + vioc->dma_map = vfio_legacy_cpr_dma_map; + + container->cpr.remap_listener = (MemoryListener) { + .name = "vfio cpr recover", + .region_add = vfio_region_remap + }; + memory_listener_register(&container->cpr.remap_listener, + bcontainer->space->as); + memory_listener_unregister(&container->cpr.remap_listener); + container->cpr.vaddr_unmapped = false; + vioc->dma_map = saved_dma_map; + } + return 0; +} + +bool vfio_legacy_cpr_register_container(VFIOContainer *container, Error **errp) +{ + VFIOContainerBase *bcontainer = &container->bcontainer; + Error **cpr_blocker = &container->cpr.blocker; + + migration_add_notifier_mode(&bcontainer->cpr_reboot_notifier, + vfio_cpr_reboot_notifier, + MIG_MODE_CPR_REBOOT); + + if (!vfio_cpr_supported(container, cpr_blocker)) { + return migrate_add_blocker_modes(cpr_blocker, errp, + MIG_MODE_CPR_TRANSFER, -1) == 0; + } + + vfio_cpr_add_kvm_notifier(); + + vmstate_register(NULL, -1, &vfio_container_vmstate, container); + + migration_add_notifier_mode(&container->cpr.transfer_notifier, + vfio_cpr_fail_notifier, + MIG_MODE_CPR_TRANSFER); + return true; +} + +void vfio_legacy_cpr_unregister_container(VFIOContainer *container) +{ + VFIOContainerBase *bcontainer = &container->bcontainer; + + migration_remove_notifier(&bcontainer->cpr_reboot_notifier); + migrate_del_blocker(&container->cpr.blocker); + vmstate_unregister(NULL, &vfio_container_vmstate, container); + migration_remove_notifier(&container->cpr.transfer_notifier); +} + +/* + * In old QEMU, VFIO_DMA_UNMAP_FLAG_VADDR may fail on some mapping after + * succeeding for others, so the latter have lost their vaddr. Call this + * to restore vaddr for a section with a giommu. + * + * The giommu already exists. Find it and replay it, which calls + * vfio_legacy_cpr_dma_map further down the stack. + */ +void vfio_cpr_giommu_remap(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) +{ + VFIOGuestIOMMU *giommu = NULL; + hwaddr as_offset = section->offset_within_address_space; + hwaddr iommu_offset = as_offset - section->offset_within_region; + + QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { + if (giommu->iommu_mr == IOMMU_MEMORY_REGION(section->mr) && + giommu->iommu_offset == iommu_offset) { + break; + } + } + g_assert(giommu); + memory_region_iommu_replay(giommu->iommu_mr, &giommu->n); +} + +/* + * In old QEMU, VFIO_DMA_UNMAP_FLAG_VADDR may fail on some mapping after + * succeeding for others, so the latter have lost their vaddr. Call this + * to restore vaddr for a section with a RamDiscardManager. + * + * The ram discard listener already exists. Call its populate function + * directly, which calls vfio_legacy_cpr_dma_map. + */ +bool vfio_cpr_ram_discard_register_listener(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) +{ + VFIORamDiscardListener *vrdl = + vfio_find_ram_discard_listener(bcontainer, section); + + g_assert(vrdl); + return vrdl->listener.notify_populate(&vrdl->listener, section) == 0; +} + +int vfio_cpr_group_get_device_fd(int d, const char *name) +{ + const int id = 0; + int fd = cpr_find_fd(name, id); + + if (fd < 0) { + fd = ioctl(d, VFIO_GROUP_GET_DEVICE_FD, name); + if (fd >= 0) { + cpr_save_fd(name, id, fd); + } + } + return fd; +} + +static bool same_device(int fd1, int fd2) +{ + struct stat st1, st2; + + return !fstat(fd1, &st1) && !fstat(fd2, &st2) && st1.st_dev == st2.st_dev; +} + +bool vfio_cpr_container_match(VFIOContainer *container, VFIOGroup *group, + int fd) +{ + if (container->fd == fd) { + return true; + } + if (!same_device(container->fd, fd)) { + return false; + } + /* + * Same device, different fd. This occurs when the container fd is + * cpr_save'd multiple times, once for each groupid, so SCM_RIGHTS + * produces duplicates. De-dup it. + */ + cpr_delete_fd("vfio_container_for_group", group->groupid); + close(fd); + cpr_save_fd("vfio_container_for_group", group->groupid, container->fd); + return true; +} diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c index 3214184..af0f12a 100644 --- a/hw/vfio/cpr.c +++ b/hw/vfio/cpr.c @@ -7,13 +7,16 @@ #include "qemu/osdep.h" #include "hw/vfio/vfio-device.h" -#include "migration/misc.h" +#include "hw/vfio/vfio-cpr.h" +#include "hw/vfio/pci.h" +#include "hw/pci/msix.h" +#include "hw/pci/msi.h" +#include "migration/cpr.h" #include "qapi/error.h" #include "system/runstate.h" -#include "vfio-cpr.h" -static int vfio_cpr_reboot_notifier(NotifierWithReturn *notifier, - MigrationEvent *e, Error **errp) +int vfio_cpr_reboot_notifier(NotifierWithReturn *notifier, + MigrationEvent *e, Error **errp) { if (e->type == MIG_EVENT_PRECOPY_SETUP && !runstate_check(RUN_STATE_SUSPENDED) && !vm_get_suspended()) { @@ -26,15 +29,172 @@ static int vfio_cpr_reboot_notifier(NotifierWithReturn *notifier, return 0; } -bool vfio_cpr_register_container(VFIOContainerBase *bcontainer, Error **errp) +#define STRDUP_VECTOR_FD_NAME(vdev, name) \ + g_strdup_printf("%s_%s", (vdev)->vbasedev.name, (name)) + +void vfio_cpr_save_vector_fd(VFIOPCIDevice *vdev, const char *name, int nr, + int fd) +{ + g_autofree char *fdname = STRDUP_VECTOR_FD_NAME(vdev, name); + cpr_save_fd(fdname, nr, fd); +} + +int vfio_cpr_load_vector_fd(VFIOPCIDevice *vdev, const char *name, int nr) +{ + g_autofree char *fdname = STRDUP_VECTOR_FD_NAME(vdev, name); + return cpr_find_fd(fdname, nr); +} + +void vfio_cpr_delete_vector_fd(VFIOPCIDevice *vdev, const char *name, int nr) { - migration_add_notifier_mode(&bcontainer->cpr_reboot_notifier, - vfio_cpr_reboot_notifier, - MIG_MODE_CPR_REBOOT); - return true; + g_autofree char *fdname = STRDUP_VECTOR_FD_NAME(vdev, name); + cpr_delete_fd(fdname, nr); +} + +static void vfio_cpr_claim_vectors(VFIOPCIDevice *vdev, int nr_vectors, + bool msix) +{ + int i, fd; + bool pending = false; + PCIDevice *pdev = &vdev->pdev; + + vdev->nr_vectors = nr_vectors; + vdev->msi_vectors = g_new0(VFIOMSIVector, nr_vectors); + vdev->interrupt = msix ? VFIO_INT_MSIX : VFIO_INT_MSI; + + vfio_pci_prepare_kvm_msi_virq_batch(vdev); + + for (i = 0; i < nr_vectors; i++) { + VFIOMSIVector *vector = &vdev->msi_vectors[i]; + + fd = vfio_cpr_load_vector_fd(vdev, "interrupt", i); + if (fd >= 0) { + vfio_pci_vector_init(vdev, i); + vfio_pci_msi_set_handler(vdev, i); + } + + if (vfio_cpr_load_vector_fd(vdev, "kvm_interrupt", i) >= 0) { + vfio_pci_add_kvm_msi_virq(vdev, vector, i, msix); + } else { + vdev->msi_vectors[i].virq = -1; + } + + if (msix && msix_is_pending(pdev, i) && msix_is_masked(pdev, i)) { + set_bit(i, vdev->msix->pending); + pending = true; + } + } + + vfio_pci_commit_kvm_msi_virq_batch(vdev); + + if (msix) { + memory_region_set_enabled(&pdev->msix_pba_mmio, pending); + } } -void vfio_cpr_unregister_container(VFIOContainerBase *bcontainer) +/* + * The kernel may change non-emulated config bits. Exclude them from the + * changed-bits check in get_pci_config_device. + */ +static int vfio_cpr_pci_pre_load(void *opaque) { - migration_remove_notifier(&bcontainer->cpr_reboot_notifier); + VFIOPCIDevice *vdev = opaque; + PCIDevice *pdev = &vdev->pdev; + int size = MIN(pci_config_size(pdev), vdev->config_size); + int i; + + for (i = 0; i < size; i++) { + pdev->cmask[i] &= vdev->emulated_config_bits[i]; + } + + return 0; +} + +static int vfio_cpr_pci_post_load(void *opaque, int version_id) +{ + VFIOPCIDevice *vdev = opaque; + PCIDevice *pdev = &vdev->pdev; + int nr_vectors; + + if (msix_enabled(pdev)) { + vfio_pci_msix_set_notifiers(vdev); + nr_vectors = vdev->msix->entries; + vfio_cpr_claim_vectors(vdev, nr_vectors, true); + + } else if (msi_enabled(pdev)) { + nr_vectors = msi_nr_vectors_allocated(pdev); + vfio_cpr_claim_vectors(vdev, nr_vectors, false); + + } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) { + Error *local_err = NULL; + if (!vfio_pci_intx_enable(vdev, &local_err)) { + error_report_err(local_err); + return -1; + } + } + + return 0; +} + +static bool pci_msix_present(void *opaque, int version_id) +{ + PCIDevice *pdev = opaque; + + return msix_present(pdev); +} + +static const VMStateDescription vfio_intx_vmstate = { + .name = "vfio-cpr-intx", + .version_id = 0, + .minimum_version_id = 0, + .fields = (VMStateField[]) { + VMSTATE_BOOL(pending, VFIOINTx), + VMSTATE_UINT32(route.mode, VFIOINTx), + VMSTATE_INT32(route.irq, VFIOINTx), + VMSTATE_END_OF_LIST() + } +}; + +#define VMSTATE_VFIO_INTX(_field, _state) { \ + .name = (stringify(_field)), \ + .size = sizeof(VFIOINTx), \ + .vmsd = &vfio_intx_vmstate, \ + .flags = VMS_STRUCT, \ + .offset = vmstate_offset_value(_state, _field, VFIOINTx), \ +} + +const VMStateDescription vfio_cpr_pci_vmstate = { + .name = "vfio-cpr-pci", + .version_id = 0, + .minimum_version_id = 0, + .pre_load = vfio_cpr_pci_pre_load, + .post_load = vfio_cpr_pci_post_load, + .needed = cpr_incoming_needed, + .fields = (VMStateField[]) { + VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice), + VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, pci_msix_present), + VMSTATE_VFIO_INTX(intx, VFIOPCIDevice), + VMSTATE_END_OF_LIST() + } +}; + +static NotifierWithReturn kvm_close_notifier; + +static int vfio_cpr_kvm_close_notifier(NotifierWithReturn *notifier, + MigrationEvent *e, + Error **errp) +{ + if (e->type == MIG_EVENT_PRECOPY_DONE) { + vfio_kvm_device_close(); + } + return 0; +} + +void vfio_cpr_add_kvm_notifier(void) +{ + if (!kvm_close_notifier.notify) { + migration_add_notifier_mode(&kvm_close_notifier, + vfio_cpr_kvm_close_notifier, + MIG_MODE_CPR_TRANSFER); + } } diff --git a/hw/vfio/device.c b/hw/vfio/device.c index 9fba2c7..96cf214 100644 --- a/hw/vfio/device.c +++ b/hw/vfio/device.c @@ -28,6 +28,8 @@ #include "qapi/error.h" #include "qemu/error-report.h" #include "qemu/units.h" +#include "migration/cpr.h" +#include "migration/blocker.h" #include "monitor/monitor.h" #include "vfio-helpers.h" @@ -200,6 +202,7 @@ int vfio_device_get_region_info(VFIODevice *vbasedev, int index, struct vfio_region_info **info) { size_t argsz = sizeof(struct vfio_region_info); + int fd = -1; int ret; /* check cache */ @@ -214,7 +217,7 @@ int vfio_device_get_region_info(VFIODevice *vbasedev, int index, retry: (*info)->argsz = argsz; - ret = vbasedev->io_ops->get_region_info(vbasedev, *info); + ret = vbasedev->io_ops->get_region_info(vbasedev, *info, &fd); if (ret != 0) { g_free(*info); *info = NULL; @@ -225,15 +228,30 @@ retry: argsz = (*info)->argsz; *info = g_realloc(*info, argsz); + if (fd != -1) { + close(fd); + fd = -1; + } + goto retry; } /* fill cache */ vbasedev->reginfo[index] = *info; + if (vbasedev->region_fds != NULL) { + vbasedev->region_fds[index] = fd; + } return 0; } +int vfio_device_get_region_fd(VFIODevice *vbasedev, int index) +{ + return vbasedev->region_fds ? + vbasedev->region_fds[index] : + vbasedev->fd; +} + int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type, uint32_t subtype, struct vfio_region_info **info) { @@ -300,28 +318,40 @@ bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp) error_setg(errp, "Use FD passing only with iommufd backend"); return false; } - /* - * Give a name with fd so any function printing out vbasedev->name - * will not break. - */ if (!vbasedev->name) { - vbasedev->name = g_strdup_printf("VFIO_FD%d", vbasedev->fd); + + if (vbasedev->dev->id) { + vbasedev->name = g_strdup(vbasedev->dev->id); + return true; + } else { + /* + * Assign a name so any function printing it will not break. + * The fd number changes across processes, so this cannot be + * used as an invariant name for CPR. + */ + vbasedev->name = g_strdup_printf("VFIO_FD%d", vbasedev->fd); + error_setg(&vbasedev->cpr.id_blocker, + "vfio device with fd=%d needs an id property", + vbasedev->fd); + return migrate_add_blocker_modes(&vbasedev->cpr.id_blocker, + errp, MIG_MODE_CPR_TRANSFER, + -1) == 0; + } } } return true; } -void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp) +void vfio_device_free_name(VFIODevice *vbasedev) { - ERRP_GUARD(); - int fd = monitor_fd_param(monitor_cur(), str, errp); + g_clear_pointer(&vbasedev->name, g_free); + migrate_del_blocker(&vbasedev->cpr.id_blocker); +} - if (fd < 0) { - error_prepend(errp, "Could not parse remote object fd %s:", str); - return; - } - vbasedev->fd = fd; +void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp) +{ + vbasedev->fd = cpr_get_fd_param(vbasedev->dev->id, str, 0, errp); } static VFIODeviceIOOps vfio_device_io_ops_ioctl; @@ -334,6 +364,7 @@ void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, vbasedev->io_ops = &vfio_device_io_ops_ioctl; vbasedev->dev = dev; vbasedev->fd = -1; + vbasedev->use_region_fds = false; vbasedev->ram_block_discard_allowed = ram_discard; } @@ -444,6 +475,9 @@ void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer, vbasedev->reginfo = g_new0(struct vfio_region_info *, vbasedev->num_regions); + if (vbasedev->use_region_fds) { + vbasedev->region_fds = g_new0(int, vbasedev->num_regions); + } } void vfio_device_unprepare(VFIODevice *vbasedev) @@ -452,9 +486,14 @@ void vfio_device_unprepare(VFIODevice *vbasedev) for (i = 0; i < vbasedev->num_regions; i++) { g_free(vbasedev->reginfo[i]); + if (vbasedev->region_fds != NULL && vbasedev->region_fds[i] != -1) { + close(vbasedev->region_fds[i]); + } + } - g_free(vbasedev->reginfo); - vbasedev->reginfo = NULL; + + g_clear_pointer(&vbasedev->reginfo, g_free); + g_clear_pointer(&vbasedev->region_fds, g_free); QLIST_REMOVE(vbasedev, container_next); QLIST_REMOVE(vbasedev, global_next); @@ -476,10 +515,13 @@ static int vfio_device_io_device_feature(VFIODevice *vbasedev, } static int vfio_device_io_get_region_info(VFIODevice *vbasedev, - struct vfio_region_info *info) + struct vfio_region_info *info, + int *fd) { int ret; + *fd = -1; + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, info); return ret < 0 ? -errno : ret; @@ -522,7 +564,8 @@ static int vfio_device_io_region_read(VFIODevice *vbasedev, uint8_t index, } static int vfio_device_io_region_write(VFIODevice *vbasedev, uint8_t index, - off_t off, uint32_t size, void *data) + off_t off, uint32_t size, void *data, + bool post) { struct vfio_region_info *info; int ret; diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index d0dbab1..9a5f621 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -117,6 +117,17 @@ bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, int vfio_kvm_device_fd = -1; #endif +void vfio_kvm_device_close(void) +{ +#ifdef CONFIG_KVM + kvm_close(); + if (vfio_kvm_device_fd != -1) { + close(vfio_kvm_device_fd); + vfio_kvm_device_fd = -1; + } +#endif +} + int vfio_kvm_device_add_fd(int fd, Error **errp) { #ifdef CONFIG_KVM diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c index e7952d1..e7a9d1f 100644 --- a/hw/vfio/igd.c +++ b/hw/vfio/igd.c @@ -187,23 +187,21 @@ static bool vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, } static bool vfio_pci_igd_opregion_detect(VFIOPCIDevice *vdev, - struct vfio_region_info **opregion, - Error **errp) + struct vfio_region_info **opregion) { int ret; - /* Hotplugging is not supported for opregion access */ - if (vdev->pdev.qdev.hotplugged) { - error_setg(errp, "IGD OpRegion is not supported on hotplugged device"); - return false; - } - ret = vfio_device_get_region_info_type(&vdev->vbasedev, VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, opregion); if (ret) { - error_setg_errno(errp, -ret, - "Device does not supports IGD OpRegion feature"); + return false; + } + + /* Hotplugging is not supported for opregion access */ + if (vdev->pdev.qdev.hotplugged) { + warn_report("IGD device detected, but OpRegion is not supported " + "on hotplugged device."); return false; } @@ -524,7 +522,7 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) } /* IGD device always comes with OpRegion */ - if (!vfio_pci_igd_opregion_detect(vdev, &opregion, errp)) { + if (!vfio_pci_igd_opregion_detect(vdev, &opregion)) { return true; } info_report("OpRegion detected on Intel display %x.", vdev->device_id); @@ -695,7 +693,7 @@ static bool vfio_pci_kvmgt_config_quirk(VFIOPCIDevice *vdev, Error **errp) return true; } - if (!vfio_pci_igd_opregion_detect(vdev, &opregion, errp)) { + if (!vfio_pci_igd_opregion_detect(vdev, &opregion)) { /* Should never reach here, KVMGT always emulates OpRegion */ return false; } diff --git a/hw/vfio/iommufd-stubs.c b/hw/vfio/iommufd-stubs.c new file mode 100644 index 0000000..0be5276 --- /dev/null +++ b/hw/vfio/iommufd-stubs.c @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2025 Oracle and/or its affiliates. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "migration/cpr.h" +#include "migration/vmstate.h" + +const VMStateDescription vmstate_cpr_vfio_devices = { + .name = CPR_STATE "/vfio devices", + .version_id = 1, + .minimum_version_id = 1, + .fields = (const VMStateField[]){ + VMSTATE_END_OF_LIST() + } +}; diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index af1c7ab..48c590b 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -21,20 +21,22 @@ #include "qapi/error.h" #include "system/iommufd.h" #include "hw/qdev-core.h" +#include "hw/vfio/vfio-cpr.h" #include "system/reset.h" #include "qemu/cutils.h" #include "qemu/chardev_open.h" +#include "migration/cpr.h" #include "pci.h" #include "vfio-iommufd.h" #include "vfio-helpers.h" -#include "vfio-cpr.h" #include "vfio-listener.h" #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO \ TYPE_HOST_IOMMU_DEVICE_IOMMUFD "-vfio" static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova, - ram_addr_t size, void *vaddr, bool readonly) + ram_addr_t size, void *vaddr, bool readonly, + MemoryRegion *mr) { const VFIOIOMMUFDContainer *container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); @@ -44,6 +46,18 @@ static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova, iova, size, vaddr, readonly); } +static int iommufd_cdev_map_file(const VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + int fd, unsigned long start, bool readonly) +{ + const VFIOIOMMUFDContainer *container = + container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + + return iommufd_backend_map_file_dma(container->be, + container->ioas_id, + iova, size, fd, start, readonly); +} + static int iommufd_cdev_unmap(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb, bool unmap_all) @@ -108,6 +122,10 @@ static bool iommufd_cdev_connect_and_bind(VFIODevice *vbasedev, Error **errp) goto err_kvm_device_add; } + if (cpr_is_incoming()) { + goto skip_bind; + } + /* Bind device to iommufd */ bind.iommufd = iommufd->fd; if (ioctl(vbasedev->fd, VFIO_DEVICE_BIND_IOMMUFD, &bind)) { @@ -119,6 +137,8 @@ static bool iommufd_cdev_connect_and_bind(VFIODevice *vbasedev, Error **errp) vbasedev->devid = bind.out_devid; trace_iommufd_cdev_connect_and_bind(bind.iommufd, vbasedev->name, vbasedev->fd, vbasedev->devid); + +skip_bind: return true; err_bind: iommufd_cdev_kvm_device_del(vbasedev); @@ -312,7 +332,14 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev, /* Try to find a domain */ QLIST_FOREACH(hwpt, &container->hwpt_list, next) { - ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp); + if (!cpr_is_incoming()) { + ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp); + } else if (vbasedev->cpr.hwpt_id == hwpt->hwpt_id) { + ret = 0; + } else { + continue; + } + if (ret) { /* -EINVAL means the domain is incompatible with the device. */ if (ret == -EINVAL) { @@ -329,6 +356,7 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev, return false; } else { vbasedev->hwpt = hwpt; + vbasedev->cpr.hwpt_id = hwpt->hwpt_id; QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next); vbasedev->iommu_dirty_tracking = iommufd_hwpt_dirty_tracking(hwpt); return true; @@ -351,6 +379,11 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev, flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING; } + if (cpr_is_incoming()) { + hwpt_id = vbasedev->cpr.hwpt_id; + goto skip_alloc; + } + if (!iommufd_backend_alloc_hwpt(iommufd, vbasedev->devid, container->ioas_id, flags, IOMMU_HWPT_DATA_NONE, 0, NULL, @@ -358,19 +391,20 @@ static bool iommufd_cdev_autodomains_get(VFIODevice *vbasedev, return false; } + ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt_id, errp); + if (ret) { + iommufd_backend_free_id(container->be, hwpt_id); + return false; + } + +skip_alloc: hwpt = g_malloc0(sizeof(*hwpt)); hwpt->hwpt_id = hwpt_id; hwpt->hwpt_flags = flags; QLIST_INIT(&hwpt->device_list); - ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp); - if (ret) { - iommufd_backend_free_id(container->be, hwpt->hwpt_id); - g_free(hwpt); - return false; - } - vbasedev->hwpt = hwpt; + vbasedev->cpr.hwpt_id = hwpt->hwpt_id; vbasedev->iommu_dirty_tracking = iommufd_hwpt_dirty_tracking(hwpt); QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next); QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next); @@ -408,7 +442,9 @@ static bool iommufd_cdev_attach_container(VFIODevice *vbasedev, return iommufd_cdev_autodomains_get(vbasedev, container, errp); } - return !iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp); + /* If CPR, we are already attached to ioas_id. */ + return cpr_is_incoming() || + !iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp); } static void iommufd_cdev_detach_container(VFIODevice *vbasedev, @@ -433,7 +469,7 @@ static void iommufd_cdev_container_destroy(VFIOIOMMUFDContainer *container) if (!QLIST_EMPTY(&bcontainer->device_list)) { return; } - vfio_cpr_unregister_container(bcontainer); + vfio_iommufd_cpr_unregister_container(container); vfio_listener_unregister(bcontainer); iommufd_backend_free_id(container->be, container->ioas_id); object_unref(container); @@ -497,11 +533,14 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, VFIOAddressSpace *space; struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; int ret, devfd; + bool res; uint32_t ioas_id; Error *err = NULL; const VFIOIOMMUClass *iommufd_vioc = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); + vfio_cpr_load_device(vbasedev); + if (vbasedev->fd < 0) { devfd = iommufd_cdev_getfd(vbasedev->sysfsdev, errp); if (devfd < 0) { @@ -525,7 +564,16 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, vbasedev->iommufd != container->be) { continue; } - if (!iommufd_cdev_attach_container(vbasedev, container, &err)) { + + if (!cpr_is_incoming()) { + res = iommufd_cdev_attach_container(vbasedev, container, &err); + } else if (vbasedev->cpr.ioas_id == container->ioas_id) { + res = true; + } else { + continue; + } + + if (!res) { const char *msg = error_get_pretty(err); trace_iommufd_cdev_fail_attach_existing_container(msg); @@ -542,6 +590,11 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, } } + if (cpr_is_incoming()) { + ioas_id = vbasedev->cpr.ioas_id; + goto skip_ioas_alloc; + } + /* Need to allocate a new dedicated container */ if (!iommufd_backend_alloc_ioas(vbasedev->iommufd, &ioas_id, errp)) { goto err_alloc_ioas; @@ -549,10 +602,12 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, trace_iommufd_cdev_alloc_ioas(vbasedev->iommufd->fd, ioas_id); +skip_ioas_alloc: container = VFIO_IOMMU_IOMMUFD(object_new(TYPE_VFIO_IOMMU_IOMMUFD)); container->be = vbasedev->iommufd; container->ioas_id = ioas_id; QLIST_INIT(&container->hwpt_list); + vbasedev->cpr.ioas_id = ioas_id; bcontainer = &container->bcontainer; vfio_address_space_insert(space, bcontainer); @@ -579,7 +634,7 @@ static bool iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, goto err_listener_register; } - if (!vfio_cpr_register_container(bcontainer, errp)) { + if (!vfio_iommufd_cpr_register_container(container, errp)) { goto err_listener_register; } @@ -592,6 +647,10 @@ found_container: goto err_listener_register; } + /* + * Do not move this code before attachment! The nested IOMMU support + * needs device and hwpt id which are generated only after attachment. + */ if (!vfio_device_hiod_create_and_realize(vbasedev, TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO, errp)) { goto err_listener_register; @@ -606,6 +665,7 @@ found_container: } vfio_device_prepare(vbasedev, bcontainer, &dev_info); + vfio_iommufd_cpr_register_device(vbasedev); trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs, vbasedev->num_regions, vbasedev->flags); @@ -643,6 +703,7 @@ static void iommufd_cdev_detach(VFIODevice *vbasedev) iommufd_cdev_container_destroy(container); vfio_address_space_put(space); + vfio_iommufd_cpr_unregister_device(vbasedev); iommufd_cdev_unbind_and_disconnect(vbasedev); close(vbasedev->fd); } @@ -802,6 +863,7 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, const void *data) VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); vioc->dma_map = iommufd_cdev_map; + vioc->dma_map_file = iommufd_cdev_map_file; vioc->dma_unmap = iommufd_cdev_unmap; vioc->attach_device = iommufd_cdev_attach; vioc->detach_device = iommufd_cdev_detach; @@ -810,21 +872,38 @@ static void vfio_iommu_iommufd_class_init(ObjectClass *klass, const void *data) vioc->query_dirty_bitmap = iommufd_query_dirty_bitmap; }; +static bool +host_iommu_device_iommufd_vfio_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + uint32_t hwpt_id, Error **errp) +{ + VFIODevice *vbasedev = HOST_IOMMU_DEVICE(idev)->agent; + + return !iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt_id, errp); +} + +static bool +host_iommu_device_iommufd_vfio_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + Error **errp) +{ + VFIODevice *vbasedev = HOST_IOMMU_DEVICE(idev)->agent; + + return iommufd_cdev_detach_ioas_hwpt(vbasedev, errp); +} + static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, Error **errp) { VFIODevice *vdev = opaque; + HostIOMMUDeviceIOMMUFD *idev; HostIOMMUDeviceCaps *caps = &hiod->caps; + VendorCaps *vendor_caps = &caps->vendor_caps; enum iommu_hw_info_type type; - union { - struct iommu_hw_info_vtd vtd; - } data; uint64_t hw_caps; hiod->agent = opaque; - if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, - &type, &data, sizeof(data), + if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, &type, + vendor_caps, sizeof(*vendor_caps), &hw_caps, errp)) { return false; } @@ -833,6 +912,11 @@ static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, caps->type = type; caps->hw_caps = hw_caps; + idev = HOST_IOMMU_DEVICE_IOMMUFD(hiod); + idev->iommufd = vdev->iommufd; + idev->devid = vdev->devid; + idev->hwpt_id = vdev->hwpt->hwpt_id; + return true; } @@ -858,10 +942,14 @@ hiod_iommufd_vfio_get_page_size_mask(HostIOMMUDevice *hiod) static void hiod_iommufd_vfio_class_init(ObjectClass *oc, const void *data) { HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_CLASS(oc); + HostIOMMUDeviceIOMMUFDClass *idevc = HOST_IOMMU_DEVICE_IOMMUFD_CLASS(oc); hiodc->realize = hiod_iommufd_vfio_realize; hiodc->get_iova_ranges = hiod_iommufd_vfio_get_iova_ranges; hiodc->get_page_size_mask = hiod_iommufd_vfio_get_page_size_mask; + + idevc->attach_hwpt = host_iommu_device_iommufd_vfio_attach_hwpt; + idevc->detach_hwpt = host_iommu_device_iommufd_vfio_detach_hwpt; }; static const TypeInfo types[] = { diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c index bfacb3d..f498e23 100644 --- a/hw/vfio/listener.c +++ b/hw/vfio/listener.c @@ -90,16 +90,17 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section) section->offset_within_address_space & (1ULL << 63); } -/* Called with rcu_read_lock held. */ -static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, - ram_addr_t *ram_addr, bool *read_only, - Error **errp) +/* + * Called with rcu_read_lock held. + * The returned MemoryRegion must not be accessed after calling rcu_read_unlock. + */ +static MemoryRegion *vfio_translate_iotlb(IOMMUTLBEntry *iotlb, hwaddr *xlat_p, + Error **errp) { - bool ret, mr_has_discard_manager; + MemoryRegion *mr; - ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only, - &mr_has_discard_manager, errp); - if (ret && mr_has_discard_manager) { + mr = memory_translate_iotlb(iotlb, xlat_p, errp); + if (mr && memory_region_has_ram_discard_manager(mr)) { /* * Malicious VMs might trigger discarding of IOMMU-mapped memory. The * pages will remain pinned inside vfio until unmapped, resulting in a @@ -118,7 +119,7 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, " intended via an IOMMU. It's possible to mitigate " " by setting/adjusting RLIMIT_MEMLOCK."); } - return ret; + return mr; } static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) @@ -126,6 +127,8 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); VFIOContainerBase *bcontainer = giommu->bcontainer; hwaddr iova = iotlb->iova + giommu->iommu_offset; + MemoryRegion *mr; + hwaddr xlat; void *vaddr; int ret; Error *local_err = NULL; @@ -150,10 +153,14 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { bool read_only; - if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, &local_err)) { + mr = vfio_translate_iotlb(iotlb, &xlat, &local_err); + if (!mr) { error_report_err(local_err); goto out; } + vaddr = memory_region_get_ram_ptr(mr) + xlat; + read_only = !(iotlb->perm & IOMMU_WO) || mr->readonly; + /* * vaddr is only valid until rcu_read_unlock(). But after * vfio_dma_map has set up the mapping the pages will be @@ -163,7 +170,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) */ ret = vfio_container_dma_map(bcontainer, iova, iotlb->addr_mask + 1, vaddr, - read_only); + read_only, mr); if (ret) { error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%s)", @@ -233,7 +240,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, vaddr = memory_region_get_ram_ptr(section->mr) + start; ret = vfio_container_dma_map(bcontainer, iova, next - start, - vaddr, section->readonly); + vaddr, section->readonly, section->mr); if (ret) { /* Rollback */ vfio_ram_discard_notify_discard(rdl, section); @@ -430,7 +437,7 @@ static void vfio_listener_commit(MemoryListener *listener) listener); void (*listener_commit)(VFIOContainerBase *bcontainer); - listener_commit = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_begin; + listener_commit = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_commit; if (listener_commit) { listener_commit(bcontainer); @@ -449,11 +456,38 @@ static void vfio_device_error_append(VFIODevice *vbasedev, Error **errp) } } +VFIORamDiscardListener *vfio_find_ram_discard_listener( + VFIOContainerBase *bcontainer, MemoryRegionSection *section) +{ + VFIORamDiscardListener *vrdl = NULL; + + QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { + if (vrdl->mr == section->mr && + vrdl->offset_within_address_space == + section->offset_within_address_space) { + break; + } + } + + if (!vrdl) { + hw_error("vfio: Trying to sync missing RAM discard listener"); + /* does not return */ + } + return vrdl; +} + static void vfio_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, listener); + vfio_container_region_add(bcontainer, section, false); +} + +void vfio_container_region_add(VFIOContainerBase *bcontainer, + MemoryRegionSection *section, + bool cpr_remap) +{ hwaddr iova, end; Int128 llend, llsize; void *vaddr; @@ -489,6 +523,11 @@ static void vfio_listener_region_add(MemoryListener *listener, int iommu_idx; trace_vfio_listener_region_add_iommu(section->mr->name, iova, end); + + if (cpr_remap) { + vfio_cpr_giommu_remap(bcontainer, section); + } + /* * FIXME: For VFIO iommu types which have KVM acceleration to * avoid bouncing all map/unmaps through qemu this way, this @@ -531,7 +570,12 @@ static void vfio_listener_region_add(MemoryListener *listener, * about changes. */ if (memory_region_has_ram_discard_manager(section->mr)) { - vfio_ram_discard_register_listener(bcontainer, section); + if (!cpr_remap) { + vfio_ram_discard_register_listener(bcontainer, section); + } else if (!vfio_cpr_ram_discard_register_listener(bcontainer, + section)) { + goto fail; + } return; } @@ -557,7 +601,7 @@ static void vfio_listener_region_add(MemoryListener *listener, } ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize), - vaddr, section->readonly); + vaddr, section->readonly, section->mr); if (ret) { error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%s)", @@ -1010,6 +1054,8 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) ram_addr_t translated_addr; Error *local_err = NULL; int ret = -EINVAL; + MemoryRegion *mr; + hwaddr xlat; trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask); @@ -1021,9 +1067,11 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) } rcu_read_lock(); - if (!vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL, &local_err)) { + mr = vfio_translate_iotlb(iotlb, &xlat, &local_err); + if (!mr) { goto out_unlock; } + translated_addr = memory_region_get_ram_addr(mr) + xlat; ret = vfio_container_query_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1, translated_addr, &local_err); @@ -1075,19 +1123,8 @@ vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer, MemoryRegionSection *section) { RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); - VFIORamDiscardListener *vrdl = NULL; - - QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { - if (vrdl->mr == section->mr && - vrdl->offset_within_address_space == - section->offset_within_address_space) { - break; - } - } - - if (!vrdl) { - hw_error("vfio: Trying to sync missing RAM discard listener"); - } + VFIORamDiscardListener *vrdl = + vfio_find_ram_discard_listener(bcontainer, section); /* * We only want/can synchronize the bitmap for actually mapped parts - diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index bccb050..bfaf6be 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + vfio_ss = ss.source_set() vfio_ss.add(files( 'listener.c', @@ -21,6 +23,7 @@ system_ss.add(when: 'CONFIG_VFIO_XGMAC', if_true: files('calxeda-xgmac.c')) system_ss.add(when: 'CONFIG_VFIO_AMD_XGBE', if_true: files('amd-xgbe.c')) system_ss.add(when: 'CONFIG_VFIO', if_true: files( 'cpr.c', + 'cpr-legacy.c', 'device.c', 'migration.c', 'migration-multifd.c', @@ -28,7 +31,9 @@ system_ss.add(when: 'CONFIG_VFIO', if_true: files( )) system_ss.add(when: ['CONFIG_VFIO', 'CONFIG_IOMMUFD'], if_true: files( 'iommufd.c', + 'cpr-iommufd.c', )) +system_ss.add(when: 'CONFIG_IOMMUFD', if_false: files('iommufd-stubs.c')) system_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files( 'display.c', )) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index a1bfdfe..1093b28 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -29,7 +29,9 @@ #include "hw/pci/pci_bridge.h" #include "hw/qdev-properties.h" #include "hw/qdev-properties-system.h" +#include "hw/vfio/vfio-cpr.h" #include "migration/vmstate.h" +#include "migration/cpr.h" #include "qobject/qdict.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" @@ -56,6 +58,36 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); static void vfio_msi_disable_common(VFIOPCIDevice *vdev); +/* Create new or reuse existing eventfd */ +static bool vfio_notifier_init(VFIOPCIDevice *vdev, EventNotifier *e, + const char *name, int nr, Error **errp) +{ + int fd, ret; + + fd = vfio_cpr_load_vector_fd(vdev, name, nr); + if (fd >= 0) { + event_notifier_init_fd(e, fd); + return true; + } + + ret = event_notifier_init(e, 0); + if (ret) { + error_setg_errno(errp, -ret, "vfio_notifier_init %s failed", name); + return false; + } + + fd = event_notifier_get_fd(e); + vfio_cpr_save_vector_fd(vdev, name, nr, fd); + return true; +} + +static void vfio_notifier_cleanup(VFIOPCIDevice *vdev, EventNotifier *e, + const char *name, int nr) +{ + vfio_cpr_delete_vector_fd(vdev, name, nr); + event_notifier_cleanup(e); +} + /* * Disabling BAR mmaping can be slow, but toggling it around INTx can * also be a huge overhead. We try to get the best of both worlds by @@ -103,7 +135,7 @@ static void vfio_intx_interrupt(void *opaque) } } -static void vfio_intx_eoi(VFIODevice *vbasedev) +void vfio_pci_intx_eoi(VFIODevice *vbasedev) { VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); @@ -111,7 +143,7 @@ static void vfio_intx_eoi(VFIODevice *vbasedev) return; } - trace_vfio_intx_eoi(vbasedev->name); + trace_vfio_pci_intx_eoi(vbasedev->name); vdev->intx.pending = false; pci_irq_deassert(&vdev->pdev); @@ -136,8 +168,7 @@ static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp) pci_irq_deassert(&vdev->pdev); /* Get an eventfd for resample/unmask */ - if (event_notifier_init(&vdev->intx.unmask, 0)) { - error_setg(errp, "event_notifier_init failed eoi"); + if (!vfio_notifier_init(vdev, &vdev->intx.unmask, "intx-unmask", 0, errp)) { goto fail; } @@ -169,7 +200,7 @@ fail_vfio: kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt, vdev->intx.route.irq); fail_irqfd: - event_notifier_cleanup(&vdev->intx.unmask); + vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0); fail: qemu_set_fd_handler(irq_fd, vfio_intx_interrupt, NULL, vdev); vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); @@ -179,6 +210,36 @@ fail: #endif } +static bool vfio_cpr_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp) +{ +#ifdef CONFIG_KVM + if (vdev->no_kvm_intx || !kvm_irqfds_enabled() || + vdev->intx.route.mode != PCI_INTX_ENABLED || + !kvm_resamplefds_enabled()) { + return true; + } + + if (!vfio_notifier_init(vdev, &vdev->intx.unmask, "intx-unmask", 0, errp)) { + return false; + } + + if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, + &vdev->intx.interrupt, + &vdev->intx.unmask, + vdev->intx.route.irq)) { + error_setg_errno(errp, errno, "failed to setup resample irqfd"); + vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0); + return false; + } + + vdev->intx.kvm_accel = true; + trace_vfio_intx_enable_kvm(vdev->vbasedev.name); + return true; +#else + return true; +#endif +} + static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev) { #ifdef CONFIG_KVM @@ -201,7 +262,7 @@ static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev) } /* We only need to close the eventfd for VFIO to cleanup the kernel side */ - event_notifier_cleanup(&vdev->intx.unmask); + vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0); /* QEMU starts listening for interrupt events. */ qemu_set_fd_handler(event_notifier_get_fd(&vdev->intx.interrupt), @@ -236,7 +297,7 @@ static void vfio_intx_update(VFIOPCIDevice *vdev, PCIINTxRoute *route) } /* Re-enable the interrupt in cased we missed an EOI */ - vfio_intx_eoi(&vdev->vbasedev); + vfio_pci_intx_eoi(&vdev->vbasedev); } static void vfio_intx_routing_notifier(PCIDevice *pdev) @@ -268,14 +329,19 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1); Error *err = NULL; int32_t fd; - int ret; if (!pin) { return true; } - vfio_disable_interrupts(vdev); + /* + * Do not alter interrupt state during vfio_realize and cpr load. + * The incoming state is cleared thereafter. + */ + if (!cpr_is_incoming()) { + vfio_disable_interrupts(vdev); + } vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */ pci_config_set_interrupt_pin(vdev->pdev.config, pin); @@ -291,18 +357,25 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) } #endif - ret = event_notifier_init(&vdev->intx.interrupt, 0); - if (ret) { - error_setg_errno(errp, -ret, "event_notifier_init failed"); + if (!vfio_notifier_init(vdev, &vdev->intx.interrupt, "intx-interrupt", 0, + errp)) { return false; } fd = event_notifier_get_fd(&vdev->intx.interrupt); qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev); + + if (cpr_is_incoming()) { + if (!vfio_cpr_intx_enable_kvm(vdev, &err)) { + warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); + } + goto skip_signaling; + } + if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) { qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->intx.interrupt); + vfio_notifier_cleanup(vdev, &vdev->intx.interrupt, "intx-interrupt", 0); return false; } @@ -310,6 +383,7 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); } +skip_signaling: vdev->interrupt = VFIO_INT_INTx; trace_vfio_intx_enable(vdev->vbasedev.name); @@ -329,13 +403,18 @@ static void vfio_intx_disable(VFIOPCIDevice *vdev) fd = event_notifier_get_fd(&vdev->intx.interrupt); qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->intx.interrupt); + vfio_notifier_cleanup(vdev, &vdev->intx.interrupt, "intx-interrupt", 0); vdev->interrupt = VFIO_INT_NONE; trace_vfio_intx_disable(vdev->vbasedev.name); } +bool vfio_pci_intx_enable(VFIOPCIDevice *vdev, Error **errp) +{ + return vfio_intx_enable(vdev, errp); +} + /* * MSI/X */ @@ -374,6 +453,14 @@ static void vfio_msi_interrupt(void *opaque) notify(&vdev->pdev, nr); } +void vfio_pci_msi_set_handler(VFIOPCIDevice *vdev, int nr) +{ + VFIOMSIVector *vector = &vdev->msi_vectors[nr]; + int fd = event_notifier_get_fd(&vector->interrupt); + + qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, vector); +} + /* * Get MSI-X enabled, but no vector enabled, by setting vector 0 with an invalid * fd to kernel. @@ -460,8 +547,8 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) return ret; } -static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, - int vector_n, bool msix) +void vfio_pci_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, + int vector_n, bool msix) { if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi)) { return; @@ -471,13 +558,16 @@ static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, vector_n, &vdev->pdev); } -static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector) +static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector, int nr) { + const char *name = "kvm_interrupt"; + if (vector->virq < 0) { return; } - if (event_notifier_init(&vector->kvm_interrupt, 0)) { + if (!vfio_notifier_init(vector->vdev, &vector->kvm_interrupt, name, nr, + NULL)) { goto fail_notifier; } @@ -489,19 +579,20 @@ static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector) return; fail_kvm: - event_notifier_cleanup(&vector->kvm_interrupt); + vfio_notifier_cleanup(vector->vdev, &vector->kvm_interrupt, name, nr); fail_notifier: kvm_irqchip_release_virq(kvm_state, vector->virq); vector->virq = -1; } -static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector) +static void vfio_remove_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, + int nr) { kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt, vector->virq); kvm_irqchip_release_virq(kvm_state, vector->virq); vector->virq = -1; - event_notifier_cleanup(&vector->kvm_interrupt); + vfio_notifier_cleanup(vdev, &vector->kvm_interrupt, "kvm_interrupt", nr); } static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg, @@ -511,6 +602,43 @@ static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg, kvm_irqchip_commit_routes(kvm_state); } +static void set_irq_signalling(VFIODevice *vbasedev, VFIOMSIVector *vector, + unsigned int nr) +{ + Error *err = NULL; + int32_t fd; + + if (vector->virq >= 0) { + fd = event_notifier_get_fd(&vector->kvm_interrupt); + } else { + fd = event_notifier_get_fd(&vector->interrupt); + } + + if (!vfio_device_irq_set_signaling(vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr, + VFIO_IRQ_SET_ACTION_TRIGGER, + fd, &err)) { + error_reportf_err(err, VFIO_MSG_PREFIX, vbasedev->name); + } +} + +void vfio_pci_vector_init(VFIOPCIDevice *vdev, int nr) +{ + VFIOMSIVector *vector = &vdev->msi_vectors[nr]; + PCIDevice *pdev = &vdev->pdev; + Error *local_err = NULL; + + vector->vdev = vdev; + vector->virq = -1; + if (!vfio_notifier_init(vdev, &vector->interrupt, "interrupt", nr, + &local_err)) { + error_report_err(local_err); + } + vector->use = true; + if (vdev->interrupt == VFIO_INT_MSIX) { + msix_vector_use(pdev, nr); + } +} + static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, MSIMessage *msg, IOHandler *handler) { @@ -524,13 +652,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, vector = &vdev->msi_vectors[nr]; if (!vector->use) { - vector->vdev = vdev; - vector->virq = -1; - if (event_notifier_init(&vector->interrupt, 0)) { - error_report("vfio: Error: event_notifier_init failed"); - } - vector->use = true; - msix_vector_use(pdev, nr); + vfio_pci_vector_init(vdev, nr); } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), @@ -542,19 +664,19 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, */ if (vector->virq >= 0) { if (!msg) { - vfio_remove_kvm_msi_virq(vector); + vfio_remove_kvm_msi_virq(vdev, vector, nr); } else { vfio_update_kvm_msi_virq(vector, *msg, pdev); } } else { if (msg) { if (vdev->defer_kvm_irq_routing) { - vfio_add_kvm_msi_virq(vdev, vector, nr, true); + vfio_pci_add_kvm_msi_virq(vdev, vector, nr, true); } else { vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state); - vfio_add_kvm_msi_virq(vdev, vector, nr, true); + vfio_pci_add_kvm_msi_virq(vdev, vector, nr, true); kvm_irqchip_commit_route_changes(&vfio_route_change); - vfio_connect_kvm_msi_virq(vector); + vfio_connect_kvm_msi_virq(vector, nr); } } } @@ -583,21 +705,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, strerror(-ret)); } } else { - Error *err = NULL; - int32_t fd; - - if (vector->virq >= 0) { - fd = event_notifier_get_fd(&vector->kvm_interrupt); - } else { - fd = event_notifier_get_fd(&vector->interrupt); - } - - if (!vfio_device_irq_set_signaling(&vdev->vbasedev, - VFIO_PCI_MSIX_IRQ_INDEX, nr, - VFIO_IRQ_SET_ACTION_TRIGGER, fd, - &err)) { - error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); - } + set_irq_signalling(&vdev->vbasedev, vector, nr); } } @@ -615,6 +723,15 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, static int vfio_msix_vector_use(PCIDevice *pdev, unsigned int nr, MSIMessage msg) { + /* + * Ignore the callback from msix_set_vector_notifiers during resume. + * The necessary subset of these actions is called from + * vfio_cpr_claim_vectors during post load. + */ + if (cpr_is_incoming()) { + return 0; + } + return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt); } @@ -645,14 +762,20 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) } } -static void vfio_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev) +void vfio_pci_msix_set_notifiers(VFIOPCIDevice *vdev) +{ + msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use, + vfio_msix_vector_release, NULL); +} + +void vfio_pci_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev) { assert(!vdev->defer_kvm_irq_routing); vdev->defer_kvm_irq_routing = true; vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state); } -static void vfio_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev) +void vfio_pci_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev) { int i; @@ -662,7 +785,7 @@ static void vfio_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev) kvm_irqchip_commit_route_changes(&vfio_route_change); for (i = 0; i < vdev->nr_vectors; i++) { - vfio_connect_kvm_msi_virq(&vdev->msi_vectors[i]); + vfio_connect_kvm_msi_virq(&vdev->msi_vectors[i], i); } } @@ -682,14 +805,14 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev) * routes once rather than per vector provides a substantial * performance improvement. */ - vfio_prepare_kvm_msi_virq_batch(vdev); + vfio_pci_prepare_kvm_msi_virq_batch(vdev); if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use, vfio_msix_vector_release, NULL)) { error_report("vfio: msix_set_vector_notifiers failed"); } - vfio_commit_kvm_msi_virq_batch(vdev); + vfio_pci_commit_kvm_msi_virq_batch(vdev); if (vdev->nr_vectors) { ret = vfio_enable_vectors(vdev, true); @@ -733,19 +856,21 @@ retry: * Deferring to commit the KVM routes once rather than per vector * provides a substantial performance improvement. */ - vfio_prepare_kvm_msi_virq_batch(vdev); + vfio_pci_prepare_kvm_msi_virq_batch(vdev); vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors); for (i = 0; i < vdev->nr_vectors; i++) { VFIOMSIVector *vector = &vdev->msi_vectors[i]; + Error *local_err = NULL; vector->vdev = vdev; vector->virq = -1; vector->use = true; - if (event_notifier_init(&vector->interrupt, 0)) { - error_report("vfio: Error: event_notifier_init failed"); + if (!vfio_notifier_init(vdev, &vector->interrupt, "interrupt", i, + &local_err)) { + error_report_err(local_err); } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), @@ -755,10 +880,10 @@ retry: * Attempt to enable route through KVM irqchip, * default to userspace handling if unavailable. */ - vfio_add_kvm_msi_virq(vdev, vector, i, false); + vfio_pci_add_kvm_msi_virq(vdev, vector, i, false); } - vfio_commit_kvm_msi_virq_batch(vdev); + vfio_pci_commit_kvm_msi_virq_batch(vdev); /* Set interrupt type prior to possible interrupts */ vdev->interrupt = VFIO_INT_MSI; @@ -801,11 +926,11 @@ static void vfio_msi_disable_common(VFIOPCIDevice *vdev) VFIOMSIVector *vector = &vdev->msi_vectors[i]; if (vdev->msi_vectors[i].use) { if (vector->virq >= 0) { - vfio_remove_kvm_msi_virq(vector); + vfio_remove_kvm_msi_virq(vdev, vector, i); } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), NULL, NULL, NULL); - event_notifier_cleanup(&vector->interrupt); + vfio_notifier_cleanup(vdev, &vector->interrupt, "interrupt", i); } } @@ -984,7 +1109,7 @@ static int vfio_pci_config_space_write(VFIOPCIDevice *vdev, off_t offset, { return vdev->vbasedev.io_ops->region_write(&vdev->vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, - offset, size, data); + offset, size, data, false); } static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size) @@ -1738,7 +1863,7 @@ static bool vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp) return true; } -static void vfio_teardown_msi(VFIOPCIDevice *vdev) +void vfio_pci_teardown_msi(VFIOPCIDevice *vdev) { msi_uninit(&vdev->pdev); @@ -1788,6 +1913,9 @@ static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr) bar->type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK : ~PCI_BASE_ADDRESS_MEM_MASK); bar->size = bar->region.size; + + /* IO regions are sync, memory can be async */ + bar->region.post_wr = (bar->ioport == 0); } static void vfio_bars_prepare(VFIOPCIDevice *vdev) @@ -1834,7 +1962,7 @@ static void vfio_bars_register(VFIOPCIDevice *vdev) } } -static void vfio_bars_exit(VFIOPCIDevice *vdev) +void vfio_pci_bars_exit(VFIOPCIDevice *vdev) { int i; @@ -2425,7 +2553,7 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev) g_free(config); } -static bool vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp) +bool vfio_pci_add_capabilities(VFIOPCIDevice *vdev, Error **errp) { PCIDevice *pdev = &vdev->pdev; @@ -2701,7 +2829,7 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) static VFIODeviceOps vfio_pci_ops = { .vfio_compute_needs_reset = vfio_pci_compute_needs_reset, .vfio_hot_reset_multi = vfio_pci_hot_reset_multi, - .vfio_eoi = vfio_intx_eoi, + .vfio_eoi = vfio_pci_intx_eoi, .vfio_get_object = vfio_pci_get_object, .vfio_save_config = vfio_pci_save_config, .vfio_load_config = vfio_pci_load_config, @@ -2772,7 +2900,7 @@ bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) return true; } -static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) +bool vfio_pci_populate_device(VFIOPCIDevice *vdev, Error **errp) { VFIODevice *vbasedev = &vdev->vbasedev; struct vfio_region_info *reg_info = NULL; @@ -2818,7 +2946,7 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) return false; } - trace_vfio_populate_device_config(vdev->vbasedev.name, + trace_vfio_pci_populate_device_config(vdev->vbasedev.name, (unsigned long)reg_info->size, (unsigned long)reg_info->offset, (unsigned long)reg_info->flags); @@ -2840,7 +2968,7 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) ret = vfio_device_get_irq_info(vbasedev, VFIO_PCI_ERR_IRQ_INDEX, &irq_info); if (ret) { /* This can fail for an old kernel or legacy PCI dev */ - trace_vfio_populate_device_get_irq_info_failure(strerror(-ret)); + trace_vfio_pci_populate_device_get_irq_info_failure(strerror(-ret)); } else if (irq_info.count == 1) { vdev->pci_aer = true; } else { @@ -2852,11 +2980,23 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) return true; } -static void vfio_pci_put_device(VFIOPCIDevice *vdev) +void vfio_pci_put_device(VFIOPCIDevice *vdev) { + vfio_display_finalize(vdev); + vfio_bars_finalize(vdev); + g_free(vdev->emulated_config_bits); + g_free(vdev->rom); + /* + * XXX Leaking igd_opregion is not an oversight, we can't remove the + * fw_cfg entry therefore leaking this allocation seems like the safest + * option. + * + * g_free(vdev->igd_opregion); + */ + vfio_device_detach(&vdev->vbasedev); - g_free(vdev->vbasedev.name); + vfio_device_free_name(&vdev->vbasedev); g_free(vdev->msix); } @@ -2888,7 +3028,7 @@ static void vfio_err_notifier_handler(void *opaque) * and continue after disabling error recovery support for the * device. */ -static void vfio_register_err_notifier(VFIOPCIDevice *vdev) +void vfio_pci_register_err_notifier(VFIOPCIDevice *vdev) { Error *err = NULL; int32_t fd; @@ -2897,8 +3037,9 @@ static void vfio_register_err_notifier(VFIOPCIDevice *vdev) return; } - if (event_notifier_init(&vdev->err_notifier, 0)) { - error_report("vfio: Unable to init event notifier for error detection"); + if (!vfio_notifier_init(vdev, &vdev->err_notifier, "err_notifier", 0, + &err)) { + error_report_err(err); vdev->pci_aer = false; return; } @@ -2906,11 +3047,16 @@ static void vfio_register_err_notifier(VFIOPCIDevice *vdev) fd = event_notifier_get_fd(&vdev->err_notifier); qemu_set_fd_handler(fd, vfio_err_notifier_handler, NULL, vdev); + /* Do not alter irq_signaling during vfio_realize for cpr */ + if (cpr_is_incoming()) { + return; + } + if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0, VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->err_notifier); + vfio_notifier_cleanup(vdev, &vdev->err_notifier, "err_notifier", 0); vdev->pci_aer = false; } } @@ -2929,7 +3075,7 @@ static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev) } qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier), NULL, NULL, vdev); - event_notifier_cleanup(&vdev->err_notifier); + vfio_notifier_cleanup(vdev, &vdev->err_notifier, "err_notifier", 0); } static void vfio_req_notifier_handler(void *opaque) @@ -2947,7 +3093,7 @@ static void vfio_req_notifier_handler(void *opaque) } } -static void vfio_register_req_notifier(VFIOPCIDevice *vdev) +void vfio_pci_register_req_notifier(VFIOPCIDevice *vdev) { struct vfio_irq_info irq_info; Error *err = NULL; @@ -2964,19 +3110,26 @@ static void vfio_register_req_notifier(VFIOPCIDevice *vdev) return; } - if (event_notifier_init(&vdev->req_notifier, 0)) { - error_report("vfio: Unable to init event notifier for device request"); + if (!vfio_notifier_init(vdev, &vdev->req_notifier, "req_notifier", 0, + &err)) { + error_report_err(err); return; } fd = event_notifier_get_fd(&vdev->req_notifier); qemu_set_fd_handler(fd, vfio_req_notifier_handler, NULL, vdev); + /* Do not alter irq_signaling during vfio_realize for cpr */ + if (cpr_is_incoming()) { + vdev->req_enabled = true; + return; + } + if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0, VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->req_notifier); + vfio_notifier_cleanup(vdev, &vdev->req_notifier, "req_notifier", 0); } else { vdev->req_enabled = true; } @@ -2996,15 +3149,28 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev) } qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier), NULL, NULL, vdev); - event_notifier_cleanup(&vdev->req_notifier); + vfio_notifier_cleanup(vdev, &vdev->req_notifier, "req_notifier", 0); vdev->req_enabled = false; } -static bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp) +bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp) { PCIDevice *pdev = &vdev->pdev; VFIODevice *vbasedev = &vdev->vbasedev; + uint32_t config_space_size; + int ret; + + config_space_size = MIN(pci_config_size(&vdev->pdev), vdev->config_size); + + /* Get a copy of config space */ + ret = vfio_pci_config_space_read(vdev, 0, config_space_size, + vdev->pdev.config); + if (ret < (int)config_space_size) { + ret = ret < 0 ? -ret : EFAULT; + error_setg_errno(errp, ret, "failed to read device config space"); + return false; + } /* vfio emulates a lot for us, but some bits need extra love */ vdev->emulated_config_bits = g_malloc0(vdev->config_size); @@ -3094,7 +3260,7 @@ static bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp) return true; } -static bool vfio_interrupt_setup(VFIOPCIDevice *vdev, Error **errp) +bool vfio_pci_interrupt_setup(VFIOPCIDevice *vdev, Error **errp) { PCIDevice *pdev = &vdev->pdev; @@ -3116,7 +3282,13 @@ static bool vfio_interrupt_setup(VFIOPCIDevice *vdev, Error **errp) vfio_intx_routing_notifier); vdev->irqchip_change_notifier.notify = vfio_irqchip_change; kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier); - if (!vfio_intx_enable(vdev, errp)) { + + /* + * During CPR, do not call vfio_intx_enable at this time. Instead, + * call it from vfio_pci_post_load after the intx routing data has + * been loaded from vmstate. + */ + if (!cpr_is_incoming() && !vfio_intx_enable(vdev, errp)) { timer_free(vdev->intx.mmap_timer); pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier); @@ -3126,15 +3298,14 @@ static bool vfio_interrupt_setup(VFIOPCIDevice *vdev, Error **errp) return true; } -static void vfio_realize(PCIDevice *pdev, Error **errp) +static void vfio_pci_realize(PCIDevice *pdev, Error **errp) { ERRP_GUARD(); VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); VFIODevice *vbasedev = &vdev->vbasedev; - int i, ret; + int i; char uuid[UUID_STR_LEN]; g_autofree char *name = NULL; - uint32_t config_space_size; if (vbasedev->fd < 0 && !vbasedev->sysfsdev) { if (!(~vdev->host.domain || ~vdev->host.bus || @@ -3185,18 +3356,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) goto error; } - if (!vfio_populate_device(vdev, errp)) { - goto error; - } - - config_space_size = MIN(pci_config_size(&vdev->pdev), vdev->config_size); - - /* Get a copy of config space */ - ret = vfio_pci_config_space_read(vdev, 0, config_space_size, - vdev->pdev.config); - if (ret < (int)config_space_size) { - ret = ret < 0 ? -ret : EFAULT; - error_setg_errno(errp, ret, "failed to read device config space"); + if (!vfio_pci_populate_device(vdev, errp)) { goto error; } @@ -3210,7 +3370,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) goto out_teardown; } - if (!vfio_add_capabilities(vdev, errp)) { + if (!vfio_pci_add_capabilities(vdev, errp)) { goto out_unset_idev; } @@ -3226,7 +3386,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vfio_bar_quirk_setup(vdev, i); } - if (!vfio_interrupt_setup(vdev, errp)) { + if (!vfio_pci_interrupt_setup(vdev, errp)) { goto out_unset_idev; } @@ -3270,8 +3430,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) } } - vfio_register_err_notifier(vdev); - vfio_register_req_notifier(vdev); + vfio_pci_register_err_notifier(vdev); + vfio_pci_register_req_notifier(vdev); vfio_setup_resetfn_quirk(vdev); return; @@ -3292,8 +3452,8 @@ out_unset_idev: pci_device_unset_iommu_device(pdev); } out_teardown: - vfio_teardown_msi(vdev); - vfio_bars_exit(vdev); + vfio_pci_teardown_msi(vdev); + vfio_pci_bars_exit(vdev); error: error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->name); } @@ -3302,17 +3462,6 @@ static void vfio_instance_finalize(Object *obj) { VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); - vfio_display_finalize(vdev); - vfio_bars_finalize(vdev); - g_free(vdev->emulated_config_bits); - g_free(vdev->rom); - /* - * XXX Leaking igd_opregion is not an oversight, we can't remove the - * fw_cfg entry therefore leaking this allocation seems like the safest - * option. - * - * g_free(vdev->igd_opregion); - */ vfio_pci_put_device(vdev); } @@ -3331,9 +3480,9 @@ static void vfio_exitfn(PCIDevice *pdev) if (vdev->intx.mmap_timer) { timer_free(vdev->intx.mmap_timer); } - vfio_teardown_msi(vdev); + vfio_pci_teardown_msi(vdev); vfio_pci_disable_rp_atomics(vdev); - vfio_bars_exit(vdev); + vfio_pci_bars_exit(vdev); vfio_migration_exit(vbasedev); if (!vbasedev->mdev) { pci_device_unset_iommu_device(pdev); @@ -3344,6 +3493,11 @@ static void vfio_pci_reset(DeviceState *dev) { VFIOPCIDevice *vdev = VFIO_PCI_BASE(dev); + /* Do not reset the device during qemu_system_reset prior to cpr load */ + if (cpr_is_incoming()) { + return; + } + trace_vfio_pci_reset(vdev->vbasedev.name); vfio_pci_pre_reset(vdev); @@ -3401,6 +3555,13 @@ static void vfio_instance_init(Object *obj) /* QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command * line, therefore, no need to wait to realize like other devices */ pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS; + + /* + * A device that is resuming for cpr is already configured, so do not + * reset it during qemu_system_reset prior to cpr load, else interrupts + * may be lost. + */ + pci_dev->cap_present |= QEMU_PCI_SKIP_RESET_ON_CPR; } static void vfio_pci_base_dev_class_init(ObjectClass *klass, const void *data) @@ -3418,7 +3579,7 @@ static void vfio_pci_base_dev_class_init(ObjectClass *klass, const void *data) static const TypeInfo vfio_pci_base_dev_info = { .name = TYPE_VFIO_PCI_BASE, .parent = TYPE_PCI_DEVICE, - .instance_size = 0, + .instance_size = sizeof(VFIOPCIDevice), .abstract = true, .class_init = vfio_pci_base_dev_class_init, .interfaces = (const InterfaceInfo[]) { @@ -3513,8 +3674,9 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, const void *data) #ifdef CONFIG_IOMMUFD object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd); #endif + dc->vmsd = &vfio_cpr_pci_vmstate; dc->desc = "VFIO-based PCI device assignment"; - pdc->realize = vfio_realize; + pdc->realize = vfio_pci_realize; object_class_property_set_description(klass, /* 1.3 */ "host", @@ -3640,7 +3802,6 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, const void *data) static const TypeInfo vfio_pci_dev_info = { .name = TYPE_VFIO_PCI, .parent = TYPE_VFIO_PCI_BASE, - .instance_size = sizeof(VFIOPCIDevice), .class_init = vfio_pci_dev_class_init, .instance_init = vfio_instance_init, .instance_finalize = vfio_instance_finalize, diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 5ce0fb9..495fae7 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -116,6 +116,7 @@ typedef struct VFIOMSIXInfo { uint32_t pba_offset; unsigned long *pending; bool noresize; + MemoryRegion *pba_region; } VFIOMSIXInfo; /* @@ -210,6 +211,16 @@ static inline bool vfio_is_vga(VFIOPCIDevice *vdev) return class == PCI_CLASS_DISPLAY_VGA; } +/* MSI/MSI-X/INTx */ +void vfio_pci_vector_init(VFIOPCIDevice *vdev, int nr); +void vfio_pci_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, + int vector_n, bool msix); +void vfio_pci_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev); +void vfio_pci_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev); +bool vfio_pci_intx_enable(VFIOPCIDevice *vdev, Error **errp); +void vfio_pci_msix_set_notifiers(VFIOPCIDevice *vdev); +void vfio_pci_msi_set_handler(VFIOPCIDevice *vdev, int nr); + uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len); void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, uint32_t val, int len); @@ -248,4 +259,15 @@ void vfio_display_finalize(VFIOPCIDevice *vdev); extern const VMStateDescription vfio_display_vmstate; +void vfio_pci_bars_exit(VFIOPCIDevice *vdev); +bool vfio_pci_add_capabilities(VFIOPCIDevice *vdev, Error **errp); +bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp); +bool vfio_pci_interrupt_setup(VFIOPCIDevice *vdev, Error **errp); +void vfio_pci_intx_eoi(VFIODevice *vbasedev); +void vfio_pci_put_device(VFIOPCIDevice *vdev); +bool vfio_pci_populate_device(VFIOPCIDevice *vdev, Error **errp); +void vfio_pci_register_err_notifier(VFIOPCIDevice *vdev); +void vfio_pci_register_req_notifier(VFIOPCIDevice *vdev); +void vfio_pci_teardown_msi(VFIOPCIDevice *vdev); + #endif /* HW_VFIO_VFIO_PCI_H */ diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index 9a21f2e..5c1795a 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -530,7 +530,7 @@ static bool vfio_base_device_init(VFIODevice *vbasedev, Error **errp) { /* @fd takes precedence over @sysfsdev which takes precedence over @host */ if (vbasedev->fd < 0 && vbasedev->sysfsdev) { - g_free(vbasedev->name); + vfio_device_free_name(vbasedev); vbasedev->name = g_path_get_basename(vbasedev->sysfsdev); } else if (vbasedev->fd < 0) { if (!vbasedev->name || strchr(vbasedev->name, '/')) { diff --git a/hw/vfio/region.c b/hw/vfio/region.c index 34752c3..d04c57d 100644 --- a/hw/vfio/region.c +++ b/hw/vfio/region.c @@ -66,7 +66,7 @@ void vfio_region_write(void *opaque, hwaddr addr, } ret = vbasedev->io_ops->region_write(vbasedev, region->nr, - addr, size, &buf); + addr, size, &buf, region->post_wr); if (ret != size) { error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64 ",%d) failed: %s", @@ -200,6 +200,7 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, region->size = info->size; region->fd_offset = info->offset; region->nr = index; + region->post_wr = false; if (region->size) { region->mem = g_new0(MemoryRegion, 1); @@ -241,6 +242,7 @@ int vfio_region_mmap(VFIORegion *region) { int i, ret, prot = 0; char *name; + int fd; if (!region->mem) { return 0; @@ -271,14 +273,15 @@ int vfio_region_mmap(VFIORegion *region) goto no_mmap; } + fd = vfio_device_get_region_fd(region->vbasedev, region->nr); + map_align = (void *)ROUND_UP((uintptr_t)map_base, (uintptr_t)align); munmap(map_base, map_align - map_base); munmap(map_align + region->mmaps[i].size, align - (map_align - map_base)); region->mmaps[i].mmap = mmap(map_align, region->mmaps[i].size, prot, - MAP_SHARED | MAP_FIXED, - region->vbasedev->fd, + MAP_SHARED | MAP_FIXED, fd, region->fd_offset + region->mmaps[i].offset); if (region->mmaps[i].mmap == MAP_FAILED) { diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index e90ec9b..8ec0ad0 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -1,8 +1,10 @@ # See docs/devel/tracing.rst for syntax documentation. +# +# SPDX-License-Identifier: GPL-2.0-or-later # pci.c vfio_intx_interrupt(const char *name, char line) " (%s) Pin %c" -vfio_intx_eoi(const char *name) " (%s) EOI" +vfio_pci_intx_eoi(const char *name) " (%s) EOI" vfio_intx_enable_kvm(const char *name) " (%s) KVM INTx accel enabled" vfio_intx_disable_kvm(const char *name) " (%s) KVM INTx accel disabled" vfio_intx_update(const char *name, int new_irq, int target_irq) " (%s) IRQ moved %d -> %d" @@ -35,8 +37,8 @@ vfio_pci_hot_reset(const char *name, const char *type) " (%s) %s" vfio_pci_hot_reset_has_dep_devices(const char *name) "%s: hot reset dependent devices:" vfio_pci_hot_reset_dep_devices(int domain, int bus, int slot, int function, int group_id) "\t%04x:%02x:%02x.%x group %d" vfio_pci_hot_reset_result(const char *name, const char *result) "%s hot reset: %s" -vfio_populate_device_config(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device '%s' config: size: 0x%lx, offset: 0x%lx, flags: 0x%lx" -vfio_populate_device_get_irq_info_failure(const char *errstr) "VFIO_DEVICE_GET_IRQ_INFO failure: %s" +vfio_pci_populate_device_config(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device '%s' config: size: 0x%lx, offset: 0x%lx, flags: 0x%lx" +vfio_pci_populate_device_get_irq_info_failure(const char *errstr) "VFIO_DEVICE_GET_IRQ_INFO failure: %s" vfio_mdev(const char *name, bool is_mdev) " (%s) is_mdev %d" vfio_add_ext_cap_dropped(const char *name, uint16_t cap, uint16_t offset) "%s 0x%x@0x%x" vfio_pci_reset(const char *name) " (%s)" @@ -195,6 +197,9 @@ iommufd_cdev_alloc_ioas(int iommufd, int ioas_id) " [iommufd=%d] new IOMMUFD con iommufd_cdev_device_info(char *name, int devfd, int num_irqs, int num_regions, int flags) " %s (%d) num_irqs=%d num_regions=%d flags=%d" iommufd_cdev_pci_hot_reset_dep_devices(int domain, int bus, int slot, int function, int dev_id) "\t%04x:%02x:%02x.%x devid %d" +# cpr-iommufd.c +vfio_cpr_find_device(uint32_t ioas_id, int devid, uint32_t hwpt_id) "ioas_id %u, devid %d, hwpt_id %u" + # device.c vfio_device_get_region_info_type(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%08x" vfio_device_reset_handler(void) "" diff --git a/hw/vfio/trace.h b/hw/vfio/trace.h index 5a343aa..b34b61d 100644 --- a/hw/vfio/trace.h +++ b/hw/vfio/trace.h @@ -1 +1,4 @@ +/* + * SPDX-License-Identifier: GPL-2.0-or-later + */ #include "trace/trace-hw_vfio.h" diff --git a/hw/vfio/vfio-cpr.h b/hw/vfio/vfio-cpr.h deleted file mode 100644 index 134b83a..0000000 --- a/hw/vfio/vfio-cpr.h +++ /dev/null @@ -1,15 +0,0 @@ -/* - * VFIO CPR - * - * Copyright (c) 2025 Oracle and/or its affiliates. - * - * SPDX-License-Identifier: GPL-2.0-or-later - */ - -#ifndef HW_VFIO_CPR_H -#define HW_VFIO_CPR_H - -bool vfio_cpr_register_container(VFIOContainerBase *bcontainer, Error **errp); -void vfio_cpr_unregister_container(VFIOContainerBase *bcontainer); - -#endif /* HW_VFIO_CPR_H */ diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index 1ab2c11..7061b6e 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -209,6 +209,8 @@ static void vhost_vdpa_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) int ret; Int128 llend; Error *local_err = NULL; + MemoryRegion *mr; + hwaddr xlat; if (iotlb->target_as != &address_space_memory) { error_report("Wrong target AS \"%s\", only system memory is allowed", @@ -228,11 +230,14 @@ static void vhost_vdpa_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { bool read_only; - if (!memory_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, NULL, - &local_err)) { + mr = memory_translate_iotlb(iotlb, &xlat, &local_err); + if (!mr) { error_report_err(local_err); return; } + vaddr = memory_region_get_ram_ptr(mr) + xlat; + read_only = !(iotlb->perm & IOMMU_WO) || mr->readonly; + ret = vhost_vdpa_dma_map(s, VHOST_VDPA_GUEST_PA_ASID, iova, iotlb->addr_mask + 1, vaddr, read_only); if (ret) { @@ -594,6 +599,36 @@ static void vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v) v->shadow_vqs = g_steal_pointer(&shadow_vqs); } +static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev) +{ + struct vhost_vdpa *v = dev->opaque; + + uint64_t features; + uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 | + 0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH | + 0x1ULL << VHOST_BACKEND_F_IOTLB_ASID | + 0x1ULL << VHOST_BACKEND_F_SUSPEND; + int r; + + if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) { + return -EFAULT; + } + + features &= f; + + if (vhost_vdpa_first_dev(dev)) { + r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features); + if (r) { + return -EFAULT; + } + } + + dev->backend_cap = features; + v->shared->backend_cap = features; + + return 0; +} + static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp) { struct vhost_vdpa *v = opaque; @@ -603,7 +638,12 @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp) v->dev = dev; dev->opaque = opaque ; - v->shared->listener = vhost_vdpa_memory_listener; + + ret = vhost_vdpa_set_backend_cap(dev); + if (unlikely(ret != 0)) { + return ret; + } + vhost_vdpa_init_svq(dev, v); error_propagate(&dev->migration_blocker, v->migration_blocker); @@ -639,6 +679,7 @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp) vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER); + v->shared->listener = vhost_vdpa_memory_listener; return 0; } @@ -841,36 +882,6 @@ static int vhost_vdpa_set_features(struct vhost_dev *dev, return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK); } -static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev) -{ - struct vhost_vdpa *v = dev->opaque; - - uint64_t features; - uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 | - 0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH | - 0x1ULL << VHOST_BACKEND_F_IOTLB_ASID | - 0x1ULL << VHOST_BACKEND_F_SUSPEND; - int r; - - if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) { - return -EFAULT; - } - - features &= f; - - if (vhost_vdpa_first_dev(dev)) { - r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features); - if (r) { - return -EFAULT; - } - } - - dev->backend_cap = features; - v->shared->backend_cap = features; - - return 0; -} - static int vhost_vdpa_get_device_id(struct vhost_dev *dev, uint32_t *device_id) { @@ -888,8 +899,14 @@ static int vhost_vdpa_reset_device(struct vhost_dev *dev) ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status); trace_vhost_vdpa_reset_device(dev); + if (ret) { + return ret; + } + + memory_listener_unregister(&v->shared->listener); + v->shared->listener_registered = false; v->suspended = false; - return ret; + return 0; } static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx) @@ -1373,7 +1390,15 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) "IOMMU and try again"); return -1; } - memory_listener_register(&v->shared->listener, dev->vdev->dma_as); + if (v->shared->listener_registered && + dev->vdev->dma_as != v->shared->listener.address_space) { + memory_listener_unregister(&v->shared->listener); + v->shared->listener_registered = false; + } + if (!v->shared->listener_registered) { + memory_listener_register(&v->shared->listener, dev->vdev->dma_as); + v->shared->listener_registered = true; + } return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK); } @@ -1383,8 +1408,6 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) static void vhost_vdpa_reset_status(struct vhost_dev *dev) { - struct vhost_vdpa *v = dev->opaque; - if (!vhost_vdpa_last_dev(dev)) { return; } @@ -1392,7 +1415,6 @@ static void vhost_vdpa_reset_status(struct vhost_dev *dev) vhost_vdpa_reset_device(dev); vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER); - memory_listener_unregister(&v->shared->listener); } static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base, @@ -1526,12 +1548,27 @@ static int vhost_vdpa_get_features(struct vhost_dev *dev, static int vhost_vdpa_set_owner(struct vhost_dev *dev) { + int r; + struct vhost_vdpa *v; + if (!vhost_vdpa_first_dev(dev)) { return 0; } trace_vhost_vdpa_set_owner(dev); - return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL); + r = vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL); + if (unlikely(r < 0)) { + return r; + } + + /* + * Being optimistic and listening address space memory. If the device + * uses vIOMMU, it is changed at vhost_vdpa_dev_start. + */ + v = dev->opaque; + memory_listener_register(&v->shared->listener, &address_space_memory); + v->shared->listener_registered = true; + return 0; } static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev, @@ -1563,7 +1600,6 @@ const VhostOps vdpa_ops = { .vhost_set_vring_kick = vhost_vdpa_set_vring_kick, .vhost_set_vring_call = vhost_vdpa_set_vring_call, .vhost_get_features = vhost_vdpa_get_features, - .vhost_set_backend_cap = vhost_vdpa_set_backend_cap, .vhost_set_owner = vhost_vdpa_set_owner, .vhost_set_vring_endian = NULL, .vhost_backend_memslots_limit = vhost_vdpa_memslots_limit, diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c index a3d1a67..c46f6f9 100644 --- a/hw/virtio/virtio-mem.c +++ b/hw/virtio/virtio-mem.c @@ -244,28 +244,6 @@ static int virtio_mem_for_each_plugged_range(VirtIOMEM *vmem, void *arg, return ret; } -/* - * Adjust the memory section to cover the intersection with the given range. - * - * Returns false if the intersection is empty, otherwise returns true. - */ -static bool virtio_mem_intersect_memory_section(MemoryRegionSection *s, - uint64_t offset, uint64_t size) -{ - uint64_t start = MAX(s->offset_within_region, offset); - uint64_t end = MIN(s->offset_within_region + int128_get64(s->size), - offset + size); - - if (end <= start) { - return false; - } - - s->offset_within_address_space += start - s->offset_within_region; - s->offset_within_region = start; - s->size = int128_make64(end - start); - return true; -} - typedef int (*virtio_mem_section_cb)(MemoryRegionSection *s, void *arg); static int virtio_mem_for_each_plugged_section(const VirtIOMEM *vmem, @@ -287,7 +265,7 @@ static int virtio_mem_for_each_plugged_section(const VirtIOMEM *vmem, first_bit + 1) - 1; size = (last_bit - first_bit + 1) * vmem->block_size; - if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) { + if (!memory_region_section_intersect_range(&tmp, offset, size)) { break; } ret = cb(&tmp, arg); @@ -319,7 +297,7 @@ static int virtio_mem_for_each_unplugged_section(const VirtIOMEM *vmem, first_bit + 1) - 1; size = (last_bit - first_bit + 1) * vmem->block_size; - if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) { + if (!memory_region_section_intersect_range(&tmp, offset, size)) { break; } ret = cb(&tmp, arg); @@ -355,7 +333,7 @@ static void virtio_mem_notify_unplug(VirtIOMEM *vmem, uint64_t offset, QLIST_FOREACH(rdl, &vmem->rdl_list, next) { MemoryRegionSection tmp = *rdl->section; - if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) { + if (!memory_region_section_intersect_range(&tmp, offset, size)) { continue; } rdl->notify_discard(rdl, &tmp); @@ -371,7 +349,7 @@ static int virtio_mem_notify_plug(VirtIOMEM *vmem, uint64_t offset, QLIST_FOREACH(rdl, &vmem->rdl_list, next) { MemoryRegionSection tmp = *rdl->section; - if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) { + if (!memory_region_section_intersect_range(&tmp, offset, size)) { continue; } ret = rdl->notify_populate(rdl, &tmp); @@ -388,7 +366,7 @@ static int virtio_mem_notify_plug(VirtIOMEM *vmem, uint64_t offset, if (rdl2 == rdl) { break; } - if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) { + if (!memory_region_section_intersect_range(&tmp, offset, size)) { continue; } rdl2->notify_discard(rdl2, &tmp); @@ -1070,6 +1048,17 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp) } /* + * Set ourselves as RamDiscardManager before the plug handler maps the + * memory region and exposes it via an address space. + */ + if (memory_region_set_ram_discard_manager(&vmem->memdev->mr, + RAM_DISCARD_MANAGER(vmem))) { + error_setg(errp, "Failed to set RamDiscardManager"); + ram_block_coordinated_discard_require(false); + return; + } + + /* * We don't know at this point whether shared RAM is migrated using * QEMU or migrated using the file content. "x-ignore-shared" will be * configured after realizing the device. So in case we have an @@ -1083,6 +1072,7 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp) ret = ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb)); if (ret) { error_setg_errno(errp, -ret, "Unexpected error discarding RAM"); + memory_region_set_ram_discard_manager(&vmem->memdev->mr, NULL); ram_block_coordinated_discard_require(false); return; } @@ -1144,13 +1134,6 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp) vmem->system_reset = VIRTIO_MEM_SYSTEM_RESET(obj); vmem->system_reset->vmem = vmem; qemu_register_resettable(obj); - - /* - * Set ourselves as RamDiscardManager before the plug handler maps the - * memory region and exposes it via an address space. - */ - memory_region_set_ram_discard_manager(&vmem->memdev->mr, - RAM_DISCARD_MANAGER(vmem)); } static void virtio_mem_device_unrealize(DeviceState *dev) @@ -1158,12 +1141,6 @@ static void virtio_mem_device_unrealize(DeviceState *dev) VirtIODevice *vdev = VIRTIO_DEVICE(dev); VirtIOMEM *vmem = VIRTIO_MEM(dev); - /* - * The unplug handler unmapped the memory region, it cannot be - * found via an address space anymore. Unset ourselves. - */ - memory_region_set_ram_discard_manager(&vmem->memdev->mr, NULL); - qemu_unregister_resettable(OBJECT(vmem->system_reset)); object_unref(OBJECT(vmem->system_reset)); @@ -1176,6 +1153,11 @@ static void virtio_mem_device_unrealize(DeviceState *dev) virtio_del_queue(vdev, 0); virtio_cleanup(vdev); g_free(vmem->bitmap); + /* + * The unplug handler unmapped the memory region, it cannot be + * found via an address space anymore. Unset ourselves. + */ + memory_region_set_ram_discard_manager(&vmem->memdev->mr, NULL); ram_block_coordinated_discard_require(false); } @@ -1750,7 +1732,7 @@ static bool virtio_mem_rdm_is_populated(const RamDiscardManager *rdm, } struct VirtIOMEMReplayData { - void *fn; + ReplayRamDiscardState fn; void *opaque; }; @@ -1758,12 +1740,12 @@ static int virtio_mem_rdm_replay_populated_cb(MemoryRegionSection *s, void *arg) { struct VirtIOMEMReplayData *data = arg; - return ((ReplayRamPopulate)data->fn)(s, data->opaque); + return data->fn(s, data->opaque); } static int virtio_mem_rdm_replay_populated(const RamDiscardManager *rdm, MemoryRegionSection *s, - ReplayRamPopulate replay_fn, + ReplayRamDiscardState replay_fn, void *opaque) { const VirtIOMEM *vmem = VIRTIO_MEM(rdm); @@ -1782,14 +1764,13 @@ static int virtio_mem_rdm_replay_discarded_cb(MemoryRegionSection *s, { struct VirtIOMEMReplayData *data = arg; - ((ReplayRamDiscard)data->fn)(s, data->opaque); - return 0; + return data->fn(s, data->opaque); } -static void virtio_mem_rdm_replay_discarded(const RamDiscardManager *rdm, - MemoryRegionSection *s, - ReplayRamDiscard replay_fn, - void *opaque) +static int virtio_mem_rdm_replay_discarded(const RamDiscardManager *rdm, + MemoryRegionSection *s, + ReplayRamDiscardState replay_fn, + void *opaque) { const VirtIOMEM *vmem = VIRTIO_MEM(rdm); struct VirtIOMEMReplayData data = { @@ -1798,8 +1779,8 @@ static void virtio_mem_rdm_replay_discarded(const RamDiscardManager *rdm, }; g_assert(s->mr == &vmem->memdev->mr); - virtio_mem_for_each_unplugged_section(vmem, s, &data, - virtio_mem_rdm_replay_discarded_cb); + return virtio_mem_for_each_unplugged_section(vmem, s, &data, + virtio_mem_rdm_replay_discarded_cb); } static void virtio_mem_rdm_register_listener(RamDiscardManager *rdm, diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 9b48aa8..fba2372 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -146,9 +146,7 @@ static const VMStateDescription vmstate_virtio_pci = { static bool virtio_pci_has_extra_state(DeviceState *d) { - VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); - - return proxy->flags & VIRTIO_PCI_FLAG_MIGRATE_EXTRA; + return true; } static void virtio_pci_save_extra_state(DeviceState *d, QEMUFile *f) @@ -1215,7 +1213,12 @@ static int virtio_pci_set_guest_notifier(DeviceState *d, int n, bool assign, static bool virtio_pci_query_guest_notifiers(DeviceState *d) { VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); - return msix_enabled(&proxy->pci_dev); + + if (msix_enabled(&proxy->pci_dev)) { + return true; + } else { + return pci_irq_disabled(&proxy->pci_dev); + } } static int virtio_pci_set_guest_notifiers(DeviceState *d, int nvqs, bool assign) @@ -2363,12 +2366,8 @@ static void virtio_pci_bus_reset_hold(Object *obj, ResetType type) static const Property virtio_pci_properties[] = { DEFINE_PROP_BIT("virtio-pci-bus-master-bug-migration", VirtIOPCIProxy, flags, VIRTIO_PCI_FLAG_BUS_MASTER_BUG_MIGRATION_BIT, false), - DEFINE_PROP_BIT("migrate-extra", VirtIOPCIProxy, flags, - VIRTIO_PCI_FLAG_MIGRATE_EXTRA_BIT, true), DEFINE_PROP_BIT("modern-pio-notify", VirtIOPCIProxy, flags, VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY_BIT, false), - DEFINE_PROP_BIT("x-disable-pcie", VirtIOPCIProxy, flags, - VIRTIO_PCI_FLAG_DISABLE_PCIE_BIT, false), DEFINE_PROP_BIT("page-per-vq", VirtIOPCIProxy, flags, VIRTIO_PCI_FLAG_PAGE_PER_VQ_BIT, false), DEFINE_PROP_BOOL("x-ignore-backend-features", VirtIOPCIProxy, @@ -2397,8 +2396,7 @@ static void virtio_pci_dc_realize(DeviceState *qdev, Error **errp) VirtIOPCIProxy *proxy = VIRTIO_PCI(qdev); PCIDevice *pci_dev = &proxy->pci_dev; - if (!(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_PCIE) && - virtio_pci_modern(proxy)) { + if (virtio_pci_modern(proxy)) { pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS; } diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 2e98cec..82a285a 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -205,6 +205,15 @@ static const char *virtio_id_to_name(uint16_t device_id) return name; } +static void virtio_check_indirect_feature(VirtIODevice *vdev) +{ + if (!virtio_vdev_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) { + qemu_log_mask(LOG_GUEST_ERROR, + "Device %s: indirect_desc was not negotiated!\n", + vdev->name); + } +} + /* Called within call_rcu(). */ static void virtio_free_region_cache(VRingMemoryRegionCaches *caches) { @@ -1680,8 +1689,8 @@ static void *virtqueue_split_pop(VirtQueue *vq, size_t sz) VirtIODevice *vdev = vq->vdev; VirtQueueElement *elem = NULL; unsigned out_num, in_num, elem_entries; - hwaddr addr[VIRTQUEUE_MAX_SIZE]; - struct iovec iov[VIRTQUEUE_MAX_SIZE]; + hwaddr QEMU_UNINITIALIZED addr[VIRTQUEUE_MAX_SIZE]; + struct iovec QEMU_UNINITIALIZED iov[VIRTQUEUE_MAX_SIZE]; VRingDesc desc; int rc; @@ -1733,6 +1742,7 @@ static void *virtqueue_split_pop(VirtQueue *vq, size_t sz) virtio_error(vdev, "Invalid size for indirect buffer table"); goto done; } + virtio_check_indirect_feature(vdev); /* loop over the indirect descriptor table */ len = address_space_cache_init(&indirect_desc_cache, vdev->dma_as, @@ -1826,8 +1836,8 @@ static void *virtqueue_packed_pop(VirtQueue *vq, size_t sz) VirtIODevice *vdev = vq->vdev; VirtQueueElement *elem = NULL; unsigned out_num, in_num, elem_entries; - hwaddr addr[VIRTQUEUE_MAX_SIZE]; - struct iovec iov[VIRTQUEUE_MAX_SIZE]; + hwaddr QEMU_UNINITIALIZED addr[VIRTQUEUE_MAX_SIZE]; + struct iovec QEMU_UNINITIALIZED iov[VIRTQUEUE_MAX_SIZE]; VRingPackedDesc desc; uint16_t id; int rc; @@ -1870,6 +1880,7 @@ static void *virtqueue_packed_pop(VirtQueue *vq, size_t sz) virtio_error(vdev, "Invalid size for indirect buffer table"); goto done; } + virtio_check_indirect_feature(vdev); /* loop over the indirect descriptor table */ len = address_space_cache_init(&indirect_desc_cache, vdev->dma_as, |