diff options
author | Alexey Kardashevskiy <aik@ozlabs.ru> | 2019-03-12 19:21:03 +1100 |
---|---|---|
committer | David Gibson <david@gibson.dropbear.id.au> | 2019-04-26 10:41:23 +1000 |
commit | ec132efaa81f09861a3bd6afad94827e74543b3f (patch) | |
tree | 0faa60ac303942814073ddec70e98ff0c524d2ad /hw/vfio | |
parent | 3284aa128153750f14a61e8a96fd085e6f2999b6 (diff) | |
download | qemu-ec132efaa81f09861a3bd6afad94827e74543b3f.zip qemu-ec132efaa81f09861a3bd6afad94827e74543b3f.tar.gz qemu-ec132efaa81f09861a3bd6afad94827e74543b3f.tar.bz2 |
spapr: Support NVIDIA V100 GPU with NVLink2
NVIDIA V100 GPUs have on-board RAM which is mapped into the host memory
space and accessible as normal RAM via an NVLink bus. The VFIO-PCI driver
implements special regions for such GPUs and emulates an NVLink bridge.
NVLink2-enabled POWER9 CPUs also provide address translation services
which includes an ATS shootdown (ATSD) register exported via the NVLink
bridge device.
This adds a quirk to VFIO to map the GPU memory and create an MR;
the new MR is stored in a PCI device as a QOM link. The sPAPR PCI uses
this to get the MR and map it to the system address space.
Another quirk does the same for ATSD.
This adds additional steps to sPAPR PHB setup:
1. Search for specific GPUs and NPUs, collect findings in
sPAPRPHBState::nvgpus, manage system address space mappings;
2. Add device-specific properties such as "ibm,npu", "ibm,gpu",
"memory-block", "link-speed" to advertise the NVLink2 function to
the guest;
3. Add "mmio-atsd" to vPHB to advertise the ATSD capability;
4. Add new memory blocks (with extra "linux,memory-usable" to prevent
the guest OS from accessing the new memory until it is onlined) and
npuphb# nodes representing an NPU unit for every vPHB as the GPU driver
uses it for link discovery.
This allocates space for GPU RAM and ATSD like we do for MMIOs by
adding 2 new parameters to the phb_placement() hook. Older machine types
set these to zero.
This puts new memory nodes in a separate NUMA node to as the GPU RAM
needs to be configured equally distant from any other node in the system.
Unlike the host setup which assigns numa ids from 255 downwards, this
adds new NUMA nodes after the user configures nodes or from 1 if none
were configured.
This adds requirement similar to EEH - one IOMMU group per vPHB.
The reason for this is that ATSD registers belong to a physical NPU
so they cannot invalidate translations on GPUs attached to another NPU.
It is guaranteed by the host platform as it does not mix NVLink bridges
or GPUs from different NPU in the same IOMMU group. If more than one
IOMMU group is detected on a vPHB, this disables ATSD support for that
vPHB and prints a warning.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for vfio portions]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Message-Id: <20190312082103.130561-1-aik@ozlabs.ru>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Diffstat (limited to 'hw/vfio')
-rw-r--r-- | hw/vfio/pci-quirks.c | 131 | ||||
-rw-r--r-- | hw/vfio/pci.c | 14 | ||||
-rw-r--r-- | hw/vfio/pci.h | 2 | ||||
-rw-r--r-- | hw/vfio/trace-events | 4 |
4 files changed, 151 insertions, 0 deletions
diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c index 40a1200..29b2697 100644 --- a/hw/vfio/pci-quirks.c +++ b/hw/vfio/pci-quirks.c @@ -2180,3 +2180,134 @@ int vfio_add_virt_caps(VFIOPCIDevice *vdev, Error **errp) return 0; } + +static void vfio_pci_nvlink2_get_tgt(Object *obj, Visitor *v, + const char *name, + void *opaque, Error **errp) +{ + uint64_t tgt = (uintptr_t) opaque; + visit_type_uint64(v, name, &tgt, errp); +} + +static void vfio_pci_nvlink2_get_link_speed(Object *obj, Visitor *v, + const char *name, + void *opaque, Error **errp) +{ + uint32_t link_speed = (uint32_t)(uintptr_t) opaque; + visit_type_uint32(v, name, &link_speed, errp); +} + +int vfio_pci_nvidia_v100_ram_init(VFIOPCIDevice *vdev, Error **errp) +{ + int ret; + void *p; + struct vfio_region_info *nv2reg = NULL; + struct vfio_info_cap_header *hdr; + struct vfio_region_info_cap_nvlink2_ssatgt *cap; + VFIOQuirk *quirk; + + ret = vfio_get_dev_region_info(&vdev->vbasedev, + VFIO_REGION_TYPE_PCI_VENDOR_TYPE | + PCI_VENDOR_ID_NVIDIA, + VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM, + &nv2reg); + if (ret) { + return ret; + } + + hdr = vfio_get_region_info_cap(nv2reg, VFIO_REGION_INFO_CAP_NVLINK2_SSATGT); + if (!hdr) { + ret = -ENODEV; + goto free_exit; + } + cap = (void *) hdr; + + p = mmap(NULL, nv2reg->size, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_SHARED, vdev->vbasedev.fd, nv2reg->offset); + if (p == MAP_FAILED) { + ret = -errno; + goto free_exit; + } + + quirk = vfio_quirk_alloc(1); + memory_region_init_ram_ptr(&quirk->mem[0], OBJECT(vdev), "nvlink2-mr", + nv2reg->size, p); + QLIST_INSERT_HEAD(&vdev->bars[0].quirks, quirk, next); + + object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64", + vfio_pci_nvlink2_get_tgt, NULL, NULL, + (void *) (uintptr_t) cap->tgt, NULL); + trace_vfio_pci_nvidia_gpu_setup_quirk(vdev->vbasedev.name, cap->tgt, + nv2reg->size); +free_exit: + g_free(nv2reg); + + return ret; +} + +int vfio_pci_nvlink2_init(VFIOPCIDevice *vdev, Error **errp) +{ + int ret; + void *p; + struct vfio_region_info *atsdreg = NULL; + struct vfio_info_cap_header *hdr; + struct vfio_region_info_cap_nvlink2_ssatgt *captgt; + struct vfio_region_info_cap_nvlink2_lnkspd *capspeed; + VFIOQuirk *quirk; + + ret = vfio_get_dev_region_info(&vdev->vbasedev, + VFIO_REGION_TYPE_PCI_VENDOR_TYPE | + PCI_VENDOR_ID_IBM, + VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD, + &atsdreg); + if (ret) { + return ret; + } + + hdr = vfio_get_region_info_cap(atsdreg, + VFIO_REGION_INFO_CAP_NVLINK2_SSATGT); + if (!hdr) { + ret = -ENODEV; + goto free_exit; + } + captgt = (void *) hdr; + + hdr = vfio_get_region_info_cap(atsdreg, + VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD); + if (!hdr) { + ret = -ENODEV; + goto free_exit; + } + capspeed = (void *) hdr; + + /* Some NVLink bridges may not have assigned ATSD */ + if (atsdreg->size) { + p = mmap(NULL, atsdreg->size, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_SHARED, vdev->vbasedev.fd, atsdreg->offset); + if (p == MAP_FAILED) { + ret = -errno; + goto free_exit; + } + + quirk = vfio_quirk_alloc(1); + memory_region_init_ram_device_ptr(&quirk->mem[0], OBJECT(vdev), + "nvlink2-atsd-mr", atsdreg->size, p); + QLIST_INSERT_HEAD(&vdev->bars[0].quirks, quirk, next); + } + + object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64", + vfio_pci_nvlink2_get_tgt, NULL, NULL, + (void *) (uintptr_t) captgt->tgt, NULL); + trace_vfio_pci_nvlink2_setup_quirk_ssatgt(vdev->vbasedev.name, captgt->tgt, + atsdreg->size); + + object_property_add(OBJECT(vdev), "nvlink2-link-speed", "uint32", + vfio_pci_nvlink2_get_link_speed, NULL, NULL, + (void *) (uintptr_t) capspeed->link_speed, NULL); + trace_vfio_pci_nvlink2_setup_quirk_lnkspd(vdev->vbasedev.name, + capspeed->link_speed); +free_exit: + g_free(atsdreg); + + return ret; +} diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 0142819..8cecb53 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3086,6 +3086,20 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) } } + if (vdev->vendor_id == PCI_VENDOR_ID_NVIDIA) { + ret = vfio_pci_nvidia_v100_ram_init(vdev, errp); + if (ret && ret != -ENODEV) { + error_report("Failed to setup NVIDIA V100 GPU RAM"); + } + } + + if (vdev->vendor_id == PCI_VENDOR_ID_IBM) { + ret = vfio_pci_nvlink2_init(vdev, errp); + if (ret && ret != -ENODEV) { + error_report("Failed to setup NVlink2 bridge"); + } + } + vfio_register_err_notifier(vdev); vfio_register_req_notifier(vdev); vfio_setup_resetfn_quirk(vdev); diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index c11c3f1..cfcd1a8 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -196,6 +196,8 @@ int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp); int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, struct vfio_region_info *info, Error **errp); +int vfio_pci_nvidia_v100_ram_init(VFIOPCIDevice *vdev, Error **errp); +int vfio_pci_nvlink2_init(VFIOPCIDevice *vdev, Error **errp); void vfio_display_reset(VFIOPCIDevice *vdev); int vfio_display_probe(VFIOPCIDevice *vdev, Error **errp); diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index eb58993..b1ef55a 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -86,6 +86,10 @@ vfio_pci_igd_opregion_enabled(const char *name) "%s" vfio_pci_igd_host_bridge_enabled(const char *name) "%s" vfio_pci_igd_lpc_bridge_enabled(const char *name) "%s" +vfio_pci_nvidia_gpu_setup_quirk(const char *name, uint64_t tgt, uint64_t size) "%s tgt=0x%"PRIx64" size=0x%"PRIx64 +vfio_pci_nvlink2_setup_quirk_ssatgt(const char *name, uint64_t tgt, uint64_t size) "%s tgt=0x%"PRIx64" size=0x%"PRIx64 +vfio_pci_nvlink2_setup_quirk_lnkspd(const char *name, uint32_t link_speed) "%s link_speed=0x%x" + # common.c vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)" vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64 |