diff options
author | Stefan Hajnoczi <stefanha@redhat.com> | 2025-05-15 13:41:56 -0400 |
---|---|---|
committer | Stefan Hajnoczi <stefanha@redhat.com> | 2025-05-15 13:41:56 -0400 |
commit | 21596064081e8d0c0153f68714981c7f0e040973 (patch) | |
tree | 4598685872d139d53c780aebfa9f4b972e2bff7b | |
parent | 92fbc2ffc92f387c2ccb00b38ac800ca924c079a (diff) | |
parent | 28931c2e1591deb4bfaaf744fdc8813e96c230f1 (diff) | |
download | qemu-21596064081e8d0c0153f68714981c7f0e040973.zip qemu-21596064081e8d0c0153f68714981c7f0e040973.tar.gz qemu-21596064081e8d0c0153f68714981c7f0e040973.tar.bz2 |
Merge tag 'for_upstream' of https://git.kernel.org/pub/scm/virt/kvm/mst/qemu into staging
virtio,pci,pc: fixes, features
vhost-scsi now supports scsi hotplug
cxl gained a bag of new operations, motably media operations
virtio-net now supports SR-IOV emulation
pci-testdev now supports backing memory bar with host memory
amd iommu now supports migration
fixes all over the place
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
# -----BEGIN PGP SIGNATURE-----
#
# iQFDBAABCgAtFiEEXQn9CHHI+FuUyooNKB8NuNKNVGkFAmgkg0UPHG1zdEByZWRo
# YXQuY29tAAoJECgfDbjSjVRpcDIH+wbrq7DzG+BVOraYtmD69BQCzYszby1mAWry
# 2OUYuAx9Oh+DsAwbzwbBdh9+SmJoi1oJ/d8rzSK328hdDrpCaPmc7bcBdAWJ3YcB
# bGNPyJ+9eJLRXtlceGIhfAOMLIB0ugXGkHLQ61zlVCTg4Xwnj7/dQp2tAQ1BkTwW
# Azc7ujBoJOBF3WVpa1Pqw0t1m3K74bwanOlkIg/JUWXk27sgP2YMnyrcpOu9Iz1T
# VazgobyHo5y15V0wvd05w4Bk7cJSHwgW+y3DtgTtIffetIaAbSRgl3Pl5Ic1yKcX
# ofg9aDFN6m0S8tv4WgFc+rT3Xaa/aPue9awjD5sEEldRasWKKNo=
# =847R
# -----END PGP SIGNATURE-----
# gpg: Signature made Wed 14 May 2025 07:49:25 EDT
# gpg: using RSA key 5D09FD0871C8F85B94CA8A0D281F0DB8D28D5469
# gpg: issuer "mst@redhat.com"
# gpg: Good signature from "Michael S. Tsirkin <mst@kernel.org>" [full]
# gpg: aka "Michael S. Tsirkin <mst@redhat.com>" [full]
# Primary key fingerprint: 0270 606B 6F3C DF3D 0B17 0970 C350 3912 AFBE 8E67
# Subkey fingerprint: 5D09 FD08 71C8 F85B 94CA 8A0D 281F 0DB8 D28D 5469
* tag 'for_upstream' of https://git.kernel.org/pub/scm/virt/kvm/mst/qemu: (27 commits)
hw/i386/amd_iommu: Allow migration when explicitly create the AMDVI-PCI device
hw/i386/amd_iommu: Isolate AMDVI-PCI from amd-iommu device to allow full control over the PCI device creation
intel_iommu: Take locks when looking for and creating address spaces
intel_iommu: Use BQL_LOCK_GUARD to manage cleanup automatically
virtio: Move virtio_reset()
virtio: Call set_features during reset
vhost-scsi: support VIRTIO_SCSI_F_HOTPLUG
vhost-user: return failure if backend crash when live migration
vhost: return failure if stop virtqueue failed in vhost_dev_stop
system/runstate: add VM state change cb with return value
pci-testdev.c: Add membar-backed option for backing membar
pcie_sriov: Make a PCI device with user-created VF ARI-capable
docs: Document composable SR-IOV device
virtio-net: Implement SR-IOV VF
virtio-pci: Implement SR-IOV PF
pcie_sriov: Allow user to create SR-IOV device
pcie_sriov: Check PCI Express for SR-IOV PF
pcie_sriov: Ensure PF and VF are mutually exclusive
hw/pci: Fix SR-IOV VF number calculation
hw/pci: Do not add ROM BAR for SR-IOV VF
...
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
55 files changed, 1437 insertions, 356 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 6dacd6d..b579358 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2058,6 +2058,7 @@ F: hw/pci-bridge/* F: qapi/pci.json F: docs/pci* F: docs/specs/*pci* +F: docs/system/sriov.rst PCIE DOE M: Huai-Cheng Kuo <hchkuo@avery-design.com.tw> diff --git a/backends/vhost-user.c b/backends/vhost-user.c index 94274a6..4284532 100644 --- a/backends/vhost-user.c +++ b/backends/vhost-user.c @@ -97,30 +97,28 @@ err_host_notifiers: vhost_dev_disable_notifiers(&b->dev, b->vdev); } -void +int vhost_user_backend_stop(VhostUserBackend *b) { BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(b->vdev))); VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); - int ret = 0; + int ret; if (!b->started) { - return; + return 0; } - vhost_dev_stop(&b->dev, b->vdev, true); + ret = vhost_dev_stop(&b->dev, b->vdev, true); - if (k->set_guest_notifiers) { - ret = k->set_guest_notifiers(qbus->parent, - b->dev.nvqs, false); - if (ret < 0) { - error_report("vhost guest notifier cleanup failed: %d", ret); - } + if (k->set_guest_notifiers && + k->set_guest_notifiers(qbus->parent, b->dev.nvqs, false) < 0) { + error_report("vhost guest notifier cleanup failed: %d", ret); + return -1; } - assert(ret >= 0); vhost_dev_disable_notifiers(&b->dev, b->vdev); b->started = false; + return ret; } static void set_chardev(Object *obj, const char *value, Error **errp) diff --git a/docs/system/devices/cxl.rst b/docs/system/devices/cxl.rst index 882b036..e307caf 100644 --- a/docs/system/devices/cxl.rst +++ b/docs/system/devices/cxl.rst @@ -308,7 +308,7 @@ A very simple setup with just one directly attached CXL Type 3 Persistent Memory -object memory-backend-file,id=cxl-lsa1,share=on,mem-path=/tmp/lsa.raw,size=256M \ -device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1 \ -device cxl-rp,port=0,bus=cxl.1,id=root_port13,chassis=0,slot=2 \ - -device cxl-type3,bus=root_port13,persistent-memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem0 \ + -device cxl-type3,bus=root_port13,persistent-memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem0,sn=0x1 \ -M cxl-fmw.0.targets.0=cxl.1,cxl-fmw.0.size=4G A very simple setup with just one directly attached CXL Type 3 Volatile Memory device:: @@ -349,13 +349,13 @@ the CXL Type3 device directly attached (no switches).:: -device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1 \ -device pxb-cxl,bus_nr=222,bus=pcie.0,id=cxl.2 \ -device cxl-rp,port=0,bus=cxl.1,id=root_port13,chassis=0,slot=2 \ - -device cxl-type3,bus=root_port13,persistent-memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem0 \ + -device cxl-type3,bus=root_port13,persistent-memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem0,sn=0x1 \ -device cxl-rp,port=1,bus=cxl.1,id=root_port14,chassis=0,slot=3 \ - -device cxl-type3,bus=root_port14,persistent-memdev=cxl-mem2,lsa=cxl-lsa2,id=cxl-pmem1 \ + -device cxl-type3,bus=root_port14,persistent-memdev=cxl-mem2,lsa=cxl-lsa2,id=cxl-pmem1,sn=0x2 \ -device cxl-rp,port=0,bus=cxl.2,id=root_port15,chassis=0,slot=5 \ - -device cxl-type3,bus=root_port15,persistent-memdev=cxl-mem3,lsa=cxl-lsa3,id=cxl-pmem2 \ + -device cxl-type3,bus=root_port15,persistent-memdev=cxl-mem3,lsa=cxl-lsa3,id=cxl-pmem2,sn=0x3 \ -device cxl-rp,port=1,bus=cxl.2,id=root_port16,chassis=0,slot=6 \ - -device cxl-type3,bus=root_port16,persistent-memdev=cxl-mem4,lsa=cxl-lsa4,id=cxl-pmem3 \ + -device cxl-type3,bus=root_port16,persistent-memdev=cxl-mem4,lsa=cxl-lsa4,id=cxl-pmem3,sn=0x4 \ -M cxl-fmw.0.targets.0=cxl.1,cxl-fmw.0.targets.1=cxl.2,cxl-fmw.0.size=4G,cxl-fmw.0.interleave-granularity=8k An example of 4 devices below a switch suitable for 1, 2 or 4 way interleave:: @@ -375,13 +375,13 @@ An example of 4 devices below a switch suitable for 1, 2 or 4 way interleave:: -device cxl-rp,port=1,bus=cxl.1,id=root_port1,chassis=0,slot=1 \ -device cxl-upstream,bus=root_port0,id=us0 \ -device cxl-downstream,port=0,bus=us0,id=swport0,chassis=0,slot=4 \ - -device cxl-type3,bus=swport0,persistent-memdev=cxl-mem0,lsa=cxl-lsa0,id=cxl-pmem0 \ + -device cxl-type3,bus=swport0,persistent-memdev=cxl-mem0,lsa=cxl-lsa0,id=cxl-pmem0,sn=0x1 \ -device cxl-downstream,port=1,bus=us0,id=swport1,chassis=0,slot=5 \ - -device cxl-type3,bus=swport1,persistent-memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem1 \ + -device cxl-type3,bus=swport1,persistent-memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem1,sn=0x2 \ -device cxl-downstream,port=2,bus=us0,id=swport2,chassis=0,slot=6 \ - -device cxl-type3,bus=swport2,persistent-memdev=cxl-mem2,lsa=cxl-lsa2,id=cxl-pmem2 \ + -device cxl-type3,bus=swport2,persistent-memdev=cxl-mem2,lsa=cxl-lsa2,id=cxl-pmem2,sn=0x3 \ -device cxl-downstream,port=3,bus=us0,id=swport3,chassis=0,slot=7 \ - -device cxl-type3,bus=swport3,persistent-memdev=cxl-mem3,lsa=cxl-lsa3,id=cxl-pmem3 \ + -device cxl-type3,bus=swport3,persistent-memdev=cxl-mem3,lsa=cxl-lsa3,id=cxl-pmem3,sn=0x4 \ -M cxl-fmw.0.targets.0=cxl.1,cxl-fmw.0.size=4G,cxl-fmw.0.interleave-granularity=4k Deprecations diff --git a/docs/system/index.rst b/docs/system/index.rst index c21065e..718e9d3 100644 --- a/docs/system/index.rst +++ b/docs/system/index.rst @@ -39,3 +39,4 @@ or Hypervisor.Framework. multi-process confidential-guest-support vm-templating + sriov diff --git a/docs/system/sriov.rst b/docs/system/sriov.rst new file mode 100644 index 0000000..d12178f --- /dev/null +++ b/docs/system/sriov.rst @@ -0,0 +1,37 @@ +.. SPDX-License-Identifier: GPL-2.0-or-later + +Compsable SR-IOV device +======================= + +SR-IOV (Single Root I/O Virtualization) is an optional extended capability of a +PCI Express device. It allows a single physical function (PF) to appear as +multiple virtual functions (VFs) for the main purpose of eliminating software +overhead in I/O from virtual machines. + +There are devices with predefined SR-IOV configurations, but it is also possible +to compose an SR-IOV device yourself. Composing an SR-IOV device is currently +only supported by virtio-net-pci. + +Users can configure an SR-IOV-capable virtio-net device by adding +virtio-net-pci functions to a bus. Below is a command line example: + +.. code-block:: shell + + -netdev user,id=n -netdev user,id=o + -netdev user,id=p -netdev user,id=q + -device pcie-root-port,id=b + -device virtio-net-pci,bus=b,addr=0x0.0x3,netdev=q,sriov-pf=f + -device virtio-net-pci,bus=b,addr=0x0.0x2,netdev=p,sriov-pf=f + -device virtio-net-pci,bus=b,addr=0x0.0x1,netdev=o,sriov-pf=f + -device virtio-net-pci,bus=b,addr=0x0.0x0,netdev=n,id=f + +The VFs specify the paired PF with ``sriov-pf`` property. The PF must be +added after all VFs. It is the user's responsibility to ensure that VFs have +function numbers larger than one of the PF, and that the function numbers +have a consistent stride. Both the PF and VFs are ARI-capable so you can have +255 VFs at maximum. + +You may also need to perform additional steps to activate the SR-IOV feature on +your guest. For Linux, refer to [1]_. + +.. [1] https://docs.kernel.org/PCI/pci-iov-howto.html diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c index 4bb5ed2..0eebbcd 100644 --- a/hw/block/vhost-user-blk.c +++ b/hw/block/vhost-user-blk.c @@ -204,7 +204,7 @@ err_host_notifiers: return ret; } -static void vhost_user_blk_stop(VirtIODevice *vdev) +static int vhost_user_blk_stop(VirtIODevice *vdev) { VHostUserBlk *s = VHOST_USER_BLK(vdev); BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); @@ -212,26 +212,26 @@ static void vhost_user_blk_stop(VirtIODevice *vdev) int ret; if (!s->started_vu) { - return; + return 0; } s->started_vu = false; if (!k->set_guest_notifiers) { - return; + return 0; } - vhost_dev_stop(&s->dev, vdev, true); + ret = vhost_dev_stop(&s->dev, vdev, true); - ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, false); - if (ret < 0) { + if (k->set_guest_notifiers(qbus->parent, s->dev.nvqs, false) < 0) { error_report("vhost guest notifier cleanup failed: %d", ret); - return; + return -1; } vhost_dev_disable_notifiers(&s->dev, vdev); + return ret; } -static void vhost_user_blk_set_status(VirtIODevice *vdev, uint8_t status) +static int vhost_user_blk_set_status(VirtIODevice *vdev, uint8_t status) { VHostUserBlk *s = VHOST_USER_BLK(vdev); bool should_start = virtio_device_should_start(vdev, status); @@ -239,11 +239,11 @@ static void vhost_user_blk_set_status(VirtIODevice *vdev, uint8_t status) int ret; if (!s->connected) { - return; + return -1; } if (vhost_dev_is_started(&s->dev) == should_start) { - return; + return 0; } if (should_start) { @@ -253,9 +253,12 @@ static void vhost_user_blk_set_status(VirtIODevice *vdev, uint8_t status) qemu_chr_fe_disconnect(&s->chardev); } } else { - vhost_user_blk_stop(vdev); + ret = vhost_user_blk_stop(vdev); + if (ret < 0) { + return ret; + } } - + return 0; } static uint64_t vhost_user_blk_get_features(VirtIODevice *vdev, diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index b54d01d..9bab271 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -1270,7 +1270,7 @@ static uint64_t virtio_blk_get_features(VirtIODevice *vdev, uint64_t features, return features; } -static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t status) +static int virtio_blk_set_status(VirtIODevice *vdev, uint8_t status) { VirtIOBlock *s = VIRTIO_BLK(vdev); @@ -1279,7 +1279,7 @@ static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t status) } if (!(status & VIRTIO_CONFIG_S_DRIVER_OK)) { - return; + return 0; } /* A guest that supports VIRTIO_BLK_F_CONFIG_WCE must be able to send @@ -1302,6 +1302,7 @@ static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t status) virtio_vdev_has_feature(vdev, VIRTIO_BLK_F_WCE)); } + return 0; } static void virtio_blk_save_device(VirtIODevice *vdev, QEMUFile *f) @@ -1802,7 +1803,7 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp) * called after ->start_ioeventfd() has already set blk's AioContext. */ s->change = - qdev_add_vm_change_state_handler(dev, virtio_blk_dma_restart_cb, s); + qdev_add_vm_change_state_handler(dev, virtio_blk_dma_restart_cb, NULL, s); blk_ram_registrar_init(&s->blk_ram_registrar, s->blk); blk_set_dev_ops(s->blk, &virtio_block_ops, s); diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c index eb79f52..673c50f 100644 --- a/hw/char/virtio-serial-bus.c +++ b/hw/char/virtio-serial-bus.c @@ -622,7 +622,7 @@ static void guest_reset(VirtIOSerial *vser) } } -static void set_status(VirtIODevice *vdev, uint8_t status) +static int set_status(VirtIODevice *vdev, uint8_t status) { VirtIOSerial *vser; VirtIOSerialPort *port; @@ -650,6 +650,7 @@ static void set_status(VirtIODevice *vdev, uint8_t status) vsc->enable_backend(port, vdev->vm_running); } } + return 0; } static void vser_reset(VirtIODevice *vdev) diff --git a/hw/core/vm-change-state-handler.c b/hw/core/vm-change-state-handler.c index 7064995..99c642b 100644 --- a/hw/core/vm-change-state-handler.c +++ b/hw/core/vm-change-state-handler.c @@ -40,6 +40,7 @@ static int qdev_get_dev_tree_depth(DeviceState *dev) * qdev_add_vm_change_state_handler: * @dev: the device that owns this handler * @cb: the callback function to be invoked + * @cb_ret: the callback function with return value to be invoked * @opaque: user data passed to the callback function * * This function works like qemu_add_vm_change_state_handler() except callbacks @@ -50,25 +51,30 @@ static int qdev_get_dev_tree_depth(DeviceState *dev) * controller's callback is invoked before the children on its bus when the VM * starts running. The order is reversed when the VM stops running. * + * Note that the parameter `cb` and `cb_ret` are mutually exclusive. + * * Returns: an entry to be freed with qemu_del_vm_change_state_handler() */ VMChangeStateEntry *qdev_add_vm_change_state_handler(DeviceState *dev, VMChangeStateHandler *cb, + VMChangeStateHandlerWithRet *cb_ret, void *opaque) { - return qdev_add_vm_change_state_handler_full(dev, cb, NULL, opaque); + assert(!cb || !cb_ret); + return qdev_add_vm_change_state_handler_full(dev, cb, NULL, cb_ret, opaque); } /* * Exactly like qdev_add_vm_change_state_handler() but passes a prepare_cb - * argument too. + * and the cb_ret arguments too. */ VMChangeStateEntry *qdev_add_vm_change_state_handler_full( - DeviceState *dev, VMChangeStateHandler *cb, - VMChangeStateHandler *prepare_cb, void *opaque) + DeviceState *dev, VMChangeStateHandler *cb, VMChangeStateHandler *prepare_cb, + VMChangeStateHandlerWithRet *cb_ret, void *opaque) { int depth = qdev_get_dev_tree_depth(dev); - return qemu_add_vm_change_state_handler_prio_full(cb, prepare_cb, opaque, - depth); + assert(!cb || !cb_ret); + return qemu_add_vm_change_state_handler_prio_full(cb, prepare_cb, cb_ret, + opaque, depth); } diff --git a/hw/cxl/cxl-device-utils.c b/hw/cxl/cxl-device-utils.c index 52ad1e4..e150d74 100644 --- a/hw/cxl/cxl-device-utils.c +++ b/hw/cxl/cxl-device-utils.c @@ -95,11 +95,15 @@ static uint64_t mailbox_reg_read(void *opaque, hwaddr offset, unsigned size) } if (offset == A_CXL_DEV_MAILBOX_STS) { uint64_t status_reg = cxl_dstate->mbox_reg_state64[offset / size]; - if (cci->bg.complete_pct) { - status_reg = FIELD_DP64(status_reg, CXL_DEV_MAILBOX_STS, BG_OP, - 0); - cxl_dstate->mbox_reg_state64[offset / size] = status_reg; - } + int bgop; + + qemu_mutex_lock(&cci->bg.lock); + bgop = !(cci->bg.complete_pct == 100 || cci->bg.aborted); + + status_reg = FIELD_DP64(status_reg, CXL_DEV_MAILBOX_STS, BG_OP, + bgop); + cxl_dstate->mbox_reg_state64[offset / size] = status_reg; + qemu_mutex_unlock(&cci->bg.lock); } return cxl_dstate->mbox_reg_state64[offset / size]; default: diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c index 516c01d..299f232 100644 --- a/hw/cxl/cxl-mailbox-utils.c +++ b/hw/cxl/cxl-mailbox-utils.c @@ -7,6 +7,8 @@ * COPYING file in the top-level directory. */ +#include <math.h> + #include "qemu/osdep.h" #include "hw/pci/msi.h" #include "hw/pci/msix.h" @@ -26,6 +28,11 @@ #define CXL_DC_EVENT_LOG_SIZE 8 #define CXL_NUM_EXTENTS_SUPPORTED 512 #define CXL_NUM_TAGS_SUPPORTED 0 +#define CXL_ALERTS_LIFE_USED_WARN_THRESH (1 << 0) +#define CXL_ALERTS_OVER_TEMP_WARN_THRESH (1 << 1) +#define CXL_ALERTS_UNDER_TEMP_WARN_THRESH (1 << 2) +#define CXL_ALERTS_COR_VMEM_ERR_WARN_THRESH (1 << 3) +#define CXL_ALERTS_COR_PMEM_ERR_WARN_THRESH (1 << 4) /* * How to add a new command, example. The command set FOO, with cmd BAR. @@ -56,6 +63,9 @@ enum { INFOSTAT = 0x00, #define IS_IDENTIFY 0x1 #define BACKGROUND_OPERATION_STATUS 0x2 + #define GET_RESPONSE_MSG_LIMIT 0x3 + #define SET_RESPONSE_MSG_LIMIT 0x4 + #define BACKGROUND_OPERATION_ABORT 0x5 EVENTS = 0x01, #define GET_RECORDS 0x0 #define CLEAR_RECORDS 0x1 @@ -81,9 +91,13 @@ enum { #define GET_PARTITION_INFO 0x0 #define GET_LSA 0x2 #define SET_LSA 0x3 + HEALTH_INFO_ALERTS = 0x42, + #define GET_ALERT_CONFIG 0x1 + #define SET_ALERT_CONFIG 0x2 SANITIZE = 0x44, #define OVERWRITE 0x0 #define SECURE_ERASE 0x1 + #define MEDIA_OPERATIONS 0x2 PERSISTENT_MEM = 0x45, #define GET_SECURITY_STATE 0x0 MEDIA_AND_POISON = 0x43, @@ -412,12 +426,58 @@ static CXLRetCode cmd_infostat_identify(const struct cxl_cmd *cmd, is_identify->component_type = 0x3; /* Type 3 */ } - /* TODO: Allow this to vary across different CCIs */ - is_identify->max_message_size = 9; /* 512 bytes - MCTP_CXL_MAILBOX_BYTES */ + is_identify->max_message_size = (uint8_t)log2(cci->payload_max); *len_out = sizeof(*is_identify); return CXL_MBOX_SUCCESS; } +/* CXL r3.1 section 8.2.9.1.3: Get Response Message Limit (Opcode 0003h) */ +static CXLRetCode cmd_get_response_msg_limit(const struct cxl_cmd *cmd, + uint8_t *payload_in, + size_t len_in, + uint8_t *payload_out, + size_t *len_out, + CXLCCI *cci) +{ + struct { + uint8_t rsp_limit; + } QEMU_PACKED *get_rsp_msg_limit = (void *)payload_out; + QEMU_BUILD_BUG_ON(sizeof(*get_rsp_msg_limit) != 1); + + get_rsp_msg_limit->rsp_limit = (uint8_t)log2(cci->payload_max); + + *len_out = sizeof(*get_rsp_msg_limit); + return CXL_MBOX_SUCCESS; +} + +/* CXL r3.1 section 8.2.9.1.4: Set Response Message Limit (Opcode 0004h) */ +static CXLRetCode cmd_set_response_msg_limit(const struct cxl_cmd *cmd, + uint8_t *payload_in, + size_t len_in, + uint8_t *payload_out, + size_t *len_out, + CXLCCI *cci) +{ + struct { + uint8_t rsp_limit; + } QEMU_PACKED *in = (void *)payload_in; + QEMU_BUILD_BUG_ON(sizeof(*in) != 1); + struct { + uint8_t rsp_limit; + } QEMU_PACKED *out = (void *)payload_out; + QEMU_BUILD_BUG_ON(sizeof(*out) != 1); + + if (in->rsp_limit < 8 || in->rsp_limit > 10) { + return CXL_MBOX_INVALID_INPUT; + } + + cci->payload_max = 1 << in->rsp_limit; + out->rsp_limit = in->rsp_limit; + + *len_out = sizeof(*out); + return CXL_MBOX_SUCCESS; +} + static void cxl_set_dsp_active_bm(PCIBus *b, PCIDevice *d, void *private) { @@ -636,6 +696,41 @@ static CXLRetCode cmd_infostat_bg_op_sts(const struct cxl_cmd *cmd, return CXL_MBOX_SUCCESS; } +/* + * CXL r3.1 Section 8.2.9.1.5: + * Request Abort Background Operation (Opcode 0005h) + */ +static CXLRetCode cmd_infostat_bg_op_abort(const struct cxl_cmd *cmd, + uint8_t *payload_in, + size_t len_in, + uint8_t *payload_out, + size_t *len_out, + CXLCCI *cci) +{ + int bg_set = cci->bg.opcode >> 8; + int bg_cmd = cci->bg.opcode & 0xff; + const struct cxl_cmd *bg_c = &cci->cxl_cmd_set[bg_set][bg_cmd]; + + if (!(bg_c->effect & CXL_MBOX_BACKGROUND_OPERATION_ABORT)) { + return CXL_MBOX_REQUEST_ABORT_NOTSUP; + } + + qemu_mutex_lock(&cci->bg.lock); + if (cci->bg.runtime) { + /* operation is near complete, let it finish */ + if (cci->bg.complete_pct < 85) { + timer_del(cci->bg.timer); + cci->bg.ret_code = CXL_MBOX_ABORTED; + cci->bg.starttime = 0; + cci->bg.runtime = 0; + cci->bg.aborted = true; + } + } + qemu_mutex_unlock(&cci->bg.lock); + + return CXL_MBOX_SUCCESS; +} + #define CXL_FW_SLOTS 2 #define CXL_FW_SIZE 0x02000000 /* 32 mb */ @@ -1523,6 +1618,97 @@ static CXLRetCode cmd_ccls_set_lsa(const struct cxl_cmd *cmd, return CXL_MBOX_SUCCESS; } +/* CXL r3.2 Section 8.2.10.9.3.2 Get Alert Configuration (Opcode 4201h) */ +static CXLRetCode cmd_get_alert_config(const struct cxl_cmd *cmd, + uint8_t *payload_in, + size_t len_in, + uint8_t *payload_out, + size_t *len_out, + CXLCCI *cci) +{ + CXLType3Dev *ct3d = CXL_TYPE3(cci->d); + CXLAlertConfig *out = (CXLAlertConfig *)payload_out; + + memcpy(out, &ct3d->alert_config, sizeof(ct3d->alert_config)); + *len_out = sizeof(ct3d->alert_config); + + return CXL_MBOX_SUCCESS; +} + +/* CXL r3.2 Section 8.2.10.9.3.3 Set Alert Configuration (Opcode 4202h) */ +static CXLRetCode cmd_set_alert_config(const struct cxl_cmd *cmd, + uint8_t *payload_in, + size_t len_in, + uint8_t *payload_out, + size_t *len_out, + CXLCCI *cci) +{ + CXLType3Dev *ct3d = CXL_TYPE3(cci->d); + CXLAlertConfig *alert_config = &ct3d->alert_config; + struct { + uint8_t valid_alert_actions; + uint8_t enable_alert_actions; + uint8_t life_used_warn_thresh; + uint8_t rsvd; + uint16_t over_temp_warn_thresh; + uint16_t under_temp_warn_thresh; + uint16_t cor_vmem_err_warn_thresh; + uint16_t cor_pmem_err_warn_thresh; + } QEMU_PACKED *in = (void *)payload_in; + + if (in->valid_alert_actions & CXL_ALERTS_LIFE_USED_WARN_THRESH) { + /* + * CXL r3.2 Table 8-149 The life used warning threshold shall be + * less than the life used critical alert value. + */ + if (in->life_used_warn_thresh >= + alert_config->life_used_crit_alert_thresh) { + return CXL_MBOX_INVALID_INPUT; + } + alert_config->life_used_warn_thresh = in->life_used_warn_thresh; + alert_config->enable_alerts |= CXL_ALERTS_LIFE_USED_WARN_THRESH; + } + + if (in->valid_alert_actions & CXL_ALERTS_OVER_TEMP_WARN_THRESH) { + /* + * CXL r3.2 Table 8-149 The Device Over-Temperature Warning Threshold + * shall be less than the the Device Over-Temperature Critical + * Alert Threshold. + */ + if (in->over_temp_warn_thresh >= + alert_config->over_temp_crit_alert_thresh) { + return CXL_MBOX_INVALID_INPUT; + } + alert_config->over_temp_warn_thresh = in->over_temp_warn_thresh; + alert_config->enable_alerts |= CXL_ALERTS_OVER_TEMP_WARN_THRESH; + } + + if (in->valid_alert_actions & CXL_ALERTS_UNDER_TEMP_WARN_THRESH) { + /* + * CXL r3.2 Table 8-149 The Device Under-Temperature Warning Threshold + * shall be higher than the the Device Under-Temperature Critical + * Alert Threshold. + */ + if (in->under_temp_warn_thresh <= + alert_config->under_temp_crit_alert_thresh) { + return CXL_MBOX_INVALID_INPUT; + } + alert_config->under_temp_warn_thresh = in->under_temp_warn_thresh; + alert_config->enable_alerts |= CXL_ALERTS_UNDER_TEMP_WARN_THRESH; + } + + if (in->valid_alert_actions & CXL_ALERTS_COR_VMEM_ERR_WARN_THRESH) { + alert_config->cor_vmem_err_warn_thresh = in->cor_vmem_err_warn_thresh; + alert_config->enable_alerts |= CXL_ALERTS_COR_VMEM_ERR_WARN_THRESH; + } + + if (in->valid_alert_actions & CXL_ALERTS_COR_PMEM_ERR_WARN_THRESH) { + alert_config->cor_pmem_err_warn_thresh = in->cor_pmem_err_warn_thresh; + alert_config->enable_alerts |= CXL_ALERTS_COR_PMEM_ERR_WARN_THRESH; + } + return CXL_MBOX_SUCCESS; +} + /* Perform the actual device zeroing */ static void __do_sanitization(CXLType3Dev *ct3d) { @@ -1553,34 +1739,10 @@ static void __do_sanitization(CXLType3Dev *ct3d) cxl_discard_all_event_records(&ct3d->cxl_dstate); } -/* - * CXL r3.1 Section 8.2.9.9.5.1: Sanitize (Opcode 4400h) - * - * Once the Sanitize command has started successfully, the device shall be - * placed in the media disabled state. If the command fails or is interrupted - * by a reset or power failure, it shall remain in the media disabled state - * until a successful Sanitize command has been completed. During this state: - * - * 1. Memory writes to the device will have no effect, and all memory reads - * will return random values (no user data returned, even for locations that - * the failed Sanitize operation didn’t sanitize yet). - * - * 2. Mailbox commands shall still be processed in the disabled state, except - * that commands that access Sanitized areas shall fail with the Media Disabled - * error code. - */ -static CXLRetCode cmd_sanitize_overwrite(const struct cxl_cmd *cmd, - uint8_t *payload_in, - size_t len_in, - uint8_t *payload_out, - size_t *len_out, - CXLCCI *cci) +static int get_sanitize_duration(uint64_t total_mem) { - CXLType3Dev *ct3d = CXL_TYPE3(cci->d); - uint64_t total_mem; /* in Mb */ - int secs; + int secs = 0; - total_mem = (ct3d->cxl_dstate.vmem_size + ct3d->cxl_dstate.pmem_size) >> 20; if (total_mem <= 512) { secs = 4; } else if (total_mem <= 1024) { @@ -1609,6 +1771,39 @@ static CXLRetCode cmd_sanitize_overwrite(const struct cxl_cmd *cmd, secs = 240 * 60; /* max 4 hrs */ } + return secs; +} + +/* + * CXL r3.1 Section 8.2.9.9.5.1: Sanitize (Opcode 4400h) + * + * Once the Sanitize command has started successfully, the device shall be + * placed in the media disabled state. If the command fails or is interrupted + * by a reset or power failure, it shall remain in the media disabled state + * until a successful Sanitize command has been completed. During this state: + * + * 1. Memory writes to the device will have no effect, and all memory reads + * will return random values (no user data returned, even for locations that + * the failed Sanitize operation didn’t sanitize yet). + * + * 2. Mailbox commands shall still be processed in the disabled state, except + * that commands that access Sanitized areas shall fail with the Media Disabled + * error code. + */ +static CXLRetCode cmd_sanitize_overwrite(const struct cxl_cmd *cmd, + uint8_t *payload_in, + size_t len_in, + uint8_t *payload_out, + size_t *len_out, + CXLCCI *cci) +{ + CXLType3Dev *ct3d = CXL_TYPE3(cci->d); + uint64_t total_mem; /* in Mb */ + int secs; + + total_mem = (ct3d->cxl_dstate.vmem_size + ct3d->cxl_dstate.pmem_size) >> 20; + secs = get_sanitize_duration(total_mem); + /* EBUSY other bg cmds as of now */ cci->bg.runtime = secs * 1000UL; *len_out = 0; @@ -1619,6 +1814,324 @@ static CXLRetCode cmd_sanitize_overwrite(const struct cxl_cmd *cmd, return CXL_MBOX_BG_STARTED; } +struct dpa_range_list_entry { + uint64_t starting_dpa; + uint64_t length; +} QEMU_PACKED; + +struct CXLSanitizeInfo { + uint32_t dpa_range_count; + uint8_t fill_value; + struct dpa_range_list_entry dpa_range_list[]; +} QEMU_PACKED; + +static uint64_t get_vmr_size(CXLType3Dev *ct3d, MemoryRegion **vmr) +{ + MemoryRegion *mr; + if (ct3d->hostvmem) { + mr = host_memory_backend_get_memory(ct3d->hostvmem); + if (vmr) { + *vmr = mr; + } + return memory_region_size(mr); + } + return 0; +} + +static uint64_t get_pmr_size(CXLType3Dev *ct3d, MemoryRegion **pmr) +{ + MemoryRegion *mr; + if (ct3d->hostpmem) { + mr = host_memory_backend_get_memory(ct3d->hostpmem); + if (pmr) { + *pmr = mr; + } + return memory_region_size(mr); + } + return 0; +} + +static uint64_t get_dc_size(CXLType3Dev *ct3d, MemoryRegion **dc_mr) +{ + MemoryRegion *mr; + if (ct3d->dc.host_dc) { + mr = host_memory_backend_get_memory(ct3d->dc.host_dc); + if (dc_mr) { + *dc_mr = mr; + } + return memory_region_size(mr); + } + return 0; +} + +static int validate_dpa_addr(CXLType3Dev *ct3d, uint64_t dpa_addr, + size_t length) +{ + uint64_t vmr_size, pmr_size, dc_size; + + if ((dpa_addr % CXL_CACHE_LINE_SIZE) || + (length % CXL_CACHE_LINE_SIZE) || + (length <= 0)) { + return -EINVAL; + } + + vmr_size = get_vmr_size(ct3d, NULL); + pmr_size = get_pmr_size(ct3d, NULL); + dc_size = get_dc_size(ct3d, NULL); + + if (dpa_addr + length > vmr_size + pmr_size + dc_size) { + return -EINVAL; + } + + if (dpa_addr > vmr_size + pmr_size) { + if (!ct3_test_region_block_backed(ct3d, dpa_addr, length)) { + return -ENODEV; + } + } + + return 0; +} + +static int sanitize_range(CXLType3Dev *ct3d, uint64_t dpa_addr, size_t length, + uint8_t fill_value) +{ + + uint64_t vmr_size, pmr_size; + AddressSpace *as = NULL; + MemTxAttrs mem_attrs = {}; + + vmr_size = get_vmr_size(ct3d, NULL); + pmr_size = get_pmr_size(ct3d, NULL); + + if (dpa_addr < vmr_size) { + as = &ct3d->hostvmem_as; + } else if (dpa_addr < vmr_size + pmr_size) { + as = &ct3d->hostpmem_as; + } else { + if (!ct3_test_region_block_backed(ct3d, dpa_addr, length)) { + return -ENODEV; + } + as = &ct3d->dc.host_dc_as; + } + + return address_space_set(as, dpa_addr, fill_value, length, mem_attrs); +} + +/* Perform the actual device zeroing */ +static void __do_sanitize(CXLType3Dev *ct3d) +{ + struct CXLSanitizeInfo *san_info = ct3d->media_op_sanitize; + int dpa_range_count = san_info->dpa_range_count; + int rc = 0; + int i; + + for (i = 0; i < dpa_range_count; i++) { + rc = sanitize_range(ct3d, san_info->dpa_range_list[i].starting_dpa, + san_info->dpa_range_list[i].length, + san_info->fill_value); + if (rc) { + goto exit; + } + } +exit: + g_free(ct3d->media_op_sanitize); + ct3d->media_op_sanitize = NULL; + return; +} + +enum { + MEDIA_OP_CLASS_GENERAL = 0x0, + #define MEDIA_OP_GEN_SUBC_DISCOVERY 0x0 + MEDIA_OP_CLASS_SANITIZE = 0x1, + #define MEDIA_OP_SAN_SUBC_SANITIZE 0x0 + #define MEDIA_OP_SAN_SUBC_ZERO 0x1 +}; + +struct media_op_supported_list_entry { + uint8_t media_op_class; + uint8_t media_op_subclass; +}; + +struct media_op_discovery_out_pl { + uint64_t dpa_range_granularity; + uint16_t total_supported_operations; + uint16_t num_of_supported_operations; + struct media_op_supported_list_entry entry[]; +} QEMU_PACKED; + +static const struct media_op_supported_list_entry media_op_matrix[] = { + { MEDIA_OP_CLASS_GENERAL, MEDIA_OP_GEN_SUBC_DISCOVERY }, + { MEDIA_OP_CLASS_SANITIZE, MEDIA_OP_SAN_SUBC_SANITIZE }, + { MEDIA_OP_CLASS_SANITIZE, MEDIA_OP_SAN_SUBC_ZERO }, +}; + +static CXLRetCode media_operations_discovery(uint8_t *payload_in, + size_t len_in, + uint8_t *payload_out, + size_t *len_out) +{ + struct { + uint8_t media_operation_class; + uint8_t media_operation_subclass; + uint8_t rsvd[2]; + uint32_t dpa_range_count; + struct { + uint16_t start_index; + uint16_t num_ops; + } discovery_osa; + } QEMU_PACKED *media_op_in_disc_pl = (void *)payload_in; + struct media_op_discovery_out_pl *media_out_pl = + (struct media_op_discovery_out_pl *)payload_out; + int num_ops, start_index, i; + int count = 0; + + if (len_in < sizeof(*media_op_in_disc_pl)) { + return CXL_MBOX_INVALID_PAYLOAD_LENGTH; + } + + num_ops = media_op_in_disc_pl->discovery_osa.num_ops; + start_index = media_op_in_disc_pl->discovery_osa.start_index; + + /* + * As per spec CXL r3.2 8.2.10.9.5.3 dpa_range_count should be zero and + * start index should not exceed the total number of entries for discovery + * sub class command. + */ + if (media_op_in_disc_pl->dpa_range_count || + start_index > ARRAY_SIZE(media_op_matrix)) { + return CXL_MBOX_INVALID_INPUT; + } + + media_out_pl->dpa_range_granularity = CXL_CACHE_LINE_SIZE; + media_out_pl->total_supported_operations = + ARRAY_SIZE(media_op_matrix); + if (num_ops > 0) { + for (i = start_index; i < start_index + num_ops; i++) { + media_out_pl->entry[count].media_op_class = + media_op_matrix[i].media_op_class; + media_out_pl->entry[count].media_op_subclass = + media_op_matrix[i].media_op_subclass; + count++; + if (count == num_ops) { + break; + } + } + } + + media_out_pl->num_of_supported_operations = count; + *len_out = sizeof(*media_out_pl) + count * sizeof(*media_out_pl->entry); + return CXL_MBOX_SUCCESS; +} + +static CXLRetCode media_operations_sanitize(CXLType3Dev *ct3d, + uint8_t *payload_in, + size_t len_in, + uint8_t *payload_out, + size_t *len_out, + uint8_t fill_value, + CXLCCI *cci) +{ + struct media_operations_sanitize { + uint8_t media_operation_class; + uint8_t media_operation_subclass; + uint8_t rsvd[2]; + uint32_t dpa_range_count; + struct dpa_range_list_entry dpa_range_list[]; + } QEMU_PACKED *media_op_in_sanitize_pl = (void *)payload_in; + uint32_t dpa_range_count = media_op_in_sanitize_pl->dpa_range_count; + uint64_t total_mem = 0; + size_t dpa_range_list_size; + int secs = 0, i; + + if (dpa_range_count == 0) { + return CXL_MBOX_SUCCESS; + } + + dpa_range_list_size = dpa_range_count * sizeof(struct dpa_range_list_entry); + if (len_in < (sizeof(*media_op_in_sanitize_pl) + dpa_range_list_size)) { + return CXL_MBOX_INVALID_PAYLOAD_LENGTH; + } + + for (i = 0; i < dpa_range_count; i++) { + uint64_t start_dpa = + media_op_in_sanitize_pl->dpa_range_list[i].starting_dpa; + uint64_t length = media_op_in_sanitize_pl->dpa_range_list[i].length; + + if (validate_dpa_addr(ct3d, start_dpa, length)) { + return CXL_MBOX_INVALID_INPUT; + } + total_mem += length; + } + ct3d->media_op_sanitize = g_malloc0(sizeof(struct CXLSanitizeInfo) + + dpa_range_list_size); + + ct3d->media_op_sanitize->dpa_range_count = dpa_range_count; + ct3d->media_op_sanitize->fill_value = fill_value; + memcpy(ct3d->media_op_sanitize->dpa_range_list, + media_op_in_sanitize_pl->dpa_range_list, + dpa_range_list_size); + secs = get_sanitize_duration(total_mem >> 20); + + /* EBUSY other bg cmds as of now */ + cci->bg.runtime = secs * 1000UL; + *len_out = 0; + /* + * media op sanitize is targeted so no need to disable media or + * clear event logs + */ + return CXL_MBOX_BG_STARTED; +} + +static CXLRetCode cmd_media_operations(const struct cxl_cmd *cmd, + uint8_t *payload_in, + size_t len_in, + uint8_t *payload_out, + size_t *len_out, + CXLCCI *cci) +{ + struct { + uint8_t media_operation_class; + uint8_t media_operation_subclass; + uint8_t rsvd[2]; + uint32_t dpa_range_count; + } QEMU_PACKED *media_op_in_common_pl = (void *)payload_in; + CXLType3Dev *ct3d = CXL_TYPE3(cci->d); + uint8_t media_op_cl = 0; + uint8_t media_op_subclass = 0; + + if (len_in < sizeof(*media_op_in_common_pl)) { + return CXL_MBOX_INVALID_PAYLOAD_LENGTH; + } + + media_op_cl = media_op_in_common_pl->media_operation_class; + media_op_subclass = media_op_in_common_pl->media_operation_subclass; + + switch (media_op_cl) { + case MEDIA_OP_CLASS_GENERAL: + if (media_op_subclass != MEDIA_OP_GEN_SUBC_DISCOVERY) { + return CXL_MBOX_UNSUPPORTED; + } + + return media_operations_discovery(payload_in, len_in, payload_out, + len_out); + case MEDIA_OP_CLASS_SANITIZE: + switch (media_op_subclass) { + case MEDIA_OP_SAN_SUBC_SANITIZE: + return media_operations_sanitize(ct3d, payload_in, len_in, + payload_out, len_out, 0xF, + cci); + case MEDIA_OP_SAN_SUBC_ZERO: + return media_operations_sanitize(ct3d, payload_in, len_in, + payload_out, len_out, 0, + cci); + default: + return CXL_MBOX_UNSUPPORTED; + } + default: + return CXL_MBOX_UNSUPPORTED; + } +} + static CXLRetCode cmd_get_security_state(const struct cxl_cmd *cmd, uint8_t *payload_in, size_t len_in, @@ -2715,6 +3228,8 @@ static CXLRetCode cmd_dcd_release_dyn_cap(const struct cxl_cmd *cmd, } static const struct cxl_cmd cxl_cmd_set[256][256] = { + [INFOSTAT][BACKGROUND_OPERATION_ABORT] = { "BACKGROUND_OPERATION_ABORT", + cmd_infostat_bg_op_abort, 0, 0 }, [EVENTS][GET_RECORDS] = { "EVENTS_GET_RECORDS", cmd_events_get_records, 1, 0 }, [EVENTS][CLEAR_RECORDS] = { "EVENTS_CLEAR_RECORDS", @@ -2727,9 +3242,11 @@ static const struct cxl_cmd cxl_cmd_set[256][256] = { [FIRMWARE_UPDATE][GET_INFO] = { "FIRMWARE_UPDATE_GET_INFO", cmd_firmware_update_get_info, 0, 0 }, [FIRMWARE_UPDATE][TRANSFER] = { "FIRMWARE_UPDATE_TRANSFER", - cmd_firmware_update_transfer, ~0, CXL_MBOX_BACKGROUND_OPERATION }, + cmd_firmware_update_transfer, ~0, + CXL_MBOX_BACKGROUND_OPERATION | CXL_MBOX_BACKGROUND_OPERATION_ABORT }, [FIRMWARE_UPDATE][ACTIVATE] = { "FIRMWARE_UPDATE_ACTIVATE", - cmd_firmware_update_activate, 2, CXL_MBOX_BACKGROUND_OPERATION }, + cmd_firmware_update_activate, 2, + CXL_MBOX_BACKGROUND_OPERATION | CXL_MBOX_BACKGROUND_OPERATION_ABORT }, [TIMESTAMP][GET] = { "TIMESTAMP_GET", cmd_timestamp_get, 0, 0 }, [TIMESTAMP][SET] = { "TIMESTAMP_SET", cmd_timestamp_set, 8, CXL_MBOX_IMMEDIATE_POLICY_CHANGE }, @@ -2755,9 +3272,20 @@ static const struct cxl_cmd cxl_cmd_set[256][256] = { [CCLS][GET_LSA] = { "CCLS_GET_LSA", cmd_ccls_get_lsa, 8, 0 }, [CCLS][SET_LSA] = { "CCLS_SET_LSA", cmd_ccls_set_lsa, ~0, CXL_MBOX_IMMEDIATE_CONFIG_CHANGE | CXL_MBOX_IMMEDIATE_DATA_CHANGE }, + [HEALTH_INFO_ALERTS][GET_ALERT_CONFIG] = { + "HEALTH_INFO_ALERTS_GET_ALERT_CONFIG", + cmd_get_alert_config, 0, 0 }, + [HEALTH_INFO_ALERTS][SET_ALERT_CONFIG] = { + "HEALTH_INFO_ALERTS_SET_ALERT_CONFIG", + cmd_set_alert_config, 12, CXL_MBOX_IMMEDIATE_POLICY_CHANGE }, [SANITIZE][OVERWRITE] = { "SANITIZE_OVERWRITE", cmd_sanitize_overwrite, 0, (CXL_MBOX_IMMEDIATE_DATA_CHANGE | CXL_MBOX_SECURITY_STATE_CHANGE | + CXL_MBOX_BACKGROUND_OPERATION | + CXL_MBOX_BACKGROUND_OPERATION_ABORT)}, + [SANITIZE][MEDIA_OPERATIONS] = { "MEDIA_OPERATIONS", cmd_media_operations, + ~0, + (CXL_MBOX_IMMEDIATE_DATA_CHANGE | CXL_MBOX_BACKGROUND_OPERATION)}, [PERSISTENT_MEM][GET_SECURITY_STATE] = { "GET_SECURITY_STATE", cmd_get_security_state, 0, 0 }, @@ -2771,7 +3299,8 @@ static const struct cxl_cmd cxl_cmd_set[256][256] = { "MEDIA_AND_POISON_GET_SCAN_MEDIA_CAPABILITIES", cmd_media_get_scan_media_capabilities, 16, 0 }, [MEDIA_AND_POISON][SCAN_MEDIA] = { "MEDIA_AND_POISON_SCAN_MEDIA", - cmd_media_scan_media, 17, CXL_MBOX_BACKGROUND_OPERATION }, + cmd_media_scan_media, 17, + (CXL_MBOX_BACKGROUND_OPERATION | CXL_MBOX_BACKGROUND_OPERATION_ABORT)}, [MEDIA_AND_POISON][GET_SCAN_MEDIA_RESULTS] = { "MEDIA_AND_POISON_GET_SCAN_MEDIA_RESULTS", cmd_media_get_scan_media_results, 0, 0 }, @@ -2795,6 +3324,8 @@ static const struct cxl_cmd cxl_cmd_set_sw[256][256] = { [INFOSTAT][IS_IDENTIFY] = { "IDENTIFY", cmd_infostat_identify, 0, 0 }, [INFOSTAT][BACKGROUND_OPERATION_STATUS] = { "BACKGROUND_OPERATION_STATUS", cmd_infostat_bg_op_sts, 0, 0 }, + [INFOSTAT][BACKGROUND_OPERATION_ABORT] = { "BACKGROUND_OPERATION_ABORT", + cmd_infostat_bg_op_abort, 0, 0 }, [TIMESTAMP][GET] = { "TIMESTAMP_GET", cmd_timestamp_get, 0, 0 }, [TIMESTAMP][SET] = { "TIMESTAMP_SET", cmd_timestamp_set, 8, CXL_MBOX_IMMEDIATE_POLICY_CHANGE }, @@ -2881,6 +3412,7 @@ int cxl_process_cci_message(CXLCCI *cci, uint8_t set, uint8_t cmd, cci->bg.opcode = (set << 8) | cmd; cci->bg.complete_pct = 0; + cci->bg.aborted = false; cci->bg.ret_code = 0; now = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); @@ -2894,10 +3426,12 @@ int cxl_process_cci_message(CXLCCI *cci, uint8_t set, uint8_t cmd, static void bg_timercb(void *opaque) { CXLCCI *cci = opaque; - uint64_t now = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); - uint64_t total_time = cci->bg.starttime + cci->bg.runtime; + uint64_t now, total_time; - assert(cci->bg.runtime > 0); + qemu_mutex_lock(&cci->bg.lock); + + now = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); + total_time = cci->bg.starttime + cci->bg.runtime; if (now >= total_time) { /* we are done */ uint16_t ret = CXL_MBOX_SUCCESS; @@ -2916,6 +3450,12 @@ static void bg_timercb(void *opaque) cxl_dev_enable_media(&ct3d->cxl_dstate); } break; + case 0x4402: /* Media Operations sanitize */ + { + CXLType3Dev *ct3d = CXL_TYPE3(cci->d); + __do_sanitize(ct3d); + } + break; case 0x4304: /* scan media */ { CXLType3Dev *ct3d = CXL_TYPE3(cci->d); @@ -2950,6 +3490,8 @@ static void bg_timercb(void *opaque) msi_notify(pdev, cxl_dstate->mbox_msi_n); } } + + qemu_mutex_unlock(&cci->bg.lock); } static void cxl_rebuild_cel(CXLCCI *cci) @@ -2978,12 +3520,21 @@ void cxl_init_cci(CXLCCI *cci, size_t payload_max) cci->bg.complete_pct = 0; cci->bg.starttime = 0; cci->bg.runtime = 0; + cci->bg.aborted = false; cci->bg.timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, bg_timercb, cci); + qemu_mutex_init(&cci->bg.lock); memset(&cci->fw, 0, sizeof(cci->fw)); cci->fw.active_slot = 1; cci->fw.slot[cci->fw.active_slot - 1] = true; + cci->initialized = true; +} + +void cxl_destroy_cci(CXLCCI *cci) +{ + qemu_mutex_destroy(&cci->bg.lock); + cci->initialized = false; } static void cxl_copy_cci_commands(CXLCCI *cci, const struct cxl_cmd (*cxl_cmds)[256]) @@ -3047,6 +3598,10 @@ void cxl_initialize_t3_ld_cci(CXLCCI *cci, DeviceState *d, DeviceState *intf, static const struct cxl_cmd cxl_cmd_set_t3_fm_owned_ld_mctp[256][256] = { [INFOSTAT][IS_IDENTIFY] = { "IDENTIFY", cmd_infostat_identify, 0, 0}, + [INFOSTAT][GET_RESPONSE_MSG_LIMIT] = { "GET_RESPONSE_MSG_LIMIT", + cmd_get_response_msg_limit, 0, 0 }, + [INFOSTAT][SET_RESPONSE_MSG_LIMIT] = { "SET_RESPONSE_MSG_LIMIT", + cmd_set_response_msg_limit, 1, 0 }, [LOGS][GET_SUPPORTED] = { "LOGS_GET_SUPPORTED", cmd_logs_get_supported, 0, 0 }, [LOGS][GET_LOG] = { "LOGS_GET_LOG", cmd_logs_get_log, 0x18, 0 }, diff --git a/hw/display/vhost-user-gpu.c b/hw/display/vhost-user-gpu.c index 43d4c08..9fc6bbc 100644 --- a/hw/display/vhost-user-gpu.c +++ b/hw/display/vhost-user-gpu.c @@ -516,7 +516,7 @@ vhost_user_gpu_set_config(VirtIODevice *vdev, } } -static void +static int vhost_user_gpu_set_status(VirtIODevice *vdev, uint8_t val) { VhostUserGPU *g = VHOST_USER_GPU(vdev); @@ -525,18 +525,24 @@ vhost_user_gpu_set_status(VirtIODevice *vdev, uint8_t val) if (val & VIRTIO_CONFIG_S_DRIVER_OK && vdev->vm_running) { if (!vhost_user_gpu_do_set_socket(g, &err)) { error_report_err(err); - return; + return 0; } vhost_user_backend_start(g->vhost); } else { + int ret; + /* unblock any wait and stop processing */ if (g->vhost_gpu_fd != -1) { vhost_user_gpu_update_blocked(g, true); qemu_chr_fe_deinit(&g->vhost_chr, true); g->vhost_gpu_fd = -1; } - vhost_user_backend_stop(g->vhost); + ret = vhost_user_backend_stop(g->vhost); + if (ret < 0) { + return ret; + } } + return 0; } static bool diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index f40ad06..61851cc 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -2333,10 +2333,10 @@ build_amd_iommu(GArray *table_data, BIOSLinker *linker, const char *oem_id, build_append_int_noprefix(table_data, ivhd_blob->len + 24, 2); /* DeviceID */ build_append_int_noprefix(table_data, - object_property_get_int(OBJECT(&s->pci), "addr", + object_property_get_int(OBJECT(s->pci), "addr", &error_abort), 2); /* Capability offset */ - build_append_int_noprefix(table_data, s->pci.capab_offset, 2); + build_append_int_noprefix(table_data, s->pci->capab_offset, 2); /* IOMMU base address */ build_append_int_noprefix(table_data, s->mr_mmio.addr, 8); /* PCI Segment Group */ @@ -2368,10 +2368,10 @@ build_amd_iommu(GArray *table_data, BIOSLinker *linker, const char *oem_id, build_append_int_noprefix(table_data, ivhd_blob->len + 40, 2); /* DeviceID */ build_append_int_noprefix(table_data, - object_property_get_int(OBJECT(&s->pci), "addr", + object_property_get_int(OBJECT(s->pci), "addr", &error_abort), 2); /* Capability offset */ - build_append_int_noprefix(table_data, s->pci.capab_offset, 2); + build_append_int_noprefix(table_data, s->pci->capab_offset, 2); /* IOMMU base address */ build_append_int_noprefix(table_data, s->mr_mmio.addr, 8); /* PCI Segment Group */ diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index 2cf7e24..0775c8f 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -167,11 +167,11 @@ static void amdvi_generate_msi_interrupt(AMDVIState *s) { MSIMessage msg = {}; MemTxAttrs attrs = { - .requester_id = pci_requester_id(&s->pci.dev) + .requester_id = pci_requester_id(&s->pci->dev) }; - if (msi_enabled(&s->pci.dev)) { - msg = msi_get_message(&s->pci.dev, 0); + if (msi_enabled(&s->pci->dev)) { + msg = msi_get_message(&s->pci->dev, 0); address_space_stl_le(&address_space_memory, msg.address, msg.data, attrs, NULL); } @@ -239,7 +239,7 @@ static void amdvi_page_fault(AMDVIState *s, uint16_t devid, info |= AMDVI_EVENT_IOPF_I | AMDVI_EVENT_IOPF; amdvi_encode_event(evt, devid, addr, info); amdvi_log_event(s, evt); - pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS, + pci_word_test_and_set_mask(s->pci->dev.config + PCI_STATUS, PCI_STATUS_SIG_TARGET_ABORT); } /* @@ -256,7 +256,7 @@ static void amdvi_log_devtab_error(AMDVIState *s, uint16_t devid, amdvi_encode_event(evt, devid, devtab, info); amdvi_log_event(s, evt); - pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS, + pci_word_test_and_set_mask(s->pci->dev.config + PCI_STATUS, PCI_STATUS_SIG_TARGET_ABORT); } /* log an event trying to access command buffer @@ -269,7 +269,7 @@ static void amdvi_log_command_error(AMDVIState *s, hwaddr addr) amdvi_encode_event(evt, 0, addr, info); amdvi_log_event(s, evt); - pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS, + pci_word_test_and_set_mask(s->pci->dev.config + PCI_STATUS, PCI_STATUS_SIG_TARGET_ABORT); } /* log an illegal command event @@ -310,7 +310,7 @@ static void amdvi_log_pagetab_error(AMDVIState *s, uint16_t devid, info |= AMDVI_EVENT_PAGE_TAB_HW_ERROR; amdvi_encode_event(evt, devid, addr, info); amdvi_log_event(s, evt); - pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS, + pci_word_test_and_set_mask(s->pci->dev.config + PCI_STATUS, PCI_STATUS_SIG_TARGET_ABORT); } @@ -1607,26 +1607,92 @@ static void amdvi_sysbus_reset(DeviceState *dev) { AMDVIState *s = AMD_IOMMU_DEVICE(dev); - msi_reset(&s->pci.dev); + msi_reset(&s->pci->dev); amdvi_init(s); } +static const VMStateDescription vmstate_amdvi_sysbus_migratable = { + .name = "amd-iommu", + .version_id = 1, + .minimum_version_id = 1, + .priority = MIG_PRI_IOMMU, + .fields = (VMStateField[]) { + /* Updated in amdvi_handle_control_write() */ + VMSTATE_BOOL(enabled, AMDVIState), + VMSTATE_BOOL(ga_enabled, AMDVIState), + VMSTATE_BOOL(ats_enabled, AMDVIState), + VMSTATE_BOOL(cmdbuf_enabled, AMDVIState), + VMSTATE_BOOL(completion_wait_intr, AMDVIState), + VMSTATE_BOOL(evtlog_enabled, AMDVIState), + VMSTATE_BOOL(evtlog_intr, AMDVIState), + /* Updated in amdvi_handle_devtab_write() */ + VMSTATE_UINT64(devtab, AMDVIState), + VMSTATE_UINT64(devtab_len, AMDVIState), + /* Updated in amdvi_handle_cmdbase_write() */ + VMSTATE_UINT64(cmdbuf, AMDVIState), + VMSTATE_UINT64(cmdbuf_len, AMDVIState), + /* Updated in amdvi_handle_cmdhead_write() */ + VMSTATE_UINT32(cmdbuf_head, AMDVIState), + /* Updated in amdvi_handle_cmdtail_write() */ + VMSTATE_UINT32(cmdbuf_tail, AMDVIState), + /* Updated in amdvi_handle_evtbase_write() */ + VMSTATE_UINT64(evtlog, AMDVIState), + VMSTATE_UINT32(evtlog_len, AMDVIState), + /* Updated in amdvi_handle_evthead_write() */ + VMSTATE_UINT32(evtlog_head, AMDVIState), + /* Updated in amdvi_handle_evttail_write() */ + VMSTATE_UINT32(evtlog_tail, AMDVIState), + /* Updated in amdvi_handle_pprbase_write() */ + VMSTATE_UINT64(ppr_log, AMDVIState), + VMSTATE_UINT32(pprlog_len, AMDVIState), + /* Updated in amdvi_handle_pprhead_write() */ + VMSTATE_UINT32(pprlog_head, AMDVIState), + /* Updated in amdvi_handle_tailhead_write() */ + VMSTATE_UINT32(pprlog_tail, AMDVIState), + /* MMIO registers */ + VMSTATE_UINT8_ARRAY(mmior, AMDVIState, AMDVI_MMIO_SIZE), + VMSTATE_UINT8_ARRAY(romask, AMDVIState, AMDVI_MMIO_SIZE), + VMSTATE_UINT8_ARRAY(w1cmask, AMDVIState, AMDVI_MMIO_SIZE), + VMSTATE_END_OF_LIST() + } +}; + static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) { + DeviceClass *dc = (DeviceClass *) object_get_class(OBJECT(dev)); AMDVIState *s = AMD_IOMMU_DEVICE(dev); MachineState *ms = MACHINE(qdev_get_machine()); PCMachineState *pcms = PC_MACHINE(ms); X86MachineState *x86ms = X86_MACHINE(ms); PCIBus *bus = pcms->pcibus; - s->iotlb = g_hash_table_new_full(amdvi_uint64_hash, - amdvi_uint64_equal, g_free, g_free); + if (s->pci_id) { + PCIDevice *pdev = NULL; + int ret = pci_qdev_find_device(s->pci_id, &pdev); - /* This device should take care of IOMMU PCI properties */ - if (!qdev_realize(DEVICE(&s->pci), &bus->qbus, errp)) { - return; + if (ret) { + error_report("Cannot find PCI device '%s'", s->pci_id); + return; + } + + if (!object_dynamic_cast(OBJECT(pdev), TYPE_AMD_IOMMU_PCI)) { + error_report("Device '%s' must be an AMDVI-PCI device type", s->pci_id); + return; + } + + s->pci = AMD_IOMMU_PCI(pdev); + dc->vmsd = &vmstate_amdvi_sysbus_migratable; + } else { + s->pci = AMD_IOMMU_PCI(object_new(TYPE_AMD_IOMMU_PCI)); + /* This device should take care of IOMMU PCI properties */ + if (!qdev_realize(DEVICE(s->pci), &bus->qbus, errp)) { + return; + } } + s->iotlb = g_hash_table_new_full(amdvi_uint64_hash, + amdvi_uint64_equal, g_free, g_free); + /* Pseudo address space under root PCI bus. */ x86ms->ioapic_as = amdvi_host_dma_iommu(bus, s, AMDVI_IOAPIC_SB_DEVID); @@ -1663,6 +1729,7 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) static const Property amdvi_properties[] = { DEFINE_PROP_BOOL("xtsup", AMDVIState, xtsup, false), + DEFINE_PROP_STRING("pci-id", AMDVIState, pci_id), }; static const VMStateDescription vmstate_amdvi_sysbus = { @@ -1670,13 +1737,6 @@ static const VMStateDescription vmstate_amdvi_sysbus = { .unmigratable = 1 }; -static void amdvi_sysbus_instance_init(Object *klass) -{ - AMDVIState *s = AMD_IOMMU_DEVICE(klass); - - object_initialize(&s->pci, sizeof(s->pci), TYPE_AMD_IOMMU_PCI); -} - static void amdvi_sysbus_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); @@ -1696,7 +1756,6 @@ static const TypeInfo amdvi_sysbus = { .name = TYPE_AMD_IOMMU_DEVICE, .parent = TYPE_X86_IOMMU_DEVICE, .instance_size = sizeof(AMDVIState), - .instance_init = amdvi_sysbus_instance_init, .class_init = amdvi_sysbus_class_init }; diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h index 2812513..5672bde 100644 --- a/hw/i386/amd_iommu.h +++ b/hw/i386/amd_iommu.h @@ -315,7 +315,8 @@ struct AMDVIPCIState { struct AMDVIState { X86IOMMUState iommu; /* IOMMU bus device */ - AMDVIPCIState pci; /* IOMMU PCI device */ + AMDVIPCIState *pci; /* IOMMU PCI device */ + char *pci_id; /* ID of AMDVI-PCI device, if user created */ uint32_t version; @@ -328,7 +329,7 @@ struct AMDVIState { bool excl_enabled; hwaddr devtab; /* base address device table */ - size_t devtab_len; /* device table length */ + uint64_t devtab_len; /* device table length */ hwaddr cmdbuf; /* command buffer base address */ uint64_t cmdbuf_len; /* command buffer length */ diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 5f8ed12..69d72ad 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -1728,8 +1728,6 @@ static bool vtd_as_pt_enabled(VTDAddressSpace *as) static bool vtd_switch_address_space(VTDAddressSpace *as) { bool use_iommu, pt; - /* Whether we need to take the BQL on our own */ - bool take_bql = !bql_locked(); assert(as); @@ -1746,9 +1744,7 @@ static bool vtd_switch_address_space(VTDAddressSpace *as) * from vtd_pt_enable_fast_path(). However the memory APIs need * it. We'd better make sure we have had it already, or, take it. */ - if (take_bql) { - bql_lock(); - } + BQL_LOCK_GUARD(); /* Turn off first then on the other */ if (use_iommu) { @@ -1801,10 +1797,6 @@ static bool vtd_switch_address_space(VTDAddressSpace *as) memory_region_set_enabled(&as->iommu_ir_fault, false); } - if (take_bql) { - bql_unlock(); - } - return use_iommu; } @@ -4213,9 +4205,30 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, VTDAddressSpace *vtd_dev_as; char name[128]; + vtd_iommu_lock(s); vtd_dev_as = g_hash_table_lookup(s->vtd_address_spaces, &key); + vtd_iommu_unlock(s); + if (!vtd_dev_as) { - struct vtd_as_key *new_key = g_malloc(sizeof(*new_key)); + struct vtd_as_key *new_key; + /* Slow path */ + + /* + * memory_region_add_subregion_overlap requires the bql, + * make sure we own it. + */ + BQL_LOCK_GUARD(); + vtd_iommu_lock(s); + + /* Check again as we released the lock for a moment */ + vtd_dev_as = g_hash_table_lookup(s->vtd_address_spaces, &key); + if (vtd_dev_as) { + vtd_iommu_unlock(s); + return vtd_dev_as; + } + + /* Still nothing, allocate a new address space */ + new_key = g_malloc(sizeof(*new_key)); new_key->bus = bus; new_key->devfn = devfn; @@ -4306,6 +4319,8 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, vtd_switch_address_space(vtd_dev_as); g_hash_table_insert(s->vtd_address_spaces, new_key, vtd_dev_as); + + vtd_iommu_unlock(s); } return vtd_dev_as; } diff --git a/hw/input/virtio-input.c b/hw/input/virtio-input.c index 1818cbd..a3f554f 100644 --- a/hw/input/virtio-input.c +++ b/hw/input/virtio-input.c @@ -189,7 +189,7 @@ static uint64_t virtio_input_get_features(VirtIODevice *vdev, uint64_t f, return f; } -static void virtio_input_set_status(VirtIODevice *vdev, uint8_t val) +static int virtio_input_set_status(VirtIODevice *vdev, uint8_t val) { VirtIOInputClass *vic = VIRTIO_INPUT_GET_CLASS(vdev); VirtIOInput *vinput = VIRTIO_INPUT(vdev); @@ -202,6 +202,7 @@ static void virtio_input_set_status(VirtIODevice *vdev, uint8_t val) } } } + return 0; } static void virtio_input_reset(VirtIODevice *vdev) diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c index bba923f..94e7274 100644 --- a/hw/mem/cxl_type3.c +++ b/hw/mem/cxl_type3.c @@ -843,6 +843,19 @@ static DOEProtocol doe_cdat_prot[] = { { } }; +/* Initialize CXL device alerts with default threshold values. */ +static void init_alert_config(CXLType3Dev *ct3d) +{ + ct3d->alert_config = (CXLAlertConfig) { + .life_used_crit_alert_thresh = 75, + .life_used_warn_thresh = 40, + .over_temp_crit_alert_thresh = 35, + .under_temp_crit_alert_thresh = 10, + .over_temp_warn_thresh = 25, + .under_temp_warn_thresh = 20 + }; +} + static void ct3_realize(PCIDevice *pci_dev, Error **errp) { ERRP_GUARD(); @@ -910,6 +923,7 @@ static void ct3_realize(PCIDevice *pci_dev, Error **errp) goto err_msix_uninit; } + init_alert_config(ct3d); pcie_cap_deverr_init(pci_dev); /* Leave a bit of room for expansion */ rc = pcie_aer_init(pci_dev, PCI_ERR_VER, 0x200, PCI_ERR_SIZEOF, errp); @@ -969,6 +983,7 @@ static void ct3_exit(PCIDevice *pci_dev) cxl_doe_cdat_release(cxl_cstate); msix_uninit_exclusive_bar(pci_dev); g_free(regs->special_ops); + cxl_destroy_cci(&ct3d->cci); if (ct3d->dc.host_dc) { cxl_destroy_dc_regions(ct3d); address_space_destroy(&ct3d->dc.host_dc_as); @@ -1224,12 +1239,17 @@ static void ct3d_reset(DeviceState *dev) * Bring up an endpoint to target with MCTP over VDM. * This device is emulating an MLD with single LD for now. */ + if (ct3d->vdm_fm_owned_ld_mctp_cci.initialized) { + cxl_destroy_cci(&ct3d->vdm_fm_owned_ld_mctp_cci); + } cxl_initialize_t3_fm_owned_ld_mctpcci(&ct3d->vdm_fm_owned_ld_mctp_cci, DEVICE(ct3d), DEVICE(ct3d), 512); /* Max payload made up */ + if (ct3d->ld0_cci.initialized) { + cxl_destroy_cci(&ct3d->ld0_cci); + } cxl_initialize_t3_ld_cci(&ct3d->ld0_cci, DEVICE(ct3d), DEVICE(ct3d), 512); /* Max payload made up */ - } static const Property ct3_props[] = { diff --git a/hw/misc/pci-testdev.c b/hw/misc/pci-testdev.c index 3f6a8bb..ba71c50 100644 --- a/hw/misc/pci-testdev.c +++ b/hw/misc/pci-testdev.c @@ -90,6 +90,7 @@ struct PCITestDevState { int current; uint64_t membar_size; + bool membar_backed; MemoryRegion membar; }; @@ -258,8 +259,14 @@ static void pci_testdev_realize(PCIDevice *pci_dev, Error **errp) pci_register_bar(pci_dev, 1, PCI_BASE_ADDRESS_SPACE_IO, &d->portio); if (d->membar_size) { - memory_region_init(&d->membar, OBJECT(d), "pci-testdev-membar", - d->membar_size); + if (d->membar_backed) + memory_region_init_ram(&d->membar, OBJECT(d), + "pci-testdev-membar-backed", + d->membar_size, NULL); + else + memory_region_init(&d->membar, OBJECT(d), + "pci-testdev-membar", + d->membar_size); pci_register_bar(pci_dev, 2, PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_PREFETCH | @@ -321,6 +328,7 @@ static void qdev_pci_testdev_reset(DeviceState *dev) static const Property pci_testdev_properties[] = { DEFINE_PROP_SIZE("membar", PCITestDevState, membar_size, 0), + DEFINE_PROP_BOOL("membar-backed", PCITestDevState, membar_backed, false), }; static void pci_testdev_class_init(ObjectClass *klass, const void *data) diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 2de037c..221252e 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -382,7 +382,7 @@ static void virtio_net_drop_tx_queue_data(VirtIODevice *vdev, VirtQueue *vq) } } -static void virtio_net_set_status(struct VirtIODevice *vdev, uint8_t status) +static int virtio_net_set_status(struct VirtIODevice *vdev, uint8_t status) { VirtIONet *n = VIRTIO_NET(vdev); VirtIONetQueue *q; @@ -437,6 +437,7 @@ static void virtio_net_set_status(struct VirtIODevice *vdev, uint8_t status) } } } + return 0; } static void virtio_net_set_link_status(NetClientState *nc) diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 352b3d1..f5ab510 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -94,6 +94,7 @@ static const Property pci_props[] = { QEMU_PCIE_ARI_NEXTFN_1_BITNR, false), DEFINE_PROP_SIZE32("x-max-bounce-buffer-size", PCIDevice, max_bounce_buffer_size, DEFAULT_MAX_BOUNCE_BUFFER_SIZE), + DEFINE_PROP_STRING("sriov-pf", PCIDevice, sriov_pf), DEFINE_PROP_BIT("x-pcie-ext-tag", PCIDevice, cap_present, QEMU_PCIE_EXT_TAG_BITNR, true), { .name = "busnr", .info = &prop_pci_busnr }, @@ -1105,13 +1106,8 @@ static void pci_init_multifunction(PCIBus *bus, PCIDevice *dev, Error **errp) dev->config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION; } - /* - * With SR/IOV and ARI, a device at function 0 need not be a multifunction - * device, as it may just be a VF that ended up with function 0 in - * the legacy PCI interpretation. Avoid failing in such cases: - */ - if (pci_is_vf(dev) && - dev->exp.sriov_vf.pf->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) { + /* SR/IOV is not handled here. */ + if (pci_is_vf(dev)) { return; } @@ -1144,7 +1140,8 @@ static void pci_init_multifunction(PCIBus *bus, PCIDevice *dev, Error **errp) } /* function 0 indicates single function, so function > 0 must be NULL */ for (func = 1; func < PCI_FUNC_MAX; ++func) { - if (bus->devices[PCI_DEVFN(slot, func)]) { + PCIDevice *device = bus->devices[PCI_DEVFN(slot, func)]; + if (device && !pci_is_vf(device)) { error_setg(errp, "PCI: %x.0 indicates single function, " "but %x.%x is already populated.", slot, slot, func); @@ -1432,6 +1429,7 @@ static void pci_qdev_unrealize(DeviceState *dev) pci_unregister_io_regions(pci_dev); pci_del_option_rom(pci_dev); + pcie_sriov_unregister_device(pci_dev); if (pc->exit) { pc->exit(pci_dev); @@ -1463,7 +1461,6 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num, pcibus_t size = memory_region_size(memory); uint8_t hdr_type; - assert(!pci_is_vf(pci_dev)); /* VFs must use pcie_sriov_vf_register_bar */ assert(region_num >= 0); assert(region_num < PCI_NUM_REGIONS); assert(is_power_of_2(size)); @@ -1475,7 +1472,6 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num, r = &pci_dev->io_regions[region_num]; assert(!r->size); - r->addr = PCI_BAR_UNMAPPED; r->size = size; r->type = type; r->memory = memory; @@ -1483,22 +1479,35 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num, ? pci_get_bus(pci_dev)->address_space_io : pci_get_bus(pci_dev)->address_space_mem; - wmask = ~(size - 1); - if (region_num == PCI_ROM_SLOT) { - /* ROM enable bit is writable */ - wmask |= PCI_ROM_ADDRESS_ENABLE; - } + if (pci_is_vf(pci_dev)) { + PCIDevice *pf = pci_dev->exp.sriov_vf.pf; + assert(!pf || type == pf->exp.sriov_pf.vf_bar_type[region_num]); - addr = pci_bar(pci_dev, region_num); - pci_set_long(pci_dev->config + addr, type); - - if (!(r->type & PCI_BASE_ADDRESS_SPACE_IO) && - r->type & PCI_BASE_ADDRESS_MEM_TYPE_64) { - pci_set_quad(pci_dev->wmask + addr, wmask); - pci_set_quad(pci_dev->cmask + addr, ~0ULL); + r->addr = pci_bar_address(pci_dev, region_num, r->type, r->size); + if (r->addr != PCI_BAR_UNMAPPED) { + memory_region_add_subregion_overlap(r->address_space, + r->addr, r->memory, 1); + } } else { - pci_set_long(pci_dev->wmask + addr, wmask & 0xffffffff); - pci_set_long(pci_dev->cmask + addr, 0xffffffff); + r->addr = PCI_BAR_UNMAPPED; + + wmask = ~(size - 1); + if (region_num == PCI_ROM_SLOT) { + /* ROM enable bit is writable */ + wmask |= PCI_ROM_ADDRESS_ENABLE; + } + + addr = pci_bar(pci_dev, region_num); + pci_set_long(pci_dev->config + addr, type); + + if (!(r->type & PCI_BASE_ADDRESS_SPACE_IO) && + r->type & PCI_BASE_ADDRESS_MEM_TYPE_64) { + pci_set_quad(pci_dev->wmask + addr, wmask); + pci_set_quad(pci_dev->cmask + addr, ~0ULL); + } else { + pci_set_long(pci_dev->wmask + addr, wmask & 0xffffffff); + pci_set_long(pci_dev->cmask + addr, 0xffffffff); + } } } @@ -1587,7 +1596,11 @@ static pcibus_t pci_config_get_bar_addr(PCIDevice *d, int reg, pci_get_word(pf->config + sriov_cap + PCI_SRIOV_VF_OFFSET); uint16_t vf_stride = pci_get_word(pf->config + sriov_cap + PCI_SRIOV_VF_STRIDE); - uint32_t vf_num = (d->devfn - (pf->devfn + vf_offset)) / vf_stride; + uint32_t vf_num = d->devfn - (pf->devfn + vf_offset); + + if (vf_num) { + vf_num /= vf_stride; + } if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) { new_addr = pci_get_quad(pf->config + bar); @@ -2261,6 +2274,11 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp) } } + if (!pcie_sriov_register_device(pci_dev, errp)) { + pci_qdev_unrealize(DEVICE(pci_dev)); + return; + } + /* * A PCIe Downstream Port that do not have ARI Forwarding enabled must * associate only Device 0 with the device attached to the bus @@ -2515,6 +2533,14 @@ static void pci_add_option_rom(PCIDevice *pdev, bool is_default_rom, return; } + if (pci_is_vf(pdev)) { + if (pdev->rom_bar > 0) { + error_setg(errp, "ROM BAR cannot be enabled for SR-IOV VF"); + } + + return; + } + if (load_file || pdev->romsize == UINT32_MAX) { path = qemu_find_file(QEMU_FILE_TYPE_BIOS, pdev->romfile); if (path == NULL) { diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c index 1eb4358..3ad1874 100644 --- a/hw/pci/pcie_sriov.c +++ b/hw/pci/pcie_sriov.c @@ -15,11 +15,12 @@ #include "hw/pci/pcie.h" #include "hw/pci/pci_bus.h" #include "hw/qdev-properties.h" -#include "qemu/error-report.h" #include "qemu/range.h" #include "qapi/error.h" #include "trace.h" +static GHashTable *pfs; + static void unparent_vfs(PCIDevice *dev, uint16_t total_vfs) { for (uint16_t i = 0; i < total_vfs; i++) { @@ -31,17 +32,57 @@ static void unparent_vfs(PCIDevice *dev, uint16_t total_vfs) dev->exp.sriov_pf.vf = NULL; } -bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset, - const char *vfname, uint16_t vf_dev_id, - uint16_t init_vfs, uint16_t total_vfs, - uint16_t vf_offset, uint16_t vf_stride, - Error **errp) +static void register_vfs(PCIDevice *dev) +{ + uint16_t num_vfs; + uint16_t i; + uint16_t sriov_cap = dev->exp.sriov_cap; + + assert(sriov_cap > 0); + num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF); + + trace_sriov_register_vfs(dev->name, PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn), num_vfs); + for (i = 0; i < num_vfs; i++) { + pci_set_enabled(dev->exp.sriov_pf.vf[i], true); + } + + pci_set_word(dev->wmask + sriov_cap + PCI_SRIOV_NUM_VF, 0); +} + +static void unregister_vfs(PCIDevice *dev) +{ + uint8_t *cfg = dev->config + dev->exp.sriov_cap; + uint16_t i; + + trace_sriov_unregister_vfs(dev->name, PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn)); + for (i = 0; i < pci_get_word(cfg + PCI_SRIOV_TOTAL_VF); i++) { + pci_set_enabled(dev->exp.sriov_pf.vf[i], false); + } + + pci_set_word(dev->wmask + dev->exp.sriov_cap + PCI_SRIOV_NUM_VF, 0xffff); +} + +static bool pcie_sriov_pf_init_common(PCIDevice *dev, uint16_t offset, + uint16_t vf_dev_id, uint16_t init_vfs, + uint16_t total_vfs, uint16_t vf_offset, + uint16_t vf_stride, Error **errp) { - BusState *bus = qdev_get_parent_bus(&dev->qdev); int32_t devfn = dev->devfn + vf_offset; uint8_t *cfg = dev->config + offset; uint8_t *wmask; + if (!pci_is_express(dev)) { + error_setg(errp, "PCI Express is required for SR-IOV PF"); + return false; + } + + if (pci_is_vf(dev)) { + error_setg(errp, "a device cannot be a SR-IOV PF and a VF at the same time"); + return false; + } + if (total_vfs && (uint32_t)devfn + (uint32_t)(total_vfs - 1) * vf_stride >= PCI_DEVFN_MAX) { error_setg(errp, "VF addr overflows"); @@ -84,6 +125,28 @@ bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset, qdev_prop_set_bit(&dev->qdev, "multifunction", true); + return true; +} + +bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset, + const char *vfname, uint16_t vf_dev_id, + uint16_t init_vfs, uint16_t total_vfs, + uint16_t vf_offset, uint16_t vf_stride, + Error **errp) +{ + BusState *bus = qdev_get_parent_bus(&dev->qdev); + int32_t devfn = dev->devfn + vf_offset; + + if (pfs && g_hash_table_contains(pfs, dev->qdev.id)) { + error_setg(errp, "attaching user-created SR-IOV VF unsupported"); + return false; + } + + if (!pcie_sriov_pf_init_common(dev, offset, vf_dev_id, init_vfs, + total_vfs, vf_offset, vf_stride, errp)) { + return false; + } + dev->exp.sriov_pf.vf = g_new(PCIDevice *, total_vfs); for (uint16_t i = 0; i < total_vfs; i++) { @@ -113,7 +176,22 @@ void pcie_sriov_pf_exit(PCIDevice *dev) { uint8_t *cfg = dev->config + dev->exp.sriov_cap; - unparent_vfs(dev, pci_get_word(cfg + PCI_SRIOV_TOTAL_VF)); + if (dev->exp.sriov_pf.vf_user_created) { + uint16_t ven_id = pci_get_word(dev->config + PCI_VENDOR_ID); + uint16_t total_vfs = pci_get_word(dev->config + PCI_SRIOV_TOTAL_VF); + uint16_t vf_dev_id = pci_get_word(dev->config + PCI_SRIOV_VF_DID); + + unregister_vfs(dev); + + for (uint16_t i = 0; i < total_vfs; i++) { + dev->exp.sriov_pf.vf[i]->exp.sriov_vf.pf = NULL; + + pci_config_set_vendor_id(dev->exp.sriov_pf.vf[i]->config, ven_id); + pci_config_set_device_id(dev->exp.sriov_pf.vf[i]->config, vf_dev_id); + } + } else { + unparent_vfs(dev, pci_get_word(cfg + PCI_SRIOV_TOTAL_VF)); + } } void pcie_sriov_pf_init_vf_bar(PCIDevice *dev, int region_num, @@ -146,69 +224,179 @@ void pcie_sriov_pf_init_vf_bar(PCIDevice *dev, int region_num, void pcie_sriov_vf_register_bar(PCIDevice *dev, int region_num, MemoryRegion *memory) { - PCIIORegion *r; - PCIBus *bus = pci_get_bus(dev); uint8_t type; - pcibus_t size = memory_region_size(memory); - assert(pci_is_vf(dev)); /* PFs must use pci_register_bar */ - assert(region_num >= 0); - assert(region_num < PCI_NUM_REGIONS); + assert(dev->exp.sriov_vf.pf); type = dev->exp.sriov_vf.pf->exp.sriov_pf.vf_bar_type[region_num]; - if (!is_power_of_2(size)) { - error_report("%s: PCI region size must be a power" - " of two - type=0x%x, size=0x%"FMT_PCIBUS, - __func__, type, size); - exit(1); - } + return pci_register_bar(dev, region_num, type, memory); +} - r = &dev->io_regions[region_num]; - r->memory = memory; - r->address_space = - type & PCI_BASE_ADDRESS_SPACE_IO - ? bus->address_space_io - : bus->address_space_mem; - r->size = size; - r->type = type; - - r->addr = pci_bar_address(dev, region_num, r->type, r->size); - if (r->addr != PCI_BAR_UNMAPPED) { - memory_region_add_subregion_overlap(r->address_space, - r->addr, r->memory, 1); - } +static gint compare_vf_devfns(gconstpointer a, gconstpointer b) +{ + return (*(PCIDevice **)a)->devfn - (*(PCIDevice **)b)->devfn; } -static void register_vfs(PCIDevice *dev) +int16_t pcie_sriov_pf_init_from_user_created_vfs(PCIDevice *dev, + uint16_t offset, + Error **errp) { - uint16_t num_vfs; + GPtrArray *pf; + PCIDevice **vfs; + BusState *bus = qdev_get_parent_bus(DEVICE(dev)); + uint16_t ven_id = pci_get_word(dev->config + PCI_VENDOR_ID); + uint16_t size = PCI_EXT_CAP_SRIOV_SIZEOF; + uint16_t vf_dev_id; + uint16_t vf_offset; + uint16_t vf_stride; uint16_t i; - uint16_t sriov_cap = dev->exp.sriov_cap; - assert(sriov_cap > 0); - num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF); + if (!pfs || !dev->qdev.id) { + return 0; + } - trace_sriov_register_vfs(dev->name, PCI_SLOT(dev->devfn), - PCI_FUNC(dev->devfn), num_vfs); - for (i = 0; i < num_vfs; i++) { - pci_set_enabled(dev->exp.sriov_pf.vf[i], true); + pf = g_hash_table_lookup(pfs, dev->qdev.id); + if (!pf) { + return 0; } - pci_set_word(dev->wmask + sriov_cap + PCI_SRIOV_NUM_VF, 0); + if (pf->len > UINT16_MAX) { + error_setg(errp, "too many VFs"); + return -1; + } + + g_ptr_array_sort(pf, compare_vf_devfns); + vfs = (void *)pf->pdata; + + if (vfs[0]->devfn <= dev->devfn) { + error_setg(errp, "a VF function number is less than the PF function number"); + return -1; + } + + vf_dev_id = pci_get_word(vfs[0]->config + PCI_DEVICE_ID); + vf_offset = vfs[0]->devfn - dev->devfn; + vf_stride = pf->len < 2 ? 0 : vfs[1]->devfn - vfs[0]->devfn; + + for (i = 0; i < pf->len; i++) { + if (bus != qdev_get_parent_bus(&vfs[i]->qdev)) { + error_setg(errp, "SR-IOV VF parent bus mismatches with PF"); + return -1; + } + + if (ven_id != pci_get_word(vfs[i]->config + PCI_VENDOR_ID)) { + error_setg(errp, "SR-IOV VF vendor ID mismatches with PF"); + return -1; + } + + if (vf_dev_id != pci_get_word(vfs[i]->config + PCI_DEVICE_ID)) { + error_setg(errp, "inconsistent SR-IOV VF device IDs"); + return -1; + } + + for (size_t j = 0; j < PCI_NUM_REGIONS; j++) { + if (vfs[i]->io_regions[j].size != vfs[0]->io_regions[j].size || + vfs[i]->io_regions[j].type != vfs[0]->io_regions[j].type) { + error_setg(errp, "inconsistent SR-IOV BARs"); + return -1; + } + } + + if (vfs[i]->devfn - vfs[0]->devfn != vf_stride * i) { + error_setg(errp, "inconsistent SR-IOV stride"); + return -1; + } + } + + if (!pcie_sriov_pf_init_common(dev, offset, vf_dev_id, pf->len, + pf->len, vf_offset, vf_stride, errp)) { + return -1; + } + + if (!pcie_find_capability(dev, PCI_EXT_CAP_ID_ARI)) { + pcie_ari_init(dev, offset + size); + size += PCI_ARI_SIZEOF; + } + + for (i = 0; i < pf->len; i++) { + vfs[i]->exp.sriov_vf.pf = dev; + vfs[i]->exp.sriov_vf.vf_number = i; + + /* set vid/did according to sr/iov spec - they are not used */ + pci_config_set_vendor_id(vfs[i]->config, 0xffff); + pci_config_set_device_id(vfs[i]->config, 0xffff); + } + + dev->exp.sriov_pf.vf = vfs; + dev->exp.sriov_pf.vf_user_created = true; + + for (i = 0; i < PCI_NUM_REGIONS; i++) { + PCIIORegion *region = &vfs[0]->io_regions[i]; + + if (region->size) { + pcie_sriov_pf_init_vf_bar(dev, i, region->type, region->size); + } + } + + return size; } -static void unregister_vfs(PCIDevice *dev) +bool pcie_sriov_register_device(PCIDevice *dev, Error **errp) { - uint8_t *cfg = dev->config + dev->exp.sriov_cap; - uint16_t i; + if (!dev->exp.sriov_pf.vf && dev->qdev.id && + pfs && g_hash_table_contains(pfs, dev->qdev.id)) { + error_setg(errp, "attaching user-created SR-IOV VF unsupported"); + return false; + } - trace_sriov_unregister_vfs(dev->name, PCI_SLOT(dev->devfn), - PCI_FUNC(dev->devfn)); - for (i = 0; i < pci_get_word(cfg + PCI_SRIOV_TOTAL_VF); i++) { - pci_set_enabled(dev->exp.sriov_pf.vf[i], false); + if (dev->sriov_pf) { + PCIDevice *pci_pf; + GPtrArray *pf; + + if (!PCI_DEVICE_GET_CLASS(dev)->sriov_vf_user_creatable) { + error_setg(errp, "user cannot create SR-IOV VF with this device type"); + return false; + } + + if (!pci_is_express(dev)) { + error_setg(errp, "PCI Express is required for SR-IOV VF"); + return false; + } + + if (!pci_qdev_find_device(dev->sriov_pf, &pci_pf)) { + error_setg(errp, "PCI device specified as SR-IOV PF already exists"); + return false; + } + + if (!pfs) { + pfs = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL); + } + + pf = g_hash_table_lookup(pfs, dev->sriov_pf); + if (!pf) { + pf = g_ptr_array_new(); + g_hash_table_insert(pfs, g_strdup(dev->sriov_pf), pf); + } + + g_ptr_array_add(pf, dev); } - pci_set_word(dev->wmask + dev->exp.sriov_cap + PCI_SRIOV_NUM_VF, 0xffff); + return true; +} + +void pcie_sriov_unregister_device(PCIDevice *dev) +{ + if (dev->sriov_pf && pfs) { + GPtrArray *pf = g_hash_table_lookup(pfs, dev->sriov_pf); + + if (pf) { + g_ptr_array_remove_fast(pf, dev); + + if (!pf->len) { + g_hash_table_remove(pfs, dev->sriov_pf); + g_ptr_array_free(pf, FALSE); + } + } + } } void pcie_sriov_config_write(PCIDevice *dev, uint32_t address, @@ -304,7 +492,7 @@ void pcie_sriov_pf_add_sup_pgsize(PCIDevice *dev, uint16_t opt_sup_pgsize) uint16_t pcie_sriov_vf_number(PCIDevice *dev) { - assert(pci_is_vf(dev)); + assert(dev->exp.sriov_vf.pf); return dev->exp.sriov_vf.vf_number; } diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c index 70be4a7..9b12ee7 100644 --- a/hw/scsi/scsi-bus.c +++ b/hw/scsi/scsi-bus.c @@ -400,7 +400,7 @@ static void scsi_qdev_realize(DeviceState *qdev, Error **errp) return; } dev->vmsentry = qdev_add_vm_change_state_handler(DEVICE(dev), - scsi_dma_restart_cb, dev); + scsi_dma_restart_cb, NULL, dev); } static void scsi_qdev_unrealize(DeviceState *qdev) diff --git a/hw/scsi/vhost-scsi-common.c b/hw/scsi/vhost-scsi-common.c index 4c86370..43525ba 100644 --- a/hw/scsi/vhost-scsi-common.c +++ b/hw/scsi/vhost-scsi-common.c @@ -101,24 +101,25 @@ err_host_notifiers: return ret; } -void vhost_scsi_common_stop(VHostSCSICommon *vsc) +int vhost_scsi_common_stop(VHostSCSICommon *vsc) { VirtIODevice *vdev = VIRTIO_DEVICE(vsc); BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); int ret = 0; - vhost_dev_stop(&vsc->dev, vdev, true); + ret = vhost_dev_stop(&vsc->dev, vdev, true); if (k->set_guest_notifiers) { - ret = k->set_guest_notifiers(qbus->parent, vsc->dev.nvqs, false); - if (ret < 0) { - error_report("vhost guest notifier cleanup failed: %d", ret); + int r = k->set_guest_notifiers(qbus->parent, vsc->dev.nvqs, false); + if (r < 0) { + error_report("vhost guest notifier cleanup failed: %d", ret); + return r; } } - assert(ret >= 0); vhost_dev_disable_notifiers(&vsc->dev, vdev); + return ret; } uint64_t vhost_scsi_common_get_features(VirtIODevice *vdev, uint64_t features, diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c index 10fde8e..cdf405b 100644 --- a/hw/scsi/vhost-scsi.c +++ b/hw/scsi/vhost-scsi.c @@ -114,7 +114,7 @@ static void vhost_scsi_stop(VHostSCSI *s) vhost_scsi_common_stop(vsc); } -static void vhost_scsi_set_status(VirtIODevice *vdev, uint8_t val) +static int vhost_scsi_set_status(VirtIODevice *vdev, uint8_t val) { VHostSCSI *s = VHOST_SCSI(vdev); VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s); @@ -125,7 +125,7 @@ static void vhost_scsi_set_status(VirtIODevice *vdev, uint8_t val) } if (vhost_dev_is_started(&vsc->dev) == start) { - return; + return 0; } if (start) { @@ -139,6 +139,7 @@ static void vhost_scsi_set_status(VirtIODevice *vdev, uint8_t val) } else { vhost_scsi_stop(s); } + return 0; } static void vhost_dummy_handle_output(VirtIODevice *vdev, VirtQueue *vq) @@ -358,6 +359,9 @@ static const Property vhost_scsi_properties[] = { DEFINE_PROP_BIT64("t10_pi", VHostSCSICommon, host_features, VIRTIO_SCSI_F_T10_PI, false), + DEFINE_PROP_BIT64("hotplug", VHostSCSICommon, host_features, + VIRTIO_SCSI_F_HOTPLUG, + false), DEFINE_PROP_BOOL("migratable", VHostSCSICommon, migratable, false), DEFINE_PROP_BOOL("worker_per_virtqueue", VirtIOSCSICommon, conf.worker_per_virtqueue, false), diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c index 8298e8c..25f2d89 100644 --- a/hw/scsi/vhost-user-scsi.c +++ b/hw/scsi/vhost-user-scsi.c @@ -52,19 +52,19 @@ static int vhost_user_scsi_start(VHostUserSCSI *s, Error **errp) return ret; } -static void vhost_user_scsi_stop(VHostUserSCSI *s) +static int vhost_user_scsi_stop(VHostUserSCSI *s) { VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s); if (!s->started_vu) { - return; + return 0; } s->started_vu = false; - vhost_scsi_common_stop(vsc); + return vhost_scsi_common_stop(vsc); } -static void vhost_user_scsi_set_status(VirtIODevice *vdev, uint8_t status) +static int vhost_user_scsi_set_status(VirtIODevice *vdev, uint8_t status) { VHostUserSCSI *s = (VHostUserSCSI *)vdev; DeviceState *dev = DEVICE(vdev); @@ -75,11 +75,11 @@ static void vhost_user_scsi_set_status(VirtIODevice *vdev, uint8_t status) int ret; if (!s->connected) { - return; + return -1; } if (vhost_dev_is_started(&vsc->dev) == should_start) { - return; + return 0; } if (should_start) { @@ -91,8 +91,12 @@ static void vhost_user_scsi_set_status(VirtIODevice *vdev, uint8_t status) qemu_chr_fe_disconnect(&vs->conf.chardev); } } else { - vhost_user_scsi_stop(s); + ret = vhost_user_scsi_stop(s); + if (ret) { + return ret; + } } + return 0; } static void vhost_user_scsi_handle_output(VirtIODevice *vdev, VirtQueue *vq) diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 1dceab1..b76697bd 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -1016,7 +1016,7 @@ static int vfio_migration_init(VFIODevice *vbasedev) vfio_vmstate_change_prepare : NULL; migration->vm_state = qdev_add_vm_change_state_handler_full( - vbasedev->dev, vfio_vmstate_change, prepare_cb, vbasedev); + vbasedev->dev, vfio_vmstate_change, prepare_cb, NULL, vbasedev); migration_add_notifier(&migration->migration_state, vfio_migration_state_notifier); diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c index dd8837c..d1da40a 100644 --- a/hw/virtio/vdpa-dev.c +++ b/hw/virtio/vdpa-dev.c @@ -312,7 +312,7 @@ static void vhost_vdpa_device_stop(VirtIODevice *vdev) vhost_dev_disable_notifiers(&s->dev, vdev); } -static void vhost_vdpa_device_set_status(VirtIODevice *vdev, uint8_t status) +static int vhost_vdpa_device_set_status(VirtIODevice *vdev, uint8_t status) { VhostVdpaDevice *s = VHOST_VDPA_DEVICE(vdev); bool should_start = virtio_device_started(vdev, status); @@ -324,7 +324,7 @@ static void vhost_vdpa_device_set_status(VirtIODevice *vdev, uint8_t status) } if (s->started == should_start) { - return; + return 0; } if (should_start) { @@ -335,6 +335,7 @@ static void vhost_vdpa_device_set_status(VirtIODevice *vdev, uint8_t status) } else { vhost_vdpa_device_stop(vdev); } + return 0; } static const Property vhost_vdpa_device_properties[] = { diff --git a/hw/virtio/vhost-user-base.c b/hw/virtio/vhost-user-base.c index 7714332..ff67a02 100644 --- a/hw/virtio/vhost-user-base.c +++ b/hw/virtio/vhost-user-base.c @@ -66,7 +66,7 @@ err_host_notifiers: vhost_dev_disable_notifiers(&vub->vhost_dev, vdev); } -static void vub_stop(VirtIODevice *vdev) +static int vub_stop(VirtIODevice *vdev) { VHostUserBase *vub = VHOST_USER_BASE(vdev); BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); @@ -74,34 +74,39 @@ static void vub_stop(VirtIODevice *vdev) int ret; if (!k->set_guest_notifiers) { - return; + return 0; } - vhost_dev_stop(&vub->vhost_dev, vdev, true); + ret = vhost_dev_stop(&vub->vhost_dev, vdev, true); - ret = k->set_guest_notifiers(qbus->parent, vub->vhost_dev.nvqs, false); - if (ret < 0) { + if (k->set_guest_notifiers(qbus->parent, vub->vhost_dev.nvqs, false) < 0) { error_report("vhost guest notifier cleanup failed: %d", ret); - return; + return -1; } vhost_dev_disable_notifiers(&vub->vhost_dev, vdev); + return ret; } -static void vub_set_status(VirtIODevice *vdev, uint8_t status) +static int vub_set_status(VirtIODevice *vdev, uint8_t status) { VHostUserBase *vub = VHOST_USER_BASE(vdev); bool should_start = virtio_device_should_start(vdev, status); if (vhost_dev_is_started(&vub->vhost_dev) == should_start) { - return; + return 0; } if (should_start) { vub_start(vdev); } else { - vub_stop(vdev); + int ret; + ret = vub_stop(vdev); + if (ret < 0) { + return ret; + } } + return 0; } /* diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c index f6d1fc8..e77c69e 100644 --- a/hw/virtio/vhost-user-fs.c +++ b/hw/virtio/vhost-user-fs.c @@ -100,7 +100,7 @@ err_host_notifiers: vhost_dev_disable_notifiers(&fs->vhost_dev, vdev); } -static void vuf_stop(VirtIODevice *vdev) +static int vuf_stop(VirtIODevice *vdev) { VHostUserFS *fs = VHOST_USER_FS(vdev); BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); @@ -108,34 +108,39 @@ static void vuf_stop(VirtIODevice *vdev) int ret; if (!k->set_guest_notifiers) { - return; + return 0; } - vhost_dev_stop(&fs->vhost_dev, vdev, true); + ret = vhost_dev_stop(&fs->vhost_dev, vdev, true); - ret = k->set_guest_notifiers(qbus->parent, fs->vhost_dev.nvqs, false); - if (ret < 0) { + if (k->set_guest_notifiers(qbus->parent, fs->vhost_dev.nvqs, false) < 0) { error_report("vhost guest notifier cleanup failed: %d", ret); - return; + return -1; } vhost_dev_disable_notifiers(&fs->vhost_dev, vdev); + return ret; } -static void vuf_set_status(VirtIODevice *vdev, uint8_t status) +static int vuf_set_status(VirtIODevice *vdev, uint8_t status) { VHostUserFS *fs = VHOST_USER_FS(vdev); bool should_start = virtio_device_should_start(vdev, status); if (vhost_dev_is_started(&fs->vhost_dev) == should_start) { - return; + return 0; } if (should_start) { vuf_start(vdev); } else { - vuf_stop(vdev); + int ret; + ret = vuf_stop(vdev); + if (ret < 0) { + return ret; + } } + return 0; } static uint64_t vuf_get_features(VirtIODevice *vdev, diff --git a/hw/virtio/vhost-user-scmi.c b/hw/virtio/vhost-user-scmi.c index 7a0f622..f9264c4 100644 --- a/hw/virtio/vhost-user-scmi.c +++ b/hw/virtio/vhost-user-scmi.c @@ -83,7 +83,7 @@ err_host_notifiers: return ret; } -static void vu_scmi_stop(VirtIODevice *vdev) +static int vu_scmi_stop(VirtIODevice *vdev) { VHostUserSCMI *scmi = VHOST_USER_SCMI(vdev); BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); @@ -93,41 +93,46 @@ static void vu_scmi_stop(VirtIODevice *vdev) /* vhost_dev_is_started() check in the callers is not fully reliable. */ if (!scmi->started_vu) { - return; + return 0; } scmi->started_vu = false; if (!k->set_guest_notifiers) { - return; + return 0; } - vhost_dev_stop(vhost_dev, vdev, true); + ret = vhost_dev_stop(vhost_dev, vdev, true); - ret = k->set_guest_notifiers(qbus->parent, vhost_dev->nvqs, false); - if (ret < 0) { + if (k->set_guest_notifiers(qbus->parent, vhost_dev->nvqs, false) < 0) { error_report("vhost guest notifier cleanup failed: %d", ret); - return; + return -1; } vhost_dev_disable_notifiers(vhost_dev, vdev); + return ret; } -static void vu_scmi_set_status(VirtIODevice *vdev, uint8_t status) +static int vu_scmi_set_status(VirtIODevice *vdev, uint8_t status) { VHostUserSCMI *scmi = VHOST_USER_SCMI(vdev); bool should_start = virtio_device_should_start(vdev, status); if (!scmi->connected) { - return; + return -1; } if (vhost_dev_is_started(&scmi->vhost_dev) == should_start) { - return; + return 0; } if (should_start) { vu_scmi_start(vdev); } else { - vu_scmi_stop(vdev); + int ret; + ret = vu_scmi_stop(vdev); + if (ret < 0) { + return ret; + } } + return 0; } static uint64_t vu_scmi_get_features(VirtIODevice *vdev, uint64_t features, diff --git a/hw/virtio/vhost-user-vsock.c b/hw/virtio/vhost-user-vsock.c index 2776792..993c287 100644 --- a/hw/virtio/vhost-user-vsock.c +++ b/hw/virtio/vhost-user-vsock.c @@ -54,23 +54,28 @@ const VhostDevConfigOps vsock_ops = { .vhost_dev_config_notifier = vuv_handle_config_change, }; -static void vuv_set_status(VirtIODevice *vdev, uint8_t status) +static int vuv_set_status(VirtIODevice *vdev, uint8_t status) { VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); bool should_start = virtio_device_should_start(vdev, status); + int ret; if (vhost_dev_is_started(&vvc->vhost_dev) == should_start) { - return; + return 0; } if (should_start) { - int ret = vhost_vsock_common_start(vdev); + ret = vhost_vsock_common_start(vdev); if (ret < 0) { - return; + return ret; } } else { - vhost_vsock_common_stop(vdev); + ret = vhost_vsock_common_stop(vdev); + if (ret < 0) { + return ret; + } } + return 0; } static uint64_t vuv_get_features(VirtIODevice *vdev, diff --git a/hw/virtio/vhost-vsock-common.c b/hw/virtio/vhost-vsock-common.c index 4b4fbb4..c6c44d8 100644 --- a/hw/virtio/vhost-vsock-common.c +++ b/hw/virtio/vhost-vsock-common.c @@ -95,7 +95,7 @@ err_host_notifiers: return ret; } -void vhost_vsock_common_stop(VirtIODevice *vdev) +int vhost_vsock_common_stop(VirtIODevice *vdev) { VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); @@ -103,18 +103,18 @@ void vhost_vsock_common_stop(VirtIODevice *vdev) int ret; if (!k->set_guest_notifiers) { - return; + return 0; } - vhost_dev_stop(&vvc->vhost_dev, vdev, true); + ret = vhost_dev_stop(&vvc->vhost_dev, vdev, true); - ret = k->set_guest_notifiers(qbus->parent, vvc->vhost_dev.nvqs, false); - if (ret < 0) { + if (k->set_guest_notifiers(qbus->parent, vvc->vhost_dev.nvqs, false) < 0) { error_report("vhost guest notifier cleanup failed: %d", ret); - return; + return -1; } vhost_dev_disable_notifiers(&vvc->vhost_dev, vdev); + return ret; } diff --git a/hw/virtio/vhost-vsock.c b/hw/virtio/vhost-vsock.c index b73dc72..6e40888 100644 --- a/hw/virtio/vhost-vsock.c +++ b/hw/virtio/vhost-vsock.c @@ -67,37 +67,38 @@ static int vhost_vsock_set_running(VirtIODevice *vdev, int start) } -static void vhost_vsock_set_status(VirtIODevice *vdev, uint8_t status) +static int vhost_vsock_set_status(VirtIODevice *vdev, uint8_t status) { VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); bool should_start = virtio_device_should_start(vdev, status); int ret; if (vhost_dev_is_started(&vvc->vhost_dev) == should_start) { - return; + return 0; } if (should_start) { ret = vhost_vsock_common_start(vdev); if (ret < 0) { - return; + return 0; } ret = vhost_vsock_set_running(vdev, 1); if (ret < 0) { vhost_vsock_common_stop(vdev); error_report("Error starting vhost vsock: %d", -ret); - return; + return 0; } } else { ret = vhost_vsock_set_running(vdev, 0); if (ret < 0) { error_report("vhost vsock set running failed: %d", ret); - return; + return 0; } vhost_vsock_common_stop(vdev); } + return 0; } static uint64_t vhost_vsock_get_features(VirtIODevice *vdev, diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 4cae7c1..fc43853 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -1367,10 +1367,10 @@ fail_alloc_desc: return r; } -void vhost_virtqueue_stop(struct vhost_dev *dev, - struct VirtIODevice *vdev, - struct vhost_virtqueue *vq, - unsigned idx) +int vhost_virtqueue_stop(struct vhost_dev *dev, + struct VirtIODevice *vdev, + struct vhost_virtqueue *vq, + unsigned idx) { int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx); struct vhost_vring_state state = { @@ -1380,7 +1380,7 @@ void vhost_virtqueue_stop(struct vhost_dev *dev, if (virtio_queue_get_desc_addr(vdev, idx) == 0) { /* Don't stop the virtqueue which might have not been started */ - return; + return 0; } r = dev->vhost_ops->vhost_get_vring_base(dev, &state); @@ -1411,6 +1411,7 @@ void vhost_virtqueue_stop(struct vhost_dev *dev, 0, virtio_queue_get_avail_size(vdev, idx)); vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx), 0, virtio_queue_get_desc_size(vdev, idx)); + return r; } static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev, @@ -2135,9 +2136,10 @@ fail_features: } /* Host notifiers must be enabled at this point. */ -void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) +int vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) { int i; + int rc = 0; /* should only be called after backend is connected */ assert(hdev->vhost_ops); @@ -2156,10 +2158,10 @@ void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) vhost_dev_set_vring_enable(hdev, false); } for (i = 0; i < hdev->nvqs; ++i) { - vhost_virtqueue_stop(hdev, - vdev, - hdev->vqs + i, - hdev->vq_index + i); + rc |= vhost_virtqueue_stop(hdev, + vdev, + hdev->vqs + i, + hdev->vq_index + i); } if (hdev->vhost_ops->vhost_reset_status) { hdev->vhost_ops->vhost_reset_status(hdev); @@ -2176,6 +2178,7 @@ void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) hdev->started = false; vdev->vhost_started = false; hdev->vdev = NULL; + return rc; } int vhost_net_set_backend(struct vhost_dev *hdev, diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index 91510ec..db787d0 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -958,7 +958,7 @@ static void virtio_balloon_device_reset(VirtIODevice *vdev) s->poison_val = 0; } -static void virtio_balloon_set_status(VirtIODevice *vdev, uint8_t status) +static int virtio_balloon_set_status(VirtIODevice *vdev, uint8_t status) { VirtIOBalloon *s = VIRTIO_BALLOON(vdev); @@ -988,6 +988,7 @@ static void virtio_balloon_set_status(VirtIODevice *vdev, uint8_t status) qemu_mutex_unlock(&s->free_page_lock); } } + return 0; } static ResettableState *virtio_balloon_get_reset_state(Object *obj) diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c index e24d691..517f208 100644 --- a/hw/virtio/virtio-crypto.c +++ b/hw/virtio/virtio-crypto.c @@ -1197,11 +1197,12 @@ static void virtio_crypto_vhost_status(VirtIOCrypto *c, uint8_t status) } } -static void virtio_crypto_set_status(VirtIODevice *vdev, uint8_t status) +static int virtio_crypto_set_status(VirtIODevice *vdev, uint8_t status) { VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(vdev); virtio_crypto_vhost_status(vcrypto, status); + return 0; } static void virtio_crypto_guest_notifier_mask(VirtIODevice *vdev, int idx, diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index 5406098..3500f1b 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -1522,9 +1522,10 @@ static void virtio_iommu_device_reset_exit(Object *obj, ResetType type) NULL, NULL, virtio_iommu_put_endpoint); } -static void virtio_iommu_set_status(VirtIODevice *vdev, uint8_t status) +static int virtio_iommu_set_status(VirtIODevice *vdev, uint8_t status) { trace_virtio_iommu_device_status(status); + return 0; } static void virtio_iommu_instance_init(Object *obj) diff --git a/hw/virtio/virtio-net-pci.c b/hw/virtio/virtio-net-pci.c index 8cf9788..f857a84 100644 --- a/hw/virtio/virtio-net-pci.c +++ b/hw/virtio/virtio-net-pci.c @@ -74,6 +74,7 @@ static void virtio_net_pci_class_init(ObjectClass *klass, const void *data) k->device_id = PCI_DEVICE_ID_VIRTIO_NET; k->revision = VIRTIO_PCI_ABI_VERSION; k->class_id = PCI_CLASS_NETWORK_ETHERNET; + k->sriov_vf_user_creatable = true; set_bit(DEVICE_CATEGORY_NETWORK, dc->categories); device_class_set_props(dc, virtio_net_properties); vpciklass->realize = virtio_net_pci_realize; diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 0fa8fe4..9b48aa8 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -1962,6 +1962,7 @@ static void virtio_pci_device_plugged(DeviceState *d, Error **errp) uint8_t *config; uint32_t size; VirtIODevice *vdev = virtio_bus_get_device(bus); + int16_t res; /* * Virtio capabilities present without @@ -2109,6 +2110,18 @@ static void virtio_pci_device_plugged(DeviceState *d, Error **errp) pci_register_bar(&proxy->pci_dev, proxy->legacy_io_bar_idx, PCI_BASE_ADDRESS_SPACE_IO, &proxy->bar); } + + if (pci_is_vf(&proxy->pci_dev)) { + pcie_ari_init(&proxy->pci_dev, proxy->last_pcie_cap_offset); + proxy->last_pcie_cap_offset += PCI_ARI_SIZEOF; + } else { + res = pcie_sriov_pf_init_from_user_created_vfs( + &proxy->pci_dev, proxy->last_pcie_cap_offset, errp); + if (res > 0) { + proxy->last_pcie_cap_offset += res; + virtio_add_feature(&vdev->host_features, VIRTIO_F_SR_IOV); + } + } } static void virtio_pci_device_unplugged(DeviceState *d) @@ -2199,7 +2212,7 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) if (pcie_port && pci_is_express(pci_dev)) { int pos; - uint16_t last_pcie_cap_offset = PCI_CONFIG_SPACE_SIZE; + proxy->last_pcie_cap_offset = PCI_CONFIG_SPACE_SIZE; pos = pcie_endpoint_cap_init(pci_dev, 0); assert(pos > 0); @@ -2216,9 +2229,9 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) pci_set_word(pci_dev->config + pos + PCI_PM_PMC, 0x3); if (proxy->flags & VIRTIO_PCI_FLAG_AER) { - pcie_aer_init(pci_dev, PCI_ERR_VER, last_pcie_cap_offset, + pcie_aer_init(pci_dev, PCI_ERR_VER, proxy->last_pcie_cap_offset, PCI_ERR_SIZEOF, NULL); - last_pcie_cap_offset += PCI_ERR_SIZEOF; + proxy->last_pcie_cap_offset += PCI_ERR_SIZEOF; } if (proxy->flags & VIRTIO_PCI_FLAG_INIT_DEVERR) { @@ -2243,9 +2256,9 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) } if (proxy->flags & VIRTIO_PCI_FLAG_ATS) { - pcie_ats_init(pci_dev, last_pcie_cap_offset, + pcie_ats_init(pci_dev, proxy->last_pcie_cap_offset, proxy->flags & VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED); - last_pcie_cap_offset += PCI_EXT_CAP_ATS_SIZEOF; + proxy->last_pcie_cap_offset += PCI_EXT_CAP_ATS_SIZEOF; } if (proxy->flags & VIRTIO_PCI_FLAG_INIT_FLR) { @@ -2273,6 +2286,7 @@ static void virtio_pci_exit(PCIDevice *pci_dev) !pci_bus_is_root(pci_get_bus(pci_dev)); bool modern_pio = proxy->flags & VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY; + pcie_sriov_pf_exit(&proxy->pci_dev); msix_uninit_exclusive_bar(pci_dev); if (proxy->flags & VIRTIO_PCI_FLAG_AER && pcie_port && pci_is_express(pci_dev)) { diff --git a/hw/virtio/virtio-rng.c b/hw/virtio/virtio-rng.c index dcb3c71..3df5d25 100644 --- a/hw/virtio/virtio-rng.c +++ b/hw/virtio/virtio-rng.c @@ -159,17 +159,18 @@ static void check_rate_limit(void *opaque) vrng->activate_timer = true; } -static void virtio_rng_set_status(VirtIODevice *vdev, uint8_t status) +static int virtio_rng_set_status(VirtIODevice *vdev, uint8_t status) { VirtIORNG *vrng = VIRTIO_RNG(vdev); if (!vdev->vm_running) { - return; + return 0; } vdev->status = status; /* Something changed, try to process buffers */ virtio_rng_process(vrng); + return 0; } static void virtio_rng_device_realize(DeviceState *dev, Error **errp) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 480c2e5..2e98cec 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -2221,12 +2221,12 @@ int virtio_set_status(VirtIODevice *vdev, uint8_t val) { VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); trace_virtio_set_status(vdev, val); + int ret = 0; if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { if (!(vdev->status & VIRTIO_CONFIG_S_FEATURES_OK) && val & VIRTIO_CONFIG_S_FEATURES_OK) { - int ret = virtio_validate_features(vdev); - + ret = virtio_validate_features(vdev); if (ret) { return ret; } @@ -2239,11 +2239,15 @@ int virtio_set_status(VirtIODevice *vdev, uint8_t val) } if (k->set_status) { - k->set_status(vdev, val); + ret = k->set_status(vdev, val); + if (ret) { + qemu_log("set %s status to %d failed, old status: %d\n", + vdev->name, val, vdev->status); + } } vdev->status = val; - return 0; + return ret; } static enum virtio_device_endian virtio_default_endian(void) @@ -2316,49 +2320,6 @@ void virtio_queue_enable(VirtIODevice *vdev, uint32_t queue_index) } } -void virtio_reset(void *opaque) -{ - VirtIODevice *vdev = opaque; - VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); - int i; - - virtio_set_status(vdev, 0); - if (current_cpu) { - /* Guest initiated reset */ - vdev->device_endian = virtio_current_cpu_endian(); - } else { - /* System reset */ - vdev->device_endian = virtio_default_endian(); - } - - if (k->get_vhost) { - struct vhost_dev *hdev = k->get_vhost(vdev); - /* Only reset when vhost back-end is connected */ - if (hdev && hdev->vhost_ops) { - vhost_reset_device(hdev); - } - } - - if (k->reset) { - k->reset(vdev); - } - - vdev->start_on_kick = false; - vdev->started = false; - vdev->broken = false; - vdev->guest_features = 0; - vdev->queue_sel = 0; - vdev->status = 0; - vdev->disabled = false; - qatomic_set(&vdev->isr, 0); - vdev->config_vector = VIRTIO_NO_VECTOR; - virtio_notify_vector(vdev, vdev->config_vector); - - for(i = 0; i < VIRTIO_QUEUE_MAX; i++) { - __virtio_queue_reset(vdev, i); - } -} - void virtio_queue_set_addr(VirtIODevice *vdev, int n, hwaddr addr) { if (!vdev->vq[n].vring.num) { @@ -3169,6 +3130,49 @@ int virtio_set_features(VirtIODevice *vdev, uint64_t val) return ret; } +void virtio_reset(void *opaque) +{ + VirtIODevice *vdev = opaque; + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + int i; + + virtio_set_status(vdev, 0); + if (current_cpu) { + /* Guest initiated reset */ + vdev->device_endian = virtio_current_cpu_endian(); + } else { + /* System reset */ + vdev->device_endian = virtio_default_endian(); + } + + if (k->get_vhost) { + struct vhost_dev *hdev = k->get_vhost(vdev); + /* Only reset when vhost back-end is connected */ + if (hdev && hdev->vhost_ops) { + vhost_reset_device(hdev); + } + } + + if (k->reset) { + k->reset(vdev); + } + + vdev->start_on_kick = false; + vdev->started = false; + vdev->broken = false; + virtio_set_features_nocheck(vdev, 0); + vdev->queue_sel = 0; + vdev->status = 0; + vdev->disabled = false; + qatomic_set(&vdev->isr, 0); + vdev->config_vector = VIRTIO_NO_VECTOR; + virtio_notify_vector(vdev, vdev->config_vector); + + for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { + __virtio_queue_reset(vdev, i); + } +} + static void virtio_device_check_notification_compatibility(VirtIODevice *vdev, Error **errp) { @@ -3419,7 +3423,7 @@ void virtio_cleanup(VirtIODevice *vdev) qemu_del_vm_change_state_handler(vdev->vmstate); } -static void virtio_vmstate_change(void *opaque, bool running, RunState state) +static int virtio_vmstate_change(void *opaque, bool running, RunState state) { VirtIODevice *vdev = opaque; BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); @@ -3436,8 +3440,12 @@ static void virtio_vmstate_change(void *opaque, bool running, RunState state) } if (!backend_run) { - virtio_set_status(vdev, vdev->status); + int ret = virtio_set_status(vdev, vdev->status); + if (ret) { + return ret; + } } + return 0; } void virtio_instance_init_common(Object *proxy_obj, void *data, @@ -3489,7 +3497,7 @@ void virtio_init(VirtIODevice *vdev, uint16_t device_id, size_t config_size) vdev->config = NULL; } vdev->vmstate = qdev_add_vm_change_state_handler(DEVICE(vdev), - virtio_vmstate_change, vdev); + NULL, virtio_vmstate_change, vdev); vdev->device_endian = virtio_default_endian(); vdev->use_guest_notifier_mask = true; } diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h index 3a0ee7e..ed6cd50 100644 --- a/include/hw/cxl/cxl_device.h +++ b/include/hw/cxl/cxl_device.h @@ -176,10 +176,12 @@ typedef struct CXLCCI { uint16_t opcode; uint16_t complete_pct; uint16_t ret_code; /* Current value of retcode */ + bool aborted; uint64_t starttime; /* set by each bg cmd, cleared by the bg_timer when complete */ uint64_t runtime; QEMUTimer *timer; + QemuMutex lock; /* serializes mbox abort vs timer cb */ } bg; /* firmware update */ @@ -201,6 +203,7 @@ typedef struct CXLCCI { DeviceState *d; /* Pointer to the device hosting the protocol conversion */ DeviceState *intf; + bool initialized; } CXLCCI; typedef struct cxl_device_state { @@ -316,6 +319,7 @@ void cxl_initialize_mailbox_t3(CXLCCI *cci, DeviceState *d, size_t payload_max); void cxl_initialize_mailbox_swcci(CXLCCI *cci, DeviceState *intf, DeviceState *d, size_t payload_max); void cxl_init_cci(CXLCCI *cci, size_t payload_max); +void cxl_destroy_cci(CXLCCI *cci); void cxl_add_cci_commands(CXLCCI *cci, const struct cxl_cmd (*cxl_cmd_set)[256], size_t payload_max); int cxl_process_cci_message(CXLCCI *cci, uint8_t set, uint8_t cmd, @@ -536,6 +540,21 @@ typedef struct CXLSetFeatureInfo { size_t data_size; } CXLSetFeatureInfo; +struct CXLSanitizeInfo; + +typedef struct CXLAlertConfig { + uint8_t valid_alerts; + uint8_t enable_alerts; + uint8_t life_used_crit_alert_thresh; + uint8_t life_used_warn_thresh; + uint16_t over_temp_crit_alert_thresh; + uint16_t under_temp_crit_alert_thresh; + uint16_t over_temp_warn_thresh; + uint16_t under_temp_warn_thresh; + uint16_t cor_vmem_err_warn_thresh; + uint16_t cor_pmem_err_warn_thresh; +} QEMU_PACKED CXLAlertConfig; + struct CXLType3Dev { /* Private */ PCIDevice parent_obj; @@ -557,6 +576,8 @@ struct CXLType3Dev { CXLCCI vdm_fm_owned_ld_mctp_cci; CXLCCI ld0_cci; + CXLAlertConfig alert_config; + /* PCIe link characteristics */ PCIExpLinkSpeed speed; PCIExpLinkWidth width; @@ -602,6 +623,8 @@ struct CXLType3Dev { uint8_t num_regions; /* 0-8 regions */ CXLDCRegion regions[DCD_MAX_NUM_REGION]; } dc; + + struct CXLSanitizeInfo *media_op_sanitize; }; #define TYPE_CXL_TYPE3 "cxl-type3" diff --git a/include/hw/cxl/cxl_mailbox.h b/include/hw/cxl/cxl_mailbox.h index beb0480..9008402 100644 --- a/include/hw/cxl/cxl_mailbox.h +++ b/include/hw/cxl/cxl_mailbox.h @@ -14,5 +14,6 @@ #define CXL_MBOX_IMMEDIATE_LOG_CHANGE (1 << 4) #define CXL_MBOX_SECURITY_STATE_CHANGE (1 << 5) #define CXL_MBOX_BACKGROUND_OPERATION (1 << 6) +#define CXL_MBOX_BACKGROUND_OPERATION_ABORT (1 << 7) #endif diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h index 345b12e..e41d95b 100644 --- a/include/hw/pci/pci_device.h +++ b/include/hw/pci/pci_device.h @@ -38,6 +38,8 @@ struct PCIDeviceClass { uint16_t subsystem_id; /* only for header type = 0 */ const char *romfile; /* rom bar */ + + bool sriov_vf_user_creatable; }; enum PCIReqIDType { @@ -177,6 +179,8 @@ struct PCIDevice { * realizing the device. */ uint32_t max_bounce_buffer_size; + + char *sriov_pf; }; static inline int pci_intx(PCIDevice *pci_dev) @@ -209,7 +213,7 @@ static inline int pci_is_express_downstream_port(const PCIDevice *d) static inline int pci_is_vf(const PCIDevice *d) { - return d->exp.sriov_vf.pf != NULL; + return d->sriov_pf || d->exp.sriov_vf.pf != NULL; } static inline uint32_t pci_config_size(const PCIDevice *d) diff --git a/include/hw/pci/pcie_sriov.h b/include/hw/pci/pcie_sriov.h index c5d2d31..aeaa38c 100644 --- a/include/hw/pci/pcie_sriov.h +++ b/include/hw/pci/pcie_sriov.h @@ -18,6 +18,7 @@ typedef struct PCIESriovPF { uint8_t vf_bar_type[PCI_NUM_REGIONS]; /* Store type for each VF bar */ PCIDevice **vf; /* Pointer to an array of num_vfs VF devices */ + bool vf_user_created; /* If VFs are created by user */ } PCIESriovPF; typedef struct PCIESriovVF { @@ -40,6 +41,26 @@ void pcie_sriov_pf_init_vf_bar(PCIDevice *dev, int region_num, void pcie_sriov_vf_register_bar(PCIDevice *dev, int region_num, MemoryRegion *memory); +/** + * pcie_sriov_pf_init_from_user_created_vfs() - Initialize PF with user-created + * VFs, adding ARI to PF + * @dev: A PCIe device being realized. + * @offset: The offset of the SR-IOV capability. + * @errp: pointer to Error*, to store an error if it happens. + * + * Initializes a PF with user-created VFs, adding the ARI extended capability to + * the PF. The VFs should call pcie_ari_init() to form an ARI device. + * + * Return: The size of added capabilities. 0 if the user did not create VFs. + * -1 if failed. + */ +int16_t pcie_sriov_pf_init_from_user_created_vfs(PCIDevice *dev, + uint16_t offset, + Error **errp); + +bool pcie_sriov_register_device(PCIDevice *dev, Error **errp); +void pcie_sriov_unregister_device(PCIDevice *dev); + /* * Default (minimal) page size support values * as required by the SR/IOV standard: diff --git a/include/hw/virtio/vhost-scsi-common.h b/include/hw/virtio/vhost-scsi-common.h index c5d2c09..d54d9c9 100644 --- a/include/hw/virtio/vhost-scsi-common.h +++ b/include/hw/virtio/vhost-scsi-common.h @@ -40,7 +40,7 @@ struct VHostSCSICommon { }; int vhost_scsi_common_start(VHostSCSICommon *vsc, Error **errp); -void vhost_scsi_common_stop(VHostSCSICommon *vsc); +int vhost_scsi_common_stop(VHostSCSICommon *vsc); char *vhost_scsi_common_get_fw_dev_path(FWPathProvider *p, BusState *bus, DeviceState *dev); void vhost_scsi_common_set_config(VirtIODevice *vdev, const uint8_t *config); diff --git a/include/hw/virtio/vhost-vsock-common.h b/include/hw/virtio/vhost-vsock-common.h index 75a74e8..01bf606 100644 --- a/include/hw/virtio/vhost-vsock-common.h +++ b/include/hw/virtio/vhost-vsock-common.h @@ -42,7 +42,7 @@ struct VHostVSockCommon { }; int vhost_vsock_common_start(VirtIODevice *vdev); -void vhost_vsock_common_stop(VirtIODevice *vdev); +int vhost_vsock_common_stop(VirtIODevice *vdev); int vhost_vsock_common_pre_save(void *opaque); int vhost_vsock_common_post_load(void *opaque, int version_id); void vhost_vsock_common_realize(VirtIODevice *vdev); diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h index bb4b58e..38800a7 100644 --- a/include/hw/virtio/vhost.h +++ b/include/hw/virtio/vhost.h @@ -232,8 +232,10 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings); * Stop the vhost device. After the device is stopped the notifiers * can be disabled (@vhost_dev_disable_notifiers) and the device can * be torn down (@vhost_dev_cleanup). + * + * Return: 0 on success, != 0 on error when stopping dev. */ -void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings); +int vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings); /** * DOC: vhost device configuration handling @@ -333,8 +335,8 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write); int vhost_virtqueue_start(struct vhost_dev *dev, struct VirtIODevice *vdev, struct vhost_virtqueue *vq, unsigned idx); -void vhost_virtqueue_stop(struct vhost_dev *dev, struct VirtIODevice *vdev, - struct vhost_virtqueue *vq, unsigned idx); +int vhost_virtqueue_stop(struct vhost_dev *dev, struct VirtIODevice *vdev, + struct vhost_virtqueue *vq, unsigned idx); void vhost_dev_reset_inflight(struct vhost_inflight *inflight); void vhost_dev_free_inflight(struct vhost_inflight *inflight); diff --git a/include/hw/virtio/virtio-pci.h b/include/hw/virtio/virtio-pci.h index 31ec144..1dbc385 100644 --- a/include/hw/virtio/virtio-pci.h +++ b/include/hw/virtio/virtio-pci.h @@ -155,6 +155,7 @@ struct VirtIOPCIProxy { uint32_t modern_io_bar_idx; uint32_t modern_mem_bar_idx; int config_cap; + uint16_t last_pcie_cap_offset; uint32_t flags; bool disable_modern; bool ignore_backend_features; diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index 7e0c471..214d4a7 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -186,7 +186,7 @@ struct VirtioDeviceClass { void (*get_config)(VirtIODevice *vdev, uint8_t *config); void (*set_config)(VirtIODevice *vdev, const uint8_t *config); void (*reset)(VirtIODevice *vdev); - void (*set_status)(VirtIODevice *vdev, uint8_t val); + int (*set_status)(VirtIODevice *vdev, uint8_t val); /* Device must validate queue_index. */ void (*queue_reset)(VirtIODevice *vdev, uint32_t queue_index); /* Device must validate queue_index. */ diff --git a/include/system/runstate.h b/include/system/runstate.h index bffc371..fdd5c4a 100644 --- a/include/system/runstate.h +++ b/include/system/runstate.h @@ -12,6 +12,7 @@ bool runstate_needs_reset(void); void runstate_replay_enable(void); typedef void VMChangeStateHandler(void *opaque, bool running, RunState state); +typedef int VMChangeStateHandlerWithRet(void *opaque, bool running, RunState state); VMChangeStateEntry *qemu_add_vm_change_state_handler(VMChangeStateHandler *cb, void *opaque); @@ -20,21 +21,27 @@ VMChangeStateEntry *qemu_add_vm_change_state_handler_prio( VMChangeStateEntry * qemu_add_vm_change_state_handler_prio_full(VMChangeStateHandler *cb, VMChangeStateHandler *prepare_cb, + VMChangeStateHandlerWithRet *cb_ret, void *opaque, int priority); VMChangeStateEntry *qdev_add_vm_change_state_handler(DeviceState *dev, VMChangeStateHandler *cb, + VMChangeStateHandlerWithRet *cb_ret, void *opaque); VMChangeStateEntry *qdev_add_vm_change_state_handler_full( - DeviceState *dev, VMChangeStateHandler *cb, - VMChangeStateHandler *prepare_cb, void *opaque); + DeviceState *dev, VMChangeStateHandler *cb, VMChangeStateHandler *prepare_cb, + VMChangeStateHandlerWithRet *cb_ret, void *opaque); void qemu_del_vm_change_state_handler(VMChangeStateEntry *e); /** * vm_state_notify: Notify the state of the VM * * @running: whether the VM is running or not. * @state: the #RunState of the VM. + * + * Return the result of the callback which has return value. + * If no callback has return value, still return 0 and the + * upper layer should not do additional processing. */ -void vm_state_notify(bool running, RunState state); +int vm_state_notify(bool running, RunState state); static inline bool shutdown_caused_by_guest(ShutdownCause cause) { diff --git a/include/system/vhost-user-backend.h b/include/system/vhost-user-backend.h index 5ed953c..5634ebd 100644 --- a/include/system/vhost-user-backend.h +++ b/include/system/vhost-user-backend.h @@ -43,6 +43,6 @@ struct VhostUserBackend { int vhost_user_backend_dev_init(VhostUserBackend *b, VirtIODevice *vdev, unsigned nvqs, Error **errp); void vhost_user_backend_start(VhostUserBackend *b); -void vhost_user_backend_stop(VhostUserBackend *b); +int vhost_user_backend_stop(VhostUserBackend *b); #endif diff --git a/system/cpus.c b/system/cpus.c index 2cc5f88..d16b0df 100644 --- a/system/cpus.c +++ b/system/cpus.c @@ -299,14 +299,18 @@ static int do_vm_stop(RunState state, bool send_stop) if (oldstate == RUN_STATE_RUNNING) { pause_all_vcpus(); } - vm_state_notify(0, state); + ret = vm_state_notify(0, state); if (send_stop) { qapi_event_send_stop(); } } bdrv_drain_all(); - ret = bdrv_flush_all(); + /* + * Even if vm_state_notify() return failure, + * it would be better to flush as before. + */ + ret |= bdrv_flush_all(); trace_vm_stop_flush_all(ret); return ret; diff --git a/system/runstate.c b/system/runstate.c index 272801d..de74d96 100644 --- a/system/runstate.c +++ b/system/runstate.c @@ -297,6 +297,7 @@ void qemu_system_vmstop_request(RunState state) struct VMChangeStateEntry { VMChangeStateHandler *cb; VMChangeStateHandler *prepare_cb; + VMChangeStateHandlerWithRet *cb_ret; void *opaque; QTAILQ_ENTRY(VMChangeStateEntry) entries; int priority; @@ -320,14 +321,15 @@ static QTAILQ_HEAD(, VMChangeStateEntry) vm_change_state_head = VMChangeStateEntry *qemu_add_vm_change_state_handler_prio( VMChangeStateHandler *cb, void *opaque, int priority) { - return qemu_add_vm_change_state_handler_prio_full(cb, NULL, opaque, - priority); + return qemu_add_vm_change_state_handler_prio_full(cb, NULL, NULL, + opaque, priority); } /** * qemu_add_vm_change_state_handler_prio_full: * @cb: the main callback to invoke * @prepare_cb: a callback to invoke before the main callback + * @cb_ret: the main callback to invoke with return value * @opaque: user data passed to the callbacks * @priority: low priorities execute first when the vm runs and the reverse is * true when the vm stops @@ -344,6 +346,7 @@ VMChangeStateEntry *qemu_add_vm_change_state_handler_prio( VMChangeStateEntry * qemu_add_vm_change_state_handler_prio_full(VMChangeStateHandler *cb, VMChangeStateHandler *prepare_cb, + VMChangeStateHandlerWithRet *cb_ret, void *opaque, int priority) { VMChangeStateEntry *e; @@ -352,6 +355,7 @@ qemu_add_vm_change_state_handler_prio_full(VMChangeStateHandler *cb, e = g_malloc0(sizeof(*e)); e->cb = cb; e->prepare_cb = prepare_cb; + e->cb_ret = cb_ret; e->opaque = opaque; e->priority = priority; @@ -379,9 +383,10 @@ void qemu_del_vm_change_state_handler(VMChangeStateEntry *e) g_free(e); } -void vm_state_notify(bool running, RunState state) +int vm_state_notify(bool running, RunState state) { VMChangeStateEntry *e, *next; + int ret = 0; trace_vm_state_notify(running, state, RunState_str(state)); @@ -393,7 +398,17 @@ void vm_state_notify(bool running, RunState state) } QTAILQ_FOREACH_SAFE(e, &vm_change_state_head, entries, next) { - e->cb(e->opaque, running, state); + if (e->cb) { + e->cb(e->opaque, running, state); + } else if (e->cb_ret) { + /* + * Here ignore the return value of cb_ret because + * we only care about the stopping the device during + * the VM live migration to indicate whether the + * connection between qemu and backend is normal. + */ + e->cb_ret(e->opaque, running, state); + } } } else { QTAILQ_FOREACH_REVERSE_SAFE(e, &vm_change_state_head, entries, next) { @@ -403,9 +418,19 @@ void vm_state_notify(bool running, RunState state) } QTAILQ_FOREACH_REVERSE_SAFE(e, &vm_change_state_head, entries, next) { - e->cb(e->opaque, running, state); + if (e->cb) { + e->cb(e->opaque, running, state); + } else if (e->cb_ret) { + /* + * We should execute all registered callbacks even if + * one of them returns failure, otherwise, some cleanup + * work of the device will be skipped. + */ + ret |= e->cb_ret(e->opaque, running, state); + } } } + return ret; } static ShutdownCause reset_requested; |