diff options
Diffstat (limited to 'include/system')
-rw-r--r-- | include/system/accel-irq.h | 37 | ||||
-rw-r--r-- | include/system/accel-ops.h | 73 | ||||
-rw-r--r-- | include/system/block-backend-global-state.h | 8 | ||||
-rw-r--r-- | include/system/confidential-guest-support.h | 88 | ||||
-rw-r--r-- | include/system/cpus.h | 8 | ||||
-rw-r--r-- | include/system/host_iommu_device.h | 15 | ||||
-rw-r--r-- | include/system/hvf.h | 37 | ||||
-rw-r--r-- | include/system/hvf_int.h | 39 | ||||
-rw-r--r-- | include/system/hw_accel.h | 20 | ||||
-rw-r--r-- | include/system/igvm-cfg.h | 49 | ||||
-rw-r--r-- | include/system/iommufd.h | 65 | ||||
-rw-r--r-- | include/system/kvm.h | 44 | ||||
-rw-r--r-- | include/system/kvm_int.h | 1 | ||||
-rw-r--r-- | include/system/memory.h | 202 | ||||
-rw-r--r-- | include/system/mshv.h | 64 | ||||
-rw-r--r-- | include/system/mshv_int.h | 155 | ||||
-rw-r--r-- | include/system/nvmm.h | 23 | ||||
-rw-r--r-- | include/system/os-win32.h | 5 | ||||
-rw-r--r-- | include/system/physmem.h | 54 | ||||
-rw-r--r-- | include/system/ram_addr.h | 426 | ||||
-rw-r--r-- | include/system/ramblock.h | 49 | ||||
-rw-r--r-- | include/system/runstate.h | 54 | ||||
-rw-r--r-- | include/system/system.h | 1 | ||||
-rw-r--r-- | include/system/vhost-user-backend.h | 2 | ||||
-rw-r--r-- | include/system/whpx.h | 23 |
25 files changed, 889 insertions, 653 deletions
diff --git a/include/system/accel-irq.h b/include/system/accel-irq.h new file mode 100644 index 0000000..671fb7d --- /dev/null +++ b/include/system/accel-irq.h @@ -0,0 +1,37 @@ +/* + * Accelerated irqchip abstraction + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: Ziqiao Zhou <ziqiaozhou@microsoft.com> + * Magnus Kulke <magnuskulke@microsoft.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef SYSTEM_ACCEL_IRQ_H +#define SYSTEM_ACCEL_IRQ_H +#include "hw/pci/msi.h" +#include "qemu/osdep.h" +#include "system/kvm.h" +#include "system/mshv.h" + +static inline bool accel_msi_via_irqfd_enabled(void) +{ + return mshv_msi_via_irqfd_enabled() || kvm_msi_via_irqfd_enabled(); +} + +static inline bool accel_irqchip_is_split(void) +{ + return mshv_msi_via_irqfd_enabled() || kvm_irqchip_is_split(); +} + +int accel_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev); +int accel_irqchip_update_msi_route(int vector, MSIMessage msg, PCIDevice *dev); +void accel_irqchip_commit_route_changes(KVMRouteChange *c); +void accel_irqchip_commit_routes(void); +void accel_irqchip_release_virq(int virq); +int accel_irqchip_add_irqfd_notifier_gsi(EventNotifier *n, EventNotifier *rn, + int virq); +int accel_irqchip_remove_irqfd_notifier_gsi(EventNotifier *n, int virq); +#endif diff --git a/include/system/accel-ops.h b/include/system/accel-ops.h deleted file mode 100644 index 4c99d25..0000000 --- a/include/system/accel-ops.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Accelerator OPS, used for cpus.c module - * - * Copyright 2021 SUSE LLC - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - */ - -#ifndef ACCEL_OPS_H -#define ACCEL_OPS_H - -#include "exec/vaddr.h" -#include "qom/object.h" - -#define ACCEL_OPS_SUFFIX "-ops" -#define TYPE_ACCEL_OPS "accel" ACCEL_OPS_SUFFIX -#define ACCEL_OPS_NAME(name) (name "-" TYPE_ACCEL_OPS) - -DECLARE_CLASS_CHECKERS(AccelOpsClass, ACCEL_OPS, TYPE_ACCEL_OPS) - -/** - * struct AccelOpsClass - accelerator interfaces - * - * This structure is used to abstract accelerator differences from the - * core CPU code. Not all have to be implemented. - */ -struct AccelOpsClass { - /*< private >*/ - ObjectClass parent_class; - /*< public >*/ - - /* initialization function called when accel is chosen */ - void (*ops_init)(AccelOpsClass *ops); - - bool (*cpus_are_resettable)(void); - void (*cpu_reset_hold)(CPUState *cpu); - - void (*create_vcpu_thread)(CPUState *cpu); /* MANDATORY NON-NULL */ - void (*kick_vcpu_thread)(CPUState *cpu); - bool (*cpu_thread_is_idle)(CPUState *cpu); - - void (*synchronize_post_reset)(CPUState *cpu); - void (*synchronize_post_init)(CPUState *cpu); - void (*synchronize_state)(CPUState *cpu); - void (*synchronize_pre_loadvm)(CPUState *cpu); - void (*synchronize_pre_resume)(bool step_pending); - - void (*handle_interrupt)(CPUState *cpu, int mask); - - /** - * @get_virtual_clock: fetch virtual clock - * @set_virtual_clock: set virtual clock - * - * These allow the timer subsystem to defer to the accelerator to - * fetch time. The set function is needed if the accelerator wants - * to track the changes to time as the timer is warped through - * various timer events. - */ - int64_t (*get_virtual_clock)(void); - void (*set_virtual_clock)(int64_t time); - - int64_t (*get_elapsed_ticks)(void); - - /* gdbstub hooks */ - bool (*supports_guest_debug)(void); - int (*update_guest_debug)(CPUState *cpu); - int (*insert_breakpoint)(CPUState *cpu, int type, vaddr addr, vaddr len); - int (*remove_breakpoint)(CPUState *cpu, int type, vaddr addr, vaddr len); - void (*remove_all_breakpoints)(CPUState *cpu); -}; - -#endif /* ACCEL_OPS_H */ diff --git a/include/system/block-backend-global-state.h b/include/system/block-backend-global-state.h index 35b5e83..c384964 100644 --- a/include/system/block-backend-global-state.h +++ b/include/system/block-backend-global-state.h @@ -55,7 +55,7 @@ void monitor_remove_blk(BlockBackend *blk); BlockBackendPublic *blk_get_public(BlockBackend *blk); -void blk_remove_bs(BlockBackend *blk); +void GRAPH_UNLOCKED blk_remove_bs(BlockBackend *blk); int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp); int blk_replace_bs(BlockBackend *blk, BlockDriverState *new_bs, Error **errp); bool GRAPH_RDLOCK bdrv_has_blk(BlockDriverState *bs); @@ -78,8 +78,8 @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags); void blk_aio_cancel(BlockAIOCB *acb); int blk_commit_all(void); bool blk_in_drain(BlockBackend *blk); -void blk_drain(BlockBackend *blk); -void blk_drain_all(void); +void GRAPH_UNLOCKED blk_drain(BlockBackend *blk); +void GRAPH_UNLOCKED blk_drain_all(void); void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error, BlockdevOnError on_write_error); bool blk_supports_write_perm(BlockBackend *blk); @@ -109,7 +109,7 @@ int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz); int blk_probe_geometry(BlockBackend *blk, HDGeometry *geo); void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg); -void blk_io_limits_disable(BlockBackend *blk); +void GRAPH_UNLOCKED blk_io_limits_disable(BlockBackend *blk); void blk_io_limits_enable(BlockBackend *blk, const char *group); void blk_io_limits_update_group(BlockBackend *blk, const char *group); void blk_set_force_allow_inactivate(BlockBackend *blk); diff --git a/include/system/confidential-guest-support.h b/include/system/confidential-guest-support.h index ea46b50..0cc8b26 100644 --- a/include/system/confidential-guest-support.h +++ b/include/system/confidential-guest-support.h @@ -19,6 +19,7 @@ #define QEMU_CONFIDENTIAL_GUEST_SUPPORT_H #include "qom/object.h" +#include "exec/hwaddr.h" #define TYPE_CONFIDENTIAL_GUEST_SUPPORT "confidential-guest-support" OBJECT_DECLARE_TYPE(ConfidentialGuestSupport, @@ -26,6 +27,40 @@ OBJECT_DECLARE_TYPE(ConfidentialGuestSupport, CONFIDENTIAL_GUEST_SUPPORT) +typedef enum ConfidentialGuestPlatformType { + CGS_PLATFORM_SEV, + CGS_PLATFORM_SEV_ES, + CGS_PLATFORM_SEV_SNP, +} ConfidentialGuestPlatformType; + +typedef enum ConfidentialGuestMemoryType { + CGS_MEM_RAM, + CGS_MEM_RESERVED, + CGS_MEM_ACPI, + CGS_MEM_NVS, + CGS_MEM_UNUSABLE, +} ConfidentialGuestMemoryType; + +typedef struct ConfidentialGuestMemoryMapEntry { + uint64_t gpa; + uint64_t size; + ConfidentialGuestMemoryType type; +} ConfidentialGuestMemoryMapEntry; + +typedef enum ConfidentialGuestPageType { + CGS_PAGE_TYPE_NORMAL, + CGS_PAGE_TYPE_VMSA, + CGS_PAGE_TYPE_ZERO, + CGS_PAGE_TYPE_UNMEASURED, + CGS_PAGE_TYPE_SECRETS, + CGS_PAGE_TYPE_CPUID, + CGS_PAGE_TYPE_REQUIRED_MEMORY, +} ConfidentialGuestPageType; + +typedef enum ConfidentialGuestPolicyType { + GUEST_POLICY_SEV, +} ConfidentialGuestPolicyType; + struct ConfidentialGuestSupport { Object parent; @@ -64,6 +99,59 @@ typedef struct ConfidentialGuestSupportClass { int (*kvm_init)(ConfidentialGuestSupport *cgs, Error **errp); int (*kvm_reset)(ConfidentialGuestSupport *cgs, Error **errp); + + /* + * Check to see if this confidential guest supports a particular + * platform or configuration. + * + * Return true if supported or false if not supported. + */ + bool (*check_support)(ConfidentialGuestPlatformType platform, + uint16_t platform_version, uint8_t highest_vtl, + uint64_t shared_gpa_boundary); + + /* + * Configure part of the state of a guest for a particular set of data, page + * type and gpa. This can be used for example to pre-populate and measure + * guest memory contents, define private ranges or set the initial CPU state + * for one or more CPUs. + * + * If memory_type is CGS_PAGE_TYPE_VMSA then ptr points to the initial CPU + * context for a virtual CPU. The format of the data depends on the type of + * confidential virtual machine. For example, for SEV-ES ptr will point to a + * vmcb_save_area structure that should be copied into guest memory at the + * address specified in gpa. The cpu_index parameter contains the index of + * the CPU the VMSA applies to. + */ + int (*set_guest_state)(hwaddr gpa, uint8_t *ptr, uint64_t len, + ConfidentialGuestPageType memory_type, + uint16_t cpu_index, Error **errp); + + /* + * Set the guest policy. The policy can be used to configure the + * confidential platform, such as if debug is enabled or not and can contain + * information about expected launch measurements, signed verification of + * guest configuration and other platform data. + * + * The format of the policy data is specific to each platform. For example, + * SEV-SNP uses a policy bitfield in the 'policy' argument and provides an + * ID block and ID authentication in the 'policy_data' parameters. The type + * of policy data is identified by the 'policy_type' argument. + */ + int (*set_guest_policy)(ConfidentialGuestPolicyType policy_type, + uint64_t policy, + void *policy_data1, uint32_t policy_data1_size, + void *policy_data2, uint32_t policy_data2_size, + Error **errp); + + /* + * Iterate the system memory map, getting the entry with the given index + * that can be populated into guest memory. + * + * Returns 0 for ok, 1 if the index is out of range and -1 on error. + */ + int (*get_mem_map_entry)(int index, ConfidentialGuestMemoryMapEntry *entry, + Error **errp); } ConfidentialGuestSupportClass; static inline int confidential_guest_kvm_init(ConfidentialGuestSupport *cgs, diff --git a/include/system/cpus.h b/include/system/cpus.h index 3226c76..508444c 100644 --- a/include/system/cpus.h +++ b/include/system/cpus.h @@ -7,11 +7,6 @@ void cpus_register_accel(const AccelOpsClass *i); /* return registers ops */ const AccelOpsClass *cpus_get_accel(void); -/* accel/dummy-cpus.c */ - -/* Create a dummy vcpu for AccelOpsClass->create_vcpu_thread */ -void dummy_start_vcpu_thread(CPUState *); - /* interface available for cpus accelerator threads */ /* For temporary buffers for forming a name */ @@ -22,8 +17,7 @@ bool cpu_work_list_empty(CPUState *cpu); bool cpu_thread_is_idle(CPUState *cpu); bool all_cpu_threads_idle(void); bool cpu_can_run(CPUState *cpu); -void qemu_wait_io_event_common(CPUState *cpu); -void qemu_wait_io_event(CPUState *cpu); +void qemu_process_cpu_events_common(CPUState *cpu); void cpu_thread_signal_created(CPUState *cpu); void cpu_thread_signal_destroyed(CPUState *cpu); void cpu_handle_guest_debug(CPUState *cpu); diff --git a/include/system/host_iommu_device.h b/include/system/host_iommu_device.h index 809cced..ab849a4 100644 --- a/include/system/host_iommu_device.h +++ b/include/system/host_iommu_device.h @@ -14,6 +14,13 @@ #include "qom/object.h" #include "qapi/error.h" +#ifdef CONFIG_LINUX +#include "linux/iommufd.h" + +typedef union VendorCaps { + struct iommu_hw_info_vtd vtd; + struct iommu_hw_info_arm_smmuv3 smmuv3; +} VendorCaps; /** * struct HostIOMMUDeviceCaps - Define host IOMMU device capabilities. @@ -22,11 +29,17 @@ * * @hw_caps: host platform IOMMU capabilities (e.g. on IOMMUFD this represents * the @out_capabilities value returned from IOMMU_GET_HW_INFO ioctl) + * + * @vendor_caps: host platform IOMMU vendor specific capabilities (e.g. on + * IOMMUFD this represents a user-space buffer filled by kernel + * with host IOMMU @type specific hardware information data) */ typedef struct HostIOMMUDeviceCaps { uint32_t type; uint64_t hw_caps; + VendorCaps vendor_caps; } HostIOMMUDeviceCaps; +#endif #define TYPE_HOST_IOMMU_DEVICE "host-iommu-device" OBJECT_DECLARE_TYPE(HostIOMMUDevice, HostIOMMUDeviceClass, HOST_IOMMU_DEVICE) @@ -38,7 +51,9 @@ struct HostIOMMUDevice { void *agent; /* pointer to agent device, ie. VFIO or VDPA device */ PCIBus *aliased_bus; int aliased_devfn; +#ifdef CONFIG_LINUX HostIOMMUDeviceCaps caps; +#endif }; /** diff --git a/include/system/hvf.h b/include/system/hvf.h index 7b45a2e..d3dcf08 100644 --- a/include/system/hvf.h +++ b/include/system/hvf.h @@ -14,9 +14,6 @@ #define HVF_H #include "qemu/accel.h" -#include "qemu/queue.h" -#include "exec/vaddr.h" -#include "qom/object.h" #ifdef COMPILING_PER_TARGET # ifdef CONFIG_HVF @@ -39,38 +36,4 @@ typedef struct HVFState HVFState; DECLARE_INSTANCE_CHECKER(HVFState, HVF_STATE, TYPE_HVF_ACCEL) -#ifdef COMPILING_PER_TARGET -struct hvf_sw_breakpoint { - vaddr pc; - vaddr saved_insn; - int use_count; - QTAILQ_ENTRY(hvf_sw_breakpoint) entry; -}; - -struct hvf_sw_breakpoint *hvf_find_sw_breakpoint(CPUState *cpu, - vaddr pc); -int hvf_sw_breakpoints_active(CPUState *cpu); - -int hvf_arch_insert_sw_breakpoint(CPUState *cpu, struct hvf_sw_breakpoint *bp); -int hvf_arch_remove_sw_breakpoint(CPUState *cpu, struct hvf_sw_breakpoint *bp); -int hvf_arch_insert_hw_breakpoint(vaddr addr, vaddr len, int type); -int hvf_arch_remove_hw_breakpoint(vaddr addr, vaddr len, int type); -void hvf_arch_remove_all_hw_breakpoints(void); - -/* - * hvf_update_guest_debug: - * @cs: CPUState for the CPU to update - * - * Update guest to enable or disable debugging. Per-arch specifics will be - * handled by calling down to hvf_arch_update_guest_debug. - */ -int hvf_update_guest_debug(CPUState *cpu); -void hvf_arch_update_guest_debug(CPUState *cpu); - -/* - * Return whether the guest supports debugging. - */ -bool hvf_arch_supports_guest_debug(void); -#endif /* COMPILING_PER_TARGET */ - #endif diff --git a/include/system/hvf_int.h b/include/system/hvf_int.h index 8c8b840..a3b06a3 100644 --- a/include/system/hvf_int.h +++ b/include/system/hvf_int.h @@ -12,6 +12,9 @@ #define HVF_INT_H #include "qemu/queue.h" +#include "exec/vaddr.h" +#include "qom/object.h" +#include "accel/accel-ops.h" #ifdef __aarch64__ #include <Hypervisor/Hypervisor.h> @@ -43,7 +46,8 @@ typedef struct hvf_vcpu_caps { } hvf_vcpu_caps; struct HVFState { - AccelState parent; + AccelState parent_obj; + hvf_slot slots[32]; int num_slots; @@ -59,7 +63,6 @@ struct AccelCPUState { bool vtimer_masked; sigset_t unblock_ipi_mask; bool guest_debug_enabled; - bool dirty; }; void assert_hvf_ok_impl(hv_return_t ret, const char *file, unsigned int line, @@ -76,4 +79,36 @@ int hvf_put_registers(CPUState *); int hvf_get_registers(CPUState *); void hvf_kick_vcpu_thread(CPUState *cpu); +struct hvf_sw_breakpoint { + vaddr pc; + vaddr saved_insn; + int use_count; + QTAILQ_ENTRY(hvf_sw_breakpoint) entry; +}; + +struct hvf_sw_breakpoint *hvf_find_sw_breakpoint(CPUState *cpu, + vaddr pc); +int hvf_sw_breakpoints_active(CPUState *cpu); + +int hvf_arch_insert_sw_breakpoint(CPUState *cpu, struct hvf_sw_breakpoint *bp); +int hvf_arch_remove_sw_breakpoint(CPUState *cpu, struct hvf_sw_breakpoint *bp); +int hvf_arch_insert_hw_breakpoint(vaddr addr, vaddr len, int type); +int hvf_arch_remove_hw_breakpoint(vaddr addr, vaddr len, int type); +void hvf_arch_remove_all_hw_breakpoints(void); + +/* + * hvf_update_guest_debug: + * @cs: CPUState for the CPU to update + * + * Update guest to enable or disable debugging. Per-arch specifics will be + * handled by calling down to hvf_arch_update_guest_debug. + */ +int hvf_update_guest_debug(CPUState *cpu); +void hvf_arch_update_guest_debug(CPUState *cpu); + +/* + * Return whether the guest supports debugging. + */ +bool hvf_arch_supports_guest_debug(void); + #endif diff --git a/include/system/hw_accel.h b/include/system/hw_accel.h index 380e9e6..55497ed 100644 --- a/include/system/hw_accel.h +++ b/include/system/hw_accel.h @@ -14,12 +14,30 @@ #include "hw/core/cpu.h" #include "system/kvm.h" #include "system/hvf.h" +#include "system/mshv.h" #include "system/whpx.h" #include "system/nvmm.h" +/** + * cpu_synchronize_state: + * cpu_synchronize_pre_loadvm: + * @cpu: The vCPU to synchronize. + * + * Request to synchronize QEMU vCPU registers from the hardware accelerator + * (the hardware accelerator is the reference). + */ void cpu_synchronize_state(CPUState *cpu); +void cpu_synchronize_pre_loadvm(CPUState *cpu); + +/** + * cpu_synchronize_post_reset: + * cpu_synchronize_post_init: + * @cpu: The vCPU to synchronize. + * + * Request to synchronize QEMU vCPU registers to the hardware accelerator + * (QEMU is the reference). + */ void cpu_synchronize_post_reset(CPUState *cpu); void cpu_synchronize_post_init(CPUState *cpu); -void cpu_synchronize_pre_loadvm(CPUState *cpu); #endif /* QEMU_HW_ACCEL_H */ diff --git a/include/system/igvm-cfg.h b/include/system/igvm-cfg.h new file mode 100644 index 0000000..944f23a --- /dev/null +++ b/include/system/igvm-cfg.h @@ -0,0 +1,49 @@ +/* + * QEMU IGVM interface + * + * Copyright (C) 2024 SUSE + * + * Authors: + * Roy Hopkins <roy.hopkins@randomman.co.uk> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef QEMU_IGVM_CFG_H +#define QEMU_IGVM_CFG_H + +#include "qom/object.h" + +typedef struct IgvmCfg { + ObjectClass parent_class; + + /* + * filename: Filename that specifies a file that contains the configuration + * of the guest in Independent Guest Virtual Machine (IGVM) + * format. + */ + char *filename; +} IgvmCfg; + +typedef struct IgvmCfgClass { + ObjectClass parent_class; + + /* + * If an IGVM filename has been specified then process the IGVM file. + * Performs a no-op if no filename has been specified. + * If onlyVpContext is true then only the IGVM_VHT_VP_CONTEXT entries + * in the IGVM file will be processed, allowing information about the + * CPU state to be determined before processing the entire file. + * + * Returns 0 for ok and -1 on error. + */ + int (*process)(IgvmCfg *cfg, ConfidentialGuestSupport *cgs, + bool onlyVpContext, Error **errp); + +} IgvmCfgClass; + +#define TYPE_IGVM_CFG "igvm-cfg" + +OBJECT_DECLARE_TYPE(IgvmCfg, IgvmCfgClass, IGVM_CFG) + +#endif diff --git a/include/system/iommufd.h b/include/system/iommufd.h index cbab75b..a659f36 100644 --- a/include/system/iommufd.h +++ b/include/system/iommufd.h @@ -32,6 +32,7 @@ struct IOMMUFDBackend { /*< protected >*/ int fd; /* /dev/iommu file descriptor */ bool owned; /* is the /dev/iommu opened internally */ + Error *cpr_blocker;/* set if be does not support CPR */ uint32_t users; /*< public >*/ @@ -43,10 +44,13 @@ void iommufd_backend_disconnect(IOMMUFDBackend *be); bool iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, Error **errp); void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id); +int iommufd_backend_map_file_dma(IOMMUFDBackend *be, uint32_t ioas_id, + hwaddr iova, uint64_t size, int fd, + unsigned long start, bool readonly); int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, - ram_addr_t size, void *vaddr, bool readonly); + uint64_t size, void *vaddr, bool readonly); int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, - hwaddr iova, ram_addr_t size); + hwaddr iova, uint64_t size); bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, uint32_t *type, void *data, uint32_t len, uint64_t *caps, Error **errp); @@ -61,6 +65,63 @@ bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, uint32_t hwpt_id, uint64_t iova, ram_addr_t size, uint64_t page_size, uint64_t *data, Error **errp); +bool iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t id, + uint32_t data_type, uint32_t entry_len, + uint32_t *entry_num, void *data, + Error **errp); + +bool iommufd_change_process_capable(IOMMUFDBackend *be); +bool iommufd_change_process(IOMMUFDBackend *be, Error **errp); #define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" +OBJECT_DECLARE_TYPE(HostIOMMUDeviceIOMMUFD, HostIOMMUDeviceIOMMUFDClass, + HOST_IOMMU_DEVICE_IOMMUFD) + +/* Overload of the host IOMMU device for the iommufd backend */ +struct HostIOMMUDeviceIOMMUFD { + HostIOMMUDevice parent_obj; + + IOMMUFDBackend *iommufd; + uint32_t devid; + uint32_t hwpt_id; +}; + +struct HostIOMMUDeviceIOMMUFDClass { + HostIOMMUDeviceClass parent_class; + + /** + * @attach_hwpt: attach host IOMMU device to IOMMUFD hardware page table. + * VFIO and VDPA device can have different implementation. + * + * Mandatory callback. + * + * @idev: host IOMMU device backed by IOMMUFD backend. + * + * @hwpt_id: ID of IOMMUFD hardware page table. + * + * @errp: pass an Error out when attachment fails. + * + * Returns: true on success, false on failure. + */ + bool (*attach_hwpt)(HostIOMMUDeviceIOMMUFD *idev, uint32_t hwpt_id, + Error **errp); + /** + * @detach_hwpt: detach host IOMMU device from IOMMUFD hardware page table. + * VFIO and VDPA device can have different implementation. + * + * Mandatory callback. + * + * @idev: host IOMMU device backed by IOMMUFD backend. + * + * @errp: pass an Error out when attachment fails. + * + * Returns: true on success, false on failure. + */ + bool (*detach_hwpt)(HostIOMMUDeviceIOMMUFD *idev, Error **errp); +}; + +bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + uint32_t hwpt_id, Error **errp); +bool host_iommu_device_iommufd_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + Error **errp); #endif diff --git a/include/system/kvm.h b/include/system/kvm.h index b690dda..8f9eecf 100644 --- a/include/system/kvm.h +++ b/include/system/kvm.h @@ -42,6 +42,7 @@ extern bool kvm_gsi_routing_allowed; extern bool kvm_gsi_direct_mapping; extern bool kvm_readonly_mem_allowed; extern bool kvm_msi_use_devid; +extern bool kvm_pre_fault_memory_supported; #define kvm_enabled() (kvm_allowed) /** @@ -194,6 +195,7 @@ bool kvm_has_sync_mmu(void); int kvm_has_vcpu_events(void); int kvm_max_nested_state_length(void); int kvm_has_gsi_routing(void); +void kvm_close(void); /** * kvm_arm_supports_user_irq @@ -316,31 +318,6 @@ int kvm_create_device(KVMState *s, uint64_t type, bool test); bool kvm_device_supported(int vmfd, uint64_t type); /** - * kvm_create_vcpu - Gets a parked KVM vCPU or creates a KVM vCPU - * @cpu: QOM CPUState object for which KVM vCPU has to be fetched/created. - * - * @returns: 0 when success, errno (<0) when failed. - */ -int kvm_create_vcpu(CPUState *cpu); - -/** - * kvm_park_vcpu - Park QEMU KVM vCPU context - * @cpu: QOM CPUState object for which QEMU KVM vCPU context has to be parked. - * - * @returns: none - */ -void kvm_park_vcpu(CPUState *cpu); - -/** - * kvm_unpark_vcpu - unpark QEMU KVM vCPU context - * @s: KVM State - * @vcpu_id: Architecture vCPU ID of the parked vCPU - * - * @returns: KVM fd - */ -int kvm_unpark_vcpu(KVMState *s, unsigned long vcpu_id); - -/** * kvm_create_and_park_vcpu - Create and park a KVM vCPU * @cpu: QOM CPUState object for which KVM vCPU has to be created and parked. * @@ -363,19 +340,22 @@ int kvm_arch_process_async_events(CPUState *cpu); int kvm_arch_get_registers(CPUState *cpu, Error **errp); -/* state subset only touched by the VCPU itself during runtime */ -#define KVM_PUT_RUNTIME_STATE 1 -/* state subset modified during VCPU reset */ -#define KVM_PUT_RESET_STATE 2 -/* full state set, modified during initialization or on vmload */ -#define KVM_PUT_FULL_STATE 3 +typedef enum kvm_put_state { + /* state subset only touched by the VCPU itself during runtime */ + KVM_PUT_RUNTIME_STATE = 1, + /* state subset modified during VCPU reset */ + KVM_PUT_RESET_STATE = 2, + /* full state set, modified during initialization or on vmload */ + KVM_PUT_FULL_STATE = 3, +} KvmPutState; -int kvm_arch_put_registers(CPUState *cpu, int level, Error **errp); +int kvm_arch_put_registers(CPUState *cpu, KvmPutState level, Error **errp); int kvm_arch_get_default_type(MachineState *ms); int kvm_arch_init(MachineState *ms, KVMState *s); +int kvm_arch_pre_create_vcpu(CPUState *cpu, Error **errp); int kvm_arch_init_vcpu(CPUState *cpu); int kvm_arch_destroy_vcpu(CPUState *cpu); diff --git a/include/system/kvm_int.h b/include/system/kvm_int.h index 756a3c0..9247493 100644 --- a/include/system/kvm_int.h +++ b/include/system/kvm_int.h @@ -14,6 +14,7 @@ #include "qemu/accel.h" #include "qemu/queue.h" #include "system/kvm.h" +#include "accel/accel-ops.h" #include "hw/boards.h" #include "hw/i386/topology.h" #include "io/channel-socket.h" diff --git a/include/system/memory.h b/include/system/memory.h index fbbf4cf..3bd5ffa 100644 --- a/include/system/memory.h +++ b/include/system/memory.h @@ -19,7 +19,6 @@ #include "exec/memattrs.h" #include "exec/memop.h" #include "exec/ramlist.h" -#include "exec/tswap.h" #include "qemu/bswap.h" #include "qemu/queue.h" #include "qemu/int128.h" @@ -109,15 +108,34 @@ struct MemoryRegionSection { typedef struct IOMMUTLBEntry IOMMUTLBEntry; -/* See address_space_translate: bit 0 is read, bit 1 is write. */ +/* + * See address_space_translate: + * - bit 0 : read + * - bit 1 : write + * - bit 2 : exec + * - bit 3 : priv + * - bit 4 : global + * - bit 5 : untranslated only + */ typedef enum { IOMMU_NONE = 0, IOMMU_RO = 1, IOMMU_WO = 2, IOMMU_RW = 3, + IOMMU_EXEC = 4, + IOMMU_PRIV = 8, + IOMMU_GLOBAL = 16, + IOMMU_UNTRANSLATED_ONLY = 32, } IOMMUAccessFlags; -#define IOMMU_ACCESS_FLAG(r, w) (((r) ? IOMMU_RO : 0) | ((w) ? IOMMU_WO : 0)) +#define IOMMU_ACCESS_FLAG(r, w) (((r) ? IOMMU_RO : 0) | \ + ((w) ? IOMMU_WO : 0)) +#define IOMMU_ACCESS_FLAG_FULL(r, w, x, p, g, uo) \ + (IOMMU_ACCESS_FLAG(r, w) | \ + ((x) ? IOMMU_EXEC : 0) | \ + ((p) ? IOMMU_PRIV : 0) | \ + ((g) ? IOMMU_GLOBAL : 0) | \ + ((uo) ? IOMMU_UNTRANSLATED_ONLY : 0)) struct IOMMUTLBEntry { AddressSpace *target_as; @@ -125,6 +143,7 @@ struct IOMMUTLBEntry { hwaddr translated_addr; hwaddr addr_mask; /* 0xfff = 4k translation */ IOMMUAccessFlags perm; + uint32_t pasid; }; /* @@ -183,6 +202,7 @@ struct IOMMUNotifier { hwaddr start; hwaddr end; int iommu_idx; + void *opaque; QLIST_ENTRY(IOMMUNotifier) node; }; typedef struct IOMMUNotifier IOMMUNotifier; @@ -575,8 +595,20 @@ static inline void ram_discard_listener_init(RamDiscardListener *rdl, rdl->double_discard_supported = double_discard_supported; } -typedef int (*ReplayRamPopulate)(MemoryRegionSection *section, void *opaque); -typedef void (*ReplayRamDiscard)(MemoryRegionSection *section, void *opaque); +/** + * typedef ReplayRamDiscardState: + * + * The callback handler for #RamDiscardManagerClass.replay_populated/ + * #RamDiscardManagerClass.replay_discarded to invoke on populated/discarded + * parts. + * + * @section: the #MemoryRegionSection of populated/discarded part + * @opaque: pointer to forward to the callback + * + * Returns 0 on success, or a negative error if failed. + */ +typedef int (*ReplayRamDiscardState)(MemoryRegionSection *section, + void *opaque); /* * RamDiscardManagerClass: @@ -650,36 +682,38 @@ struct RamDiscardManagerClass { /** * @replay_populated: * - * Call the #ReplayRamPopulate callback for all populated parts within the - * #MemoryRegionSection via the #RamDiscardManager. + * Call the #ReplayRamDiscardState callback for all populated parts within + * the #MemoryRegionSection via the #RamDiscardManager. * * In case any call fails, no further calls are made. * * @rdm: the #RamDiscardManager * @section: the #MemoryRegionSection - * @replay_fn: the #ReplayRamPopulate callback + * @replay_fn: the #ReplayRamDiscardState callback * @opaque: pointer to forward to the callback * * Returns 0 on success, or a negative error if any notification failed. */ int (*replay_populated)(const RamDiscardManager *rdm, MemoryRegionSection *section, - ReplayRamPopulate replay_fn, void *opaque); + ReplayRamDiscardState replay_fn, void *opaque); /** * @replay_discarded: * - * Call the #ReplayRamDiscard callback for all discarded parts within the - * #MemoryRegionSection via the #RamDiscardManager. + * Call the #ReplayRamDiscardState callback for all discarded parts within + * the #MemoryRegionSection via the #RamDiscardManager. * * @rdm: the #RamDiscardManager * @section: the #MemoryRegionSection - * @replay_fn: the #ReplayRamDiscard callback + * @replay_fn: the #ReplayRamDiscardState callback * @opaque: pointer to forward to the callback + * + * Returns 0 on success, or a negative error if any notification failed. */ - void (*replay_discarded)(const RamDiscardManager *rdm, - MemoryRegionSection *section, - ReplayRamDiscard replay_fn, void *opaque); + int (*replay_discarded)(const RamDiscardManager *rdm, + MemoryRegionSection *section, + ReplayRamDiscardState replay_fn, void *opaque); /** * @register_listener: @@ -720,15 +754,41 @@ uint64_t ram_discard_manager_get_min_granularity(const RamDiscardManager *rdm, bool ram_discard_manager_is_populated(const RamDiscardManager *rdm, const MemoryRegionSection *section); +/** + * ram_discard_manager_replay_populated: + * + * A wrapper to call the #RamDiscardManagerClass.replay_populated callback + * of the #RamDiscardManager. + * + * @rdm: the #RamDiscardManager + * @section: the #MemoryRegionSection + * @replay_fn: the #ReplayRamDiscardState callback + * @opaque: pointer to forward to the callback + * + * Returns 0 on success, or a negative error if any notification failed. + */ int ram_discard_manager_replay_populated(const RamDiscardManager *rdm, MemoryRegionSection *section, - ReplayRamPopulate replay_fn, + ReplayRamDiscardState replay_fn, void *opaque); -void ram_discard_manager_replay_discarded(const RamDiscardManager *rdm, - MemoryRegionSection *section, - ReplayRamDiscard replay_fn, - void *opaque); +/** + * ram_discard_manager_replay_discarded: + * + * A wrapper to call the #RamDiscardManagerClass.replay_discarded callback + * of the #RamDiscardManager. + * + * @rdm: the #RamDiscardManager + * @section: the #MemoryRegionSection + * @replay_fn: the #ReplayRamDiscardState callback + * @opaque: pointer to forward to the callback + * + * Returns 0 on success, or a negative error if any notification failed. + */ +int ram_discard_manager_replay_discarded(const RamDiscardManager *rdm, + MemoryRegionSection *section, + ReplayRamDiscardState replay_fn, + void *opaque); void ram_discard_manager_register_listener(RamDiscardManager *rdm, RamDiscardListener *rdl, @@ -738,21 +798,20 @@ void ram_discard_manager_unregister_listener(RamDiscardManager *rdm, RamDiscardListener *rdl); /** - * memory_get_xlat_addr: Extract addresses from a TLB entry + * memory_translate_iotlb: Extract addresses from a TLB entry. + * Called with rcu_read_lock held. * * @iotlb: pointer to an #IOMMUTLBEntry - * @vaddr: virtual address - * @ram_addr: RAM address - * @read_only: indicates if writes are allowed - * @mr_has_discard_manager: indicates memory is controlled by a - * RamDiscardManager + * @xlat_p: return the offset of the entry from the start of the returned + * MemoryRegion. * @errp: pointer to Error*, to store an error if it happens. * - * Return: true on success, else false setting @errp with error. + * Return: On success, return the MemoryRegion containing the @iotlb translated + * addr. The MemoryRegion must not be accessed after rcu_read_unlock. + * On failure, return NULL, setting @errp with error. */ -bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, - ram_addr_t *ram_addr, bool *read_only, - bool *mr_has_discard_manager, Error **errp); +MemoryRegion *memory_translate_iotlb(IOMMUTLBEntry *iotlb, hwaddr *xlat_p, + Error **errp); typedef struct CoalescedMemoryRange CoalescedMemoryRange; typedef struct MemoryRegionIoeventfd MemoryRegionIoeventfd; @@ -774,6 +833,7 @@ struct MemoryRegion { bool nonvolatile; bool rom_device; bool flush_coalesced_mmio; + bool lockless_io; bool unmergeable; uint8_t dirty_log_mask; bool is_iommu; @@ -1212,6 +1272,36 @@ MemoryRegionSection *memory_region_section_new_copy(MemoryRegionSection *s); void memory_region_section_free_copy(MemoryRegionSection *s); /** + * memory_region_section_intersect_range: Adjust the memory section to cover + * the intersection with the given range. + * + * @s: the #MemoryRegionSection to be adjusted + * @offset: the offset of the given range in the memory region + * @size: the size of the given range + * + * Returns false if the intersection is empty, otherwise returns true. + */ +static inline bool memory_region_section_intersect_range(MemoryRegionSection *s, + uint64_t offset, + uint64_t size) +{ + uint64_t start = MAX(s->offset_within_region, offset); + Int128 end = int128_min(int128_add(int128_make64(s->offset_within_region), + s->size), + int128_add(int128_make64(offset), + int128_make64(size))); + + if (int128_le(end, int128_make64(start))) { + return false; + } + + s->offset_within_address_space += start - s->offset_within_region; + s->offset_within_region = start; + s->size = int128_sub(end, int128_make64(start)); + return true; +} + +/** * memory_region_init: Initialize a memory region * * The region typically acts as a container for other memory regions. Use @@ -2253,6 +2343,17 @@ void memory_region_set_flush_coalesced(MemoryRegion *mr); void memory_region_clear_flush_coalesced(MemoryRegion *mr); /** + * memory_region_enable_lockless_io: Enable lockless (BQL free) acceess. + * + * Enable BQL-free access for devices that are well prepared to handle + * locking during I/O themselves: either by doing fine grained locking or + * by providing lock-free I/O schemes. + * + * @mr: the memory region to be updated. + */ +void memory_region_enable_lockless_io(MemoryRegion *mr); + +/** * memory_region_add_eventfd: Request an eventfd to be triggered when a word * is written to a location. * @@ -2469,13 +2570,13 @@ static inline bool memory_region_has_ram_discard_manager(MemoryRegion *mr) * * This function must not be called for a mapped #MemoryRegion, a #MemoryRegion * that does not cover RAM, or a #MemoryRegion that already has a - * #RamDiscardManager assigned. + * #RamDiscardManager assigned. Return 0 if the rdm is set successfully. * * @mr: the #MemoryRegion * @rdm: #RamDiscardManager to set */ -void memory_region_set_ram_discard_manager(MemoryRegion *mr, - RamDiscardManager *rdm); +int memory_region_set_ram_discard_manager(MemoryRegion *mr, + RamDiscardManager *rdm); /** * memory_region_find: translate an address/size relative to a @@ -2626,15 +2727,33 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name); /** * address_space_destroy: destroy an address space * - * Releases all resources associated with an address space. After an address space - * is destroyed, its root memory region (given by address_space_init()) may be destroyed - * as well. + * Releases all resources associated with an address space. After an + * address space is destroyed, the reference the AddressSpace had to + * its root memory region is dropped, which may result in the + * destruction of that memory region as well. + * + * Note that destruction of the AddressSpace is done via RCU; + * it is therefore not valid to free the memory the AddressSpace + * struct is in until after that RCU callback has completed. + * If you want to g_free() the AddressSpace after destruction you + * can do that with address_space_destroy_free(). * * @as: address space to be destroyed */ void address_space_destroy(AddressSpace *as); /** + * address_space_destroy_free: destroy an address space and free it + * + * This does the same thing as address_space_destroy(), and then also + * frees (via g_free()) the AddressSpace itself once the destruction + * is complete. + * + * @as: address space to be destroyed + */ +void address_space_destroy_free(AddressSpace *as); + +/** * address_space_remove_listeners: unregister all listeners of an address space * * Removes all callbacks previously registered with memory_listener_register() @@ -2876,6 +2995,8 @@ void address_space_cache_invalidate(MemoryRegionCache *cache, */ void address_space_cache_destroy(MemoryRegionCache *cache); +void address_space_flush_icache_range(AddressSpace *as, hwaddr addr, hwaddr len); + /* address_space_get_iotlb_entry: translate an address into an IOTLB * entry. Should be called from an RCU critical section. */ @@ -2928,6 +3049,15 @@ static inline MemoryRegion *address_space_translate(AddressSpace *as, bool address_space_access_valid(AddressSpace *as, hwaddr addr, hwaddr len, bool is_write, MemTxAttrs attrs); +/** + * address_space_is_io: check whether an guest physical addresses + * whithin an address space is I/O memory. + * + * @as: #AddressSpace to be accessed + * @addr: address within that address space + */ +bool address_space_is_io(AddressSpace *as, hwaddr addr); + /* address_space_map: map a physical memory region into a host virtual address * * May map a subset of the requested range, given by and returned in @plen. diff --git a/include/system/mshv.h b/include/system/mshv.h new file mode 100644 index 0000000..8b1fc20 --- /dev/null +++ b/include/system/mshv.h @@ -0,0 +1,64 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: Ziqiao Zhou <ziqiaozhou@microsoft.com> + * Magnus Kulke <magnuskulke@microsoft.com> + * Jinank Jain <jinankjain@microsoft.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + * + */ + +#ifndef QEMU_MSHV_H +#define QEMU_MSHV_H + +#include "qemu/osdep.h" +#include "qemu/accel.h" +#include "hw/hyperv/hyperv-proto.h" +#include "hw/hyperv/hvhdk.h" +#include "hw/hyperv/hvgdk_mini.h" +#include "qapi/qapi-types-common.h" +#include "system/memory.h" +#include "accel/accel-ops.h" + +#ifdef COMPILING_PER_TARGET +#ifdef CONFIG_MSHV +#include <linux/mshv.h> +#define CONFIG_MSHV_IS_POSSIBLE +#endif +#else +#define CONFIG_MSHV_IS_POSSIBLE +#endif + +#define MSHV_MAX_MSI_ROUTES 4096 + +#define MSHV_PAGE_SHIFT 12 + +#ifdef CONFIG_MSHV_IS_POSSIBLE +extern bool mshv_allowed; +#define mshv_enabled() (mshv_allowed) +#define mshv_msi_via_irqfd_enabled() mshv_enabled() +#else /* CONFIG_MSHV_IS_POSSIBLE */ +#define mshv_enabled() false +#define mshv_msi_via_irqfd_enabled() mshv_enabled() +#endif + +typedef struct MshvState MshvState; +extern MshvState *mshv_state; + +/* interrupt */ +int mshv_request_interrupt(MshvState *mshv_state, uint32_t interrupt_type, uint32_t vector, + uint32_t vp_index, bool logical_destination_mode, + bool level_triggered); + +int mshv_irqchip_add_msi_route(int vector, PCIDevice *dev); +int mshv_irqchip_update_msi_route(int virq, MSIMessage msg, PCIDevice *dev); +void mshv_irqchip_commit_routes(void); +void mshv_irqchip_release_virq(int virq); +int mshv_irqchip_add_irqfd_notifier_gsi(const EventNotifier *n, + const EventNotifier *rn, int virq); +int mshv_irqchip_remove_irqfd_notifier_gsi(const EventNotifier *n, int virq); + +#endif diff --git a/include/system/mshv_int.h b/include/system/mshv_int.h new file mode 100644 index 0000000..490563c --- /dev/null +++ b/include/system/mshv_int.h @@ -0,0 +1,155 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: Ziqiao Zhou <ziqiaozhou@microsoft.com> + * Magnus Kulke <magnuskulke@microsoft.com> + * Jinank Jain <jinankjain@microsoft.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + * + */ + +#ifndef QEMU_MSHV_INT_H +#define QEMU_MSHV_INT_H + +#define MSHV_MSR_ENTRIES_COUNT 64 + +#define MSHV_MAX_MEM_SLOTS 32 + +typedef struct hyperv_message hv_message; + +typedef struct MshvHvCallArgs { + void *base; + void *input_page; + void *output_page; +} MshvHvCallArgs; + +struct AccelCPUState { + int cpufd; + bool dirty; + MshvHvCallArgs hvcall_args; +}; + +typedef struct MshvMemoryListener { + MemoryListener listener; + int as_id; +} MshvMemoryListener; + +typedef struct MshvAddressSpace { + MshvMemoryListener *ml; + AddressSpace *as; +} MshvAddressSpace; + +typedef struct MshvMemorySlotManager { + size_t n_slots; + GList *slots; + QemuMutex mutex; +} MshvMemorySlotManager; + +struct MshvState { + AccelState parent_obj; + int vm; + MshvMemoryListener memory_listener; + /* number of listeners */ + int nr_as; + MshvAddressSpace *as; + int fd; + MshvMemorySlotManager msm; +}; + +typedef struct MshvMsiControl { + bool updated; + GHashTable *gsi_routes; +} MshvMsiControl; + +#define mshv_vcpufd(cpu) (cpu->accel->cpufd) + +/* cpu */ +typedef struct MshvFPU { + uint8_t fpr[8][16]; + uint16_t fcw; + uint16_t fsw; + uint8_t ftwx; + uint8_t pad1; + uint16_t last_opcode; + uint64_t last_ip; + uint64_t last_dp; + uint8_t xmm[16][16]; + uint32_t mxcsr; + uint32_t pad2; +} MshvFPU; + +typedef enum MshvVmExit { + MshvVmExitIgnore = 0, + MshvVmExitShutdown = 1, + MshvVmExitSpecial = 2, +} MshvVmExit; + +typedef enum MshvRemapResult { + MshvRemapOk = 0, + MshvRemapNoMapping = 1, + MshvRemapNoOverlap = 2, +} MshvRemapResult; + +void mshv_init_mmio_emu(void); +int mshv_create_vcpu(int vm_fd, uint8_t vp_index, int *cpu_fd); +void mshv_remove_vcpu(int vm_fd, int cpu_fd); +int mshv_configure_vcpu(const CPUState *cpu, const MshvFPU *fpu, uint64_t xcr0); +int mshv_get_standard_regs(CPUState *cpu); +int mshv_get_special_regs(CPUState *cpu); +int mshv_run_vcpu(int vm_fd, CPUState *cpu, hv_message *msg, MshvVmExit *exit); +int mshv_load_regs(CPUState *cpu); +int mshv_store_regs(CPUState *cpu); +int mshv_set_generic_regs(const CPUState *cpu, const hv_register_assoc *assocs, + size_t n_regs); +int mshv_arch_put_registers(const CPUState *cpu); +void mshv_arch_init_vcpu(CPUState *cpu); +void mshv_arch_destroy_vcpu(CPUState *cpu); +void mshv_arch_amend_proc_features( + union hv_partition_synthetic_processor_features *features); +int mshv_arch_post_init_vm(int vm_fd); + +#if defined COMPILING_PER_TARGET && defined CONFIG_MSHV_IS_POSSIBLE +int mshv_hvcall(int fd, const struct mshv_root_hvcall *args); +#endif + +/* memory */ +typedef struct MshvMemorySlot { + uint64_t guest_phys_addr; + uint64_t memory_size; + uint64_t userspace_addr; + bool readonly; + bool mapped; +} MshvMemorySlot; + +MshvRemapResult mshv_remap_overlap_region(int vm_fd, uint64_t gpa); +int mshv_guest_mem_read(uint64_t gpa, uint8_t *data, uintptr_t size, + bool is_secure_mode, bool instruction_fetch); +int mshv_guest_mem_write(uint64_t gpa, const uint8_t *data, uintptr_t size, + bool is_secure_mode); +void mshv_set_phys_mem(MshvMemoryListener *mml, MemoryRegionSection *section, + bool add); +void mshv_init_memory_slot_manager(MshvState *mshv_state); + +/* msr */ +typedef struct MshvMsrEntry { + uint32_t index; + uint32_t reserved; + uint64_t data; +} MshvMsrEntry; + +typedef struct MshvMsrEntries { + MshvMsrEntry entries[MSHV_MSR_ENTRIES_COUNT]; + uint32_t nmsrs; +} MshvMsrEntries; + +int mshv_configure_msr(const CPUState *cpu, const MshvMsrEntry *msrs, + size_t n_msrs); + +/* interrupt */ +void mshv_init_msicontrol(void); +int mshv_reserve_ioapic_msi_routes(int vm_fd); + +#endif diff --git a/include/system/nvmm.h b/include/system/nvmm.h index 6971ddb..7390def 100644 --- a/include/system/nvmm.h +++ b/include/system/nvmm.h @@ -13,17 +13,18 @@ #define QEMU_NVMM_H #ifdef COMPILING_PER_TARGET - -#ifdef CONFIG_NVMM - -int nvmm_enabled(void); - -#else /* CONFIG_NVMM */ - -#define nvmm_enabled() (0) - -#endif /* CONFIG_NVMM */ - +# ifdef CONFIG_NVMM +# define CONFIG_NVMM_IS_POSSIBLE +# endif /* !CONFIG_NVMM */ +#else +# define CONFIG_NVMM_IS_POSSIBLE #endif /* COMPILING_PER_TARGET */ +#ifdef CONFIG_NVMM_IS_POSSIBLE +extern bool nvmm_allowed; +#define nvmm_enabled() (nvmm_allowed) +#else /* !CONFIG_NVMM_IS_POSSIBLE */ +#define nvmm_enabled() 0 +#endif /* !CONFIG_NVMM_IS_POSSIBLE */ + #endif /* QEMU_NVMM_H */ diff --git a/include/system/os-win32.h b/include/system/os-win32.h index 3aa6cee..22d72ba 100644 --- a/include/system/os-win32.h +++ b/include/system/os-win32.h @@ -168,11 +168,14 @@ static inline void qemu_funlockfile(FILE *f) #endif } -/* Helper for WSAEventSelect, to report errors */ +/* Helpers for WSAEventSelect() */ bool qemu_socket_select(int sockfd, WSAEVENT hEventObject, long lNetworkEvents, Error **errp); +void qemu_socket_select_nofail(int sockfd, WSAEVENT hEventObject, + long lNetworkEvents); bool qemu_socket_unselect(int sockfd, Error **errp); +void qemu_socket_unselect_nofail(int sockfd); /* We wrap all the sockets functions so that we can set errno based on * WSAGetLastError(), and use file-descriptors instead of SOCKET. diff --git a/include/system/physmem.h b/include/system/physmem.h new file mode 100644 index 0000000..879f6ea --- /dev/null +++ b/include/system/physmem.h @@ -0,0 +1,54 @@ +/* + * QEMU physical memory interfaces (target independent). + * + * Copyright (c) 2003 Fabrice Bellard + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ +#ifndef QEMU_SYSTEM_PHYSMEM_H +#define QEMU_SYSTEM_PHYSMEM_H + +#include "exec/hwaddr.h" +#include "exec/ramlist.h" + +#define DIRTY_CLIENTS_ALL ((1 << DIRTY_MEMORY_NUM) - 1) +#define DIRTY_CLIENTS_NOCODE (DIRTY_CLIENTS_ALL & ~(1 << DIRTY_MEMORY_CODE)) + +bool physical_memory_get_dirty_flag(ram_addr_t addr, unsigned client); + +bool physical_memory_is_clean(ram_addr_t addr); + +uint8_t physical_memory_range_includes_clean(ram_addr_t start, + ram_addr_t length, + uint8_t mask); + +void physical_memory_set_dirty_flag(ram_addr_t addr, unsigned client); + +void physical_memory_set_dirty_range(ram_addr_t start, ram_addr_t length, + uint8_t mask); + +/* + * Contrary to physical_memory_sync_dirty_bitmap() this function returns + * the number of dirty pages in @bitmap passed as argument. On the other hand, + * physical_memory_sync_dirty_bitmap() returns newly dirtied pages that + * weren't set in the global migration bitmap. + */ +uint64_t physical_memory_set_dirty_lebitmap(unsigned long *bitmap, + ram_addr_t start, + ram_addr_t pages); + +void physical_memory_dirty_bits_cleared(ram_addr_t start, ram_addr_t length); + +bool physical_memory_test_and_clear_dirty(ram_addr_t start, + ram_addr_t length, + unsigned client); + +DirtyBitmapSnapshot * +physical_memory_snapshot_and_clear_dirty(MemoryRegion *mr, hwaddr offset, + hwaddr length, unsigned client); + +bool physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap, + ram_addr_t start, + ram_addr_t length); + +#endif diff --git a/include/system/ram_addr.h b/include/system/ram_addr.h index 15a1b1a..6834859 100644 --- a/include/system/ram_addr.h +++ b/include/system/ram_addr.h @@ -19,17 +19,9 @@ #ifndef SYSTEM_RAM_ADDR_H #define SYSTEM_RAM_ADDR_H -#include "system/xen.h" -#include "system/tcg.h" -#include "exec/cputlb.h" -#include "exec/ramlist.h" #include "system/ramblock.h" -#include "system/memory.h" #include "exec/target_page.h" -#include "qemu/rcu.h" - #include "exec/hwaddr.h" -#include "exec/cpu-common.h" extern uint64_t total_dirty_pages; @@ -80,17 +72,6 @@ static inline bool clear_bmap_test_and_clear(RAMBlock *rb, uint64_t page) return bitmap_test_and_clear(rb->clear_bmap, page >> shift, 1); } -static inline bool offset_in_ramblock(RAMBlock *b, ram_addr_t offset) -{ - return (b && b->host && offset < b->used_length) ? true : false; -} - -static inline void *ramblock_ptr(RAMBlock *block, ram_addr_t offset) -{ - assert(offset_in_ramblock(block, offset)); - return (char *)block->host + offset; -} - static inline unsigned long int ramblock_recv_bitmap_offset(void *host_addr, RAMBlock *rb) { @@ -99,8 +80,6 @@ static inline unsigned long int ramblock_recv_bitmap_offset(void *host_addr, return host_addr_offset >> TARGET_PAGE_BITS; } -bool ramblock_is_pmem(RAMBlock *rb); - /** * qemu_ram_alloc_from_file, * qemu_ram_alloc_from_fd: Allocate a ram block from the specified backing @@ -153,409 +132,4 @@ static inline void qemu_ram_block_writeback(RAMBlock *block) qemu_ram_msync(block, 0, block->used_length); } -#define DIRTY_CLIENTS_ALL ((1 << DIRTY_MEMORY_NUM) - 1) -#define DIRTY_CLIENTS_NOCODE (DIRTY_CLIENTS_ALL & ~(1 << DIRTY_MEMORY_CODE)) - -static inline bool cpu_physical_memory_get_dirty(ram_addr_t start, - ram_addr_t length, - unsigned client) -{ - DirtyMemoryBlocks *blocks; - unsigned long end, page; - unsigned long idx, offset, base; - bool dirty = false; - - assert(client < DIRTY_MEMORY_NUM); - - end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS; - page = start >> TARGET_PAGE_BITS; - - WITH_RCU_READ_LOCK_GUARD() { - blocks = qatomic_rcu_read(&ram_list.dirty_memory[client]); - - idx = page / DIRTY_MEMORY_BLOCK_SIZE; - offset = page % DIRTY_MEMORY_BLOCK_SIZE; - base = page - offset; - while (page < end) { - unsigned long next = MIN(end, base + DIRTY_MEMORY_BLOCK_SIZE); - unsigned long num = next - base; - unsigned long found = find_next_bit(blocks->blocks[idx], - num, offset); - if (found < num) { - dirty = true; - break; - } - - page = next; - idx++; - offset = 0; - base += DIRTY_MEMORY_BLOCK_SIZE; - } - } - - return dirty; -} - -static inline bool cpu_physical_memory_all_dirty(ram_addr_t start, - ram_addr_t length, - unsigned client) -{ - DirtyMemoryBlocks *blocks; - unsigned long end, page; - unsigned long idx, offset, base; - bool dirty = true; - - assert(client < DIRTY_MEMORY_NUM); - - end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS; - page = start >> TARGET_PAGE_BITS; - - RCU_READ_LOCK_GUARD(); - - blocks = qatomic_rcu_read(&ram_list.dirty_memory[client]); - - idx = page / DIRTY_MEMORY_BLOCK_SIZE; - offset = page % DIRTY_MEMORY_BLOCK_SIZE; - base = page - offset; - while (page < end) { - unsigned long next = MIN(end, base + DIRTY_MEMORY_BLOCK_SIZE); - unsigned long num = next - base; - unsigned long found = find_next_zero_bit(blocks->blocks[idx], num, offset); - if (found < num) { - dirty = false; - break; - } - - page = next; - idx++; - offset = 0; - base += DIRTY_MEMORY_BLOCK_SIZE; - } - - return dirty; -} - -static inline bool cpu_physical_memory_get_dirty_flag(ram_addr_t addr, - unsigned client) -{ - return cpu_physical_memory_get_dirty(addr, 1, client); -} - -static inline bool cpu_physical_memory_is_clean(ram_addr_t addr) -{ - bool vga = cpu_physical_memory_get_dirty_flag(addr, DIRTY_MEMORY_VGA); - bool code = cpu_physical_memory_get_dirty_flag(addr, DIRTY_MEMORY_CODE); - bool migration = - cpu_physical_memory_get_dirty_flag(addr, DIRTY_MEMORY_MIGRATION); - return !(vga && code && migration); -} - -static inline uint8_t cpu_physical_memory_range_includes_clean(ram_addr_t start, - ram_addr_t length, - uint8_t mask) -{ - uint8_t ret = 0; - - if (mask & (1 << DIRTY_MEMORY_VGA) && - !cpu_physical_memory_all_dirty(start, length, DIRTY_MEMORY_VGA)) { - ret |= (1 << DIRTY_MEMORY_VGA); - } - if (mask & (1 << DIRTY_MEMORY_CODE) && - !cpu_physical_memory_all_dirty(start, length, DIRTY_MEMORY_CODE)) { - ret |= (1 << DIRTY_MEMORY_CODE); - } - if (mask & (1 << DIRTY_MEMORY_MIGRATION) && - !cpu_physical_memory_all_dirty(start, length, DIRTY_MEMORY_MIGRATION)) { - ret |= (1 << DIRTY_MEMORY_MIGRATION); - } - return ret; -} - -static inline void cpu_physical_memory_set_dirty_flag(ram_addr_t addr, - unsigned client) -{ - unsigned long page, idx, offset; - DirtyMemoryBlocks *blocks; - - assert(client < DIRTY_MEMORY_NUM); - - page = addr >> TARGET_PAGE_BITS; - idx = page / DIRTY_MEMORY_BLOCK_SIZE; - offset = page % DIRTY_MEMORY_BLOCK_SIZE; - - RCU_READ_LOCK_GUARD(); - - blocks = qatomic_rcu_read(&ram_list.dirty_memory[client]); - - set_bit_atomic(offset, blocks->blocks[idx]); -} - -static inline void cpu_physical_memory_set_dirty_range(ram_addr_t start, - ram_addr_t length, - uint8_t mask) -{ - DirtyMemoryBlocks *blocks[DIRTY_MEMORY_NUM]; - unsigned long end, page; - unsigned long idx, offset, base; - int i; - - if (!mask && !xen_enabled()) { - return; - } - - end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS; - page = start >> TARGET_PAGE_BITS; - - WITH_RCU_READ_LOCK_GUARD() { - for (i = 0; i < DIRTY_MEMORY_NUM; i++) { - blocks[i] = qatomic_rcu_read(&ram_list.dirty_memory[i]); - } - - idx = page / DIRTY_MEMORY_BLOCK_SIZE; - offset = page % DIRTY_MEMORY_BLOCK_SIZE; - base = page - offset; - while (page < end) { - unsigned long next = MIN(end, base + DIRTY_MEMORY_BLOCK_SIZE); - - if (likely(mask & (1 << DIRTY_MEMORY_MIGRATION))) { - bitmap_set_atomic(blocks[DIRTY_MEMORY_MIGRATION]->blocks[idx], - offset, next - page); - } - if (unlikely(mask & (1 << DIRTY_MEMORY_VGA))) { - bitmap_set_atomic(blocks[DIRTY_MEMORY_VGA]->blocks[idx], - offset, next - page); - } - if (unlikely(mask & (1 << DIRTY_MEMORY_CODE))) { - bitmap_set_atomic(blocks[DIRTY_MEMORY_CODE]->blocks[idx], - offset, next - page); - } - - page = next; - idx++; - offset = 0; - base += DIRTY_MEMORY_BLOCK_SIZE; - } - } - - if (xen_enabled()) { - xen_hvm_modified_memory(start, length); - } -} - -#if !defined(_WIN32) - -/* - * Contrary to cpu_physical_memory_sync_dirty_bitmap() this function returns - * the number of dirty pages in @bitmap passed as argument. On the other hand, - * cpu_physical_memory_sync_dirty_bitmap() returns newly dirtied pages that - * weren't set in the global migration bitmap. - */ -static inline -uint64_t cpu_physical_memory_set_dirty_lebitmap(unsigned long *bitmap, - ram_addr_t start, - ram_addr_t pages) -{ - unsigned long i, j; - unsigned long page_number, c, nbits; - hwaddr addr; - ram_addr_t ram_addr; - uint64_t num_dirty = 0; - unsigned long len = (pages + HOST_LONG_BITS - 1) / HOST_LONG_BITS; - unsigned long hpratio = qemu_real_host_page_size() / TARGET_PAGE_SIZE; - unsigned long page = BIT_WORD(start >> TARGET_PAGE_BITS); - - /* start address is aligned at the start of a word? */ - if ((((page * BITS_PER_LONG) << TARGET_PAGE_BITS) == start) && - (hpratio == 1)) { - unsigned long **blocks[DIRTY_MEMORY_NUM]; - unsigned long idx; - unsigned long offset; - long k; - long nr = BITS_TO_LONGS(pages); - - idx = (start >> TARGET_PAGE_BITS) / DIRTY_MEMORY_BLOCK_SIZE; - offset = BIT_WORD((start >> TARGET_PAGE_BITS) % - DIRTY_MEMORY_BLOCK_SIZE); - - WITH_RCU_READ_LOCK_GUARD() { - for (i = 0; i < DIRTY_MEMORY_NUM; i++) { - blocks[i] = - qatomic_rcu_read(&ram_list.dirty_memory[i])->blocks; - } - - for (k = 0; k < nr; k++) { - if (bitmap[k]) { - unsigned long temp = leul_to_cpu(bitmap[k]); - - nbits = ctpopl(temp); - qatomic_or(&blocks[DIRTY_MEMORY_VGA][idx][offset], temp); - - if (global_dirty_tracking) { - qatomic_or( - &blocks[DIRTY_MEMORY_MIGRATION][idx][offset], - temp); - if (unlikely( - global_dirty_tracking & GLOBAL_DIRTY_DIRTY_RATE)) { - total_dirty_pages += nbits; - } - } - - num_dirty += nbits; - - if (tcg_enabled()) { - qatomic_or(&blocks[DIRTY_MEMORY_CODE][idx][offset], - temp); - } - } - - if (++offset >= BITS_TO_LONGS(DIRTY_MEMORY_BLOCK_SIZE)) { - offset = 0; - idx++; - } - } - } - - if (xen_enabled()) { - xen_hvm_modified_memory(start, pages << TARGET_PAGE_BITS); - } - } else { - uint8_t clients = tcg_enabled() ? DIRTY_CLIENTS_ALL : DIRTY_CLIENTS_NOCODE; - - if (!global_dirty_tracking) { - clients &= ~(1 << DIRTY_MEMORY_MIGRATION); - } - - /* - * bitmap-traveling is faster than memory-traveling (for addr...) - * especially when most of the memory is not dirty. - */ - for (i = 0; i < len; i++) { - if (bitmap[i] != 0) { - c = leul_to_cpu(bitmap[i]); - nbits = ctpopl(c); - if (unlikely(global_dirty_tracking & GLOBAL_DIRTY_DIRTY_RATE)) { - total_dirty_pages += nbits; - } - num_dirty += nbits; - do { - j = ctzl(c); - c &= ~(1ul << j); - page_number = (i * HOST_LONG_BITS + j) * hpratio; - addr = page_number * TARGET_PAGE_SIZE; - ram_addr = start + addr; - cpu_physical_memory_set_dirty_range(ram_addr, - TARGET_PAGE_SIZE * hpratio, clients); - } while (c != 0); - } - } - } - - return num_dirty; -} -#endif /* not _WIN32 */ - -static inline void cpu_physical_memory_dirty_bits_cleared(ram_addr_t start, - ram_addr_t length) -{ - if (tcg_enabled()) { - tlb_reset_dirty_range_all(start, length); - } - -} -bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start, - ram_addr_t length, - unsigned client); - -DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty - (MemoryRegion *mr, hwaddr offset, hwaddr length, unsigned client); - -bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap, - ram_addr_t start, - ram_addr_t length); - -static inline void cpu_physical_memory_clear_dirty_range(ram_addr_t start, - ram_addr_t length) -{ - cpu_physical_memory_test_and_clear_dirty(start, length, DIRTY_MEMORY_MIGRATION); - cpu_physical_memory_test_and_clear_dirty(start, length, DIRTY_MEMORY_VGA); - cpu_physical_memory_test_and_clear_dirty(start, length, DIRTY_MEMORY_CODE); -} - - -/* Called with RCU critical section */ -static inline -uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock *rb, - ram_addr_t start, - ram_addr_t length) -{ - ram_addr_t addr; - unsigned long word = BIT_WORD((start + rb->offset) >> TARGET_PAGE_BITS); - uint64_t num_dirty = 0; - unsigned long *dest = rb->bmap; - - /* start address and length is aligned at the start of a word? */ - if (((word * BITS_PER_LONG) << TARGET_PAGE_BITS) == - (start + rb->offset) && - !(length & ((BITS_PER_LONG << TARGET_PAGE_BITS) - 1))) { - int k; - int nr = BITS_TO_LONGS(length >> TARGET_PAGE_BITS); - unsigned long * const *src; - unsigned long idx = (word * BITS_PER_LONG) / DIRTY_MEMORY_BLOCK_SIZE; - unsigned long offset = BIT_WORD((word * BITS_PER_LONG) % - DIRTY_MEMORY_BLOCK_SIZE); - unsigned long page = BIT_WORD(start >> TARGET_PAGE_BITS); - - src = qatomic_rcu_read( - &ram_list.dirty_memory[DIRTY_MEMORY_MIGRATION])->blocks; - - for (k = page; k < page + nr; k++) { - if (src[idx][offset]) { - unsigned long bits = qatomic_xchg(&src[idx][offset], 0); - unsigned long new_dirty; - new_dirty = ~dest[k]; - dest[k] |= bits; - new_dirty &= bits; - num_dirty += ctpopl(new_dirty); - } - - if (++offset >= BITS_TO_LONGS(DIRTY_MEMORY_BLOCK_SIZE)) { - offset = 0; - idx++; - } - } - if (num_dirty) { - cpu_physical_memory_dirty_bits_cleared(start, length); - } - - if (rb->clear_bmap) { - /* - * Postpone the dirty bitmap clear to the point before we - * really send the pages, also we will split the clear - * dirty procedure into smaller chunks. - */ - clear_bmap_set(rb, start >> TARGET_PAGE_BITS, - length >> TARGET_PAGE_BITS); - } else { - /* Slow path - still do that in a huge chunk */ - memory_region_clear_dirty_bitmap(rb->mr, start, length); - } - } else { - ram_addr_t offset = rb->offset; - - for (addr = 0; addr < length; addr += TARGET_PAGE_SIZE) { - if (cpu_physical_memory_test_and_clear_dirty( - start + addr + offset, - TARGET_PAGE_SIZE, - DIRTY_MEMORY_MIGRATION)) { - long k = (start + addr) >> TARGET_PAGE_BITS; - if (!test_and_set_bit(k, dest)) { - num_dirty++; - } - } - } - } - - return num_dirty; -} - #endif diff --git a/include/system/ramblock.h b/include/system/ramblock.h index d8a116b..76694fe 100644 --- a/include/system/ramblock.h +++ b/include/system/ramblock.h @@ -11,17 +11,16 @@ * */ -/* - * This header is for use by exec.c and memory.c ONLY. Do not include it. - * The functions declared here will be removed soon. - */ - #ifndef SYSTEM_RAMBLOCK_H #define SYSTEM_RAMBLOCK_H #include "exec/cpu-common.h" #include "qemu/rcu.h" #include "exec/ramlist.h" +#include "system/hostmem.h" + +#define TYPE_RAM_BLOCK_ATTRIBUTES "ram-block-attributes" +OBJECT_DECLARE_SIMPLE_TYPE(RamBlockAttributes, RAM_BLOCK_ATTRIBUTES) struct RAMBlock { struct rcu_head rcu; @@ -42,6 +41,7 @@ struct RAMBlock { int fd; uint64_t fd_offset; int guest_memfd; + RamBlockAttributes *attributes; size_t page_size; /* dirty bitmap used during migration */ unsigned long *bmap; @@ -91,4 +91,43 @@ struct RAMBlock { ram_addr_t postcopy_length; }; +struct RamBlockAttributes { + Object parent; + + RAMBlock *ram_block; + + /* 1-setting of the bitmap represents ram is populated (shared) */ + unsigned bitmap_size; + unsigned long *bitmap; + + QLIST_HEAD(, RamDiscardListener) rdl_list; +}; + +/* @offset: the offset within the RAMBlock */ +int ram_block_discard_range(RAMBlock *rb, uint64_t offset, size_t length); +/* @offset: the offset within the RAMBlock */ +int ram_block_discard_guest_memfd_range(RAMBlock *rb, uint64_t offset, + size_t length); + +RamBlockAttributes *ram_block_attributes_create(RAMBlock *ram_block); +void ram_block_attributes_destroy(RamBlockAttributes *attr); +int ram_block_attributes_state_change(RamBlockAttributes *attr, uint64_t offset, + uint64_t size, bool to_discard); + +/** + * ram_block_is_pmem: Whether the RAM block is of persistent memory + */ +bool ram_block_is_pmem(RAMBlock *rb); + +static inline bool offset_in_ramblock(RAMBlock *b, ram_addr_t offset) +{ + return b && b->host && (offset < b->used_length); +} + +static inline void *ramblock_ptr(RAMBlock *block, ram_addr_t offset) +{ + assert(offset_in_ramblock(block, offset)); + return (char *)block->host + offset; +} + #endif diff --git a/include/system/runstate.h b/include/system/runstate.h index bffc371..929379a 100644 --- a/include/system/runstate.h +++ b/include/system/runstate.h @@ -12,29 +12,76 @@ bool runstate_needs_reset(void); void runstate_replay_enable(void); typedef void VMChangeStateHandler(void *opaque, bool running, RunState state); +typedef int VMChangeStateHandlerWithRet(void *opaque, bool running, RunState state); +/** + * qemu_add_vm_change_state_handler: + * @cb: the callback to invoke + * @opaque: user data passed to the callback + * + * Register a callback function that is invoked when the vm starts or stops + * running. + * + * Returns: an entry to be freed using qemu_del_vm_change_state_handler() + */ VMChangeStateEntry *qemu_add_vm_change_state_handler(VMChangeStateHandler *cb, void *opaque); +/** + * qemu_add_vm_change_state_handler_prio: + * @cb: the callback to invoke + * @opaque: user data passed to the callback + * @priority: low priorities execute first when the vm runs and the reverse is + * true when the vm stops + * + * Register a callback function that is invoked when the vm starts or stops + * running. + * + * Returns: an entry to be freed using qemu_del_vm_change_state_handler() + */ VMChangeStateEntry *qemu_add_vm_change_state_handler_prio( VMChangeStateHandler *cb, void *opaque, int priority); VMChangeStateEntry * +/** + * qemu_add_vm_change_state_handler_prio_full: + * @cb: the main callback to invoke + * @prepare_cb: a callback to invoke before the main callback + * @cb_ret: the main callback to invoke with return value + * @opaque: user data passed to the callbacks + * @priority: low priorities execute first when the vm runs and the reverse is + * true when the vm stops + * + * Register a main callback function and an optional prepare callback function + * that are invoked when the vm starts or stops running. The main callback and + * the prepare callback are called in two separate phases: First all prepare + * callbacks are called and only then all main callbacks are called. As its + * name suggests, the prepare callback can be used to do some preparatory work + * before invoking the main callback. + * + * Returns: an entry to be freed using qemu_del_vm_change_state_handler() + */ qemu_add_vm_change_state_handler_prio_full(VMChangeStateHandler *cb, VMChangeStateHandler *prepare_cb, + VMChangeStateHandlerWithRet *cb_ret, void *opaque, int priority); VMChangeStateEntry *qdev_add_vm_change_state_handler(DeviceState *dev, VMChangeStateHandler *cb, + VMChangeStateHandlerWithRet *cb_ret, void *opaque); VMChangeStateEntry *qdev_add_vm_change_state_handler_full( - DeviceState *dev, VMChangeStateHandler *cb, - VMChangeStateHandler *prepare_cb, void *opaque); + DeviceState *dev, VMChangeStateHandler *cb, VMChangeStateHandler *prepare_cb, + VMChangeStateHandlerWithRet *cb_ret, void *opaque); void qemu_del_vm_change_state_handler(VMChangeStateEntry *e); /** * vm_state_notify: Notify the state of the VM * * @running: whether the VM is running or not. * @state: the #RunState of the VM. + * + * Return the result of the callback which has return value. + * If no callback has return value, still return 0 and the + * upper layer should not do additional processing. */ -void vm_state_notify(bool running, RunState state); +int vm_state_notify(bool running, RunState state); static inline bool shutdown_caused_by_guest(ShutdownCause cause) { @@ -100,6 +147,7 @@ void qemu_system_vmstop_request(RunState reason); void qemu_system_vmstop_request_prepare(void); bool qemu_vmstop_requested(RunState *r); ShutdownCause qemu_shutdown_requested_get(void); +bool qemu_force_shutdown_requested(void); ShutdownCause qemu_reset_requested_get(void); void qemu_system_killed(int signal, pid_t pid); void qemu_system_reset(ShutdownCause reason); diff --git a/include/system/system.h b/include/system/system.h index a7effe7..03a2d0e 100644 --- a/include/system/system.h +++ b/include/system/system.h @@ -42,7 +42,6 @@ extern int graphic_height; extern int graphic_depth; extern int display_opengl; extern const char *keyboard_layout; -extern int old_param; extern uint8_t *boot_splash_filedata; extern bool enable_cpu_pm; extern QEMUClockType rtc_clock; diff --git a/include/system/vhost-user-backend.h b/include/system/vhost-user-backend.h index 5ed953c..5634ebd 100644 --- a/include/system/vhost-user-backend.h +++ b/include/system/vhost-user-backend.h @@ -43,6 +43,6 @@ struct VhostUserBackend { int vhost_user_backend_dev_init(VhostUserBackend *b, VirtIODevice *vdev, unsigned nvqs, Error **errp); void vhost_user_backend_start(VhostUserBackend *b); -void vhost_user_backend_stop(VhostUserBackend *b); +int vhost_user_backend_stop(VhostUserBackend *b); #endif diff --git a/include/system/whpx.h b/include/system/whpx.h index 00ff409..00f6a3e 100644 --- a/include/system/whpx.h +++ b/include/system/whpx.h @@ -16,19 +16,20 @@ #define QEMU_WHPX_H #ifdef COMPILING_PER_TARGET +# ifdef CONFIG_WHPX +# define CONFIG_WHPX_IS_POSSIBLE +# endif /* !CONFIG_WHPX */ +#else +# define CONFIG_WHPX_IS_POSSIBLE +#endif /* COMPILING_PER_TARGET */ -#ifdef CONFIG_WHPX - -int whpx_enabled(void); +#ifdef CONFIG_WHPX_IS_POSSIBLE +extern bool whpx_allowed; +#define whpx_enabled() (whpx_allowed) bool whpx_apic_in_platform(void); - -#else /* CONFIG_WHPX */ - -#define whpx_enabled() (0) +#else /* !CONFIG_WHPX_IS_POSSIBLE */ +#define whpx_enabled() 0 #define whpx_apic_in_platform() (0) - -#endif /* CONFIG_WHPX */ - -#endif /* COMPILING_PER_TARGET */ +#endif /* !CONFIG_WHPX_IS_POSSIBLE */ #endif /* QEMU_WHPX_H */ |