aboutsummaryrefslogtreecommitdiff
path: root/include/system
diff options
context:
space:
mode:
Diffstat (limited to 'include/system')
-rw-r--r--include/system/accel-irq.h37
-rw-r--r--include/system/accel-ops.h73
-rw-r--r--include/system/block-backend-global-state.h8
-rw-r--r--include/system/confidential-guest-support.h88
-rw-r--r--include/system/cpus.h8
-rw-r--r--include/system/host_iommu_device.h15
-rw-r--r--include/system/hvf.h37
-rw-r--r--include/system/hvf_int.h39
-rw-r--r--include/system/hw_accel.h20
-rw-r--r--include/system/igvm-cfg.h49
-rw-r--r--include/system/iommufd.h65
-rw-r--r--include/system/kvm.h44
-rw-r--r--include/system/kvm_int.h1
-rw-r--r--include/system/memory.h202
-rw-r--r--include/system/mshv.h64
-rw-r--r--include/system/mshv_int.h155
-rw-r--r--include/system/nvmm.h23
-rw-r--r--include/system/os-win32.h5
-rw-r--r--include/system/physmem.h54
-rw-r--r--include/system/ram_addr.h426
-rw-r--r--include/system/ramblock.h49
-rw-r--r--include/system/runstate.h54
-rw-r--r--include/system/system.h1
-rw-r--r--include/system/vhost-user-backend.h2
-rw-r--r--include/system/whpx.h23
25 files changed, 889 insertions, 653 deletions
diff --git a/include/system/accel-irq.h b/include/system/accel-irq.h
new file mode 100644
index 0000000..671fb7d
--- /dev/null
+++ b/include/system/accel-irq.h
@@ -0,0 +1,37 @@
+/*
+ * Accelerated irqchip abstraction
+ *
+ * Copyright Microsoft, Corp. 2025
+ *
+ * Authors: Ziqiao Zhou <ziqiaozhou@microsoft.com>
+ * Magnus Kulke <magnuskulke@microsoft.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef SYSTEM_ACCEL_IRQ_H
+#define SYSTEM_ACCEL_IRQ_H
+#include "hw/pci/msi.h"
+#include "qemu/osdep.h"
+#include "system/kvm.h"
+#include "system/mshv.h"
+
+static inline bool accel_msi_via_irqfd_enabled(void)
+{
+ return mshv_msi_via_irqfd_enabled() || kvm_msi_via_irqfd_enabled();
+}
+
+static inline bool accel_irqchip_is_split(void)
+{
+ return mshv_msi_via_irqfd_enabled() || kvm_irqchip_is_split();
+}
+
+int accel_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev);
+int accel_irqchip_update_msi_route(int vector, MSIMessage msg, PCIDevice *dev);
+void accel_irqchip_commit_route_changes(KVMRouteChange *c);
+void accel_irqchip_commit_routes(void);
+void accel_irqchip_release_virq(int virq);
+int accel_irqchip_add_irqfd_notifier_gsi(EventNotifier *n, EventNotifier *rn,
+ int virq);
+int accel_irqchip_remove_irqfd_notifier_gsi(EventNotifier *n, int virq);
+#endif
diff --git a/include/system/accel-ops.h b/include/system/accel-ops.h
deleted file mode 100644
index 4c99d25..0000000
--- a/include/system/accel-ops.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Accelerator OPS, used for cpus.c module
- *
- * Copyright 2021 SUSE LLC
- *
- * This work is licensed under the terms of the GNU GPL, version 2 or later.
- * See the COPYING file in the top-level directory.
- */
-
-#ifndef ACCEL_OPS_H
-#define ACCEL_OPS_H
-
-#include "exec/vaddr.h"
-#include "qom/object.h"
-
-#define ACCEL_OPS_SUFFIX "-ops"
-#define TYPE_ACCEL_OPS "accel" ACCEL_OPS_SUFFIX
-#define ACCEL_OPS_NAME(name) (name "-" TYPE_ACCEL_OPS)
-
-DECLARE_CLASS_CHECKERS(AccelOpsClass, ACCEL_OPS, TYPE_ACCEL_OPS)
-
-/**
- * struct AccelOpsClass - accelerator interfaces
- *
- * This structure is used to abstract accelerator differences from the
- * core CPU code. Not all have to be implemented.
- */
-struct AccelOpsClass {
- /*< private >*/
- ObjectClass parent_class;
- /*< public >*/
-
- /* initialization function called when accel is chosen */
- void (*ops_init)(AccelOpsClass *ops);
-
- bool (*cpus_are_resettable)(void);
- void (*cpu_reset_hold)(CPUState *cpu);
-
- void (*create_vcpu_thread)(CPUState *cpu); /* MANDATORY NON-NULL */
- void (*kick_vcpu_thread)(CPUState *cpu);
- bool (*cpu_thread_is_idle)(CPUState *cpu);
-
- void (*synchronize_post_reset)(CPUState *cpu);
- void (*synchronize_post_init)(CPUState *cpu);
- void (*synchronize_state)(CPUState *cpu);
- void (*synchronize_pre_loadvm)(CPUState *cpu);
- void (*synchronize_pre_resume)(bool step_pending);
-
- void (*handle_interrupt)(CPUState *cpu, int mask);
-
- /**
- * @get_virtual_clock: fetch virtual clock
- * @set_virtual_clock: set virtual clock
- *
- * These allow the timer subsystem to defer to the accelerator to
- * fetch time. The set function is needed if the accelerator wants
- * to track the changes to time as the timer is warped through
- * various timer events.
- */
- int64_t (*get_virtual_clock)(void);
- void (*set_virtual_clock)(int64_t time);
-
- int64_t (*get_elapsed_ticks)(void);
-
- /* gdbstub hooks */
- bool (*supports_guest_debug)(void);
- int (*update_guest_debug)(CPUState *cpu);
- int (*insert_breakpoint)(CPUState *cpu, int type, vaddr addr, vaddr len);
- int (*remove_breakpoint)(CPUState *cpu, int type, vaddr addr, vaddr len);
- void (*remove_all_breakpoints)(CPUState *cpu);
-};
-
-#endif /* ACCEL_OPS_H */
diff --git a/include/system/block-backend-global-state.h b/include/system/block-backend-global-state.h
index 35b5e83..c384964 100644
--- a/include/system/block-backend-global-state.h
+++ b/include/system/block-backend-global-state.h
@@ -55,7 +55,7 @@ void monitor_remove_blk(BlockBackend *blk);
BlockBackendPublic *blk_get_public(BlockBackend *blk);
-void blk_remove_bs(BlockBackend *blk);
+void GRAPH_UNLOCKED blk_remove_bs(BlockBackend *blk);
int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp);
int blk_replace_bs(BlockBackend *blk, BlockDriverState *new_bs, Error **errp);
bool GRAPH_RDLOCK bdrv_has_blk(BlockDriverState *bs);
@@ -78,8 +78,8 @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags);
void blk_aio_cancel(BlockAIOCB *acb);
int blk_commit_all(void);
bool blk_in_drain(BlockBackend *blk);
-void blk_drain(BlockBackend *blk);
-void blk_drain_all(void);
+void GRAPH_UNLOCKED blk_drain(BlockBackend *blk);
+void GRAPH_UNLOCKED blk_drain_all(void);
void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error,
BlockdevOnError on_write_error);
bool blk_supports_write_perm(BlockBackend *blk);
@@ -109,7 +109,7 @@ int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz);
int blk_probe_geometry(BlockBackend *blk, HDGeometry *geo);
void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg);
-void blk_io_limits_disable(BlockBackend *blk);
+void GRAPH_UNLOCKED blk_io_limits_disable(BlockBackend *blk);
void blk_io_limits_enable(BlockBackend *blk, const char *group);
void blk_io_limits_update_group(BlockBackend *blk, const char *group);
void blk_set_force_allow_inactivate(BlockBackend *blk);
diff --git a/include/system/confidential-guest-support.h b/include/system/confidential-guest-support.h
index ea46b50..0cc8b26 100644
--- a/include/system/confidential-guest-support.h
+++ b/include/system/confidential-guest-support.h
@@ -19,6 +19,7 @@
#define QEMU_CONFIDENTIAL_GUEST_SUPPORT_H
#include "qom/object.h"
+#include "exec/hwaddr.h"
#define TYPE_CONFIDENTIAL_GUEST_SUPPORT "confidential-guest-support"
OBJECT_DECLARE_TYPE(ConfidentialGuestSupport,
@@ -26,6 +27,40 @@ OBJECT_DECLARE_TYPE(ConfidentialGuestSupport,
CONFIDENTIAL_GUEST_SUPPORT)
+typedef enum ConfidentialGuestPlatformType {
+ CGS_PLATFORM_SEV,
+ CGS_PLATFORM_SEV_ES,
+ CGS_PLATFORM_SEV_SNP,
+} ConfidentialGuestPlatformType;
+
+typedef enum ConfidentialGuestMemoryType {
+ CGS_MEM_RAM,
+ CGS_MEM_RESERVED,
+ CGS_MEM_ACPI,
+ CGS_MEM_NVS,
+ CGS_MEM_UNUSABLE,
+} ConfidentialGuestMemoryType;
+
+typedef struct ConfidentialGuestMemoryMapEntry {
+ uint64_t gpa;
+ uint64_t size;
+ ConfidentialGuestMemoryType type;
+} ConfidentialGuestMemoryMapEntry;
+
+typedef enum ConfidentialGuestPageType {
+ CGS_PAGE_TYPE_NORMAL,
+ CGS_PAGE_TYPE_VMSA,
+ CGS_PAGE_TYPE_ZERO,
+ CGS_PAGE_TYPE_UNMEASURED,
+ CGS_PAGE_TYPE_SECRETS,
+ CGS_PAGE_TYPE_CPUID,
+ CGS_PAGE_TYPE_REQUIRED_MEMORY,
+} ConfidentialGuestPageType;
+
+typedef enum ConfidentialGuestPolicyType {
+ GUEST_POLICY_SEV,
+} ConfidentialGuestPolicyType;
+
struct ConfidentialGuestSupport {
Object parent;
@@ -64,6 +99,59 @@ typedef struct ConfidentialGuestSupportClass {
int (*kvm_init)(ConfidentialGuestSupport *cgs, Error **errp);
int (*kvm_reset)(ConfidentialGuestSupport *cgs, Error **errp);
+
+ /*
+ * Check to see if this confidential guest supports a particular
+ * platform or configuration.
+ *
+ * Return true if supported or false if not supported.
+ */
+ bool (*check_support)(ConfidentialGuestPlatformType platform,
+ uint16_t platform_version, uint8_t highest_vtl,
+ uint64_t shared_gpa_boundary);
+
+ /*
+ * Configure part of the state of a guest for a particular set of data, page
+ * type and gpa. This can be used for example to pre-populate and measure
+ * guest memory contents, define private ranges or set the initial CPU state
+ * for one or more CPUs.
+ *
+ * If memory_type is CGS_PAGE_TYPE_VMSA then ptr points to the initial CPU
+ * context for a virtual CPU. The format of the data depends on the type of
+ * confidential virtual machine. For example, for SEV-ES ptr will point to a
+ * vmcb_save_area structure that should be copied into guest memory at the
+ * address specified in gpa. The cpu_index parameter contains the index of
+ * the CPU the VMSA applies to.
+ */
+ int (*set_guest_state)(hwaddr gpa, uint8_t *ptr, uint64_t len,
+ ConfidentialGuestPageType memory_type,
+ uint16_t cpu_index, Error **errp);
+
+ /*
+ * Set the guest policy. The policy can be used to configure the
+ * confidential platform, such as if debug is enabled or not and can contain
+ * information about expected launch measurements, signed verification of
+ * guest configuration and other platform data.
+ *
+ * The format of the policy data is specific to each platform. For example,
+ * SEV-SNP uses a policy bitfield in the 'policy' argument and provides an
+ * ID block and ID authentication in the 'policy_data' parameters. The type
+ * of policy data is identified by the 'policy_type' argument.
+ */
+ int (*set_guest_policy)(ConfidentialGuestPolicyType policy_type,
+ uint64_t policy,
+ void *policy_data1, uint32_t policy_data1_size,
+ void *policy_data2, uint32_t policy_data2_size,
+ Error **errp);
+
+ /*
+ * Iterate the system memory map, getting the entry with the given index
+ * that can be populated into guest memory.
+ *
+ * Returns 0 for ok, 1 if the index is out of range and -1 on error.
+ */
+ int (*get_mem_map_entry)(int index, ConfidentialGuestMemoryMapEntry *entry,
+ Error **errp);
} ConfidentialGuestSupportClass;
static inline int confidential_guest_kvm_init(ConfidentialGuestSupport *cgs,
diff --git a/include/system/cpus.h b/include/system/cpus.h
index 3226c76..508444c 100644
--- a/include/system/cpus.h
+++ b/include/system/cpus.h
@@ -7,11 +7,6 @@ void cpus_register_accel(const AccelOpsClass *i);
/* return registers ops */
const AccelOpsClass *cpus_get_accel(void);
-/* accel/dummy-cpus.c */
-
-/* Create a dummy vcpu for AccelOpsClass->create_vcpu_thread */
-void dummy_start_vcpu_thread(CPUState *);
-
/* interface available for cpus accelerator threads */
/* For temporary buffers for forming a name */
@@ -22,8 +17,7 @@ bool cpu_work_list_empty(CPUState *cpu);
bool cpu_thread_is_idle(CPUState *cpu);
bool all_cpu_threads_idle(void);
bool cpu_can_run(CPUState *cpu);
-void qemu_wait_io_event_common(CPUState *cpu);
-void qemu_wait_io_event(CPUState *cpu);
+void qemu_process_cpu_events_common(CPUState *cpu);
void cpu_thread_signal_created(CPUState *cpu);
void cpu_thread_signal_destroyed(CPUState *cpu);
void cpu_handle_guest_debug(CPUState *cpu);
diff --git a/include/system/host_iommu_device.h b/include/system/host_iommu_device.h
index 809cced..ab849a4 100644
--- a/include/system/host_iommu_device.h
+++ b/include/system/host_iommu_device.h
@@ -14,6 +14,13 @@
#include "qom/object.h"
#include "qapi/error.h"
+#ifdef CONFIG_LINUX
+#include "linux/iommufd.h"
+
+typedef union VendorCaps {
+ struct iommu_hw_info_vtd vtd;
+ struct iommu_hw_info_arm_smmuv3 smmuv3;
+} VendorCaps;
/**
* struct HostIOMMUDeviceCaps - Define host IOMMU device capabilities.
@@ -22,11 +29,17 @@
*
* @hw_caps: host platform IOMMU capabilities (e.g. on IOMMUFD this represents
* the @out_capabilities value returned from IOMMU_GET_HW_INFO ioctl)
+ *
+ * @vendor_caps: host platform IOMMU vendor specific capabilities (e.g. on
+ * IOMMUFD this represents a user-space buffer filled by kernel
+ * with host IOMMU @type specific hardware information data)
*/
typedef struct HostIOMMUDeviceCaps {
uint32_t type;
uint64_t hw_caps;
+ VendorCaps vendor_caps;
} HostIOMMUDeviceCaps;
+#endif
#define TYPE_HOST_IOMMU_DEVICE "host-iommu-device"
OBJECT_DECLARE_TYPE(HostIOMMUDevice, HostIOMMUDeviceClass, HOST_IOMMU_DEVICE)
@@ -38,7 +51,9 @@ struct HostIOMMUDevice {
void *agent; /* pointer to agent device, ie. VFIO or VDPA device */
PCIBus *aliased_bus;
int aliased_devfn;
+#ifdef CONFIG_LINUX
HostIOMMUDeviceCaps caps;
+#endif
};
/**
diff --git a/include/system/hvf.h b/include/system/hvf.h
index 7b45a2e..d3dcf08 100644
--- a/include/system/hvf.h
+++ b/include/system/hvf.h
@@ -14,9 +14,6 @@
#define HVF_H
#include "qemu/accel.h"
-#include "qemu/queue.h"
-#include "exec/vaddr.h"
-#include "qom/object.h"
#ifdef COMPILING_PER_TARGET
# ifdef CONFIG_HVF
@@ -39,38 +36,4 @@ typedef struct HVFState HVFState;
DECLARE_INSTANCE_CHECKER(HVFState, HVF_STATE,
TYPE_HVF_ACCEL)
-#ifdef COMPILING_PER_TARGET
-struct hvf_sw_breakpoint {
- vaddr pc;
- vaddr saved_insn;
- int use_count;
- QTAILQ_ENTRY(hvf_sw_breakpoint) entry;
-};
-
-struct hvf_sw_breakpoint *hvf_find_sw_breakpoint(CPUState *cpu,
- vaddr pc);
-int hvf_sw_breakpoints_active(CPUState *cpu);
-
-int hvf_arch_insert_sw_breakpoint(CPUState *cpu, struct hvf_sw_breakpoint *bp);
-int hvf_arch_remove_sw_breakpoint(CPUState *cpu, struct hvf_sw_breakpoint *bp);
-int hvf_arch_insert_hw_breakpoint(vaddr addr, vaddr len, int type);
-int hvf_arch_remove_hw_breakpoint(vaddr addr, vaddr len, int type);
-void hvf_arch_remove_all_hw_breakpoints(void);
-
-/*
- * hvf_update_guest_debug:
- * @cs: CPUState for the CPU to update
- *
- * Update guest to enable or disable debugging. Per-arch specifics will be
- * handled by calling down to hvf_arch_update_guest_debug.
- */
-int hvf_update_guest_debug(CPUState *cpu);
-void hvf_arch_update_guest_debug(CPUState *cpu);
-
-/*
- * Return whether the guest supports debugging.
- */
-bool hvf_arch_supports_guest_debug(void);
-#endif /* COMPILING_PER_TARGET */
-
#endif
diff --git a/include/system/hvf_int.h b/include/system/hvf_int.h
index 8c8b840..a3b06a3 100644
--- a/include/system/hvf_int.h
+++ b/include/system/hvf_int.h
@@ -12,6 +12,9 @@
#define HVF_INT_H
#include "qemu/queue.h"
+#include "exec/vaddr.h"
+#include "qom/object.h"
+#include "accel/accel-ops.h"
#ifdef __aarch64__
#include <Hypervisor/Hypervisor.h>
@@ -43,7 +46,8 @@ typedef struct hvf_vcpu_caps {
} hvf_vcpu_caps;
struct HVFState {
- AccelState parent;
+ AccelState parent_obj;
+
hvf_slot slots[32];
int num_slots;
@@ -59,7 +63,6 @@ struct AccelCPUState {
bool vtimer_masked;
sigset_t unblock_ipi_mask;
bool guest_debug_enabled;
- bool dirty;
};
void assert_hvf_ok_impl(hv_return_t ret, const char *file, unsigned int line,
@@ -76,4 +79,36 @@ int hvf_put_registers(CPUState *);
int hvf_get_registers(CPUState *);
void hvf_kick_vcpu_thread(CPUState *cpu);
+struct hvf_sw_breakpoint {
+ vaddr pc;
+ vaddr saved_insn;
+ int use_count;
+ QTAILQ_ENTRY(hvf_sw_breakpoint) entry;
+};
+
+struct hvf_sw_breakpoint *hvf_find_sw_breakpoint(CPUState *cpu,
+ vaddr pc);
+int hvf_sw_breakpoints_active(CPUState *cpu);
+
+int hvf_arch_insert_sw_breakpoint(CPUState *cpu, struct hvf_sw_breakpoint *bp);
+int hvf_arch_remove_sw_breakpoint(CPUState *cpu, struct hvf_sw_breakpoint *bp);
+int hvf_arch_insert_hw_breakpoint(vaddr addr, vaddr len, int type);
+int hvf_arch_remove_hw_breakpoint(vaddr addr, vaddr len, int type);
+void hvf_arch_remove_all_hw_breakpoints(void);
+
+/*
+ * hvf_update_guest_debug:
+ * @cs: CPUState for the CPU to update
+ *
+ * Update guest to enable or disable debugging. Per-arch specifics will be
+ * handled by calling down to hvf_arch_update_guest_debug.
+ */
+int hvf_update_guest_debug(CPUState *cpu);
+void hvf_arch_update_guest_debug(CPUState *cpu);
+
+/*
+ * Return whether the guest supports debugging.
+ */
+bool hvf_arch_supports_guest_debug(void);
+
#endif
diff --git a/include/system/hw_accel.h b/include/system/hw_accel.h
index 380e9e6..55497ed 100644
--- a/include/system/hw_accel.h
+++ b/include/system/hw_accel.h
@@ -14,12 +14,30 @@
#include "hw/core/cpu.h"
#include "system/kvm.h"
#include "system/hvf.h"
+#include "system/mshv.h"
#include "system/whpx.h"
#include "system/nvmm.h"
+/**
+ * cpu_synchronize_state:
+ * cpu_synchronize_pre_loadvm:
+ * @cpu: The vCPU to synchronize.
+ *
+ * Request to synchronize QEMU vCPU registers from the hardware accelerator
+ * (the hardware accelerator is the reference).
+ */
void cpu_synchronize_state(CPUState *cpu);
+void cpu_synchronize_pre_loadvm(CPUState *cpu);
+
+/**
+ * cpu_synchronize_post_reset:
+ * cpu_synchronize_post_init:
+ * @cpu: The vCPU to synchronize.
+ *
+ * Request to synchronize QEMU vCPU registers to the hardware accelerator
+ * (QEMU is the reference).
+ */
void cpu_synchronize_post_reset(CPUState *cpu);
void cpu_synchronize_post_init(CPUState *cpu);
-void cpu_synchronize_pre_loadvm(CPUState *cpu);
#endif /* QEMU_HW_ACCEL_H */
diff --git a/include/system/igvm-cfg.h b/include/system/igvm-cfg.h
new file mode 100644
index 0000000..944f23a
--- /dev/null
+++ b/include/system/igvm-cfg.h
@@ -0,0 +1,49 @@
+/*
+ * QEMU IGVM interface
+ *
+ * Copyright (C) 2024 SUSE
+ *
+ * Authors:
+ * Roy Hopkins <roy.hopkins@randomman.co.uk>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef QEMU_IGVM_CFG_H
+#define QEMU_IGVM_CFG_H
+
+#include "qom/object.h"
+
+typedef struct IgvmCfg {
+ ObjectClass parent_class;
+
+ /*
+ * filename: Filename that specifies a file that contains the configuration
+ * of the guest in Independent Guest Virtual Machine (IGVM)
+ * format.
+ */
+ char *filename;
+} IgvmCfg;
+
+typedef struct IgvmCfgClass {
+ ObjectClass parent_class;
+
+ /*
+ * If an IGVM filename has been specified then process the IGVM file.
+ * Performs a no-op if no filename has been specified.
+ * If onlyVpContext is true then only the IGVM_VHT_VP_CONTEXT entries
+ * in the IGVM file will be processed, allowing information about the
+ * CPU state to be determined before processing the entire file.
+ *
+ * Returns 0 for ok and -1 on error.
+ */
+ int (*process)(IgvmCfg *cfg, ConfidentialGuestSupport *cgs,
+ bool onlyVpContext, Error **errp);
+
+} IgvmCfgClass;
+
+#define TYPE_IGVM_CFG "igvm-cfg"
+
+OBJECT_DECLARE_TYPE(IgvmCfg, IgvmCfgClass, IGVM_CFG)
+
+#endif
diff --git a/include/system/iommufd.h b/include/system/iommufd.h
index cbab75b..a659f36 100644
--- a/include/system/iommufd.h
+++ b/include/system/iommufd.h
@@ -32,6 +32,7 @@ struct IOMMUFDBackend {
/*< protected >*/
int fd; /* /dev/iommu file descriptor */
bool owned; /* is the /dev/iommu opened internally */
+ Error *cpr_blocker;/* set if be does not support CPR */
uint32_t users;
/*< public >*/
@@ -43,10 +44,13 @@ void iommufd_backend_disconnect(IOMMUFDBackend *be);
bool iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
Error **errp);
void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id);
+int iommufd_backend_map_file_dma(IOMMUFDBackend *be, uint32_t ioas_id,
+ hwaddr iova, uint64_t size, int fd,
+ unsigned long start, bool readonly);
int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova,
- ram_addr_t size, void *vaddr, bool readonly);
+ uint64_t size, void *vaddr, bool readonly);
int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
- hwaddr iova, ram_addr_t size);
+ hwaddr iova, uint64_t size);
bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid,
uint32_t *type, void *data, uint32_t len,
uint64_t *caps, Error **errp);
@@ -61,6 +65,63 @@ bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, uint32_t hwpt_id,
uint64_t iova, ram_addr_t size,
uint64_t page_size, uint64_t *data,
Error **errp);
+bool iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t id,
+ uint32_t data_type, uint32_t entry_len,
+ uint32_t *entry_num, void *data,
+ Error **errp);
+
+bool iommufd_change_process_capable(IOMMUFDBackend *be);
+bool iommufd_change_process(IOMMUFDBackend *be, Error **errp);
#define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd"
+OBJECT_DECLARE_TYPE(HostIOMMUDeviceIOMMUFD, HostIOMMUDeviceIOMMUFDClass,
+ HOST_IOMMU_DEVICE_IOMMUFD)
+
+/* Overload of the host IOMMU device for the iommufd backend */
+struct HostIOMMUDeviceIOMMUFD {
+ HostIOMMUDevice parent_obj;
+
+ IOMMUFDBackend *iommufd;
+ uint32_t devid;
+ uint32_t hwpt_id;
+};
+
+struct HostIOMMUDeviceIOMMUFDClass {
+ HostIOMMUDeviceClass parent_class;
+
+ /**
+ * @attach_hwpt: attach host IOMMU device to IOMMUFD hardware page table.
+ * VFIO and VDPA device can have different implementation.
+ *
+ * Mandatory callback.
+ *
+ * @idev: host IOMMU device backed by IOMMUFD backend.
+ *
+ * @hwpt_id: ID of IOMMUFD hardware page table.
+ *
+ * @errp: pass an Error out when attachment fails.
+ *
+ * Returns: true on success, false on failure.
+ */
+ bool (*attach_hwpt)(HostIOMMUDeviceIOMMUFD *idev, uint32_t hwpt_id,
+ Error **errp);
+ /**
+ * @detach_hwpt: detach host IOMMU device from IOMMUFD hardware page table.
+ * VFIO and VDPA device can have different implementation.
+ *
+ * Mandatory callback.
+ *
+ * @idev: host IOMMU device backed by IOMMUFD backend.
+ *
+ * @errp: pass an Error out when attachment fails.
+ *
+ * Returns: true on success, false on failure.
+ */
+ bool (*detach_hwpt)(HostIOMMUDeviceIOMMUFD *idev, Error **errp);
+};
+
+bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev,
+ uint32_t hwpt_id, Error **errp);
+bool host_iommu_device_iommufd_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev,
+ Error **errp);
#endif
diff --git a/include/system/kvm.h b/include/system/kvm.h
index b690dda..8f9eecf 100644
--- a/include/system/kvm.h
+++ b/include/system/kvm.h
@@ -42,6 +42,7 @@ extern bool kvm_gsi_routing_allowed;
extern bool kvm_gsi_direct_mapping;
extern bool kvm_readonly_mem_allowed;
extern bool kvm_msi_use_devid;
+extern bool kvm_pre_fault_memory_supported;
#define kvm_enabled() (kvm_allowed)
/**
@@ -194,6 +195,7 @@ bool kvm_has_sync_mmu(void);
int kvm_has_vcpu_events(void);
int kvm_max_nested_state_length(void);
int kvm_has_gsi_routing(void);
+void kvm_close(void);
/**
* kvm_arm_supports_user_irq
@@ -316,31 +318,6 @@ int kvm_create_device(KVMState *s, uint64_t type, bool test);
bool kvm_device_supported(int vmfd, uint64_t type);
/**
- * kvm_create_vcpu - Gets a parked KVM vCPU or creates a KVM vCPU
- * @cpu: QOM CPUState object for which KVM vCPU has to be fetched/created.
- *
- * @returns: 0 when success, errno (<0) when failed.
- */
-int kvm_create_vcpu(CPUState *cpu);
-
-/**
- * kvm_park_vcpu - Park QEMU KVM vCPU context
- * @cpu: QOM CPUState object for which QEMU KVM vCPU context has to be parked.
- *
- * @returns: none
- */
-void kvm_park_vcpu(CPUState *cpu);
-
-/**
- * kvm_unpark_vcpu - unpark QEMU KVM vCPU context
- * @s: KVM State
- * @vcpu_id: Architecture vCPU ID of the parked vCPU
- *
- * @returns: KVM fd
- */
-int kvm_unpark_vcpu(KVMState *s, unsigned long vcpu_id);
-
-/**
* kvm_create_and_park_vcpu - Create and park a KVM vCPU
* @cpu: QOM CPUState object for which KVM vCPU has to be created and parked.
*
@@ -363,19 +340,22 @@ int kvm_arch_process_async_events(CPUState *cpu);
int kvm_arch_get_registers(CPUState *cpu, Error **errp);
-/* state subset only touched by the VCPU itself during runtime */
-#define KVM_PUT_RUNTIME_STATE 1
-/* state subset modified during VCPU reset */
-#define KVM_PUT_RESET_STATE 2
-/* full state set, modified during initialization or on vmload */
-#define KVM_PUT_FULL_STATE 3
+typedef enum kvm_put_state {
+ /* state subset only touched by the VCPU itself during runtime */
+ KVM_PUT_RUNTIME_STATE = 1,
+ /* state subset modified during VCPU reset */
+ KVM_PUT_RESET_STATE = 2,
+ /* full state set, modified during initialization or on vmload */
+ KVM_PUT_FULL_STATE = 3,
+} KvmPutState;
-int kvm_arch_put_registers(CPUState *cpu, int level, Error **errp);
+int kvm_arch_put_registers(CPUState *cpu, KvmPutState level, Error **errp);
int kvm_arch_get_default_type(MachineState *ms);
int kvm_arch_init(MachineState *ms, KVMState *s);
+int kvm_arch_pre_create_vcpu(CPUState *cpu, Error **errp);
int kvm_arch_init_vcpu(CPUState *cpu);
int kvm_arch_destroy_vcpu(CPUState *cpu);
diff --git a/include/system/kvm_int.h b/include/system/kvm_int.h
index 756a3c0..9247493 100644
--- a/include/system/kvm_int.h
+++ b/include/system/kvm_int.h
@@ -14,6 +14,7 @@
#include "qemu/accel.h"
#include "qemu/queue.h"
#include "system/kvm.h"
+#include "accel/accel-ops.h"
#include "hw/boards.h"
#include "hw/i386/topology.h"
#include "io/channel-socket.h"
diff --git a/include/system/memory.h b/include/system/memory.h
index fbbf4cf..3bd5ffa 100644
--- a/include/system/memory.h
+++ b/include/system/memory.h
@@ -19,7 +19,6 @@
#include "exec/memattrs.h"
#include "exec/memop.h"
#include "exec/ramlist.h"
-#include "exec/tswap.h"
#include "qemu/bswap.h"
#include "qemu/queue.h"
#include "qemu/int128.h"
@@ -109,15 +108,34 @@ struct MemoryRegionSection {
typedef struct IOMMUTLBEntry IOMMUTLBEntry;
-/* See address_space_translate: bit 0 is read, bit 1 is write. */
+/*
+ * See address_space_translate:
+ * - bit 0 : read
+ * - bit 1 : write
+ * - bit 2 : exec
+ * - bit 3 : priv
+ * - bit 4 : global
+ * - bit 5 : untranslated only
+ */
typedef enum {
IOMMU_NONE = 0,
IOMMU_RO = 1,
IOMMU_WO = 2,
IOMMU_RW = 3,
+ IOMMU_EXEC = 4,
+ IOMMU_PRIV = 8,
+ IOMMU_GLOBAL = 16,
+ IOMMU_UNTRANSLATED_ONLY = 32,
} IOMMUAccessFlags;
-#define IOMMU_ACCESS_FLAG(r, w) (((r) ? IOMMU_RO : 0) | ((w) ? IOMMU_WO : 0))
+#define IOMMU_ACCESS_FLAG(r, w) (((r) ? IOMMU_RO : 0) | \
+ ((w) ? IOMMU_WO : 0))
+#define IOMMU_ACCESS_FLAG_FULL(r, w, x, p, g, uo) \
+ (IOMMU_ACCESS_FLAG(r, w) | \
+ ((x) ? IOMMU_EXEC : 0) | \
+ ((p) ? IOMMU_PRIV : 0) | \
+ ((g) ? IOMMU_GLOBAL : 0) | \
+ ((uo) ? IOMMU_UNTRANSLATED_ONLY : 0))
struct IOMMUTLBEntry {
AddressSpace *target_as;
@@ -125,6 +143,7 @@ struct IOMMUTLBEntry {
hwaddr translated_addr;
hwaddr addr_mask; /* 0xfff = 4k translation */
IOMMUAccessFlags perm;
+ uint32_t pasid;
};
/*
@@ -183,6 +202,7 @@ struct IOMMUNotifier {
hwaddr start;
hwaddr end;
int iommu_idx;
+ void *opaque;
QLIST_ENTRY(IOMMUNotifier) node;
};
typedef struct IOMMUNotifier IOMMUNotifier;
@@ -575,8 +595,20 @@ static inline void ram_discard_listener_init(RamDiscardListener *rdl,
rdl->double_discard_supported = double_discard_supported;
}
-typedef int (*ReplayRamPopulate)(MemoryRegionSection *section, void *opaque);
-typedef void (*ReplayRamDiscard)(MemoryRegionSection *section, void *opaque);
+/**
+ * typedef ReplayRamDiscardState:
+ *
+ * The callback handler for #RamDiscardManagerClass.replay_populated/
+ * #RamDiscardManagerClass.replay_discarded to invoke on populated/discarded
+ * parts.
+ *
+ * @section: the #MemoryRegionSection of populated/discarded part
+ * @opaque: pointer to forward to the callback
+ *
+ * Returns 0 on success, or a negative error if failed.
+ */
+typedef int (*ReplayRamDiscardState)(MemoryRegionSection *section,
+ void *opaque);
/*
* RamDiscardManagerClass:
@@ -650,36 +682,38 @@ struct RamDiscardManagerClass {
/**
* @replay_populated:
*
- * Call the #ReplayRamPopulate callback for all populated parts within the
- * #MemoryRegionSection via the #RamDiscardManager.
+ * Call the #ReplayRamDiscardState callback for all populated parts within
+ * the #MemoryRegionSection via the #RamDiscardManager.
*
* In case any call fails, no further calls are made.
*
* @rdm: the #RamDiscardManager
* @section: the #MemoryRegionSection
- * @replay_fn: the #ReplayRamPopulate callback
+ * @replay_fn: the #ReplayRamDiscardState callback
* @opaque: pointer to forward to the callback
*
* Returns 0 on success, or a negative error if any notification failed.
*/
int (*replay_populated)(const RamDiscardManager *rdm,
MemoryRegionSection *section,
- ReplayRamPopulate replay_fn, void *opaque);
+ ReplayRamDiscardState replay_fn, void *opaque);
/**
* @replay_discarded:
*
- * Call the #ReplayRamDiscard callback for all discarded parts within the
- * #MemoryRegionSection via the #RamDiscardManager.
+ * Call the #ReplayRamDiscardState callback for all discarded parts within
+ * the #MemoryRegionSection via the #RamDiscardManager.
*
* @rdm: the #RamDiscardManager
* @section: the #MemoryRegionSection
- * @replay_fn: the #ReplayRamDiscard callback
+ * @replay_fn: the #ReplayRamDiscardState callback
* @opaque: pointer to forward to the callback
+ *
+ * Returns 0 on success, or a negative error if any notification failed.
*/
- void (*replay_discarded)(const RamDiscardManager *rdm,
- MemoryRegionSection *section,
- ReplayRamDiscard replay_fn, void *opaque);
+ int (*replay_discarded)(const RamDiscardManager *rdm,
+ MemoryRegionSection *section,
+ ReplayRamDiscardState replay_fn, void *opaque);
/**
* @register_listener:
@@ -720,15 +754,41 @@ uint64_t ram_discard_manager_get_min_granularity(const RamDiscardManager *rdm,
bool ram_discard_manager_is_populated(const RamDiscardManager *rdm,
const MemoryRegionSection *section);
+/**
+ * ram_discard_manager_replay_populated:
+ *
+ * A wrapper to call the #RamDiscardManagerClass.replay_populated callback
+ * of the #RamDiscardManager.
+ *
+ * @rdm: the #RamDiscardManager
+ * @section: the #MemoryRegionSection
+ * @replay_fn: the #ReplayRamDiscardState callback
+ * @opaque: pointer to forward to the callback
+ *
+ * Returns 0 on success, or a negative error if any notification failed.
+ */
int ram_discard_manager_replay_populated(const RamDiscardManager *rdm,
MemoryRegionSection *section,
- ReplayRamPopulate replay_fn,
+ ReplayRamDiscardState replay_fn,
void *opaque);
-void ram_discard_manager_replay_discarded(const RamDiscardManager *rdm,
- MemoryRegionSection *section,
- ReplayRamDiscard replay_fn,
- void *opaque);
+/**
+ * ram_discard_manager_replay_discarded:
+ *
+ * A wrapper to call the #RamDiscardManagerClass.replay_discarded callback
+ * of the #RamDiscardManager.
+ *
+ * @rdm: the #RamDiscardManager
+ * @section: the #MemoryRegionSection
+ * @replay_fn: the #ReplayRamDiscardState callback
+ * @opaque: pointer to forward to the callback
+ *
+ * Returns 0 on success, or a negative error if any notification failed.
+ */
+int ram_discard_manager_replay_discarded(const RamDiscardManager *rdm,
+ MemoryRegionSection *section,
+ ReplayRamDiscardState replay_fn,
+ void *opaque);
void ram_discard_manager_register_listener(RamDiscardManager *rdm,
RamDiscardListener *rdl,
@@ -738,21 +798,20 @@ void ram_discard_manager_unregister_listener(RamDiscardManager *rdm,
RamDiscardListener *rdl);
/**
- * memory_get_xlat_addr: Extract addresses from a TLB entry
+ * memory_translate_iotlb: Extract addresses from a TLB entry.
+ * Called with rcu_read_lock held.
*
* @iotlb: pointer to an #IOMMUTLBEntry
- * @vaddr: virtual address
- * @ram_addr: RAM address
- * @read_only: indicates if writes are allowed
- * @mr_has_discard_manager: indicates memory is controlled by a
- * RamDiscardManager
+ * @xlat_p: return the offset of the entry from the start of the returned
+ * MemoryRegion.
* @errp: pointer to Error*, to store an error if it happens.
*
- * Return: true on success, else false setting @errp with error.
+ * Return: On success, return the MemoryRegion containing the @iotlb translated
+ * addr. The MemoryRegion must not be accessed after rcu_read_unlock.
+ * On failure, return NULL, setting @errp with error.
*/
-bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
- ram_addr_t *ram_addr, bool *read_only,
- bool *mr_has_discard_manager, Error **errp);
+MemoryRegion *memory_translate_iotlb(IOMMUTLBEntry *iotlb, hwaddr *xlat_p,
+ Error **errp);
typedef struct CoalescedMemoryRange CoalescedMemoryRange;
typedef struct MemoryRegionIoeventfd MemoryRegionIoeventfd;
@@ -774,6 +833,7 @@ struct MemoryRegion {
bool nonvolatile;
bool rom_device;
bool flush_coalesced_mmio;
+ bool lockless_io;
bool unmergeable;
uint8_t dirty_log_mask;
bool is_iommu;
@@ -1212,6 +1272,36 @@ MemoryRegionSection *memory_region_section_new_copy(MemoryRegionSection *s);
void memory_region_section_free_copy(MemoryRegionSection *s);
/**
+ * memory_region_section_intersect_range: Adjust the memory section to cover
+ * the intersection with the given range.
+ *
+ * @s: the #MemoryRegionSection to be adjusted
+ * @offset: the offset of the given range in the memory region
+ * @size: the size of the given range
+ *
+ * Returns false if the intersection is empty, otherwise returns true.
+ */
+static inline bool memory_region_section_intersect_range(MemoryRegionSection *s,
+ uint64_t offset,
+ uint64_t size)
+{
+ uint64_t start = MAX(s->offset_within_region, offset);
+ Int128 end = int128_min(int128_add(int128_make64(s->offset_within_region),
+ s->size),
+ int128_add(int128_make64(offset),
+ int128_make64(size)));
+
+ if (int128_le(end, int128_make64(start))) {
+ return false;
+ }
+
+ s->offset_within_address_space += start - s->offset_within_region;
+ s->offset_within_region = start;
+ s->size = int128_sub(end, int128_make64(start));
+ return true;
+}
+
+/**
* memory_region_init: Initialize a memory region
*
* The region typically acts as a container for other memory regions. Use
@@ -2253,6 +2343,17 @@ void memory_region_set_flush_coalesced(MemoryRegion *mr);
void memory_region_clear_flush_coalesced(MemoryRegion *mr);
/**
+ * memory_region_enable_lockless_io: Enable lockless (BQL free) acceess.
+ *
+ * Enable BQL-free access for devices that are well prepared to handle
+ * locking during I/O themselves: either by doing fine grained locking or
+ * by providing lock-free I/O schemes.
+ *
+ * @mr: the memory region to be updated.
+ */
+void memory_region_enable_lockless_io(MemoryRegion *mr);
+
+/**
* memory_region_add_eventfd: Request an eventfd to be triggered when a word
* is written to a location.
*
@@ -2469,13 +2570,13 @@ static inline bool memory_region_has_ram_discard_manager(MemoryRegion *mr)
*
* This function must not be called for a mapped #MemoryRegion, a #MemoryRegion
* that does not cover RAM, or a #MemoryRegion that already has a
- * #RamDiscardManager assigned.
+ * #RamDiscardManager assigned. Return 0 if the rdm is set successfully.
*
* @mr: the #MemoryRegion
* @rdm: #RamDiscardManager to set
*/
-void memory_region_set_ram_discard_manager(MemoryRegion *mr,
- RamDiscardManager *rdm);
+int memory_region_set_ram_discard_manager(MemoryRegion *mr,
+ RamDiscardManager *rdm);
/**
* memory_region_find: translate an address/size relative to a
@@ -2626,15 +2727,33 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name);
/**
* address_space_destroy: destroy an address space
*
- * Releases all resources associated with an address space. After an address space
- * is destroyed, its root memory region (given by address_space_init()) may be destroyed
- * as well.
+ * Releases all resources associated with an address space. After an
+ * address space is destroyed, the reference the AddressSpace had to
+ * its root memory region is dropped, which may result in the
+ * destruction of that memory region as well.
+ *
+ * Note that destruction of the AddressSpace is done via RCU;
+ * it is therefore not valid to free the memory the AddressSpace
+ * struct is in until after that RCU callback has completed.
+ * If you want to g_free() the AddressSpace after destruction you
+ * can do that with address_space_destroy_free().
*
* @as: address space to be destroyed
*/
void address_space_destroy(AddressSpace *as);
/**
+ * address_space_destroy_free: destroy an address space and free it
+ *
+ * This does the same thing as address_space_destroy(), and then also
+ * frees (via g_free()) the AddressSpace itself once the destruction
+ * is complete.
+ *
+ * @as: address space to be destroyed
+ */
+void address_space_destroy_free(AddressSpace *as);
+
+/**
* address_space_remove_listeners: unregister all listeners of an address space
*
* Removes all callbacks previously registered with memory_listener_register()
@@ -2876,6 +2995,8 @@ void address_space_cache_invalidate(MemoryRegionCache *cache,
*/
void address_space_cache_destroy(MemoryRegionCache *cache);
+void address_space_flush_icache_range(AddressSpace *as, hwaddr addr, hwaddr len);
+
/* address_space_get_iotlb_entry: translate an address into an IOTLB
* entry. Should be called from an RCU critical section.
*/
@@ -2928,6 +3049,15 @@ static inline MemoryRegion *address_space_translate(AddressSpace *as,
bool address_space_access_valid(AddressSpace *as, hwaddr addr, hwaddr len,
bool is_write, MemTxAttrs attrs);
+/**
+ * address_space_is_io: check whether an guest physical addresses
+ * whithin an address space is I/O memory.
+ *
+ * @as: #AddressSpace to be accessed
+ * @addr: address within that address space
+ */
+bool address_space_is_io(AddressSpace *as, hwaddr addr);
+
/* address_space_map: map a physical memory region into a host virtual address
*
* May map a subset of the requested range, given by and returned in @plen.
diff --git a/include/system/mshv.h b/include/system/mshv.h
new file mode 100644
index 0000000..8b1fc20
--- /dev/null
+++ b/include/system/mshv.h
@@ -0,0 +1,64 @@
+/*
+ * QEMU MSHV support
+ *
+ * Copyright Microsoft, Corp. 2025
+ *
+ * Authors: Ziqiao Zhou <ziqiaozhou@microsoft.com>
+ * Magnus Kulke <magnuskulke@microsoft.com>
+ * Jinank Jain <jinankjain@microsoft.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ */
+
+#ifndef QEMU_MSHV_H
+#define QEMU_MSHV_H
+
+#include "qemu/osdep.h"
+#include "qemu/accel.h"
+#include "hw/hyperv/hyperv-proto.h"
+#include "hw/hyperv/hvhdk.h"
+#include "hw/hyperv/hvgdk_mini.h"
+#include "qapi/qapi-types-common.h"
+#include "system/memory.h"
+#include "accel/accel-ops.h"
+
+#ifdef COMPILING_PER_TARGET
+#ifdef CONFIG_MSHV
+#include <linux/mshv.h>
+#define CONFIG_MSHV_IS_POSSIBLE
+#endif
+#else
+#define CONFIG_MSHV_IS_POSSIBLE
+#endif
+
+#define MSHV_MAX_MSI_ROUTES 4096
+
+#define MSHV_PAGE_SHIFT 12
+
+#ifdef CONFIG_MSHV_IS_POSSIBLE
+extern bool mshv_allowed;
+#define mshv_enabled() (mshv_allowed)
+#define mshv_msi_via_irqfd_enabled() mshv_enabled()
+#else /* CONFIG_MSHV_IS_POSSIBLE */
+#define mshv_enabled() false
+#define mshv_msi_via_irqfd_enabled() mshv_enabled()
+#endif
+
+typedef struct MshvState MshvState;
+extern MshvState *mshv_state;
+
+/* interrupt */
+int mshv_request_interrupt(MshvState *mshv_state, uint32_t interrupt_type, uint32_t vector,
+ uint32_t vp_index, bool logical_destination_mode,
+ bool level_triggered);
+
+int mshv_irqchip_add_msi_route(int vector, PCIDevice *dev);
+int mshv_irqchip_update_msi_route(int virq, MSIMessage msg, PCIDevice *dev);
+void mshv_irqchip_commit_routes(void);
+void mshv_irqchip_release_virq(int virq);
+int mshv_irqchip_add_irqfd_notifier_gsi(const EventNotifier *n,
+ const EventNotifier *rn, int virq);
+int mshv_irqchip_remove_irqfd_notifier_gsi(const EventNotifier *n, int virq);
+
+#endif
diff --git a/include/system/mshv_int.h b/include/system/mshv_int.h
new file mode 100644
index 0000000..490563c
--- /dev/null
+++ b/include/system/mshv_int.h
@@ -0,0 +1,155 @@
+/*
+ * QEMU MSHV support
+ *
+ * Copyright Microsoft, Corp. 2025
+ *
+ * Authors: Ziqiao Zhou <ziqiaozhou@microsoft.com>
+ * Magnus Kulke <magnuskulke@microsoft.com>
+ * Jinank Jain <jinankjain@microsoft.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ */
+
+#ifndef QEMU_MSHV_INT_H
+#define QEMU_MSHV_INT_H
+
+#define MSHV_MSR_ENTRIES_COUNT 64
+
+#define MSHV_MAX_MEM_SLOTS 32
+
+typedef struct hyperv_message hv_message;
+
+typedef struct MshvHvCallArgs {
+ void *base;
+ void *input_page;
+ void *output_page;
+} MshvHvCallArgs;
+
+struct AccelCPUState {
+ int cpufd;
+ bool dirty;
+ MshvHvCallArgs hvcall_args;
+};
+
+typedef struct MshvMemoryListener {
+ MemoryListener listener;
+ int as_id;
+} MshvMemoryListener;
+
+typedef struct MshvAddressSpace {
+ MshvMemoryListener *ml;
+ AddressSpace *as;
+} MshvAddressSpace;
+
+typedef struct MshvMemorySlotManager {
+ size_t n_slots;
+ GList *slots;
+ QemuMutex mutex;
+} MshvMemorySlotManager;
+
+struct MshvState {
+ AccelState parent_obj;
+ int vm;
+ MshvMemoryListener memory_listener;
+ /* number of listeners */
+ int nr_as;
+ MshvAddressSpace *as;
+ int fd;
+ MshvMemorySlotManager msm;
+};
+
+typedef struct MshvMsiControl {
+ bool updated;
+ GHashTable *gsi_routes;
+} MshvMsiControl;
+
+#define mshv_vcpufd(cpu) (cpu->accel->cpufd)
+
+/* cpu */
+typedef struct MshvFPU {
+ uint8_t fpr[8][16];
+ uint16_t fcw;
+ uint16_t fsw;
+ uint8_t ftwx;
+ uint8_t pad1;
+ uint16_t last_opcode;
+ uint64_t last_ip;
+ uint64_t last_dp;
+ uint8_t xmm[16][16];
+ uint32_t mxcsr;
+ uint32_t pad2;
+} MshvFPU;
+
+typedef enum MshvVmExit {
+ MshvVmExitIgnore = 0,
+ MshvVmExitShutdown = 1,
+ MshvVmExitSpecial = 2,
+} MshvVmExit;
+
+typedef enum MshvRemapResult {
+ MshvRemapOk = 0,
+ MshvRemapNoMapping = 1,
+ MshvRemapNoOverlap = 2,
+} MshvRemapResult;
+
+void mshv_init_mmio_emu(void);
+int mshv_create_vcpu(int vm_fd, uint8_t vp_index, int *cpu_fd);
+void mshv_remove_vcpu(int vm_fd, int cpu_fd);
+int mshv_configure_vcpu(const CPUState *cpu, const MshvFPU *fpu, uint64_t xcr0);
+int mshv_get_standard_regs(CPUState *cpu);
+int mshv_get_special_regs(CPUState *cpu);
+int mshv_run_vcpu(int vm_fd, CPUState *cpu, hv_message *msg, MshvVmExit *exit);
+int mshv_load_regs(CPUState *cpu);
+int mshv_store_regs(CPUState *cpu);
+int mshv_set_generic_regs(const CPUState *cpu, const hv_register_assoc *assocs,
+ size_t n_regs);
+int mshv_arch_put_registers(const CPUState *cpu);
+void mshv_arch_init_vcpu(CPUState *cpu);
+void mshv_arch_destroy_vcpu(CPUState *cpu);
+void mshv_arch_amend_proc_features(
+ union hv_partition_synthetic_processor_features *features);
+int mshv_arch_post_init_vm(int vm_fd);
+
+#if defined COMPILING_PER_TARGET && defined CONFIG_MSHV_IS_POSSIBLE
+int mshv_hvcall(int fd, const struct mshv_root_hvcall *args);
+#endif
+
+/* memory */
+typedef struct MshvMemorySlot {
+ uint64_t guest_phys_addr;
+ uint64_t memory_size;
+ uint64_t userspace_addr;
+ bool readonly;
+ bool mapped;
+} MshvMemorySlot;
+
+MshvRemapResult mshv_remap_overlap_region(int vm_fd, uint64_t gpa);
+int mshv_guest_mem_read(uint64_t gpa, uint8_t *data, uintptr_t size,
+ bool is_secure_mode, bool instruction_fetch);
+int mshv_guest_mem_write(uint64_t gpa, const uint8_t *data, uintptr_t size,
+ bool is_secure_mode);
+void mshv_set_phys_mem(MshvMemoryListener *mml, MemoryRegionSection *section,
+ bool add);
+void mshv_init_memory_slot_manager(MshvState *mshv_state);
+
+/* msr */
+typedef struct MshvMsrEntry {
+ uint32_t index;
+ uint32_t reserved;
+ uint64_t data;
+} MshvMsrEntry;
+
+typedef struct MshvMsrEntries {
+ MshvMsrEntry entries[MSHV_MSR_ENTRIES_COUNT];
+ uint32_t nmsrs;
+} MshvMsrEntries;
+
+int mshv_configure_msr(const CPUState *cpu, const MshvMsrEntry *msrs,
+ size_t n_msrs);
+
+/* interrupt */
+void mshv_init_msicontrol(void);
+int mshv_reserve_ioapic_msi_routes(int vm_fd);
+
+#endif
diff --git a/include/system/nvmm.h b/include/system/nvmm.h
index 6971ddb..7390def 100644
--- a/include/system/nvmm.h
+++ b/include/system/nvmm.h
@@ -13,17 +13,18 @@
#define QEMU_NVMM_H
#ifdef COMPILING_PER_TARGET
-
-#ifdef CONFIG_NVMM
-
-int nvmm_enabled(void);
-
-#else /* CONFIG_NVMM */
-
-#define nvmm_enabled() (0)
-
-#endif /* CONFIG_NVMM */
-
+# ifdef CONFIG_NVMM
+# define CONFIG_NVMM_IS_POSSIBLE
+# endif /* !CONFIG_NVMM */
+#else
+# define CONFIG_NVMM_IS_POSSIBLE
#endif /* COMPILING_PER_TARGET */
+#ifdef CONFIG_NVMM_IS_POSSIBLE
+extern bool nvmm_allowed;
+#define nvmm_enabled() (nvmm_allowed)
+#else /* !CONFIG_NVMM_IS_POSSIBLE */
+#define nvmm_enabled() 0
+#endif /* !CONFIG_NVMM_IS_POSSIBLE */
+
#endif /* QEMU_NVMM_H */
diff --git a/include/system/os-win32.h b/include/system/os-win32.h
index 3aa6cee..22d72ba 100644
--- a/include/system/os-win32.h
+++ b/include/system/os-win32.h
@@ -168,11 +168,14 @@ static inline void qemu_funlockfile(FILE *f)
#endif
}
-/* Helper for WSAEventSelect, to report errors */
+/* Helpers for WSAEventSelect() */
bool qemu_socket_select(int sockfd, WSAEVENT hEventObject,
long lNetworkEvents, Error **errp);
+void qemu_socket_select_nofail(int sockfd, WSAEVENT hEventObject,
+ long lNetworkEvents);
bool qemu_socket_unselect(int sockfd, Error **errp);
+void qemu_socket_unselect_nofail(int sockfd);
/* We wrap all the sockets functions so that we can set errno based on
* WSAGetLastError(), and use file-descriptors instead of SOCKET.
diff --git a/include/system/physmem.h b/include/system/physmem.h
new file mode 100644
index 0000000..879f6ea
--- /dev/null
+++ b/include/system/physmem.h
@@ -0,0 +1,54 @@
+/*
+ * QEMU physical memory interfaces (target independent).
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+#ifndef QEMU_SYSTEM_PHYSMEM_H
+#define QEMU_SYSTEM_PHYSMEM_H
+
+#include "exec/hwaddr.h"
+#include "exec/ramlist.h"
+
+#define DIRTY_CLIENTS_ALL ((1 << DIRTY_MEMORY_NUM) - 1)
+#define DIRTY_CLIENTS_NOCODE (DIRTY_CLIENTS_ALL & ~(1 << DIRTY_MEMORY_CODE))
+
+bool physical_memory_get_dirty_flag(ram_addr_t addr, unsigned client);
+
+bool physical_memory_is_clean(ram_addr_t addr);
+
+uint8_t physical_memory_range_includes_clean(ram_addr_t start,
+ ram_addr_t length,
+ uint8_t mask);
+
+void physical_memory_set_dirty_flag(ram_addr_t addr, unsigned client);
+
+void physical_memory_set_dirty_range(ram_addr_t start, ram_addr_t length,
+ uint8_t mask);
+
+/*
+ * Contrary to physical_memory_sync_dirty_bitmap() this function returns
+ * the number of dirty pages in @bitmap passed as argument. On the other hand,
+ * physical_memory_sync_dirty_bitmap() returns newly dirtied pages that
+ * weren't set in the global migration bitmap.
+ */
+uint64_t physical_memory_set_dirty_lebitmap(unsigned long *bitmap,
+ ram_addr_t start,
+ ram_addr_t pages);
+
+void physical_memory_dirty_bits_cleared(ram_addr_t start, ram_addr_t length);
+
+bool physical_memory_test_and_clear_dirty(ram_addr_t start,
+ ram_addr_t length,
+ unsigned client);
+
+DirtyBitmapSnapshot *
+physical_memory_snapshot_and_clear_dirty(MemoryRegion *mr, hwaddr offset,
+ hwaddr length, unsigned client);
+
+bool physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
+ ram_addr_t start,
+ ram_addr_t length);
+
+#endif
diff --git a/include/system/ram_addr.h b/include/system/ram_addr.h
index 15a1b1a..6834859 100644
--- a/include/system/ram_addr.h
+++ b/include/system/ram_addr.h
@@ -19,17 +19,9 @@
#ifndef SYSTEM_RAM_ADDR_H
#define SYSTEM_RAM_ADDR_H
-#include "system/xen.h"
-#include "system/tcg.h"
-#include "exec/cputlb.h"
-#include "exec/ramlist.h"
#include "system/ramblock.h"
-#include "system/memory.h"
#include "exec/target_page.h"
-#include "qemu/rcu.h"
-
#include "exec/hwaddr.h"
-#include "exec/cpu-common.h"
extern uint64_t total_dirty_pages;
@@ -80,17 +72,6 @@ static inline bool clear_bmap_test_and_clear(RAMBlock *rb, uint64_t page)
return bitmap_test_and_clear(rb->clear_bmap, page >> shift, 1);
}
-static inline bool offset_in_ramblock(RAMBlock *b, ram_addr_t offset)
-{
- return (b && b->host && offset < b->used_length) ? true : false;
-}
-
-static inline void *ramblock_ptr(RAMBlock *block, ram_addr_t offset)
-{
- assert(offset_in_ramblock(block, offset));
- return (char *)block->host + offset;
-}
-
static inline unsigned long int ramblock_recv_bitmap_offset(void *host_addr,
RAMBlock *rb)
{
@@ -99,8 +80,6 @@ static inline unsigned long int ramblock_recv_bitmap_offset(void *host_addr,
return host_addr_offset >> TARGET_PAGE_BITS;
}
-bool ramblock_is_pmem(RAMBlock *rb);
-
/**
* qemu_ram_alloc_from_file,
* qemu_ram_alloc_from_fd: Allocate a ram block from the specified backing
@@ -153,409 +132,4 @@ static inline void qemu_ram_block_writeback(RAMBlock *block)
qemu_ram_msync(block, 0, block->used_length);
}
-#define DIRTY_CLIENTS_ALL ((1 << DIRTY_MEMORY_NUM) - 1)
-#define DIRTY_CLIENTS_NOCODE (DIRTY_CLIENTS_ALL & ~(1 << DIRTY_MEMORY_CODE))
-
-static inline bool cpu_physical_memory_get_dirty(ram_addr_t start,
- ram_addr_t length,
- unsigned client)
-{
- DirtyMemoryBlocks *blocks;
- unsigned long end, page;
- unsigned long idx, offset, base;
- bool dirty = false;
-
- assert(client < DIRTY_MEMORY_NUM);
-
- end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
- page = start >> TARGET_PAGE_BITS;
-
- WITH_RCU_READ_LOCK_GUARD() {
- blocks = qatomic_rcu_read(&ram_list.dirty_memory[client]);
-
- idx = page / DIRTY_MEMORY_BLOCK_SIZE;
- offset = page % DIRTY_MEMORY_BLOCK_SIZE;
- base = page - offset;
- while (page < end) {
- unsigned long next = MIN(end, base + DIRTY_MEMORY_BLOCK_SIZE);
- unsigned long num = next - base;
- unsigned long found = find_next_bit(blocks->blocks[idx],
- num, offset);
- if (found < num) {
- dirty = true;
- break;
- }
-
- page = next;
- idx++;
- offset = 0;
- base += DIRTY_MEMORY_BLOCK_SIZE;
- }
- }
-
- return dirty;
-}
-
-static inline bool cpu_physical_memory_all_dirty(ram_addr_t start,
- ram_addr_t length,
- unsigned client)
-{
- DirtyMemoryBlocks *blocks;
- unsigned long end, page;
- unsigned long idx, offset, base;
- bool dirty = true;
-
- assert(client < DIRTY_MEMORY_NUM);
-
- end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
- page = start >> TARGET_PAGE_BITS;
-
- RCU_READ_LOCK_GUARD();
-
- blocks = qatomic_rcu_read(&ram_list.dirty_memory[client]);
-
- idx = page / DIRTY_MEMORY_BLOCK_SIZE;
- offset = page % DIRTY_MEMORY_BLOCK_SIZE;
- base = page - offset;
- while (page < end) {
- unsigned long next = MIN(end, base + DIRTY_MEMORY_BLOCK_SIZE);
- unsigned long num = next - base;
- unsigned long found = find_next_zero_bit(blocks->blocks[idx], num, offset);
- if (found < num) {
- dirty = false;
- break;
- }
-
- page = next;
- idx++;
- offset = 0;
- base += DIRTY_MEMORY_BLOCK_SIZE;
- }
-
- return dirty;
-}
-
-static inline bool cpu_physical_memory_get_dirty_flag(ram_addr_t addr,
- unsigned client)
-{
- return cpu_physical_memory_get_dirty(addr, 1, client);
-}
-
-static inline bool cpu_physical_memory_is_clean(ram_addr_t addr)
-{
- bool vga = cpu_physical_memory_get_dirty_flag(addr, DIRTY_MEMORY_VGA);
- bool code = cpu_physical_memory_get_dirty_flag(addr, DIRTY_MEMORY_CODE);
- bool migration =
- cpu_physical_memory_get_dirty_flag(addr, DIRTY_MEMORY_MIGRATION);
- return !(vga && code && migration);
-}
-
-static inline uint8_t cpu_physical_memory_range_includes_clean(ram_addr_t start,
- ram_addr_t length,
- uint8_t mask)
-{
- uint8_t ret = 0;
-
- if (mask & (1 << DIRTY_MEMORY_VGA) &&
- !cpu_physical_memory_all_dirty(start, length, DIRTY_MEMORY_VGA)) {
- ret |= (1 << DIRTY_MEMORY_VGA);
- }
- if (mask & (1 << DIRTY_MEMORY_CODE) &&
- !cpu_physical_memory_all_dirty(start, length, DIRTY_MEMORY_CODE)) {
- ret |= (1 << DIRTY_MEMORY_CODE);
- }
- if (mask & (1 << DIRTY_MEMORY_MIGRATION) &&
- !cpu_physical_memory_all_dirty(start, length, DIRTY_MEMORY_MIGRATION)) {
- ret |= (1 << DIRTY_MEMORY_MIGRATION);
- }
- return ret;
-}
-
-static inline void cpu_physical_memory_set_dirty_flag(ram_addr_t addr,
- unsigned client)
-{
- unsigned long page, idx, offset;
- DirtyMemoryBlocks *blocks;
-
- assert(client < DIRTY_MEMORY_NUM);
-
- page = addr >> TARGET_PAGE_BITS;
- idx = page / DIRTY_MEMORY_BLOCK_SIZE;
- offset = page % DIRTY_MEMORY_BLOCK_SIZE;
-
- RCU_READ_LOCK_GUARD();
-
- blocks = qatomic_rcu_read(&ram_list.dirty_memory[client]);
-
- set_bit_atomic(offset, blocks->blocks[idx]);
-}
-
-static inline void cpu_physical_memory_set_dirty_range(ram_addr_t start,
- ram_addr_t length,
- uint8_t mask)
-{
- DirtyMemoryBlocks *blocks[DIRTY_MEMORY_NUM];
- unsigned long end, page;
- unsigned long idx, offset, base;
- int i;
-
- if (!mask && !xen_enabled()) {
- return;
- }
-
- end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
- page = start >> TARGET_PAGE_BITS;
-
- WITH_RCU_READ_LOCK_GUARD() {
- for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
- blocks[i] = qatomic_rcu_read(&ram_list.dirty_memory[i]);
- }
-
- idx = page / DIRTY_MEMORY_BLOCK_SIZE;
- offset = page % DIRTY_MEMORY_BLOCK_SIZE;
- base = page - offset;
- while (page < end) {
- unsigned long next = MIN(end, base + DIRTY_MEMORY_BLOCK_SIZE);
-
- if (likely(mask & (1 << DIRTY_MEMORY_MIGRATION))) {
- bitmap_set_atomic(blocks[DIRTY_MEMORY_MIGRATION]->blocks[idx],
- offset, next - page);
- }
- if (unlikely(mask & (1 << DIRTY_MEMORY_VGA))) {
- bitmap_set_atomic(blocks[DIRTY_MEMORY_VGA]->blocks[idx],
- offset, next - page);
- }
- if (unlikely(mask & (1 << DIRTY_MEMORY_CODE))) {
- bitmap_set_atomic(blocks[DIRTY_MEMORY_CODE]->blocks[idx],
- offset, next - page);
- }
-
- page = next;
- idx++;
- offset = 0;
- base += DIRTY_MEMORY_BLOCK_SIZE;
- }
- }
-
- if (xen_enabled()) {
- xen_hvm_modified_memory(start, length);
- }
-}
-
-#if !defined(_WIN32)
-
-/*
- * Contrary to cpu_physical_memory_sync_dirty_bitmap() this function returns
- * the number of dirty pages in @bitmap passed as argument. On the other hand,
- * cpu_physical_memory_sync_dirty_bitmap() returns newly dirtied pages that
- * weren't set in the global migration bitmap.
- */
-static inline
-uint64_t cpu_physical_memory_set_dirty_lebitmap(unsigned long *bitmap,
- ram_addr_t start,
- ram_addr_t pages)
-{
- unsigned long i, j;
- unsigned long page_number, c, nbits;
- hwaddr addr;
- ram_addr_t ram_addr;
- uint64_t num_dirty = 0;
- unsigned long len = (pages + HOST_LONG_BITS - 1) / HOST_LONG_BITS;
- unsigned long hpratio = qemu_real_host_page_size() / TARGET_PAGE_SIZE;
- unsigned long page = BIT_WORD(start >> TARGET_PAGE_BITS);
-
- /* start address is aligned at the start of a word? */
- if ((((page * BITS_PER_LONG) << TARGET_PAGE_BITS) == start) &&
- (hpratio == 1)) {
- unsigned long **blocks[DIRTY_MEMORY_NUM];
- unsigned long idx;
- unsigned long offset;
- long k;
- long nr = BITS_TO_LONGS(pages);
-
- idx = (start >> TARGET_PAGE_BITS) / DIRTY_MEMORY_BLOCK_SIZE;
- offset = BIT_WORD((start >> TARGET_PAGE_BITS) %
- DIRTY_MEMORY_BLOCK_SIZE);
-
- WITH_RCU_READ_LOCK_GUARD() {
- for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
- blocks[i] =
- qatomic_rcu_read(&ram_list.dirty_memory[i])->blocks;
- }
-
- for (k = 0; k < nr; k++) {
- if (bitmap[k]) {
- unsigned long temp = leul_to_cpu(bitmap[k]);
-
- nbits = ctpopl(temp);
- qatomic_or(&blocks[DIRTY_MEMORY_VGA][idx][offset], temp);
-
- if (global_dirty_tracking) {
- qatomic_or(
- &blocks[DIRTY_MEMORY_MIGRATION][idx][offset],
- temp);
- if (unlikely(
- global_dirty_tracking & GLOBAL_DIRTY_DIRTY_RATE)) {
- total_dirty_pages += nbits;
- }
- }
-
- num_dirty += nbits;
-
- if (tcg_enabled()) {
- qatomic_or(&blocks[DIRTY_MEMORY_CODE][idx][offset],
- temp);
- }
- }
-
- if (++offset >= BITS_TO_LONGS(DIRTY_MEMORY_BLOCK_SIZE)) {
- offset = 0;
- idx++;
- }
- }
- }
-
- if (xen_enabled()) {
- xen_hvm_modified_memory(start, pages << TARGET_PAGE_BITS);
- }
- } else {
- uint8_t clients = tcg_enabled() ? DIRTY_CLIENTS_ALL : DIRTY_CLIENTS_NOCODE;
-
- if (!global_dirty_tracking) {
- clients &= ~(1 << DIRTY_MEMORY_MIGRATION);
- }
-
- /*
- * bitmap-traveling is faster than memory-traveling (for addr...)
- * especially when most of the memory is not dirty.
- */
- for (i = 0; i < len; i++) {
- if (bitmap[i] != 0) {
- c = leul_to_cpu(bitmap[i]);
- nbits = ctpopl(c);
- if (unlikely(global_dirty_tracking & GLOBAL_DIRTY_DIRTY_RATE)) {
- total_dirty_pages += nbits;
- }
- num_dirty += nbits;
- do {
- j = ctzl(c);
- c &= ~(1ul << j);
- page_number = (i * HOST_LONG_BITS + j) * hpratio;
- addr = page_number * TARGET_PAGE_SIZE;
- ram_addr = start + addr;
- cpu_physical_memory_set_dirty_range(ram_addr,
- TARGET_PAGE_SIZE * hpratio, clients);
- } while (c != 0);
- }
- }
- }
-
- return num_dirty;
-}
-#endif /* not _WIN32 */
-
-static inline void cpu_physical_memory_dirty_bits_cleared(ram_addr_t start,
- ram_addr_t length)
-{
- if (tcg_enabled()) {
- tlb_reset_dirty_range_all(start, length);
- }
-
-}
-bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
- ram_addr_t length,
- unsigned client);
-
-DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty
- (MemoryRegion *mr, hwaddr offset, hwaddr length, unsigned client);
-
-bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
- ram_addr_t start,
- ram_addr_t length);
-
-static inline void cpu_physical_memory_clear_dirty_range(ram_addr_t start,
- ram_addr_t length)
-{
- cpu_physical_memory_test_and_clear_dirty(start, length, DIRTY_MEMORY_MIGRATION);
- cpu_physical_memory_test_and_clear_dirty(start, length, DIRTY_MEMORY_VGA);
- cpu_physical_memory_test_and_clear_dirty(start, length, DIRTY_MEMORY_CODE);
-}
-
-
-/* Called with RCU critical section */
-static inline
-uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock *rb,
- ram_addr_t start,
- ram_addr_t length)
-{
- ram_addr_t addr;
- unsigned long word = BIT_WORD((start + rb->offset) >> TARGET_PAGE_BITS);
- uint64_t num_dirty = 0;
- unsigned long *dest = rb->bmap;
-
- /* start address and length is aligned at the start of a word? */
- if (((word * BITS_PER_LONG) << TARGET_PAGE_BITS) ==
- (start + rb->offset) &&
- !(length & ((BITS_PER_LONG << TARGET_PAGE_BITS) - 1))) {
- int k;
- int nr = BITS_TO_LONGS(length >> TARGET_PAGE_BITS);
- unsigned long * const *src;
- unsigned long idx = (word * BITS_PER_LONG) / DIRTY_MEMORY_BLOCK_SIZE;
- unsigned long offset = BIT_WORD((word * BITS_PER_LONG) %
- DIRTY_MEMORY_BLOCK_SIZE);
- unsigned long page = BIT_WORD(start >> TARGET_PAGE_BITS);
-
- src = qatomic_rcu_read(
- &ram_list.dirty_memory[DIRTY_MEMORY_MIGRATION])->blocks;
-
- for (k = page; k < page + nr; k++) {
- if (src[idx][offset]) {
- unsigned long bits = qatomic_xchg(&src[idx][offset], 0);
- unsigned long new_dirty;
- new_dirty = ~dest[k];
- dest[k] |= bits;
- new_dirty &= bits;
- num_dirty += ctpopl(new_dirty);
- }
-
- if (++offset >= BITS_TO_LONGS(DIRTY_MEMORY_BLOCK_SIZE)) {
- offset = 0;
- idx++;
- }
- }
- if (num_dirty) {
- cpu_physical_memory_dirty_bits_cleared(start, length);
- }
-
- if (rb->clear_bmap) {
- /*
- * Postpone the dirty bitmap clear to the point before we
- * really send the pages, also we will split the clear
- * dirty procedure into smaller chunks.
- */
- clear_bmap_set(rb, start >> TARGET_PAGE_BITS,
- length >> TARGET_PAGE_BITS);
- } else {
- /* Slow path - still do that in a huge chunk */
- memory_region_clear_dirty_bitmap(rb->mr, start, length);
- }
- } else {
- ram_addr_t offset = rb->offset;
-
- for (addr = 0; addr < length; addr += TARGET_PAGE_SIZE) {
- if (cpu_physical_memory_test_and_clear_dirty(
- start + addr + offset,
- TARGET_PAGE_SIZE,
- DIRTY_MEMORY_MIGRATION)) {
- long k = (start + addr) >> TARGET_PAGE_BITS;
- if (!test_and_set_bit(k, dest)) {
- num_dirty++;
- }
- }
- }
- }
-
- return num_dirty;
-}
-
#endif
diff --git a/include/system/ramblock.h b/include/system/ramblock.h
index d8a116b..76694fe 100644
--- a/include/system/ramblock.h
+++ b/include/system/ramblock.h
@@ -11,17 +11,16 @@
*
*/
-/*
- * This header is for use by exec.c and memory.c ONLY. Do not include it.
- * The functions declared here will be removed soon.
- */
-
#ifndef SYSTEM_RAMBLOCK_H
#define SYSTEM_RAMBLOCK_H
#include "exec/cpu-common.h"
#include "qemu/rcu.h"
#include "exec/ramlist.h"
+#include "system/hostmem.h"
+
+#define TYPE_RAM_BLOCK_ATTRIBUTES "ram-block-attributes"
+OBJECT_DECLARE_SIMPLE_TYPE(RamBlockAttributes, RAM_BLOCK_ATTRIBUTES)
struct RAMBlock {
struct rcu_head rcu;
@@ -42,6 +41,7 @@ struct RAMBlock {
int fd;
uint64_t fd_offset;
int guest_memfd;
+ RamBlockAttributes *attributes;
size_t page_size;
/* dirty bitmap used during migration */
unsigned long *bmap;
@@ -91,4 +91,43 @@ struct RAMBlock {
ram_addr_t postcopy_length;
};
+struct RamBlockAttributes {
+ Object parent;
+
+ RAMBlock *ram_block;
+
+ /* 1-setting of the bitmap represents ram is populated (shared) */
+ unsigned bitmap_size;
+ unsigned long *bitmap;
+
+ QLIST_HEAD(, RamDiscardListener) rdl_list;
+};
+
+/* @offset: the offset within the RAMBlock */
+int ram_block_discard_range(RAMBlock *rb, uint64_t offset, size_t length);
+/* @offset: the offset within the RAMBlock */
+int ram_block_discard_guest_memfd_range(RAMBlock *rb, uint64_t offset,
+ size_t length);
+
+RamBlockAttributes *ram_block_attributes_create(RAMBlock *ram_block);
+void ram_block_attributes_destroy(RamBlockAttributes *attr);
+int ram_block_attributes_state_change(RamBlockAttributes *attr, uint64_t offset,
+ uint64_t size, bool to_discard);
+
+/**
+ * ram_block_is_pmem: Whether the RAM block is of persistent memory
+ */
+bool ram_block_is_pmem(RAMBlock *rb);
+
+static inline bool offset_in_ramblock(RAMBlock *b, ram_addr_t offset)
+{
+ return b && b->host && (offset < b->used_length);
+}
+
+static inline void *ramblock_ptr(RAMBlock *block, ram_addr_t offset)
+{
+ assert(offset_in_ramblock(block, offset));
+ return (char *)block->host + offset;
+}
+
#endif
diff --git a/include/system/runstate.h b/include/system/runstate.h
index bffc371..929379a 100644
--- a/include/system/runstate.h
+++ b/include/system/runstate.h
@@ -12,29 +12,76 @@ bool runstate_needs_reset(void);
void runstate_replay_enable(void);
typedef void VMChangeStateHandler(void *opaque, bool running, RunState state);
+typedef int VMChangeStateHandlerWithRet(void *opaque, bool running, RunState state);
+/**
+ * qemu_add_vm_change_state_handler:
+ * @cb: the callback to invoke
+ * @opaque: user data passed to the callback
+ *
+ * Register a callback function that is invoked when the vm starts or stops
+ * running.
+ *
+ * Returns: an entry to be freed using qemu_del_vm_change_state_handler()
+ */
VMChangeStateEntry *qemu_add_vm_change_state_handler(VMChangeStateHandler *cb,
void *opaque);
+/**
+ * qemu_add_vm_change_state_handler_prio:
+ * @cb: the callback to invoke
+ * @opaque: user data passed to the callback
+ * @priority: low priorities execute first when the vm runs and the reverse is
+ * true when the vm stops
+ *
+ * Register a callback function that is invoked when the vm starts or stops
+ * running.
+ *
+ * Returns: an entry to be freed using qemu_del_vm_change_state_handler()
+ */
VMChangeStateEntry *qemu_add_vm_change_state_handler_prio(
VMChangeStateHandler *cb, void *opaque, int priority);
VMChangeStateEntry *
+/**
+ * qemu_add_vm_change_state_handler_prio_full:
+ * @cb: the main callback to invoke
+ * @prepare_cb: a callback to invoke before the main callback
+ * @cb_ret: the main callback to invoke with return value
+ * @opaque: user data passed to the callbacks
+ * @priority: low priorities execute first when the vm runs and the reverse is
+ * true when the vm stops
+ *
+ * Register a main callback function and an optional prepare callback function
+ * that are invoked when the vm starts or stops running. The main callback and
+ * the prepare callback are called in two separate phases: First all prepare
+ * callbacks are called and only then all main callbacks are called. As its
+ * name suggests, the prepare callback can be used to do some preparatory work
+ * before invoking the main callback.
+ *
+ * Returns: an entry to be freed using qemu_del_vm_change_state_handler()
+ */
qemu_add_vm_change_state_handler_prio_full(VMChangeStateHandler *cb,
VMChangeStateHandler *prepare_cb,
+ VMChangeStateHandlerWithRet *cb_ret,
void *opaque, int priority);
VMChangeStateEntry *qdev_add_vm_change_state_handler(DeviceState *dev,
VMChangeStateHandler *cb,
+ VMChangeStateHandlerWithRet *cb_ret,
void *opaque);
VMChangeStateEntry *qdev_add_vm_change_state_handler_full(
- DeviceState *dev, VMChangeStateHandler *cb,
- VMChangeStateHandler *prepare_cb, void *opaque);
+ DeviceState *dev, VMChangeStateHandler *cb, VMChangeStateHandler *prepare_cb,
+ VMChangeStateHandlerWithRet *cb_ret, void *opaque);
void qemu_del_vm_change_state_handler(VMChangeStateEntry *e);
/**
* vm_state_notify: Notify the state of the VM
*
* @running: whether the VM is running or not.
* @state: the #RunState of the VM.
+ *
+ * Return the result of the callback which has return value.
+ * If no callback has return value, still return 0 and the
+ * upper layer should not do additional processing.
*/
-void vm_state_notify(bool running, RunState state);
+int vm_state_notify(bool running, RunState state);
static inline bool shutdown_caused_by_guest(ShutdownCause cause)
{
@@ -100,6 +147,7 @@ void qemu_system_vmstop_request(RunState reason);
void qemu_system_vmstop_request_prepare(void);
bool qemu_vmstop_requested(RunState *r);
ShutdownCause qemu_shutdown_requested_get(void);
+bool qemu_force_shutdown_requested(void);
ShutdownCause qemu_reset_requested_get(void);
void qemu_system_killed(int signal, pid_t pid);
void qemu_system_reset(ShutdownCause reason);
diff --git a/include/system/system.h b/include/system/system.h
index a7effe7..03a2d0e 100644
--- a/include/system/system.h
+++ b/include/system/system.h
@@ -42,7 +42,6 @@ extern int graphic_height;
extern int graphic_depth;
extern int display_opengl;
extern const char *keyboard_layout;
-extern int old_param;
extern uint8_t *boot_splash_filedata;
extern bool enable_cpu_pm;
extern QEMUClockType rtc_clock;
diff --git a/include/system/vhost-user-backend.h b/include/system/vhost-user-backend.h
index 5ed953c..5634ebd 100644
--- a/include/system/vhost-user-backend.h
+++ b/include/system/vhost-user-backend.h
@@ -43,6 +43,6 @@ struct VhostUserBackend {
int vhost_user_backend_dev_init(VhostUserBackend *b, VirtIODevice *vdev,
unsigned nvqs, Error **errp);
void vhost_user_backend_start(VhostUserBackend *b);
-void vhost_user_backend_stop(VhostUserBackend *b);
+int vhost_user_backend_stop(VhostUserBackend *b);
#endif
diff --git a/include/system/whpx.h b/include/system/whpx.h
index 00ff409..00f6a3e 100644
--- a/include/system/whpx.h
+++ b/include/system/whpx.h
@@ -16,19 +16,20 @@
#define QEMU_WHPX_H
#ifdef COMPILING_PER_TARGET
+# ifdef CONFIG_WHPX
+# define CONFIG_WHPX_IS_POSSIBLE
+# endif /* !CONFIG_WHPX */
+#else
+# define CONFIG_WHPX_IS_POSSIBLE
+#endif /* COMPILING_PER_TARGET */
-#ifdef CONFIG_WHPX
-
-int whpx_enabled(void);
+#ifdef CONFIG_WHPX_IS_POSSIBLE
+extern bool whpx_allowed;
+#define whpx_enabled() (whpx_allowed)
bool whpx_apic_in_platform(void);
-
-#else /* CONFIG_WHPX */
-
-#define whpx_enabled() (0)
+#else /* !CONFIG_WHPX_IS_POSSIBLE */
+#define whpx_enabled() 0
#define whpx_apic_in_platform() (0)
-
-#endif /* CONFIG_WHPX */
-
-#endif /* COMPILING_PER_TARGET */
+#endif /* !CONFIG_WHPX_IS_POSSIBLE */
#endif /* QEMU_WHPX_H */